Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9125726
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
View Options
diff --git a/debian/control b/debian/control
index 0bec5e7..d111162 100644
--- a/debian/control
+++ b/debian/control
@@ -1,24 +1,25 @@
Source: swh-loader-tar
Maintainer: Software Heritage developers <swh-devel@inria.fr>
Section: python
Priority: optional
Build-Depends: debhelper (>= 9),
dh-python,
python3-all,
python3-nose,
python3-setuptools,
python3-swh.core (>= 0.0.14~),
+ python3-swh.model (>= 0.0.13~),
python3-swh.scheduler,
python3-swh.storage (>= 0.0.76~),
python3-swh.loader.dir (>= 0.0.24~),
python3-vcversioner
Standards-Version: 3.9.6
Homepage: https://forge.softwareheritage.org/diffusion/DLDTAR/
Package: python3-swh.loader.tar
Architecture: all
Depends: python3-swh.core (>= 0.0.14~), python3-swh.storage (>= 0.0.76~),
python3-swh.loader.dir (>= 0.0.24~), python3-swh.scheduler,
${misc:Depends},
${python3:Depends}
Description: Software Heritage Tarball Loader
diff --git a/requirements-swh.txt b/requirements-swh.txt
index cd41b69..08432c5 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,5 @@
swh.core >= 0.0.14
+swh.model >= 0.0.13
swh.scheduler
swh.storage >= 0.0.76
swh.loader.dir >= 0.0.24
diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py
index 7d320f4..9d2ac5d 100644
--- a/swh/loader/tar/loader.py
+++ b/swh/loader/tar/loader.py
@@ -1,100 +1,100 @@
# Copyright (C) 2015-2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import tempfile
import shutil
-from swh.core import hashutil
from swh.loader.dir import loader
from swh.loader.tar import tarball, utils
+from swh.model import hashutil
class TarLoader(loader.DirLoader):
"""A tarball loader:
- creates an origin if it does not exist
- creates a fetch_history entry
- creates an origin_visit
- uncompress locally the tarballs in a temporary location
- process the content of the tarballs to persist on swh storage
- clean up the temporary location
- write an entry in fetch_history to mark the loading tarball end
(success or failure)
Args:
- tarpath: path to the tarball to uncompress
- origin: Dictionary origin
- url: url origin we fetched
- type: type of the origin
- visit_date (str): To override the visit date
- revision: Dictionary of information needed, keys are:
- author_name: revision's author name
- author_email: revision's author email
- author_date: timestamp (e.g. 1444054085)
- author_offset: date offset e.g. -0220, +0100
- committer_name: revision's committer name
- committer_email: revision's committer email
- committer_date: timestamp
- committer_offset: date offset e.g. -0220, +0100
- type: type of revision dir, tar
- message: synthetic message for the revision
- occurrences: List of occurrence dictionary.
Information needed, keys are:
- branch: occurrence's branch name
- authority_id: authority id (e.g. 1 for swh)
- validity: validity date (e.g. 2015-01-01 00:00:00+00)
"""
CONFIG_BASE_FILENAME = 'loader/tar'
ADDITIONAL_CONFIG = {
'extraction_dir': ('string', '/tmp')
}
def __init__(self):
super().__init__(logging_class='swh.loader.tar.TarLoader')
def prepare(self, *args, **kwargs):
"""1. Uncompress the tarball in a temporary directory.
2. Compute some metadata to update the revision.
"""
tarpath, origin, visit_date, revision, occs = args
if 'type' not in origin: # let the type flow if present
origin['type'] = 'tar'
# Prepare the extraction path
extraction_dir = self.config['extraction_dir']
os.makedirs(extraction_dir, 0o755, exist_ok=True)
dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-',
dir=extraction_dir)
# add checksums in revision
- artifact = utils.convert_to_hex(hashutil.hashfile(tarpath))
+ artifact = utils.convert_to_hex(hashutil.hash_path(tarpath))
artifact['name'] = os.path.basename(tarpath)
self.log.info('Uncompress %s to %s' % (tarpath, dir_path))
nature = tarball.uncompress(tarpath, dir_path)
artifact['archive_type'] = nature
artifact['length'] = os.path.getsize(tarpath)
revision['metadata'] = {
'original_artifact': [artifact],
}
self.dir_path = dir_path
super().prepare(dir_path, origin, visit_date, revision, None, occs)
def cleanup(self):
"""Clean up temporary directory where we uncompress the tarball.
"""
dir_path = self.dir_path
if dir_path and os.path.exists(dir_path):
shutil.rmtree(dir_path)
diff --git a/swh/loader/tar/utils.py b/swh/loader/tar/utils.py
index 67706d6..1edbdfc 100644
--- a/swh/loader/tar/utils.py
+++ b/swh/loader/tar/utils.py
@@ -1,78 +1,78 @@
# Copyright (C) 2015-2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import itertools
import random
-from swh.core import hashutil
+from swh.model import hashutil
def commonname(path0, path1, as_str=False):
"""Compute the commonname between the path0 and path1.
"""
return path1.split(path0)[1]
def convert_to_hex(d):
"""Convert a flat dictionary with bytes in values to the same dictionary
with hex as values.
Args:
dict: flat dictionary with sha bytes in their values.
Returns:
Mirror dictionary with values as string hex.
"""
if not d:
return d
checksums = {}
for key, h in d.items():
checksums[key] = hashutil.hash_to_hex(h)
return checksums
def grouper(iterable, n, fillvalue=None):
"""Collect data into fixed-length chunks or blocks.
Args:
iterable: an iterable
n: size of block
fillvalue: value to use for the last block
Returns:
fixed-length chunks of blocks as iterables
"""
args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)
def random_blocks(iterable, block=100, fillvalue=None):
"""Given an iterable:
- slice the iterable in data set of block-sized elements
- randomized the data set
- yield each element
Args:
iterable: iterable of data
block: number of elements per block
fillvalue: a fillvalue for the last block if not enough values in
last block
Returns:
An iterable of randomized per block-size elements.
"""
count = 0
for iterable in grouper(iterable, block, fillvalue=fillvalue):
count += 1
l = list(iterable)
random.shuffle(l)
for e in l:
yield e
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, Jun 21, 9:15 PM (4 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3344582
Attached To
rDLDTAR Tarball Loader
Event Timeline
Log In to Comment