diff --git a/PKG-INFO b/PKG-INFO index 37fe6f6..6e91a52 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.tar -Version: 0.0.20 +Version: 0.0.21 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index fd13d3b..5fe4dc6 100644 --- a/debian/control +++ b/debian/control @@ -1,23 +1,22 @@ Source: swh-loader-tar Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.14~), python3-swh.scheduler, python3-swh.storage (>= 0.0.31~), - python3-swh.loader.dir (>= 0.0.21~), - python3-swh.loader.core (>= 0.0.10~), + python3-swh.loader.dir (>= 0.0.22~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDTAR/ Package: python3-swh.loader.tar Architecture: all Depends: ${misc:Depends}, ${python3:Depends} Description: Software Heritage Tarball Loader diff --git a/requirements.txt b/requirements.txt index f45e2c0..cfc0175 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,9 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner swh.core >= 0.0.14 swh.scheduler swh.storage >= 0.0.31 -swh.loader.dir >= 0.0.21 -swh.loader.core >= 0.0.10 +swh.loader.dir >= 0.0.22 retrying diff --git a/swh.loader.tar.egg-info/PKG-INFO b/swh.loader.tar.egg-info/PKG-INFO index 37fe6f6..6e91a52 100644 --- a/swh.loader.tar.egg-info/PKG-INFO +++ b/swh.loader.tar.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.tar -Version: 0.0.20 +Version: 0.0.21 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.tar.egg-info/requires.txt b/swh.loader.tar.egg-info/requires.txt index f8992da..c895c5d 100644 --- a/swh.loader.tar.egg-info/requires.txt +++ b/swh.loader.tar.egg-info/requires.txt @@ -1,7 +1,6 @@ retrying swh.core>=0.0.14 -swh.loader.core>=0.0.10 -swh.loader.dir>=0.0.21 +swh.loader.dir>=0.0.22 swh.scheduler swh.storage>=0.0.31 vcversioner diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index 002f091..9ef1376 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,93 +1,130 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import os import tempfile import shutil from swh.core import hashutil from swh.loader.dir import loader from swh.loader.tar import tarball, utils class TarLoader(loader.DirLoader): """A tarball loader. """ CONFIG_BASE_FILENAME = 'loader/tar.ini' ADDITIONAL_CONFIG = { 'extraction_dir': ('string', '/tmp') } - def __init__(self, origin_id): - super().__init__(origin_id, - logging_class='swh.loader.tar.TarLoader') + def __init__(self): + super().__init__(logging_class='swh.loader.tar.TarLoader') - def process(self, tarpath, origin, revision, release, occurrences): - """Load a tarball in backend. + def load(self, tarpath, origin, visit, revision, release, occurrences): + """ + Load a tarball in backend. This will: - - persist the origin if it does not exist. - - write an entry in fetch_history to mark the loading tarball start - uncompress locally the tarballs in a temporary location - process the content of the tarballs to persist on swh storage - clean up the temporary location - write an entry in fetch_history to mark the loading tarball end Args: - tarpath: path to the tarball to uncompress - origin: Dictionary origin - url: url origin we fetched - type: type of the origin + - visit: Numbered visit - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - release: Dictionary of information needed, keys are: - name: release name - date: release timestamp (e.g. 1444054085) - offset: release date offset e.g. -0220, +0100 - author_name: release author's name - author_email: release author's email - comment: release's comment message - occurrences: List of occurrence dictionary. Information needed, keys are: - branch: occurrence's branch name - authority_id: authority id (e.g. 1 for swh) - validity: validity date (e.g. 2015-01-01 00:00:00+00) """ # Prepare the extraction path extraction_dir = self.config['extraction_dir'] os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=extraction_dir) # add checksums in revision artifact = utils.convert_to_hex(hashutil.hashfile(tarpath)) artifact['name'] = os.path.basename(tarpath) try: self.log.info('Uncompress %s to %s' % (tarpath, dir_path)) nature = tarball.uncompress(tarpath, dir_path) artifact['archive_type'] = nature artifact['length'] = os.path.getsize(tarpath) revision['metadata'] = { 'original_artifact': [artifact], } - return super().process(dir_path, origin, revision, release, - occurrences) + return super().load( + dir_path, origin, visit, revision, release, occurrences) finally: shutil.rmtree(dir_path) + + def prepare_and_load(self, + tarpath, origin, revision, release, occurrences): + """ + Prepare origin, fetch_origin, origin_visit + Then load a tarball 'tarpath'. + Then close origin_visit, fetch_history + + First: + - creates an origin if it does not exist + - creates a fetch_history entry + - creates an origin_visit + - Then loads the tarball + + """ + if 'type' not in origin: # let the type flow if present + origin['type'] = 'tar' + + self.origin_id = self.storage.origin_add_one(origin) + origin['id'] = self.origin_id + + date_visit = datetime.datetime.now(tz=datetime.timezone.utc) + origin_visit = self.storage.origin_visit_add(origin['id'], date_visit) + visit = origin_visit['visit'] + + fetch_history_id = self.open_fetch_history() + + try: + self.load(tarpath, origin, visit, revision, release, occurrences) + self.close_fetch_history_success(fetch_history_id) + self.storage.origin_visit_update( + self.origin_id, origin_visit['visit'], status='full') + except: + self.close_fetch_history_failure(fetch_history_id) + self.storage.origin_visit_update( + self.origin_id, origin_visit['visit'], status='partial') + raise diff --git a/swh/loader/tar/tasks.py b/swh/loader/tar/tasks.py index d40cb2a..c6ea825 100644 --- a/swh/loader/tar/tasks.py +++ b/swh/loader/tar/tasks.py @@ -1,40 +1,27 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.loader.core import tasks +from swh.scheduler.task import Task from swh.loader.tar.loader import TarLoader -class LoadTarRepository(tasks.LoaderCoreTask): +class LoadTarRepository(Task): """Import a directory to Software Heritage """ task_queue = 'swh_loader_tar' - CONFIG_BASE_FILENAME = 'loader/tar.ini' def run(self, tarpath, origin, revision, release, occurrences): """Import a tarball into swh. Args: - tarpath: path to a tarball file - origin, revision, release, occurrences: cf. swh.loader.dir.loader.run docstring """ - if 'type' not in origin: # let the type flow if present - origin['type'] = 'tar' - - origin['id'] = self.storage.origin_add_one(origin) - - fetch_history_id = self.open_fetch_history(origin['id']) - - result = TarLoader(origin['id']).process(tarpath, - origin, - revision, - release, - occurrences) - - self.close_fetch_history(fetch_history_id, result) + TarLoader().prepare_and_load( + tarpath, origin, revision, release, occurrences) diff --git a/version.txt b/version.txt index 58f39a6..b9937ac 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.20-0-ge777e55 \ No newline at end of file +v0.0.21-0-gf330178 \ No newline at end of file