diff --git a/PKG-INFO b/PKG-INFO index 2475c0a..bab54c2 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,79 +1,82 @@ Metadata-Version: 2.1 Name: swh.loader.tar -Version: 0.0.39 +Version: 0.0.40 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-tar +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Description: # SWH Tarball Loader - The Software Heritage Tarball Loader is in charge of ingesting the - directory representation of the tarball into the Software Heritage - archive. + The Software Heritage Tarball Loader is in charge of ingesting the directory + representation of the tarball into the Software Heritage archive. - ## Configuration + ## Sample configuration - This is the loader's (or task's) configuration file. + The loader's configuration will be taken from the default configuration file: + `~/.config/swh/loader/tar.yml` (you can choose a different path by setting the + `SWH_CONFIG_FILENAME` environment variable). - *`{/etc/softwareheritage | ~/.config/swh | ~/.swh}`/loader/tar.yml*: + This file holds information for the loader to work, including celery + configuration: ```YAML working_dir: /home/storage/tmp/ storage: cls: remote args: url: http://localhost:5002/ + celery: + task_modules: + - swh.loader.tar.tasks + task_queues: + - swh.loader.tar.tasks.LoadTarRepository ``` - ## API - - ### local + ### Local Load local tarball directly from code or python3's toplevel: ``` Python # Fill in those repo = '8sync.tar.gz' tarpath = '/home/storage/tar/%s' % repo origin = {'url': 'file://%s' % repo, 'type': 'tar'} visit_date = 'Tue, 3 May 2017 17:16:32 +0200' last_modified = 'Tue, 10 May 2016 16:16:32 +0200' import logging logging.basicConfig(level=logging.DEBUG) - from swh.loader.tar.tasks import LoadTarRepository - l = LoadTarRepository() - l.run_task(origin=origin, visit_date=visit_date, - last_modified=last_modified) + from swh.loader.tar.tasks import load_tar + load_tar(origin=origin, visit_date=visit_date, + last_modified=last_modified) ``` - ### remote + ### Remote - Load remote tarball is the same sample + Load remote tarball is the same sample: ```Python url = 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz' origin = {'url': url, 'type': 'tar'} visit_date = 'Tue, 3 May 2017 17:16:32 +0200' last_modified = '2016-04-22 16:35' import logging logging.basicConfig(level=logging.DEBUG) - from swh.loader.tar.tasks import LoadTarRepository - l = LoadTarRepository() - l.run_task(origin=origin, visit_date=visit_date, - last_modified=last_modified) + from swh.loader.tar.tasks import load_tar + load_tar(origin=origin, visit_date=visit_date, + last_modified=last_modified) ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/README.md b/README.md index f43db04..137f16f 100644 --- a/README.md +++ b/README.md @@ -1,59 +1,62 @@ # SWH Tarball Loader -The Software Heritage Tarball Loader is in charge of ingesting the -directory representation of the tarball into the Software Heritage -archive. +The Software Heritage Tarball Loader is in charge of ingesting the directory +representation of the tarball into the Software Heritage archive. -## Configuration +## Sample configuration -This is the loader's (or task's) configuration file. +The loader's configuration will be taken from the default configuration file: +`~/.config/swh/loader/tar.yml` (you can choose a different path by setting the +`SWH_CONFIG_FILENAME` environment variable). -*`{/etc/softwareheritage | ~/.config/swh | ~/.swh}`/loader/tar.yml*: +This file holds information for the loader to work, including celery +configuration: ```YAML working_dir: /home/storage/tmp/ storage: cls: remote args: url: http://localhost:5002/ +celery: +task_modules: + - swh.loader.tar.tasks +task_queues: + - swh.loader.tar.tasks.LoadTarRepository ``` -## API - -### local +### Local Load local tarball directly from code or python3's toplevel: ``` Python # Fill in those repo = '8sync.tar.gz' tarpath = '/home/storage/tar/%s' % repo origin = {'url': 'file://%s' % repo, 'type': 'tar'} visit_date = 'Tue, 3 May 2017 17:16:32 +0200' last_modified = 'Tue, 10 May 2016 16:16:32 +0200' import logging logging.basicConfig(level=logging.DEBUG) -from swh.loader.tar.tasks import LoadTarRepository -l = LoadTarRepository() -l.run_task(origin=origin, visit_date=visit_date, - last_modified=last_modified) +from swh.loader.tar.tasks import load_tar +load_tar(origin=origin, visit_date=visit_date, + last_modified=last_modified) ``` -### remote +### Remote -Load remote tarball is the same sample +Load remote tarball is the same sample: ```Python url = 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz' origin = {'url': url, 'type': 'tar'} visit_date = 'Tue, 3 May 2017 17:16:32 +0200' last_modified = '2016-04-22 16:35' import logging logging.basicConfig(level=logging.DEBUG) -from swh.loader.tar.tasks import LoadTarRepository -l = LoadTarRepository() -l.run_task(origin=origin, visit_date=visit_date, - last_modified=last_modified) +from swh.loader.tar.tasks import load_tar +load_tar(origin=origin, visit_date=visit_date, + last_modified=last_modified) ``` diff --git a/debian/changelog b/debian/changelog index ac7adc7..59c17ec 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,316 +1,319 @@ -swh-loader-tar (0.0.39-1~swh1~bpo9+1) stretch-swh; urgency=medium +swh-loader-tar (0.0.40-1~swh1) unstable-swh; urgency=medium - * Rebuild for stretch-swh + * New upstream release 0.0.40 - (tagged by Nicolas Dandrimont + on 2019-04-19 08:51:58 +0200) + * Upstream changes: - Release swh.loader.tar v0.0.40 - Pass + through metadata from the caller - -- Software Heritage autobuilder (on jenkins-debian1) Mon, 18 Feb 2019 10:50:51 +0000 + -- Software Heritage autobuilder (on jenkins-debian1) Fri, 19 Apr 2019 06:58:41 +0000 swh-loader-tar (0.0.39-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.39 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-18 11:44:02 +0100) * Upstream changes: - v0.0.39 - tar.loader: Fix default value initialization caveat -- Software Heritage autobuilder (on jenkins-debian1) Mon, 18 Feb 2019 10:49:32 +0000 swh-loader-tar (0.0.38-1~swh2) unstable-swh; urgency=medium * New upstream release, fix debian tree -- Antoine R. Dumont (@ardumont) Thu, 14 Feb 2019 17:56:44 +0100 swh-loader-tar (0.0.38-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.38 - (tagged by David Douard on 2019-02-07 17:34:18 +0100) * Upstream changes: - v0.0.38 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 07 Feb 2019 16:39:05 +0000 swh-loader-tar (0.0.35-1~swh1) unstable-swh; urgency=medium * v0.0.35 * Improve origin_visit initialization step * Properly sandbox the prepare statement so that if it breaks, we can * update appropriately the visit with the correct status -- Antoine R. Dumont (@ardumont) Wed, 07 Mar 2018 12:00:13 +0100 swh-loader-tar (0.0.34-1~swh1) unstable-swh; urgency=medium * v0.0.34 * loader: replace occurrences with snapshot -- Antoine R. Dumont (@ardumont) Mon, 12 Feb 2018 11:17:17 +0100 swh-loader-tar (0.0.33-1~swh1) unstable-swh; urgency=medium * Release swh.loader.tar v0.0.33 * Use snapshots instead of occurrences -- Nicolas Dandrimont Tue, 06 Feb 2018 14:34:47 +0100 swh-loader-tar (0.0.32-1~swh1) unstable-swh; urgency=medium * v0.0.32 * Migrate to latest swh-core -- Antoine R. Dumont (@ardumont) Wed, 06 Dec 2017 12:04:02 +0100 swh-loader-tar (0.0.31-1~swh1) unstable-swh; urgency=medium * v0.0.31 * Bump to latest swh.loader.dir * Fix test -- Antoine R. Dumont (@ardumont) Mon, 04 Dec 2017 18:55:50 +0100 swh-loader-tar (0.0.30-1~swh1) unstable-swh; urgency=medium * v0.0.30 * Do not override metadata if already provided -- Antoine R. Dumont (@ardumont) Mon, 04 Dec 2017 18:13:05 +0100 swh-loader-tar (0.0.29-1~swh1) unstable-swh; urgency=medium * v0.0.29 * Permit inheritance and configuration from loader-tar -- Antoine R. Dumont (@ardumont) Wed, 29 Nov 2017 14:39:35 +0100 swh-loader-tar (0.0.28-1~swh1) unstable-swh; urgency=medium * Release swh.loader.tar version 0.0.28 * Update packaging runes -- Nicolas Dandrimont Thu, 12 Oct 2017 18:07:58 +0200 swh-loader-tar (0.0.27-1~swh1) unstable-swh; urgency=medium * Release swh.loader.tar v0.0.27 * Update to new swh.loader.core -- Nicolas Dandrimont Fri, 06 Oct 2017 15:11:40 +0200 swh-loader-tar (0.0.26-1~swh1) unstable-swh; urgency=medium * v0.0.26 * Bump dependency on swh.loader.dir * Add a one no-persistence test -- Antoine R. Dumont (@ardumont) Fri, 29 Sep 2017 14:26:10 +0200 swh-loader-tar (0.0.25-1~swh1) unstable-swh; urgency=medium * Release swh.loader.tar 0.0.25 * Update tasks to new swh.scheduler API -- Nicolas Dandrimont Mon, 12 Jun 2017 18:05:35 +0200 swh-loader-tar (0.0.24-1~swh1) unstable-swh; urgency=medium * Release swh.loader.tar v0.0.24 * migrate to swh.model.hashutil -- Nicolas Dandrimont Thu, 04 May 2017 16:29:50 +0200 swh-loader-tar (0.0.23-1~swh1) unstable-swh; urgency=medium * v0.0.23 * Simplify loader's logic -- Antoine R. Dumont (@ardumont) Wed, 22 Feb 2017 13:50:05 +0100 swh-loader-tar (0.0.22-1~swh1) unstable-swh; urgency=medium * v0.0.22 * Update storage configuration reading -- Antoine R. Dumont (@ardumont) Thu, 15 Dec 2016 18:53:35 +0100 swh-loader-tar (0.0.21-1~swh1) unstable-swh; urgency=medium * v0.0.21 * Update tar loader to register origin_visit's state -- Antoine R. Dumont (@ardumont) Wed, 24 Aug 2016 14:52:57 +0200 swh-loader-tar (0.0.20-1~swh1) unstable-swh; urgency=medium * v0.0.20 * Stabilize dependency on swh-loader-dir -- Antoine R. Dumont (@ardumont) Sat, 11 Jun 2016 02:35:02 +0200 swh-loader-tar (0.0.19-1~swh1) unstable-swh; urgency=medium * v0.0.19 * Simplify task initialization -- Antoine R. Dumont (@ardumont) Fri, 10 Jun 2016 15:16:26 +0200 swh-loader-tar (0.0.18-1~swh1) unstable-swh; urgency=medium * v0.0.18 * Migrate to latest swh-loader-dir * Remove try except which break silently * Add missing tar.ini configuration file option -- Antoine R. Dumont (@ardumont) Wed, 08 Jun 2016 18:17:20 +0200 swh-loader-tar (0.0.17-1~swh1) unstable-swh; urgency=medium * v0.0.17 * Simplify bootstrap tar loading * Bump dependencies version -- Antoine R. Dumont (@ardumont) Wed, 25 May 2016 12:34:52 +0200 swh-loader-tar (0.0.16-1~swh1) unstable-swh; urgency=medium * v0.0.16 * Fix length to add in metadata at load time * Fix origin_visit time at load time * Add a script to help updating missing length field in db -- Antoine R. Dumont (@ardumont) Tue, 08 Mar 2016 19:00:24 +0100 swh-loader-tar (0.0.15-1~swh1) unstable-swh; urgency=medium * Release swh.loader.tar v0.0.15 * Move swh.core.scheduling to swh.scheduler -- Nicolas Dandrimont Fri, 19 Feb 2016 19:05:21 +0100 swh-loader-tar (0.0.14-1~swh1) unstable-swh; urgency=medium * Prepare swh.loader.tar release 0.0.14 * Update for new swh.storage -- Nicolas Dandrimont Mon, 07 Dec 2015 18:53:00 +0100 swh-loader-tar (0.0.13-1~swh1) unstable-swh; urgency=medium * v0.0.13 * Add a binary to build a tarball from storage -- Antoine R. Dumont (@ardumont) Thu, 19 Nov 2015 11:27:31 +0100 swh-loader-tar (0.0.12-1~swh1) unstable-swh; urgency=medium * v0.0.12 * archive_type is the original artifact's property * unify property names with _ (to respect existing conventions) -- Antoine R. Dumont (@ardumont) Thu, 05 Nov 2015 10:12:39 +0100 swh-loader-tar (0.0.11-1~swh1) unstable-swh; urgency=medium * v0.0.11 * Keep tarball nature (zip, tar) as revision metadata -- Antoine R. Dumont (@ardumont) Wed, 04 Nov 2015 17:18:41 +0100 swh-loader-tar (0.0.10-1~swh1) unstable-swh; urgency=medium * v0.0.10 * Synthetic revision use author and committer date with tarball's mtime -- Antoine R. Dumont (@ardumont) Wed, 04 Nov 2015 11:57:03 +0100 swh-loader-tar (0.0.9-1~swh1) unstable-swh; urgency=medium * v0.0.9 * Uncompress safe archive only * Bump dependency to swh.loader.dir to v0.0.12 * Fix miscounting the number of sent messages -- Antoine R. Dumont (@ardumont) Mon, 02 Nov 2015 15:42:33 +0100 swh-loader-tar (0.0.8-1~swh1) unstable-swh; urgency=medium * v0.0.8 * Stream random blocks of tarballs message -- Antoine R. Dumont (@ardumont) Fri, 30 Oct 2015 12:12:53 +0100 swh-loader-tar (0.0.7-1~swh1) unstable-swh; urgency=medium * v0.0.7 * Send tarball's checksums as metadata in revision table * Use fetch_history table * Update README * Load authorities from configuration file * Improve tar loader architecture * Improve celery task (by simplifying it to a shim layer around the tar loader) * Improve error messages policy * Fix time used (use ctime instead of atime) for authority swh * debian packaging: * clarify dependency list -- Antoine R. Dumont (@ardumont) Thu, 29 Oct 2015 11:09:36 +0100 swh-loader-tar (0.0.6-1~swh1) unstable-swh; urgency=medium * v0.0.6 * Add swh-diff-db-mirror script to compute diff between db and mirror * root directory * Enhance documentation * Fix producer's missing import * debian packaging: * align requirements.txt with debian/control files regarding version deps -- Antoine R. Dumont (@ardumont) Mon, 26 Oct 2015 13:45:21 +0100 swh-loader-tar (0.0.5-1~swh1) unstable-swh; urgency=medium * v0.0.5 * Properly instantiate celery app * Prepare extraction_dir if it does not exist * Fix typos -- Antoine R. Dumont (@ardumont) Fri, 23 Oct 2015 16:43:44 +0200 swh-loader-tar (0.0.4-1~swh1) unstable-swh; urgency=medium * v0.0.4 * debian package: * Upgrade minimum dependency version on swh.loader.dir -- Antoine R. Dumont (@ardumont) Fri, 23 Oct 2015 15:02:47 +0200 swh-loader-tar (0.0.3-1~swh1) unstable-swh; urgency=medium * v0.0.3 * Debian packaging: * Fix depends version requirement on swh-loader-dir and swh-core -- Antoine R. Dumont (@ardumont) Fri, 23 Oct 2015 14:17:43 +0200 swh-loader-tar (0.0.2-1~swh1) unstable-swh; urgency=medium * v0.0.2 * Producer: * Improve release number extraction * Improve producer's cli option * Add option to load tarball subsets from a file * Add tarball format (dmg, apk, xpi, jar, etc...) * Add coverage * Loader: * Fix permission error when cleaning up * Improve uncompress policy -- Antoine R. Dumont (@ardumont) Fri, 23 Oct 2015 11:46:20 +0200 swh-loader-tar (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * v0.0.1 * Load tarballs in swh -- Antoine R. Dumont (@ardumont) Wed, 21 Oct 2015 12:35:53 +0200 diff --git a/swh.loader.tar.egg-info/PKG-INFO b/swh.loader.tar.egg-info/PKG-INFO index 2475c0a..bab54c2 100644 --- a/swh.loader.tar.egg-info/PKG-INFO +++ b/swh.loader.tar.egg-info/PKG-INFO @@ -1,79 +1,82 @@ Metadata-Version: 2.1 Name: swh.loader.tar -Version: 0.0.39 +Version: 0.0.40 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Funding, https://www.softwareheritage.org/donate -Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-tar +Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Description: # SWH Tarball Loader - The Software Heritage Tarball Loader is in charge of ingesting the - directory representation of the tarball into the Software Heritage - archive. + The Software Heritage Tarball Loader is in charge of ingesting the directory + representation of the tarball into the Software Heritage archive. - ## Configuration + ## Sample configuration - This is the loader's (or task's) configuration file. + The loader's configuration will be taken from the default configuration file: + `~/.config/swh/loader/tar.yml` (you can choose a different path by setting the + `SWH_CONFIG_FILENAME` environment variable). - *`{/etc/softwareheritage | ~/.config/swh | ~/.swh}`/loader/tar.yml*: + This file holds information for the loader to work, including celery + configuration: ```YAML working_dir: /home/storage/tmp/ storage: cls: remote args: url: http://localhost:5002/ + celery: + task_modules: + - swh.loader.tar.tasks + task_queues: + - swh.loader.tar.tasks.LoadTarRepository ``` - ## API - - ### local + ### Local Load local tarball directly from code or python3's toplevel: ``` Python # Fill in those repo = '8sync.tar.gz' tarpath = '/home/storage/tar/%s' % repo origin = {'url': 'file://%s' % repo, 'type': 'tar'} visit_date = 'Tue, 3 May 2017 17:16:32 +0200' last_modified = 'Tue, 10 May 2016 16:16:32 +0200' import logging logging.basicConfig(level=logging.DEBUG) - from swh.loader.tar.tasks import LoadTarRepository - l = LoadTarRepository() - l.run_task(origin=origin, visit_date=visit_date, - last_modified=last_modified) + from swh.loader.tar.tasks import load_tar + load_tar(origin=origin, visit_date=visit_date, + last_modified=last_modified) ``` - ### remote + ### Remote - Load remote tarball is the same sample + Load remote tarball is the same sample: ```Python url = 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz' origin = {'url': url, 'type': 'tar'} visit_date = 'Tue, 3 May 2017 17:16:32 +0200' last_modified = '2016-04-22 16:35' import logging logging.basicConfig(level=logging.DEBUG) - from swh.loader.tar.tasks import LoadTarRepository - l = LoadTarRepository() - l.run_task(origin=origin, visit_date=visit_date, - last_modified=last_modified) + from swh.loader.tar.tasks import load_tar + load_tar(origin=origin, visit_date=visit_date, + last_modified=last_modified) ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh/loader/tar/_version.py b/swh/loader/tar/_version.py index 6ba5964..8ce9c0f 100644 --- a/swh/loader/tar/_version.py +++ b/swh/loader/tar/_version.py @@ -1,5 +1,5 @@ # This file is automatically generated by setup.py. -__version__ = '0.0.39' -__sha__ = 'g4fa8401' -__revision__ = 'g4fa8401' +__version__ = '0.0.40' +__sha__ = 'gf338e4c' +__revision__ = 'gf338e4c' diff --git a/swh/loader/tar/build.py b/swh/loader/tar/build.py index 92d4090..7d3f4b4 100755 --- a/swh/loader/tar/build.py +++ b/swh/loader/tar/build.py @@ -1,76 +1,108 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy +import logging +import os + import arrow +logger = logging.getLogger(__name__) + + # Static setup EPOCH = 0 UTC_OFFSET = 0 SWH_PERSON = { 'name': 'Software Heritage', 'fullname': 'Software Heritage', 'email': 'robot@softwareheritage.org' } REVISION_MESSAGE = 'swh-loader-tar: synthetic revision message' REVISION_TYPE = 'tar' def _time_from_last_modified(last_modified): """Compute the modification time from the tarpath. Args: last_modified (str): Last modification time Returns: dict representing a timestamp with keys {seconds, microseconds} """ last_modified = arrow.get(last_modified) mtime = last_modified.float_timestamp normalized_time = list(map(int, str(mtime).split('.'))) return { 'seconds': normalized_time[0], 'microseconds': normalized_time[1] } def compute_revision(tarpath, last_modified): """Compute a revision. Args: tarpath (str): absolute path to the tarball last_modified (str): Time of last modification read from the source remote (most probably by the lister) Returns: Revision as dict: - date (dict): the modification timestamp as returned by _time_from_path function - committer_date: the modification timestamp as returned by _time_from_path function - author: cf. SWH_PERSON - committer: cf. SWH_PERSON - type: cf. REVISION_TYPE - message: cf. REVISION_MESSAGE """ ts = _time_from_last_modified(last_modified) return { 'date': { 'timestamp': ts, 'offset': UTC_OFFSET, }, 'committer_date': { 'timestamp': ts, 'offset': UTC_OFFSET, }, 'author': SWH_PERSON, 'committer': SWH_PERSON, 'type': REVISION_TYPE, 'message': REVISION_MESSAGE, 'synthetic': True, } + + +def set_original_artifact(*, revision, filepath, nature, hashes): + """Set the original artifact data on the given revision for + the tarball currently being loaded.""" + + revision = copy.deepcopy(revision) + if 'metadata' not in revision or not revision['metadata']: + revision['metadata'] = {} + if 'original_artifact' in revision['metadata']: + oa = revision['metadata']['original_artifact'] + if oa: + logger.warning( + 'Revision already contains original_artifact metadata, ' + 'replacing: %r', + oa, + ) + + revision['metadata']['original_artifact'] = [{ + 'name': os.path.basename(filepath), + 'archive_type': nature, + **hashes, + }] + + return revision diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index 8ab67a0..96dfc99 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,346 +1,338 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tempfile import requests import shutil from urllib.parse import urlparse from tempfile import mkdtemp from swh.core import tarball from swh.loader.core.loader import BufferedLoader from swh.loader.dir.loader import revision_from, snapshot_from from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE from swh.model.from_disk import Directory -from .build import compute_revision +from .build import compute_revision, set_original_artifact try: from _version import __version__ except ImportError: __version__ = 'devel' TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.tar.' DEBUG_MODE = '** DEBUG MODE **' class LocalResponse: """Local Response class with iter_content api """ def __init__(self, path): self.path = path def iter_content(self, chunk_size=None): with open(self.path, 'rb') as f: for chunk in f: yield chunk class ArchiveFetcher: """Http/Local client in charge of downloading archives from a remote/local server. Args: temp_directory (str): Path to the temporary disk location used for downloading the release artifacts """ def __init__(self, temp_directory=None): self.temp_directory = temp_directory self.session = requests.session() self.params = { 'headers': { 'User-Agent': 'Software Heritage Tar Loader (%s)' % ( __version__ ) } } def download(self, url): """Download the remote tarball url locally. Args: url (str): Url (file or http*) Raises: ValueError in case of failing to query Returns: Tuple of local (filepath, hashes of filepath) """ url_parsed = urlparse(url) if url_parsed.scheme == 'file': path = url_parsed.path response = LocalResponse(path) length = os.path.getsize(path) else: response = self.session.get(url, **self.params, stream=True) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) length = int(response.headers['content-length']) filepath = os.path.join(self.temp_directory, os.path.basename(url)) h = MultiHash(length=length) with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): h.update(chunk) f.write(chunk) actual_length = os.path.getsize(filepath) if length != actual_length: raise ValueError('Error when checking size: %s != %s' % ( length, actual_length)) hashes = { 'length': length, **h.hexdigest() } return filepath, hashes class BaseTarLoader(BufferedLoader): """Base Tarball Loader class. This factorizes multiple loader implementations: - :class:`RemoteTarLoader`: New implementation able to deal with remote archives. - :class:`TarLoader`: Old implementation which dealt with only local archive. It also was only passing along objects to persist (revision, etc...) """ CONFIG_BASE_FILENAME = 'loader/tar' ADDITIONAL_CONFIG = { 'working_dir': ('string', '/tmp'), 'debug': ('bool', False), # NOT FOR PRODUCTION } def __init__(self, logging_class='swh.loader.tar.TarLoader', config=None): super().__init__(logging_class=logging_class, config=config) self.local_cache = None self.dir_path = None working_dir = self.config.get('working_dir', tempfile.gettempdir()) os.makedirs(working_dir, exist_ok=True) self.temp_directory = mkdtemp( suffix='-%s' % os.getpid(), prefix=TEMPORARY_DIR_PREFIX_PATTERN, dir=working_dir) self.client = ArchiveFetcher(temp_directory=self.temp_directory) os.makedirs(working_dir, 0o755, exist_ok=True) self.dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=self.temp_directory) self.debug = self.config.get('debug', False) def cleanup(self): """Clean up temporary disk folders used. """ if self.debug: self.log.warn('%s Will not clean up temp dir %s' % ( DEBUG_MODE, self.temp_directory )) return if os.path.exists(self.temp_directory): self.log.debug('Clean up %s' % self.temp_directory) shutil.rmtree(self.temp_directory) def prepare_origin_visit(self, *, origin, visit_date=None, **kwargs): """Prepare the origin visit information. Args: origin (dict): Dict with keys {url, type} visit_date (str): Date representing the date of the visit. None by default will make it the current time during the loading process. """ self.origin = origin if 'type' not in self.origin: # let the type flow if present self.origin['type'] = 'tar' self.visit_date = visit_date def get_tarball_url_to_retrieve(self): """Compute the tarball url to allow retrieval """ raise NotImplementedError() def fetch_data(self): """Retrieve, uncompress archive and fetch objects from the tarball. The actual ingestion takes place in the :meth:`store_data` implementation below. """ url = self.get_tarball_url_to_retrieve() filepath, hashes = self.client.download(url) nature = tarball.uncompress(filepath, self.dir_path) dir_path = self.dir_path.encode('utf-8') directory = Directory.from_disk(path=dir_path, save_path=True) objects = directory.collect() if 'content' not in objects: objects['content'] = {} if 'directory' not in objects: objects['directory'] = {} # compute the full revision (with ids) revision = self.build_revision(filepath, nature, hashes) revision = revision_from(directory.hash, revision) objects['revision'] = { revision['id']: revision, } snapshot = self.build_snapshot(revision) objects['snapshot'] = { snapshot['id']: snapshot } self.objects = objects def store_data(self): """Store the objects in the swh archive. """ objects = self.objects self.maybe_load_contents(objects['content'].values()) self.maybe_load_directories(objects['directory'].values()) self.maybe_load_revisions(objects['revision'].values()) snapshot = list(objects['snapshot'].values())[0] self.maybe_load_snapshot(snapshot) class RemoteTarLoader(BaseTarLoader): """This is able to load from remote/local archive into the swh archive. This will: - create an origin (if it does not exist) and a visit - fetch the tarball in a temporary location - uncompress it locally in a temporary location - process the content of the tarball to persist on swh storage - clean up the temporary location """ def prepare(self, *, last_modified, **kwargs): """last_modified is the time of last modification of the tarball. E.g https://ftp.gnu.org/gnu/8sync/: [ ] 8sync-0.1.0.tar.gz 2016-04-22 16:35 217K [ ] 8sync-0.1.0.tar.gz.sig 2016-04-22 16:35 543 [ ] ... Args: origin (dict): Dict with keys {url, type} last_modified (str): The date of last modification of the archive to ingest. visit_date (str): Date representing the date of the visit. None by default will make it the current time during the loading process. """ self.last_modified = last_modified def get_tarball_url_to_retrieve(self): return self.origin['url'] def build_revision(self, filepath, nature, hashes): """Build the revision with identifier We use the `last_modified` date provided by the caller to build the revision. """ - return { - **compute_revision(filepath, self.last_modified), - 'metadata': { - 'original_artifact': [{ - 'name': os.path.basename(filepath), - 'archive_type': nature, - **hashes, - }], - } - } + return set_original_artifact( + revision=compute_revision(filepath, self.last_modified), + filepath=filepath, + nature=nature, + hashes=hashes, + ) def build_snapshot(self, revision): """Build the snapshot targeting the revision. """ branch_name = os.path.basename(self.dir_path) return snapshot_from(revision['id'], branch_name) class LegacyLocalTarLoader(BaseTarLoader): """This loads local tarball into the swh archive. It's using the revision and branch provided by the caller as scaffolding to create the full revision and snapshot (with identifiers). This is what's: - been used to ingest our 2015 rsync copy of gnu.org - still used by the loader deposit This will: - create an origin (if it does not exist) and a visit - uncompress a tarball in a local and temporary location - process the content of the tarball to persist on swh storage - associate it to a passed revision and snapshot - clean up the temporary location """ def prepare(self, *, tar_path, revision, branch_name, **kwargs): """Prepare the data prior to ingest it in SWH archive. Args: tar_path (str): Path to the archive to ingest revision (dict): The synthetic revision to associate the archive to (no identifiers within) branch_name (str): The branch name to use for the snapshot. """ self.tar_path = tar_path self.revision = revision self.branch_name = branch_name def get_tarball_url_to_retrieve(self): return 'file://%s' % self.tar_path def build_revision(self, filepath, nature, hashes): """Build the revision with identifier We use the revision provided by the caller as a scaffolding revision. """ - return { - **self.revision, - 'metadata': { - 'original_artifact': [{ - 'name': os.path.basename(filepath), - 'archive_type': nature, - **hashes, - }], - } - } + return set_original_artifact( + revision=self.revision, + filepath=filepath, + nature=nature, + hashes=hashes, + ) def build_snapshot(self, revision): """Build the snapshot targeting the revision. We use the branch_name provided by the caller as a scaffolding as well. """ return snapshot_from(revision['id'], self.branch_name) diff --git a/swh/loader/tar/tests/test_loader.py b/swh/loader/tar/tests/test_loader.py index 30a2e70..64e89a6 100644 --- a/swh/loader/tar/tests/test_loader.py +++ b/swh/loader/tar/tests/test_loader.py @@ -1,239 +1,250 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest import requests_mock from swh.model import hashutil from swh.loader.core.tests import BaseLoaderTest from swh.loader.tar.build import SWH_PERSON from swh.loader.tar.loader import RemoteTarLoader, LegacyLocalTarLoader TEST_CONFIG = { 'working_dir': '/tmp/tests/loader-tar/', # where to extract the tarball 'debug': False, 'storage': { # we instantiate it but we don't use it in test context 'cls': 'memory', 'args': { } }, 'send_contents': True, 'send_directories': True, 'send_revisions': True, 'send_releases': True, 'send_snapshot': True, 'content_packet_size': 100, 'content_packet_block_size_bytes': 104857600, 'content_packet_size_bytes': 1073741824, 'directory_packet_size': 250, 'revision_packet_size': 100, 'release_packet_size': 100, 'content_size_limit': 1000000000 } class RemoteTarLoaderForTest(RemoteTarLoader): def parse_config_file(self, *args, **kwargs): return TEST_CONFIG @pytest.mark.fs class PrepareDataForTestLoader(BaseLoaderTest): """Prepare the archive to load (test fixture). """ def setUp(self): super().setUp('sample-folder.tgz', start_path=os.path.dirname(__file__), uncompress_archive=False) self.tarpath = self.destination_path def assert_data_ok(self): # then self.assertCountContents(8, "3 files + 5 links") self.assertCountDirectories(6, "4 subdirs + 1 empty + 1 main dir") self.assertCountRevisions(1, "synthetic revision") rev_id = hashutil.hash_to_bytes( '67a7d7dda748f9a86b56a13d9218d16f5cc9ab3d') actual_revision = next(self.storage.revision_get([rev_id])) self.assertTrue(actual_revision['synthetic']) self.assertEqual(actual_revision['parents'], []) self.assertEqual(actual_revision['type'], 'tar') self.assertEqual(actual_revision['message'], b'swh-loader-tar: synthetic revision message') self.assertEqual(actual_revision['directory'], b'\xa7A\xfcM\x96\x8c{\x8e<\x94\xff\x86\xe7\x04\x80\xc5\xc7\xe5r\xa9') # noqa self.assertEqual( actual_revision['metadata']['original_artifact'][0], { 'sha1_git': 'cc848944a0d3e71d287027347e25467e61b07428', 'archive_type': 'tar', 'blake2s256': '5d70923443ad36377cd58e993aff0e3c1b9ef14f796c69569105d3a99c64f075', # noqa 'name': 'sample-folder.tgz', 'sha1': '3ca0d0a5c6833113bd532dc5c99d9648d618f65a', 'length': 555, 'sha256': '307ebda0071ca5975f618e192c8417161e19b6c8bf581a26061b76dc8e85321d' # noqa }) self.assertCountReleases(0) self.assertCountSnapshots(1) + return actual_revision + class TestRemoteTarLoader(PrepareDataForTestLoader): """Test the remote loader scenario (local/remote) """ def setUp(self): super().setUp() self.loader = RemoteTarLoaderForTest() self.storage = self.loader.storage def test_load_local(self): """Load a local tarball should result in persisted swh data """ # given origin = { 'url': self.repo_url, 'type': 'tar' } visit_date = 'Tue, 3 May 2016 17:16:32 +0200' last_modified = '2018-12-05T12:35:23+00:00' # when self.loader.load( origin=origin, visit_date=visit_date, last_modified=last_modified) # then self.assert_data_ok() @requests_mock.Mocker() def test_load_remote(self, mock_requests): """Load a remote tarball should result in persisted swh data """ # setup the mock to stream the content of the tarball local_url = self.repo_url.replace('file:///', '/') url = 'https://nowhere.org/%s' % local_url with open(local_url, 'rb') as f: data = f.read() mock_requests.get(url, content=data, headers={ 'content-length': str(len(data)) }) # given origin = { 'url': url, 'type': 'tar' } visit_date = 'Tue, 3 May 2016 17:16:32 +0200' last_modified = '2018-12-05T12:35:23+00:00' # when self.loader.load( origin=origin, visit_date=visit_date, last_modified=last_modified) self.assert_data_ok() @requests_mock.Mocker() def test_load_remote_download_failure(self, mock_requests): """Load a remote tarball with download failure should result in no data """ # setup the mock to stream the content of the tarball local_url = self.repo_url.replace('file:///', '/') url = 'https://nowhere.org/%s' % local_url with open(local_url, 'rb') as f: data = f.read() wrong_length = len(data) - 10 mock_requests.get(url, content=data, headers={ 'content-length': str(wrong_length) }) # given origin = { 'url': url, 'type': 'tar' } visit_date = 'Tue, 3 May 2016 17:16:32 +0200' last_modified = '2018-12-05T12:35:23+00:00' # when r = self.loader.load( origin=origin, visit_date=visit_date, last_modified=last_modified) self.assertEqual(r, {'status': 'failed'}) self.assertCountContents(0) self.assertCountDirectories(0) self.assertCountRevisions(0) self.assertCountSnapshots(0) class TarLoaderForTest(LegacyLocalTarLoader): def parse_config_file(self, *args, **kwargs): return TEST_CONFIG class TestTarLoader(PrepareDataForTestLoader): """Test the legacy tar loader """ def setUp(self): super().setUp() self.loader = TarLoaderForTest() self.storage = self.loader.storage def test_load(self): """Load a local tarball should result in persisted swh data - """ # given origin = { 'url': self.repo_url, 'type': 'tar' } visit_date = 'Tue, 3 May 2016 17:16:32 +0200' import datetime commit_time = int(datetime.datetime( 2018, 12, 5, 13, 35, 23, 0, tzinfo=datetime.timezone(datetime.timedelta(hours=1)) ).timestamp()) revision_message = 'swh-loader-tar: synthetic revision message' revision_type = 'tar' revision = { 'date': { 'timestamp': commit_time, 'offset': 0, }, 'committer_date': { 'timestamp': commit_time, 'offset': 0, }, 'author': SWH_PERSON, 'committer': SWH_PERSON, 'type': revision_type, 'message': revision_message, 'synthetic': True, + 'metadata': { + 'foo': 'bar', + 'original_artifact': ['bogus_original_artifact'], + } } branch_name = os.path.basename(self.tarpath) # when self.loader.load(tar_path=self.tarpath, origin=origin, visit_date=visit_date, revision=revision, branch_name=branch_name) # then - self.assert_data_ok() + actual_revision = self.assert_data_ok() + + # Check metadata passthrough + assert actual_revision['metadata']['foo'] == 'bar' + + # FIXME: use the caplog pytest fixture to check that the clobbering of + # original artifact sent a warning diff --git a/version.txt b/version.txt index 725ef10..dcfbe1e 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.39-0-g4fa8401 \ No newline at end of file +v0.0.40-0-gf338e4c \ No newline at end of file