diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 9610b7d..d4b0402 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,268 +1,277 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime +import logging import tempfile import os from typing import Generator, Dict, Tuple, Sequence from swh.core.tarball import uncompress from swh.core.config import SWHConfig from swh.model.from_disk import Directory from swh.model.identifiers import ( revision_identifier, snapshot_identifier, identifier_to_bytes ) from swh.storage import get_storage from swh.loader.core.converters import content_for_storage +logger = logging.getLogger(__name__) + + # Not implemented yet: # - clean up disk routines from previous killed workers (when OOMkilled) # -> separation of concern would like this to be abstracted from the code # -> experience tells us it's complicated to do as such (T903, T964, T982, # etc...) # # - splitting into groups too many objects sent to storage > could be a > # -> specialized collaborator or storage implementation or proxy which deals # with this # # - model: swh.model.merkle.from_disk should output swh.model.model.* objects # to avoid this layer's conversion routine call # -> Take this up within swh.model's current implementation # # - Does not trap exceptions yet within the PackageLoader.load method class PackageLoader: # Origin visit type (str) set by the loader visit_type = None def __init__(self, url): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: url (str): Origin url to load data from """ # This expects to use the environment variable SWH_CONFIG_FILENAME self.config = SWHConfig.parse_config_file() self._check_configuration() self.storage = get_storage(**self.config['storage']) self.url = url def _check_configuration(self): """Checks the minimal configuration required is set for the loader. If some required configuration is missing, exception detailing the issue is raised. """ if 'storage' not in self.config: raise ValueError( 'Misconfiguration, at least the storage key should be set') def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Returns: Sequence of published versions """ return [] def get_artifacts(self, version: str) -> Generator[ Tuple[str, str, Dict], None, None]: """Given a release version of a package, retrieve the associated artifact information for such version. Args: version: Package version Returns: (artifact filename, artifact uri, raw artifact metadata) """ return [] def fetch_artifact_archive( self, artifact_archive_path: str, dest: str) -> str: """Fetch artifact archive to a temporary folder and returns its path. Args: artifact_archive_path: Path to artifact archive to uncompress dest: Directory to write the downloaded archive to Returns: the locally retrieved artifact path """ return '' def build_revision( self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: """Build the revision dict Returns: SWH data dict """ return {} def get_default_release(self) -> str: """Retrieve the latest release version Returns: Latest version """ return '' def load(self) -> Dict: """Load for a specific origin the associated contents. for each package version of the origin 1. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 2. Extract the downloaded files By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 3. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 4. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 5. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 6. Generate and load the snapshot for the visit Using the revisions/releases collected at step 5., and the branch information from step 0., generate a snapshot and load it into the Software Heritage archive """ status_load = 'uneventful' # either: eventful, uneventful, failed status_visit = 'partial' # either: partial, full tmp_revisions = {} - # Prepare origin and origin_visit - origin = {'url': self.url} - self.storage.origin_add([origin]) - visit_date = datetime.datetime.now(tz=datetime.timezone.utc) - visit_id = self.storage.origin_visit_add( - origin=self.url, - date=visit_date, - type=self.visit_type)['visit'] - - # Retrieve the default release (the "latest" one) - default_release = self.get_default_release() - - # FIXME: Add load exceptions handling - for version in self.get_versions(): # for each - tmp_revisions[version] = [] - # `a_` stands for `artifact_` - for a_filename, a_uri, a_metadata in self.get_artifacts(version): - with tempfile.TemporaryDirectory() as tmpdir: - a_path, a_computed_metadata = self.fetch_artifact_archive( - a_uri, dest=tmpdir) - - uncompressed_path = os.path.join(tmpdir, 'src') - uncompress(a_path, dest=uncompressed_path) - - directory = Directory.from_disk( - path=uncompressed_path.encode('utf-8'), data=True) - # FIXME: Try not to load the full raw content in memory - objects = directory.collect() - - contents = objects['content'].values() - self.storage.content_add( - map(content_for_storage, contents)) - - status_load = 'eventful' - directories = objects['directory'].values() - - self.storage.directory_add(directories) - - # FIXME: This should be release. cf. D409 discussion - revision = self.build_revision( - a_metadata, uncompressed_path) - revision.update({ - 'type': 'tar', - 'synthetic': True, - 'directory': directory.hash, - }) - revision['metadata'].update({ - 'original_artifact': a_metadata, - 'hashes_artifact': a_computed_metadata - }) - - revision['id'] = identifier_to_bytes( - revision_identifier(revision)) - self.storage.revision_add([revision]) - - tmp_revisions[version].append({ - 'filename': a_filename, - 'target': revision['id'], - }) - - # Build and load the snapshot - branches = {} - for version, v_branches in tmp_revisions.items(): - if len(v_branches) == 1: - branch_name = ('releases/%s' % version).encode('utf-8') - if version == default_release: - branches[b'HEAD'] = { - 'target_type': 'alias', - 'target': branch_name, - } + try: + # Prepare origin and origin_visit + origin = {'url': self.url} + self.storage.origin_add([origin]) + visit_date = datetime.datetime.now(tz=datetime.timezone.utc) + visit_id = self.storage.origin_visit_add( + origin=self.url, + date=visit_date, + type=self.visit_type)['visit'] + + # Retrieve the default release (the "latest" one) + default_release = self.get_default_release() + + # FIXME: Add load exceptions handling + for version in self.get_versions(): # for each + tmp_revisions[version] = [] + # `a_` stands for `artifact_` + for a_filename, a_uri, a_metadata in self.get_artifacts( + version): + with tempfile.TemporaryDirectory() as tmpdir: + # a_c_: archive_computed_ + a_path, a_c_metadata = self.fetch_artifact_archive( + a_uri, dest=tmpdir) + + uncompressed_path = os.path.join(tmpdir, 'src') + uncompress(a_path, dest=uncompressed_path) + + directory = Directory.from_disk( + path=uncompressed_path.encode('utf-8'), data=True) + # FIXME: Try not to load the full raw content in memory + objects = directory.collect() + + contents = objects['content'].values() + self.storage.content_add( + map(content_for_storage, contents)) + + status_load = 'eventful' + directories = objects['directory'].values() + + self.storage.directory_add(directories) + + # FIXME: This should be release. cf. D409 discussion + revision = self.build_revision( + a_metadata, uncompressed_path) + revision.update({ + 'type': 'tar', + 'synthetic': True, + 'directory': directory.hash, + }) + revision['metadata'].update({ + 'original_artifact': a_metadata, + 'hashes_artifact': a_c_metadata + }) + + revision['id'] = identifier_to_bytes( + revision_identifier(revision)) + self.storage.revision_add([revision]) + + tmp_revisions[version].append({ + 'filename': a_filename, + 'target': revision['id'], + }) + + # Build and load the snapshot + branches = {} + for version, v_branches in tmp_revisions.items(): + if len(v_branches) == 1: + branch_name = ('releases/%s' % version).encode('utf-8') + if version == default_release: + branches[b'HEAD'] = { + 'target_type': 'alias', + 'target': branch_name, + } - branches[branch_name] = { - 'target_type': 'revision', - 'target': v_branches[0]['target'], - } - else: - for x in v_branches: - branch_name = ('releases/%s/%s' % ( - version, v_branches['filename'])).encode('utf-8') branches[branch_name] = { 'target_type': 'revision', - 'target': x['target'], + 'target': v_branches[0]['target'], } - snapshot = { - 'branches': branches - } - snapshot['id'] = identifier_to_bytes( - snapshot_identifier(snapshot)) - self.storage.snapshot_add([snapshot]) - - # come so far, we actually reached a full visit - status_visit = 'full' - - # Update the visit's state - self.storage.origin_visit_update( - origin=self.url, visit_id=visit_id, status=status_visit, - snapshot=snapshot) - - return {'status': status_load} + else: + for x in v_branches: + branch_name = ('releases/%s/%s' % ( + version, v_branches['filename'])).encode('utf-8') + branches[branch_name] = { + 'target_type': 'revision', + 'target': x['target'], + } + snapshot = { + 'branches': branches + } + snapshot['id'] = identifier_to_bytes( + snapshot_identifier(snapshot)) + self.storage.snapshot_add([snapshot]) + + # come so far, we actually reached a full visit + status_visit = 'full' + + # Update the visit's state + self.storage.origin_visit_update( + origin=self.url, visit_id=visit_id, status=status_visit, + snapshot=snapshot) + except ValueError as e: + logger.warning('Fail to load %s. Reason: %s' % (self.url, e)) + finally: + return {'status': status_load} diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_gnu.py index 870d3db..ae7067a 100644 --- a/swh/loader/package/tests/test_gnu.py +++ b/swh/loader/package/tests/test_gnu.py @@ -1,207 +1,210 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import re from swh.model.hashutil import hash_to_bytes from swh.loader.package.gnu import GNULoader, get_version from swh.loader.package.tests.common import get_response_cb, check_snapshot def test_get_version(): """From url to branch name should yield something relevant """ for url, expected_branchname in [ ('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'), ('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'), ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'), ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'), ('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'), ('https://ftp.org/gnu/aris-w32.zip', 'w32'), ('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'), ('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'), ('https://ftp.org/gnu/crypto-build-demo.tar.gz', 'crypto-build-demo'), ('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz', 'clue+clio+xit.clisp'), ('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz', 'clue+clio.for-pcl'), ('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz', 'hppa2.0-hp-hpux10.20'), ('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'), ('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'), ('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'), ('clisp-powerpc-unknown-linuxlibc6.tar.gz', 'powerpc-unknown-linuxlibc6'), ('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'), ('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'), ('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'), ('clisp-sparc-sun-sunos4.1.3_U1.tar.gz', 'sparc-sun-sunos4.1.3_U1'), ('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz', '2.25.1-powerpc-apple-MacOSX'), ('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz', '2.27-PowerMacintosh-powerpc-Darwin-1.3.7'), ('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz', '2.27-i686-unknown-Linux-2.2.19'), ('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz', '2.28-i386-i386-freebsd-4.3-RELEASE'), ('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz', '2.28-i686-unknown-cygwin_me-4.90-1.3.10'), ('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz', '2.29-i386-i386-freebsd-4.6-STABLE'), ('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz', '2.29-i686-unknown-cygwin_nt-5.0-1.3.12'), ('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip', '2.5.3-ansi-japi-xdr.20030701_mingw32'), ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'), ('sather-logo_images.tar.gz', 'sather-logo_images'), ('sather-specification-000328.html.tar.gz', '000328.html') ]: actual_branchname = get_version(url) assert actual_branchname == expected_branchname _expected_new_contents_first_visit = [ 'e9258d81faf5881a2f96a77ba609396f82cb97ad', '1170cf105b04b7e2822a0e09d2acf71da7b9a130', 'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac', '0057bec9b5422aff9256af240b177ac0e3ac2608', '2b8d0d0b43a1078fc708930c8ddc2956a86c566e', '27de3b3bc6545d2a797aeeb4657c0e215a0c2e55', '2e6db43f5cd764e677f416ff0d0c78c7a82ef19b', 'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62', 'edeb33282b2bffa0e608e9d2fd960fd08093c0ea', 'd64e64d4c73679323f8d4cde2643331ba6c20af9', '7a756602914be889c0a2d3952c710144b3e64cb0', '84fb589b554fcb7f32b806951dcf19518d67b08f', '8624bcdae55baeef00cd11d5dfcfa60f68710a02', 'e08441aeab02704cfbd435d6445f7c072f8f524e', 'f67935bc3a83a67259cda4b2d43373bd56703844', '809788434b433eb2e3cfabd5d591c9a659d5e3d8', '7d7c6c8c5ebaeff879f61f37083a3854184f6c41', 'b99fec102eb24bffd53ab61fc30d59e810f116a2', '7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68', 'f0c97052e567948adf03e641301e9983c478ccff', '7fb724242e2b62b85ca64190c31dcae5303e19b3', '4f9709e64a9134fe8aefb36fd827b84d8b617ab5', '7350628ccf194c2c3afba4ac588c33e3f3ac778d', '0bb892d9391aa706dc2c3b1906567df43cbe06a2', '49d4c0ce1a16601f1e265d446b6c5ea6b512f27c', '6b5cc594ac466351450f7f64a0b79fdaf4435ad3', '3046e5d1f70297e2a507b98224b6222c9688d610', '1572607d456d7f633bc6065a2b3048496d679a31', ] _expected_new_directories_first_visit = [ 'daabc65ec75d487b1335ffc101c0ac11c803f8fc', '263be23b4a8101d3ad0d9831319a3e0f2b065f36', '7f6e63ba6eb3e2236f65892cd822041f1a01dd5c', '4db0a3ecbc976083e2dac01a62f93729698429a3', 'dfef1c80e1098dd5deda664bb44a9ab1f738af13', 'eca971d346ea54d95a6e19d5051f900237fafdaa', '3aebc29ed1fccc4a6f2f2010fb8e57882406b528', ] _expected_new_revisions_first_visit = { '44183488c0774ce3c957fa19ba695cf18a4a42b3': '3aebc29ed1fccc4a6f2f2010fb8e57882406b528' } _expected_branches_first_visit = { 'HEAD': { 'target_type': 'alias', 'target': 'releases/0.1.0', }, 'releases/0.1.0': { 'target_type': 'revision', 'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3', }, } # hash is different then before as we changed the snapshot # gnu used to use `release/` (singular) instead of plural _expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa -# def test_release_artifact_not_found(requests_mock): -# package = '8sync' -# package_url = 'https://ftp.gnu.org/gnu/8sync/' -# tarballs = [{ -# 'date': '944729610', -# 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', -# }] - -# loader = GNULoader(package, package_url, tarballs) -# requests_mock.get(re.compile('https://'), status_code=404) - -# assert actual_load_status == {'status': 'uneventful'} -# stats = loader.storage.stat_counters() - -# assert { -# 'content': 0, -# 'directory': 0, -# 'origin': 1, -# 'origin_visit': 1, -# 'person': 0, -# 'release': 0, -# 'revision': 0, -# 'skipped_content': 0, -# 'snapshot': 0, -# } == stats + +def test_release_artifact_not_found(requests_mock): + package = '8sync' + package_url = 'https://ftp.gnu.org/gnu/8sync/' + tarballs = [{ + 'date': '944729610', + 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', + }] + + loader = GNULoader(package, package_url, tarballs) + requests_mock.get(re.compile('https://'), status_code=404) + + actual_load_status = loader.load() + + assert actual_load_status == {'status': 'uneventful'} + stats = loader.storage.stat_counters() + + assert { + 'content': 0, + 'directory': 0, + 'origin': 1, + 'origin_visit': 1, + 'person': 0, + 'release': 0, + 'revision': 0, + 'skipped_content': 0, + 'snapshot': 0, + } == stats def test_release_artifact_no_prior_visit(requests_mock): """With no prior visit, load a pypi project ends up with 1 snapshot """ assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini package = '8sync' package_url = 'https://ftp.gnu.org/gnu/8sync/' tarballs = [{ 'date': '944729610', 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', }] loader = GNULoader(package, package_url, tarballs) requests_mock.get(re.compile('https://'), body=get_response_cb) actual_load_status = loader.load() assert actual_load_status == {'status': 'eventful'} stats = loader.storage.stat_counters() assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1 } == stats expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) assert list(loader.storage.content_missing_per_sha1(expected_contents))\ == [] expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) assert list(loader.storage.directory_missing(expected_dirs)) == [] expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit) assert list(loader.storage.revision_missing(expected_revs)) == [] check_snapshot( _expected_new_snapshot_first_visit_id, _expected_branches_first_visit, storage=loader.storage)