diff --git a/docs/package-loader-specifications.rst b/docs/package-loader-specifications.rst index 25d92c4..b277287 100644 --- a/docs/package-loader-specifications.rst +++ b/docs/package-loader-specifications.rst @@ -1,106 +1,112 @@ .. _package-loader-specifications: Package loader specifications ============================= Release fields -------------- Here is an overview of the fields (+ internal version name + branch name) used by each package loader, after D6616: .. list-table:: Fields used by each package loader :header-rows: 1 * - Loader - internal version - branch name - name - message - synthetic - author - date - Notes * - archive - passed as arg - ``release_name(​version)`` - =version - - "swh-loader-package: - synthetic revision message" + - "Synthetic release for archive at {p_info.url}" - true - - SWH robot + - "" - passed as arg - * - cran - ``metadata.get(​"Version", passed as arg)`` - ``release_name(​version)`` - =version - - =version + - standard message - true - ``metadata.get(​"Maintainer", "")`` - ``metadata.get(​"Date")`` - metadata is intrinsic * - debian - passed as arg (eg. ``stretch/contrib/0.7.2-3``) - ``release_name(​version)`` - =version - - "Synthetic revision for Debian source package %s version %s" + - standard message (using full version) - true - ``metadata​.changelog​.person`` - ``metadata​.changelog​.date`` - metadata is intrinsic. Old revisions have ``dsc`` as type * - deposit - HEAD - only HEAD - HEAD - "{client}: Deposit {id} in collection {collection}" - true - - SWH robot + - original author - ```` from SWORD XML - revisions had parents * - nixguix - URL - URL - URL - - "" + - None - true - "" - None - it's the URL of the artifact referenced by the derivation * - npm - ``metadata​["version"]`` - ``release_name(​version)`` - =version - - =version + - standard message - true - from int metadata or "" - from ext metadata or None - * - opam - as given by opam - "{opam_package}​.{version}" - =version - - =version + - standard message - true - from metadata - None - "{self.opam_package}​.{version}" matches the version names used by opam's backend. metadata is extrinsic * - pypi - ``metadata​["version"]`` - ``release_name(​version)`` or ``release_name(​version, filename)`` - =version - - "{version}: {metadata[​'comment_text']}" or just version + - ``metadata[​'comment_text']}`` or standard message - true - from int metadata or "" - from ext metadata or None - metadata is intrinsic using this function:: def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version +and "standard message" being:: + + msg = ( + f"Synthetic release for {PACKAGE_MANAGER} source package {name} " + f"version {version}" + ) + The ``target_type`` field is always ``dir``, and the target the id of a directory loaded by unpacking a tarball/zip file/... diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py index 2f83858..997736a 100644 --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -1,167 +1,162 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib import logging from os import path import string from typing import Any, Dict, Iterator, Mapping, Optional, Sequence, Tuple, Union import attr import iso8601 from swh.loader.package.loader import BasePackageInfo, PackageLoader, PartialExtID -from swh.loader.package.utils import release_name -from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone +from swh.loader.package.utils import EMPTY_AUTHOR, release_name +from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) -SWH_PERSON = Person( - name=b"Software Heritage", - fullname=b"Software Heritage", - email=b"robot@softwareheritage.org", -) -REVISION_MESSAGE = b"swh-loader-package: synthetic revision message" @attr.s class ArchivePackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) length = attr.ib(type=int) """Size of the archive file""" time = attr.ib(type=Union[str, datetime.datetime]) """Timestamp of the archive file on the server""" # default format for gnu MANIFEST_FORMAT = string.Template("$time $length $version $url") def extid(self, manifest_format: Optional[string.Template] = None) -> PartialExtID: """Returns a unique intrinsic identifier of this package info ``manifest_format`` allows overriding the class' default MANIFEST_FORMAT""" manifest_format = manifest_format or self.MANIFEST_FORMAT # TODO: use parsed attributes instead of self.raw_info manifest = manifest_format.substitute( {k: str(v) for (k, v) in self.raw_info.items()} ) return (self.EXTID_TYPE, hashlib.sha256(manifest.encode()).digest()) @classmethod def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": url = a_metadata["url"] filename = a_metadata.get("filename") return cls( url=url, filename=filename if filename else path.split(url)[-1], raw_info=a_metadata, length=a_metadata["length"], time=a_metadata["time"], version=a_metadata["version"], ) class ArchiveLoader(PackageLoader[ArchivePackageInfo]): """Load archive origin's artifact files into swh archive """ visit_type = "tar" def __init__( self, storage: StorageInterface, url: str, artifacts: Sequence[Dict[str, Any]], extid_manifest_format: Optional[str] = None, max_content_size: Optional[int] = None, snapshot_append: bool = False, ): f"""Loader constructor. For now, this is the lister's task output. Args: url: Origin url artifacts: List of artifact information with keys: - **time**: last modification time as either isoformat date string or timestamp - **url**: the artifact url to retrieve filename - **filename**: optionally, the file's name - **version**: artifact's version - **length**: artifact's length extid_manifest_format: template string used to format a manifest, which is hashed to get the extid of a package. Defaults to {ArchivePackageInfo.MANIFEST_FORMAT!r} snapshot_append: if :const:`True`, append latest snapshot content to the new snapshot created by the loader """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.artifacts = artifacts # assume order is enforced in the lister self.extid_manifest_format = ( None if extid_manifest_format is None else string.Template(extid_manifest_format) ) self.snapshot_append = snapshot_append def get_versions(self) -> Sequence[str]: versions = [] for archive in self.artifacts: v = archive.get("version") if v: versions.append(v) return versions def get_default_version(self) -> str: # It's the most recent, so for this loader, it's the last one return self.artifacts[-1]["version"] def get_package_info( self, version: str ) -> Iterator[Tuple[str, ArchivePackageInfo]]: for a_metadata in self.artifacts: p_info = ArchivePackageInfo.from_metadata(a_metadata) if version == p_info.version: # FIXME: this code assumes we have only 1 artifact per # versioned package yield release_name(version), p_info def new_packageinfo_to_extid( self, p_info: ArchivePackageInfo ) -> Optional[PartialExtID]: return p_info.extid(manifest_format=self.extid_manifest_format) def build_release( self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: time = p_info.time # assume it's a timestamp if isinstance(time, str): # otherwise, assume it's a parsable date parsed_time = iso8601.parse_date(time) else: parsed_time = time normalized_time = TimestampWithTimezone.from_datetime(parsed_time) + msg = f"Synthetic release for archive at {p_info.url}" return Release( name=p_info.version.encode(), - message=REVISION_MESSAGE, + message=msg.encode(), date=normalized_time, - author=SWH_PERSON, + author=EMPTY_AUTHOR, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: if not self.snapshot_append: return {} last_snapshot = self.last_snapshot() return last_snapshot.to_dict()["branches"] if last_snapshot else {} diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py index bc148c2..26ccf6c 100644 --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -1,465 +1,490 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib from io import BytesIO from pathlib import Path import string import attr import pytest from requests.exceptions import ContentDecodingError from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats -from swh.model.hashutil import hash_to_bytes -from swh.model.model import Snapshot, SnapshotBranch, TargetType +from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.model import ( + ObjectType, + Person, + Release, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, + TimestampWithTimezone, +) URL = "https://ftp.gnu.org/gnu/8sync/" GNU_ARTIFACTS = [ { "time": 944729610, "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz", "length": 221837, "filename": "8sync-0.1.0.tar.gz", "version": "0.1.0", }, { "time": 1480991830, "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", "length": 238466, "filename": "8sync-0.2.0.tar.gz", "version": "0.2.0", }, ] _expected_new_contents_first_visit = [ "e9258d81faf5881a2f96a77ba609396f82cb97ad", "1170cf105b04b7e2822a0e09d2acf71da7b9a130", "fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac", "0057bec9b5422aff9256af240b177ac0e3ac2608", "2b8d0d0b43a1078fc708930c8ddc2956a86c566e", "27de3b3bc6545d2a797aeeb4657c0e215a0c2e55", "2e6db43f5cd764e677f416ff0d0c78c7a82ef19b", "ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62", "edeb33282b2bffa0e608e9d2fd960fd08093c0ea", "d64e64d4c73679323f8d4cde2643331ba6c20af9", "7a756602914be889c0a2d3952c710144b3e64cb0", "84fb589b554fcb7f32b806951dcf19518d67b08f", "8624bcdae55baeef00cd11d5dfcfa60f68710a02", "e08441aeab02704cfbd435d6445f7c072f8f524e", "f67935bc3a83a67259cda4b2d43373bd56703844", "809788434b433eb2e3cfabd5d591c9a659d5e3d8", "7d7c6c8c5ebaeff879f61f37083a3854184f6c41", "b99fec102eb24bffd53ab61fc30d59e810f116a2", "7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68", "f0c97052e567948adf03e641301e9983c478ccff", "7fb724242e2b62b85ca64190c31dcae5303e19b3", "4f9709e64a9134fe8aefb36fd827b84d8b617ab5", "7350628ccf194c2c3afba4ac588c33e3f3ac778d", "0bb892d9391aa706dc2c3b1906567df43cbe06a2", "49d4c0ce1a16601f1e265d446b6c5ea6b512f27c", "6b5cc594ac466351450f7f64a0b79fdaf4435ad3", "3046e5d1f70297e2a507b98224b6222c9688d610", "1572607d456d7f633bc6065a2b3048496d679a31", ] _expected_new_directories_first_visit = [ "daabc65ec75d487b1335ffc101c0ac11c803f8fc", "263be23b4a8101d3ad0d9831319a3e0f2b065f36", "7f6e63ba6eb3e2236f65892cd822041f1a01dd5c", "4db0a3ecbc976083e2dac01a62f93729698429a3", "dfef1c80e1098dd5deda664bb44a9ab1f738af13", "eca971d346ea54d95a6e19d5051f900237fafdaa", "3aebc29ed1fccc4a6f2f2010fb8e57882406b528", ] _expected_new_releases_first_visit = { - "c9786c1e3b46f52779c727d3509d66ebf8948d88": ( + "97c2ada10ca9b7876a8b5b17858b0518309170fd": ( "3aebc29ed1fccc4a6f2f2010fb8e57882406b528" ) } def test_archive_visit_with_no_artifact_found(swh_storage, requests_mock_datadir): url = URL unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz" loader = ArchiveLoader( swh_storage, url, artifacts=[ { "time": 944729610, "url": unknown_artifact_url, # unknown artifact "length": 221837, "filename": "8sync-0.1.0.tar.gz", "version": "0.1.0", } ], ) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None stats = get_stats(swh_storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert_last_visit_matches(swh_storage, url, status="partial", type="tar") def test_archive_visit_with_release_artifact_no_prior_visit( swh_storage, requests_mock_datadir ): """With no prior visit, load a gnu project ends up with 1 snapshot """ loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS[:1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" expected_snapshot_first_visit_id = hash_to_bytes( - "cdf8f335fa0c81c8ad089870ec14f52b1980eb6c" + "af62f6f6d464f9b29f270d1bbefa355af38946c4" ) - assert ( - hash_to_bytes(actual_load_status["snapshot_id"]) - == expected_snapshot_first_visit_id + assert actual_load_status["snapshot_id"] == hash_to_hex( + expected_snapshot_first_visit_id ) assert_last_visit_matches(swh_storage, URL, status="full", type="tar") stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats + release_id = hash_to_bytes(list(_expected_new_releases_first_visit)[0]) expected_snapshot = Snapshot( id=expected_snapshot_first_visit_id, branches={ b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"releases/0.1.0", ), b"releases/0.1.0": SnapshotBranch( - target_type=TargetType.RELEASE, - target=hash_to_bytes(list(_expected_new_releases_first_visit)[0]), + target_type=TargetType.RELEASE, target=release_id, ), }, ) - check_snapshot(expected_snapshot, swh_storage) + assert swh_storage.release_get([release_id])[0] == Release( + id=release_id, + name=b"0.1.0", + message=( + b"Synthetic release for archive at " + b"https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz" + ), + target=hash_to_bytes("3aebc29ed1fccc4a6f2f2010fb8e57882406b528"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=Person.from_fullname(b""), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=944729610, microseconds=0), + offset=0, + negative_utc=False, + ), + ) + expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) assert list(swh_storage.directory_missing(expected_dirs)) == [] expected_rels = map(hash_to_bytes, _expected_new_releases_first_visit) assert list(swh_storage.release_missing(expected_rels)) == [] def test_archive_2_visits_without_change(swh_storage, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ url = URL loader = ArchiveLoader(swh_storage, url, artifacts=GNU_ARTIFACTS[:1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, url, status="full", type="tar") actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" assert actual_load_status2["snapshot_id"] is not None assert actual_load_status["snapshot_id"] == actual_load_status2["snapshot_id"] assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] assert len(urls) == 1 def test_archive_2_visits_with_new_artifact(swh_storage, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ url = URL artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(swh_storage, url, [artifact1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, url, status="full", type="tar") stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] assert len(urls) == 1 artifact2 = GNU_ARTIFACTS[1] loader2 = ArchiveLoader(swh_storage, url, [artifact1, artifact2]) stats2 = get_stats(swh_storage) assert stats == stats2 # ensure we share the storage actual_load_status2 = loader2.load() assert actual_load_status2["status"] == "eventful" assert actual_load_status2["snapshot_id"] is not None stats2 = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit) + 14, "directory": len(_expected_new_directories_first_visit) + 8, "origin": 1, "origin_visit": 1 + 1, "release": len(_expected_new_releases_first_visit) + 1, "revision": 0, "skipped_content": 0, "snapshot": 1 + 1, } == stats2 assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] # 1 artifact (2nd time no modification) + 1 new artifact assert len(urls) == 2 def test_archive_2_visits_without_change_not_gnu(swh_storage, requests_mock_datadir): """Load a project archive (not gnu) ends up with 1 snapshot """ url = "https://something.else.org/8sync/" artifacts = [ # this is not a gnu artifact { "time": "1999-12-09T09:53:30+00:00", # it's also not a timestamp "sha256": "d5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4", # noqa # keep a gnu artifact reference to avoid adding other test files "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", "length": 238466, "filename": "8sync-0.2.0.tar.gz", "version": "0.2.0", } ] # Here the loader defines the id_keys to use for existence in the snapshot # It's not the default archive loader which loader = ArchiveLoader( swh_storage, url, artifacts=artifacts, extid_manifest_format="$sha256 $length $url", ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, url, status="full", type="tar") actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" assert actual_load_status2["snapshot_id"] == actual_load_status["snapshot_id"] assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] assert len(urls) == 1 def test_archive_extid(): """Compute primary key should return the right identity """ @attr.s class TestPackageInfo(ArchivePackageInfo): a = attr.ib() b = attr.ib() metadata = GNU_ARTIFACTS[0] p_info = TestPackageInfo( raw_info={**metadata, "a": 1, "b": 2}, a=1, b=2, **metadata, ) for manifest_format, expected_manifest in [ (string.Template("$a $b"), b"1 2"), (string.Template(""), b""), (None, "{time} {length} {version} {url}".format(**metadata).encode()), ]: actual_id = p_info.extid(manifest_format=manifest_format) assert actual_id == ( "package-manifest-sha256", hashlib.sha256(expected_manifest).digest(), ) with pytest.raises(KeyError): p_info.extid(manifest_format=string.Template("$a $unknown_key")) def test_archive_snapshot_append(swh_storage, requests_mock_datadir): # first loading with a first artifact artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(swh_storage, URL, [artifact1], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 branch_artifact1_name = f"releases/{artifact1['version']}".encode() assert b"HEAD" in snapshot.branches assert branch_artifact1_name in snapshot.branches assert snapshot.branches[b"HEAD"].target == branch_artifact1_name # second loading with a second artifact artifact2 = GNU_ARTIFACTS[1] loader = ArchiveLoader(swh_storage, URL, [artifact2], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot, should contain a new branch and the # branch for the first artifact snapshot = loader.last_snapshot() assert len(snapshot.branches) == 3 branch_artifact2_name = f"releases/{artifact2['version']}".encode() assert b"HEAD" in snapshot.branches assert branch_artifact2_name in snapshot.branches assert branch_artifact1_name in snapshot.branches assert snapshot.branches[b"HEAD"].target == branch_artifact2_name def test_archive_snapshot_append_branch_override(swh_storage, requests_mock_datadir): # first loading for a first artifact artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(swh_storage, URL, [artifact1], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 branch_artifact1_name = f"releases/{artifact1['version']}".encode() assert branch_artifact1_name in snapshot.branches branch_target_first_visit = snapshot.branches[branch_artifact1_name].target # second loading for a second artifact with same version as the first one # but with different tarball content artifact2 = dict(GNU_ARTIFACTS[0]) artifact2["url"] = GNU_ARTIFACTS[1]["url"] artifact2["time"] = GNU_ARTIFACTS[1]["time"] artifact2["length"] = GNU_ARTIFACTS[1]["length"] loader = ArchiveLoader(swh_storage, URL, [artifact2], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot, should contain the same branch as previously # but with different target snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 assert branch_artifact1_name in snapshot.branches branch_target_second_visit = snapshot.branches[branch_artifact1_name].target assert branch_target_first_visit != branch_target_second_visit @pytest.fixture def not_gzipped_tarball_bytes(datadir): return Path(datadir, "not_gzipped_tarball.tar.gz").read_bytes() def test_archive_not_gzipped_tarball( swh_storage, requests_mock, not_gzipped_tarball_bytes ): """Check that a tarball erroneously marked as gzip compressed can still be downloaded and processed. """ filename = "not_gzipped_tarball.tar.gz" url = f"https://example.org/ftp/{filename}" requests_mock.get( url, [ {"exc": ContentDecodingError,}, {"body": BytesIO(not_gzipped_tarball_bytes),}, ], ) loader = ArchiveLoader( swh_storage, url, artifacts=[ { "time": 944729610, "url": url, "length": 221837, "filename": filename, "version": "0.1.0", } ], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 assert b"releases/0.1.0" in snapshot.branches diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py index 601be34..534e284 100644 --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -1,175 +1,181 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from datetime import timezone import logging import os from os import path import re import string from typing import Any, Dict, Iterator, List, Optional, Tuple import attr import dateutil.parser from debian.deb822 import Deb822 from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import release_name from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) DATE_PATTERN = re.compile(r"^(?P\d{4})-(?P\d{2})$") @attr.s class CRANPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) + name = attr.ib(type=str) EXTID_TYPE = "cran-sha256" MANIFEST_FORMAT = string.Template("$version $url") @classmethod def from_metadata(cls, a_metadata: Dict[str, Any]) -> "CRANPackageInfo": url = a_metadata["url"] return CRANPackageInfo( url=url, filename=path.basename(url), raw_info=a_metadata, + name=a_metadata["package"], version=a_metadata["version"], ) class CRANLoader(PackageLoader[CRANPackageInfo]): visit_type = "cran" def __init__( self, storage: StorageInterface, url: str, artifacts: List[Dict], max_content_size: Optional[int] = None, ): """Loader constructor. Args: url: Origin url to retrieve cran artifact(s) from artifacts: List of associated artifact for the origin url """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) # explicit what we consider the artifact identity self.artifacts = artifacts def get_versions(self) -> List[str]: versions = [] for artifact in self.artifacts: versions.append(artifact["version"]) return versions def get_default_version(self) -> str: return self.artifacts[-1]["version"] def get_package_info(self, version: str) -> Iterator[Tuple[str, CRANPackageInfo]]: for a_metadata in self.artifacts: p_info = CRANPackageInfo.from_metadata(a_metadata) if version == p_info.version: yield release_name(version), p_info def build_release( self, p_info: CRANPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # a_metadata is empty metadata = extract_intrinsic_metadata(uncompressed_path) date = parse_date(metadata.get("Date")) author = Person.from_fullname(metadata.get("Maintainer", "").encode()) + msg = ( + f"Synthetic release for CRAN source package {p_info.name} " + f"version {p_info.version}" + ) return Release( name=p_info.version.encode(), - message=p_info.version.encode(), + message=msg.encode(), date=date, author=author, target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) def parse_debian_control(filepath: str) -> Dict[str, Any]: """Parse debian control at filepath""" metadata: Dict = {} logger.debug("Debian control file %s", filepath) for paragraph in Deb822.iter_paragraphs(open(filepath, "rb")): logger.debug("paragraph: %s", paragraph) metadata.update(**paragraph) logger.debug("metadata parsed: %s", metadata) return metadata def extract_intrinsic_metadata(dir_path: str) -> Dict[str, Any]: """Given an uncompressed path holding the DESCRIPTION file, returns a DESCRIPTION parsed structure as a dict. Cran origins describes their intrinsic metadata within a DESCRIPTION file at the root tree of a tarball. This DESCRIPTION uses a simple file format called DCF, the Debian control format. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the DESCRIPTION parsed structure as a dict (or empty dict if missing) """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) != 1: return {} project_dirname = lst[0] description_path = os.path.join(dir_path, project_dirname, "DESCRIPTION") if not os.path.exists(description_path): return {} return parse_debian_control(description_path) def parse_date(date: Optional[str]) -> Optional[TimestampWithTimezone]: """Parse a date into a datetime """ assert not date or isinstance(date, str) dt: Optional[datetime.datetime] = None if not date: return None try: specific_date = DATE_PATTERN.match(date) if specific_date: year = int(specific_date.group("year")) month = int(specific_date.group("month")) dt = datetime.datetime(year, month, 1) else: dt = dateutil.parser.parse(date) if not dt.tzinfo: # up for discussion the timezone needs to be set or # normalize_timestamp is not happy: ValueError: normalize_timestamp # received datetime without timezone: 2001-06-08 00:00:00 dt = dt.replace(tzinfo=timezone.utc) except Exception as e: logger.warning("Fail to parse date %s. Reason: %s", date, e) if dt: return TimestampWithTimezone.from_datetime(dt) else: return None diff --git a/swh/loader/package/cran/tests/test_cran.py b/swh/loader/package/cran/tests/test_cran.py index fdf0bac..442100e 100644 --- a/swh/loader/package/cran/tests/test_cran.py +++ b/swh/loader/package/cran/tests/test_cran.py @@ -1,369 +1,423 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import os from os import path from unittest.mock import patch from dateutil.tz import tzlocal import pytest from swh.core.tarball import uncompress from swh.loader.package.cran.loader import ( CRANLoader, extract_intrinsic_metadata, parse_date, parse_debian_control, ) from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes -from swh.model.model import Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone +from swh.model.model import ( + ObjectType, + Person, + Release, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, + TimestampWithTimezone, +) + +RELEASE_ID = hash_to_bytes("9a977f6415e6997fd9dd53c6dcb540ff0a7bff26") SNAPSHOT = Snapshot( - id=hash_to_bytes("56ed00938d83892bd5b42f2f368ae38a1dbfa718"), + id=hash_to_bytes("3787efc620c55b1e18889cfa561d9bcdc62c4cb2"), branches={ b"HEAD": SnapshotBranch( target=b"releases/2.22-6", target_type=TargetType.ALIAS ), b"releases/2.22-6": SnapshotBranch( - target=hash_to_bytes("42993a72eac50a4a83523c9327a52be3593755a8"), - target_type=TargetType.RELEASE, + target=RELEASE_ID, target_type=TargetType.RELEASE, ), }, ) def test_cran_parse_date(): data = [ # parsable, some have debatable results though ("2001-June-08", datetime(2001, 6, 8, 0, 0, tzinfo=timezone.utc)), ( "Tue Dec 27 15:06:08 PST 2011", datetime(2011, 12, 27, 15, 6, 8, tzinfo=timezone.utc), ), ("8-14-2013", datetime(2013, 8, 14, 0, 0, tzinfo=timezone.utc)), ("2011-01", datetime(2011, 1, 1, 0, 0, tzinfo=timezone.utc)), ("201109", datetime(2009, 11, 20, 0, 0, tzinfo=timezone.utc)), ("04-12-2014", datetime(2014, 4, 12, 0, 0, tzinfo=timezone.utc)), ( "2018-08-24, 10:40:10", datetime(2018, 8, 24, 10, 40, 10, tzinfo=timezone.utc), ), ("2013-October-16", datetime(2013, 10, 16, 0, 0, tzinfo=timezone.utc)), ("Aug 23, 2013", datetime(2013, 8, 23, 0, 0, tzinfo=timezone.utc)), ("27-11-2014", datetime(2014, 11, 27, 0, 0, tzinfo=timezone.utc)), ("2019-09-26,", datetime(2019, 9, 26, 0, 0, tzinfo=timezone.utc)), ("9/25/2014", datetime(2014, 9, 25, 0, 0, tzinfo=timezone.utc)), ( "Fri Jun 27 17:23:53 2014", datetime(2014, 6, 27, 17, 23, 53, tzinfo=timezone.utc), ), ("28-04-2014", datetime(2014, 4, 28, 0, 0, tzinfo=timezone.utc)), ("04-14-2014", datetime(2014, 4, 14, 0, 0, tzinfo=timezone.utc)), ( "2019-05-08 14:17:31 UTC", datetime(2019, 5, 8, 14, 17, 31, tzinfo=timezone.utc), ), ( "Wed May 21 13:50:39 CEST 2014", datetime(2014, 5, 21, 13, 50, 39, tzinfo=tzlocal()), ), ( "2018-04-10 00:01:04 KST", datetime(2018, 4, 10, 0, 1, 4, tzinfo=timezone.utc), ), ("2019-08-25 10:45", datetime(2019, 8, 25, 10, 45, tzinfo=timezone.utc)), ("March 9, 2015", datetime(2015, 3, 9, 0, 0, tzinfo=timezone.utc)), ("Aug. 18, 2012", datetime(2012, 8, 18, 0, 0, tzinfo=timezone.utc)), ("2014-Dec-17", datetime(2014, 12, 17, 0, 0, tzinfo=timezone.utc)), ("March 01, 2013", datetime(2013, 3, 1, 0, 0, tzinfo=timezone.utc)), ("2017-04-08.", datetime(2017, 4, 8, 0, 0, tzinfo=timezone.utc)), ("2014-Apr-22", datetime(2014, 4, 22, 0, 0, tzinfo=timezone.utc)), ( "Mon Jan 12 19:54:04 2015", datetime(2015, 1, 12, 19, 54, 4, tzinfo=timezone.utc), ), ("May 22, 2014", datetime(2014, 5, 22, 0, 0, tzinfo=timezone.utc)), ( "2014-08-12 09:55:10 EDT", datetime(2014, 8, 12, 9, 55, 10, tzinfo=timezone.utc), ), # unparsable ("Fabruary 21, 2012", None), ('2019-05-28"', None), ("2017-03-01 today", None), ("2016-11-0110.1093/icesjms/fsw182", None), ("2019-07-010", None), ("2015-02.23", None), ("20013-12-30", None), ("2016-08-017", None), ("2019-02-07l", None), ("2018-05-010", None), ("2019-09-27 KST", None), ("$Date$", None), ("2019-09-27 KST", None), ("2019-06-22 $Date$", None), ("$Date: 2013-01-18 12:49:03 -0600 (Fri, 18 Jan 2013) $", None), ("2015-7-013", None), ("2018-05-023", None), ("Check NEWS file for changes: news(package='simSummary')", None), ] for date, expected_date in data: actual_tstz = parse_date(date) if expected_date is None: assert actual_tstz is None, date else: expected_tstz = TimestampWithTimezone.from_datetime(expected_date) assert actual_tstz == expected_tstz, date @pytest.mark.fs def test_cran_extract_intrinsic_metadata(tmp_path, datadir): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) # sample url # https://cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz # noqa archive_path = path.join( datadir, "https_cran.r-project.org", "src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz", ) uncompress(archive_path, dest=uncompressed_archive_path) actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path) expected_metadata = { "Package": "KernSmooth", "Priority": "recommended", "Version": "2.22-6", "Date": "2001-June-08", "Title": "Functions for kernel smoothing for Wand & Jones (1995)", "Author": "S original by Matt Wand.\n\tR port by Brian Ripley .", # noqa "Maintainer": "Brian Ripley ", "Description": 'functions for kernel smoothing (and density estimation)\n corresponding to the book: \n Wand, M.P. and Jones, M.C. (1995) "Kernel Smoothing".', # noqa "License": "Unlimited use and distribution (see LICENCE).", "URL": "http://www.biostat.harvard.edu/~mwand", } assert actual_metadata == expected_metadata @pytest.mark.fs def test_cran_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistent path/archive/PKG-INFO yield None""" # inexistent first level path assert extract_intrinsic_metadata("/something-inexistent") == {} # inexistent second level path (as expected by pypi archives) assert extract_intrinsic_metadata(tmp_path) == {} # inexistent PKG-INFO within second level path existing_path_no_pkginfo = str(tmp_path / "something") os.mkdir(existing_path_no_pkginfo) assert extract_intrinsic_metadata(tmp_path) == {} def test_cran_one_visit(swh_storage, requests_mock_datadir): version = "2.22-6" base_url = "https://cran.r-project.org" origin_url = f"{base_url}/Packages/Recommended_KernSmooth/index.html" artifact_url = ( f"{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz" # noqa ) loader = CRANLoader( - swh_storage, origin_url, artifacts=[{"url": artifact_url, "version": version,}] + swh_storage, + origin_url, + artifacts=[ + { + "url": artifact_url, + "version": version, + "package": "Recommended_KernSmooth", + } + ], ) actual_load_status = loader.load() assert actual_load_status == { "status": "eventful", "snapshot_id": SNAPSHOT.id.hex(), } assert_last_visit_matches( swh_storage, origin_url, status="full", type="cran", snapshot=SNAPSHOT.id ) check_snapshot(SNAPSHOT, swh_storage) + assert swh_storage.release_get([RELEASE_ID])[0] == Release( + id=RELEASE_ID, + name=b"2.22-6", + message=( + b"Synthetic release for CRAN source package " + b"Recommended_KernSmooth version 2.22-6" + ), + target=hash_to_bytes("ff64177fea3f4a5136b9caf7581a4f7d4cf65296"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=Person( + fullname=b"Brian Ripley ", + name=b"Brian Ripley", + email=b"ripley@stats.ox.ac.uk", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=991958400, microseconds=0), + offset=0, + negative_utc=False, + ), + ) + visit_stats = get_stats(swh_storage) assert { "content": 33, "directory": 7, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == visit_stats urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith(base_url) ] # visited each artifact once across 2 visits assert len(urls) == 1 def test_cran_2_visits_same_origin(swh_storage, requests_mock_datadir): """Multiple visits on the same origin, only 1 archive fetch""" version = "2.22-6" base_url = "https://cran.r-project.org" origin_url = f"{base_url}/Packages/Recommended_KernSmooth/index.html" artifact_url = ( f"{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz" # noqa ) loader = CRANLoader( - swh_storage, origin_url, artifacts=[{"url": artifact_url, "version": version}] + swh_storage, + origin_url, + artifacts=[ + { + "url": artifact_url, + "version": version, + "package": "Recommended_KernSmooth", + } + ], ) # first visit actual_load_status = loader.load() assert actual_load_status == { "status": "eventful", "snapshot_id": SNAPSHOT.id.hex(), } check_snapshot(SNAPSHOT, swh_storage) assert_last_visit_matches( swh_storage, origin_url, status="full", type="cran", snapshot=SNAPSHOT.id ) visit_stats = get_stats(swh_storage) assert { "content": 33, "directory": 7, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == visit_stats # second visit actual_load_status2 = loader.load() assert actual_load_status2 == { "status": "uneventful", "snapshot_id": SNAPSHOT.id.hex(), } assert_last_visit_matches( swh_storage, origin_url, status="full", type="cran", snapshot=SNAPSHOT.id, ) visit_stats2 = get_stats(swh_storage) visit_stats["origin_visit"] += 1 assert visit_stats2 == visit_stats, "same stats as 1st visit, +1 visit" urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith(base_url) ] assert len(urls) == 1, "visited one time artifact url (across 2 visits)" def test_cran_parse_debian_control(datadir): description_file = os.path.join(datadir, "description", "acepack") actual_metadata = parse_debian_control(description_file) assert actual_metadata == { "Package": "acepack", "Maintainer": "Shawn Garbett", "Version": "1.4.1", "Author": "Phil Spector, Jerome Friedman, Robert Tibshirani...", "Description": "Two nonparametric methods for multiple regression...", "Title": "ACE & AVAS 4 Selecting Multiple Regression Transformations", "License": "MIT + file LICENSE", "Suggests": "testthat", "Packaged": "2016-10-28 15:38:59 UTC; garbetsp", "Repository": "CRAN", "Date/Publication": "2016-10-29 00:11:52", "NeedsCompilation": "yes", } def test_cran_parse_debian_control_unicode_issue(datadir): # iso-8859-1 caused failure, now fixed description_file = os.path.join(datadir, "description", "KnownBR") actual_metadata = parse_debian_control(description_file) assert actual_metadata == { "Package": "KnowBR", "Version": "2.0", "Title": """Discriminating Well Surveyed Spatial Units from Exhaustive Biodiversity Databases""", "Author": "Cástor Guisande González and Jorge M. Lobo", "Maintainer": "Cástor Guisande González ", "Description": "It uses species accumulation curves and diverse estimators...", "License": "GPL (>= 2)", "Encoding": "latin1", "Depends": "R (>= 3.0), fossil, mgcv, plotrix, sp, vegan", "Suggests": "raster, rgbif", "NeedsCompilation": "no", "Packaged": "2019-01-30 13:27:29 UTC; castor", "Repository": "CRAN", "Date/Publication": "2019-01-31 20:53:50 UTC", } @pytest.mark.parametrize( "method_name", ["build_extrinsic_snapshot_metadata", "build_extrinsic_origin_metadata",], ) def test_cran_fail_to_build_or_load_extrinsic_metadata( method_name, swh_storage, requests_mock_datadir ): """problem during loading: {visit: failed, status: failed, no snapshot} """ version = "2.22-6" base_url = "https://cran.r-project.org" origin_url = f"{base_url}/Packages/Recommended_KernSmooth/index.html" artifact_url = ( f"{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz" # noqa ) full_method_name = f"swh.loader.package.cran.loader.CRANLoader.{method_name}" with patch( full_method_name, side_effect=ValueError("Fake to fail to build or load extrinsic metadata"), ): loader = CRANLoader( swh_storage, origin_url, - artifacts=[{"url": artifact_url, "version": version}], + artifacts=[ + { + "url": artifact_url, + "version": version, + "package": "Recommended_KernSmooth", + } + ], ) actual_load_status = loader.load() assert actual_load_status == { "status": "failed", "snapshot_id": SNAPSHOT.id.hex(), } visit_stats = get_stats(swh_storage) assert { "content": 33, "directory": 7, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == visit_stats assert_last_visit_matches( swh_storage, origin_url, status="partial", type="cran", snapshot=SNAPSHOT.id ) diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py index f9ad852..a9b5137 100644 --- a/swh/loader/package/debian/loader.py +++ b/swh/loader/package/debian/loader.py @@ -1,464 +1,464 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import email.utils import logging from os import path import re import subprocess from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple import attr from dateutil.parser import parse as parse_date from debian.changelog import Changelog from debian.deb822 import Dsc from swh.loader.package.loader import BasePackageInfo, PackageLoader, PartialExtID from swh.loader.package.utils import download, release_name from swh.model.hashutil import hash_to_bytes from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) UPLOADERS_SPLIT = re.compile(r"(?<=\>)\s*,\s*") EXTID_TYPE = "dsc-sha256" class DscCountError(ValueError): """Raised when an unexpected number of .dsc files is seen""" pass @attr.s class DebianFileMetadata: name = attr.ib(type=str) """Filename""" sha256 = attr.ib(type=str) size = attr.ib(type=int) uri = attr.ib(type=str) """URL of this specific file""" # md5sum is not always available, make it optional md5sum = attr.ib(type=str, default="") # Some of the DSC files imported in swh apparently had a Checksums-SHA512 # field which got recorded in the archive. Current versions of dpkg-source # don't seem to generate them, but keep the field available for # future-proofing. sha512 = attr.ib(type=str, default="") @attr.s class DebianPackageChangelog: person = attr.ib(type=Dict[str, str]) """A dict with fields like, model.Person, except they are str instead of bytes, and 'email' is optional.""" date = attr.ib(type=str) """Date of the changelog entry.""" history = attr.ib(type=List[Tuple[str, str]]) """List of tuples (package_name, version)""" @attr.s class DebianPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) files = attr.ib(type=Dict[str, DebianFileMetadata]) """Metadata of the files (.deb, .dsc, ...) of the package.""" name = attr.ib(type=str) full_version = attr.ib(type=str) """eg. stretch/contrib/0.7.2-3""" @classmethod def from_metadata( cls, a_metadata: Dict[str, Any], url: str, version: str ) -> "DebianPackageInfo": return cls( url=url, filename=None, version=version, raw_info=a_metadata, files={ file_name: DebianFileMetadata(**file_metadata) for (file_name, file_metadata) in a_metadata.get("files", {}).items() }, name=a_metadata["name"], full_version=a_metadata["version"], ) def extid(self) -> Optional[PartialExtID]: dsc_files = [ file for (name, file) in self.files.items() if name.endswith(".dsc") ] if len(dsc_files) != 1: raise DscCountError( f"Expected exactly one .dsc file for package {self.name}, " f"got {len(dsc_files)}" ) return (EXTID_TYPE, hash_to_bytes(dsc_files[0].sha256)) @attr.s class IntrinsicPackageMetadata: """Metadata extracted from a package's .dsc file.""" name = attr.ib(type=str) version = attr.ib(type=str) changelog = attr.ib(type=DebianPackageChangelog) maintainers = attr.ib(type=List[Dict[str, str]]) """A list of dicts with fields like, model.Person, except they are str instead of bytes, and 'email' is optional.""" class DebianLoader(PackageLoader[DebianPackageInfo]): """Load debian origins into swh archive. """ visit_type = "deb" def __init__( self, storage: StorageInterface, url: str, date: str, packages: Mapping[str, Any], max_content_size: Optional[int] = None, ): """Debian Loader implementation. Args: url: Origin url (e.g. deb://Debian/packages/cicero) date: Ignored packages: versioned packages and associated artifacts, example:: { 'stretch/contrib/0.7.2-3': { 'name': 'cicero', 'version': '0.7.2-3' 'files': { 'cicero_0.7.2-3.diff.gz': { 'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce', 'name': 'cicero_0.7.2-3.diff.gz', 'sha256': 'f039c9642fe15c75bed5254315e2a29f...', 'size': 3964, 'uri': 'http://d.d.o/cicero_0.7.2-3.diff.gz', }, 'cicero_0.7.2-3.dsc': { 'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a', 'name': 'cicero_0.7.2-3.dsc', 'sha256': '35b7f1048010c67adfd8d70e4961aefb...', 'size': 1864, 'uri': 'http://d.d.o/cicero_0.7.2-3.dsc', }, 'cicero_0.7.2.orig.tar.gz': { 'md5sum': '4353dede07c5728319ba7f5595a7230a', 'name': 'cicero_0.7.2.orig.tar.gz', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbab...', 'size': 96527, 'uri': 'http://d.d.o/cicero_0.7.2.orig.tar.gz', } }, }, # ... } """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.packages = packages def get_versions(self) -> Sequence[str]: """Returns the keys of the packages input (e.g. stretch/contrib/0.7.2-3, etc...) """ return list(self.packages.keys()) def get_package_info(self, version: str) -> Iterator[Tuple[str, DebianPackageInfo]]: meta = self.packages[version] p_info = DebianPackageInfo.from_metadata(meta, url=self.url, version=version) yield release_name(version), p_info def download_package( self, p_info: DebianPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Contrary to other package loaders (1 package, 1 artifact), `p_info.files` represents the package's datafiles set to fetch: - .orig.tar.gz - .dsc - .diff.gz This is delegated to the `download_package` function. """ all_hashes = download_package(p_info, tmpdir) logger.debug("all_hashes: %s", all_hashes) res = [] for hashes in all_hashes.values(): res.append((tmpdir, hashes)) logger.debug("res: %s", res) return res def uncompress( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str ) -> str: logger.debug("dl_artifacts: %s", dl_artifacts) return extract_package(dl_artifacts, dest=dest) def build_release( self, p_info: DebianPackageInfo, uncompressed_path: str, directory: Sha1Git, ) -> Optional[Release]: dsc_url, dsc_name = dsc_information(p_info) if not dsc_name: raise ValueError("dsc name for url %s should not be None" % dsc_url) dsc_path = path.join(path.dirname(uncompressed_path), dsc_name) intrinsic_metadata = get_intrinsic_package_metadata( p_info, dsc_path, uncompressed_path ) logger.debug("intrinsic_metadata: %s", intrinsic_metadata) logger.debug("p_info: %s", p_info) - msg = "Synthetic revision for Debian source package %s version %s" % ( - p_info.name, - p_info.full_version, + msg = ( + f"Synthetic release for Debian source package {p_info.name} " + f"version {p_info.full_version}" ) author = prepare_person(intrinsic_metadata.changelog.person) date = TimestampWithTimezone.from_iso8601(intrinsic_metadata.changelog.date) # inspired from swh.loader.debian.converters.package_metadata_to_revision return Release( name=p_info.version.encode(), - message=msg.encode("utf-8"), + message=msg.encode(), author=author, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def uid_to_person(uid: str) -> Dict[str, str]: """Convert an uid to a person suitable for insertion. Args: uid: an uid of the form "Name " Returns: a dictionary with the following keys: - name: the name associated to the uid - email: the mail associated to the uid - fullname: the actual uid input """ logger.debug("uid: %s", uid) ret = { "name": "", "email": "", "fullname": uid, } name, mail = email.utils.parseaddr(uid) if name and email: ret["name"] = name ret["email"] = mail else: ret["name"] = uid return ret def prepare_person(person: Mapping[str, str]) -> Person: """Prepare person for swh serialization... Args: A person dict Returns: A person ready for storage """ return Person.from_dict( {key: value.encode("utf-8") for (key, value) in person.items()} ) def download_package(p_info: DebianPackageInfo, tmpdir: Any) -> Mapping[str, Any]: """Fetch a source package in a temporary directory and check the checksums for all files. Args: p_info: Information on a package tmpdir: Where to download and extract the files to ingest Returns: Dict of swh hashes per filename key """ all_hashes = {} for filename, fileinfo in p_info.files.items(): uri = fileinfo.uri logger.debug("fileinfo: %s", fileinfo) extrinsic_hashes = {"sha256": fileinfo.sha256} logger.debug("extrinsic_hashes(%s): %s", filename, extrinsic_hashes) filepath, hashes = download( uri, dest=tmpdir, filename=filename, hashes=extrinsic_hashes ) all_hashes[filename] = hashes logger.debug("all_hashes: %s", all_hashes) return all_hashes def dsc_information(p_info: DebianPackageInfo) -> Tuple[Optional[str], Optional[str]]: """Retrieve dsc information from a package. Args: p_info: Package metadata information Returns: Tuple of dsc file's uri, dsc's full disk path """ dsc_name = None dsc_url = None for filename, fileinfo in p_info.files.items(): if filename.endswith(".dsc"): if dsc_name: raise DscCountError( "Package %s_%s references several dsc files." % (p_info.name, p_info.version) ) dsc_url = fileinfo.uri dsc_name = filename return dsc_url, dsc_name def extract_package(dl_artifacts: List[Tuple[str, Mapping]], dest: str) -> str: """Extract a Debian source package to a given directory. Note that after extraction the target directory will be the root of the extracted package, rather than containing it. Args: package: package information dictionary dest: directory where the package files are stored Returns: Package extraction directory """ a_path = dl_artifacts[0][0] logger.debug("dl_artifacts: %s", dl_artifacts) for _, hashes in dl_artifacts: logger.debug("hashes: %s", hashes) filename = hashes["filename"] if filename.endswith(".dsc"): dsc_name = filename break dsc_path = path.join(a_path, dsc_name) destdir = path.join(dest, "extracted") logfile = path.join(dest, "extract.log") logger.debug( "extract Debian source package %s in %s" % (dsc_path, destdir), extra={"swh_type": "deb_extract", "swh_dsc": dsc_path, "swh_destdir": destdir,}, ) cmd = [ "dpkg-source", "--no-copy", "--no-check", "--ignore-bad-version", "-x", dsc_path, destdir, ] try: with open(logfile, "w") as stdout: subprocess.check_call(cmd, stdout=stdout, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logdata = open(logfile, "r").read() raise ValueError( "dpkg-source exited with code %s: %s" % (e.returncode, logdata) ) from None return destdir def get_intrinsic_package_metadata( p_info: DebianPackageInfo, dsc_path: str, extracted_path: str ) -> IntrinsicPackageMetadata: """Get the package metadata from the source package at dsc_path, extracted in extracted_path. Args: p_info: the package information dsc_path: path to the package's dsc file extracted_path: the path where the package got extracted Returns: dict: a dictionary with the following keys: - history: list of (package_name, package_version) tuples parsed from the package changelog """ with open(dsc_path, "rb") as dsc: parsed_dsc = Dsc(dsc) # Parse the changelog to retrieve the rest of the package information changelog_path = path.join(extracted_path, "debian/changelog") with open(changelog_path, "rb") as changelog_file: try: parsed_changelog = Changelog(changelog_file) except UnicodeDecodeError: logger.warning( "Unknown encoding for changelog %s," " falling back to iso" % changelog_path, extra={ "swh_type": "deb_changelog_encoding", "swh_name": p_info.name, "swh_version": str(p_info.version), "swh_changelog": changelog_path, }, ) # need to reset as Changelog scrolls to the end of the file changelog_file.seek(0) parsed_changelog = Changelog(changelog_file, encoding="iso-8859-15") history: List[Tuple[str, str]] = [] for block in parsed_changelog: assert block.package is not None history.append((block.package, str(block.version))) changelog = DebianPackageChangelog( person=uid_to_person(parsed_changelog.author), date=parse_date(parsed_changelog.date).isoformat(), history=history[1:], ) maintainers = [ uid_to_person(parsed_dsc["Maintainer"]), ] maintainers.extend( uid_to_person(person) for person in UPLOADERS_SPLIT.split(parsed_dsc.get("Uploaders", "")) ) return IntrinsicPackageMetadata( name=p_info.name, version=str(p_info.version), changelog=changelog, maintainers=maintainers, ) diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py index 13da1c0..4beded3 100644 --- a/swh/loader/package/debian/tests/test_debian.py +++ b/swh/loader/package/debian/tests/test_debian.py @@ -1,449 +1,478 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from copy import deepcopy import logging from os import path import pytest from swh.loader.package.debian.loader import ( DebianLoader, DebianPackageChangelog, DebianPackageInfo, IntrinsicPackageMetadata, download_package, dsc_information, extract_package, get_intrinsic_package_metadata, prepare_person, uid_to_person, ) from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes -from swh.model.model import Person, Snapshot, SnapshotBranch, TargetType +from swh.model.model import ( + ObjectType, + Person, + Release, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, + TimestampWithTimezone, +) logger = logging.getLogger(__name__) URL = "deb://Debian/packages/cicero" PACKAGE_FILES = { "name": "cicero", "version": "0.7.2-3", "files": { "cicero_0.7.2-3.diff.gz": { "md5sum": "a93661b6a48db48d59ba7d26796fc9ce", "name": "cicero_0.7.2-3.diff.gz", "sha256": "f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c", # noqa "size": 3964, "uri": "http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-3.diff.gz", # noqa }, "cicero_0.7.2-3.dsc": { "md5sum": "d5dac83eb9cfc9bb52a15eb618b4670a", "name": "cicero_0.7.2-3.dsc", "sha256": "35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03", # noqa "size": 1864, "uri": "http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-3.dsc", # noqa }, # noqa "cicero_0.7.2.orig.tar.gz": { "md5sum": "4353dede07c5728319ba7f5595a7230a", "name": "cicero_0.7.2.orig.tar.gz", "sha256": "63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786", # noqa "size": 96527, "uri": "http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz", # noqa }, }, } PACKAGE_FILES2 = { "name": "cicero", "version": "0.7.2-4", "files": { "cicero_0.7.2-4.diff.gz": { "md5sum": "1e7e6fc4a59d57c98082a3af78145734", "name": "cicero_0.7.2-4.diff.gz", "sha256": "2e6fa296ee7005473ff58d0971f4fd325617b445671480e9f2cfb738d5dbcd01", # noqa "size": 4038, "uri": "http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-4.diff.gz", # noqa }, "cicero_0.7.2-4.dsc": { "md5sum": "1a6c8855a73b4282bb31d15518f18cde", "name": "cicero_0.7.2-4.dsc", "sha256": "913ee52f7093913420de5cbe95d63cfa817f1a1daf997961149501894e754f8b", # noqa "size": 1881, "uri": "http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-4.dsc", # noqa }, # noqa "cicero_0.7.2.orig.tar.gz": { "md5sum": "4353dede07c5728319ba7f5595a7230a", "name": "cicero_0.7.2.orig.tar.gz", "sha256": "63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786", # noqa "size": 96527, "uri": "http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz", # noqa }, }, } PACKAGE_PER_VERSION = { "stretch/contrib/0.7.2-3": PACKAGE_FILES, } PACKAGES_PER_VERSION = { "stretch/contrib/0.7.2-3": PACKAGE_FILES, "buster/contrib/0.7.2-4": PACKAGE_FILES2, } def test_debian_first_visit(swh_storage, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ loader = DebianLoader( swh_storage, URL, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGE_PER_VERSION, ) actual_load_status = loader.load() - expected_snapshot_id = "8bc5d12e2443ab216fdd2f969b25b39e96c20fef" + expected_snapshot_id = "20073c91e85b8bcbd2639990e76765d25bd2c0a6" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( swh_storage, URL, status="full", type="deb", snapshot=hash_to_bytes(expected_snapshot_id), ) + release_id = hash_to_bytes("ed191d99e070a33458a4a402becd0b4bba09cd1e") + expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"releases/stretch/contrib/0.7.2-3": SnapshotBranch( - target_type=TargetType.RELEASE, - target=hash_to_bytes("5a99736512d381700c5f54d7fdd6b46e136535a2"), + target_type=TargetType.RELEASE, target=release_id, ) }, ) # different than the previous loader as no release is done check_snapshot(expected_snapshot, swh_storage) + assert swh_storage.release_get([release_id])[0] == Release( + id=release_id, + name=b"stretch/contrib/0.7.2-3", + message=b"Synthetic release for Debian source package cicero version 0.7.2-3", + target=hash_to_bytes("798df511408c53bf842a8e54d4d335537836bdc3"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=Person( + fullname=b"Samuel Thibault ", + name=b"Samuel Thibault", + email=b"sthibault@debian.org", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1413730355, microseconds=0), + offset=120, + negative_utc=False, + ), + ) + stats = get_stats(swh_storage) assert { "content": 42, "directory": 2, "origin": 1, "origin_visit": 1, "release": 1, # all artifacts under 1 release "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_debian_first_visit_then_another_visit(swh_storage, requests_mock_datadir): """With no prior visit, load a debian project ends up with 1 snapshot """ loader = DebianLoader( swh_storage, URL, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGE_PER_VERSION, ) actual_load_status = loader.load() - expected_snapshot_id = "8bc5d12e2443ab216fdd2f969b25b39e96c20fef" + expected_snapshot_id = "20073c91e85b8bcbd2639990e76765d25bd2c0a6" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( swh_storage, URL, status="full", type="deb", snapshot=hash_to_bytes(expected_snapshot_id), ) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"releases/stretch/contrib/0.7.2-3": SnapshotBranch( target_type=TargetType.RELEASE, - target=hash_to_bytes("5a99736512d381700c5f54d7fdd6b46e136535a2"), + target=hash_to_bytes("ed191d99e070a33458a4a402becd0b4bba09cd1e"), ) }, ) # different than the previous loader as no release is done check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 42, "directory": 2, "origin": 1, "origin_visit": 1, "release": 1, # all artifacts under 1 release "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats # No change in between load actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" assert_last_visit_matches( swh_storage, URL, status="full", type="deb", snapshot=hash_to_bytes(expected_snapshot_id), ) stats2 = get_stats(swh_storage) assert { "content": 42 + 0, "directory": 2 + 0, "origin": 1, "origin_visit": 1 + 1, # a new visit occurred "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, # same snapshot across 2 visits } == stats2 urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("http://deb.debian.org") ] # visited each package artifact twice across 2 visits assert len(urls) == len(set(urls)) def test_debian_uid_to_person(): uid = "Someone Name " actual_person = uid_to_person(uid) assert actual_person == { "name": "Someone Name", "email": "someone@orga.org", "fullname": uid, } def test_debian_prepare_person(): actual_author = prepare_person( { "name": "Someone Name", "email": "someone@orga.org", "fullname": "Someone Name ", } ) assert actual_author == Person( name=b"Someone Name", email=b"someone@orga.org", fullname=b"Someone Name ", ) def test_debian_download_package(datadir, tmpdir, requests_mock_datadir): tmpdir = str(tmpdir) # py3.5 work around (LocalPath issue) p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL, version="0.7.2-3") all_hashes = download_package(p_info, tmpdir) assert all_hashes == { "cicero_0.7.2-3.diff.gz": { "checksums": { "sha1": "0815282053f21601b0ec4adf7a8fe47eace3c0bc", "sha256": "f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c", # noqa }, "filename": "cicero_0.7.2-3.diff.gz", "length": 3964, "url": ( "http://deb.debian.org/debian/pool/contrib/c/cicero/" "cicero_0.7.2-3.diff.gz" ), }, "cicero_0.7.2-3.dsc": { "checksums": { "sha1": "abbec4e8efbbc80278236e1dd136831eac08accd", "sha256": "35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03", # noqa }, "filename": "cicero_0.7.2-3.dsc", "length": 1864, "url": ( "http://deb.debian.org/debian/pool/contrib/c/cicero/cicero_0.7.2-3.dsc" ), }, "cicero_0.7.2.orig.tar.gz": { "checksums": { "sha1": "a286efd63fe2c9c9f7bb30255c3d6fcdcf390b43", "sha256": "63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786", # noqa }, "filename": "cicero_0.7.2.orig.tar.gz", "length": 96527, "url": ( "http://deb.debian.org/debian/pool/contrib/c/cicero/" "cicero_0.7.2.orig.tar.gz" ), }, } def test_debian_dsc_information_ok(): fname = "cicero_0.7.2-3.dsc" p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL, version="0.7.2-3") dsc_url, dsc_name = dsc_information(p_info) assert dsc_url == PACKAGE_FILES["files"][fname]["uri"] assert dsc_name == PACKAGE_FILES["files"][fname]["name"] def test_debian_dsc_information_not_found(): fname = "cicero_0.7.2-3.dsc" p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL, version="0.7.2-3") p_info.files.pop(fname) dsc_url, dsc_name = dsc_information(p_info) assert dsc_url is None assert dsc_name is None def test_debian_dsc_information_missing_md5sum(): package_files = deepcopy(PACKAGE_FILES) for package_metadata in package_files["files"].values(): del package_metadata["md5sum"] p_info = DebianPackageInfo.from_metadata(package_files, url=URL, version="0.7.2-3") for debian_file_metadata in p_info.files.values(): assert not debian_file_metadata.md5sum def test_debian_dsc_information_too_many_dsc_entries(): # craft an extra dsc file fname = "cicero_0.7.2-3.dsc" p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL, version="0.7.2-3") data = p_info.files[fname] fname2 = fname.replace("cicero", "ciceroo") p_info.files[fname2] = data with pytest.raises( ValueError, match="Package %s_%s references several dsc" % (PACKAGE_FILES["name"], PACKAGE_FILES["version"]), ): dsc_information(p_info) def test_debian_get_intrinsic_package_metadata( requests_mock_datadir, datadir, tmp_path ): tmp_path = str(tmp_path) # py3.5 compat. p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL, version="0.7.2-3") logger.debug("p_info: %s", p_info) # download the packages all_hashes = download_package(p_info, tmp_path) # Retrieve information from package _, dsc_name = dsc_information(p_info) dl_artifacts = [(tmp_path, hashes) for hashes in all_hashes.values()] # Extract information from package extracted_path = extract_package(dl_artifacts, tmp_path) # Retrieve information on package dsc_path = path.join(path.dirname(extracted_path), dsc_name) actual_package_info = get_intrinsic_package_metadata( p_info, dsc_path, extracted_path ) logger.debug("actual_package_info: %s", actual_package_info) assert actual_package_info == IntrinsicPackageMetadata( changelog=DebianPackageChangelog( date="2014-10-19T16:52:35+02:00", history=[ ("cicero", "0.7.2-2"), ("cicero", "0.7.2-1"), ("cicero", "0.7-1"), ], person={ "email": "sthibault@debian.org", "fullname": "Samuel Thibault ", "name": "Samuel Thibault", }, ), maintainers=[ { "email": "debian-accessibility@lists.debian.org", "fullname": "Debian Accessibility Team " "", "name": "Debian Accessibility Team", }, { "email": "sthibault@debian.org", "fullname": "Samuel Thibault ", "name": "Samuel Thibault", }, ], name="cicero", version="0.7.2-3", ) def test_debian_multiple_packages(swh_storage, requests_mock_datadir): loader = DebianLoader( swh_storage, URL, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGES_PER_VERSION, ) actual_load_status = loader.load() - expected_snapshot_id = "3d26243c91eb084c350627a5a102cfe039c5b92a" + expected_snapshot_id = "3e423d7889ebd8df0ed0373016f035dfed8541cb" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( swh_storage, URL, status="full", type="deb", snapshot=hash_to_bytes(expected_snapshot_id), ) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"releases/stretch/contrib/0.7.2-3": SnapshotBranch( target_type=TargetType.RELEASE, - target=hash_to_bytes("5a99736512d381700c5f54d7fdd6b46e136535a2"), + target=hash_to_bytes("ed191d99e070a33458a4a402becd0b4bba09cd1e"), ), b"releases/buster/contrib/0.7.2-4": SnapshotBranch( target_type=TargetType.RELEASE, - target=hash_to_bytes("192fc7ccce80f64a0d3cf33d379133af067ec721"), + target=hash_to_bytes("d3dff4a416816c36dc284e49c1c9eed52c2d2ef4"), ), }, ) check_snapshot(expected_snapshot, swh_storage) diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py index 201ba3e..72ea922 100644 --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -1,282 +1,282 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import json import logging import re from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple import attr from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, PartialExtID, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method from swh.model import hashutil from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Release, Sha1Git, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EXTID_TYPE = "subresource-integrity" """The ExtID is an ASCII string, as defined by https://w3c.github.io/webappsec-subresource-integrity/""" @attr.s class NixGuixPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) integrity = attr.ib(type=str) """Hash of the archive, formatted as in the Subresource Integrity specification.""" @classmethod def from_metadata( cls, metadata: Dict[str, Any], version: str ) -> "NixGuixPackageInfo": return cls( url=metadata["url"], filename=None, version=version, integrity=metadata["integrity"], raw_info=metadata, ) def extid(self) -> PartialExtID: return (EXTID_TYPE, self.integrity.encode("ascii")) class NixGuixLoader(PackageLoader[NixGuixPackageInfo]): """Load sources from a sources.json file. This loader is used to load sources used by functional package manager (eg. Nix and Guix). """ visit_type = "nixguix" def __init__( self, storage: StorageInterface, url: str, unsupported_file_extensions: List[str] = [], max_content_size: Optional[int] = None, ): super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.provider_url = url self.unsupported_file_extensions = unsupported_file_extensions # Note: this could be renamed get_artifacts in the PackageLoader # base class. @cached_method def raw_sources(self): return retrieve_sources(self.url) @cached_method def supported_sources(self): raw_sources = self.raw_sources() return clean_sources( parse_sources(raw_sources), self.unsupported_file_extensions ) @cached_method def integrity_by_url(self) -> Dict[str, str]: sources = self.supported_sources() return {s["urls"][0]: s["integrity"] for s in sources["sources"]} def get_versions(self) -> List[str]: """The first mirror of the mirror list is used as branch name in the snapshot. """ return list(self.integrity_by_url().keys()) def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url=self.url, metadata={}, ) def get_extrinsic_snapshot_metadata(self): return [ RawExtrinsicMetadataCore( format="nixguix-sources-json", metadata=self.raw_sources(), ), ] # Note: this could be renamed get_artifact_info in the PackageLoader # base class. def get_package_info(self, url) -> Iterator[Tuple[str, NixGuixPackageInfo]]: # TODO: try all mirrors and not only the first one. A source # can be fetched from several urls, called mirrors. We # currently only use the first one, but if the first one # fails, we should try the second one and so on. integrity = self.integrity_by_url()[url] p_info = NixGuixPackageInfo.from_metadata( {"url": url, "integrity": integrity}, version=url ) yield url, p_info def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """We add a branch to the snapshot called 'evaluation' pointing to the revision used to generate the sources.json file. This revision is specified in the sources.json file itself. For the nixpkgs origin, this revision is coming from the github.com/nixos/nixpkgs repository. Note this repository is not loaded explicitly. So, this pointer can target a nonexistent revision for a time. However, the github and gnu loaders are supposed to load this revision and should create the revision pointed by this branch. This branch can be used to identify the snapshot associated to a Nix/Guix evaluation. """ # The revision used to create the sources.json file. For Nix, # this revision belongs to the github.com/nixos/nixpkgs # repository revision = self.supported_sources()["revision"] return { b"evaluation": { "target_type": "revision", "target": hashutil.hash_to_bytes(revision), } } def build_release( self, p_info: NixGuixPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: return Release( name=p_info.version.encode(), - message=b"", + message=None, author=EMPTY_AUTHOR, date=None, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def retrieve_sources(url: str) -> bytes: """Retrieve sources. Potentially raise NotFound error.""" return api_info(url, allow_redirects=True) def parse_sources(raw_sources: bytes) -> Dict[str, Any]: return json.loads(raw_sources.decode("utf-8")) def make_pattern_unsupported_file_extension(unsupported_file_extensions: List[str],): """Make a regexp pattern for unsupported file extension out of a list of unsupported archive extension list. """ return re.compile( rf".*\.({'|'.join(map(re.escape, unsupported_file_extensions))})$", re.DOTALL ) def clean_sources( sources: Dict[str, Any], unsupported_file_extensions=[] ) -> Dict[str, Any]: """Validate and clean the sources structure. First, ensure all top level keys are present. Then, walk the sources list and remove sources that do not contain required keys. Filter out source entries whose: - required keys are missing - source type is not supported - urls attribute type is not a list - extension is known not to be supported by the loader Raises: ValueError if: - a required top level key is missing - top-level version is not 1 Returns: source Dict cleaned up """ pattern_unsupported_file = make_pattern_unsupported_file_extension( unsupported_file_extensions ) # Required top level keys required_keys = ["version", "revision", "sources"] missing_keys = [] for required_key in required_keys: if required_key not in sources: missing_keys.append(required_key) if missing_keys != []: raise ValueError( f"sources structure invalid, missing: {','.join(missing_keys)}" ) # Only the version 1 is currently supported version = int(sources["version"]) if version != 1: raise ValueError( f"The sources structure version '{sources['version']}' is not supported" ) # If a source doesn't contain required attributes, this source is # skipped but others could still be archived. verified_sources = [] for source in sources["sources"]: valid = True required_keys = ["urls", "integrity", "type"] for required_key in required_keys: if required_key not in source: logger.info( f"Skip source '{source}' because key '{required_key}' is missing", ) valid = False if valid and source["type"] != "url": logger.info( f"Skip source '{source}' because the type {source['type']} " "is not supported", ) valid = False if valid and not isinstance(source["urls"], list): logger.info( f"Skip source {source} because the urls attribute is not a list" ) valid = False if valid and len(source["urls"]) > 0: # Filter out unsupported archives supported_sources: List[str] = [] for source_url in source["urls"]: if pattern_unsupported_file.match(source_url): logger.info(f"Skip unsupported artifact url {source_url}") continue supported_sources.append(source_url) if len(supported_sources) == 0: logger.info( f"Skip source {source} because urls only reference " "unsupported artifacts. Unsupported " f"artifacts so far: {pattern_unsupported_file}" ) continue new_source = copy.deepcopy(source) new_source["urls"] = supported_sources verified_sources.append(new_source) sources["sources"] = verified_sources return sources diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py index 3f830b7..e8fd1f0 100644 --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -1,621 +1,655 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import os from typing import Dict, Optional, Tuple import pytest from swh.loader.package import __version__ from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.nixguix.loader import ( NixGuixLoader, clean_sources, make_pattern_unsupported_file_extension, parse_sources, retrieve_sources, ) from swh.loader.package.utils import download from swh.loader.tests import assert_last_visit_matches from swh.loader.tests import check_snapshot as check_snapshot_full from swh.loader.tests import get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, + ObjectType, + Person, RawExtrinsicMetadata, + Release, Snapshot, SnapshotBranch, TargetType, ) from swh.model.swhids import ExtendedObjectType, ExtendedSWHID from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.storage.exc import HashCollision from swh.storage.interface import PagedResult, StorageInterface sources_url = "https://nix-community.github.io/nixpkgs-swh/sources.json" @pytest.fixture def raw_sources(datadir) -> bytes: with open( os.path.join( datadir, "https_nix-community.github.io", "nixpkgs-swh_sources.json" ), "rb", ) as f: return f.read() SNAPSHOT1 = Snapshot( - id=hash_to_bytes("771d13ae4e799755c22d1e05da8fc39cf215de58"), + id=hash_to_bytes("efe5145f85af3fc87f34102d8b8481cd5198f4f8"), branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( - target=hash_to_bytes("24853190589d26d0ea2b6c0330b553ff39176e0c"), + target=hash_to_bytes("df7811b9644ed8ef088e2e7add62ed32b0bab15f"), target_type=TargetType.RELEASE, ), }, ) def check_snapshot(snapshot: Snapshot, storage: StorageInterface): # The `evaluation` branch is allowed to be unresolvable. It's possible at current # nixguix visit time, it is not yet visited (the git loader is in charge of its # visit for now). For more details, check the # swh.loader.package.nixguix.NixGuixLoader.extra_branches docstring. check_snapshot_full( snapshot, storage, allowed_empty=[(TargetType.REVISION, b"evaluation")] ) assert isinstance(snapshot, Snapshot) # then ensure the snapshot revisions are structurally as expected revision_ids = [] for name, branch in snapshot.branches.items(): if name == b"evaluation": continue # skipping that particular branch (cf. previous comment) if branch.target_type == TargetType.REVISION: revision_ids.append(branch.target) revisions = storage.revision_get(revision_ids) for rev in revisions: assert rev is not None metadata = rev.metadata assert not metadata def test_retrieve_sources(swh_storage, requests_mock_datadir): j = parse_sources(retrieve_sources(sources_url)) assert "sources" in j.keys() assert len(j["sources"]) == 2 def test_nixguix_url_not_found(swh_storage, requests_mock_datadir): """When failing to read from the url, the visit is marked as not_found. Here the sources url does not exist, so requests_mock_datadir returns a 404. Resulting in a NotFound raised within the package loader's main loop. This results in the task with status failed and a visit_status with status "not_found". """ unknown_url = "https://non-existing-url/" loader = NixGuixLoader(swh_storage, unknown_url) # during the retrieval step load_status = loader.load() assert load_status == {"status": "failed"} assert_last_visit_matches( swh_storage, unknown_url, status="not_found", type="nixguix", snapshot=None ) assert len(requests_mock_datadir.request_history) == 1 assert requests_mock_datadir.request_history[0].url == unknown_url def test_nixguix_url_with_decoding_error(swh_storage, requests_mock_datadir): """Other errors during communication with the url, the visit is marked as failed requests_mock_datadir will intercept the requests to sources_url. Since the file exists, returns a 200 with the requested content of the query. As file.txt is no json, fails do decode and raises a JSONDecodeError. In effect failing the visit. """ sources_url = "https://example.com/file.txt" loader = NixGuixLoader(swh_storage, sources_url) load_status = loader.load() assert load_status == {"status": "failed"} assert_last_visit_matches( swh_storage, sources_url, status="failed", type="nixguix", snapshot=None ) assert len(requests_mock_datadir.request_history) == 1 assert requests_mock_datadir.request_history[0].url == sources_url def test_clean_sources_invalid_schema(swh_storage, requests_mock_datadir): sources = {} with pytest.raises(ValueError, match="sources structure invalid, missing: .*"): clean_sources(sources) def test_clean_sources_invalid_version(swh_storage, requests_mock_datadir): for version_ok in [1, "1"]: # Check those versions are fine clean_sources({"version": version_ok, "sources": [], "revision": "my-revision"}) for version_ko in [0, "0", 2, "2"]: # Check version != 1 raise an error with pytest.raises( ValueError, match="sources structure version .* is not supported" ): clean_sources( {"version": version_ko, "sources": [], "revision": "my-revision"} ) def test_clean_sources_invalid_sources(swh_storage, requests_mock_datadir): valid_sources = [ # 1 valid source {"type": "url", "urls": ["my-url.tar.gz"], "integrity": "my-integrity"}, ] sources = { "version": 1, "sources": valid_sources + [ # integrity is missing {"type": "url", "urls": ["my-url.tgz"],}, # urls is not a list {"type": "url", "urls": "my-url.zip", "integrity": "my-integrity"}, # type is not url {"type": "git", "urls": ["my-url.zip"], "integrity": "my-integrity"}, # missing fields which got double-checked nonetheless... {"integrity": "my-integrity"}, ], "revision": "my-revision", } clean = clean_sources(sources) assert len(clean["sources"]) == len(valid_sources) def test_make_pattern_unsupported_file_extension(): unsupported_extensions = ["el", "c", "txt"] supported_extensions = ["Z", "7z"] # for test actual_unsupported_pattern = make_pattern_unsupported_file_extension( unsupported_extensions ) for supported_ext in supported_extensions: assert supported_ext not in unsupported_extensions supported_filepath = f"anything.{supported_ext}" actual_match = actual_unsupported_pattern.match(supported_filepath) assert not actual_match for unsupported_ext in unsupported_extensions: unsupported_filepath = f"something.{unsupported_ext}" actual_match = actual_unsupported_pattern.match(unsupported_filepath) assert actual_match def test_clean_sources_unsupported_artifacts(swh_storage, requests_mock_datadir): unsupported_file_extensions = [ "iso", "whl", "gem", "pom", "msi", "pod", "png", "rock", "ttf", "jar", "c", "el", "rpm", "diff", "patch", ] supported_sources = [ { "type": "url", "urls": [f"https://server.org/my-url.{ext}"], "integrity": "my-integrity", } for ext in [ "known-unknown-but-ok", # this is fine as well with the current approach "zip", "tar.gz", "tgz", "tar.bz2", "tbz", "tbz2", "tar.xz", "tar", "zip", "7z", "Z", ] ] unsupported_sources = [ { "type": "url", "urls": [f"https://server.org/my-url.{ext}"], "integrity": "my-integrity", } for ext in unsupported_file_extensions ] sources = { "version": 1, "sources": supported_sources + unsupported_sources, "revision": "my-revision", } clean = clean_sources(sources, unsupported_file_extensions) assert len(clean["sources"]) == len(supported_sources) def test_loader_one_visit(swh_storage, requests_mock_datadir, raw_sources): loader = NixGuixLoader(swh_storage, sources_url) - res = loader.load() - assert res["status"] == "eventful" + load_status = loader.load() + expected_snapshot_id_hex = "efe5145f85af3fc87f34102d8b8481cd5198f4f8" + expected_snapshot_id = hash_to_bytes(expected_snapshot_id_hex) + assert load_status == { + "status": "eventful", + "snapshot_id": expected_snapshot_id_hex, + } + + release_id = hash_to_bytes("df7811b9644ed8ef088e2e7add62ed32b0bab15f") + expected_snapshot = Snapshot( + id=expected_snapshot_id, + branches={ + b"evaluation": SnapshotBranch( + target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), + target_type=TargetType.REVISION, + ), + b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( + target=release_id, target_type=TargetType.RELEASE, + ), + }, + ) + check_snapshot(expected_snapshot, storage=swh_storage) + + assert swh_storage.release_get([release_id])[0] == Release( + id=release_id, + name=b"https://github.com/owner-1/repository-1/revision-1.tgz", + message=None, + target=hash_to_bytes("4de2e07d3742718d928e974b8a4c721b9f7b33bf"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=Person.from_fullname(b""), + date=None, + ) stats = get_stats(swh_storage) assert { "content": 1, "directory": 3, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats # The visit is partial because urls pointing to non tarball file # are not handled yet assert_last_visit_matches( swh_storage, sources_url, status="partial", type="nixguix" ) visit_status = origin_get_latest_visit_status(swh_storage, sources_url) snapshot_swhid = ExtendedSWHID( object_type=ExtendedObjectType.SNAPSHOT, object_id=visit_status.snapshot ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url=sources_url, ) expected_metadata = [ RawExtrinsicMetadata( target=snapshot_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.nixguix.loader.NixGuixLoader", version=__version__, ), discovery_date=loader.visit_date, format="nixguix-sources-json", metadata=raw_sources, origin=sources_url, ) ] assert swh_storage.raw_extrinsic_metadata_get( snapshot_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) def test_uncompress_failure(swh_storage, requests_mock_datadir): """Non tarball files are currently not supported and the uncompress function fails on such kind of files. However, even in this case of failure (because of the url https://example.com/file.txt), a snapshot and a visit has to be created (with a status partial since all files are not archived). """ loader = NixGuixLoader(swh_storage, sources_url) loader_status = loader.load() sources = loader.supported_sources()["sources"] urls = [s["urls"][0] for s in sources] assert "https://example.com/file.txt" in urls assert loader_status["status"] == "eventful" # The visit is partial because urls pointing to non tarball files # are not handled yet assert_last_visit_matches( swh_storage, sources_url, status="partial", type="nixguix" ) def test_loader_incremental(swh_storage, requests_mock_datadir): """Ensure a second visit do not download artifact already downloaded by the previous visit. """ loader = NixGuixLoader(swh_storage, sources_url) load_status = loader.load() loader.load() assert load_status == {"status": "eventful", "snapshot_id": SNAPSHOT1.id.hex()} assert_last_visit_matches( swh_storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=swh_storage) urls = [ m.url for m in requests_mock_datadir.request_history if m.url == ("https://github.com/owner-1/repository-1/revision-1.tgz") ] # The artifact # 'https://github.com/owner-1/repository-1/revision-1.tgz' is only # visited one time assert len(urls) == 1 def test_loader_two_visits(swh_storage, requests_mock_datadir_visits): """To ensure there is only one origin, but two visits, two revisions and two snapshots are created. The first visit creates a snapshot containing one tarball. The second visit creates a snapshot containing the same tarball and another tarball. """ loader = NixGuixLoader(swh_storage, sources_url) load_status = loader.load() assert load_status == {"status": "eventful", "snapshot_id": SNAPSHOT1.id.hex()} assert_last_visit_matches( swh_storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=swh_storage) stats = get_stats(swh_storage) assert { "content": 1, "directory": 3, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats loader = NixGuixLoader(swh_storage, sources_url) load_status = loader.load() - expected_snapshot_id_hex = "c5bba84fd5ac3342566effb86190619092d34e79" + expected_snapshot_id_hex = "c1983a0a3f647548e1fb92f30339da6848fe9f7a" expected_snapshot_id = hash_to_bytes(expected_snapshot_id_hex) assert load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id_hex, } assert_last_visit_matches( swh_storage, sources_url, status="partial", type="nixguix", snapshot=expected_snapshot_id, ) # This ensures visits are incremental. Indeed, if we request a # second time an url, because of the requests_mock_datadir_visits # fixture, the file has to end with `_visit1`. expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes("602140776b2ce6c9159bcf52ada73a297c063d5e"), target_type=TargetType.REVISION, ), b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( - target=hash_to_bytes("24853190589d26d0ea2b6c0330b553ff39176e0c"), + target=hash_to_bytes("df7811b9644ed8ef088e2e7add62ed32b0bab15f"), target_type=TargetType.RELEASE, ), b"https://github.com/owner-2/repository-1/revision-1.tgz": SnapshotBranch( - target=hash_to_bytes("3d44fbe814ba802cfd77f83975e45766d3a2ba85"), + target=hash_to_bytes("5cc0115cd643902b837cb6cfbc9f5865bc5a7cb2"), target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, storage=swh_storage) stats = get_stats(swh_storage) assert { "content": 2, "directory": 5, "origin": 1, "origin_visit": 2, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 2, } == stats def test_evaluation_branch(swh_storage, requests_mock_datadir): loader = NixGuixLoader(swh_storage, sources_url) res = loader.load() assert res["status"] == "eventful" assert_last_visit_matches( swh_storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=swh_storage) def test_eoferror(swh_storage, requests_mock_datadir): """Load a truncated archive which is invalid to make the uncompress function raising the exception EOFError. We then check if a snapshot is created, meaning this error is well managed. """ sources = ( "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json" # noqa ) loader = NixGuixLoader(swh_storage, sources) loader.load() expected_snapshot = Snapshot( id=hash_to_bytes("4257fa2350168c6bfec726a06452ea27a2c0cb33"), branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, storage=swh_storage) def fake_download( url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, ) -> Tuple[str, Dict]: """Fake download which raises HashCollision (for the sake of test simpliciy, let's accept that makes sense) For tests purpose only. """ if url == "https://example.com/file.txt": # instead of failing because it's a file not dealt with by the nix guix # loader, make it raise a hash collision raise HashCollision("sha1", "f92d74e3874587aaf443d1db961d4e26dde13e9c", []) return download(url, dest, hashes, filename, auth) def test_raise_exception(swh_storage, requests_mock_datadir, mocker): mock_download = mocker.patch("swh.loader.package.loader.download") mock_download.side_effect = fake_download loader = NixGuixLoader(swh_storage, sources_url) res = loader.load() assert res == { "status": "eventful", "snapshot_id": SNAPSHOT1.id.hex(), } # The visit is partial because some artifact downloads failed assert_last_visit_matches( swh_storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=swh_storage) assert len(mock_download.mock_calls) == 2 def test_load_nixguix_one_common_artifact_from_other_loader( swh_storage, datadir, requests_mock_datadir_visits, caplog ): """Misformatted revision should be caught and logged, then loading continues """ caplog.set_level(logging.ERROR, "swh.loader.package.nixguix.loader") # 1. first ingest with for example the archive loader gnu_url = "https://ftp.gnu.org/gnu/8sync/" release = "0.1.0" artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz" gnu_artifacts = [ { "time": 944729610, "url": artifact_url, "length": 221837, "filename": f"8sync-{release}.tar.gz", "version": release, } ] archive_loader = ArchiveLoader(swh_storage, url=gnu_url, artifacts=gnu_artifacts) actual_load_status = archive_loader.load() - expected_snapshot_id = "cdf8f335fa0c81c8ad089870ec14f52b1980eb6c" + expected_snapshot_id = "af62f6f6d464f9b29f270d1bbefa355af38946c4" assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] == expected_snapshot_id # noqa assert_last_visit_matches( archive_loader.storage, gnu_url, status="full", type="tar", snapshot=hash_to_bytes(expected_snapshot_id), ) # 2. Then ingest with the nixguix loader which lists the same artifact within its # sources.json # ensure test setup is ok data_sources = os.path.join( datadir, "https_nix-community.github.io", "nixpkgs-swh_sources_special.json" ) all_sources = json.loads(open(data_sources).read()) found = False for source in all_sources["sources"]: if source["urls"][0] == artifact_url: found = True assert ( found is True ), f"test setup error: {artifact_url} must be in {data_sources}" # first visit with a snapshot, ok sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json" loader = NixGuixLoader(swh_storage, sources_url) actual_load_status2 = loader.load() assert actual_load_status2["status"] == "eventful" snapshot_id = actual_load_status2["snapshot_id"] assert_last_visit_matches( swh_storage, sources_url, status="full", type="nixguix", snapshot=hash_to_bytes(snapshot_id), ) snapshot = snapshot_get_all_branches(swh_storage, hash_to_bytes(snapshot_id)) assert snapshot diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py index 8964dd7..68e5f55 100644 --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -1,292 +1,295 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from codecs import BOM_UTF8 import json import logging import os from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union from urllib.parse import quote import attr import chardet from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, PartialExtID, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import api_info, cached_method, release_name from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Person, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EMPTY_PERSON = Person(fullname=b"", name=None, email=None) EXTID_TYPE = "npm-archive-sha1" @attr.s class NpmPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) date = attr.ib(type=Optional[str]) shasum = attr.ib(type=str) """sha1 checksum""" @classmethod def from_metadata( cls, project_metadata: Dict[str, Any], version: str ) -> "NpmPackageInfo": package_metadata = project_metadata["versions"][version] url = package_metadata["dist"]["tarball"] # No date available in intrinsic metadata: retrieve it from the API # metadata, using the version number that the API claims this package # has. extrinsic_version = package_metadata["version"] if "time" in project_metadata: date = project_metadata["time"][extrinsic_version] elif "mtime" in package_metadata: date = package_metadata["mtime"] else: date = None return cls( url=url, filename=os.path.basename(url), date=date, shasum=package_metadata["dist"]["shasum"], version=extrinsic_version, raw_info=package_metadata, directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="replicate-npm-package-json", metadata=json.dumps(package_metadata).encode(), ) ], ) def extid(self) -> PartialExtID: return (EXTID_TYPE, hash_to_bytes(self.shasum)) class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive. """ visit_type = "npm" def __init__( self, storage: StorageInterface, url: str, max_content_size: Optional[int] = None, ): """Constructor Args str: origin url (e.g. https://www.npmjs.com/package/) """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) - package_name = url.split("https://www.npmjs.com/package/")[1] - safe_name = quote(package_name, safe="") + self.package_name = url.split("https://www.npmjs.com/package/")[1] + safe_name = quote(self.package_name, safe="") self.provider_url = f"https://replicate.npmjs.com/{safe_name}/" self._info: Dict[str, Any] = {} self._versions = None @cached_method def _raw_info(self) -> bytes: return api_info(self.provider_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from npm registry) """ return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: return sorted(list(self.info()["versions"].keys())) def get_default_version(self) -> str: return self.info()["dist-tags"].get("latest", "") def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={}, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, NpmPackageInfo]]: p_info = NpmPackageInfo.from_metadata( project_metadata=self.info(), version=version ) yield release_name(version), p_info def build_release( self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None author = extract_npm_package_author(i_metadata) - message = i_metadata["version"].encode("ascii") + msg = ( + f"Synthetic release for NPM source package {self.package_name} " + f"version {p_info.version}" + ) if p_info.date is None: url = p_info.url artifact_name = os.path.basename(url) raise ValueError( "Origin %s: Cannot determine upload time for artifact %s." % (p_info.url, artifact_name) ) date = TimestampWithTimezone.from_iso8601(p_info.date) # FIXME: this is to remain bug-compatible with earlier versions: date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0)) r = Release( name=p_info.version.encode(), - message=message, + message=msg.encode(), author=author, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) return r def _author_str(author_data: Union[Dict, List, str]) -> str: """Parse author from package.json author fields """ if isinstance(author_data, dict): author_str = "" name = author_data.get("name") if name is not None: if isinstance(name, str): author_str += name elif isinstance(name, list): author_str += _author_str(name[0]) if len(name) > 0 else "" email = author_data.get("email") if email is not None: author_str += f" <{email}>" result = author_str elif isinstance(author_data, list): result = _author_str(author_data[0]) if len(author_data) > 0 else "" else: result = author_data return result def extract_npm_package_author(package_json: Dict[str, Any]) -> Person: """ Extract package author from a ``package.json`` file content and return it in swh format. Args: package_json: Dict holding the content of parsed ``package.json`` file Returns: Person """ for author_key in ("author", "authors"): if author_key in package_json: author_data = package_json[author_key] if author_data is None: return EMPTY_PERSON author_str = _author_str(author_data) return Person.from_fullname(author_str.encode()) return EMPTY_PERSON def _lstrip_bom(s, bom=BOM_UTF8): if s.startswith(bom): return s[len(bom) :] else: return s def load_json(json_bytes): """ Try to load JSON from bytes and return a dictionary. First try to decode from utf-8. If the decoding failed, try to detect the encoding and decode again with replace error handling. If JSON is malformed, an empty dictionary will be returned. Args: json_bytes (bytes): binary content of a JSON file Returns: dict: JSON data loaded in a dictionary """ json_data = {} try: json_str = _lstrip_bom(json_bytes).decode("utf-8") except UnicodeDecodeError: encoding = chardet.detect(json_bytes)["encoding"] if encoding: json_str = json_bytes.decode(encoding, "replace") try: json_data = json.loads(json_str) except json.decoder.JSONDecodeError: pass return json_data def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from npm. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) == 0: return {} project_dirname = lst[0] package_json_path = os.path.join(dir_path, project_dirname, "package.json") if not os.path.exists(package_json_path): return {} with open(package_json_path, "rb") as package_json_file: package_json_bytes = package_json_file.read() return load_json(package_json_bytes) diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py index 796f58c..c6f4df0 100644 --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -1,620 +1,644 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os import pytest from swh.loader.package import __version__ from swh.loader.package.npm.loader import ( NpmLoader, _author_str, extract_npm_package_author, ) from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Person, RawExtrinsicMetadata, + Release, Snapshot, SnapshotBranch, TargetType, + Timestamp, + TimestampWithTimezone, ) from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.interface import PagedResult @pytest.fixture def org_api_info(datadir) -> bytes: with open(os.path.join(datadir, "https_replicate.npmjs.com", "org"), "rb",) as f: return f.read() def test_npm_author_str(): for author, expected_author in [ ("author", "author"), ( ["Al from quantum leap", "hal from 2001 space odyssey"], "Al from quantum leap", ), ([], ""), ({"name": "groot", "email": "groot@galaxy.org",}, "groot "), ({"name": "somebody",}, "somebody"), ({"email": "no@one.org"}, " "), # note first elt is an extra blank ({"name": "no one", "email": None,}, "no one"), ({"email": None,}, ""), ({"name": None}, ""), ({"name": None, "email": None,}, ""), ({}, ""), (None, None), ({"name": []}, "",), ( {"name": ["Susan McSween", "William H. Bonney", "Doc Scurlock",]}, "Susan McSween", ), (None, None), ]: assert _author_str(author) == expected_author def test_npm_extract_npm_package_author(datadir): package_metadata_filepath = os.path.join( datadir, "https_replicate.npmjs.com", "org_visit1" ) with open(package_metadata_filepath) as json_file: package_metadata = json.load(json_file) extract_npm_package_author(package_metadata["versions"]["0.0.2"]) == Person( fullname=b"mooz ", name=b"mooz", email=b"stillpedant@gmail.com", ) assert extract_npm_package_author(package_metadata["versions"]["0.0.3"]) == Person( fullname=b"Masafumi Oyamada ", name=b"Masafumi Oyamada", email=b"stillpedant@gmail.com", ) package_json = json.loads( """ { "name": "highlightjs-line-numbers.js", "version": "2.7.0", "description": "Highlight.js line numbers plugin.", "main": "src/highlightjs-line-numbers.js", "dependencies": {}, "devDependencies": { "gulp": "^4.0.0", "gulp-rename": "^1.4.0", "gulp-replace": "^0.6.1", "gulp-uglify": "^1.2.0" }, "repository": { "type": "git", "url": "https://github.com/wcoder/highlightjs-line-numbers.js.git" }, "author": "Yauheni Pakala ", "license": "MIT", "bugs": { "url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues" }, "homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/" }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"Yauheni Pakala ", name=b"Yauheni Pakala", email=b"evgeniy.pakalo@gmail.com", ) package_json = json.loads( """ { "name": "3-way-diff", "version": "0.0.1", "description": "3-way diffing of JavaScript objects", "main": "index.js", "authors": [ { "name": "Shawn Walsh", "url": "https://github.com/shawnpwalsh" }, { "name": "Markham F Rollins IV", "url": "https://github.com/mrollinsiv" } ], "keywords": [ "3-way diff", "3 way diff", "three-way diff", "three way diff" ], "devDependencies": { "babel-core": "^6.20.0", "babel-preset-es2015": "^6.18.0", "mocha": "^3.0.2" }, "dependencies": { "lodash": "^4.15.0" } }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"Shawn Walsh", name=b"Shawn Walsh", email=None ) package_json = json.loads( """ { "name": "yfe-ynpm", "version": "1.0.0", "homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm", "repository": { "type": "git", "url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git" }, "author": [ "fengmk2 (https://fengmk2.com)", "xufuzi (https://7993.org)" ], "license": "MIT" }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"fengmk2 (https://fengmk2.com)", name=b"fengmk2", email=b"fengmk2@gmail.com", ) package_json = json.loads( """ { "name": "umi-plugin-whale", "version": "0.0.8", "description": "Internal contract component", "authors": { "name": "xiaohuoni", "email": "448627663@qq.com" }, "repository": "alitajs/whale", "devDependencies": { "np": "^3.0.4", "umi-tools": "*" }, "license": "MIT" }""" ) assert extract_npm_package_author(package_json) == Person( fullname=b"xiaohuoni <448627663@qq.com>", name=b"xiaohuoni", email=b"448627663@qq.com", ) package_json_no_authors = json.loads( """{ "authors": null, "license": "MIT" }""" ) assert extract_npm_package_author(package_json_no_authors) == Person( fullname=b"", name=None, email=None ) def normalize_hashes(hashes): if isinstance(hashes, str): return hash_to_bytes(hashes) if isinstance(hashes, list): return [hash_to_bytes(x) for x in hashes] return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()} _expected_new_contents_first_visit = normalize_hashes( [ "4ce3058e16ab3d7e077f65aabf855c34895bf17c", "858c3ceee84c8311adc808f8cdb30d233ddc9d18", "0fa33b4f5a4e0496da6843a38ff1af8b61541996", "85a410f8ef8eb8920f2c384a9555566ad4a2e21b", "9163ac8025923d5a45aaac482262893955c9b37b", "692cf623b8dd2c5df2c2998fd95ae4ec99882fb4", "18c03aac6d3e910efb20039c15d70ab5e0297101", "41265c42446aac17ca769e67d1704f99e5a1394d", "783ff33f5882813dca9239452c4a7cadd4dba778", "b029cfb85107aee4590c2434a3329bfcf36f8fa1", "112d1900b4c2e3e9351050d1b542c9744f9793f3", "5439bbc4bd9a996f1a38244e6892b71850bc98fd", "d83097a2f994b503185adf4e719d154123150159", "d0939b4898e83090ee55fd9d8a60e312cfadfbaf", "b3523a26f7147e4af40d9d462adaae6d49eda13e", "cd065fb435d6fb204a8871bcd623d0d0e673088c", "2854a40855ad839a54f4b08f5cff0cf52fca4399", "b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe", "0f73d56e1cf480bded8a1ecf20ec6fc53c574713", "0d9882b2dfafdce31f4e77fe307d41a44a74cefe", "585fc5caab9ead178a327d3660d35851db713df1", "e8cd41a48d79101977e3036a87aeb1aac730686f", "5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7", "9c3cc2763bf9e9e37067d3607302c4776502df98", "3649a68410e354c83cd4a38b66bd314de4c8f5c9", "e96ed0c091de1ebdf587104eaf63400d1974a1fe", "078ca03d2f99e4e6eab16f7b75fbb7afb699c86c", "38de737da99514de6559ff163c988198bc91367a", ] ) _expected_new_directories_first_visit = normalize_hashes( [ "3370d20d6f96dc1c9e50f083e2134881db110f4f", "42753c0c2ab00c4501b552ac4671c68f3cf5aece", "d7895533ef5edbcffdea3f057d9fef3a1ef845ce", "80579be563e2ef3e385226fe7a3f079b377f142c", "3b0ddc6a9e58b4b53c222da4e27b280b6cda591c", "bcad03ce58ac136f26f000990fc9064e559fe1c0", "5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca", "e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd", "584b5b4b6cf7f038095e820b99386a9c232de931", "184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a", "bb5f4ee143c970367eb409f2e4c1104898048b9d", "1b95491047add1103db0dfdfa84a9735dcb11e88", "a00c6de13471a2d66e64aca140ddb21ef5521e62", "5ce6c1cd5cda2d546db513aaad8c72a44c7771e2", "c337091e349b6ac10d38a49cdf8c2401ef9bb0f2", "202fafcd7c0f8230e89d5496ad7f44ab12b807bf", "775cc516543be86c15c1dc172f49c0d4e6e78235", "ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e", ] ) _expected_new_releases_first_visit = normalize_hashes( { - "d25e722a32c145b3eb88b416049dd35d27759a87": ( + "adcc40ee87a3ebb1b5a82edd692cf52aa5099cee": ( "42753c0c2ab00c4501b552ac4671c68f3cf5aece" ), - "3522e846b97c0b8434c565fe891c0f082a357e5d": ( + "c781147df0e4963a0f9859134abd28296b702233": ( "3370d20d6f96dc1c9e50f083e2134881db110f4f" ), - "54f6c1711c6aedb6de3cf2d6347b9f772e343784": ( + "f544812dac98e7589155be7dfaef64477a408ec0": ( "d7895533ef5edbcffdea3f057d9fef3a1ef845ce" ), } ) def package_url(package): return "https://www.npmjs.com/package/%s" % package def package_metadata_url(package): return "https://replicate.npmjs.com/%s/" % package def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info): package = "org" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("ddaad89b0b4edb7eefe7c92e9b1166caa776ebbc") + expected_snapshot_id = hash_to_bytes("d24e3f10492ade1e9462ec701370fef4a79a40f1") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) versions = [ - ("0.0.2", "d25e722a32c145b3eb88b416049dd35d27759a87"), - ("0.0.3", "3522e846b97c0b8434c565fe891c0f082a357e5d"), - ("0.0.4", "54f6c1711c6aedb6de3cf2d6347b9f772e343784"), + ("0.0.2", "adcc40ee87a3ebb1b5a82edd692cf52aa5099cee"), + ("0.0.3", "c781147df0e4963a0f9859134abd28296b702233"), + ("0.0.4", "f544812dac98e7589155be7dfaef64477a408ec0"), ] expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"releases/0.0.4", target_type=TargetType.ALIAS ), **{ b"releases/" + version_name.encode(): SnapshotBranch( target=hash_to_bytes(version_id), target_type=TargetType.RELEASE, ) for (version_name, version_id) in versions }, }, ) check_snapshot(expected_snapshot, swh_storage) + assert swh_storage.release_get( + [hash_to_bytes("adcc40ee87a3ebb1b5a82edd692cf52aa5099cee")] + )[0] == Release( + name=b"0.0.2", + message=b"Synthetic release for NPM source package org version 0.0.2", + target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"), + target_type=ModelObjectType.DIRECTORY, + synthetic=True, + author=Person( + fullname=b"mooz ", + name=b"mooz", + email=b"stillpedant@gmail.com", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1388590833, microseconds=0), + offset=0, + negative_utc=False, + ), + id=hash_to_bytes("adcc40ee87a3ebb1b5a82edd692cf52aa5099cee"), + ) + contents = swh_storage.content_get(_expected_new_contents_first_visit) count = sum(0 if content is None else 1 for content in contents) assert count == len(_expected_new_contents_first_visit) assert ( list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == [] ) assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == [] metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", ) for (version_name, release_id) in versions: release = swh_storage.release_get([hash_to_bytes(release_id)])[0] assert release.target_type == ModelObjectType.DIRECTORY directory_id = release.target directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=directory_id, ) release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release_id), ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.npm.loader.NpmLoader", version=__version__, ), discovery_date=loader.visit_date, format="replicate-npm-package-json", metadata=json.dumps( json.loads(org_api_info)["versions"][version_name] ).encode(), origin="https://www.npmjs.com/package/org", release=release_swhid, ) ] assert swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_npm_loader_incremental_visit(swh_storage, requests_mock_datadir_visits): package = "org" url = package_url(package) loader = NpmLoader(swh_storage, url) - expected_snapshot_id = hash_to_bytes("ddaad89b0b4edb7eefe7c92e9b1166caa776ebbc") + expected_snapshot_id = hash_to_bytes("d24e3f10492ade1e9462ec701370fef4a79a40f1") actual_load_status = loader.load() assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats # reset loader internal state del loader._cached_info del loader._cached__raw_info actual_load_status2 = loader.load() assert actual_load_status2["status"] == "eventful" snap_id2 = actual_load_status2["snapshot_id"] assert snap_id2 is not None assert snap_id2 != actual_load_status["snapshot_id"] assert_last_visit_matches(swh_storage, url, status="full", type="npm") stats = get_stats(swh_storage) assert { # 3 new releases artifacts "content": len(_expected_new_contents_first_visit) + 14, "directory": len(_expected_new_directories_first_visit) + 15, "origin": 1, "origin_visit": 2, "release": len(_expected_new_releases_first_visit) + 3, "revision": 0, "skipped_content": 0, "snapshot": 2, } == stats urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith("https://registry.npmjs.org") ] assert len(urls) == len(set(urls)) # we visited each artifact once across @pytest.mark.usefixtures("requests_mock_datadir") def test_npm_loader_version_divergence(swh_storage): package = "@aller_shared" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("7a89bc3cb51ff1d3213b2151c745d82c3b9d69b1") + expected_snapshot_id = hash_to_bytes("92ff37da8045f0088ed35bce0bc34e2025202825") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"releases/0.1.0" ), b"releases/0.1.0": SnapshotBranch( target_type=TargetType.RELEASE, - target=hash_to_bytes("103fa6d0a1abb405468e3590dcf634bcb77f67be"), + target=hash_to_bytes("c5e0f0e185660b6bdd694ca5c68babe5bab20e24"), ), b"releases/0.1.1-alpha.14": SnapshotBranch( target_type=TargetType.RELEASE, - target=hash_to_bytes("c00b54143582a4e963e0b86e8dfa58eedd260020"), + target=hash_to_bytes("2f89c709eacc974b587e13f90d10a826b23a550e"), ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { # 1 new releases artifacts "content": 534, "directory": 153, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ package = "nativescript-telerik-analytics" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot.id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) def test_npm_artifact_with_no_upload_time(swh_storage, requests_mock_datadir): """With no time upload, artifact is skipped """ package = "jammit-no-time" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) assert actual_load_status == { "status": "uneventful", "snapshot_id": expected_snapshot.id.hex(), } assert_last_visit_matches( swh_storage, url, status="partial", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) def test_npm_artifact_use_mtime_if_no_time(swh_storage, requests_mock_datadir): """With no time upload, artifact is skipped """ package = "jammit-express" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("7f5e591dd3c4754abca4db1cc18355671e2c014c") + expected_snapshot_id = hash_to_bytes("2a7a67725f9c7134f56612281e8d1638f1386118") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } # artifact is used expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"releases/0.0.1" ), b"releases/0.0.1": SnapshotBranch( target_type=TargetType.RELEASE, - target=hash_to_bytes("199bf0ad020617357d608655e6549e526a65dc36"), + target=hash_to_bytes("68b2a100103cecec06b8dd780228bb751f2dc6f3"), ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) def test_npm_no_artifact(swh_storage, requests_mock_datadir): """If no artifacts at all is found for origin, the visit fails completely """ package = "catify" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == { "status": "failed", } assert_last_visit_matches(swh_storage, url, status="failed", type="npm") def test_npm_origin_not_found(swh_storage, requests_mock_datadir): url = package_url("non-existent-url") loader = NpmLoader(swh_storage, url) assert loader.load() == {"status": "failed"} assert_last_visit_matches( swh_storage, url, status="not_found", type="npm", snapshot=None ) diff --git a/swh/loader/package/opam/loader.py b/swh/loader/package/opam/loader.py index 9c47603..ec2b119 100644 --- a/swh/loader/package/opam/loader.py +++ b/swh/loader/package/opam/loader.py @@ -1,255 +1,259 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import os from subprocess import PIPE, Popen, call from typing import Iterator, List, Optional, Tuple import attr from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import cached_method from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Person, Release, Sha1Git, ) from swh.storage.interface import StorageInterface @attr.s class OpamPackageInfo(BasePackageInfo): author = attr.ib(type=Person) committer = attr.ib(type=Person) def opam_read( cmd: List[str], init_error_msg_if_any: Optional[str] = None ) -> Optional[str]: """This executes an opam command and returns the first line of the output. Args: cmd: Opam command to execute as a list of string init_error_msg_if_any: Error message to raise in case a problem occurs during initialization Raises: ValueError with the init_error_msg_if_any content in case stdout is not consumable and the variable is provided with non empty value. Returns: the first line of the executed command output """ with Popen(cmd, stdout=PIPE) as proc: if proc.stdout is not None: for line in io.TextIOWrapper(proc.stdout): # care only for the first line output result (mostly blank separated # values, callers will deal with the parsing of the line) return line elif init_error_msg_if_any: raise ValueError(init_error_msg_if_any) return None class OpamLoader(PackageLoader[OpamPackageInfo]): """Load all versions of a given package in a given opam repository. The state of the opam repository is stored in a directory called an opam root. This folder is a requisite for the opam binary to actually list information on package. When initialize_opam_root is False (the default for production workers), the opam root must already have been configured outside of the loading process. If not an error is raised, thus failing the loading. For standalone workers, initialize_opam_root must be set to True, so the ingestion can take care of installing the required opam root properly. The remaining ingestion uses the opam binary to give the versions of the given package. Then, for each version, the loader uses the opam binary to list the tarball url to fetch and ingest. """ visit_type = "opam" def __init__( self, storage: StorageInterface, url: str, opam_root: str, opam_instance: str, opam_url: str, opam_package: str, max_content_size: Optional[int] = None, initialize_opam_root: bool = False, ): super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.opam_root = opam_root self.opam_instance = opam_instance self.opam_url = opam_url self.opam_package = opam_package self.initialize_opam_root = initialize_opam_root def get_package_dir(self) -> str: return ( f"{self.opam_root}/repo/{self.opam_instance}/packages/{self.opam_package}" ) def get_package_name(self, version: str) -> str: return f"{self.opam_package}.{version}" def get_package_file(self, version: str) -> str: return f"{self.get_package_dir()}/{self.get_package_name(version)}/opam" def get_metadata_authority(self): return MetadataAuthority(type=MetadataAuthorityType.FORGE, url=self.opam_url) @cached_method def _compute_versions(self) -> List[str]: """Compute the versions using opam internals Raises: ValueError in case the lister is not able to determine the list of versions Returns: The list of versions for the package """ # TODO: use `opam show` instead of this workaround when it support the `--repo` # flag package_dir = self.get_package_dir() if not os.path.exists(package_dir): raise ValueError( f"can't get versions for package {self.opam_package} " f"(at url {self.url})." ) versions = [ ".".join(version.split(".")[1:]) for version in os.listdir(package_dir) ] if not versions: raise ValueError( f"can't get versions for package {self.opam_package} " f"(at url {self.url})" ) versions.sort() return versions def get_versions(self) -> List[str]: """First initialize the opam root directory if needed then start listing the package versions. Raises: ValueError in case the lister is not able to determine the list of versions or if the opam root directory is invalid. """ if self.initialize_opam_root: # for standalone loader (e.g docker), loader must initialize the opam root # folder call( [ "opam", "init", "--reinit", "--bare", "--no-setup", "--root", self.opam_root, self.opam_instance, self.opam_url, ] ) else: # for standard/production loaders, no need to initialize the opam root # folder. It must be present though so check for it, if not present, raise if not os.path.isfile(os.path.join(self.opam_root, "config")): # so if not correctly setup, raise immediately raise ValueError("Invalid opam root") return self._compute_versions() def get_default_version(self) -> str: """Return the most recent version of the package as default.""" return self._compute_versions()[-1] def _opam_show_args(self, version: str): package_file = self.get_package_file(version) return [ "opam", "show", "--color", "never", "--safe", "--normalise", "--root", self.opam_root, "--file", package_file, ] def get_enclosed_single_line_field(self, field, version) -> Optional[str]: result = opam_read(self._opam_show_args(version) + ["--field", field]) # Sanitize the result if any (remove trailing \n and enclosing ") return result.strip().strip('"') if result else None def get_package_info(self, version: str) -> Iterator[Tuple[str, OpamPackageInfo]]: url = self.get_enclosed_single_line_field("url.src:", version) if url is None: raise ValueError( f"can't get field url.src: for version {version} of package {self.opam_package} \ (at url {self.url}) from `opam show`" ) authors_field = self.get_enclosed_single_line_field("authors:", version) fullname = b"" if authors_field is None else str.encode(authors_field) author = Person(fullname=fullname, name=None, email=None) maintainer_field = self.get_enclosed_single_line_field("maintainer:", version) fullname = b"" if maintainer_field is None else str.encode(maintainer_field) committer = Person(fullname=fullname, name=None, email=None) with Popen(self._opam_show_args(version) + ["--raw"], stdout=PIPE) as proc: assert proc.stdout is not None metadata = proc.stdout.read() yield self.get_package_name(version), OpamPackageInfo( url=url, filename=None, author=author, committer=committer, version=version, directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( metadata=metadata, format="opam-package-definition", ) ], ) def build_release( self, p_info: OpamPackageInfo, uncompressed_path: str, directory: Sha1Git, ) -> Optional[Release]: + msg = ( + f"Synthetic release for OPAM source package {self.opam_package} " + f"version {p_info.version}" + ) return Release( name=p_info.version.encode(), author=p_info.author, - message=str.encode(p_info.version), + message=msg.encode(), date=None, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) diff --git a/swh/loader/package/opam/tests/test_opam.py b/swh/loader/package/opam/tests/test_opam.py index 15bf035..ba30023 100644 --- a/swh/loader/package/opam/tests/test_opam.py +++ b/swh/loader/package/opam/tests/test_opam.py @@ -1,347 +1,361 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.loader.package import __version__ from swh.loader.package.loader import RawExtrinsicMetadataCore from swh.loader.package.opam.loader import OpamLoader, OpamPackageInfo from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( - MetadataAuthority, - MetadataAuthorityType, - MetadataFetcher, Person, RawExtrinsicMetadata, + Release, Snapshot, SnapshotBranch, TargetType, ) +from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher +from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.interface import PagedResult OCB_METADATA = b"""\ opam-version: "2.0" name: "ocb" version: "0.1" synopsis: "SVG badge generator" description: "An OCaml library for SVG badge generation. There\'s also a command-line tool provided." maintainer: "OCamlPro " authors: "OCamlPro " license: "ISC" homepage: "https://ocamlpro.github.io/ocb/" doc: "https://ocamlpro.github.io/ocb/api/" bug-reports: "https://github.com/OCamlPro/ocb/issues" depends: [ "ocaml" {>= "4.05"} "dune" {>= "2.0"} "odoc" {with-doc} ] build: [ ["dune" "subst"] {dev} [ "dune" "build" "-p" name "-j" jobs "@install" "@runtest" {with-test} "@doc" {with-doc} ] ] dev-repo: "git+https://github.com/OCamlPro/ocb.git" url { src: "https://github.com/OCamlPro/ocb/archive/0.1.tar.gz" checksum: [ "sha256=aa27684fbda1b8036ae7e3c87de33a98a9cd2662bcc91c8447e00e41476b6a46" "sha512=1260344f184dd8c8074b0439dbcc8a5d59550a654c249cd61913d4c150c664f37b76195ddca38f7f6646d08bddb320ceb8d420508450b4f09a233cd5c22e6b9b" ] } """ # noqa def test_opam_loader_no_opam_repository_fails(swh_storage, tmpdir, datadir): """Running opam loader without a prepared opam repository fails""" opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir opam_instance = "loadertest" opam_package = "agrid" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( swh_storage, url, opam_root, opam_instance, opam_url, opam_package, initialize_opam_root=False, # The opam directory must be present ) # No opam root directory init directory from loader. So, at the opam root does not # exist, the loading fails. That's the expected use for the production workers # (whose opam_root maintenance will be externally managed). actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} def test_opam_loader_one_version(tmpdir, requests_mock_datadir, datadir, swh_storage): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir opam_instance = "loadertest" opam_package = "agrid" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( swh_storage, url, opam_root, opam_instance, opam_url, opam_package, initialize_opam_root=True, ) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("50b5961c27dd4f8b138acce8bac4f90d1e33081f") + expected_snapshot_id = hash_to_bytes("e480958fa7851268be2bcc8d01145c0c9624b34b") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } + release_id = hash_to_bytes("03db7f0d572509f1c7ce18c847db83070e26fd5e") + expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch(target=b"agrid.0.1", target_type=TargetType.ALIAS,), b"agrid.0.1": SnapshotBranch( - target=hash_to_bytes("efcb9ef9d0f2a85312463251732b42f9e45a5c12"), - target_type=TargetType.RELEASE, + target=release_id, target_type=TargetType.RELEASE, ), }, ) + assert swh_storage.release_get([release_id])[0] == Release( + name=b"0.1", + message=b"Synthetic release for OPAM source package agrid version 0.1", + target=hash_to_bytes("00412ee5bc601deb462e55addd1004715116785e"), + target_type=ModelObjectType.DIRECTORY, + synthetic=True, + author=Person( + fullname=b"OCamlPro ", name=None, email=None + ), + date=None, + id=release_id, + ) + assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot_id ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 18, "directory": 8, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_opam_loader_many_version(tmpdir, requests_mock_datadir, datadir, swh_storage): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir opam_instance = "loadertest" opam_package = "directories" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( swh_storage, url, opam_root, opam_instance, opam_url, opam_package, initialize_opam_root=True, ) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("f0a974e47999e74d323f1fb9604fde72527bda28") + expected_snapshot_id = hash_to_bytes("1a70631bee44c86dded71e0a091b1c91c110f812") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"directories.0.3", target_type=TargetType.ALIAS, ), b"directories.0.1": SnapshotBranch( - target=hash_to_bytes("1f839cb1f4720d6b33fdd856e3ff1119497979d9"), + target=hash_to_bytes("013d53d7e1aedbe03aaa3d5c0e6d1d780ef2634d"), target_type=TargetType.RELEASE, ), b"directories.0.2": SnapshotBranch( - target=hash_to_bytes("4133834d966381804347efbc41e35dd2bdd48962"), + target=hash_to_bytes("4fdcc3606c0af33cb4d733b70074e79f03e928a1"), target_type=TargetType.RELEASE, ), b"directories.0.3": SnapshotBranch( - target=hash_to_bytes("2f20cabfbacfe447b80dc2a4eb14d461775100c8"), + target=hash_to_bytes("5de72a60f81649157d267773c30e897b7005dcdb"), target_type=TargetType.RELEASE, ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot_id ) check_snapshot(expected_snapshot, swh_storage) def test_opam_release(tmpdir, requests_mock_datadir, swh_storage, datadir): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir opam_instance = "loadertest" opam_package = "ocb" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( swh_storage, url, opam_root, opam_instance, opam_url, opam_package, initialize_opam_root=True, ) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("987425c6fe94d3972c4c4e97ee27a6a7c8b68e82") + expected_snapshot_id = hash_to_bytes("96246035587354a71f429d5b9b8dcc98afad3708") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } info_iter = loader.get_package_info("0.1") branch_name, package_info = next(info_iter) expected_branch_name = "ocb.0.1" expected_package_info = OpamPackageInfo( url="https://github.com/OCamlPro/ocb/archive/0.1.tar.gz", filename=None, author=Person( fullname=b"OCamlPro ", name=None, email=None ), committer=Person( fullname=b"OCamlPro ", name=None, email=None ), version="0.1", directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( metadata=OCB_METADATA, format="opam-package-definition", ) ], ) assert branch_name == expected_branch_name assert package_info == expected_package_info - release_id = hash_to_bytes("8d0612cdf172e5dff3d876ca2bbc0f6003cc36cc") + release_id = hash_to_bytes("4904ad9d0f3b3f84cec2b899d0d05c682b0efdcb") expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"HEAD": SnapshotBranch(target=b"ocb.0.1", target_type=TargetType.ALIAS,), b"ocb.0.1": SnapshotBranch( target=release_id, target_type=TargetType.RELEASE, ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) release = swh_storage.release_get([release_id])[0] assert release is not None assert release.author == expected_package_info.author def test_opam_metadata(tmpdir, requests_mock_datadir, swh_storage, datadir): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir opam_instance = "loadertest" opam_package = "ocb" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( swh_storage, url, opam_root, opam_instance, opam_url, opam_package, initialize_opam_root=True, ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" - expected_release_id = hash_to_bytes("8d0612cdf172e5dff3d876ca2bbc0f6003cc36cc") + expected_release_id = hash_to_bytes("4904ad9d0f3b3f84cec2b899d0d05c682b0efdcb") expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"HEAD": SnapshotBranch(target=b"ocb.0.1", target_type=TargetType.ALIAS,), b"ocb.0.1": SnapshotBranch( target=expected_release_id, target_type=TargetType.RELEASE, ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) release = swh_storage.release_get([expected_release_id])[0] assert release is not None release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=expected_release_id ) directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url=opam_url, ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.opam.loader.OpamLoader", version=__version__, ), discovery_date=loader.visit_date, format="opam-package-definition", metadata=OCB_METADATA, origin=url, release=release_swhid, ) ] assert swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py index 4e8688c..d5f8094 100644 --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -1,239 +1,250 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import os from typing import Any, Dict, Iterator, Optional, Sequence, Tuple from urllib.parse import urlparse import attr from pkginfo import UnpackedSDist from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, PartialExtID, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method, release_name from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Person, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EXTID_TYPE = "pypi-archive-sha256" @attr.s class PyPIPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) + name = attr.ib(type=str) comment_text = attr.ib(type=Optional[str]) sha256 = attr.ib(type=str) upload_time = attr.ib(type=str) @classmethod - def from_metadata(cls, metadata: Dict[str, Any], version: str) -> "PyPIPackageInfo": + def from_metadata( + cls, metadata: Dict[str, Any], name: str, version: str + ) -> "PyPIPackageInfo": return cls( url=metadata["url"], filename=metadata["filename"], version=version, raw_info=metadata, + name=name, comment_text=metadata.get("comment_text"), sha256=metadata["digests"]["sha256"], upload_time=metadata["upload_time"], directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="pypi-project-json", metadata=json.dumps(metadata).encode(), ) ], ) def extid(self) -> PartialExtID: return (EXTID_TYPE, hash_to_bytes(self.sha256)) class PyPILoader(PackageLoader[PyPIPackageInfo]): """Load pypi origin's artifact releases into swh archive. """ visit_type = "pypi" def __init__( self, storage: StorageInterface, url: str, max_content_size: Optional[int] = None, ): super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.provider_url = pypi_api_url(self.url) @cached_method def _raw_info(self) -> bytes: return api_info(self.provider_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from pypi registry) """ return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: return self.info()["releases"].keys() def get_default_version(self) -> str: return self.info()["info"]["version"] def get_metadata_authority(self): p_url = urlparse(self.url) return MetadataAuthority( type=MetadataAuthorityType.FORGE, url=f"{p_url.scheme}://{p_url.netloc}/", metadata={}, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, PyPIPackageInfo]]: res = [] for meta in self.info()["releases"][version]: # process only standard sdist archives if meta["packagetype"] != "sdist" or meta["filename"].lower().endswith( (".deb", ".egg", ".rpm", ".whl") ): continue - p_info = PyPIPackageInfo.from_metadata(meta, version=version) + p_info = PyPIPackageInfo.from_metadata( + meta, name=self.info()["info"]["name"], version=version + ) res.append((version, p_info)) if len(res) == 1: version, p_info = res[0] yield release_name(version), p_info else: for version, p_info in res: yield release_name(version, p_info.filename), p_info def build_release( self, p_info: PyPIPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None # from intrinsic metadata - version_ = i_metadata.get("version", "") + version_ = i_metadata.get("version", p_info.version) author_ = author(i_metadata) - # from extrinsic metadata - message = p_info.comment_text or "" - message = "%s: %s" % (version_, message) if message else version_ + if p_info.comment_text: + msg = p_info.comment_text + else: + msg = ( + f"Synthetic release for PyPI source package {p_info.name} " + f"version {version_}" + ) + date = TimestampWithTimezone.from_iso8601(p_info.upload_time) return Release( name=p_info.version.encode(), - message=message.encode(), + message=msg.encode(), author=author_, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def pypi_api_url(url: str) -> str: """Compute api url from a project url Args: url (str): PyPI instance's url (e.g: https://pypi.org/project/requests) This deals with correctly transforming the project's api url (e.g https://pypi.org/pypi/requests/json) Returns: api url """ p_url = urlparse(url) project_name = p_url.path.rstrip("/").split("/")[-1] url = "%s://%s/pypi/%s/json" % (p_url.scheme, p_url.netloc, project_name) return url def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) != 1: return {} project_dirname = lst[0] pkginfo_path = os.path.join(dir_path, project_dirname, "PKG-INFO") if not os.path.exists(pkginfo_path): return {} pkginfo = UnpackedSDist(pkginfo_path) raw = pkginfo.__dict__ raw.pop("filename") # this gets added with the ondisk location return raw def author(data: Dict) -> Person: """Given a dict of project/release artifact information (coming from PyPI), returns an author subset. Args: data (dict): Representing either artifact information or release information. Returns: swh-model dict representing a person. """ name = data.get("author") email = data.get("author_email") fullname = None # type: Optional[str] if email: fullname = "%s <%s>" % (name, email) else: fullname = name if not fullname: return EMPTY_AUTHOR if name is not None: name = name.encode("utf-8") if email is not None: email = email.encode("utf-8") return Person(fullname=fullname.encode("utf-8"), name=name, email=email) diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py index bad6177..4dada3b 100644 --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -1,802 +1,806 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os from os import path from unittest.mock import patch import pytest from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.core.tarball import uncompress from swh.loader.package import __version__ from swh.loader.package.pypi.loader import ( PyPILoader, PyPIPackageInfo, author, extract_intrinsic_metadata, pypi_api_url, ) from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, Person, RawExtrinsicMetadata, Snapshot, SnapshotBranch, TargetType, ) from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.interface import PagedResult @pytest.fixture def _0805nexter_api_info(datadir) -> bytes: with open( os.path.join(datadir, "https_pypi.org", "pypi_0805nexter_json"), "rb", ) as f: return f.read() def test_pypi_author_basic(): data = { "author": "i-am-groot", "author_email": "iam@groot.org", } actual_author = author(data) expected_author = Person( fullname=b"i-am-groot ", name=b"i-am-groot", email=b"iam@groot.org", ) assert actual_author == expected_author def test_pypi_author_empty_email(): data = { "author": "i-am-groot", "author_email": "", } actual_author = author(data) expected_author = Person(fullname=b"i-am-groot", name=b"i-am-groot", email=b"",) assert actual_author == expected_author def test_pypi_author_empty_name(): data = { "author": "", "author_email": "iam@groot.org", } actual_author = author(data) expected_author = Person( fullname=b" ", name=b"", email=b"iam@groot.org", ) assert actual_author == expected_author def test_pypi_author_malformed(): data = { "author": "['pierre', 'paul', 'jacques']", "author_email": None, } actual_author = author(data) expected_author = Person( fullname=b"['pierre', 'paul', 'jacques']", name=b"['pierre', 'paul', 'jacques']", email=None, ) assert actual_author == expected_author def test_pypi_author_malformed_2(): data = { "author": "[marie, jeanne]", "author_email": "[marie@some, jeanne@thing]", } actual_author = author(data) expected_author = Person( fullname=b"[marie, jeanne] <[marie@some, jeanne@thing]>", name=b"[marie, jeanne]", email=b"[marie@some, jeanne@thing]", ) assert actual_author == expected_author def test_pypi_author_malformed_3(): data = { "author": "[marie, jeanne, pierre]", "author_email": "[marie@somewhere.org, jeanne@somewhere.org]", } actual_author = author(data) expected_author = Person( fullname=( b"[marie, jeanne, pierre] " b"<[marie@somewhere.org, jeanne@somewhere.org]>" ), name=b"[marie, jeanne, pierre]", email=b"[marie@somewhere.org, jeanne@somewhere.org]", ) actual_author == expected_author # configuration error # def test_pypi_api_url(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url("https://pypi.org/project/requests") assert url == "https://pypi.org/pypi/requests/json" def test_pypi_api_url_with_slash(): """Compute pypi api url from the pypi project url should be ok""" url = pypi_api_url("https://pypi.org/project/requests/") assert url == "https://pypi.org/pypi/requests/json" @pytest.mark.fs def test_pypi_extract_intrinsic_metadata(tmp_path, datadir): """Parsing existing archive's PKG-INFO should yield results""" uncompressed_archive_path = str(tmp_path) archive_path = path.join( datadir, "https_files.pythonhosted.org", "0805nexter-1.1.0.zip" ) uncompress(archive_path, dest=uncompressed_archive_path) actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path) expected_metadata = { "metadata_version": "1.0", "name": "0805nexter", "version": "1.1.0", "summary": "a simple printer of nested lest", "home_page": "http://www.hp.com", "author": "hgtkpython", "author_email": "2868989685@qq.com", "platforms": ["UNKNOWN"], } assert actual_metadata == expected_metadata @pytest.mark.fs def test_pypi_extract_intrinsic_metadata_failures(tmp_path): """Parsing inexistent path/archive/PKG-INFO yield None""" tmp_path = str(tmp_path) # py3.5 work around (PosixPath issue) # inexistent first level path assert extract_intrinsic_metadata("/something-inexistent") == {} # inexistent second level path (as expected by pypi archives) assert extract_intrinsic_metadata(tmp_path) == {} # inexistent PKG-INFO within second level path existing_path_no_pkginfo = path.join(tmp_path, "something") os.mkdir(existing_path_no_pkginfo) assert extract_intrinsic_metadata(tmp_path) == {} # LOADER SCENARIO # # "edge" cases (for the same origin) # # no release artifact: # {visit full, status: uneventful, no contents, etc...} requests_mock_datadir_missing_all = requests_mock_datadir_factory( ignore_urls=[ "https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip", # noqa "https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip", # noqa ] ) def test_pypi_no_release_artifact(swh_storage, requests_mock_datadir_missing_all): """Load a pypi project with all artifacts missing ends up with no snapshot """ url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None empty_snapshot = Snapshot(branches={}) assert_last_visit_matches( swh_storage, url, status="partial", type="pypi", snapshot=empty_snapshot.id ) stats = get_stats(swh_storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_pypi_fail__load_snapshot(swh_storage, requests_mock_datadir): """problem during loading: {visit: failed, status: failed, no snapshot} """ url = "https://pypi.org/project/0805nexter" with patch( "swh.loader.package.pypi.loader.PyPILoader._load_snapshot", side_effect=ValueError("Fake problem to fail visit"), ): loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} assert_last_visit_matches(swh_storage, url, status="failed", type="pypi") stats = get_stats(loader.storage) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 0, } == stats # problem during loading: # {visit: partial, status: uneventful, no snapshot} def test_pypi_release_with_traceback(swh_storage, requests_mock_datadir): url = "https://pypi.org/project/0805nexter" with patch( "swh.loader.package.pypi.loader.PyPILoader.last_snapshot", side_effect=ValueError("Fake problem to fail the visit"), ): loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} assert_last_visit_matches(swh_storage, url, status="failed", type="pypi") stats = get_stats(swh_storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 0, } == stats # problem during loading: failure early enough in between swh contents... # some contents (contents, directories, etc...) have been written in storage # {visit: partial, status: eventful, no snapshot} # problem during loading: failure late enough we can have snapshots (some # revisions are written in storage already) # {visit: partial, status: eventful, snapshot} # "normal" cases (for the same origin) # requests_mock_datadir_missing_one = requests_mock_datadir_factory( ignore_urls=[ "https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip", # noqa ] ) # some missing release artifacts: # {visit partial, status: eventful, 1 snapshot} def test_pypi_release_metadata_structure( swh_storage, requests_mock_datadir, _0805nexter_api_info ): url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None - expected_release_id = hash_to_bytes("a1e10745d375be66c1b65e55c0c15fe98776b53c") + expected_release_id = hash_to_bytes("e05d81600f3db1f905d23ab2a06ea64460c7e3f4") expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"HEAD": SnapshotBranch( target=b"releases/1.2.0", target_type=TargetType.ALIAS, ), b"releases/1.1.0": SnapshotBranch( - target=hash_to_bytes("9478c9981887fdf5ada3f1fcb20c81069cdf4c44"), + target=hash_to_bytes("ed4132a0160d97752a6ce5716722fb937a2e00b1"), target_type=TargetType.RELEASE, ), b"releases/1.2.0": SnapshotBranch( - target=hash_to_bytes("a1e10745d375be66c1b65e55c0c15fe98776b53c"), - target_type=TargetType.RELEASE, + target=expected_release_id, target_type=TargetType.RELEASE, ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) release = swh_storage.release_get([expected_release_id])[0] assert release is not None release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=expected_release_id ) directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://pypi.org/", ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.pypi.loader.PyPILoader", version=__version__, ), discovery_date=loader.visit_date, format="pypi-project-json", metadata=json.dumps( json.loads(_0805nexter_api_info)["releases"]["1.2.0"][0] ).encode(), origin=url, release=release_swhid, ) ] assert swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) def test_pypi_visit_with_missing_artifact( swh_storage, requests_mock_datadir_missing_one ): """Load a pypi project with some missing artifacts ends up with 1 snapshot """ url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("eee24d5b0c156ebb4ece0c810c9dce636ebe881f") + expected_snapshot_id = hash_to_bytes("1838a3d6fff760338ab14b95c43c2dabcbb03c5a") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="partial", type="pypi", snapshot=expected_snapshot_id, ) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"releases/1.2.0": SnapshotBranch( - target=hash_to_bytes("a1e10745d375be66c1b65e55c0c15fe98776b53c"), + target=hash_to_bytes("e05d81600f3db1f905d23ab2a06ea64460c7e3f4"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1.2.0", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, storage=swh_storage) stats = get_stats(swh_storage) assert { "content": 3, "directory": 2, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_pypi_visit_with_1_release_artifact(swh_storage, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("62d957f2b5cdc515bea0a46252a3ab29ee271636") + expected_snapshot_id = hash_to_bytes("7e34abda294fb80e6d4e64637ae43fed112079ca") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"releases/1.1.0": SnapshotBranch( - target=hash_to_bytes("9478c9981887fdf5ada3f1fcb20c81069cdf4c44"), + target=hash_to_bytes("ed4132a0160d97752a6ce5716722fb937a2e00b1"), target_type=TargetType.RELEASE, ), b"releases/1.2.0": SnapshotBranch( - target=hash_to_bytes("a1e10745d375be66c1b65e55c0c15fe98776b53c"), + target=hash_to_bytes("e05d81600f3db1f905d23ab2a06ea64460c7e3f4"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1.2.0", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_pypi_multiple_visits_with_no_change(swh_storage, requests_mock_datadir): """Multiple visits with no changes results in 1 same snapshot """ url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() - snapshot_id = hash_to_bytes("62d957f2b5cdc515bea0a46252a3ab29ee271636") + snapshot_id = hash_to_bytes("7e34abda294fb80e6d4e64637ae43fed112079ca") assert actual_load_status == { "status": "eventful", "snapshot_id": snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=snapshot_id ) expected_snapshot = Snapshot( id=snapshot_id, branches={ b"releases/1.1.0": SnapshotBranch( - target=hash_to_bytes("9478c9981887fdf5ada3f1fcb20c81069cdf4c44"), + target=hash_to_bytes("ed4132a0160d97752a6ce5716722fb937a2e00b1"), target_type=TargetType.RELEASE, ), b"releases/1.2.0": SnapshotBranch( - target=hash_to_bytes("a1e10745d375be66c1b65e55c0c15fe98776b53c"), + target=hash_to_bytes("e05d81600f3db1f905d23ab2a06ea64460c7e3f4"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1.2.0", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats actual_load_status2 = loader.load() assert actual_load_status2 == { "status": "uneventful", "snapshot_id": actual_load_status2["snapshot_id"], } visit_status2 = assert_last_visit_matches( swh_storage, url, status="full", type="pypi" ) stats2 = get_stats(swh_storage) expected_stats2 = stats.copy() expected_stats2["origin_visit"] = 1 + 1 assert expected_stats2 == stats2 # same snapshot assert visit_status2.snapshot == snapshot_id def test_pypi_incremental_visit(swh_storage, requests_mock_datadir_visits): """With prior visit, 2nd load will result with a different snapshot """ url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) visit1_actual_load_status = loader.load() visit1_stats = get_stats(swh_storage) - expected_snapshot_id = hash_to_bytes("62d957f2b5cdc515bea0a46252a3ab29ee271636") + expected_snapshot_id = hash_to_bytes("7e34abda294fb80e6d4e64637ae43fed112079ca") assert visit1_actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == visit1_stats # Reset internal state del loader._cached__raw_info del loader._cached_info visit2_actual_load_status = loader.load() visit2_stats = get_stats(swh_storage) assert visit2_actual_load_status["status"] == "eventful", visit2_actual_load_status - expected_snapshot_id2 = hash_to_bytes("6a8a84e7f765bed4362315fb054adb2466598636") + expected_snapshot_id2 = hash_to_bytes("6636693213eab9000b8eee1c5bbb3f1b675a4c70") assert visit2_actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id2.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id2 ) expected_snapshot = Snapshot( id=expected_snapshot_id2, branches={ b"releases/1.1.0": SnapshotBranch( - target=hash_to_bytes("9478c9981887fdf5ada3f1fcb20c81069cdf4c44"), + target=hash_to_bytes("ed4132a0160d97752a6ce5716722fb937a2e00b1"), target_type=TargetType.RELEASE, ), b"releases/1.2.0": SnapshotBranch( - target=hash_to_bytes("a1e10745d375be66c1b65e55c0c15fe98776b53c"), + target=hash_to_bytes("e05d81600f3db1f905d23ab2a06ea64460c7e3f4"), target_type=TargetType.RELEASE, ), b"releases/1.3.0": SnapshotBranch( - target=hash_to_bytes("d46442e99bb6e05df5f75a7f0f7f61a4f2098147"), + target=hash_to_bytes("a21b09cbec8e31f47307f196bb1f939effc26e11"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1.3.0", target_type=TargetType.ALIAS, ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) assert { "content": 6 + 1, # 1 more content "directory": 4 + 2, # 2 more directories "origin": 1, "origin_visit": 1 + 1, "release": 2 + 1, # 1 more release "revision": 0, "skipped_content": 0, "snapshot": 1 + 1, # 1 more snapshot } == visit2_stats urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith("https://files.pythonhosted.org") ] # visited each artifact once across 2 visits assert len(urls) == len(set(urls)) # release artifact, no new artifact # {visit full, status uneventful, same snapshot as before} # release artifact, old artifact with different checksums # {visit full, status full, new snapshot with shared history and some new # different history} # release with multiple sdist artifacts per pypi "version" # snapshot branch output is different def test_pypi_visit_1_release_with_2_artifacts(swh_storage, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot """ url = "https://pypi.org/project/nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() - expected_snapshot_id = hash_to_bytes("a136ee226316276c347d7be3da07df5828605927") + expected_snapshot_id = hash_to_bytes("a65c03a837b24720fa95622de07074e279eddd0d") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"releases/1.1.0/nexter-1.1.0.zip": SnapshotBranch( - target=hash_to_bytes("9478c9981887fdf5ada3f1fcb20c81069cdf4c44"), + target=hash_to_bytes("18d0087b1e1a3a31070d54bf3e9edbd44ab01cb5"), target_type=TargetType.RELEASE, ), b"releases/1.1.0/nexter-1.1.0.tar.gz": SnapshotBranch( - target=hash_to_bytes("b3391cb4007fb6872c4dfab476a7cfe7443a1bb4"), + target=hash_to_bytes("b2b379b3eb61adcde22e10788b1fc5f985e938d2"), target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, swh_storage) def test_pypi_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion """ url = "https://pypi.org/project/upymenu" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot(id=expected_snapshot_id, branches={}) assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) def test_pypi_origin_not_found(swh_storage, requests_mock_datadir): url = "https://pypi.org/project/unknown" loader = PyPILoader(swh_storage, url) assert loader.load() == {"status": "failed"} assert_last_visit_matches( swh_storage, url, status="not_found", type="pypi", snapshot=None ) def test_pypi_build_release_missing_version_in_pkg_info(swh_storage, tmp_path): """Simulate release build when Version field is missing in PKG-INFO file.""" url = "https://pypi.org/project/GermlineFilter" # create package info p_info = PyPIPackageInfo( url=url, filename="GermlineFilter-1.2.tar.gz", version="1.2", + name="GermlineFilter", directory_extrinsic_metadata=[], raw_info={}, comment_text="", sha256="e4982353c544d94b34f02c5690ab3d3ebc93480d5b62fe6f3317f23c515acc05", upload_time="2015-02-18T20:39:13", ) # create PKG-INFO file with missing Version field package_path = tmp_path / "GermlineFilter-1.2" pkg_info_path = package_path / "PKG-INFO" package_path.mkdir() pkg_info_path.write_text( """Metadata-Version: 1.2 Name: germline_filter Home-page: Author: Cristian Caloian (OICR) Author-email: cristian.caloian@oicr.on.ca License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN""" ) directory = hash_to_bytes("8b864d66f356afe35033d58f8e03b7c23a66751f") # attempt to build release loader = PyPILoader(swh_storage, url) release = loader.build_release(p_info, str(tmp_path), directory) # without comment_text and version in PKG-INFO, message should be empty - assert release.message == b"" + assert ( + release.message + == b"Synthetic release for PyPI source package GermlineFilter version 1.2" + ) def test_filter_out_invalid_sdists(swh_storage, requests_mock): project_name = "swh-test-sdist-filtering" version = "1.0.0" url = f"https://pypi.org/project/{project_name}" json_url = f"https://pypi.org/pypi/{project_name}/json" common_sdist_entries = { "url": "", "comment_text": "", "digests": {"sha256": ""}, "upload_time": "", "packagetype": "sdist", } requests_mock.get( json_url, json={ + "info": {"name": project_name,}, "releases": { version: [ { **common_sdist_entries, "filename": f"{project_name}-{version}.{ext}", } for ext in ("tar.gz", "deb", "egg", "rpm", "whl") ] }, }, ) loader = PyPILoader(swh_storage, url) packages = list(loader.get_package_info(version=version)) assert len(packages) == 1 assert packages[0][1].filename.endswith(".tar.gz")