diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -3,15 +3,16 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import logging from os import path -from typing import Any, Dict, Iterator, Mapping, Optional, Sequence, Tuple +from typing import Any, Dict, Iterator, Optional, Sequence, Tuple, Union import attr import iso8601 from swh.loader.package.loader import PackageLoader, BasePackageInfo -from swh.loader.package.utils import release_name, artifact_identity +from swh.loader.package.utils import release_name from swh.model.model import ( Sha1Git, Person, @@ -30,8 +31,36 @@ REVISION_MESSAGE = b"swh-loader-package: synthetic revision message" +@attr.s class ArchivePackageInfo(BasePackageInfo): raw = attr.ib(type=Dict[str, Any]) + length = attr.ib(type=int) + """Size of the archive file""" + time = attr.ib(type=Union[str, datetime.datetime]) + """Timestamp of the archive file on the server""" + version = attr.ib(type=str) + + # default keys for gnu + ID_KEYS = ["time", "url", "length", "version"] + + def artifact_identity(self, id_keys=None): + if id_keys is None: + id_keys = self.ID_KEYS + # TODO: use parsed attributes instead of self.raw + return [self.raw.get(k) for k in id_keys] + + @classmethod + def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": + url = a_metadata["url"] + filename = a_metadata.get("filename") + return cls( + url=url, + filename=filename if filename else path.split(url)[-1], + raw=a_metadata, + length=a_metadata["length"], + time=a_metadata["time"], + version=a_metadata["version"], + ) class ArchiveLoader(PackageLoader[ArchivePackageInfo]): @@ -44,7 +73,7 @@ def __init__( self, url: str, - artifacts: Sequence[Mapping[str, Any]], + artifacts: Sequence[Dict[str, Any]], identity_artifact_keys: Optional[Sequence[str]] = None, ): """Loader constructor. @@ -70,9 +99,6 @@ """ super().__init__(url=url) self.artifacts = artifacts # assume order is enforced in the lister - if not identity_artifact_keys: - # default keys for gnu - identity_artifact_keys = ["time", "url", "length", "version"] self.identity_artifact_keys = identity_artifact_keys def get_versions(self) -> Sequence[str]: @@ -91,42 +117,38 @@ self, version: str ) -> Iterator[Tuple[str, ArchivePackageInfo]]: for a_metadata in self.artifacts: - url = a_metadata["url"] - package_version = a_metadata["version"] - if version == package_version: - filename = a_metadata.get("filename") - p_info = ArchivePackageInfo( - url=url, - filename=filename if filename else path.split(url)[-1], - raw=a_metadata, - ) + p_info = ArchivePackageInfo.from_metadata(a_metadata) + if version == p_info.version: # FIXME: this code assumes we have only 1 artifact per # versioned package yield release_name(version), p_info def resolve_revision_from( - self, known_artifacts: Dict, artifact_metadata: Dict + self, known_artifacts: Dict, p_info: ArchivePackageInfo ) -> Optional[bytes]: - identity = artifact_identity( - artifact_metadata, id_keys=self.identity_artifact_keys - ) + identity = p_info.artifact_identity(id_keys=self.identity_artifact_keys) for rev_id, known_artifact in known_artifacts.items(): logging.debug("known_artifact: %s", known_artifact) reference_artifact = known_artifact["extrinsic"]["raw"] - known_identity = artifact_identity( - reference_artifact, id_keys=self.identity_artifact_keys + reference_artifact_info = ArchivePackageInfo.from_metadata( + reference_artifact + ) + known_identity = reference_artifact_info.artifact_identity( + id_keys=self.identity_artifact_keys ) if identity == known_identity: return rev_id return None def build_revision( - self, a_metadata: Mapping[str, Any], uncompressed_path: str, directory: Sha1Git + self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: - time = a_metadata["time"] # assume it's a timestamp + time = p_info.time # assume it's a timestamp if isinstance(time, str): # otherwise, assume it's a parsable date - time = iso8601.parse_date(time) - normalized_time = TimestampWithTimezone.from_datetime(time) + parsed_time = iso8601.parse_date(time) + else: + parsed_time = time + normalized_time = TimestampWithTimezone.from_datetime(parsed_time) return Revision( type=RevisionType.TAR, message=REVISION_MESSAGE, @@ -142,7 +164,7 @@ "extrinsic": { "provider": self.url, "when": self.visit_date.isoformat(), - "raw": a_metadata, + "raw": p_info.raw, }, }, ) diff --git a/swh/loader/package/archive/tasks.py b/swh/loader/package/archive/tasks.py --- a/swh/loader/package/archive/tasks.py +++ b/swh/loader/package/archive/tasks.py @@ -9,8 +9,6 @@ @shared_task(name=__name__ + ".LoadArchive") -def load_archive_files(*, url=None, artifacts=None, identity_artifact_keys=None): +def load_archive_files(*, url=None, artifacts=None): """Load archive's artifacts (e.g gnu, etc...)""" - return ArchiveLoader( - url, artifacts, identity_artifact_keys=identity_artifact_keys - ).load() + return ArchiveLoader(url, artifacts).load() diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -3,10 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import attr + from swh.model.hashutil import hash_to_bytes from swh.model.model import Snapshot, SnapshotBranch, TargetType -from swh.loader.package.archive.loader import ArchiveLoader +from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo from swh.loader.package.tests.common import check_metadata_paths from swh.loader.tests import ( assert_last_visit_matches, @@ -343,3 +345,35 @@ if m.url.startswith("https://ftp.gnu.org") ] assert len(urls) == 1 + + +def test_artifact_identity(): + """Compute primary key should return the right identity + + """ + + @attr.s + class TestPackageInfo(ArchivePackageInfo): + a = attr.ib() + b = attr.ib() + + metadata = GNU_ARTIFACTS[0] + + p_info = TestPackageInfo(raw={**metadata, "a": 1, "b": 2}, a=1, b=2, **metadata,) + + for id_keys, expected_id in [ + (["a", "b"], [1, 2]), + ([], []), + (["a", "key-that-does-not-exist"], [1, None]), + ( + None, + [ + metadata["time"], + metadata["url"], + metadata["length"], + metadata["version"], + ], + ), + ]: + actual_id = p_info.artifact_identity(id_keys=id_keys) + assert actual_id == expected_id diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -16,7 +16,7 @@ from debian.deb822 import Deb822 from swh.loader.package.loader import BasePackageInfo, PackageLoader -from swh.loader.package.utils import release_name, artifact_identity +from swh.loader.package.utils import release_name from swh.model.model import ( Person, TimestampWithTimezone, @@ -32,8 +32,22 @@ DATE_PATTERN = re.compile(r"^(?P\d{4})-(?P\d{2})$") +@attr.s class CRANPackageInfo(BasePackageInfo): raw = attr.ib(type=Dict[str, Any]) + version = attr.ib(type=str) + + ID_KEYS = ["url", "version"] + + @classmethod + def from_metadata(cls, a_metadata: Dict[str, Any]) -> "CRANPackageInfo": + url = a_metadata["url"] + return CRANPackageInfo( + url=url, + filename=path.basename(url), + raw=a_metadata, + version=a_metadata["version"], + ) class CRANLoader(PackageLoader[CRANPackageInfo]): @@ -49,7 +63,6 @@ """ super().__init__(url=url) # explicit what we consider the artifact identity - self.id_keys = ["url", "version"] self.artifacts = artifacts def get_versions(self) -> List[str]: @@ -63,40 +76,36 @@ def get_package_info(self, version: str) -> Iterator[Tuple[str, CRANPackageInfo]]: for a_metadata in self.artifacts: - url = a_metadata["url"] - package_version = a_metadata["version"] - if version == package_version: - p_info = CRANPackageInfo( - url=url, filename=path.basename(url), raw=a_metadata, - ) + p_info = CRANPackageInfo.from_metadata(a_metadata) + if version == p_info.version: yield release_name(version), p_info def resolve_revision_from( - self, - known_artifacts: Mapping[bytes, Mapping], - artifact_metadata: Mapping[str, Any], + self, known_artifacts: Mapping[bytes, Mapping], p_info: CRANPackageInfo, ) -> Optional[bytes]: """Given known_artifacts per revision, try to determine the revision for artifact_metadata """ - new_identity = artifact_identity(artifact_metadata, self.id_keys) + new_identity = p_info.artifact_identity() for rev_id, known_artifact_meta in known_artifacts.items(): logging.debug("known_artifact_meta: %s", known_artifact_meta) known_artifact = known_artifact_meta["extrinsic"]["raw"] - known_identity = artifact_identity(known_artifact, self.id_keys) + known_identity = CRANPackageInfo.from_metadata( + known_artifact + ).artifact_identity() if new_identity == known_identity: return rev_id return None def build_revision( - self, a_metadata: Mapping[str, Any], uncompressed_path: str, directory: Sha1Git + self, p_info: CRANPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: # a_metadata is empty metadata = extract_intrinsic_metadata(uncompressed_path) date = parse_date(metadata.get("Date")) author = Person.from_fullname(metadata.get("Maintainer", "").encode()) - version = metadata.get("Version", a_metadata["version"]) + version = metadata.get("Version", p_info.version) return Revision( message=version.encode("utf-8"), type=RevisionType.TAR, @@ -112,7 +121,7 @@ "extrinsic": { "provider": self.url, "when": self.visit_date.isoformat(), - "raw": a_metadata, + "raw": p_info.raw, }, }, ) diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py --- a/swh/loader/package/debian/loader.py +++ b/swh/loader/package/debian/loader.py @@ -8,7 +8,17 @@ from os import path import re import subprocess -from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple +from typing import ( + Any, + Dict, + FrozenSet, + Iterator, + List, + Mapping, + Optional, + Sequence, + Tuple, +) import attr from dateutil.parser import parse as parse_date @@ -30,8 +40,61 @@ UPLOADERS_SPLIT = re.compile(r"(?<=\>)\s*,\s*") +@attr.s +class DebianFileMetadata: + md5sum = attr.ib(type=str) + name = attr.ib(type=str) + """Filename""" + sha256 = attr.ib(type=str) + size = attr.ib(type=int) + uri = attr.ib(type=str) + """URL of this specific file""" + + +@attr.s +class DebianPackageChangelog: + person = attr.ib(type=Dict[str, str]) + """A dict with fields like, model.Person, except they are str instead + of bytes, and 'email' is optional.""" + date = attr.ib(type=str) + """Date of the changelog entry.""" + history = attr.ib(type=List[Tuple[str, str]]) + """List of tuples (package_name, version)""" + + +@attr.s class DebianPackageInfo(BasePackageInfo): raw = attr.ib(type=Dict[str, Any]) + files = attr.ib(type=Dict[str, DebianFileMetadata]) + """Metadata of the files (.deb, .dsc, ...) of the package.""" + name = attr.ib(type=str) + version = attr.ib(type=str) + + @classmethod + def from_metadata(cls, a_metadata: Dict[str, Any], url: str) -> "DebianPackageInfo": + return cls( + url=url, + filename=None, + raw=a_metadata, + files={ + file_name: DebianFileMetadata(**file_metadata) + for (file_name, file_metadata) in a_metadata.get("files", {}).items() + }, + name=a_metadata["name"], + version=a_metadata["version"], + ) + + +@attr.s +class IntrinsicPackageMetadata: + """Metadata extracted from a package's .dsc file.""" + + name = attr.ib(type=str) + version = attr.ib(type=str) + changelog = attr.ib(type=DebianPackageChangelog) + maintainers = attr.ib(type=List[Dict[str, str]]) + """A list of dicts with fields like, model.Person, except they are str instead + of bytes, and 'email' is optional.""" class DebianLoader(PackageLoader[DebianPackageInfo]): @@ -93,13 +156,13 @@ def get_package_info(self, version: str) -> Iterator[Tuple[str, DebianPackageInfo]]: meta = self.packages[version] - p_info = DebianPackageInfo(url=self.url, filename=None, raw=meta,) + p_info = DebianPackageInfo.from_metadata(meta, url=self.url) yield release_name(version), p_info def resolve_revision_from( - self, known_package_artifacts: Mapping, artifact_metadata: Mapping + self, known_package_artifacts: Mapping, p_info: DebianPackageInfo ) -> Optional[bytes]: - return resolve_revision_from(known_package_artifacts, artifact_metadata) + return resolve_revision_from(known_package_artifacts, p_info) def download_package( self, p_info: DebianPackageInfo, tmpdir: str @@ -113,7 +176,7 @@ This is delegated to the `download_package` function. """ - all_hashes = download_package(p_info.raw, tmpdir) + all_hashes = download_package(p_info, tmpdir) logger.debug("all_hashes: %s", all_hashes) res = [] for hashes in all_hashes.values(): @@ -128,24 +191,26 @@ return extract_package(dl_artifacts, dest=dest) def build_revision( - self, a_metadata: Mapping[str, Any], uncompressed_path: str, directory: Sha1Git + self, p_info: DebianPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: - dsc_url, dsc_name = dsc_information(a_metadata) + dsc_url, dsc_name = dsc_information(p_info) if not dsc_name: raise ValueError("dsc name for url %s should not be None" % dsc_url) dsc_path = path.join(path.dirname(uncompressed_path), dsc_name) - i_metadata = get_package_metadata(a_metadata, dsc_path, uncompressed_path) + intrinsic_metadata = get_intrinsic_package_metadata( + p_info, dsc_path, uncompressed_path + ) - logger.debug("i_metadata: %s", i_metadata) - logger.debug("a_metadata: %s", a_metadata) + logger.debug("intrinsic_metadata: %s", intrinsic_metadata) + logger.debug("p_info: %s", p_info) msg = "Synthetic revision for Debian source package %s version %s" % ( - a_metadata["name"], - a_metadata["version"], + p_info.name, + p_info.version, ) - date = TimestampWithTimezone.from_iso8601(i_metadata["changelog"]["date"]) - author = prepare_person(i_metadata["changelog"]["person"]) + author = prepare_person(intrinsic_metadata.changelog.person) + date = TimestampWithTimezone.from_iso8601(intrinsic_metadata.changelog.date) # inspired from swh.loader.debian.converters.package_metadata_to_revision # noqa return Revision( @@ -159,38 +224,35 @@ directory=directory, synthetic=True, metadata={ - "intrinsic": {"tool": "dsc", "raw": i_metadata,}, + "intrinsic": {"tool": "dsc", "raw": attr.asdict(intrinsic_metadata),}, "extrinsic": { "provider": dsc_url, "when": self.visit_date.isoformat(), - "raw": a_metadata, + "raw": p_info.raw, }, }, ) def resolve_revision_from( - known_package_artifacts: Mapping, artifact_metadata: Mapping + known_package_artifacts: Mapping, p_info: DebianPackageInfo ) -> Optional[bytes]: """Given known package artifacts (resolved from the snapshot of previous visit) and the new artifact to fetch, try to solve the corresponding revision. """ - artifacts_to_fetch = artifact_metadata.get("files") + artifacts_to_fetch = p_info.files if not artifacts_to_fetch: return None - def to_set(data): + def to_set(data: DebianPackageInfo) -> FrozenSet[Tuple[str, str, int]]: return frozenset( - [ - (name, meta["sha256"], meta["size"]) - for name, meta in data["files"].items() - ] + (name, meta.sha256, meta.size) for name, meta in data.files.items() ) # what we want to avoid downloading back if we have them already - set_new_artifacts = to_set(artifact_metadata) + set_new_artifacts = to_set(p_info) known_artifacts_revision_id = {} for rev_id, known_artifacts in known_package_artifacts.items(): @@ -198,13 +260,13 @@ if not extrinsic: continue - s = to_set(extrinsic["raw"]) + s = to_set(DebianPackageInfo.from_metadata(extrinsic["raw"], url=p_info.url)) known_artifacts_revision_id[s] = rev_id return known_artifacts_revision_id.get(set_new_artifacts) -def uid_to_person(uid: str) -> Mapping[str, str]: +def uid_to_person(uid: str) -> Dict[str, str]: """Convert an uid to a person suitable for insertion. Args: @@ -249,12 +311,12 @@ ) -def download_package(package: Mapping[str, Any], tmpdir: Any) -> Mapping[str, Any]: +def download_package(p_info: DebianPackageInfo, tmpdir: Any) -> Mapping[str, Any]: """Fetch a source package in a temporary directory and check the checksums for all files. Args: - package: Dict defining the set of files representing a debian package + p_info: Information on a package tmpdir: Where to download and extract the files to ingest Returns: @@ -262,10 +324,10 @@ """ all_hashes = {} - for filename, fileinfo in package["files"].items(): - uri = fileinfo["uri"] + for filename, fileinfo in p_info.files.items(): + uri = fileinfo.uri logger.debug("fileinfo: %s", fileinfo) - extrinsic_hashes = {"sha256": fileinfo["sha256"]} + extrinsic_hashes = {"sha256": fileinfo.sha256} logger.debug("extrinsic_hashes(%s): %s", filename, extrinsic_hashes) filepath, hashes = download( uri, dest=tmpdir, filename=filename, hashes=extrinsic_hashes @@ -276,11 +338,11 @@ return all_hashes -def dsc_information(package: Mapping[str, Any]) -> Tuple[Optional[str], Optional[str]]: +def dsc_information(p_info: DebianPackageInfo) -> Tuple[Optional[str], Optional[str]]: """Retrieve dsc information from a package. Args: - package: Package metadata information + p_info: Package metadata information Returns: Tuple of dsc file's uri, dsc's full disk path @@ -288,14 +350,14 @@ """ dsc_name = None dsc_url = None - for filename, fileinfo in package["files"].items(): + for filename, fileinfo in p_info.files.items(): if filename.endswith(".dsc"): if dsc_name: raise ValueError( "Package %s_%s references several dsc files." - % (package["name"], package["version"]) + % (p_info.name, p_info.version) ) - dsc_url = fileinfo["uri"] + dsc_url = fileinfo.uri dsc_name = filename return dsc_url, dsc_name @@ -354,14 +416,14 @@ return destdir -def get_package_metadata( - package: Mapping[str, Any], dsc_path: str, extracted_path: str -) -> Mapping[str, Any]: +def get_intrinsic_package_metadata( + p_info: DebianPackageInfo, dsc_path: str, extracted_path: str +) -> IntrinsicPackageMetadata: """Get the package metadata from the source package at dsc_path, extracted in extracted_path. Args: - package: the package dict (with a dsc_path key) + p_info: the package information dsc_path: path to the package's dsc file extracted_path: the path where the package got extracted @@ -377,36 +439,30 @@ # Parse the changelog to retrieve the rest of the package information changelog_path = path.join(extracted_path, "debian/changelog") - with open(changelog_path, "rb") as changelog: + with open(changelog_path, "rb") as changelog_file: try: - parsed_changelog = Changelog(changelog) + parsed_changelog = Changelog(changelog_file) except UnicodeDecodeError: logger.warning( "Unknown encoding for changelog %s," " falling back to iso" % changelog_path, extra={ "swh_type": "deb_changelog_encoding", - "swh_name": package["name"], - "swh_version": str(package["version"]), + "swh_name": p_info.name, + "swh_version": str(p_info.version), "swh_changelog": changelog_path, }, ) # need to reset as Changelog scrolls to the end of the file - changelog.seek(0) - parsed_changelog = Changelog(changelog, encoding="iso-8859-15") - - package_info = { - "name": package["name"], - "version": str(package["version"]), - "changelog": { - "person": uid_to_person(parsed_changelog.author), - "date": parse_date(parsed_changelog.date).isoformat(), - "history": [ - (block.package, str(block.version)) for block in parsed_changelog - ][1:], - }, - } + changelog_file.seek(0) + parsed_changelog = Changelog(changelog_file, encoding="iso-8859-15") + + changelog = DebianPackageChangelog( + person=uid_to_person(parsed_changelog.author), + date=parse_date(parsed_changelog.date).isoformat(), + history=[(block.package, str(block.version)) for block in parsed_changelog][1:], + ) maintainers = [ uid_to_person(parsed_dsc["Maintainer"]), @@ -415,6 +471,10 @@ uid_to_person(person) for person in UPLOADERS_SPLIT.split(parsed_dsc.get("Uploaders", "")) ) - package_info["maintainers"] = maintainers - return package_info + return IntrinsicPackageMetadata( + name=p_info.name, + version=str(p_info.version), + changelog=changelog, + maintainers=maintainers, + ) diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py --- a/swh/loader/package/debian/tests/test_debian.py +++ b/swh/loader/package/debian/tests/test_debian.py @@ -3,7 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import copy import logging import pytest import random @@ -12,11 +11,14 @@ from swh.loader.package.debian.loader import ( DebianLoader, + DebianPackageInfo, + DebianPackageChangelog, + IntrinsicPackageMetadata, download_package, dsc_information, uid_to_person, prepare_person, - get_package_metadata, + get_intrinsic_package_metadata, extract_package, ) from swh.loader.tests import ( @@ -34,6 +36,8 @@ logger = logging.getLogger(__name__) +URL = "deb://Debian/packages/cicero" + PACKAGE_FILES = { "name": "cicero", "version": "0.7.2-3", @@ -106,9 +110,8 @@ """With no prior visit, load a gnu project ends up with 1 snapshot """ - url = "deb://Debian/packages/cicero" loader = DebianLoader( - url=url, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGE_PER_VERSION, + url=URL, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGE_PER_VERSION, ) actual_load_status = loader.load() @@ -118,7 +121,7 @@ "snapshot_id": expected_snapshot_id, } - assert_last_visit_matches(loader.storage, url, status="full", type="deb") + assert_last_visit_matches(loader.storage, URL, status="full", type="deb") stats = get_stats(loader.storage) assert { @@ -150,9 +153,8 @@ """With no prior visit, load a debian project ends up with 1 snapshot """ - url = "deb://Debian/packages/cicero" loader = DebianLoader( - url=url, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGE_PER_VERSION + url=URL, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGE_PER_VERSION ) actual_load_status = loader.load() @@ -163,7 +165,7 @@ "snapshot_id": expected_snapshot_id, } - assert_last_visit_matches(loader.storage, url, status="full", type="deb") + assert_last_visit_matches(loader.storage, URL, status="full", type="deb") stats = get_stats(loader.storage) assert { @@ -193,7 +195,7 @@ # No change in between load actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" - assert_last_visit_matches(loader.storage, url, status="full", type="deb") + assert_last_visit_matches(loader.storage, URL, status="full", type="deb") stats2 = get_stats(loader.storage) assert { @@ -246,7 +248,8 @@ def test_download_package(datadir, tmpdir, requests_mock_datadir): tmpdir = str(tmpdir) # py3.5 work around (LocalPath issue) - all_hashes = download_package(PACKAGE_FILES, tmpdir) + p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) + all_hashes = download_package(p_info, tmpdir) assert all_hashes == { "cicero_0.7.2-3.diff.gz": { "checksums": { @@ -277,7 +280,8 @@ def test_dsc_information_ok(): fname = "cicero_0.7.2-3.dsc" - dsc_url, dsc_name = dsc_information(PACKAGE_FILES) + p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) + dsc_url, dsc_name = dsc_information(p_info) assert dsc_url == PACKAGE_FILES["files"][fname]["uri"] assert dsc_name == PACKAGE_FILES["files"][fname]["name"] @@ -285,10 +289,10 @@ def test_dsc_information_not_found(): fname = "cicero_0.7.2-3.dsc" - package_files = copy.deepcopy(PACKAGE_FILES) - package_files["files"].pop(fname) + p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) + p_info.files.pop(fname) - dsc_url, dsc_name = dsc_information(package_files) + dsc_url, dsc_name = dsc_information(p_info) assert dsc_url is None assert dsc_name is None @@ -297,30 +301,30 @@ def test_dsc_information_too_many_dsc_entries(): # craft an extra dsc file fname = "cicero_0.7.2-3.dsc" - package_files = copy.deepcopy(PACKAGE_FILES) - data = package_files["files"][fname] + p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) + data = p_info.files[fname] fname2 = fname.replace("cicero", "ciceroo") - package_files["files"][fname2] = data + p_info.files[fname2] = data with pytest.raises( ValueError, match="Package %s_%s references several dsc" - % (package_files["name"], package_files["version"]), + % (PACKAGE_FILES["name"], PACKAGE_FILES["version"]), ): - dsc_information(package_files) + dsc_information(p_info) -def test_get_package_metadata(requests_mock_datadir, datadir, tmp_path): +def test_get_intrinsic_package_metadata(requests_mock_datadir, datadir, tmp_path): tmp_path = str(tmp_path) # py3.5 compat. - package = PACKAGE_FILES + p_info = DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) - logger.debug("package: %s", package) + logger.debug("p_info: %s", p_info) # download the packages - all_hashes = download_package(package, tmp_path) + all_hashes = download_package(p_info, tmp_path) # Retrieve information from package - _, dsc_name = dsc_information(package) + _, dsc_name = dsc_information(p_info) dl_artifacts = [(tmp_path, hashes) for hashes in all_hashes.values()] @@ -329,25 +333,27 @@ # Retrieve information on package dsc_path = path.join(path.dirname(extracted_path), dsc_name) - actual_package_info = get_package_metadata(package, dsc_path, extracted_path) + actual_package_info = get_intrinsic_package_metadata( + p_info, dsc_path, extracted_path + ) logger.debug("actual_package_info: %s", actual_package_info) - assert actual_package_info == { - "changelog": { - "date": "2014-10-19T16:52:35+02:00", - "history": [ + assert actual_package_info == IntrinsicPackageMetadata( + changelog=DebianPackageChangelog( + date="2014-10-19T16:52:35+02:00", + history=[ ("cicero", "0.7.2-2"), ("cicero", "0.7.2-1"), ("cicero", "0.7-1"), ], - "person": { + person={ "email": "sthibault@debian.org", "fullname": "Samuel Thibault ", "name": "Samuel Thibault", }, - }, - "maintainers": [ + ), + maintainers=[ { "email": "debian-accessibility@lists.debian.org", "fullname": "Debian Accessibility Team " @@ -360,15 +366,14 @@ "name": "Samuel Thibault", }, ], - "name": "cicero", - "version": "0.7.2-3", - } + name="cicero", + version="0.7.2-3", + ) def test_debian_multiple_packages(swh_config, requests_mock_datadir): - url = "deb://Debian/packages/cicero" loader = DebianLoader( - url=url, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGES_PER_VERSION + url=URL, date="2019-10-12T05:58:09.165557+00:00", packages=PACKAGES_PER_VERSION ) actual_load_status = loader.load() @@ -378,7 +383,7 @@ "snapshot_id": expected_snapshot_id, } - assert_last_visit_matches(loader.storage, url, status="full", type="deb") + assert_last_visit_matches(loader.storage, URL, status="full", type="deb") expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), @@ -401,12 +406,19 @@ """Solving revision with empty data will result in unknown revision """ - for package_artifacts in [{}, PACKAGE_FILES]: - actual_revision = resolve_revision_from({}, package_artifacts) + empty_artifact = { + "name": PACKAGE_FILES["name"], + "version": PACKAGE_FILES["version"], + } + for package_artifacts in [empty_artifact, PACKAGE_FILES]: + p_info = DebianPackageInfo.from_metadata(package_artifacts, url=URL) + actual_revision = resolve_revision_from({}, p_info) assert actual_revision is None for known_artifacts in [{}, PACKAGE_FILES]: - actual_revision = resolve_revision_from(known_artifacts, {}) + actual_revision = resolve_revision_from( + known_artifacts, DebianPackageInfo.from_metadata(empty_artifact, url=URL) + ) assert actual_revision is None known_package_artifacts = { @@ -417,7 +429,9 @@ # ... removed the unnecessary intermediary data } } - assert not resolve_revision_from(known_package_artifacts, PACKAGE_FILES) + assert not resolve_revision_from( + known_package_artifacts, DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) + ) def test_resolve_revision_from_edge_cases_hit_and_miss(): @@ -425,6 +439,7 @@ """ artifact_metadata = PACKAGE_FILES2 + p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) expected_revision_id = ( b"(\x08\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xff\x85\x85O\xfe\xcf\x07" # noqa ) @@ -435,7 +450,7 @@ } } - actual_revision = resolve_revision_from(known_package_artifacts, artifact_metadata) + actual_revision = resolve_revision_from(known_package_artifacts, p_info) assert actual_revision is None @@ -445,6 +460,7 @@ """ artifact_metadata = PACKAGE_FILES + p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) expected_revision_id = ( b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07" # noqa ) @@ -453,7 +469,11 @@ # shuffling dict's keys keys = list(files.keys()) random.shuffle(keys) - package_files = {"files": {k: files[k] for k in keys}} + package_files = { + "name": PACKAGE_FILES["name"], + "version": PACKAGE_FILES["version"], + "files": {k: files[k] for k in keys}, + } known_package_artifacts = { expected_revision_id: { @@ -462,6 +482,6 @@ } } - actual_revision = resolve_revision_from(known_package_artifacts, artifact_metadata) + actual_revision = resolve_revision_from(known_package_artifacts, p_info) assert actual_revision == expected_revision_id diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import json import logging import requests @@ -31,10 +32,53 @@ logger = logging.getLogger(__name__) +@attr.s class DepositPackageInfo(BasePackageInfo): filename = attr.ib(type=str) # instead of Optional[str] raw = attr.ib(type=Dict[str, Any]) + author_date = attr.ib(type=datetime.datetime) + """codemeta:dateCreated if any, deposit completed_date otherwise""" + commit_date = attr.ib(type=datetime.datetime) + """codemeta:datePublished if any, deposit completed_date otherwise""" + client = attr.ib(type=str) + id = attr.ib(type=int) + """Internal ID of the deposit in the deposit DB""" + collection = attr.ib(type=str) + """The collection in the deposit; see SWORD specification.""" + author = attr.ib(type=Person) + committer = attr.ib(type=Person) + revision_parents = attr.ib(type=Tuple[Sha1Git, ...]) + """Revisions created from previous deposits, that will be used as parents of the + revision created for this deposit.""" + + @classmethod + def from_metadata( + cls, metadata: Dict[str, Any], url: str, filename: str + ) -> "DepositPackageInfo": + # Note: + # `date` and `committer_date` are always transmitted by the deposit read api + # which computes itself the values. The loader needs to use those to create the + # revision. + + metadata = metadata.copy() + # FIXME: this removes information from 'raw' metadata + depo = metadata.pop("deposit") + + return cls( + url=url, + filename=filename, + author_date=depo["author_date"], + commit_date=depo["committer_date"], + client=depo["client"], + id=depo["id"], + collection=depo["collection"], + author=parse_author(depo["author"]), + committer=parse_author(depo["committer"]), + revision_parents=tuple(hash_to_bytes(p) for p in depo["revision_parents"]), + raw=metadata, + ) + class DepositLoader(PackageLoader[DepositPackageInfo]): """Load pypi origin's artifact releases into swh archive. @@ -66,8 +110,8 @@ def get_package_info( self, version: str ) -> Iterator[Tuple[str, DepositPackageInfo]]: - p_info = DepositPackageInfo( - url=self.url, filename="archive.zip", raw=self.metadata, + p_info = DepositPackageInfo.from_metadata( + self.metadata, url=self.url, filename="archive.zip", ) yield "HEAD", p_info @@ -80,41 +124,27 @@ return [self.client.archive_get(self.deposit_id, tmpdir, p_info.filename)] def build_revision( - self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git + self, p_info: DepositPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: - depo = a_metadata.pop("deposit") - - # Note: - # `date` and `committer_date` are always transmitted by the deposit read api - # which computes itself the values. The loader needs to use those to create the - # revision. - - # date: codemeta:dateCreated if any, deposit completed_date otherwise - date = TimestampWithTimezone.from_dict(depo["author_date"]) - # commit_date: codemeta:datePublished if any, deposit completed_date otherwise - commit_date = TimestampWithTimezone.from_dict(depo["committer_date"]) - - client, id, collection = [depo[k] for k in ["client", "id", "collection"]] - message = f"{client}: Deposit {id} in collection {collection}".encode("utf-8") - - author = parse_author(depo["author"]) - committer = parse_author(depo["committer"]) + message = ( + f"{p_info.client}: Deposit {p_info.id} in collection {p_info.collection}" + ).encode("utf-8") return Revision( type=RevisionType.TAR, message=message, - author=author, - date=date, - committer=committer, - committer_date=commit_date, - parents=tuple([hash_to_bytes(p) for p in depo["revision_parents"]]), + author=p_info.author, + date=TimestampWithTimezone.from_dict(p_info.author_date), + committer=p_info.committer, + committer_date=TimestampWithTimezone.from_dict(p_info.commit_date), + parents=p_info.revision_parents, directory=directory, synthetic=True, metadata={ "extrinsic": { "provider": self.client.metadata_url(self.deposit_id), "when": self.visit_date.isoformat(), - "raw": a_metadata, # Actually the processed metadata instead of raw + "raw": p_info.raw, }, }, ) diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -51,10 +51,29 @@ @attr.s class BasePackageInfo: + """Compute the primary key for a dict using the id_keys as primary key + composite. + + Args: + d: A dict entry to compute the primary key on + id_keys: Sequence of keys to use as primary key + + Returns: + The identity for that dict entry + + """ + url = attr.ib(type=str) filename = attr.ib(type=Optional[str]) raw = attr.ib(type=Any) + @property + def ID_KEYS(self): + raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS") + + def artifact_identity(self): + return [getattr(self, k) for k in self.ID_KEYS] + TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) @@ -112,13 +131,13 @@ yield from {} def build_revision( - self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git + self, p_info: TPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: """Build the revision from the archive metadata (extrinsic artifact metadata) and the intrinsic metadata. Args: - a_metadata: Artifact metadata + p_info: Package information uncompressed_path: Artifact uncompressed path on disk Returns: @@ -170,7 +189,7 @@ } def resolve_revision_from( - self, known_artifacts: Dict, artifact_metadata: Dict + self, known_artifacts: Dict, p_info: TPackageInfo, ) -> Optional[bytes]: """Resolve the revision from a snapshot and an artifact metadata dict. @@ -180,7 +199,7 @@ Args: snapshot: Snapshot - artifact_metadata: Information dict + p_info: Package information Returns: None or revision identifier @@ -345,7 +364,7 @@ # `p_` stands for `package_` for branch_name, p_info in self.get_package_info(version): logger.debug("package_info: %s", p_info) - revision_id = self.resolve_revision_from(known_artifacts, p_info.raw) + revision_id = self.resolve_revision_from(known_artifacts, p_info) if revision_id is None: try: revision_id = self._load_revision(p_info, origin) @@ -431,7 +450,7 @@ # FIXME: This should be release. cf. D409 revision = self.build_revision( - p_info.raw, uncompressed_path, directory=directory.hash + p_info, uncompressed_path, directory=directory.hash ) if not revision: # Some artifacts are missing intrinsic metadata diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -27,9 +27,23 @@ logger = logging.getLogger(__name__) +@attr.s class NixGuixPackageInfo(BasePackageInfo): raw = attr.ib(type=Dict[str, Any]) + integrity = attr.ib(type=str) + """Hash of the archive, formatted as in the Subresource Integrity + specification.""" + + @classmethod + def from_metadata(cls, metadata: Dict[str, Any]) -> "NixGuixPackageInfo": + return cls( + url=metadata["url"], + filename=None, + integrity=metadata["integrity"], + raw=metadata, + ) + class NixGuixLoader(PackageLoader[NixGuixPackageInfo]): """Load sources from a sources.json file. This loader is used to load @@ -70,9 +84,7 @@ # currently only use the first one, but if the first one # fails, we should try the second one and so on. integrity = self._integrityByUrl[url] - p_info = NixGuixPackageInfo( - url=url, filename=None, raw={"url": url, "integrity": integrity}, - ) + p_info = NixGuixPackageInfo.from_metadata({"url": url, "integrity": integrity}) yield url, p_info def known_artifacts(self, snapshot: Optional[Snapshot]) -> Dict[Sha1Git, BaseModel]: @@ -103,7 +115,7 @@ return ret def resolve_revision_from( - self, known_artifacts: Dict, artifact_metadata: Dict + self, known_artifacts: Dict, p_info: NixGuixPackageInfo, ) -> Optional[bytes]: for rev_id, known_artifact in known_artifacts.items(): try: @@ -124,7 +136,7 @@ # the other revisions continue else: - if artifact_metadata["integrity"] == known_integrity: + if p_info.integrity == known_integrity: return rev_id return None @@ -152,7 +164,7 @@ } def build_revision( - self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git + self, p_info: NixGuixPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: return Revision( type=RevisionType.TAR, @@ -168,7 +180,7 @@ "extrinsic": { "provider": self.provider_url, "when": self.visit_date.isoformat(), - "raw": a_metadata, + "raw": p_info.raw, }, }, ) diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -18,6 +18,7 @@ from swh.model.model import Snapshot, SnapshotBranch, TargetType from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.nixguix.loader import ( + NixGuixPackageInfo, NixGuixLoader, retrieve_sources, clean_sources, @@ -312,10 +313,14 @@ "id2": {"extrinsic": {"raw": {"url": "url2", "integrity": "integrity2"}}}, } - metadata = {"url": "url1", "integrity": "integrity1"} - assert loader.resolve_revision_from(known_artifacts, metadata) == "id1" - metadata = {"url": "url3", "integrity": "integrity3"} - assert loader.resolve_revision_from(known_artifacts, metadata) == None # noqa + p_info = NixGuixPackageInfo.from_metadata( + {"url": "url1", "integrity": "integrity1"} + ) + assert loader.resolve_revision_from(known_artifacts, p_info) == "id1" + p_info = NixGuixPackageInfo.from_metadata( + {"url": "url3", "integrity": "integrity3"} + ) + assert loader.resolve_revision_from(known_artifacts, p_info) == None # noqa def test_evaluation_branch(swh_config, requests_mock_datadir): diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -32,9 +32,43 @@ EMPTY_PERSON = Person(fullname=b"", name=None, email=None) +@attr.s class NpmPackageInfo(BasePackageInfo): raw = attr.ib(type=Dict[str, Any]) + date = attr.ib(type=Optional[str]) + shasum = attr.ib(type=str) + """sha1 checksum""" + version = attr.ib(type=str) + + @classmethod + def from_metadata( + cls, project_metadata: Dict[str, Any], version: str + ) -> "NpmPackageInfo": + package_metadata = project_metadata["versions"][version] + url = package_metadata["dist"]["tarball"] + + # No date available in intrinsic metadata: retrieve it from the API + # metadata, using the version number that the API claims this package + # has. + extrinsic_version = package_metadata["version"] + + if "time" in project_metadata: + date = project_metadata["time"][extrinsic_version] + elif "mtime" in package_metadata: + date = package_metadata["mtime"] + else: + date = None + + return cls( + url=url, + filename=os.path.basename(url), + date=date, + shasum=package_metadata["dist"]["shasum"], + version=extrinsic_version, + raw=package_metadata, # FIXME: we're losing some of the project metadata + ) + class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive. @@ -71,46 +105,37 @@ def get_default_version(self) -> str: return self.info["dist-tags"].get("latest", "") - def get_package_info(self, version: str) -> Iterator[Tuple[str, NpmPackageInfo]]: - meta = self.info["versions"][version] - url = meta["dist"]["tarball"] - p_info = NpmPackageInfo(url=url, filename=os.path.basename(url), raw=meta,) + def get_package_info( + self, version: str + ) -> Iterator[Tuple[str, NpmPackageInfo]]: + p_info = NpmPackageInfo.from_metadata( + project_metadata=self.info, version=version + ) yield release_name(version), p_info def resolve_revision_from( - self, known_artifacts: Dict, artifact_metadata: Dict + self, known_artifacts: Dict, p_info: NpmPackageInfo ) -> Optional[bytes]: - return artifact_to_revision_id(known_artifacts, artifact_metadata) + return artifact_to_revision_id(known_artifacts, p_info) def build_revision( - self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git + self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None - # from intrinsic metadata author = extract_npm_package_author(i_metadata) message = i_metadata["version"].encode("ascii") - # from extrinsic metadata - - # No date available in intrinsic metadata: retrieve it from the API - # metadata, using the version number that the API claims this package - # has. - extrinsic_version = a_metadata["version"] - - if "time" in self.info: - date = self.info["time"][extrinsic_version] - elif "mtime" in a_metadata: - date = a_metadata["mtime"] - else: - artifact_name = os.path.basename(a_metadata["dist"]["tarball"]) + if p_info.date is None: + url = p_info.url + artifact_name = os.path.basename(url) raise ValueError( "Origin %s: Cannot determine upload time for artifact %s." - % (self.url, artifact_name) + % (p_info.url, artifact_name) ) - date = TimestampWithTimezone.from_iso8601(date) + date = TimestampWithTimezone.from_iso8601(p_info.date) # FIXME: this is to remain bug-compatible with earlier versions: date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0)) @@ -130,7 +155,7 @@ "extrinsic": { "provider": self.provider_url, "when": self.visit_date.isoformat(), - "raw": a_metadata, + "raw": p_info.raw, }, }, ) @@ -138,7 +163,7 @@ def artifact_to_revision_id( - known_artifacts: Dict, artifact_metadata: Dict + known_artifacts: Dict, p_info: NpmPackageInfo ) -> Optional[bytes]: """Given metadata artifact, solves the associated revision id. @@ -165,7 +190,7 @@ } """ - shasum = artifact_metadata["dist"]["shasum"] + shasum = p_info.shasum for rev_id, known_artifact in known_artifacts.items(): known_original_artifact = known_artifact.get("original_artifact") if not known_original_artifact: diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -501,9 +501,9 @@ """Current loader version should stop soon if nothing can be found """ - artifact_metadata = { - "dist": {"shasum": "05181c12cd8c22035dd31155656826b85745da37",}, - } + + class artifact_metadata: + shasum = "05181c12cd8c22035dd31155656826b85745da37" known_artifacts = { "b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92": {}, @@ -516,9 +516,9 @@ """Current loader version should solve old metadata scheme """ - artifact_metadata = { - "dist": {"shasum": "05181c12cd8c22035dd31155656826b85745da37",} - } + + class artifact_metadata: + shasum = "05181c12cd8c22035dd31155656826b85745da37" known_artifacts = { hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): { @@ -538,9 +538,9 @@ """Current loader version should be able to solve current metadata scheme """ - artifact_metadata = { - "dist": {"shasum": "05181c12cd8c22035dd31155656826b85745da37",} - } + + class artifact_metadata: + shasum = "05181c12cd8c22035dd31155656826b85745da37" known_artifacts = { hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): { diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -25,9 +25,25 @@ logger = logging.getLogger(__name__) +@attr.s class PyPIPackageInfo(BasePackageInfo): raw = attr.ib(type=Dict[str, Any]) + comment_text = attr.ib(type=Optional[str]) + sha256 = attr.ib(type=str) + upload_time = attr.ib(type=str) + + @classmethod + def from_metadata(cls, metadata: Dict[str, Any]) -> "PyPIPackageInfo": + return cls( + url=metadata["url"], + filename=metadata["filename"], + raw=metadata, + comment_text=metadata.get("comment_text"), + sha256=metadata["digests"]["sha256"], + upload_time=metadata["upload_time"], + ) + class PyPILoader(PackageLoader[PyPIPackageInfo]): """Load pypi origin's artifact releases into swh archive. @@ -61,8 +77,7 @@ for meta in self.info["releases"][version]: if meta["packagetype"] != "sdist": continue - filename = meta["filename"] - p_info = PyPIPackageInfo(url=meta["url"], filename=filename, raw=meta,) + p_info = PyPIPackageInfo.from_metadata(meta) res.append((version, p_info)) if len(res) == 1: @@ -73,12 +88,12 @@ yield release_name(version, p_info.filename), p_info def resolve_revision_from( - self, known_artifacts: Dict, artifact_metadata: Dict + self, known_artifacts: Dict, p_info: PyPIPackageInfo ) -> Optional[bytes]: - return artifact_to_revision_id(known_artifacts, artifact_metadata) + return artifact_to_revision_id(known_artifacts, p_info) def build_revision( - self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git + self, p_info: PyPIPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: @@ -89,9 +104,9 @@ _author = author(i_metadata) # from extrinsic metadata - message = a_metadata.get("comment_text", "") + message = p_info.comment_text or "" message = "%s: %s" % (name, message) if message else name - date = TimestampWithTimezone.from_iso8601(a_metadata["upload_time"]) + date = TimestampWithTimezone.from_iso8601(p_info.upload_time) return Revision( type=RevisionType.TAR, @@ -108,14 +123,14 @@ "extrinsic": { "provider": self.provider_url, "when": self.visit_date.isoformat(), - "raw": a_metadata, + "raw": p_info.raw, }, }, ) def artifact_to_revision_id( - known_artifacts: Dict, artifact_metadata: Dict + known_artifacts: Dict, p_info: PyPIPackageInfo ) -> Optional[bytes]: """Given metadata artifact, solves the associated revision id. @@ -145,7 +160,7 @@ } """ - sha256 = artifact_metadata["digests"]["sha256"] + sha256 = p_info.sha256 for rev_id, known_artifact in known_artifacts.items(): original_artifact = known_artifact["original_artifact"] if isinstance(original_artifact, dict): diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -739,11 +739,9 @@ """Current loader version should stop soon if nothing can be found """ - artifact_metadata = { - "digests": { - "sha256": "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec", # noqa - }, - } + + class artifact_metadata: + sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" assert artifact_to_revision_id({}, artifact_metadata) is None @@ -760,11 +758,9 @@ """Current loader version should solve old metadata scheme """ - artifact_metadata = { - "digests": { - "sha256": "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec", # noqa - } - } + + class artifact_metadata: + sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" known_artifacts = { hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): { @@ -786,11 +782,9 @@ """Current loader version should be able to solve current metadata scheme """ - artifact_metadata = { - "digests": { - "sha256": "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec", # noqa - } - } + + class artifact_metadata: + sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" known_artifacts = { hash_to_bytes("b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92"): { @@ -812,20 +806,6 @@ ) -def test_pypi_artifact_to_revision_id_failures(): - with pytest.raises(KeyError, match="sha256"): - artifact_metadata = { - "digests": {}, - } - assert artifact_to_revision_id({}, artifact_metadata) - - with pytest.raises(KeyError, match="digests"): - artifact_metadata = { - "something": "wrong", - } - assert artifact_to_revision_id({}, artifact_metadata) - - def test_pypi_artifact_with_no_intrinsic_metadata(swh_config, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py --- a/swh/loader/package/tests/test_loader.py +++ b/swh/loader/package/tests/test_loader.py @@ -3,7 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.loader.package.loader import PackageLoader +import attr + +from swh.loader.package.loader import BasePackageInfo, PackageLoader class FakeStorage: @@ -36,3 +38,32 @@ actual_load_status2 = loader.load() assert actual_load_status2 == {"status": "failed"} + + +def test_artifact_identity(): + """Compute primary key should return the right identity + + """ + + @attr.s + class TestPackageInfo(BasePackageInfo): + a = attr.ib() + b = attr.ib() + length = attr.ib() + filename = attr.ib() + version = attr.ib() + + ID_KEYS = ["a", "b"] + + p_info = TestPackageInfo( + url="http://example.org/", + raw={}, + a=1, + b=2, + length=221837, + filename="8sync-0.1.0.tar.gz", + version="0.1.0", + ) + + actual_id = p_info.artifact_identity() + assert actual_id == [1, 2] diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -9,7 +9,7 @@ import swh.loader.package -from swh.loader.package.utils import download, api_info, release_name, artifact_identity +from swh.loader.package.utils import download, api_info, release_name def test_version_generation(): @@ -156,24 +156,3 @@ ("0.0.2", "something", "releases/0.0.2/something"), ]: assert release_name(version, filename) == expected_release - - -def test_artifact_identity(): - """Compute primary key should return the right identity - - """ - data = { - "a": 1, - "b": 2, - "length": 221837, - "filename": "8sync-0.1.0.tar.gz", - "version": "0.1.0", - } - - for id_keys, expected_id in [ - (["a", "b"], [1, 2]), - ([], []), - (["a", "key-that-does-not-exist"], [1, None]), - ]: - actual_id = artifact_identity(data, id_keys=id_keys) - assert actual_id == expected_id diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -8,7 +8,7 @@ import os import requests -from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple +from typing import Dict, Optional, Tuple from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE from swh.model.model import Person @@ -121,18 +121,3 @@ if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version - - -def artifact_identity(d: Mapping[str, Any], id_keys: Sequence[str]) -> List[Any]: - """Compute the primary key for a dict using the id_keys as primary key - composite. - - Args: - d: A dict entry to compute the primary key on - id_keys: Sequence of keys to use as primary key - - Returns: - The identity for that dict entry - - """ - return [d.get(k) for k in id_keys]