diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -4,8 +4,10 @@ # See top-level LICENSE file for more information import datetime +import hashlib import logging from os import path +import string from typing import Any, Dict, Iterator, Optional, Sequence, Tuple, Union import attr @@ -40,14 +42,19 @@ """Timestamp of the archive file on the server""" version = attr.ib(type=str) - # default keys for gnu - ID_KEYS = ["time", "url", "length", "version"] + # default format for gnu + MANIFEST_FORMAT = string.Template("$time $length $version $url") - def artifact_identity(self, id_keys=None): - if id_keys is None: - id_keys = self.ID_KEYS + def extid(self, manifest_format: Optional[string.Template] = None) -> bytes: + """Returns a unique intrinsic identifier of this package info + + ``manifest_format`` allows overriding the class' default MANIFEST_FORMAT""" + manifest_format = manifest_format or self.MANIFEST_FORMAT # TODO: use parsed attributes instead of self.raw_info - return [self.raw_info.get(k) for k in id_keys] + manifest = manifest_format.substitute( + {k: str(v) for (k, v) in self.raw_info.items()} + ) + return hashlib.sha256(manifest.encode()).digest() @classmethod def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": @@ -75,10 +82,10 @@ storage: StorageInterface, url: str, artifacts: Sequence[Dict[str, Any]], - identity_artifact_keys: Optional[Sequence[str]] = None, + extid_manifest_format: Optional[str] = None, max_content_size: Optional[int] = None, ): - """Loader constructor. + f"""Loader constructor. For now, this is the lister's task output. @@ -97,13 +104,18 @@ - **length**: artifact's length - identity_artifact_keys: Optional List of keys forming the - "identity" of an artifact + extid_manifest_format: template string used to format a manifest, + which is hashed to get the extid of a package. + Defaults to {ArchivePackageInfo.MANIFEST_FORMAT!r} """ super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.artifacts = artifacts # assume order is enforced in the lister - self.identity_artifact_keys = identity_artifact_keys + self.extid_manifest_format = ( + None + if extid_manifest_format is None + else string.Template(extid_manifest_format) + ) def get_versions(self) -> Sequence[str]: versions = [] @@ -127,20 +139,19 @@ # versioned package yield release_name(version), p_info + def extid_from_reference_artifact(self, reference_artifact: Dict) -> bytes: + reference_artifact_info = ArchivePackageInfo.from_metadata(reference_artifact) + return reference_artifact_info.extid(manifest_format=self.extid_manifest_format) + def resolve_revision_from( self, known_artifacts: Dict, p_info: ArchivePackageInfo ) -> Optional[bytes]: - identity = p_info.artifact_identity(id_keys=self.identity_artifact_keys) + extid = p_info.extid(manifest_format=self.extid_manifest_format) for rev_id, known_artifact in known_artifacts.items(): logging.debug("known_artifact: %s", known_artifact) reference_artifact = known_artifact["extrinsic"]["raw"] - reference_artifact_info = ArchivePackageInfo.from_metadata( - reference_artifact - ) - known_identity = reference_artifact_info.artifact_identity( - id_keys=self.identity_artifact_keys - ) - if identity == known_identity: + known_extid = self.extid_from_reference_artifact(reference_artifact) + if extid == known_extid: return rev_id return None diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -3,7 +3,11 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import hashlib +import string + import attr +import pytest from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo from swh.loader.package.tests.common import check_metadata_paths @@ -319,7 +323,7 @@ swh_storage, url, artifacts=artifacts, - identity_artifact_keys=["sha256", "length", "url"], + extid_manifest_format="$sha256 $length $url", ) actual_load_status = loader.load() @@ -340,7 +344,7 @@ assert len(urls) == 1 -def test_archive_artifact_identity(): +def test_archive_extid(): """Compute primary key should return the right identity """ @@ -356,19 +360,13 @@ raw_info={**metadata, "a": 1, "b": 2}, a=1, b=2, **metadata, ) - for id_keys, expected_id in [ - (["a", "b"], [1, 2]), - ([], []), - (["a", "key-that-does-not-exist"], [1, None]), - ( - None, - [ - metadata["time"], - metadata["url"], - metadata["length"], - metadata["version"], - ], - ), + for manifest_format, expected_manifest in [ + (string.Template("$a $b"), b"1 2"), + (string.Template(""), b""), + (None, "{time} {length} {version} {url}".format(**metadata).encode()), ]: - actual_id = p_info.artifact_identity(id_keys=id_keys) - assert actual_id == expected_id + actual_id = p_info.extid(manifest_format=manifest_format) + assert actual_id == hashlib.sha256(expected_manifest).digest() + + with pytest.raises(KeyError): + p_info.extid(manifest_format=string.Template("$a $unknown_key")) diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -9,6 +9,7 @@ import os from os import path import re +import string from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple import attr @@ -37,7 +38,7 @@ raw_info = attr.ib(type=Dict[str, Any]) version = attr.ib(type=str) - ID_KEYS = ["url", "version"] + MANIFEST_FORMAT = string.Template("$version $url") @classmethod def from_metadata(cls, a_metadata: Dict[str, Any]) -> "CRANPackageInfo": @@ -86,6 +87,9 @@ if version == p_info.version: yield release_name(version), p_info + def extid_from_known_artifact(self, known_artifact: Dict) -> bytes: + return CRANPackageInfo.from_metadata(known_artifact).extid() + def resolve_revision_from( self, known_artifacts: Mapping[bytes, Mapping], p_info: CRANPackageInfo, ) -> Optional[bytes]: @@ -93,14 +97,12 @@ artifact_metadata """ - new_identity = p_info.artifact_identity() + new_extid = p_info.extid() for rev_id, known_artifact_meta in known_artifacts.items(): logging.debug("known_artifact_meta: %s", known_artifact_meta) known_artifact = known_artifact_meta["extrinsic"]["raw"] - known_identity = CRANPackageInfo.from_metadata( - known_artifact - ).artifact_identity() - if new_identity == known_identity: + known_extid = self.extid_from_known_artifact(known_artifact) + if new_extid == known_extid: return rev_id return None diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -4,10 +4,12 @@ # See top-level LICENSE file for more information import datetime +import hashlib from itertools import islice import json import logging import os +import string import sys import tempfile from typing import ( @@ -111,11 +113,20 @@ # TODO: add support for metadata for directories and contents @property - def ID_KEYS(self): - raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS") + def MANIFEST_FORMAT(self) -> string.Template: + """A string.Template object used to format a manifest, which is hashed + to get the extid of this package info object""" + raise NotImplementedError( + f"{self.__class__.__name__} is missing MANIFEST_FORMAT " + f"or an override of extid()" + ) - def artifact_identity(self): - return [getattr(self, k) for k in self.ID_KEYS] + def extid(self) -> bytes: + """Returns a unique intrinsic identifier of this package info""" + manifest = self.MANIFEST_FORMAT.substitute( + {k: str(v) for (k, v) in attr.asdict(self).items()} + ) + return hashlib.sha256(manifest.encode()).digest() TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py --- a/swh/loader/package/tests/test_loader.py +++ b/swh/loader/package/tests/test_loader.py @@ -3,6 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import hashlib +import string + import attr import pytest @@ -41,7 +44,7 @@ assert actual_load_status2 == {"status": "failed"} -def test_artifact_identity(): +def test_extid(): """Compute primary key should return the right identity """ @@ -54,7 +57,7 @@ filename = attr.ib() version = attr.ib() - ID_KEYS = ["a", "b"] + MANIFEST_FORMAT = string.Template("$a $b") p_info = TestPackageInfo( url="http://example.org/", @@ -65,8 +68,8 @@ version="0.1.0", ) - actual_id = p_info.artifact_identity() - assert actual_id == [1, 2] + actual_id = p_info.extid() + assert actual_id == hashlib.sha256(b"1 2").digest() def test_no_env_swh_config_filename_raise(monkeypatch):