Page MenuHomeSoftware Heritage

D5289.id19000.diff
No OneTemporary

D5289.id19000.diff

diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py
--- a/swh/loader/package/archive/loader.py
+++ b/swh/loader/package/archive/loader.py
@@ -4,8 +4,10 @@
# See top-level LICENSE file for more information
import datetime
+import hashlib
import logging
from os import path
+import string
from typing import Any, Dict, Iterator, Optional, Sequence, Tuple, Union
import attr
@@ -40,14 +42,19 @@
"""Timestamp of the archive file on the server"""
version = attr.ib(type=str)
- # default keys for gnu
- ID_KEYS = ["time", "url", "length", "version"]
+ # default format for gnu
+ MANIFEST_FORMAT = string.Template("$time $length $version $url")
- def artifact_identity(self, id_keys=None):
- if id_keys is None:
- id_keys = self.ID_KEYS
+ def extid(self, manifest_format: Optional[string.Template] = None) -> bytes:
+ """Returns a unique intrinsic identifier of this package info
+
+ ``manifest_format`` allows overriding the class' default MANIFEST_FORMAT"""
+ manifest_format = manifest_format or self.MANIFEST_FORMAT
# TODO: use parsed attributes instead of self.raw_info
- return [self.raw_info.get(k) for k in id_keys]
+ manifest = manifest_format.substitute(
+ {k: str(v) for (k, v) in self.raw_info.items()}
+ )
+ return hashlib.sha256(manifest.encode()).digest()
@classmethod
def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo":
@@ -75,10 +82,10 @@
storage: StorageInterface,
url: str,
artifacts: Sequence[Dict[str, Any]],
- identity_artifact_keys: Optional[Sequence[str]] = None,
+ extid_manifest_format: Optional[str] = None,
max_content_size: Optional[int] = None,
):
- """Loader constructor.
+ f"""Loader constructor.
For now, this is the lister's task output.
@@ -97,13 +104,18 @@
- **length**: artifact's length
- identity_artifact_keys: Optional List of keys forming the
- "identity" of an artifact
+ extid_manifest_format: template string used to format a manifest,
+ which is hashed to get the extid of a package.
+ Defaults to {ArchivePackageInfo.MANIFEST_FORMAT!r}
"""
super().__init__(storage=storage, url=url, max_content_size=max_content_size)
self.artifacts = artifacts # assume order is enforced in the lister
- self.identity_artifact_keys = identity_artifact_keys
+ self.extid_manifest_format = (
+ None
+ if extid_manifest_format is None
+ else string.Template(extid_manifest_format)
+ )
def get_versions(self) -> Sequence[str]:
versions = []
@@ -127,20 +139,19 @@
# versioned package
yield release_name(version), p_info
+ def extid_from_reference_artifact(self, reference_artifact: Dict) -> bytes:
+ reference_artifact_info = ArchivePackageInfo.from_metadata(reference_artifact)
+ return reference_artifact_info.extid(manifest_format=self.extid_manifest_format)
+
def resolve_revision_from(
self, known_artifacts: Dict, p_info: ArchivePackageInfo
) -> Optional[bytes]:
- identity = p_info.artifact_identity(id_keys=self.identity_artifact_keys)
+ extid = p_info.extid(manifest_format=self.extid_manifest_format)
for rev_id, known_artifact in known_artifacts.items():
logging.debug("known_artifact: %s", known_artifact)
reference_artifact = known_artifact["extrinsic"]["raw"]
- reference_artifact_info = ArchivePackageInfo.from_metadata(
- reference_artifact
- )
- known_identity = reference_artifact_info.artifact_identity(
- id_keys=self.identity_artifact_keys
- )
- if identity == known_identity:
+ known_extid = self.extid_from_reference_artifact(reference_artifact)
+ if extid == known_extid:
return rev_id
return None
diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py
--- a/swh/loader/package/archive/tests/test_archive.py
+++ b/swh/loader/package/archive/tests/test_archive.py
@@ -3,7 +3,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import hashlib
+import string
+
import attr
+import pytest
from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo
from swh.loader.package.tests.common import check_metadata_paths
@@ -319,7 +323,7 @@
swh_storage,
url,
artifacts=artifacts,
- identity_artifact_keys=["sha256", "length", "url"],
+ extid_manifest_format="$sha256 $length $url",
)
actual_load_status = loader.load()
@@ -340,7 +344,7 @@
assert len(urls) == 1
-def test_archive_artifact_identity():
+def test_archive_extid():
"""Compute primary key should return the right identity
"""
@@ -356,19 +360,13 @@
raw_info={**metadata, "a": 1, "b": 2}, a=1, b=2, **metadata,
)
- for id_keys, expected_id in [
- (["a", "b"], [1, 2]),
- ([], []),
- (["a", "key-that-does-not-exist"], [1, None]),
- (
- None,
- [
- metadata["time"],
- metadata["url"],
- metadata["length"],
- metadata["version"],
- ],
- ),
+ for manifest_format, expected_manifest in [
+ (string.Template("$a $b"), b"1 2"),
+ (string.Template(""), b""),
+ (None, "{time} {length} {version} {url}".format(**metadata).encode()),
]:
- actual_id = p_info.artifact_identity(id_keys=id_keys)
- assert actual_id == expected_id
+ actual_id = p_info.extid(manifest_format=manifest_format)
+ assert actual_id == hashlib.sha256(expected_manifest).digest()
+
+ with pytest.raises(KeyError):
+ p_info.extid(manifest_format=string.Template("$a $unknown_key"))
diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py
--- a/swh/loader/package/cran/loader.py
+++ b/swh/loader/package/cran/loader.py
@@ -9,6 +9,7 @@
import os
from os import path
import re
+import string
from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple
import attr
@@ -37,7 +38,7 @@
raw_info = attr.ib(type=Dict[str, Any])
version = attr.ib(type=str)
- ID_KEYS = ["url", "version"]
+ MANIFEST_FORMAT = string.Template("$version $url")
@classmethod
def from_metadata(cls, a_metadata: Dict[str, Any]) -> "CRANPackageInfo":
@@ -86,6 +87,9 @@
if version == p_info.version:
yield release_name(version), p_info
+ def extid_from_known_artifact(self, known_artifact: Dict) -> bytes:
+ return CRANPackageInfo.from_metadata(known_artifact).extid()
+
def resolve_revision_from(
self, known_artifacts: Mapping[bytes, Mapping], p_info: CRANPackageInfo,
) -> Optional[bytes]:
@@ -93,14 +97,12 @@
artifact_metadata
"""
- new_identity = p_info.artifact_identity()
+ new_extid = p_info.extid()
for rev_id, known_artifact_meta in known_artifacts.items():
logging.debug("known_artifact_meta: %s", known_artifact_meta)
known_artifact = known_artifact_meta["extrinsic"]["raw"]
- known_identity = CRANPackageInfo.from_metadata(
- known_artifact
- ).artifact_identity()
- if new_identity == known_identity:
+ known_extid = self.extid_from_known_artifact(known_artifact)
+ if new_extid == known_extid:
return rev_id
return None
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -4,10 +4,12 @@
# See top-level LICENSE file for more information
import datetime
+import hashlib
from itertools import islice
import json
import logging
import os
+import string
import sys
import tempfile
from typing import (
@@ -111,11 +113,20 @@
# TODO: add support for metadata for directories and contents
@property
- def ID_KEYS(self):
- raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS")
+ def MANIFEST_FORMAT(self) -> string.Template:
+ """A string.Template object used to format a manifest, which is hashed
+ to get the extid of this package info object"""
+ raise NotImplementedError(
+ f"{self.__class__.__name__} is missing MANIFEST_FORMAT "
+ f"or an override of extid()"
+ )
- def artifact_identity(self):
- return [getattr(self, k) for k in self.ID_KEYS]
+ def extid(self) -> bytes:
+ """Returns a unique intrinsic identifier of this package info"""
+ manifest = self.MANIFEST_FORMAT.substitute(
+ {k: str(v) for (k, v) in attr.asdict(self).items()}
+ )
+ return hashlib.sha256(manifest.encode()).digest()
TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo)
diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py
--- a/swh/loader/package/tests/test_loader.py
+++ b/swh/loader/package/tests/test_loader.py
@@ -3,6 +3,9 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import hashlib
+import string
+
import attr
import pytest
@@ -41,7 +44,7 @@
assert actual_load_status2 == {"status": "failed"}
-def test_artifact_identity():
+def test_extid():
"""Compute primary key should return the right identity
"""
@@ -54,7 +57,7 @@
filename = attr.ib()
version = attr.ib()
- ID_KEYS = ["a", "b"]
+ MANIFEST_FORMAT = string.Template("$a $b")
p_info = TestPackageInfo(
url="http://example.org/",
@@ -65,8 +68,8 @@
version="0.1.0",
)
- actual_id = p_info.artifact_identity()
- assert actual_id == [1, 2]
+ actual_id = p_info.extid()
+ assert actual_id == hashlib.sha256(b"1 2").digest()
def test_no_env_swh_config_filename_raise(monkeypatch):

File Metadata

Mime Type
text/plain
Expires
Wed, Jul 2, 11:50 AM (4 d, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222405

Event Timeline