Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9313679
D5289.id19000.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
10 KB
Subscribers
None
D5289.id19000.diff
View Options
diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py
--- a/swh/loader/package/archive/loader.py
+++ b/swh/loader/package/archive/loader.py
@@ -4,8 +4,10 @@
# See top-level LICENSE file for more information
import datetime
+import hashlib
import logging
from os import path
+import string
from typing import Any, Dict, Iterator, Optional, Sequence, Tuple, Union
import attr
@@ -40,14 +42,19 @@
"""Timestamp of the archive file on the server"""
version = attr.ib(type=str)
- # default keys for gnu
- ID_KEYS = ["time", "url", "length", "version"]
+ # default format for gnu
+ MANIFEST_FORMAT = string.Template("$time $length $version $url")
- def artifact_identity(self, id_keys=None):
- if id_keys is None:
- id_keys = self.ID_KEYS
+ def extid(self, manifest_format: Optional[string.Template] = None) -> bytes:
+ """Returns a unique intrinsic identifier of this package info
+
+ ``manifest_format`` allows overriding the class' default MANIFEST_FORMAT"""
+ manifest_format = manifest_format or self.MANIFEST_FORMAT
# TODO: use parsed attributes instead of self.raw_info
- return [self.raw_info.get(k) for k in id_keys]
+ manifest = manifest_format.substitute(
+ {k: str(v) for (k, v) in self.raw_info.items()}
+ )
+ return hashlib.sha256(manifest.encode()).digest()
@classmethod
def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo":
@@ -75,10 +82,10 @@
storage: StorageInterface,
url: str,
artifacts: Sequence[Dict[str, Any]],
- identity_artifact_keys: Optional[Sequence[str]] = None,
+ extid_manifest_format: Optional[str] = None,
max_content_size: Optional[int] = None,
):
- """Loader constructor.
+ f"""Loader constructor.
For now, this is the lister's task output.
@@ -97,13 +104,18 @@
- **length**: artifact's length
- identity_artifact_keys: Optional List of keys forming the
- "identity" of an artifact
+ extid_manifest_format: template string used to format a manifest,
+ which is hashed to get the extid of a package.
+ Defaults to {ArchivePackageInfo.MANIFEST_FORMAT!r}
"""
super().__init__(storage=storage, url=url, max_content_size=max_content_size)
self.artifacts = artifacts # assume order is enforced in the lister
- self.identity_artifact_keys = identity_artifact_keys
+ self.extid_manifest_format = (
+ None
+ if extid_manifest_format is None
+ else string.Template(extid_manifest_format)
+ )
def get_versions(self) -> Sequence[str]:
versions = []
@@ -127,20 +139,19 @@
# versioned package
yield release_name(version), p_info
+ def extid_from_reference_artifact(self, reference_artifact: Dict) -> bytes:
+ reference_artifact_info = ArchivePackageInfo.from_metadata(reference_artifact)
+ return reference_artifact_info.extid(manifest_format=self.extid_manifest_format)
+
def resolve_revision_from(
self, known_artifacts: Dict, p_info: ArchivePackageInfo
) -> Optional[bytes]:
- identity = p_info.artifact_identity(id_keys=self.identity_artifact_keys)
+ extid = p_info.extid(manifest_format=self.extid_manifest_format)
for rev_id, known_artifact in known_artifacts.items():
logging.debug("known_artifact: %s", known_artifact)
reference_artifact = known_artifact["extrinsic"]["raw"]
- reference_artifact_info = ArchivePackageInfo.from_metadata(
- reference_artifact
- )
- known_identity = reference_artifact_info.artifact_identity(
- id_keys=self.identity_artifact_keys
- )
- if identity == known_identity:
+ known_extid = self.extid_from_reference_artifact(reference_artifact)
+ if extid == known_extid:
return rev_id
return None
diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py
--- a/swh/loader/package/archive/tests/test_archive.py
+++ b/swh/loader/package/archive/tests/test_archive.py
@@ -3,7 +3,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import hashlib
+import string
+
import attr
+import pytest
from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo
from swh.loader.package.tests.common import check_metadata_paths
@@ -319,7 +323,7 @@
swh_storage,
url,
artifacts=artifacts,
- identity_artifact_keys=["sha256", "length", "url"],
+ extid_manifest_format="$sha256 $length $url",
)
actual_load_status = loader.load()
@@ -340,7 +344,7 @@
assert len(urls) == 1
-def test_archive_artifact_identity():
+def test_archive_extid():
"""Compute primary key should return the right identity
"""
@@ -356,19 +360,13 @@
raw_info={**metadata, "a": 1, "b": 2}, a=1, b=2, **metadata,
)
- for id_keys, expected_id in [
- (["a", "b"], [1, 2]),
- ([], []),
- (["a", "key-that-does-not-exist"], [1, None]),
- (
- None,
- [
- metadata["time"],
- metadata["url"],
- metadata["length"],
- metadata["version"],
- ],
- ),
+ for manifest_format, expected_manifest in [
+ (string.Template("$a $b"), b"1 2"),
+ (string.Template(""), b""),
+ (None, "{time} {length} {version} {url}".format(**metadata).encode()),
]:
- actual_id = p_info.artifact_identity(id_keys=id_keys)
- assert actual_id == expected_id
+ actual_id = p_info.extid(manifest_format=manifest_format)
+ assert actual_id == hashlib.sha256(expected_manifest).digest()
+
+ with pytest.raises(KeyError):
+ p_info.extid(manifest_format=string.Template("$a $unknown_key"))
diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py
--- a/swh/loader/package/cran/loader.py
+++ b/swh/loader/package/cran/loader.py
@@ -9,6 +9,7 @@
import os
from os import path
import re
+import string
from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple
import attr
@@ -37,7 +38,7 @@
raw_info = attr.ib(type=Dict[str, Any])
version = attr.ib(type=str)
- ID_KEYS = ["url", "version"]
+ MANIFEST_FORMAT = string.Template("$version $url")
@classmethod
def from_metadata(cls, a_metadata: Dict[str, Any]) -> "CRANPackageInfo":
@@ -86,6 +87,9 @@
if version == p_info.version:
yield release_name(version), p_info
+ def extid_from_known_artifact(self, known_artifact: Dict) -> bytes:
+ return CRANPackageInfo.from_metadata(known_artifact).extid()
+
def resolve_revision_from(
self, known_artifacts: Mapping[bytes, Mapping], p_info: CRANPackageInfo,
) -> Optional[bytes]:
@@ -93,14 +97,12 @@
artifact_metadata
"""
- new_identity = p_info.artifact_identity()
+ new_extid = p_info.extid()
for rev_id, known_artifact_meta in known_artifacts.items():
logging.debug("known_artifact_meta: %s", known_artifact_meta)
known_artifact = known_artifact_meta["extrinsic"]["raw"]
- known_identity = CRANPackageInfo.from_metadata(
- known_artifact
- ).artifact_identity()
- if new_identity == known_identity:
+ known_extid = self.extid_from_known_artifact(known_artifact)
+ if new_extid == known_extid:
return rev_id
return None
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -4,10 +4,12 @@
# See top-level LICENSE file for more information
import datetime
+import hashlib
from itertools import islice
import json
import logging
import os
+import string
import sys
import tempfile
from typing import (
@@ -111,11 +113,20 @@
# TODO: add support for metadata for directories and contents
@property
- def ID_KEYS(self):
- raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS")
+ def MANIFEST_FORMAT(self) -> string.Template:
+ """A string.Template object used to format a manifest, which is hashed
+ to get the extid of this package info object"""
+ raise NotImplementedError(
+ f"{self.__class__.__name__} is missing MANIFEST_FORMAT "
+ f"or an override of extid()"
+ )
- def artifact_identity(self):
- return [getattr(self, k) for k in self.ID_KEYS]
+ def extid(self) -> bytes:
+ """Returns a unique intrinsic identifier of this package info"""
+ manifest = self.MANIFEST_FORMAT.substitute(
+ {k: str(v) for (k, v) in attr.asdict(self).items()}
+ )
+ return hashlib.sha256(manifest.encode()).digest()
TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo)
diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py
--- a/swh/loader/package/tests/test_loader.py
+++ b/swh/loader/package/tests/test_loader.py
@@ -3,6 +3,9 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import hashlib
+import string
+
import attr
import pytest
@@ -41,7 +44,7 @@
assert actual_load_status2 == {"status": "failed"}
-def test_artifact_identity():
+def test_extid():
"""Compute primary key should return the right identity
"""
@@ -54,7 +57,7 @@
filename = attr.ib()
version = attr.ib()
- ID_KEYS = ["a", "b"]
+ MANIFEST_FORMAT = string.Template("$a $b")
p_info = TestPackageInfo(
url="http://example.org/",
@@ -65,8 +68,8 @@
version="0.1.0",
)
- actual_id = p_info.artifact_identity()
- assert actual_id == [1, 2]
+ actual_id = p_info.extid()
+ assert actual_id == hashlib.sha256(b"1 2").digest()
def test_no_env_swh_config_filename_raise(monkeypatch):
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Jul 2, 11:50 AM (4 d, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222405
Attached To
D5289: archive, cran: Replace 'artifact_identity' with extid to detect known packages
Event Timeline
Log In to Comment