Page MenuHomeSoftware Heritage

D3613.diff
No OneTemporary

D3613.diff

diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -7,7 +7,7 @@
import logging
import tempfile
import os
-
+import sys
from typing import (
Any,
Dict,
@@ -38,7 +38,12 @@
Origin,
OriginVisit,
OriginVisitStatus,
+ MetadataAuthority,
+ MetadataFetcher,
+ MetadataTargetType,
+ RawExtrinsicMetadata,
)
+from swh.model.identifiers import SWHID
from swh.storage import get_storage
from swh.storage.utils import now
from swh.storage.algos.snapshot import snapshot_get_latest
@@ -66,6 +71,19 @@
url = attr.ib(type=str)
filename = attr.ib(type=Optional[str])
+ # The following attribute has kw_only=True in order to allow subclasses
+ # to add attributes. Without kw_only, attributes without default values cannot
+ # go after attributes with default values.
+ # See <https://github.com/python-attrs/attrs/issues/38>
+
+ revision_extrinsic_metadata = attr.ib(
+ type=List[Tuple[datetime.datetime, str, bytes]], default=[], kw_only=True,
+ )
+ """Tuple elements are respectively the 'discovery_date', 'format',
+ and 'metadata' fields of RawExtrinsicMetadata"""
+
+ # TODO: add support for metadata for origins, directories, and contents
+
@property
def ID_KEYS(self):
raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS")
@@ -81,6 +99,11 @@
# Origin visit type (str) set by the loader
visit_type = ""
+ DEFAULT_CONFIG = {
+ "create_authorities": ("bool", True),
+ "create_fetchers": ("bool", True),
+ }
+
def __init__(self, url):
"""Loader's constructor. This raises exception if the minimal required
configuration is missing (cf. fn:`check` method).
@@ -367,6 +390,8 @@
if revision_id is None:
try:
revision_id = self._load_revision(p_info, origin)
+ if revision_id:
+ self._load_extrinsic_revision_metadata(p_info, revision_id)
self.storage.flush()
status_load = "eventful"
except Exception as e:
@@ -517,3 +542,97 @@
self.storage.snapshot_add([snapshot])
return snapshot
+
+ def get_loader_name(self) -> str:
+ """Returns a fully qualified name of this loader."""
+ return f"{self.__class__.__module__}.{self.__class__.__name__}"
+
+ def get_loader_version(self) -> str:
+ """Returns the version of the current loader."""
+ module_name = self.__class__.__module__ or ""
+ module_name_parts = module_name.split(".")
+
+ # Iterate rootward through the package hierarchy until we find a parent of this
+ # loader's module with a __version__ attribute.
+ for prefix_size in range(len(module_name_parts), 0, -1):
+ package_name = ".".join(module_name_parts[0:prefix_size])
+ module = sys.modules[package_name]
+ if hasattr(module, "__version__"):
+ return module.__version__ # type: ignore
+
+ # If this loader's class has no parent package with a __version__,
+ # it should implement it itself.
+ raise NotImplementedError(
+ f"Could not dynamically find the version of {self.get_loader_name()}."
+ )
+
+ def get_metadata_fetcher(self) -> MetadataFetcher:
+ """Returns a MetadataFetcher instance representing this package loader;
+ which is used to for adding provenance information to extracted
+ extrinsic metadata, if any."""
+ return MetadataFetcher(
+ name=self.get_loader_name(), version=self.get_loader_version(), metadata={},
+ )
+
+ def get_metadata_authority(self) -> MetadataAuthority:
+ """For package loaders that get extrinsic metadata, returns the authority
+ the metadata are coming from.
+ """
+ raise NotImplementedError("get_metadata_authority")
+
+ def build_extrinsic_revision_metadata(
+ self, p_info: TPackageInfo, revision_id: Sha1Git
+ ) -> List[RawExtrinsicMetadata]:
+ if not p_info.revision_extrinsic_metadata:
+ # If this package loader doesn't write metadata, no need to require
+ # an implementation for get_metadata_authority.
+ return []
+
+ authority = self.get_metadata_authority()
+ fetcher = self.get_metadata_fetcher()
+
+ metadata_objects = []
+
+ for (discovery_date, format, metadata) in p_info.revision_extrinsic_metadata:
+ metadata_objects.append(
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=SWHID(object_type="revision", object_id=revision_id),
+ discovery_date=discovery_date,
+ authority=authority,
+ fetcher=fetcher,
+ format=format,
+ metadata=metadata,
+ origin=self.url,
+ )
+ )
+
+ return metadata_objects
+
+ def _load_extrinsic_revision_metadata(
+ self, p_info: TPackageInfo, revision_id: Sha1Git
+ ) -> None:
+ metadata_objects = self.build_extrinsic_revision_metadata(p_info, revision_id)
+
+ authorities = {
+ (
+ metadata_object.authority.type,
+ metadata_object.authority.url,
+ ): metadata_object.authority
+ for metadata_object in metadata_objects
+ }
+ if authorities:
+ self.storage.metadata_authority_add(authorities.values())
+
+ fetchers = {
+ (
+ metadata_object.fetcher.name,
+ metadata_object.fetcher.version,
+ ): metadata_object.fetcher
+ for metadata_object in metadata_objects
+ }
+ if fetchers:
+ self.storage.metadata_fetcher_add(fetchers.values())
+
+ if metadata_objects:
+ self.storage.object_metadata_add(metadata_objects)
diff --git a/swh/loader/package/tests/test_loader_metadata.py b/swh/loader/package/tests/test_loader_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/tests/test_loader_metadata.py
@@ -0,0 +1,158 @@
+# Copyright (C) 2019-2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import datetime
+from typing import Iterator, Optional, Sequence, Tuple
+
+import attr
+
+from swh.loader.package.loader import BasePackageInfo, PackageLoader
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import SWHID
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ RawExtrinsicMetadata,
+ Sha1Git,
+)
+from swh.storage import get_storage
+
+from swh.loader.package import __version__
+
+EMPTY_SNAPSHOT_ID = "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"
+FULL_SNAPSHOT_ID = "4a9b608c9f01860a627237dd2409d1d50ec4b054"
+
+AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.FORGE, url="http://example.org/",
+)
+ORIGIN_URL = "http://example.org/archive.tgz"
+
+REVISION_ID = hash_to_bytes("8ff44f081d43176474b267de5451f2c2e88089d0")
+REVISION_SWHID = SWHID(object_type="revision", object_id=REVISION_ID)
+
+
+FETCHER = MetadataFetcher(
+ name="swh.loader.package.tests.test_loader_metadata.MetadataTestLoader",
+ version=__version__,
+)
+
+METADATA = [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=REVISION_SWHID,
+ discovery_date=datetime.datetime.now(),
+ authority=AUTHORITY,
+ fetcher=FETCHER,
+ format="test-format1",
+ metadata=b"foo bar",
+ origin=ORIGIN_URL,
+ ),
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=REVISION_SWHID,
+ discovery_date=datetime.datetime.now() + datetime.timedelta(seconds=1),
+ authority=AUTHORITY,
+ fetcher=FETCHER,
+ format="test-format2",
+ metadata=b"bar baz",
+ origin=ORIGIN_URL,
+ ),
+]
+
+
+class MetadataTestLoader(PackageLoader[BasePackageInfo]):
+ def get_versions(self) -> Sequence[str]:
+ return ["v1.0.0"]
+
+ def _load_revision(self, p_info: BasePackageInfo, origin) -> Optional[Sha1Git]:
+ return REVISION_ID
+
+ def get_metadata_authority(self):
+ return attr.evolve(AUTHORITY, metadata={})
+
+ def get_package_info(self, version: str) -> Iterator[Tuple[str, BasePackageInfo]]:
+ m0 = METADATA[0]
+ m1 = METADATA[1]
+ p_info = BasePackageInfo(
+ url=ORIGIN_URL,
+ filename="archive.tgz",
+ revision_extrinsic_metadata=[
+ (m0.discovery_date, m0.format, m0.metadata),
+ (m1.discovery_date, m1.format, m1.metadata),
+ ],
+ )
+
+ yield (version, p_info)
+
+
+def test_load_revision_metadata(swh_config, caplog):
+ storage = get_storage("memory")
+
+ loader = MetadataTestLoader(ORIGIN_URL)
+ loader.storage = storage
+
+ load_status = loader.load()
+ assert load_status == {
+ "status": "eventful",
+ "snapshot_id": FULL_SNAPSHOT_ID,
+ }
+
+ result = storage.object_metadata_get(
+ MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY,
+ )
+ assert result["next_page_token"] is None
+ assert result["results"] == METADATA
+
+ assert caplog.text == ""
+
+
+def test_existing_authority(swh_config, caplog):
+ storage = get_storage("memory")
+
+ loader = MetadataTestLoader(ORIGIN_URL)
+ loader.storage = storage
+ loader.config["create_authorities"] = False
+
+ storage.metadata_authority_add([attr.evolve(AUTHORITY, metadata={})])
+
+ load_status = loader.load()
+ assert load_status == {
+ "status": "eventful",
+ "snapshot_id": FULL_SNAPSHOT_ID,
+ }
+
+ result = storage.object_metadata_get(
+ MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY,
+ )
+ assert result["next_page_token"] is None
+ assert result["results"] == METADATA
+
+ assert caplog.text == ""
+
+
+def test_existing_fetcher(swh_config, caplog):
+ storage = get_storage("memory")
+
+ loader = MetadataTestLoader(ORIGIN_URL)
+ loader.storage = storage
+ loader.config["create_fetchers"] = False
+
+ storage.metadata_fetcher_add([attr.evolve(FETCHER, metadata={})])
+
+ load_status = loader.load()
+ assert load_status == {
+ "status": "eventful",
+ "snapshot_id": FULL_SNAPSHOT_ID,
+ }
+
+ result = storage.object_metadata_get(
+ MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY,
+ )
+ assert result["next_page_token"] is None
+ assert result["results"] == METADATA
+
+ assert caplog.text == ""

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 5:06 PM (3 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216892

Event Timeline