Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9347199
D3613.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
10 KB
Subscribers
None
D3613.diff
View Options
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -7,7 +7,7 @@
import logging
import tempfile
import os
-
+import sys
from typing import (
Any,
Dict,
@@ -38,7 +38,12 @@
Origin,
OriginVisit,
OriginVisitStatus,
+ MetadataAuthority,
+ MetadataFetcher,
+ MetadataTargetType,
+ RawExtrinsicMetadata,
)
+from swh.model.identifiers import SWHID
from swh.storage import get_storage
from swh.storage.utils import now
from swh.storage.algos.snapshot import snapshot_get_latest
@@ -66,6 +71,19 @@
url = attr.ib(type=str)
filename = attr.ib(type=Optional[str])
+ # The following attribute has kw_only=True in order to allow subclasses
+ # to add attributes. Without kw_only, attributes without default values cannot
+ # go after attributes with default values.
+ # See <https://github.com/python-attrs/attrs/issues/38>
+
+ revision_extrinsic_metadata = attr.ib(
+ type=List[Tuple[datetime.datetime, str, bytes]], default=[], kw_only=True,
+ )
+ """Tuple elements are respectively the 'discovery_date', 'format',
+ and 'metadata' fields of RawExtrinsicMetadata"""
+
+ # TODO: add support for metadata for origins, directories, and contents
+
@property
def ID_KEYS(self):
raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS")
@@ -81,6 +99,11 @@
# Origin visit type (str) set by the loader
visit_type = ""
+ DEFAULT_CONFIG = {
+ "create_authorities": ("bool", True),
+ "create_fetchers": ("bool", True),
+ }
+
def __init__(self, url):
"""Loader's constructor. This raises exception if the minimal required
configuration is missing (cf. fn:`check` method).
@@ -367,6 +390,8 @@
if revision_id is None:
try:
revision_id = self._load_revision(p_info, origin)
+ if revision_id:
+ self._load_extrinsic_revision_metadata(p_info, revision_id)
self.storage.flush()
status_load = "eventful"
except Exception as e:
@@ -517,3 +542,97 @@
self.storage.snapshot_add([snapshot])
return snapshot
+
+ def get_loader_name(self) -> str:
+ """Returns a fully qualified name of this loader."""
+ return f"{self.__class__.__module__}.{self.__class__.__name__}"
+
+ def get_loader_version(self) -> str:
+ """Returns the version of the current loader."""
+ module_name = self.__class__.__module__ or ""
+ module_name_parts = module_name.split(".")
+
+ # Iterate rootward through the package hierarchy until we find a parent of this
+ # loader's module with a __version__ attribute.
+ for prefix_size in range(len(module_name_parts), 0, -1):
+ package_name = ".".join(module_name_parts[0:prefix_size])
+ module = sys.modules[package_name]
+ if hasattr(module, "__version__"):
+ return module.__version__ # type: ignore
+
+ # If this loader's class has no parent package with a __version__,
+ # it should implement it itself.
+ raise NotImplementedError(
+ f"Could not dynamically find the version of {self.get_loader_name()}."
+ )
+
+ def get_metadata_fetcher(self) -> MetadataFetcher:
+ """Returns a MetadataFetcher instance representing this package loader;
+ which is used to for adding provenance information to extracted
+ extrinsic metadata, if any."""
+ return MetadataFetcher(
+ name=self.get_loader_name(), version=self.get_loader_version(), metadata={},
+ )
+
+ def get_metadata_authority(self) -> MetadataAuthority:
+ """For package loaders that get extrinsic metadata, returns the authority
+ the metadata are coming from.
+ """
+ raise NotImplementedError("get_metadata_authority")
+
+ def build_extrinsic_revision_metadata(
+ self, p_info: TPackageInfo, revision_id: Sha1Git
+ ) -> List[RawExtrinsicMetadata]:
+ if not p_info.revision_extrinsic_metadata:
+ # If this package loader doesn't write metadata, no need to require
+ # an implementation for get_metadata_authority.
+ return []
+
+ authority = self.get_metadata_authority()
+ fetcher = self.get_metadata_fetcher()
+
+ metadata_objects = []
+
+ for (discovery_date, format, metadata) in p_info.revision_extrinsic_metadata:
+ metadata_objects.append(
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=SWHID(object_type="revision", object_id=revision_id),
+ discovery_date=discovery_date,
+ authority=authority,
+ fetcher=fetcher,
+ format=format,
+ metadata=metadata,
+ origin=self.url,
+ )
+ )
+
+ return metadata_objects
+
+ def _load_extrinsic_revision_metadata(
+ self, p_info: TPackageInfo, revision_id: Sha1Git
+ ) -> None:
+ metadata_objects = self.build_extrinsic_revision_metadata(p_info, revision_id)
+
+ authorities = {
+ (
+ metadata_object.authority.type,
+ metadata_object.authority.url,
+ ): metadata_object.authority
+ for metadata_object in metadata_objects
+ }
+ if authorities:
+ self.storage.metadata_authority_add(authorities.values())
+
+ fetchers = {
+ (
+ metadata_object.fetcher.name,
+ metadata_object.fetcher.version,
+ ): metadata_object.fetcher
+ for metadata_object in metadata_objects
+ }
+ if fetchers:
+ self.storage.metadata_fetcher_add(fetchers.values())
+
+ if metadata_objects:
+ self.storage.object_metadata_add(metadata_objects)
diff --git a/swh/loader/package/tests/test_loader_metadata.py b/swh/loader/package/tests/test_loader_metadata.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/tests/test_loader_metadata.py
@@ -0,0 +1,158 @@
+# Copyright (C) 2019-2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import datetime
+from typing import Iterator, Optional, Sequence, Tuple
+
+import attr
+
+from swh.loader.package.loader import BasePackageInfo, PackageLoader
+from swh.model.hashutil import hash_to_bytes
+from swh.model.identifiers import SWHID
+from swh.model.model import (
+ MetadataAuthority,
+ MetadataAuthorityType,
+ MetadataFetcher,
+ MetadataTargetType,
+ RawExtrinsicMetadata,
+ Sha1Git,
+)
+from swh.storage import get_storage
+
+from swh.loader.package import __version__
+
+EMPTY_SNAPSHOT_ID = "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"
+FULL_SNAPSHOT_ID = "4a9b608c9f01860a627237dd2409d1d50ec4b054"
+
+AUTHORITY = MetadataAuthority(
+ type=MetadataAuthorityType.FORGE, url="http://example.org/",
+)
+ORIGIN_URL = "http://example.org/archive.tgz"
+
+REVISION_ID = hash_to_bytes("8ff44f081d43176474b267de5451f2c2e88089d0")
+REVISION_SWHID = SWHID(object_type="revision", object_id=REVISION_ID)
+
+
+FETCHER = MetadataFetcher(
+ name="swh.loader.package.tests.test_loader_metadata.MetadataTestLoader",
+ version=__version__,
+)
+
+METADATA = [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=REVISION_SWHID,
+ discovery_date=datetime.datetime.now(),
+ authority=AUTHORITY,
+ fetcher=FETCHER,
+ format="test-format1",
+ metadata=b"foo bar",
+ origin=ORIGIN_URL,
+ ),
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.REVISION,
+ id=REVISION_SWHID,
+ discovery_date=datetime.datetime.now() + datetime.timedelta(seconds=1),
+ authority=AUTHORITY,
+ fetcher=FETCHER,
+ format="test-format2",
+ metadata=b"bar baz",
+ origin=ORIGIN_URL,
+ ),
+]
+
+
+class MetadataTestLoader(PackageLoader[BasePackageInfo]):
+ def get_versions(self) -> Sequence[str]:
+ return ["v1.0.0"]
+
+ def _load_revision(self, p_info: BasePackageInfo, origin) -> Optional[Sha1Git]:
+ return REVISION_ID
+
+ def get_metadata_authority(self):
+ return attr.evolve(AUTHORITY, metadata={})
+
+ def get_package_info(self, version: str) -> Iterator[Tuple[str, BasePackageInfo]]:
+ m0 = METADATA[0]
+ m1 = METADATA[1]
+ p_info = BasePackageInfo(
+ url=ORIGIN_URL,
+ filename="archive.tgz",
+ revision_extrinsic_metadata=[
+ (m0.discovery_date, m0.format, m0.metadata),
+ (m1.discovery_date, m1.format, m1.metadata),
+ ],
+ )
+
+ yield (version, p_info)
+
+
+def test_load_revision_metadata(swh_config, caplog):
+ storage = get_storage("memory")
+
+ loader = MetadataTestLoader(ORIGIN_URL)
+ loader.storage = storage
+
+ load_status = loader.load()
+ assert load_status == {
+ "status": "eventful",
+ "snapshot_id": FULL_SNAPSHOT_ID,
+ }
+
+ result = storage.object_metadata_get(
+ MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY,
+ )
+ assert result["next_page_token"] is None
+ assert result["results"] == METADATA
+
+ assert caplog.text == ""
+
+
+def test_existing_authority(swh_config, caplog):
+ storage = get_storage("memory")
+
+ loader = MetadataTestLoader(ORIGIN_URL)
+ loader.storage = storage
+ loader.config["create_authorities"] = False
+
+ storage.metadata_authority_add([attr.evolve(AUTHORITY, metadata={})])
+
+ load_status = loader.load()
+ assert load_status == {
+ "status": "eventful",
+ "snapshot_id": FULL_SNAPSHOT_ID,
+ }
+
+ result = storage.object_metadata_get(
+ MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY,
+ )
+ assert result["next_page_token"] is None
+ assert result["results"] == METADATA
+
+ assert caplog.text == ""
+
+
+def test_existing_fetcher(swh_config, caplog):
+ storage = get_storage("memory")
+
+ loader = MetadataTestLoader(ORIGIN_URL)
+ loader.storage = storage
+ loader.config["create_fetchers"] = False
+
+ storage.metadata_fetcher_add([attr.evolve(FETCHER, metadata={})])
+
+ load_status = loader.load()
+ assert load_status == {
+ "status": "eventful",
+ "snapshot_id": FULL_SNAPSHOT_ID,
+ }
+
+ result = storage.object_metadata_get(
+ MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY,
+ )
+ assert result["next_page_token"] is None
+ assert result["results"] == METADATA
+
+ assert caplog.text == ""
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 5:06 PM (3 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216892
Attached To
D3613: Make the base PackageLoader write extrinsic revision metadata.
Event Timeline
Log In to Comment