Page MenuHomeSoftware Heritage

D3616.diff
No OneTemporary

D3616.diff

diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py
--- a/swh/loader/package/deposit/loader.py
+++ b/swh/loader/package/deposit/loader.py
@@ -22,8 +22,6 @@
MetadataAuthority,
MetadataAuthorityType,
MetadataFetcher,
- MetadataTargetType,
- RawExtrinsicMetadata,
)
from swh.loader.package.loader import (
BasePackageInfo,
@@ -181,6 +179,16 @@
},
)
+ def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]:
+ origin_metadata = self.metadata["origin_metadata"]
+ return [
+ RawExtrinsicMetadataCore(
+ format="sword-v2-atom-codemeta-v2-in-json",
+ metadata=json.dumps(origin_metadata["metadata"]).encode(),
+ discovery_date=None,
+ )
+ ]
+
def load(self) -> Dict:
# First making sure the deposit is known prior to trigger a loading
try:
@@ -192,31 +200,6 @@
r = super().load()
success = r["status"] != "failed"
- if success:
- # Update archive with metadata information
- origin_metadata = self.metadata["origin_metadata"]
- logger.debug("origin_metadata: %s", origin_metadata)
-
- authority = self.get_metadata_authority()
- self.storage.metadata_authority_add([authority])
-
- fetcher = self.get_metadata_fetcher()
- self.storage.metadata_fetcher_add([fetcher])
-
- self.storage.object_metadata_add(
- [
- RawExtrinsicMetadata(
- type=MetadataTargetType.ORIGIN,
- id=self.url,
- discovery_date=self.visit_date,
- authority=authority,
- fetcher=fetcher,
- format="sword-v2-atom-codemeta-v2-in-json",
- metadata=json.dumps(origin_metadata["metadata"]).encode(),
- )
- ]
- )
-
# Update deposit status
try:
if not success:
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -13,6 +13,7 @@
Dict,
Iterator,
Generic,
+ Iterable,
List,
Mapping,
Optional,
@@ -92,7 +93,7 @@
type=List[RawExtrinsicMetadataCore], default=[], kw_only=True,
)
- # TODO: add support for metadata for origins, directories, and contents
+ # TODO: add support for metadata for directories and contents
@property
def ID_KEYS(self):
@@ -438,13 +439,22 @@
snapshot = self._load_snapshot(
default_version, tmp_revisions, extra_branches
)
-
+ self.storage.flush()
except Exception as e:
logger.exception("Failed to build snapshot for origin %s", self.url)
sentry_sdk.capture_exception(e)
status_visit = "partial"
status_load = "failed"
+ try:
+ metadata_objects = self.build_extrinsic_origin_metadata()
+ self._load_metadata_objects(metadata_objects)
+ except Exception as e:
+ logger.exception("Failed to extrinsic origin metadata for %s", self.url)
+ sentry_sdk.capture_exception(e)
+ status_visit = "partial"
+ status_load = "failed"
+
return finalize_visit()
def _load_revision(self, p_info: TPackageInfo, origin) -> Optional[Sha1Git]:
@@ -590,6 +600,39 @@
"""
raise NotImplementedError("get_metadata_authority")
+ def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]:
+ """Returns metadata items, used by build_extrinsic_origin_metadata."""
+ return []
+
+ def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]:
+ """Builds a list of full RawExtrinsicMetadata objects, using
+ metadata returned by get_extrinsic_origin_metadata."""
+ metadata_items = self.get_extrinsic_origin_metadata()
+ if not metadata_items:
+ # If this package loader doesn't write metadata, no need to require
+ # an implementation for get_metadata_authority.
+ return []
+
+ authority = self.get_metadata_authority()
+ fetcher = self.get_metadata_fetcher()
+
+ metadata_objects = []
+
+ for item in metadata_items:
+ metadata_objects.append(
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.ORIGIN,
+ id=self.url,
+ discovery_date=item.discovery_date or self.visit_date,
+ authority=authority,
+ fetcher=fetcher,
+ format=item.format,
+ metadata=item.metadata,
+ )
+ )
+
+ return metadata_objects
+
def build_extrinsic_revision_metadata(
self, p_info: TPackageInfo, revision_id: Sha1Git
) -> List[RawExtrinsicMetadata]:
@@ -623,26 +666,31 @@
self, p_info: TPackageInfo, revision_id: Sha1Git
) -> None:
metadata_objects = self.build_extrinsic_revision_metadata(p_info, revision_id)
+ self._load_metadata_objects(metadata_objects)
+
+ def _load_metadata_objects(
+ self, metadata_objects: List[RawExtrinsicMetadata]
+ ) -> None:
+ if not metadata_objects:
+ # If this package loader doesn't write metadata, no need to require
+ # an implementation for get_metadata_authority.
+ return
- authorities = {
- (
- metadata_object.authority.type,
- metadata_object.authority.url,
- ): metadata_object.authority
- for metadata_object in metadata_objects
+ self._create_authorities(mo.authority for mo in metadata_objects)
+ self._create_fetchers(mo.fetcher for mo in metadata_objects)
+
+ self.storage.object_metadata_add(metadata_objects)
+
+ def _create_authorities(self, authorities: Iterable[MetadataAuthority]) -> None:
+ deduplicated_authorities = {
+ (authority.type, authority.url): authority for authority in authorities
}
if authorities:
- self.storage.metadata_authority_add(authorities.values())
-
- fetchers = {
- (
- metadata_object.fetcher.name,
- metadata_object.fetcher.version,
- ): metadata_object.fetcher
- for metadata_object in metadata_objects
+ self.storage.metadata_authority_add(deduplicated_authorities.values())
+
+ def _create_fetchers(self, fetchers: Iterable[MetadataFetcher]) -> None:
+ deduplicated_fetchers = {
+ (fetcher.name, fetcher.version): fetcher for fetcher in fetchers
}
if fetchers:
- self.storage.metadata_fetcher_add(fetchers.values())
-
- if metadata_objects:
- self.storage.object_metadata_add(metadata_objects)
+ self.storage.metadata_fetcher_add(deduplicated_fetchers.values())
diff --git a/swh/loader/package/tests/test_loader_metadata.py b/swh/loader/package/tests/test_loader_metadata.py
--- a/swh/loader/package/tests/test_loader_metadata.py
+++ b/swh/loader/package/tests/test_loader_metadata.py
@@ -4,7 +4,7 @@
# See top-level LICENSE file for more information
import datetime
-from typing import Iterator, Optional, Sequence, Tuple
+from typing import Iterator, List, Optional, Sequence, Tuple
import attr
@@ -44,7 +44,7 @@
version=__version__,
)
-METADATA = [
+REVISION_METADATA = [
RawExtrinsicMetadata(
type=MetadataTargetType.REVISION,
id=REVISION_SWHID,
@@ -67,6 +67,18 @@
),
]
+ORIGIN_METADATA = [
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.ORIGIN,
+ id=ORIGIN_URL,
+ discovery_date=datetime.datetime.now(),
+ authority=AUTHORITY,
+ fetcher=FETCHER,
+ format="test-format3",
+ metadata=b"baz qux",
+ ),
+]
+
class MetadataTestLoader(PackageLoader[BasePackageInfo]):
def get_versions(self) -> Sequence[str]:
@@ -79,8 +91,8 @@
return attr.evolve(AUTHORITY, metadata={})
def get_package_info(self, version: str) -> Iterator[Tuple[str, BasePackageInfo]]:
- m0 = METADATA[0]
- m1 = METADATA[1]
+ m0 = REVISION_METADATA[0]
+ m1 = REVISION_METADATA[1]
p_info = BasePackageInfo(
url=ORIGIN_URL,
filename="archive.tgz",
@@ -92,8 +104,12 @@
yield (version, p_info)
+ def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]:
+ m = ORIGIN_METADATA[0]
+ return [RawExtrinsicMetadataCore(m.format, m.metadata, m.discovery_date)]
+
-def test_load_revision_metadata(swh_config, caplog):
+def test_load_metadata(swh_config, caplog):
storage = get_storage("memory")
loader = MetadataTestLoader(ORIGIN_URL)
@@ -109,7 +125,13 @@
MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY,
)
assert result["next_page_token"] is None
- assert result["results"] == METADATA
+ assert result["results"] == REVISION_METADATA
+
+ result = storage.object_metadata_get(
+ MetadataTargetType.ORIGIN, ORIGIN_URL, AUTHORITY,
+ )
+ assert result["next_page_token"] is None
+ assert result["results"] == ORIGIN_METADATA
assert caplog.text == ""
@@ -133,7 +155,7 @@
MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY,
)
assert result["next_page_token"] is None
- assert result["results"] == METADATA
+ assert result["results"] == REVISION_METADATA
assert caplog.text == ""
@@ -157,6 +179,6 @@
MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY,
)
assert result["next_page_token"] is None
- assert result["results"] == METADATA
+ assert result["results"] == REVISION_METADATA
assert caplog.text == ""

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 3:19 PM (2 d, 21 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220590

Event Timeline