diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -22,8 +22,6 @@ MetadataAuthority, MetadataAuthorityType, MetadataFetcher, - MetadataTargetType, - RawExtrinsicMetadata, ) from swh.loader.package.loader import ( BasePackageInfo, @@ -181,6 +179,16 @@ }, ) + def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: + origin_metadata = self.metadata["origin_metadata"] + return [ + RawExtrinsicMetadataCore( + format="sword-v2-atom-codemeta-v2-in-json", + metadata=json.dumps(origin_metadata["metadata"]).encode(), + discovery_date=None, + ) + ] + def load(self) -> Dict: # First making sure the deposit is known prior to trigger a loading try: @@ -192,31 +200,6 @@ r = super().load() success = r["status"] != "failed" - if success: - # Update archive with metadata information - origin_metadata = self.metadata["origin_metadata"] - logger.debug("origin_metadata: %s", origin_metadata) - - authority = self.get_metadata_authority() - self.storage.metadata_authority_add([authority]) - - fetcher = self.get_metadata_fetcher() - self.storage.metadata_fetcher_add([fetcher]) - - self.storage.object_metadata_add( - [ - RawExtrinsicMetadata( - type=MetadataTargetType.ORIGIN, - id=self.url, - discovery_date=self.visit_date, - authority=authority, - fetcher=fetcher, - format="sword-v2-atom-codemeta-v2-in-json", - metadata=json.dumps(origin_metadata["metadata"]).encode(), - ) - ] - ) - # Update deposit status try: if not success: diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -13,6 +13,7 @@ Dict, Iterator, Generic, + Iterable, List, Mapping, Optional, @@ -92,7 +93,7 @@ type=List[RawExtrinsicMetadataCore], default=[], kw_only=True, ) - # TODO: add support for metadata for origins, directories, and contents + # TODO: add support for metadata for directories and contents @property def ID_KEYS(self): @@ -438,13 +439,22 @@ snapshot = self._load_snapshot( default_version, tmp_revisions, extra_branches ) - + self.storage.flush() except Exception as e: logger.exception("Failed to build snapshot for origin %s", self.url) sentry_sdk.capture_exception(e) status_visit = "partial" status_load = "failed" + try: + metadata_objects = self.build_extrinsic_origin_metadata() + self._load_metadata_objects(metadata_objects) + except Exception as e: + logger.exception("Failed to extrinsic origin metadata for %s", self.url) + sentry_sdk.capture_exception(e) + status_visit = "partial" + status_load = "failed" + return finalize_visit() def _load_revision(self, p_info: TPackageInfo, origin) -> Optional[Sha1Git]: @@ -590,6 +600,39 @@ """ raise NotImplementedError("get_metadata_authority") + def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: + """Returns metadata items, used by build_extrinsic_origin_metadata.""" + return [] + + def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]: + """Builds a list of full RawExtrinsicMetadata objects, using + metadata returned by get_extrinsic_origin_metadata.""" + metadata_items = self.get_extrinsic_origin_metadata() + if not metadata_items: + # If this package loader doesn't write metadata, no need to require + # an implementation for get_metadata_authority. + return [] + + authority = self.get_metadata_authority() + fetcher = self.get_metadata_fetcher() + + metadata_objects = [] + + for item in metadata_items: + metadata_objects.append( + RawExtrinsicMetadata( + type=MetadataTargetType.ORIGIN, + id=self.url, + discovery_date=item.discovery_date or self.visit_date, + authority=authority, + fetcher=fetcher, + format=item.format, + metadata=item.metadata, + ) + ) + + return metadata_objects + def build_extrinsic_revision_metadata( self, p_info: TPackageInfo, revision_id: Sha1Git ) -> List[RawExtrinsicMetadata]: @@ -623,26 +666,31 @@ self, p_info: TPackageInfo, revision_id: Sha1Git ) -> None: metadata_objects = self.build_extrinsic_revision_metadata(p_info, revision_id) + self._load_metadata_objects(metadata_objects) + + def _load_metadata_objects( + self, metadata_objects: List[RawExtrinsicMetadata] + ) -> None: + if not metadata_objects: + # If this package loader doesn't write metadata, no need to require + # an implementation for get_metadata_authority. + return - authorities = { - ( - metadata_object.authority.type, - metadata_object.authority.url, - ): metadata_object.authority - for metadata_object in metadata_objects + self._create_authorities(mo.authority for mo in metadata_objects) + self._create_fetchers(mo.fetcher for mo in metadata_objects) + + self.storage.object_metadata_add(metadata_objects) + + def _create_authorities(self, authorities: Iterable[MetadataAuthority]) -> None: + deduplicated_authorities = { + (authority.type, authority.url): authority for authority in authorities } if authorities: - self.storage.metadata_authority_add(authorities.values()) - - fetchers = { - ( - metadata_object.fetcher.name, - metadata_object.fetcher.version, - ): metadata_object.fetcher - for metadata_object in metadata_objects + self.storage.metadata_authority_add(deduplicated_authorities.values()) + + def _create_fetchers(self, fetchers: Iterable[MetadataFetcher]) -> None: + deduplicated_fetchers = { + (fetcher.name, fetcher.version): fetcher for fetcher in fetchers } if fetchers: - self.storage.metadata_fetcher_add(fetchers.values()) - - if metadata_objects: - self.storage.object_metadata_add(metadata_objects) + self.storage.metadata_fetcher_add(deduplicated_fetchers.values()) diff --git a/swh/loader/package/tests/test_loader_metadata.py b/swh/loader/package/tests/test_loader_metadata.py --- a/swh/loader/package/tests/test_loader_metadata.py +++ b/swh/loader/package/tests/test_loader_metadata.py @@ -4,7 +4,7 @@ # See top-level LICENSE file for more information import datetime -from typing import Iterator, Optional, Sequence, Tuple +from typing import Iterator, List, Optional, Sequence, Tuple import attr @@ -44,7 +44,7 @@ version=__version__, ) -METADATA = [ +REVISION_METADATA = [ RawExtrinsicMetadata( type=MetadataTargetType.REVISION, id=REVISION_SWHID, @@ -67,6 +67,18 @@ ), ] +ORIGIN_METADATA = [ + RawExtrinsicMetadata( + type=MetadataTargetType.ORIGIN, + id=ORIGIN_URL, + discovery_date=datetime.datetime.now(), + authority=AUTHORITY, + fetcher=FETCHER, + format="test-format3", + metadata=b"baz qux", + ), +] + class MetadataTestLoader(PackageLoader[BasePackageInfo]): def get_versions(self) -> Sequence[str]: @@ -79,8 +91,8 @@ return attr.evolve(AUTHORITY, metadata={}) def get_package_info(self, version: str) -> Iterator[Tuple[str, BasePackageInfo]]: - m0 = METADATA[0] - m1 = METADATA[1] + m0 = REVISION_METADATA[0] + m1 = REVISION_METADATA[1] p_info = BasePackageInfo( url=ORIGIN_URL, filename="archive.tgz", @@ -92,8 +104,12 @@ yield (version, p_info) + def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: + m = ORIGIN_METADATA[0] + return [RawExtrinsicMetadataCore(m.format, m.metadata, m.discovery_date)] + -def test_load_revision_metadata(swh_config, caplog): +def test_load_metadata(swh_config, caplog): storage = get_storage("memory") loader = MetadataTestLoader(ORIGIN_URL) @@ -109,7 +125,13 @@ MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY, ) assert result["next_page_token"] is None - assert result["results"] == METADATA + assert result["results"] == REVISION_METADATA + + result = storage.object_metadata_get( + MetadataTargetType.ORIGIN, ORIGIN_URL, AUTHORITY, + ) + assert result["next_page_token"] is None + assert result["results"] == ORIGIN_METADATA assert caplog.text == "" @@ -133,7 +155,7 @@ MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY, ) assert result["next_page_token"] is None - assert result["results"] == METADATA + assert result["results"] == REVISION_METADATA assert caplog.text == "" @@ -157,6 +179,6 @@ MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY, ) assert result["next_page_token"] is None - assert result["results"] == METADATA + assert result["results"] == REVISION_METADATA assert caplog.text == ""