diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -449,6 +449,18 @@ status_visit = "partial" status_load = "failed" + if snapshot: + try: + metadata_objects = self.build_extrinsic_snapshot_metadata(snapshot.id) + self._load_metadata_objects(metadata_objects) + except Exception as e: + logger.exception( + "Failed to load extrinsic snapshot metadata for %s", self.url + ) + sentry_sdk.capture_exception(e) + status_visit = "partial" + status_load = "failed" + try: metadata_objects = self.build_extrinsic_origin_metadata() self._load_metadata_objects(metadata_objects) @@ -638,6 +650,42 @@ return metadata_objects + def get_extrinsic_snapshot_metadata(self) -> List[RawExtrinsicMetadataCore]: + """Returns metadata items, used by build_extrinsic_snapshot_metadata.""" + return [] + + def build_extrinsic_snapshot_metadata( + self, snapshot_id: Sha1Git + ) -> List[RawExtrinsicMetadata]: + """Builds a list of full RawExtrinsicMetadata objects, using + metadata returned by get_extrinsic_snapshot_metadata.""" + metadata_items = self.get_extrinsic_snapshot_metadata() + if not metadata_items: + # If this package loader doesn't write metadata, no need to require + # an implementation for get_metadata_authority. + return [] + + authority = self.get_metadata_authority() + fetcher = self.get_metadata_fetcher() + + metadata_objects = [] + + for item in metadata_items: + metadata_objects.append( + RawExtrinsicMetadata( + type=MetadataTargetType.SNAPSHOT, + id=SWHID(object_type="snapshot", object_id=snapshot_id), + discovery_date=item.discovery_date or self.visit_date, + authority=authority, + fetcher=fetcher, + format=item.format, + metadata=item.metadata, + origin=self.url, + ) + ) + + return metadata_objects + def build_extrinsic_revision_metadata( self, p_info: TPackageInfo, revision_id: Sha1Git ) -> List[RawExtrinsicMetadata]: diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -12,6 +12,8 @@ from swh.model import hashutil from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, Revision, RevisionType, TargetType, @@ -21,7 +23,11 @@ ) from swh.loader.package.utils import EMPTY_AUTHOR -from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.loader import ( + BasePackageInfo, + PackageLoader, + RawExtrinsicMetadataCore, +) logger = logging.getLogger(__name__) @@ -55,8 +61,8 @@ def __init__(self, url): super().__init__(url=url) - raw = retrieve_sources(url) - clean = clean_sources(raw) + self.raw_sources = retrieve_sources(url) + clean = clean_sources(parse_sources(self.raw_sources)) self.sources = clean["sources"] self.provider_url = url @@ -76,6 +82,20 @@ """ return self._integrityByUrl.keys() + def get_metadata_authority(self): + return MetadataAuthority( + type=MetadataAuthorityType.FORGE, url=self.url, metadata={}, + ) + + def get_extrinsic_snapshot_metadata(self): + return [ + RawExtrinsicMetadataCore( + format="nixguix-sources-json", + metadata=self.raw_sources, + discovery_date=None, + ), + ] + # Note: this could be renamed get_artifact_info in the PackageLoader # base class. def get_package_info(self, url) -> Iterator[Tuple[str, NixGuixPackageInfo]]: @@ -186,12 +206,16 @@ ) -def retrieve_sources(url: str) -> Dict[str, Any]: +def retrieve_sources(url: str) -> bytes: response = requests.get(url, allow_redirects=True) if response.status_code != 200: raise ValueError("Got %d HTTP code on %s", response.status_code, url) - return json.loads(response.content.decode("utf-8")) + return response.content + + +def parse_sources(raw_sources: bytes) -> Dict[str, Any]: + return json.loads(raw_sources.decode("utf-8")) def clean_sources(sources: Dict[str, Any]) -> Dict[str, Any]: diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -15,11 +15,22 @@ from unittest.mock import patch -from swh.model.model import Snapshot, SnapshotBranch, TargetType +from swh.model.identifiers import SWHID +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, + Snapshot, + SnapshotBranch, + TargetType, +) from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.nixguix.loader import ( NixGuixPackageInfo, NixGuixLoader, + parse_sources, retrieve_sources, clean_sources, ) @@ -27,6 +38,10 @@ from swh.loader.package.utils import download from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.storage.exc import HashCollision +from swh.storage.algos.origin import origin_get_latest_visit_status +from swh.storage.interface import PagedResult + +from swh.loader.package import __version__ from swh.loader.tests import ( assert_last_visit_matches, @@ -38,6 +53,17 @@ sources_url = "https://nix-community.github.io/nixpkgs-swh/sources.json" +@pytest.fixture +def raw_sources(datadir) -> bytes: + with open( + os.path.join( + datadir, "https_nix-community.github.io", "nixpkgs-swh_sources.json" + ), + "rb", + ) as f: + return f.read() + + SNAPSHOT1 = Snapshot( id=hash_to_bytes("0c5881c74283793ebe9a09a105a9381e41380383"), branches={ @@ -80,7 +106,7 @@ def test_retrieve_sources(swh_config, requests_mock_datadir): - j = retrieve_sources(sources_url) + j = parse_sources(retrieve_sources(sources_url)) assert "sources" in j.keys() assert len(j["sources"]) == 2 @@ -136,7 +162,7 @@ assert len(clean["sources"]) == 1 -def test_loader_one_visit(swh_config, requests_mock_datadir): +def test_loader_one_visit(swh_config, requests_mock_datadir, raw_sources): loader = NixGuixLoader(sources_url) res = loader.load() assert res["status"] == "eventful" @@ -160,6 +186,34 @@ loader.storage, sources_url, status="partial", type="nixguix" ) + (_, visit_status) = origin_get_latest_visit_status(loader.storage, sources_url) + snapshot_swhid = SWHID( + object_type="snapshot", object_id=hash_to_hex(visit_status.snapshot) + ) + metadata_authority = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url=sources_url, + ) + expected_metadata = [ + RawExtrinsicMetadata( + type=MetadataTargetType.SNAPSHOT, + id=snapshot_swhid, + authority=metadata_authority, + fetcher=MetadataFetcher( + name="swh.loader.package.nixguix.loader.NixGuixLoader", + version=__version__, + ), + discovery_date=loader.visit_date, + format="nixguix-sources-json", + metadata=raw_sources, + origin=sources_url, + ) + ] + assert loader.storage.raw_extrinsic_metadata_get( + type=MetadataTargetType.SNAPSHOT, + id=snapshot_swhid, + authority=metadata_authority, + ) == PagedResult(next_page_token=None, results=expected_metadata,) + def test_uncompress_failure(swh_config, requests_mock_datadir): """Non tarball files are currently not supported and the uncompress @@ -305,7 +359,7 @@ } == stats -def test_resolve_revision_from(swh_config, requests_mock_datadir): +def test_resolve_revision_from(swh_config, requests_mock_datadir, datadir): loader = NixGuixLoader(sources_url) known_artifacts = { diff --git a/swh/loader/package/nixguix/tests/test_tasks.py b/swh/loader/package/nixguix/tests/test_tasks.py --- a/swh/loader/package/nixguix/tests/test_tasks.py +++ b/swh/loader/package/nixguix/tests/test_tasks.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import json + def test_nixguix_loader( mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config @@ -13,11 +15,9 @@ mock_retrieve_sources = mocker.patch( "swh.loader.package.nixguix.loader.retrieve_sources" ) - mock_retrieve_sources.return_value = { - "version": 1, - "sources": [], - "revision": "some-revision", - } + mock_retrieve_sources.return_value = json.dumps( + {"version": 1, "sources": [], "revision": "some-revision",} + ).encode() res = swh_scheduler_celery_app.send_task( "swh.loader.package.nixguix.tasks.LoadNixguix", kwargs=dict(url="some-url") diff --git a/swh/loader/tests/__init__.py b/swh/loader/tests/__init__.py --- a/swh/loader/tests/__init__.py +++ b/swh/loader/tests/__init__.py @@ -59,6 +59,7 @@ f"Visit_status points to snapshot {visit_status.snapshot.hex()} " f"instead of {snapshot.hex()}" ) + return visit_status