diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -2,4 +2,4 @@ swh.model >= 1.0.0 swh.objstorage >= 0.2.2 swh.scheduler >= 0.4.0 -swh.storage >= 0.13.1 +swh.storage >= 0.27.0 diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -44,6 +44,7 @@ ObjectType, ) from swh.model.model import ( + ExtID, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, @@ -588,6 +589,7 @@ branch.target for branch in last_snapshot.branches.values() } + new_extids: Set[ExtID] = set() tmp_revisions: Dict[str, List[Tuple[str, Sha1Git]]] = { version: [] for version in versions } @@ -633,6 +635,16 @@ if revision_id is None: continue + partial_extid = p_info.extid() + if partial_extid is not None: + (extid_type, extid) = partial_extid + revision_swhid = CoreSWHID( + object_type=ObjectType.REVISION, object_id=revision_id + ) + new_extids.add( + ExtID(extid_type=extid_type, extid=extid, target=revision_swhid) + ) + tmp_revisions[version].append((branch_name, revision_id)) if load_exceptions: @@ -689,6 +701,8 @@ status_visit = "partial" status_load = "failed" + self._load_extids(new_extids) + return self.finalize_visit( snapshot=snapshot, visit=visit, @@ -1001,3 +1015,14 @@ } if fetchers: self.storage.metadata_fetcher_add(list(deduplicated_fetchers.values())) + + def _load_extids(self, extids: Set[ExtID]) -> None: + if not extids: + return + try: + self.storage.extid_add(list(extids)) + except Exception as e: + logger.exception("Failed to load new ExtIDs for %s", self.url) + sentry_sdk.capture_exception(e) + # No big deal, it just means the next visit will load the same versions + # again. diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -697,10 +697,10 @@ expected_detections = [ {"reason": "'integrity'", "known_artifact": old_revision.metadata,}, - {"reason": "'integrity'", "known_artifact": old_revision.metadata,}, ] - # as many calls as there are sources listed in the sources.json - assert len(expected_detections) == len(all_sources["sources"]) + # less calls than there are sources listed in the sources.json; + # as some of them are skipped using the ExtID from a previous run + assert len(expected_detections) <= len(all_sources["sources"]) assert actual_detections == expected_detections diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py --- a/swh/loader/package/tests/test_loader.py +++ b/swh/loader/package/tests/test_loader.py @@ -202,8 +202,9 @@ ) -def test_load_skip_extids() -> None: - """Checks PackageLoader.load() skips iff it should.""" +def test_load_extids() -> None: + """Checks PackageLoader.load() skips iff it should, and writes (only) + the new ExtIDs""" storage = get_storage("memory") origin = "http://example.org" @@ -295,6 +296,26 @@ ) assert snapshot_get_latest(storage, origin) == snapshot + extids = storage.extid_get_from_target( + ObjectType.REVISION, + [ + rev1_swhid.object_id, + rev2_swhid.object_id, + rev3_swhid.object_id, + rev4_swhid.object_id, + ], + ) + + assert set(extids) == { + # What we inserted at the beginning of the test: + ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid), + ExtID("extid-type2", b"extid-of-v2.0", rev2_swhid), + # Added by the loader: + ExtID("extid-type1", b"extid-of-v2.0", rev4_swhid), + ExtID("extid-type2", b"extid-of-v3.0", rev4_swhid), + ExtID("extid-type2", b"extid-of-v4.0", rev4_swhid), + } + def test_manifest_extid(): """Compute primary key should return the right identity