diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -144,12 +144,6 @@ ) -> Optional[PartialExtID]: return p_info.extid(manifest_format=self.extid_manifest_format) - def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[PartialExtID]: - known_artifact_info = ArchivePackageInfo.from_metadata( - known_artifact["extrinsic"]["raw"] - ) - return known_artifact_info.extid(manifest_format=self.extid_manifest_format) - def build_revision( self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: @@ -169,12 +163,4 @@ parents=(), directory=directory, synthetic=True, - metadata={ - "intrinsic": {}, - "extrinsic": { - "provider": self.url, - "when": self.visit_date.isoformat(), - "raw": p_info.raw_info, - }, - }, ) diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -10,7 +10,6 @@ import pytest from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo -from swh.loader.package.tests.common import check_metadata_paths from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import Snapshot, SnapshotBranch, TargetType @@ -110,37 +109,6 @@ assert_last_visit_matches(swh_storage, url, status="partial", type="tar") -def test_archive_check_revision_metadata_structure(swh_storage, requests_mock_datadir): - loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS) - - actual_load_status = loader.load() - assert actual_load_status["status"] == "eventful" - assert actual_load_status["snapshot_id"] is not None - - assert_last_visit_matches(swh_storage, URL, status="full", type="tar") - - expected_revision_id = hash_to_bytes("44183488c0774ce3c957fa19ba695cf18a4a42b3") - revision = swh_storage.revision_get([expected_revision_id])[0] - assert revision is not None - - check_metadata_paths( - revision.metadata, - paths=[ - ("intrinsic", dict), - ("extrinsic.provider", str), - ("extrinsic.when", str), - ("extrinsic.raw", dict), - ("original_artifact", list), - ], - ) - - for original_artifact in revision.metadata["original_artifact"]: - check_metadata_paths( - original_artifact, - paths=[("filename", str), ("length", int), ("checksums", dict),], - ) - - def test_archive_visit_with_release_artifact_no_prior_visit( swh_storage, requests_mock_datadir ): diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -16,7 +16,7 @@ import dateutil.parser from debian.deb822 import Deb822 -from swh.loader.package.loader import BasePackageInfo, PackageLoader, PartialExtID +from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import release_name from swh.model.model import ( Person, @@ -88,10 +88,6 @@ if version == p_info.version: yield release_name(version), p_info - @staticmethod - def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]: - return CRANPackageInfo.from_metadata(known_artifact["extrinsic"]["raw"]).extid() - def build_revision( self, p_info: CRANPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: @@ -110,14 +106,6 @@ parents=(), directory=directory, synthetic=True, - metadata={ - "intrinsic": {"tool": "DESCRIPTION", "raw": metadata,}, - "extrinsic": { - "provider": self.url, - "when": self.visit_date.isoformat(), - "raw": p_info.raw_info, - }, - }, ) diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py --- a/swh/loader/package/debian/loader.py +++ b/swh/loader/package/debian/loader.py @@ -178,21 +178,6 @@ p_info = DebianPackageInfo.from_metadata(meta, url=self.url) yield release_name(version), p_info - def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[PartialExtID]: - sha256 = _artifact_to_dsc_sha256(known_artifact, url=self.url) - if sha256 is None: - return None - return (EXTID_TYPE, hash_to_bytes(sha256)) - - def resolve_revision_from_artifacts( - self, known_artifacts: Dict, p_info: DebianPackageInfo, - ) -> Optional[bytes]: - try: - return super().resolve_revision_from_artifacts(known_artifacts, p_info) - except DscCountError: - # known_artifacts are corrupted, ignore them instead of crashing - return None - def download_package( self, p_info: DebianPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: @@ -252,14 +237,6 @@ parents=(), directory=directory, synthetic=True, - metadata={ - "intrinsic": {"tool": "dsc", "raw": attr.asdict(intrinsic_metadata),}, - "extrinsic": { - "provider": dsc_url, - "when": self.visit_date.isoformat(), - "raw": p_info.raw_info, - }, - }, ) diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py --- a/swh/loader/package/debian/tests/test_debian.py +++ b/swh/loader/package/debian/tests/test_debian.py @@ -5,7 +5,6 @@ import logging from os import path -import random import pytest @@ -411,152 +410,3 @@ ) check_snapshot(expected_snapshot, swh_storage) - - -def test_debian_resolve_revision_from_artifacts_edge_cases(): - """Solving revision with empty data will result in unknown revision - - """ - loader = DebianLoader(None, None, None, None) - empty_artifact = { - "name": PACKAGE_FILES["name"], - "version": PACKAGE_FILES["version"], - } - for package_artifacts in [empty_artifact, PACKAGE_FILES]: - p_info = DebianPackageInfo.from_metadata(package_artifacts, url=URL) - actual_revision = loader.resolve_revision_from_artifacts({}, p_info) - assert actual_revision is None - - for known_artifacts in [{}, PACKAGE_FILES]: - actual_revision = loader.resolve_revision_from_artifacts( - known_artifacts, DebianPackageInfo.from_metadata(empty_artifact, url=URL) - ) - assert actual_revision is None - - known_package_artifacts = { - b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07": { - "extrinsic": { - # empty - }, - # ... removed the unnecessary intermediary data - } - } - assert not loader.resolve_revision_from_artifacts( - known_package_artifacts, DebianPackageInfo.from_metadata(PACKAGE_FILES, url=URL) - ) - - -def test_debian_resolve_revision_from_artifacts_edge_cases_hit_and_miss(): - """Solving revision with inconsistent data will result in unknown revision - - """ - loader = DebianLoader(None, None, None, None) - artifact_metadata = PACKAGE_FILES2 - p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) - expected_revision_id = ( - b"(\x08\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xff\x85\x85O\xfe\xcf\x07" # noqa - ) - known_package_artifacts = { - expected_revision_id: { - "extrinsic": {"raw": PACKAGE_FILES,}, - # ... removed the unnecessary intermediary data - } - } - - actual_revision = loader.resolve_revision_from_artifacts( - known_package_artifacts, p_info - ) - - assert actual_revision is None - - -def test_debian_resolve_revision_from_artifacts(): - """Solving revision with consistent data will solve the revision - - """ - loader = DebianLoader(None, None, None, None) - artifact_metadata = PACKAGE_FILES - p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) - expected_revision_id = ( - b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07" # noqa - ) - - files = artifact_metadata["files"] - # shuffling dict's keys - keys = list(files.keys()) - random.shuffle(keys) - package_files = { - "name": PACKAGE_FILES["name"], - "version": PACKAGE_FILES["version"], - "files": {k: files[k] for k in keys}, - } - - known_package_artifacts = { - expected_revision_id: { - "extrinsic": {"raw": package_files,}, - # ... removed the unnecessary intermediary data - } - } - - actual_revision = loader.resolve_revision_from_artifacts( - known_package_artifacts, p_info - ) - - assert actual_revision == expected_revision_id - - -def test_debian_resolve_revision_from_artifacts_corrupt_known_artifact(): - """To many or not enough .dsc files in the known_artifacts dict""" - loader = DebianLoader(None, None, None, None) - artifact_metadata = PACKAGE_FILES - p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) - expected_revision_id = ( - b"(\x07\xf5\xb3\xf8Ch\xb4\x88\x9a\x9a\xe8'\xfe\x85\x85O\xfe\xcf\x07" - ) - - files = dict(artifact_metadata["files"]) - package_files = { - "name": PACKAGE_FILES["name"], - "version": PACKAGE_FILES["version"], - "files": files, - } - - known_package_artifacts = { - expected_revision_id: { - "extrinsic": {"raw": package_files,}, - # ... removed the unnecessary intermediary data - } - } - - # Too many .dsc - files["another.dsc"] = files["cicero_0.7.2-3.dsc"] - assert ( - loader.resolve_revision_from_artifacts(known_package_artifacts, p_info) is None - ) - - # Not enough .dsc - del files["another.dsc"] - del files["cicero_0.7.2-3.dsc"] - assert ( - loader.resolve_revision_from_artifacts(known_package_artifacts, p_info) is None - ) - - -def test_debian_resolve_revision_from_artifacts_corrupt_new_artifact(): - loader = DebianLoader(None, None, None, None) - artifact_metadata = PACKAGE_FILES - - files = PACKAGE_FILES["files"] - files = {**files, "another.dsc": files["cicero_0.7.2-3.dsc"]} - artifact_metadata = {**PACKAGE_FILES, "files": files} - - # Too many .dsc - files["another.dsc"] = files["cicero_0.7.2-3.dsc"] - p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) - assert loader.resolve_revision_from_artifacts(PACKAGE_FILES, p_info) is None - - # Not enough .dsc - del files["another.dsc"] - del files["cicero_0.7.2-3.dsc"] - p_info = DebianPackageInfo.from_metadata(artifact_metadata, url=URL) - assert loader.resolve_revision_from_artifacts(PACKAGE_FILES, p_info) is None diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -209,13 +209,6 @@ parents=p_info.revision_parents, directory=directory, synthetic=True, - metadata={ - "extrinsic": { - "provider": self.client.metadata_url(self.deposit_id), - "when": self.visit_date.isoformat(), - "raw": p_info.raw_info, - }, - }, ) def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -12,7 +12,6 @@ from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.loader.package.deposit.loader import ApiClient, DepositLoader from swh.loader.package.loader import now -from swh.loader.package.tests.common import check_metadata_paths from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.identifiers import ( @@ -158,42 +157,6 @@ assert body == expected_body -def test_deposit_revision_metadata_structure( - swh_storage, deposit_client, requests_mock_datadir -): - url = "https://hal-test.archives-ouvertes.fr/some-external-id" - deposit_id = 666 - loader = DepositLoader( - swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" - ) - - actual_load_status = loader.load() - assert actual_load_status["status"] == "eventful" - assert actual_load_status["snapshot_id"] is not None - expected_revision_id = hash_to_bytes("637318680351f5d78856d13264faebbd91efe9bb") - revision = loader.storage.revision_get([expected_revision_id])[0] - assert revision is not None - - check_metadata_paths( - revision.metadata, - paths=[ - ("extrinsic.provider", str), - ("extrinsic.when", str), - ("extrinsic.raw", dict), - ("original_artifact", list), - ], - ) - - # Only 2 top-level keys now - assert set(revision.metadata.keys()) == {"extrinsic", "original_artifact"} - - for original_artifact in revision.metadata["original_artifact"]: - check_metadata_paths( - original_artifact, - paths=[("filename", str), ("length", int), ("checksums", dict),], - ) - - def test_deposit_loading_ok(swh_storage, deposit_client, requests_mock_datadir): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 666 @@ -336,8 +299,7 @@ assert revision assert revision.date.to_dict() == raw_meta["deposit"]["author_date"] assert revision.committer_date.to_dict() == raw_meta["deposit"]["committer_date"] - - read_api = f"{DEPOSIT_URL}/{deposit_id}/meta/" + assert not revision.metadata provider = { "provider_name": "hal", @@ -350,31 +312,6 @@ "version": "0.0.1", "configuration": {"sword_version": "2"}, } - assert revision.metadata == { - "extrinsic": { - "provider": read_api, - "raw": { - "origin": {"type": "deposit", "url": url,}, - "origin_metadata": { - "metadata": raw_meta["metadata_dict"], - "provider": provider, - "tool": tool, - }, - }, - "when": revision.metadata["extrinsic"]["when"], # dynamic - }, - "original_artifact": [ - { - "checksums": { - "sha1": "f8c63d7c890a7453498e6cf9fef215d85ec6801d", - "sha256": "474bf646aeeff6d945eb752b1a9f8a40f3d81a88909ee7bd2d08cc822aa361e6", # noqa - }, - "filename": "archive.zip", - "length": 956830, - "url": "https://deposit.softwareheritage.org/1/private/777/raw/", - } - ], - } fetcher = MetadataFetcher(name="swh-deposit", version="0.0.1",) diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -252,45 +252,6 @@ def new_packageinfo_to_extid(self, p_info: TPackageInfo) -> Optional[PartialExtID]: return p_info.extid() - def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[PartialExtID]: - """Returns a unique intrinsic identifier of a downloaded artifact, - used to check if a new artifact is the same.""" - return None - - def resolve_revision_from_artifacts( - self, known_artifacts: Dict[Sha1Git, Any], p_info: TPackageInfo, - ) -> Optional[Sha1Git]: - """Resolve the revision from known artifact metadata and a package info object. - - If the artifact has already been downloaded, this will return the - existing revision targeting that uncompressed artifact directory. - Otherwise, this returns None. - - Args: - known_artifacts: dict from revision ids to revision metadata - p_info: Package information - - Returns: - None or revision identifier - - """ - if not known_artifacts: - # No known artifact, no need to compute the artifact's extid - return None - - new_extid = self.new_packageinfo_to_extid(p_info) - if new_extid is None: - # This loader does not support deduplication, at least not for this - # artifact. - return None - - for rev_id, known_artifact in known_artifacts.items(): - known_extid = self.known_artifact_to_extid(known_artifact) - if new_extid == known_extid: - return rev_id - - return None - def _get_known_extids( self, packages_info: List[TPackageInfo] ) -> Dict[PartialExtID, List[CoreSWHID]]: @@ -609,14 +570,6 @@ known_extids, p_info, last_snapshot_targets ) - if revision_id is None: - # No existing revision found from an acceptable ExtID, - # search in the artifact data instead. - # TODO: remove this after we finished migrating to ExtIDs. - revision_id = self.resolve_revision_from_artifacts( - known_artifacts, p_info - ) - if revision_id is None: # No matching revision found in the last snapshot, load it. try: @@ -771,18 +724,6 @@ return None metadata = [metadata for (filepath, metadata) in dl_artifacts] - extra_metadata: Tuple[str, Any] = ( - "original_artifact", - metadata, - ) - - if revision.metadata is not None: - full_metadata = list(revision.metadata.items()) + [extra_metadata] - else: - full_metadata = [extra_metadata] - - # TODO: don't add these extrinsic metadata to the revision. - revision = attr.evolve(revision, metadata=ImmutableDict(full_metadata)) original_artifact_metadata = RawExtrinsicMetadata( target=ExtendedSWHID( diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -155,21 +155,6 @@ ret[revision.id] = revision.metadata return ret - @staticmethod - def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]: - try: - value = known_artifact["extrinsic"]["raw"]["integrity"].encode("ascii") - except KeyError as e: - logger.exception( - "Unexpected metadata revision structure detected: %(context)s", - {"context": {"reason": str(e), "known_artifact": known_artifact,}}, - ) - # metadata field for the revision is not as expected by the loader - # nixguix. We consider this not the right revision and continue checking - # the other revisions - return None - return (EXTID_TYPE, value) - def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """We add a branch to the snapshot called 'evaluation' pointing to the revision used to generate the sources.json file. This revision @@ -210,13 +195,6 @@ parents=(), directory=directory, synthetic=True, - metadata={ - "extrinsic": { - "provider": self.provider_url, - "when": self.visit_date.isoformat(), - "raw": p_info.raw_info, - }, - }, ) diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -6,17 +6,14 @@ import json import logging import os -from typing import Dict, List, Optional, Tuple -from unittest.mock import patch +from typing import Dict, Optional, Tuple -import attr import pytest from swh.loader.package import __version__ from swh.loader.package.archive.loader import ArchiveLoader from swh.loader.package.nixguix.loader import ( NixGuixLoader, - NixGuixPackageInfo, clean_sources, make_pattern_unsupported_file_extension, parse_sources, @@ -93,10 +90,7 @@ for rev in revisions: assert rev is not None metadata = rev.metadata - assert metadata is not None - raw = metadata["extrinsic"]["raw"] - assert "url" in raw - assert "integrity" in raw + assert not metadata def test_retrieve_sources(swh_storage, requests_mock_datadir): @@ -469,24 +463,6 @@ } == stats -def test_resolve_revision_from_artifacts(swh_storage, requests_mock_datadir, datadir): - loader = NixGuixLoader(swh_storage, sources_url) - - known_artifacts = { - "id1": {"extrinsic": {"raw": {"url": "url1", "integrity": "integrity1"}}}, - "id2": {"extrinsic": {"raw": {"url": "url2", "integrity": "integrity2"}}}, - } - - p_info = NixGuixPackageInfo.from_metadata( - {"url": "url1", "integrity": "integrity1"} - ) - assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) == "id1" - p_info = NixGuixPackageInfo.from_metadata( - {"url": "url3", "integrity": "integrity3"} - ) - assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) is None - - def test_evaluation_branch(swh_storage, requests_mock_datadir): loader = NixGuixLoader(swh_storage, sources_url) res = loader.load() @@ -601,12 +577,6 @@ archive_loader.storage, gnu_url, status="full", type="tar" ) - gnu_snapshot: Snapshot = snapshot_get_all_branches( - archive_loader.storage, hash_to_bytes(expected_snapshot_id) - ) - - first_revision = gnu_snapshot.branches[f"releases/{release}".encode("utf-8")] - # 2. Then ingest with the nixguix loader which lists the same artifact within its # sources.json @@ -634,73 +604,3 @@ snapshot_id = actual_load_status2["snapshot_id"] snapshot = snapshot_get_all_branches(swh_storage, hash_to_bytes(snapshot_id)) assert snapshot - - # 3. Then ingest again with the nixguix loader, with a different snapshot - # and different source - - # simulate a snapshot already seen with a revision with the wrong metadata structure - # This revision should be skipped, thus making the artifact being ingested again. - with patch( - "swh.loader.package.loader.PackageLoader.last_snapshot" - ) as last_snapshot: - # mutate the snapshot to target a revision with the wrong metadata structure - # snapshot["branches"][artifact_url.encode("utf-8")] = first_revision - old_revision = swh_storage.revision_get([first_revision.target])[0] - # assert that revision is not in the right format - assert old_revision.metadata["extrinsic"]["raw"].get("integrity", {}) == {} - - # mutate snapshot to create a clash - snapshot = attr.evolve( - snapshot, - branches={ - **snapshot.branches, - artifact_url.encode("utf-8"): SnapshotBranch( - target_type=TargetType.REVISION, - target=hash_to_bytes(old_revision.id), - ), - }, - ) - - # modify snapshot to actually change revision metadata structure so we simulate - # a revision written by somebody else (structure different) - last_snapshot.return_value = snapshot - - loader = NixGuixLoader(swh_storage, sources_url) - actual_load_status3 = loader.load() - assert last_snapshot.called - assert actual_load_status3["status"] == "eventful" - - assert_last_visit_matches( - swh_storage, sources_url, status="full", type="nixguix" - ) - - new_snapshot_id = "32ff641e510aceefc3a6d0dcbf208b2854d2e965" - assert actual_load_status3["snapshot_id"] == new_snapshot_id - - last_snapshot = snapshot_get_all_branches( - swh_storage, hash_to_bytes(new_snapshot_id) - ) - new_revision_branch = last_snapshot.branches[artifact_url.encode("utf-8")] - assert new_revision_branch.target_type == TargetType.REVISION - - new_revision = swh_storage.revision_get([new_revision_branch.target])[0] - - # the new revision has the correct structure, so it got ingested alright by the - # new run - assert new_revision.metadata["extrinsic"]["raw"]["integrity"] is not None - - actual_detections: List[Dict] = [] - for record in caplog.records: - logtext = record.getMessage() - if "Unexpected metadata revision structure detected:" in logtext: - actual_detections.append(record.args["context"]) - - expected_detections = [ - {"reason": "'integrity'", "known_artifact": old_revision.metadata,}, - ] - - # less calls than there are sources listed in the sources.json; - # as some of them are skipped using the ExtID from a previous run - assert len(expected_detections) <= len(all_sources["sources"]) - - assert actual_detections == expected_detections diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -141,16 +141,6 @@ ) yield release_name(version), p_info - @staticmethod - def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]: - extid_str = _artifact_to_sha1(known_artifact) - if extid_str is None: - return None - try: - return (EXTID_TYPE, hash_to_bytes(extid_str)) - except ValueError: - return None - def build_revision( self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: @@ -183,56 +173,10 @@ parents=(), directory=directory, synthetic=True, - metadata={ - "intrinsic": {"tool": "package.json", "raw": i_metadata,}, - "extrinsic": { - "provider": self.provider_url, - "when": self.visit_date.isoformat(), - "raw": p_info.raw_info, - }, - }, ) return r -def _artifact_to_sha1(known_artifact: Dict) -> Optional[str]: - """Returns the sha1 from an NPM 'original_artifact' dict - - The following code allows to deal with 2 metadata formats: - - - old format sample:: - - { - 'package_source': { - 'sha1': '05181c12cd8c22035dd31155656826b85745da37', - } - } - - - new format sample:: - - { - 'original_artifact': [{ - 'checksums': { - 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa - ... - }, - }], - ... - } - - """ - known_original_artifact = known_artifact.get("original_artifact") - if not known_original_artifact: - # previous loader-npm version kept original artifact elsewhere - known_original_artifact = known_artifact.get("package_source") - if not known_original_artifact: - return None - return known_original_artifact["sha1"] - else: - assert isinstance(known_original_artifact, list) - return known_original_artifact[0]["checksums"]["sha1"] - - def _author_str(author_data: Union[Dict, List, str]) -> str: """Parse author from package.json author fields diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -14,7 +14,6 @@ _author_str, extract_npm_package_author, ) -from swh.loader.package.tests.common import check_metadata_paths from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import ( @@ -308,37 +307,6 @@ return "https://replicate.npmjs.com/%s/" % package -def test_npm_revision_metadata_structure(swh_storage, requests_mock_datadir): - package = "org" - loader = NpmLoader(swh_storage, package_url(package)) - - actual_load_status = loader.load() - assert actual_load_status["status"] == "eventful" - assert actual_load_status["snapshot_id"] is not None - - expected_revision_id = hash_to_bytes("d8a1c7474d2956ac598a19f0f27d52f7015f117e") - revision = swh_storage.revision_get([expected_revision_id])[0] - assert revision is not None - - check_metadata_paths( - revision.metadata, - paths=[ - ("intrinsic.tool", str), - ("intrinsic.raw", dict), - ("extrinsic.provider", str), - ("extrinsic.when", str), - ("extrinsic.raw", dict), - ("original_artifact", list), - ], - ) - - for original_artifact in revision.metadata["original_artifact"]: - check_metadata_paths( - original_artifact, - paths=[("filename", str), ("length", int), ("checksums", dict),], - ) - - def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info): package = "org" url = package_url(package) @@ -544,41 +512,6 @@ check_snapshot(expected_snapshot, swh_storage) -def test_npm__known_artifact_to_extid__old_loader_version(): - """Current loader version should parse old metadata scheme - - """ - assert ( - NpmLoader.known_artifact_to_extid( - {"package_source": {"sha1": "something-wrong"}} - ) - is None - ) - - sha1 = "05181c12cd8c22035dd31155656826b85745da37" - assert NpmLoader.known_artifact_to_extid({"package_source": {"sha1": sha1,}}) == ( - "npm-archive-sha1", - hash_to_bytes(sha1), - ) - - -def test_npm__known_artifact_to_extid__current_loader_version(): - """Current loader version should be able to parse current metadata scheme - - """ - sha1 = "05181c12cd8c22035dd31155656826b85745da37" - assert NpmLoader.known_artifact_to_extid( - {"original_artifact": [{"checksums": {"sha1": sha1},}],} - ) == ("npm-archive-sha1", hash_to_bytes(sha1)) - - assert ( - NpmLoader.known_artifact_to_extid( - {"original_artifact": [{"checksums": {"sha1": "something-wrong"},}],}, - ) - is None - ) - - def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -121,16 +121,6 @@ for version, p_info in res: yield release_name(version, p_info.filename), p_info - @staticmethod - def known_artifact_to_extid(known_artifact: Dict) -> Optional[PartialExtID]: - extid_str = _artifact_to_sha256(known_artifact) - if extid_str is None: - return None - try: - return (EXTID_TYPE, hash_to_bytes(extid_str)) if extid_str else None - except ValueError: - return None - def build_revision( self, p_info: PyPIPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: @@ -157,61 +147,6 @@ parents=(), directory=directory, synthetic=True, - metadata={ - "intrinsic": {"tool": "PKG-INFO", "raw": i_metadata,}, - "extrinsic": { - "provider": self.provider_url, - "when": self.visit_date.isoformat(), - "raw": p_info.raw_info, - }, - }, - ) - - -def _artifact_to_sha256(known_artifact: Dict) -> Optional[str]: - """Returns the sha256 from a PyPI 'original_artifact' dict - - The following code allows to deal with 2 metadata formats (column metadata - in 'revision') - - - old format sample:: - - { - 'original_artifact': { - 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa - ... - }, - ... - } - - - new format sample:: - - { - 'original_artifact': [{ - 'checksums': { - 'sha256': '6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec', # noqa - ... - }, - }], - ... - } - - """ - original_artifact = known_artifact["original_artifact"] - if isinstance(original_artifact, dict): - # previous loader-pypi version stored metadata as dict - return original_artifact["sha256"] - # new pypi loader actually store metadata dict differently... - assert isinstance(original_artifact, list) - # current loader-pypi stores metadata as list of dict - if len(known_artifact["original_artifact"]) == 0: - return None - elif len(known_artifact["original_artifact"]) == 1: - return original_artifact[0]["checksums"]["sha256"] - else: - raise ValueError( - f"Expected exactly one PyPI original_artifact, got " - f"{len(known_artifact['original_artifact'])}" ) diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -19,7 +19,6 @@ extract_intrinsic_metadata, pypi_api_url, ) -from swh.loader.package.tests.common import check_metadata_paths from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import ( @@ -334,24 +333,6 @@ revision = swh_storage.revision_get([expected_revision_id])[0] assert revision is not None - check_metadata_paths( - revision.metadata, - paths=[ - ("intrinsic.tool", str), - ("intrinsic.raw", dict), - ("extrinsic.provider", str), - ("extrinsic.when", str), - ("extrinsic.raw", dict), - ("original_artifact", list), - ], - ) - - for original_artifact in revision.metadata["original_artifact"]: - check_metadata_paths( - original_artifact, - paths=[("filename", str), ("length", int), ("checksums", dict),], - ) - revision_swhid = CoreSWHID( object_type=ObjectType.REVISION, object_id=expected_revision_id ) @@ -796,52 +777,6 @@ ) -def test_pypi__known_artifact_to_extid__old_loader_version(): - """Current loader version should solve old metadata scheme - - """ - assert ( - PyPILoader.known_artifact_to_extid( - {"original_artifact": {"sha256": "something-wrong",},} - ) - is None - ) - - sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" - assert PyPILoader.known_artifact_to_extid( - {"original_artifact": {"sha256": sha256}} - ) == ("pypi-archive-sha256", hash_to_bytes(sha256)) - - -def test_pypi__known_artifact_to_extid__current_loader_version(): - """Current loader version should be able to solve current metadata scheme - - """ - sha256 = "6975816f2c5ad4046acc676ba112f2fff945b01522d63948531f11f11e0892ec" - - assert PyPILoader.known_artifact_to_extid( - {"original_artifact": [{"checksums": {"sha256": sha256,},}],} - ) == ("pypi-archive-sha256", hash_to_bytes(sha256)) - - assert ( - PyPILoader.known_artifact_to_extid( - {"original_artifact": [{"checksums": {"sha256": "something-wrong"},}],}, - ) - is None - ) - - # there should not be more than one artifact - with pytest.raises(ValueError): - PyPILoader.known_artifact_to_extid( - { - "original_artifact": [ - {"checksums": {"sha256": sha256,},}, - {"checksums": {"sha256": sha256,},}, - ], - } - ) - - def test_pypi_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion diff --git a/swh/loader/package/tests/common.py b/swh/loader/package/tests/common.py --- a/swh/loader/package/tests/common.py +++ b/swh/loader/package/tests/common.py @@ -5,50 +5,8 @@ import logging from os import path -from typing import Dict, List, Tuple logger = logging.getLogger(__file__) DATADIR = path.join(path.abspath(path.dirname(__file__)), "resources") - - -def check_metadata(metadata: Dict, key_path: str, raw_type: str): - """Given a metadata dict, ensure the associated key_path value is of type - raw_type. - - Args: - metadata: Dict to check - key_path: Path to check - raw_type: Type to check the path with - - Raises: - Assertion error in case of mismatch - - """ - data = metadata - keys = key_path.split(".") - for k in keys: - try: - data = data[k] - except (TypeError, KeyError) as e: - # KeyError: because path too long - # TypeError: data is not a dict - raise AssertionError(e) - assert isinstance(data, raw_type) # type: ignore - - -def check_metadata_paths(metadata: Dict, paths: List[Tuple[str, str]]): - """Given a metadata dict, ensure the keys are of expected types - - Args: - metadata: Dict to check - key_path: Path to check - raw_type: Type to check the path with - - Raises: - Assertion error in case of mismatch - - """ - for key_path, raw_type in paths: - check_metadata(metadata, key_path, raw_type) diff --git a/swh/loader/package/tests/test_common.py b/swh/loader/package/tests/test_common.py deleted file mode 100644 --- a/swh/loader/package/tests/test_common.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest - -from swh.loader.package.tests.common import check_metadata, check_metadata_paths - - -def test_check_metadata(): - metadata = { - "a": {"raw": {"time": "something",},}, - "b": [], - "c": 1, - } - - for raw_path, raw_type in [ - ("a.raw", dict), - ("a.raw.time", str), - ("b", list), - ("c", int), - ]: - check_metadata(metadata, raw_path, raw_type) - - -def test_check_metadata_ko(): - metadata = { - "a": {"raw": "hello",}, - "b": [], - "c": 1, - } - - for raw_path, raw_type in [ - ("a.b", dict), - ("a.raw.time", str), - ]: - with pytest.raises(AssertionError): - check_metadata(metadata, raw_path, raw_type) - - -def test_check_metadata_paths(): - metadata = { - "a": {"raw": {"time": "something",},}, - "b": [], - "c": 1, - } - - check_metadata_paths( - metadata, [("a.raw", dict), ("a.raw.time", str), ("b", list), ("c", int),] - ) - - -def test_check_metadata_paths_ko(): - metadata = { - "a": {"raw": "hello",}, - "b": [], - "c": 1, - } - - with pytest.raises(AssertionError): - check_metadata_paths(metadata, [("a.b", dict), ("a.raw.time", str),]) diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py --- a/swh/loader/package/tests/test_loader.py +++ b/swh/loader/package/tests/test_loader.py @@ -83,51 +83,6 @@ assert actual_load_status2 == {"status": "failed"} -def test_resolve_revision_from_artifacts() -> None: - loader = PackageLoader(None, None) # type: ignore - loader.known_artifact_to_extid = Mock( # type: ignore - wraps=lambda known_artifact: ("extid-type", known_artifact["key"].encode()) - ) - - known_artifacts = { - b"a" * 20: {"key": "extid-of-aaaa"}, - b"b" * 20: {"key": "extid-of-bbbb"}, - } - - p_info = Mock(wraps=BasePackageInfo(None, None)) # type: ignore - - # No known artifact -> it would be useless to compute the extid - assert loader.resolve_revision_from_artifacts({}, p_info) is None - p_info.extid.assert_not_called() - loader.known_artifact_to_extid.assert_not_called() - - p_info.extid.reset_mock() - - # Some artifacts, but the PackageInfo does not support extids - p_info.extid.return_value = None - assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) is None - p_info.extid.assert_called_once() - loader.known_artifact_to_extid.assert_not_called() - - p_info.extid.reset_mock() - - # Some artifacts, and the PackageInfo is not one of them (ie. cache miss) - p_info.extid.return_value = ("extid-type", b"extid-of-cccc") - assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) is None - p_info.extid.assert_called_once() - loader.known_artifact_to_extid.assert_any_call({"key": "extid-of-aaaa"}) - loader.known_artifact_to_extid.assert_any_call({"key": "extid-of-bbbb"}) - - p_info.extid.reset_mock() - loader.known_artifact_to_extid.reset_mock() - - # Some artifacts, and the PackageInfo is one of them (ie. cache hit) - p_info.extid.return_value = ("extid-type", b"extid-of-aaaa") - assert loader.resolve_revision_from_artifacts(known_artifacts, p_info) == b"a" * 20 - p_info.extid.assert_called_once() - loader.known_artifact_to_extid.assert_called_once_with({"key": "extid-of-aaaa"}) - - def test_resolve_revision_from_extids() -> None: loader = PackageLoader(None, None) # type: ignore