diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py index 0f940a4..34e5ab5 100644 --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -1,265 +1,261 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import requests import types from typing import Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple, Union from swh.model.hashutil import hash_to_hex, hash_to_bytes from swh.model.model import ( Person, Revision, RevisionType, TimestampWithTimezone, Sha1Git, ) from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import download logger = logging.getLogger(__name__) class DepositLoader(PackageLoader): """Load pypi origin's artifact releases into swh archive. """ visit_type = "deposit" def __init__(self, url: str, deposit_id: str): """Constructor Args: url: Origin url to associate the artifacts/metadata to deposit_id: Deposit identity """ super().__init__(url=url) config_deposit = self.config["deposit"] self.deposit_id = deposit_id self.client = ApiClient(url=config_deposit["url"], auth=config_deposit["auth"]) self.metadata: Dict[str, Any] = {} def get_versions(self) -> Sequence[str]: # only 1 branch 'HEAD' with no alias since we only have 1 snapshot # branch return ["HEAD"] def get_package_info( self, version: str ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: p_info = { "filename": "archive.zip", "raw": self.metadata, } yield "HEAD", p_info def download_package( self, p_info: Mapping[str, Any], tmpdir: str ) -> List[Tuple[str, Mapping]]: """Override to allow use of the dedicated deposit client """ return [self.client.archive_get(self.deposit_id, tmpdir, p_info["filename"])] def build_revision( self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: # FIXME: the deposit read api should no longer need to build the revision entry # as this would avoid unnecessary indirection. This would also align with what # other package loaders do revision_data = a_metadata.pop("revision") # Note: # `date` and `committer_date` are always transmitted by the deposit read api # which computes itself the values. The loader needs to use those to create the # revision. # date: codemeta:dateCreated if any, deposit completed_date otherwise date = TimestampWithTimezone.from_dict(revision_data["date"]) # commit_date: codemeta:datePublished if any, deposit completed_date otherwise commit_date = TimestampWithTimezone.from_dict(revision_data["committer_date"]) - metadata = revision_data["metadata"] - metadata.update( - { - "extrinsic": { - "provider": self.client.metadata_url(self.deposit_id), - "when": self.visit_date.isoformat(), - "raw": a_metadata, - }, - } - ) return Revision( type=RevisionType.TAR, message=revision_data["message"].encode("utf-8"), author=parse_author(revision_data["author"]), date=date, committer=parse_author(revision_data["committer"]), committer_date=commit_date, parents=[hash_to_bytes(p) for p in revision_data.get("parents", [])], directory=directory, synthetic=True, - metadata=metadata, + metadata={ + "extrinsic": { + "provider": self.client.metadata_url(self.deposit_id), + "when": self.visit_date.isoformat(), + "raw": a_metadata, + }, + }, ) def load(self) -> Dict: # First making sure the deposit is known prior to trigger a loading try: self.metadata = self.client.metadata_get(self.deposit_id) except ValueError: logger.error(f"Unknown deposit {self.deposit_id}, ignoring") return {"status": "failed"} # Then usual loading r = super().load() success = r["status"] != "failed" if success: # Update archive with metadata information origin_metadata = self.metadata["origin_metadata"] logger.debug("origin_metadata: %s", origin_metadata) tools = self.storage.tool_add([origin_metadata["tool"]]) logger.debug("tools: %s", tools) tool_id = tools[0]["id"] provider = origin_metadata["provider"] # FIXME: Shall we delete this info? provider_id = self.storage.metadata_provider_add( provider["provider_name"], provider["provider_type"], provider["provider_url"], metadata=None, ) metadata = origin_metadata["metadata"] self.storage.origin_metadata_add( self.url, self.visit_date, provider_id, tool_id, metadata ) # Update deposit status try: if not success: self.client.status_update(self.deposit_id, status="failed") return r snapshot_id = hash_to_bytes(r["snapshot_id"]) branches = self.storage.snapshot_get(snapshot_id)["branches"] logger.debug("branches: %s", branches) if not branches: return r rev_id = branches[b"HEAD"]["target"] revisions = self.storage.revision_get([rev_id]) # FIXME: inconsistency between tests and production code if isinstance(revisions, types.GeneratorType): revisions = list(revisions) revision = revisions[0] # Retrieve the revision identifier dir_id = revision["directory"] # update the deposit's status to success with its # revision-id and directory-id self.client.status_update( self.deposit_id, status="done", revision_id=hash_to_hex(rev_id), directory_id=hash_to_hex(dir_id), origin_url=self.url, ) except Exception: logger.exception("Problem when trying to update the deposit's status") return {"status": "failed"} return r def parse_author(author) -> Person: """See prior fixme """ return Person( fullname=author["fullname"].encode("utf-8"), name=author["name"].encode("utf-8"), email=author["email"].encode("utf-8"), ) class ApiClient: """Private Deposit Api client """ def __init__(self, url, auth: Optional[Mapping[str, str]]): self.base_url = url.rstrip("/") self.auth = None if not auth else (auth["username"], auth["password"]) def do(self, method: str, url: str, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in get/post/put Returns: The request's execution output """ method_fn = getattr(requests, method) if self.auth: kwargs["auth"] = self.auth return method_fn(url, *args, **kwargs) def archive_get( self, deposit_id: Union[int, str], tmpdir: str, filename: str ) -> Tuple[str, Dict]: """Retrieve deposit's archive artifact locally """ url = f"{self.base_url}/{deposit_id}/raw/" return download(url, dest=tmpdir, filename=filename, auth=self.auth) def metadata_url(self, deposit_id: Union[int, str]) -> str: return f"{self.base_url}/{deposit_id}/meta/" def metadata_get(self, deposit_id: Union[int, str]) -> Dict[str, Any]: """Retrieve deposit's metadata artifact as json """ url = self.metadata_url(deposit_id) r = self.do("get", url) if r.ok: return r.json() msg = f"Problem when retrieving deposit metadata at {url}" logger.error(msg) raise ValueError(msg) def status_update( self, deposit_id: Union[int, str], status: str, revision_id: Optional[str] = None, directory_id: Optional[str] = None, origin_url: Optional[str] = None, ): """Update deposit's information including status, and persistent identifiers result of the loading. """ url = f"{self.base_url}/{deposit_id}/update/" payload = {"status": status} if revision_id: payload["revision_id"] = revision_id if directory_id: payload["directory_id"] = directory_id if origin_url: payload["origin_url"] = origin_url self.do("put", url, json=payload) diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py index fbfa5cd..5ca57da 100644 --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -1,254 +1,330 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import pytest from swh.model.hashutil import hash_to_bytes from swh.loader.package.deposit.loader import DepositLoader from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats, ) from swh.core.pytest_plugin import requests_mock_datadir_factory @pytest.fixture def requests_mock_datadir(requests_mock_datadir): """Enhance default mock data to mock put requests as the loader does some internal update queries there. """ requests_mock_datadir.put(re.compile("https")) return requests_mock_datadir def test_deposit_init_ok(swh_config, swh_loader_config): url = "some-url" deposit_id = 999 loader = DepositLoader(url, deposit_id) # Something that does not exist assert loader.url == url assert loader.client is not None assert loader.client.base_url == swh_loader_config["deposit"]["url"] def test_deposit_loading_unknown_deposit(swh_config, requests_mock_datadir): """Loading an unknown deposit should fail no origin, no visit, no snapshot """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url" unknown_deposit_id = 667 loader = DepositLoader(url, unknown_deposit_id) # does not exist actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} stats = get_stats(loader.storage) assert { "content": 0, "directory": 0, "origin": 0, "origin_visit": 0, "person": 0, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 0, } == stats requests_mock_datadir_missing_one = requests_mock_datadir_factory( ignore_urls=["https://deposit.softwareheritage.org/1/private/666/raw/",] ) def test_deposit_loading_failure_to_retrieve_1_artifact( swh_config, requests_mock_datadir_missing_one ): """Deposit with missing artifact ends up with an uneventful/partial visit """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url-2" deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None stats = get_stats(loader.storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "person": 0, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit["status"] == "partial" assert origin_visit["type"] == "deposit" def test_revision_metadata_structure(swh_config, requests_mock_datadir): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_revision_id = hash_to_bytes("637318680351f5d78856d13264faebbd91efe9bb") revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths( revision["metadata"], paths=[ ("extrinsic.provider", str), ("extrinsic.when", str), ("extrinsic.raw", dict), ("original_artifact", list), ], ) + # Only 2 top-level keys now + assert set(revision["metadata"].keys()) == {"extrinsic", "original_artifact"} + for original_artifact in revision["metadata"]["original_artifact"]: check_metadata_paths( original_artifact, paths=[("filename", str), ("length", int), ("checksums", dict),], ) def test_deposit_loading_ok(swh_config, requests_mock_datadir): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 666 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() expected_snapshot_id = "b2b327b33dc85818bd23c3ccda8b7e675a66ecbd" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } stats = get_stats(loader.storage) assert { "content": 303, "directory": 12, "origin": 1, "origin_visit": 1, "person": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats origin_visit = loader.storage.origin_visit_get_latest(url) assert origin_visit["status"] == "full" assert origin_visit["type"] == "deposit" expected_branches = { "HEAD": { "target": "637318680351f5d78856d13264faebbd91efe9bb", "target_type": "revision", }, } expected_snapshot = { "id": expected_snapshot_id, "branches": expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) # check metadata tool = { "name": "swh-deposit", "version": "0.0.1", "configuration": {"sword_version": "2",}, } tool = loader.storage.tool_get(tool) assert tool is not None assert tool["id"] is not None provider = { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": None, } provider = loader.storage.metadata_provider_get_by(provider) assert provider is not None assert provider["id"] is not None metadata = list( loader.storage.origin_metadata_get_by(url, provider_type="deposit_client") ) assert metadata is not None assert isinstance(metadata, list) assert len(metadata) == 1 metadata0 = metadata[0] assert metadata0["provider_id"] == provider["id"] assert metadata0["provider_type"] == "deposit_client" assert metadata0["tool_id"] == tool["id"] def test_deposit_loading_ok_2(swh_config, requests_mock_datadir): """Field dates should be se appropriately """ - url = "https://hal-test.archives-ouvertes.fr/some-external-id" + external_id = "some-external-id" + url = f"https://hal-test.archives-ouvertes.fr/{external_id}" deposit_id = 777 loader = DepositLoader(url, deposit_id) actual_load_status = loader.load() expected_snapshot_id = "3e68440fdd7c81d283f8f3aebb6f0c8657864192" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } revision_id = "564d18943d71be80d0d73b43a77cfb205bcde96c" expected_branches = {"HEAD": {"target": revision_id, "target_type": "revision"}} expected_snapshot = { "id": expected_snapshot_id, "branches": expected_branches, } check_snapshot(expected_snapshot, storage=loader.storage) origin_visit = loader.storage.origin_visit_get_latest(url) # The visit is partial because some hash collision were detected assert origin_visit["status"] == "full" assert origin_visit["type"] == "deposit" raw_meta = loader.client.metadata_get(deposit_id) # Ensure the date fields are set appropriately in the revision # Retrieve the revision revision = next(loader.storage.revision_get([hash_to_bytes(revision_id)])) assert revision - for field_date in ["committer_date", "date"]: - assert revision[field_date] == raw_meta["revision"][field_date] + assert revision["committer_date"] == raw_meta["revision"]["committer_date"] + assert revision["date"] == raw_meta["revision"]["date"] + + read_api = f"https://deposit.softwareheritage.org/1/private/{deposit_id}/meta/" + + assert revision["metadata"] == { + "extrinsic": { + "provider": read_api, + "raw": { + "branch_name": "master", + "origin": {"type": "deposit", "url": url,}, + "origin_metadata": { + "metadata": { + "@xmlns": ["http://www.w3.org/2005/Atom"], + "author": ["some awesome author", "another one", "no one",], + "codemeta:dateCreated": "2017-10-07T15:17:08Z", + "codemeta:datePublished": "2017-10-08T15:00:00Z", + "external_identifier": "some-external-id", + "url": url, + }, + "provider": { + "metadata": None, + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + }, + "tool": { + "configuration": {"sword_version": "2"}, + "name": "swh-deposit", + "version": "0.0.1", + }, + }, + }, + "when": revision["metadata"]["extrinsic"]["when"], # dynamic + }, + "original_artifact": [ + { + "checksums": { + "sha1": "f8c63d7c890a7453498e6cf9fef215d85ec6801d", + "sha256": "474bf646aeeff6d945eb752b1a9f8a40f3d81a88909ee7bd2d08cc822aa361e6", # noqa + }, + "filename": "archive.zip", + "length": 956830, + } + ], + } + + # Check the metadata swh side + origin_meta = list( + loader.storage.origin_metadata_get_by(url, provider_type="deposit_client") + ) + + assert len(origin_meta) == 1 + + origin_meta = origin_meta[0] + # dynamic, a pain to display and not that interesting + origin_meta.pop("discovery_date") + + assert origin_meta == { + "metadata": { + "@xmlns": ["http://www.w3.org/2005/Atom"], + "author": ["some awesome author", "another one", "no one"], + "codemeta:dateCreated": "2017-10-07T15:17:08Z", + "codemeta:datePublished": "2017-10-08T15:00:00Z", + "external_identifier": "some-external-id", + "url": "https://hal-test.archives-ouvertes.fr/some-external-id", + }, + "origin_url": "https://hal-test.archives-ouvertes.fr/some-external-id", + "provider_id": 1, + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "tool_id": 1, + }