diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py
index f5888fa..bf8d118 100644
--- a/swh/loader/package/deposit/loader.py
+++ b/swh/loader/package/deposit/loader.py
@@ -1,325 +1,351 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
+from datetime import timezone
import json
import logging
from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union
import attr
import requests
from swh.loader.package.loader import (
BasePackageInfo,
PackageLoader,
RawExtrinsicMetadataCore,
)
from swh.loader.package.utils import cached_method, download
from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.model.model import (
MetadataAuthority,
MetadataAuthorityType,
MetadataFetcher,
Person,
Revision,
RevisionType,
Sha1Git,
TimestampWithTimezone,
)
from swh.storage.algos.snapshot import snapshot_get_all_branches
logger = logging.getLogger(__name__)
+def now() -> datetime.datetime:
+ return datetime.datetime.now(tz=timezone.utc)
+
+
@attr.s
class DepositPackageInfo(BasePackageInfo):
filename = attr.ib(type=str) # instead of Optional[str]
raw_info = attr.ib(type=Dict[str, Any])
author_date = attr.ib(type=datetime.datetime)
"""codemeta:dateCreated if any, deposit completed_date otherwise"""
commit_date = attr.ib(type=datetime.datetime)
"""codemeta:datePublished if any, deposit completed_date otherwise"""
client = attr.ib(type=str)
id = attr.ib(type=int)
"""Internal ID of the deposit in the deposit DB"""
collection = attr.ib(type=str)
"""The collection in the deposit; see SWORD specification."""
author = attr.ib(type=Person)
committer = attr.ib(type=Person)
revision_parents = attr.ib(type=Tuple[Sha1Git, ...])
"""Revisions created from previous deposits, that will be used as parents of the
revision created for this deposit."""
@classmethod
def from_metadata(
cls, metadata: Dict[str, Any], url: str, filename: str
) -> "DepositPackageInfo":
# Note:
# `date` and `committer_date` are always transmitted by the deposit read api
# which computes itself the values. The loader needs to use those to create the
# revision.
- raw_metadata_from_origin = json.dumps(
- metadata["origin_metadata"]["metadata"]
- ).encode()
- metadata = metadata.copy()
- # FIXME: this removes information from 'raw' metadata
- depo = metadata.pop("deposit")
-
+ all_metadata_raw: List[str] = metadata["metadata_raw"]
+ raw_info = {
+ "origin": metadata["origin"],
+ "origin_metadata": {
+ "metadata": json.dumps(metadata["metadata_dict"]),
+ "provider": metadata["provider"],
+ "tool": metadata["tool"],
+ },
+ }
+ depo = metadata["deposit"]
return cls(
url=url,
filename=filename,
author_date=depo["author_date"],
commit_date=depo["committer_date"],
client=depo["client"],
id=depo["id"],
collection=depo["collection"],
author=parse_author(depo["author"]),
committer=parse_author(depo["committer"]),
revision_parents=tuple(hash_to_bytes(p) for p in depo["revision_parents"]),
- raw_info=metadata,
+ raw_info=raw_info,
revision_extrinsic_metadata=[
RawExtrinsicMetadataCore(
- format="sword-v2-atom-codemeta-v2-in-json",
- metadata=raw_metadata_from_origin,
- ),
+ discovery_date=now(),
+ metadata=raw_metadata.encode(),
+ format="sword-v2-atom-codemeta-v2",
+ )
+ for raw_metadata in all_metadata_raw
],
)
class DepositLoader(PackageLoader[DepositPackageInfo]):
"""Load pypi origin's artifact releases into swh archive.
"""
visit_type = "deposit"
def __init__(self, url: str, deposit_id: str):
"""Constructor
Args:
url: Origin url to associate the artifacts/metadata to
deposit_id: Deposit identity
"""
super().__init__(url=url)
config_deposit = self.config["deposit"]
self.deposit_id = deposit_id
self.client = ApiClient(url=config_deposit["url"], auth=config_deposit["auth"])
def get_versions(self) -> Sequence[str]:
# only 1 branch 'HEAD' with no alias since we only have 1 snapshot
# branch
return ["HEAD"]
def get_metadata_authority(self) -> MetadataAuthority:
- provider = self.metadata()["origin_metadata"]["provider"]
- assert provider["provider_type"] == "deposit_client"
+ provider = self.metadata()["provider"]
+ assert provider["provider_type"] == MetadataAuthorityType.DEPOSIT_CLIENT.value
return MetadataAuthority(
type=MetadataAuthorityType.DEPOSIT_CLIENT,
url=provider["provider_url"],
metadata={
"name": provider["provider_name"],
**(provider["metadata"] or {}),
},
)
def get_metadata_fetcher(self) -> MetadataFetcher:
- tool = self.metadata()["origin_metadata"]["tool"]
+ tool = self.metadata()["tool"]
return MetadataFetcher(
name=tool["name"], version=tool["version"], metadata=tool["configuration"],
)
def get_package_info(
self, version: str
) -> Iterator[Tuple[str, DepositPackageInfo]]:
p_info = DepositPackageInfo.from_metadata(
self.metadata(), url=self.url, filename="archive.zip",
)
yield "HEAD", p_info
def download_package(
self, p_info: DepositPackageInfo, tmpdir: str
) -> List[Tuple[str, Mapping]]:
"""Override to allow use of the dedicated deposit client
"""
return [self.client.archive_get(self.deposit_id, tmpdir, p_info.filename)]
def build_revision(
self, p_info: DepositPackageInfo, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
message = (
f"{p_info.client}: Deposit {p_info.id} in collection {p_info.collection}"
).encode("utf-8")
return Revision(
type=RevisionType.TAR,
message=message,
author=p_info.author,
date=TimestampWithTimezone.from_dict(p_info.author_date),
committer=p_info.committer,
committer_date=TimestampWithTimezone.from_dict(p_info.commit_date),
parents=p_info.revision_parents,
directory=directory,
synthetic=True,
metadata={
"extrinsic": {
"provider": self.client.metadata_url(self.deposit_id),
"when": self.visit_date.isoformat(),
"raw": p_info.raw_info,
},
},
)
def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]:
- origin_metadata = self.metadata()["origin_metadata"]
+ metadata = self.metadata()
+ all_metadata_raw: List[str] = metadata["metadata_raw"]
+ origin_metadata = json.dumps(
+ {
+ "metadata": all_metadata_raw,
+ "provider": metadata["provider"],
+ "tool": metadata["tool"],
+ }
+ ).encode()
return [
RawExtrinsicMetadataCore(
- format="sword-v2-atom-codemeta-v2-in-json",
- metadata=json.dumps(origin_metadata["metadata"]).encode(),
+ discovery_date=now(),
+ metadata=raw_meta.encode(),
+ format="sword-v2-atom-codemeta-v2",
+ )
+ for raw_meta in all_metadata_raw
+ ] + [
+ RawExtrinsicMetadataCore(
+ discovery_date=now(),
+ metadata=origin_metadata,
+ format="original-artifacts-json",
)
]
@cached_method
def metadata(self):
"""Returns metadata from the deposit server"""
return self.client.metadata_get(self.deposit_id)
def load(self) -> Dict:
# First making sure the deposit is known prior to trigger a loading
try:
self.metadata()
except ValueError:
logger.error(f"Unknown deposit {self.deposit_id}, ignoring")
return {"status": "failed"}
# Then usual loading
r = super().load()
success = r["status"] != "failed"
# Update deposit status
try:
if not success:
self.client.status_update(self.deposit_id, status="failed")
return r
snapshot_id = hash_to_bytes(r["snapshot_id"])
snapshot = snapshot_get_all_branches(self.storage, snapshot_id)
if not snapshot:
return r
branches = snapshot.branches
logger.debug("branches: %s", branches)
if not branches:
return r
rev_id = branches[b"HEAD"].target
revision = self.storage.revision_get([rev_id])[0]
if not revision:
return r
# update the deposit's status to success with its
# revision-id and directory-id
self.client.status_update(
self.deposit_id,
status="done",
revision_id=hash_to_hex(rev_id),
directory_id=hash_to_hex(revision.directory),
snapshot_id=r["snapshot_id"],
origin_url=self.url,
)
except Exception:
logger.exception("Problem when trying to update the deposit's status")
return {"status": "failed"}
return r
def parse_author(author) -> Person:
"""See prior fixme
"""
return Person(
fullname=author["fullname"].encode("utf-8"),
name=author["name"].encode("utf-8"),
email=author["email"].encode("utf-8"),
)
class ApiClient:
"""Private Deposit Api client
"""
def __init__(self, url, auth: Optional[Mapping[str, str]]):
self.base_url = url.rstrip("/")
self.auth = None if not auth else (auth["username"], auth["password"])
def do(self, method: str, url: str, *args, **kwargs):
"""Internal method to deal with requests, possibly with basic http
authentication.
Args:
method (str): supported http methods as in get/post/put
Returns:
The request's execution output
"""
method_fn = getattr(requests, method)
if self.auth:
kwargs["auth"] = self.auth
return method_fn(url, *args, **kwargs)
def archive_get(
self, deposit_id: Union[int, str], tmpdir: str, filename: str
) -> Tuple[str, Dict]:
"""Retrieve deposit's archive artifact locally
"""
url = f"{self.base_url}/{deposit_id}/raw/"
return download(url, dest=tmpdir, filename=filename, auth=self.auth)
def metadata_url(self, deposit_id: Union[int, str]) -> str:
return f"{self.base_url}/{deposit_id}/meta/"
def metadata_get(self, deposit_id: Union[int, str]) -> Dict[str, Any]:
"""Retrieve deposit's metadata artifact as json
"""
url = self.metadata_url(deposit_id)
r = self.do("get", url)
if r.ok:
return r.json()
msg = f"Problem when retrieving deposit metadata at {url}"
logger.error(msg)
raise ValueError(msg)
def status_update(
self,
deposit_id: Union[int, str],
status: str,
revision_id: Optional[str] = None,
directory_id: Optional[str] = None,
snapshot_id: Optional[str] = None,
origin_url: Optional[str] = None,
):
"""Update deposit's information including status, and persistent
identifiers result of the loading.
"""
url = f"{self.base_url}/{deposit_id}/update/"
payload = {"status": status}
if revision_id:
payload["revision_id"] = revision_id
if directory_id:
payload["directory_id"] = directory_id
if snapshot_id:
payload["snapshot_id"] = snapshot_id
if origin_url:
payload["origin_url"] = origin_url
self.do("put", url, json=payload)
diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json
index 4af7553..d054b58 100644
--- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json
+++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json
@@ -1,66 +1,62 @@
{
"origin": {
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
"type": "deposit"
},
- "origin_metadata": {
- "metadata": {
- "@xmlns": [
- "http://www.w3.org/2005/Atom"
- ],
- "author": [
- "some awesome author",
- "another one",
- "no one"
- ],
- "codemeta:dateCreated": "2017-10-07T15:17:08Z",
- "external_identifier": "some-external-id",
- "url": "https://hal-test.archives-ouvertes.fr/some-external-id"
- },
- "provider": {
- "provider_name": "hal",
- "provider_type": "deposit_client",
- "provider_url": "https://hal-test.archives-ouvertes.fr/",
- "metadata": null
- },
- "tool": {
- "name": "swh-deposit",
- "version": "0.0.1",
- "configuration": {
- "sword_version": "2"
- }
+ "metadata_raw" : ["some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother one"],
+ "metadata_dict": {
+ "author": [
+ "some awesome author",
+ "another one",
+ "no one"
+ ],
+ "codemeta:dateCreated": "2017-10-07T15:17:08Z",
+ "external_identifier": "some-external-id",
+ "url": "https://hal-test.archives-ouvertes.fr/some-external-id"
+ },
+ "provider": {
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ "metadata": null
+ },
+ "tool": {
+ "name": "swh-deposit",
+ "version": "0.0.1",
+ "configuration": {
+ "sword_version": "2"
}
},
"deposit": {
"id": "666",
"client": "hal",
"collection": "hal",
"author": {
"name": "Software Heritage",
"fullname": "Software Heritage",
"email": "robot@softwareheritage.org"
},
"author_date": {
"timestamp": {
"seconds": 1507389428,
"microseconds": 0
},
"offset": 0,
"negative_utc": false
},
"committer": {
"name": "Software Heritage",
"fullname": "Software Heritage",
"email": "robot@softwareheritage.org"
},
"committer_date": {
"timestamp": {
"seconds": 1507389428,
"microseconds": 0
},
"offset": 0,
"negative_utc": false
},
"revision_parents": []
}
}
diff --git a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json
index cbfab58..9d73fac 100644
--- a/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json
+++ b/swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json
@@ -1,67 +1,65 @@
{
"origin": {
"url": "https://hal-test.archives-ouvertes.fr/some-external-id",
"type": "deposit"
},
- "origin_metadata": {
- "metadata": {
- "@xmlns": [
- "http://www.w3.org/2005/Atom"
- ],
- "author": [
- "some awesome author",
- "another one",
- "no one"
- ],
- "codemeta:dateCreated": "2017-10-07T15:17:08Z",
- "codemeta:datePublished": "2017-10-08T15:00:00Z",
- "external_identifier": "some-external-id",
- "url": "https://hal-test.archives-ouvertes.fr/some-external-id"
- },
- "provider": {
- "provider_name": "hal",
- "provider_type": "deposit_client",
- "provider_url": "https://hal-test.archives-ouvertes.fr/",
- "metadata": null
- },
- "tool": {
- "name": "swh-deposit",
- "version": "0.0.1",
- "configuration": {
- "sword_version": "2"
- }
+ "metadata_raw": ["some-external-idhttps://hal-test.archives-ouvertes.fr/some-external-id2017-10-07T15:17:08Zsome awesome authoranother oneno one",
+"someone"
+ ],
+ "metadata_dict": {
+ "author": [
+ "some awesome author",
+ "another one",
+ "no one"
+ ],
+ "codemeta:dateCreated": "2017-10-07T15:17:08Z",
+ "codemeta:datePublished": "2017-10-08T15:00:00Z",
+ "external_identifier": "some-external-id",
+ "url": "https://hal-test.archives-ouvertes.fr/some-external-id"
+ },
+ "provider": {
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ "metadata": null
+ },
+ "tool": {
+ "name": "swh-deposit",
+ "version": "0.0.1",
+ "configuration": {
+ "sword_version": "2"
}
},
"deposit": {
"id": 777,
"client": "hal",
"collection": "hal",
"author": {
"name": "Software Heritage",
"fullname": "Software Heritage",
"email": "robot@softwareheritage.org"
},
"author_date": {
"timestamp": {
"seconds": 1507389428,
"microseconds": 0
},
"offset": 0,
"negative_utc": false
},
"committer": {
"name": "Software Heritage",
"fullname": "Software Heritage",
"email": "robot@softwareheritage.org"
},
"committer_date": {
"timestamp": {
"seconds": 1507474800,
"microseconds": 0
},
"offset": 0,
"negative_utc": false
},
"revision_parents": []
}
}
diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py
index f29d3f9..f70ae98 100644
--- a/swh/loader/package/deposit/tests/test_deposit.py
+++ b/swh/loader/package/deposit/tests/test_deposit.py
@@ -1,398 +1,429 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import re
+from typing import List
import attr
import pytest
from swh.core.pytest_plugin import requests_mock_datadir_factory
from swh.loader.package.deposit.loader import DepositLoader
+from swh.loader.package.loader import now
from swh.loader.package.tests.common import check_metadata_paths
from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.model.identifiers import SWHID
from swh.model.model import (
MetadataAuthority,
MetadataAuthorityType,
MetadataFetcher,
MetadataTargetType,
RawExtrinsicMetadata,
Snapshot,
SnapshotBranch,
TargetType,
)
DEPOSIT_URL = "https://deposit.softwareheritage.org/1/private"
@pytest.fixture
def requests_mock_datadir(requests_mock_datadir):
"""Enhance default mock data to mock put requests as the loader does some
internal update queries there.
"""
requests_mock_datadir.put(re.compile("https"))
return requests_mock_datadir
def test_deposit_init_ok(swh_config, swh_loader_config):
url = "some-url"
deposit_id = 999
loader = DepositLoader(url, deposit_id) # Something that does not exist
assert loader.url == url
assert loader.client is not None
assert loader.client.base_url == swh_loader_config["deposit"]["url"]
def test_deposit_loading_unknown_deposit(swh_config, requests_mock_datadir):
"""Loading an unknown deposit should fail
no origin, no visit, no snapshot
"""
# private api url form: 'https://deposit.s.o/1/private/hal/666/raw/'
url = "some-url"
unknown_deposit_id = 667
loader = DepositLoader(url, unknown_deposit_id) # does not exist
actual_load_status = loader.load()
assert actual_load_status == {"status": "failed"}
stats = get_stats(loader.storage)
assert {
"content": 0,
"directory": 0,
"origin": 0,
"origin_visit": 0,
"release": 0,
"revision": 0,
"skipped_content": 0,
"snapshot": 0,
} == stats
requests_mock_datadir_missing_one = requests_mock_datadir_factory(
ignore_urls=[f"{DEPOSIT_URL}/666/raw/",]
)
def test_deposit_loading_failure_to_retrieve_1_artifact(
swh_config, requests_mock_datadir_missing_one
):
"""Deposit with missing artifact ends up with an uneventful/partial visit
"""
# private api url form: 'https://deposit.s.o/1/private/hal/666/raw/'
url = "some-url-2"
deposit_id = 666
loader = DepositLoader(url, deposit_id)
actual_load_status = loader.load()
assert actual_load_status["status"] == "uneventful"
assert actual_load_status["snapshot_id"] is not None
assert_last_visit_matches(loader.storage, url, status="partial", type="deposit")
stats = get_stats(loader.storage)
assert {
"content": 0,
"directory": 0,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 0,
"skipped_content": 0,
"snapshot": 1,
} == stats
def test_revision_metadata_structure(swh_config, requests_mock_datadir):
url = "https://hal-test.archives-ouvertes.fr/some-external-id"
deposit_id = 666
loader = DepositLoader(url, deposit_id)
actual_load_status = loader.load()
assert actual_load_status["status"] == "eventful"
assert actual_load_status["snapshot_id"] is not None
expected_revision_id = hash_to_bytes("637318680351f5d78856d13264faebbd91efe9bb")
revision = loader.storage.revision_get([expected_revision_id])[0]
assert revision is not None
check_metadata_paths(
revision.metadata,
paths=[
("extrinsic.provider", str),
("extrinsic.when", str),
("extrinsic.raw", dict),
("original_artifact", list),
],
)
# Only 2 top-level keys now
assert set(revision.metadata.keys()) == {"extrinsic", "original_artifact"}
for original_artifact in revision.metadata["original_artifact"]:
check_metadata_paths(
original_artifact,
paths=[("filename", str), ("length", int), ("checksums", dict),],
)
def test_deposit_loading_ok(swh_config, requests_mock_datadir):
url = "https://hal-test.archives-ouvertes.fr/some-external-id"
deposit_id = 666
loader = DepositLoader(url, deposit_id)
actual_load_status = loader.load()
expected_snapshot_id = "b2b327b33dc85818bd23c3ccda8b7e675a66ecbd"
assert actual_load_status == {
"status": "eventful",
"snapshot_id": expected_snapshot_id,
}
assert_last_visit_matches(loader.storage, url, status="full", type="deposit")
stats = get_stats(loader.storage)
assert {
"content": 303,
"directory": 12,
"origin": 1,
"origin_visit": 1,
"release": 0,
"revision": 1,
"skipped_content": 0,
"snapshot": 1,
} == stats
revision_id_hex = "637318680351f5d78856d13264faebbd91efe9bb"
revision_id = hash_to_bytes(revision_id_hex)
expected_snapshot = Snapshot(
id=hash_to_bytes(expected_snapshot_id),
branches={
b"HEAD": SnapshotBranch(
target=revision_id, target_type=TargetType.REVISION,
),
},
)
check_snapshot(expected_snapshot, storage=loader.storage)
revision = loader.storage.revision_get([revision_id])[0]
assert revision is not None
# check metadata
fetcher = MetadataFetcher(name="swh-deposit", version="0.0.1",)
authority = MetadataAuthority(
type=MetadataAuthorityType.DEPOSIT_CLIENT,
url="https://hal-test.archives-ouvertes.fr/",
)
# Check origin metadata
orig_meta = loader.storage.raw_extrinsic_metadata_get(
MetadataTargetType.ORIGIN, url, authority
)
assert orig_meta.next_page_token is None
- assert len(orig_meta.results) == 1
+ raw_meta = loader.client.metadata_get(deposit_id)
+ all_metadata_raw: List[str] = raw_meta["metadata_raw"]
+ # 2 raw metadata xml + 1 json dict
+ assert len(orig_meta.results) == len(all_metadata_raw) + 1
orig_meta0 = orig_meta.results[0]
assert orig_meta0.authority == authority
assert orig_meta0.fetcher == fetcher
# Check revision metadata
revision_swhid = SWHID(object_type="revision", object_id=revision_id)
- rev_meta = loader.storage.raw_extrinsic_metadata_get(
+ actual_rev_meta = loader.storage.raw_extrinsic_metadata_get(
MetadataTargetType.REVISION, revision_swhid, authority
)
- assert rev_meta.next_page_token is None
- assert len(rev_meta.results) == 1
- rev_meta0 = rev_meta.results[0]
- assert rev_meta0.authority == authority
- assert rev_meta0.fetcher == fetcher
+ assert actual_rev_meta.next_page_token is None
+ assert len(actual_rev_meta.results) == len(all_metadata_raw)
+ for rev_meta in actual_rev_meta.results:
+ assert rev_meta.authority == authority
+ assert rev_meta.fetcher == fetcher
+ assert rev_meta.metadata.decode() in all_metadata_raw
# Retrieve the information for deposit status update query to the deposit
urls = [
m
for m in requests_mock_datadir.request_history
if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/"
]
assert len(urls) == 1
update_query = urls[0]
body = update_query.json()
expected_body = {
"status": "done",
"revision_id": revision_id_hex,
"directory_id": hash_to_hex(revision.directory),
"snapshot_id": expected_snapshot_id,
"origin_url": url,
}
assert body == expected_body
def test_deposit_loading_ok_2(swh_config, requests_mock_datadir):
"""Field dates should be se appropriately
"""
external_id = "some-external-id"
url = f"https://hal-test.archives-ouvertes.fr/{external_id}"
deposit_id = 777
loader = DepositLoader(url, deposit_id)
actual_load_status = loader.load()
expected_snapshot_id = "3e68440fdd7c81d283f8f3aebb6f0c8657864192"
assert actual_load_status == {
"status": "eventful",
"snapshot_id": expected_snapshot_id,
}
assert_last_visit_matches(loader.storage, url, status="full", type="deposit")
revision_id = "564d18943d71be80d0d73b43a77cfb205bcde96c"
expected_snapshot = Snapshot(
id=hash_to_bytes(expected_snapshot_id),
branches={
b"HEAD": SnapshotBranch(
target=hash_to_bytes(revision_id), target_type=TargetType.REVISION
)
},
)
check_snapshot(expected_snapshot, storage=loader.storage)
raw_meta = loader.client.metadata_get(deposit_id)
# Ensure the date fields are set appropriately in the revision
# Retrieve the revision
revision = loader.storage.revision_get([hash_to_bytes(revision_id)])[0]
assert revision
assert revision.date.to_dict() == raw_meta["deposit"]["author_date"]
assert revision.committer_date.to_dict() == raw_meta["deposit"]["committer_date"]
read_api = f"{DEPOSIT_URL}/{deposit_id}/meta/"
+ provider = {
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ "metadata": None,
+ }
+ tool = {
+ "name": "swh-deposit",
+ "version": "0.0.1",
+ "configuration": {"sword_version": "2"},
+ }
assert revision.metadata == {
"extrinsic": {
"provider": read_api,
"raw": {
"origin": {"type": "deposit", "url": url,},
"origin_metadata": {
- "metadata": {
- "@xmlns": ["http://www.w3.org/2005/Atom"],
- "author": ["some awesome author", "another one", "no one",],
- "codemeta:dateCreated": "2017-10-07T15:17:08Z",
- "codemeta:datePublished": "2017-10-08T15:00:00Z",
- "external_identifier": "some-external-id",
- "url": url,
- },
- "provider": {
- "metadata": None,
- "provider_name": "hal",
- "provider_type": "deposit_client",
- "provider_url": "https://hal-test.archives-ouvertes.fr/",
- },
- "tool": {
- "configuration": {"sword_version": "2"},
- "name": "swh-deposit",
- "version": "0.0.1",
- },
+ "metadata": json.dumps(raw_meta["metadata_dict"]),
+ "provider": provider,
+ "tool": tool,
},
},
"when": revision.metadata["extrinsic"]["when"], # dynamic
},
"original_artifact": [
{
"checksums": {
"sha1": "f8c63d7c890a7453498e6cf9fef215d85ec6801d",
"sha256": "474bf646aeeff6d945eb752b1a9f8a40f3d81a88909ee7bd2d08cc822aa361e6", # noqa
},
"filename": "archive.zip",
"length": 956830,
"url": "https://deposit.softwareheritage.org/1/private/777/raw/",
}
],
}
fetcher = MetadataFetcher(name="swh-deposit", version="0.0.1",)
authority = MetadataAuthority(
type=MetadataAuthorityType.DEPOSIT_CLIENT,
url="https://hal-test.archives-ouvertes.fr/",
)
# Check the origin metadata swh side
- orig_meta = loader.storage.raw_extrinsic_metadata_get(
+ origin_extrinsic_metadata = loader.storage.raw_extrinsic_metadata_get(
MetadataTargetType.ORIGIN, url, authority
)
- assert orig_meta.next_page_token is None
- assert len(orig_meta.results) == 1
-
- orig_meta0 = orig_meta.results[0]
+ assert origin_extrinsic_metadata.next_page_token is None
+ all_metadata_raw: List[str] = raw_meta["metadata_raw"]
+ # 1 raw metadata xml + 1 json dict
+ assert len(origin_extrinsic_metadata.results) == len(all_metadata_raw) + 1
+
+ expected_metadata = []
+ for idx, raw_meta in enumerate(all_metadata_raw):
+ origin_meta = origin_extrinsic_metadata.results[idx]
+ expected_metadata.append(
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.ORIGIN,
+ id=url,
+ discovery_date=origin_meta.discovery_date,
+ metadata=raw_meta.encode(),
+ format="sword-v2-atom-codemeta-v2",
+ authority=authority,
+ fetcher=fetcher,
+ )
+ )
- expected_metadata = RawExtrinsicMetadata(
- type=MetadataTargetType.ORIGIN,
- id=url,
- discovery_date=orig_meta0.discovery_date,
- metadata=json.dumps(
- {
- "@xmlns": ["http://www.w3.org/2005/Atom"],
- "author": ["some awesome author", "another one", "no one"],
- "codemeta:dateCreated": "2017-10-07T15:17:08Z",
- "codemeta:datePublished": "2017-10-08T15:00:00Z",
- "external_identifier": "some-external-id",
- "url": "https://hal-test.archives-ouvertes.fr/some-external-id",
- }
- ).encode(),
- format="sword-v2-atom-codemeta-v2-in-json",
- authority=authority,
- fetcher=fetcher,
+ origin_metadata = {
+ "metadata": all_metadata_raw,
+ "provider": provider,
+ "tool": tool,
+ }
+ expected_metadata.append(
+ RawExtrinsicMetadata(
+ type=MetadataTargetType.ORIGIN,
+ id=url,
+ discovery_date=origin_extrinsic_metadata.results[-1].discovery_date,
+ metadata=json.dumps(origin_metadata).encode(),
+ format="original-artifacts-json",
+ authority=authority,
+ fetcher=fetcher,
+ )
)
- assert orig_meta0 == expected_metadata
+ assert len(origin_extrinsic_metadata.results) == len(expected_metadata)
+ for orig_meta in origin_extrinsic_metadata.results:
+ assert orig_meta in expected_metadata
# Check the revision metadata swh side
revision_swhid = SWHID(object_type="revision", object_id=revision_id)
- rev_meta = loader.storage.raw_extrinsic_metadata_get(
+ actual_revision_metadata = loader.storage.raw_extrinsic_metadata_get(
MetadataTargetType.REVISION, revision_swhid, authority
)
- assert rev_meta.next_page_token is None
+ assert actual_revision_metadata.next_page_token is None
+ assert len(actual_revision_metadata.results) == len(all_metadata_raw)
- assert len(rev_meta.results) == 1
-
- rev_meta0 = rev_meta.results[0]
-
- assert rev_meta0 == attr.evolve(
- expected_metadata,
+ rev_metadata_template = RawExtrinsicMetadata(
type=MetadataTargetType.REVISION,
id=revision_swhid,
+ format="sword-v2-atom-codemeta-v2",
+ authority=authority,
+ fetcher=fetcher,
origin=url,
+ # to satisfy the constructor
+ discovery_date=now(),
+ metadata=b"",
)
+ expected_revision_metadata = []
+ for idx, raw_meta in enumerate(all_metadata_raw):
+ rev_metadata = actual_revision_metadata.results[idx]
+ expected_revision_metadata.append(
+ attr.evolve(
+ rev_metadata_template,
+ discovery_date=rev_metadata.discovery_date,
+ metadata=raw_meta.encode(),
+ )
+ )
+
+ assert actual_revision_metadata.results == expected_revision_metadata
+
# Retrieve the information for deposit status update query to the deposit
urls = [
m
for m in requests_mock_datadir.request_history
if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/"
]
assert len(urls) == 1
update_query = urls[0]
body = update_query.json()
expected_body = {
"status": "done",
"revision_id": revision_id,
"directory_id": hash_to_hex(revision.directory),
"snapshot_id": expected_snapshot_id,
"origin_url": url,
}
assert body == expected_body