Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9312323
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
20 KB
Subscribers
None
View Options
diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py
index 0f940a4..34e5ab5 100644
--- a/swh/loader/package/deposit/loader.py
+++ b/swh/loader/package/deposit/loader.py
@@ -1,265 +1,261 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import requests
import types
from typing import Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple, Union
from swh.model.hashutil import hash_to_hex, hash_to_bytes
from swh.model.model import (
Person,
Revision,
RevisionType,
TimestampWithTimezone,
Sha1Git,
)
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import download
logger = logging.getLogger(__name__)
class DepositLoader(PackageLoader):
"""Load pypi origin's artifact releases into swh archive.
"""
visit_type = "deposit"
def __init__(self, url: str, deposit_id: str):
"""Constructor
Args:
url: Origin url to associate the artifacts/metadata to
deposit_id: Deposit identity
"""
super().__init__(url=url)
config_deposit = self.config["deposit"]
self.deposit_id = deposit_id
self.client = ApiClient(url=config_deposit["url"], auth=config_deposit["auth"])
self.metadata: Dict[str, Any] = {}
def get_versions(self) -> Sequence[str]:
# only 1 branch 'HEAD' with no alias since we only have 1 snapshot
# branch
return ["HEAD"]
def get_package_info(
self, version: str
) -> Generator[Tuple[str, Mapping[str, Any]], None, None]:
p_info = {
"filename": "archive.zip",
"raw": self.metadata,
}
yield "HEAD", p_info
def download_package(
self, p_info: Mapping[str, Any], tmpdir: str
) -> List[Tuple[str, Mapping]]:
"""Override to allow use of the dedicated deposit client
"""
return [self.client.archive_get(self.deposit_id, tmpdir, p_info["filename"])]
def build_revision(
self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git
) -> Optional[Revision]:
# FIXME: the deposit read api should no longer need to build the revision entry
# as this would avoid unnecessary indirection. This would also align with what
# other package loaders do
revision_data = a_metadata.pop("revision")
# Note:
# `date` and `committer_date` are always transmitted by the deposit read api
# which computes itself the values. The loader needs to use those to create the
# revision.
# date: codemeta:dateCreated if any, deposit completed_date otherwise
date = TimestampWithTimezone.from_dict(revision_data["date"])
# commit_date: codemeta:datePublished if any, deposit completed_date otherwise
commit_date = TimestampWithTimezone.from_dict(revision_data["committer_date"])
- metadata = revision_data["metadata"]
- metadata.update(
- {
- "extrinsic": {
- "provider": self.client.metadata_url(self.deposit_id),
- "when": self.visit_date.isoformat(),
- "raw": a_metadata,
- },
- }
- )
return Revision(
type=RevisionType.TAR,
message=revision_data["message"].encode("utf-8"),
author=parse_author(revision_data["author"]),
date=date,
committer=parse_author(revision_data["committer"]),
committer_date=commit_date,
parents=[hash_to_bytes(p) for p in revision_data.get("parents", [])],
directory=directory,
synthetic=True,
- metadata=metadata,
+ metadata={
+ "extrinsic": {
+ "provider": self.client.metadata_url(self.deposit_id),
+ "when": self.visit_date.isoformat(),
+ "raw": a_metadata,
+ },
+ },
)
def load(self) -> Dict:
# First making sure the deposit is known prior to trigger a loading
try:
self.metadata = self.client.metadata_get(self.deposit_id)
except ValueError:
logger.error(f"Unknown deposit {self.deposit_id}, ignoring")
return {"status": "failed"}
# Then usual loading
r = super().load()
success = r["status"] != "failed"
if success:
# Update archive with metadata information
origin_metadata = self.metadata["origin_metadata"]
logger.debug("origin_metadata: %s", origin_metadata)
tools = self.storage.tool_add([origin_metadata["tool"]])
logger.debug("tools: %s", tools)
tool_id = tools[0]["id"]
provider = origin_metadata["provider"]
# FIXME: Shall we delete this info?
provider_id = self.storage.metadata_provider_add(
provider["provider_name"],
provider["provider_type"],
provider["provider_url"],
metadata=None,
)
metadata = origin_metadata["metadata"]
self.storage.origin_metadata_add(
self.url, self.visit_date, provider_id, tool_id, metadata
)
# Update deposit status
try:
if not success:
self.client.status_update(self.deposit_id, status="failed")
return r
snapshot_id = hash_to_bytes(r["snapshot_id"])
branches = self.storage.snapshot_get(snapshot_id)["branches"]
logger.debug("branches: %s", branches)
if not branches:
return r
rev_id = branches[b"HEAD"]["target"]
revisions = self.storage.revision_get([rev_id])
# FIXME: inconsistency between tests and production code
if isinstance(revisions, types.GeneratorType):
revisions = list(revisions)
revision = revisions[0]
# Retrieve the revision identifier
dir_id = revision["directory"]
# update the deposit's status to success with its
# revision-id and directory-id
self.client.status_update(
self.deposit_id,
status="done",
revision_id=hash_to_hex(rev_id),
directory_id=hash_to_hex(dir_id),
origin_url=self.url,
)
except Exception:
logger.exception("Problem when trying to update the deposit's status")
return {"status": "failed"}
return r
def parse_author(author) -> Person:
"""See prior fixme
"""
return Person(
fullname=author["fullname"].encode("utf-8"),
name=author["name"].encode("utf-8"),
email=author["email"].encode("utf-8"),
)
class ApiClient:
"""Private Deposit Api client
"""
def __init__(self, url, auth: Optional[Mapping[str, str]]):
self.base_url = url.rstrip("/")
self.auth = None if not auth else (auth["username"], auth["password"])
def do(self, method: str, url: str, *args, **kwargs):
"""Internal method to deal with requests, possibly with basic http
authentication.
Args:
method (str): supported http methods as in get/post/put
Returns:
The request's execution output
"""
method_fn = getattr(requests, method)
if self.auth:
kwargs["auth"] = self.auth
return method_fn(url, *args, **kwargs)
def archive_get(
self, deposit_id: Union[int, str], tmpdir: str, filename: str
) -> Tuple[str, Dict]:
"""Retrieve deposit's archive artifact locally
"""
url = f"{self.base_url}/{deposit_id}/raw/"
return download(url, dest=tmpdir, filename=filename, auth=self.auth)
def metadata_url(self, deposit_id: Union[int, str]) -> str:
return f"{self.base_url}/{deposit_id}/meta/"
def metadata_get(self, deposit_id: Union[int, str]) -> Dict[str, Any]:
"""Retrieve deposit's metadata artifact as json
"""
url = self.metadata_url(deposit_id)
r = self.do("get", url)
if r.ok:
return r.json()
msg = f"Problem when retrieving deposit metadata at {url}"
logger.error(msg)
raise ValueError(msg)
def status_update(
self,
deposit_id: Union[int, str],
status: str,
revision_id: Optional[str] = None,
directory_id: Optional[str] = None,
origin_url: Optional[str] = None,
):
"""Update deposit's information including status, and persistent
identifiers result of the loading.
"""
url = f"{self.base_url}/{deposit_id}/update/"
payload = {"status": status}
if revision_id:
payload["revision_id"] = revision_id
if directory_id:
payload["directory_id"] = directory_id
if origin_url:
payload["origin_url"] = origin_url
self.do("put", url, json=payload)
diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py
index fbfa5cd..5ca57da 100644
--- a/swh/loader/package/deposit/tests/test_deposit.py
+++ b/swh/loader/package/deposit/tests/test_deposit.py
@@ -1,254 +1,330 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
import pytest
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.deposit.loader import DepositLoader
from swh.loader.package.tests.common import (
check_snapshot,
check_metadata_paths,
get_stats,
)
from swh.core.pytest_plugin import requests_mock_datadir_factory
@pytest.fixture
def requests_mock_datadir(requests_mock_datadir):
"""Enhance default mock data to mock put requests as the loader does some
internal update queries there.
"""
requests_mock_datadir.put(re.compile("https"))
return requests_mock_datadir
def test_deposit_init_ok(swh_config, swh_loader_config):
url = "some-url"
deposit_id = 999
loader = DepositLoader(url, deposit_id) # Something that does not exist
assert loader.url == url
assert loader.client is not None
assert loader.client.base_url == swh_loader_config["deposit"]["url"]
def test_deposit_loading_unknown_deposit(swh_config, requests_mock_datadir):
"""Loading an unknown deposit should fail
no origin, no visit, no snapshot
"""
# private api url form: 'https://deposit.s.o/1/private/hal/666/raw/'
url = "some-url"
unknown_deposit_id = 667
loader = DepositLoader(url, unknown_deposit_id) # does not exist
actual_load_status = loader.load()
assert actual_load_status == {"status": "failed"}
stats = get_stats(loader.storage)
assert {
"content": 0,
"directory": 0,
"origin": 0,
"origin_visit": 0,
"person": 0,
"release": 0,
"revision": 0,
"skipped_content": 0,
"snapshot": 0,
} == stats
requests_mock_datadir_missing_one = requests_mock_datadir_factory(
ignore_urls=["https://deposit.softwareheritage.org/1/private/666/raw/",]
)
def test_deposit_loading_failure_to_retrieve_1_artifact(
swh_config, requests_mock_datadir_missing_one
):
"""Deposit with missing artifact ends up with an uneventful/partial visit
"""
# private api url form: 'https://deposit.s.o/1/private/hal/666/raw/'
url = "some-url-2"
deposit_id = 666
loader = DepositLoader(url, deposit_id)
actual_load_status = loader.load()
assert actual_load_status["status"] == "uneventful"
assert actual_load_status["snapshot_id"] is not None
stats = get_stats(loader.storage)
assert {
"content": 0,
"directory": 0,
"origin": 1,
"origin_visit": 1,
"person": 0,
"release": 0,
"revision": 0,
"skipped_content": 0,
"snapshot": 1,
} == stats
origin_visit = loader.storage.origin_visit_get_latest(url)
assert origin_visit["status"] == "partial"
assert origin_visit["type"] == "deposit"
def test_revision_metadata_structure(swh_config, requests_mock_datadir):
url = "https://hal-test.archives-ouvertes.fr/some-external-id"
deposit_id = 666
loader = DepositLoader(url, deposit_id)
actual_load_status = loader.load()
assert actual_load_status["status"] == "eventful"
assert actual_load_status["snapshot_id"] is not None
expected_revision_id = hash_to_bytes("637318680351f5d78856d13264faebbd91efe9bb")
revision = list(loader.storage.revision_get([expected_revision_id]))[0]
assert revision is not None
check_metadata_paths(
revision["metadata"],
paths=[
("extrinsic.provider", str),
("extrinsic.when", str),
("extrinsic.raw", dict),
("original_artifact", list),
],
)
+ # Only 2 top-level keys now
+ assert set(revision["metadata"].keys()) == {"extrinsic", "original_artifact"}
+
for original_artifact in revision["metadata"]["original_artifact"]:
check_metadata_paths(
original_artifact,
paths=[("filename", str), ("length", int), ("checksums", dict),],
)
def test_deposit_loading_ok(swh_config, requests_mock_datadir):
url = "https://hal-test.archives-ouvertes.fr/some-external-id"
deposit_id = 666
loader = DepositLoader(url, deposit_id)
actual_load_status = loader.load()
expected_snapshot_id = "b2b327b33dc85818bd23c3ccda8b7e675a66ecbd"
assert actual_load_status == {
"status": "eventful",
"snapshot_id": expected_snapshot_id,
}
stats = get_stats(loader.storage)
assert {
"content": 303,
"directory": 12,
"origin": 1,
"origin_visit": 1,
"person": 1,
"release": 0,
"revision": 1,
"skipped_content": 0,
"snapshot": 1,
} == stats
origin_visit = loader.storage.origin_visit_get_latest(url)
assert origin_visit["status"] == "full"
assert origin_visit["type"] == "deposit"
expected_branches = {
"HEAD": {
"target": "637318680351f5d78856d13264faebbd91efe9bb",
"target_type": "revision",
},
}
expected_snapshot = {
"id": expected_snapshot_id,
"branches": expected_branches,
}
check_snapshot(expected_snapshot, storage=loader.storage)
# check metadata
tool = {
"name": "swh-deposit",
"version": "0.0.1",
"configuration": {"sword_version": "2",},
}
tool = loader.storage.tool_get(tool)
assert tool is not None
assert tool["id"] is not None
provider = {
"provider_name": "hal",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
"metadata": None,
}
provider = loader.storage.metadata_provider_get_by(provider)
assert provider is not None
assert provider["id"] is not None
metadata = list(
loader.storage.origin_metadata_get_by(url, provider_type="deposit_client")
)
assert metadata is not None
assert isinstance(metadata, list)
assert len(metadata) == 1
metadata0 = metadata[0]
assert metadata0["provider_id"] == provider["id"]
assert metadata0["provider_type"] == "deposit_client"
assert metadata0["tool_id"] == tool["id"]
def test_deposit_loading_ok_2(swh_config, requests_mock_datadir):
"""Field dates should be se appropriately
"""
- url = "https://hal-test.archives-ouvertes.fr/some-external-id"
+ external_id = "some-external-id"
+ url = f"https://hal-test.archives-ouvertes.fr/{external_id}"
deposit_id = 777
loader = DepositLoader(url, deposit_id)
actual_load_status = loader.load()
expected_snapshot_id = "3e68440fdd7c81d283f8f3aebb6f0c8657864192"
assert actual_load_status == {
"status": "eventful",
"snapshot_id": expected_snapshot_id,
}
revision_id = "564d18943d71be80d0d73b43a77cfb205bcde96c"
expected_branches = {"HEAD": {"target": revision_id, "target_type": "revision"}}
expected_snapshot = {
"id": expected_snapshot_id,
"branches": expected_branches,
}
check_snapshot(expected_snapshot, storage=loader.storage)
origin_visit = loader.storage.origin_visit_get_latest(url)
# The visit is partial because some hash collision were detected
assert origin_visit["status"] == "full"
assert origin_visit["type"] == "deposit"
raw_meta = loader.client.metadata_get(deposit_id)
# Ensure the date fields are set appropriately in the revision
# Retrieve the revision
revision = next(loader.storage.revision_get([hash_to_bytes(revision_id)]))
assert revision
- for field_date in ["committer_date", "date"]:
- assert revision[field_date] == raw_meta["revision"][field_date]
+ assert revision["committer_date"] == raw_meta["revision"]["committer_date"]
+ assert revision["date"] == raw_meta["revision"]["date"]
+
+ read_api = f"https://deposit.softwareheritage.org/1/private/{deposit_id}/meta/"
+
+ assert revision["metadata"] == {
+ "extrinsic": {
+ "provider": read_api,
+ "raw": {
+ "branch_name": "master",
+ "origin": {"type": "deposit", "url": url,},
+ "origin_metadata": {
+ "metadata": {
+ "@xmlns": ["http://www.w3.org/2005/Atom"],
+ "author": ["some awesome author", "another one", "no one",],
+ "codemeta:dateCreated": "2017-10-07T15:17:08Z",
+ "codemeta:datePublished": "2017-10-08T15:00:00Z",
+ "external_identifier": "some-external-id",
+ "url": url,
+ },
+ "provider": {
+ "metadata": None,
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ },
+ "tool": {
+ "configuration": {"sword_version": "2"},
+ "name": "swh-deposit",
+ "version": "0.0.1",
+ },
+ },
+ },
+ "when": revision["metadata"]["extrinsic"]["when"], # dynamic
+ },
+ "original_artifact": [
+ {
+ "checksums": {
+ "sha1": "f8c63d7c890a7453498e6cf9fef215d85ec6801d",
+ "sha256": "474bf646aeeff6d945eb752b1a9f8a40f3d81a88909ee7bd2d08cc822aa361e6", # noqa
+ },
+ "filename": "archive.zip",
+ "length": 956830,
+ }
+ ],
+ }
+
+ # Check the metadata swh side
+ origin_meta = list(
+ loader.storage.origin_metadata_get_by(url, provider_type="deposit_client")
+ )
+
+ assert len(origin_meta) == 1
+
+ origin_meta = origin_meta[0]
+ # dynamic, a pain to display and not that interesting
+ origin_meta.pop("discovery_date")
+
+ assert origin_meta == {
+ "metadata": {
+ "@xmlns": ["http://www.w3.org/2005/Atom"],
+ "author": ["some awesome author", "another one", "no one"],
+ "codemeta:dateCreated": "2017-10-07T15:17:08Z",
+ "codemeta:datePublished": "2017-10-08T15:00:00Z",
+ "external_identifier": "some-external-id",
+ "url": "https://hal-test.archives-ouvertes.fr/some-external-id",
+ },
+ "origin_url": "https://hal-test.archives-ouvertes.fr/some-external-id",
+ "provider_id": 1,
+ "provider_name": "hal",
+ "provider_type": "deposit_client",
+ "provider_url": "https://hal-test.archives-ouvertes.fr/",
+ "tool_id": 1,
+ }
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Thu, Jul 3, 10:49 AM (1 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3246464
Attached To
rDLDBASE Generic VCS/Package Loader
Event Timeline
Log In to Comment