Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/deposit/loader.py
# Copyright (C) 2019-2020 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
from datetime import timezone | |||||
import json | import json | ||||
import logging | import logging | ||||
from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union | from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union | ||||
import attr | import attr | ||||
import requests | import requests | ||||
from swh.loader.package.loader import ( | from swh.loader.package.loader import ( | ||||
Show All 13 Lines | from swh.model.model import ( | ||||
Sha1Git, | Sha1Git, | ||||
TimestampWithTimezone, | TimestampWithTimezone, | ||||
) | ) | ||||
from swh.storage.algos.snapshot import snapshot_get_all_branches | from swh.storage.algos.snapshot import snapshot_get_all_branches | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
def now() -> datetime.datetime: | |||||
return datetime.datetime.now(tz=timezone.utc) | |||||
@attr.s | @attr.s | ||||
class DepositPackageInfo(BasePackageInfo): | class DepositPackageInfo(BasePackageInfo): | ||||
filename = attr.ib(type=str) # instead of Optional[str] | filename = attr.ib(type=str) # instead of Optional[str] | ||||
raw_info = attr.ib(type=Dict[str, Any]) | raw_info = attr.ib(type=Dict[str, Any]) | ||||
author_date = attr.ib(type=datetime.datetime) | author_date = attr.ib(type=datetime.datetime) | ||||
"""codemeta:dateCreated if any, deposit completed_date otherwise""" | """codemeta:dateCreated if any, deposit completed_date otherwise""" | ||||
commit_date = attr.ib(type=datetime.datetime) | commit_date = attr.ib(type=datetime.datetime) | ||||
Show All 13 Lines | class DepositPackageInfo(BasePackageInfo): | ||||
def from_metadata( | def from_metadata( | ||||
cls, metadata: Dict[str, Any], url: str, filename: str | cls, metadata: Dict[str, Any], url: str, filename: str | ||||
) -> "DepositPackageInfo": | ) -> "DepositPackageInfo": | ||||
# Note: | # Note: | ||||
# `date` and `committer_date` are always transmitted by the deposit read api | # `date` and `committer_date` are always transmitted by the deposit read api | ||||
# which computes itself the values. The loader needs to use those to create the | # which computes itself the values. The loader needs to use those to create the | ||||
# revision. | # revision. | ||||
raw_metadata_from_origin = json.dumps( | all_metadata_raw: List[str] = metadata["metadata_raw"] | ||||
ardumont: The all raw xml join i'm talking about in the diff description.
It's either that or sending one… | |||||
Done Inline Actionsno longer relevant ^ (this now sends on RawExtrinsicMetadataCore per xml file). ardumont: no longer relevant ^ (this now sends on RawExtrinsicMetadataCore per xml file). | |||||
metadata["origin_metadata"]["metadata"] | raw_info = { | ||||
).encode() | "origin": metadata["origin"], | ||||
metadata = metadata.copy() | "origin_metadata": { | ||||
# FIXME: this removes information from 'raw' metadata | "metadata": json.dumps(metadata["metadata_dict"]), | ||||
depo = metadata.pop("deposit") | "provider": metadata["provider"], | ||||
"tool": metadata["tool"], | |||||
}, | |||||
} | |||||
depo = metadata["deposit"] | |||||
return cls( | return cls( | ||||
url=url, | url=url, | ||||
filename=filename, | filename=filename, | ||||
author_date=depo["author_date"], | author_date=depo["author_date"], | ||||
commit_date=depo["committer_date"], | commit_date=depo["committer_date"], | ||||
client=depo["client"], | client=depo["client"], | ||||
id=depo["id"], | id=depo["id"], | ||||
collection=depo["collection"], | collection=depo["collection"], | ||||
author=parse_author(depo["author"]), | author=parse_author(depo["author"]), | ||||
committer=parse_author(depo["committer"]), | committer=parse_author(depo["committer"]), | ||||
revision_parents=tuple(hash_to_bytes(p) for p in depo["revision_parents"]), | revision_parents=tuple(hash_to_bytes(p) for p in depo["revision_parents"]), | ||||
raw_info=metadata, | raw_info=raw_info, | ||||
revision_extrinsic_metadata=[ | revision_extrinsic_metadata=[ | ||||
RawExtrinsicMetadataCore( | RawExtrinsicMetadataCore( | ||||
format="sword-v2-atom-codemeta-v2-in-json", | discovery_date=now(), | ||||
metadata=raw_metadata_from_origin, | metadata=raw_metadata.encode(), | ||||
), | format="sword-v2-atom-codemeta-v2", | ||||
) | |||||
for raw_metadata in all_metadata_raw | |||||
], | ], | ||||
) | ) | ||||
class DepositLoader(PackageLoader[DepositPackageInfo]): | class DepositLoader(PackageLoader[DepositPackageInfo]): | ||||
"""Load pypi origin's artifact releases into swh archive. | """Load pypi origin's artifact releases into swh archive. | ||||
""" | """ | ||||
Show All 15 Lines | def __init__(self, url: str, deposit_id: str): | ||||
self.client = ApiClient(url=config_deposit["url"], auth=config_deposit["auth"]) | self.client = ApiClient(url=config_deposit["url"], auth=config_deposit["auth"]) | ||||
def get_versions(self) -> Sequence[str]: | def get_versions(self) -> Sequence[str]: | ||||
# only 1 branch 'HEAD' with no alias since we only have 1 snapshot | # only 1 branch 'HEAD' with no alias since we only have 1 snapshot | ||||
# branch | # branch | ||||
return ["HEAD"] | return ["HEAD"] | ||||
def get_metadata_authority(self) -> MetadataAuthority: | def get_metadata_authority(self) -> MetadataAuthority: | ||||
provider = self.metadata()["origin_metadata"]["provider"] | provider = self.metadata()["provider"] | ||||
assert provider["provider_type"] == "deposit_client" | assert provider["provider_type"] == MetadataAuthorityType.DEPOSIT_CLIENT.value | ||||
return MetadataAuthority( | return MetadataAuthority( | ||||
type=MetadataAuthorityType.DEPOSIT_CLIENT, | type=MetadataAuthorityType.DEPOSIT_CLIENT, | ||||
url=provider["provider_url"], | url=provider["provider_url"], | ||||
metadata={ | metadata={ | ||||
"name": provider["provider_name"], | "name": provider["provider_name"], | ||||
**(provider["metadata"] or {}), | **(provider["metadata"] or {}), | ||||
}, | }, | ||||
) | ) | ||||
def get_metadata_fetcher(self) -> MetadataFetcher: | def get_metadata_fetcher(self) -> MetadataFetcher: | ||||
tool = self.metadata()["origin_metadata"]["tool"] | tool = self.metadata()["tool"] | ||||
return MetadataFetcher( | return MetadataFetcher( | ||||
name=tool["name"], version=tool["version"], metadata=tool["configuration"], | name=tool["name"], version=tool["version"], metadata=tool["configuration"], | ||||
) | ) | ||||
def get_package_info( | def get_package_info( | ||||
self, version: str | self, version: str | ||||
) -> Iterator[Tuple[str, DepositPackageInfo]]: | ) -> Iterator[Tuple[str, DepositPackageInfo]]: | ||||
p_info = DepositPackageInfo.from_metadata( | p_info = DepositPackageInfo.from_metadata( | ||||
Show All 31 Lines | ) -> Optional[Revision]: | ||||
"provider": self.client.metadata_url(self.deposit_id), | "provider": self.client.metadata_url(self.deposit_id), | ||||
"when": self.visit_date.isoformat(), | "when": self.visit_date.isoformat(), | ||||
"raw": p_info.raw_info, | "raw": p_info.raw_info, | ||||
}, | }, | ||||
}, | }, | ||||
) | ) | ||||
def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: | def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: | ||||
origin_metadata = self.metadata()["origin_metadata"] | metadata = self.metadata() | ||||
all_metadata_raw: List[str] = metadata["metadata_raw"] | |||||
origin_metadata = json.dumps( | |||||
{ | |||||
"metadata": all_metadata_raw, | |||||
"provider": metadata["provider"], | |||||
"tool": metadata["tool"], | |||||
} | |||||
Done Inline ActionsKept the existing json format as it was before... ardumont: Kept the existing json format as it was before... | |||||
).encode() | |||||
return [ | return [ | ||||
RawExtrinsicMetadataCore( | RawExtrinsicMetadataCore( | ||||
format="sword-v2-atom-codemeta-v2-in-json", | discovery_date=now(), | ||||
metadata=json.dumps(origin_metadata["metadata"]).encode(), | metadata=raw_meta.encode(), | ||||
format="sword-v2-atom-codemeta-v2", | |||||
) | |||||
for raw_meta in all_metadata_raw | |||||
] + [ | |||||
RawExtrinsicMetadataCore( | |||||
discovery_date=now(), | |||||
metadata=origin_metadata, | |||||
format="original-artifacts-json", | |||||
) | ) | ||||
Done Inline ActionsHere, i did not join the xml files into 1 (as I did for the revision one), shout out if not ok ;) ardumont: Here, i did not join the xml files into 1 (as I did for the revision one), shout out if not ok… | |||||
Done Inline ActionsThe other part was aligned with this. ardumont: The other part was aligned with this.
| |||||
] | ] | ||||
@cached_method | @cached_method | ||||
def metadata(self): | def metadata(self): | ||||
"""Returns metadata from the deposit server""" | """Returns metadata from the deposit server""" | ||||
return self.client.metadata_get(self.deposit_id) | return self.client.metadata_get(self.deposit_id) | ||||
def load(self) -> Dict: | def load(self) -> Dict: | ||||
▲ Show 20 Lines • Show All 132 Lines • Show Last 20 Lines |
The all raw xml join i'm talking about in the diff description.
It's either that or sending one RawExtrinsicMetadataCore per xml file below (around line 95) [1]
Like i did for the origin metadata...
I don't know what's best...
I'm now inclined to think that sending multiple raw metadata may be better.
Bottom line, It implies less action when reading.