Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/deposit/loader.py
# Copyright (C) 2019-2020 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | |||||
import json | import json | ||||
import logging | import logging | ||||
import requests | import requests | ||||
from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union | from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union | ||||
import types | import types | ||||
import attr | import attr | ||||
Show All 12 Lines | |||||
) | ) | ||||
from swh.loader.package.loader import PackageLoader, BasePackageInfo | from swh.loader.package.loader import PackageLoader, BasePackageInfo | ||||
from swh.loader.package.utils import download | from swh.loader.package.utils import download | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
@attr.s | |||||
class DepositPackageInfo(BasePackageInfo): | class DepositPackageInfo(BasePackageInfo): | ||||
filename = attr.ib(type=str) # instead of Optional[str] | filename = attr.ib(type=str) # instead of Optional[str] | ||||
raw = attr.ib(type=Dict[str, Any]) | raw = attr.ib(type=Dict[str, Any]) | ||||
author_date = attr.ib(type=datetime.datetime) | |||||
"""codemeta:dateCreated if any, deposit completed_date otherwise""" | |||||
commit_date = attr.ib(type=datetime.datetime) | |||||
"""codemeta:datePublished if any, deposit completed_date otherwise""" | |||||
client = attr.ib(type=str) | |||||
id = attr.ib(type=int) | |||||
"""Internal ID of the deposit in the deposit DB""" | |||||
collection = attr.ib(type=str) | |||||
"""The collection in the deposit; see SWORD specification.""" | |||||
author = attr.ib(type=Person) | |||||
committer = attr.ib(type=Person) | |||||
revision_parents = attr.ib(type=Tuple[Sha1Git, ...]) | |||||
"""Revisions created from previous deposits, that will be used as parents of the | |||||
revision created for this deposit.""" | |||||
@classmethod | |||||
def from_metadata( | |||||
cls, metadata: Dict[str, Any], url: str, filename: str | |||||
) -> "DepositPackageInfo": | |||||
# Note: | |||||
# `date` and `committer_date` are always transmitted by the deposit read api | |||||
# which computes itself the values. The loader needs to use those to create the | |||||
# revision. | |||||
metadata = metadata.copy() | |||||
# FIXME: this removes information from 'raw' metadata | |||||
depo = metadata.pop("deposit") | |||||
return cls( | |||||
url=url, | |||||
filename=filename, | |||||
author_date=depo["author_date"], | |||||
commit_date=depo["committer_date"], | |||||
client=depo["client"], | |||||
id=depo["id"], | |||||
collection=depo["collection"], | |||||
author=parse_author(depo["author"]), | |||||
committer=parse_author(depo["committer"]), | |||||
revision_parents=tuple(hash_to_bytes(p) for p in depo["revision_parents"]), | |||||
raw=metadata, | |||||
) | |||||
class DepositLoader(PackageLoader[DepositPackageInfo]): | class DepositLoader(PackageLoader[DepositPackageInfo]): | ||||
"""Load pypi origin's artifact releases into swh archive. | """Load pypi origin's artifact releases into swh archive. | ||||
""" | """ | ||||
visit_type = "deposit" | visit_type = "deposit" | ||||
Show All 15 Lines | class DepositLoader(PackageLoader[DepositPackageInfo]): | ||||
def get_versions(self) -> Sequence[str]: | def get_versions(self) -> Sequence[str]: | ||||
# only 1 branch 'HEAD' with no alias since we only have 1 snapshot | # only 1 branch 'HEAD' with no alias since we only have 1 snapshot | ||||
# branch | # branch | ||||
return ["HEAD"] | return ["HEAD"] | ||||
def get_package_info( | def get_package_info( | ||||
self, version: str | self, version: str | ||||
) -> Iterator[Tuple[str, DepositPackageInfo]]: | ) -> Iterator[Tuple[str, DepositPackageInfo]]: | ||||
p_info = DepositPackageInfo( | p_info = DepositPackageInfo.from_metadata( | ||||
url=self.url, filename="archive.zip", raw=self.metadata, | self.metadata, url=self.url, filename="archive.zip", | ||||
) | ) | ||||
yield "HEAD", p_info | yield "HEAD", p_info | ||||
def download_package( | def download_package( | ||||
self, p_info: DepositPackageInfo, tmpdir: str | self, p_info: DepositPackageInfo, tmpdir: str | ||||
) -> List[Tuple[str, Mapping]]: | ) -> List[Tuple[str, Mapping]]: | ||||
"""Override to allow use of the dedicated deposit client | """Override to allow use of the dedicated deposit client | ||||
""" | """ | ||||
return [self.client.archive_get(self.deposit_id, tmpdir, p_info.filename)] | return [self.client.archive_get(self.deposit_id, tmpdir, p_info.filename)] | ||||
def build_revision( | def build_revision( | ||||
self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git | self, p_info: DepositPackageInfo, uncompressed_path: str, directory: Sha1Git | ||||
) -> Optional[Revision]: | ) -> Optional[Revision]: | ||||
depo = a_metadata.pop("deposit") | message = ( | ||||
f"{p_info.client}: Deposit {p_info.id} in collection {p_info.collection}" | |||||
# Note: | ).encode("utf-8") | ||||
# `date` and `committer_date` are always transmitted by the deposit read api | |||||
# which computes itself the values. The loader needs to use those to create the | |||||
# revision. | |||||
# date: codemeta:dateCreated if any, deposit completed_date otherwise | |||||
date = TimestampWithTimezone.from_dict(depo["author_date"]) | |||||
# commit_date: codemeta:datePublished if any, deposit completed_date otherwise | |||||
commit_date = TimestampWithTimezone.from_dict(depo["committer_date"]) | |||||
client, id, collection = [depo[k] for k in ["client", "id", "collection"]] | |||||
message = f"{client}: Deposit {id} in collection {collection}".encode("utf-8") | |||||
author = parse_author(depo["author"]) | |||||
committer = parse_author(depo["committer"]) | |||||
return Revision( | return Revision( | ||||
type=RevisionType.TAR, | type=RevisionType.TAR, | ||||
message=message, | message=message, | ||||
author=author, | author=p_info.author, | ||||
date=date, | date=TimestampWithTimezone.from_dict(p_info.author_date), | ||||
committer=committer, | committer=p_info.committer, | ||||
ardumont: Maybe centralize the conversion at the same location.
Here you are converting one part into… | |||||
Done Inline ActionsThat's what I did initially, but it makes more sense here, as I'm not using model objects in the *PackageInfo structures; and TimestampWithTimezone is only really useful for making the Revision. vlorentz: That's what I did initially, but it makes more sense here, as I'm not using model objects in… | |||||
committer_date=commit_date, | committer_date=TimestampWithTimezone.from_dict(p_info.commit_date), | ||||
parents=tuple([hash_to_bytes(p) for p in depo["revision_parents"]]), | parents=p_info.revision_parents, | ||||
directory=directory, | directory=directory, | ||||
synthetic=True, | synthetic=True, | ||||
metadata={ | metadata={ | ||||
"extrinsic": { | "extrinsic": { | ||||
"provider": self.client.metadata_url(self.deposit_id), | "provider": self.client.metadata_url(self.deposit_id), | ||||
"when": self.visit_date.isoformat(), | "when": self.visit_date.isoformat(), | ||||
"raw": a_metadata, # Actually the processed metadata instead of raw | "raw": p_info.raw, | ||||
}, | }, | ||||
}, | }, | ||||
) | ) | ||||
def load(self) -> Dict: | def load(self) -> Dict: | ||||
# First making sure the deposit is known prior to trigger a loading | # First making sure the deposit is known prior to trigger a loading | ||||
try: | try: | ||||
self.metadata = self.client.metadata_get(self.deposit_id) | self.metadata = self.client.metadata_get(self.deposit_id) | ||||
▲ Show 20 Lines • Show All 170 Lines • Show Last 20 Lines |
Maybe centralize the conversion at the same location.
Here you are converting one part into the package info class (parent ids to bytes), the other in the build revision (timestamp stuff).
Might be convert everything in the package info class (same goes for other loaders).