Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/archive/loader.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | |||||
import logging | import logging | ||||
from os import path | from os import path | ||||
from typing import Any, Dict, Iterator, Mapping, Optional, Sequence, Tuple | from typing import Any, Dict, Iterator, Optional, Sequence, Tuple, Union | ||||
import attr | import attr | ||||
import iso8601 | import iso8601 | ||||
from swh.loader.package.loader import PackageLoader, BasePackageInfo | from swh.loader.package.loader import PackageLoader, BasePackageInfo | ||||
from swh.loader.package.utils import release_name, artifact_identity | from swh.loader.package.utils import release_name | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
Sha1Git, | Sha1Git, | ||||
Person, | Person, | ||||
TimestampWithTimezone, | TimestampWithTimezone, | ||||
Revision, | Revision, | ||||
RevisionType, | RevisionType, | ||||
) | ) | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
SWH_PERSON = Person( | SWH_PERSON = Person( | ||||
name=b"Software Heritage", | name=b"Software Heritage", | ||||
fullname=b"Software Heritage", | fullname=b"Software Heritage", | ||||
email=b"robot@softwareheritage.org", | email=b"robot@softwareheritage.org", | ||||
) | ) | ||||
REVISION_MESSAGE = b"swh-loader-package: synthetic revision message" | REVISION_MESSAGE = b"swh-loader-package: synthetic revision message" | ||||
@attr.s | |||||
class ArchivePackageInfo(BasePackageInfo): | class ArchivePackageInfo(BasePackageInfo): | ||||
raw = attr.ib(type=Dict[str, Any]) | raw = attr.ib(type=Dict[str, Any]) | ||||
length = attr.ib(type=int) | |||||
"""Size of the archive file""" | |||||
time = attr.ib(type=Union[str, datetime.datetime]) | |||||
"""Timestamp of the archive file on the server""" | |||||
version = attr.ib(type=str) | |||||
# default keys for gnu | |||||
ID_KEYS = ["time", "url", "length", "version"] | |||||
def artifact_identity(self, id_keys=None): | |||||
if id_keys is None: | |||||
id_keys = self.ID_KEYS | |||||
# TODO: use parsed attributes instead of self.raw | |||||
return [self.raw.get(k) for k in id_keys] | |||||
@classmethod | |||||
def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": | |||||
url = a_metadata["url"] | |||||
filename = a_metadata.get("filename") | |||||
return cls( | |||||
url=url, | |||||
filename=filename if filename else path.split(url)[-1], | |||||
raw=a_metadata, | |||||
length=a_metadata["length"], | |||||
time=a_metadata["time"], | |||||
version=a_metadata["version"], | |||||
) | |||||
class ArchiveLoader(PackageLoader[ArchivePackageInfo]): | class ArchiveLoader(PackageLoader[ArchivePackageInfo]): | ||||
"""Load archive origin's artifact files into swh archive | """Load archive origin's artifact files into swh archive | ||||
""" | """ | ||||
visit_type = "tar" | visit_type = "tar" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
url: str, | url: str, | ||||
artifacts: Sequence[Mapping[str, Any]], | artifacts: Sequence[Dict[str, Any]], | ||||
identity_artifact_keys: Optional[Sequence[str]] = None, | identity_artifact_keys: Optional[Sequence[str]] = None, | ||||
): | ): | ||||
"""Loader constructor. | """Loader constructor. | ||||
For now, this is the lister's task output. | For now, this is the lister's task output. | ||||
Args: | Args: | ||||
url: Origin url | url: Origin url | ||||
Show All 9 Lines | ): | ||||
- **length**: artifact's length | - **length**: artifact's length | ||||
identity_artifact_keys: Optional List of keys forming the | identity_artifact_keys: Optional List of keys forming the | ||||
"identity" of an artifact | "identity" of an artifact | ||||
""" | """ | ||||
super().__init__(url=url) | super().__init__(url=url) | ||||
self.artifacts = artifacts # assume order is enforced in the lister | self.artifacts = artifacts # assume order is enforced in the lister | ||||
if not identity_artifact_keys: | |||||
# default keys for gnu | |||||
identity_artifact_keys = ["time", "url", "length", "version"] | |||||
self.identity_artifact_keys = identity_artifact_keys | self.identity_artifact_keys = identity_artifact_keys | ||||
def get_versions(self) -> Sequence[str]: | def get_versions(self) -> Sequence[str]: | ||||
versions = [] | versions = [] | ||||
for archive in self.artifacts: | for archive in self.artifacts: | ||||
v = archive.get("version") | v = archive.get("version") | ||||
if v: | if v: | ||||
versions.append(v) | versions.append(v) | ||||
return versions | return versions | ||||
def get_default_version(self) -> str: | def get_default_version(self) -> str: | ||||
# It's the most recent, so for this loader, it's the last one | # It's the most recent, so for this loader, it's the last one | ||||
return self.artifacts[-1]["version"] | return self.artifacts[-1]["version"] | ||||
def get_package_info( | def get_package_info( | ||||
self, version: str | self, version: str | ||||
) -> Iterator[Tuple[str, ArchivePackageInfo]]: | ) -> Iterator[Tuple[str, ArchivePackageInfo]]: | ||||
for a_metadata in self.artifacts: | for a_metadata in self.artifacts: | ||||
url = a_metadata["url"] | p_info = ArchivePackageInfo.from_metadata(a_metadata) | ||||
package_version = a_metadata["version"] | if version == p_info.version: | ||||
if version == package_version: | |||||
filename = a_metadata.get("filename") | |||||
p_info = ArchivePackageInfo( | |||||
url=url, | |||||
filename=filename if filename else path.split(url)[-1], | |||||
raw=a_metadata, | |||||
) | |||||
# FIXME: this code assumes we have only 1 artifact per | # FIXME: this code assumes we have only 1 artifact per | ||||
# versioned package | # versioned package | ||||
yield release_name(version), p_info | yield release_name(version), p_info | ||||
def resolve_revision_from( | def resolve_revision_from( | ||||
self, known_artifacts: Dict, artifact_metadata: Dict | self, known_artifacts: Dict, p_info: ArchivePackageInfo | ||||
) -> Optional[bytes]: | ) -> Optional[bytes]: | ||||
identity = artifact_identity( | identity = p_info.artifact_identity(id_keys=self.identity_artifact_keys) | ||||
artifact_metadata, id_keys=self.identity_artifact_keys | |||||
) | |||||
for rev_id, known_artifact in known_artifacts.items(): | for rev_id, known_artifact in known_artifacts.items(): | ||||
logging.debug("known_artifact: %s", known_artifact) | logging.debug("known_artifact: %s", known_artifact) | ||||
reference_artifact = known_artifact["extrinsic"]["raw"] | reference_artifact = known_artifact["extrinsic"]["raw"] | ||||
known_identity = artifact_identity( | reference_artifact_info = ArchivePackageInfo.from_metadata( | ||||
reference_artifact, id_keys=self.identity_artifact_keys | reference_artifact | ||||
) | |||||
known_identity = reference_artifact_info.artifact_identity( | |||||
id_keys=self.identity_artifact_keys | |||||
) | ) | ||||
if identity == known_identity: | if identity == known_identity: | ||||
return rev_id | return rev_id | ||||
return None | return None | ||||
def build_revision( | def build_revision( | ||||
self, a_metadata: Mapping[str, Any], uncompressed_path: str, directory: Sha1Git | self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git | ||||
) -> Optional[Revision]: | ) -> Optional[Revision]: | ||||
time = a_metadata["time"] # assume it's a timestamp | time = p_info.time # assume it's a timestamp | ||||
if isinstance(time, str): # otherwise, assume it's a parsable date | if isinstance(time, str): # otherwise, assume it's a parsable date | ||||
time = iso8601.parse_date(time) | parsed_time = iso8601.parse_date(time) | ||||
normalized_time = TimestampWithTimezone.from_datetime(time) | else: | ||||
parsed_time = time | |||||
normalized_time = TimestampWithTimezone.from_datetime(parsed_time) | |||||
return Revision( | return Revision( | ||||
type=RevisionType.TAR, | type=RevisionType.TAR, | ||||
message=REVISION_MESSAGE, | message=REVISION_MESSAGE, | ||||
date=normalized_time, | date=normalized_time, | ||||
author=SWH_PERSON, | author=SWH_PERSON, | ||||
committer=SWH_PERSON, | committer=SWH_PERSON, | ||||
committer_date=normalized_time, | committer_date=normalized_time, | ||||
parents=(), | parents=(), | ||||
directory=directory, | directory=directory, | ||||
synthetic=True, | synthetic=True, | ||||
metadata={ | metadata={ | ||||
"intrinsic": {}, | "intrinsic": {}, | ||||
"extrinsic": { | "extrinsic": { | ||||
"provider": self.url, | "provider": self.url, | ||||
"when": self.visit_date.isoformat(), | "when": self.visit_date.isoformat(), | ||||
"raw": a_metadata, | "raw": p_info.raw, | ||||
}, | }, | ||||
}, | }, | ||||
) | ) |