Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/archive/loader.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import hashlib | |||||
import logging | import logging | ||||
from os import path | from os import path | ||||
import string | |||||
from typing import Any, Dict, Iterator, Optional, Sequence, Tuple, Union | from typing import Any, Dict, Iterator, Optional, Sequence, Tuple, Union | ||||
import attr | import attr | ||||
import iso8601 | import iso8601 | ||||
from swh.loader.package.loader import BasePackageInfo, PackageLoader | from swh.loader.package.loader import BasePackageInfo, PackageLoader | ||||
from swh.loader.package.utils import release_name | from swh.loader.package.utils import release_name | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
Show All 18 Lines | |||||
class ArchivePackageInfo(BasePackageInfo): | class ArchivePackageInfo(BasePackageInfo): | ||||
raw_info = attr.ib(type=Dict[str, Any]) | raw_info = attr.ib(type=Dict[str, Any]) | ||||
length = attr.ib(type=int) | length = attr.ib(type=int) | ||||
"""Size of the archive file""" | """Size of the archive file""" | ||||
time = attr.ib(type=Union[str, datetime.datetime]) | time = attr.ib(type=Union[str, datetime.datetime]) | ||||
"""Timestamp of the archive file on the server""" | """Timestamp of the archive file on the server""" | ||||
version = attr.ib(type=str) | version = attr.ib(type=str) | ||||
# default keys for gnu | # default format for gnu | ||||
ID_KEYS = ["time", "url", "length", "version"] | MANIFEST_FORMAT = string.Template("$time $length $version $url") | ||||
def artifact_identity(self, id_keys=None): | def extid(self, manifest_format: Optional[string.Template] = None) -> bytes: | ||||
if id_keys is None: | """Returns a unique intrinsic identifier of this package info | ||||
id_keys = self.ID_KEYS | |||||
``manifest_format`` allows overriding the class' default MANIFEST_FORMAT""" | |||||
manifest_format = manifest_format or self.MANIFEST_FORMAT | |||||
# TODO: use parsed attributes instead of self.raw_info | # TODO: use parsed attributes instead of self.raw_info | ||||
return [self.raw_info.get(k) for k in id_keys] | manifest = manifest_format.substitute( | ||||
{k: str(v) for (k, v) in self.raw_info.items()} | |||||
) | |||||
return hashlib.sha256(manifest.encode()).digest() | |||||
@classmethod | @classmethod | ||||
def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": | def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": | ||||
url = a_metadata["url"] | url = a_metadata["url"] | ||||
filename = a_metadata.get("filename") | filename = a_metadata.get("filename") | ||||
return cls( | return cls( | ||||
url=url, | url=url, | ||||
filename=filename if filename else path.split(url)[-1], | filename=filename if filename else path.split(url)[-1], | ||||
Show All 11 Lines | class ArchiveLoader(PackageLoader[ArchivePackageInfo]): | ||||
visit_type = "tar" | visit_type = "tar" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
storage: StorageInterface, | storage: StorageInterface, | ||||
url: str, | url: str, | ||||
artifacts: Sequence[Dict[str, Any]], | artifacts: Sequence[Dict[str, Any]], | ||||
identity_artifact_keys: Optional[Sequence[str]] = None, | extid_manifest_format: Optional[str] = None, | ||||
max_content_size: Optional[int] = None, | max_content_size: Optional[int] = None, | ||||
): | ): | ||||
"""Loader constructor. | f"""Loader constructor. | ||||
For now, this is the lister's task output. | For now, this is the lister's task output. | ||||
Args: | Args: | ||||
url: Origin url | url: Origin url | ||||
artifacts: List of artifact information with keys: | artifacts: List of artifact information with keys: | ||||
- **time**: last modification time as either isoformat date | - **time**: last modification time as either isoformat date | ||||
string or timestamp | string or timestamp | ||||
- **url**: the artifact url to retrieve filename | - **url**: the artifact url to retrieve filename | ||||
- **filename**: optionally, the file's name | - **filename**: optionally, the file's name | ||||
- **version**: artifact's version | - **version**: artifact's version | ||||
- **length**: artifact's length | - **length**: artifact's length | ||||
identity_artifact_keys: Optional List of keys forming the | extid_manifest_format: template string used to format a manifest, | ||||
"identity" of an artifact | which is hashed to get the extid of a package. | ||||
Defaults to {ArchivePackageInfo.MANIFEST_FORMAT!r} | |||||
""" | """ | ||||
super().__init__(storage=storage, url=url, max_content_size=max_content_size) | super().__init__(storage=storage, url=url, max_content_size=max_content_size) | ||||
self.artifacts = artifacts # assume order is enforced in the lister | self.artifacts = artifacts # assume order is enforced in the lister | ||||
self.identity_artifact_keys = identity_artifact_keys | self.extid_manifest_format = ( | ||||
None | |||||
if extid_manifest_format is None | |||||
else string.Template(extid_manifest_format) | |||||
) | |||||
def get_versions(self) -> Sequence[str]: | def get_versions(self) -> Sequence[str]: | ||||
versions = [] | versions = [] | ||||
for archive in self.artifacts: | for archive in self.artifacts: | ||||
v = archive.get("version") | v = archive.get("version") | ||||
if v: | if v: | ||||
versions.append(v) | versions.append(v) | ||||
return versions | return versions | ||||
def get_default_version(self) -> str: | def get_default_version(self) -> str: | ||||
# It's the most recent, so for this loader, it's the last one | # It's the most recent, so for this loader, it's the last one | ||||
return self.artifacts[-1]["version"] | return self.artifacts[-1]["version"] | ||||
def get_package_info( | def get_package_info( | ||||
self, version: str | self, version: str | ||||
) -> Iterator[Tuple[str, ArchivePackageInfo]]: | ) -> Iterator[Tuple[str, ArchivePackageInfo]]: | ||||
for a_metadata in self.artifacts: | for a_metadata in self.artifacts: | ||||
p_info = ArchivePackageInfo.from_metadata(a_metadata) | p_info = ArchivePackageInfo.from_metadata(a_metadata) | ||||
if version == p_info.version: | if version == p_info.version: | ||||
# FIXME: this code assumes we have only 1 artifact per | # FIXME: this code assumes we have only 1 artifact per | ||||
# versioned package | # versioned package | ||||
yield release_name(version), p_info | yield release_name(version), p_info | ||||
def extid_from_reference_artifact(self, reference_artifact: Dict) -> bytes: | |||||
reference_artifact_info = ArchivePackageInfo.from_metadata(reference_artifact) | |||||
return reference_artifact_info.extid(manifest_format=self.extid_manifest_format) | |||||
def resolve_revision_from( | def resolve_revision_from( | ||||
self, known_artifacts: Dict, p_info: ArchivePackageInfo | self, known_artifacts: Dict, p_info: ArchivePackageInfo | ||||
) -> Optional[bytes]: | ) -> Optional[bytes]: | ||||
identity = p_info.artifact_identity(id_keys=self.identity_artifact_keys) | extid = p_info.extid(manifest_format=self.extid_manifest_format) | ||||
for rev_id, known_artifact in known_artifacts.items(): | for rev_id, known_artifact in known_artifacts.items(): | ||||
logging.debug("known_artifact: %s", known_artifact) | logging.debug("known_artifact: %s", known_artifact) | ||||
reference_artifact = known_artifact["extrinsic"]["raw"] | reference_artifact = known_artifact["extrinsic"]["raw"] | ||||
reference_artifact_info = ArchivePackageInfo.from_metadata( | known_extid = self.extid_from_reference_artifact(reference_artifact) | ||||
reference_artifact | if extid == known_extid: | ||||
) | |||||
known_identity = reference_artifact_info.artifact_identity( | |||||
id_keys=self.identity_artifact_keys | |||||
) | |||||
if identity == known_identity: | |||||
return rev_id | return rev_id | ||||
return None | return None | ||||
def build_revision( | def build_revision( | ||||
self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git | self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git | ||||
) -> Optional[Revision]: | ) -> Optional[Revision]: | ||||
time = p_info.time # assume it's a timestamp | time = p_info.time # assume it's a timestamp | ||||
if isinstance(time, str): # otherwise, assume it's a parsable date | if isinstance(time, str): # otherwise, assume it's a parsable date | ||||
Show All 23 Lines |