Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/archive/loader.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import hashlib | import hashlib | ||||
import logging | import logging | ||||
from os import path | from os import path | ||||
import string | import string | ||||
from typing import Any, Dict, Iterator, Optional, Sequence, Tuple, Union | from typing import Any, Dict, Iterator, Optional, Sequence, Tuple, Union | ||||
import attr | import attr | ||||
import iso8601 | import iso8601 | ||||
from swh.loader.package.loader import BasePackageInfo, PackageLoader | from swh.loader.package.loader import BaseManifestPackageInfo, PackageLoader | ||||
from swh.loader.package.utils import release_name | from swh.loader.package.utils import release_name | ||||
from swh.model.model import ( | from swh.model.model import ( | ||||
Person, | Person, | ||||
Revision, | Revision, | ||||
RevisionType, | RevisionType, | ||||
Sha1Git, | Sha1Git, | ||||
TimestampWithTimezone, | TimestampWithTimezone, | ||||
) | ) | ||||
from swh.storage.interface import StorageInterface | from swh.storage.interface import StorageInterface | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
SWH_PERSON = Person( | SWH_PERSON = Person( | ||||
name=b"Software Heritage", | name=b"Software Heritage", | ||||
fullname=b"Software Heritage", | fullname=b"Software Heritage", | ||||
email=b"robot@softwareheritage.org", | email=b"robot@softwareheritage.org", | ||||
) | ) | ||||
REVISION_MESSAGE = b"swh-loader-package: synthetic revision message" | REVISION_MESSAGE = b"swh-loader-package: synthetic revision message" | ||||
@attr.s | @attr.s | ||||
class ArchivePackageInfo(BasePackageInfo): | class ArchivePackageInfo(BaseManifestPackageInfo): | ||||
raw_info = attr.ib(type=Dict[str, Any]) | raw_info = attr.ib(type=Dict[str, Any]) | ||||
length = attr.ib(type=int) | length = attr.ib(type=int) | ||||
"""Size of the archive file""" | """Size of the archive file""" | ||||
time = attr.ib(type=Union[str, datetime.datetime]) | time = attr.ib(type=Union[str, datetime.datetime]) | ||||
"""Timestamp of the archive file on the server""" | """Timestamp of the archive file on the server""" | ||||
version = attr.ib(type=str) | version = attr.ib(type=str) | ||||
# default format for gnu | # default format for gnu | ||||
MANIFEST_FORMAT = string.Template("$time $length $version $url") | MANIFEST_FORMAT = string.Template("$time $length $version $url") | ||||
def extid(self, manifest_format: Optional[string.Template] = None) -> bytes: | def extid(self, manifest_format: Optional[string.Template] = None) -> bytes: | ||||
"""Returns a unique intrinsic identifier of this package info | """Returns a unique intrinsic identifier of this package info | ||||
``manifest_format`` allows overriding the class' default MANIFEST_FORMAT""" | ``manifest_format`` allows overriding the class' default MANIFEST_FORMAT""" | ||||
manifest_format = manifest_format or self.MANIFEST_FORMAT | manifest_format = manifest_format or self.MANIFEST_FORMAT | ||||
# TODO: use parsed attributes instead of self.raw_info | # TODO: use parsed attributes instead of self.raw_info | ||||
manifest = manifest_format.substitute( | manifest = manifest_format.substitute( | ||||
{k: str(v) for (k, v) in self.raw_info.items()} | {k: str(v) for (k, v) in self.raw_info.items()} | ||||
) | ) | ||||
print("hashed", repr(manifest)) | |||||
return hashlib.sha256(manifest.encode()).digest() | return hashlib.sha256(manifest.encode()).digest() | ||||
@classmethod | @classmethod | ||||
def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": | def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": | ||||
url = a_metadata["url"] | url = a_metadata["url"] | ||||
filename = a_metadata.get("filename") | filename = a_metadata.get("filename") | ||||
return cls( | return cls( | ||||
url=url, | url=url, | ||||
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines | class ArchiveLoader(PackageLoader[ArchivePackageInfo]): | ||||
) -> Iterator[Tuple[str, ArchivePackageInfo]]: | ) -> Iterator[Tuple[str, ArchivePackageInfo]]: | ||||
for a_metadata in self.artifacts: | for a_metadata in self.artifacts: | ||||
p_info = ArchivePackageInfo.from_metadata(a_metadata) | p_info = ArchivePackageInfo.from_metadata(a_metadata) | ||||
if version == p_info.version: | if version == p_info.version: | ||||
# FIXME: this code assumes we have only 1 artifact per | # FIXME: this code assumes we have only 1 artifact per | ||||
# versioned package | # versioned package | ||||
yield release_name(version), p_info | yield release_name(version), p_info | ||||
def extid_from_reference_artifact(self, reference_artifact: Dict) -> bytes: | def new_packageinfo_to_extid(self, p_info: ArchivePackageInfo) -> Optional[bytes]: | ||||
reference_artifact_info = ArchivePackageInfo.from_metadata(reference_artifact) | return p_info.extid(manifest_format=self.extid_manifest_format) | ||||
return reference_artifact_info.extid(manifest_format=self.extid_manifest_format) | |||||
def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[bytes]: | |||||
def resolve_revision_from( | known_artifact_info = ArchivePackageInfo.from_metadata( | ||||
self, known_artifacts: Dict, p_info: ArchivePackageInfo | known_artifact["extrinsic"]["raw"] | ||||
) -> Optional[bytes]: | ) | ||||
extid = p_info.extid(manifest_format=self.extid_manifest_format) | return known_artifact_info.extid(manifest_format=self.extid_manifest_format) | ||||
for rev_id, known_artifact in known_artifacts.items(): | |||||
logging.debug("known_artifact: %s", known_artifact) | |||||
reference_artifact = known_artifact["extrinsic"]["raw"] | |||||
known_extid = self.extid_from_reference_artifact(reference_artifact) | |||||
if extid == known_extid: | |||||
return rev_id | |||||
return None | |||||
def build_revision( | def build_revision( | ||||
self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git | self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git | ||||
) -> Optional[Revision]: | ) -> Optional[Revision]: | ||||
time = p_info.time # assume it's a timestamp | time = p_info.time # assume it's a timestamp | ||||
if isinstance(time, str): # otherwise, assume it's a parsable date | if isinstance(time, str): # otherwise, assume it's a parsable date | ||||
parsed_time = iso8601.parse_date(time) | parsed_time = iso8601.parse_date(time) | ||||
else: | else: | ||||
Show All 21 Lines |