Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/loader.py
# Copyright (C) 2019-2020 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import logging | import logging | ||||
import tempfile | import tempfile | ||||
import os | import os | ||||
from typing import Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple | from typing import ( | ||||
Any, | |||||
Dict, | |||||
Iterator, | |||||
Generic, | |||||
List, | |||||
Mapping, | |||||
Optional, | |||||
Sequence, | |||||
Tuple, | |||||
TypeVar, | |||||
) | |||||
import attr | import attr | ||||
import sentry_sdk | import sentry_sdk | ||||
from swh.core.tarball import uncompress | from swh.core.tarball import uncompress | ||||
from swh.core.config import SWHConfig | from swh.core.config import SWHConfig | ||||
from swh.model import from_disk | from swh.model import from_disk | ||||
from swh.model.collections import ImmutableDict | from swh.model.collections import ImmutableDict | ||||
Show All 13 Lines | |||||
from swh.storage.algos.snapshot import snapshot_get_latest | from swh.storage.algos.snapshot import snapshot_get_latest | ||||
from swh.loader.package.utils import download | from swh.loader.package.utils import download | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
class PackageLoader: | @attr.s | ||||
class BasePackageInfo: | |||||
url = attr.ib(type=str) | |||||
filename = attr.ib(type=Optional[str]) | |||||
raw = attr.ib(type=Any) | |||||
TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) | |||||
class PackageLoader(Generic[TPackageInfo]): | |||||
# Origin visit type (str) set by the loader | # Origin visit type (str) set by the loader | ||||
visit_type = "" | visit_type = "" | ||||
def __init__(self, url): | def __init__(self, url): | ||||
"""Loader's constructor. This raises exception if the minimal required | """Loader's constructor. This raises exception if the minimal required | ||||
configuration is missing (cf. fn:`check` method). | configuration is missing (cf. fn:`check` method). | ||||
Args: | Args: | ||||
Show All 22 Lines | def get_versions(self) -> Sequence[str]: | ||||
"""Return the list of all published package versions. | """Return the list of all published package versions. | ||||
Returns: | Returns: | ||||
Sequence of published versions | Sequence of published versions | ||||
""" | """ | ||||
return [] | return [] | ||||
def get_package_info( | def get_package_info(self, version: str) -> Iterator[Tuple[str, TPackageInfo]]: | ||||
ardumont: Iterator... | |||||
self, version: str | |||||
) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: | |||||
"""Given a release version of a package, retrieve the associated | """Given a release version of a package, retrieve the associated | ||||
package information for such version. | package information for such version. | ||||
Args: | Args: | ||||
version: Package version | version: Package version | ||||
Returns: | Returns: | ||||
(branch name, package metadata) | (branch name, package metadata) | ||||
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines | ) -> Optional[bytes]: | ||||
Returns: | Returns: | ||||
None or revision identifier | None or revision identifier | ||||
""" | """ | ||||
return None | return None | ||||
def download_package( | def download_package( | ||||
self, p_info: Mapping[str, Any], tmpdir: str | self, p_info: TPackageInfo, tmpdir: str | ||||
) -> List[Tuple[str, Mapping]]: | ) -> List[Tuple[str, Mapping]]: | ||||
"""Download artifacts for a specific package. All downloads happen in | """Download artifacts for a specific package. All downloads happen in | ||||
in the tmpdir folder. | in the tmpdir folder. | ||||
Default implementation expects the artifacts package info to be | Default implementation expects the artifacts package info to be | ||||
about one artifact per package. | about one artifact per package. | ||||
Note that most implementation have 1 artifact per package. But some | Note that most implementation have 1 artifact per package. But some | ||||
implementation have multiple artifacts per package (debian), some have | implementation have multiple artifacts per package (debian), some have | ||||
none, the package is the artifact (gnu). | none, the package is the artifact (gnu). | ||||
Args: | Args: | ||||
artifacts_package_info: Information on the package artifacts to | artifacts_package_info: Information on the package artifacts to | ||||
download (url, filename, etc...) | download (url, filename, etc...) | ||||
tmpdir: Location to retrieve such artifacts | tmpdir: Location to retrieve such artifacts | ||||
Returns: | Returns: | ||||
List of (path, computed hashes) | List of (path, computed hashes) | ||||
""" | """ | ||||
a_uri = p_info["url"] | return [download(p_info.url, dest=tmpdir, filename=p_info.filename)] | ||||
filename = p_info.get("filename") | |||||
return [download(a_uri, dest=tmpdir, filename=filename)] | |||||
def uncompress( | def uncompress( | ||||
self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str | self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str | ||||
) -> str: | ) -> str: | ||||
"""Uncompress the artifact(s) in the destination folder dest. | """Uncompress the artifact(s) in the destination folder dest. | ||||
Optionally, this could need to use the p_info dict for some more | Optionally, this could need to use the p_info dict for some more | ||||
information (debian). | information (debian). | ||||
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines | def load(self) -> Dict: | ||||
load_exceptions: List[Exception] = [] | load_exceptions: List[Exception] = [] | ||||
for version in self.get_versions(): # for each | for version in self.get_versions(): # for each | ||||
logger.debug("version: %s", version) | logger.debug("version: %s", version) | ||||
tmp_revisions[version] = [] | tmp_revisions[version] = [] | ||||
# `p_` stands for `package_` | # `p_` stands for `package_` | ||||
for branch_name, p_info in self.get_package_info(version): | for branch_name, p_info in self.get_package_info(version): | ||||
logger.debug("package_info: %s", p_info) | logger.debug("package_info: %s", p_info) | ||||
revision_id = self.resolve_revision_from(known_artifacts, p_info["raw"]) | revision_id = self.resolve_revision_from(known_artifacts, p_info.raw) | ||||
if revision_id is None: | if revision_id is None: | ||||
try: | try: | ||||
revision_id = self._load_revision(p_info, origin) | revision_id = self._load_revision(p_info, origin) | ||||
self.storage.flush() | self.storage.flush() | ||||
status_load = "eventful" | status_load = "eventful" | ||||
except Exception as e: | except Exception as e: | ||||
self.storage.clear_buffers() | self.storage.clear_buffers() | ||||
load_exceptions.append(e) | load_exceptions.append(e) | ||||
Show All 32 Lines | def load(self) -> Dict: | ||||
except Exception as e: | except Exception as e: | ||||
logger.exception("Failed to build snapshot for origin %s", self.url) | logger.exception("Failed to build snapshot for origin %s", self.url) | ||||
sentry_sdk.capture_exception(e) | sentry_sdk.capture_exception(e) | ||||
status_visit = "partial" | status_visit = "partial" | ||||
status_load = "failed" | status_load = "failed" | ||||
return finalize_visit() | return finalize_visit() | ||||
def _load_revision(self, p_info, origin) -> Optional[Sha1Git]: | def _load_revision(self, p_info: TPackageInfo, origin) -> Optional[Sha1Git]: | ||||
"""Does all the loading of a revision itself: | """Does all the loading of a revision itself: | ||||
* downloads a package and uncompresses it | * downloads a package and uncompresses it | ||||
* loads it from disk | * loads it from disk | ||||
* adds contents, directories, and revision to self.storage | * adds contents, directories, and revision to self.storage | ||||
* returns (revision_id, loaded) | * returns (revision_id, loaded) | ||||
Raises | Raises | ||||
Show All 20 Lines | def _load_revision(self, p_info: TPackageInfo, origin) -> Optional[Sha1Git]: | ||||
logger.debug("Number of contents: %s", len(contents)) | logger.debug("Number of contents: %s", len(contents)) | ||||
self.storage.content_add(contents) | self.storage.content_add(contents) | ||||
logger.debug("Number of directories: %s", len(directories)) | logger.debug("Number of directories: %s", len(directories)) | ||||
self.storage.directory_add(directories) | self.storage.directory_add(directories) | ||||
# FIXME: This should be release. cf. D409 | # FIXME: This should be release. cf. D409 | ||||
revision = self.build_revision( | revision = self.build_revision( | ||||
p_info["raw"], uncompressed_path, directory=directory.hash | p_info.raw, uncompressed_path, directory=directory.hash | ||||
) | ) | ||||
if not revision: | if not revision: | ||||
# Some artifacts are missing intrinsic metadata | # Some artifacts are missing intrinsic metadata | ||||
# skipping those | # skipping those | ||||
return None | return None | ||||
extra_metadata: Tuple[str, Any] = ( | extra_metadata: Tuple[str, Any] = ( | ||||
"original_artifact", | "original_artifact", | ||||
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines |
Iterator...