Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/loader.py
# Copyright (C) 2019-2020 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import logging | import logging | ||||
import tempfile | import tempfile | ||||
import os | import os | ||||
import sys | |||||
from typing import ( | from typing import ( | ||||
Any, | Any, | ||||
Dict, | Dict, | ||||
Iterator, | Iterator, | ||||
Generic, | Generic, | ||||
List, | List, | ||||
Mapping, | Mapping, | ||||
Optional, | Optional, | ||||
Show All 14 Lines | from swh.model.model import ( | ||||
BaseModel, | BaseModel, | ||||
Sha1Git, | Sha1Git, | ||||
Revision, | Revision, | ||||
TargetType, | TargetType, | ||||
Snapshot, | Snapshot, | ||||
Origin, | Origin, | ||||
OriginVisit, | OriginVisit, | ||||
OriginVisitStatus, | OriginVisitStatus, | ||||
MetadataAuthority, | |||||
MetadataFetcher, | |||||
MetadataTargetType, | |||||
RawExtrinsicMetadata, | |||||
) | ) | ||||
from swh.model.identifiers import SWHID | |||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
from swh.storage.utils import now | from swh.storage.utils import now | ||||
from swh.storage.algos.snapshot import snapshot_get_latest | from swh.storage.algos.snapshot import snapshot_get_latest | ||||
from swh.loader.package.utils import download | from swh.loader.package.utils import download | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
Show All 11 Lines | class BasePackageInfo: | ||||
Returns: | Returns: | ||||
The identity for that dict entry | The identity for that dict entry | ||||
""" | """ | ||||
url = attr.ib(type=str) | url = attr.ib(type=str) | ||||
filename = attr.ib(type=Optional[str]) | filename = attr.ib(type=Optional[str]) | ||||
# The following attribute has kw_only=True in order to allow subclasses | |||||
# to add attributes. Without kw_only, attributes without default values cannot | |||||
# go after attributes with default values. | |||||
# See <https://github.com/python-attrs/attrs/issues/38> | |||||
revision_extrinsic_metadata = attr.ib( | |||||
type=List[Tuple[datetime.datetime, str, bytes]], default=[], kw_only=True, | |||||
) | |||||
"""Tuple elements are respectively the 'discovery_date', 'format', | |||||
and 'metadata' fields of RawExtrinsicMetadata""" | |||||
# TODO: add support for metadata for origins, directories, and contents | |||||
@property | @property | ||||
def ID_KEYS(self): | def ID_KEYS(self): | ||||
raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS") | raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS") | ||||
def artifact_identity(self): | def artifact_identity(self): | ||||
return [getattr(self, k) for k in self.ID_KEYS] | return [getattr(self, k) for k in self.ID_KEYS] | ||||
TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) | TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) | ||||
class PackageLoader(Generic[TPackageInfo]): | class PackageLoader(Generic[TPackageInfo]): | ||||
# Origin visit type (str) set by the loader | # Origin visit type (str) set by the loader | ||||
ardumont: erf, i tried to remove it once...
Can't we do without? | |||||
Done Inline ActionsYes, thanks to the comment below vlorentz: Yes, thanks to the comment below | |||||
visit_type = "" | visit_type = "" | ||||
DEFAULT_CONFIG = { | |||||
"create_authorities": ("bool", True), | |||||
Done Inline ActionsShouldn't this be false by default, and opening it once it's ok instead? ardumont: Shouldn't this be false by default, and opening it once it's ok instead? | |||||
Done Inline Actionswell, maybe it's fine, i don't know. Why do you think this is needed? What package loaders would not want those created? ardumont: well, maybe it's fine, i don't know.
Why do you think this is needed? What package loaders… | |||||
Done Inline ActionsThe current behavior is equivalent to True. But you're right, it doesn't need to be configurable. I'll update the diff to remove it. vlorentz: The current behavior is equivalent to True. But you're right, it doesn't need to be… | |||||
"create_fetchers": ("bool", True), | |||||
} | |||||
def __init__(self, url): | def __init__(self, url): | ||||
"""Loader's constructor. This raises exception if the minimal required | """Loader's constructor. This raises exception if the minimal required | ||||
configuration is missing (cf. fn:`check` method). | configuration is missing (cf. fn:`check` method). | ||||
Args: | Args: | ||||
url (str): Origin url to load data from | url (str): Origin url to load data from | ||||
""" | """ | ||||
▲ Show 20 Lines • Show All 270 Lines • ▼ Show 20 Lines | def load(self) -> Dict: | ||||
tmp_revisions[version] = [] | tmp_revisions[version] = [] | ||||
# `p_` stands for `package_` | # `p_` stands for `package_` | ||||
for branch_name, p_info in self.get_package_info(version): | for branch_name, p_info in self.get_package_info(version): | ||||
logger.debug("package_info: %s", p_info) | logger.debug("package_info: %s", p_info) | ||||
revision_id = self.resolve_revision_from(known_artifacts, p_info) | revision_id = self.resolve_revision_from(known_artifacts, p_info) | ||||
if revision_id is None: | if revision_id is None: | ||||
try: | try: | ||||
revision_id = self._load_revision(p_info, origin) | revision_id = self._load_revision(p_info, origin) | ||||
if revision_id: | |||||
self._load_extrinsic_revision_metadata(p_info, revision_id) | |||||
self.storage.flush() | self.storage.flush() | ||||
status_load = "eventful" | status_load = "eventful" | ||||
except Exception as e: | except Exception as e: | ||||
self.storage.clear_buffers() | self.storage.clear_buffers() | ||||
load_exceptions.append(e) | load_exceptions.append(e) | ||||
sentry_sdk.capture_exception(e) | sentry_sdk.capture_exception(e) | ||||
logger.exception( | logger.exception( | ||||
"Failed loading branch %s for %s", branch_name, self.url | "Failed loading branch %s for %s", branch_name, self.url | ||||
▲ Show 20 Lines • Show All 134 Lines • ▼ Show 20 Lines | ) -> Optional[Snapshot]: | ||||
snapshot_data = {"branches": branches} | snapshot_data = {"branches": branches} | ||||
logger.debug("snapshot: %s", snapshot_data) | logger.debug("snapshot: %s", snapshot_data) | ||||
snapshot = Snapshot.from_dict(snapshot_data) | snapshot = Snapshot.from_dict(snapshot_data) | ||||
logger.debug("snapshot: %s", snapshot) | logger.debug("snapshot: %s", snapshot) | ||||
self.storage.snapshot_add([snapshot]) | self.storage.snapshot_add([snapshot]) | ||||
return snapshot | return snapshot | ||||
def get_loader_name(self) -> str: | |||||
"""Returns a fully qualified name of this loader.""" | |||||
return f"{self.__class__.__module__}.{self.__class__.__name__}" | |||||
def get_loader_version(self) -> str: | |||||
"""Returns the version of the current loader.""" | |||||
module_name = self.__class__.__module__ or "" | |||||
module_name_parts = module_name.split(".") | |||||
# Iterate rootward through the package hierarchy until we find a parent of this | |||||
# loader's module with a __version__ attribute. | |||||
for prefix_size in range(len(module_name_parts), 0, -1): | |||||
package_name = ".".join(module_name_parts[0:prefix_size]) | |||||
module = sys.modules[package_name] | |||||
if hasattr(module, "__version__"): | |||||
return module.__version__ # type: ignore | |||||
# If this loader's class has no parent package with a __version__, | |||||
# it should implement it itself. | |||||
raise NotImplementedError( | |||||
f"Could not dynamically find the version of {self.get_loader_name()}." | |||||
) | |||||
def get_metadata_fetcher(self) -> MetadataFetcher: | |||||
"""Returns a MetadataFetcher instance representing this package loader; | |||||
which is used to for adding provenance information to extracted | |||||
extrinsic metadata, if any.""" | |||||
return MetadataFetcher( | |||||
name=self.get_loader_name(), version=self.get_loader_version(), metadata={}, | |||||
) | |||||
def get_metadata_authority(self) -> MetadataAuthority: | |||||
"""For package loaders that get extrinsic metadata, returns the authority | |||||
the metadata are coming from. | |||||
""" | |||||
raise NotImplementedError("get_metadata_authority") | |||||
def build_extrinsic_revision_metadata( | |||||
self, p_info: TPackageInfo, revision_id: Sha1Git | |||||
) -> List[RawExtrinsicMetadata]: | |||||
if not p_info.revision_extrinsic_metadata: | |||||
# If this package loader doesn't write metadata, no need to require | |||||
# an implementation for get_metadata_authority. | |||||
return [] | |||||
authority = self.get_metadata_authority() | |||||
fetcher = self.get_metadata_fetcher() | |||||
metadata_objects = [] | |||||
for (discovery_date, format, metadata) in p_info.revision_extrinsic_metadata: | |||||
metadata_objects.append( | |||||
RawExtrinsicMetadata( | |||||
type=MetadataTargetType.REVISION, | |||||
id=SWHID(object_type="revision", object_id=revision_id), | |||||
discovery_date=discovery_date, | |||||
authority=authority, | |||||
fetcher=fetcher, | |||||
format=format, | |||||
metadata=metadata, | |||||
origin=self.url, | |||||
) | |||||
) | |||||
return metadata_objects | |||||
def _load_extrinsic_revision_metadata( | |||||
self, p_info: TPackageInfo, revision_id: Sha1Git | |||||
) -> None: | |||||
Not Done Inline Actionsmake it clearer we stop early (I had to read this multiple times ;) if not metadata_objects: return None ardumont: make it clearer we stop early (I had to read this multiple times ;)
```
if not… | |||||
Done Inline ActionsThere's something to that effect in my uncommitted code vlorentz: There's something to that effect in my uncommitted code | |||||
Done Inline ActionsIt's in D3616 now vlorentz: It's in D3616 now | |||||
metadata_objects = self.build_extrinsic_revision_metadata(p_info, revision_id) | |||||
authorities = { | |||||
( | |||||
metadata_object.authority.type, | |||||
metadata_object.authority.url, | |||||
): metadata_object.authority | |||||
for metadata_object in metadata_objects | |||||
} | |||||
if authorities: | |||||
self.storage.metadata_authority_add(authorities.values()) | |||||
fetchers = { | |||||
( | |||||
metadata_object.fetcher.name, | |||||
metadata_object.fetcher.version, | |||||
): metadata_object.fetcher | |||||
for metadata_object in metadata_objects | |||||
} | |||||
if fetchers: | |||||
self.storage.metadata_fetcher_add(fetchers.values()) | |||||
if metadata_objects: | |||||
self.storage.object_metadata_add(metadata_objects) |
erf, i tried to remove it once...
Can't we do without?