Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/loader.py
# Copyright (C) 2019-2020 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import logging | import logging | ||||
import tempfile | import tempfile | ||||
import os | import os | ||||
import sys | import sys | ||||
from typing import ( | from typing import ( | ||||
Any, | Any, | ||||
Dict, | Dict, | ||||
Iterator, | Iterator, | ||||
Generic, | Generic, | ||||
Iterable, | |||||
List, | List, | ||||
Mapping, | Mapping, | ||||
Optional, | Optional, | ||||
Sequence, | Sequence, | ||||
Tuple, | Tuple, | ||||
TypeVar, | TypeVar, | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines | class BasePackageInfo: | ||||
# to add attributes. Without kw_only, attributes without default values cannot | # to add attributes. Without kw_only, attributes without default values cannot | ||||
# go after attributes with default values. | # go after attributes with default values. | ||||
# See <https://github.com/python-attrs/attrs/issues/38> | # See <https://github.com/python-attrs/attrs/issues/38> | ||||
revision_extrinsic_metadata = attr.ib( | revision_extrinsic_metadata = attr.ib( | ||||
type=List[RawExtrinsicMetadataCore], default=[], kw_only=True, | type=List[RawExtrinsicMetadataCore], default=[], kw_only=True, | ||||
) | ) | ||||
# TODO: add support for metadata for origins, directories, and contents | # TODO: add support for metadata for directories and contents | ||||
@property | @property | ||||
def ID_KEYS(self): | def ID_KEYS(self): | ||||
raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS") | raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS") | ||||
def artifact_identity(self): | def artifact_identity(self): | ||||
return [getattr(self, k) for k in self.ID_KEYS] | return [getattr(self, k) for k in self.ID_KEYS] | ||||
▲ Show 20 Lines • Show All 329 Lines • ▼ Show 20 Lines | def load(self) -> Dict: | ||||
logger.debug("default version: %s", default_version) | logger.debug("default version: %s", default_version) | ||||
# Retrieve extra branches | # Retrieve extra branches | ||||
extra_branches = self.extra_branches() | extra_branches = self.extra_branches() | ||||
logger.debug("extra branches: %s", extra_branches) | logger.debug("extra branches: %s", extra_branches) | ||||
snapshot = self._load_snapshot( | snapshot = self._load_snapshot( | ||||
default_version, tmp_revisions, extra_branches | default_version, tmp_revisions, extra_branches | ||||
) | ) | ||||
self.storage.flush() | |||||
except Exception as e: | except Exception as e: | ||||
logger.exception("Failed to build snapshot for origin %s", self.url) | logger.exception("Failed to build snapshot for origin %s", self.url) | ||||
sentry_sdk.capture_exception(e) | sentry_sdk.capture_exception(e) | ||||
status_visit = "partial" | status_visit = "partial" | ||||
ardumont: why do you need that part? | |||||
Done Inline ActionsWhat part? (also, the git diff is a bit confusing; I'm going to push another commit that should make it clearer as a side effect) vlorentz: What part? (also, the git diff is a bit confusing; I'm going to push another commit that should… | |||||
Done Inline Actions(done) vlorentz: (done) | |||||
Not Done Inline ActionsI saw what seems to be a new try: catch: block around a seemingly unrelated part of your diff ;) ardumont: I saw what seems to be a new try: catch: block around a seemingly unrelated part of your diff ;) | |||||
status_load = "failed" | status_load = "failed" | ||||
try: | |||||
self._load_extrinsic_origin_metadata() | |||||
except Exception as e: | |||||
logger.exception("Failed to extrinsic origin metadata for %s", self.url) | |||||
sentry_sdk.capture_exception(e) | |||||
status_visit = "partial" | |||||
status_load = "failed" | |||||
return finalize_visit() | return finalize_visit() | ||||
def _load_revision(self, p_info: TPackageInfo, origin) -> Optional[Sha1Git]: | def _load_revision(self, p_info: TPackageInfo, origin) -> Optional[Sha1Git]: | ||||
"""Does all the loading of a revision itself: | """Does all the loading of a revision itself: | ||||
* downloads a package and uncompresses it | * downloads a package and uncompresses it | ||||
* loads it from disk | * loads it from disk | ||||
* adds contents, directories, and revision to self.storage | * adds contents, directories, and revision to self.storage | ||||
▲ Show 20 Lines • Show All 129 Lines • ▼ Show 20 Lines | def get_metadata_fetcher(self) -> MetadataFetcher: | ||||
) | ) | ||||
def get_metadata_authority(self) -> MetadataAuthority: | def get_metadata_authority(self) -> MetadataAuthority: | ||||
"""For package loaders that get extrinsic metadata, returns the authority | """For package loaders that get extrinsic metadata, returns the authority | ||||
the metadata are coming from. | the metadata are coming from. | ||||
""" | """ | ||||
raise NotImplementedError("get_metadata_authority") | raise NotImplementedError("get_metadata_authority") | ||||
def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: | |||||
"""Returns metadata items, used by build_extrinsic_origin_metadata.""" | |||||
return [] | |||||
def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]: | |||||
"""Builds a list of full RawExtrinsicMetadata objects, using | |||||
metadata returned by get_extrinsic_origin_metadata.""" | |||||
metadata_items = self.get_extrinsic_origin_metadata() | |||||
if not metadata_items: | |||||
# If this package loader doesn't write metadata, no need to require | |||||
# an implementation for get_metadata_authority. | |||||
return [] | |||||
authority = self.get_metadata_authority() | |||||
fetcher = self.get_metadata_fetcher() | |||||
metadata_objects = [] | |||||
for item in metadata_items: | |||||
metadata_objects.append( | |||||
RawExtrinsicMetadata( | |||||
type=MetadataTargetType.ORIGIN, | |||||
id=self.url, | |||||
discovery_date=item.discovery_date or self.visit_date, | |||||
authority=authority, | |||||
fetcher=fetcher, | |||||
format=item.format, | |||||
metadata=item.metadata, | |||||
) | |||||
) | |||||
return metadata_objects | |||||
def _load_extrinsic_origin_metadata(self) -> None: | |||||
metadata_objects = self.build_extrinsic_origin_metadata() | |||||
self._load_metadata_objects(metadata_objects) | |||||
ardumontUnsubmitted Not Done Inline ActionsPlease inline within the try-catch. ardumont: Please inline within the try-catch.
It only adds indirection as it's not reused imo
| |||||
def build_extrinsic_revision_metadata( | def build_extrinsic_revision_metadata( | ||||
self, p_info: TPackageInfo, revision_id: Sha1Git | self, p_info: TPackageInfo, revision_id: Sha1Git | ||||
) -> List[RawExtrinsicMetadata]: | ) -> List[RawExtrinsicMetadata]: | ||||
if not p_info.revision_extrinsic_metadata: | if not p_info.revision_extrinsic_metadata: | ||||
# If this package loader doesn't write metadata, no need to require | # If this package loader doesn't write metadata, no need to require | ||||
# an implementation for get_metadata_authority. | # an implementation for get_metadata_authority. | ||||
return [] | return [] | ||||
Show All 17 Lines | ) -> List[RawExtrinsicMetadata]: | ||||
) | ) | ||||
return metadata_objects | return metadata_objects | ||||
def _load_extrinsic_revision_metadata( | def _load_extrinsic_revision_metadata( | ||||
self, p_info: TPackageInfo, revision_id: Sha1Git | self, p_info: TPackageInfo, revision_id: Sha1Git | ||||
) -> None: | ) -> None: | ||||
metadata_objects = self.build_extrinsic_revision_metadata(p_info, revision_id) | metadata_objects = self.build_extrinsic_revision_metadata(p_info, revision_id) | ||||
self._load_metadata_objects(metadata_objects) | |||||
def _load_metadata_objects( | |||||
self, metadata_objects: List[RawExtrinsicMetadata] | |||||
) -> None: | |||||
if not metadata_objects: | |||||
# If this package loader doesn't write metadata, no need to require | |||||
# an implementation for get_metadata_authority. | |||||
return | |||||
authorities = { | self._create_authorities(mo.authority for mo in metadata_objects) | ||||
( | self._create_fetchers(mo.fetcher for mo in metadata_objects) | ||||
ardumontUnsubmitted Not Done Inline Actionssame inline those 2 extra "_create_*" methods here. ardumont: same inline those 2 extra "_create_*" methods here.
It's not reused. | |||||
metadata_object.authority.type, | |||||
metadata_object.authority.url, | self.storage.object_metadata_add(metadata_objects) | ||||
): metadata_object.authority | |||||
for metadata_object in metadata_objects | def _create_authorities(self, authorities: Iterable[MetadataAuthority]) -> None: | ||||
deduplicated_authorities = { | |||||
(authority.type, authority.url): authority for authority in authorities | |||||
} | } | ||||
if authorities: | if authorities: | ||||
self.storage.metadata_authority_add(authorities.values()) | self.storage.metadata_authority_add(deduplicated_authorities.values()) | ||||
fetchers = { | def _create_fetchers(self, fetchers: Iterable[MetadataFetcher]) -> None: | ||||
( | deduplicated_fetchers = { | ||||
metadata_object.fetcher.name, | (fetcher.name, fetcher.version): fetcher for fetcher in fetchers | ||||
metadata_object.fetcher.version, | |||||
): metadata_object.fetcher | |||||
for metadata_object in metadata_objects | |||||
} | } | ||||
if fetchers: | if fetchers: | ||||
self.storage.metadata_fetcher_add(fetchers.values()) | self.storage.metadata_fetcher_add(deduplicated_fetchers.values()) | ||||
if metadata_objects: | |||||
self.storage.object_metadata_add(metadata_objects) |
why do you need that part?