Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/loader.py
Show First 20 Lines • Show All 246 Lines • ▼ Show 20 Lines | ) -> Dict[Sha1Git, Optional[ImmutableDict[str, object]]]: | ||||
known_revisions = self.storage.revision_get(revs) | known_revisions = self.storage.revision_get(revs) | ||||
return { | return { | ||||
revision.id: revision.metadata for revision in known_revisions if revision | revision.id: revision.metadata for revision in known_revisions if revision | ||||
} | } | ||||
def new_packageinfo_to_extid(self, p_info: TPackageInfo) -> Optional[PartialExtID]: | def new_packageinfo_to_extid(self, p_info: TPackageInfo) -> Optional[PartialExtID]: | ||||
return p_info.extid() | return p_info.extid() | ||||
def known_artifact_to_extid(self, known_artifact: Dict) -> Optional[PartialExtID]: | |||||
"""Returns a unique intrinsic identifier of a downloaded artifact, | |||||
used to check if a new artifact is the same.""" | |||||
return None | |||||
def resolve_revision_from_artifacts( | |||||
self, known_artifacts: Dict[Sha1Git, Any], p_info: TPackageInfo, | |||||
) -> Optional[Sha1Git]: | |||||
"""Resolve the revision from known artifact metadata and a package info object. | |||||
If the artifact has already been downloaded, this will return the | |||||
existing revision targeting that uncompressed artifact directory. | |||||
Otherwise, this returns None. | |||||
Args: | |||||
known_artifacts: dict from revision ids to revision metadata | |||||
p_info: Package information | |||||
Returns: | |||||
None or revision identifier | |||||
""" | |||||
if not known_artifacts: | |||||
# No known artifact, no need to compute the artifact's extid | |||||
return None | |||||
new_extid = self.new_packageinfo_to_extid(p_info) | |||||
if new_extid is None: | |||||
# This loader does not support deduplication, at least not for this | |||||
# artifact. | |||||
return None | |||||
for rev_id, known_artifact in known_artifacts.items(): | |||||
known_extid = self.known_artifact_to_extid(known_artifact) | |||||
if new_extid == known_extid: | |||||
return rev_id | |||||
return None | |||||
def _get_known_extids( | def _get_known_extids( | ||||
self, packages_info: List[TPackageInfo] | self, packages_info: List[TPackageInfo] | ||||
) -> Dict[PartialExtID, List[CoreSWHID]]: | ) -> Dict[PartialExtID, List[CoreSWHID]]: | ||||
"""Compute the ExtIDs from new PackageInfo objects, searches which are already | """Compute the ExtIDs from new PackageInfo objects, searches which are already | ||||
loaded in the archive, and returns them if any.""" | loaded in the archive, and returns them if any.""" | ||||
# Compute the ExtIDs of all the new packages, grouped by extid type | # Compute the ExtIDs of all the new packages, grouped by extid type | ||||
new_extids: Dict[str, List[bytes]] = {} | new_extids: Dict[str, List[bytes]] = {} | ||||
▲ Show 20 Lines • Show All 303 Lines • ▼ Show 20 Lines | def load(self) -> Dict: | ||||
logger.debug("package_info: %s", p_info) | logger.debug("package_info: %s", p_info) | ||||
# Check if the package was already loaded, using its ExtID | # Check if the package was already loaded, using its ExtID | ||||
revision_id = self.resolve_revision_from_extids( | revision_id = self.resolve_revision_from_extids( | ||||
known_extids, p_info, last_snapshot_targets | known_extids, p_info, last_snapshot_targets | ||||
) | ) | ||||
if revision_id is None: | if revision_id is None: | ||||
# No existing revision found from an acceptable ExtID, | |||||
# search in the artifact data instead. | |||||
# TODO: remove this after we finished migrating to ExtIDs. | |||||
revision_id = self.resolve_revision_from_artifacts( | |||||
known_artifacts, p_info | |||||
) | |||||
if revision_id is None: | |||||
# No matching revision found in the last snapshot, load it. | # No matching revision found in the last snapshot, load it. | ||||
try: | try: | ||||
res = self._load_revision(p_info, origin) | res = self._load_revision(p_info, origin) | ||||
if res: | if res: | ||||
(revision_id, directory_id) = res | (revision_id, directory_id) = res | ||||
assert revision_id | assert revision_id | ||||
assert directory_id | assert directory_id | ||||
self._load_extrinsic_directory_metadata( | self._load_extrinsic_directory_metadata( | ||||
▲ Show 20 Lines • Show All 137 Lines • ▼ Show 20 Lines | ) -> Optional[Tuple[Sha1Git, Sha1Git]]: | ||||
p_info, uncompressed_path, directory=directory.hash | p_info, uncompressed_path, directory=directory.hash | ||||
) | ) | ||||
if not revision: | if not revision: | ||||
# Some artifacts are missing intrinsic metadata | # Some artifacts are missing intrinsic metadata | ||||
# skipping those | # skipping those | ||||
return None | return None | ||||
metadata = [metadata for (filepath, metadata) in dl_artifacts] | metadata = [metadata for (filepath, metadata) in dl_artifacts] | ||||
extra_metadata: Tuple[str, Any] = ( | |||||
"original_artifact", | |||||
metadata, | |||||
) | |||||
if revision.metadata is not None: | |||||
full_metadata = list(revision.metadata.items()) + [extra_metadata] | |||||
else: | |||||
full_metadata = [extra_metadata] | |||||
# TODO: don't add these extrinsic metadata to the revision. | |||||
revision = attr.evolve(revision, metadata=ImmutableDict(full_metadata)) | |||||
original_artifact_metadata = RawExtrinsicMetadata( | original_artifact_metadata = RawExtrinsicMetadata( | ||||
target=ExtendedSWHID( | target=ExtendedSWHID( | ||||
object_type=ExtendedObjectType.DIRECTORY, object_id=revision.directory | object_type=ExtendedObjectType.DIRECTORY, object_id=revision.directory | ||||
), | ), | ||||
discovery_date=self.visit_date, | discovery_date=self.visit_date, | ||||
authority=SWH_METADATA_AUTHORITY, | authority=SWH_METADATA_AUTHORITY, | ||||
fetcher=self.get_metadata_fetcher(), | fetcher=self.get_metadata_fetcher(), | ||||
▲ Show 20 Lines • Show All 243 Lines • Show Last 20 Lines |