Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata.py
# Copyright (C) 2017-2022 The Software Heritage developers | # Copyright (C) 2017-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from copy import deepcopy | from copy import deepcopy | ||||
import hashlib | |||||
import itertools | import itertools | ||||
import logging | import logging | ||||
import time | import time | ||||
from typing import ( | from typing import ( | ||||
Any, | Any, | ||||
Callable, | Callable, | ||||
Dict, | Dict, | ||||
Iterable, | Iterable, | ||||
▲ Show 20 Lines • Show All 89 Lines • ▼ Show 20 Lines | def index( | ||||
id: Sha1Git, | id: Sha1Git, | ||||
data: Optional[RawExtrinsicMetadata], | data: Optional[RawExtrinsicMetadata], | ||||
**kwargs, | **kwargs, | ||||
) -> List[OriginExtrinsicMetadataRow]: | ) -> List[OriginExtrinsicMetadataRow]: | ||||
if data is None: | if data is None: | ||||
raise NotImplementedError( | raise NotImplementedError( | ||||
"ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data" | "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data" | ||||
) | ) | ||||
if data.target.object_type != ExtendedObjectType.ORIGIN: | if data.target.object_type == ExtendedObjectType.ORIGIN: | ||||
origin_sha1 = data.target.object_id | |||||
elif data.origin is not None: | |||||
# HACK: As swh-search does (yet?) not support searching on directories | |||||
# and traversing back to origins, we index metadata on non-origins with | |||||
# an origin context as if they were on the origin itself. | |||||
origin_sha1 = hashlib.sha1(data.origin.encode()).digest() | |||||
else: | |||||
# other types are not supported yet | # other types are not supported yet | ||||
return [] | return [] | ||||
if data.authority.type == MetadataAuthorityType.REGISTRY: | if data.authority.type == MetadataAuthorityType.REGISTRY: | ||||
# metadata provided by a third-party; don't trust it | # metadata provided by a third-party; don't trust it | ||||
# (technically this could be handled below, but we check it here | # (technically this could be handled below, but we check it here | ||||
# to return early; sparing a translation and origin lookup) | # to return early; sparing a translation and origin lookup) | ||||
# TODO: add ways to define trusted authorities | # TODO: add ways to define trusted authorities | ||||
Show All 10 Lines | ) -> List[OriginExtrinsicMetadataRow]: | ||||
mappings.append(mapping.name) | mappings.append(mapping.name) | ||||
if not metadata_items: | if not metadata_items: | ||||
# Don't have any mapping to parse it, ignore | # Don't have any mapping to parse it, ignore | ||||
return [] | return [] | ||||
# TODO: batch requests to origin_get_by_sha1() | # TODO: batch requests to origin_get_by_sha1() | ||||
for _ in range(6): | for _ in range(6): | ||||
origins = self.storage.origin_get_by_sha1([data.target.object_id]) | origins = self.storage.origin_get_by_sha1([origin_sha1]) | ||||
try: | try: | ||||
(origin,) = origins | (origin,) = origins | ||||
if origin is not None: | if origin is not None: | ||||
break | break | ||||
except ValueError: | except ValueError: | ||||
pass | pass | ||||
# The origin does not exist. This may be due to some replication lag | # The origin does not exist. This may be due to some replication lag | ||||
# between the loader's DB/journal and the DB we are consuming from. | # between the loader's DB/journal and the DB we are consuming from. | ||||
▲ Show 20 Lines • Show All 412 Lines • Show Last 20 Lines |