Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata.py
Show All 26 Lines | |||||
) -> Iterator[str]: | ) -> Iterator[str]: | ||||
"""Calls a function with batches of args, and concatenates the results. | """Calls a function with batches of args, and concatenates the results. | ||||
""" | """ | ||||
groups = grouper(args, batch_size) | groups = grouper(args, batch_size) | ||||
for group in groups: | for group in groups: | ||||
yield from f(list(group)) | yield from f(list(group)) | ||||
class ContentMetadataIndexer(ContentIndexer): | class ContentMetadataIndexer(ContentIndexer[Dict]): | ||||
"""Content-level indexer | """Content-level indexer | ||||
This indexer is in charge of: | This indexer is in charge of: | ||||
- filtering out content already indexed in content_metadata | - filtering out content already indexed in content_metadata | ||||
- reading content from objstorage with the content's id sha1 | - reading content from objstorage with the content's id sha1 | ||||
- computing metadata by given context | - computing metadata by given context | ||||
- using the metadata_dictionary as the 'swh-metadata-translator' tool | - using the metadata_dictionary as the 'swh-metadata-translator' tool | ||||
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines | DEFAULT_CONFIG: Dict[str, Any] = { | ||||
"tools": { | "tools": { | ||||
"name": "swh-metadata-detector", | "name": "swh-metadata-detector", | ||||
"version": "0.0.2", | "version": "0.0.2", | ||||
"configuration": {}, | "configuration": {}, | ||||
}, | }, | ||||
} | } | ||||
class RevisionMetadataIndexer(RevisionIndexer): | class RevisionMetadataIndexer(RevisionIndexer[Dict]): | ||||
"""Revision-level indexer | """Revision-level indexer | ||||
This indexer is in charge of: | This indexer is in charge of: | ||||
- filtering revisions already indexed in revision_intrinsic_metadata table | - filtering revisions already indexed in revision_intrinsic_metadata table | ||||
with defined computation tool | with defined computation tool | ||||
- retrieve all entry_files in root directory | - retrieve all entry_files in root directory | ||||
- use metadata_detector for file_names containing metadata | - use metadata_detector for file_names containing metadata | ||||
▲ Show 20 Lines • Show All 140 Lines • ▼ Show 20 Lines | ) -> Tuple[List[Any], List[Any]]: | ||||
try: | try: | ||||
c_metadata_indexer.run( | c_metadata_indexer.run( | ||||
sha1s_filtered, | sha1s_filtered, | ||||
policy_update="ignore-dups", | policy_update="ignore-dups", | ||||
log_suffix=log_suffix, | log_suffix=log_suffix, | ||||
) | ) | ||||
# on the fly possibility: | # on the fly possibility: | ||||
for result in c_metadata_indexer.results: | for result in c_metadata_indexer.results: | ||||
assert isinstance(result, dict) # TODO: remove this | |||||
local_metadata = result["metadata"] | local_metadata = result["metadata"] | ||||
metadata.append(local_metadata) | metadata.append(local_metadata) | ||||
except Exception: | except Exception: | ||||
self.log.exception("Exception while indexing metadata on contents") | self.log.exception("Exception while indexing metadata on contents") | ||||
metadata = merge_documents(metadata) | metadata = merge_documents(metadata) | ||||
return (used_mappings, metadata) | return (used_mappings, metadata) | ||||
class OriginMetadataIndexer(OriginIndexer): | class OriginMetadataIndexer(OriginIndexer[Dict]): | ||||
USE_TOOLS = False | USE_TOOLS = False | ||||
def __init__(self, config=None, **kwargs) -> None: | def __init__(self, config=None, **kwargs) -> None: | ||||
super().__init__(config=config, **kwargs) | super().__init__(config=config, **kwargs) | ||||
self.origin_head_indexer = OriginHeadIndexer(config=config) | self.origin_head_indexer = OriginHeadIndexer(config=config) | ||||
self.revision_metadata_indexer = RevisionMetadataIndexer(config=config) | self.revision_metadata_indexer = RevisionMetadataIndexer(config=config) | ||||
def index_list(self, origin_urls, **kwargs): | def index_list(self, origin_urls, **kwargs): | ||||
▲ Show 20 Lines • Show All 92 Lines • Show Last 20 Lines |