Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata.py
# Copyright (C) 2017-2018 The Software Heritage developers | # Copyright (C) 2017-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import click | import click | ||||
import itertools | import itertools | ||||
import logging | import logging | ||||
from copy import deepcopy | |||||
from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer | from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer | ||||
from swh.indexer.metadata_dictionary import MAPPINGS | from swh.indexer.metadata_dictionary import MAPPINGS | ||||
from swh.indexer.metadata_detector import detect_metadata | from swh.indexer.metadata_detector import detect_metadata | ||||
from swh.indexer.metadata_detector import extract_minimal_metadata_dict | from swh.indexer.metadata_detector import extract_minimal_metadata_dict | ||||
from swh.indexer.storage import INDEXER_CFG_KEY | from swh.indexer.storage import INDEXER_CFG_KEY | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
Show All 10 Lines | class ContentMetadataIndexer(ContentIndexer): | ||||
- using the metadata_dictionary as the 'swh-metadata-translator' tool | - using the metadata_dictionary as the 'swh-metadata-translator' tool | ||||
- store result in content_metadata table | - store result in content_metadata table | ||||
""" | """ | ||||
# Note: This used when the content metadata indexer is used alone | # Note: This used when the content metadata indexer is used alone | ||||
# (not the case for example in the case of the RevisionMetadataIndexer) | # (not the case for example in the case of the RevisionMetadataIndexer) | ||||
CONFIG_BASE_FILENAME = 'indexer/content_metadata' | CONFIG_BASE_FILENAME = 'indexer/content_metadata' | ||||
def __init__(self, tool, config): | |||||
# FIXME: Simplify this twisted way to use the exact same | |||||
# config of RevisionMetadataIndexer object that uses | |||||
# internally ContentMetadataIndexer | |||||
self.config = config | |||||
self.config['tools'] = tool | |||||
self.results = [] | |||||
super().__init__() | |||||
self.tool = self.tools[0] # Tool is now registered (cf. prepare call) | |||||
def filter(self, ids): | def filter(self, ids): | ||||
"""Filter out known sha1s and return only missing ones. | """Filter out known sha1s and return only missing ones. | ||||
""" | """ | ||||
yield from self.idx_storage.content_metadata_missing(( | yield from self.idx_storage.content_metadata_missing(( | ||||
{ | { | ||||
'id': sha1, | 'id': sha1, | ||||
'indexer_configuration_id': self.tool['id'], | 'indexer_configuration_id': self.tool['id'], | ||||
} for sha1 in ids | } for sha1 in ids | ||||
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines | ADDITIONAL_CONFIG = { | ||||
'version': '0.0.2', | 'version': '0.0.2', | ||||
'configuration': { | 'configuration': { | ||||
'type': 'local', | 'type': 'local', | ||||
'context': ['NpmMapping', 'CodemetaMapping'] | 'context': ['NpmMapping', 'CodemetaMapping'] | ||||
}, | }, | ||||
}), | }), | ||||
} | } | ||||
ContentMetadataIndexer = ContentMetadataIndexer | |||||
def prepare(self): | |||||
super().prepare() | |||||
self.tool = self.tools[0] | |||||
def filter(self, sha1_gits): | def filter(self, sha1_gits): | ||||
"""Filter out known sha1s and return only missing ones. | """Filter out known sha1s and return only missing ones. | ||||
""" | """ | ||||
yield from self.idx_storage.revision_metadata_missing(( | yield from self.idx_storage.revision_metadata_missing(( | ||||
{ | { | ||||
'id': sha1_git, | 'id': sha1_git, | ||||
'indexer_configuration_id': self.tool['id'], | 'indexer_configuration_id': self.tool['id'], | ||||
▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines | def translate_revision_metadata(self, detected_files): | ||||
} | } | ||||
# TODO: iterate on each context, on each file | # TODO: iterate on each context, on each file | ||||
# -> get raw_contents | # -> get raw_contents | ||||
# -> translate each content | # -> translate each content | ||||
config = { | config = { | ||||
k: self.config[k] | k: self.config[k] | ||||
for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] | for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] | ||||
} | } | ||||
config['tools'] = [tool] | |||||
for context in detected_files.keys(): | for context in detected_files.keys(): | ||||
tool['configuration']['context'] = context | cfg = deepcopy(config) | ||||
c_metadata_indexer = self.ContentMetadataIndexer(tool, config) | cfg['tools'][0]['configuration']['context'] = context | ||||
c_metadata_indexer = ContentMetadataIndexer(config=cfg) | |||||
# sha1s that are in content_metadata table | # sha1s that are in content_metadata table | ||||
sha1s_in_storage = [] | sha1s_in_storage = [] | ||||
metadata_generator = self.idx_storage.content_metadata_get( | metadata_generator = self.idx_storage.content_metadata_get( | ||||
detected_files[context]) | detected_files[context]) | ||||
for c in metadata_generator: | for c in metadata_generator: | ||||
# extracting translated_metadata | # extracting translated_metadata | ||||
sha1 = c['id'] | sha1 = c['id'] | ||||
sha1s_in_storage.append(sha1) | sha1s_in_storage.append(sha1) | ||||
Show All 26 Lines | |||||
class OriginMetadataIndexer(OriginIndexer): | class OriginMetadataIndexer(OriginIndexer): | ||||
CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata' | CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata' | ||||
ADDITIONAL_CONFIG = { | ADDITIONAL_CONFIG = { | ||||
'tools': ('list', []) | 'tools': ('list', []) | ||||
} | } | ||||
def check(self, **kwargs): | USE_TOOLS = False | ||||
kwargs['check_tools'] = False | |||||
super().check(**kwargs) | |||||
def filter(self, ids): | |||||
return ids | |||||
def run(self, origin_head, policy_update): | def run(self, origin_head, policy_update): | ||||
"""Expected to be called with the result of RevisionMetadataIndexer | """Expected to be called with the result of RevisionMetadataIndexer | ||||
as first argument; ie. not a list of ids as other indexers would. | as first argument; ie. not a list of ids as other indexers would. | ||||
Args: | Args: | ||||
origin_head (dict): {str(origin_id): rev_id} | origin_head (dict): {str(origin_id): rev_id} | ||||
keys `origin_id` and `revision_id`, which is the result | keys `origin_id` and `revision_id`, which is the result | ||||
▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines |