Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata.py
Show All 29 Lines | class ContentMetadataIndexer(ContentIndexer): | ||||
def __init__(self, tool, config): | def __init__(self, tool, config): | ||||
# twisted way to use the exact same config of RevisionMetadataIndexer | # twisted way to use the exact same config of RevisionMetadataIndexer | ||||
# object that uses internally ContentMetadataIndexer | # object that uses internally ContentMetadataIndexer | ||||
self.config = config | self.config = config | ||||
self.config['tools'] = tool | self.config['tools'] = tool | ||||
super().__init__() | super().__init__() | ||||
def prepare(self): | |||||
self.results = [] | |||||
if self.config[INDEXER_CFG_KEY]: | |||||
self.idx_storage = self.config[INDEXER_CFG_KEY] | |||||
if self.config['objstorage']: | |||||
self.objstorage = self.config['objstorage'] | |||||
_log = logging.getLogger('requests.packages.urllib3.connectionpool') | |||||
_log.setLevel(logging.WARN) | |||||
self.log = logging.getLogger('swh.indexer') | |||||
self.tools = self.register_tools(self.config['tools']) | |||||
# NOTE: only one tool so far, change when no longer true | |||||
self.tool = self.tools[0] | |||||
def filter(self, ids): | def filter(self, ids): | ||||
"""Filter out known sha1s and return only missing ones. | """Filter out known sha1s and return only missing ones. | ||||
""" | """ | ||||
yield from self.idx_storage.content_metadata_missing(( | yield from self.idx_storage.content_metadata_missing(( | ||||
{ | { | ||||
'id': sha1, | 'id': sha1, | ||||
'indexer_configuration_id': self.tool['id'], | 'indexer_configuration_id': self.tool['id'], | ||||
} for sha1 in ids | } for sha1 in ids | ||||
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines | ADDITIONAL_CONFIG = { | ||||
'version': '0.0.1', | 'version': '0.0.1', | ||||
'configuration': { | 'configuration': { | ||||
'type': 'local', | 'type': 'local', | ||||
'context': ['npm', 'codemeta'] | 'context': ['npm', 'codemeta'] | ||||
}, | }, | ||||
}), | }), | ||||
} | } | ||||
ContentMetadataIndexer = ContentMetadataIndexer | |||||
def prepare(self): | def prepare(self): | ||||
super().prepare() | super().prepare() | ||||
self.tool = self.tools[0] | self.tool = self.tools[0] | ||||
def filter(self, sha1_gits): | def filter(self, sha1_gits): | ||||
"""Filter out known sha1s and return only missing ones. | """Filter out known sha1s and return only missing ones. | ||||
""" | """ | ||||
▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Lines | def translate_revision_metadata(self, detected_files): | ||||
# -> get raw_contents | # -> get raw_contents | ||||
# -> translate each content | # -> translate each content | ||||
config = { | config = { | ||||
INDEXER_CFG_KEY: self.idx_storage, | INDEXER_CFG_KEY: self.idx_storage, | ||||
'objstorage': self.objstorage | 'objstorage': self.objstorage | ||||
} | } | ||||
for context in detected_files.keys(): | for context in detected_files.keys(): | ||||
tool['configuration']['context'] = context | tool['configuration']['context'] = context | ||||
c_metadata_indexer = ContentMetadataIndexer(tool, config) | c_metadata_indexer = self.ContentMetadataIndexer(tool, config) | ||||
# sha1s that are in content_metadata table | # sha1s that are in content_metadata table | ||||
sha1s_in_storage = [] | sha1s_in_storage = [] | ||||
metadata_generator = self.idx_storage.content_metadata_get( | metadata_generator = self.idx_storage.content_metadata_get( | ||||
detected_files[context]) | detected_files[context]) | ||||
for c in metadata_generator: | for c in metadata_generator: | ||||
# extracting translated_metadata | # extracting translated_metadata | ||||
sha1 = c['id'] | sha1 = c['id'] | ||||
sha1s_in_storage.append(sha1) | sha1s_in_storage.append(sha1) | ||||
▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines |