Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata.py
# Copyright (C) 2017-2018 The Software Heritage developers | # Copyright (C) 2017-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from copy import deepcopy | |||||
from swh.core.utils import grouper | from swh.core.utils import grouper | ||||
from swh.indexer.codemeta import merge_documents | from swh.indexer.codemeta import merge_documents | ||||
from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer | from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer | ||||
from swh.indexer.origin_head import OriginHeadIndexer | from swh.indexer.origin_head import OriginHeadIndexer | ||||
from swh.indexer.metadata_dictionary import MAPPINGS | from swh.indexer.metadata_dictionary import MAPPINGS | ||||
from swh.indexer.metadata_detector import detect_metadata | from swh.indexer.metadata_detector import detect_metadata | ||||
from swh.indexer.storage import INDEXER_CFG_KEY | |||||
from swh.model import hashutil | from swh.model import hashutil | ||||
REVISION_GET_BATCH_SIZE = 10 | REVISION_GET_BATCH_SIZE = 10 | ||||
ORIGIN_GET_BATCH_SIZE = 10 | ORIGIN_GET_BATCH_SIZE = 10 | ||||
▲ Show 20 Lines • Show All 131 Lines • ▼ Show 20 Lines | def index(self, rev): | ||||
result = { | result = { | ||||
'id': rev['id'], | 'id': rev['id'], | ||||
'indexer_configuration_id': self.tool['id'], | 'indexer_configuration_id': self.tool['id'], | ||||
'mappings': None, | 'mappings': None, | ||||
'metadata': None | 'metadata': None | ||||
} | } | ||||
try: | try: | ||||
# Get the root directory of the revision | |||||
root_dir = rev['directory'] | root_dir = rev['directory'] | ||||
dir_ls = list(self.storage.directory_ls(root_dir, recursive=False)) | dir_ls = list(self.storage.directory_ls(root_dir, recursive=False)) | ||||
if [entry['type'] for entry in dir_ls] == ['dir']: | if [entry['type'] for entry in dir_ls] == ['dir']: | ||||
# If the root is just a single directory, recurse into it | # If the root is just a single directory, recurse into it | ||||
# eg. PyPI packages, GNU tarballs | # eg. PyPI packages, GNU tarballs | ||||
subdir = dir_ls[0]['target'] | subdir = dir_ls[0]['target'] | ||||
dir_ls = self.storage.directory_ls(subdir, recursive=False) | dir_ls = self.storage.directory_ls(subdir, recursive=False) | ||||
# List files, and find which ones are metadata files | |||||
files = [entry for entry in dir_ls if entry['type'] == 'file'] | files = [entry for entry in dir_ls if entry['type'] == 'file'] | ||||
detected_files = detect_metadata(files) | detected_files = detect_metadata(files) | ||||
# Translate the metadata from these files | |||||
(mappings, metadata) = self.translate_revision_intrinsic_metadata( | (mappings, metadata) = self.translate_revision_intrinsic_metadata( | ||||
detected_files, | detected_files, | ||||
log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])) | log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])) | ||||
result['mappings'] = mappings | result['mappings'] = mappings | ||||
result['metadata'] = metadata | result['metadata'] = metadata | ||||
except Exception as e: | except Exception as e: | ||||
self.log.exception( | self.log.exception( | ||||
'Problem when indexing rev: %r', e) | 'Problem when indexing rev: %r', e) | ||||
Show All 12 Lines | def persist_index_computations(self, results, policy_update): | ||||
respectively update duplicates or ignore them | respectively update duplicates or ignore them | ||||
""" | """ | ||||
# TODO: add functions in storage to keep data in | # TODO: add functions in storage to keep data in | ||||
# revision_intrinsic_metadata | # revision_intrinsic_metadata | ||||
self.idx_storage.revision_intrinsic_metadata_add( | self.idx_storage.revision_intrinsic_metadata_add( | ||||
results, conflict_update=(policy_update == 'update-dups')) | results, conflict_update=(policy_update == 'update-dups')) | ||||
def get_cached_content_metadata(self, sha1s): | |||||
"""Fetches the 'content_metadata' table to find metadata for | |||||
content indexed by the exact same tool as this one.""" | |||||
cached_metadata = self.idx_storage.content_metadata_get(sha1s) | |||||
sha1_to_metadata = {} | |||||
for c in cached_metadata: | |||||
if c['tool']['id'] == self.tool['id']: | |||||
# If it was indexed by a different tool (or a different | |||||
# version of this tool), use the cache. | |||||
assert c['id'] not in sha1_to_metadata, ( | |||||
'Content %s has more than one metadata entry for tool %s.' | |||||
% (hashutil.hash_to_hex(c['id']), self.tool)) | |||||
sha1_to_metadata[c['id']] = c['metadata'] | |||||
else: | |||||
# TODO: if same tool but older version, let's delete | |||||
# the cache? | |||||
pass | |||||
return sha1_to_metadata | |||||
def translate_revision_intrinsic_metadata( | def translate_revision_intrinsic_metadata( | ||||
self, detected_files, log_suffix): | self, detected_files, log_suffix): | ||||
""" | """ | ||||
Determine plan of action to translate metadata when containing | Determine plan of action to translate metadata when containing | ||||
one or multiple detected files: | one or multiple detected files: | ||||
Args: | Args: | ||||
detected_files (dict): dictionary mapping context names (e.g., | detected_files (dict): dictionary mapping context names (e.g., | ||||
"npm", "authors") to list of sha1 | "npm", "authors") to list of sha1 | ||||
Returns: | Returns: | ||||
(List[str], dict): list of mappings used and dict with | (List[str], dict): list of mappings used and dict with | ||||
translated metadata according to the CodeMeta vocabulary | translated metadata according to the CodeMeta vocabulary | ||||
""" | """ | ||||
used_mappings = [MAPPINGS[context].name for context in detected_files] | used_mappings = [MAPPINGS[context].name for context in detected_files] | ||||
metadata = [] | metadata = [] | ||||
tool = { | |||||
'name': 'swh-metadata-translator', | |||||
'version': '0.0.2', | |||||
'configuration': { | |||||
}, | |||||
} | |||||
# TODO: iterate on each context, on each file | |||||
# -> get raw_contents | |||||
# -> translate each content | |||||
config = { | |||||
k: self.config[k] | |||||
for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] | |||||
} | |||||
config['tools'] = [tool] | |||||
for context in detected_files.keys(): | for context in detected_files.keys(): | ||||
cfg = deepcopy(config) | |||||
cfg['tools'][0]['configuration']['context'] = context | |||||
c_metadata_indexer = ContentMetadataIndexer(config=cfg) | |||||
# sha1s that are in content_metadata table | # sha1s that are in content_metadata table | ||||
sha1s_in_storage = [] | cached_metadata_for_files = self.get_cached_content_metadata( | ||||
metadata_generator = self.idx_storage.content_metadata_get( | |||||
detected_files[context]) | detected_files[context]) | ||||
for c in metadata_generator: | metadata.extend(cached_metadata_for_files.values()) | ||||
# extracting metadata | |||||
sha1 = c['id'] | |||||
sha1s_in_storage.append(sha1) | |||||
local_metadata = c['metadata'] | |||||
# local metadata is aggregated | |||||
if local_metadata: | |||||
metadata.append(local_metadata) | |||||
sha1s_filtered = [item for item in detected_files[context] | sha1s_filtered = [item for item in detected_files[context] | ||||
if item not in sha1s_in_storage] | if item not in cached_metadata_for_files] | ||||
# If all the metadata are not already cached, run the | |||||
# ContentMetadataIndexer. | |||||
if sha1s_filtered: | if sha1s_filtered: | ||||
c_metadata_indexer = ContentMetadataIndexer(config=self.config) | |||||
# content indexing | # content indexing | ||||
try: | try: | ||||
c_metadata_indexer.run(sha1s_filtered, | c_metadata_indexer.run(sha1s_filtered, | ||||
policy_update='ignore-dups', | policy_update='ignore-dups', | ||||
log_suffix=log_suffix) | log_suffix=log_suffix) | ||||
# on the fly possibility: | |||||
for result in c_metadata_indexer.results: | for result in c_metadata_indexer.results: | ||||
local_metadata = result['metadata'] | local_metadata = result['metadata'] | ||||
metadata.append(local_metadata) | metadata.append(local_metadata) | ||||
except Exception: | except Exception: | ||||
self.log.exception( | self.log.exception( | ||||
"Exception while indexing metadata on contents") | "Exception while indexing metadata on contents") | ||||
▲ Show 20 Lines • Show All 91 Lines • Show Last 20 Lines |