diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -3,8 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from copy import deepcopy - from swh.core.utils import grouper from swh.indexer.codemeta import merge_documents @@ -12,7 +10,6 @@ from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata -from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil @@ -160,15 +157,21 @@ } try: + # Get the root directory of the revision root_dir = rev['directory'] dir_ls = list(self.storage.directory_ls(root_dir, recursive=False)) + if [entry['type'] for entry in dir_ls] == ['dir']: # If the root is just a single directory, recurse into it # eg. PyPI packages, GNU tarballs subdir = dir_ls[0]['target'] dir_ls = self.storage.directory_ls(subdir, recursive=False) + + # List files, and find which ones are metadata files files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) + + # Translate the metadata from these files (mappings, metadata) = self.translate_revision_intrinsic_metadata( detected_files, log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])) @@ -197,6 +200,27 @@ self.idx_storage.revision_intrinsic_metadata_add( results, conflict_update=(policy_update == 'update-dups')) + def get_cached_content_metadata(self, sha1s): + """Fetches the 'content_metadata' table to find metadata for + content indexed by the exact same tool as this one.""" + cached_metadata = self.idx_storage.content_metadata_get(sha1s) + sha1_to_metadata = {} + for c in cached_metadata: + if c['tool']['id'] == self.tool['id']: + # If it was indexed by a different tool (or a different + # version of this tool), use the cache. + assert c['id'] not in sha1_to_metadata, ( + 'Content %s has more than one metadata entry for tool %s.' + % (hashutil.hash_to_hex(c['id']), self.tool)) + + sha1_to_metadata[c['id']] = c['metadata'] + else: + # TODO: if same tool but older version, let's delete + # the cache? + pass + + return sha1_to_metadata + def translate_revision_intrinsic_metadata( self, detected_files, log_suffix): """ @@ -214,47 +238,24 @@ """ used_mappings = [MAPPINGS[context].name for context in detected_files] metadata = [] - tool = { - 'name': 'swh-metadata-translator', - 'version': '0.0.2', - 'configuration': { - }, - } - # TODO: iterate on each context, on each file - # -> get raw_contents - # -> translate each content - config = { - k: self.config[k] - for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] - } - config['tools'] = [tool] for context in detected_files.keys(): - cfg = deepcopy(config) - cfg['tools'][0]['configuration']['context'] = context - c_metadata_indexer = ContentMetadataIndexer(config=cfg) # sha1s that are in content_metadata table - sha1s_in_storage = [] - metadata_generator = self.idx_storage.content_metadata_get( + cached_metadata_for_files = self.get_cached_content_metadata( detected_files[context]) - for c in metadata_generator: - # extracting metadata - sha1 = c['id'] - sha1s_in_storage.append(sha1) - local_metadata = c['metadata'] - # local metadata is aggregated - if local_metadata: - metadata.append(local_metadata) + metadata.extend(cached_metadata_for_files.values()) sha1s_filtered = [item for item in detected_files[context] - if item not in sha1s_in_storage] + if item not in cached_metadata_for_files] + # If all the metadata are not already cached, run the + # ContentMetadataIndexer. if sha1s_filtered: + c_metadata_indexer = ContentMetadataIndexer(config=self.config) # content indexing try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups', log_suffix=log_suffix) - # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result['metadata'] metadata.append(local_metadata)