diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -293,14 +293,39 @@ # Deduplicate revisions rev_metadata = [] orig_metadata = [] + revs_to_delete = set() + origs_to_delete = set() for (orig_item, rev_item) in results: - if rev_item not in rev_metadata: - rev_metadata.append(rev_item) - if orig_item not in orig_metadata: - orig_metadata.append(orig_item) - - self.idx_storage.revision_metadata_add( - rev_metadata, conflict_update=conflict_update) - - self.idx_storage.origin_intrinsic_metadata_add( - orig_metadata, conflict_update=conflict_update) + assert rev_item['mappings'] == orig_item['mappings'] + if rev_item['mappings']: + # Only store translated metadata if we found a metadata file. + # Otherwise it's just an empty dict with a "@context" key. + if rev_item not in rev_metadata: + rev_metadata.append(rev_item) + if orig_item not in orig_metadata: + orig_metadata.append(orig_item) + else: + revs_to_delete.add(( + rev_item['id'], + rev_item['indexer_configuration_id'], + )) + origs_to_delete.add(( + orig_item['origin_id'], + orig_item['indexer_configuration_id'], + )) + + if rev_metadata: + self.idx_storage.revision_metadata_add( + rev_metadata, conflict_update=conflict_update) + if orig_metadata: + self.idx_storage.origin_intrinsic_metadata_add( + orig_metadata, conflict_update=conflict_update) + + # revs_to_delete should always be empty unless we changed a mapping + # to detect less files. + # However, origs_to_delete may be empty whenever an upstream deletes + # a metadata file. + for (orig_id, tool_id) in origs_to_delete: + self.idx_storage.origin_intrinsic_metadata_delete(orig_id, tool_id) + for (rev_id, tool_id) in revs_to_delete: + self.idx_storage.revision_metadata_delete(rev_id, tool_id) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from unittest.mock import patch + from swh.model.hashutil import hash_to_bytes from swh.indexer.metadata import OriginMetadataIndexer @@ -155,3 +157,54 @@ results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin1['id'], origin2['id']])) assert len(results) == 2 + + +def test_origin_metadata_indexer_no_metadata( + idx_storage, storage, obj_storage): + + indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + with patch('swh.indexer.metadata_dictionary.NpmMapping.filename', + b'foo.json'): + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + assert results == [] + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + assert results == [] + + +def test_origin_metadata_indexer_delete_metadata( + idx_storage, storage, obj_storage): + + indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + assert results != [] + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + assert results != [] + + with patch('swh.indexer.metadata_dictionary.NpmMapping.filename', + b'foo.json'): + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + assert results == [] + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + assert results == []