Page MenuHomeSoftware Heritage

D1206.id3803.diff
No OneTemporary

D1206.id3803.diff

diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -293,14 +293,37 @@
# Deduplicate revisions
rev_metadata = []
orig_metadata = []
+ revs_to_delete = set()
+ origs_to_delete = set()
for (orig_item, rev_item) in results:
- if rev_item not in rev_metadata:
- rev_metadata.append(rev_item)
- if orig_item not in orig_metadata:
- orig_metadata.append(orig_item)
-
- self.idx_storage.revision_metadata_add(
- rev_metadata, conflict_update=conflict_update)
-
- self.idx_storage.origin_intrinsic_metadata_add(
- orig_metadata, conflict_update=conflict_update)
+ assert rev_item['mappings'] == orig_item['mappings']
+ if rev_item['mappings']:
+ if rev_item not in rev_metadata:
+ rev_metadata.append(rev_item)
+ if orig_item not in orig_metadata:
+ orig_metadata.append(orig_item)
+ else:
+ revs_to_delete.add((
+ rev_item['id'],
+ rev_item['indexer_configuration_id'],
+ ))
+ origs_to_delete.add((
+ orig_item['origin_id'],
+ orig_item['indexer_configuration_id'],
+ ))
+
+ if rev_metadata:
+ self.idx_storage.revision_metadata_add(
+ rev_metadata, conflict_update=conflict_update)
+ if orig_metadata:
+ self.idx_storage.origin_intrinsic_metadata_add(
+ orig_metadata, conflict_update=conflict_update)
+
+ # revs_to_delete should always be empty unless we changed a mapping
+ # to detect less files.
+ # However, origs_to_delete may be empty whenever an upstream deletes
+ # a metadata file.
+ for (orig_id, tool_id) in origs_to_delete:
+ self.idx_storage.origin_intrinsic_metadata_delete(orig_id, tool_id)
+ for (rev_id, tool_id) in revs_to_delete:
+ self.idx_storage.revision_metadata_delete(rev_id, tool_id)
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -3,6 +3,8 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from unittest.mock import patch
+
from swh.model.hashutil import hash_to_bytes
from swh.indexer.metadata import OriginMetadataIndexer
@@ -155,3 +157,54 @@
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
origin1['id'], origin2['id']]))
assert len(results) == 2
+
+
+def test_origin_metadata_indexer_no_metadata(
+ idx_storage, storage, obj_storage):
+
+ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ with patch('swh.indexer.metadata_dictionary.NpmMapping.filename',
+ b'foo.json'):
+ indexer.run(["git+https://github.com/librariesio/yarn-parser"])
+
+ origin = storage.origin_get({
+ 'type': 'git',
+ 'url': 'https://github.com/librariesio/yarn-parser'})
+ rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
+
+ results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
+ assert results == []
+
+ results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
+ origin['id']]))
+ assert results == []
+
+
+def test_origin_metadata_indexer_delete_metadata(
+ idx_storage, storage, obj_storage):
+
+ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ indexer.run(["git+https://github.com/librariesio/yarn-parser"])
+
+ origin = storage.origin_get({
+ 'type': 'git',
+ 'url': 'https://github.com/librariesio/yarn-parser'})
+ rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
+
+ results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
+ assert results != []
+
+ results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
+ origin['id']]))
+ assert results != []
+
+ with patch('swh.indexer.metadata_dictionary.NpmMapping.filename',
+ b'foo.json'):
+ indexer.run(["git+https://github.com/librariesio/yarn-parser"])
+
+ results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
+ assert results == []
+
+ results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
+ origin['id']]))
+ assert results == []

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 6:05 PM (4 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226239

Event Timeline