diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -303,13 +303,22 @@ return results def persist_index_computations(self, results, policy_update): + conflict_update = (policy_update == 'update-dups') + + # Deduplicate revisions + rev_metadata = [] + orig_metadata = [] + for (orig_item, rev_item) in results: + if rev_item not in rev_metadata: + rev_metadata.append(rev_item) + if rev_item not in orig_metadata: + orig_metadata.append(orig_item) + self.idx_storage.revision_metadata_add( - [rev_item for (orig_item, rev_item) in results], - conflict_update=(policy_update == 'update-dups')) + rev_metadata, conflict_update=conflict_update) self.idx_storage.origin_intrinsic_metadata_add( - [orig_item for (orig_item, rev_item) in results], - conflict_update=(policy_update == 'update-dups')) + orig_metadata, conflict_update=conflict_update) @click.command() diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -75,3 +75,25 @@ for result in results: del result['tool'] assert results == [origin_metadata] + + +def test_origin_metadata_indexer_duplicates( + idx_storage, storage, obj_storage, origin_metadata_indexer): + indexer = OriginMetadataIndexer() + indexer.storage = storage + indexer.idx_storage = idx_storage + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2) + + origin = storage.origin_get({ + 'type': 'git', + 'url': 'https://github.com/librariesio/yarn-parser'}) + rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') + + results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + assert len(results) == 1 + + results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ + origin['id']])) + assert len(results) == 1