diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -297,13 +297,21 @@ return (orig_metadata, rev_metadata) def persist_index_computations(self, results, policy_update): + conflict_update = (policy_update == 'update-dups') + + # Deduplicate revisions + rev_metadata = [] + orig_metadata = [] + for (orig_item, rev_item) in results: + if rev_item not in rev_metadata: + rev_metadata.append(rev_item) + orig_metadata.append(orig_item) + self.idx_storage.revision_metadata_add( - [rev_item for (orig_item, rev_item) in results], - conflict_update=(policy_update == 'update-dups')) + rev_metadata, conflict_update=conflict_update) self.idx_storage.origin_intrinsic_metadata_add( - [orig_item for (orig_item, rev_item) in results], - conflict_update=(policy_update == 'update-dups')) + orig_metadata, conflict_update=conflict_update) @click.command() diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -122,3 +122,13 @@ for result in results: del result['tool'] assert results == [origin_metadata] + + +@origin_metadata_indexer_test +def test_origin_metadata_indexer_duplicates(storage, idx_storage): + indexer = OriginMetadataIndexer() + indexer.storage = storage + indexer.idx_storage = idx_storage + indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + + indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2)