Page MenuHomeSoftware Heritage

D8417.id30384.diff
No OneTemporary

D8417.id30384.diff

diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -535,25 +535,27 @@
results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
) -> Dict[str, int]:
# Deduplicate directories
- dir_metadata: List[DirectoryIntrinsicMetadataRow] = []
- orig_metadata: List[OriginIntrinsicMetadataRow] = []
+ dir_metadata: Dict[bytes, DirectoryIntrinsicMetadataRow] = {}
+ orig_metadata: Dict[str, OriginIntrinsicMetadataRow] = {}
summary: Dict = {}
for (orig_item, dir_item) in results:
assert dir_item.metadata == orig_item.metadata
if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
# Only store non-empty metadata sets
- if dir_item not in dir_metadata:
- dir_metadata.append(dir_item)
- if orig_item not in orig_metadata:
- orig_metadata.append(orig_item)
+ if dir_item.id not in dir_metadata:
+ dir_metadata[dir_item.id] = dir_item
+ if orig_item.id not in orig_metadata:
+ orig_metadata[orig_item.id] = orig_item
if dir_metadata:
summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
- dir_metadata
+ list(dir_metadata.values())
)
summary.update(summary_dir)
if orig_metadata:
- summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
+ summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
+ list(orig_metadata.values())
+ )
summary.update(summary_ori)
return summary
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 The Software Heritage developers
+# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -6,6 +6,7 @@
import copy
from unittest.mock import patch
+import attr
import pytest
from swh.indexer.metadata import OriginMetadataIndexer
@@ -213,6 +214,58 @@
assert len(orig_results) == 2
+def test_origin_metadata_indexer_duplicate_directory_different_result(
+ swh_indexer_config,
+ idx_storage: IndexerStorageInterface,
+ storage: StorageInterface,
+ obj_storage,
+ mocker,
+) -> None:
+ """Same as above, but indexing the same directory twice resulted in different
+ data (because list order differs).
+ """
+ indexer = OriginMetadataIndexer(config=swh_indexer_config)
+ indexer.storage = storage
+ indexer.idx_storage = idx_storage
+ indexer.catch_exceptions = False
+ origin1 = "https://github.com/librariesio/yarn-parser"
+ origin2 = "https://github.com/librariesio/yarn-parser.git"
+
+ directory_index = indexer.directory_metadata_indexer.index
+
+ nb_calls = 0
+
+ def side_effect(dir_id):
+ nonlocal nb_calls
+ if nb_calls == 0:
+ keywords = ["foo", "bar"]
+ elif nb_calls == 1:
+ keywords = ["bar", "foo"]
+ else:
+ assert False, nb_calls
+ nb_calls += 1
+ return [
+ attr.evolve(row, metadata={**row.metadata, "keywords": keywords})
+ for row in directory_index(dir_id)
+ ]
+
+ mocker.patch.object(
+ indexer.directory_metadata_indexer, "index", side_effect=side_effect
+ )
+
+ indexer.run([origin1, origin2])
+
+ dir_id = DIRECTORY2.id
+
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert len(dir_results) == 1
+
+ orig_results = list(
+ indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
+ )
+ assert len(orig_results) == 2
+
+
def test_origin_metadata_indexer_no_metadata_file(
swh_indexer_config,
idx_storage: IndexerStorageInterface,

File Metadata

Mime Type
text/plain
Expires
Dec 19 2024, 11:16 PM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228030

Event Timeline