Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123743
D8417.id30384.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
4 KB
Subscribers
None
D8417.id30384.diff
View Options
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -535,25 +535,27 @@
results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
) -> Dict[str, int]:
# Deduplicate directories
- dir_metadata: List[DirectoryIntrinsicMetadataRow] = []
- orig_metadata: List[OriginIntrinsicMetadataRow] = []
+ dir_metadata: Dict[bytes, DirectoryIntrinsicMetadataRow] = {}
+ orig_metadata: Dict[str, OriginIntrinsicMetadataRow] = {}
summary: Dict = {}
for (orig_item, dir_item) in results:
assert dir_item.metadata == orig_item.metadata
if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
# Only store non-empty metadata sets
- if dir_item not in dir_metadata:
- dir_metadata.append(dir_item)
- if orig_item not in orig_metadata:
- orig_metadata.append(orig_item)
+ if dir_item.id not in dir_metadata:
+ dir_metadata[dir_item.id] = dir_item
+ if orig_item.id not in orig_metadata:
+ orig_metadata[orig_item.id] = orig_item
if dir_metadata:
summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
- dir_metadata
+ list(dir_metadata.values())
)
summary.update(summary_dir)
if orig_metadata:
- summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
+ summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
+ list(orig_metadata.values())
+ )
summary.update(summary_ori)
return summary
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 The Software Heritage developers
+# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -6,6 +6,7 @@
import copy
from unittest.mock import patch
+import attr
import pytest
from swh.indexer.metadata import OriginMetadataIndexer
@@ -213,6 +214,58 @@
assert len(orig_results) == 2
+def test_origin_metadata_indexer_duplicate_directory_different_result(
+ swh_indexer_config,
+ idx_storage: IndexerStorageInterface,
+ storage: StorageInterface,
+ obj_storage,
+ mocker,
+) -> None:
+ """Same as above, but indexing the same directory twice resulted in different
+ data (because list order differs).
+ """
+ indexer = OriginMetadataIndexer(config=swh_indexer_config)
+ indexer.storage = storage
+ indexer.idx_storage = idx_storage
+ indexer.catch_exceptions = False
+ origin1 = "https://github.com/librariesio/yarn-parser"
+ origin2 = "https://github.com/librariesio/yarn-parser.git"
+
+ directory_index = indexer.directory_metadata_indexer.index
+
+ nb_calls = 0
+
+ def side_effect(dir_id):
+ nonlocal nb_calls
+ if nb_calls == 0:
+ keywords = ["foo", "bar"]
+ elif nb_calls == 1:
+ keywords = ["bar", "foo"]
+ else:
+ assert False, nb_calls
+ nb_calls += 1
+ return [
+ attr.evolve(row, metadata={**row.metadata, "keywords": keywords})
+ for row in directory_index(dir_id)
+ ]
+
+ mocker.patch.object(
+ indexer.directory_metadata_indexer, "index", side_effect=side_effect
+ )
+
+ indexer.run([origin1, origin2])
+
+ dir_id = DIRECTORY2.id
+
+ dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+ assert len(dir_results) == 1
+
+ orig_results = list(
+ indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
+ )
+ assert len(orig_results) == 2
+
+
def test_origin_metadata_indexer_no_metadata_file(
swh_indexer_config,
idx_storage: IndexerStorageInterface,
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 19 2024, 11:16 PM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228030
Attached To
D8417: Fix crash when indexing the same directory twice with non-deterministic order
Event Timeline
Log In to Comment