diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -505,8 +505,7 @@ the results Yields: - an iterable of missing id for the tuple (id, - indexer_configuration_id) + missing sha1s """ for obj in db.content_metadata_missing_from_list(metadata, cur): @@ -521,7 +520,7 @@ ids (iterable): sha1 checksums Yields: - list: dictionaries with the following keys: + dictionaries with the following keys: id (bytes) translated_metadata (str): associated metadata @@ -567,8 +566,8 @@ - **indexer_configuration_id** (int): tool used to compute the results - Returns: - iterable: missing ids + Yields: + missing ids """ for obj in db.revision_metadata_missing_from_list(metadata, cur): @@ -583,7 +582,7 @@ ids (iterable): sha1 checksums Yields: - list: dictionaries with the following keys: + dictionaries with the following keys: - **id** (bytes) - **translated_metadata** (str): associated metadata diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -3,14 +3,201 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict import json +class MetadataStorage: + """Implements missing/get/add logic for both content_metadata and + revision_metadata.""" + def __init__(self, tools): + self._tools = tools + self._metadata = {} # map (id_, tool_id) -> metadata_dict + self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] + + def _transform_tool(self, tool): + return { + 'id': tool['id'], + 'name': tool['tool_name'], + 'version': tool['tool_version'], + 'configuration': tool['tool_configuration'], + } + + def missing(self, ids): + """List metadata missing from storage. + + Args: + metadata (iterable): dictionaries with keys: + + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute + the results + + Yields: + missing sha1s + + """ + for id_ in ids: + tool_id = id_['indexer_configuration_id'] + id_ = id_['id'] + if tool_id not in self._tools_per_id.get(id_, set()): + yield id_ + + def get(self, ids): + """Retrieve metadata per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + dictionaries with the following keys: + + id (bytes) + translated_metadata (str): associated metadata + tool (dict): tool used to compute metadata + + """ + for id_ in ids: + for tool_id in self._tools_per_id.get(id_, set()): + key = (id_, tool_id) + yield { + 'id': id_, + 'tool': self._transform_tool(self._tools[tool_id]), + 'translated_metadata': self._metadata[key], + } + + def add(self, metadata, conflict_update): + """Add metadata not present in storage. + + Args: + metadata (iterable): dictionaries with keys: + + - **id**: sha1 + - **translated_metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute the + results + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false) + + """ + for item in metadata: + tool_id = item['indexer_configuration_id'] + data = item['translated_metadata'] + id_ = item['id'] + if not conflict_update and \ + tool_id in self._tools_per_id.get(id_, set()): + # Duplicate, should not be updated + continue + key = (id_, tool_id) + self._metadata[key] = data + self._tools_per_id[id_].add(tool_id) + + class IndexerStorage: """In-memory SWH indexer storage.""" def __init__(self): self._tools = {} + self._content_metadata = MetadataStorage(self._tools) + self._revision_metadata = MetadataStorage(self._tools) + + def content_metadata_missing(self, metadata): + """List metadata missing from storage. + + Args: + metadata (iterable): dictionaries with keys: + + - **id** (bytes): sha1 identifier + - **indexer_configuration_id** (int): tool used to compute + the results + + Yields: + missing sha1s + + """ + yield from self._content_metadata.missing(metadata) + + def content_metadata_get(self, ids): + """Retrieve metadata per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + dictionaries with the following keys: + + id (bytes) + translated_metadata (str): associated metadata + tool (dict): tool used to compute metadata + + """ + yield from self._content_metadata.get(ids) + + def content_metadata_add(self, metadata, conflict_update=False): + """Add metadata not present in storage. + + Args: + metadata (iterable): dictionaries with keys: + + - **id**: sha1 + - **translated_metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute the + results + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + self._content_metadata.add(metadata, conflict_update) + + def revision_metadata_missing(self, metadata): + """List metadata missing from storage. + + Args: + metadata (iterable): dictionaries with keys: + + - **id** (bytes): sha1_git revision identifier + - **indexer_configuration_id** (int): tool used to compute + the results + + Yields: + missing ids + + """ + yield from self._revision_metadata.missing(metadata) + + def revision_metadata_get(self, ids): + """Retrieve revision metadata per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + dictionaries with the following keys: + + - **id** (bytes) + - **translated_metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata + + """ + yield from self._revision_metadata.get(ids) + + def revision_metadata_add(self, metadata, conflict_update=False): + """Add metadata not present in storage. + + Args: + metadata (iterable): dictionaries with keys: + + - **id**: sha1_git of revision + - **translated_metadata**: arbitrary dict + - **indexer_configuration_id**: tool used to compute metadata + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + self._revision_metadata.add(metadata, conflict_update) def indexer_configuration_add(self, tools): """Add new tools to the storage. @@ -33,12 +220,13 @@ inserted = [] for tool in tools: tool = tool.copy() - tool['id'] = self._tool_key(tool) - self._tools[tool['id']] = tool + id_ = self._tool_key(tool) + tool['id'] = id_ + self._tools[id_] = tool inserted.append(tool) return inserted - def indexer_configuration_get(self, tool, db=None, cur=None): + def indexer_configuration_get(self, tool): """Retrieve tool information. Args: diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py --- a/swh/indexer/tests/storage/test_in_memory.py +++ b/swh/indexer/tests/storage/test_in_memory.py @@ -86,38 +86,6 @@ pass @pytest.mark.xfail - def test_content_metadata_missing(self): - pass - - @pytest.mark.xfail - def test_content_metadata_get(self): - pass - - @pytest.mark.xfail - def test_content_metadata_add_drop_duplicate(self): - pass - - @pytest.mark.xfail - def test_content_metadata_add_update_in_place_duplicate(self): - pass - - @pytest.mark.xfail - def test_revision_metadata_missing(self): - pass - - @pytest.mark.xfail - def test_revision_metadata_get(self): - pass - - @pytest.mark.xfail - def test_revision_metadata_add_drop_duplicate(self): - pass - - @pytest.mark.xfail - def test_revision_metadata_add_update_in_place_duplicate(self): - pass - - @pytest.mark.xfail def test_origin_intrinsic_metadata_get(self): pass diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -85,15 +85,6 @@ }, } - self.sha1_1 = hash_to_bytes('34973274ccef6ab4dfaaf86599792fa9c3fe4689') - self.sha1_2 = hash_to_bytes('61c2b3a30496d329e21af70dd2d7e097046d07b7') - self.revision_id_1 = hash_to_bytes( - '7026b7c1a2af56521e951c01ed20f255fa054238') - self.revision_id_2 = hash_to_bytes( - '7026b7c1a2af56521e9587659012345678904321') - self.origin_id_1 = 54974445 - self.origin_id_2 = 44434342 - def tearDown(self): self.reset_storage_tables() self.storage = None @@ -127,6 +118,15 @@ 'configuration': tool['tool_configuration'], } + self.sha1_1 = hash_to_bytes('34973274ccef6ab4dfaaf86599792fa9c3fe4689') + self.sha1_2 = hash_to_bytes('61c2b3a30496d329e21af70dd2d7e097046d07b7') + self.revision_id_1 = hash_to_bytes( + '7026b7c1a2af56521e951c01ed20f255fa054238') + self.revision_id_2 = hash_to_bytes( + '7026b7c1a2af56521e9587659012345678904321') + self.origin_id_1 = 54974445 + self.origin_id_2 = 44434342 + def test_check_config(self): self.assertTrue(self.storage.check_config(check_write=True)) self.assertTrue(self.storage.check_config(check_write=False))