diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -13,6 +13,7 @@ from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError from .db import Db +from ..metadata_dictionary import MAPPINGS from . import converters @@ -698,6 +699,52 @@ yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) + @remote_api_endpoint('origin_intrinsic_metadata/stats') + @db_transaction() + def origin_intrinsic_metadata_stats( + self, db=None, cur=None): + """Returns counts of indexed metadata per origins, broken down + into metadata types. + + Returns: + dict: dictionary with keys: + + - total (int): total number of origins that were indexed + (possibly yielding an empty metadata dictionary) + - non_empty (int): total number of origins that we extracted + a non-empty metadata dictionary from + - per_mapping (dict): a dictionary with mapping names as + keys and number of origins whose indexing used this + mapping. Note that indexing a given origin may use + 0, 1, or many mappings. + """ + mapping_names = [m.name for m in MAPPINGS.values()] + select_parts = [] + + # Count rows for each mapping + for mapping_name in mapping_names: + select_parts.append(( + "sum(case when (mappings @> ARRAY['%s']) " + " then 1 else 0 end)" + ) % mapping_name) + + # Total + select_parts.append("sum(1)") + + # Rows whose metadata has at least one key that is not '@context' + select_parts.append( + "sum(case when ('{}'::jsonb @> (metadata - '@context')) " + " then 0 else 1 end)") + cur.execute('select ' + ', '.join(select_parts) + + ' from origin_intrinsic_metadata') + results = dict(zip(mapping_names + ['total', 'non_empty'], + cur.fetchone())) + return { + 'total': results.pop('total'), + 'non_empty': results.pop('non_empty'), + 'per_mapping': results, + } + @remote_api_endpoint('indexer_configuration/add') @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -11,6 +11,8 @@ import math import re +from ..metadata_dictionary import MAPPINGS + SHA1_DIGEST_SIZE = 160 @@ -662,6 +664,35 @@ result['origin_id'] = result.pop('id') yield result + def origin_intrinsic_metadata_stats(self): + """Returns statistics on stored intrinsic metadata. + + Returns: + dict: dictionary with keys: + + - total (int): total number of origins that were indexed + (possibly yielding an empty metadata dictionary) + - non_empty (int): total number of origins that we extracted + a non-empty metadata dictionary from + - per_mapping (dict): a dictionary with mapping names as + keys and number of origins whose indexing used this + mapping. Note that indexing a given origin may use + 0, 1, or many mappings. + """ + mapping_count = {m.name: 0 for m in MAPPINGS.values()} + total = non_empty = 0 + for data in self._origin_intrinsic_metadata.get_all(): + total += 1 + if set(data['metadata']) - {'@context'}: + non_empty += 1 + for mapping in data['mappings']: + mapping_count[mapping] += 1 + return { + 'per_mapping': mapping_count, + 'total': total, + 'non_empty': non_empty + } + def indexer_configuration_add(self, tools): """Add new tools to the storage. diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -124,8 +124,11 @@ '7026b7c1a2af56521e951c01ed20f255fa054238') self.revision_id_2 = hash_to_bytes( '7026b7c1a2af56521e9587659012345678904321') + self.revision_id_3 = hash_to_bytes( + '7026b7c1a2af56521e9587659012345678904320') self.origin_id_1 = 54974445 self.origin_id_2 = 44434342 + self.origin_id_3 = 44434341 def test_check_config(self): self.assertTrue(self.storage.check_config(check_write=True)) @@ -1739,6 +1742,83 @@ [res['origin_id'] for res in search(['John', 'Jane'])], [self.origin_id_1]) + def test_origin_intrinsic_metadata_stats(self): + # given + tool_id = self.tools['swh-metadata-detector']['id'] + + metadata1 = { + '@context': 'foo', + 'author': 'John Doe', + } + metadata1_rev = { + 'id': self.revision_id_1, + 'translated_metadata': metadata1, + 'mappings': ['npm'], + 'indexer_configuration_id': tool_id, + } + metadata1_origin = { + 'origin_id': self.origin_id_1, + 'metadata': metadata1, + 'mappings': ['npm'], + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_1, + } + metadata2 = { + '@context': 'foo', + 'author': 'Jane Doe', + } + metadata2_rev = { + 'id': self.revision_id_2, + 'translated_metadata': metadata2, + 'mappings': ['npm', 'gemspec'], + 'indexer_configuration_id': tool_id, + } + metadata2_origin = { + 'origin_id': self.origin_id_2, + 'metadata': metadata2, + 'mappings': ['npm', 'gemspec'], + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_2, + } + metadata3 = { + '@context': 'foo', + } + metadata3_rev = { + 'id': self.revision_id_3, + 'translated_metadata': metadata3, + 'mappings': ['npm', 'gemspec'], + 'indexer_configuration_id': tool_id, + } + metadata3_origin = { + 'origin_id': self.origin_id_3, + 'metadata': metadata3, + 'mappings': ['pkg-info'], + 'indexer_configuration_id': tool_id, + 'from_revision': self.revision_id_3, + } + + # when + self.storage.revision_metadata_add([metadata1_rev]) + self.storage.origin_intrinsic_metadata_add([metadata1_origin]) + self.storage.revision_metadata_add([metadata2_rev]) + self.storage.origin_intrinsic_metadata_add([metadata2_origin]) + self.storage.revision_metadata_add([metadata3_rev]) + self.storage.origin_intrinsic_metadata_add([metadata3_origin]) + + # then + result = self.storage.origin_intrinsic_metadata_stats() + self.assertEqual(result, { + 'per_mapping': { + 'gemspec': 1, + 'npm': 2, + 'pkg-info': 1, + 'codemeta': 0, + 'maven': 0, + }, + 'total': 3, + 'non_empty': 2, + }) + def test_indexer_configuration_add(self): tool = { 'tool_name': 'some-unknown-tool',