diff --git a/sql/upgrades/123.sql b/sql/upgrades/123.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/123.sql @@ -0,0 +1,12 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 122 +-- to_version: 123 +-- description: fix heterogeneity of names in metadata tables + +insert into dbversion(version, release, description) +values(123, now(), 'Work In Progress'); + +alter table revision_metadata rename column translated_metadata to metadata; +alter table origin_intrinsic_metadata rename column origin_id to id; + +alter table revision_metadata rename to revision_intrinsic_metadata; diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -89,8 +89,8 @@ This indexer is in charge of: - - filtering revisions already indexed in revision_metadata table with - defined computation tool + - filtering revisions already indexed in revision_intrinsic_metadata table + with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) @@ -111,7 +111,7 @@ """Filter out known sha1s and return only missing ones. """ - yield from self.idx_storage.revision_metadata_missing(( + yield from self.idx_storage.revision_intrinsic_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], @@ -130,18 +130,19 @@ rev (dict): revision artifact from storage Returns: - dict: dictionary representing a revision_metadata, with keys: + dict: dictionary representing a revision_intrinsic_metadata, with + keys: - id (str): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - - translated_metadata: dict of retrieved metadata + - metadata: dict of retrieved metadata """ result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], 'mappings': None, - 'translated_metadata': None + 'metadata': None } try: @@ -149,11 +150,11 @@ dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) - (mappings, metadata) = self.translate_revision_metadata( + (mappings, metadata) = self.translate_revision_intrinsic_metadata( detected_files, log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])) result['mappings'] = mappings - result['translated_metadata'] = metadata + result['metadata'] = metadata except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) @@ -172,11 +173,13 @@ respectively update duplicates or ignore them """ - # TODO: add functions in storage to keep data in revision_metadata - self.idx_storage.revision_metadata_add( + # TODO: add functions in storage to keep data in + # revision_intrinsic_metadata + self.idx_storage.revision_intrinsic_metadata_add( results, conflict_update=(policy_update == 'update-dups')) - def translate_revision_metadata(self, detected_files, log_suffix): + def translate_revision_intrinsic_metadata( + self, detected_files, log_suffix): """ Determine plan of action to translate metadata when containing one or multiple detected files: @@ -191,7 +194,7 @@ """ used_mappings = [MAPPINGS[context].name for context in detected_files] - translated_metadata = [] + metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', @@ -215,13 +218,13 @@ metadata_generator = self.idx_storage.content_metadata_get( detected_files[context]) for c in metadata_generator: - # extracting translated_metadata + # extracting metadata sha1 = c['id'] sha1s_in_storage.append(sha1) local_metadata = c['translated_metadata'] # local metadata is aggregated if local_metadata: - translated_metadata.append(local_metadata) + metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] @@ -235,14 +238,14 @@ # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result['translated_metadata'] - translated_metadata.append(local_metadata) + metadata.append(local_metadata) except Exception: self.log.exception( "Exception while indexing metadata on contents") - # transform translated_metadata into min set with swh-metadata-detector - min_metadata = extract_minimal_metadata_dict(translated_metadata) + # transform metadata into min set with swh-metadata-detector + min_metadata = extract_minimal_metadata_dict(metadata) return (used_mappings, min_metadata) @@ -278,8 +281,8 @@ rev_metadata = self.revision_metadata_indexer.index(rev) orig_metadata = { 'from_revision': rev_metadata['id'], - 'origin_id': origin['id'], - 'metadata': rev_metadata['translated_metadata'], + 'id': origin['id'], + 'metadata': rev_metadata['metadata'], 'mappings': rev_metadata['mappings'], 'indexer_configuration_id': rev_metadata['indexer_configuration_id'], @@ -311,7 +314,7 @@ origs_to_delete.append(orig_item) if rev_metadata: - self.idx_storage.revision_metadata_add( + self.idx_storage.revision_intrinsic_metadata_add( rev_metadata, conflict_update=conflict_update) if orig_metadata: self.idx_storage.origin_intrinsic_metadata_add( @@ -324,4 +327,4 @@ if origs_to_delete: self.idx_storage.origin_intrinsic_metadata_delete(origs_to_delete) if revs_to_delete: - self.idx_storage.revision_metadata_delete(revs_to_delete) + self.idx_storage.revision_intrinsic_metadata_delete(revs_to_delete) diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -112,24 +112,24 @@ comment on column content_metadata.translated_metadata is 'result of translation with defined format'; comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; --- The table revision_metadata provides a minimal set of intrinsic metadata --- detected with the detection tool (indexer_configuration_id) and aggregated --- from the content_metadata translation. -create table revision_metadata( +-- The table revision_intrinsic_metadata provides a minimal set of intrinsic +-- metadata detected with the detection tool (indexer_configuration_id) and +-- aggregated from the content_metadata translation. +create table revision_intrinsic_metadata( id sha1_git not null, - translated_metadata jsonb not null, + metadata jsonb not null, indexer_configuration_id bigint not null, mappings text array not null ); -comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; -comment on column revision_metadata.id is 'sha1_git of revision'; -comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; -comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; -comment on column revision_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; +comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision'; +comment on column revision_intrinsic_metadata.id is 'sha1_git of revision'; +comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format'; +comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection'; +comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; create table origin_intrinsic_metadata( - origin_id bigserial not null, + id bigserial not null, metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null, @@ -138,7 +138,7 @@ ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; -comment on column origin_intrinsic_metadata.origin_id is 'the entry id in origin'; +comment on column origin_intrinsic_metadata.id is 'the entry id in origin'; comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.'; diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql --- a/swh/indexer/sql/40-swh-func.sql +++ b/swh/indexer/sql/40-swh-func.sql @@ -300,33 +300,34 @@ -- end content_metadata functions --- add tmp_revision_metadata entries to revision_metadata, overwriting --- duplicates if conflict_update is true, skipping duplicates otherwise. +-- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata, +-- overwriting duplicates if conflict_update is true, skipping duplicates +-- otherwise. -- -- If filtering duplicates is in order, the call to --- swh_revision_metadata_missing must take place before calling this +-- swh_revision_intrinsic_metadata_missing must take place before calling this -- function. -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to --- tmp_revision_metadata, 2. call this function -create or replace function swh_revision_metadata_add(conflict_update boolean) +-- tmp_revision_intrinsic_metadata, 2. call this function +create or replace function swh_revision_intrinsic_metadata_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then - insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) - select id, translated_metadata, mappings, indexer_configuration_id - from tmp_revision_metadata tcm + insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_revision_intrinsic_metadata tcm on conflict(id, indexer_configuration_id) do update set - translated_metadata = excluded.translated_metadata, + metadata = excluded.metadata, mappings = excluded.mappings; else - insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) - select id, translated_metadata, mappings, indexer_configuration_id - from tmp_revision_metadata tcm + insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_revision_intrinsic_metadata tcm on conflict(id, indexer_configuration_id) do nothing; end if; @@ -334,19 +335,19 @@ end $$; -comment on function swh_revision_metadata_add(boolean) IS 'Add new revision metadata'; +comment on function swh_revision_intrinsic_metadata_add(boolean) IS 'Add new revision intrinsic metadata'; --- create a temporary table for retrieving revision_metadata -create or replace function swh_mktemp_revision_metadata() +-- create a temporary table for retrieving revision_intrinsic_metadata +create or replace function swh_mktemp_revision_intrinsic_metadata() returns void language sql as $$ - create temporary table tmp_revision_metadata ( - like revision_metadata including defaults + create temporary table tmp_revision_intrinsic_metadata ( + like revision_intrinsic_metadata including defaults ) on commit drop; $$; -comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata'; +comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata'; -- create a temporary table for retrieving origin_intrinsic_metadata create or replace function swh_mktemp_origin_intrinsic_metadata() @@ -412,21 +413,21 @@ begin perform swh_origin_intrinsic_metadata_compute_tsvector(); if conflict_update then - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select origin_id, metadata, indexer_configuration_id, from_revision, + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata - on conflict(origin_id, indexer_configuration_id) + on conflict(id, indexer_configuration_id) do update set metadata = excluded.metadata, mappings = excluded.mappings; else - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select origin_id, metadata, indexer_configuration_id, from_revision, + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata - on conflict(origin_id, indexer_configuration_id) + on conflict(id, indexer_configuration_id) do nothing; end if; return; diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql --- a/swh/indexer/sql/60-swh-indexes.sql +++ b/swh/indexer/sql/60-swh-indexes.sql @@ -25,12 +25,12 @@ alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; --- revision_metadata -create unique index revision_metadata_pkey on revision_metadata(id, indexer_configuration_id); -alter table revision_metadata add primary key using index revision_metadata_pkey; +-- revision_intrinsic_metadata +create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id); +alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey; -alter table revision_metadata add constraint revision_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; -alter table revision_metadata validate constraint revision_metadata_indexer_configuration_id_fkey; +alter table revision_intrinsic_metadata add constraint revision_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table revision_intrinsic_metadata validate constraint revision_intrinsic_metadata_indexer_configuration_id_fkey; -- content_mimetype create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id); @@ -57,12 +57,12 @@ alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey; -- origin_intrinsic_metadata -create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(origin_id, indexer_configuration_id); +create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(id, indexer_configuration_id); alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey; alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; -alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_metadata(id, indexer_configuration_id) not valid; +alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_intrinsic_metadata(id, indexer_configuration_id) not valid; alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey; create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -590,9 +590,9 @@ cur) db.content_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('revision_metadata/missing') + @remote_api_endpoint('revision_intrinsic_metadata/missing') @db_transaction_generator() - def revision_metadata_missing(self, metadata, db=None, cur=None): + def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): """List metadata missing from storage. Args: @@ -606,12 +606,13 @@ missing ids """ - for obj in db.revision_metadata_missing_from_list(metadata, cur): + for obj in db.revision_intrinsic_metadata_missing_from_list( + metadata, cur): yield obj[0] - @remote_api_endpoint('revision_metadata') + @remote_api_endpoint('revision_intrinsic_metadata') @db_transaction_generator() - def revision_metadata_get(self, ids, db=None, cur=None): + def revision_intrinsic_metadata_get(self, ids, db=None, cur=None): """Retrieve revision metadata per id. Args: @@ -621,27 +622,27 @@ dictionaries with the following keys: - **id** (bytes) - - **translated_metadata** (str): associated metadata + - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ - for c in db.revision_metadata_get_from_list(ids, cur): + for c in db.revision_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( - dict(zip(db.revision_metadata_cols, c))) + dict(zip(db.revision_intrinsic_metadata_cols, c))) - @remote_api_endpoint('revision_metadata/add') + @remote_api_endpoint('revision_intrinsic_metadata/add') @db_transaction() - def revision_metadata_add(self, metadata, conflict_update=False, db=None, - cur=None): + def revision_intrinsic_metadata_add(self, metadata, conflict_update=False, + db=None, cur=None): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1_git of revision - - **translated_metadata**: arbitrary dict + - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata @@ -652,17 +653,17 @@ """ _check_duplicates(metadata, 'id') - db.mktemp_revision_metadata(cur) + db.mktemp_revision_intrinsic_metadata(cur) - db.copy_to(metadata, 'tmp_revision_metadata', - ['id', 'translated_metadata', 'mappings', + db.copy_to(metadata, 'tmp_revision_intrinsic_metadata', + ['id', 'metadata', 'mappings', 'indexer_configuration_id'], cur) - db.revision_metadata_add_from_temp(conflict_update, cur) + db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('revision_metadata/delete') + @remote_api_endpoint('revision_intrinsic_metadata/delete') @db_transaction() - def revision_metadata_delete(self, entries, db=None, cur=None): + def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None): """Remove revision metadata from the storage. Args: @@ -671,7 +672,7 @@ - **indexer_configuration_id** (int): tool used to compute metadata """ - db.revision_metadata_delete(entries, cur) + db.revision_intrinsic_metadata_delete(entries, cur) @remote_api_endpoint('origin_intrinsic_metadata') @db_transaction_generator() @@ -684,7 +685,7 @@ Yields: list: dictionaries with the following keys: - - **origin_id** (int) + - **id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate @@ -705,7 +706,7 @@ Args: metadata (iterable): dictionaries with keys: - - **origin_id**: origin identifier + - **id**: origin identifier - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict @@ -717,12 +718,12 @@ or skip duplicates (false, the default) """ - _check_duplicates(metadata, 'origin_id') + _check_duplicates(metadata, 'id') db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', - ['origin_id', 'metadata', 'indexer_configuration_id', + ['id', 'metadata', 'indexer_configuration_id', 'from_revision', 'mappings'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -295,44 +295,48 @@ yield from self._get_from_list( 'content_metadata', ids, self.content_metadata_cols, cur=cur) - revision_metadata_hash_keys = [ + revision_intrinsic_metadata_hash_keys = [ 'id', 'indexer_configuration_id'] - def revision_metadata_missing_from_list(self, metadata, cur=None): + def revision_intrinsic_metadata_missing_from_list( + self, metadata, cur=None): """List missing metadata. """ yield from self._missing_from_list( - 'revision_metadata', metadata, self.revision_metadata_hash_keys, - cur=cur) + 'revision_intrinsic_metadata', metadata, + self.revision_intrinsic_metadata_hash_keys, cur=cur) - revision_metadata_cols = [ - 'id', 'translated_metadata', 'mappings', + revision_intrinsic_metadata_cols = [ + 'id', 'metadata', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] - @stored_procedure('swh_mktemp_revision_metadata') - def mktemp_revision_metadata(self, cur=None): pass + @stored_procedure('swh_mktemp_revision_intrinsic_metadata') + def mktemp_revision_intrinsic_metadata(self, cur=None): pass - def revision_metadata_add_from_temp(self, conflict_update, cur=None): - self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", - (conflict_update, )) + def revision_intrinsic_metadata_add_from_temp( + self, conflict_update, cur=None): + self._cursor(cur).execute( + "SELECT swh_revision_intrinsic_metadata_add(%s)", + (conflict_update, )) - def revision_metadata_delete( + def revision_intrinsic_metadata_delete( self, entries, cur=None): cur = self._cursor(cur) cur.execute( - "DELETE from revision_metadata " + "DELETE from revision_intrinsic_metadata " "WHERE (id, indexer_configuration_id) IN " " (VALUES %s)" % (', '.join('%s' for _ in entries)), tuple((e['id'], e['indexer_configuration_id']) for e in entries),) - def revision_metadata_get_from_list(self, ids, cur=None): + def revision_intrinsic_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( - 'revision_metadata', ids, self.revision_metadata_cols, cur=cur) + 'revision_intrinsic_metadata', ids, + self.revision_intrinsic_metadata_cols, cur=cur) origin_intrinsic_metadata_cols = [ - 'origin_id', 'metadata', 'from_revision', 'mappings', + 'id', 'metadata', 'from_revision', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' @@ -357,16 +361,16 @@ cur = self._cursor(cur) cur.execute( "DELETE from origin_intrinsic_metadata " - "WHERE (origin_id, indexer_configuration_id) IN" + "WHERE (id, indexer_configuration_id) IN" " (VALUES %s)" % (', '.join('%s' for _ in entries)), - tuple((e['origin_id'], e['indexer_configuration_id']) + tuple((e['id'], e['indexer_configuration_id']) for e in entries),) def origin_intrinsic_metadata_get_from_list(self, orig_ids, cur=None): yield from self._get_from_list( 'origin_intrinsic_metadata', orig_ids, self.origin_intrinsic_metadata_cols, cur=cur, - id_col='origin_id') + id_col='id') def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, cur): regconfig = self.origin_intrinsic_metadata_regconfig @@ -390,7 +394,7 @@ def origin_intrinsic_metadata_search_by_producer( self, start, end, limit, ids_only, mappings, tool_ids, cur): if ids_only: - keys = 'oim.origin_id' + keys = 'oim.id' else: keys = ', '.join(map(self._convert_key, self.origin_intrinsic_metadata_cols)) @@ -404,10 +408,10 @@ where = [] if start: - where.append('oim.origin_id >= %s') + where.append('oim.id>= %s') args.append(start) if end: - where.append('oim.origin_id <= %s') + where.append('oim.id<= %s') args.append(end) if mappings is not None: where.append('oim.mappings && %s') diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -196,7 +196,7 @@ self._content_ctags = SubStorage(self._tools) self._licenses = SubStorage(self._tools) self._content_metadata = SubStorage(self._tools) - self._revision_metadata = SubStorage(self._tools) + self._revision_intrinsic_metadata = SubStorage(self._tools) self._origin_intrinsic_metadata = SubStorage(self._tools) def content_mimetype_missing(self, mimetypes): @@ -537,7 +537,7 @@ raise TypeError('identifiers must be bytes.') self._content_metadata.add(metadata, conflict_update) - def revision_metadata_missing(self, metadata): + def revision_intrinsic_metadata_missing(self, metadata): """List metadata missing from storage. Args: @@ -551,9 +551,9 @@ missing ids """ - yield from self._revision_metadata.missing(metadata) + yield from self._revision_intrinsic_metadata.missing(metadata) - def revision_metadata_get(self, ids): + def revision_intrinsic_metadata_get(self, ids): """Retrieve revision metadata per id. Args: @@ -563,22 +563,22 @@ dictionaries with the following keys: - **id** (bytes) - - **translated_metadata** (str): associated metadata + - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ - yield from self._revision_metadata.get(ids) + yield from self._revision_intrinsic_metadata.get(ids) - def revision_metadata_add(self, metadata, conflict_update=False): + def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1_git of revision - - **translated_metadata**: arbitrary dict + - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata @@ -589,9 +589,9 @@ """ if not all(isinstance(x['id'], bytes) for x in metadata): raise TypeError('identifiers must be bytes.') - self._revision_metadata.add(metadata, conflict_update) + self._revision_intrinsic_metadata.add(metadata, conflict_update) - def revision_metadata_delete(self, entries): + def revision_intrinsic_metadata_delete(self, entries): """Remove revision metadata from the storage. Args: @@ -599,7 +599,7 @@ - **revision** (int): origin identifier - **id** (int): tool used to compute metadata """ - self._revision_metadata.delete(entries) + self._revision_intrinsic_metadata.delete(entries) def origin_intrinsic_metadata_get(self, ids): """Retrieve origin metadata per id. @@ -610,7 +610,7 @@ Yields: list: dictionaries with the following keys: - - **origin_id** (int) + - **id** (int) - **translated_metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate @@ -618,7 +618,7 @@ """ for item in self._origin_intrinsic_metadata.get(ids): - item['origin_id'] = item.pop('id') + item['id'] = item.pop('id') yield item def origin_intrinsic_metadata_add(self, metadata, @@ -628,7 +628,7 @@ Args: metadata (iterable): dictionaries with keys: - - **origin_id**: origin identifier + - **id**: origin identifier - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict @@ -644,7 +644,7 @@ items = [] for item in metadata: item = item.copy() - item['id'] = item.pop('origin_id') + item['id'] = item.pop('id') items.append(item) self._origin_intrinsic_metadata.add(items, conflict_update) @@ -653,14 +653,14 @@ Args: entries (dict): dictionaries with the following keys: - - **origin_id** (int): origin identifier + - **id** (int): origin identifier - **indexer_configuration_id** (int): tool used to compute metadata """ items = [] for entry in entries: item = entry.copy() - item['id'] = item.pop('origin_id') + item['id'] = item.pop('id') items.append(item) self._origin_intrinsic_metadata.delete(items) @@ -712,7 +712,7 @@ reverse=True) for (rank_, result) in results[:limit]: result = result.copy() - result['origin_id'] = result.pop('id') + result['id'] = result.pop('id') yield result def origin_intrinsic_metadata_search_by_producer( @@ -759,7 +759,7 @@ yield entry['id'] else: entry = entry.copy() - entry['origin_id'] = entry.pop('id') + entry['id'] = entry.pop('id') yield entry nb_results += 1 diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -12,7 +12,7 @@ from .utils import fill_storage, fill_obj_storage -TASK_NAMES = ['revision_metadata', 'origin_intrinsic_metadata'] +TASK_NAMES = ['revision_intrinsic_metadata', 'origin_intrinsic_metadata'] @pytest.fixture diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -866,20 +866,20 @@ }, ) - # revision_metadata tests + # revision_intrinsic_metadata tests ( - test_revision_metadata_missing, - test_revision_metadata_add__drop_duplicate, - test_revision_metadata_add__update_in_place_duplicate, - test_revision_metadata_add__duplicate_twice, - test_revision_metadata_get, - test_revision_metadata_delete, - test_revision_metadata_delete_nonexisting, + test_revision_intrinsic_metadata_missing, + test_revision_intrinsic_metadata_add__drop_duplicate, + test_revision_intrinsic_metadata_add__update_in_place_duplicate, + test_revision_intrinsic_metadata_add__duplicate_twice, + test_revision_intrinsic_metadata_get, + test_revision_intrinsic_metadata_delete, + test_revision_intrinsic_metadata_delete_nonexisting, ) = gen_generic_endpoint_tests( - endpoint_type='revision_metadata', + endpoint_type='revision_intrinsic_metadata', tool_name='swh-metadata-detector', example_data1={ - 'translated_metadata': { + 'metadata': { 'other': {}, 'codeRepository': { 'type': 'git', @@ -892,7 +892,7 @@ 'mappings': ['mapping1'], }, example_data2={ - 'translated_metadata': { + 'metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' @@ -911,12 +911,12 @@ } metadata_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata, + 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], @@ -924,7 +924,7 @@ } # when - self.storage.revision_metadata_add([metadata_rev]) + self.storage.revision_intrinsic_metadata_add([metadata_rev]) self.storage.origin_intrinsic_metadata_add([metadata_origin]) # then @@ -932,7 +932,7 @@ [self.origin_id_1, 42])) expected_metadata = [{ - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -951,12 +951,12 @@ } metadata_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata, + 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], @@ -964,11 +964,11 @@ } # when - self.storage.revision_metadata_add([metadata_rev]) + self.storage.revision_intrinsic_metadata_add([metadata_rev]) self.storage.origin_intrinsic_metadata_add([metadata_origin]) self.storage.origin_intrinsic_metadata_delete([ { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'indexer_configuration_id': tool_id } ]) @@ -983,7 +983,7 @@ tool_id = self.tools['swh-metadata-detector']['id'] self.storage.origin_intrinsic_metadata_delete([ { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'indexer_configuration_id': tool_id } ]) @@ -998,12 +998,12 @@ } metadata_rev_v1 = { 'id': self.revision_id_1, - 'translated_metadata': metadata_v1.copy(), + 'metadata': metadata_v1.copy(), 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], @@ -1011,7 +1011,7 @@ } # given - self.storage.revision_metadata_add([metadata_rev_v1]) + self.storage.revision_intrinsic_metadata_add([metadata_rev_v1]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when @@ -1019,7 +1019,7 @@ [self.origin_id_1, 42])) expected_metadata_v1 = [{ - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_1, @@ -1036,10 +1036,10 @@ }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() - metadata_rev_v2['translated_metadata'] = metadata_v2 - metadata_origin_v2['translated_metadata'] = metadata_v2 + metadata_rev_v2['metadata'] = metadata_v2 + metadata_origin_v2['metadata'] = metadata_v2 - self.storage.revision_metadata_add([metadata_rev_v2]) + self.storage.revision_intrinsic_metadata_add([metadata_rev_v2]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v2]) # then @@ -1059,12 +1059,12 @@ } metadata_rev_v1 = { 'id': self.revision_id_2, - 'translated_metadata': metadata_v1, + 'metadata': metadata_v1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], @@ -1072,7 +1072,7 @@ } # given - self.storage.revision_metadata_add([metadata_rev_v1]) + self.storage.revision_intrinsic_metadata_add([metadata_rev_v1]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when @@ -1081,7 +1081,7 @@ # then expected_metadata_v1 = [{ - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -1097,19 +1097,19 @@ }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() - metadata_rev_v2['translated_metadata'] = metadata_v2 + metadata_rev_v2['metadata'] = metadata_v2 metadata_origin_v2['metadata'] = metadata_v2 - self.storage.revision_metadata_add([metadata_rev_v2], - conflict_update=True) - self.storage.origin_intrinsic_metadata_add([metadata_origin_v2], - conflict_update=True) + self.storage.revision_intrinsic_metadata_add( + [metadata_rev_v2], conflict_update=True) + self.storage.origin_intrinsic_metadata_add( + [metadata_origin_v2], conflict_update=True) actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1])) expected_metadata_v2 = [{ - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata_v2, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -1129,12 +1129,12 @@ } metadata_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata, + 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], @@ -1142,7 +1142,7 @@ } # when - self.storage.revision_metadata_add([metadata_rev]) + self.storage.revision_intrinsic_metadata_add([metadata_rev]) with self.assertRaises(ValueError): self.storage.origin_intrinsic_metadata_add([ @@ -1157,12 +1157,12 @@ } metadata1_rev = { 'id': self.revision_id_1, - 'translated_metadata': metadata1, + 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1173,12 +1173,12 @@ } metadata2_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata2, + 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { - 'origin_id': self.origin_id_2, + 'id': self.origin_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1186,24 +1186,24 @@ } # when - self.storage.revision_metadata_add([metadata1_rev]) + self.storage.revision_intrinsic_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) - self.storage.revision_metadata_add([metadata2_rev]) + self.storage.revision_intrinsic_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = self.storage.origin_intrinsic_metadata_search_fulltext self.assertCountEqual( - [res['origin_id'] for res in search(['Doe'])], + [res['id'] for res in search(['Doe'])], [self.origin_id_1, self.origin_id_2]) self.assertEqual( - [res['origin_id'] for res in search(['John', 'Doe'])], + [res['id'] for res in search(['John', 'Doe'])], [self.origin_id_1]) self.assertEqual( - [res['origin_id'] for res in search(['John'])], + [res['id'] for res in search(['John'])], [self.origin_id_1]) self.assertEqual( - [res['origin_id'] for res in search(['John', 'Jane'])], + [res['id'] for res in search(['John', 'Jane'])], []) def test_origin_intrinsic_metadata_search_fulltext_rank(self): @@ -1223,12 +1223,12 @@ } metadata1_rev = { 'id': self.revision_id_1, - 'translated_metadata': metadata1, + 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1242,12 +1242,12 @@ } metadata2_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata2, + 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { - 'origin_id': self.origin_id_2, + 'id': self.origin_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1255,27 +1255,27 @@ } # when - self.storage.revision_metadata_add([metadata1_rev]) + self.storage.revision_intrinsic_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) - self.storage.revision_metadata_add([metadata2_rev]) + self.storage.revision_intrinsic_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = self.storage.origin_intrinsic_metadata_search_fulltext self.assertEqual( - [res['origin_id'] for res in search(['Doe'])], + [res['id'] for res in search(['Doe'])], [self.origin_id_1, self.origin_id_2]) self.assertEqual( - [res['origin_id'] for res in search(['Doe'], limit=1)], + [res['id'] for res in search(['Doe'], limit=1)], [self.origin_id_1]) self.assertEqual( - [res['origin_id'] for res in search(['John'])], + [res['id'] for res in search(['John'])], [self.origin_id_1]) self.assertEqual( - [res['origin_id'] for res in search(['Jane'])], + [res['id'] for res in search(['Jane'])], [self.origin_id_2, self.origin_id_1]) self.assertEqual( - [res['origin_id'] for res in search(['John', 'Jane'])], + [res['id'] for res in search(['John', 'Jane'])], [self.origin_id_1]) def _fill_origin_intrinsic_metadata(self): @@ -1288,12 +1288,12 @@ } metadata1_rev = { 'id': self.revision_id_1, - 'translated_metadata': metadata1, + 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, } metadata1_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, @@ -1305,12 +1305,12 @@ } metadata2_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata2, + 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, } metadata2_origin = { - 'origin_id': self.origin_id_2, + 'id': self.origin_id_2, 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, @@ -1321,23 +1321,23 @@ } metadata3_rev = { 'id': self.revision_id_3, - 'translated_metadata': metadata3, + 'metadata': metadata3, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, } metadata3_origin = { - 'origin_id': self.origin_id_3, + 'id': self.origin_id_3, 'metadata': metadata3, 'mappings': ['pkg-info'], 'indexer_configuration_id': tool2_id, 'from_revision': self.revision_id_3, } - self.storage.revision_metadata_add([metadata1_rev]) + self.storage.revision_intrinsic_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) - self.storage.revision_metadata_add([metadata2_rev]) + self.storage.revision_intrinsic_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) - self.storage.revision_metadata_add([metadata3_rev]) + self.storage.revision_intrinsic_metadata_add([metadata3_rev]) self.storage.origin_intrinsic_metadata_add([metadata3_origin]) def test_origin_intrinsic_metadata_search_by_producer(self): @@ -1399,7 +1399,7 @@ # test ids_only=False self.assertEqual(list(endpoint(mappings=['gemspec'])), [{ - 'origin_id': self.origin_id_2, + 'id': self.origin_id_2, 'metadata': { '@context': 'foo', 'author': 'Jane Doe', diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -40,7 +40,7 @@ origin_metadata = [ { - 'origin_id': origin_id, + 'id': origin_id, 'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, @@ -58,7 +58,7 @@ for origin_id in range(nb_rows) ] - idx_storage.revision_metadata_add(revision_metadata) + idx_storage.revision_intrinsic_metadata_add(revision_metadata) idx_storage.origin_intrinsic_metadata_add(origin_metadata) return [tool['id'] for tool in tools] diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1119,13 +1119,14 @@ ] metadata_indexer.run(sha1_gits, 'update-dups') - results = list(metadata_indexer.idx_storage.revision_metadata_get( - sha1_gits)) + results = list( + metadata_indexer.idx_storage. + revision_intrinsic_metadata_get(sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, - 'translated_metadata': YARN_PARSER_METADATA, + 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], }] diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -26,17 +26,18 @@ rev_metadata = { 'id': rev_id, - 'translated_metadata': YARN_PARSER_METADATA, + 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { - 'origin_id': origin['id'], + 'id': origin['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] @@ -62,7 +63,8 @@ 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ @@ -112,17 +114,18 @@ rev_metadata = { 'id': rev_id, - 'translated_metadata': YARN_PARSER_METADATA, + 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { - 'origin_id': origin2['id'], + 'id': origin2['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] @@ -151,7 +154,8 @@ assert origin1['id'] != origin2['id'] rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ @@ -172,7 +176,8 @@ 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ @@ -191,7 +196,8 @@ 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results != [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ @@ -202,7 +208,8 @@ b'foo.json'): indexer.run(["git+https://github.com/librariesio/yarn-parser"]) - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([