diff --git a/sql/upgrades/123.sql b/sql/upgrades/123.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/123.sql @@ -0,0 +1,103 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 122 +-- to_version: 123 +-- description: fix heterogeneity of names in metadata tables + +insert into dbversion(version, release, description) +values(123, now(), 'Work In Progress'); + +create or replace function swh_content_metadata_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into content_metadata (id, metadata, indexer_configuration_id) + select id, metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do update set metadata = excluded.metadata; + + else + insert into content_metadata (id, metadata, indexer_configuration_id) + select id, metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +alter function swh_revision_metadata_add rename to swh_revision_intrinsic_metadata_add; +create or replace function swh_revision_intrinsic_metadata_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_revision_intrinsic_metadata tcm + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + mappings = excluded.mappings; + + else + insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_revision_intrinsic_metadata tcm + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +alter function swh_mktemp_revision_metadata rename to swh_mktemp_revision_intrinsic_metadata; +create or replace function swh_mktemp_revision_intrinsic_metadata() + returns void + language sql +as $$ + create temporary table tmp_revision_intrinsic_metadata ( + like revision_intrinsic_metadata including defaults + ) on commit drop; +$$; + +create or replace function swh_origin_intrinsic_metadata_add( + conflict_update boolean) + returns void + language plpgsql +as $$ +begin + perform swh_origin_intrinsic_metadata_compute_tsvector(); + if conflict_update then + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, + metadata_tsvector, mappings + from tmp_origin_intrinsic_metadata + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + mappings = excluded.mappings; + + else + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, + metadata_tsvector, mappings + from tmp_origin_intrinsic_metadata + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +alter index revision_metadata_pkey rename to revision_intrinsic_metadata_pkey; + +alter table revision_metadata rename column translated_metadata to metadata; +alter table content_metadata rename column translated_metadata to metadata; +alter table origin_intrinsic_metadata rename column origin_id to id; + +alter table revision_metadata rename to revision_intrinsic_metadata; diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -22,7 +22,7 @@ - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - - computing translated_metadata by given context + - computing metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table @@ -46,25 +46,25 @@ Returns: dict: dictionary representing a content_metadata. If the - translation wasn't successful the translated_metadata keys will + translation wasn't successful the metadata keys will be returned as None """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], - 'translated_metadata': None + 'metadata': None } try: mapping_name = self.tool['tool_configuration']['context'] log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id) - result['translated_metadata'] = \ + result['metadata'] = \ MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " "for content %s" % hashutil.hash_to_hex(id)) - if result['translated_metadata'] is None: + if result['metadata'] is None: return None return result @@ -75,7 +75,7 @@ results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - - translated_metadata (jsonb): detected metadata + - metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them @@ -89,8 +89,8 @@ This indexer is in charge of: - - filtering revisions already indexed in revision_metadata table with - defined computation tool + - filtering revisions already indexed in revision_intrinsic_metadata table + with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) @@ -111,7 +111,7 @@ """Filter out known sha1s and return only missing ones. """ - yield from self.idx_storage.revision_metadata_missing(( + yield from self.idx_storage.revision_intrinsic_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], @@ -130,18 +130,19 @@ rev (dict): revision artifact from storage Returns: - dict: dictionary representing a revision_metadata, with keys: + dict: dictionary representing a revision_intrinsic_metadata, with + keys: - id (str): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - - translated_metadata: dict of retrieved metadata + - metadata: dict of retrieved metadata """ result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], 'mappings': None, - 'translated_metadata': None + 'metadata': None } try: @@ -149,11 +150,11 @@ dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) - (mappings, metadata) = self.translate_revision_metadata( + (mappings, metadata) = self.translate_revision_intrinsic_metadata( detected_files, log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])) result['mappings'] = mappings - result['translated_metadata'] = metadata + result['metadata'] = metadata except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) @@ -172,11 +173,13 @@ respectively update duplicates or ignore them """ - # TODO: add functions in storage to keep data in revision_metadata - self.idx_storage.revision_metadata_add( + # TODO: add functions in storage to keep data in + # revision_intrinsic_metadata + self.idx_storage.revision_intrinsic_metadata_add( results, conflict_update=(policy_update == 'update-dups')) - def translate_revision_metadata(self, detected_files, log_suffix): + def translate_revision_intrinsic_metadata( + self, detected_files, log_suffix): """ Determine plan of action to translate metadata when containing one or multiple detected files: @@ -191,7 +194,7 @@ """ used_mappings = [MAPPINGS[context].name for context in detected_files] - translated_metadata = [] + metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', @@ -215,13 +218,13 @@ metadata_generator = self.idx_storage.content_metadata_get( detected_files[context]) for c in metadata_generator: - # extracting translated_metadata + # extracting metadata sha1 = c['id'] sha1s_in_storage.append(sha1) - local_metadata = c['translated_metadata'] + local_metadata = c['metadata'] # local metadata is aggregated if local_metadata: - translated_metadata.append(local_metadata) + metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] @@ -234,15 +237,15 @@ log_suffix=log_suffix) # on the fly possibility: for result in c_metadata_indexer.results: - local_metadata = result['translated_metadata'] - translated_metadata.append(local_metadata) + local_metadata = result['metadata'] + metadata.append(local_metadata) except Exception: self.log.exception( "Exception while indexing metadata on contents") - # transform translated_metadata into min set with swh-metadata-detector - min_metadata = extract_minimal_metadata_dict(translated_metadata) + # transform metadata into min set with swh-metadata-detector + min_metadata = extract_minimal_metadata_dict(metadata) return (used_mappings, min_metadata) @@ -278,8 +281,8 @@ rev_metadata = self.revision_metadata_indexer.index(rev) orig_metadata = { 'from_revision': rev_metadata['id'], - 'origin_id': origin['id'], - 'metadata': rev_metadata['translated_metadata'], + 'id': origin['id'], + 'metadata': rev_metadata['metadata'], 'mappings': rev_metadata['mappings'], 'indexer_configuration_id': rev_metadata['indexer_configuration_id'], @@ -311,7 +314,7 @@ origs_to_delete.append(orig_item) if rev_metadata: - self.idx_storage.revision_metadata_add( + self.idx_storage.revision_intrinsic_metadata_add( rev_metadata, conflict_update=conflict_update) if orig_metadata: self.idx_storage.origin_intrinsic_metadata_add( @@ -324,4 +327,4 @@ if origs_to_delete: self.idx_storage.origin_intrinsic_metadata_delete(origs_to_delete) if revs_to_delete: - self.idx_storage.revision_metadata_delete(revs_to_delete) + self.idx_storage.revision_intrinsic_metadata_delete(revs_to_delete) diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -103,33 +103,33 @@ -- identified as potentially containning metadata with a translation tool (indexer_configuration_id) create table content_metadata( id sha1 not null, - translated_metadata jsonb not null, + metadata jsonb not null, indexer_configuration_id bigint not null ); comment on table content_metadata is 'metadata semantically translated from a content file'; comment on column content_metadata.id is 'sha1 of content file'; -comment on column content_metadata.translated_metadata is 'result of translation with defined format'; +comment on column content_metadata.metadata is 'result of translation with defined format'; comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; --- The table revision_metadata provides a minimal set of intrinsic metadata --- detected with the detection tool (indexer_configuration_id) and aggregated --- from the content_metadata translation. -create table revision_metadata( +-- The table revision_intrinsic_metadata provides a minimal set of intrinsic +-- metadata detected with the detection tool (indexer_configuration_id) and +-- aggregated from the content_metadata translation. +create table revision_intrinsic_metadata( id sha1_git not null, - translated_metadata jsonb not null, + metadata jsonb not null, indexer_configuration_id bigint not null, mappings text array not null ); -comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; -comment on column revision_metadata.id is 'sha1_git of revision'; -comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; -comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; -comment on column revision_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; +comment on table revision_intrinsic_metadata is 'metadata semantically detected and translated in a revision'; +comment on column revision_intrinsic_metadata.id is 'sha1_git of revision'; +comment on column revision_intrinsic_metadata.metadata is 'result of detection and translation with defined format'; +comment on column revision_intrinsic_metadata.indexer_configuration_id is 'tool used for detection'; +comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; create table origin_intrinsic_metadata( - origin_id bigserial not null, + id bigserial not null, metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null, @@ -138,7 +138,7 @@ ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; -comment on column origin_intrinsic_metadata.origin_id is 'the entry id in origin'; +comment on column origin_intrinsic_metadata.id is 'the entry id in origin'; comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.'; diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql --- a/swh/indexer/sql/40-swh-func.sql +++ b/swh/indexer/sql/40-swh-func.sql @@ -267,15 +267,15 @@ as $$ begin if conflict_update then - insert into content_metadata (id, translated_metadata, indexer_configuration_id) - select id, translated_metadata, indexer_configuration_id + insert into content_metadata (id, metadata, indexer_configuration_id) + select id, metadata, indexer_configuration_id from tmp_content_metadata tcm on conflict(id, indexer_configuration_id) - do update set translated_metadata = excluded.translated_metadata; + do update set metadata = excluded.metadata; else - insert into content_metadata (id, translated_metadata, indexer_configuration_id) - select id, translated_metadata, indexer_configuration_id + insert into content_metadata (id, metadata, indexer_configuration_id) + select id, metadata, indexer_configuration_id from tmp_content_metadata tcm on conflict(id, indexer_configuration_id) do nothing; @@ -300,33 +300,34 @@ -- end content_metadata functions --- add tmp_revision_metadata entries to revision_metadata, overwriting --- duplicates if conflict_update is true, skipping duplicates otherwise. +-- add tmp_revision_intrinsic_metadata entries to revision_intrinsic_metadata, +-- overwriting duplicates if conflict_update is true, skipping duplicates +-- otherwise. -- -- If filtering duplicates is in order, the call to --- swh_revision_metadata_missing must take place before calling this +-- swh_revision_intrinsic_metadata_missing must take place before calling this -- function. -- -- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to --- tmp_revision_metadata, 2. call this function -create or replace function swh_revision_metadata_add(conflict_update boolean) +-- tmp_revision_intrinsic_metadata, 2. call this function +create or replace function swh_revision_intrinsic_metadata_add(conflict_update boolean) returns void language plpgsql as $$ begin if conflict_update then - insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) - select id, translated_metadata, mappings, indexer_configuration_id - from tmp_revision_metadata tcm + insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_revision_intrinsic_metadata tcm on conflict(id, indexer_configuration_id) do update set - translated_metadata = excluded.translated_metadata, + metadata = excluded.metadata, mappings = excluded.mappings; else - insert into revision_metadata (id, translated_metadata, mappings, indexer_configuration_id) - select id, translated_metadata, mappings, indexer_configuration_id - from tmp_revision_metadata tcm + insert into revision_intrinsic_metadata (id, metadata, mappings, indexer_configuration_id) + select id, metadata, mappings, indexer_configuration_id + from tmp_revision_intrinsic_metadata tcm on conflict(id, indexer_configuration_id) do nothing; end if; @@ -334,19 +335,19 @@ end $$; -comment on function swh_revision_metadata_add(boolean) IS 'Add new revision metadata'; +comment on function swh_revision_intrinsic_metadata_add(boolean) IS 'Add new revision intrinsic metadata'; --- create a temporary table for retrieving revision_metadata -create or replace function swh_mktemp_revision_metadata() +-- create a temporary table for retrieving revision_intrinsic_metadata +create or replace function swh_mktemp_revision_intrinsic_metadata() returns void language sql as $$ - create temporary table tmp_revision_metadata ( - like revision_metadata including defaults + create temporary table tmp_revision_intrinsic_metadata ( + like revision_intrinsic_metadata including defaults ) on commit drop; $$; -comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata'; +comment on function swh_mktemp_revision_intrinsic_metadata() is 'Helper table to add revision intrinsic metadata'; -- create a temporary table for retrieving origin_intrinsic_metadata create or replace function swh_mktemp_origin_intrinsic_metadata() @@ -412,21 +413,21 @@ begin perform swh_origin_intrinsic_metadata_compute_tsvector(); if conflict_update then - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select origin_id, metadata, indexer_configuration_id, from_revision, + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata - on conflict(origin_id, indexer_configuration_id) + on conflict(id, indexer_configuration_id) do update set metadata = excluded.metadata, mappings = excluded.mappings; else - insert into origin_intrinsic_metadata (origin_id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select origin_id, metadata, indexer_configuration_id, from_revision, + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata - on conflict(origin_id, indexer_configuration_id) + on conflict(id, indexer_configuration_id) do nothing; end if; return; diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql --- a/swh/indexer/sql/60-swh-indexes.sql +++ b/swh/indexer/sql/60-swh-indexes.sql @@ -25,12 +25,12 @@ alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; --- revision_metadata -create unique index revision_metadata_pkey on revision_metadata(id, indexer_configuration_id); -alter table revision_metadata add primary key using index revision_metadata_pkey; +-- revision_intrinsic_metadata +create unique index revision_intrinsic_metadata_pkey on revision_intrinsic_metadata(id, indexer_configuration_id); +alter table revision_intrinsic_metadata add primary key using index revision_intrinsic_metadata_pkey; -alter table revision_metadata add constraint revision_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; -alter table revision_metadata validate constraint revision_metadata_indexer_configuration_id_fkey; +alter table revision_intrinsic_metadata add constraint revision_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table revision_intrinsic_metadata validate constraint revision_intrinsic_metadata_indexer_configuration_id_fkey; -- content_mimetype create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id); @@ -57,12 +57,12 @@ alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey; -- origin_intrinsic_metadata -create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(origin_id, indexer_configuration_id); +create unique index origin_intrinsic_metadata_pkey on origin_intrinsic_metadata(id, indexer_configuration_id); alter table origin_intrinsic_metadata add primary key using index origin_intrinsic_metadata_pkey; alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; -alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_metadata(id, indexer_configuration_id) not valid; +alter table origin_intrinsic_metadata add constraint origin_intrinsic_metadata_revision_metadata_fkey foreign key (from_revision, indexer_configuration_id) references revision_intrinsic_metadata(id, indexer_configuration_id) not valid; alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey; create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -51,32 +51,30 @@ return IndexerStorage(**args) -def _check_duplicates(data, key): +def _check_id_duplicates(data): """ - If any two dictionaries in `data` have the same value for the - key, raises a `ValueError`. + If any two dictionaries in `data` have the same id, raises + a `ValueError`. Values associated to the key must be hashable. Args: data (List[dict]): List of dictionaries to be inserted - key (str): Name of the key that acts as id. - >>> _check_duplicates([ + >>> _check_id_duplicates([ ... {'id': 'foo', 'data': 'spam'}, ... {'id': 'bar', 'data': 'egg'}, - ... ], 'id') - >>> _check_duplicates([ + ... ]) + >>> _check_id_duplicates([ ... {'id': 'foo', 'data': 'spam'}, ... {'id': 'foo', 'data': 'egg'}, - ... ], 'id') + ... ]) Traceback (most recent call last): ... ValueError: The same id is present more than once. """ - if len({item[key] for item in data}) < len(data): - raise ValueError( - 'The same {} is present more than once.'.format(key)) + if len({item['id'] for item in data}) < len(data): + raise ValueError('The same id is present more than once.') class IndexerStorage: @@ -246,7 +244,7 @@ default) """ - _check_duplicates(mimetypes, 'id') + _check_id_duplicates(mimetypes) mimetypes.sort(key=lambda m: m['id']) db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', @@ -332,7 +330,7 @@ default) """ - _check_duplicates(languages, 'id') + _check_id_duplicates(languages) languages.sort(key=lambda m: m['id']) db.mktemp_content_language(cur) # empty language is mapped to 'unknown' @@ -403,7 +401,7 @@ line, lang """ - _check_duplicates(ctags, 'id') + _check_id_duplicates(ctags) ctags.sort(key=lambda m: m['id']) def _convert_ctags(__ctags): @@ -487,7 +485,7 @@ list: content_license entries which failed due to unknown licenses """ - _check_duplicates(licenses, 'id') + _check_id_duplicates(licenses) licenses.sort(key=lambda m: m['id']) db.mktemp_content_fossology_license(cur) db.copy_to( @@ -562,7 +560,7 @@ dictionaries with the following keys: id (bytes) - translated_metadata (str): associated metadata + metadata (str): associated metadata tool (dict): tool used to compute metadata """ @@ -580,25 +578,25 @@ metadata (iterable): dictionaries with keys: - **id**: sha1 - - **translated_metadata**: arbitrary dict + - **metadata**: arbitrary dict conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ - _check_duplicates(metadata, 'id') + _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_content_metadata(cur) db.copy_to(metadata, 'tmp_content_metadata', - ['id', 'translated_metadata', 'indexer_configuration_id'], + ['id', 'metadata', 'indexer_configuration_id'], cur) db.content_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('revision_metadata/missing') + @remote_api_endpoint('revision_intrinsic_metadata/missing') @db_transaction_generator() - def revision_metadata_missing(self, metadata, db=None, cur=None): + def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): """List metadata missing from storage. Args: @@ -612,12 +610,13 @@ missing ids """ - for obj in db.revision_metadata_missing_from_list(metadata, cur): + for obj in db.revision_intrinsic_metadata_missing_from_list( + metadata, cur): yield obj[0] - @remote_api_endpoint('revision_metadata') + @remote_api_endpoint('revision_intrinsic_metadata') @db_transaction_generator() - def revision_metadata_get(self, ids, db=None, cur=None): + def revision_intrinsic_metadata_get(self, ids, db=None, cur=None): """Retrieve revision metadata per id. Args: @@ -627,27 +626,27 @@ dictionaries with the following keys: - **id** (bytes) - - **translated_metadata** (str): associated metadata + - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ - for c in db.revision_metadata_get_from_list(ids, cur): + for c in db.revision_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( - dict(zip(db.revision_metadata_cols, c))) + dict(zip(db.revision_intrinsic_metadata_cols, c))) - @remote_api_endpoint('revision_metadata/add') + @remote_api_endpoint('revision_intrinsic_metadata/add') @db_transaction() - def revision_metadata_add(self, metadata, conflict_update=False, db=None, - cur=None): + def revision_intrinsic_metadata_add(self, metadata, conflict_update=False, + db=None, cur=None): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1_git of revision - - **translated_metadata**: arbitrary dict + - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata @@ -656,20 +655,20 @@ or skip duplicates (false, the default) """ - _check_duplicates(metadata, 'id') + _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) - db.mktemp_revision_metadata(cur) + db.mktemp_revision_intrinsic_metadata(cur) - db.copy_to(metadata, 'tmp_revision_metadata', - ['id', 'translated_metadata', 'mappings', + db.copy_to(metadata, 'tmp_revision_intrinsic_metadata', + ['id', 'metadata', 'mappings', 'indexer_configuration_id'], cur) - db.revision_metadata_add_from_temp(conflict_update, cur) + db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('revision_metadata/delete') + @remote_api_endpoint('revision_intrinsic_metadata/delete') @db_transaction() - def revision_metadata_delete(self, entries, db=None, cur=None): + def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None): """Remove revision metadata from the storage. Args: @@ -678,7 +677,7 @@ - **indexer_configuration_id** (int): tool used to compute metadata """ - db.revision_metadata_delete(entries, cur) + db.revision_intrinsic_metadata_delete(entries, cur) @remote_api_endpoint('origin_intrinsic_metadata') @db_transaction_generator() @@ -691,7 +690,7 @@ Yields: list: dictionaries with the following keys: - - **origin_id** (int) + - **id** (int) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate @@ -712,7 +711,7 @@ Args: metadata (iterable): dictionaries with keys: - - **origin_id**: origin identifier + - **id**: origin identifier - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict @@ -724,13 +723,13 @@ or skip duplicates (false, the default) """ - _check_duplicates(metadata, 'origin_id') - metadata.sort(key=lambda m: m['origin_id']) + _check_id_duplicates(metadata) + metadata.sort(key=lambda m: m['id']) db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', - ['origin_id', 'metadata', 'indexer_configuration_id', + ['id', 'metadata', 'indexer_configuration_id', 'from_revision', 'mappings'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -281,7 +281,7 @@ cur=cur) content_metadata_cols = [ - 'id', 'translated_metadata', + 'id', 'metadata', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] @stored_procedure('swh_mktemp_content_metadata') @@ -295,44 +295,48 @@ yield from self._get_from_list( 'content_metadata', ids, self.content_metadata_cols, cur=cur) - revision_metadata_hash_keys = [ + revision_intrinsic_metadata_hash_keys = [ 'id', 'indexer_configuration_id'] - def revision_metadata_missing_from_list(self, metadata, cur=None): + def revision_intrinsic_metadata_missing_from_list( + self, metadata, cur=None): """List missing metadata. """ yield from self._missing_from_list( - 'revision_metadata', metadata, self.revision_metadata_hash_keys, - cur=cur) + 'revision_intrinsic_metadata', metadata, + self.revision_intrinsic_metadata_hash_keys, cur=cur) - revision_metadata_cols = [ - 'id', 'translated_metadata', 'mappings', + revision_intrinsic_metadata_cols = [ + 'id', 'metadata', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] - @stored_procedure('swh_mktemp_revision_metadata') - def mktemp_revision_metadata(self, cur=None): pass + @stored_procedure('swh_mktemp_revision_intrinsic_metadata') + def mktemp_revision_intrinsic_metadata(self, cur=None): pass - def revision_metadata_add_from_temp(self, conflict_update, cur=None): - self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", - (conflict_update, )) + def revision_intrinsic_metadata_add_from_temp( + self, conflict_update, cur=None): + self._cursor(cur).execute( + "SELECT swh_revision_intrinsic_metadata_add(%s)", + (conflict_update, )) - def revision_metadata_delete( + def revision_intrinsic_metadata_delete( self, entries, cur=None): cur = self._cursor(cur) cur.execute( - "DELETE from revision_metadata " + "DELETE from revision_intrinsic_metadata " "WHERE (id, indexer_configuration_id) IN " " (VALUES %s)" % (', '.join('%s' for _ in entries)), tuple((e['id'], e['indexer_configuration_id']) for e in entries),) - def revision_metadata_get_from_list(self, ids, cur=None): + def revision_intrinsic_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( - 'revision_metadata', ids, self.revision_metadata_cols, cur=cur) + 'revision_intrinsic_metadata', ids, + self.revision_intrinsic_metadata_cols, cur=cur) origin_intrinsic_metadata_cols = [ - 'origin_id', 'metadata', 'from_revision', 'mappings', + 'id', 'metadata', 'from_revision', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' @@ -357,23 +361,25 @@ cur = self._cursor(cur) cur.execute( "DELETE from origin_intrinsic_metadata " - "WHERE (origin_id, indexer_configuration_id) IN" + "WHERE (id, indexer_configuration_id) IN" " (VALUES %s)" % (', '.join('%s' for _ in entries)), - tuple((e['origin_id'], e['indexer_configuration_id']) + tuple((e['id'], e['indexer_configuration_id']) for e in entries),) def origin_intrinsic_metadata_get_from_list(self, orig_ids, cur=None): yield from self._get_from_list( 'origin_intrinsic_metadata', orig_ids, self.origin_intrinsic_metadata_cols, cur=cur, - id_col='origin_id') + id_col='id') def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, cur): regconfig = self.origin_intrinsic_metadata_regconfig tsquery_template = ' && '.join("plainto_tsquery('%s', %%s)" % regconfig for _ in terms) tsquery_args = [(term,) for term in terms] - keys = map(self._convert_key, self.origin_intrinsic_metadata_cols) + keys = (self._convert_key(col, 'oim') for col in + self.origin_intrinsic_metadata_cols) + query = ("SELECT {keys} FROM origin_intrinsic_metadata AS oim " "INNER JOIN indexer_configuration AS i " "ON oim.indexer_configuration_id=i.id " @@ -390,10 +396,10 @@ def origin_intrinsic_metadata_search_by_producer( self, start, end, limit, ids_only, mappings, tool_ids, cur): if ids_only: - keys = 'oim.origin_id' + keys = 'oim.id' else: - keys = ', '.join(map(self._convert_key, - self.origin_intrinsic_metadata_cols)) + keys = ', '.join((self._convert_key(col, 'oim') for col in + self.origin_intrinsic_metadata_cols)) query_parts = [ "SELECT %s" % keys, "FROM origin_intrinsic_metadata AS oim", @@ -404,10 +410,10 @@ where = [] if start: - where.append('oim.origin_id >= %s') + where.append('oim.id >= %s') args.append(start) if end: - where.append('oim.origin_id <= %s') + where.append('oim.id <= %s') args.append(end) if mappings is not None: where.append('oim.mappings && %s') diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -196,7 +196,7 @@ self._content_ctags = SubStorage(self._tools) self._licenses = SubStorage(self._tools) self._content_metadata = SubStorage(self._tools) - self._revision_metadata = SubStorage(self._tools) + self._revision_intrinsic_metadata = SubStorage(self._tools) self._origin_intrinsic_metadata = SubStorage(self._tools) def content_mimetype_missing(self, mimetypes): @@ -513,7 +513,7 @@ dictionaries with the following keys: - **id** (bytes) - - **translated_metadata** (str): associated metadata + - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata """ @@ -526,7 +526,7 @@ metadata (iterable): dictionaries with keys: - **id**: sha1 - - **translated_metadata**: arbitrary dict + - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute the results @@ -538,7 +538,7 @@ raise TypeError('identifiers must be bytes.') self._content_metadata.add(metadata, conflict_update) - def revision_metadata_missing(self, metadata): + def revision_intrinsic_metadata_missing(self, metadata): """List metadata missing from storage. Args: @@ -552,9 +552,9 @@ missing ids """ - yield from self._revision_metadata.missing(metadata) + yield from self._revision_intrinsic_metadata.missing(metadata) - def revision_metadata_get(self, ids): + def revision_intrinsic_metadata_get(self, ids): """Retrieve revision metadata per id. Args: @@ -564,22 +564,22 @@ dictionaries with the following keys: - **id** (bytes) - - **translated_metadata** (str): associated metadata + - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ - yield from self._revision_metadata.get(ids) + yield from self._revision_intrinsic_metadata.get(ids) - def revision_metadata_add(self, metadata, conflict_update=False): + def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1_git of revision - - **translated_metadata**: arbitrary dict + - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata @@ -590,9 +590,9 @@ """ if not all(isinstance(x['id'], bytes) for x in metadata): raise TypeError('identifiers must be bytes.') - self._revision_metadata.add(metadata, conflict_update) + self._revision_intrinsic_metadata.add(metadata, conflict_update) - def revision_metadata_delete(self, entries): + def revision_intrinsic_metadata_delete(self, entries): """Remove revision metadata from the storage. Args: @@ -600,7 +600,7 @@ - **revision** (int): origin identifier - **id** (int): tool used to compute metadata """ - self._revision_metadata.delete(entries) + self._revision_intrinsic_metadata.delete(entries) def origin_intrinsic_metadata_get(self, ids): """Retrieve origin metadata per id. @@ -611,16 +611,14 @@ Yields: list: dictionaries with the following keys: - - **origin_id** (int) - - **translated_metadata** (str): associated metadata + - **id** (int) + - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ - for item in self._origin_intrinsic_metadata.get(ids): - item['origin_id'] = item.pop('id') - yield item + yield from self._origin_intrinsic_metadata.get(ids) def origin_intrinsic_metadata_add(self, metadata, conflict_update=False): @@ -629,7 +627,7 @@ Args: metadata (iterable): dictionaries with keys: - - **origin_id**: origin identifier + - **id**: origin identifier - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict @@ -641,29 +639,18 @@ or skip duplicates (false, the default) """ - - items = [] - for item in metadata: - item = item.copy() - item['id'] = item.pop('origin_id') - items.append(item) - self._origin_intrinsic_metadata.add(items, conflict_update) + self._origin_intrinsic_metadata.add(metadata, conflict_update) def origin_intrinsic_metadata_delete(self, entries): """Remove origin metadata from the storage. Args: entries (dict): dictionaries with the following keys: - - **origin_id** (int): origin identifier + - **id** (int): origin identifier - **indexer_configuration_id** (int): tool used to compute metadata """ - items = [] - for entry in entries: - item = entry.copy() - item['id'] = item.pop('origin_id') - items.append(item) - self._origin_intrinsic_metadata.delete(items) + self._origin_intrinsic_metadata.delete(entries) def origin_intrinsic_metadata_search_fulltext( self, conjunction, limit=100): @@ -712,8 +699,6 @@ results.sort(key=operator.itemgetter(0), # Don't try to order 'data' reverse=True) for (rank_, result) in results[:limit]: - result = result.copy() - result['origin_id'] = result.pop('id') yield result def origin_intrinsic_metadata_search_by_producer( @@ -759,8 +744,6 @@ if ids_only: yield entry['id'] else: - entry = entry.copy() - entry['origin_id'] = entry.pop('id') yield entry nb_results += 1 diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py --- a/swh/indexer/tests/conftest.py +++ b/swh/indexer/tests/conftest.py @@ -12,7 +12,7 @@ from .utils import fill_storage, fill_obj_storage -TASK_NAMES = ['revision_metadata', 'origin_intrinsic_metadata'] +TASK_NAMES = ['revision_intrinsic_metadata', 'origin_intrinsic_metadata'] @pytest.fixture diff --git a/swh/indexer/tests/storage/test_converters.py b/swh/indexer/tests/storage/test_converters.py --- a/swh/indexer/tests/storage/test_converters.py +++ b/swh/indexer/tests/storage/test_converters.py @@ -169,12 +169,12 @@ 'tool_name': 'some-toolname', 'tool_version': 'some-toolversion', 'tool_configuration': {}, - 'translated_metadata': b'translated_metadata', + 'metadata': b'metadata', } expected_metadata = { 'id': b'some-id', - 'translated_metadata': b'translated_metadata', + 'metadata': b'metadata', 'tool': { 'id': 20, 'name': 'some-toolname', diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -931,7 +931,7 @@ endpoint_type='content_metadata', tool_name='swh-metadata-detector', example_data1={ - 'translated_metadata': { + 'metadata': { 'other': {}, 'codeRepository': { 'type': 'git', @@ -943,7 +943,7 @@ }, }, example_data2={ - 'translated_metadata': { + 'metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' @@ -951,21 +951,21 @@ }, ) - # revision_metadata tests + # revision_intrinsic_metadata tests ( - test_revision_metadata_missing, - test_revision_metadata_add__drop_duplicate, - test_revision_metadata_add__update_in_place_duplicate, - test_revision_metadata_add__update_in_place_deadlock, - test_revision_metadata_add__duplicate_twice, - test_revision_metadata_get, - test_revision_metadata_delete, - test_revision_metadata_delete_nonexisting, + test_revision_intrinsic_metadata_missing, + test_revision_intrinsic_metadata_add__drop_duplicate, + test_revision_intrinsic_metadata_add__update_in_place_duplicate, + test_revision_intrinsic_metadata_add__update_in_place_deadlock, + test_revision_intrinsic_metadata_add__duplicate_twice, + test_revision_intrinsic_metadata_get, + test_revision_intrinsic_metadata_delete, + test_revision_intrinsic_metadata_delete_nonexisting, ) = gen_generic_endpoint_tests( - endpoint_type='revision_metadata', + endpoint_type='revision_intrinsic_metadata', tool_name='swh-metadata-detector', example_data1={ - 'translated_metadata': { + 'metadata': { 'other': {}, 'codeRepository': { 'type': 'git', @@ -978,7 +978,7 @@ 'mappings': ['mapping1'], }, example_data2={ - 'translated_metadata': { + 'metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' @@ -997,12 +997,12 @@ } metadata_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata, + 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], @@ -1010,7 +1010,7 @@ } # when - self.storage.revision_metadata_add([metadata_rev]) + self.storage.revision_intrinsic_metadata_add([metadata_rev]) self.storage.origin_intrinsic_metadata_add([metadata_origin]) # then @@ -1018,7 +1018,7 @@ [self.origin_id_1, 42])) expected_metadata = [{ - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -1037,28 +1037,28 @@ } metadata_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata, + 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': self.revision_id_2, } metadata_origin2 = metadata_origin.copy() - metadata_origin2['origin_id'] = self.origin_id_2 + metadata_origin2['id'] = self.origin_id_2 # when - self.storage.revision_metadata_add([metadata_rev]) + self.storage.revision_intrinsic_metadata_add([metadata_rev]) self.storage.origin_intrinsic_metadata_add([ metadata_origin, metadata_origin2]) self.storage.origin_intrinsic_metadata_delete([ { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'indexer_configuration_id': tool_id } ]) @@ -1074,7 +1074,7 @@ tool_id = self.tools['swh-metadata-detector']['id'] self.storage.origin_intrinsic_metadata_delete([ { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'indexer_configuration_id': tool_id } ]) @@ -1089,12 +1089,12 @@ } metadata_rev_v1 = { 'id': self.revision_id_1, - 'translated_metadata': metadata_v1.copy(), + 'metadata': metadata_v1.copy(), 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], @@ -1102,7 +1102,7 @@ } # given - self.storage.revision_metadata_add([metadata_rev_v1]) + self.storage.revision_intrinsic_metadata_add([metadata_rev_v1]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when @@ -1110,7 +1110,7 @@ [self.origin_id_1, 42])) expected_metadata_v1 = [{ - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_1, @@ -1127,10 +1127,10 @@ }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() - metadata_rev_v2['translated_metadata'] = metadata_v2 - metadata_origin_v2['translated_metadata'] = metadata_v2 + metadata_rev_v2['metadata'] = metadata_v2 + metadata_origin_v2['metadata'] = metadata_v2 - self.storage.revision_metadata_add([metadata_rev_v2]) + self.storage.revision_intrinsic_metadata_add([metadata_rev_v2]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v2]) # then @@ -1150,12 +1150,12 @@ } metadata_rev_v1 = { 'id': self.revision_id_2, - 'translated_metadata': metadata_v1, + 'metadata': metadata_v1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], @@ -1163,7 +1163,7 @@ } # given - self.storage.revision_metadata_add([metadata_rev_v1]) + self.storage.revision_intrinsic_metadata_add([metadata_rev_v1]) self.storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when @@ -1172,7 +1172,7 @@ # then expected_metadata_v1 = [{ - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -1188,19 +1188,19 @@ }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() - metadata_rev_v2['translated_metadata'] = metadata_v2 + metadata_rev_v2['metadata'] = metadata_v2 metadata_origin_v2['metadata'] = metadata_v2 - self.storage.revision_metadata_add([metadata_rev_v2], - conflict_update=True) - self.storage.origin_intrinsic_metadata_add([metadata_origin_v2], - conflict_update=True) + self.storage.revision_intrinsic_metadata_add( + [metadata_rev_v2], conflict_update=True) + self.storage.origin_intrinsic_metadata_add( + [metadata_origin_v2], conflict_update=True) actual_metadata = list(self.storage.origin_intrinsic_metadata_get( [self.origin_id_1])) expected_metadata_v2 = [{ - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata_v2, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -1233,7 +1233,7 @@ metadata_rev_v1 = { 'id': self.revision_id_2, - 'translated_metadata': { + 'metadata': { 'version': None, 'name': None, }, @@ -1243,7 +1243,7 @@ data_v1 = [ { - 'origin_id': id_, + 'id': id_, 'from_revision': self.revision_id_2, **example_data1, 'indexer_configuration_id': tool_id, @@ -1252,7 +1252,7 @@ ] data_v2 = [ { - 'origin_id': id_, + 'id': id_, 'from_revision': self.revision_id_2, **example_data2, 'indexer_configuration_id': tool_id, @@ -1266,7 +1266,7 @@ data_v2b = list(reversed(data_v2[0:-1])) # given - self.storage.revision_metadata_add([metadata_rev_v1]) + self.storage.revision_intrinsic_metadata_add([metadata_rev_v1]) self.storage.origin_intrinsic_metadata_add(data_v1) # when @@ -1274,7 +1274,7 @@ expected_data_v1 = [ { - 'origin_id': id_, + 'id': id_, 'from_revision': self.revision_id_2, **example_data1, 'tool': self.tools['swh-metadata-detector'], @@ -1306,7 +1306,7 @@ expected_data_v2 = [ { - 'origin_id': id_, + 'id': id_, 'from_revision': self.revision_id_2, **example_data2, 'tool': self.tools['swh-metadata-detector'], @@ -1327,12 +1327,12 @@ } metadata_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata, + 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], @@ -1340,7 +1340,7 @@ } # when - self.storage.revision_metadata_add([metadata_rev]) + self.storage.revision_intrinsic_metadata_add([metadata_rev]) with self.assertRaises(ValueError): self.storage.origin_intrinsic_metadata_add([ @@ -1355,12 +1355,12 @@ } metadata1_rev = { 'id': self.revision_id_1, - 'translated_metadata': metadata1, + 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1371,12 +1371,12 @@ } metadata2_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata2, + 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { - 'origin_id': self.origin_id_2, + 'id': self.origin_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1384,24 +1384,24 @@ } # when - self.storage.revision_metadata_add([metadata1_rev]) + self.storage.revision_intrinsic_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) - self.storage.revision_metadata_add([metadata2_rev]) + self.storage.revision_intrinsic_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = self.storage.origin_intrinsic_metadata_search_fulltext self.assertCountEqual( - [res['origin_id'] for res in search(['Doe'])], + [res['id'] for res in search(['Doe'])], [self.origin_id_1, self.origin_id_2]) self.assertEqual( - [res['origin_id'] for res in search(['John', 'Doe'])], + [res['id'] for res in search(['John', 'Doe'])], [self.origin_id_1]) self.assertEqual( - [res['origin_id'] for res in search(['John'])], + [res['id'] for res in search(['John'])], [self.origin_id_1]) self.assertEqual( - [res['origin_id'] for res in search(['John', 'Jane'])], + [res['id'] for res in search(['John', 'Jane'])], []) def test_origin_intrinsic_metadata_search_fulltext_rank(self): @@ -1421,12 +1421,12 @@ } metadata1_rev = { 'id': self.revision_id_1, - 'translated_metadata': metadata1, + 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1440,12 +1440,12 @@ } metadata2_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata2, + 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { - 'origin_id': self.origin_id_2, + 'id': self.origin_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1453,27 +1453,27 @@ } # when - self.storage.revision_metadata_add([metadata1_rev]) + self.storage.revision_intrinsic_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) - self.storage.revision_metadata_add([metadata2_rev]) + self.storage.revision_intrinsic_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = self.storage.origin_intrinsic_metadata_search_fulltext self.assertEqual( - [res['origin_id'] for res in search(['Doe'])], + [res['id'] for res in search(['Doe'])], [self.origin_id_1, self.origin_id_2]) self.assertEqual( - [res['origin_id'] for res in search(['Doe'], limit=1)], + [res['id'] for res in search(['Doe'], limit=1)], [self.origin_id_1]) self.assertEqual( - [res['origin_id'] for res in search(['John'])], + [res['id'] for res in search(['John'])], [self.origin_id_1]) self.assertEqual( - [res['origin_id'] for res in search(['Jane'])], + [res['id'] for res in search(['Jane'])], [self.origin_id_2, self.origin_id_1]) self.assertEqual( - [res['origin_id'] for res in search(['John', 'Jane'])], + [res['id'] for res in search(['John', 'Jane'])], [self.origin_id_1]) def _fill_origin_intrinsic_metadata(self): @@ -1486,12 +1486,12 @@ } metadata1_rev = { 'id': self.revision_id_1, - 'translated_metadata': metadata1, + 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, } metadata1_origin = { - 'origin_id': self.origin_id_1, + 'id': self.origin_id_1, 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, @@ -1503,12 +1503,12 @@ } metadata2_rev = { 'id': self.revision_id_2, - 'translated_metadata': metadata2, + 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, } metadata2_origin = { - 'origin_id': self.origin_id_2, + 'id': self.origin_id_2, 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, @@ -1519,23 +1519,23 @@ } metadata3_rev = { 'id': self.revision_id_3, - 'translated_metadata': metadata3, + 'metadata': metadata3, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, } metadata3_origin = { - 'origin_id': self.origin_id_3, + 'id': self.origin_id_3, 'metadata': metadata3, 'mappings': ['pkg-info'], 'indexer_configuration_id': tool2_id, 'from_revision': self.revision_id_3, } - self.storage.revision_metadata_add([metadata1_rev]) + self.storage.revision_intrinsic_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) - self.storage.revision_metadata_add([metadata2_rev]) + self.storage.revision_intrinsic_metadata_add([metadata2_rev]) self.storage.origin_intrinsic_metadata_add([metadata2_origin]) - self.storage.revision_metadata_add([metadata3_rev]) + self.storage.revision_intrinsic_metadata_add([metadata3_rev]) self.storage.origin_intrinsic_metadata_add([metadata3_origin]) def test_origin_intrinsic_metadata_search_by_producer(self): @@ -1597,7 +1597,7 @@ # test ids_only=False self.assertEqual(list(endpoint(mappings=['gemspec'])), [{ - 'origin_id': self.origin_id_2, + 'id': self.origin_id_2, 'metadata': { '@context': 'foo', 'author': 'Jane Doe', diff --git a/swh/indexer/tests/tasks.py b/swh/indexer/tests/tasks.py --- a/swh/indexer/tests/tasks.py +++ b/swh/indexer/tests/tasks.py @@ -41,7 +41,7 @@ @app.task -def revision_metadata(*args, **kwargs): +def revision_intrinsic_metadata(*args, **kwargs): indexer = RevisionMetadataTestIndexer() indexer.run(*args, **kwargs) print('REV RESULT=', indexer.results) diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -41,7 +41,7 @@ origin_metadata = [ { - 'origin_id': origin_id, + 'id': origin_id, 'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, @@ -59,7 +59,7 @@ for origin_id in range(nb_rows) ] - idx_storage.revision_metadata_add(revision_metadata) + idx_storage.revision_intrinsic_metadata_add(revision_metadata) idx_storage.origin_intrinsic_metadata_add(origin_metadata) return [tool['id'] for tool in tools] diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -251,7 +251,7 @@ sha1s)) expected_results = [{ - 'translated_metadata': { + 'metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codeRepository': @@ -262,7 +262,7 @@ }, 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), }, { - 'translated_metadata': { + 'metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': @@ -1111,7 +1111,7 @@ metadata_indexer.idx_storage.content_metadata_add([{ 'indexer_configuration_id': tool['id'], 'id': b'cde', - 'translated_metadata': YARN_PARSER_METADATA, + 'metadata': YARN_PARSER_METADATA, }]) sha1_gits = [ @@ -1119,13 +1119,14 @@ ] metadata_indexer.run(sha1_gits, 'update-dups') - results = list(metadata_indexer.idx_storage.revision_metadata_get( - sha1_gits)) + results = list( + metadata_indexer.idx_storage. + revision_intrinsic_metadata_get(sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, - 'translated_metadata': YARN_PARSER_METADATA, + 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], }] diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -18,7 +18,7 @@ 'configuration': {}, }, 'tasks': { - 'revision_metadata': None, + 'revision_intrinsic_metadata': None, 'origin_intrinsic_metadata': None, } } diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -26,17 +26,18 @@ rev_metadata = { 'id': rev_id, - 'translated_metadata': YARN_PARSER_METADATA, + 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { - 'origin_id': origin['id'], + 'id': origin['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] @@ -62,7 +63,8 @@ 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ @@ -112,17 +114,18 @@ rev_metadata = { 'id': rev_id, - 'translated_metadata': YARN_PARSER_METADATA, + 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } origin_metadata = { - 'origin_id': origin2['id'], + 'id': origin2['id'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], } - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) for result in results: del result['tool'] assert results == [rev_metadata] @@ -151,7 +154,8 @@ assert origin1['id'] != origin2['id'] rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ @@ -172,7 +176,8 @@ 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ @@ -191,7 +196,8 @@ 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results != [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ @@ -202,7 +208,8 @@ b'foo.json'): indexer.run(["git+https://github.com/librariesio/yarn-parser"]) - results = list(indexer.idx_storage.revision_metadata_get([rev_id])) + results = list( + indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([