diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -2,5 +2,5 @@ swh.model >= 0.0.15 swh.objstorage >= 0.0.28 swh.scheduler >= 0.0.47 -swh.storage >= 0.0.155, < 0.0.156 +swh.storage >= 0.0.156 swh.journal >= 0.0.17 diff --git a/sql/upgrades/127.sql b/sql/upgrades/127.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/127.sql @@ -0,0 +1,63 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 126 +-- to_version: 127 +-- description: Remove swh_origin_intrinsic_metadata_add origin_url field and +-- replace id by the former content of origin_url + +insert into dbversion(version, release, description) +values(127, now(), 'Work In Progress'); + +-- replace id column by origin_url +alter table origin_intrinsic_metadata + drop constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; +alter table origin_intrinsic_metadata + drop constraint origin_intrinsic_metadata_pkey; +alter table origin_intrinsic_metadata + drop column id; +alter table origin_intrinsic_metadata + rename column origin_url id; +comment on column origin_intrinsic_metadata.id is 'url of the origin'; + +-- replace functions that operate on this table +create or replace function swh_origin_intrinsic_metadata_add( + conflict_update boolean) + returns void + language plpgsql +as $$ +begin + perform swh_origin_intrinsic_metadata_compute_tsvector(); + if conflict_update then + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, + metadata_tsvector, mappings + from tmp_origin_intrinsic_metadata + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + metadata_tsvector = excluded.metadata_tsvector, + mappings = excluded.mappings, + from_revision = excluded.from_revision; + + else + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, + metadata_tsvector, mappings + from tmp_origin_intrinsic_metadata + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; +comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata'; + +-- recreate indexes/constraints on this table +create unique index origin_intrinsic_metadata_pkey + on origin_intrinsic_metadata(id, indexer_configuration_id); +alter table origin_intrinsic_metadata + add primary key using index origin_intrinsic_metadata_pkey; + +alter table origin_intrinsic_metadata + add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table origin_intrinsic_metadata + validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey; diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -145,7 +145,7 @@ def list_origins_by_producer(idx_storage, mappings, tool_ids): - start = 0 + start = '' limit = 10000 while True: origins = list( @@ -154,7 +154,7 @@ mappings=mappings or None, tool_ids=tool_ids or None)) if not origins: break - start = origins[-1]+1 + start = origins[-1] + '\x00' # first possible string after this yield from origins diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -535,15 +535,14 @@ """ def run(self, origin_urls, policy_update='update-dups', next_step=None, **kwargs): - """Given a list of origin ids: + """Given a list of origin urls: - retrieve origins from storage - execute the indexing computations - store the results (according to policy_update) Args: - ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or - (type, url) tuples. + origin_urls ([str]): list of origin urls. policy_update (str): either 'update-dups' or 'ignore-dups' to respectively update duplicates (default) or ignore them next_step (dict): a dict in the form expected by diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -286,7 +286,6 @@ for origin in origins: head_result = self.origin_head_indexer.index(origin['url']) if head_result: - head_result['origin_id'] = origin['id'] origins_with_head.append(origin) head_rev_ids.append(head_result['revision_id']) @@ -305,8 +304,7 @@ rev_metadata = self.revision_metadata_indexer.index(rev) orig_metadata = { 'from_revision': rev_metadata['id'], - 'id': origin['id'], - 'origin_url': origin['url'], + 'id': origin['url'], 'metadata': rev_metadata['metadata'], 'mappings': rev_metadata['mappings'], 'indexer_configuration_id': diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(126, now(), 'Work In Progress'); + values(127, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) @@ -129,8 +129,7 @@ comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; create table origin_intrinsic_metadata( - id bigserial not null, - origin_url text, + id text not null, -- origin url metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null, @@ -139,7 +138,7 @@ ); comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin'; -comment on column origin_intrinsic_metadata.id is 'the entry id in origin'; +comment on column origin_intrinsic_metadata.id is 'url of the origin'; comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision'; comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.'; diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql --- a/swh/indexer/sql/40-swh-func.sql +++ b/swh/indexer/sql/40-swh-func.sql @@ -413,8 +413,8 @@ begin perform swh_origin_intrinsic_metadata_compute_tsvector(); if conflict_update then - insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select id, origin_url, metadata, indexer_configuration_id, from_revision, + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(id, indexer_configuration_id) @@ -422,12 +422,11 @@ metadata = excluded.metadata, metadata_tsvector = excluded.metadata_tsvector, mappings = excluded.mappings, - origin_url = excluded.origin_url, from_revision = excluded.from_revision; else - insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select id, origin_url, metadata, indexer_configuration_id, from_revision, + insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(id, indexer_configuration_id) @@ -453,4 +452,3 @@ set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata); end $$; - diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -692,8 +692,7 @@ Yields: list: dictionaries with the following keys: - - **id** (int): (legacy) origin identifier - - **origin_url** (str) + - **id** (str): origin url - **from_revision** (bytes): which revision this metadata was extracted from - **metadata** (str): associated metadata @@ -716,8 +715,7 @@ Args: metadata (iterable): dictionaries with keys: - - **id**: legacy origin identifier - - **origin_url** + - **id**: origin urls - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict @@ -735,7 +733,7 @@ db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', - ['id', 'origin_url', 'metadata', + ['id', 'metadata', 'indexer_configuration_id', 'from_revision', 'mappings'], cur) @@ -749,7 +747,7 @@ Args: entries (dict): dictionaries with the following keys: - - **id** (int): origin identifier + - **id** (str): origin urls - **indexer_configuration_id** (int): tool used to compute metadata """ @@ -768,8 +766,7 @@ Yields: list: dictionaries with the following keys: - - **id** (int): legacy origin identifier - - **origin_url** (str) + - **id** (str): origin urls - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata** (str): associated metadata @@ -786,17 +783,17 @@ @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer') @db_transaction_generator() def origin_intrinsic_metadata_search_by_producer( - self, start=0, end=None, limit=100, ids_only=False, + self, start='', end=None, limit=100, ids_only=False, mappings=None, tool_ids=None, db=None, cur=None): """Returns the list of origins whose metadata contain all the terms. Args: - start (int): The minimum origin id to return - end (int): The maximum origin id to return + start (str): The minimum origin url to return + end (str): The maximum origin url to return limit (int): The maximum number of results to return - ids_only (bool): Determines whether only origin ids are returned - or the content as well + ids_only (bool): Determines whether only origin urls are + returned or the content as well mappings (List[str]): Returns origins whose intrinsic metadata were generated using at least one of these mappings. @@ -804,8 +801,7 @@ list: list of origin ids (int) if `ids_only=True`, else dictionaries with the following keys: - - **id** (int): legacy origin identifier - - **origin_url** (str) + - **id** (str): origin urls - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata** (str): associated metadata @@ -817,8 +813,8 @@ res = db.origin_intrinsic_metadata_search_by_producer( start, end, limit, ids_only, mappings, tool_ids, cur) if ids_only: - for (origin_id,) in res: - yield origin_id + for (origin,) in res: + yield origin else: for c in res: yield converters.db_to_metadata( diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -336,7 +336,7 @@ self.revision_intrinsic_metadata_cols, cur=cur) origin_intrinsic_metadata_cols = [ - 'id', 'origin_url', 'metadata', 'from_revision', 'mappings', + 'id', 'metadata', 'from_revision', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' @@ -366,9 +366,9 @@ tuple((e['id'], e['indexer_configuration_id']) for e in entries),) - def origin_intrinsic_metadata_get_from_list(self, orig_ids, cur=None): + def origin_intrinsic_metadata_get_from_list(self, ids, cur=None): yield from self._get_from_list( - 'origin_intrinsic_metadata', orig_ids, + 'origin_intrinsic_metadata', ids, self.origin_intrinsic_metadata_cols, cur=cur, id_col='id') diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -77,7 +77,7 @@ } def get_all(self): - yield from self.get(list(self._tools_per_id)) + yield from self.get(self._sorted_ids) def get_range(self, start, end, indexer_configuration_id, limit): """Retrieve data within range [start, end] bound by limit. @@ -611,8 +611,7 @@ Yields: list: dictionaries with the following keys: - - **id** (int) - - **origin_url** (str) + - **id** (str): origin url - **from_revision** (bytes): which revision this metadata was extracted from - **metadata** (str): associated metadata @@ -630,8 +629,7 @@ Args: metadata (iterable): dictionaries with keys: - - **id**: origin identifier - - **origin_url** + - **id**: origin url - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict @@ -650,7 +648,7 @@ Args: entries (dict): dictionaries with the following keys: - - **id** (int): origin identifier + - **id** (str): origin url - **indexer_configuration_id** (int): tool used to compute metadata """ @@ -667,8 +665,7 @@ Yields: list: dictionaries with the following keys: - - **id** (int) - - **origin_url** (str) + - **id** (str): origin url - **from_revision** (bytes): which revision this metadata was extracted from - **metadata** (str): associated metadata @@ -709,14 +706,14 @@ yield result def origin_intrinsic_metadata_search_by_producer( - self, start=0, end=None, limit=100, ids_only=False, + self, start='', end=None, limit=100, ids_only=False, mappings=None, tool_ids=None, db=None, cur=None): """Returns the list of origins whose metadata contain all the terms. Args: - start (int): The minimum origin id to return - end (int): The maximum origin id to return + start (str): The minimum origin url to return + end (str): The maximum origin url to return limit (int): The maximum number of results to return ids_only (bool): Determines whether only origin ids are returned or the content as well @@ -727,8 +724,7 @@ list: list of origin ids (int) if `ids_only=True`, else dictionaries with the following keys: - - **id** (int) - - **origin_url** (str) + - **id** (str): origin url - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata** (str): associated metadata diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -450,9 +450,9 @@ '7026b7c1a2af56521e9587659012345678904321') self.revision_id_3 = hash_to_bytes( '7026b7c1a2af56521e9587659012345678904320') - self.origin_id_1 = 44434341 - self.origin_id_2 = 44434342 - self.origin_id_3 = 54974445 + self.origin_url_1 = 'file:///dev/0/zero' # 44434341 + self.origin_url_2 = 'file:///dev/1/one' # 44434342 + self.origin_url_3 = 'file:///dev/2/two' # 54974445 def test_check_config(self): self.assertTrue(self.storage.check_config(check_write=True)) @@ -1002,8 +1002,7 @@ 'indexer_configuration_id': tool_id, } metadata_origin = { - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], @@ -1016,11 +1015,10 @@ # then actual_metadata = list(self.storage.origin_intrinsic_metadata_get( - [self.origin_id_1, 42])) + [self.origin_url_1, 'no://where'])) expected_metadata = [{ - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -1044,15 +1042,14 @@ 'indexer_configuration_id': tool_id, } metadata_origin = { - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': self.revision_id_2, } metadata_origin2 = metadata_origin.copy() - metadata_origin2['id'] = self.origin_id_2 + metadata_origin2['id'] = self.origin_url_2 # when self.storage.revision_intrinsic_metadata_add([metadata_rev]) @@ -1061,14 +1058,14 @@ self.storage.origin_intrinsic_metadata_delete([ { - 'id': self.origin_id_1, + 'id': self.origin_url_1, 'indexer_configuration_id': tool_id } ]) # then actual_metadata = list(self.storage.origin_intrinsic_metadata_get( - [self.origin_id_1, self.origin_id_2, 42])) + [self.origin_url_1, self.origin_url_2, 'no://where'])) for item in actual_metadata: item['indexer_configuration_id'] = item.pop('tool')['id'] self.assertEqual(actual_metadata, [metadata_origin2]) @@ -1077,7 +1074,7 @@ tool_id = self.tools['swh-metadata-detector']['id'] self.storage.origin_intrinsic_metadata_delete([ { - 'id': self.origin_id_1, + 'id': self.origin_url_1, 'indexer_configuration_id': tool_id } ]) @@ -1092,14 +1089,12 @@ } metadata_rev_v1 = { 'id': self.revision_id_1, - 'origin_url': 'file:///dev/zero', 'metadata': metadata_v1.copy(), 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], @@ -1112,11 +1107,10 @@ # when actual_metadata = list(self.storage.origin_intrinsic_metadata_get( - [self.origin_id_1, 42])) + [self.origin_url_1, 'no://where'])) expected_metadata_v1 = [{ - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_1, @@ -1141,7 +1135,7 @@ # then actual_metadata = list(self.storage.origin_intrinsic_metadata_get( - [self.origin_id_1])) + [self.origin_url_1])) # metadata did not change as the v2 was dropped. self.assertEqual(actual_metadata, expected_metadata_v1) @@ -1161,8 +1155,7 @@ 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], @@ -1175,12 +1168,11 @@ # when actual_metadata = list(self.storage.origin_intrinsic_metadata_get( - [self.origin_id_1])) + [self.origin_url_1])) # then expected_metadata_v1 = [{ - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -1198,8 +1190,7 @@ metadata_origin_v2 = metadata_origin_v1.copy() metadata_rev_v2['metadata'] = metadata_v2 metadata_origin_v2 = { - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/null', + 'id': self.origin_url_1, 'metadata': metadata_v2.copy(), 'indexer_configuration_id': tool_id, 'mappings': ['npm'], @@ -1212,11 +1203,10 @@ [metadata_origin_v2], conflict_update=True) actual_metadata = list(self.storage.origin_intrinsic_metadata_get( - [self.origin_id_1])) + [self.origin_url_1])) expected_metadata_v2 = [{ - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/null', + 'id': self.origin_url_1, 'metadata': metadata_v2, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_1, @@ -1259,8 +1249,7 @@ data_v1 = [ { - 'id': id_, - 'origin_url': 'file:///tmp/origin%d' % id_, + 'id': 'file:///tmp/origin%d' % id_, 'from_revision': self.revision_id_2, **example_data1, 'indexer_configuration_id': tool_id, @@ -1269,8 +1258,7 @@ ] data_v2 = [ { - 'id': id_, - 'origin_url': 'file:///tmp/origin%d' % id_, + 'id': 'file:///tmp/origin%d' % id_, 'from_revision': self.revision_id_2, **example_data2, 'indexer_configuration_id': tool_id, @@ -1288,12 +1276,12 @@ self.storage.origin_intrinsic_metadata_add(data_v1) # when - actual_data = list(self.storage.origin_intrinsic_metadata_get(ids)) + origins = ['file:///tmp/origin%d' % i for i in ids] + actual_data = list(self.storage.origin_intrinsic_metadata_get(origins)) expected_data_v1 = [ { - 'id': id_, - 'origin_url': 'file:///tmp/origin%d' % id_, + 'id': 'file:///tmp/origin%d' % id_, 'from_revision': self.revision_id_2, **example_data1, 'tool': self.tools['swh-metadata-detector'], @@ -1321,12 +1309,11 @@ t1.join() t2.join() - actual_data = list(self.storage.origin_intrinsic_metadata_get(ids)) + actual_data = list(self.storage.origin_intrinsic_metadata_get(origins)) expected_data_v2 = [ { - 'id': id_, - 'origin_url': 'file:///tmp/origin%d' % id_, + 'id': 'file:///tmp/origin%d' % id_, 'from_revision': self.revision_id_2, **example_data2, 'tool': self.tools['swh-metadata-detector'], @@ -1352,8 +1339,7 @@ 'indexer_configuration_id': tool_id, } metadata_origin = { - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], @@ -1381,8 +1367,7 @@ 'indexer_configuration_id': tool_id, } metadata1_origin = { - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1393,14 +1378,13 @@ } metadata2_rev = { 'id': self.revision_id_2, - 'origin_url': 'file:///dev/zero', + 'origin': self.origin_url_1, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { - 'id': self.origin_id_2, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1417,13 +1401,13 @@ search = self.storage.origin_intrinsic_metadata_search_fulltext self.assertCountEqual( [res['id'] for res in search(['Doe'])], - [self.origin_id_1, self.origin_id_2]) + [self.origin_url_1, self.origin_url_2]) self.assertEqual( [res['id'] for res in search(['John', 'Doe'])], - [self.origin_id_1]) + [self.origin_url_1]) self.assertEqual( [res['id'] for res in search(['John'])], - [self.origin_id_1]) + [self.origin_url_1]) self.assertEqual( [res['id'] for res in search(['John', 'Jane'])], []) @@ -1450,8 +1434,7 @@ 'indexer_configuration_id': tool_id, } metadata1_origin = { - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1470,8 +1453,7 @@ 'indexer_configuration_id': tool_id, } metadata2_origin = { - 'id': self.origin_id_2, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1488,19 +1470,19 @@ search = self.storage.origin_intrinsic_metadata_search_fulltext self.assertEqual( [res['id'] for res in search(['Doe'])], - [self.origin_id_1, self.origin_id_2]) + [self.origin_url_1, self.origin_url_2]) self.assertEqual( [res['id'] for res in search(['Doe'], limit=1)], - [self.origin_id_1]) + [self.origin_url_1]) self.assertEqual( [res['id'] for res in search(['John'])], - [self.origin_id_1]) + [self.origin_url_1]) self.assertEqual( [res['id'] for res in search(['Jane'])], - [self.origin_id_2, self.origin_id_1]) + [self.origin_url_2, self.origin_url_1]) self.assertEqual( [res['id'] for res in search(['John', 'Jane'])], - [self.origin_id_1]) + [self.origin_url_1]) def _fill_origin_intrinsic_metadata(self): tool1_id = self.tools['swh-metadata-detector']['id'] @@ -1517,8 +1499,7 @@ 'indexer_configuration_id': tool1_id, } metadata1_origin = { - 'id': self.origin_id_1, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_1, 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, @@ -1535,8 +1516,7 @@ 'indexer_configuration_id': tool2_id, } metadata2_origin = { - 'id': self.origin_id_2, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_2, 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, @@ -1552,8 +1532,7 @@ 'indexer_configuration_id': tool2_id, } metadata3_origin = { - 'id': self.origin_id_3, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_3, 'metadata': metadata3, 'mappings': ['pkg-info'], 'indexer_configuration_id': tool2_id, @@ -1576,34 +1555,34 @@ # test pagination self.assertCountEqual( endpoint(ids_only=True), - [self.origin_id_1, self.origin_id_2, self.origin_id_3]) + [self.origin_url_1, self.origin_url_2, self.origin_url_3]) self.assertCountEqual( - endpoint(start=0, ids_only=True), - [self.origin_id_1, self.origin_id_2, self.origin_id_3]) + endpoint(start=self.origin_url_1, ids_only=True), + [self.origin_url_1, self.origin_url_2, self.origin_url_3]) self.assertCountEqual( - endpoint(start=0, limit=2, ids_only=True), - [self.origin_id_1, self.origin_id_2]) + endpoint(start=self.origin_url_1, limit=2, ids_only=True), + [self.origin_url_1, self.origin_url_2]) self.assertCountEqual( - endpoint(start=self.origin_id_1+1, ids_only=True), - [self.origin_id_2, self.origin_id_3]) + endpoint(start=self.origin_url_1+'2', ids_only=True), + [self.origin_url_2, self.origin_url_3]) self.assertCountEqual( - endpoint(start=self.origin_id_1+1, end=self.origin_id_3-1, + endpoint(start=self.origin_url_1+'2', end=self.origin_url_3[:-1], ids_only=True), - [self.origin_id_2]) + [self.origin_url_2]) # test mappings filtering self.assertCountEqual( endpoint(mappings=['npm'], ids_only=True), - [self.origin_id_1, self.origin_id_2]) + [self.origin_url_1, self.origin_url_2]) self.assertCountEqual( endpoint(mappings=['npm', 'gemspec'], ids_only=True), - [self.origin_id_1, self.origin_id_2]) + [self.origin_url_1, self.origin_url_2]) self.assertCountEqual( endpoint(mappings=['gemspec'], ids_only=True), - [self.origin_id_2]) + [self.origin_url_2]) self.assertCountEqual( endpoint(mappings=['pkg-info'], ids_only=True), - [self.origin_id_3]) + [self.origin_url_3]) self.assertCountEqual( endpoint(mappings=['foobar'], ids_only=True), []) @@ -1611,23 +1590,22 @@ # test pagination + mappings self.assertCountEqual( endpoint(mappings=['npm'], limit=1, ids_only=True), - [self.origin_id_1]) + [self.origin_url_1]) # test tool filtering self.assertCountEqual( endpoint(tool_ids=[tool1['id']], ids_only=True), - [self.origin_id_1]) + [self.origin_url_1]) self.assertCountEqual( endpoint(tool_ids=[tool2['id']], ids_only=True), - [self.origin_id_2, self.origin_id_3]) + [self.origin_url_2, self.origin_url_3]) self.assertCountEqual( endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True), - [self.origin_id_1, self.origin_id_2, self.origin_id_3]) + [self.origin_url_1, self.origin_url_2, self.origin_url_3]) # test ids_only=False self.assertEqual(list(endpoint(mappings=['gemspec'])), [{ - 'id': self.origin_id_2, - 'origin_url': 'file:///dev/zero', + 'id': self.origin_url_2, 'metadata': { '@context': 'foo', 'author': 'Jane Doe', diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -42,8 +42,7 @@ origin_metadata = [ { - 'id': origin_id, - 'origin_url': 'file:///dev/zero', + 'id': 'file://dev/%04d' % origin_id, 'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, @@ -83,7 +82,8 @@ assert all(len(task['arguments']['args']) == 1 for task in tasks) for task in tasks: assert task['arguments']['kwargs'] == expected_kwargs, task - assert _origins_in_task_args(tasks) == set(origins) + assert _origins_in_task_args(tasks) == set([ + 'file://dev/%04d' % i for i in origins]) def invoke(scheduler, catch_exceptions, args): @@ -325,12 +325,11 @@ def test_journal_client(storage, indexer_scheduler): - """Tests the re-indexing when origin_batch_size*task_batch_size is a - divisor of nb_origins.""" + """Test the 'swh indexer journal-client' cli tool.""" message = FakeKafkaMessage('swh.journal.objects.origin_visit', 'bogus', { 'status': 'full', 'origin': { - 'url': 'file:///dev/zero', + 'url': 'file://dev/0000', } }) @@ -359,4 +358,4 @@ assert len(tasks) == 1 _assert_tasks_for_origins( tasks, - ['file:///dev/zero']) + [0]) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -13,14 +13,12 @@ from .test_metadata import REVISION_METADATA_CONFIG -def test_origin_metadata_indexer( - idx_storage, storage, obj_storage): +def test_origin_metadata_indexer(idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["https://github.com/librariesio/yarn-parser"]) - origin = storage.origin_get({ - 'url': 'https://github.com/librariesio/yarn-parser'}) + origin = 'https://github.com/librariesio/yarn-parser' rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') rev_metadata = { @@ -29,8 +27,7 @@ 'mappings': ['npm'], } origin_metadata = { - 'id': origin['id'], - 'origin_url': origin['url'], + 'id': origin, 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], @@ -43,7 +40,7 @@ assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ - origin['id']])) + origin])) for result in results: del result['tool'] assert results == [origin_metadata] @@ -58,8 +55,7 @@ indexer.run(["https://github.com/librariesio/yarn-parser"]*2) - origin = storage.origin_get({ - 'url': 'https://github.com/librariesio/yarn-parser'}) + origin = 'https://github.com/librariesio/yarn-parser' rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( @@ -67,7 +63,7 @@ assert len(results) == 1 results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ - origin['id']])) + origin])) assert len(results) == 1 @@ -82,11 +78,10 @@ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["https://example.com"]) - origin = storage.origin_get({ - 'url': 'https://example.com'}) + origin = 'https://example.com' results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ - origin['id']])) + origin])) assert results == [] @@ -102,10 +97,8 @@ indexer.run(["https://example.com", "https://github.com/librariesio/yarn-parser"]) - origin1 = storage.origin_get({ - 'url': 'https://example.com'}) - origin2 = storage.origin_get({ - 'url': 'https://github.com/librariesio/yarn-parser'}) + origin1 = 'https://example.com' + origin2 = 'https://github.com/librariesio/yarn-parser' rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') rev_metadata = { @@ -114,8 +107,7 @@ 'mappings': ['npm'], } origin_metadata = { - 'id': origin2['id'], - 'origin_url': origin2['url'], + 'id': origin2, 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], @@ -128,7 +120,7 @@ assert results == [rev_metadata] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ - origin1['id'], origin2['id']])) + origin1, origin2])) for result in results: del result['tool'] assert results == [origin_metadata] @@ -142,19 +134,16 @@ indexer.run(["https://github.com/librariesio/yarn-parser", "https://github.com/librariesio/yarn-parser.git"]) - origin1 = storage.origin_get({ - 'url': 'https://github.com/librariesio/yarn-parser'}) - origin2 = storage.origin_get({ - 'url': 'https://github.com/librariesio/yarn-parser.git'}) - assert origin1['id'] != origin2['id'] + origin1 = 'https://github.com/librariesio/yarn-parser' + origin2 = 'https://github.com/librariesio/yarn-parser.git' rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) assert len(results) == 1 - results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ - origin1['id'], origin2['id']])) + results = list(indexer.idx_storage.origin_intrinsic_metadata_get( + [origin1, origin2])) assert len(results) == 2 @@ -166,8 +155,7 @@ b'foo.json'): indexer.run(["https://github.com/librariesio/yarn-parser"]) - origin = storage.origin_get({ - 'url': 'https://github.com/librariesio/yarn-parser'}) + origin = 'https://github.com/librariesio/yarn-parser' rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( @@ -175,7 +163,7 @@ assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ - origin['id']])) + origin])) assert results == [] @@ -188,8 +176,7 @@ return_value=(['npm'], {'@context': 'foo'})): indexer.run(["https://github.com/librariesio/yarn-parser"]) - origin = storage.origin_get({ - 'url': 'https://github.com/librariesio/yarn-parser'}) + origin = 'https://github.com/librariesio/yarn-parser' rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( @@ -197,7 +184,7 @@ assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ - origin['id']])) + origin])) assert results == [] @@ -210,8 +197,7 @@ return_value=None): indexer.run(["https://github.com/librariesio/yarn-parser"]) - origin = storage.origin_get({ - 'url': 'https://github.com/librariesio/yarn-parser'}) + origin = 'https://github.com/librariesio/yarn-parser' rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( @@ -219,7 +205,7 @@ assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ - origin['id']])) + origin])) assert results == [] @@ -229,8 +215,7 @@ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.run(["https://github.com/librariesio/yarn-parser"]) - origin = storage.origin_get({ - 'url': 'https://github.com/librariesio/yarn-parser'}) + origin = 'https://github.com/librariesio/yarn-parser' rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') results = list( @@ -238,7 +223,7 @@ assert results != [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ - origin['id']])) + origin])) assert results != [] with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', @@ -250,5 +235,5 @@ assert results == [] results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ - origin['id']])) + origin])) assert results == []