Page MenuHomeSoftware Heritage

D2207.id7651.diff
No OneTemporary

D2207.id7651.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -2,5 +2,5 @@
swh.model >= 0.0.15
swh.objstorage >= 0.0.28
swh.scheduler >= 0.0.47
-swh.storage >= 0.0.155, < 0.0.156
+swh.storage >= 0.0.156
swh.journal >= 0.0.17
diff --git a/sql/upgrades/127.sql b/sql/upgrades/127.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/127.sql
@@ -0,0 +1,63 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 126
+-- to_version: 127
+-- description: Remove swh_origin_intrinsic_metadata_add origin_url field and
+-- replace id by the former content of origin_url
+
+insert into dbversion(version, release, description)
+values(127, now(), 'Work In Progress');
+
+-- replace id column by origin_url
+alter table origin_intrinsic_metadata
+ drop constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
+alter table origin_intrinsic_metadata
+ drop constraint origin_intrinsic_metadata_pkey;
+alter table origin_intrinsic_metadata
+ drop column id;
+alter table origin_intrinsic_metadata
+ rename column origin_url id;
+comment on column origin_intrinsic_metadata.id is 'url of the origin';
+
+-- replace functions that operate on this table
+create or replace function swh_origin_intrinsic_metadata_add(
+ conflict_update boolean)
+ returns void
+ language plpgsql
+as $$
+begin
+ perform swh_origin_intrinsic_metadata_compute_tsvector();
+ if conflict_update then
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
+ metadata_tsvector, mappings
+ from tmp_origin_intrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_revision = excluded.from_revision;
+
+ else
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
+ metadata_tsvector, mappings
+ from tmp_origin_intrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do nothing;
+ end if;
+ return;
+end
+$$;
+comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
+
+-- recreate indexes/constraints on this table
+create unique index origin_intrinsic_metadata_pkey
+ on origin_intrinsic_metadata(id, indexer_configuration_id);
+alter table origin_intrinsic_metadata
+ add primary key using index origin_intrinsic_metadata_pkey;
+
+alter table origin_intrinsic_metadata
+ add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table origin_intrinsic_metadata
+ validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -145,7 +145,7 @@
def list_origins_by_producer(idx_storage, mappings, tool_ids):
- start = 0
+ start = ''
limit = 10000
while True:
origins = list(
@@ -154,7 +154,7 @@
mappings=mappings or None, tool_ids=tool_ids or None))
if not origins:
break
- start = origins[-1]+1
+ start = origins[-1] + '\x00' # first possible string after this
yield from origins
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -535,15 +535,14 @@
"""
def run(self, origin_urls, policy_update='update-dups',
next_step=None, **kwargs):
- """Given a list of origin ids:
+ """Given a list of origin urls:
- retrieve origins from storage
- execute the indexing computations
- store the results (according to policy_update)
Args:
- ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or
- (type, url) tuples.
+ origin_urls ([str]): list of origin urls.
policy_update (str): either 'update-dups' or 'ignore-dups' to
respectively update duplicates (default) or ignore them
next_step (dict): a dict in the form expected by
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -286,7 +286,6 @@
for origin in origins:
head_result = self.origin_head_indexer.index(origin['url'])
if head_result:
- head_result['origin_id'] = origin['id']
origins_with_head.append(origin)
head_rev_ids.append(head_result['revision_id'])
@@ -305,8 +304,7 @@
rev_metadata = self.revision_metadata_indexer.index(rev)
orig_metadata = {
'from_revision': rev_metadata['id'],
- 'id': origin['id'],
- 'origin_url': origin['url'],
+ 'id': origin['url'],
'metadata': rev_metadata['metadata'],
'mappings': rev_metadata['mappings'],
'indexer_configuration_id':
diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql
--- a/swh/indexer/sql/30-swh-schema.sql
+++ b/swh/indexer/sql/30-swh-schema.sql
@@ -14,7 +14,7 @@
);
insert into dbversion(version, release, description)
- values(126, now(), 'Work In Progress');
+ values(127, now(), 'Work In Progress');
-- Computing metadata on sha1's contents
-- a SHA1 checksum (not necessarily originating from Git)
@@ -129,8 +129,7 @@
comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
create table origin_intrinsic_metadata(
- id bigserial not null,
- origin_url text,
+ id text not null, -- origin url
metadata jsonb,
indexer_configuration_id bigint not null,
from_revision sha1_git not null,
@@ -139,7 +138,7 @@
);
comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
-comment on column origin_intrinsic_metadata.id is 'the entry id in origin';
+comment on column origin_intrinsic_metadata.id is 'url of the origin';
comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision';
comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.';
diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql
--- a/swh/indexer/sql/40-swh-func.sql
+++ b/swh/indexer/sql/40-swh-func.sql
@@ -413,8 +413,8 @@
begin
perform swh_origin_intrinsic_metadata_compute_tsvector();
if conflict_update then
- insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, origin_url, metadata, indexer_configuration_id, from_revision,
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
metadata_tsvector, mappings
from tmp_origin_intrinsic_metadata
on conflict(id, indexer_configuration_id)
@@ -422,12 +422,11 @@
metadata = excluded.metadata,
metadata_tsvector = excluded.metadata_tsvector,
mappings = excluded.mappings,
- origin_url = excluded.origin_url,
from_revision = excluded.from_revision;
else
- insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, origin_url, metadata, indexer_configuration_id, from_revision,
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
metadata_tsvector, mappings
from tmp_origin_intrinsic_metadata
on conflict(id, indexer_configuration_id)
@@ -453,4 +452,3 @@
set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata);
end
$$;
-
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -692,8 +692,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int): (legacy) origin identifier
- - **origin_url** (str)
+ - **id** (str): origin url
- **from_revision** (bytes): which revision this metadata
was extracted from
- **metadata** (str): associated metadata
@@ -716,8 +715,7 @@
Args:
metadata (iterable): dictionaries with keys:
- - **id**: legacy origin identifier
- - **origin_url**
+ - **id**: origin urls
- **from_revision**: sha1 id of the revision used to generate
these metadata.
- **metadata**: arbitrary dict
@@ -735,7 +733,7 @@
db.mktemp_origin_intrinsic_metadata(cur)
db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',
- ['id', 'origin_url', 'metadata',
+ ['id', 'metadata',
'indexer_configuration_id',
'from_revision', 'mappings'],
cur)
@@ -749,7 +747,7 @@
Args:
entries (dict): dictionaries with the following keys:
- - **id** (int): origin identifier
+ - **id** (str): origin urls
- **indexer_configuration_id** (int): tool used to compute
metadata
"""
@@ -768,8 +766,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int): legacy origin identifier
- - **origin_url** (str)
+ - **id** (str): origin urls
- **from_revision**: sha1 id of the revision used to generate
these metadata.
- **metadata** (str): associated metadata
@@ -786,17 +783,17 @@
@remote_api_endpoint('origin_intrinsic_metadata/search/by_producer')
@db_transaction_generator()
def origin_intrinsic_metadata_search_by_producer(
- self, start=0, end=None, limit=100, ids_only=False,
+ self, start='', end=None, limit=100, ids_only=False,
mappings=None, tool_ids=None,
db=None, cur=None):
"""Returns the list of origins whose metadata contain all the terms.
Args:
- start (int): The minimum origin id to return
- end (int): The maximum origin id to return
+ start (str): The minimum origin url to return
+ end (str): The maximum origin url to return
limit (int): The maximum number of results to return
- ids_only (bool): Determines whether only origin ids are returned
- or the content as well
+ ids_only (bool): Determines whether only origin urls are
+ returned or the content as well
mappings (List[str]): Returns origins whose intrinsic metadata
were generated using at least one of these mappings.
@@ -804,8 +801,7 @@
list: list of origin ids (int) if `ids_only=True`, else
dictionaries with the following keys:
- - **id** (int): legacy origin identifier
- - **origin_url** (str)
+ - **id** (str): origin urls
- **from_revision**: sha1 id of the revision used to generate
these metadata.
- **metadata** (str): associated metadata
@@ -817,8 +813,8 @@
res = db.origin_intrinsic_metadata_search_by_producer(
start, end, limit, ids_only, mappings, tool_ids, cur)
if ids_only:
- for (origin_id,) in res:
- yield origin_id
+ for (origin,) in res:
+ yield origin
else:
for c in res:
yield converters.db_to_metadata(
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -336,7 +336,7 @@
self.revision_intrinsic_metadata_cols, cur=cur)
origin_intrinsic_metadata_cols = [
- 'id', 'origin_url', 'metadata', 'from_revision', 'mappings',
+ 'id', 'metadata', 'from_revision', 'mappings',
'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
origin_intrinsic_metadata_regconfig = 'pg_catalog.simple'
@@ -366,9 +366,9 @@
tuple((e['id'], e['indexer_configuration_id'])
for e in entries),)
- def origin_intrinsic_metadata_get_from_list(self, orig_ids, cur=None):
+ def origin_intrinsic_metadata_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
- 'origin_intrinsic_metadata', orig_ids,
+ 'origin_intrinsic_metadata', ids,
self.origin_intrinsic_metadata_cols, cur=cur,
id_col='id')
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -77,7 +77,7 @@
}
def get_all(self):
- yield from self.get(list(self._tools_per_id))
+ yield from self.get(self._sorted_ids)
def get_range(self, start, end, indexer_configuration_id, limit):
"""Retrieve data within range [start, end] bound by limit.
@@ -611,8 +611,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int)
- - **origin_url** (str)
+ - **id** (str): origin url
- **from_revision** (bytes): which revision this metadata
was extracted from
- **metadata** (str): associated metadata
@@ -630,8 +629,7 @@
Args:
metadata (iterable): dictionaries with keys:
- - **id**: origin identifier
- - **origin_url**
+ - **id**: origin url
- **from_revision**: sha1 id of the revision used to generate
these metadata.
- **metadata**: arbitrary dict
@@ -650,7 +648,7 @@
Args:
entries (dict): dictionaries with the following keys:
- - **id** (int): origin identifier
+ - **id** (str): origin url
- **indexer_configuration_id** (int): tool used to compute
metadata
"""
@@ -667,8 +665,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int)
- - **origin_url** (str)
+ - **id** (str): origin url
- **from_revision** (bytes): which revision this metadata
was extracted from
- **metadata** (str): associated metadata
@@ -709,14 +706,14 @@
yield result
def origin_intrinsic_metadata_search_by_producer(
- self, start=0, end=None, limit=100, ids_only=False,
+ self, start='', end=None, limit=100, ids_only=False,
mappings=None, tool_ids=None,
db=None, cur=None):
"""Returns the list of origins whose metadata contain all the terms.
Args:
- start (int): The minimum origin id to return
- end (int): The maximum origin id to return
+ start (str): The minimum origin url to return
+ end (str): The maximum origin url to return
limit (int): The maximum number of results to return
ids_only (bool): Determines whether only origin ids are returned
or the content as well
@@ -727,8 +724,7 @@
list: list of origin ids (int) if `ids_only=True`, else
dictionaries with the following keys:
- - **id** (int)
- - **origin_url** (str)
+ - **id** (str): origin url
- **from_revision**: sha1 id of the revision used to generate
these metadata.
- **metadata** (str): associated metadata
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -450,9 +450,9 @@
'7026b7c1a2af56521e9587659012345678904321')
self.revision_id_3 = hash_to_bytes(
'7026b7c1a2af56521e9587659012345678904320')
- self.origin_id_1 = 44434341
- self.origin_id_2 = 44434342
- self.origin_id_3 = 54974445
+ self.origin_url_1 = 'file:///dev/0/zero' # 44434341
+ self.origin_url_2 = 'file:///dev/1/one' # 44434342
+ self.origin_url_3 = 'file:///dev/2/two' # 54974445
def test_check_config(self):
self.assertTrue(self.storage.check_config(check_write=True))
@@ -1002,8 +1002,7 @@
'indexer_configuration_id': tool_id,
}
metadata_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata,
'indexer_configuration_id': tool_id,
'mappings': ['mapping1'],
@@ -1016,11 +1015,10 @@
# then
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1, 42]))
+ [self.origin_url_1, 'no://where']))
expected_metadata = [{
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_2,
@@ -1044,15 +1042,14 @@
'indexer_configuration_id': tool_id,
}
metadata_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata,
'indexer_configuration_id': tool_id,
'mappings': ['mapping1'],
'from_revision': self.revision_id_2,
}
metadata_origin2 = metadata_origin.copy()
- metadata_origin2['id'] = self.origin_id_2
+ metadata_origin2['id'] = self.origin_url_2
# when
self.storage.revision_intrinsic_metadata_add([metadata_rev])
@@ -1061,14 +1058,14 @@
self.storage.origin_intrinsic_metadata_delete([
{
- 'id': self.origin_id_1,
+ 'id': self.origin_url_1,
'indexer_configuration_id': tool_id
}
])
# then
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1, self.origin_id_2, 42]))
+ [self.origin_url_1, self.origin_url_2, 'no://where']))
for item in actual_metadata:
item['indexer_configuration_id'] = item.pop('tool')['id']
self.assertEqual(actual_metadata, [metadata_origin2])
@@ -1077,7 +1074,7 @@
tool_id = self.tools['swh-metadata-detector']['id']
self.storage.origin_intrinsic_metadata_delete([
{
- 'id': self.origin_id_1,
+ 'id': self.origin_url_1,
'indexer_configuration_id': tool_id
}
])
@@ -1092,14 +1089,12 @@
}
metadata_rev_v1 = {
'id': self.revision_id_1,
- 'origin_url': 'file:///dev/zero',
'metadata': metadata_v1.copy(),
'mappings': [],
'indexer_configuration_id': tool_id,
}
metadata_origin_v1 = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata_v1.copy(),
'indexer_configuration_id': tool_id,
'mappings': [],
@@ -1112,11 +1107,10 @@
# when
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1, 42]))
+ [self.origin_url_1, 'no://where']))
expected_metadata_v1 = [{
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata_v1,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_1,
@@ -1141,7 +1135,7 @@
# then
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1]))
+ [self.origin_url_1]))
# metadata did not change as the v2 was dropped.
self.assertEqual(actual_metadata, expected_metadata_v1)
@@ -1161,8 +1155,7 @@
'indexer_configuration_id': tool_id,
}
metadata_origin_v1 = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata_v1.copy(),
'indexer_configuration_id': tool_id,
'mappings': [],
@@ -1175,12 +1168,11 @@
# when
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1]))
+ [self.origin_url_1]))
# then
expected_metadata_v1 = [{
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata_v1,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_2,
@@ -1198,8 +1190,7 @@
metadata_origin_v2 = metadata_origin_v1.copy()
metadata_rev_v2['metadata'] = metadata_v2
metadata_origin_v2 = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/null',
+ 'id': self.origin_url_1,
'metadata': metadata_v2.copy(),
'indexer_configuration_id': tool_id,
'mappings': ['npm'],
@@ -1212,11 +1203,10 @@
[metadata_origin_v2], conflict_update=True)
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1]))
+ [self.origin_url_1]))
expected_metadata_v2 = [{
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/null',
+ 'id': self.origin_url_1,
'metadata': metadata_v2,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_1,
@@ -1259,8 +1249,7 @@
data_v1 = [
{
- 'id': id_,
- 'origin_url': 'file:///tmp/origin%d' % id_,
+ 'id': 'file:///tmp/origin%d' % id_,
'from_revision': self.revision_id_2,
**example_data1,
'indexer_configuration_id': tool_id,
@@ -1269,8 +1258,7 @@
]
data_v2 = [
{
- 'id': id_,
- 'origin_url': 'file:///tmp/origin%d' % id_,
+ 'id': 'file:///tmp/origin%d' % id_,
'from_revision': self.revision_id_2,
**example_data2,
'indexer_configuration_id': tool_id,
@@ -1288,12 +1276,12 @@
self.storage.origin_intrinsic_metadata_add(data_v1)
# when
- actual_data = list(self.storage.origin_intrinsic_metadata_get(ids))
+ origins = ['file:///tmp/origin%d' % i for i in ids]
+ actual_data = list(self.storage.origin_intrinsic_metadata_get(origins))
expected_data_v1 = [
{
- 'id': id_,
- 'origin_url': 'file:///tmp/origin%d' % id_,
+ 'id': 'file:///tmp/origin%d' % id_,
'from_revision': self.revision_id_2,
**example_data1,
'tool': self.tools['swh-metadata-detector'],
@@ -1321,12 +1309,11 @@
t1.join()
t2.join()
- actual_data = list(self.storage.origin_intrinsic_metadata_get(ids))
+ actual_data = list(self.storage.origin_intrinsic_metadata_get(origins))
expected_data_v2 = [
{
- 'id': id_,
- 'origin_url': 'file:///tmp/origin%d' % id_,
+ 'id': 'file:///tmp/origin%d' % id_,
'from_revision': self.revision_id_2,
**example_data2,
'tool': self.tools['swh-metadata-detector'],
@@ -1352,8 +1339,7 @@
'indexer_configuration_id': tool_id,
}
metadata_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata,
'indexer_configuration_id': tool_id,
'mappings': ['mapping1'],
@@ -1381,8 +1367,7 @@
'indexer_configuration_id': tool_id,
}
metadata1_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata1,
'mappings': [],
'indexer_configuration_id': tool_id,
@@ -1393,14 +1378,13 @@
}
metadata2_rev = {
'id': self.revision_id_2,
- 'origin_url': 'file:///dev/zero',
+ 'origin': self.origin_url_1,
'metadata': metadata2,
'mappings': [],
'indexer_configuration_id': tool_id,
}
metadata2_origin = {
- 'id': self.origin_id_2,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_2,
'metadata': metadata2,
'mappings': [],
'indexer_configuration_id': tool_id,
@@ -1417,13 +1401,13 @@
search = self.storage.origin_intrinsic_metadata_search_fulltext
self.assertCountEqual(
[res['id'] for res in search(['Doe'])],
- [self.origin_id_1, self.origin_id_2])
+ [self.origin_url_1, self.origin_url_2])
self.assertEqual(
[res['id'] for res in search(['John', 'Doe'])],
- [self.origin_id_1])
+ [self.origin_url_1])
self.assertEqual(
[res['id'] for res in search(['John'])],
- [self.origin_id_1])
+ [self.origin_url_1])
self.assertEqual(
[res['id'] for res in search(['John', 'Jane'])],
[])
@@ -1450,8 +1434,7 @@
'indexer_configuration_id': tool_id,
}
metadata1_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata1,
'mappings': [],
'indexer_configuration_id': tool_id,
@@ -1470,8 +1453,7 @@
'indexer_configuration_id': tool_id,
}
metadata2_origin = {
- 'id': self.origin_id_2,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_2,
'metadata': metadata2,
'mappings': [],
'indexer_configuration_id': tool_id,
@@ -1488,19 +1470,19 @@
search = self.storage.origin_intrinsic_metadata_search_fulltext
self.assertEqual(
[res['id'] for res in search(['Doe'])],
- [self.origin_id_1, self.origin_id_2])
+ [self.origin_url_1, self.origin_url_2])
self.assertEqual(
[res['id'] for res in search(['Doe'], limit=1)],
- [self.origin_id_1])
+ [self.origin_url_1])
self.assertEqual(
[res['id'] for res in search(['John'])],
- [self.origin_id_1])
+ [self.origin_url_1])
self.assertEqual(
[res['id'] for res in search(['Jane'])],
- [self.origin_id_2, self.origin_id_1])
+ [self.origin_url_2, self.origin_url_1])
self.assertEqual(
[res['id'] for res in search(['John', 'Jane'])],
- [self.origin_id_1])
+ [self.origin_url_1])
def _fill_origin_intrinsic_metadata(self):
tool1_id = self.tools['swh-metadata-detector']['id']
@@ -1517,8 +1499,7 @@
'indexer_configuration_id': tool1_id,
}
metadata1_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata1,
'mappings': ['npm'],
'indexer_configuration_id': tool1_id,
@@ -1535,8 +1516,7 @@
'indexer_configuration_id': tool2_id,
}
metadata2_origin = {
- 'id': self.origin_id_2,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_2,
'metadata': metadata2,
'mappings': ['npm', 'gemspec'],
'indexer_configuration_id': tool2_id,
@@ -1552,8 +1532,7 @@
'indexer_configuration_id': tool2_id,
}
metadata3_origin = {
- 'id': self.origin_id_3,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_3,
'metadata': metadata3,
'mappings': ['pkg-info'],
'indexer_configuration_id': tool2_id,
@@ -1576,34 +1555,34 @@
# test pagination
self.assertCountEqual(
endpoint(ids_only=True),
- [self.origin_id_1, self.origin_id_2, self.origin_id_3])
+ [self.origin_url_1, self.origin_url_2, self.origin_url_3])
self.assertCountEqual(
- endpoint(start=0, ids_only=True),
- [self.origin_id_1, self.origin_id_2, self.origin_id_3])
+ endpoint(start=self.origin_url_1, ids_only=True),
+ [self.origin_url_1, self.origin_url_2, self.origin_url_3])
self.assertCountEqual(
- endpoint(start=0, limit=2, ids_only=True),
- [self.origin_id_1, self.origin_id_2])
+ endpoint(start=self.origin_url_1, limit=2, ids_only=True),
+ [self.origin_url_1, self.origin_url_2])
self.assertCountEqual(
- endpoint(start=self.origin_id_1+1, ids_only=True),
- [self.origin_id_2, self.origin_id_3])
+ endpoint(start=self.origin_url_1+'2', ids_only=True),
+ [self.origin_url_2, self.origin_url_3])
self.assertCountEqual(
- endpoint(start=self.origin_id_1+1, end=self.origin_id_3-1,
+ endpoint(start=self.origin_url_1+'2', end=self.origin_url_3[:-1],
ids_only=True),
- [self.origin_id_2])
+ [self.origin_url_2])
# test mappings filtering
self.assertCountEqual(
endpoint(mappings=['npm'], ids_only=True),
- [self.origin_id_1, self.origin_id_2])
+ [self.origin_url_1, self.origin_url_2])
self.assertCountEqual(
endpoint(mappings=['npm', 'gemspec'], ids_only=True),
- [self.origin_id_1, self.origin_id_2])
+ [self.origin_url_1, self.origin_url_2])
self.assertCountEqual(
endpoint(mappings=['gemspec'], ids_only=True),
- [self.origin_id_2])
+ [self.origin_url_2])
self.assertCountEqual(
endpoint(mappings=['pkg-info'], ids_only=True),
- [self.origin_id_3])
+ [self.origin_url_3])
self.assertCountEqual(
endpoint(mappings=['foobar'], ids_only=True),
[])
@@ -1611,23 +1590,22 @@
# test pagination + mappings
self.assertCountEqual(
endpoint(mappings=['npm'], limit=1, ids_only=True),
- [self.origin_id_1])
+ [self.origin_url_1])
# test tool filtering
self.assertCountEqual(
endpoint(tool_ids=[tool1['id']], ids_only=True),
- [self.origin_id_1])
+ [self.origin_url_1])
self.assertCountEqual(
endpoint(tool_ids=[tool2['id']], ids_only=True),
- [self.origin_id_2, self.origin_id_3])
+ [self.origin_url_2, self.origin_url_3])
self.assertCountEqual(
endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True),
- [self.origin_id_1, self.origin_id_2, self.origin_id_3])
+ [self.origin_url_1, self.origin_url_2, self.origin_url_3])
# test ids_only=False
self.assertEqual(list(endpoint(mappings=['gemspec'])), [{
- 'id': self.origin_id_2,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_2,
'metadata': {
'@context': 'foo',
'author': 'Jane Doe',
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -42,8 +42,7 @@
origin_metadata = [
{
- 'id': origin_id,
- 'origin_url': 'file:///dev/zero',
+ 'id': 'file://dev/%04d' % origin_id,
'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)),
'indexer_configuration_id': tools[origin_id % 2]['id'],
'metadata': {'name': 'origin %d' % origin_id},
@@ -83,7 +82,8 @@
assert all(len(task['arguments']['args']) == 1 for task in tasks)
for task in tasks:
assert task['arguments']['kwargs'] == expected_kwargs, task
- assert _origins_in_task_args(tasks) == set(origins)
+ assert _origins_in_task_args(tasks) == set([
+ 'file://dev/%04d' % i for i in origins])
def invoke(scheduler, catch_exceptions, args):
@@ -325,12 +325,11 @@
def test_journal_client(storage, indexer_scheduler):
- """Tests the re-indexing when origin_batch_size*task_batch_size is a
- divisor of nb_origins."""
+ """Test the 'swh indexer journal-client' cli tool."""
message = FakeKafkaMessage('swh.journal.objects.origin_visit', 'bogus', {
'status': 'full',
'origin': {
- 'url': 'file:///dev/zero',
+ 'url': 'file://dev/0000',
}
})
@@ -359,4 +358,4 @@
assert len(tasks) == 1
_assert_tasks_for_origins(
tasks,
- ['file:///dev/zero'])
+ [0])
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -13,14 +13,12 @@
from .test_metadata import REVISION_METADATA_CONFIG
-def test_origin_metadata_indexer(
- idx_storage, storage, obj_storage):
+def test_origin_metadata_indexer(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.run(["https://github.com/librariesio/yarn-parser"])
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
rev_metadata = {
@@ -29,8 +27,7 @@
'mappings': ['npm'],
}
origin_metadata = {
- 'id': origin['id'],
- 'origin_url': origin['url'],
+ 'id': origin,
'from_revision': rev_id,
'metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
@@ -43,7 +40,7 @@
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
for result in results:
del result['tool']
assert results == [origin_metadata]
@@ -58,8 +55,7 @@
indexer.run(["https://github.com/librariesio/yarn-parser"]*2)
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
@@ -67,7 +63,7 @@
assert len(results) == 1
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert len(results) == 1
@@ -82,11 +78,10 @@
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.run(["https://example.com"])
- origin = storage.origin_get({
- 'url': 'https://example.com'})
+ origin = 'https://example.com'
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results == []
@@ -102,10 +97,8 @@
indexer.run(["https://example.com",
"https://github.com/librariesio/yarn-parser"])
- origin1 = storage.origin_get({
- 'url': 'https://example.com'})
- origin2 = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin1 = 'https://example.com'
+ origin2 = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
rev_metadata = {
@@ -114,8 +107,7 @@
'mappings': ['npm'],
}
origin_metadata = {
- 'id': origin2['id'],
- 'origin_url': origin2['url'],
+ 'id': origin2,
'from_revision': rev_id,
'metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
@@ -128,7 +120,7 @@
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin1['id'], origin2['id']]))
+ origin1, origin2]))
for result in results:
del result['tool']
assert results == [origin_metadata]
@@ -142,19 +134,16 @@
indexer.run(["https://github.com/librariesio/yarn-parser",
"https://github.com/librariesio/yarn-parser.git"])
- origin1 = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
- origin2 = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser.git'})
- assert origin1['id'] != origin2['id']
+ origin1 = 'https://github.com/librariesio/yarn-parser'
+ origin2 = 'https://github.com/librariesio/yarn-parser.git'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert len(results) == 1
- results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin1['id'], origin2['id']]))
+ results = list(indexer.idx_storage.origin_intrinsic_metadata_get(
+ [origin1, origin2]))
assert len(results) == 2
@@ -166,8 +155,7 @@
b'foo.json'):
indexer.run(["https://github.com/librariesio/yarn-parser"])
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
@@ -175,7 +163,7 @@
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results == []
@@ -188,8 +176,7 @@
return_value=(['npm'], {'@context': 'foo'})):
indexer.run(["https://github.com/librariesio/yarn-parser"])
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
@@ -197,7 +184,7 @@
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results == []
@@ -210,8 +197,7 @@
return_value=None):
indexer.run(["https://github.com/librariesio/yarn-parser"])
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
@@ -219,7 +205,7 @@
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results == []
@@ -229,8 +215,7 @@
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.run(["https://github.com/librariesio/yarn-parser"])
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
@@ -238,7 +223,7 @@
assert results != []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results != []
with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename',
@@ -250,5 +235,5 @@
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results == []

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 1:15 PM (8 h, 54 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218191

Event Timeline