Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163689
D2207.id7651.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
41 KB
Subscribers
None
D2207.id7651.diff
View Options
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -2,5 +2,5 @@
swh.model >= 0.0.15
swh.objstorage >= 0.0.28
swh.scheduler >= 0.0.47
-swh.storage >= 0.0.155, < 0.0.156
+swh.storage >= 0.0.156
swh.journal >= 0.0.17
diff --git a/sql/upgrades/127.sql b/sql/upgrades/127.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/127.sql
@@ -0,0 +1,63 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 126
+-- to_version: 127
+-- description: Remove swh_origin_intrinsic_metadata_add origin_url field and
+-- replace id by the former content of origin_url
+
+insert into dbversion(version, release, description)
+values(127, now(), 'Work In Progress');
+
+-- replace id column by origin_url
+alter table origin_intrinsic_metadata
+ drop constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
+alter table origin_intrinsic_metadata
+ drop constraint origin_intrinsic_metadata_pkey;
+alter table origin_intrinsic_metadata
+ drop column id;
+alter table origin_intrinsic_metadata
+ rename column origin_url id;
+comment on column origin_intrinsic_metadata.id is 'url of the origin';
+
+-- replace functions that operate on this table
+create or replace function swh_origin_intrinsic_metadata_add(
+ conflict_update boolean)
+ returns void
+ language plpgsql
+as $$
+begin
+ perform swh_origin_intrinsic_metadata_compute_tsvector();
+ if conflict_update then
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
+ metadata_tsvector, mappings
+ from tmp_origin_intrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_revision = excluded.from_revision;
+
+ else
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
+ metadata_tsvector, mappings
+ from tmp_origin_intrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do nothing;
+ end if;
+ return;
+end
+$$;
+comment on function swh_origin_intrinsic_metadata_add(boolean) IS 'Add new origin intrinsic metadata';
+
+-- recreate indexes/constraints on this table
+create unique index origin_intrinsic_metadata_pkey
+ on origin_intrinsic_metadata(id, indexer_configuration_id);
+alter table origin_intrinsic_metadata
+ add primary key using index origin_intrinsic_metadata_pkey;
+
+alter table origin_intrinsic_metadata
+ add constraint origin_intrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table origin_intrinsic_metadata
+ validate constraint origin_intrinsic_metadata_indexer_configuration_id_fkey;
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -145,7 +145,7 @@
def list_origins_by_producer(idx_storage, mappings, tool_ids):
- start = 0
+ start = ''
limit = 10000
while True:
origins = list(
@@ -154,7 +154,7 @@
mappings=mappings or None, tool_ids=tool_ids or None))
if not origins:
break
- start = origins[-1]+1
+ start = origins[-1] + '\x00' # first possible string after this
yield from origins
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -535,15 +535,14 @@
"""
def run(self, origin_urls, policy_update='update-dups',
next_step=None, **kwargs):
- """Given a list of origin ids:
+ """Given a list of origin urls:
- retrieve origins from storage
- execute the indexing computations
- store the results (according to policy_update)
Args:
- ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or
- (type, url) tuples.
+ origin_urls ([str]): list of origin urls.
policy_update (str): either 'update-dups' or 'ignore-dups' to
respectively update duplicates (default) or ignore them
next_step (dict): a dict in the form expected by
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -286,7 +286,6 @@
for origin in origins:
head_result = self.origin_head_indexer.index(origin['url'])
if head_result:
- head_result['origin_id'] = origin['id']
origins_with_head.append(origin)
head_rev_ids.append(head_result['revision_id'])
@@ -305,8 +304,7 @@
rev_metadata = self.revision_metadata_indexer.index(rev)
orig_metadata = {
'from_revision': rev_metadata['id'],
- 'id': origin['id'],
- 'origin_url': origin['url'],
+ 'id': origin['url'],
'metadata': rev_metadata['metadata'],
'mappings': rev_metadata['mappings'],
'indexer_configuration_id':
diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql
--- a/swh/indexer/sql/30-swh-schema.sql
+++ b/swh/indexer/sql/30-swh-schema.sql
@@ -14,7 +14,7 @@
);
insert into dbversion(version, release, description)
- values(126, now(), 'Work In Progress');
+ values(127, now(), 'Work In Progress');
-- Computing metadata on sha1's contents
-- a SHA1 checksum (not necessarily originating from Git)
@@ -129,8 +129,7 @@
comment on column revision_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
create table origin_intrinsic_metadata(
- id bigserial not null,
- origin_url text,
+ id text not null, -- origin url
metadata jsonb,
indexer_configuration_id bigint not null,
from_revision sha1_git not null,
@@ -139,7 +138,7 @@
);
comment on table origin_intrinsic_metadata is 'keeps intrinsic metadata for an origin';
-comment on column origin_intrinsic_metadata.id is 'the entry id in origin';
+comment on column origin_intrinsic_metadata.id is 'url of the origin';
comment on column origin_intrinsic_metadata.metadata is 'metadata extracted from a revision';
comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
comment on column origin_intrinsic_metadata.from_revision is 'sha1 of the revision this metadata was copied from.';
diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql
--- a/swh/indexer/sql/40-swh-func.sql
+++ b/swh/indexer/sql/40-swh-func.sql
@@ -413,8 +413,8 @@
begin
perform swh_origin_intrinsic_metadata_compute_tsvector();
if conflict_update then
- insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, origin_url, metadata, indexer_configuration_id, from_revision,
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
metadata_tsvector, mappings
from tmp_origin_intrinsic_metadata
on conflict(id, indexer_configuration_id)
@@ -422,12 +422,11 @@
metadata = excluded.metadata,
metadata_tsvector = excluded.metadata_tsvector,
mappings = excluded.mappings,
- origin_url = excluded.origin_url,
from_revision = excluded.from_revision;
else
- insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
- select id, origin_url, metadata, indexer_configuration_id, from_revision,
+ insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_revision,
metadata_tsvector, mappings
from tmp_origin_intrinsic_metadata
on conflict(id, indexer_configuration_id)
@@ -453,4 +452,3 @@
set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata);
end
$$;
-
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -692,8 +692,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int): (legacy) origin identifier
- - **origin_url** (str)
+ - **id** (str): origin url
- **from_revision** (bytes): which revision this metadata
was extracted from
- **metadata** (str): associated metadata
@@ -716,8 +715,7 @@
Args:
metadata (iterable): dictionaries with keys:
- - **id**: legacy origin identifier
- - **origin_url**
+ - **id**: origin urls
- **from_revision**: sha1 id of the revision used to generate
these metadata.
- **metadata**: arbitrary dict
@@ -735,7 +733,7 @@
db.mktemp_origin_intrinsic_metadata(cur)
db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',
- ['id', 'origin_url', 'metadata',
+ ['id', 'metadata',
'indexer_configuration_id',
'from_revision', 'mappings'],
cur)
@@ -749,7 +747,7 @@
Args:
entries (dict): dictionaries with the following keys:
- - **id** (int): origin identifier
+ - **id** (str): origin urls
- **indexer_configuration_id** (int): tool used to compute
metadata
"""
@@ -768,8 +766,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int): legacy origin identifier
- - **origin_url** (str)
+ - **id** (str): origin urls
- **from_revision**: sha1 id of the revision used to generate
these metadata.
- **metadata** (str): associated metadata
@@ -786,17 +783,17 @@
@remote_api_endpoint('origin_intrinsic_metadata/search/by_producer')
@db_transaction_generator()
def origin_intrinsic_metadata_search_by_producer(
- self, start=0, end=None, limit=100, ids_only=False,
+ self, start='', end=None, limit=100, ids_only=False,
mappings=None, tool_ids=None,
db=None, cur=None):
"""Returns the list of origins whose metadata contain all the terms.
Args:
- start (int): The minimum origin id to return
- end (int): The maximum origin id to return
+ start (str): The minimum origin url to return
+ end (str): The maximum origin url to return
limit (int): The maximum number of results to return
- ids_only (bool): Determines whether only origin ids are returned
- or the content as well
+ ids_only (bool): Determines whether only origin urls are
+ returned or the content as well
mappings (List[str]): Returns origins whose intrinsic metadata
were generated using at least one of these mappings.
@@ -804,8 +801,7 @@
list: list of origin ids (int) if `ids_only=True`, else
dictionaries with the following keys:
- - **id** (int): legacy origin identifier
- - **origin_url** (str)
+ - **id** (str): origin urls
- **from_revision**: sha1 id of the revision used to generate
these metadata.
- **metadata** (str): associated metadata
@@ -817,8 +813,8 @@
res = db.origin_intrinsic_metadata_search_by_producer(
start, end, limit, ids_only, mappings, tool_ids, cur)
if ids_only:
- for (origin_id,) in res:
- yield origin_id
+ for (origin,) in res:
+ yield origin
else:
for c in res:
yield converters.db_to_metadata(
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -336,7 +336,7 @@
self.revision_intrinsic_metadata_cols, cur=cur)
origin_intrinsic_metadata_cols = [
- 'id', 'origin_url', 'metadata', 'from_revision', 'mappings',
+ 'id', 'metadata', 'from_revision', 'mappings',
'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
origin_intrinsic_metadata_regconfig = 'pg_catalog.simple'
@@ -366,9 +366,9 @@
tuple((e['id'], e['indexer_configuration_id'])
for e in entries),)
- def origin_intrinsic_metadata_get_from_list(self, orig_ids, cur=None):
+ def origin_intrinsic_metadata_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
- 'origin_intrinsic_metadata', orig_ids,
+ 'origin_intrinsic_metadata', ids,
self.origin_intrinsic_metadata_cols, cur=cur,
id_col='id')
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -77,7 +77,7 @@
}
def get_all(self):
- yield from self.get(list(self._tools_per_id))
+ yield from self.get(self._sorted_ids)
def get_range(self, start, end, indexer_configuration_id, limit):
"""Retrieve data within range [start, end] bound by limit.
@@ -611,8 +611,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int)
- - **origin_url** (str)
+ - **id** (str): origin url
- **from_revision** (bytes): which revision this metadata
was extracted from
- **metadata** (str): associated metadata
@@ -630,8 +629,7 @@
Args:
metadata (iterable): dictionaries with keys:
- - **id**: origin identifier
- - **origin_url**
+ - **id**: origin url
- **from_revision**: sha1 id of the revision used to generate
these metadata.
- **metadata**: arbitrary dict
@@ -650,7 +648,7 @@
Args:
entries (dict): dictionaries with the following keys:
- - **id** (int): origin identifier
+ - **id** (str): origin url
- **indexer_configuration_id** (int): tool used to compute
metadata
"""
@@ -667,8 +665,7 @@
Yields:
list: dictionaries with the following keys:
- - **id** (int)
- - **origin_url** (str)
+ - **id** (str): origin url
- **from_revision** (bytes): which revision this metadata
was extracted from
- **metadata** (str): associated metadata
@@ -709,14 +706,14 @@
yield result
def origin_intrinsic_metadata_search_by_producer(
- self, start=0, end=None, limit=100, ids_only=False,
+ self, start='', end=None, limit=100, ids_only=False,
mappings=None, tool_ids=None,
db=None, cur=None):
"""Returns the list of origins whose metadata contain all the terms.
Args:
- start (int): The minimum origin id to return
- end (int): The maximum origin id to return
+ start (str): The minimum origin url to return
+ end (str): The maximum origin url to return
limit (int): The maximum number of results to return
ids_only (bool): Determines whether only origin ids are returned
or the content as well
@@ -727,8 +724,7 @@
list: list of origin ids (int) if `ids_only=True`, else
dictionaries with the following keys:
- - **id** (int)
- - **origin_url** (str)
+ - **id** (str): origin url
- **from_revision**: sha1 id of the revision used to generate
these metadata.
- **metadata** (str): associated metadata
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -450,9 +450,9 @@
'7026b7c1a2af56521e9587659012345678904321')
self.revision_id_3 = hash_to_bytes(
'7026b7c1a2af56521e9587659012345678904320')
- self.origin_id_1 = 44434341
- self.origin_id_2 = 44434342
- self.origin_id_3 = 54974445
+ self.origin_url_1 = 'file:///dev/0/zero' # 44434341
+ self.origin_url_2 = 'file:///dev/1/one' # 44434342
+ self.origin_url_3 = 'file:///dev/2/two' # 54974445
def test_check_config(self):
self.assertTrue(self.storage.check_config(check_write=True))
@@ -1002,8 +1002,7 @@
'indexer_configuration_id': tool_id,
}
metadata_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata,
'indexer_configuration_id': tool_id,
'mappings': ['mapping1'],
@@ -1016,11 +1015,10 @@
# then
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1, 42]))
+ [self.origin_url_1, 'no://where']))
expected_metadata = [{
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_2,
@@ -1044,15 +1042,14 @@
'indexer_configuration_id': tool_id,
}
metadata_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata,
'indexer_configuration_id': tool_id,
'mappings': ['mapping1'],
'from_revision': self.revision_id_2,
}
metadata_origin2 = metadata_origin.copy()
- metadata_origin2['id'] = self.origin_id_2
+ metadata_origin2['id'] = self.origin_url_2
# when
self.storage.revision_intrinsic_metadata_add([metadata_rev])
@@ -1061,14 +1058,14 @@
self.storage.origin_intrinsic_metadata_delete([
{
- 'id': self.origin_id_1,
+ 'id': self.origin_url_1,
'indexer_configuration_id': tool_id
}
])
# then
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1, self.origin_id_2, 42]))
+ [self.origin_url_1, self.origin_url_2, 'no://where']))
for item in actual_metadata:
item['indexer_configuration_id'] = item.pop('tool')['id']
self.assertEqual(actual_metadata, [metadata_origin2])
@@ -1077,7 +1074,7 @@
tool_id = self.tools['swh-metadata-detector']['id']
self.storage.origin_intrinsic_metadata_delete([
{
- 'id': self.origin_id_1,
+ 'id': self.origin_url_1,
'indexer_configuration_id': tool_id
}
])
@@ -1092,14 +1089,12 @@
}
metadata_rev_v1 = {
'id': self.revision_id_1,
- 'origin_url': 'file:///dev/zero',
'metadata': metadata_v1.copy(),
'mappings': [],
'indexer_configuration_id': tool_id,
}
metadata_origin_v1 = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata_v1.copy(),
'indexer_configuration_id': tool_id,
'mappings': [],
@@ -1112,11 +1107,10 @@
# when
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1, 42]))
+ [self.origin_url_1, 'no://where']))
expected_metadata_v1 = [{
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata_v1,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_1,
@@ -1141,7 +1135,7 @@
# then
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1]))
+ [self.origin_url_1]))
# metadata did not change as the v2 was dropped.
self.assertEqual(actual_metadata, expected_metadata_v1)
@@ -1161,8 +1155,7 @@
'indexer_configuration_id': tool_id,
}
metadata_origin_v1 = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata_v1.copy(),
'indexer_configuration_id': tool_id,
'mappings': [],
@@ -1175,12 +1168,11 @@
# when
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1]))
+ [self.origin_url_1]))
# then
expected_metadata_v1 = [{
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata_v1,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_2,
@@ -1198,8 +1190,7 @@
metadata_origin_v2 = metadata_origin_v1.copy()
metadata_rev_v2['metadata'] = metadata_v2
metadata_origin_v2 = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/null',
+ 'id': self.origin_url_1,
'metadata': metadata_v2.copy(),
'indexer_configuration_id': tool_id,
'mappings': ['npm'],
@@ -1212,11 +1203,10 @@
[metadata_origin_v2], conflict_update=True)
actual_metadata = list(self.storage.origin_intrinsic_metadata_get(
- [self.origin_id_1]))
+ [self.origin_url_1]))
expected_metadata_v2 = [{
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/null',
+ 'id': self.origin_url_1,
'metadata': metadata_v2,
'tool': self.tools['swh-metadata-detector'],
'from_revision': self.revision_id_1,
@@ -1259,8 +1249,7 @@
data_v1 = [
{
- 'id': id_,
- 'origin_url': 'file:///tmp/origin%d' % id_,
+ 'id': 'file:///tmp/origin%d' % id_,
'from_revision': self.revision_id_2,
**example_data1,
'indexer_configuration_id': tool_id,
@@ -1269,8 +1258,7 @@
]
data_v2 = [
{
- 'id': id_,
- 'origin_url': 'file:///tmp/origin%d' % id_,
+ 'id': 'file:///tmp/origin%d' % id_,
'from_revision': self.revision_id_2,
**example_data2,
'indexer_configuration_id': tool_id,
@@ -1288,12 +1276,12 @@
self.storage.origin_intrinsic_metadata_add(data_v1)
# when
- actual_data = list(self.storage.origin_intrinsic_metadata_get(ids))
+ origins = ['file:///tmp/origin%d' % i for i in ids]
+ actual_data = list(self.storage.origin_intrinsic_metadata_get(origins))
expected_data_v1 = [
{
- 'id': id_,
- 'origin_url': 'file:///tmp/origin%d' % id_,
+ 'id': 'file:///tmp/origin%d' % id_,
'from_revision': self.revision_id_2,
**example_data1,
'tool': self.tools['swh-metadata-detector'],
@@ -1321,12 +1309,11 @@
t1.join()
t2.join()
- actual_data = list(self.storage.origin_intrinsic_metadata_get(ids))
+ actual_data = list(self.storage.origin_intrinsic_metadata_get(origins))
expected_data_v2 = [
{
- 'id': id_,
- 'origin_url': 'file:///tmp/origin%d' % id_,
+ 'id': 'file:///tmp/origin%d' % id_,
'from_revision': self.revision_id_2,
**example_data2,
'tool': self.tools['swh-metadata-detector'],
@@ -1352,8 +1339,7 @@
'indexer_configuration_id': tool_id,
}
metadata_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata,
'indexer_configuration_id': tool_id,
'mappings': ['mapping1'],
@@ -1381,8 +1367,7 @@
'indexer_configuration_id': tool_id,
}
metadata1_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata1,
'mappings': [],
'indexer_configuration_id': tool_id,
@@ -1393,14 +1378,13 @@
}
metadata2_rev = {
'id': self.revision_id_2,
- 'origin_url': 'file:///dev/zero',
+ 'origin': self.origin_url_1,
'metadata': metadata2,
'mappings': [],
'indexer_configuration_id': tool_id,
}
metadata2_origin = {
- 'id': self.origin_id_2,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_2,
'metadata': metadata2,
'mappings': [],
'indexer_configuration_id': tool_id,
@@ -1417,13 +1401,13 @@
search = self.storage.origin_intrinsic_metadata_search_fulltext
self.assertCountEqual(
[res['id'] for res in search(['Doe'])],
- [self.origin_id_1, self.origin_id_2])
+ [self.origin_url_1, self.origin_url_2])
self.assertEqual(
[res['id'] for res in search(['John', 'Doe'])],
- [self.origin_id_1])
+ [self.origin_url_1])
self.assertEqual(
[res['id'] for res in search(['John'])],
- [self.origin_id_1])
+ [self.origin_url_1])
self.assertEqual(
[res['id'] for res in search(['John', 'Jane'])],
[])
@@ -1450,8 +1434,7 @@
'indexer_configuration_id': tool_id,
}
metadata1_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata1,
'mappings': [],
'indexer_configuration_id': tool_id,
@@ -1470,8 +1453,7 @@
'indexer_configuration_id': tool_id,
}
metadata2_origin = {
- 'id': self.origin_id_2,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_2,
'metadata': metadata2,
'mappings': [],
'indexer_configuration_id': tool_id,
@@ -1488,19 +1470,19 @@
search = self.storage.origin_intrinsic_metadata_search_fulltext
self.assertEqual(
[res['id'] for res in search(['Doe'])],
- [self.origin_id_1, self.origin_id_2])
+ [self.origin_url_1, self.origin_url_2])
self.assertEqual(
[res['id'] for res in search(['Doe'], limit=1)],
- [self.origin_id_1])
+ [self.origin_url_1])
self.assertEqual(
[res['id'] for res in search(['John'])],
- [self.origin_id_1])
+ [self.origin_url_1])
self.assertEqual(
[res['id'] for res in search(['Jane'])],
- [self.origin_id_2, self.origin_id_1])
+ [self.origin_url_2, self.origin_url_1])
self.assertEqual(
[res['id'] for res in search(['John', 'Jane'])],
- [self.origin_id_1])
+ [self.origin_url_1])
def _fill_origin_intrinsic_metadata(self):
tool1_id = self.tools['swh-metadata-detector']['id']
@@ -1517,8 +1499,7 @@
'indexer_configuration_id': tool1_id,
}
metadata1_origin = {
- 'id': self.origin_id_1,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_1,
'metadata': metadata1,
'mappings': ['npm'],
'indexer_configuration_id': tool1_id,
@@ -1535,8 +1516,7 @@
'indexer_configuration_id': tool2_id,
}
metadata2_origin = {
- 'id': self.origin_id_2,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_2,
'metadata': metadata2,
'mappings': ['npm', 'gemspec'],
'indexer_configuration_id': tool2_id,
@@ -1552,8 +1532,7 @@
'indexer_configuration_id': tool2_id,
}
metadata3_origin = {
- 'id': self.origin_id_3,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_3,
'metadata': metadata3,
'mappings': ['pkg-info'],
'indexer_configuration_id': tool2_id,
@@ -1576,34 +1555,34 @@
# test pagination
self.assertCountEqual(
endpoint(ids_only=True),
- [self.origin_id_1, self.origin_id_2, self.origin_id_3])
+ [self.origin_url_1, self.origin_url_2, self.origin_url_3])
self.assertCountEqual(
- endpoint(start=0, ids_only=True),
- [self.origin_id_1, self.origin_id_2, self.origin_id_3])
+ endpoint(start=self.origin_url_1, ids_only=True),
+ [self.origin_url_1, self.origin_url_2, self.origin_url_3])
self.assertCountEqual(
- endpoint(start=0, limit=2, ids_only=True),
- [self.origin_id_1, self.origin_id_2])
+ endpoint(start=self.origin_url_1, limit=2, ids_only=True),
+ [self.origin_url_1, self.origin_url_2])
self.assertCountEqual(
- endpoint(start=self.origin_id_1+1, ids_only=True),
- [self.origin_id_2, self.origin_id_3])
+ endpoint(start=self.origin_url_1+'2', ids_only=True),
+ [self.origin_url_2, self.origin_url_3])
self.assertCountEqual(
- endpoint(start=self.origin_id_1+1, end=self.origin_id_3-1,
+ endpoint(start=self.origin_url_1+'2', end=self.origin_url_3[:-1],
ids_only=True),
- [self.origin_id_2])
+ [self.origin_url_2])
# test mappings filtering
self.assertCountEqual(
endpoint(mappings=['npm'], ids_only=True),
- [self.origin_id_1, self.origin_id_2])
+ [self.origin_url_1, self.origin_url_2])
self.assertCountEqual(
endpoint(mappings=['npm', 'gemspec'], ids_only=True),
- [self.origin_id_1, self.origin_id_2])
+ [self.origin_url_1, self.origin_url_2])
self.assertCountEqual(
endpoint(mappings=['gemspec'], ids_only=True),
- [self.origin_id_2])
+ [self.origin_url_2])
self.assertCountEqual(
endpoint(mappings=['pkg-info'], ids_only=True),
- [self.origin_id_3])
+ [self.origin_url_3])
self.assertCountEqual(
endpoint(mappings=['foobar'], ids_only=True),
[])
@@ -1611,23 +1590,22 @@
# test pagination + mappings
self.assertCountEqual(
endpoint(mappings=['npm'], limit=1, ids_only=True),
- [self.origin_id_1])
+ [self.origin_url_1])
# test tool filtering
self.assertCountEqual(
endpoint(tool_ids=[tool1['id']], ids_only=True),
- [self.origin_id_1])
+ [self.origin_url_1])
self.assertCountEqual(
endpoint(tool_ids=[tool2['id']], ids_only=True),
- [self.origin_id_2, self.origin_id_3])
+ [self.origin_url_2, self.origin_url_3])
self.assertCountEqual(
endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True),
- [self.origin_id_1, self.origin_id_2, self.origin_id_3])
+ [self.origin_url_1, self.origin_url_2, self.origin_url_3])
# test ids_only=False
self.assertEqual(list(endpoint(mappings=['gemspec'])), [{
- 'id': self.origin_id_2,
- 'origin_url': 'file:///dev/zero',
+ 'id': self.origin_url_2,
'metadata': {
'@context': 'foo',
'author': 'Jane Doe',
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -42,8 +42,7 @@
origin_metadata = [
{
- 'id': origin_id,
- 'origin_url': 'file:///dev/zero',
+ 'id': 'file://dev/%04d' % origin_id,
'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)),
'indexer_configuration_id': tools[origin_id % 2]['id'],
'metadata': {'name': 'origin %d' % origin_id},
@@ -83,7 +82,8 @@
assert all(len(task['arguments']['args']) == 1 for task in tasks)
for task in tasks:
assert task['arguments']['kwargs'] == expected_kwargs, task
- assert _origins_in_task_args(tasks) == set(origins)
+ assert _origins_in_task_args(tasks) == set([
+ 'file://dev/%04d' % i for i in origins])
def invoke(scheduler, catch_exceptions, args):
@@ -325,12 +325,11 @@
def test_journal_client(storage, indexer_scheduler):
- """Tests the re-indexing when origin_batch_size*task_batch_size is a
- divisor of nb_origins."""
+ """Test the 'swh indexer journal-client' cli tool."""
message = FakeKafkaMessage('swh.journal.objects.origin_visit', 'bogus', {
'status': 'full',
'origin': {
- 'url': 'file:///dev/zero',
+ 'url': 'file://dev/0000',
}
})
@@ -359,4 +358,4 @@
assert len(tasks) == 1
_assert_tasks_for_origins(
tasks,
- ['file:///dev/zero'])
+ [0])
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -13,14 +13,12 @@
from .test_metadata import REVISION_METADATA_CONFIG
-def test_origin_metadata_indexer(
- idx_storage, storage, obj_storage):
+def test_origin_metadata_indexer(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.run(["https://github.com/librariesio/yarn-parser"])
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
rev_metadata = {
@@ -29,8 +27,7 @@
'mappings': ['npm'],
}
origin_metadata = {
- 'id': origin['id'],
- 'origin_url': origin['url'],
+ 'id': origin,
'from_revision': rev_id,
'metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
@@ -43,7 +40,7 @@
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
for result in results:
del result['tool']
assert results == [origin_metadata]
@@ -58,8 +55,7 @@
indexer.run(["https://github.com/librariesio/yarn-parser"]*2)
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
@@ -67,7 +63,7 @@
assert len(results) == 1
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert len(results) == 1
@@ -82,11 +78,10 @@
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.run(["https://example.com"])
- origin = storage.origin_get({
- 'url': 'https://example.com'})
+ origin = 'https://example.com'
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results == []
@@ -102,10 +97,8 @@
indexer.run(["https://example.com",
"https://github.com/librariesio/yarn-parser"])
- origin1 = storage.origin_get({
- 'url': 'https://example.com'})
- origin2 = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin1 = 'https://example.com'
+ origin2 = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
rev_metadata = {
@@ -114,8 +107,7 @@
'mappings': ['npm'],
}
origin_metadata = {
- 'id': origin2['id'],
- 'origin_url': origin2['url'],
+ 'id': origin2,
'from_revision': rev_id,
'metadata': YARN_PARSER_METADATA,
'mappings': ['npm'],
@@ -128,7 +120,7 @@
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin1['id'], origin2['id']]))
+ origin1, origin2]))
for result in results:
del result['tool']
assert results == [origin_metadata]
@@ -142,19 +134,16 @@
indexer.run(["https://github.com/librariesio/yarn-parser",
"https://github.com/librariesio/yarn-parser.git"])
- origin1 = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
- origin2 = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser.git'})
- assert origin1['id'] != origin2['id']
+ origin1 = 'https://github.com/librariesio/yarn-parser'
+ origin2 = 'https://github.com/librariesio/yarn-parser.git'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert len(results) == 1
- results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin1['id'], origin2['id']]))
+ results = list(indexer.idx_storage.origin_intrinsic_metadata_get(
+ [origin1, origin2]))
assert len(results) == 2
@@ -166,8 +155,7 @@
b'foo.json'):
indexer.run(["https://github.com/librariesio/yarn-parser"])
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
@@ -175,7 +163,7 @@
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results == []
@@ -188,8 +176,7 @@
return_value=(['npm'], {'@context': 'foo'})):
indexer.run(["https://github.com/librariesio/yarn-parser"])
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
@@ -197,7 +184,7 @@
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results == []
@@ -210,8 +197,7 @@
return_value=None):
indexer.run(["https://github.com/librariesio/yarn-parser"])
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
@@ -219,7 +205,7 @@
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results == []
@@ -229,8 +215,7 @@
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.run(["https://github.com/librariesio/yarn-parser"])
- origin = storage.origin_get({
- 'url': 'https://github.com/librariesio/yarn-parser'})
+ origin = 'https://github.com/librariesio/yarn-parser'
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
results = list(
@@ -238,7 +223,7 @@
assert results != []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results != []
with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename',
@@ -250,5 +235,5 @@
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
- origin['id']]))
+ origin]))
assert results == []
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 1:15 PM (8 h, 54 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218191
Attached To
D2207: update indexer for storage 0.0.156
Event Timeline
Log In to Comment