diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -2,5 +2,5 @@ swh.model >= 0.0.15 swh.objstorage >= 0.0.28 swh.scheduler >= 0.0.47 -swh.storage >= 0.0.123 +swh.storage >= 0.0.143 swh.journal >= 0.0.6 diff --git a/sql/upgrades/125.sql b/sql/upgrades/125.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/125.sql @@ -0,0 +1,11 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 124 +-- to_version: 125 +-- description: Add 'origin_url' column to origin_intrinsic_metadata. + +insert into dbversion(version, release, description) +values(125, now(), 'Work In Progress'); + +alter origin_intrinsic_metadata + add column origin_url type text; + diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -4,7 +4,6 @@ # See top-level LICENSE file for more information import abc -import ast import os import logging import shutil @@ -526,37 +525,6 @@ return with_indexed_data -def origin_get_params(id_): - """From any of the two types of origin identifiers (int or - type+url), returns a dict that can be passed to Storage.origin_get. - Also accepts JSON-encoded forms of these (used via the task scheduler). - - >>> from pprint import pprint - >>> origin_get_params(123) - {'id': 123} - >>> pprint(origin_get_params(['git', 'https://example.com/foo.git'])) - {'type': 'git', 'url': 'https://example.com/foo.git'} - >>> origin_get_params("123") - {'id': 123} - >>> pprint(origin_get_params('["git", "https://example.com/foo.git"]')) - {'type': 'git', 'url': 'https://example.com/foo.git'} - """ - if isinstance(id_, str): - # Data coming from JSON, which requires string keys, so - # one extra level of deserialization is needed - id_ = ast.literal_eval(id_) - if isinstance(id_, (tuple, list)): - if len(id_) != 2: - raise TypeError('Expected a (type, url) tuple.') - (type_, url) = id_ - params = {'type': type_, 'url': url} - elif isinstance(id_, int): - params = {'id': id_} - else: - raise TypeError('Invalid value in "ids": %r' % id_) - return params - - class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method @@ -567,7 +535,7 @@ class. """ - def run(self, ids, policy_update='update-dups', parse_ids=True, + def run(self, origin_urls, policy_update='update-dups', next_step=None, **kwargs): """Given a list of origin ids: @@ -587,21 +555,7 @@ **kwargs: passed to the `index` method """ - if parse_ids: - ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id - for o in ids] - - origins_filtered = [] - origins = self.storage.origin_get( - [origin_get_params(id_) for id_ in ids]) - for (id_, origin) in zip(ids, origins): - if not origin: - self.log.warning('Origin %s not found in storage' % - id_) - continue - origins_filtered.append(origin) - - results = self.index_list(origins_filtered, **kwargs) + results = self.index_list(origin_urls, **kwargs) self.persist_index_computations(results, policy_update) self.results = results diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -264,12 +264,15 @@ self.origin_head_indexer = OriginHeadIndexer(config=config) self.revision_metadata_indexer = RevisionMetadataIndexer(config=config) - def index_list(self, origins): + def index_list(self, origin_urls): head_rev_ids = [] origins_with_head = [] + origins = self.storage.origin_get( + [{'url': url} for url in origin_urls]) for origin in origins: - head_result = self.origin_head_indexer.index(origin) + head_result = self.origin_head_indexer.index(origin['url']) if head_result: + head_result['origin_id'] = origin['id'] origins_with_head.append(origin) head_rev_ids.append(head_result['revision_id']) @@ -280,13 +283,14 @@ for (origin, rev) in zip(origins_with_head, head_revs): if not rev: self.log.warning('Missing head revision of origin %r', - origin) + origin['url']) continue rev_metadata = self.revision_metadata_indexer.index(rev) orig_metadata = { 'from_revision': rev_metadata['id'], 'id': origin['id'], + 'origin_url': origin['url'], 'metadata': rev_metadata['metadata'], 'mappings': rev_metadata['mappings'], 'indexer_configuration_id': diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -27,22 +27,25 @@ # Dispatch - def index(self, origin): - origin_id = origin['id'] - latest_snapshot = self.storage.snapshot_get_latest(origin_id) - if latest_snapshot is None: + def index(self, origin_url): + latest_visit = self.storage.origin_visit_get_latest( + origin_url, allowed_statuses=['full'], require_snapshot=True) + if latest_visit is None: return None - method = getattr(self, '_try_get_%s_head' % origin['type'], None) - if method is None: - method = self._try_get_head_generic + latest_snapshot = self.storage.snapshot_get(latest_visit['snapshot']) + method = getattr( + self, '_try_get_%s_head' % latest_visit['type'], + self._try_get_head_generic) + rev_id = method(latest_snapshot) - if rev_id is None: - return None - result = { - 'origin_id': origin_id, + if rev_id is not None: + return { + 'origin_url': origin_url, 'revision_id': rev_id, } - return result + + # could not find a head revision + return None # VCSs diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(124, now(), 'Work In Progress'); + values(125, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) @@ -130,6 +130,7 @@ create table origin_intrinsic_metadata( id bigserial not null, + origin_url text, metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null, diff --git a/swh/indexer/sql/40-swh-func.sql b/swh/indexer/sql/40-swh-func.sql --- a/swh/indexer/sql/40-swh-func.sql +++ b/swh/indexer/sql/40-swh-func.sql @@ -413,8 +413,8 @@ begin perform swh_origin_intrinsic_metadata_compute_tsvector(); if conflict_update then - insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select id, metadata, indexer_configuration_id, from_revision, + insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(id, indexer_configuration_id) @@ -423,8 +423,8 @@ mappings = excluded.mappings; else - insert into origin_intrinsic_metadata (id, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) - select id, metadata, indexer_configuration_id, from_revision, + insert into origin_intrinsic_metadata (id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings) + select id, origin_url, metadata, indexer_configuration_id, from_revision, metadata_tsvector, mappings from tmp_origin_intrinsic_metadata on conflict(id, indexer_configuration_id) diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -692,7 +692,10 @@ Yields: list: dictionaries with the following keys: - - **id** (int) + - **id** (int): (legacy) origin identifier + - **origin_url** (str): origin URL + - **from_revision** (bytes): which revision this metadata + was extracted from - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate @@ -713,7 +716,8 @@ Args: metadata (iterable): dictionaries with keys: - - **id**: origin identifier + - **id**: legacy origin identifier + - **origin_url**: URL of the origin - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict @@ -731,7 +735,8 @@ db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', - ['id', 'metadata', 'indexer_configuration_id', + ['id', 'origin_url', 'metadata', + 'indexer_configuration_id', 'from_revision', 'mappings'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) @@ -763,7 +768,8 @@ Yields: list: dictionaries with the following keys: - - **id** (int) + - **id** (int): legacy origin identifier + - **origin_url** (str) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate @@ -796,7 +802,8 @@ list: list of origin ids (int) if `ids_only=True`, else dictionaries with the following keys: - - **id** (int) + - **id** (int): legacy origin identifier + - **origin_url** (str) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -336,7 +336,7 @@ self.revision_intrinsic_metadata_cols, cur=cur) origin_intrinsic_metadata_cols = [ - 'id', 'metadata', 'from_revision', 'mappings', + 'id', 'origin_url', 'metadata', 'from_revision', 'mappings', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] origin_intrinsic_metadata_regconfig = 'pg_catalog.simple' diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -628,6 +628,7 @@ metadata (iterable): dictionaries with keys: - **id**: origin identifier + - **origin_url**: URL of the origin - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1003,6 +1003,7 @@ } metadata_origin = { 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], @@ -1019,6 +1020,7 @@ expected_metadata = [{ 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -1043,6 +1045,7 @@ } metadata_origin = { 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], @@ -1089,12 +1092,14 @@ } metadata_rev_v1 = { 'id': self.revision_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata_v1.copy(), 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], @@ -1111,6 +1116,7 @@ expected_metadata_v1 = [{ 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_1, @@ -1156,6 +1162,7 @@ } metadata_origin_v1 = { 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], @@ -1173,6 +1180,7 @@ # then expected_metadata_v1 = [{ 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata_v1, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -1201,6 +1209,7 @@ expected_metadata_v2 = [{ 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata_v2, 'tool': self.tools['swh-metadata-detector'], 'from_revision': self.revision_id_2, @@ -1214,7 +1223,7 @@ # given tool_id = self.tools['swh-metadata-detector']['id'] - ids = list(range(1000)) + ids = list(range(10)) example_data1 = { 'metadata': { @@ -1244,6 +1253,7 @@ data_v1 = [ { 'id': id_, + 'origin_url': 'file:///tmp/origin%d' % id_, 'from_revision': self.revision_id_2, **example_data1, 'indexer_configuration_id': tool_id, @@ -1253,6 +1263,7 @@ data_v2 = [ { 'id': id_, + 'origin_url': 'file:///tmp/origin%d' % id_, 'from_revision': self.revision_id_2, **example_data2, 'indexer_configuration_id': tool_id, @@ -1275,6 +1286,7 @@ expected_data_v1 = [ { 'id': id_, + 'origin_url': 'file:///tmp/origin%d' % id_, 'from_revision': self.revision_id_2, **example_data1, 'tool': self.tools['swh-metadata-detector'], @@ -1307,6 +1319,7 @@ expected_data_v2 = [ { 'id': id_, + 'origin_url': 'file:///tmp/origin%d' % id_, 'from_revision': self.revision_id_2, **example_data2, 'tool': self.tools['swh-metadata-detector'], @@ -1333,6 +1346,7 @@ } metadata_origin = { 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], @@ -1361,6 +1375,7 @@ } metadata1_origin = { 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1371,12 +1386,14 @@ } metadata2_rev = { 'id': self.revision_id_2, + 'origin_url': 'file:///dev/zero', 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'id': self.origin_id_2, + 'origin_url': 'file:///dev/zero', 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1427,6 +1444,7 @@ } metadata1_origin = { 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1446,6 +1464,7 @@ } metadata2_origin = { 'id': self.origin_id_2, + 'origin_url': 'file:///dev/zero', 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, @@ -1492,6 +1511,7 @@ } metadata1_origin = { 'id': self.origin_id_1, + 'origin_url': 'file:///dev/zero', 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, @@ -1509,6 +1529,7 @@ } metadata2_origin = { 'id': self.origin_id_2, + 'origin_url': 'file:///dev/zero', 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, @@ -1525,6 +1546,7 @@ } metadata3_origin = { 'id': self.origin_id_3, + 'origin_url': 'file:///dev/zero', 'metadata': metadata3, 'mappings': ['pkg-info'], 'indexer_configuration_id': tool2_id, @@ -1598,6 +1620,7 @@ # test ids_only=False self.assertEqual(list(endpoint(mappings=['gemspec'])), [{ 'id': self.origin_id_2, + 'origin_url': 'file:///dev/zero', 'metadata': { '@context': 'foo', 'author': 'Jane Doe', diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -42,6 +42,7 @@ origin_metadata = [ { 'id': origin_id, + 'origin_url': 'file:///dev/zero', 'from_revision': hash_to_bytes('abcd{:0>4}'.format(origin_id)), 'indexer_configuration_id': tools[origin_id % 2]['id'], 'metadata': {'name': 'origin %d' % origin_id}, diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -41,20 +41,37 @@ self.indexer.catch_exceptions = False fill_storage(self.indexer.storage) - def _get_origin_id(self, type_, url): - origin = self.indexer.storage.origin_get({ - 'type': type_, 'url': url}) - return origin['id'] - def test_git(self): self.indexer.run( - ['git+https://github.com/SoftwareHeritage/swh-storage']) - origin_id = self._get_origin_id( - 'git', 'https://github.com/SoftwareHeritage/swh-storage') + ['https://github.com/SoftwareHeritage/swh-storage']) self.assertEqual(self.indexer.results, [{ 'revision_id': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{' b'\xd7}\xac\xefrm', - 'origin_id': origin_id}]) + 'origin_url': 'https://github.com/SoftwareHeritage/swh-storage'}]) + + def test_git_partial_snapshot(self): + """Checks partial snapshots are ignored.""" + origin_url = 'https://github.com/SoftwareHeritage/swh-core' + self.indexer.storage.origin_add_one({ + 'type': 'git', + 'url': origin_url, + }) + visit = self.indexer.storage.origin_visit_add( + origin_url, '2019-02-27') + self.indexer.storage.snapshot_add([{ + 'id': b'foo', + 'branches': { + b'foo': None, + b'HEAD': { + 'target_type': 'alias', + 'target': b'foo', + } + } + }]) + self.indexer.storage.origin_visit_update( + origin_url, visit['visit'], status='partial', snapshot=b'foo') + self.indexer.run([origin_url]) + self.assertEqual(self.indexer.results, []) def test_vcs_missing_snapshot(self): self.indexer.storage.origin_add([{ @@ -62,18 +79,19 @@ 'url': 'https://github.com/SoftwareHeritage/swh-indexer', }]) self.indexer.run( - ['git+https://github.com/SoftwareHeritage/swh-indexer']) + ['https://github.com/SoftwareHeritage/swh-indexer']) self.assertEqual(self.indexer.results, []) def test_pypi_missing_branch(self): - origin_id = self.indexer.storage.origin_add_one({ + origin_url = 'https://pypi.org/project/abcdef/' + self.indexer.storage.origin_add_one({ 'type': 'pypi', - 'url': 'https://pypi.org/project/abcdef/', + 'url': origin_url, }) visit = self.indexer.storage.origin_visit_add( - origin_id, '2019-02-27') - self.indexer.storage.snapshot_add(origin_id, visit['visit'], { - 'id': 'foo', + origin_url, '2019-02-27') + self.indexer.storage.snapshot_add([{ + 'id': b'foo', 'branches': { b'foo': None, b'HEAD': { @@ -81,19 +99,19 @@ 'target': b'foo', } } - }) - self.indexer.run(['pypi+https://pypi.org/project/abcdef/']) + }]) + self.indexer.storage.origin_visit_update( + origin_url, visit['visit'], status='full', snapshot=b'foo') + self.indexer.run(['https://pypi.org/project/abcdef/']) self.assertEqual(self.indexer.results, []) def test_ftp(self): self.indexer.run( - ['ftp+rsync://ftp.gnu.org/gnu/3dldf']) - origin_id = self._get_origin_id( - 'ftp', 'rsync://ftp.gnu.org/gnu/3dldf') + ['rsync://ftp.gnu.org/gnu/3dldf']) self.assertEqual(self.indexer.results, [{ 'revision_id': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', - 'origin_id': origin_id}]) + 'origin_url': 'rsync://ftp.gnu.org/gnu/3dldf'}]) def test_ftp_missing_snapshot(self): self.indexer.storage.origin_add([{ @@ -101,19 +119,18 @@ 'url': 'rsync://ftp.gnu.org/gnu/foobar', }]) self.indexer.run( - ['ftp+rsync://ftp.gnu.org/gnu/foobar']) + ['rsync://ftp.gnu.org/gnu/foobar']) self.assertEqual(self.indexer.results, []) def test_deposit(self): self.indexer.run( - ['deposit+https://forge.softwareheritage.org/source/' + ['https://forge.softwareheritage.org/source/' 'jesuisgpl/']) - origin_id = self._get_origin_id( - 'deposit', 'https://forge.softwareheritage.org/source/jesuisgpl/') self.assertEqual(self.indexer.results, [{ 'revision_id': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', - 'origin_id': origin_id}]) + 'origin_url': 'https://forge.softwareheritage.org/source/' + 'jesuisgpl/'}]) def test_deposit_missing_snapshot(self): self.indexer.storage.origin_add([{ @@ -121,25 +138,21 @@ 'url': 'https://forge.softwareheritage.org/source/foobar', }]) self.indexer.run( - ['deposit+https://forge.softwareheritage.org/source/foobar']) + ['https://forge.softwareheritage.org/source/foobar']) self.assertEqual(self.indexer.results, []) def test_pypi(self): self.indexer.run( - ['pypi+https://pypi.org/project/limnoria/']) - origin_id = self._get_origin_id( - 'pypi', 'https://pypi.org/project/limnoria/') + ['https://pypi.org/project/limnoria/']) self.assertEqual(self.indexer.results, [{ 'revision_id': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', - 'origin_id': origin_id}]) + 'origin_url': 'https://pypi.org/project/limnoria/'}]) def test_svn(self): self.indexer.run( - ['svn+http://0-512-md.googlecode.com/svn/']) - origin_id = self._get_origin_id( - 'svn', 'http://0-512-md.googlecode.com/svn/') + ['http://0-512-md.googlecode.com/svn/']) self.assertEqual(self.indexer.results, [{ 'revision_id': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', - 'origin_id': origin_id}]) + 'origin_url': 'http://0-512-md.googlecode.com/svn/'}]) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -17,10 +17,9 @@ idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -31,6 +30,7 @@ } origin_metadata = { 'id': origin['id'], + 'origin_url': origin['url'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], @@ -54,12 +54,11 @@ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) - indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2) + indexer.run(["https://github.com/librariesio/yarn-parser"]*2) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -81,10 +80,9 @@ }]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["git+https://example.com"]) + indexer.run(["https://example.com"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://example.com'}) results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ @@ -101,14 +99,12 @@ }]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["git+https://example.com", - "git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://example.com", + "https://github.com/librariesio/yarn-parser"]) origin1 = storage.origin_get({ - 'type': 'git', 'url': 'https://example.com'}) origin2 = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -119,6 +115,7 @@ } origin_metadata = { 'id': origin2['id'], + 'origin_url': origin2['url'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], @@ -142,14 +139,12 @@ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage - indexer.run(["git+https://github.com/librariesio/yarn-parser", - "git+https://github.com/librariesio/yarn-parser.git"]) + indexer.run(["https://github.com/librariesio/yarn-parser", + "https://github.com/librariesio/yarn-parser.git"]) origin1 = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) origin2 = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser.git'}) assert origin1['id'] != origin2['id'] rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -169,10 +164,9 @@ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', b'foo.json'): - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -192,10 +186,9 @@ with patch('swh.indexer.metadata.RevisionMetadataIndexer' '.translate_revision_intrinsic_metadata', return_value=(['npm'], {'@context': 'foo'})): - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -215,10 +208,9 @@ with patch('swh.indexer.metadata.RevisionMetadataIndexer' '.translate_revision_intrinsic_metadata', return_value=None): - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -235,10 +227,9 @@ idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -252,7 +243,7 @@ with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', b'foo.json'): - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id])) diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -36,133 +36,133 @@ ORIGINS = [ { - 'id': 52189575, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/SoftwareHeritage/swh-storage'}, { - 'id': 4423668, 'lister': None, 'project': None, 'type': 'ftp', 'url': 'rsync://ftp.gnu.org/gnu/3dldf'}, { - 'id': 77775770, 'lister': None, 'project': None, 'type': 'deposit', 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'}, { - 'id': 85072327, 'lister': None, 'project': None, 'type': 'pypi', 'url': 'https://pypi.org/project/limnoria/'}, { - 'id': 49908349, 'lister': None, 'project': None, 'type': 'svn', 'url': 'http://0-512-md.googlecode.com/svn/'}, { - 'id': 54974445, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}, { - 'id': 54974446, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser.git'}, ] -SNAPSHOTS = { - 52189575: { - 'branches': { - b'refs/heads/add-revision-origin-cache': { - 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' - b's\xe7/\xe9l\x1e', - 'target_type': 'revision'}, - b'HEAD': { - 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' - b'\xac\xefrm', - 'target_type': 'revision'}, - b'refs/tags/v0.0.103': { - 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' - b'\x0f\xdd', - 'target_type': 'release'}, - }}, - 4423668: { - 'branches': { - b'3DLDF-1.1.4.tar.gz': { - 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' - b'"G\x99\x11', - 'target_type': 'revision'}, - b'3DLDF-2.0.2.tar.gz': { - 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' - b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', - 'target_type': 'revision'}, - b'3DLDF-2.0.3-examples.tar.gz': { - 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' - b'\xfe\xadZ\x80\x80\xc1\x83\xff', - 'target_type': 'revision'}, - b'3DLDF-2.0.3.tar.gz': { - 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' - b'\xcc\x1a\xb4`\x8c\x8by', - 'target_type': 'revision'}, - b'3DLDF-2.0.tar.gz': { - 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' - b'\xd3\xd1m', - b'target_type': 'revision'} - }}, - 77775770: { - 'branches': { - b'master': { - 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' - b'\xa6\xe9\x99\xb1\x9e]q\xeb', - 'target_type': 'revision'} - }, - 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" - b"\x1d\r "}, - 85072327: { - 'branches': { - b'HEAD': { - 'target': b'releases/2018.09.09', - 'target_type': 'alias'}, - b'releases/2018.09.01': { - 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' - b'\xbb\xdfF\xfdw\xcf', - 'target_type': 'revision'}, - b'releases/2018.09.09': { - 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' - b'A\x10\x9d\xc5\xfa2\xf8t', - 'target_type': 'revision'}}, - 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' - b'\x12\x9e\xd6\xb3'}, - 49908349: { - 'branches': { - b'master': { - 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' - b'\xc9\xad#.\x1bw=\x18', - 'target_type': 'revision'}}, - 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' - b'\x05\xea\xb8\x1f\xc4H\xf4s'}, - 54974445: { - 'branches': { - b'HEAD': { - 'target': hash_to_bytes( - '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), - 'target_type': 'revision'}}}, - 54974446: { - 'branches': { - b'HEAD': { - 'target': hash_to_bytes( - '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), - 'target_type': 'revision'}}}, - } +SNAPSHOTS = [ + { + 'origin': 'https://github.com/SoftwareHeritage/swh-storage', + 'branches': { + b'refs/heads/add-revision-origin-cache': { + 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' + b's\xe7/\xe9l\x1e', + 'target_type': 'revision'}, + b'HEAD': { + 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' + b'\xac\xefrm', + 'target_type': 'revision'}, + b'refs/tags/v0.0.103': { + 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' + b'\x0f\xdd', + 'target_type': 'release'}, + }}, + { + 'origin': 'rsync://ftp.gnu.org/gnu/3dldf', + 'branches': { + b'3DLDF-1.1.4.tar.gz': { + 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' + b'"G\x99\x11', + 'target_type': 'revision'}, + b'3DLDF-2.0.2.tar.gz': { + 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' + b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', + 'target_type': 'revision'}, + b'3DLDF-2.0.3-examples.tar.gz': { + 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' + b'\xfe\xadZ\x80\x80\xc1\x83\xff', + 'target_type': 'revision'}, + b'3DLDF-2.0.3.tar.gz': { + 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' + b'\xcc\x1a\xb4`\x8c\x8by', + 'target_type': 'revision'}, + b'3DLDF-2.0.tar.gz': { + 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' + b'\xd3\xd1m', + b'target_type': 'revision'} + }}, + { + 'origin': 'https://forge.softwareheritage.org/source/jesuisgpl/', + 'branches': { + b'master': { + 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' + b'\xa6\xe9\x99\xb1\x9e]q\xeb', + 'target_type': 'revision'} + }, + 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" + b"\x1d\r "}, + { + 'origin': 'https://pypi.org/project/limnoria/', + 'branches': { + b'HEAD': { + 'target': b'releases/2018.09.09', + 'target_type': 'alias'}, + b'releases/2018.09.01': { + 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' + b'\xbb\xdfF\xfdw\xcf', + 'target_type': 'revision'}, + b'releases/2018.09.09': { + 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' + b'A\x10\x9d\xc5\xfa2\xf8t', + 'target_type': 'revision'}}, + 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' + b'\x12\x9e\xd6\xb3'}, + { + 'origin': 'http://0-512-md.googlecode.com/svn/', + 'branches': { + b'master': { + 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' + b'\xc9\xad#.\x1bw=\x18', + 'target_type': 'revision'}}, + 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' + b'\x05\xea\xb8\x1f\xc4H\xf4s'}, + { + 'origin': 'https://github.com/librariesio/yarn-parser', + 'branches': { + b'HEAD': { + 'target': hash_to_bytes( + '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), + 'target_type': 'revision'}}}, + { + 'origin': 'https://github.com/librariesio/yarn-parser.git', + 'branches': { + b'HEAD': { + 'target': hash_to_bytes( + '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), + 'target_type': 'revision'}}}, +] REVISIONS = [{ @@ -551,24 +551,18 @@ def fill_storage(storage): for origin in ORIGINS: - origin = origin.copy() - del origin['id'] storage.origin_add_one(origin) - for (orig_pseudo_id, snap) in SNAPSHOTS.items(): - for orig in ORIGINS: - if orig_pseudo_id == orig['id']: - origin_id = storage.origin_get( - {'type': orig['type'], 'url': orig['url']})['id'] - break - else: - assert False - visit = storage.origin_visit_add(origin_id, datetime.datetime.now()) + for snap in SNAPSHOTS: + origin_url = snap['origin'] + visit = storage.origin_visit_add(origin_url, datetime.datetime.now()) snap_id = snap.get('id') or \ bytes([random.randint(0, 255) for _ in range(32)]) - storage.snapshot_add(origin_id, visit['visit'], { + storage.snapshot_add([{ 'id': snap_id, 'branches': snap['branches'] - }) + }]) + storage.origin_visit_update( + origin_url, visit['visit'], status='full', snapshot=snap_id) storage.revision_add(REVISIONS) storage.directory_add([{ 'id': DIRECTORY_ID,