diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -2,5 +2,5 @@ swh.model >= 0.0.15 swh.objstorage >= 0.0.28 swh.scheduler >= 0.0.47 -swh.storage >= 0.0.123 +swh.storage >= 0.0.141 swh.journal >= 0.0.6 diff --git a/sql/upgrades/125.sql b/sql/upgrades/125.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/125.sql @@ -0,0 +1,11 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 124 +-- to_version: 125 +-- description: Add 'origin_url' column to origin_intrinsic_metadata. + +insert into dbversion(version, release, description) +values(125, now(), 'Work In Progress'); + +alter origin_intrinsic_metadata + add column origin_url type text; + diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -4,7 +4,6 @@ # See top-level LICENSE file for more information import abc -import ast import os import logging import shutil @@ -526,37 +525,6 @@ return with_indexed_data -def origin_get_params(id_): - """From any of the two types of origin identifiers (int or - type+url), returns a dict that can be passed to Storage.origin_get. - Also accepts JSON-encoded forms of these (used via the task scheduler). - - >>> from pprint import pprint - >>> origin_get_params(123) - {'id': 123} - >>> pprint(origin_get_params(['git', 'https://example.com/foo.git'])) - {'type': 'git', 'url': 'https://example.com/foo.git'} - >>> origin_get_params("123") - {'id': 123} - >>> pprint(origin_get_params('["git", "https://example.com/foo.git"]')) - {'type': 'git', 'url': 'https://example.com/foo.git'} - """ - if isinstance(id_, str): - # Data coming from JSON, which requires string keys, so - # one extra level of deserialization is needed - id_ = ast.literal_eval(id_) - if isinstance(id_, (tuple, list)): - if len(id_) != 2: - raise TypeError('Expected a (type, url) tuple.') - (type_, url) = id_ - params = {'type': type_, 'url': url} - elif isinstance(id_, int): - params = {'id': id_} - else: - raise TypeError('Invalid value in "ids": %r' % id_) - return params - - class OriginIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements Origin indexing using the run method @@ -567,7 +535,7 @@ class. """ - def run(self, ids, policy_update='update-dups', parse_ids=True, + def run(self, origin_urls, policy_update='update-dups', next_step=None, **kwargs): """Given a list of origin ids: @@ -587,21 +555,7 @@ **kwargs: passed to the `index` method """ - if parse_ids: - ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id - for o in ids] - - origins_filtered = [] - origins = self.storage.origin_get( - [origin_get_params(id_) for id_ in ids]) - for (id_, origin) in zip(ids, origins): - if not origin: - self.log.warning('Origin %s not found in storage' % - id_) - continue - origins_filtered.append(origin) - - results = self.index_list(origins_filtered, **kwargs) + results = self.index_list(origin_urls, **kwargs) self.persist_index_computations(results, policy_update) self.results = results diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -264,12 +264,15 @@ self.origin_head_indexer = OriginHeadIndexer(config=config) self.revision_metadata_indexer = RevisionMetadataIndexer(config=config) - def index_list(self, origins): + def index_list(self, origin_urls): head_rev_ids = [] origins_with_head = [] + origins = self.storage.origin_get( + [{'url': url} for url in origin_urls]) for origin in origins: - head_result = self.origin_head_indexer.index(origin) + head_result = self.origin_head_indexer.index(origin['url']) if head_result: + head_result['origin_id'] = origin['id'] origins_with_head.append(origin) head_rev_ids.append(head_result['revision_id']) @@ -280,13 +283,14 @@ for (origin, rev) in zip(origins_with_head, head_revs): if not rev: self.log.warning('Missing head revision of origin %r', - origin) + origin['url']) continue rev_metadata = self.revision_metadata_indexer.index(rev) orig_metadata = { 'from_revision': rev_metadata['id'], 'id': origin['id'], + 'origin_url': origin['url'], 'metadata': rev_metadata['metadata'], 'mappings': rev_metadata['mappings'], 'indexer_configuration_id': diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -27,22 +27,21 @@ # Dispatch - def index(self, origin): - origin_id = origin['id'] - latest_snapshot = self.storage.snapshot_get_latest(origin_id) + def index(self, origin_url): + latest_snapshot = self.storage.snapshot_get_latest(origin_url) if latest_snapshot is None: return None - method = getattr(self, '_try_get_%s_head' % origin['type'], None) - if method is None: - method = self._try_get_head_generic - rev_id = method(latest_snapshot) - if rev_id is None: - return None - result = { - 'origin_id': origin_id, - 'revision_id': rev_id, - } - return result + for method in (self._try_get_vcs_head, self._try_get_head_generic, + self._try_get_ftp_head): + rev_id = method(latest_snapshot) + if rev_id is not None: + return { + 'origin_url': origin_url, + 'revision_id': rev_id, + } + + # could not find a head revision + return None # VCSs @@ -54,8 +53,6 @@ except KeyError: return None - _try_get_hg_head = _try_get_git_head = _try_get_vcs_head - # Tarballs _archive_filename_re = re.compile( diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(124, now(), 'Work In Progress'); + values(125, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) @@ -130,6 +130,7 @@ create table origin_intrinsic_metadata( id bigserial not null, + origin_url text, metadata jsonb, indexer_configuration_id bigint not null, from_revision sha1_git not null, diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -713,7 +713,8 @@ Args: metadata (iterable): dictionaries with keys: - - **id**: origin identifier + - **id**: legacy origin identifier + - **origin_url**: URL of the origin - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict @@ -731,7 +732,8 @@ db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', - ['id', 'metadata', 'indexer_configuration_id', + ['id', 'origin_url', 'metadata', + 'indexer_configuration_id', 'from_revision', 'mappings'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) @@ -763,7 +765,8 @@ Yields: list: dictionaries with the following keys: - - **id** (int) + - **id** (int): legacy origin identifier + - **origin_url** (str) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate @@ -796,7 +799,8 @@ list: list of origin ids (int) if `ids_only=True`, else dictionaries with the following keys: - - **id** (int) + - **id** (int): legacy origin identifier + - **origin_url** (str) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -48,13 +48,11 @@ def test_git(self): self.indexer.run( - ['git+https://github.com/SoftwareHeritage/swh-storage']) - origin_id = self._get_origin_id( - 'git', 'https://github.com/SoftwareHeritage/swh-storage') + ['https://github.com/SoftwareHeritage/swh-storage']) self.assertEqual(self.indexer.results, [{ 'revision_id': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{' b'\xd7}\xac\xefrm', - 'origin_id': origin_id}]) + 'origin_url': 'https://github.com/SoftwareHeritage/swh-storage'}]) def test_vcs_missing_snapshot(self): self.indexer.storage.origin_add([{ @@ -62,7 +60,7 @@ 'url': 'https://github.com/SoftwareHeritage/swh-indexer', }]) self.indexer.run( - ['git+https://github.com/SoftwareHeritage/swh-indexer']) + ['https://github.com/SoftwareHeritage/swh-indexer']) self.assertEqual(self.indexer.results, []) def test_pypi_missing_branch(self): @@ -82,18 +80,16 @@ } } }) - self.indexer.run(['pypi+https://pypi.org/project/abcdef/']) + self.indexer.run(['https://pypi.org/project/abcdef/']) self.assertEqual(self.indexer.results, []) def test_ftp(self): self.indexer.run( - ['ftp+rsync://ftp.gnu.org/gnu/3dldf']) - origin_id = self._get_origin_id( - 'ftp', 'rsync://ftp.gnu.org/gnu/3dldf') + ['rsync://ftp.gnu.org/gnu/3dldf']) self.assertEqual(self.indexer.results, [{ 'revision_id': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', - 'origin_id': origin_id}]) + 'origin_url': 'rsync://ftp.gnu.org/gnu/3dldf'}]) def test_ftp_missing_snapshot(self): self.indexer.storage.origin_add([{ @@ -101,19 +97,18 @@ 'url': 'rsync://ftp.gnu.org/gnu/foobar', }]) self.indexer.run( - ['ftp+rsync://ftp.gnu.org/gnu/foobar']) + ['rsync://ftp.gnu.org/gnu/foobar']) self.assertEqual(self.indexer.results, []) def test_deposit(self): self.indexer.run( - ['deposit+https://forge.softwareheritage.org/source/' + ['https://forge.softwareheritage.org/source/' 'jesuisgpl/']) - origin_id = self._get_origin_id( - 'deposit', 'https://forge.softwareheritage.org/source/jesuisgpl/') self.assertEqual(self.indexer.results, [{ 'revision_id': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', - 'origin_id': origin_id}]) + 'origin_url': 'https://forge.softwareheritage.org/source/' + 'jesuisgpl/'}]) def test_deposit_missing_snapshot(self): self.indexer.storage.origin_add([{ @@ -121,25 +116,21 @@ 'url': 'https://forge.softwareheritage.org/source/foobar', }]) self.indexer.run( - ['deposit+https://forge.softwareheritage.org/source/foobar']) + ['https://forge.softwareheritage.org/source/foobar']) self.assertEqual(self.indexer.results, []) def test_pypi(self): self.indexer.run( - ['pypi+https://pypi.org/project/limnoria/']) - origin_id = self._get_origin_id( - 'pypi', 'https://pypi.org/project/limnoria/') + ['https://pypi.org/project/limnoria/']) self.assertEqual(self.indexer.results, [{ 'revision_id': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', - 'origin_id': origin_id}]) + 'origin_url': 'https://pypi.org/project/limnoria/'}]) def test_svn(self): self.indexer.run( - ['svn+http://0-512-md.googlecode.com/svn/']) - origin_id = self._get_origin_id( - 'svn', 'http://0-512-md.googlecode.com/svn/') + ['http://0-512-md.googlecode.com/svn/']) self.assertEqual(self.indexer.results, [{ 'revision_id': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', - 'origin_id': origin_id}]) + 'origin_url': 'http://0-512-md.googlecode.com/svn/'}]) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -17,10 +17,9 @@ idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -31,6 +30,7 @@ } origin_metadata = { 'id': origin['id'], + 'origin_url': origin['url'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], @@ -54,12 +54,11 @@ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) - indexer.run(["git+https://github.com/librariesio/yarn-parser"]*2) + indexer.run(["https://github.com/librariesio/yarn-parser"]*2) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -81,10 +80,9 @@ }]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["git+https://example.com"]) + indexer.run(["https://example.com"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://example.com'}) results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ @@ -101,14 +99,12 @@ }]) indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["git+https://example.com", - "git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://example.com", + "https://github.com/librariesio/yarn-parser"]) origin1 = storage.origin_get({ - 'type': 'git', 'url': 'https://example.com'}) origin2 = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -119,6 +115,7 @@ } origin_metadata = { 'id': origin2['id'], + 'origin_url': origin2['url'], 'from_revision': rev_id, 'metadata': YARN_PARSER_METADATA, 'mappings': ['npm'], @@ -142,14 +139,12 @@ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) indexer.storage = storage indexer.idx_storage = idx_storage - indexer.run(["git+https://github.com/librariesio/yarn-parser", - "git+https://github.com/librariesio/yarn-parser.git"]) + indexer.run(["https://github.com/librariesio/yarn-parser", + "https://github.com/librariesio/yarn-parser.git"]) origin1 = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) origin2 = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser.git'}) assert origin1['id'] != origin2['id'] rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -169,10 +164,9 @@ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', b'foo.json'): - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -192,10 +186,9 @@ with patch('swh.indexer.metadata.RevisionMetadataIndexer' '.translate_revision_intrinsic_metadata', return_value=(['npm'], {'@context': 'foo'})): - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -215,10 +208,9 @@ with patch('swh.indexer.metadata.RevisionMetadataIndexer' '.translate_revision_intrinsic_metadata', return_value=None): - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -235,10 +227,9 @@ idx_storage, storage, obj_storage): indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG) - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) origin = storage.origin_get({ - 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') @@ -252,7 +243,7 @@ with patch('swh.indexer.metadata_dictionary.npm.NpmMapping.filename', b'foo.json'): - indexer.run(["git+https://github.com/librariesio/yarn-parser"]) + indexer.run(["https://github.com/librariesio/yarn-parser"]) results = list( indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))