diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -79,15 +79,15 @@ storage. To implement a new object type indexer, inherit from the - BaseIndexer and implement the process of indexation: + BaseIndexer and implement indexing: :func:`run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level - classes: :class:`ContentIndexer`, :class:`RevisionIndexer` (later - on :class:`OriginIndexer` will also be available) + classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, + :class:`OriginIndexer`. Then you need to implement the following functions: @@ -337,8 +337,7 @@ class ContentIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and - implements the process of indexation for Contents using the run - method + implements Content indexing using the run method Note: the :class:`ContentIndexer` is not an instantiable object. To use it in another context, one should inherit from this @@ -384,10 +383,68 @@ self.rescheduling_task.delay(ids, policy_update) +class OriginIndexer(BaseIndexer): + """An object type indexer, inherits from the :class:`BaseIndexer` and + implements Origin indexing using the run method + + Note: the :class:`OriginIndexer` is not an instantiable object. + To use it in another context one should inherit from this class + and override the methods mentioned in the :class:`BaseIndexer` + class. + + """ + def run(self, ids, policy_update, parse_ids=False): + """Given a list of origin ids: + + - retrieve origins from storage + - execute the indexing computations + - store the results (according to policy_update) + + Args: + ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or + (type, url) tuples. + policy_update ([str]): either 'update-dups' or 'ignore-dups' to + respectively update duplicates or ignore + them + parse_ids ([bool]: If `True`, will try to convert `ids` + from a human input to the valid type. + + """ + if parse_ids: + ids = [ + o.split('+', 1) if ':' in o else int(o) # type+url or id + for o in ids] + + results = [] + + for id_ in ids: + if isinstance(id_, (tuple, list)): + if len(id_) != 2: + raise TypeError('Expected a (type, url) tuple.') + (type_, url) = id_ + params = {'type': type_, 'url': url} + elif isinstance(id_, int): + params = {'id': id_} + else: + raise TypeError('Invalid value for "ids": %r' % id_) + origin = self.storage.origin_get(params) + if not origin: + self.log.warn('Origins %s not found in storage' % + list(ids)) + continue + try: + res = self.index(origin) + if origin: # If no results, skip it + results.append(res) + except Exception: + self.log.exception( + 'Problem when processing origin %s' % id_) + self.persist_index_computations(results, policy_update) + + class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and - implements the process of indexation for Revisions using the run - method + implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -279,9 +279,6 @@ @click.command() @click.option('--revs', '-i', - default=['8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', - '026040ea79dec1b49b4e3e7beda9132b6b26b51b', - '9699072e21eded4be8d45e3b8d543952533fa190'], help='Default sha1_git to lookup', multiple=True) def main(revs): _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py new file mode 100644 --- /dev/null +++ b/swh/indexer/origin_head.py @@ -0,0 +1,163 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re +import click +import logging + +from swh.indexer.indexer import OriginIndexer + + +class OriginHeadIndexer(OriginIndexer): + """Origin-level indexer. + + This indexer is in charge of looking up the revision that acts as the + "head" of an origin. + + In git, this is usually the commit pointed to by the 'master' branch.""" + + ADDITIONAL_CONFIG = { + 'tools': ('dict', { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {}, + }), + } + + def filter(self, ids): + yield from ids + + def persist_index_computations(self, results, policy_update): + """Do nothing. The indexer's results are not persistant, they + should only be piped to another indexer via the orchestrator.""" + pass + + # Dispatch + + def index(self, origin): + origin_id = origin['id'] + latest_snapshot = self.storage.snapshot_get_latest(origin_id) + method = getattr(self, '_try_get_%s_head' % origin['type'], None) + if method is None: + method = self._try_get_head_generic + rev_id = method(latest_snapshot) + if rev_id is None: + return None + result = { + 'origin_id': origin_id, + 'revision_id': rev_id, + } + return result + + # VCSs + + def _try_get_vcs_head(self, snapshot): + try: + if isinstance(snapshot, dict): + branches = snapshot['branches'] + if branches[b'HEAD']['target_type'] == 'revision': + return branches[b'HEAD']['target'] + except KeyError: + return None + + _try_get_hg_head = _try_get_git_head = _try_get_vcs_head + + # Tarballs + + _archive_filename_re = re.compile( + b'^' + b'(?P.*)[-_]' + b'(?P[0-9]+(\.[0-9])*)' + b'(?P[-+][a-zA-Z0-9.~]+?)?' + b'(?P(\.[a-zA-Z0-9]+)+)' + b'$') + + @classmethod + def _parse_version(cls, filename): + """Extracts the release version from an archive filename, + to get an ordering whose maximum is likely to be the last + version of the software + + >>> OriginHeadIndexer._parse_version(b'foo') + (-inf,) + >>> OriginHeadIndexer._parse_version(b'foo.tar.gz') + (-inf,) + >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz') + (0, 0, 1, 0) + >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') + (0, 0, 1, -1, 'beta2') + >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') + (0, 0, 1, 1, 'foobar') + """ + res = cls._archive_filename_re.match(filename) + if res is None: + return (float('-infinity'),) + version = [int(n) for n in res.group('version').decode().split('.')] + if res.group('preversion') is None: + version.append(0) + else: + preversion = res.group('preversion').decode() + if preversion.startswith('-'): + version.append(-1) + version.append(preversion[1:]) + elif preversion.startswith('+'): + version.append(1) + version.append(preversion[1:]) + else: + assert False, res.group('preversion') + return tuple(version) + + def _try_get_ftp_head(self, snapshot): + archive_names = list(snapshot['branches']) + max_archive_name = max(archive_names, key=self._parse_version) + r = self._try_resolve_target(snapshot['branches'], max_archive_name) + return r + + # Generic + + def _try_get_head_generic(self, snapshot): + # Works on 'deposit', 'svn', and 'pypi'. + try: + if isinstance(snapshot, dict): + branches = snapshot['branches'] + except KeyError: + return None + else: + return ( + self._try_resolve_target(branches, b'HEAD') or + self._try_resolve_target(branches, b'master') + ) + + def _try_resolve_target(self, branches, target_name): + try: + target = branches[target_name] + while target['target_type'] == 'alias': + target = branches[target['target']] + if target['target_type'] == 'revision': + return target['target'] + elif target['target_type'] == 'content': + return None # TODO + elif target['target_type'] == 'directory': + return None # TODO + elif target['target_type'] == 'release': + return None # TODO + else: + assert False + except KeyError: + return None + + +@click.command() +@click.option('--origins', '-i', + help='Origins to lookup, in the "type+url" format', + multiple=True) +def main(origins): + rev_metadata_indexer = OriginHeadIndexer() + rev_metadata_indexer.run(origins, 'update-dups', parse_ids=True) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main() diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/test_origin_head.py @@ -0,0 +1,213 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +import logging +from nose.tools import istest + +from swh.indexer.origin_head import OriginHeadIndexer +from swh.indexer.tests.test_utils import MockIndexerStorage + +ORIGINS = [ + { + 'id': 52189575, + 'lister': None, + 'project': None, + 'type': 'git', + 'url': 'https://github.com/SoftwareHeritage/swh-storage'}, + { + 'id': 4423668, + 'lister': None, + 'project': None, + 'type': 'ftp', + 'url': 'rsync://ftp.gnu.org/gnu/3dldf'}, + { + 'id': 77775770, + 'lister': None, + 'project': None, + 'type': 'deposit', + 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'}, + { + 'id': 85072327, + 'lister': None, + 'project': None, + 'type': 'pypi', + 'url': 'https://pypi.org/project/limnoria/'}, + { + 'id': 49908349, + 'lister': None, + 'project': None, + 'type': 'svn', + 'url': 'http://0-512-md.googlecode.com/svn/'}, + ] + +SNAPSHOTS = { + 52189575: { + 'branches': { + b'refs/heads/add-revision-origin-cache': { + 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' + b's\xe7/\xe9l\x1e', + 'target_type': 'revision'}, + b'HEAD': { + 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' + b'\xac\xefrm', + 'target_type': 'revision'}, + b'refs/tags/v0.0.103': { + 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' + b'\x0f\xdd', + 'target_type': 'release'}, + }}, + 4423668: { + 'branches': { + b'3DLDF-1.1.4.tar.gz': { + 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' + b'"G\x99\x11', + 'target_type': 'revision'}, + b'3DLDF-2.0.2.tar.gz': { + 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' + b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', + 'target_type': 'revision'}, + b'3DLDF-2.0.3-examples.tar.gz': { + 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' + b'\xfe\xadZ\x80\x80\xc1\x83\xff', + 'target_type': 'revision'}, + b'3DLDF-2.0.3.tar.gz': { + 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' + b'\xcc\x1a\xb4`\x8c\x8by', + 'target_type': 'revision'}, + b'3DLDF-2.0.tar.gz': { + 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' + b'\xd3\xd1m', + b'target_type': 'revision'} + }}, + 77775770: { + 'branches': { + b'master': { + 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' + b'\xa6\xe9\x99\xb1\x9e]q\xeb', + 'target_type': 'revision'} + }, + 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" + b"\x1d\r "}, + 85072327: { + 'branches': { + b'HEAD': { + 'target': b'releases/2018.09.09', + 'target_type': 'alias'}, + b'releases/2018.09.01': { + 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' + b'\xbb\xdfF\xfdw\xcf', + 'target_type': 'revision'}, + b'releases/2018.09.09': { + 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' + b'A\x10\x9d\xc5\xfa2\xf8t', + 'target_type': 'revision'}}, + 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' + b'\x12\x9e\xd6\xb3'}, + 49908349: { + 'branches': { + b'master': { + 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' + b'\xc9\xad#.\x1bw=\x18', + 'target_type': 'revision'}}, + 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' + b'\x05\xea\xb8\x1f\xc4H\xf4s'}, + } + + +class MockStorage: + def origin_get(self, id_): + for origin in ORIGINS: + if origin['type'] == id_['type'] and origin['url'] == id_['url']: + return origin + assert False, id_ + + def snapshot_get_latest(self, origin_id): + if origin_id in SNAPSHOTS: + return SNAPSHOTS[origin_id] + else: + assert False, origin_id + + +class TestOriginHeadIndexer(OriginHeadIndexer): + """Specific indexer whose configuration is enough to satisfy the + indexing tests. + """ + def prepare(self): + self.config = { + 'tools': { + 'name': 'origin-metadata', + 'version': '0.0.1', + 'configuration': {}, + }, + } + self.storage = MockStorage() + self.idx_storage = MockIndexerStorage() + self.log = logging.getLogger('swh.indexer') + self.objstorage = None + self.tools = self.register_tools(self.config['tools']) + self.tool = self.tools[0] + self.results = None + + def persist_index_computations(self, results, policy_update): + self.results = results + + +class OriginHead(unittest.TestCase): + @istest + def test_git(self): + indexer = TestOriginHeadIndexer() + indexer.run( + ['git+https://github.com/SoftwareHeritage/swh-storage'], + 'update-dups', parse_ids=True) + self.assertEqual(indexer.results, [{ + 'revision_id': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{' + b'\xd7}\xac\xefrm', + 'origin_id': 52189575}]) + + @istest + def test_ftp(self): + indexer = TestOriginHeadIndexer() + indexer.run( + ['ftp+rsync://ftp.gnu.org/gnu/3dldf'], + 'update-dups', parse_ids=True) + self.assertEqual(indexer.results, [{ + 'revision_id': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' + b'\xcc\x1a\xb4`\x8c\x8by', + 'origin_id': 4423668}]) + + @istest + def test_deposit(self): + indexer = TestOriginHeadIndexer() + indexer.run( + ['deposit+https://forge.softwareheritage.org/source/' + 'jesuisgpl/'], + 'update-dups', parse_ids=True) + self.assertEqual(indexer.results, [{ + 'revision_id': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' + b'\xa6\xe9\x99\xb1\x9e]q\xeb', + 'origin_id': 77775770}]) + + @istest + def test_pypi(self): + indexer = TestOriginHeadIndexer() + indexer.run( + ['pypi+https://pypi.org/project/limnoria/'], + 'update-dups', parse_ids=True) + self.assertEqual(indexer.results, [{ + 'revision_id': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' + b'A\x10\x9d\xc5\xfa2\xf8t', + 'origin_id': 85072327}]) + + @istest + def test_svn(self): + indexer = TestOriginHeadIndexer() + indexer.run( + ['svn+http://0-512-md.googlecode.com/svn/'], + 'update-dups', parse_ids=True) + self.assertEqual(indexer.results, [{ + 'revision_id': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' + b'\xc9\xad#.\x1bw=\x18', + 'origin_id': 49908349}]) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -143,6 +143,15 @@ 'context': 'npm' }, }] + elif tool['tool_name'] == 'origin-metadata': + return [{ + 'id': 8, + 'tool_name': 'origin-metadata', + 'tool_version': '0.0.1', + 'tool_configuration': {}, + }] + else: + assert False, 'Unknown tool {tool_name}'.format(**tool) def content_metadata_missing(self, sha1s): yield from []