diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -79,7 +79,7 @@ storage. To implement a new object type indexer, inherit from the - BaseIndexer and implement the process of indexation: + BaseIndexer and implement indexing: :func:`run`: object_ids are different depending on object. For example: sha1 for @@ -337,8 +337,7 @@ class ContentIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and - implements the process of indexation for Contents using the run - method + implements Content indexing using the run method Note: the :class:`ContentIndexer` is not an instantiable object. To use it in another context, one should inherit from this @@ -384,10 +383,68 @@ self.rescheduling_task.delay(ids, policy_update) +class OriginIndexer(BaseIndexer): + """An object type indexer, inherits from the :class:`BaseIndexer` and + implements Origin indexing using the run method + + Note: the :class:`OriginIndexer` is not an instantiable object. + To use it in another context one should inherit from this class + and override the methods mentioned in the :class:`BaseIndexer` + class. + + """ + def run(self, ids, policy_update, parse_ids=False): + """Given a list of origin ids: + + - retrieve origins from storage + - execute the indexing computations + - store the results (according to policy_update) + + Args: + ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or + (type, url) tuples. + policy_update ([str]): either 'update-dups' or 'ignore-dups' to + respectively update duplicates or ignore + them + parse_ids ([bool]: If `True`, will try to convert `ids` + from a human input to the valid type. + + """ + if parse_ids: + ids = [ + o.split('+', 1) if ':' in o else int(o) # type+url or id + for o in ids] + + results = [] + + for id_ in ids: + if isinstance(id_, (tuple, list)): + if len(id_) != 2: + raise TypeError('Excepted a (type, url) tuple.') + (type_, url) = id_ + params = {'type': type_, 'url': url} + elif isinstance(id_, int): + params = {'id': id_} + else: + raise TypeError('Invalid value for "ids": %r' % id_) + origin = self.storage.origin_get(params) + if not origin: + self.log.warn('Origins %s not found in storage' % + list(ids)) + continue + try: + res = self.index(origin) + if origin: # If no results, skip it + results.append(res) + except Exception: + self.log.exception( + 'Problem when processing origin') + self.persist_index_computations(results, policy_update) + + class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and - implements the process of indexation for Revisions using the run - method + implements Revision indexing using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py new file mode 100644 --- /dev/null +++ b/swh/indexer/origin_head.py @@ -0,0 +1,170 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re +import click +import logging + +from swh.indexer.indexer import OriginIndexer + + +class OriginHeadIndexer(OriginIndexer): + """Origin-level indexer. + + This indexer is in charge of looking up the revision that acts as the + "head" of an origin. + + In git, this is usually the commit pointed to by the 'master' branch.""" + + ADDITIONAL_CONFIG = { + 'storage': ('dict', { + 'cls': 'local', + 'args': { + 'db': 'service=swh-replica', + 'objstorage': OriginIndexer.DEFAULT_CONFIG['objstorage'][1], + } + }), + 'tools': ('dict', { + 'name': 'swh-head-revision', + 'version': '0.0.1', + 'configuration': {}, + }), + } + + def filter(self, ids): + yield from ids + + def persist_index_computations(self, results, policy_update): + pass # TODO + + # Dispatch + + def index(self, origin): + origin_id = origin['id'] + latest_snapshot = self.storage.snapshot_get_latest(origin_id) + method = getattr(self, '_try_get_%s_head' % origin['type'], None) + if method is None: + method = self._try_get_head_generic + rev_id = method(latest_snapshot) + if rev_id is None: + return None + result = { + 'origin_id': origin_id, + 'revision_id': rev_id, + } + return result + + # VCSs + + def _try_get_git_head(self, snapshot): + try: + if isinstance(snapshot, dict): + branches = snapshot['branches'] + if branches[b'HEAD']['target_type'] == 'revision': + return branches[b'HEAD']['target'] + except KeyError: + return None + + def _try_get_hg_head(self, snapshot): + pass # TODO, see https://forge.softwareheritage.org/T1189 + + # Tarballs + + _archive_filename_re = re.compile( + b'^' + b'(?P.*)[-_]' + b'(?P[0-9]+(\.[0-9])*)' + b'(?P[-+][a-zA-Z0-9.~]+?)?' + b'(?P(\.[a-zA-Z0-9]+)+)' + b'$') + + @classmethod + def _parse_version(cls, filename): + """Extracts the release version from an archive filename, + to get an ordering whose maximum is likely to be the last + version of the software + + >>> OriginHeadIndexer._parse_version(b'foo') + (-inf,) + >>> OriginHeadIndexer._parse_version(b'foo.tar.gz') + (-inf,) + >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz') + (0, 0, 1, 0) + >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') + (0, 0, 1, -1, 'beta2') + >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') + (0, 0, 1, 1, 'foobar') + """ + res = cls._archive_filename_re.match(filename) + if res is None: + return (float('-infinity'),) + version = [int(n) for n in res.group('version').decode().split('.')] + if res.group('preversion') is None: + version.append(0) + else: + preversion = res.group('preversion').decode() + if preversion.startswith('-'): + version.append(-1) + version.append(preversion[1:]) + elif preversion.startswith('+'): + version.append(1) + version.append(preversion[1:]) + else: + assert False, res.group('preversion') + return tuple(version) + + def _try_get_ftp_head(self, snapshot): + archive_names = list(snapshot['branches']) + max_archive_name = max(archive_names, key=self._parse_version) + r = self._try_resolve_target(snapshot['branches'], max_archive_name) + return r + + # Generic + + def _try_get_head_generic(self, snapshot): + # Works on 'deposit', 'svn', and 'pypi'. + try: + if isinstance(snapshot, dict): + branches = snapshot['branches'] + except KeyError: + return None + else: + return ( + self._try_resolve_target(branches, b'HEAD') or + self._try_resolve_target(branches, b'master') + ) + + def _try_resolve_target(self, branches, target_name): + try: + target = branches[target_name] + while target['target_type'] == 'alias': + target = branches[target['target']] + if target['target_type'] == 'revision': + return target['target'] + except KeyError: + return None + + +@click.command() +@click.option('--origins', '-i', default=[ + 'git+https://github.com/SoftwareHeritage/swh-storage', + 'git+https://github.com/SoftwareHeritage/swh-indexer', + 'deposit+https://forge.softwareheritage.org/source/jesuisgpl/', + 'ftp+rsync://ftp.gnu.org/gnu/3dldf', + 'hg+https://gnuplotutils.googlecode.com/hg/', + 'pypi+https://pypi.org/project/0-._.-._.-._.-._.-._.-._.-0/', + 'pypi+https://pypi.org/project/limnoria/', + 'svn+http://0-512-md.googlecode.com/svn/', + ], + help='Origins to lookup, in the "type+url" format', + multiple=True) +def main(origins): + rev_metadata_indexer = OriginHeadIndexer() + rev_metadata_indexer.run(origins, 'update-dups', parse_ids=True) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main()