diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -384,6 +384,66 @@ self.rescheduling_task.delay(ids, policy_update) +class OriginIndexer(BaseIndexer): + """An object type indexer, inherits from the :class:`BaseIndexer` and + implements the process of indexation for Origins using the run + method + + Note: the :class:`OriginIndexer` is not an instantiable object. + To use it in another context one should inherit from this class + and override the methods mentioned in the :class:`BaseIndexer` + class. + + """ + def run(self, ids, policy_update, parse_ids=False): + """Given a list of origin ids: + + - retrieve origins from storage + - execute the indexing computations + - store the results (according to policy_update) + + Args: + ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or + (type, url) tuples. + policy_update ([str]): either 'update-dups' or 'ignore-dups' to + respectively update duplicates or ignore + them + parse_ids ([bool]: If `True`, will try to convert `ids` + from a human input to the valid type. + + """ + if parse_ids: + ids = [ + o.split('+', 1) if ':' in o else int(o) # type+url or id + for o in ids] + + results = [] + + for id_ in ids: + if isinstance(id_, (tuple, list)): + if len(id_) != 2: + raise TypeError('Excepted a (type, url) tuple.') + (type_, url) = id_ + params = {'type': type_, 'url': url} + elif isinstance(id_, int): + params = {'id': id_} + else: + raise TypeError('Invalid value for "ids": %r' % id_) + origin = self.storage.origin_get(params) + if not origin: + self.log.warn('Origins %s not found in storage' % + list(ids)) + continue + try: + res = self.index(origin) + if origin: # If no results, skip it + results.append(res) + except Exception: + self.log.exception( + 'Problem when processing origin') + self.persist_index_computations(results, policy_update) + + class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements the process of indexation for Revisions using the run diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py new file mode 100644 --- /dev/null +++ b/swh/indexer/origin_head.py @@ -0,0 +1,121 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click +import logging +from pprint import pprint + +from swh.indexer.indexer import OriginIndexer + + +class OriginHeadIndexer(OriginIndexer): + """Origin-level indexer. + + This indexer is in charge of looking up the revision that acts as the + "head" of an origin. + + In git, this is usually the commit pointed to by the 'master' branch.""" + + ADDITIONAL_CONFIG = { + 'storage': ('dict', { + 'cls': 'local', + 'args': { + 'db': 'service=swh-replica', + 'objstorage': OriginIndexer.DEFAULT_CONFIG['objstorage'][1], + } + }), + 'tools': ('dict', { + 'name': 'swh-head-revision', + 'version': '0.0.1', + 'configuration': {}, + }), + } + + def filter(self, ids): + yield from ids + + def index(self, origin): + origin_id = origin['id'] + latest_snapshot = self.storage.snapshot_get_latest(origin_id) + print() + print() + print(origin) + pprint(latest_snapshot) + method = getattr(self, '_try_get_%s_head' % origin['type'], None) + if method is None: + method = self._try_get_head_generic + rev_id = method(latest_snapshot) + print(repr(rev_id)) + if rev_id is None: + return None + result = { + 'origin_id': origin_id, + 'revision_id': rev_id, + } + return result + + def _try_get_git_head(self, snapshot): + try: + if isinstance(snapshot, dict): + branches = snapshot['branches'] + if branches[b'HEAD']['target_type'] == 'revision': + return branches[b'HEAD']['target'] + except KeyError: + return None + + def _try_get_hg_head(self, snapshot): + pass # TODO, see https://forge.softwareheritage.org/T1189 + + def _try_get_ftp_head(self, snapshot): + pass # TODO, look for branches named like 'pkgname-version.tar.gz' + + def _try_get_head_generic(self, snapshot): + # Works on 'deposit', 'svn', and 'pypi'. + try: + if isinstance(snapshot, dict): + branches = snapshot['branches'] + except KeyError: + return None + else: + return ( + self._try_resolve_target(branches, b'HEAD') or + self._try_resolve_target(branches, b'master') + ) + + def _try_resolve_target(self, branches, target_name): + try: + target = branches[target_name] + while target['target_type'] == 'alias': + target = branches[target['target']] + if target['target_type'] == 'revision': + return target['target'] + except KeyError: + return None + + def persist_index_computations(self, results, policy_update): + pass # TODO + + +@click.command() +@click.option('--origins', '-i', default=[ + 'git+https://github.com/SoftwareHeritage/swh-storage', + 'git+https://github.com/SoftwareHeritage/swh-indexer', + 'deposit+https://forge.softwareheritage.org/source/jesuisgpl/', + 'ftp+rsync://ftp.gnu.org/gnu/3dldf', + 'hg+https://gnuplotutils.googlecode.com/hg/', + 'pypi+https://pypi.org/project/0-._.-._.-._.-._.-._.-._.-0/', + 'pypi+https://pypi.org/project/limnoria/', + 'svn+http://0-512-md.googlecode.com/svn/', + ], + help='Origins to lookup, in the "type+url" format', + multiple=True) +def main(origins): + rev_metadata_indexer = OriginHeadIndexer() + rev_metadata_indexer.run(origins, 'update-dups', parse_ids=True) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main()