Page MenuHomeSoftware Heritage

D490.id1467.diff
No OneTemporary

D490.id1467.diff

diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -384,6 +384,66 @@
self.rescheduling_task.delay(ids, policy_update)
+class OriginIndexer(BaseIndexer):
+ """An object type indexer, inherits from the :class:`BaseIndexer` and
+ implements the process of indexation for Origins using the run
+ method
+
+ Note: the :class:`OriginIndexer` is not an instantiable object.
+ To use it in another context one should inherit from this class
+ and override the methods mentioned in the :class:`BaseIndexer`
+ class.
+
+ """
+ def run(self, ids, policy_update, parse_ids=False):
+ """Given a list of origin ids:
+
+ - retrieve origins from storage
+ - execute the indexing computations
+ - store the results (according to policy_update)
+
+ Args:
+ ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or
+ (type, url) tuples.
+ policy_update ([str]): either 'update-dups' or 'ignore-dups' to
+ respectively update duplicates or ignore
+ them
+ parse_ids ([bool]: If `True`, will try to convert `ids`
+ from a human input to the valid type.
+
+ """
+ if parse_ids:
+ ids = [
+ o.split('+', 1) if ':' in o else int(o) # type+url or id
+ for o in ids]
+
+ results = []
+
+ for id_ in ids:
+ if isinstance(id_, (tuple, list)):
+ if len(id_) != 2:
+ raise TypeError('Excepted a (type, url) tuple.')
+ (type_, url) = id_
+ params = {'type': type_, 'url': url}
+ elif isinstance(id_, int):
+ params = {'id': id_}
+ else:
+ raise TypeError('Invalid value for "ids": %r' % id_)
+ origin = self.storage.origin_get(params)
+ if not origin:
+ self.log.warn('Origins %s not found in storage' %
+ list(ids))
+ continue
+ try:
+ res = self.index(origin)
+ if origin: # If no results, skip it
+ results.append(res)
+ except Exception:
+ self.log.exception(
+ 'Problem when processing origin')
+ self.persist_index_computations(results, policy_update)
+
+
class RevisionIndexer(BaseIndexer):
"""An object type indexer, inherits from the :class:`BaseIndexer` and
implements the process of indexation for Revisions using the run
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/origin_head.py
@@ -0,0 +1,121 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import click
+import logging
+from pprint import pprint
+
+from swh.indexer.indexer import OriginIndexer
+
+
+class OriginHeadIndexer(OriginIndexer):
+ """Origin-level indexer.
+
+ This indexer is in charge of looking up the revision that acts as the
+ "head" of an origin.
+
+ In git, this is usually the commit pointed to by the 'master' branch."""
+
+ ADDITIONAL_CONFIG = {
+ 'storage': ('dict', {
+ 'cls': 'local',
+ 'args': {
+ 'db': 'service=swh-replica',
+ 'objstorage': OriginIndexer.DEFAULT_CONFIG['objstorage'][1],
+ }
+ }),
+ 'tools': ('dict', {
+ 'name': 'swh-head-revision',
+ 'version': '0.0.1',
+ 'configuration': {},
+ }),
+ }
+
+ def filter(self, ids):
+ yield from ids
+
+ def index(self, origin):
+ origin_id = origin['id']
+ latest_snapshot = self.storage.snapshot_get_latest(origin_id)
+ print()
+ print()
+ print(origin)
+ pprint(latest_snapshot)
+ method = getattr(self, '_try_get_%s_head' % origin['type'], None)
+ if method is None:
+ method = self._try_get_head_generic
+ rev_id = method(latest_snapshot)
+ print(repr(rev_id))
+ if rev_id is None:
+ return None
+ result = {
+ 'origin_id': origin_id,
+ 'revision_id': rev_id,
+ }
+ return result
+
+ def _try_get_git_head(self, snapshot):
+ try:
+ if isinstance(snapshot, dict):
+ branches = snapshot['branches']
+ if branches[b'HEAD']['target_type'] == 'revision':
+ return branches[b'HEAD']['target']
+ except KeyError:
+ return None
+
+ def _try_get_hg_head(self, snapshot):
+ pass # TODO, see https://forge.softwareheritage.org/T1189
+
+ def _try_get_ftp_head(self, snapshot):
+ pass # TODO, look for branches named like 'pkgname-version.tar.gz'
+
+ def _try_get_head_generic(self, snapshot):
+ # Works on 'deposit', 'svn', and 'pypi'.
+ try:
+ if isinstance(snapshot, dict):
+ branches = snapshot['branches']
+ except KeyError:
+ return None
+ else:
+ return (
+ self._try_resolve_target(branches, b'HEAD') or
+ self._try_resolve_target(branches, b'master')
+ )
+
+ def _try_resolve_target(self, branches, target_name):
+ try:
+ target = branches[target_name]
+ while target['target_type'] == 'alias':
+ target = branches[target['target']]
+ if target['target_type'] == 'revision':
+ return target['target']
+ except KeyError:
+ return None
+
+ def persist_index_computations(self, results, policy_update):
+ pass # TODO
+
+
+@click.command()
+@click.option('--origins', '-i', default=[
+ 'git+https://github.com/SoftwareHeritage/swh-storage',
+ 'git+https://github.com/SoftwareHeritage/swh-indexer',
+ 'deposit+https://forge.softwareheritage.org/source/jesuisgpl/',
+ 'ftp+rsync://ftp.gnu.org/gnu/3dldf',
+ 'hg+https://gnuplotutils.googlecode.com/hg/',
+ 'pypi+https://pypi.org/project/0-._.-._.-._.-._.-._.-._.-0/',
+ 'pypi+https://pypi.org/project/limnoria/',
+ 'svn+http://0-512-md.googlecode.com/svn/',
+ ],
+ help='Origins to lookup, in the "type+url" format',
+ multiple=True)
+def main(origins):
+ rev_metadata_indexer = OriginHeadIndexer()
+ rev_metadata_indexer.run(origins, 'update-dups', parse_ids=True)
+
+
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ main()

File Metadata

Mime Type
text/plain
Expires
Mon, Aug 18, 12:05 AM (1 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223090

Event Timeline