Page MenuHomeSoftware Heritage

D490.diff
No OneTemporary

D490.diff

diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -79,15 +79,15 @@
storage.
To implement a new object type indexer, inherit from the
- BaseIndexer and implement the process of indexation:
+ BaseIndexer and implement indexing:
:func:`run`:
object_ids are different depending on object. For example: sha1 for
content, sha1_git for revision, directory, release, and id for origin
To implement a new concrete indexer, inherit from the object level
- classes: :class:`ContentIndexer`, :class:`RevisionIndexer` (later
- on :class:`OriginIndexer` will also be available)
+ classes: :class:`ContentIndexer`, :class:`RevisionIndexer`,
+ :class:`OriginIndexer`.
Then you need to implement the following functions:
@@ -337,8 +337,7 @@
class ContentIndexer(BaseIndexer):
"""An object type indexer, inherits from the :class:`BaseIndexer` and
- implements the process of indexation for Contents using the run
- method
+ implements Content indexing using the run method
Note: the :class:`ContentIndexer` is not an instantiable
object. To use it in another context, one should inherit from this
@@ -384,10 +383,68 @@
self.rescheduling_task.delay(ids, policy_update)
+class OriginIndexer(BaseIndexer):
+ """An object type indexer, inherits from the :class:`BaseIndexer` and
+ implements Origin indexing using the run method
+
+ Note: the :class:`OriginIndexer` is not an instantiable object.
+ To use it in another context one should inherit from this class
+ and override the methods mentioned in the :class:`BaseIndexer`
+ class.
+
+ """
+ def run(self, ids, policy_update, parse_ids=False):
+ """Given a list of origin ids:
+
+ - retrieve origins from storage
+ - execute the indexing computations
+ - store the results (according to policy_update)
+
+ Args:
+ ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or
+ (type, url) tuples.
+ policy_update ([str]): either 'update-dups' or 'ignore-dups' to
+ respectively update duplicates or ignore
+ them
+ parse_ids ([bool]: If `True`, will try to convert `ids`
+ from a human input to the valid type.
+
+ """
+ if parse_ids:
+ ids = [
+ o.split('+', 1) if ':' in o else int(o) # type+url or id
+ for o in ids]
+
+ results = []
+
+ for id_ in ids:
+ if isinstance(id_, (tuple, list)):
+ if len(id_) != 2:
+ raise TypeError('Expected a (type, url) tuple.')
+ (type_, url) = id_
+ params = {'type': type_, 'url': url}
+ elif isinstance(id_, int):
+ params = {'id': id_}
+ else:
+ raise TypeError('Invalid value for "ids": %r' % id_)
+ origin = self.storage.origin_get(params)
+ if not origin:
+ self.log.warn('Origins %s not found in storage' %
+ list(ids))
+ continue
+ try:
+ res = self.index(origin)
+ if origin: # If no results, skip it
+ results.append(res)
+ except Exception:
+ self.log.exception(
+ 'Problem when processing origin %s' % id_)
+ self.persist_index_computations(results, policy_update)
+
+
class RevisionIndexer(BaseIndexer):
"""An object type indexer, inherits from the :class:`BaseIndexer` and
- implements the process of indexation for Revisions using the run
- method
+ implements Revision indexing using the run method
Note: the :class:`RevisionIndexer` is not an instantiable object.
To use it in another context one should inherit from this class
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -268,9 +268,6 @@
@click.command()
@click.option('--revs', '-i',
- default=['8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
- '026040ea79dec1b49b4e3e7beda9132b6b26b51b',
- '9699072e21eded4be8d45e3b8d543952533fa190'],
help='Default sha1_git to lookup', multiple=True)
def main(revs):
_git_sha1s = list(map(hashutil.hash_to_bytes, revs))
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/origin_head.py
@@ -0,0 +1,163 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import re
+import click
+import logging
+
+from swh.indexer.indexer import OriginIndexer
+
+
+class OriginHeadIndexer(OriginIndexer):
+ """Origin-level indexer.
+
+ This indexer is in charge of looking up the revision that acts as the
+ "head" of an origin.
+
+ In git, this is usually the commit pointed to by the 'master' branch."""
+
+ ADDITIONAL_CONFIG = {
+ 'tools': ('dict', {
+ 'name': 'origin-metadata',
+ 'version': '0.0.1',
+ 'configuration': {},
+ }),
+ }
+
+ def filter(self, ids):
+ yield from ids
+
+ def persist_index_computations(self, results, policy_update):
+ """Do nothing. The indexer's results are not persistant, they
+ should only be piped to another indexer via the orchestrator."""
+ pass
+
+ # Dispatch
+
+ def index(self, origin):
+ origin_id = origin['id']
+ latest_snapshot = self.storage.snapshot_get_latest(origin_id)
+ method = getattr(self, '_try_get_%s_head' % origin['type'], None)
+ if method is None:
+ method = self._try_get_head_generic
+ rev_id = method(latest_snapshot)
+ if rev_id is None:
+ return None
+ result = {
+ 'origin_id': origin_id,
+ 'revision_id': rev_id,
+ }
+ return result
+
+ # VCSs
+
+ def _try_get_vcs_head(self, snapshot):
+ try:
+ if isinstance(snapshot, dict):
+ branches = snapshot['branches']
+ if branches[b'HEAD']['target_type'] == 'revision':
+ return branches[b'HEAD']['target']
+ except KeyError:
+ return None
+
+ _try_get_hg_head = _try_get_git_head = _try_get_vcs_head
+
+ # Tarballs
+
+ _archive_filename_re = re.compile(
+ b'^'
+ b'(?P<pkgname>.*)[-_]'
+ b'(?P<version>[0-9]+(\.[0-9])*)'
+ b'(?P<preversion>[-+][a-zA-Z0-9.~]+?)?'
+ b'(?P<extension>(\.[a-zA-Z0-9]+)+)'
+ b'$')
+
+ @classmethod
+ def _parse_version(cls, filename):
+ """Extracts the release version from an archive filename,
+ to get an ordering whose maximum is likely to be the last
+ version of the software
+
+ >>> OriginHeadIndexer._parse_version(b'foo')
+ (-inf,)
+ >>> OriginHeadIndexer._parse_version(b'foo.tar.gz')
+ (-inf,)
+ >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz')
+ (0, 0, 1, 0)
+ >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz')
+ (0, 0, 1, -1, 'beta2')
+ >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz')
+ (0, 0, 1, 1, 'foobar')
+ """
+ res = cls._archive_filename_re.match(filename)
+ if res is None:
+ return (float('-infinity'),)
+ version = [int(n) for n in res.group('version').decode().split('.')]
+ if res.group('preversion') is None:
+ version.append(0)
+ else:
+ preversion = res.group('preversion').decode()
+ if preversion.startswith('-'):
+ version.append(-1)
+ version.append(preversion[1:])
+ elif preversion.startswith('+'):
+ version.append(1)
+ version.append(preversion[1:])
+ else:
+ assert False, res.group('preversion')
+ return tuple(version)
+
+ def _try_get_ftp_head(self, snapshot):
+ archive_names = list(snapshot['branches'])
+ max_archive_name = max(archive_names, key=self._parse_version)
+ r = self._try_resolve_target(snapshot['branches'], max_archive_name)
+ return r
+
+ # Generic
+
+ def _try_get_head_generic(self, snapshot):
+ # Works on 'deposit', 'svn', and 'pypi'.
+ try:
+ if isinstance(snapshot, dict):
+ branches = snapshot['branches']
+ except KeyError:
+ return None
+ else:
+ return (
+ self._try_resolve_target(branches, b'HEAD') or
+ self._try_resolve_target(branches, b'master')
+ )
+
+ def _try_resolve_target(self, branches, target_name):
+ try:
+ target = branches[target_name]
+ while target['target_type'] == 'alias':
+ target = branches[target['target']]
+ if target['target_type'] == 'revision':
+ return target['target']
+ elif target['target_type'] == 'content':
+ return None # TODO
+ elif target['target_type'] == 'directory':
+ return None # TODO
+ elif target['target_type'] == 'release':
+ return None # TODO
+ else:
+ assert False
+ except KeyError:
+ return None
+
+
+@click.command()
+@click.option('--origins', '-i',
+ help='Origins to lookup, in the "type+url" format',
+ multiple=True)
+def main(origins):
+ rev_metadata_indexer = OriginHeadIndexer()
+ rev_metadata_indexer.run(origins, 'update-dups', parse_ids=True)
+
+
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ main()
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/test_origin_head.py
@@ -0,0 +1,213 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+import logging
+from nose.tools import istest
+
+from swh.indexer.origin_head import OriginHeadIndexer
+from swh.indexer.tests.test_utils import MockIndexerStorage
+
+ORIGINS = [
+ {
+ 'id': 52189575,
+ 'lister': None,
+ 'project': None,
+ 'type': 'git',
+ 'url': 'https://github.com/SoftwareHeritage/swh-storage'},
+ {
+ 'id': 4423668,
+ 'lister': None,
+ 'project': None,
+ 'type': 'ftp',
+ 'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
+ {
+ 'id': 77775770,
+ 'lister': None,
+ 'project': None,
+ 'type': 'deposit',
+ 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
+ {
+ 'id': 85072327,
+ 'lister': None,
+ 'project': None,
+ 'type': 'pypi',
+ 'url': 'https://pypi.org/project/limnoria/'},
+ {
+ 'id': 49908349,
+ 'lister': None,
+ 'project': None,
+ 'type': 'svn',
+ 'url': 'http://0-512-md.googlecode.com/svn/'},
+ ]
+
+SNAPSHOTS = {
+ 52189575: {
+ 'branches': {
+ b'refs/heads/add-revision-origin-cache': {
+ 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
+ b's\xe7/\xe9l\x1e',
+ 'target_type': 'revision'},
+ b'HEAD': {
+ 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
+ b'\xac\xefrm',
+ 'target_type': 'revision'},
+ b'refs/tags/v0.0.103': {
+ 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
+ b'\x0f\xdd',
+ 'target_type': 'release'},
+ }},
+ 4423668: {
+ 'branches': {
+ b'3DLDF-1.1.4.tar.gz': {
+ 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc'
+ b'"G\x99\x11',
+ 'target_type': 'revision'},
+ b'3DLDF-2.0.2.tar.gz': {
+ 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
+ b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
+ 'target_type': 'revision'},
+ b'3DLDF-2.0.3-examples.tar.gz': {
+ 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
+ b'\xfe\xadZ\x80\x80\xc1\x83\xff',
+ 'target_type': 'revision'},
+ b'3DLDF-2.0.3.tar.gz': {
+ 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
+ b'\xcc\x1a\xb4`\x8c\x8by',
+ 'target_type': 'revision'},
+ b'3DLDF-2.0.tar.gz': {
+ 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
+ b'\xd3\xd1m',
+ b'target_type': 'revision'}
+ }},
+ 77775770: {
+ 'branches': {
+ b'master': {
+ 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
+ b'\xa6\xe9\x99\xb1\x9e]q\xeb',
+ 'target_type': 'revision'}
+ },
+ 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
+ b"\x1d\r "},
+ 85072327: {
+ 'branches': {
+ b'HEAD': {
+ 'target': b'releases/2018.09.09',
+ 'target_type': 'alias'},
+ b'releases/2018.09.01': {
+ 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
+ b'\xbb\xdfF\xfdw\xcf',
+ 'target_type': 'revision'},
+ b'releases/2018.09.09': {
+ 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
+ b'A\x10\x9d\xc5\xfa2\xf8t',
+ 'target_type': 'revision'}},
+ 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
+ b'\x12\x9e\xd6\xb3'},
+ 49908349: {
+ 'branches': {
+ b'master': {
+ 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
+ b'\xc9\xad#.\x1bw=\x18',
+ 'target_type': 'revision'}},
+ 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
+ b'\x05\xea\xb8\x1f\xc4H\xf4s'},
+ }
+
+
+class MockStorage:
+ def origin_get(self, id_):
+ for origin in ORIGINS:
+ if origin['type'] == id_['type'] and origin['url'] == id_['url']:
+ return origin
+ assert False, id_
+
+ def snapshot_get_latest(self, origin_id):
+ if origin_id in SNAPSHOTS:
+ return SNAPSHOTS[origin_id]
+ else:
+ assert False, origin_id
+
+
+class TestOriginHeadIndexer(OriginHeadIndexer):
+ """Specific indexer whose configuration is enough to satisfy the
+ indexing tests.
+ """
+ def prepare(self):
+ self.config = {
+ 'tools': {
+ 'name': 'origin-metadata',
+ 'version': '0.0.1',
+ 'configuration': {},
+ },
+ }
+ self.storage = MockStorage()
+ self.idx_storage = MockIndexerStorage()
+ self.log = logging.getLogger('swh.indexer')
+ self.objstorage = None
+ self.tools = self.register_tools(self.config['tools'])
+ self.tool = self.tools[0]
+ self.results = None
+
+ def persist_index_computations(self, results, policy_update):
+ self.results = results
+
+
+class OriginHead(unittest.TestCase):
+ @istest
+ def test_git(self):
+ indexer = TestOriginHeadIndexer()
+ indexer.run(
+ ['git+https://github.com/SoftwareHeritage/swh-storage'],
+ 'update-dups', parse_ids=True)
+ self.assertEqual(indexer.results, [{
+ 'revision_id': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{'
+ b'\xd7}\xac\xefrm',
+ 'origin_id': 52189575}])
+
+ @istest
+ def test_ftp(self):
+ indexer = TestOriginHeadIndexer()
+ indexer.run(
+ ['ftp+rsync://ftp.gnu.org/gnu/3dldf'],
+ 'update-dups', parse_ids=True)
+ self.assertEqual(indexer.results, [{
+ 'revision_id': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
+ b'\xcc\x1a\xb4`\x8c\x8by',
+ 'origin_id': 4423668}])
+
+ @istest
+ def test_deposit(self):
+ indexer = TestOriginHeadIndexer()
+ indexer.run(
+ ['deposit+https://forge.softwareheritage.org/source/'
+ 'jesuisgpl/'],
+ 'update-dups', parse_ids=True)
+ self.assertEqual(indexer.results, [{
+ 'revision_id': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
+ b'\xa6\xe9\x99\xb1\x9e]q\xeb',
+ 'origin_id': 77775770}])
+
+ @istest
+ def test_pypi(self):
+ indexer = TestOriginHeadIndexer()
+ indexer.run(
+ ['pypi+https://pypi.org/project/limnoria/'],
+ 'update-dups', parse_ids=True)
+ self.assertEqual(indexer.results, [{
+ 'revision_id': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
+ b'A\x10\x9d\xc5\xfa2\xf8t',
+ 'origin_id': 85072327}])
+
+ @istest
+ def test_svn(self):
+ indexer = TestOriginHeadIndexer()
+ indexer.run(
+ ['svn+http://0-512-md.googlecode.com/svn/'],
+ 'update-dups', parse_ids=True)
+ self.assertEqual(indexer.results, [{
+ 'revision_id': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
+ b'\xc9\xad#.\x1bw=\x18',
+ 'origin_id': 49908349}])
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -142,6 +142,15 @@
'context': 'npm'
},
}]
+ elif tool['tool_name'] == 'origin-metadata':
+ return [{
+ 'id': 8,
+ 'tool_name': 'origin-metadata',
+ 'tool_version': '0.0.1',
+ 'tool_configuration': {},
+ }]
+ else:
+ assert False, 'Unknown tool {tool_name}'.format(**tool)
def content_metadata_missing(self, sha1s):
yield from []

File Metadata

Mime Type
text/plain
Expires
Mon, Apr 14, 3:32 AM (14 h, 40 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218844

Event Timeline