Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7437734
D490.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
18 KB
Subscribers
None
D490.diff
View Options
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -79,15 +79,15 @@
storage.
To implement a new object type indexer, inherit from the
- BaseIndexer and implement the process of indexation:
+ BaseIndexer and implement indexing:
:func:`run`:
object_ids are different depending on object. For example: sha1 for
content, sha1_git for revision, directory, release, and id for origin
To implement a new concrete indexer, inherit from the object level
- classes: :class:`ContentIndexer`, :class:`RevisionIndexer` (later
- on :class:`OriginIndexer` will also be available)
+ classes: :class:`ContentIndexer`, :class:`RevisionIndexer`,
+ :class:`OriginIndexer`.
Then you need to implement the following functions:
@@ -337,8 +337,7 @@
class ContentIndexer(BaseIndexer):
"""An object type indexer, inherits from the :class:`BaseIndexer` and
- implements the process of indexation for Contents using the run
- method
+ implements Content indexing using the run method
Note: the :class:`ContentIndexer` is not an instantiable
object. To use it in another context, one should inherit from this
@@ -384,10 +383,68 @@
self.rescheduling_task.delay(ids, policy_update)
+class OriginIndexer(BaseIndexer):
+ """An object type indexer, inherits from the :class:`BaseIndexer` and
+ implements Origin indexing using the run method
+
+ Note: the :class:`OriginIndexer` is not an instantiable object.
+ To use it in another context one should inherit from this class
+ and override the methods mentioned in the :class:`BaseIndexer`
+ class.
+
+ """
+ def run(self, ids, policy_update, parse_ids=False):
+ """Given a list of origin ids:
+
+ - retrieve origins from storage
+ - execute the indexing computations
+ - store the results (according to policy_update)
+
+ Args:
+ ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or
+ (type, url) tuples.
+ policy_update ([str]): either 'update-dups' or 'ignore-dups' to
+ respectively update duplicates or ignore
+ them
+ parse_ids ([bool]: If `True`, will try to convert `ids`
+ from a human input to the valid type.
+
+ """
+ if parse_ids:
+ ids = [
+ o.split('+', 1) if ':' in o else int(o) # type+url or id
+ for o in ids]
+
+ results = []
+
+ for id_ in ids:
+ if isinstance(id_, (tuple, list)):
+ if len(id_) != 2:
+ raise TypeError('Expected a (type, url) tuple.')
+ (type_, url) = id_
+ params = {'type': type_, 'url': url}
+ elif isinstance(id_, int):
+ params = {'id': id_}
+ else:
+ raise TypeError('Invalid value for "ids": %r' % id_)
+ origin = self.storage.origin_get(params)
+ if not origin:
+ self.log.warn('Origins %s not found in storage' %
+ list(ids))
+ continue
+ try:
+ res = self.index(origin)
+ if origin: # If no results, skip it
+ results.append(res)
+ except Exception:
+ self.log.exception(
+ 'Problem when processing origin %s' % id_)
+ self.persist_index_computations(results, policy_update)
+
+
class RevisionIndexer(BaseIndexer):
"""An object type indexer, inherits from the :class:`BaseIndexer` and
- implements the process of indexation for Revisions using the run
- method
+ implements Revision indexing using the run method
Note: the :class:`RevisionIndexer` is not an instantiable object.
To use it in another context one should inherit from this class
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -268,9 +268,6 @@
@click.command()
@click.option('--revs', '-i',
- default=['8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
- '026040ea79dec1b49b4e3e7beda9132b6b26b51b',
- '9699072e21eded4be8d45e3b8d543952533fa190'],
help='Default sha1_git to lookup', multiple=True)
def main(revs):
_git_sha1s = list(map(hashutil.hash_to_bytes, revs))
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/origin_head.py
@@ -0,0 +1,163 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import re
+import click
+import logging
+
+from swh.indexer.indexer import OriginIndexer
+
+
+class OriginHeadIndexer(OriginIndexer):
+ """Origin-level indexer.
+
+ This indexer is in charge of looking up the revision that acts as the
+ "head" of an origin.
+
+ In git, this is usually the commit pointed to by the 'master' branch."""
+
+ ADDITIONAL_CONFIG = {
+ 'tools': ('dict', {
+ 'name': 'origin-metadata',
+ 'version': '0.0.1',
+ 'configuration': {},
+ }),
+ }
+
+ def filter(self, ids):
+ yield from ids
+
+ def persist_index_computations(self, results, policy_update):
+ """Do nothing. The indexer's results are not persistant, they
+ should only be piped to another indexer via the orchestrator."""
+ pass
+
+ # Dispatch
+
+ def index(self, origin):
+ origin_id = origin['id']
+ latest_snapshot = self.storage.snapshot_get_latest(origin_id)
+ method = getattr(self, '_try_get_%s_head' % origin['type'], None)
+ if method is None:
+ method = self._try_get_head_generic
+ rev_id = method(latest_snapshot)
+ if rev_id is None:
+ return None
+ result = {
+ 'origin_id': origin_id,
+ 'revision_id': rev_id,
+ }
+ return result
+
+ # VCSs
+
+ def _try_get_vcs_head(self, snapshot):
+ try:
+ if isinstance(snapshot, dict):
+ branches = snapshot['branches']
+ if branches[b'HEAD']['target_type'] == 'revision':
+ return branches[b'HEAD']['target']
+ except KeyError:
+ return None
+
+ _try_get_hg_head = _try_get_git_head = _try_get_vcs_head
+
+ # Tarballs
+
+ _archive_filename_re = re.compile(
+ b'^'
+ b'(?P<pkgname>.*)[-_]'
+ b'(?P<version>[0-9]+(\.[0-9])*)'
+ b'(?P<preversion>[-+][a-zA-Z0-9.~]+?)?'
+ b'(?P<extension>(\.[a-zA-Z0-9]+)+)'
+ b'$')
+
+ @classmethod
+ def _parse_version(cls, filename):
+ """Extracts the release version from an archive filename,
+ to get an ordering whose maximum is likely to be the last
+ version of the software
+
+ >>> OriginHeadIndexer._parse_version(b'foo')
+ (-inf,)
+ >>> OriginHeadIndexer._parse_version(b'foo.tar.gz')
+ (-inf,)
+ >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz')
+ (0, 0, 1, 0)
+ >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz')
+ (0, 0, 1, -1, 'beta2')
+ >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz')
+ (0, 0, 1, 1, 'foobar')
+ """
+ res = cls._archive_filename_re.match(filename)
+ if res is None:
+ return (float('-infinity'),)
+ version = [int(n) for n in res.group('version').decode().split('.')]
+ if res.group('preversion') is None:
+ version.append(0)
+ else:
+ preversion = res.group('preversion').decode()
+ if preversion.startswith('-'):
+ version.append(-1)
+ version.append(preversion[1:])
+ elif preversion.startswith('+'):
+ version.append(1)
+ version.append(preversion[1:])
+ else:
+ assert False, res.group('preversion')
+ return tuple(version)
+
+ def _try_get_ftp_head(self, snapshot):
+ archive_names = list(snapshot['branches'])
+ max_archive_name = max(archive_names, key=self._parse_version)
+ r = self._try_resolve_target(snapshot['branches'], max_archive_name)
+ return r
+
+ # Generic
+
+ def _try_get_head_generic(self, snapshot):
+ # Works on 'deposit', 'svn', and 'pypi'.
+ try:
+ if isinstance(snapshot, dict):
+ branches = snapshot['branches']
+ except KeyError:
+ return None
+ else:
+ return (
+ self._try_resolve_target(branches, b'HEAD') or
+ self._try_resolve_target(branches, b'master')
+ )
+
+ def _try_resolve_target(self, branches, target_name):
+ try:
+ target = branches[target_name]
+ while target['target_type'] == 'alias':
+ target = branches[target['target']]
+ if target['target_type'] == 'revision':
+ return target['target']
+ elif target['target_type'] == 'content':
+ return None # TODO
+ elif target['target_type'] == 'directory':
+ return None # TODO
+ elif target['target_type'] == 'release':
+ return None # TODO
+ else:
+ assert False
+ except KeyError:
+ return None
+
+
+@click.command()
+@click.option('--origins', '-i',
+ help='Origins to lookup, in the "type+url" format',
+ multiple=True)
+def main(origins):
+ rev_metadata_indexer = OriginHeadIndexer()
+ rev_metadata_indexer.run(origins, 'update-dups', parse_ids=True)
+
+
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ main()
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/test_origin_head.py
@@ -0,0 +1,213 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+import logging
+from nose.tools import istest
+
+from swh.indexer.origin_head import OriginHeadIndexer
+from swh.indexer.tests.test_utils import MockIndexerStorage
+
+ORIGINS = [
+ {
+ 'id': 52189575,
+ 'lister': None,
+ 'project': None,
+ 'type': 'git',
+ 'url': 'https://github.com/SoftwareHeritage/swh-storage'},
+ {
+ 'id': 4423668,
+ 'lister': None,
+ 'project': None,
+ 'type': 'ftp',
+ 'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
+ {
+ 'id': 77775770,
+ 'lister': None,
+ 'project': None,
+ 'type': 'deposit',
+ 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
+ {
+ 'id': 85072327,
+ 'lister': None,
+ 'project': None,
+ 'type': 'pypi',
+ 'url': 'https://pypi.org/project/limnoria/'},
+ {
+ 'id': 49908349,
+ 'lister': None,
+ 'project': None,
+ 'type': 'svn',
+ 'url': 'http://0-512-md.googlecode.com/svn/'},
+ ]
+
+SNAPSHOTS = {
+ 52189575: {
+ 'branches': {
+ b'refs/heads/add-revision-origin-cache': {
+ 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
+ b's\xe7/\xe9l\x1e',
+ 'target_type': 'revision'},
+ b'HEAD': {
+ 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
+ b'\xac\xefrm',
+ 'target_type': 'revision'},
+ b'refs/tags/v0.0.103': {
+ 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
+ b'\x0f\xdd',
+ 'target_type': 'release'},
+ }},
+ 4423668: {
+ 'branches': {
+ b'3DLDF-1.1.4.tar.gz': {
+ 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc'
+ b'"G\x99\x11',
+ 'target_type': 'revision'},
+ b'3DLDF-2.0.2.tar.gz': {
+ 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
+ b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
+ 'target_type': 'revision'},
+ b'3DLDF-2.0.3-examples.tar.gz': {
+ 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
+ b'\xfe\xadZ\x80\x80\xc1\x83\xff',
+ 'target_type': 'revision'},
+ b'3DLDF-2.0.3.tar.gz': {
+ 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
+ b'\xcc\x1a\xb4`\x8c\x8by',
+ 'target_type': 'revision'},
+ b'3DLDF-2.0.tar.gz': {
+ 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
+ b'\xd3\xd1m',
+ b'target_type': 'revision'}
+ }},
+ 77775770: {
+ 'branches': {
+ b'master': {
+ 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
+ b'\xa6\xe9\x99\xb1\x9e]q\xeb',
+ 'target_type': 'revision'}
+ },
+ 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
+ b"\x1d\r "},
+ 85072327: {
+ 'branches': {
+ b'HEAD': {
+ 'target': b'releases/2018.09.09',
+ 'target_type': 'alias'},
+ b'releases/2018.09.01': {
+ 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
+ b'\xbb\xdfF\xfdw\xcf',
+ 'target_type': 'revision'},
+ b'releases/2018.09.09': {
+ 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
+ b'A\x10\x9d\xc5\xfa2\xf8t',
+ 'target_type': 'revision'}},
+ 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
+ b'\x12\x9e\xd6\xb3'},
+ 49908349: {
+ 'branches': {
+ b'master': {
+ 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
+ b'\xc9\xad#.\x1bw=\x18',
+ 'target_type': 'revision'}},
+ 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
+ b'\x05\xea\xb8\x1f\xc4H\xf4s'},
+ }
+
+
+class MockStorage:
+ def origin_get(self, id_):
+ for origin in ORIGINS:
+ if origin['type'] == id_['type'] and origin['url'] == id_['url']:
+ return origin
+ assert False, id_
+
+ def snapshot_get_latest(self, origin_id):
+ if origin_id in SNAPSHOTS:
+ return SNAPSHOTS[origin_id]
+ else:
+ assert False, origin_id
+
+
+class TestOriginHeadIndexer(OriginHeadIndexer):
+ """Specific indexer whose configuration is enough to satisfy the
+ indexing tests.
+ """
+ def prepare(self):
+ self.config = {
+ 'tools': {
+ 'name': 'origin-metadata',
+ 'version': '0.0.1',
+ 'configuration': {},
+ },
+ }
+ self.storage = MockStorage()
+ self.idx_storage = MockIndexerStorage()
+ self.log = logging.getLogger('swh.indexer')
+ self.objstorage = None
+ self.tools = self.register_tools(self.config['tools'])
+ self.tool = self.tools[0]
+ self.results = None
+
+ def persist_index_computations(self, results, policy_update):
+ self.results = results
+
+
+class OriginHead(unittest.TestCase):
+ @istest
+ def test_git(self):
+ indexer = TestOriginHeadIndexer()
+ indexer.run(
+ ['git+https://github.com/SoftwareHeritage/swh-storage'],
+ 'update-dups', parse_ids=True)
+ self.assertEqual(indexer.results, [{
+ 'revision_id': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{'
+ b'\xd7}\xac\xefrm',
+ 'origin_id': 52189575}])
+
+ @istest
+ def test_ftp(self):
+ indexer = TestOriginHeadIndexer()
+ indexer.run(
+ ['ftp+rsync://ftp.gnu.org/gnu/3dldf'],
+ 'update-dups', parse_ids=True)
+ self.assertEqual(indexer.results, [{
+ 'revision_id': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
+ b'\xcc\x1a\xb4`\x8c\x8by',
+ 'origin_id': 4423668}])
+
+ @istest
+ def test_deposit(self):
+ indexer = TestOriginHeadIndexer()
+ indexer.run(
+ ['deposit+https://forge.softwareheritage.org/source/'
+ 'jesuisgpl/'],
+ 'update-dups', parse_ids=True)
+ self.assertEqual(indexer.results, [{
+ 'revision_id': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
+ b'\xa6\xe9\x99\xb1\x9e]q\xeb',
+ 'origin_id': 77775770}])
+
+ @istest
+ def test_pypi(self):
+ indexer = TestOriginHeadIndexer()
+ indexer.run(
+ ['pypi+https://pypi.org/project/limnoria/'],
+ 'update-dups', parse_ids=True)
+ self.assertEqual(indexer.results, [{
+ 'revision_id': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
+ b'A\x10\x9d\xc5\xfa2\xf8t',
+ 'origin_id': 85072327}])
+
+ @istest
+ def test_svn(self):
+ indexer = TestOriginHeadIndexer()
+ indexer.run(
+ ['svn+http://0-512-md.googlecode.com/svn/'],
+ 'update-dups', parse_ids=True)
+ self.assertEqual(indexer.results, [{
+ 'revision_id': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
+ b'\xc9\xad#.\x1bw=\x18',
+ 'origin_id': 49908349}])
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -142,6 +142,15 @@
'context': 'npm'
},
}]
+ elif tool['tool_name'] == 'origin-metadata':
+ return [{
+ 'id': 8,
+ 'tool_name': 'origin-metadata',
+ 'tool_version': '0.0.1',
+ 'tool_configuration': {},
+ }]
+ else:
+ assert False, 'Unknown tool {tool_name}'.format(**tool)
def content_metadata_missing(self, sha1s):
yield from []
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Apr 14, 3:32 AM (14 h, 40 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218844
Attached To
D490: Add OriginIndexer + OriginHeadIndexer.
Event Timeline
Log In to Comment