Changeset View
Standalone View
swh/indexer/origin_head.py
- This file was added.
# Copyright (C) 2018 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import re | |||||
import click | |||||
import logging | |||||
from swh.indexer.indexer import OriginIndexer | |||||
class OriginHeadIndexer(OriginIndexer): | |||||
"""Origin-level indexer. | |||||
This indexer is in charge of looking up the revision that acts as the | |||||
"head" of an origin. | |||||
In git, this is usually the commit pointed to by the 'master' branch.""" | |||||
ADDITIONAL_CONFIG = { | |||||
'storage': ('dict', { | |||||
'cls': 'local', | |||||
zack: As this is going to be persisted in the DB, it's time to define some decent naming for the… | |||||
'args': { | |||||
'db': 'service=swh-replica', | |||||
'objstorage': OriginIndexer.DEFAULT_CONFIG['objstorage'][1], | |||||
} | |||||
Done Inline ActionsThe default configuration should be targeting a remote storage [1], defaulting to a local one as per [2] for example. [1] D372#inline-1828 [2] https://docs.softwareheritage.org/devel/getting-started.html#step-4-ingest-repositories ardumont: The default configuration should be targeting a remote storage [1], defaulting to a local one… | |||||
Done Inline ActionsSorry, I didn't mean to commit that. vlorentz: Sorry, I didn't mean to commit that. | |||||
}), | |||||
'tools': ('dict', { | |||||
'name': 'swh-head-revision', | |||||
'version': '0.0.1', | |||||
'configuration': {}, | |||||
}), | |||||
} | |||||
def filter(self, ids): | |||||
yield from ids | |||||
def persist_index_computations(self, results, policy_update): | |||||
self.idx_storage.origin_head_add( | |||||
results, | |||||
conflict_update=(policy_update == 'update-dups')) | |||||
# Dispatch | |||||
Done Inline Actionsif you are keeping prints for testing, don't mind this comment. moranegg: if you are keeping prints for testing, don't mind this comment.
If not, don't forget erasing… | |||||
def index(self, origin): | |||||
Not Done Inline Actionsi would rename method or comment with an explicit comment saying but i'm being picky.. moranegg: i would rename method or comment with an explicit comment saying
that each origin type requires… | |||||
origin_id = origin['id'] | |||||
latest_snapshot = self.storage.snapshot_get_latest(origin_id) | |||||
method = getattr(self, '_try_get_%s_head' % origin['type'], None) | |||||
if method is None: | |||||
method = self._try_get_head_generic | |||||
rev_id = method(latest_snapshot) | |||||
if rev_id is None: | |||||
return None | |||||
result = { | |||||
'origin_id': origin_id, | |||||
Not Done Inline Actionsas per T1257 this is no longer git-specific, so this should be _try_get_head zack: as per T1257 this is no longer git-specific, so this should be `_try_get_head` | |||||
'revision_id': rev_id, | |||||
Not Done Inline ActionsHere the result is without the metadata and will be kept in the DB as a pointer to the last seen revision. It seems a good compartmentalization. moranegg: Here the result is without the metadata and will be kept in the DB as a pointer to the last… | |||||
Done Inline Actions+1 for using another component vlorentz: +1 for using another component | |||||
Done Inline ActionsNow that I think about it, it may affect the search between the time the OriginHead indexer runs and updates the reference, and the time the revision metadata is copied. We should probably discuss this further, outside a Diff inline comments vlorentz: Now that I think about it, it may affect the search between the time the OriginHead indexer… | |||||
Not Done Inline ActionsI agree. I'm voting to not copy the metadata and keep only the reference (which might change quickly) moranegg: I agree.
I'm voting to not copy the metadata and keep only the reference (which might change… | |||||
} | |||||
return result | |||||
# VCSs | |||||
def _try_get_git_head(self, snapshot): | |||||
try: | |||||
if isinstance(snapshot, dict): | |||||
branches = snapshot['branches'] | |||||
Not Done Inline Actions… for the same reason this should now be removed zack: … for the same reason this should now be removed | |||||
if branches[b'HEAD']['target_type'] == 'revision': | |||||
return branches[b'HEAD']['target'] | |||||
except KeyError: | |||||
return None | |||||
Not Done Inline ActionsI'm pretty sure this regex should not be here. It's a naming convention that, I guess, is also implemented by the tarball loader, right? zack: I'm pretty sure this regex should not be here. It's a naming convention that, I guess, is also… | |||||
Done Inline ActionsAs far as I understand, the tarball loader does not try to parse the file name; it only downloads a tarball from an URL, extracts it, and forwards it to the dir loader (which doesn't seem to parse names either). vlorentz: As far as I understand, the tarball loader does not try to parse the file name; it only… | |||||
Not Done Inline ActionsFair enough. Thanks for checking. zack: Fair enough. Thanks for checking. | |||||
def _try_get_hg_head(self, snapshot): | |||||
pass # TODO, see https://forge.softwareheritage.org/T1189 | |||||
Not Done Inline ActionsFor the snapshot, possible target_type per loader:
ardumont: For the snapshot, possible target_type per loader:
- mercurial: release, revision [1]
- pypi… | |||||
Done Inline ActionsWhat would the name of the "head" "branch" in mercurial be? "tip"? vlorentz: What would the name of the "head" "branch" in mercurial be? "tip"? | |||||
Not Done Inline Actionsdefault is the head branch. ardumont: `default` is the head branch.
`tip` is what they use to define it, i think. | |||||
# Tarballs | |||||
_archive_filename_re = re.compile( | |||||
b'^' | |||||
b'(?P<pkgname>.*)[-_]' | |||||
Not Done Inline Actionssame for this method. (And hence maybe we do not need to expose the regex, but rather a filename parsing method from the tarball loader?) zack: same for this method. (And hence maybe we do not need to expose the regex, but rather a… | |||||
b'(?P<version>[0-9]+(\.[0-9])*)' | |||||
b'(?P<preversion>[-+][a-zA-Z0-9.~]+?)?' | |||||
Not Done Inline ActionsI'm not a real python programmer, so this is not my place to say I think it's only me, so you can completely ignore this comment. moranegg: I'm not a real python programmer, so this is not my place to say
but the else after the except… | |||||
Done Inline ActionsIt's a block that is run if the try: block did not raise an exception. I can drop the else: if you find it less readable. vlorentz: It's a block that is run if the `try:` block did not raise an exception. I can drop the `else:`… | |||||
b'(?P<extension>(\.[a-zA-Z0-9]+)+)' | |||||
b'$') | |||||
@classmethod | |||||
def _parse_version(cls, filename): | |||||
"""Extracts the release version from an archive filename, | |||||
to get an ordering whose maximum is likely to be the last | |||||
version of the software | |||||
>>> OriginHeadIndexer._parse_version(b'foo') | |||||
Not Done Inline Actionsis 'alias' and 'revision' the only target_type possible? moranegg: is 'alias' and 'revision' the only target_type possible?
| |||||
Done Inline Actionsafaik, yes. vlorentz: afaik, yes. | |||||
Done Inline Actionspsql does not agree: softwareheritage=> select distinct target_type from snapshot_branch; target_type ------------- content release revision alias directory I'm not sure what to do with content and directory though. I'll create an issue later. vlorentz: psql does not agree:
```
softwareheritage=> select distinct target_type from snapshot_branch… | |||||
(-inf,) | |||||
>>> OriginHeadIndexer._parse_version(b'foo.tar.gz') | |||||
(-inf,) | |||||
>>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz') | |||||
(0, 0, 1, 0) | |||||
>>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') | |||||
(0, 0, 1, -1, 'beta2') | |||||
>>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') | |||||
(0, 0, 1, 1, 'foobar') | |||||
""" | |||||
res = cls._archive_filename_re.match(filename) | |||||
if res is None: | |||||
return (float('-infinity'),) | |||||
version = [int(n) for n in res.group('version').decode().split('.')] | |||||
if res.group('preversion') is None: | |||||
version.append(0) | |||||
else: | |||||
preversion = res.group('preversion').decode() | |||||
if preversion.startswith('-'): | |||||
version.append(-1) | |||||
Done Inline ActionsWe shouldn't have a default here, as there is no sane default. If you want the list somewhere for ease of testing please put them in a REAME or use a local script. zack: We shouldn't have a default here, as there is no sane default.
If you want the list somewhere… | |||||
Done Inline ActionsI used a default list with the RevisionMetatadaIndexer to test the component outside the MockStorage moranegg: I used a default list with the RevisionMetatadaIndexer to test the component outside the… | |||||
Done Inline ActionsI'll turn these into tests. vlorentz: I'll turn these into tests. | |||||
version.append(preversion[1:]) | |||||
elif preversion.startswith('+'): | |||||
version.append(1) | |||||
version.append(preversion[1:]) | |||||
else: | |||||
assert False, res.group('preversion') | |||||
return tuple(version) | |||||
def _try_get_ftp_head(self, snapshot): | |||||
archive_names = list(snapshot['branches']) | |||||
max_archive_name = max(archive_names, key=self._parse_version) | |||||
r = self._try_resolve_target(snapshot['branches'], max_archive_name) | |||||
return r | |||||
# Generic | |||||
def _try_get_head_generic(self, snapshot): | |||||
# Works on 'deposit', 'svn', and 'pypi'. | |||||
try: | |||||
if isinstance(snapshot, dict): | |||||
branches = snapshot['branches'] | |||||
except KeyError: | |||||
return None | |||||
else: | |||||
return ( | |||||
self._try_resolve_target(branches, b'HEAD') or | |||||
self._try_resolve_target(branches, b'master') | |||||
) | |||||
def _try_resolve_target(self, branches, target_name): | |||||
try: | |||||
target = branches[target_name] | |||||
while target['target_type'] == 'alias': | |||||
target = branches[target['target']] | |||||
if target['target_type'] == 'revision': | |||||
return target['target'] | |||||
elif target['target_type'] == 'content': | |||||
return None # TODO | |||||
elif target['target_type'] == 'directory': | |||||
return None # TODO | |||||
elif target['target_type'] == 'release': | |||||
return None # TODO | |||||
else: | |||||
assert False | |||||
except KeyError: | |||||
return None | |||||
@click.command() | |||||
@click.option('--origins', '-i', | |||||
Not Done Inline ActionsI don't mind keeping the main method with the defaults here for a few iterations before deleting it. moranegg: I don't mind keeping the main method with the defaults here for a few iterations before… | |||||
help='Origins to lookup, in the "type+url" format', | |||||
multiple=True) | |||||
def main(origins): | |||||
rev_metadata_indexer = OriginHeadIndexer() | |||||
rev_metadata_indexer.run(origins, 'update-dups', parse_ids=True) | |||||
if __name__ == '__main__': | |||||
logging.basicConfig(level=logging.INFO) | |||||
main() |
As this is going to be persisted in the DB, it's time to define some decent naming for the indexer tools.
This is the "metadata indexer" so we should use it somewhere.
Also, we should go hierarchically: metadata → {origin,content,revision,etc.}
The fact we take the "head" revision is an implementation detail, it's just how the "origin metadata indexer" currently works.
"indexer" can remain implicit, as it's what all the tools do.
Bottom line: let's go for "metadata-origin".