Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/origin_head.py
# Copyright (C) 2018 The Software Heritage developers | # Copyright (C) 2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import re | import re | ||||
import click | import click | ||||
import logging | import logging | ||||
from swh.scheduler import get_scheduler | |||||
from swh.scheduler.utils import create_task_dict | |||||
from swh.indexer.indexer import OriginIndexer | from swh.indexer.indexer import OriginIndexer | ||||
from swh.model.hashutil import hash_to_hex | |||||
class OriginHeadIndexer(OriginIndexer): | class OriginHeadIndexer(OriginIndexer): | ||||
"""Origin-level indexer. | """Origin-level indexer. | ||||
This indexer is in charge of looking up the revision that acts as the | This indexer is in charge of looking up the revision that acts as the | ||||
"head" of an origin. | "head" of an origin. | ||||
In git, this is usually the commit pointed to by the 'master' branch.""" | In git, this is usually the commit pointed to by the 'master' branch.""" | ||||
ADDITIONAL_CONFIG = { | ADDITIONAL_CONFIG = { | ||||
'tools': ('dict', { | 'tools': ('dict', { | ||||
'name': 'origin-metadata', | 'name': 'origin-metadata', | ||||
'version': '0.0.1', | 'version': '0.0.1', | ||||
'configuration': {}, | 'configuration': {}, | ||||
}), | }), | ||||
'tasks': ('dict', { | |||||
'revision_metadata': 'revision_metadata', | |||||
'origin_intrinsic_metadata': 'origin_metadata', | |||||
}) | |||||
} | } | ||||
CONFIG_BASE_FILENAME = 'indexer/origin_head' | CONFIG_BASE_FILENAME = 'indexer/origin_head' | ||||
def persist_index_computations(self, results, policy_update): | def persist_index_computations(self, results, policy_update): | ||||
"""Do nothing. The indexer's results are not persistent, they | """Do nothing. The indexer's results are not persistent, they | ||||
should only be piped to another indexer.""" | should only be piped to another indexer.""" | ||||
pass | pass | ||||
def next_step(self, results, task): | |||||
"""Once the head is found, call the RevisionMetadataIndexer | |||||
on these revisions, then call the OriginMetadataIndexer with | |||||
both the origin_id and the revision metadata, so it can copy the | |||||
revision metadata to the origin's metadata. | |||||
Args: | |||||
results (Iterable[dict]): Iterable of return values from `index`. | |||||
""" | |||||
super().next_step(results, task) | |||||
revision_metadata_task = self.config['tasks']['revision_metadata'] | |||||
origin_intrinsic_metadata_task = self.config['tasks'][ | |||||
'origin_intrinsic_metadata'] | |||||
if revision_metadata_task is None and \ | |||||
origin_intrinsic_metadata_task is None: | |||||
return | |||||
assert revision_metadata_task is not None | |||||
assert origin_intrinsic_metadata_task is not None | |||||
# Second task to run after this one: copy the revision's metadata | |||||
# to the origin | |||||
sub_task = create_task_dict( | |||||
origin_intrinsic_metadata_task, | |||||
'oneshot', | |||||
origin_head={ | |||||
str(result['origin_id']): | |||||
hash_to_hex(result['revision_id']) | |||||
for result in results}, | |||||
policy_update='update-dups', | |||||
) | |||||
del sub_task['next_run'] # Not json-serializable | |||||
# First task to run after this one: index the metadata of the | |||||
# revision | |||||
task = create_task_dict( | |||||
revision_metadata_task, | |||||
'oneshot', | |||||
ids=[hash_to_hex(res['revision_id']) for res in results], | |||||
policy_update='update-dups', | |||||
next_step=sub_task, | |||||
) | |||||
if getattr(self, 'scheduler', None): | |||||
scheduler = self.scheduler | |||||
else: | |||||
scheduler = get_scheduler(**self.config['scheduler']) | |||||
scheduler.create_tasks([task]) | |||||
# Dispatch | # Dispatch | ||||
def index(self, origin): | def index(self, origin): | ||||
origin_id = origin['id'] | origin_id = origin['id'] | ||||
latest_snapshot = self.storage.snapshot_get_latest(origin_id) | latest_snapshot = self.storage.snapshot_get_latest(origin_id) | ||||
method = getattr(self, '_try_get_%s_head' % origin['type'], None) | method = getattr(self, '_try_get_%s_head' % origin['type'], None) | ||||
if method is None: | if method is None: | ||||
method = self._try_get_head_generic | method = self._try_get_head_generic | ||||
▲ Show 20 Lines • Show All 119 Lines • Show Last 20 Lines |