Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/origin_head.py
# Copyright (C) 2018 The Software Heritage developers | # Copyright (C) 2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import re | import re | ||||
import click | import click | ||||
import logging | import logging | ||||
from celery import chain | |||||
from swh.indexer.indexer import OriginIndexer | from swh.indexer.indexer import OriginIndexer | ||||
ardumont: oopsy ;) | |||||
from swh.indexer.tasks import OriginMetadataTask, RevisionMetadataTask | |||||
class OriginHeadIndexer(OriginIndexer): | class OriginHeadIndexer(OriginIndexer): | ||||
"""Origin-level indexer. | """Origin-level indexer. | ||||
This indexer is in charge of looking up the revision that acts as the | This indexer is in charge of looking up the revision that acts as the | ||||
"head" of an origin. | "head" of an origin. | ||||
In git, this is usually the commit pointed to by the 'master' branch.""" | In git, this is usually the commit pointed to by the 'master' branch.""" | ||||
ADDITIONAL_CONFIG = { | ADDITIONAL_CONFIG = { | ||||
'tools': ('dict', { | 'tools': ('dict', { | ||||
'name': 'origin-metadata', | 'name': 'origin-metadata', | ||||
'version': '0.0.1', | 'version': '0.0.1', | ||||
'configuration': {}, | 'configuration': {}, | ||||
}), | }), | ||||
} | } | ||||
CONFIG_BASE_FILENAME = 'indexer/origin_head' | |||||
revision_metadata_task = RevisionMetadataTask() | |||||
Not Done Inline Actionsdoopsy, cf. patch below. diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py index 7551fde..7ec69ce 100644 --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -10,7 +10,7 @@ import logging from celery import chain from swh.indexer.indexer import OriginIndexer -from swh.indexer.tasks import OriginMetadataTask, RevisionMetadataTask +from swh.indexer.tasks import OriginMetadata, RevisionMetadata class OriginHeadIndexer(OriginIndexer): @@ -31,8 +31,8 @@ class OriginHeadIndexer(OriginIndexer): CONFIG_BASE_FILENAME = 'indexer/origin_head' - revision_metadata_task = RevisionMetadataTask() - origin_intrinsic_metadata_task = OriginMetadataTask() + revision_metadata_task = RevisionMetadata() + origin_intrinsic_metadata_task = OriginMetadata() def filter(self, ids): yield from ids ardumont: doopsy, cf. patch below.
```
diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head. | |||||
origin_intrinsic_metadata_task = OriginMetadataTask() | |||||
def filter(self, ids): | def filter(self, ids): | ||||
yield from ids | yield from ids | ||||
def persist_index_computations(self, results, policy_update): | def persist_index_computations(self, results, policy_update): | ||||
"""Do nothing. The indexer's results are not persistant, they | """Do nothing. The indexer's results are not persistent, they | ||||
Done Inline Actionspersistent ardumont: persistent | |||||
should only be piped to another indexer via the orchestrator.""" | should only be piped to another indexer via the orchestrator.""" | ||||
pass | pass | ||||
def next_step(self, results): | |||||
"""Once the head is found, call the RevisionMetadataIndexer | |||||
on these revisions, then call the OriginMetadataIndexer with | |||||
both the origin_id and the revision metadata, so it can copy the | |||||
revision metadata to the origin's metadata. | |||||
Args: | |||||
results (Iterable[dict]): Iterable of return values from `index`. | |||||
""" | |||||
if self.revision_metadata_task is None and \ | |||||
self.origin_intrinsic_metadata_task is None: | |||||
return | |||||
assert self.revision_metadata_task is not None | |||||
Done Inline ActionsIt's redundant, isn't it? You cannot pass here since the method will already have returned if one of them is None. ardumont: It's redundant, isn't it?
You cannot pass here since the method will already have returned if… | |||||
Done Inline ActionsOops, should have been an and in the condition, not an or. vlorentz: Oops, should have been an `and` in the condition, not an `or`. | |||||
assert self.origin_intrinsic_metadata_task is not None | |||||
return chain( | |||||
self.revision_metadata_task.s( | |||||
ids=[res['revision_id'] for res in results], | |||||
policy_update='update-dups'), | |||||
self.origin_intrinsic_metadata_task.s( | |||||
origin_head_pairs=results, | |||||
policy_update='update-dups'), | |||||
)() | |||||
# Dispatch | # Dispatch | ||||
def index(self, origin): | def index(self, origin): | ||||
origin_id = origin['id'] | origin_id = origin['id'] | ||||
latest_snapshot = self.storage.snapshot_get_latest(origin_id) | latest_snapshot = self.storage.snapshot_get_latest(origin_id) | ||||
Not Done Inline ActionsFor information, you may want to update this with the latest swh.storage.algo.snapshot_get_all_branches api olasd opened [1] ardumont: For information, you may want to update this with the latest `swh.storage.algo. | |||||
Not Done Inline ActionsWhat i failed to explain is that currently, the snapshot_get_latest is a paginated result so you won't have all branches if you keep the old api call. I think that's relevant here. ardumont: What i failed to explain is that currently, the `snapshot_get_latest` is a paginated result so… | |||||
Done Inline ActionsWill do in a later diff. vlorentz: Will do in a later diff. | |||||
Not Done Inline Actionssure, the explanation was needed though ;) ardumont: sure, the explanation was needed though ;) | |||||
method = getattr(self, '_try_get_%s_head' % origin['type'], None) | method = getattr(self, '_try_get_%s_head' % origin['type'], None) | ||||
if method is None: | if method is None: | ||||
method = self._try_get_head_generic | method = self._try_get_head_generic | ||||
rev_id = method(latest_snapshot) | rev_id = method(latest_snapshot) | ||||
if rev_id is None: | if rev_id is None: | ||||
return None | return None | ||||
result = { | result = { | ||||
'origin_id': origin_id, | 'origin_id': origin_id, | ||||
▲ Show 20 Lines • Show All 114 Lines • Show Last 20 Lines |
oopsy ;)