Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/indexer.py
Show First 20 Lines • Show All 73 Lines • ▼ Show 20 Lines | class BaseIndexer(SWHConfig, | ||||
Indexers can: | Indexers can: | ||||
- filter out ids whose data has already been indexed. | - filter out ids whose data has already been indexed. | ||||
- retrieve ids data from storage or objstorage | - retrieve ids data from storage or objstorage | ||||
- index this data depending on the object and store the result in | - index this data depending on the object and store the result in | ||||
storage. | storage. | ||||
To implement a new object type indexer, inherit from the | To implement a new object type indexer, inherit from the | ||||
BaseIndexer and implement the process of indexation: | BaseIndexer and implement indexing: | ||||
:func:`run`: | :func:`run`: | ||||
object_ids are different depending on object. For example: sha1 for | object_ids are different depending on object. For example: sha1 for | ||||
content, sha1_git for revision, directory, release, and id for origin | content, sha1_git for revision, directory, release, and id for origin | ||||
To implement a new concrete indexer, inherit from the object level | To implement a new concrete indexer, inherit from the object level | ||||
classes: :class:`ContentIndexer`, :class:`RevisionIndexer` (later | classes: :class:`ContentIndexer`, :class:`RevisionIndexer`, | ||||
ardumont: you can remove the parenthesis mention now. | |||||
on :class:`OriginIndexer` will also be available) | :class:`OriginIndexer`. | ||||
Then you need to implement the following functions: | Then you need to implement the following functions: | ||||
:func:`filter`: | :func:`filter`: | ||||
filter out data already indexed (in storage). This function is used by | filter out data already indexed (in storage). This function is used by | ||||
the orchestrator and not directly by the indexer | the orchestrator and not directly by the indexer | ||||
(cf. swh.indexer.orchestrator.BaseOrchestratorIndexer). | (cf. swh.indexer.orchestrator.BaseOrchestratorIndexer). | ||||
▲ Show 20 Lines • Show All 233 Lines • ▼ Show 20 Lines | def run(self, ids, policy_update): | ||||
respectively update duplicates or ignore them | respectively update duplicates or ignore them | ||||
""" | """ | ||||
pass | pass | ||||
class ContentIndexer(BaseIndexer): | class ContentIndexer(BaseIndexer): | ||||
"""An object type indexer, inherits from the :class:`BaseIndexer` and | """An object type indexer, inherits from the :class:`BaseIndexer` and | ||||
implements the process of indexation for Contents using the run | implements Content indexing using the run method | ||||
method | |||||
Note: the :class:`ContentIndexer` is not an instantiable | Note: the :class:`ContentIndexer` is not an instantiable | ||||
object. To use it in another context, one should inherit from this | object. To use it in another context, one should inherit from this | ||||
class and override the methods mentioned in the | class and override the methods mentioned in the | ||||
:class:`BaseIndexer` class. | :class:`BaseIndexer` class. | ||||
""" | """ | ||||
Show All 29 Lines | def run(self, ids, policy_update): | ||||
except Exception: | except Exception: | ||||
self.log.exception( | self.log.exception( | ||||
'Problem when reading contents metadata.') | 'Problem when reading contents metadata.') | ||||
if self.rescheduling_task: | if self.rescheduling_task: | ||||
self.log.warn('Rescheduling batch') | self.log.warn('Rescheduling batch') | ||||
self.rescheduling_task.delay(ids, policy_update) | self.rescheduling_task.delay(ids, policy_update) | ||||
class OriginIndexer(BaseIndexer): | |||||
"""An object type indexer, inherits from the :class:`BaseIndexer` and | |||||
Done Inline Actionsshould be "implements origin indexing" More generally in this file (not related to this diff but might be a chance to fix it) we use "indexation" which means something else in English. We should use "indexing" consistently zack: should be "implements origin indexing"
More generally in this file (not related to this diff… | |||||
implements Origin indexing using the run method | |||||
Note: the :class:`OriginIndexer` is not an instantiable object. | |||||
To use it in another context one should inherit from this class | |||||
and override the methods mentioned in the :class:`BaseIndexer` | |||||
class. | |||||
""" | |||||
def run(self, ids, policy_update, parse_ids=False): | |||||
"""Given a list of origin ids: | |||||
- retrieve origins from storage | |||||
- execute the indexing computations | |||||
- store the results (according to policy_update) | |||||
Args: | |||||
ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or | |||||
(type, url) tuples. | |||||
policy_update ([str]): either 'update-dups' or 'ignore-dups' to | |||||
respectively update duplicates or ignore | |||||
them | |||||
parse_ids ([bool]: If `True`, will try to convert `ids` | |||||
Done Inline Actionsthumbs up for the detailed type annotations here zack: thumbs up for the detailed type annotations here | |||||
from a human input to the valid type. | |||||
""" | |||||
if parse_ids: | |||||
ids = [ | |||||
o.split('+', 1) if ':' in o else int(o) # type+url or id | |||||
for o in ids] | |||||
results = [] | |||||
for id_ in ids: | |||||
if isinstance(id_, (tuple, list)): | |||||
if len(id_) != 2: | |||||
Done Inline ActionsExpected a (type... ardumont: `Expected` a (type... | |||||
raise TypeError('Expected a (type, url) tuple.') | |||||
(type_, url) = id_ | |||||
params = {'type': type_, 'url': url} | |||||
elif isinstance(id_, int): | |||||
params = {'id': id_} | |||||
else: | |||||
raise TypeError('Invalid value for "ids": %r' % id_) | |||||
origin = self.storage.origin_get(params) | |||||
if not origin: | |||||
self.log.warn('Origins %s not found in storage' % | |||||
list(ids)) | |||||
continue | |||||
try: | |||||
res = self.index(origin) | |||||
if origin: # If no results, skip it | |||||
results.append(res) | |||||
except Exception: | |||||
self.log.exception( | |||||
Done Inline ActionsMaybe worth mentioning the origin in question in the error log. ardumont: Maybe worth mentioning the origin in question in the error log.
| |||||
'Problem when processing origin %s' % id_) | |||||
self.persist_index_computations(results, policy_update) | |||||
class RevisionIndexer(BaseIndexer): | class RevisionIndexer(BaseIndexer): | ||||
"""An object type indexer, inherits from the :class:`BaseIndexer` and | """An object type indexer, inherits from the :class:`BaseIndexer` and | ||||
implements the process of indexation for Revisions using the run | implements Revision indexing using the run method | ||||
method | |||||
Note: the :class:`RevisionIndexer` is not an instantiable object. | Note: the :class:`RevisionIndexer` is not an instantiable object. | ||||
To use it in another context one should inherit from this class | To use it in another context one should inherit from this class | ||||
and override the methods mentioned in the :class:`BaseIndexer` | and override the methods mentioned in the :class:`BaseIndexer` | ||||
class. | class. | ||||
""" | """ | ||||
def run(self, ids, policy_update): | def run(self, ids, policy_update): | ||||
Show All 29 Lines |
you can remove the parenthesis mention now.