Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/indexer.py
Show First 20 Lines • Show All 505 Lines • ▼ Show 20 Lines | def run(self, start, end, skip_existing=True, **kwargs): | ||||
with_indexed_data = True | with_indexed_data = True | ||||
except Exception: | except Exception: | ||||
self.log.exception( | self.log.exception( | ||||
'Problem when computing metadata.') | 'Problem when computing metadata.') | ||||
finally: | finally: | ||||
return with_indexed_data | return with_indexed_data | ||||
def origin_get_params(id_): | |||||
"""From any of the two types of origin identifiers (int or | |||||
type+url), returns a dict that can be passed to Storage.origin_get. | |||||
Also accepts JSON-encoded forms of these (used via the task scheduler). | |||||
>>> from pprint import pprint | |||||
>>> origin_get_params(123) | |||||
{'id': 123} | |||||
>>> pprint(origin_get_params(['git', 'https://example.com/foo.git'])) | |||||
{'type': 'git', 'url': 'https://example.com/foo.git'} | |||||
>>> origin_get_params("123") | |||||
{'id': 123} | |||||
>>> pprint(origin_get_params('["git", "https://example.com/foo.git"]')) | |||||
{'type': 'git', 'url': 'https://example.com/foo.git'} | |||||
""" | |||||
if isinstance(id_, str): | |||||
# Data coming from JSON, which requires string keys, so | |||||
# one extra level of deserialization is needed | |||||
id_ = ast.literal_eval(id_) | |||||
if isinstance(id_, (tuple, list)): | |||||
if len(id_) != 2: | |||||
raise TypeError('Expected a (type, url) tuple.') | |||||
(type_, url) = id_ | |||||
params = {'type': type_, 'url': url} | |||||
elif isinstance(id_, int): | |||||
params = {'id': id_} | |||||
else: | |||||
raise TypeError('Invalid value in "ids": %r' % id_) | |||||
return params | |||||
class OriginIndexer(BaseIndexer): | class OriginIndexer(BaseIndexer): | ||||
"""An object type indexer, inherits from the :class:`BaseIndexer` and | """An object type indexer, inherits from the :class:`BaseIndexer` and | ||||
implements Origin indexing using the run method | implements Origin indexing using the run method | ||||
Note: the :class:`OriginIndexer` is not an instantiable object. | Note: the :class:`OriginIndexer` is not an instantiable object. | ||||
To use it in another context one should inherit from this class | To use it in another context one should inherit from this class | ||||
and override the methods mentioned in the :class:`BaseIndexer` | and override the methods mentioned in the :class:`BaseIndexer` | ||||
class. | class. | ||||
""" | """ | ||||
douardda: self not being used in the body of the method, make it either a simple function or a static… | |||||
def run(self, ids, policy_update='update-dups', parse_ids=True, | def run(self, ids, policy_update='update-dups', parse_ids=True, | ||||
next_step=None, **kwargs): | next_step=None, **kwargs): | ||||
"""Given a list of origin ids: | """Given a list of origin ids: | ||||
- retrieve origins from storage | - retrieve origins from storage | ||||
- execute the indexing computations | - execute the indexing computations | ||||
- store the results (according to policy_update) | - store the results (according to policy_update) | ||||
Args: | Args: | ||||
ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or | ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or | ||||
(type, url) tuples. | (type, url) tuples. | ||||
policy_update (str): either 'update-dups' or 'ignore-dups' to | policy_update (str): either 'update-dups' or 'ignore-dups' to | ||||
respectively update duplicates (default) or ignore them | respectively update duplicates (default) or ignore them | ||||
next_step (dict): a dict in the form expected by | next_step (dict): a dict in the form expected by | ||||
`scheduler.backend.SchedulerBackend.create_tasks` without | `scheduler.backend.SchedulerBackend.create_tasks` without | ||||
`next_run`, plus an optional `result_name` key. | `next_run`, plus an optional `result_name` key. | ||||
parse_ids (bool): Do we need to parse id or not (default) | parse_ids (bool): Do we need to parse id or not (default) | ||||
**kwargs: passed to the `index` method | **kwargs: passed to the `index` method | ||||
""" | """ | ||||
if parse_ids: | if parse_ids: | ||||
ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id | ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id | ||||
for o in ids] | for o in ids] | ||||
results = [] | origins = [] | ||||
for id_ in ids: | for id_ in ids: | ||||
if isinstance(id_, str): | origin = self.storage.origin_get(origin_get_params(id_)) | ||||
# Data coming from JSON, which requires string keys, so | |||||
# one extra level of deserialization is needed | |||||
id_ = ast.literal_eval(id_) | |||||
if isinstance(id_, (tuple, list)): | |||||
if len(id_) != 2: | |||||
raise TypeError('Expected a (type, url) tuple.') | |||||
(type_, url) = id_ | |||||
params = {'type': type_, 'url': url} | |||||
elif isinstance(id_, int): | |||||
params = {'id': id_} | |||||
else: | |||||
raise TypeError('Invalid value in "ids": %r' % id_) | |||||
origin = self.storage.origin_get(params) | |||||
if not origin: | if not origin: | ||||
self.log.warning('Origin %s not found in storage' % | self.log.warning('Origin %s not found in storage' % | ||||
id_) | id_) | ||||
continue | continue | ||||
origins.append(origin) | |||||
results = self.index_list(origins, **kwargs) | |||||
self.persist_index_computations(results, policy_update) | |||||
self.results = results | |||||
return self.next_step(results, task=next_step) | |||||
def index_list(self, origins, **kwargs): | |||||
results = [] | |||||
for origin in origins: | |||||
try: | try: | ||||
res = self.index(origin, **kwargs) | res = self.index(origin, **kwargs) | ||||
if res: # If no results, skip it | if res: # If no results, skip it | ||||
results.append(res) | results.append(res) | ||||
except Exception: | except Exception: | ||||
self.log.exception( | self.log.exception( | ||||
'Problem when processing origin %s' % (id_,)) | 'Problem when processing origin %s', | ||||
self.persist_index_computations(results, policy_update) | origin['id']) | ||||
self.results = results | return results | ||||
return self.next_step(results, task=next_step) | |||||
class RevisionIndexer(BaseIndexer): | class RevisionIndexer(BaseIndexer): | ||||
"""An object type indexer, inherits from the :class:`BaseIndexer` and | """An object type indexer, inherits from the :class:`BaseIndexer` and | ||||
implements Revision indexing using the run method | implements Revision indexing using the run method | ||||
Note: the :class:`RevisionIndexer` is not an instantiable object. | Note: the :class:`RevisionIndexer` is not an instantiable object. | ||||
To use it in another context one should inherit from this class | To use it in another context one should inherit from this class | ||||
Show All 37 Lines |
self not being used in the body of the method, make it either a simple function or a static method. Also extracting a piece of code like this in a dedicated function make it easier to test it. Just sayin'!