Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/indexer.py
# Copyright (C) 2016-2018 The Software Heritage developers | # Copyright (C) 2016-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import abc | import abc | ||||
import ast | |||||
import os | import os | ||||
import logging | import logging | ||||
import shutil | import shutil | ||||
import tempfile | import tempfile | ||||
import datetime | import datetime | ||||
from copy import deepcopy | from copy import deepcopy | ||||
from swh.scheduler import get_scheduler | from swh.scheduler import get_scheduler | ||||
▲ Show 20 Lines • Show All 523 Lines • ▼ Show 20 Lines | def run(self, ids, policy_update='update-dups', parse_ids=True, | ||||
""" | """ | ||||
if parse_ids: | if parse_ids: | ||||
ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id | ids = [o.split('+', 1) if ':' in o else int(o) # type+url or id | ||||
for o in ids] | for o in ids] | ||||
results = [] | results = [] | ||||
for id_ in ids: | for id_ in ids: | ||||
if isinstance(id_, str): | |||||
# Data coming from JSON, which requires string keys, so | |||||
# one extra level of deserialization is needed | |||||
id_ = ast.literal_eval(id_) | |||||
if isinstance(id_, (tuple, list)): | if isinstance(id_, (tuple, list)): | ||||
if len(id_) != 2: | if len(id_) != 2: | ||||
raise TypeError('Expected a (type, url) tuple.') | raise TypeError('Expected a (type, url) tuple.') | ||||
(type_, url) = id_ | (type_, url) = id_ | ||||
params = {'type': type_, 'url': url} | params = {'type': type_, 'url': url} | ||||
elif isinstance(id_, int): | elif isinstance(id_, int): | ||||
params = {'id': id_} | params = {'id': id_} | ||||
else: | else: | ||||
raise TypeError('Invalid value in "ids": %r' % id_) | raise TypeError('Invalid value in "ids": %r' % id_) | ||||
origin = self.storage.origin_get(params) | origin = self.storage.origin_get(params) | ||||
if not origin: | if not origin: | ||||
self.log.warning('Origins %s not found in storage' % | self.log.warning('Origin %s not found in storage' % | ||||
list(ids)) | list(id_)) | ||||
continue | continue | ||||
try: | try: | ||||
res = self.index(origin, **kwargs) | res = self.index(origin, **kwargs) | ||||
if origin: # If no results, skip it | if origin: # If no results, skip it | ||||
results.append(res) | results.append(res) | ||||
except Exception: | except Exception: | ||||
self.log.exception( | self.log.exception( | ||||
'Problem when processing origin %s' % id_) | 'Problem when processing origin %s' % (id_,)) | ||||
self.persist_index_computations(results, policy_update) | self.persist_index_computations(results, policy_update) | ||||
self.results = results | self.results = results | ||||
return self.next_step(results, task=next_step) | return self.next_step(results, task=next_step) | ||||
class RevisionIndexer(BaseIndexer): | class RevisionIndexer(BaseIndexer): | ||||
"""An object type indexer, inherits from the :class:`BaseIndexer` and | """An object type indexer, inherits from the :class:`BaseIndexer` and | ||||
implements Revision indexing using the run method | implements Revision indexing using the run method | ||||
Show All 40 Lines |