Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/indexer.py
Show First 20 Lines • Show All 433 Lines • ▼ Show 20 Lines | def _index_contents(self, start, end, indexed, **kwargs): | ||||
except ObjNotFoundError: | except ObjNotFoundError: | ||||
self.log.warning('Content %s not found in objstorage' % | self.log.warning('Content %s not found in objstorage' % | ||||
hashutil.hash_to_hex(sha1)) | hashutil.hash_to_hex(sha1)) | ||||
continue | continue | ||||
res = self.index(sha1, raw_content, **kwargs) | res = self.index(sha1, raw_content, **kwargs) | ||||
if res: | if res: | ||||
yield res | yield res | ||||
def _index_with_skipping_already_done(self, start, end): | |||||
"""Index not already indexed contents in range [start, end]. | |||||
Args: | |||||
**start** (Union[bytes, str]): Starting range identifier | |||||
**end** (Union[bytes, str]): Ending range identifier | |||||
Yields: | |||||
Content identifier (bytes) present in the range [start, | |||||
end] which are not already indexed. | |||||
""" | |||||
while start: | |||||
indexed_page = self.indexed_contents_in_range(start, end) | |||||
contents = indexed_page['ids'] | |||||
_end = contents[-1] if contents else end | |||||
yield from self._index_contents( | |||||
start, _end, contents) | |||||
start = indexed_page['next'] | |||||
def run(self, start, end, skip_existing=True, **kwargs): | def run(self, start, end, skip_existing=True, **kwargs): | ||||
"""Given a range of content ids, compute the indexing computations on | """Given a range of content ids, compute the indexing computations on | ||||
the contents within. Either the indexer is incremental | the contents within. Either the indexer is incremental | ||||
(filter out existing computed data) or not (compute | (filter out existing computed data) or not (compute | ||||
everything from scratch). | everything from scratch). | ||||
Args: | Args: | ||||
**start** (Union[bytes, str]): Starting range identifier | **start** (Union[bytes, str]): Starting range identifier | ||||
Show All 9 Lines | def run(self, start, end, skip_existing=True, **kwargs): | ||||
with_indexed_data = False | with_indexed_data = False | ||||
try: | try: | ||||
if isinstance(start, str): | if isinstance(start, str): | ||||
start = hashutil.hash_to_bytes(start) | start = hashutil.hash_to_bytes(start) | ||||
if isinstance(end, str): | if isinstance(end, str): | ||||
end = hashutil.hash_to_bytes(end) | end = hashutil.hash_to_bytes(end) | ||||
if skip_existing: | if skip_existing: | ||||
indexed = set(self.indexed_contents_in_range(start, end)) | gen = self._index_with_skipping_already_done(start, end) | ||||
else: | else: | ||||
indexed = set() | gen = self._index_contents(start, end, indexed=[]) | ||||
index_computations = self._index_contents(start, end, indexed) | for results in utils.grouper(gen, | ||||
for results in utils.grouper(index_computations, | |||||
n=self.config['write_batch_size']): | n=self.config['write_batch_size']): | ||||
self.persist_index_computations( | self.persist_index_computations( | ||||
results, policy_update='update-dups') | results, policy_update='update-dups') | ||||
with_indexed_data = True | with_indexed_data = True | ||||
return with_indexed_data | |||||
except Exception: | except Exception: | ||||
self.log.exception( | self.log.exception( | ||||
'Problem when computing metadata.') | 'Problem when computing metadata.') | ||||
finally: | |||||
return with_indexed_data | |||||
class OriginIndexer(BaseIndexer): | class OriginIndexer(BaseIndexer): | ||||
"""An object type indexer, inherits from the :class:`BaseIndexer` and | """An object type indexer, inherits from the :class:`BaseIndexer` and | ||||
implements Origin indexing using the run method | implements Origin indexing using the run method | ||||
Note: the :class:`OriginIndexer` is not an instantiable object. | Note: the :class:`OriginIndexer` is not an instantiable object. | ||||
To use it in another context one should inherit from this class | To use it in another context one should inherit from this class | ||||
▲ Show 20 Lines • Show All 102 Lines • Show Last 20 Lines |