Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/indexer.py
Show First 20 Lines • Show All 399 Lines • ▼ Show 20 Lines | def _list_contents_to_index(self, start, end, indexed): | ||||
start (bytes): Starting bound from range identifier | start (bytes): Starting bound from range identifier | ||||
end (bytes): End range identifier | end (bytes): End range identifier | ||||
indexed (Set[bytes]): Set of content already indexed. | indexed (Set[bytes]): Set of content already indexed. | ||||
Yields: | Yields: | ||||
bytes: Identifier of contents to index. | bytes: Identifier of contents to index. | ||||
""" | """ | ||||
if not isinstance(start, bytes) or not isinstance(end, bytes): | |||||
raise TypeError('identifiers must be bytes, not %r and %r.' % | |||||
(start, end)) | |||||
while start: | while start: | ||||
result = self.storage.content_get_range(start, end) | result = self.storage.content_get_range(start, end) | ||||
contents = result['contents'] | contents = result['contents'] | ||||
for c in contents: | for c in contents: | ||||
_id = c['sha1'] | _id = hashutil.hash_to_bytes(c['sha1']) | ||||
if _id in indexed: | if _id in indexed: | ||||
continue | continue | ||||
yield _id | yield _id | ||||
start = result['next'] | start = result['next'] | ||||
def _index_contents(self, start, end, indexed, **kwargs): | def _index_contents(self, start, end, indexed, **kwargs): | ||||
"""Index the contents from within range [start, end] | """Index the contents from within range [start, end] | ||||
Show All 9 Lines | def _index_contents(self, start, end, indexed, **kwargs): | ||||
for sha1 in self._list_contents_to_index(start, end, indexed): | for sha1 in self._list_contents_to_index(start, end, indexed): | ||||
try: | try: | ||||
raw_content = self.objstorage.get(sha1) | raw_content = self.objstorage.get(sha1) | ||||
except ObjNotFoundError: | except ObjNotFoundError: | ||||
self.log.warning('Content %s not found in objstorage' % | self.log.warning('Content %s not found in objstorage' % | ||||
hashutil.hash_to_hex(sha1)) | hashutil.hash_to_hex(sha1)) | ||||
continue | continue | ||||
res = self.index(sha1, raw_content, **kwargs) | res = self.index(sha1, raw_content, **kwargs) | ||||
if not isinstance(res['id'], bytes): | |||||
raise TypeError( | |||||
'%r.index should return ids as bytes, not %r' % | |||||
(self.__class__.__name__, res['id'])) | |||||
if res: | if res: | ||||
yield res | yield res | ||||
def _index_with_skipping_already_done(self, start, end): | def _index_with_skipping_already_done(self, start, end): | ||||
"""Index not already indexed contents in range [start, end]. | """Index not already indexed contents in range [start, end]. | ||||
Args: | Args: | ||||
start** (Union[bytes, str]): Starting range identifier | start** (Union[bytes, str]): Starting range identifier | ||||
▲ Show 20 Lines • Show All 162 Lines • Show Last 20 Lines |