diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -110,6 +110,48 @@ for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] + @remote_api_endpoint('content_mimetype/range') + @db_transaction() + def content_mimetype_range(self, start, end, indexer_configuration_id, + limit=1000, db=None, cur=None): + """Retrieve mimetypes within range [start, end] bound by limit. + + Args: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **indexer_configuration_id** (int): The tool used to indexed data + **limit** (int): Limit result (default to 1000) + + Returns: + a dict with keys: + - contents [dict]: iterable of contents in between the range. + - next (bytes): There remains content in the range + starting from this next sha1 + + Yields: + an iterable of mimetypes within the specified range + + """ + if limit is None: + raise ValueError('Development error: limit should not be None') + + ids = [] + next_id = None + for counter, obj in enumerate(db.content_get_range( + 'content_mimetype', start, end, + limit=limit+1, cur=cur)): + _id = obj[0] + if counter >= limit: + next_id = _id + ids.append(_id) + + return { + 'ids': ids, + 'next_id': next_id + } + @remote_api_endpoint('content_mimetype/add') @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -113,6 +113,38 @@ ((_id,) for _id in ids) ) + content_tables = { + 'mimetype': 'content_mimetype', + 'language': 'content_language', + } + + def content_get_range(self, content_type, start, end, + indexer_configuration_id, limit=1000, cur=None): + """Retrieve contents with content_type, within range [start, end] + bound by limit and associated to the given indexer + configuration id. + + """ + cur = self._cursor(cur) + table = self.content_tables.get(content_type) + if not table: + raise ValueError( + 'Development error: Wrong type. Should be one of [%s]' % ( + ','.join(self.content_tables))) + query = """select %s + from %s t + inner join indexer_configuration ic + on t.indexer_configuration_id=ic.id + where tc.id=%%s and + %%s <= t.sha1 and t.sha1 <= %%s + order by t.indexer_configuration_id, t.sha1 + limit %%s""" % ( + ', '.join(self.content_get_metadata_keys), # keys + table + ) + cur.execute(query, (start, end, indexer_configuration_id, limit)) + yield from cursor_to_bytes(cur) + def content_mimetype_get_from_list(self, ids, cur=None): yield from self._get_from_list( 'content_mimetype', ids, self.content_mimetype_cols, cur=cur)