Page MenuHomeSoftware Heritage

D654.diff
No OneTemporary

D654.diff

diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -110,6 +110,48 @@
for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
yield obj[0]
+ @remote_api_endpoint('content_mimetype/range')
+ @db_transaction()
+ def content_mimetype_range(self, start, end, indexer_configuration_id,
+ limit=1000, db=None, cur=None):
+ """Retrieve mimetypes within range [start, end] bound by limit.
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to indexed data
+ **limit** (int): Limit result (default to 1000)
+
+ Returns:
+ a dict with keys:
+ - contents [dict]: iterable of contents in between the range.
+ - next (bytes): There remains content in the range
+ starting from this next sha1
+
+ Yields:
+ an iterable of mimetypes within the specified range
+
+ """
+ if limit is None:
+ raise ValueError('Development error: limit should not be None')
+
+ ids = []
+ next_id = None
+ for counter, obj in enumerate(db.content_get_range(
+ 'content_mimetype', start, end,
+ limit=limit+1, cur=cur)):
+ _id = obj[0]
+ if counter >= limit:
+ next_id = _id
+ ids.append(_id)
+
+ return {
+ 'ids': ids,
+ 'next_id': next_id
+ }
+
@remote_api_endpoint('content_mimetype/add')
@db_transaction()
def content_mimetype_add(self, mimetypes, conflict_update=False, db=None,
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -113,6 +113,38 @@
((_id,) for _id in ids)
)
+ content_tables = {
+ 'mimetype': 'content_mimetype',
+ 'language': 'content_language',
+ }
+
+ def content_get_range(self, content_type, start, end,
+ indexer_configuration_id, limit=1000, cur=None):
+ """Retrieve contents with content_type, within range [start, end]
+ bound by limit and associated to the given indexer
+ configuration id.
+
+ """
+ cur = self._cursor(cur)
+ table = self.content_tables.get(content_type)
+ if not table:
+ raise ValueError(
+ 'Development error: Wrong type. Should be one of [%s]' % (
+ ','.join(self.content_tables)))
+ query = """select %s
+ from %s t
+ inner join indexer_configuration ic
+ on t.indexer_configuration_id=ic.id
+ where tc.id=%%s and
+ %%s <= t.sha1 and t.sha1 <= %%s
+ order by t.indexer_configuration_id, t.sha1
+ limit %%s""" % (
+ ', '.join(self.content_get_metadata_keys), # keys
+ table
+ )
+ cur.execute(query, (start, end, indexer_configuration_id, limit))
+ yield from cursor_to_bytes(cur)
+
def content_mimetype_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
'content_mimetype', ids, self.content_mimetype_cols, cur=cur)

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 11:37 AM (3 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218725

Event Timeline