Page MenuHomeSoftware Heritage

D677.id2135.diff
No OneTemporary

D677.id2135.diff

diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -169,8 +169,6 @@
for _id in contents:
yield _id
start = result['next']
- if start is None:
- break
@click.command(help='Compute license for path using tool')
diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
--- a/swh/indexer/mimetype.py
+++ b/swh/indexer/mimetype.py
@@ -162,8 +162,6 @@
for _id in contents:
yield _id
start = result['next']
- if start is None:
- break
@click.command()
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -112,6 +112,7 @@
def _content_get_range(self, content_type, start, end,
indexer_configuration_id, limit=1000,
+ with_textual_data=False,
db=None, cur=None):
"""Retrieve ids of type content_type within range [start, end] bound
by limit.
@@ -124,6 +125,10 @@
than start)
**indexer_configuration_id** (int): The tool used to index data
**limit** (int): Limit result (default to 1000)
+ **with_textual_data** (bool): Deal with only textual
+ content (True) or all
+ content (all contents by
+ defaults, False)
Raises:
ValueError for;
@@ -148,7 +153,7 @@
next_id = None
for counter, obj in enumerate(db.content_get_range(
content_type, start, end, indexer_configuration_id,
- limit=limit+1, cur=cur)):
+ limit=limit+1, with_textual_data=with_textual_data, cur=cur)):
_id = obj[0]
if counter >= limit:
next_id = _id
@@ -483,7 +488,7 @@
"""
return self._content_get_range('fossology_license', start, end,
indexer_configuration_id, limit=limit,
- db=db, cur=cur)
+ with_textual_data=True, db=db, cur=cur)
@remote_api_endpoint('content_metadata/missing')
@db_transaction_generator()
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -119,22 +119,32 @@
}
def content_get_range(self, content_type, start, end,
- indexer_configuration_id, limit=1000, cur=None):
+ indexer_configuration_id, limit=1000,
+ with_textual_data=False, cur=None):
"""Retrieve contents with content_type, within range [start, end]
bound by limit and associated to the given indexer
configuration id.
+ When asking to work on textual content, that filters on the
+ mimetype table with any mimetype that is not binary.
+
"""
cur = self._cursor(cur)
table = self.content_indexer_names[content_type]
+ if with_textual_data:
+ extra = """inner join content_mimetype cm
+ on (t.id=cm.id and cm.mimetype like 'text/%%')"""
+ else:
+ extra = ""
query = """select t.id
from %s t
inner join indexer_configuration ic
on t.indexer_configuration_id=ic.id
+ %s
where ic.id=%%s and
%%s <= t.id and t.id <= %%s
order by t.indexer_configuration_id, t.id
- limit %%s""" % table
+ limit %%s""" % (table, extra)
cur.execute(query, (indexer_configuration_id, start, end, limit))
yield from cursor_to_bytes(cur)
diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py
--- a/swh/indexer/tasks.py
+++ b/swh/indexer/tasks.py
@@ -10,7 +10,9 @@
from .mimetype import ContentMimetypeIndexer, MimetypeRangeIndexer
from .language import ContentLanguageIndexer
from .ctags import CtagsIndexer
-from .fossology_license import ContentFossologyLicenseIndexer
+from .fossology_license import (
+ ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer
+)
from .rehash import RecomputeChecksums
from .metadata import RevisionMetadataIndexer, OriginMetadataIndexer
from .origin_head import OriginHeadIndexer
@@ -59,7 +61,7 @@
class ContentMimetype(StatusTask):
- """Compute (mimetype, encoding) from the sha1's content.
+ """Compute (mimetype, encoding) on a list of sha1s' content.
"""
task_queue = 'swh_indexer_content_mimetype'
@@ -93,14 +95,21 @@
class ContentFossologyLicense(Task):
- """Task which computes licenses from the sha1's content.
+ """Compute fossology licenses on a list of sha1s' content.
"""
task_queue = 'swh_indexer_content_fossology_license'
-
Indexer = ContentFossologyLicenseIndexer
+class ContentRangeFossologyLicense(StatusTask):
+ """Compute fossology license on a range of sha1s.
+
+ """
+ task_queue = 'swh_indexer_content_fossology_license_range'
+ Indexer = FossologyLicenseRangeIndexer
+
+
class RecomputeChecksums(Task):
"""Task which recomputes hashes and possibly new ones.
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -1706,11 +1706,67 @@
self.assertEqual(e.exception.args, (
'Development error: limit should not be None',))
+ def prepare_mimetypes_from(self, fossology_licenses):
+ """Fossology license needs some consistent data in db to run.
+
+ """
+ mimetypes = []
+ for c in fossology_licenses:
+ mimetypes.append({
+ 'id': c['id'],
+ 'mimetype': 'text/plain',
+ 'encoding': 'utf-8',
+ 'indexer_configuration_id': c['indexer_configuration_id'],
+ })
+ return mimetypes
+
@given(gen_content_fossology_licenses(min_size=1, max_size=4))
def test_generate_content_fossology_license_get_range_no_limit(
self, fossology_licenses):
"""license_get_range returns licenses within range provided"""
self.reset_storage_tables()
+ # craft some consistent mimetypes
+ mimetypes = self.prepare_mimetypes_from(fossology_licenses)
+
+ self.storage.content_mimetype_add(mimetypes)
+ # add fossology_licenses to storage
+ self.storage.content_fossology_license_add(fossology_licenses)
+
+ # All ids from the db
+ content_ids = sorted([c['id'] for c in fossology_licenses])
+
+ start = content_ids[0]
+ end = content_ids[-1]
+
+ # retrieve fossology_licenses
+ tool_id = fossology_licenses[0]['indexer_configuration_id']
+ actual_result = self.storage.content_fossology_license_get_range(
+ start, end, indexer_configuration_id=tool_id)
+
+ actual_ids = actual_result['ids']
+ actual_next = actual_result['next']
+
+ self.assertEqual(len(fossology_licenses), len(actual_ids))
+ self.assertIsNone(actual_next)
+ self.assertEqual(content_ids, actual_ids)
+
+ @given(gen_content_fossology_licenses(min_size=1, max_size=4),
+ gen_content_mimetypes(min_size=1, max_size=1))
+ def test_generate_content_fossology_license_get_range_no_limit_with_filter(
+ self, fossology_licenses, mimetypes):
+ """This filters non textual, then returns results within range"""
+ self.reset_storage_tables()
+
+ # craft some consistent mimetypes
+ _mimetypes = self.prepare_mimetypes_from(fossology_licenses)
+ # add binary mimetypes which will get filtered out in results
+ for m in mimetypes:
+ _mimetypes.append({
+ 'mimetype': 'binary',
+ **m,
+ })
+
+ self.storage.content_mimetype_add(_mimetypes)
# add fossology_licenses to storage
self.storage.content_fossology_license_add(fossology_licenses)
@@ -1737,8 +1793,11 @@
self, fossology_licenses):
"""fossology_license_get_range paginates results if limit exceeded"""
self.reset_storage_tables()
+ # craft some consistent mimetypes
+ mimetypes = self.prepare_mimetypes_from(fossology_licenses)
# add fossology_licenses to storage
+ self.storage.content_mimetype_add(mimetypes)
self.storage.content_fossology_license_add(fossology_licenses)
# input the list of sha1s we want from storage

File Metadata

Mime Type
text/plain
Expires
Wed, Jul 2, 10:55 AM (1 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214335

Event Timeline