Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9312506
D677.id2135.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
8 KB
Subscribers
None
D677.id2135.diff
View Options
diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -169,8 +169,6 @@
for _id in contents:
yield _id
start = result['next']
- if start is None:
- break
@click.command(help='Compute license for path using tool')
diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
--- a/swh/indexer/mimetype.py
+++ b/swh/indexer/mimetype.py
@@ -162,8 +162,6 @@
for _id in contents:
yield _id
start = result['next']
- if start is None:
- break
@click.command()
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -112,6 +112,7 @@
def _content_get_range(self, content_type, start, end,
indexer_configuration_id, limit=1000,
+ with_textual_data=False,
db=None, cur=None):
"""Retrieve ids of type content_type within range [start, end] bound
by limit.
@@ -124,6 +125,10 @@
than start)
**indexer_configuration_id** (int): The tool used to index data
**limit** (int): Limit result (default to 1000)
+ **with_textual_data** (bool): Deal with only textual
+ content (True) or all
+ content (all contents by
+ defaults, False)
Raises:
ValueError for;
@@ -148,7 +153,7 @@
next_id = None
for counter, obj in enumerate(db.content_get_range(
content_type, start, end, indexer_configuration_id,
- limit=limit+1, cur=cur)):
+ limit=limit+1, with_textual_data=with_textual_data, cur=cur)):
_id = obj[0]
if counter >= limit:
next_id = _id
@@ -483,7 +488,7 @@
"""
return self._content_get_range('fossology_license', start, end,
indexer_configuration_id, limit=limit,
- db=db, cur=cur)
+ with_textual_data=True, db=db, cur=cur)
@remote_api_endpoint('content_metadata/missing')
@db_transaction_generator()
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -119,22 +119,32 @@
}
def content_get_range(self, content_type, start, end,
- indexer_configuration_id, limit=1000, cur=None):
+ indexer_configuration_id, limit=1000,
+ with_textual_data=False, cur=None):
"""Retrieve contents with content_type, within range [start, end]
bound by limit and associated to the given indexer
configuration id.
+ When asking to work on textual content, that filters on the
+ mimetype table with any mimetype that is not binary.
+
"""
cur = self._cursor(cur)
table = self.content_indexer_names[content_type]
+ if with_textual_data:
+ extra = """inner join content_mimetype cm
+ on (t.id=cm.id and cm.mimetype like 'text/%%')"""
+ else:
+ extra = ""
query = """select t.id
from %s t
inner join indexer_configuration ic
on t.indexer_configuration_id=ic.id
+ %s
where ic.id=%%s and
%%s <= t.id and t.id <= %%s
order by t.indexer_configuration_id, t.id
- limit %%s""" % table
+ limit %%s""" % (table, extra)
cur.execute(query, (indexer_configuration_id, start, end, limit))
yield from cursor_to_bytes(cur)
diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py
--- a/swh/indexer/tasks.py
+++ b/swh/indexer/tasks.py
@@ -10,7 +10,9 @@
from .mimetype import ContentMimetypeIndexer, MimetypeRangeIndexer
from .language import ContentLanguageIndexer
from .ctags import CtagsIndexer
-from .fossology_license import ContentFossologyLicenseIndexer
+from .fossology_license import (
+ ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer
+)
from .rehash import RecomputeChecksums
from .metadata import RevisionMetadataIndexer, OriginMetadataIndexer
from .origin_head import OriginHeadIndexer
@@ -59,7 +61,7 @@
class ContentMimetype(StatusTask):
- """Compute (mimetype, encoding) from the sha1's content.
+ """Compute (mimetype, encoding) on a list of sha1s' content.
"""
task_queue = 'swh_indexer_content_mimetype'
@@ -93,14 +95,21 @@
class ContentFossologyLicense(Task):
- """Task which computes licenses from the sha1's content.
+ """Compute fossology licenses on a list of sha1s' content.
"""
task_queue = 'swh_indexer_content_fossology_license'
-
Indexer = ContentFossologyLicenseIndexer
+class ContentRangeFossologyLicense(StatusTask):
+ """Compute fossology license on a range of sha1s.
+
+ """
+ task_queue = 'swh_indexer_content_fossology_license_range'
+ Indexer = FossologyLicenseRangeIndexer
+
+
class RecomputeChecksums(Task):
"""Task which recomputes hashes and possibly new ones.
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -1706,11 +1706,67 @@
self.assertEqual(e.exception.args, (
'Development error: limit should not be None',))
+ def prepare_mimetypes_from(self, fossology_licenses):
+ """Fossology license needs some consistent data in db to run.
+
+ """
+ mimetypes = []
+ for c in fossology_licenses:
+ mimetypes.append({
+ 'id': c['id'],
+ 'mimetype': 'text/plain',
+ 'encoding': 'utf-8',
+ 'indexer_configuration_id': c['indexer_configuration_id'],
+ })
+ return mimetypes
+
@given(gen_content_fossology_licenses(min_size=1, max_size=4))
def test_generate_content_fossology_license_get_range_no_limit(
self, fossology_licenses):
"""license_get_range returns licenses within range provided"""
self.reset_storage_tables()
+ # craft some consistent mimetypes
+ mimetypes = self.prepare_mimetypes_from(fossology_licenses)
+
+ self.storage.content_mimetype_add(mimetypes)
+ # add fossology_licenses to storage
+ self.storage.content_fossology_license_add(fossology_licenses)
+
+ # All ids from the db
+ content_ids = sorted([c['id'] for c in fossology_licenses])
+
+ start = content_ids[0]
+ end = content_ids[-1]
+
+ # retrieve fossology_licenses
+ tool_id = fossology_licenses[0]['indexer_configuration_id']
+ actual_result = self.storage.content_fossology_license_get_range(
+ start, end, indexer_configuration_id=tool_id)
+
+ actual_ids = actual_result['ids']
+ actual_next = actual_result['next']
+
+ self.assertEqual(len(fossology_licenses), len(actual_ids))
+ self.assertIsNone(actual_next)
+ self.assertEqual(content_ids, actual_ids)
+
+ @given(gen_content_fossology_licenses(min_size=1, max_size=4),
+ gen_content_mimetypes(min_size=1, max_size=1))
+ def test_generate_content_fossology_license_get_range_no_limit_with_filter(
+ self, fossology_licenses, mimetypes):
+ """This filters non textual, then returns results within range"""
+ self.reset_storage_tables()
+
+ # craft some consistent mimetypes
+ _mimetypes = self.prepare_mimetypes_from(fossology_licenses)
+ # add binary mimetypes which will get filtered out in results
+ for m in mimetypes:
+ _mimetypes.append({
+ 'mimetype': 'binary',
+ **m,
+ })
+
+ self.storage.content_mimetype_add(_mimetypes)
# add fossology_licenses to storage
self.storage.content_fossology_license_add(fossology_licenses)
@@ -1737,8 +1793,11 @@
self, fossology_licenses):
"""fossology_license_get_range paginates results if limit exceeded"""
self.reset_storage_tables()
+ # craft some consistent mimetypes
+ mimetypes = self.prepare_mimetypes_from(fossology_licenses)
# add fossology_licenses to storage
+ self.storage.content_mimetype_add(mimetypes)
self.storage.content_fossology_license_add(fossology_licenses)
# input the list of sha1s we want from storage
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Jul 2, 10:55 AM (1 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214335
Attached To
D677: tasks: Open new fossology license range indexer task
Event Timeline
Log In to Comment