diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -169,8 +169,6 @@ for _id in contents: yield _id start = result['next'] - if start is None: - break @click.command(help='Compute license for path using tool') diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -162,8 +162,6 @@ for _id in contents: yield _id start = result['next'] - if start is None: - break @click.command() diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -112,6 +112,7 @@ def _content_get_range(self, content_type, start, end, indexer_configuration_id, limit=1000, + with_textual_data=False, db=None, cur=None): """Retrieve ids of type content_type within range [start, end] bound by limit. @@ -124,6 +125,10 @@ than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) + **with_textual_data** (bool): Deal with only textual + content (True) or all + content (all contents by + defaults, False) Raises: ValueError for; @@ -148,7 +153,7 @@ next_id = None for counter, obj in enumerate(db.content_get_range( content_type, start, end, indexer_configuration_id, - limit=limit+1, cur=cur)): + limit=limit+1, with_textual_data=with_textual_data, cur=cur)): _id = obj[0] if counter >= limit: next_id = _id @@ -483,7 +488,7 @@ """ return self._content_get_range('fossology_license', start, end, indexer_configuration_id, limit=limit, - db=db, cur=cur) + with_textual_data=True, db=db, cur=cur) @remote_api_endpoint('content_metadata/missing') @db_transaction_generator() diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -119,22 +119,32 @@ } def content_get_range(self, content_type, start, end, - indexer_configuration_id, limit=1000, cur=None): + indexer_configuration_id, limit=1000, + with_textual_data=False, cur=None): """Retrieve contents with content_type, within range [start, end] bound by limit and associated to the given indexer configuration id. + When asking to work on textual content, that filters on the + mimetype table with any mimetype that is not binary. + """ cur = self._cursor(cur) table = self.content_indexer_names[content_type] + if with_textual_data: + extra = """inner join content_mimetype cm + on (t.id=cm.id and cm.mimetype like 'text/%%')""" + else: + extra = "" query = """select t.id from %s t inner join indexer_configuration ic on t.indexer_configuration_id=ic.id + %s where ic.id=%%s and %%s <= t.id and t.id <= %%s order by t.indexer_configuration_id, t.id - limit %%s""" % table + limit %%s""" % (table, extra) cur.execute(query, (indexer_configuration_id, start, end, limit)) yield from cursor_to_bytes(cur) diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -10,7 +10,9 @@ from .mimetype import ContentMimetypeIndexer, MimetypeRangeIndexer from .language import ContentLanguageIndexer from .ctags import CtagsIndexer -from .fossology_license import ContentFossologyLicenseIndexer +from .fossology_license import ( + ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer +) from .rehash import RecomputeChecksums from .metadata import RevisionMetadataIndexer, OriginMetadataIndexer from .origin_head import OriginHeadIndexer @@ -59,7 +61,7 @@ class ContentMimetype(StatusTask): - """Compute (mimetype, encoding) from the sha1's content. + """Compute (mimetype, encoding) on a list of sha1s' content. """ task_queue = 'swh_indexer_content_mimetype' @@ -93,14 +95,21 @@ class ContentFossologyLicense(Task): - """Task which computes licenses from the sha1's content. + """Compute fossology licenses on a list of sha1s' content. """ task_queue = 'swh_indexer_content_fossology_license' - Indexer = ContentFossologyLicenseIndexer +class ContentRangeFossologyLicense(StatusTask): + """Compute fossology license on a range of sha1s. + + """ + task_queue = 'swh_indexer_content_fossology_license_range' + Indexer = FossologyLicenseRangeIndexer + + class RecomputeChecksums(Task): """Task which recomputes hashes and possibly new ones. diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1706,11 +1706,67 @@ self.assertEqual(e.exception.args, ( 'Development error: limit should not be None',)) + def prepare_mimetypes_from(self, fossology_licenses): + """Fossology license needs some consistent data in db to run. + + """ + mimetypes = [] + for c in fossology_licenses: + mimetypes.append({ + 'id': c['id'], + 'mimetype': 'text/plain', + 'encoding': 'utf-8', + 'indexer_configuration_id': c['indexer_configuration_id'], + }) + return mimetypes + @given(gen_content_fossology_licenses(min_size=1, max_size=4)) def test_generate_content_fossology_license_get_range_no_limit( self, fossology_licenses): """license_get_range returns licenses within range provided""" self.reset_storage_tables() + # craft some consistent mimetypes + mimetypes = self.prepare_mimetypes_from(fossology_licenses) + + self.storage.content_mimetype_add(mimetypes) + # add fossology_licenses to storage + self.storage.content_fossology_license_add(fossology_licenses) + + # All ids from the db + content_ids = sorted([c['id'] for c in fossology_licenses]) + + start = content_ids[0] + end = content_ids[-1] + + # retrieve fossology_licenses + tool_id = fossology_licenses[0]['indexer_configuration_id'] + actual_result = self.storage.content_fossology_license_get_range( + start, end, indexer_configuration_id=tool_id) + + actual_ids = actual_result['ids'] + actual_next = actual_result['next'] + + self.assertEqual(len(fossology_licenses), len(actual_ids)) + self.assertIsNone(actual_next) + self.assertEqual(content_ids, actual_ids) + + @given(gen_content_fossology_licenses(min_size=1, max_size=4), + gen_content_mimetypes(min_size=1, max_size=1)) + def test_generate_content_fossology_license_get_range_no_limit_with_filter( + self, fossology_licenses, mimetypes): + """This filters non textual, then returns results within range""" + self.reset_storage_tables() + + # craft some consistent mimetypes + _mimetypes = self.prepare_mimetypes_from(fossology_licenses) + # add binary mimetypes which will get filtered out in results + for m in mimetypes: + _mimetypes.append({ + 'mimetype': 'binary', + **m, + }) + + self.storage.content_mimetype_add(_mimetypes) # add fossology_licenses to storage self.storage.content_fossology_license_add(fossology_licenses) @@ -1737,8 +1793,11 @@ self, fossology_licenses): """fossology_license_get_range paginates results if limit exceeded""" self.reset_storage_tables() + # craft some consistent mimetypes + mimetypes = self.prepare_mimetypes_from(fossology_licenses) # add fossology_licenses to storage + self.storage.content_mimetype_add(mimetypes) self.storage.content_fossology_license_add(fossology_licenses) # input the list of sha1s we want from storage