diff --git a/swh/indexer/__init__.py b/swh/indexer/__init__.py index 1d36165..f7cb4fb 100644 --- a/swh/indexer/__init__.py +++ b/swh/indexer/__init__.py @@ -1,27 +1,28 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information INDEXER_CLASSES = { 'mimetype': 'swh.indexer.mimetype.ContentMimetypeIndexer', 'language': 'swh.indexer.language.ContentLanguageIndexer', 'ctags': 'swh.indexer.ctags.CtagsIndexer', - 'license': 'swh.indexer.license.ContentLicenseIndexer', + 'fossology_license': + 'swh.indexer.fossology_license.ContentFossologyLicenseIndexer', } TASK_NAMES = { 'orchestrator_all': 'swh.indexer.tasks.SWHOrchestratorAllContentsTask', 'orchestrator_text': 'swh.indexer.tasks.SWHOrchestratorTextContentsTask', 'mimetype': 'swh.indexer.tasks.SWHContentMimetypeTask', 'language': 'swh.indexer.tasks.SWHContentLanguageTask', 'ctags': 'swh.indexer.tasks.SWHCtagsTask', - 'license': 'swh.indexer.tasks.SWHContentLicenseTask', + 'fossology_license': 'swh.indexer.tasks.SWHContentFossologyLicenseTask', } __all__ = [ 'INDEXER_CLASSES', 'TASK_NAMES', ] diff --git a/swh/indexer/license.py b/swh/indexer/fossology_license.py similarity index 92% rename from swh/indexer/license.py rename to swh/indexer/fossology_license.py index 3665277..661e228 100644 --- a/swh/indexer/license.py +++ b/swh/indexer/fossology_license.py @@ -1,134 +1,134 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import subprocess from swh.core import hashutil from .indexer import BaseIndexer, DiskIndexer def compute_license(tool, path): """Determine license from file at path. Args: path: filepath to determine the license Returns: A dict with the following keys: - licenses ([str]): associated detected licenses to path - path (bytes): content filepath - tool (str): tool used to compute the output """ properties = subprocess.check_output([tool, path], universal_newlines=False) if properties: res = properties.rstrip().split(' contains license(s) ') licenses = res[1].split(',') return { 'licenses': licenses, 'path': path, } -class ContentLicenseIndexer(BaseIndexer, DiskIndexer): +class ContentFossologyLicenseIndexer(BaseIndexer, DiskIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {license, encoding} from that content - store result in storage """ ADDITIONAL_CONFIG = { - 'workdir': ('str', '/tmp/swh/indexer.license'), + 'workdir': ('str', '/tmp/swh/indexer.fossology.license'), 'tool': ('dict', { 'cli': '/usr/local/bin/nomossa', 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c' }), } - CONFIG_BASE_FILENAME = 'indexer/license' + CONFIG_BASE_FILENAME = 'indexer/fossology_license' def __init__(self): super().__init__() self.working_directory = self.config['workdir'] self.tool = self.config['tool']['cli'] self.tool_name = self.config['tool']['name'] self.tool_version = self.config['tool']['version'] def filter_contents(self, sha1s): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.content_license_missing(sha1s) + yield from self.storage.content_fossology_license_missing(sha1s) def index_content(self, sha1, content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier content (bytes): raw content in bytes Returns: A dict, representing a content_license, with keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path """ filename = hashutil.hash_to_hex(sha1) content_path = self.write_to_temp( filename=filename, data=content) properties = compute_license(self.tool, path=content_path) properties.update({ 'id': sha1, 'tool_name': self.config['tool_name'], 'tool_version': self.config['tool_version'] }) self.log.info('Licenses: %s' % properties['licenses']) self.cleanup(content_path) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_license, dict with the following keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ - wrong_licenses = self.storage.content_license_add( + wrong_licenses = self.storage.content_fossology_license_add( results, conflict_update=(policy_update == 'update-dups')) if wrong_licenses: for l in wrong_licenses: self.log.warn('Content %s has some unknown licenses: %s' % ( hashutil.hash_to_hex(l['id']), ','.join((name.decode('utf-8') for name in l['licenses']))) ) @click.command(help='Compute license for path using tool') @click.option('--tool', default='nomossa', help="Path to tool") @click.option('--path', required=1, help="Path to execute index on") def main(tool, path): print(compute_license(tool, path)) if __name__ == '__main__': main() diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py index 8fd5876..167ccd6 100644 --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -1,75 +1,75 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task from .orchestrator import OrchestratorAllContentsIndexer from .orchestrator import OrchestratorTextContentsIndexer from .mimetype import ContentMimetypeIndexer from .language import ContentLanguageIndexer from .ctags import CtagsIndexer -from .license import ContentLicenseIndexer +from .fossology_license import ContentFossologyLicenseIndexer class SWHOrchestratorAllContentsTask(Task): """Main task in charge of reading batch contents (of any type) and broadcasting them back to other tasks. """ task_queue = 'swh_indexer_orchestrator_content_all' def run(self, *args, **kwargs): OrchestratorAllContentsIndexer().run(*args, **kwargs) class SWHOrchestratorTextContentsTask(Task): """Main task in charge of reading batch contents (of type text) and broadcasting them back to other tasks. """ task_queue = 'swh_indexer_orchestrator_content_text' def run(self, *args, **kwargs): OrchestratorTextContentsIndexer().run(*args, **kwargs) class SWHContentMimetypeTask(Task): """Task which computes the mimetype, encoding from the sha1's content. """ task_queue = 'swh_indexer_content_mimetype' def run(self, *args, **kwargs): ContentMimetypeIndexer().run(*args, **kwargs) class SWHContentLanguageTask(Task): """Task which computes the language from the sha1's content. """ task_queue = 'swh_indexer_content_language' def run(self, *args, **kwargs): ContentLanguageIndexer().run(*args, **kwargs) class SWHCtagsTask(Task): """Task which computes ctags from the sha1's content. """ task_queue = 'swh_indexer_content_ctags' def run(self, *args, **kwargs): CtagsIndexer().run(*args, **kwargs) -class SWHContentLicenseTask(Task): +class SWHContentFossologyLicenseTask(Task): """Task which computes licenses from the sha1's content. """ - task_queue = 'swh_indexer_content_license' + task_queue = 'swh_indexer_content_fossology_license' def run(self, *args, **kwargs): - ContentLicenseIndexer().run(*args, **kwargs) + ContentFossologyLicenseIndexer().run(*args, **kwargs)