diff --git a/debian/control b/debian/control index 73adddc..74dfa4a 100644 --- a/debian/control +++ b/debian/control @@ -1,25 +1,25 @@ Source: swh-indexer Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.27~), python3-swh.model (>= 0.0.15~), - python3-swh.storage (>= 0.0.84~), + python3-swh.storage (>= 0.0.85~), python3-swh.objstorage (>= 0.0.13~), python3-swh.scheduler (>= 0.0.9~), python3-chardet (>= 2.3.0~), python3-click, python3-pygments, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/78/ Package: python3-swh.indexer Architecture: all Depends: universal-ctags (>= 0.8~), fossology-nomossa (>= 3.1~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Content Indexer diff --git a/requirements-swh.txt b/requirements-swh.txt index df5d464..3423013 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.27 -swh.storage >= 0.0.84 +swh.storage >= 0.0.85 swh.objstorage >= 0.0.13 swh.scheduler >= 0.0.9 swh.model >= 0.0.15 diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py index a73e78a..19a3a56 100644 --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,148 +1,141 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import subprocess from swh.model import hashutil from .indexer import BaseIndexer, DiskIndexer def compute_license(path, log=None): """Determine license from file at path. Args: path: filepath to determine the license Returns: A dict with the following keys: - licenses ([str]): associated detected licenses to path - path (bytes): content filepath - tool (str): tool used to compute the output """ try: properties = subprocess.check_output(['nomossa', path], universal_newlines=True) if properties: res = properties.rstrip().split(' contains license(s) ') licenses = res[1].split(',') return { 'licenses': licenses, 'path': path, } except subprocess.CalledProcessError: if log: from os import path as __path log.exception('Problem during license detection for sha1 %s' % __path.basename(path)) return { 'licenses': [], 'path': path, } class ContentFossologyLicenseIndexer(BaseIndexer, DiskIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {license, encoding} from that content - store result in storage """ ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.fossology.license'), 'tools': ('dict', { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }), } CONFIG_BASE_FILENAME = 'indexer/fossology_license' def __init__(self): super().__init__() self.working_directory = self.config['workdir'] def filter_contents(self, sha1s): """Filter out known sha1s and return only missing ones. """ tools = self.retrieve_tools_information() yield from self.storage.content_fossology_license_missing(( { 'id': sha1, 'indexer_configuration_id': tools['id'], } for sha1 in sha1s )) def index_content(self, sha1, raw_content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: A dict, representing a content_license, with keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path """ filename = hashutil.hash_to_hex(sha1) content_path = self.write_to_temp( filename=filename, data=raw_content) try: properties = compute_license(path=content_path, log=self.log) properties.update({ 'id': sha1, 'indexer_configuration_id': self.tools['id'], }) finally: self.cleanup(content_path) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_license, dict with the following keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ - wrong_licenses = self.storage.content_fossology_license_add( + self.storage.content_fossology_license_add( results, conflict_update=(policy_update == 'update-dups')) - if wrong_licenses: - for l in wrong_licenses: - self.log.warn('Content %s has some unknown licenses: %s' % ( - hashutil.hash_to_hex(l['id']), - ','.join((name for name in l['licenses']))) - ) - @click.command(help='Compute license for path using tool') @click.option('--tool', default='nomossa', help="Path to tool") @click.option('--path', required=1, help="Path to execute index on") def main(tool, path): print(compute_license(tool, path)) if __name__ == '__main__': main()