Changeset View
Standalone View
swh/indexer/fossology_license.py
# Copyright (C) 2016-2018 The Software Heritage developers | # Copyright (C) 2016-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | |||||
import subprocess | import subprocess | ||||
from typing import Optional | from typing import Any, Dict, List, Optional | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from .indexer import ContentIndexer, ContentRangeIndexer, write_to_temp | from .indexer import ContentIndexer, ContentRangeIndexer, write_to_temp | ||||
def compute_license(path, log=None): | logger = logging.getLogger(__name__) | ||||
def compute_license(path): | |||||
"""Determine license from file at path. | """Determine license from file at path. | ||||
Args: | Args: | ||||
path: filepath to determine the license | path: filepath to determine the license | ||||
Returns: | Returns: | ||||
dict: A dict with the following keys: | dict: A dict with the following keys: | ||||
Show All 10 Lines | try: | ||||
else: | else: | ||||
licenses = [] | licenses = [] | ||||
return { | return { | ||||
'licenses': licenses, | 'licenses': licenses, | ||||
'path': path, | 'path': path, | ||||
} | } | ||||
except subprocess.CalledProcessError: | except subprocess.CalledProcessError: | ||||
if log: | |||||
from os import path as __path | from os import path as __path | ||||
log.exception('Problem during license detection for sha1 %s' % | logger.exception('Problem during license detection for sha1 %s' % | ||||
__path.basename(path)) | __path.basename(path)) | ||||
return { | return { | ||||
'licenses': [], | 'licenses': [], | ||||
'path': path, | 'path': path, | ||||
} | } | ||||
class MixinFossologyLicenseIndexer: | class MixinFossologyLicenseIndexer: | ||||
"""Mixin fossology license indexer. | """Mixin fossology license indexer. | ||||
Show All 10 Lines | ADDITIONAL_CONFIG = { | ||||
'configuration': { | 'configuration': { | ||||
'command_line': 'nomossa <filepath>', | 'command_line': 'nomossa <filepath>', | ||||
}, | }, | ||||
}), | }), | ||||
'write_batch_size': ('int', 1000), | 'write_batch_size': ('int', 1000), | ||||
} | } | ||||
CONFIG_BASE_FILENAME = 'indexer/fossology_license' # type: Optional[str] | CONFIG_BASE_FILENAME = 'indexer/fossology_license' # type: Optional[str] | ||||
tool: Any | |||||
idx_storage: Any | |||||
def prepare(self): | def prepare(self): | ||||
super().prepare() | super().prepare() | ||||
self.working_directory = self.config['workdir'] | self.working_directory = self.config['workdir'] | ||||
def index(self, id, data): | def index(self, id: bytes, data: Optional[bytes] = None, | ||||
**kwargs) -> Dict[str, Any]: | |||||
"""Index sha1s' content and store result. | """Index sha1s' content and store result. | ||||
Args: | Args: | ||||
id (bytes): content's identifier | id (bytes): content's identifier | ||||
raw_content (bytes): associated raw content to content id | raw_content (bytes): associated raw content to content id | ||||
Returns: | Returns: | ||||
dict: A dict, representing a content_license, with keys: | dict: A dict, representing a content_license, with keys: | ||||
- id (bytes): content's identifier (sha1) | - id (bytes): content's identifier (sha1) | ||||
- license (bytes): license in bytes | - license (bytes): license in bytes | ||||
- path (bytes): path | - path (bytes): path | ||||
- indexer_configuration_id (int): tool used to compute the output | - indexer_configuration_id (int): tool used to compute the output | ||||
""" | """ | ||||
assert isinstance(id, bytes) | assert isinstance(id, bytes) | ||||
vlorentz: Why is `data` optional? | |||||
Done Inline Actionsbecause we have a gazillion inconsistent indexer... ardumont: because we have a gazillion inconsistent indexer...
RevisionIndexer does not pass the data… | |||||
assert data is not None | |||||
Not Done Inline ActionsWhy type: ignore? vlorentz: Why `type: ignore`? | |||||
Done Inline Actionsbecause i don't want to spend my week satisfying more mypy. which is somewhat true because it's not initialized in the __init__ method but in the prepare for valid reasons which i forgot (most possibly be able to deal with initialization in tess). ardumont: because i don't want to spend my week satisfying more mypy.
without it it says, that the class… | |||||
Done Inline Actionsyou could add tool: Any in the class declaration vlorentz: you could add `tool: Any` in the class declaration | |||||
Done Inline Actionsthanks for that hint, i did not realize, i'll check that tomorrow ;) ardumont: thanks for that hint, i did not realize, i'll check that tomorrow ;) | |||||
with write_to_temp( | with write_to_temp( | ||||
filename=hashutil.hash_to_hex(id), # use the id as pathname | filename=hashutil.hash_to_hex(id), # use the id as pathname | ||||
data=data, | data=data, | ||||
working_directory=self.working_directory) as content_path: | working_directory=self.working_directory) as content_path: | ||||
properties = compute_license(path=content_path, log=self.log) | properties = compute_license(path=content_path) | ||||
properties.update({ | properties.update({ | ||||
'id': id, | 'id': id, | ||||
'indexer_configuration_id': self.tool['id'], | 'indexer_configuration_id': self.tool['id'], | ||||
}) | }) | ||||
return properties | return properties | ||||
def persist_index_computations(self, results, policy_update): | def persist_index_computations( | ||||
self, results: List[Dict], policy_update: str) -> Dict: | |||||
"""Persist the results in storage. | """Persist the results in storage. | ||||
Args: | Args: | ||||
results ([dict]): list of content_license, dict with the | results: list of content_license dict with the | ||||
following keys: | following keys: | ||||
- id (bytes): content's identifier (sha1) | - id (bytes): content's identifier (sha1) | ||||
- license (bytes): license in bytes | - license (bytes): license in bytes | ||||
- path (bytes): path | - path (bytes): path | ||||
policy_update ([str]): either 'update-dups' or 'ignore-dups' to | policy_update: either 'update-dups' or 'ignore-dups' to | ||||
respectively update duplicates or ignore them | respectively update duplicates or ignore them | ||||
""" | """ | ||||
self.idx_storage.content_fossology_license_add( | return self.idx_storage.content_fossology_license_add( | ||||
Done Inline ActionsWhy type: ignore? vlorentz: Why `type: ignore`? | |||||
Done Inline Actionssame. Also, note that i won't refactor more the indexers right now... I want to be able to graph what's happening right now. ardumont: same.
Also, note that i won't refactor more the indexers right now...
I want to be able to… | |||||
results, conflict_update=(policy_update == 'update-dups')) | results, conflict_update=(policy_update == 'update-dups')) | ||||
class FossologyLicenseIndexer( | class FossologyLicenseIndexer( | ||||
MixinFossologyLicenseIndexer, ContentIndexer): | MixinFossologyLicenseIndexer, ContentIndexer): | ||||
"""Indexer in charge of: | """Indexer in charge of: | ||||
- filtering out content already indexed | - filtering out content already indexed | ||||
▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines |
Why is data optional?