Differential D2761 Diff 9837 swh/indexer/fossology_license.py

Changeset View

Standalone View

swh/indexer/fossology_license.py

# Copyright (C) 2016-2018 The Software Heritage developers		# Copyright (C) 2016-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

		import logging
import subprocess		import subprocess

from typing import Optional		from typing import Any, Dict, List, Optional

from swh.model import hashutil		from swh.model import hashutil
from .indexer import ContentIndexer, ContentRangeIndexer, write_to_temp		from .indexer import ContentIndexer, ContentRangeIndexer, write_to_temp


def compute_license(path, log=None):		logger = logging.getLogger(__name__)


		def compute_license(path):
"""Determine license from file at path.		"""Determine license from file at path.

Args:		Args:
path: filepath to determine the license		path: filepath to determine the license

Returns:		Returns:
dict: A dict with the following keys:		dict: A dict with the following keys:

Show All 10 Lines	try:
else:		else:
licenses = []		licenses = []

return {		return {
'licenses': licenses,		'licenses': licenses,
'path': path,		'path': path,
}		}
except subprocess.CalledProcessError:		except subprocess.CalledProcessError:
if log:
from os import path as __path		from os import path as __path
log.exception('Problem during license detection for sha1 %s' %		logger.exception('Problem during license detection for sha1 %s' %
__path.basename(path))		__path.basename(path))
return {		return {
'licenses': [],		'licenses': [],
'path': path,		'path': path,
}		}


class MixinFossologyLicenseIndexer:		class MixinFossologyLicenseIndexer:
"""Mixin fossology license indexer.		"""Mixin fossology license indexer.
Show All 10 Lines	ADDITIONAL_CONFIG = {
'configuration': {		'configuration': {
'command_line': 'nomossa <filepath>',		'command_line': 'nomossa <filepath>',
},		},
}),		}),
'write_batch_size': ('int', 1000),		'write_batch_size': ('int', 1000),
}		}

CONFIG_BASE_FILENAME = 'indexer/fossology_license' # type: Optional[str]		CONFIG_BASE_FILENAME = 'indexer/fossology_license' # type: Optional[str]
		tool: Any
		idx_storage: Any

def prepare(self):		def prepare(self):
super().prepare()		super().prepare()
self.working_directory = self.config['workdir']		self.working_directory = self.config['workdir']

def index(self, id, data):		def index(self, id: bytes, data: Optional[bytes] = None,
		**kwargs) -> Dict[str, Any]:
"""Index sha1s' content and store result.		"""Index sha1s' content and store result.

Args:		Args:
id (bytes): content's identifier		id (bytes): content's identifier
raw_content (bytes): associated raw content to content id		raw_content (bytes): associated raw content to content id

Returns:		Returns:
dict: A dict, representing a content_license, with keys:		dict: A dict, representing a content_license, with keys:

- id (bytes): content's identifier (sha1)		- id (bytes): content's identifier (sha1)
- license (bytes): license in bytes		- license (bytes): license in bytes
- path (bytes): path		- path (bytes): path
- indexer_configuration_id (int): tool used to compute the output		- indexer_configuration_id (int): tool used to compute the output

"""		"""
assert isinstance(id, bytes)		assert isinstance(id, bytes)
		vlorentzUnsubmitted Not Done Inline Actions Why is `data` optional? vlorentz: Why is `data` optional?
		ardumontAuthorUnsubmitted Done Inline Actions because we have a gazillion inconsistent indexer... RevisionIndexer does not pass the data along... ardumont: because we have a gazillion inconsistent indexer... RevisionIndexer does not pass the data…
		assert data is not None
		vlorentzUnsubmitted Not Done Inline Actions Why `type: ignore`? vlorentz: Why `type: ignore`?
		ardumontAuthorUnsubmitted Done Inline Actions because i don't want to spend my week satisfying more mypy. without it it says, that the class has no `tool`... which is somewhat true because it's not initialized in the `__init__` method but in the `prepare` for valid reasons which i forgot (most possibly be able to deal with initialization in tess). ardumont: because i don't want to spend my week satisfying more mypy. without it it says, that the class…
		vlorentzUnsubmitted Done Inline Actions you could add `tool: Any` in the class declaration vlorentz: you could add `tool: Any` in the class declaration
		ardumontAuthorUnsubmitted Done Inline Actions thanks for that hint, i did not realize, i'll check that tomorrow ;) ardumont: thanks for that hint, i did not realize, i'll check that tomorrow ;)
with write_to_temp(		with write_to_temp(
filename=hashutil.hash_to_hex(id), # use the id as pathname		filename=hashutil.hash_to_hex(id), # use the id as pathname
data=data,		data=data,
working_directory=self.working_directory) as content_path:		working_directory=self.working_directory) as content_path:
properties = compute_license(path=content_path, log=self.log)		properties = compute_license(path=content_path)
properties.update({		properties.update({
'id': id,		'id': id,
'indexer_configuration_id': self.tool['id'],		'indexer_configuration_id': self.tool['id'],
})		})
return properties		return properties

def persist_index_computations(self, results, policy_update):		def persist_index_computations(
		self, results: List[Dict], policy_update: str) -> Dict:
"""Persist the results in storage.		"""Persist the results in storage.

Args:		Args:
results ([dict]): list of content_license, dict with the		results: list of content_license dict with the
following keys:		following keys:

- id (bytes): content's identifier (sha1)		- id (bytes): content's identifier (sha1)
- license (bytes): license in bytes		- license (bytes): license in bytes
- path (bytes): path		- path (bytes): path

policy_update ([str]): either 'update-dups' or 'ignore-dups' to		policy_update: either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them		respectively update duplicates or ignore them

"""		"""
self.idx_storage.content_fossology_license_add(		return self.idx_storage.content_fossology_license_add(
		vlorentzUnsubmitted Done Inline Actions Why `type: ignore`? vlorentz: Why `type: ignore`?
		ardumontAuthorUnsubmitted Done Inline Actions same. Also, note that i won't refactor more the indexers right now... I want to be able to graph what's happening right now. The modifications i'm doing is for getting there. I'm trying to move as few cogs as possible... ardumont: same. Also, note that i won't refactor more the indexers right now... I want to be able to…
results, conflict_update=(policy_update == 'update-dups'))		results, conflict_update=(policy_update == 'update-dups'))


class FossologyLicenseIndexer(		class FossologyLicenseIndexer(
MixinFossologyLicenseIndexer, ContentIndexer):		MixinFossologyLicenseIndexer, ContentIndexer):
"""Indexer in charge of:		"""Indexer in charge of:

- filtering out content already indexed		- filtering out content already indexed
▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines