Differential D3718 Diff 13117 swh/indexer/fossology_license.py

Changeset View

Standalone View

swh/indexer/fossology_license.py

# Copyright (C) 2016-2020 The Software Heritage developers		# Copyright (C) 2016-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

import logging		import logging
import subprocess		import subprocess

from typing import Any, Dict, List, Optional		from typing import Any, Dict, List, Optional, Union

from swh.model import hashutil		from swh.model import hashutil
from .indexer import ContentIndexer, ContentRangeIndexer, write_to_temp		from .indexer import ContentIndexer, ContentPartitionIndexer, write_to_temp

		from swh.indexer.storage.interface import PagedResult, Sha1

logger = logging.getLogger(__name__)		logger = logging.getLogger(__name__)


def compute_license(path):		def compute_license(path):
"""Determine license from file at path.		"""Determine license from file at path.

Args:		Args:
Show All 29 Lines	except subprocess.CalledProcessError:
"path": path,		"path": path,
}		}


class MixinFossologyLicenseIndexer:		class MixinFossologyLicenseIndexer:
"""Mixin fossology license indexer.		"""Mixin fossology license indexer.

See :class:`FossologyLicenseIndexer` and		See :class:`FossologyLicenseIndexer` and
:class:`FossologyLicenseRangeIndexer`		:class:`FossologyLicensePartitionIndexer`

"""		"""

ADDITIONAL_CONFIG = {		ADDITIONAL_CONFIG = {
"workdir": ("str", "/tmp/swh/indexer.fossology.license"),		"workdir": ("str", "/tmp/swh/indexer.fossology.license"),
"tools": (		"tools": (
"dict",		"dict",
{		{
Show All 9 Lines	class MixinFossologyLicenseIndexer:
tool: Any		tool: Any
idx_storage: Any		idx_storage: Any

def prepare(self):		def prepare(self):
super().prepare()		super().prepare()
self.working_directory = self.config["workdir"]		self.working_directory = self.config["workdir"]

def index(		def index(
self, id: bytes, data: Optional[bytes] = None, **kwargs		self, id: Union[bytes, Dict], data: Optional[bytes] = None, **kwargs
) -> Dict[str, Any]:		) -> Dict[str, Any]:
"""Index sha1s' content and store result.		"""Index sha1s' content and store result.

Args:		Args:
id (bytes): content's identifier		id (bytes): content's identifier
raw_content (bytes): associated raw content to content id		raw_content (bytes): associated raw content to content id

Returns:		Returns:
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	def filter(self, ids):
"""Filter out known sha1s and return only missing ones.		"""Filter out known sha1s and return only missing ones.

"""		"""
yield from self.idx_storage.content_fossology_license_missing(		yield from self.idx_storage.content_fossology_license_missing(
({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids)		({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids)
)		)


class FossologyLicenseRangeIndexer(MixinFossologyLicenseIndexer, ContentRangeIndexer):		class FossologyLicensePartitionIndexer(
"""FossologyLicense Range Indexer working on range of content identifiers.		MixinFossologyLicenseIndexer, ContentPartitionIndexer
		):
		"""FossologyLicense Range Indexer working on range/partition of content identifiers.

- filters out the non textual content		- filters out the non textual content
- (optionally) filters out content already indexed (cf		- (optionally) filters out content already indexed (cf
:meth:`.indexed_contents_in_range`)		:meth:`.indexed_contents_in_partition`)
- reads content from objstorage per the content's id (sha1)		- reads content from objstorage per the content's id (sha1)
- computes {mimetype, encoding} from that content		- computes {mimetype, encoding} from that content
- stores result in storage		- stores result in storage

"""		"""

def indexed_contents_in_range(self, start, end):		def indexed_contents_in_partition(
"""Retrieve indexed content id within range [start, end].		self, partition_id: int, nb_partitions: int, page_token: Optional[str] = None
		) -> PagedResult[Sha1]:
		"""Retrieve indexed content id within the partition id

Args:		Args:
start (bytes): Starting bound from range identifier		partition_id: Index of the partition to fetch
end (bytes): End range identifier		nb_partitions: Total number of partitions to split into
		page_token: opaque token used for pagination

Returns:		Returns:
dict: a dict with keys:		PagedResult of Sha1. If next_page_token is None, there is no more data
		to fetch
- ids [bytes]: iterable of content ids within the range.
- next (Optional[bytes]): The next range of sha1 starts at
this sha1 if any

"""		"""
return self.idx_storage.content_fossology_license_get_range(		return self.idx_storage.content_fossology_license_get_partition(
start, end, self.tool["id"]		self.tool["id"], partition_id, nb_partitions, page_token=page_token
)		)


		# alias for retrocompatibility
		FossologyLicenseRangeIndexer = FossologyLicensePartitionIndexer