Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/fossology_license.py
# Copyright (C) 2016-2020 The Software Heritage developers | # Copyright (C) 2016-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | import logging | ||||
import subprocess | import subprocess | ||||
from typing import Any, Dict, List, Optional | from typing import Any, Dict, List, Optional, Union | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from .indexer import ContentIndexer, ContentRangeIndexer, write_to_temp | from .indexer import ContentIndexer, ContentPartitionIndexer, write_to_temp | ||||
from swh.indexer.storage.interface import PagedResult, Sha1 | |||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
def compute_license(path): | def compute_license(path): | ||||
"""Determine license from file at path. | """Determine license from file at path. | ||||
Args: | Args: | ||||
Show All 29 Lines | except subprocess.CalledProcessError: | ||||
"path": path, | "path": path, | ||||
} | } | ||||
class MixinFossologyLicenseIndexer: | class MixinFossologyLicenseIndexer: | ||||
"""Mixin fossology license indexer. | """Mixin fossology license indexer. | ||||
See :class:`FossologyLicenseIndexer` and | See :class:`FossologyLicenseIndexer` and | ||||
:class:`FossologyLicenseRangeIndexer` | :class:`FossologyLicensePartitionIndexer` | ||||
""" | """ | ||||
ADDITIONAL_CONFIG = { | ADDITIONAL_CONFIG = { | ||||
"workdir": ("str", "/tmp/swh/indexer.fossology.license"), | "workdir": ("str", "/tmp/swh/indexer.fossology.license"), | ||||
"tools": ( | "tools": ( | ||||
"dict", | "dict", | ||||
{ | { | ||||
Show All 9 Lines | class MixinFossologyLicenseIndexer: | ||||
tool: Any | tool: Any | ||||
idx_storage: Any | idx_storage: Any | ||||
def prepare(self): | def prepare(self): | ||||
super().prepare() | super().prepare() | ||||
self.working_directory = self.config["workdir"] | self.working_directory = self.config["workdir"] | ||||
def index( | def index( | ||||
self, id: bytes, data: Optional[bytes] = None, **kwargs | self, id: Union[bytes, Dict], data: Optional[bytes] = None, **kwargs | ||||
) -> Dict[str, Any]: | ) -> Dict[str, Any]: | ||||
"""Index sha1s' content and store result. | """Index sha1s' content and store result. | ||||
Args: | Args: | ||||
id (bytes): content's identifier | id (bytes): content's identifier | ||||
raw_content (bytes): associated raw content to content id | raw_content (bytes): associated raw content to content id | ||||
Returns: | Returns: | ||||
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | def filter(self, ids): | ||||
"""Filter out known sha1s and return only missing ones. | """Filter out known sha1s and return only missing ones. | ||||
""" | """ | ||||
yield from self.idx_storage.content_fossology_license_missing( | yield from self.idx_storage.content_fossology_license_missing( | ||||
({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids) | ({"id": sha1, "indexer_configuration_id": self.tool["id"],} for sha1 in ids) | ||||
) | ) | ||||
class FossologyLicenseRangeIndexer(MixinFossologyLicenseIndexer, ContentRangeIndexer): | class FossologyLicensePartitionIndexer( | ||||
"""FossologyLicense Range Indexer working on range of content identifiers. | MixinFossologyLicenseIndexer, ContentPartitionIndexer | ||||
): | |||||
"""FossologyLicense Range Indexer working on range/partition of content identifiers. | |||||
- filters out the non textual content | - filters out the non textual content | ||||
- (optionally) filters out content already indexed (cf | - (optionally) filters out content already indexed (cf | ||||
:meth:`.indexed_contents_in_range`) | :meth:`.indexed_contents_in_partition`) | ||||
- reads content from objstorage per the content's id (sha1) | - reads content from objstorage per the content's id (sha1) | ||||
- computes {mimetype, encoding} from that content | - computes {mimetype, encoding} from that content | ||||
- stores result in storage | - stores result in storage | ||||
""" | """ | ||||
def indexed_contents_in_range(self, start, end): | def indexed_contents_in_partition( | ||||
"""Retrieve indexed content id within range [start, end]. | self, partition_id: int, nb_partitions: int, page_token: Optional[str] = None | ||||
) -> PagedResult[Sha1]: | |||||
"""Retrieve indexed content id within the partition id | |||||
Args: | Args: | ||||
start (bytes): Starting bound from range identifier | partition_id: Index of the partition to fetch | ||||
end (bytes): End range identifier | nb_partitions: Total number of partitions to split into | ||||
page_token: opaque token used for pagination | |||||
Returns: | Returns: | ||||
dict: a dict with keys: | PagedResult of Sha1. If next_page_token is None, there is no more data | ||||
to fetch | |||||
- **ids** [bytes]: iterable of content ids within the range. | |||||
- **next** (Optional[bytes]): The next range of sha1 starts at | |||||
this sha1 if any | |||||
""" | """ | ||||
return self.idx_storage.content_fossology_license_get_range( | return self.idx_storage.content_fossology_license_get_partition( | ||||
start, end, self.tool["id"] | self.tool["id"], partition_id, nb_partitions, page_token=page_token | ||||
) | ) | ||||
# alias for retrocompatibility | |||||
FossologyLicenseRangeIndexer = FossologyLicensePartitionIndexer |