Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/fossology_license.py
Show All 11 Lines | |||||
def compute_license(path, log=None): | def compute_license(path, log=None): | ||||
"""Determine license from file at path. | """Determine license from file at path. | ||||
Args: | Args: | ||||
path: filepath to determine the license | path: filepath to determine the license | ||||
Returns: | Returns: | ||||
A dict with the following keys: | dict: A dict with the following keys: | ||||
- licenses ([str]): associated detected licenses to path | - licenses ([str]): associated detected licenses to path | ||||
- path (bytes): content filepath | - path (bytes): content filepath | ||||
""" | """ | ||||
try: | try: | ||||
properties = subprocess.check_output(['nomossa', path], | properties = subprocess.check_output(['nomossa', path], | ||||
universal_newlines=True) | universal_newlines=True) | ||||
if properties: | if properties: | ||||
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines | class MixinFossologyLicenseIndexer: | ||||
def compute_license(self, path, log=None): | def compute_license(self, path, log=None): | ||||
"""Determine license from file at path. | """Determine license from file at path. | ||||
Args: | Args: | ||||
path: filepath to determine the license | path: filepath to determine the license | ||||
Returns: | Returns: | ||||
A dict with the following keys: | dict: A dict with the following keys: | ||||
- licenses ([str]): associated detected licenses to path | - licenses ([str]): associated detected licenses to path | ||||
- path (bytes): content filepath | - path (bytes): content filepath | ||||
""" | """ | ||||
return compute_license(path, log=log) | return compute_license(path, log=log) | ||||
def index(self, id, data): | def index(self, id, data): | ||||
"""Index sha1s' content and store result. | """Index sha1s' content and store result. | ||||
Args: | Args: | ||||
id (bytes): content's identifier | id (bytes): content's identifier | ||||
raw_content (bytes): associated raw content to content id | raw_content (bytes): associated raw content to content id | ||||
Returns: | Returns: | ||||
A dict, representing a content_license, with keys: | dict: A dict, representing a content_license, with keys: | ||||
- id (bytes): content's identifier (sha1) | - id (bytes): content's identifier (sha1) | ||||
- license (bytes): license in bytes | - license (bytes): license in bytes | ||||
- path (bytes): path | - path (bytes): path | ||||
- indexer_configuration_id (int): tool used to compute the output | - indexer_configuration_id (int): tool used to compute the output | ||||
""" | """ | ||||
content_path = self.write_to_temp( | content_path = self.write_to_temp( | ||||
filename=hashutil.hash_to_hex(id), # use the id as pathname | filename=hashutil.hash_to_hex(id), # use the id as pathname | ||||
data=data) | data=data) | ||||
try: | try: | ||||
properties = self.compute_license(path=content_path, log=self.log) | properties = self.compute_license(path=content_path, log=self.log) | ||||
properties.update({ | properties.update({ | ||||
'id': id, | 'id': id, | ||||
'indexer_configuration_id': self.tool['id'], | 'indexer_configuration_id': self.tool['id'], | ||||
}) | }) | ||||
finally: | finally: | ||||
self.cleanup(content_path) | self.cleanup(content_path) | ||||
return properties | return properties | ||||
def persist_index_computations(self, results, policy_update): | def persist_index_computations(self, results, policy_update): | ||||
"""Persist the results in storage. | """Persist the results in storage. | ||||
Args: | Args: | ||||
results ([dict]): list of content_license, dict with the | results ([dict]): list of content_license, dict with the | ||||
following keys: | following keys: | ||||
- id (bytes): content's identifier (sha1) | - id (bytes): content's identifier (sha1) | ||||
- license (bytes): license in bytes | - license (bytes): license in bytes | ||||
- path (bytes): path | - path (bytes): path | ||||
policy_update ([str]): either 'update-dups' or 'ignore-dups' to | policy_update ([str]): either 'update-dups' or 'ignore-dups' to | ||||
respectively update duplicates or ignore them | respectively update duplicates or ignore them | ||||
""" | """ | ||||
self.idx_storage.content_fossology_license_add( | self.idx_storage.content_fossology_license_add( | ||||
results, conflict_update=(policy_update == 'update-dups')) | results, conflict_update=(policy_update == 'update-dups')) | ||||
class ContentFossologyLicenseIndexer( | class ContentFossologyLicenseIndexer( | ||||
MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): | MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): | ||||
"""Indexer in charge of: | """Indexer in charge of: | ||||
- filtering out content already indexed | - filtering out content already indexed | ||||
- reading content from objstorage per the content's id (sha1) | - reading content from objstorage per the content's id (sha1) | ||||
- computing {license, encoding} from that content | - computing {license, encoding} from that content | ||||
- store result in storage | - store result in storage | ||||
""" | """ | ||||
def filter(self, ids): | def filter(self, ids): | ||||
"""Filter out known sha1s and return only missing ones. | """Filter out known sha1s and return only missing ones. | ||||
""" | """ | ||||
yield from self.idx_storage.content_fossology_license_missing(( | yield from self.idx_storage.content_fossology_license_missing(( | ||||
{ | { | ||||
'id': sha1, | 'id': sha1, | ||||
'indexer_configuration_id': self.tool['id'], | 'indexer_configuration_id': self.tool['id'], | ||||
} for sha1 in ids | } for sha1 in ids | ||||
)) | )) | ||||
class FossologyLicenseRangeIndexer( | class FossologyLicenseRangeIndexer( | ||||
MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer): | MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer): | ||||
"""FossologyLicense Range Indexer working on range of content identifiers. | """FossologyLicense Range Indexer working on range of content identifiers. | ||||
It: | |||||
- filters out the non textual content | - filters out the non textual content | ||||
- (optionally) filters out content already indexed (cf | - (optionally) filters out content already indexed (cf | ||||
:func:`indexed_contents_in_range`) | :meth:`.indexed_contents_in_range`) | ||||
- reads content from objstorage per the content's id (sha1) | - reads content from objstorage per the content's id (sha1) | ||||
- computes {mimetype, encoding} from that content | - computes {mimetype, encoding} from that content | ||||
- stores result in storage | - stores result in storage | ||||
""" | """ | ||||
def indexed_contents_in_range(self, start, end): | def indexed_contents_in_range(self, start, end): | ||||
"""Retrieve indexed content id within range [start, end]. | """Retrieve indexed content id within range [start, end]. | ||||
Args | Args: | ||||
**start** (bytes): Starting bound from range identifier | start (bytes): Starting bound from range identifier | ||||
**end** (bytes): End range identifier | end (bytes): End range identifier | ||||
Returns: | Returns: | ||||
a dict with keys: | dict: a dict with keys: | ||||
- **ids** [bytes]: iterable of content ids within the range. | - **ids** [bytes]: iterable of content ids within the range. | ||||
- **next** (Optional[bytes]): The next range of sha1 starts at | - **next** (Optional[bytes]): The next range of sha1 starts at | ||||
this sha1 if any | this sha1 if any | ||||
""" | """ | ||||
return self.idx_storage.content_fossology_license_get_range( | return self.idx_storage.content_fossology_license_get_range( | ||||
start, end, self.tool['id']) | start, end, self.tool['id']) |