Differential D755 Diff 2372 swh/indexer/fossology_license.py

Changeset View

Standalone View

swh/indexer/fossology_license.py

Show All 11 Lines

def compute_license(path, log=None):		def compute_license(path, log=None):
"""Determine license from file at path.		"""Determine license from file at path.

Args:		Args:
path: filepath to determine the license		path: filepath to determine the license

Returns:		Returns:
A dict with the following keys:		dict: A dict with the following keys:

- licenses ([str]): associated detected licenses to path		- licenses ([str]): associated detected licenses to path
- path (bytes): content filepath		- path (bytes): content filepath

"""		"""
try:		try:
properties = subprocess.check_output(['nomossa', path],		properties = subprocess.check_output(['nomossa', path],
universal_newlines=True)		universal_newlines=True)
if properties:		if properties:
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	class MixinFossologyLicenseIndexer:

def compute_license(self, path, log=None):		def compute_license(self, path, log=None):
"""Determine license from file at path.		"""Determine license from file at path.

Args:		Args:
path: filepath to determine the license		path: filepath to determine the license

Returns:		Returns:
A dict with the following keys:		dict: A dict with the following keys:

- licenses ([str]): associated detected licenses to path		- licenses ([str]): associated detected licenses to path
- path (bytes): content filepath		- path (bytes): content filepath

"""		"""
return compute_license(path, log=log)		return compute_license(path, log=log)

def index(self, id, data):		def index(self, id, data):
"""Index sha1s' content and store result.		"""Index sha1s' content and store result.

Args:		Args:
id (bytes): content's identifier		id (bytes): content's identifier
raw_content (bytes): associated raw content to content id		raw_content (bytes): associated raw content to content id

Returns:		Returns:
A dict, representing a content_license, with keys:		dict: A dict, representing a content_license, with keys:

- id (bytes): content's identifier (sha1)		- id (bytes): content's identifier (sha1)
- license (bytes): license in bytes		- license (bytes): license in bytes
- path (bytes): path		- path (bytes): path
- indexer_configuration_id (int): tool used to compute the output		- indexer_configuration_id (int): tool used to compute the output

"""		"""
content_path = self.write_to_temp(		content_path = self.write_to_temp(
filename=hashutil.hash_to_hex(id), # use the id as pathname		filename=hashutil.hash_to_hex(id), # use the id as pathname
data=data)		data=data)

try:		try:
properties = self.compute_license(path=content_path, log=self.log)		properties = self.compute_license(path=content_path, log=self.log)
properties.update({		properties.update({
'id': id,		'id': id,
'indexer_configuration_id': self.tool['id'],		'indexer_configuration_id': self.tool['id'],
})		})
finally:		finally:
self.cleanup(content_path)		self.cleanup(content_path)

return properties		return properties

def persist_index_computations(self, results, policy_update):		def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.		"""Persist the results in storage.

Args:		Args:
results ([dict]): list of content_license, dict with the		results ([dict]): list of content_license, dict with the
following keys:		following keys:

- id (bytes): content's identifier (sha1)		- id (bytes): content's identifier (sha1)
- license (bytes): license in bytes		- license (bytes): license in bytes
- path (bytes): path		- path (bytes): path

policy_update ([str]): either 'update-dups' or 'ignore-dups' to		policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them		respectively update duplicates or ignore them

"""		"""
self.idx_storage.content_fossology_license_add(		self.idx_storage.content_fossology_license_add(
results, conflict_update=(policy_update == 'update-dups'))		results, conflict_update=(policy_update == 'update-dups'))


class ContentFossologyLicenseIndexer(		class ContentFossologyLicenseIndexer(
MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer):		MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer):
"""Indexer in charge of:		"""Indexer in charge of:

- filtering out content already indexed		- filtering out content already indexed
- reading content from objstorage per the content's id (sha1)		- reading content from objstorage per the content's id (sha1)
- computing {license, encoding} from that content		- computing {license, encoding} from that content
- store result in storage		- store result in storage

"""		"""
def filter(self, ids):		def filter(self, ids):
"""Filter out known sha1s and return only missing ones.		"""Filter out known sha1s and return only missing ones.

"""		"""
yield from self.idx_storage.content_fossology_license_missing((		yield from self.idx_storage.content_fossology_license_missing((
{		{
'id': sha1,		'id': sha1,
'indexer_configuration_id': self.tool['id'],		'indexer_configuration_id': self.tool['id'],
} for sha1 in ids		} for sha1 in ids
))		))


class FossologyLicenseRangeIndexer(		class FossologyLicenseRangeIndexer(
MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer):		MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer):
"""FossologyLicense Range Indexer working on range of content identifiers.		"""FossologyLicense Range Indexer working on range of content identifiers.

It:
- filters out the non textual content		- filters out the non textual content
- (optionally) filters out content already indexed (cf		- (optionally) filters out content already indexed (cf
:func:`indexed_contents_in_range`)		:meth:`.indexed_contents_in_range`)
- reads content from objstorage per the content's id (sha1)		- reads content from objstorage per the content's id (sha1)
- computes {mimetype, encoding} from that content		- computes {mimetype, encoding} from that content
- stores result in storage		- stores result in storage

"""		"""
def indexed_contents_in_range(self, start, end):		def indexed_contents_in_range(self, start, end):
"""Retrieve indexed content id within range [start, end].		"""Retrieve indexed content id within range [start, end].

Args		Args:
start (bytes): Starting bound from range identifier		start (bytes): Starting bound from range identifier
end (bytes): End range identifier		end (bytes): End range identifier

Returns:		Returns:
a dict with keys:		dict: a dict with keys:

- ids [bytes]: iterable of content ids within the range.		- ids [bytes]: iterable of content ids within the range.
- next (Optional[bytes]): The next range of sha1 starts at		- next (Optional[bytes]): The next range of sha1 starts at
this sha1 if any		this sha1 if any

"""		"""
return self.idx_storage.content_fossology_license_get_range(		return self.idx_storage.content_fossology_license_get_range(
start, end, self.tool['id'])		start, end, self.tool['id'])