No OneTemporary
Actions

Size

16 KB

Subscribers

None

View Options

	diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py
	index b6f4bb7..0679692 100644
	--- a/swh/indexer/ctags.py
	+++ b/swh/indexer/ctags.py
	@@ -1,167 +1,156 @@
	# Copyright (C) 2015-2017 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import click
	import subprocess
	import json

	from swh.model import hashutil

	from .language import compute_language
	from .indexer import ContentIndexer, DiskIndexer


	# Options used to compute tags
	__FLAGS = [
	'--fields=+lnz', # +l: language
	# +n: line number of tag definition
	# +z: include the symbol's kind (function, variable, ...)
	'--sort=no', # sort output on tag name
	'--links=no', # do not follow symlinks
	'--output-format=json', # outputs in json
	]


	def run_ctags(path, lang=None, ctags_command='ctags'):
	"""Run ctags on file path with optional language.

	Args:
	path: path to the file
	lang: language for that path (optional)

	Returns:
	ctags' output

	"""
	optional = []
	if lang:
	optional = ['--language-force=%s' % lang]

	cmd = [ctags_command] + __FLAGS + optional + [path]
	output = subprocess.check_output(cmd, universal_newlines=True)

	for symbol in output.split('\n'):
	if not symbol:
	continue
	js_symbol = json.loads(symbol)
	yield {
	'name': js_symbol['name'],
	'kind': js_symbol['kind'],
	'line': js_symbol['line'],
	'lang': js_symbol['language'],
	}


	class CtagsIndexer(ContentIndexer, DiskIndexer):
	CONFIG_BASE_FILENAME = 'indexer/ctags'

	ADDITIONAL_CONFIG = {
	'workdir': ('str', '/tmp/swh/indexer.ctags'),
	'tools': ('dict', {
	'name': 'universal-ctags',
	'version': '~git7859817b',
	'configuration': {
	'command_line': '''ctags --fields=+lnz --sort=no --links=no '''
	'''--output-format=json <filepath>'''
	},
	}),
	'languages': ('dict', {
	'ada': 'Ada',
	'adl': None,
	'agda': None,
	# ...
	})
	}

	def prepare(self):
	super().prepare()
	self.working_directory = self.config['workdir']
	self.language_map = self.config['languages']
	self.tool = self.tools[0]

	def filter(self, ids):
	"""Filter out known sha1s and return only missing ones.

	"""
	yield from self.idx_storage.content_ctags_missing((
	{
	'id': sha1,
	'indexer_configuration_id': self.tool['id'],
	} for sha1 in ids
	))

	def compute_ctags(self, path, lang):
	"""Compute ctags on file at path with language lang.

	"""
	return run_ctags(path, lang=lang)

	def index(self, id, data):
	"""Index sha1s' content and store result.

	Args:
	id (bytes): content's identifier
	data (bytes): raw content in bytes

	Returns:
	A dict, representing a content_mimetype, with keys:
	- id (bytes): content's identifier (sha1)
	- ctags ([dict]): ctags list of symbols

	"""
	lang = compute_language(data, log=self.log)['lang']

	if not lang:
	return None

	ctags_lang = self.language_map.get(lang)

	if not ctags_lang:
	return None

	ctags = {
	'id': id,
	}

	filename = hashutil.hash_to_hex(id)
	content_path = self.write_to_temp(
	filename=filename,
	data=data)

	result = run_ctags(content_path, lang=ctags_lang)
	ctags.update({
	'ctags': list(result),
	'indexer_configuration_id': self.tool['id'],
	})

	self.cleanup(content_path)

	return ctags

	def persist_index_computations(self, results, policy_update):
	"""Persist the results in storage.

	Args:
	results ([dict]): list of content_mimetype, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- ctags ([dict]): ctags list of symbols
	policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	self.idx_storage.content_ctags_add(
	results, conflict_update=(policy_update == 'update-dups'))
	-
	-
	-@click.command()
	-@click.option('--path', help="Path to execute index on")
	-def main(path):
	- r = list(run_ctags(path))
	- print(r)
	-
	-
	-if __name__ == '__main__':
	- main()
	diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
	index bac3810..37522b9 100644
	--- a/swh/indexer/fossology_license.py
	+++ b/swh/indexer/fossology_license.py
	@@ -1,184 +1,172 @@
	# Copyright (C) 2016-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import click
	import subprocess

	from swh.model import hashutil

	from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer


	class MixinFossologyLicenseIndexer:
	"""Mixin fossology license indexer.

	See :class:`ContentFossologyLicenseIndexer` and
	:class:`FossologyLicenseRangeIndexer`

	"""
	ADDITIONAL_CONFIG = {
	'workdir': ('str', '/tmp/swh/indexer.fossology.license'),
	'tools': ('dict', {
	'name': 'nomos',
	'version': '3.1.0rc2-31-ga2cbb8c',
	'configuration': {
	'command_line': 'nomossa <filepath>',
	},
	}),
	'write_batch_size': ('int', 1000),
	}

	CONFIG_BASE_FILENAME = 'indexer/fossology_license'

	def prepare(self):
	super().prepare()
	self.working_directory = self.config['workdir']
	self.tool = self.tools[0]

	def compute_license(self, path, log=None):
	"""Determine license from file at path.

	Args:
	path: filepath to determine the license

	Returns:
	A dict with the following keys:
	- licenses ([str]): associated detected licenses to path
	- path (bytes): content filepath
	- tool (str): tool used to compute the output

	"""
	try:
	properties = subprocess.check_output(['nomossa', path],
	universal_newlines=True)
	if properties:
	res = properties.rstrip().split(' contains license(s) ')
	licenses = res[1].split(',')

	return {
	'licenses': licenses,
	'path': path,
	}
	except subprocess.CalledProcessError:
	if log:
	from os import path as __path
	log.exception('Problem during license detection for sha1 %s' %
	__path.basename(path))
	return {
	'licenses': [],
	'path': path,
	}

	def index(self, id, data):
	"""Index sha1s' content and store result.

	Args:
	id (bytes): content's identifier
	raw_content (bytes): raw content in bytes

	Returns:
	A dict, representing a content_license, with keys:
	- id (bytes): content's identifier (sha1)
	- license (bytes): license in bytes
	- path (bytes): path

	"""
	if isinstance(id, str):
	id = hashutil.hash_to_hex(id)
	content_path = self.write_to_temp(
	filename=id,
	data=data)

	try:
	properties = self.compute_license(path=content_path, log=self.log)
	properties.update({
	'id': id,
	'indexer_configuration_id': self.tool['id'],
	})
	finally:
	self.cleanup(content_path)

	return properties

	def persist_index_computations(self, results, policy_update):
	"""Persist the results in storage.

	Args:
	results ([dict]): list of content_license, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- license (bytes): license in bytes
	- path (bytes): path
	policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	self.idx_storage.content_fossology_license_add(
	results, conflict_update=(policy_update == 'update-dups'))


	class ContentFossologyLicenseIndexer(
	MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer):
	"""Indexer in charge of:
	- filtering out content already indexed
	- reading content from objstorage per the content's id (sha1)
	- computing {license, encoding} from that content
	- store result in storage

	"""
	def filter(self, ids):
	"""Filter out known sha1s and return only missing ones.

	"""
	yield from self.idx_storage.content_fossology_license_missing((
	{
	'id': sha1,
	'indexer_configuration_id': self.tool['id'],
	} for sha1 in ids
	))


	class FossologyLicenseRangeIndexer(
	MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer):
	"""FossologyLicense Range Indexer working on range of content identifiers.

	It:
	- filters out the non textual content
	- (optionally) filters out content already indexed (cf :callable:`range`)
	- reads content from objstorage per the content's id (sha1)
	- computes {mimetype, encoding} from that content
	- stores result in storage

	"""
	def indexed_contents_in_range(self, start, end):
	"""Retrieve indexed content id within range [start, end].

	Args
	start (bytes): Starting bound from range identifier
	end (bytes): End range identifier

	Yields:
	Content identifier (bytes) present in the range [start, end]

	"""
	while start:
	result = self.idx_storage.content_fossology_license_get_range(
	start, end, self.tool['id'])
	contents = result['ids']
	for _id in contents:
	yield _id
	start = result['next']
	-
	-
	-@click.command(help='Compute license for path using tool')
	-@click.option('--tool', default='nomossa', help="Path to tool")
	-@click.option('--path', required=1, help="Path to execute index on")
	-def main(tool, path):
	- indexer = ContentFossologyLicenseIndexer()
	- print(indexer.compute_license(tool, path))
	-
	-
	-if __name__ == '__main__':
	- main()
	diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
	index 4342faa..7dd43af 100644
	--- a/swh/indexer/mimetype.py
	+++ b/swh/indexer/mimetype.py
	@@ -1,169 +1,156 @@
	# Copyright (C) 2016-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import click
	import magic

	from swh.model import hashutil

	from .indexer import ContentIndexer, ContentRangeIndexer


	def compute_mimetype_encoding(raw_content):
	"""Determine mimetype and encoding from the raw content.

	Args:
	raw_content (bytes): content's raw data

	Returns:
	A dict with mimetype and encoding key and corresponding values
	(as bytes).

	"""
	r = magic.detect_from_content(raw_content)
	return {
	'mimetype': r.mime_type.encode('utf-8'),
	'encoding': r.encoding.encode('utf-8'),
	}


	class MixinMimetypeIndexer:
	"""Mixin mimetype indexer.

	See :class:`ContentMimetypeIndexer` and :class:`MimetypeRangeIndexer`

	"""
	ADDITIONAL_CONFIG = {
	'tools': ('dict', {
	'name': 'file',
	'version': '1:5.30-1+deb9u1',
	'configuration': {
	"type": "library",
	"debian-package": "python3-magic"
	},
	}),
	'write_batch_size': ('int', 1000),
	}

	CONFIG_BASE_FILENAME = 'indexer/mimetype'

	def prepare(self):
	super().prepare()
	self.tool = self.tools[0]

	def index(self, id, data):
	"""Index sha1s' content and store result.

	Args:
	id (bytes): content's identifier
	data (bytes): raw content in bytes

	Returns:
	A dict, representing a content_mimetype, with keys:

	- id (bytes): content's identifier (sha1)
	- mimetype (bytes): mimetype in bytes
	- encoding (bytes): encoding in bytes

	"""
	try:
	properties = compute_mimetype_encoding(data)
	properties.update({
	'id': id,
	'indexer_configuration_id': self.tool['id'],
	})
	except TypeError:
	self.log.error('Detecting mimetype error for id %s' % (
	hashutil.hash_to_hex(id), ))
	return None

	return properties

	def persist_index_computations(self, results, policy_update):
	"""Persist the results in storage.

	Args:
	results ([dict]): list of content_mimetype, dict with the
	following keys:

	- id (bytes): content's identifier (sha1)
	- mimetype (bytes): mimetype in bytes
	- encoding (bytes): encoding in bytes

	policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	self.idx_storage.content_mimetype_add(
	results, conflict_update=(policy_update == 'update-dups'))


	class ContentMimetypeIndexer(MixinMimetypeIndexer, ContentIndexer):
	"""Mimetype Indexer working on list of content identifiers.

	It:
	- (optionally) filters out content already indexed (cf. :callable:`filter`)
	- reads content from objstorage per the content's id (sha1)
	- computes {mimetype, encoding} from that content
	- stores result in storage

	FIXME:
	- 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer
	- 2. Do we keep it afterwards? ~> i think this can be used with the journal

	"""
	def filter(self, ids):
	"""Filter out known sha1s and return only missing ones.

	"""
	yield from self.idx_storage.content_mimetype_missing((
	{
	'id': sha1,
	'indexer_configuration_id': self.tool['id'],
	} for sha1 in ids
	))


	class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer):
	"""Mimetype Range Indexer working on range of content identifiers.

	It:
	- (optionally) filters out content already indexed (cf :callable:`range`)
	- reads content from objstorage per the content's id (sha1)
	- computes {mimetype, encoding} from that content
	- stores result in storage

	"""
	def indexed_contents_in_range(self, start, end):
	"""Retrieve indexed content id within range [start, end].

	Args
	start (bytes): Starting bound from range identifier
	end (bytes): End range identifier

	Yields:
	Content identifier (bytes) present in the range [start, end]

	"""
	while start:
	result = self.idx_storage.content_mimetype_get_range(
	start, end, self.tool['id'])
	contents = result['ids']
	for _id in contents:
	yield _id
	start = result['next']
	-
	-
	-@click.command()
	-@click.option('--path', help="Path to execute index on")
	-def main(path):
	- with open(path, 'rb') as f:
	- raw_content = f.read()
	-
	- print(compute_mimetype_encoding(raw_content))
	-
	-
	-if __name__ == '__main__':
	- main()

File Metadata

Mime Type: text/x-diff
Expires: Fri, Jul 4, 3:23 PM (5 d, 22 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3356174

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions