metadata.py
No OneTemporary
Actions

Size

11 KB

Subscribers

None

metadata.py
View Options

	# Copyright (C) 2017 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	import click
	import logging

	from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
	from swh.indexer.metadata_dictionary import MAPPINGS
	from swh.indexer.metadata_detector import detect_metadata
	from swh.indexer.metadata_detector import extract_minimal_metadata_dict
	from swh.indexer.storage import INDEXER_CFG_KEY

	from swh.model import hashutil


	class ContentMetadataIndexer(ContentIndexer):
	"""Content-level indexer

	This indexer is in charge of:

	- filtering out content already indexed in content_metadata
	- reading content from objstorage with the content's id sha1
	- computing translated_metadata by given context
	- using the metadata_dictionary as the 'swh-metadata-translator' tool
	- store result in content_metadata table

	"""
	CONFIG_BASE_FILENAME = 'indexer/content_metadata'

	def __init__(self, tool, config):
	# twisted way to use the exact same config of RevisionMetadataIndexer
	# object that uses internally ContentMetadataIndexer
	self.config = config
	self.config['tools'] = tool
	super().__init__()

	def filter(self, ids):
	"""Filter out known sha1s and return only missing ones.
	"""
	yield from self.idx_storage.content_metadata_missing((
	{
	'id': sha1,
	'indexer_configuration_id': self.tool['id'],
	} for sha1 in ids
	))

	def index(self, id, data):
	"""Index sha1s' content and store result.

	Args:
	id (bytes): content's identifier
	data (bytes): raw content in bytes

	Returns:
	dict: dictionary representing a content_metadata. If the
	translation wasn't successful the translated_metadata keys will
	be returned as None

	"""
	result = {
	'id': id,
	'indexer_configuration_id': self.tool['id'],
	'translated_metadata': None
	}
	try:
	mapping_name = self.tool['tool_configuration']['context']
	result['translated_metadata'] = MAPPINGS[mapping_name] \
	.translate(data)
	# a twisted way to keep result with indexer object for get_results
	self.results.append(result)
	except Exception:
	self.log.exception(
	"Problem during tool retrieval of metadata translation")
	return result

	def persist_index_computations(self, results, policy_update):
	"""Persist the results in storage.

	Args:
	results ([dict]): list of content_metadata, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- translated_metadata (jsonb): detected metadata
	policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	self.idx_storage.content_metadata_add(
	results, conflict_update=(policy_update == 'update-dups'))

	def get_results(self):
	"""can be called only if run method was called before

	Returns:
	list: list of content_metadata entries calculated by
	current indexer

	"""
	return self.results


	class RevisionMetadataIndexer(RevisionIndexer):
	"""Revision-level indexer

	This indexer is in charge of:

	- filtering revisions already indexed in revision_metadata table with
	defined computation tool
	- retrieve all entry_files in root directory
	- use metadata_detector for file_names containing metadata
	- compute metadata translation if necessary and possible (depends on tool)
	- send sha1s to content indexing if possible
	- store the results for revision

	"""
	CONFIG_BASE_FILENAME = 'indexer/revision_metadata'

	ADDITIONAL_CONFIG = {
	'tools': ('dict', {
	'name': 'swh-metadata-detector',
	'version': '0.0.2',
	'configuration': {
	'type': 'local',
	'context': ['NpmMapping', 'CodemetaMapping']
	},
	}),
	}

	ContentMetadataIndexer = ContentMetadataIndexer

	def prepare(self):
	super().prepare()
	self.tool = self.tools[0]

	def filter(self, sha1_gits):
	"""Filter out known sha1s and return only missing ones.

	"""
	yield from self.idx_storage.revision_metadata_missing((
	{
	'id': sha1_git,
	'indexer_configuration_id': self.tool['id'],
	} for sha1_git in sha1_gits
	))

	def index(self, rev):
	"""Index rev by processing it and organizing result.

	use metadata_detector to iterate on filenames

	- if one filename detected -> sends file to content indexer
	- if multiple file detected -> translation needed at revision level

	Args:
	rev (bytes): revision artifact from storage

	Returns:
	dict: dictionary representing a revision_metadata, with keys:

	- id (str): rev's identifier (sha1_git)
	- indexer_configuration_id (bytes): tool used
	- translated_metadata: dict of retrieved metadata

	"""
	result = {
	'id': rev['id'].decode(),
	'indexer_configuration_id': self.tool['id'],
	'translated_metadata': None
	}

	try:
	root_dir = rev['directory']
	dir_ls = self.storage.directory_ls(root_dir, recursive=False)
	files = [entry for entry in dir_ls if entry['type'] == 'file']
	detected_files = detect_metadata(files)
	result['translated_metadata'] = self.translate_revision_metadata(
	detected_files)
	except Exception as e:
	self.log.exception(
	'Problem when indexing rev: %r', e)
	return result

	def persist_index_computations(self, results, policy_update):
	"""Persist the results in storage.

	Args:
	results ([dict]): list of content_mimetype, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- mimetype (bytes): mimetype in bytes
	- encoding (bytes): encoding in bytes
	policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	# TODO: add functions in storage to keep data in revision_metadata
	self.idx_storage.revision_metadata_add(
	results, conflict_update=(policy_update == 'update-dups'))

	def translate_revision_metadata(self, detected_files):
	"""
	Determine plan of action to translate metadata when containing
	one or multiple detected files:

	Args:
	detected_files (dict): dictionary mapping context names (e.g.,
	"npm", "authors") to list of sha1

	Returns:
	dict: dict with translated metadata according to the CodeMeta
	vocabulary

	"""
	translated_metadata = []
	tool = {
	'name': 'swh-metadata-translator',
	'version': '0.0.2',
	'configuration': {
	'type': 'local',
	'context': None
	},
	}
	# TODO: iterate on each context, on each file
	# -> get raw_contents
	# -> translate each content
	config = {
	INDEXER_CFG_KEY: self.idx_storage,
	'objstorage': self.objstorage
	}
	for context in detected_files.keys():
	tool['configuration']['context'] = context
	c_metadata_indexer = self.ContentMetadataIndexer(tool, config)
	# sha1s that are in content_metadata table
	sha1s_in_storage = []
	metadata_generator = self.idx_storage.content_metadata_get(
	detected_files[context])
	for c in metadata_generator:
	# extracting translated_metadata
	sha1 = c['id']
	sha1s_in_storage.append(sha1)
	local_metadata = c['translated_metadata']
	# local metadata is aggregated
	if local_metadata:
	translated_metadata.append(local_metadata)

	sha1s_filtered = [item for item in detected_files[context]
	if item not in sha1s_in_storage]

	if sha1s_filtered:
	# schedule indexation of content
	try:
	c_metadata_indexer.run(sha1s_filtered,
	policy_update='ignore-dups')
	# on the fly possibility:
	results = c_metadata_indexer.get_results()

	for result in results:
	local_metadata = result['translated_metadata']
	translated_metadata.append(local_metadata)

	except Exception as e:
	self.log.warning("""Exception while indexing content""", e)

	# transform translated_metadata into min set with swh-metadata-detector
	min_metadata = extract_minimal_metadata_dict(translated_metadata)
	return min_metadata


	class OriginMetadataIndexer(OriginIndexer):
	ADDITIONAL_CONFIG = {
	'tools': ('list', [])
	}

	def check(self, **kwargs):
	kwargs['check_tools'] = False
	super().check(**kwargs)

	def filter(self, ids):
	return ids

	def run(self, origin_head, policy_update):
	"""Expected to be called with the result of RevisionMetadataIndexer
	as first argument; ie. not a list of ids as other indexers would.

	Args:

	* `origin_head` (dict): {str(origin_id): rev_id.encode()}
	keys `origin_id` and `revision_id`, which is the result
	of OriginHeadIndexer.
	* `policy_update`: `'ignore-dups'` or `'update-dups'`
	"""
	origin_head_map = {int(origin_id): rev_id
	for (origin_id, rev_id) in origin_head.items()}

	# Fix up the argument order. revisions_metadata has to be the
	# first argument because of celery.chain; the next line calls
	# run() with the usual order, ie. origin ids first.
	return super().run(ids=list(origin_head_map),
	policy_update=policy_update,
	parse_ids=False,
	origin_head_map=origin_head_map)

	def index(self, origin, *, origin_head_map):
	# Get the last revision of the origin.
	revision_id = origin_head_map[origin['id']]

	revision_metadata = self.idx_storage \
	.revision_metadata_get([revision_id])

	for item in revision_metadata:
	assert item['id'] == revision_id
	# Get the metadata of that revision, and return it
	return {
	'origin_id': origin['id'],
	'metadata': item['translated_metadata'],
	'from_revision': revision_id,
	'indexer_configuration_id':
	item['indexer_configuration_id'],
	}

	def persist_index_computations(self, results, policy_update):
	self.idx_storage.origin_intrinsic_metadata_add(
	results, conflict_update=(policy_update == 'update-dups'))


	@click.command()
	@click.option('--revs', '-i',
	help='Default sha1_git to lookup', multiple=True)
	def main(revs):
	_git_sha1s = list(map(hashutil.hash_to_bytes, revs))
	rev_metadata_indexer = RevisionMetadataIndexer()
	rev_metadata_indexer.run(_git_sha1s, 'update-dups')


	if __name__ == '__main__':
	logging.basicConfig(level=logging.INFO)
	main()

File Metadata

Mime Type: text/x-python
Expires: Jul 4 2025, 8:43 AM (6 w, 6 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3359664

metadata.pyNo OneTemporaryActions

metadata.pyView Options

File Metadata

Event Timeline

metadata.py
No OneTemporary
Actions

metadata.py
View Options