Differential D1079 Diff 3522 swh/indexer/storage/__init__.py

Changeset View

Standalone View

swh/indexer/storage/init.py

Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	elif cls == 'local':
from . import IndexerStorage		from . import IndexerStorage
elif cls == 'memory':		elif cls == 'memory':
from .in_memory import IndexerStorage		from .in_memory import IndexerStorage
else:		else:
raise ValueError('Unknown indexer storage class `%s`' % cls)		raise ValueError('Unknown indexer storage class `%s`' % cls)

return IndexerStorage(**args)		return IndexerStorage(**args)


		def _check_duplicates(data, key):
		"""
		If any two dictionaries in `data` have the same value for the
		key, raises a `ValueError`.

		Values associated to the key must be hashable.

		Args:
		data (List[dict]): List of dictionaries to be inserted
		key (str): Name of the key that acts as id.

		>>> _check_duplicates([
		... {'id': 'foo', 'data': 'spam'},
		... {'id': 'bar', 'data': 'egg'},
		... ], 'id')
		>>> _check_duplicates([
		... {'id': 'foo', 'data': 'spam'},
		... {'id': 'foo', 'data': 'egg'},
		... ], 'id')
		Traceback (most recent call last):
		...
		ValueError: The same id is present more than once.
		"""
		if len({item[key] for item in data}) < len(data):
		raise ValueError(
		'The same {} is present more than once.'.format(key))


		douarddaUnsubmitted Not Done Inline Actions I'm still quite not happy with this for several reasons: the arg name 'column' make no sense in the context of checking stuff on dictionnaries, the psycopg stuff in the docstring is very specific to the intended (primary) usage of this function, but there is no reason to make it specific to that usage, more importantly, I still do not see a unit test for this function. For example it's very easy to make it fail in an unexpected way using a dict as value in a dict (since these are not hashable, the function will generates a TypeError). If we do not bother to be bullet proof, make it explicit in the doctring. douardda: I'm still quite not happy with this for several reasons: - the arg name 'column' make no sense…
		vlorentzAuthorUnsubmitted Done Inline Actions the arg name 'column' make no sense in the context of checking stuff on dictionnaries, Indeed, fixed. the psycopg stuff in the docstring is very specific to the intended (primary) usage of this function, but there is no reason to make it specific to that usage, It's in the module that implements the pg backend and its name is prefixed with `_`, so it's unlikely to be used anywhere else. Fixed anyway. I still do not see a unit test for this function There's the doctest If we do not bother to be bullet proof, make it explicit in the doctring. Indeed, fixed. vlorentz: > the arg name 'column' make no sense in the context of checking stuff on dictionnaries…
class IndexerStorage:		class IndexerStorage:
"""SWH Indexer Storage		"""SWH Indexer Storage

"""		"""
def __init__(self, db, min_pool_conns=1, max_pool_conns=10):		def __init__(self, db, min_pool_conns=1, max_pool_conns=10):
"""		"""
Args:		Args:
db_conn: either a libpq connection string, or a psycopg2 connection		db_conn: either a libpq connection string, or a psycopg2 connection
▲ Show 20 Lines • Show All 151 Lines • ▼ Show 20 Lines	def content_mimetype_add(self, mimetypes, conflict_update=False, db=None,
- encoding (bytes): raw content's encoding		- encoding (bytes): raw content's encoding
- indexer_configuration_id (int): tool's id used to		- indexer_configuration_id (int): tool's id used to
compute the results		compute the results
- conflict_update (bool): Flag to determine if we want to		- conflict_update (bool): Flag to determine if we want to
overwrite (``True``) or skip duplicates (``False``, the		overwrite (``True``) or skip duplicates (``False``, the
default)		default)

"""		"""
		_check_duplicates(mimetypes, 'id')
db.mktemp_content_mimetype(cur)		db.mktemp_content_mimetype(cur)
db.copy_to(mimetypes, 'tmp_content_mimetype',		db.copy_to(mimetypes, 'tmp_content_mimetype',
['id', 'mimetype', 'encoding', 'indexer_configuration_id'],		['id', 'mimetype', 'encoding', 'indexer_configuration_id'],
cur)		cur)
db.content_mimetype_add_from_temp(conflict_update, cur)		db.content_mimetype_add_from_temp(conflict_update, cur)

@remote_api_endpoint('content_mimetype')		@remote_api_endpoint('content_mimetype')
@db_transaction_generator()		@db_transaction_generator()
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	def content_language_add(self, languages, conflict_update=False, db=None,
- id (bytes): sha1		- id (bytes): sha1
- lang (bytes): language detected		- lang (bytes): language detected

conflict_update (bool): Flag to determine if we want to		conflict_update (bool): Flag to determine if we want to
overwrite (true) or skip duplicates (false, the		overwrite (true) or skip duplicates (false, the
default)		default)

"""		"""
		_check_duplicates(languages, 'id')
db.mktemp_content_language(cur)		db.mktemp_content_language(cur)
# empty language is mapped to 'unknown'		# empty language is mapped to 'unknown'
db.copy_to(		db.copy_to(
({		({
'id': l['id'],		'id': l['id'],
'lang': 'unknown' if not l['lang'] else l['lang'],		'lang': 'unknown' if not l['lang'] else l['lang'],
'indexer_configuration_id': l['indexer_configuration_id'],		'indexer_configuration_id': l['indexer_configuration_id'],
} for l in languages),		} for l in languages),
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	def content_ctags_add(self, ctags, conflict_update=False, db=None,
Args:		Args:
ctags (iterable): dictionaries with keys:		ctags (iterable): dictionaries with keys:

- id (bytes): sha1		- id (bytes): sha1
- ctags ([list): List of dictionary with keys: name, kind,		- ctags ([list): List of dictionary with keys: name, kind,
line, lang		line, lang

"""		"""
		_check_duplicates(ctags, 'id')

def _convert_ctags(__ctags):		def _convert_ctags(__ctags):
"""Convert ctags dict to list of ctags.		"""Convert ctags dict to list of ctags.

"""		"""
for ctags in __ctags:		for ctags in __ctags:
yield from converters.ctags_to_db(ctags)		yield from converters.ctags_to_db(ctags)

db.mktemp_content_ctags(cur)		db.mktemp_content_ctags(cur)
▲ Show 20 Lines • Show All 64 Lines • ▼ Show 20 Lines	def content_fossology_license_add(self, licenses, conflict_update=False,

conflict_update: Flag to determine if we want to overwrite (true)		conflict_update: Flag to determine if we want to overwrite (true)
or skip duplicates (false, the default)		or skip duplicates (false, the default)

Returns:		Returns:
list: content_license entries which failed due to unknown licenses		list: content_license entries which failed due to unknown licenses

"""		"""
# Then, we add the correct ones		_check_duplicates(licenses, 'id')
db.mktemp_content_fossology_license(cur)		db.mktemp_content_fossology_license(cur)
db.copy_to(		db.copy_to(
({		({
'id': sha1['id'],		'id': sha1['id'],
'indexer_configuration_id': sha1['indexer_configuration_id'],		'indexer_configuration_id': sha1['indexer_configuration_id'],
'license': license,		'license': license,
} for sha1 in licenses		} for sha1 in licenses
for license in sha1['licenses']),		for license in sha1['licenses']),
▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines	def content_metadata_add(self, metadata, conflict_update=False, db=None,

- id: sha1		- id: sha1
- translated_metadata: arbitrary dict		- translated_metadata: arbitrary dict

conflict_update: Flag to determine if we want to overwrite (true)		conflict_update: Flag to determine if we want to overwrite (true)
or skip duplicates (false, the default)		or skip duplicates (false, the default)

"""		"""
		_check_duplicates(metadata, 'id')

db.mktemp_content_metadata(cur)		db.mktemp_content_metadata(cur)

db.copy_to(metadata, 'tmp_content_metadata',		db.copy_to(metadata, 'tmp_content_metadata',
['id', 'translated_metadata', 'indexer_configuration_id'],		['id', 'translated_metadata', 'indexer_configuration_id'],
cur)		cur)
db.content_metadata_add_from_temp(conflict_update, cur)		db.content_metadata_add_from_temp(conflict_update, cur)

@remote_api_endpoint('revision_metadata/missing')		@remote_api_endpoint('revision_metadata/missing')
▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines	def revision_metadata_add(self, metadata, conflict_update=False, db=None,
- indexer_configuration_id: tool used to compute metadata		- indexer_configuration_id: tool used to compute metadata
- mappings (List[str]): list of mappings used to translate		- mappings (List[str]): list of mappings used to translate
these metadata		these metadata

conflict_update: Flag to determine if we want to overwrite (true)		conflict_update: Flag to determine if we want to overwrite (true)
or skip duplicates (false, the default)		or skip duplicates (false, the default)

"""		"""
		_check_duplicates(metadata, 'id')

db.mktemp_revision_metadata(cur)		db.mktemp_revision_metadata(cur)

db.copy_to(metadata, 'tmp_revision_metadata',		db.copy_to(metadata, 'tmp_revision_metadata',
['id', 'translated_metadata', 'mappings',		['id', 'translated_metadata', 'mappings',
'indexer_configuration_id'],		'indexer_configuration_id'],
cur)		cur)
db.revision_metadata_add_from_temp(conflict_update, cur)		db.revision_metadata_add_from_temp(conflict_update, cur)

Show All 36 Lines	def origin_intrinsic_metadata_add(self, metadata,
- indexer_configuration_id: tool used to compute metadata		- indexer_configuration_id: tool used to compute metadata
- mappings (List[str]): list of mappings used to translate		- mappings (List[str]): list of mappings used to translate
these metadata		these metadata

conflict_update: Flag to determine if we want to overwrite (true)		conflict_update: Flag to determine if we want to overwrite (true)
or skip duplicates (false, the default)		or skip duplicates (false, the default)

"""		"""
		_check_duplicates(metadata, 'origin_id')

db.mktemp_origin_intrinsic_metadata(cur)		db.mktemp_origin_intrinsic_metadata(cur)

db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',		db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',
['origin_id', 'metadata', 'indexer_configuration_id',		['origin_id', 'metadata', 'indexer_configuration_id',
'from_revision', 'mappings'],		'from_revision', 'mappings'],
cur)		cur)
db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)		db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)

▲ Show 20 Lines • Show All 127 Lines • Show Last 20 Lines