D790.id2491.diff
No OneTemporary
Actions

Size

42 KB

Subscribers

None

D790.id2491.diff
View Options

	diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
	--- a/swh/indexer/fossology_license.py
	+++ b/swh/indexer/fossology_license.py
	@@ -104,6 +104,7 @@
	- indexer_configuration_id (int): tool used to compute the output

	"""
	+ assert isinstance(id, bytes)
	content_path = self.write_to_temp(
	filename=hashutil.hash_to_hex(id), # use the id as pathname
	data=data)
	diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
	--- a/swh/indexer/indexer.py
	+++ b/swh/indexer/indexer.py
	@@ -405,11 +405,14 @@
	bytes: Identifier of contents to index.

	"""
	+ if not isinstance(start, bytes) or not isinstance(end, bytes):
	+ raise TypeError('identifiers must be bytes, not %r and %r.' %
	+ (start, end))
	while start:
	result = self.storage.content_get_range(start, end)
	contents = result['contents']
	for c in contents:
	- _id = c['sha1']
	+ _id = hashutil.hash_to_bytes(c['sha1'])
	if _id in indexed:
	continue
	yield _id
	@@ -435,6 +438,10 @@
	hashutil.hash_to_hex(sha1))
	continue
	res = self.index(sha1, raw_content, **kwargs)
	+ if not isinstance(res['id'], bytes):
	+ raise TypeError(
	+ '%r.index should return ids as bytes, not %r' %
	+ (self.__class__.__name__, res['id']))
	if res:
	yield res

	diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
	--- a/swh/indexer/storage/__init__.py
	+++ b/swh/indexer/storage/__init__.py
	@@ -346,7 +346,7 @@
	- id (bytes): content's identifier
	- name (str): symbol's name
	- kind (str): symbol's kind
	- - language (str): language for that content
	+ - lang (str): language for that content
	- tool (dict): tool used to compute the ctags' info


	@@ -365,7 +365,7 @@

	- id (bytes): sha1
	- ctags ([list): List of dictionary with keys: name, kind,
	- line, language
	+ line, lang

	"""
	def _convert_ctags(__ctags):
	@@ -412,9 +412,8 @@
	ids (iterable): sha1 checksums

	Yields:
	- list: dictionaries with the following keys:
	+ `{id: facts}` where `facts` is a dict with the following keys:

	- - id (bytes)
	- licenses ([str]): associated licenses for that content
	- tool (dict): Tool used to compute the license

	@@ -439,7 +438,7 @@
	licenses (iterable): dictionaries with keys:

	- id: sha1
	- - license ([bytes]): List of licenses associated to sha1
	+ - licenses ([bytes]): List of licenses associated to sha1
	- tool (str): nomossa

	conflict_update: Flag to determine if we want to overwrite (true)
	@@ -628,7 +627,7 @@
	Yields:
	list: dictionaries with the following keys:

	- - id (int)
	+ - origin_id (int)
	- translated_metadata (str): associated metadata
	- tool (dict): tool used to compute metadata

	diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
	--- a/swh/indexer/storage/in_memory.py
	+++ b/swh/indexer/storage/in_memory.py
	@@ -3,31 +3,45 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	-from collections import defaultdict
	+import bisect
	+from collections import defaultdict, Counter
	+import itertools
	import json
	+import operator
	+import math
	+import re

	+SHA1_DIGEST_SIZE = 160

	-class MetadataStorage:
	- """Implements missing/get/add logic for both content_metadata and
	- revision_metadata."""
	+
	+def _replace_key(d, old_key, new_key):
	+ """Replaces the old key by a new key in a dict, returning a new dict."""
	+ d = d.copy()
	+ d[new_key] = d.pop(old_key)
	+
	+
	+def _transform_tool(tool):
	+ return {
	+ 'id': tool['id'],
	+ 'name': tool['tool_name'],
	+ 'version': tool['tool_version'],
	+ 'configuration': tool['tool_configuration'],
	+ }
	+
	+
	+class SubStorage:
	+ """Implements common missing/get/add logic for each indexer type."""
	def __init__(self, tools):
	self._tools = tools
	- self._metadata = {} # map (id_, tool_id) -> metadata_dict
	+ self._sorted_ids = []
	+ self._data = {} # map (id_, tool_id) -> metadata_dict
	self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id]

	- def _transform_tool(self, tool):
	- return {
	- 'id': tool['id'],
	- 'name': tool['tool_name'],
	- 'version': tool['tool_version'],
	- 'configuration': tool['tool_configuration'],
	- }
	-
	def missing(self, ids):
	- """List metadata missing from storage.
	+ """List data missing from storage.

	Args:
	- metadata (iterable): dictionaries with keys:
	+ data (iterable): dictionaries with keys:

	- id (bytes): sha1 identifier
	- indexer_configuration_id (int): tool used to compute
	@@ -44,7 +58,7 @@
	yield id_

	def get(self, ids):
	- """Retrieve metadata per id.
	+ """Retrieve data per id.

	Args:
	ids (iterable): sha1 checksums
	@@ -53,8 +67,8 @@
	dict: dictionaries with the following keys:

	- id (bytes)
	- - translated_metadata (str): associated metadata
	- tool (dict): tool used to compute metadata
	+ - arbitrary data (as provided to `add`)

	"""
	for id_ in ids:
	@@ -62,36 +76,105 @@
	key = (id_, tool_id)
	yield {
	'id': id_,
	- 'tool': self._transform_tool(self._tools[tool_id]),
	- 'translated_metadata': self._metadata[key],
	+ 'tool': _transform_tool(self._tools[tool_id]),
	+ **self._data[key],
	}

	- def add(self, metadata, conflict_update):
	- """Add metadata not present in storage.
	+ def get_all(self):
	+ yield from self.get(list(self._tools_per_id))
	+
	+ def get_range(self, start, end, indexer_configuration_id, limit):
	+ """Retrieve data within range [start, end] bound by limit.

	Args:
	- metadata (iterable): dictionaries with keys:
	+ start (bytes): Starting identifier range (expected smaller
	+ than end)
	+ end (bytes): Ending identifier range (expected larger
	+ than start)
	+ indexer_configuration_id (int): The tool used to index data
	+ limit (int): Limit result
	+
	+ Raises:
	+ ValueError for limit to None
	+
	+ Returns:
	+ a dict with keys:
	+ - ids [bytes]: iterable of content ids within the range.
	+ - next (Optional[bytes]): The next range of sha1 starts at
	+ this sha1 if any
	+
	+ """
	+ if limit is None:
	+ raise ValueError('Development error: limit should not be None')
	+ from_index = bisect.bisect_left(self._sorted_ids, start)
	+ to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index)
	+ if to_index - from_index >= limit:
	+ return {
	+ 'ids': self._sorted_ids[from_index:from_index+limit],
	+ 'next': self._sorted_ids[from_index+limit],
	+ }
	+ else:
	+ return {
	+ 'ids': self._sorted_ids[from_index:to_index],
	+ 'next': None,
	+ }
	+
	+ def add(self, data, conflict_update):
	+ """Add data not present in storage.
	+
	+ Args:
	+ data (iterable): dictionaries with keys:

	- id: sha1
	- - translated_metadata: arbitrary dict
	- indexer_configuration_id: tool used to compute the
	results
	+ - arbitrary data

	conflict_update (bool): Flag to determine if we want to overwrite
	(true) or skip duplicates (false)

	"""
	- for item in metadata:
	- tool_id = item['indexer_configuration_id']
	- data = item['translated_metadata']
	- id_ = item['id']
	+ for item in data:
	+ item = item.copy()
	+ tool_id = item.pop('indexer_configuration_id')
	+ id_ = item.pop('id')
	+ data = item
	if not conflict_update and \
	tool_id in self._tools_per_id.get(id_, set()):
	# Duplicate, should not be updated
	continue
	key = (id_, tool_id)
	- self._metadata[key] = data
	+ self._data[key] = data
	self._tools_per_id[id_].add(tool_id)
	+ if id_ not in self._sorted_ids:
	+ bisect.insort(self._sorted_ids, id_)
	+
	+ def add_merge(self, new_data, conflict_update, merged_key):
	+ for new_item in new_data:
	+ id_ = new_item['id']
	+ tool_id = new_item['indexer_configuration_id']
	+ if conflict_update:
	+ all_subitems = []
	+ else:
	+ existing = list(self.get([id_]))
	+ all_subitems = [
	+ old_subitem
	+ for existing_item in existing
	+ if existing_item['tool']['id'] == tool_id
	+ for old_subitem in existing_item[merged_key]
	+ ]
	+ for new_subitem in new_item[merged_key]:
	+ if new_subitem not in all_subitems:
	+ all_subitems.append(new_subitem)
	+ self.add([
	+ {
	+ 'id': id_,
	+ 'indexer_configuration_id': tool_id,
	+ merged_key: all_subitems,
	+ }
	+ ], conflict_update=True)
	+ if id_ not in self._sorted_ids:
	+ bisect.insort(self._sorted_ids, id_)


	class IndexerStorage:
	@@ -99,8 +182,296 @@

	def __init__(self):
	self._tools = {}
	- self._content_metadata = MetadataStorage(self._tools)
	- self._revision_metadata = MetadataStorage(self._tools)
	+ self._mimetypes = SubStorage(self._tools)
	+ self._languages = SubStorage(self._tools)
	+ self._content_ctags = SubStorage(self._tools)
	+ self._licenses = SubStorage(self._tools)
	+ self._content_metadata = SubStorage(self._tools)
	+ self._revision_metadata = SubStorage(self._tools)
	+ self._origin_intrinsic_metadata = SubStorage(self._tools)
	+
	+ def content_mimetype_missing(self, mimetypes):
	+ """Generate mimetypes missing from storage.
	+
	+ Args:
	+ mimetypes (iterable): iterable of dict with keys:
	+
	+ - id (bytes): sha1 identifier
	+ - indexer_configuration_id (int): tool used to compute the
	+ results
	+
	+ Yields:
	+ tuple (id, indexer_configuration_id): missing id
	+
	+ """
	+ yield from self._mimetypes.missing(mimetypes)
	+
	+ def content_mimetype_get_range(
	+ self, start, end, indexer_configuration_id, limit=1000):
	+ """Retrieve mimetypes within range [start, end] bound by limit.
	+
	+ Args:
	+ start (bytes): Starting identifier range (expected smaller
	+ than end)
	+ end (bytes): Ending identifier range (expected larger
	+ than start)
	+ indexer_configuration_id (int): The tool used to index data
	+ limit (int): Limit result (default to 1000)
	+
	+ Raises:
	+ ValueError for limit to None
	+
	+ Returns:
	+ a dict with keys:
	+ - ids [bytes]: iterable of content ids within the range.
	+ - next (Optional[bytes]): The next range of sha1 starts at
	+ this sha1 if any
	+
	+ """
	+ return self._mimetypes.get_range(
	+ start, end, indexer_configuration_id, limit)
	+
	+ def content_mimetype_add(self, mimetypes, conflict_update=False):
	+ """Add mimetypes not present in storage.
	+
	+ Args:
	+ mimetypes (iterable): dictionaries with keys:
	+
	+ - id (bytes): sha1 identifier
	+ - mimetype (bytes): raw content's mimetype
	+ - encoding (bytes): raw content's encoding
	+ - indexer_configuration_id (int): tool's id used to
	+ compute the results
	+ - conflict_update (bool): Flag to determine if we want to
	+ overwrite (``True``) or skip duplicates (``False``, the
	+ default)
	+
	+ """
	+ if not all(isinstance(x['id'], bytes) for x in mimetypes):
	+ raise TypeError('identifiers must be bytes.')
	+ self._mimetypes.add(mimetypes, conflict_update)
	+
	+ def content_mimetype_get(self, ids, db=None, cur=None):
	+ """Retrieve full content mimetype per ids.
	+
	+ Args:
	+ ids (iterable): sha1 identifier
	+
	+ Yields:
	+ mimetypes (iterable): dictionaries with keys:
	+
	+ - id (bytes): sha1 identifier
	+ - mimetype (bytes): raw content's mimetype
	+ - encoding (bytes): raw content's encoding
	+ - tool (dict): Tool used to compute the language
	+
	+ """
	+ yield from self._mimetypes.get(ids)
	+
	+ def content_language_missing(self, languages):
	+ """List languages missing from storage.
	+
	+ Args:
	+ languages (iterable): dictionaries with keys:
	+
	+ - id (bytes): sha1 identifier
	+ - indexer_configuration_id (int): tool used to compute
	+ the results
	+
	+ Yields:
	+ an iterable of missing id for the tuple (id,
	+ indexer_configuration_id)
	+
	+ """
	+ yield from self._languages.missing(languages)
	+
	+ def content_language_get(self, ids):
	+ """Retrieve full content language per ids.
	+
	+ Args:
	+ ids (iterable): sha1 identifier
	+
	+ Yields:
	+ languages (iterable): dictionaries with keys:
	+
	+ - id (bytes): sha1 identifier
	+ - lang (bytes): raw content's language
	+ - tool (dict): Tool used to compute the language
	+
	+ """
	+ yield from self._languages.get(ids)
	+
	+ def content_language_add(self, languages, conflict_update=False):
	+ """Add languages not present in storage.
	+
	+ Args:
	+ languages (iterable): dictionaries with keys:
	+
	+ - id (bytes): sha1
	+ - lang (bytes): language detected
	+
	+ conflict_update (bool): Flag to determine if we want to
	+ overwrite (true) or skip duplicates (false, the
	+ default)
	+
	+ """
	+ self._languages.add(languages, conflict_update)
	+
	+ def content_ctags_missing(self, ctags):
	+ """List ctags missing from storage.
	+
	+ Args:
	+ ctags (iterable): dicts with keys:
	+
	+ - id (bytes): sha1 identifier
	+ - indexer_configuration_id (int): tool used to compute
	+ the results
	+
	+ Yields:
	+ an iterable of missing id for the tuple (id,
	+ indexer_configuration_id)
	+
	+ """
	+ yield from self._content_ctags.missing(ctags)
	+
	+ def content_ctags_get(self, ids):
	+ """Retrieve ctags per id.
	+
	+ Args:
	+ ids (iterable): sha1 checksums
	+
	+ Yields:
	+ Dictionaries with keys:
	+
	+ - id (bytes): content's identifier
	+ - name (str): symbol's name
	+ - kind (str): symbol's kind
	+ - lang (str): language for that content
	+ - tool (dict): tool used to compute the ctags' info
	+
	+
	+ """
	+ for item in self._content_ctags.get(ids):
	+ for item_ctags_item in item['ctags']:
	+ yield {
	+ 'id': item['id'],
	+ 'tool': item['tool'],
	+ **item_ctags_item
	+ }
	+
	+ def content_ctags_add(self, ctags, conflict_update=False):
	+ """Add ctags not present in storage
	+
	+ Args:
	+ ctags (iterable): dictionaries with keys:
	+
	+ - id (bytes): sha1
	+ - ctags ([list): List of dictionary with keys: name, kind,
	+ line, lang
	+ - indexer_configuration_id: tool used to compute the
	+ results
	+
	+ """
	+ if not all(isinstance(x['id'], bytes) for x in ctags):
	+ raise TypeError('identifiers must be bytes.')
	+ self._content_ctags.add_merge(ctags, conflict_update, 'ctags')
	+
	+ def content_ctags_search(self, expression,
	+ limit=10, last_sha1=None, db=None, cur=None):
	+ """Search through content's raw ctags symbols.
	+
	+ Args:
	+ expression (str): Expression to search for
	+ limit (int): Number of rows to return (default to 10).
	+ last_sha1 (str): Offset from which retrieving data (default to '').
	+
	+ Yields:
	+ rows of ctags including id, name, lang, kind, line, etc...
	+
	+ """
	+ nb_matches = 0
	+ for ((id_, tool_id), item) in \
	+ sorted(self._content_ctags._data.items()):
	+ if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))):
	+ continue
	+ nb_matches += 1
	+ for ctags_item in item['ctags']:
	+ if ctags_item['name'] != expression:
	+ continue
	+ yield {
	+ 'id': id_,
	+ 'tool': _transform_tool(self._tools[tool_id]),
	+ **ctags_item
	+ }
	+ if nb_matches >= limit:
	+ return
	+
	+ def content_fossology_license_get(self, ids):
	+ """Retrieve licenses per id.
	+
	+ Args:
	+ ids (iterable): sha1 checksums
	+
	+ Yields:
	+ `{id: facts}` where `facts` is a dict with the following keys:
	+
	+ - licenses ([str]): associated licenses for that content
	+ - tool (dict): Tool used to compute the license
	+
	+ """
	+ # TODO: remove this reformatting in order to yield items with the
	+ # same format as other _get methods.
	+ res = {}
	+ for d in self._licenses.get(ids):
	+ res.setdefault(d.pop('id'), []).append(d)
	+ for (id_, facts) in res.items():
	+ yield {id_: facts}
	+
	+ def content_fossology_license_add(self, licenses, conflict_update=False):
	+ """Add licenses not present in storage.
	+
	+ Args:
	+ licenses (iterable): dictionaries with keys:
	+
	+ - id: sha1
	+ - licenses ([bytes]): List of licenses associated to sha1
	+ - tool (str): nomossa
	+
	+ conflict_update: Flag to determine if we want to overwrite (true)
	+ or skip duplicates (false, the default)
	+
	+ Returns:
	+ list: content_license entries which failed due to unknown licenses
	+
	+ """
	+ if not all(isinstance(x['id'], bytes) for x in licenses):
	+ raise TypeError('identifiers must be bytes.')
	+ self._licenses.add_merge(licenses, conflict_update, 'licenses')
	+
	+ def content_fossology_license_get_range(
	+ self, start, end, indexer_configuration_id, limit=1000):
	+ """Retrieve licenses within range [start, end] bound by limit.
	+
	+ Args:
	+ start (bytes): Starting identifier range (expected smaller
	+ than end)
	+ end (bytes): Ending identifier range (expected larger
	+ than start)
	+ indexer_configuration_id (int): The tool used to index data
	+ limit (int): Limit result (default to 1000)
	+
	+ Raises:
	+ ValueError for limit to None
	+
	+ Returns:
	+ a dict with keys:
	+ - ids [bytes]: iterable of content ids within the range.
	+ - next (Optional[bytes]): The next range of sha1 starts at
	+ this sha1 if any
	+
	+ """
	+ return self._licenses.get_range(
	+ start, end, indexer_configuration_id, limit)

	def content_metadata_missing(self, metadata):
	"""List metadata missing from storage.
	@@ -149,6 +520,8 @@
	or skip duplicates (false, the default)

	"""
	+ if not all(isinstance(x['id'], bytes) for x in metadata):
	+ raise TypeError('identifiers must be bytes.')
	self._content_metadata.add(metadata, conflict_update)

	def revision_metadata_missing(self, metadata):
	@@ -197,8 +570,101 @@
	or skip duplicates (false, the default)

	"""
	+ if not all(isinstance(x['id'], bytes) for x in metadata):
	+ raise TypeError('identifiers must be bytes.')
	self._revision_metadata.add(metadata, conflict_update)

	+ def origin_intrinsic_metadata_get(self, ids):
	+ """Retrieve origin metadata per id.
	+
	+ Args:
	+ ids (iterable): origin identifiers
	+
	+ Yields:
	+ list: dictionaries with the following keys:
	+
	+ - origin_id (int)
	+ - translated_metadata (str): associated metadata
	+ - tool (dict): tool used to compute metadata
	+
	+ """
	+ for item in self._origin_intrinsic_metadata.get(ids):
	+ item['origin_id'] = item.pop('id')
	+ yield item
	+
	+ def origin_intrinsic_metadata_add(self, metadata,
	+ conflict_update=False):
	+ """Add origin metadata not present in storage.
	+
	+ Args:
	+ metadata (iterable): dictionaries with keys:
	+
	+ - origin_id: origin identifier
	+ - from_revision: sha1 id of the revision used to generate
	+ these metadata.
	+ - metadata: arbitrary dict
	+ - indexer_configuration_id: tool used to compute metadata
	+
	+ conflict_update: Flag to determine if we want to overwrite (true)
	+ or skip duplicates (false, the default)
	+
	+ """
	+
	+ for item in metadata:
	+ item = item.copy()
	+ item['id'] = item.pop('origin_id')
	+ self._origin_intrinsic_metadata.add([item], conflict_update)
	+
	+ def origin_intrinsic_metadata_search_fulltext(
	+ self, conjunction, limit=100):
	+ """Returns the list of origins whose metadata contain all the terms.
	+
	+ Args:
	+ conjunction (List[str]): List of terms to be searched for.
	+ limit (int): The maximum number of results to return
	+
	+ Yields:
	+ list: dictionaries with the following keys:
	+
	+ - id (int)
	+ - metadata (str): associated metadata
	+ - tool (dict): tool used to compute metadata
	+
	+ """
	+ # A very crude fulltext search implementation, but that's enough
	+ # to work on English metadata
	+ tokens_re = re.compile('[a-zA-Z0-9]+')
	+ search_tokens = list(itertools.chain(
	+ *map(tokens_re.findall, conjunction)))
	+
	+ def rank(data):
	+ # Tokenize the metadata
	+ text = json.dumps(data['metadata'])
	+ text_tokens = tokens_re.findall(text)
	+ text_token_occurences = Counter(text_tokens)
	+
	+ # Count the number of occurences of search tokens in the text
	+ score = 0
	+ for search_token in search_tokens:
	+ if text_token_occurences[search_token] == 0:
	+ # Search token is not in the text.
	+ return 0
	+ score += text_token_occurences[search_token]
	+
	+ # Normalize according to the text's length
	+ return score / math.log(len(text_tokens))
	+
	+ results = [(rank(data), data)
	+ for data in self._origin_intrinsic_metadata.get_all()]
	+ results = [(rank_, data) for (rank_, data) in results if rank_ > 0]
	+ results.sort(key=operator.itemgetter(0), # Don't try to order 'data'
	+ reverse=True)
	+ results = [data for (rank_, data) in results]
	+ for result in results[:limit]:
	+ result = result.copy()
	+ result['origin_id'] = result.pop('id')
	+ yield result
	+
	def indexer_configuration_add(self, tools):
	"""Add new tools to the storage.

	diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py
	--- a/swh/indexer/tests/storage/test_in_memory.py
	+++ b/swh/indexer/tests/storage/test_in_memory.py
	@@ -13,131 +13,9 @@
	}
	super().setUp()

	- @pytest.mark.xfail
	- def test_check_config(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_mimetype_missing(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_mimetype_add__drop_duplicate(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_mimetype_add__update_in_place_duplicate(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_mimetype_get(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_language_missing(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_language_get(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_language_add__drop_duplicate(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_language_add__update_in_place_duplicate(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_ctags_missing(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_ctags_get(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_ctags_search(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_ctags_search_no_result(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_ctags_add__add_new_ctags_added(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_ctags_add__update_in_place(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_fossology_license_get(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_fossology_license_add__new_license_added(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_content_fossology_license_add__update_in_place_duplicate(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_origin_intrinsic_metadata_get(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_origin_intrinsic_metadata_add_drop_duplicate(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_origin_intrinsic_metadata_add_update_in_place_duplicate(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_origin_intrinsic_metadata_search_fulltext(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_origin_intrinsic_metadata_search_fulltext_rank(self):
	- pass
	+ def reset_storage_tables(self):
	+ self.storage = self.storage.__class__()

	@pytest.mark.xfail
	- def test_indexer_configuration_metadata_get_missing_context(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_indexer_configuration_metadata_get(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_generate_content_mimetype_get_range_limit_none(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_generate_content_mimetype_get_range_no_limit(self, mimetypes):
	- pass
	-
	- @pytest.mark.xfail
	- def test_generate_content_mimetype_get_range_limit(self, mimetypes):
	- pass
	-
	- @pytest.mark.xfail
	- def test_generate_content_fossology_license_get_range_limit_none(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_generate_content_fossology_license_get_range_no_limit(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_generate_content_fossology_license_get_range_no_limit_with_filter(
	- self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_generate_fossology_license_get_range_limit(self):
	+ def test_check_config(self):
	pass
	diff --git a/swh/indexer/tests/test_ctags.py b/swh/indexer/tests/test_ctags.py
	--- a/swh/indexer/tests/test_ctags.py
	+++ b/swh/indexer/tests/test_ctags.py
	@@ -11,7 +11,7 @@
	)

	from swh.indexer.tests.test_utils import (
	- BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest,
	+ CommonContentIndexerTest,
	CommonIndexerWithErrorsTest, CommonIndexerNoTool,
	SHA1_TO_CTAGS, NoDiskIndexer, BASE_TEST_CONFIG
	)
	@@ -99,12 +99,6 @@
	'workdir': '/nowhere',
	}

	- def prepare(self):
	- super().prepare()
	- self.idx_storage = BasicMockIndexerStorage()
	- self.objstorage = MockObjStorage()
	- self.tool_config = self.config['tools']['configuration']
	-

	class TestCtagsIndexer(CommonContentIndexerTest, unittest.TestCase):
	"""Ctags indexer test scenarios:
	@@ -113,8 +107,13 @@
	- Unknown sha1 in the input list are not indexed

	"""
	+
	+ def get_indexer_results(self, ids):
	+ yield from self.idx_storage.content_ctags_get(ids)
	+
	def setUp(self):
	self.indexer = CtagsIndexerTest()
	+ self.idx_storage = self.indexer.idx_storage

	# Prepare test input
	self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
	--- a/swh/indexer/tests/test_fossology_license.py
	+++ b/swh/indexer/tests/test_fossology_license.py
	@@ -4,7 +4,6 @@
	# See top-level LICENSE file for more information

	import unittest
	-import logging

	from unittest.mock import patch

	@@ -14,10 +13,9 @@
	)

	from swh.indexer.tests.test_utils import (
	- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
	SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest,
	CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer,
	- BASE_TEST_CONFIG
	+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
	)


	@@ -78,12 +76,6 @@
	},
	}

	- def prepare(self):
	- super().prepare()
	- self.idx_storage = BasicMockIndexerStorage()
	- self.log = logging.getLogger('swh.indexer')
	- self.objstorage = MockObjStorage()
	-

	class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase):
	"""Language indexer test scenarios:
	@@ -92,8 +84,14 @@
	- Unknown sha1 in the input list are not indexed

	"""
	+
	+ def get_indexer_results(self, ids):
	+ yield from self.idx_storage.content_ctags_get(ids)
	+
	def setUp(self):
	+ super().setUp()
	self.indexer = FossologyLicenseTestIndexer()
	+ self.idx_storage = self.indexer.idx_storage

	self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
	@@ -138,15 +136,6 @@
	'write_batch_size': 100,
	}

	- def prepare(self):
	- super().prepare()
	- self.idx_storage = BasicMockIndexerStorage()
	- self.log = logging.getLogger('swh.indexer')
	- # this hardcodes some contents, will use this to setup the storage
	- self.objstorage = MockObjStorage()
	- contents = [{'sha1': c_id} for c_id in self.objstorage]
	- self.storage = BasicMockStorage(contents)
	-

	class TestFossologyLicenseRangeIndexer(
	CommonContentIndexerRangeTest, unittest.TestCase):
	@@ -159,12 +148,10 @@

	"""
	def setUp(self):
	+ super().setUp()
	self.indexer = FossologyLicenseRangeIndexerTest()
	- # will play along with the objstorage's mocked contents for now
	- self.contents = sorted(self.indexer.objstorage)
	- # FIXME: leverage swh.objstorage.in_memory_storage's
	- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
	- # hypothesis to generate data to actually run indexer on those
	+ fill_storage(self.indexer.storage)
	+ fill_obj_storage(self.indexer.objstorage)

	self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
	diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py
	--- a/swh/indexer/tests/test_language.py
	+++ b/swh/indexer/tests/test_language.py
	@@ -7,8 +7,8 @@
	from swh.indexer import language
	from swh.indexer.language import LanguageIndexer
	from swh.indexer.tests.test_utils import (
	- BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest,
	- CommonIndexerWithErrorsTest, CommonIndexerNoTool, BASE_TEST_CONFIG
	+ CommonContentIndexerTest, CommonIndexerWithErrorsTest,
	+ CommonIndexerNoTool, BASE_TEST_CONFIG, fill_storage, fill_obj_storage
	)


	@@ -30,12 +30,6 @@
	}
	}

	- def prepare(self):
	- super().prepare()
	- self.idx_storage = BasicMockIndexerStorage()
	- self.objstorage = MockObjStorage()
	- self.tool_config = self.config['tools']['configuration']
	-

	class Language(unittest.TestCase):
	"""Tests pygments tool for language detection
	@@ -60,8 +54,14 @@
	- Unknown sha1 in the input list are not indexed

	"""
	+
	+ def get_indexer_results(self, ids):
	+ yield from self.indexer.idx_storage.content_language_get(ids)
	+
	def setUp(self):
	self.indexer = LanguageTestIndexer()
	+ fill_storage(self.indexer.storage)
	+ fill_obj_storage(self.indexer.objstorage)

	self.id0 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
	self.id1 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
	diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
	--- a/swh/indexer/tests/test_mimetype.py
	+++ b/swh/indexer/tests/test_mimetype.py
	@@ -4,7 +4,6 @@
	# See top-level LICENSE file for more information

	import unittest
	-import logging

	from unittest.mock import patch

	@@ -13,10 +12,9 @@
	)

	from swh.indexer.tests.test_utils import (
	- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
	CommonContentIndexerTest, CommonContentIndexerRangeTest,
	CommonIndexerWithErrorsTest, CommonIndexerNoTool,
	- BASE_TEST_CONFIG
	+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
	)


	@@ -61,12 +59,6 @@
	},
	}

	- def prepare(self):
	- super().prepare()
	- self.idx_storage = BasicMockIndexerStorage()
	- self.log = logging.getLogger('swh.indexer')
	- self.objstorage = MockObjStorage()
	-

	class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase):
	"""Mimetype indexer test scenarios:
	@@ -75,8 +67,13 @@
	- Unknown sha1 in the input list are not indexed

	"""
	+
	+ def get_indexer_results(self, ids):
	+ yield from self.idx_storage.content_mimetype_get(ids)
	+
	def setUp(self):
	self.indexer = MimetypeTestIndexer()
	+ self.idx_storage = self.indexer.idx_storage

	self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
	@@ -123,15 +120,6 @@
	'write_batch_size': 100,
	}

	- def prepare(self):
	- super().prepare()
	- self.idx_storage = BasicMockIndexerStorage()
	- # this hardcodes some contents, will use this to setup the storage
	- self.objstorage = MockObjStorage()
	- # sync objstorage and storage
	- contents = [{'sha1': c_id} for c_id in self.objstorage]
	- self.storage = BasicMockStorage(contents)
	-

	class TestMimetypeRangeIndexer(
	CommonContentIndexerRangeTest, unittest.TestCase):
	@@ -144,12 +132,10 @@

	"""
	def setUp(self):
	+ super().setUp()
	self.indexer = MimetypeRangeIndexerTest()
	- # will play along with the objstorage's mocked contents for now
	- self.contents = sorted(self.indexer.objstorage)
	- # FIXME: leverage swh.objstorage.in_memory_storage's
	- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
	- # hypothesis to generate data to actually run indexer on those
	+ fill_storage(self.indexer.storage)
	+ fill_obj_storage(self.indexer.objstorage)

	self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
	diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
	--- a/swh/indexer/tests/test_utils.py
	+++ b/swh/indexer/tests/test_utils.py
	@@ -4,10 +4,12 @@
	# See top-level LICENSE file for more information

	import datetime
	+import hashlib
	+import random

	from swh.objstorage.exc import ObjNotFoundError
	from swh.model import hashutil
	-from swh.model.hashutil import hash_to_bytes
	+from swh.model.hashutil import hash_to_bytes, hash_to_hex

	from swh.indexer.storage import INDEXER_CFG_KEY

	@@ -488,6 +490,21 @@
	'id': DIRECTORY_ID,
	'entries': DIRECTORY,
	}])
	+ for (obj_id, content) in OBJ_STORAGE_DATA.items():
	+ if hasattr(hashlib, 'blake2s'):
	+ blake2s256 = hashlib.blake2s(content, digest_size=32).digest()
	+ else:
	+ # fallback for Python <3.6
	+ blake2s256 = bytes([random.randint(0, 255) for _ in range(32)])
	+ storage.content_add([{
	+ 'data': content,
	+ 'length': len(content),
	+ 'status': 'visible',
	+ 'sha1': hash_to_bytes(obj_id),
	+ 'sha1_git': hash_to_bytes(obj_id),
	+ 'sha256': hashlib.sha256(content).digest(),
	+ 'blake2s256': blake2s256
	+ }])


	class MockStorage():
	@@ -659,7 +676,15 @@


	class CommonContentIndexerTest:
	- def assert_results_ok(self, actual_results, expected_results=None):
	+ def get_indexer_results(self, ids):
	+ """Override this for indexers that don't have a mock storage."""
	+ return self.indexer.idx_storage.state
	+
	+ def assert_results_ok(self, sha1s, expected_results=None):
	+ sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
	+ for sha1 in sha1s]
	+ actual_results = self.get_indexer_results(sha1s)
	+
	if expected_results is None:
	expected_results = self.expected_results

	@@ -678,15 +703,12 @@
	# when
	self.indexer.run(sha1s, policy_update='update-dups')

	- actual_results = self.indexer.idx_storage.state
	- self.assertTrue(self.indexer.idx_storage.conflict_update)
	- self.assert_results_ok(actual_results)
	+ self.assert_results_ok(sha1s)

	# 2nd pass
	self.indexer.run(sha1s, policy_update='ignore-dups')

	- self.assertFalse(self.indexer.idx_storage.conflict_update)
	- self.assert_results_ok(actual_results)
	+ self.assert_results_ok(sha1s)

	def test_index_one_unknown_sha1(self):
	"""Unknown sha1 are not indexed"""
	@@ -696,29 +718,35 @@

	# when
	self.indexer.run(sha1s, policy_update='update-dups')
	- actual_results = self.indexer.idx_storage.state

	# then
	expected_results = {
	k: v for k, v in self.expected_results.items() if k in sha1s
	}

	- self.assert_results_ok(actual_results, expected_results)
	+ self.assert_results_ok(sha1s, expected_results)


	class CommonContentIndexerRangeTest:
	"""Allows to factorize tests on range indexer.

	"""
	+ def setUp(self):
	+ self.contents = sorted(OBJ_STORAGE_DATA)
	+
	def assert_results_ok(self, start, end, actual_results,
	expected_results=None):
	if expected_results is None:
	expected_results = self.expected_results

	+ actual_results = list(actual_results)
	for indexed_data in actual_results:
	_id = indexed_data['id']
	- self.assertEqual(indexed_data, expected_results[_id])
	- self.assertTrue(start <= _id and _id <= end)
	+ assert isinstance(_id, bytes)
	+ indexed_data = indexed_data.copy()
	+ indexed_data['id'] = hash_to_hex(indexed_data['id'])
	+ self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
	+ self.assertTrue(start <= _id <= end)
	_tool_id = indexed_data['indexer_configuration_id']
	self.assertEqual(_tool_id, self.indexer.tool['id'])

	@@ -726,7 +754,8 @@
	"""Indexing contents without existing data results in indexed data

	"""
	- start, end = [self.contents[0], self.contents[2]] # output hex ids
	+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
	+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
	# given
	actual_results = list(self.indexer._index_contents(
	start, end, indexed={}))
	@@ -737,12 +766,13 @@
	"""Indexing contents with existing data results in less indexed data

	"""
	- start, end = [self.contents[0], self.contents[2]] # output hex ids
	+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
	+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
	data_indexed = [self.id0, self.id2]

	# given
	actual_results = self.indexer._index_contents(
	- start, end, indexed=set(data_indexed))
	+ start, end, indexed=set(map(hash_to_bytes, data_indexed)))

	# craft the expected results
	expected_results = self.expected_results.copy()
	@@ -756,7 +786,8 @@
	"""Optimal indexing should result in indexed data

	"""
	- start, end = [self.contents[0], self.contents[2]] # output hex ids
	+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
	+ start, end = map(hashutil.hash_to_bytes, (_start, _end))

	# given
	actual_results = self.indexer.run(start, end)
	@@ -783,8 +814,9 @@

	def test_generate_content_get_no_result(self):
	"""No result indexed returns False"""
	- start, end = ['0000000000000000000000000000000000000000',
	- '0000000000000000000000000000000000000001']
	+ _start, _end = ['0000000000000000000000000000000000000000',
	+ '0000000000000000000000000000000000000001']
	+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
	# given
	actual_results = self.indexer.run(
	start, end, incremental=False)

File Metadata

Mime Type: text/plain
Expires: Sun, Aug 17, 8:53 PM (3 h, 52 m ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3227188

D790.id2491.diffNo OneTemporaryActions

D790.id2491.diffView Options

File Metadata

Event Timeline

D790.id2491.diff
No OneTemporary
Actions

D790.id2491.diff
View Options