No OneTemporary
Actions

Size

40 KB

Subscribers

None

View Options

	diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
	index 0fed6fa..2e0ec04 100644
	--- a/swh/indexer/tests/test_fossology_license.py
	+++ b/swh/indexer/tests/test_fossology_license.py
	@@ -1,227 +1,172 @@
	# Copyright (C) 2017-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import unittest
	import logging

	from swh.indexer.fossology_license import (
	ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer
	)

	from swh.indexer.tests.test_utils import (
	MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
	- SHA1_TO_LICENSES, IndexerRangeTest
	+ SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest
	)


	class NoDiskIndexer:
	"""Mixin to override the DiskIndexer behavior avoiding side-effects in
	tests.

	"""

	def write_to_temp(self, filename, data): # noop
	return filename

	def cleanup(self, content_path): # noop
	return None


	class InjectLicenseIndexer:
	"""Override license computations.

	"""
	def compute_license(self, path, log=None):
	"""path is the content identifier

	"""
	return {
	'licenses': SHA1_TO_LICENSES.get(path)
	}


	class FossologyLicenseTestIndexer(
	NoDiskIndexer, InjectLicenseIndexer, ContentFossologyLicenseIndexer):
	- """Specific mimetype whose configuration is enough to satisfy the
	- indexing tests.
	+ """Specific fossology license whose configuration is enough to satisfy
	+ the indexing checks.

	"""
	def prepare(self):
	self.config = {
	'tools': {
	'name': 'nomos',
	'version': '3.1.0rc2-31-ga2cbb8c',
	'configuration': {
	'command_line': 'nomossa <filepath>',
	},
	},
	}
	self.idx_storage = BasicMockIndexerStorage()
	self.log = logging.getLogger('swh.indexer')
	self.objstorage = MockObjStorage()
	self.tools = self.register_tools(self.config['tools'])
	self.tool = self.tools[0]


	class FossologyLicenseIndexerUnknownToolTestStorage(
	FossologyLicenseTestIndexer):
	"""Specific fossology license indexer whose configuration is not
	enough to satisfy the indexing checks

	"""
	def prepare(self):
	super().prepare()
	self.tools = None


	class TestFossologyLicenseIndexerWithErrors(unittest.TestCase):
	def test_wrong_unknown_configuration_tool(self):
	"""Indexer with unknown configuration tool should fail the check"""
	with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
	FossologyLicenseIndexerUnknownToolTestStorage()


	-class TestFossologyLicenseIndexer(unittest.TestCase):
	+class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase):
	"""Fossology license tests.

	"""
	def setUp(self):
	self.indexer = FossologyLicenseTestIndexer()

	- def test_index_no_update(self):
	- """Index sha1s results in new computed licenses
	-
	- """
	- id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	- id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
	- sha1s = [id0, id1]
	-
	- # when
	- self.indexer.run(sha1s, policy_update='ignore-dups')
	-
	- # then
	- expected_results = [{
	- 'id': id0,
	- 'indexer_configuration_id': 10,
	- 'licenses': SHA1_TO_LICENSES[id0],
	- }, {
	- 'id': id1,
	- 'indexer_configuration_id': 10,
	- 'licenses': SHA1_TO_LICENSES[id1],
	- }]
	-
	- self.assertFalse(self.indexer.idx_storage.conflict_update)
	- self.assertEqual(expected_results, self.indexer.idx_storage.state)
	-
	- def test_index_update(self):
	- """Index sha1s results in new computed licenses
	-
	- """
	- id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	- id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
	- id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content
	- sha1s = [id0, id1, id2]
	-
	- # when
	- self.indexer.run(sha1s, policy_update='update-dups')
	-
	- # then
	- expected_results = [{
	- 'id': id0,
	- 'indexer_configuration_id': 10,
	- 'licenses': SHA1_TO_LICENSES[id0],
	- }, {
	- 'id': id1,
	- 'indexer_configuration_id': 10,
	- 'licenses': SHA1_TO_LICENSES[id1],
	- }, {
	- 'id': id2,
	- 'indexer_configuration_id': 10,
	- 'licenses': SHA1_TO_LICENSES[id2],
	- }]
	-
	- self.assertTrue(self.indexer.idx_storage.conflict_update)
	- self.assertEqual(expected_results, self.indexer.idx_storage.state)
	-
	- def test_index_one_unknown_sha1(self):
	- """Only existing contents are indexed
	-
	- """
	- # given
	- id0 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
	- sha1s = [id0,
	- '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
	- '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown
	-
	- # when
	- self.indexer.run(sha1s, policy_update='update-dups')
	-
	+ self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	+ self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
	+ self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content
	# then
	- expected_results = [{
	- 'id': id0,
	- 'indexer_configuration_id': 10,
	- 'licenses': SHA1_TO_LICENSES[id0],
	- }]
	-
	- self.assertTrue(self.indexer.idx_storage.conflict_update)
	- self.assertEqual(expected_results, self.indexer.idx_storage.state)
	+ self.expected_results = {
	+ self.id0: {
	+ 'id': self.id0,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[self.id0],
	+ },
	+ self.id1: {
	+ 'id': self.id1,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[self.id1],
	+ },
	+ self.id2: {
	+ 'id': self.id2,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[self.id2],
	+ }
	+ }


	class FossologyLicenseRangeIndexerTest(
	NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer):
	"""Testing the range indexer on fossology license.

	"""
	def prepare(self):
	self.config = {
	'tools': {
	'name': 'nomos',
	'version': '3.1.0rc2-31-ga2cbb8c',
	'configuration': {
	'command_line': 'nomossa <filepath>',
	},
	},
	'write_batch_size': 100,
	}
	self.idx_storage = BasicMockIndexerStorage()
	self.log = logging.getLogger('swh.indexer')
	# this hardcodes some contents, will use this to setup the storage
	self.objstorage = MockObjStorage()
	# sync objstorage and storage
	contents = [{'sha1': c_id} for c_id in self.objstorage]
	self.storage = BasicMockStorage(contents)
	self.tools = self.register_tools(self.config['tools'])
	self.tool = self.tools[0]


	-class TestFossologyLicenseRangeIndexer(IndexerRangeTest, unittest.TestCase):
	+class TestFossologyLicenseRangeIndexer(
	+ CommonContentIndexerRangeTest, unittest.TestCase):
	def setUp(self):
	self.indexer = FossologyLicenseRangeIndexerTest()
	# will play along with the objstorage's mocked contents for now
	self.contents = sorted(self.indexer.objstorage)
	# FIXME: leverage swh.objstorage.in_memory_storage's
	# InMemoryObjStorage, swh.storage.tests's gen_contents, and
	# hypothesis to generate data to actually run indexer on those

	self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
	self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
	self.expected_results = {
	self.id0: {
	'id': self.id0,
	'indexer_configuration_id': 10,
	'licenses': SHA1_TO_LICENSES[self.id0]
	},
	self.id1: {
	'id': self.id1,
	'indexer_configuration_id': 10,
	'licenses': SHA1_TO_LICENSES[self.id1]
	},
	self.id2: {
	'id': self.id2,
	'indexer_configuration_id': 10,
	'licenses': SHA1_TO_LICENSES[self.id2]
	}
	}
	diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
	index 6206e88..7fe178a 100644
	--- a/swh/indexer/tests/test_mimetype.py
	+++ b/swh/indexer/tests/test_mimetype.py
	@@ -1,198 +1,156 @@
	# Copyright (C) 2017-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import unittest
	import logging

	from swh.indexer.mimetype import (
	ContentMimetypeIndexer, MimetypeRangeIndexer
	)

	from swh.indexer.tests.test_utils import (
	- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, IndexerRangeTest
	+ MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
	+ CommonContentIndexerTest, CommonContentIndexerRangeTest
	)


	class MimetypeTestIndexer(ContentMimetypeIndexer):
	- """Specific mimetype whose configuration is enough to satisfy the
	- indexing tests.
	+ """Specific mimetype indexer instance whose configuration is enough to
	+ satisfy the indexing tests.

	"""
	def prepare(self):
	self.config = {
	'tools': {
	'name': 'file',
	'version': '1:5.30-1+deb9u1',
	'configuration': {
	"type": "library",
	"debian-package": "python3-magic"
	},
	},
	}
	self.idx_storage = BasicMockIndexerStorage()
	self.log = logging.getLogger('swh.indexer')
	self.objstorage = MockObjStorage()
	self.tools = self.register_tools(self.config['tools'])
	self.tool = self.tools[0]


	class MimetypeIndexerUnknownToolTestStorage(MimetypeTestIndexer):
	"""Specific mimetype whose configuration is not enough to satisfy the
	- indexing tests.
	+ indexing checks.

	"""
	def prepare(self):
	super().prepare()
	self.tools = None


	class TestMimetypeIndexerWithErrors(unittest.TestCase):
	def test_wrong_unknown_configuration_tool(self):
	"""Indexer with unknown configuration tool should fail the check"""
	with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
	MimetypeIndexerUnknownToolTestStorage()


	-class TestMimetypeIndexer(unittest.TestCase):
	+class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase):
	+ """Mimetype indexer test scenarios:
	+
	+ - new data within range are indexed
	+ - no data outside a range are indexed
	+ - with filtering existing indexed data prior to compute new index
	+ - without filtering existing indexed data prior to compute new index
	+
	+ """
	def setUp(self):
	self.indexer = MimetypeTestIndexer()

	- def test_index_no_update(self):
	- # given
	- sha1s = [
	- '01c9379dfc33803963d07c1ccc748d3fe4c96bb5',
	- '688a5ef812c53907562fe379d4b3851e69c7cb15',
	- ]
	-
	- # when
	- self.indexer.run(sha1s, policy_update='ignore-dups')
	-
	- # then
	- expected_results = [{
	- 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5',
	- 'indexer_configuration_id': 10,
	- 'mimetype': b'text/plain',
	- 'encoding': b'us-ascii',
	- }, {
	- 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15',
	- 'indexer_configuration_id': 10,
	- 'mimetype': b'text/plain',
	- 'encoding': b'us-ascii',
	- }]
	-
	- self.assertFalse(self.indexer.idx_storage.conflict_update)
	- self.assertEqual(expected_results, self.indexer.idx_storage.state)
	-
	- def test_index_update(self):
	- # given
	- sha1s = [
	- '01c9379dfc33803963d07c1ccc748d3fe4c96bb5',
	- '688a5ef812c53907562fe379d4b3851e69c7cb15',
	- 'da39a3ee5e6b4b0d3255bfef95601890afd80709', # empty content
	- ]
	-
	- # when
	- self.indexer.run(sha1s, policy_update='update-dups')
	-
	- # then
	- expected_results = [{
	- 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5',
	- 'indexer_configuration_id': 10,
	- 'mimetype': b'text/plain',
	- 'encoding': b'us-ascii',
	- }, {
	- 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15',
	- 'indexer_configuration_id': 10,
	- 'mimetype': b'text/plain',
	- 'encoding': b'us-ascii',
	- }, {
	- 'id': 'da39a3ee5e6b4b0d3255bfef95601890afd80709',
	- 'indexer_configuration_id': 10,
	- 'mimetype': b'application/x-empty',
	- 'encoding': b'binary',
	- }]
	-
	- self.assertTrue(self.indexer.idx_storage.conflict_update)
	- self.assertEqual(expected_results, self.indexer.idx_storage.state)
	-
	- def test_index_one_unknown_sha1(self):
	- # given
	- sha1s = ['688a5ef812c53907562fe379d4b3851e69c7cb15',
	- '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
	- '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown
	-
	- # when
	- self.indexer.run(sha1s, policy_update='update-dups')
	-
	- # then
	- expected_results = [{
	- 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15',
	- 'indexer_configuration_id': 10,
	- 'mimetype': b'text/plain',
	- 'encoding': b'us-ascii',
	- }]
	-
	- self.assertTrue(self.indexer.idx_storage.conflict_update)
	- self.assertEqual(expected_results, self.indexer.idx_storage.state)
	+ self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	+ self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
	+ self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
	+ self.expected_results = {
	+ self.id0: {
	+ 'id': self.id0,
	+ 'indexer_configuration_id': 10,
	+ 'mimetype': b'text/plain',
	+ 'encoding': b'us-ascii',
	+ },
	+ self.id1: {
	+ 'id': self.id1,
	+ 'indexer_configuration_id': 10,
	+ 'mimetype': b'text/plain',
	+ 'encoding': b'us-ascii',
	+ },
	+ self.id2: {
	+ 'id': self.id2,
	+ 'indexer_configuration_id': 10,
	+ 'mimetype': b'application/x-empty',
	+ 'encoding': b'binary',
	+ }
	+ }


	class MimetypeRangeIndexerTest(MimetypeRangeIndexer):
	"""Specific mimetype whose configuration is enough to satisfy the
	indexing tests.

	"""
	def prepare(self):
	self.config = {
	'tools': {
	'name': 'file',
	'version': '1:5.30-1+deb9u1',
	'configuration': {
	"type": "library",
	"debian-package": "python3-magic"
	},
	},
	'write_batch_size': 100,
	}
	self.idx_storage = BasicMockIndexerStorage()
	self.log = logging.getLogger('swh.indexer')
	# this hardcodes some contents, will use this to setup the storage
	self.objstorage = MockObjStorage()
	# sync objstorage and storage
	contents = [{'sha1': c_id} for c_id in self.objstorage]
	self.storage = BasicMockStorage(contents)
	self.tools = self.register_tools(self.config['tools'])
	self.tool = self.tools[0]


	-class TestMimetypeRangeIndexer(IndexerRangeTest, unittest.TestCase):
	- """Range Mimetype Indexer tests on """
	+class TestMimetypeRangeIndexer(
	+ CommonContentIndexerRangeTest, unittest.TestCase):
	+ """Range Mimetype Indexer tests.
	+
	+ """
	def setUp(self):
	self.indexer = MimetypeRangeIndexerTest()
	# will play along with the objstorage's mocked contents for now
	self.contents = sorted(self.indexer.objstorage)
	# FIXME: leverage swh.objstorage.in_memory_storage's
	# InMemoryObjStorage, swh.storage.tests's gen_contents, and
	# hypothesis to generate data to actually run indexer on those

	self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
	self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
	self.expected_results = {
	self.id0: {
	'encoding': b'us-ascii',
	'id': self.id0,
	'indexer_configuration_id': 10,
	'mimetype': b'text/plain'},
	self.id1: {
	'encoding': b'us-ascii',
	'id': self.id1,
	'indexer_configuration_id': 10,
	'mimetype': b'text/x-python'},
	self.id2: {
	'encoding': b'us-ascii',
	'id': self.id2,
	'indexer_configuration_id': 10,
	'mimetype': b'text/plain'}
	}
	diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
	index 6a4e705..89c7afb 100644
	--- a/swh/indexer/tests/test_utils.py
	+++ b/swh/indexer/tests/test_utils.py
	@@ -1,600 +1,648 @@
	# Copyright (C) 2017-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information


	from swh.objstorage.exc import ObjNotFoundError
	from swh.model import hashutil

	ORIGINS = [
	{
	'id': 52189575,
	'lister': None,
	'project': None,
	'type': 'git',
	'url': 'https://github.com/SoftwareHeritage/swh-storage'},
	{
	'id': 4423668,
	'lister': None,
	'project': None,
	'type': 'ftp',
	'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
	{
	'id': 77775770,
	'lister': None,
	'project': None,
	'type': 'deposit',
	'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
	{
	'id': 85072327,
	'lister': None,
	'project': None,
	'type': 'pypi',
	'url': 'https://pypi.org/project/limnoria/'},
	{
	'id': 49908349,
	'lister': None,
	'project': None,
	'type': 'svn',
	'url': 'http://0-512-md.googlecode.com/svn/'},
	{
	'id': 54974445,
	'lister': None,
	'project': None,
	'type': 'git',
	'url': 'https://github.com/librariesio/yarn-parser'},
	]

	SNAPSHOTS = {
	52189575: {
	'branches': {
	b'refs/heads/add-revision-origin-cache': {
	'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
	b's\xe7/\xe9l\x1e',
	'target_type': 'revision'},
	b'HEAD': {
	'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
	b'\xac\xefrm',
	'target_type': 'revision'},
	b'refs/tags/v0.0.103': {
	'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
	b'\x0f\xdd',
	'target_type': 'release'},
	}},
	4423668: {
	'branches': {
	b'3DLDF-1.1.4.tar.gz': {
	'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90\|\xd3\xfc'
	b'"G\x99\x11',
	'target_type': 'revision'},
	b'3DLDF-2.0.2.tar.gz': {
	'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
	b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
	'target_type': 'revision'},
	b'3DLDF-2.0.3-examples.tar.gz': {
	'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
	b'\xfe\xadZ\x80\x80\xc1\x83\xff',
	'target_type': 'revision'},
	b'3DLDF-2.0.3.tar.gz': {
	'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
	b'\xcc\x1a\xb4`\x8c\x8by',
	'target_type': 'revision'},
	b'3DLDF-2.0.tar.gz': {
	'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
	b'\xd3\xd1m',
	b'target_type': 'revision'}
	}},
	77775770: {
	'branches': {
	b'master': {
	'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
	b'\xa6\xe9\x99\xb1\x9e]q\xeb',
	'target_type': 'revision'}
	},
	'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
	b"\x1d\r "},
	85072327: {
	'branches': {
	b'HEAD': {
	'target': b'releases/2018.09.09',
	'target_type': 'alias'},
	b'releases/2018.09.01': {
	'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
	b'\xbb\xdfF\xfdw\xcf',
	'target_type': 'revision'},
	b'releases/2018.09.09': {
	'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
	b'A\x10\x9d\xc5\xfa2\xf8t',
	'target_type': 'revision'}},
	'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
	b'\x12\x9e\xd6\xb3'},
	49908349: {
	'branches': {
	b'master': {
	'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
	b'\xc9\xad#.\x1bw=\x18',
	'target_type': 'revision'}},
	'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
	b'\x05\xea\xb8\x1f\xc4H\xf4s'},
	54974445: {
	'branches': {
	b'HEAD': {
	'target': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
	'target_type': 'revision'}}}
	}


	SHA1_TO_LICENSES = {
	'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
	'02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
	'103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
	'688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
	'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
	}


	class MockObjStorage:
	"""Mock an swh-objstorage objstorage with predefined contents.

	"""
	data = {}

	def __init__(self):
	self.data = {
	'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text',
	'688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text',
	'8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text',
	'02fb2c89e14f7fab46701478c83779c7beb7b069': b"""
	import unittest
	import logging
	from swh.indexer.mimetype import ContentMimetypeIndexer
	from swh.indexer.tests.test_utils import MockObjStorage

	class MockStorage():
	def content_mimetype_add(self, mimetypes):
	self.state = mimetypes
	self.conflict_update = conflict_update

	def indexer_configuration_add(self, tools):
	return [{
	'id': 10,
	}]
	""",
	'103bc087db1d26afc3a0283f38663d081e9b01e6': b"""
	#ifndef __AVL__
	#define __AVL__

	typedef struct _avl_tree avl_tree;

	typedef struct _data_t {
	int content;
	} data_t;
	""",
	'93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
	(should 'pygments (recognize 'lisp 'easily))

	""",
	'26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
	{
	"name": "test_metadata",
	"version": "0.0.1",
	"description": "Simple package.json test for indexer",
	"repository": {
	"type": "git",
	"url": "https://github.com/moranegg/metadata_test"
	}
	}
	""",
	'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
	{
	"version": "5.0.3",
	"name": "npm",
	"description": "a package manager for JavaScript",
	"keywords": [
	"install",
	"modules",
	"package manager",
	"package.json"
	],
	"preferGlobal": true,
	"config": {
	"publishtest": false
	},
	"homepage": "https://docs.npmjs.com/",
	"author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)",
	"repository": {
	"type": "git",
	"url": "https://github.com/npm/npm"
	},
	"bugs": {
	"url": "https://github.com/npm/npm/issues"
	},
	"dependencies": {
	"JSONStream": "~1.3.1",
	"abbrev": "~1.1.0",
	"ansi-regex": "~2.1.1",
	"ansicolors": "~0.3.2",
	"ansistyles": "~0.1.3"
	},
	"devDependencies": {
	"tacks": "~1.2.6",
	"tap": "~10.3.2"
	},
	"license": "Artistic-2.0"
	}

	""",
	'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b"""
	""",
	'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'',
	}

	def __iter__(self):
	yield from self.data.keys()

	def __contains__(self, sha1):
	return self.data.get(sha1) is not None

	def get(self, sha1):
	raw_content = self.data.get(sha1)
	if raw_content is None:
	raise ObjNotFoundError(sha1)
	return raw_content


	class MockIndexerStorage():
	"""Mock an swh-indexer storage.

	"""
	added_data = []

	def indexer_configuration_add(self, tools):
	tool = tools[0]
	if tool['tool_name'] == 'swh-metadata-translator':
	return [{
	'id': 30,
	'tool_name': 'swh-metadata-translator',
	'tool_version': '0.0.1',
	'tool_configuration': {
	'type': 'local',
	'context': 'NpmMapping'
	},
	}]
	elif tool['tool_name'] == 'swh-metadata-detector':
	return [{
	'id': 7,
	'tool_name': 'swh-metadata-detector',
	'tool_version': '0.0.1',
	'tool_configuration': {
	'type': 'local',
	'context': 'NpmMapping'
	},
	}]
	elif tool['tool_name'] == 'origin-metadata':
	return [{
	'id': 8,
	'tool_name': 'origin-metadata',
	'tool_version': '0.0.1',
	'tool_configuration': {},
	}]
	else:
	assert False, 'Unknown tool {tool_name}'.format(**tool)

	def content_metadata_missing(self, sha1s):
	yield from []

	def content_metadata_add(self, metadata, conflict_update=None):
	self.added_data.append(
	('content_metadata', conflict_update, metadata))

	def revision_metadata_add(self, metadata, conflict_update=None):
	self.added_data.append(
	('revision_metadata', conflict_update, metadata))

	def origin_intrinsic_metadata_add(self, metadata, conflict_update=None):
	self.added_data.append(
	('origin_intrinsic_metadata', conflict_update, metadata))

	def content_metadata_get(self, sha1s):
	return [{
	'tool': {
	'configuration': {
	'type': 'local',
	'context': 'NpmMapping'
	},
	'version': '0.0.1',
	'id': 6,
	'name': 'swh-metadata-translator'
	},
	'id': b'cde',
	'translated_metadata': {
	'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
	'type': 'SoftwareSourceCode',
	'codemeta:issueTracker':
	'https://github.com/librariesio/yarn-parser/issues',
	'version': '1.0.0',
	'name': 'yarn-parser',
	'schema:author': 'Andrew Nesbitt',
	'url':
	'https://github.com/librariesio/yarn-parser#readme',
	'processorRequirements': {'node': '7.5'},
	'license': 'AGPL-3.0',
	'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
	'schema:codeRepository':
	'git+https://github.com/librariesio/yarn-parser.git',
	'description':
	'Tiny web service for parsing yarn.lock files',
	}
	}]


	class MockStorage():
	"""Mock a real swh-storage storage to simplify reading indexers'
	outputs.

	"""
	def origin_get(self, id_):
	for origin in ORIGINS:
	for (k, v) in id_.items():
	if origin[k] != v:
	break
	else:
	# This block is run iff we didn't break, ie. if all supplied
	# parts of the id are set to the expected value.
	return origin
	assert False, id_

	def snapshot_get_latest(self, origin_id):
	if origin_id in SNAPSHOTS:
	return SNAPSHOTS[origin_id]
	else:
	assert False, origin_id

	def revision_get(self, revisions):
	return [{
	'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f',
	'committer': {
	'id': 26,
	'name': b'Andrew Nesbitt',
	'fullname': b'Andrew Nesbitt <andrewnez@gmail.com>',
	'email': b'andrewnez@gmail.com'
	},
	'synthetic': False,
	'date': {
	'negative_utc': False,
	'timestamp': {
	'seconds': 1487596456,
	'microseconds': 0
	},
	'offset': 0
	},
	'directory': b'10'
	}]

	def directory_ls(self, directory, recursive=False, cur=None):
	# with directory: b'\x9d',
	return [{
	'sha1_git': b'abc',
	'name': b'index.js',
	'target': b'abc',
	'length': 897,
	'status': 'visible',
	'type': 'file',
	'perms': 33188,
	'dir_id': b'10',
	'sha1': b'bcd'
	},
	{
	'sha1_git': b'aab',
	'name': b'package.json',
	'target': b'aab',
	'length': 712,
	'status': 'visible',
	'type': 'file',
	'perms': 33188,
	'dir_id': b'10',
	'sha1': b'cde'
	},
	{
	'dir_id': b'10',
	'target': b'11',
	'type': 'dir',
	'length': None,
	'name': b'.github',
	'sha1': None,
	'perms': 16384,
	'sha1_git': None,
	'status': None,
	'sha256': None
	}]


	class BasicMockStorage():
	"""In memory implementation to fake the content_get_range api.

	FIXME: To remove when the actual in-memory lands.

	"""
	contents = []

	def __init__(self, contents):
	self.contents = contents

	def content_get_range(self, start, end, limit=1000):
	# to make input test data consilient with actual runtime the
	# other way of doing properly things would be to rewrite all
	# tests (that's another task entirely so not right now)
	if isinstance(start, bytes):
	start = hashutil.hash_to_hex(start)
	if isinstance(end, bytes):
	end = hashutil.hash_to_hex(end)
	results = []
	_next_id = None
	counter = 0
	for c in self.contents:
	_id = c['sha1']
	if start <= _id and _id <= end:
	results.append(c)
	if counter >= limit:
	break
	counter += 1

	return {
	'contents': results,
	'next': _next_id
	}


	class BasicMockIndexerStorage():
	"""Mock Indexer storage to simplify reading indexers' outputs.

	"""
	state = []

	def _internal_add(self, data, conflict_update=None):
	"""All content indexer have the same structure. So reuse `data` as the
	same data. It's either mimetype, language,
	fossology_license, etc...

	"""
	self.state = data
	self.conflict_update = conflict_update

	def content_mimetype_add(self, data, conflict_update=None):
	self._internal_add(data, conflict_update=conflict_update)

	def content_fossology_license_add(self, data, conflict_update=None):
	self._internal_add(data, conflict_update=conflict_update)

	def _internal_get_range(self, start, end,
	indexer_configuration_id, limit=1000):
	"""Same logic as _internal_add, we retrieve indexed data given an
	identifier. So the code here does not change even though
	the underlying data does.

	"""
	# to make input test data consilient with actual runtime the
	# other way of doing properly things would be to rewrite all
	# tests (that's another task entirely so not right now)
	if isinstance(start, bytes):
	start = hashutil.hash_to_hex(start)
	if isinstance(end, bytes):
	end = hashutil.hash_to_hex(end)
	results = []
	_next = None
	counter = 0
	for m in self.state:
	_id = m['id']
	_tool_id = m['indexer_configuration_id']
	if (start <= _id and _id <= end and
	_tool_id == indexer_configuration_id):
	results.append(_id)
	if counter >= limit:
	break
	counter += 1

	return {
	'ids': results,
	'next': _next
	}

	def content_mimetype_get_range(
	self, start, end, indexer_configuration_id, limit=1000):
	return self._internal_get_range(
	start, end, indexer_configuration_id, limit=limit)

	def content_fossology_license_get_range(
	self, start, end, indexer_configuration_id, limit=1000):
	return self._internal_get_range(
	start, end, indexer_configuration_id, limit=limit)

	def indexer_configuration_add(self, tools):
	return [{
	'id': 10,
	}]


	-class IndexerRangeTest:
	+class CommonContentIndexerTest:
	+ def assert_results_ok(self, actual_results, expected_results=None):
	+ if expected_results is None:
	+ expected_results = self.expected_results
	+
	+ for indexed_data in actual_results:
	+ _id = indexed_data['id']
	+ self.assertEqual(indexed_data, expected_results[_id])
	+ _tool_id = indexed_data['indexer_configuration_id']
	+ self.assertEqual(_tool_id, self.indexer.tool['id'])
	+
	+ def test_index(self):
	+ """Known sha1 have their data indexed
	+
	+ """
	+ sha1s = [self.id0, self.id1, self.id2]
	+
	+ # when
	+ self.indexer.run(sha1s, policy_update='update-dups')
	+
	+ actual_results = self.indexer.idx_storage.state
	+ self.assertTrue(self.indexer.idx_storage.conflict_update)
	+ self.assert_results_ok(actual_results)
	+
	+ # 2nd pass
	+ self.indexer.run(sha1s, policy_update='ignore-dups')
	+
	+ self.assertFalse(self.indexer.idx_storage.conflict_update)
	+ self.assert_results_ok(actual_results)
	+
	+ def test_index_one_unknown_sha1(self):
	+ """Unknown sha1 are not indexed"""
	+ sha1s = [self.id1,
	+ '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
	+ '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown
	+
	+ # when
	+ self.indexer.run(sha1s, policy_update='update-dups')
	+ actual_results = self.indexer.idx_storage.state
	+
	+ # then
	+ expected_results = {
	+ k: v for k, v in self.expected_results.items() if k in sha1s
	+ }
	+
	+ self.assert_results_ok(actual_results, expected_results)
	+
	+
	+class CommonContentIndexerRangeTest:
	"""Allows to factorize tests on range indexer.

	"""
	def assert_results_ok(self, start, end, actual_results,
	expected_results=None):
	if expected_results is None:
	expected_results = self.expected_results

	for indexed_data in actual_results:
	_id = indexed_data['id']
	self.assertEqual(indexed_data, expected_results[_id])
	self.assertTrue(start <= _id and _id <= end)
	_tool_id = indexed_data['indexer_configuration_id']
	self.assertEqual(_tool_id, self.indexer.tool['id'])

	def test__index_contents(self):
	"""Indexing contents without existing data results in indexed data

	"""
	start, end = [self.contents[0], self.contents[2]] # output hex ids
	# given
	actual_results = list(self.indexer._index_contents(
	start, end, indexed={}))

	self.assert_results_ok(start, end, actual_results)

	def test__index_contents_with_indexed_data(self):
	"""Indexing contents with existing data results in less indexed data

	"""
	start, end = [self.contents[0], self.contents[2]] # output hex ids
	data_indexed = [self.id0, self.id2]

	# given
	actual_results = self.indexer._index_contents(
	start, end, indexed=set(data_indexed))

	# craft the expected results
	expected_results = self.expected_results.copy()
	for already_indexed_key in data_indexed:
	expected_results.pop(already_indexed_key)

	self.assert_results_ok(
	start, end, actual_results, expected_results)

	- def test_generate_content_mimetype_get(self):
	+ def test_generate_content_get(self):
	"""Optimal indexing should result in indexed data

	"""
	start, end = [self.contents[0], self.contents[2]] # output hex ids
	# given
	actual_results = self.indexer.run(start, end)

	# then
	self.assertTrue(actual_results)

	- def test_generate_content_mimetype_get_input_as_bytes(self):
	+ def test_generate_content_get_input_as_bytes(self):
	"""Optimal indexing should result in indexed data

	Input are in bytes here.

	"""
	_start, _end = [self.contents[0], self.contents[2]] # output hex ids
	start, end = map(hashutil.hash_to_bytes, (_start, _end))

	# given
	actual_results = self.indexer.run( # checks the bytes input this time
	start, end, skip_existing=False) # no data so same result

	# then
	self.assertTrue(actual_results)

	- def test_generate_content_mimetype_get_no_result(self):
	+ def test_generate_content_get_no_result(self):
	"""No result indexed returns False"""
	start, end = ['0000000000000000000000000000000000000000',
	'0000000000000000000000000000000000000001']
	# given
	actual_results = self.indexer.run(
	start, end, incremental=False)

	# then
	self.assertFalse(actual_results)

File Metadata

Mime Type: text/x-diff
Expires: Jul 4 2025, 9:50 AM (5 w, 1 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3252009

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions