Differential D215 Diff 716 swh/indexer/tests/test_metadata.py

Changeset View

Standalone View

swh/indexer/tests/test_metadata.py

This file was added.

				# Copyright (C) 2017 The Software Heritage developers
				# See the AUTHORS file at the top-level directory of this distribution
				# License: GNU General Public License version 3, or any later version
				# See top-level LICENSE file for more information

				import unittest
				import logging
				from nose.tools import istest

				from swh.indexer.metadata_dictionary import compute_metadata
				from swh.indexer.metadata import ContentMetadataIndexer
				from swh.indexer.tests.test_utils import MockObjStorage


				def compare_results(expected, captured):
				""" The metadata translation doesn't always keep the
				result in the same order, for this a dedicated function to compare
				results is needed while checking nested lists and dicts
				Args:
				- expected (dict)
				- captured (dict)
				Returns:
				True if results are the same
				"""
				if expected == captured:
				return True

				if isinstance(captured, dict):
				while captured:
				k, v = captured.popitem()
				if not compare_results(expected[k], v):
				return False

				return True
				if isinstance(captured, list):
				while captured:
				elem = captured.pop()
				if elem in expected:
				ardumontUnsubmitted Done Inline Actions you will check only one element here. If not found at the first position, you will always return false. I think you want to remove the else part. And return False after the while clause. cough, this function should be tested as well :) ardumont: you will check only one element here. If not found at the first position, you will always…
				moraneggAuthorUnsubmitted Not Done Inline Actions changing it to if elem not in expected: return False and after the while loop return True moranegg: changing it to ``` if elem not in expected: return False ``` and after the while loop…
				ardumontUnsubmitted Not Done Inline Actions ... and after the while loop return True As a first thought, this is a nice catch up :D On second thought though, is it enough? I mean is checking the absence of unknown element enough? Counter Examples: If `captured` is a subset list of `expected`, the test shall pass even though there is a missing element in `captured`. -> Adding the length check on `captured` and `expected` would make that case appropriately fail. If `captured` is a subset list of `expected` with enough duplicated valid entries. This one could still pass even with the length check (providing the right amount of duplicated valid entries). -> The only way around that seems to check the other way around, from `expected` to `captured`. From an algo's point of view, we are in unit tests so not being optimal is not a problem here. I mean, as long as tests are fast enough, it's not really a problem (you are not writing/reading in a db for example :) Note: This function makes me uneasy. -> Comparing/Serializing nested data structure is not an easy subject -> Maybe there exists some kind of library which provide this functionality... 1'. Also, your compare function seems crafted specifically for the use case covered for the actual tests. What about other kind of nested data structure (list of dict for example). Or this could also suggest a data structure improvement. -> Do we need to enforce order in the current implementation? Adding tests on this function will definitely ease up though. ardumont: > ... > and after the while loop return True As a first thought, this is a nice catch up :D…
				return True
				else:
				return False
				return False


				class MockStorage():
				"""Mock storage to simplify reading indexers' outputs.
				"""
				def content_metadata_add(self, metadata, conflict_update=None):
				self.state = metadata
				self.conflict_update = conflict_update
				ardumontUnsubmitted Done Inline Actions You can remove it. ardumont: You can remove it.
				moraneggAuthorUnsubmitted Not Done Inline Actions ack moranegg: ack

				def indexer_configuration_get(self, tool):
				return {
				'id': 30,
				'name': 'hard_mapping_npm',
				'version': '0.1'
				}


				class TestMetadataIndexer(ContentMetadataIndexer):
				"""Specific Metadata whose configuration is enough to satisfy the
				indexing tests.
				"""
				def prepare(self):
				self.config = {
				'rescheduling_task': None,
				'tools': {
				'name': 'hard_mapping_npm',
				'version': '0.1',
				ardumontUnsubmitted Not Done Inline Actions You can remove this since we saw already the big diff on failure expectation without it :) ardumont: You can remove this since we saw already the big diff on failure expectation without it :)
				moraneggAuthorUnsubmitted Not Done Inline Actions Actually, when the difference is longer than x it doesn't show the whole text. So with the metadata result which is longer than language result, this is useful. moranegg: Actually, when the difference is longer than x it doesn't show the whole text. So with the…
				'configuration': {
				'type': 'local',
				'debian-package': ''
				}
				}
				}
				self.storage = MockStorage()
				self.log = logging.getLogger('swh.indexer')
				self.objstorage = MockObjStorage()
				self.task_destination = None
				self.rescheduling_task = self.config['rescheduling_task']
				self.tools = self.retrieve_tools_information()


				class Metadata(unittest.TestCase):
				"""
				Tests metadata_mock_tool tool for Metadata detection
				"""
				def setUp(self):
				"""
				shows the entire diff in the results
				"""
				self.maxDiff = None

				@istest
				def test_compute_metadata_none(self):
				"""
				testing content empty content is empty
				should return None
				"""
				# given
				content = b""
				tool = "hard_mapping_npm"

				# None if no metadata was found or an error occurred
				declared_metadata = None
				# when
				result = compute_metadata(tool, content)
				# then
				self.assertEqual(declared_metadata, result)

				@istest
				def test_compute_metadata_npm(self):
				"""
				testing only computation of metadata with hard_mapping_npm
				"""
				# given
				content = b"""
				{
				"name": "test_metadata",
				"version": "0.0.1",
				"description": "Simple package.json test for indexer",
				"repository": {
				"type": "git",
				"url": "https://github.com/moranegg/metadata_test"
				}
				}
				"""
				declared_metadata = {
				'name': 'test_metadata',
				'version': '0.0.1',
				'description': 'Simple package.json test for indexer',
				'codeRepository': {
				'type': 'git',
				'url': 'https://github.com/moranegg/metadata_test'
				},
				'other': {}
				}

				# when
				result = compute_metadata("hard_mapping_npm", content)
				# then
				self.assertEqual(declared_metadata, result)
				self.assertTrue(compare_results(declared_metadata, result))

				@istest
				def test_index_content_metadata_npm(self):
				"""
				testing NPM with package.json
				- one sha1 uses a file that can't be translated to metadata and
				should return None in the translated metadata
				"""
				# given
				sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5',
				'd4c647f0fc257591cc9ba1722484229780d1c607',
				'02fb2c89e14f7fab46701478c83779c7beb7b069']
				# this metadata indexer computes only metadata for package.json
				# in npm context with a hard mapping
				metadata_indexer = TestMetadataIndexer()

				# when
				metadata_indexer.run(sha1s, policy_update='ignore-dups')
				results = metadata_indexer.storage.state

				expected_results = [{
				'indexer_configuration_id': 30,
				'translated_metadata': {
				'other': {},
				'codeRepository': {
				'type': 'git',
				'url': 'https://github.com/moranegg/metadata_test'
				},
				'description': 'Simple package.json test for indexer',
				'name': 'test_metadata',
				'version': '0.0.1'
				},
				'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'
				}, {
				'indexer_configuration_id': 30,
				'translated_metadata': {
				'softwareRequirements': [
				'abbrev',
				'ansi-regex',
				{
				'JSONStream': '~1.3.1',
				'abbrev': '~1.1.0',
				'ansi-regex': '~2.1.1',
				'ansicolors': '~0.3.2',
				'ansistyles': '~0.1.3'
				}
				],
				'issueTracker': {
				'url': 'https://github.com/npm/npm/issues'
				},
				'author':
				'Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)',
				'codeRepository': {
				'type': 'git',
				'url': 'https://github.com/npm/npm'
				},
				'description': 'a package manager for JavaScript',
				'softwareSuggestions': {
				'tacks': '~1.2.6',
				'tap': '~10.3.2'
				},
				'license': 'Artistic-2.0',
				'version': '5.0.3',
				'other': {
				'preferGlobal': True,
				'config': {
				'publishtest': False
				}
				},
				'name': 'npm',
				'keywords': [
				'install',
				'modules',
				'package manager',
				'package.json'
				],
				'url': 'https://docs.npmjs.com/'
				},
				'id': 'd4c647f0fc257591cc9ba1722484229780d1c607'
				}, {
				'indexer_configuration_id': 30,
				'translated_metadata': None,
				'id': '02fb2c89e14f7fab46701478c83779c7beb7b069'
				}]
				self.assertTrue(compare_results(expected_results, results))
				# The assertion bellow returns False sometimes because of nested lists
				# self.assertEqual(expected_results, results)

				@istest
				def test_compare_method(self):
				"""
				testing compare method to view problems for nested lists and dicts
				"""
				a = {
				'indexer_configuration_id': 30,
				'id': '02fb2c89e14f7fab46701478c83779c7beb7b069',
				'translated_metadata': {
				'other': {
				'preferGlobal': True,
				'config': {
				'publishtest': False
				}
				},
				'name': 'npm',
				'keywords': [
				'install',
				'modules',
				'package manager',
				'package.json'
				],
				}
				}
				b = {
				'id': '02fb2c89e14f7fab46701478c83779c7beb7b069',
				'indexer_configuration_id': 30,
				'translated_metadata':
				{
				'other': {
				'config': {
				'publishtest': False
				},
				'preferGlobal': True
				},
				'keywords': [
				'install',
				'modules',
				'package manager',
				'package.json',

				],
				'name': 'npm',
				}
				}
				self.assertTrue(compare_results(a, b))

				@istest
				def test_index_without_compare_method(self):
				"""
				testing without compare method to check integrity
				"""
				# given
				sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5',
				'02fb2c89e14f7fab46701478c83779c7beb7b069']
				# this metadata indexer computes only metadata for package.json
				# in npm context with a hard mapping
				metadata_indexer = TestMetadataIndexer()

				# when
				metadata_indexer.run(sha1s, policy_update='ignore-dups')
				results = metadata_indexer.storage.state

				expected_results = [{
				'indexer_configuration_id': 30,
				'translated_metadata': {
				'other': {},
				'codeRepository': {
				'type': 'git',
				'url': 'https://github.com/moranegg/metadata_test'
				},
				'description': 'Simple package.json test for indexer',
				'name': 'test_metadata',
				'version': '0.0.1'
				},
				'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'
				}, {
				'indexer_configuration_id': 30,
				'translated_metadata': None,
				'id': '02fb2c89e14f7fab46701478c83779c7beb7b069'
				}]
				self.assertEqual(expected_results, results)
				self.assertTrue(compare_results(expected_results, results))