Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/tests/test_metadata.py
# Copyright (C) 2017-2018 The Software Heritage developers | # Copyright (C) 2017-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
import unittest | import unittest | ||||
import unittest.mock | |||||
from hypothesis import given, strategies, settings, HealthCheck | from hypothesis import given, strategies, settings, HealthCheck | ||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||
from swh.indexer.codemeta import CODEMETA_TERMS, CROSSWALK_TABLE | from swh.indexer.codemeta import CODEMETA_TERMS, CROSSWALK_TABLE | ||||
from swh.indexer.codemeta import merge_documents | from swh.indexer.codemeta import merge_documents | ||||
from swh.indexer.metadata_dictionary import MAPPINGS | from swh.indexer.metadata_dictionary import MAPPINGS | ||||
from swh.indexer.metadata_dictionary.base import merge_values | from swh.indexer.metadata_dictionary.base import merge_values | ||||
from swh.indexer.metadata_detector import ( | from swh.indexer.metadata_detector import ( | ||||
detect_metadata | detect_metadata | ||||
) | ) | ||||
from swh.indexer.metadata import ( | from swh.indexer.metadata import RevisionMetadataIndexer | ||||
ContentMetadataIndexer, RevisionMetadataIndexer | |||||
) | |||||
from .utils import ( | from .utils import ( | ||||
BASE_TEST_CONFIG, fill_obj_storage, fill_storage, | BASE_TEST_CONFIG, fill_obj_storage, fill_storage, | ||||
YARN_PARSER_METADATA, json_document_strategy, | YARN_PARSER_METADATA, json_document_strategy, | ||||
xml_document_strategy, | xml_document_strategy, | ||||
) | ) | ||||
TRANSLATOR_TOOL = { | TRANSLATOR_TOOL = { | ||||
'name': 'swh-metadata-translator', | 'name': 'swh-metadata-translator', | ||||
'version': '0.0.2', | 'version': '0.0.2', | ||||
'configuration': { | 'configuration': { | ||||
'type': 'local', | 'type': 'local', | ||||
'context': 'NpmMapping' | 'context': 'NpmMapping' | ||||
} | } | ||||
} | } | ||||
class ContentMetadataTestIndexer(ContentMetadataIndexer): | |||||
"""Specific Metadata whose configuration is enough to satisfy the | |||||
indexing tests. | |||||
""" | |||||
def parse_config_file(self, *args, **kwargs): | |||||
assert False, 'should not be called; the rev indexer configures it.' | |||||
REVISION_METADATA_CONFIG = { | REVISION_METADATA_CONFIG = { | ||||
**BASE_TEST_CONFIG, | **BASE_TEST_CONFIG, | ||||
'tools': TRANSLATOR_TOOL, | 'tools': TRANSLATOR_TOOL, | ||||
} | } | ||||
class Metadata(unittest.TestCase): | class Metadata(unittest.TestCase): | ||||
""" | """ | ||||
▲ Show 20 Lines • Show All 164 Lines • ▼ Show 20 Lines | def test_merge_documents(self): | ||||
"description": 'Simple package.json test for indexer', | "description": 'Simple package.json test for indexer', | ||||
"name": ['test_1', 'test_0_1', 'test_metadata'], | "name": ['test_1', 'test_0_1', 'test_metadata'], | ||||
"author": ['moranegg'], | "author": ['moranegg'], | ||||
"codeRepository": | "codeRepository": | ||||
'git+https://github.com/moranegg/metadata_test', | 'git+https://github.com/moranegg/metadata_test', | ||||
} | } | ||||
self.assertEqual(expected_results, results) | self.assertEqual(expected_results, results) | ||||
def test_index_content_metadata_npm(self): | |||||
""" | |||||
testing NPM with package.json | |||||
- one sha1 uses a file that can't be translated to metadata and | |||||
should return None in the translated metadata | |||||
""" | |||||
# given | |||||
sha1s = [ | |||||
hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), | |||||
hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'), | |||||
hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'), | |||||
] | |||||
# this metadata indexer computes only metadata for package.json | |||||
# in npm context with a hard mapping | |||||
config = BASE_TEST_CONFIG.copy() | |||||
config['tools'] = [TRANSLATOR_TOOL] | |||||
metadata_indexer = ContentMetadataTestIndexer(config=config) | |||||
fill_obj_storage(metadata_indexer.objstorage) | |||||
fill_storage(metadata_indexer.storage) | |||||
# when | |||||
metadata_indexer.run(sha1s, policy_update='ignore-dups') | |||||
results = list(metadata_indexer.idx_storage.content_metadata_get( | |||||
sha1s)) | |||||
expected_results = [{ | |||||
'metadata': { | |||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | |||||
'type': 'SoftwareSourceCode', | |||||
'codeRepository': | |||||
'git+https://github.com/moranegg/metadata_test', | |||||
'description': 'Simple package.json test for indexer', | |||||
'name': 'test_metadata', | |||||
'version': '0.0.1' | |||||
}, | |||||
'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), | |||||
}, { | |||||
'metadata': { | |||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | |||||
'type': 'SoftwareSourceCode', | |||||
'issueTracker': | |||||
'https://github.com/npm/npm/issues', | |||||
'author': [{ | |||||
'type': 'Person', | |||||
'name': 'Isaac Z. Schlueter', | |||||
'email': 'i@izs.me', | |||||
'url': 'http://blog.izs.me', | |||||
}], | |||||
'codeRepository': | |||||
'git+https://github.com/npm/npm', | |||||
'description': 'a package manager for JavaScript', | |||||
'license': 'https://spdx.org/licenses/Artistic-2.0', | |||||
'version': '5.0.3', | |||||
'name': 'npm', | |||||
'keywords': [ | |||||
'install', | |||||
'modules', | |||||
'package manager', | |||||
'package.json' | |||||
], | |||||
'url': 'https://docs.npmjs.com/' | |||||
}, | |||||
'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607') | |||||
}] | |||||
for result in results: | |||||
del result['tool'] | |||||
# The assertion below returns False sometimes because of nested lists | |||||
self.assertEqual(expected_results, results) | |||||
def test_npm_bugs_normalization(self): | def test_npm_bugs_normalization(self): | ||||
# valid dictionary | # valid dictionary | ||||
package_json = b"""{ | package_json = b"""{ | ||||
"name": "foo", | "name": "foo", | ||||
"bugs": { | "bugs": { | ||||
"url": "https://github.com/owner/project/issues", | "url": "https://github.com/owner/project/issues", | ||||
"email": "foo@example.com" | "email": "foo@example.com" | ||||
} | } | ||||
▲ Show 20 Lines • Show All 807 Lines • ▼ Show 20 Lines | def test_gemspec_adversarial(self, doc): | ||||
self.gemspec_mapping.translate(b''.join(parts)) | self.gemspec_mapping.translate(b''.join(parts)) | ||||
def test_revision_metadata_indexer(self): | def test_revision_metadata_indexer(self): | ||||
metadata_indexer = RevisionMetadataIndexer( | metadata_indexer = RevisionMetadataIndexer( | ||||
config=REVISION_METADATA_CONFIG) | config=REVISION_METADATA_CONFIG) | ||||
fill_obj_storage(metadata_indexer.objstorage) | fill_obj_storage(metadata_indexer.objstorage) | ||||
fill_storage(metadata_indexer.storage) | fill_storage(metadata_indexer.storage) | ||||
tool = metadata_indexer.idx_storage.indexer_configuration_get( | |||||
{'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) | |||||
assert tool is not None | |||||
metadata_indexer.idx_storage.content_metadata_add([{ | |||||
'indexer_configuration_id': tool['id'], | |||||
'id': b'cde', | |||||
'metadata': YARN_PARSER_METADATA, | |||||
}]) | |||||
sha1_gits = [ | sha1_gits = [ | ||||
hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), | hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), | ||||
] | ] | ||||
with unittest.mock.patch( | |||||
'swh.indexer.metadata.RevisionMetadataIndexer.index_content', | |||||
return_value=YARN_PARSER_METADATA): | |||||
metadata_indexer.run(sha1_gits, 'update-dups') | metadata_indexer.run(sha1_gits, 'update-dups') | ||||
results = list( | results = list( | ||||
metadata_indexer.idx_storage. | metadata_indexer.idx_storage. | ||||
revision_intrinsic_metadata_get(sha1_gits)) | revision_intrinsic_metadata_get(sha1_gits)) | ||||
expected_results = [{ | expected_results = [{ | ||||
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), | 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), | ||||
'tool': TRANSLATOR_TOOL, | 'tool': TRANSLATOR_TOOL, | ||||
Show All 28 Lines | def test_revision_metadata_indexer_single_root_dir(self): | ||||
'sha1': None, | 'sha1': None, | ||||
'perms': 16384, | 'perms': 16384, | ||||
'sha1_git': None, | 'sha1_git': None, | ||||
'status': None, | 'status': None, | ||||
'sha256': None | 'sha256': None | ||||
}], | }], | ||||
}]) | }]) | ||||
tool = metadata_indexer.idx_storage.indexer_configuration_get( | |||||
{'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) | |||||
assert tool is not None | |||||
metadata_indexer.idx_storage.content_metadata_add([{ | |||||
'indexer_configuration_id': tool['id'], | |||||
'id': b'cde', | |||||
'metadata': YARN_PARSER_METADATA, | |||||
}]) | |||||
sha1_gits = [ | sha1_gits = [ | ||||
hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), | hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), | ||||
] | ] | ||||
with unittest.mock.patch( | |||||
'swh.indexer.metadata.RevisionMetadataIndexer.index_content', | |||||
return_value=YARN_PARSER_METADATA): | |||||
metadata_indexer.run(sha1_gits, 'update-dups') | metadata_indexer.run(sha1_gits, 'update-dups') | ||||
results = list( | results = list( | ||||
metadata_indexer.idx_storage. | metadata_indexer.idx_storage. | ||||
revision_intrinsic_metadata_get(sha1_gits)) | revision_intrinsic_metadata_get(sha1_gits)) | ||||
expected_results = [{ | expected_results = [{ | ||||
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), | 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), | ||||
'tool': TRANSLATOR_TOOL, | 'tool': TRANSLATOR_TOOL, | ||||
Show All 9 Lines |