diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index d8f8a18..ee84653 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,483 +1,486 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
-from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS
-from swh.indexer.metadata_detector import detect_metadata
-from swh.indexer.metadata_detector import extract_minimal_metadata_dict
-from swh.indexer.metadata import ContentMetadataIndexer
-from swh.indexer.metadata import RevisionMetadataIndexer
-from swh.indexer.tests.test_utils import MockObjStorage, MockStorage
-from swh.indexer.tests.test_utils import MockIndexerStorage
-
from swh.model.hashutil import hash_to_bytes
-from .test_utils import BASE_TEST_CONFIG
+from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS
+from swh.indexer.metadata_detector import (
+ detect_metadata, extract_minimal_metadata_dict
+)
+from swh.indexer.metadata import (
+ ContentMetadataIndexer, RevisionMetadataIndexer
+)
+
+from .test_utils import (
+ MockObjStorage, MockStorage, MockIndexerStorage,
+ BASE_TEST_CONFIG
+)
class ContentMetadataTestIndexer(ContentMetadataIndexer):
"""Specific Metadata whose configuration is enough to satisfy the
indexing tests.
"""
def parse_config_file(self, *args, **kwargs):
assert False, 'should not be called; the rev indexer configures it.'
def prepare(self):
super().prepare()
self.objstorage = MockObjStorage()
self.idx_storage = MockIndexerStorage()
class RevisionMetadataTestIndexer(RevisionMetadataIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
ContentMetadataIndexer = ContentMetadataTestIndexer
def parse_config_file(self, *args, **kwargs):
return {
**BASE_TEST_CONFIG,
'tools': {
'name': 'swh-metadata-detector',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': 'NpmMapping'
}
}
}
def prepare(self):
super().prepare()
self.storage = MockStorage()
self.idx_storage = MockIndexerStorage()
self.objstorage = MockObjStorage()
self.tools = list(self.register_tools(self.config['tools']))
class Metadata(unittest.TestCase):
"""
Tests metadata_mock_tool tool for Metadata detection
"""
def setUp(self):
"""
shows the entire diff in the results
"""
self.maxDiff = None
self.content_tool = {
'name': 'swh-metadata-translator',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': 'NpmMapping'
}
}
MockIndexerStorage.added_data = []
def test_crosstable(self):
self.assertEqual(CROSSWALK_TABLE['NodeJS'], {
'repository': 'http://schema.org/codeRepository',
'os': 'http://schema.org/operatingSystem',
'cpu': 'http://schema.org/processorRequirements',
'engines':
'http://schema.org/processorRequirements',
'author': 'http://schema.org/author',
'author.email': 'http://schema.org/email',
'author.name': 'http://schema.org/name',
'contributor': 'http://schema.org/contributor',
'keywords': 'http://schema.org/keywords',
'license': 'http://schema.org/license',
'version': 'http://schema.org/version',
'description': 'http://schema.org/description',
'name': 'http://schema.org/name',
'bugs': 'https://codemeta.github.io/terms/issueTracker',
'homepage': 'http://schema.org/url'
})
def test_compute_metadata_none(self):
"""
testing content empty content is empty
should return None
"""
# given
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
# when
result = MAPPINGS["NpmMapping"].translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_compute_metadata_npm(self):
"""
testing only computation of metadata with hard_mapping_npm
"""
# given
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
},
"author": {
"email": "moranegg@example.com",
"name": "Morane G"
}
}
"""
declared_metadata = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'test_metadata',
'version': '0.0.2',
'description': 'Simple package.json test for indexer',
'schema:codeRepository':
'git+https://github.com/moranegg/metadata_test',
'schema:author': {
'type': 'Person',
'name': 'Morane G',
'email': 'moranegg@example.com',
},
}
# when
result = MAPPINGS["NpmMapping"].translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_extract_minimal_metadata_dict(self):
"""
Test the creation of a coherent minimal metadata set
"""
# given
metadata_list = [{
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'test_1',
'version': '0.0.2',
'description': 'Simple package.json test for indexer',
'schema:codeRepository':
'git+https://github.com/moranegg/metadata_test',
}, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'test_0_1',
'version': '0.0.2',
'description': 'Simple package.json test for indexer',
'schema:codeRepository':
'git+https://github.com/moranegg/metadata_test'
}, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'test_metadata',
'version': '0.0.2',
'schema:author': 'moranegg',
}]
# when
results = extract_minimal_metadata_dict(metadata_list)
# then
expected_results = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
"version": '0.0.2',
"description": 'Simple package.json test for indexer',
"name": ['test_1', 'test_0_1', 'test_metadata'],
"schema:author": 'moranegg',
"schema:codeRepository":
'git+https://github.com/moranegg/metadata_test',
}
self.assertEqual(expected_results, results)
def test_index_content_metadata_npm(self):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
# given
sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5',
'd4c647f0fc257591cc9ba1722484229780d1c607',
'02fb2c89e14f7fab46701478c83779c7beb7b069']
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
metadata_indexer = ContentMetadataTestIndexer(
tool=self.content_tool, config=BASE_TEST_CONFIG.copy())
# when
metadata_indexer.run(sha1s, policy_update='ignore-dups')
results = metadata_indexer.idx_storage.added_data
expected_results = [('content_metadata', False, [{
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'schema:codeRepository':
'git+https://github.com/moranegg/metadata_test',
'description': 'Simple package.json test for indexer',
'name': 'test_metadata',
'version': '0.0.1'
},
'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'
}, {
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'codemeta:issueTracker':
'https://github.com/npm/npm/issues',
'schema:author': {
'type': 'Person',
'name': 'Isaac Z. Schlueter',
'email': 'i@izs.me',
'schema:url': 'http://blog.izs.me',
},
'schema:codeRepository':
'git+https://github.com/npm/npm',
'description': 'a package manager for JavaScript',
'schema:license': 'Artistic-2.0',
'version': '5.0.3',
'name': 'npm',
'keywords': [
'install',
'modules',
'package manager',
'package.json'
],
'schema:url': 'https://docs.npmjs.com/'
},
'id': 'd4c647f0fc257591cc9ba1722484229780d1c607'
}, {
'translated_metadata': None,
'id': '02fb2c89e14f7fab46701478c83779c7beb7b069'
}])]
for result in results:
metadata = result[2]
for item in metadata:
del item['indexer_configuration_id']
# The assertion below returns False sometimes because of nested lists
self.assertEqual(expected_results, results)
def test_detect_metadata_package_json(self):
# given
df = [{
'sha1_git': b'abc',
'name': b'index.js',
'target': b'abc',
'length': 897,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'dir_a',
'sha1': b'bcd'
},
{
'sha1_git': b'aab',
'name': b'package.json',
'target': b'aab',
'length': 712,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'dir_a',
'sha1': b'cde'
}]
# when
results = detect_metadata(df)
expected_results = {
'NpmMapping': [
b'cde'
]
}
# then
self.assertEqual(expected_results, results)
def test_compute_metadata_valid_codemeta(self):
raw_content = (
b"""{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
{
"@type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"@id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"@id": "https://doi.org/10.13039/100000001",
"@type": "Organization",
"name": "National Science Foundation"
},
"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version":"2.0",
"dateCreated":"2017-06-05",
"datePublished":"2017-06-05",
"programmingLanguage": "JSON-LD"
}""") # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description":
"CodeMeta is a concept vocabulary that can "
"be used to standardize the exchange of software metadata "
"across repositories and organizations.",
"name":
"CodeMeta: Minimal metadata schemas for science "
"software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X"
},
{
"type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl":
"https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"id": "https://doi.org/10.13039/100000001",
"type": "Organization",
"name": "National Science Foundation"
},
"funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
"in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version": "2.0",
"dateCreated": "2017-06-05",
"datePublished": "2017-06-05",
"programmingLanguage": "JSON-LD"
}
result = MAPPINGS["CodemetaMapping"].translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_maven(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
"""
result = MAPPINGS["MavenMapping"].translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'schema:identifier': 'com.mycompany.app',
'version': '1.2.3',
'schema:codeRepository':
'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
})
def test_revision_metadata_indexer(self):
metadata_indexer = RevisionMetadataTestIndexer()
sha1_gits = [
hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
]
metadata_indexer.run(sha1_gits, 'update-dups')
results = metadata_indexer.idx_storage.added_data
expected_results = [('revision_metadata', True, [{
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'url':
'https://github.com/librariesio/yarn-parser#readme',
'schema:codeRepository':
'git+https://github.com/librariesio/yarn-parser.git',
'schema:author': 'Andrew Nesbitt',
'license': 'AGPL-3.0',
'version': '1.0.0',
'description':
'Tiny web service for parsing yarn.lock files',
'codemeta:issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'name': 'yarn-parser',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
},
}])]
for result in results:
metadata = result[2]
for item in metadata:
del item['indexer_configuration_id']
# then
self.assertEqual(expected_results, results)
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 1cf307b..b11665f 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,166 +1,156 @@
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import time
-import logging
import unittest
-from celery import task
-
-from swh.indexer.metadata import OriginMetadataIndexer, \
- RevisionMetadataIndexer, ContentMetadataIndexer
-from swh.indexer.tests.test_utils import MockObjStorage, MockStorage
-from swh.indexer.tests.test_utils import MockIndexerStorage
-from swh.indexer.tests.test_origin_head import OriginHeadTestIndexer
-
-from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture
+from celery import task
from swh.model.hashutil import hash_to_bytes
-from .test_utils import BASE_TEST_CONFIG
-
+from swh.indexer.metadata import (
+ OriginMetadataIndexer, RevisionMetadataIndexer
+)
-class OriginMetadataTestIndexer(OriginMetadataIndexer):
- def parse_config_file(self, *args, **kwargs):
- return {
- **BASE_TEST_CONFIG,
- 'tools': [],
- }
-
- def prepare(self):
- super().prepare()
- self.storage = MockStorage()
- self.idx_storage = MockIndexerStorage()
- self.objstorage = MockObjStorage()
-
-
-class ContentMetadataTestIndexer(ContentMetadataIndexer):
- """Specific Metadata whose configuration is enough to satisfy the
- indexing tests.
- """
- def prepare(self):
- self.idx_storage = MockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
- self.objstorage = MockObjStorage()
- self.tools = self.register_tools(self.config['tools'])
- self.tool = self.tools[0]
- self.results = []
+from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture
+from .test_utils import (
+ MockObjStorage, MockStorage, MockIndexerStorage,
+ BASE_TEST_CONFIG
+)
+from .test_origin_head import OriginHeadTestIndexer
+from .test_metadata import ContentMetadataTestIndexer
class RevisionMetadataTestIndexer(RevisionMetadataIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
-
ContentMetadataIndexer = ContentMetadataTestIndexer
- def prepare(self):
- self.config = {
- 'storage': {},
- 'objstorage': {},
- 'indexer_storage': {},
+ def parse_config_file(self, *args, **kwargs):
+ return {
+ **BASE_TEST_CONFIG,
'tools': {
'name': 'swh-metadata-detector',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': 'NpmMapping'
}
}
}
- self.storage = MockStorage()
+
+ def prepare(self):
+ super().prepare()
self.idx_storage = MockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
+ self.storage = MockStorage()
self.objstorage = MockObjStorage()
- self.tools = self.register_tools(self.config['tools'])
- self.tool = self.tools[0]
@task
def revision_metadata_test_task(*args, **kwargs):
indexer = RevisionMetadataTestIndexer()
indexer.run(*args, **kwargs)
return indexer.results
+class OriginMetadataTestIndexer(OriginMetadataIndexer):
+ def parse_config_file(self, *args, **kwargs):
+ return {
+ **BASE_TEST_CONFIG,
+ 'tools': []
+ }
+
+ def prepare(self):
+ super().prepare()
+ self.storage = MockStorage()
+ self.objstorage = MockObjStorage()
+ self.idx_storage = MockIndexerStorage()
+
+
@task
def origin_intrinsic_metadata_test_task(*args, **kwargs):
indexer = OriginMetadataTestIndexer()
indexer.run(*args, **kwargs)
return indexer.results
class OriginHeadTestIndexer(OriginHeadTestIndexer):
def prepare(self):
super().prepare()
self.config['tasks'] = {
'revision_metadata': 'revision_metadata_test_task',
'origin_intrinsic_metadata': 'origin_intrinsic_metadata_test_task',
}
class TestOriginMetadata(SchedulerTestFixture, unittest.TestCase):
def setUp(self):
super().setUp()
self.maxDiff = None
# FIXME: Improve mock indexer storage reset behavior
MockIndexerStorage.added_data = []
MockIndexerStorage.revision_metadata = {}
self.add_scheduler_task_type(
'revision_metadata_test_task',
'swh.indexer.tests.test_origin_metadata.'
'revision_metadata_test_task')
self.add_scheduler_task_type(
'origin_intrinsic_metadata_test_task',
'swh.indexer.tests.test_origin_metadata.'
'origin_intrinsic_metadata_test_task')
RevisionMetadataTestIndexer.scheduler = self.scheduler
def tearDown(self):
del RevisionMetadataTestIndexer.scheduler
super().tearDown()
def test_pipeline(self):
indexer = OriginHeadTestIndexer()
indexer.scheduler = self.scheduler
indexer.run(["git+https://github.com/librariesio/yarn-parser"])
self.run_ready_tasks() # Run the first task
time.sleep(0.1) # Give it time to complete and schedule the 2nd one
self.run_ready_tasks() # Run the second task
metadata = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'url':
'https://github.com/librariesio/yarn-parser#readme',
'schema:codeRepository':
'git+https://github.com/librariesio/yarn-parser.git',
'schema:author': 'Andrew Nesbitt',
'license': 'AGPL-3.0',
'version': '1.0.0',
'description':
'Tiny web service for parsing yarn.lock files',
'codemeta:issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'name': 'yarn-parser',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
}
rev_metadata = {
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'translated_metadata': metadata,
- 'indexer_configuration_id': 7,
}
origin_metadata = {
'origin_id': 54974445,
'from_revision': hash_to_bytes(
'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'metadata': metadata,
- 'indexer_configuration_id': 7,
}
expected_results = [
- ('origin_intrinsic_metadata', True, [origin_metadata]),
- ('revision_metadata', True, [rev_metadata])]
+ ('revision_metadata', True, [rev_metadata]),
+ ('origin_intrinsic_metadata', True, [origin_metadata]),
+ ]
results = list(indexer.idx_storage.added_data)
+ for result in results:
+ metadata = result[2]
+ for item in metadata:
+ # cannot check those (generated ids)
+ del item['indexer_configuration_id']
+
self.assertCountEqual(expected_results, results)
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
index 58e81e4..c564ff3 100644
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -1,764 +1,772 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.objstorage.exc import ObjNotFoundError
from swh.model import hashutil
from swh.model.hashutil import hash_to_bytes
from swh.indexer.storage import INDEXER_CFG_KEY
BASE_TEST_CONFIG = {
'storage': {
'cls': 'remote',
'args': {
'url': 'http://nowhere/',
},
},
'objstorage': {
'cls': 'remote',
'args': {
'url': 'http://nowhere2/',
},
},
INDEXER_CFG_KEY: {
'cls': 'memory',
'args': {
},
},
}
ORIGINS = [
{
'id': 52189575,
'lister': None,
'project': None,
'type': 'git',
'url': 'https://github.com/SoftwareHeritage/swh-storage'},
{
'id': 4423668,
'lister': None,
'project': None,
'type': 'ftp',
'url': 'rsync://ftp.gnu.org/gnu/3dldf'},
{
'id': 77775770,
'lister': None,
'project': None,
'type': 'deposit',
'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'},
{
'id': 85072327,
'lister': None,
'project': None,
'type': 'pypi',
'url': 'https://pypi.org/project/limnoria/'},
{
'id': 49908349,
'lister': None,
'project': None,
'type': 'svn',
'url': 'http://0-512-md.googlecode.com/svn/'},
{
'id': 54974445,
'lister': None,
'project': None,
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'},
]
SNAPSHOTS = {
52189575: {
'branches': {
b'refs/heads/add-revision-origin-cache': {
'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
b's\xe7/\xe9l\x1e',
'target_type': 'revision'},
b'HEAD': {
'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}'
b'\xac\xefrm',
'target_type': 'revision'},
b'refs/tags/v0.0.103': {
'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+'
b'\x0f\xdd',
'target_type': 'release'},
}},
4423668: {
'branches': {
b'3DLDF-1.1.4.tar.gz': {
'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc'
b'"G\x99\x11',
'target_type': 'revision'},
b'3DLDF-2.0.2.tar.gz': {
'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e='
b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V',
'target_type': 'revision'},
b'3DLDF-2.0.3-examples.tar.gz': {
'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97'
b'\xfe\xadZ\x80\x80\xc1\x83\xff',
'target_type': 'revision'},
b'3DLDF-2.0.3.tar.gz': {
'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee'
b'\xcc\x1a\xb4`\x8c\x8by',
'target_type': 'revision'},
b'3DLDF-2.0.tar.gz': {
'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G'
b'\xd3\xd1m',
b'target_type': 'revision'}
}},
77775770: {
'branches': {
b'master': {
'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{'
b'\xa6\xe9\x99\xb1\x9e]q\xeb',
'target_type': 'revision'}
},
'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV"
b"\x1d\r "},
85072327: {
'branches': {
b'HEAD': {
'target': b'releases/2018.09.09',
'target_type': 'alias'},
b'releases/2018.09.01': {
'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d'
b'\xbb\xdfF\xfdw\xcf',
'target_type': 'revision'},
b'releases/2018.09.09': {
'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k'
b'A\x10\x9d\xc5\xfa2\xf8t',
'target_type': 'revision'}},
'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay'
b'\x12\x9e\xd6\xb3'},
49908349: {
'branches': {
b'master': {
'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8'
b'\xc9\xad#.\x1bw=\x18',
'target_type': 'revision'}},
'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7'
b'\x05\xea\xb8\x1f\xc4H\xf4s'},
54974445: {
'branches': {
b'HEAD': {
'target': hash_to_bytes(
'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'target_type': 'revision'}}}
}
SHA1_TO_LICENSES = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
'02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
'103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
'688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
}
SHA1_TO_CTAGS = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': [{
'name': 'foo',
'kind': 'str',
'line': 10,
'lang': 'bar',
}],
'd4c647f0fc257591cc9ba1722484229780d1c607': [{
'name': 'let',
'kind': 'int',
'line': 100,
'lang': 'haskell',
}],
'688a5ef812c53907562fe379d4b3851e69c7cb15': [{
'name': 'symbol',
'kind': 'float',
'line': 99,
'lang': 'python',
}],
}
class MockObjStorage:
"""Mock an swh-objstorage objstorage with predefined contents.
"""
data = {}
def __init__(self):
self.data = {
'01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text',
'688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text',
'8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text',
'02fb2c89e14f7fab46701478c83779c7beb7b069': b"""
import unittest
import logging
from swh.indexer.mimetype import ContentMimetypeIndexer
from swh.indexer.tests.test_utils import MockObjStorage
class MockStorage():
def content_mimetype_add(self, mimetypes):
self.state = mimetypes
self.conflict_update = conflict_update
def indexer_configuration_add(self, tools):
return [{
'id': 10,
}]
""",
'103bc087db1d26afc3a0283f38663d081e9b01e6': b"""
#ifndef __AVL__
#define __AVL__
typedef struct _avl_tree avl_tree;
typedef struct _data_t {
int content;
} data_t;
""",
'93666f74f1cf635c8c8ac118879da6ec5623c410': b"""
(should 'pygments (recognize 'lisp 'easily))
""",
'26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b"""
{
"name": "test_metadata",
"version": "0.0.1",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
}
}
""",
'd4c647f0fc257591cc9ba1722484229780d1c607': b"""
{
"version": "5.0.3",
"name": "npm",
"description": "a package manager for JavaScript",
"keywords": [
"install",
"modules",
"package manager",
"package.json"
],
"preferGlobal": true,
"config": {
"publishtest": false
},
"homepage": "https://docs.npmjs.com/",
"author": "Isaac Z. Schlueter (http://blog.izs.me)",
"repository": {
"type": "git",
"url": "https://github.com/npm/npm"
},
"bugs": {
"url": "https://github.com/npm/npm/issues"
},
"dependencies": {
"JSONStream": "~1.3.1",
"abbrev": "~1.1.0",
"ansi-regex": "~2.1.1",
"ansicolors": "~0.3.2",
"ansistyles": "~0.1.3"
},
"devDependencies": {
"tacks": "~1.2.6",
"tap": "~10.3.2"
},
"license": "Artistic-2.0"
}
""",
'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b"""
""",
'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'',
}
def __iter__(self):
yield from self.data.keys()
def __contains__(self, sha1):
return self.data.get(sha1) is not None
def get(self, sha1):
raw_content = self.data.get(sha1)
if raw_content is None:
raise ObjNotFoundError(sha1)
return raw_content
class MockIndexerStorage():
"""Mock an swh-indexer storage.
"""
added_data = []
revision_metadata = {}
tools = {}
def indexer_configuration_add(self, tools):
results = []
for tool in tools:
results.append(self._indexer_configuration_add_one(tool))
return results
def _indexer_configuration_add_one(self, tool):
if tool['tool_name'] == 'swh-metadata-translator':
tool2 = {
'id': 30,
'tool_name': 'swh-metadata-translator',
'tool_version': '0.0.1',
'tool_configuration': {
'type': 'local',
'context': 'NpmMapping'
},
}
elif tool['tool_name'] == 'swh-metadata-detector':
tool2 = {
'id': 7,
'tool_name': 'swh-metadata-detector',
'tool_version': '0.0.1',
'tool_configuration': {
'type': 'local',
'context': 'NpmMapping'
},
}
elif tool['tool_name'] == 'origin-metadata':
tool2 = {
'id': 8,
'tool_name': 'origin-metadata',
'tool_version': '0.0.1',
'tool_configuration': {},
}
else:
assert False, 'Unknown tool {tool_name}'.format(**tool)
self.tools[tool2['id']] = tool2
return tool2
def content_metadata_missing(self, sha1s):
yield from []
def content_metadata_add(self, metadata, conflict_update=None):
self.added_data.append(
('content_metadata', conflict_update, metadata))
def revision_metadata_add(self, metadata, conflict_update=None):
assert conflict_update
self.added_data.append(
('revision_metadata', conflict_update, metadata))
for item in metadata:
assert isinstance(item['id'], bytes)
self.revision_metadata.setdefault(item['id'], []).append(item)
def revision_metadata_get(self, ids):
for id_ in ids:
assert isinstance(id_, bytes)
for item in self.revision_metadata.get(id_):
item = item.copy()
tool_id = item.pop('indexer_configuration_id')
- item['tool'] = self.tools[tool_id].copy()
+ if tool_id in self.tools:
+ item['tool'] = self.tools[tool_id].copy()
+ else: # HACK: this needs to be removed altogether
+ item['tool'] = {
+ 'id': tool_id,
+ 'name': tool_id[0],
+ 'version': tool_id[1],
+ 'configuration': tool_id[2],
+ }
yield item
def origin_intrinsic_metadata_add(self, metadata, conflict_update=None):
self.added_data.append(
('origin_intrinsic_metadata', conflict_update, metadata))
def content_metadata_get(self, sha1s):
return [{
'tool': {
'configuration': {
'type': 'local',
'context': 'NpmMapping'
},
'version': '0.0.1',
'id': 6,
'name': 'swh-metadata-translator'
},
'id': b'cde',
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'codemeta:issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'version': '1.0.0',
'name': 'yarn-parser',
'schema:author': 'Andrew Nesbitt',
'url':
'https://github.com/librariesio/yarn-parser#readme',
'processorRequirements': {'node': '7.5'},
'license': 'AGPL-3.0',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
'schema:codeRepository':
'git+https://github.com/librariesio/yarn-parser.git',
'description':
'Tiny web service for parsing yarn.lock files',
}
}]
class MockStorage():
"""Mock a real swh-storage storage to simplify reading indexers'
outputs.
"""
def origin_get(self, id_):
for origin in ORIGINS:
for (k, v) in id_.items():
if origin[k] != v:
break
else:
# This block is run iff we didn't break, ie. if all supplied
# parts of the id are set to the expected value.
return origin
assert False, id_
def snapshot_get_latest(self, origin_id):
if origin_id in SNAPSHOTS:
return SNAPSHOTS[origin_id]
else:
assert False, origin_id
def revision_get(self, revisions):
return [{
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'committer': {
'id': 26,
'name': b'Andrew Nesbitt',
'fullname': b'Andrew Nesbitt ',
'email': b'andrewnez@gmail.com'
},
'synthetic': False,
'date': {
'negative_utc': False,
'timestamp': {
'seconds': 1487596456,
'microseconds': 0
},
'offset': 0
},
'directory': b'10'
}]
def directory_ls(self, directory, recursive=False, cur=None):
# with directory: b'\x9d',
return [{
'sha1_git': b'abc',
'name': b'index.js',
'target': b'abc',
'length': 897,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'10',
'sha1': b'bcd'
},
{
'sha1_git': b'aab',
'name': b'package.json',
'target': b'aab',
'length': 712,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'10',
'sha1': b'cde'
},
{
'dir_id': b'10',
'target': b'11',
'type': 'dir',
'length': None,
'name': b'.github',
'sha1': None,
'perms': 16384,
'sha1_git': None,
'status': None,
'sha256': None
}]
class BasicMockStorage():
"""In memory implementation to fake the content_get_range api.
FIXME: To remove when the actual in-memory lands.
"""
contents = []
def __init__(self, contents):
self.contents = contents
def content_get_range(self, start, end, limit=1000):
# to make input test data consilient with actual runtime the
# other way of doing properly things would be to rewrite all
# tests (that's another task entirely so not right now)
if isinstance(start, bytes):
start = hashutil.hash_to_hex(start)
if isinstance(end, bytes):
end = hashutil.hash_to_hex(end)
results = []
_next_id = None
counter = 0
for c in self.contents:
_id = c['sha1']
if start <= _id and _id <= end:
results.append(c)
if counter >= limit:
break
counter += 1
return {
'contents': results,
'next': _next_id
}
class BasicMockIndexerStorage():
"""Mock Indexer storage to simplify reading indexers' outputs.
"""
state = []
def _internal_add(self, data, conflict_update=None):
"""All content indexer have the same structure. So reuse `data` as the
same data. It's either mimetype, language,
fossology_license, etc...
"""
self.state = data
self.conflict_update = conflict_update
def content_mimetype_add(self, data, conflict_update=None):
self._internal_add(data, conflict_update=conflict_update)
def content_fossology_license_add(self, data, conflict_update=None):
self._internal_add(data, conflict_update=conflict_update)
def content_language_add(self, data, conflict_update=None):
self._internal_add(data, conflict_update=conflict_update)
def content_ctags_add(self, data, conflict_update=None):
self._internal_add(data, conflict_update=conflict_update)
def _internal_get_range(self, start, end,
indexer_configuration_id, limit=1000):
"""Same logic as _internal_add, we retrieve indexed data given an
identifier. So the code here does not change even though
the underlying data does.
"""
# to make input test data consilient with actual runtime the
# other way of doing properly things would be to rewrite all
# tests (that's another task entirely so not right now)
if isinstance(start, bytes):
start = hashutil.hash_to_hex(start)
if isinstance(end, bytes):
end = hashutil.hash_to_hex(end)
results = []
_next = None
counter = 0
for m in self.state:
_id = m['id']
_tool_id = m['indexer_configuration_id']
if (start <= _id and _id <= end and
_tool_id == indexer_configuration_id):
results.append(_id)
if counter >= limit:
break
counter += 1
return {
'ids': results,
'next': _next
}
def content_mimetype_get_range(
self, start, end, indexer_configuration_id, limit=1000):
return self._internal_get_range(
start, end, indexer_configuration_id, limit=limit)
def content_fossology_license_get_range(
self, start, end, indexer_configuration_id, limit=1000):
return self._internal_get_range(
start, end, indexer_configuration_id, limit=limit)
def indexer_configuration_add(self, tools):
return [{
'id': 10,
}]
class CommonIndexerNoTool:
"""Mixin to wronly initialize content indexer"""
def prepare(self):
super().prepare()
self.tools = None
class CommonIndexerWithErrorsTest:
"""Test indexer configuration checks.
"""
Indexer = None
RangeIndexer = None
def test_wrong_unknown_configuration_tool(self):
"""Indexer with unknown configuration tool fails check"""
with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
print('indexer: %s' % self.Indexer)
self.Indexer()
def test_wrong_unknown_configuration_tool_range(self):
"""Range Indexer with unknown configuration tool fails check"""
if self.RangeIndexer is not None:
with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
self.RangeIndexer()
class CommonContentIndexerTest:
def assert_results_ok(self, actual_results, expected_results=None):
if expected_results is None:
expected_results = self.expected_results
for indexed_data in actual_results:
_id = indexed_data['id']
self.assertEqual(indexed_data, expected_results[_id])
_tool_id = indexed_data['indexer_configuration_id']
self.assertEqual(_tool_id, self.indexer.tool['id'])
def test_index(self):
"""Known sha1 have their data indexed
"""
sha1s = [self.id0, self.id1, self.id2]
# when
self.indexer.run(sha1s, policy_update='update-dups')
actual_results = self.indexer.idx_storage.state
self.assertTrue(self.indexer.idx_storage.conflict_update)
self.assert_results_ok(actual_results)
# 2nd pass
self.indexer.run(sha1s, policy_update='ignore-dups')
self.assertFalse(self.indexer.idx_storage.conflict_update)
self.assert_results_ok(actual_results)
def test_index_one_unknown_sha1(self):
"""Unknown sha1 are not indexed"""
sha1s = [self.id1,
'799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
'800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown
# when
self.indexer.run(sha1s, policy_update='update-dups')
actual_results = self.indexer.idx_storage.state
# then
expected_results = {
k: v for k, v in self.expected_results.items() if k in sha1s
}
self.assert_results_ok(actual_results, expected_results)
class CommonContentIndexerRangeTest:
"""Allows to factorize tests on range indexer.
"""
def assert_results_ok(self, start, end, actual_results,
expected_results=None):
if expected_results is None:
expected_results = self.expected_results
for indexed_data in actual_results:
_id = indexed_data['id']
self.assertEqual(indexed_data, expected_results[_id])
self.assertTrue(start <= _id and _id <= end)
_tool_id = indexed_data['indexer_configuration_id']
self.assertEqual(_tool_id, self.indexer.tool['id'])
def test__index_contents(self):
"""Indexing contents without existing data results in indexed data
"""
start, end = [self.contents[0], self.contents[2]] # output hex ids
# given
actual_results = list(self.indexer._index_contents(
start, end, indexed={}))
self.assert_results_ok(start, end, actual_results)
def test__index_contents_with_indexed_data(self):
"""Indexing contents with existing data results in less indexed data
"""
start, end = [self.contents[0], self.contents[2]] # output hex ids
data_indexed = [self.id0, self.id2]
# given
actual_results = self.indexer._index_contents(
start, end, indexed=set(data_indexed))
# craft the expected results
expected_results = self.expected_results.copy()
for already_indexed_key in data_indexed:
expected_results.pop(already_indexed_key)
self.assert_results_ok(
start, end, actual_results, expected_results)
def test_generate_content_get(self):
"""Optimal indexing should result in indexed data
"""
start, end = [self.contents[0], self.contents[2]] # output hex ids
# given
actual_results = self.indexer.run(start, end)
# then
self.assertTrue(actual_results)
def test_generate_content_get_input_as_bytes(self):
"""Optimal indexing should result in indexed data
Input are in bytes here.
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run( # checks the bytes input this time
start, end, skip_existing=False)
# no already indexed data so same result as prior test
# then
self.assertTrue(actual_results)
def test_generate_content_get_no_result(self):
"""No result indexed returns False"""
start, end = ['0000000000000000000000000000000000000000',
'0000000000000000000000000000000000000001']
# given
actual_results = self.indexer.run(
start, end, incremental=False)
# then
self.assertFalse(actual_results)
class NoDiskIndexer:
"""Mixin to override the DiskIndexer behavior avoiding side-effects in
tests.
"""
def write_to_temp(self, filename, data): # noop
return filename
def cleanup(self, content_path): # noop
return None