diff --git a/swh/indexer/journal_client.py b/swh/indexer/journal_client.py
index e0d8ff2..4db7389 100644
--- a/swh/indexer/journal_client.py
+++ b/swh/indexer/journal_client.py
@@ -1,88 +1,88 @@
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from swh.journal.client import JournalClient
from swh.scheduler import get_scheduler
from swh.scheduler.utils import create_task_dict
class IndexerJournalClient(JournalClient):
"""Client in charge of listing new received origins and origin_visits
in the swh journal.
"""
CONFIG_BASE_FILENAME = 'indexer/journal_client'
ADDITIONAL_CONFIG = {
'scheduler': ('dict', {
'cls': 'remote',
'args': {
'url': 'http://localhost:5008/',
}
}),
'origin_visit_tasks': ('List[dict]', [
{
- 'type': 'indexer_origin_head',
+ 'type': 'indexer_full_origin_metadata',
'kwargs': {
'policy_update': 'update-dups',
'parse_ids': False,
}
}
]),
}
def __init__(self):
super().__init__(extra_configuration={
'object_types': ['origin_visit'],
})
self.scheduler = get_scheduler(**self.config['scheduler'])
logging.info(
'Starting indexer journal client with config %r',
self.config)
def process_objects(self, messages):
assert set(messages) == {'origin_visit'}, set(messages)
for origin_visit in messages['origin_visit']:
self.process_origin_visit(origin_visit)
def process_origin_visit(self, origin_visit):
task_dicts = []
logging.debug('processing origin visit %r', origin_visit)
if origin_visit[b'status'] == b'full':
for task_config in self.config['origin_visit_tasks']:
logging.info(
'Scheduling %s for visit of origin %d',
task_config['type'], origin_visit[b'origin'])
task_dicts.append(create_task_dict(
task_config['type'],
'oneshot',
[origin_visit[b'origin']],
**task_config['kwargs'],
))
else:
logging.debug('status is not "full", ignoring.')
if task_dicts:
self.scheduler.create_tasks(task_dicts)
if __name__ == '__main__':
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(process)d %(levelname)s %(message)s'
)
import click
@click.command()
def main():
"""Log the new received origin and origin_visits.
"""
IndexerJournalClient().process()
main()
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 0386550..a2a18b6 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,325 +1,373 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import itertools
import logging
from copy import deepcopy
from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer
+from swh.indexer.origin_head import OriginHeadIndexer
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_detector import extract_minimal_metadata_dict
from swh.indexer.storage import INDEXER_CFG_KEY
from swh.model import hashutil
class ContentMetadataIndexer(ContentIndexer):
"""Content-level indexer
This indexer is in charge of:
- filtering out content already indexed in content_metadata
- reading content from objstorage with the content's id sha1
- computing translated_metadata by given context
- using the metadata_dictionary as the 'swh-metadata-translator' tool
- store result in content_metadata table
"""
# Note: This used when the content metadata indexer is used alone
# (not the case for example in the case of the RevisionMetadataIndexer)
CONFIG_BASE_FILENAME = 'indexer/content_metadata'
def filter(self, ids):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.content_metadata_missing((
{
'id': sha1,
'indexer_configuration_id': self.tool['id'],
} for sha1 in ids
))
def index(self, id, data, log_suffix='unknown revision'):
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
data (bytes): raw content in bytes
Returns:
dict: dictionary representing a content_metadata. If the
translation wasn't successful the translated_metadata keys will
be returned as None
"""
result = {
'id': id,
'indexer_configuration_id': self.tool['id'],
'translated_metadata': None
}
try:
mapping_name = self.tool['tool_configuration']['context']
log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id)
result['translated_metadata'] = \
MAPPINGS[mapping_name](log_suffix).translate(data)
except Exception:
self.log.exception(
"Problem during metadata translation "
"for content %s" % hashutil.hash_to_hex(id))
if result['translated_metadata'] is None:
return None
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_metadata, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- translated_metadata (jsonb): detected metadata
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.idx_storage.content_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
class RevisionMetadataIndexer(RevisionIndexer):
"""Revision-level indexer
This indexer is in charge of:
- filtering revisions already indexed in revision_metadata table with
defined computation tool
- retrieve all entry_files in root directory
- use metadata_detector for file_names containing metadata
- compute metadata translation if necessary and possible (depends on tool)
- send sha1s to content indexing if possible
- store the results for revision
"""
CONFIG_BASE_FILENAME = 'indexer/revision_metadata'
ADDITIONAL_CONFIG = {
'tools': ('dict', {
'name': 'swh-metadata-detector',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': list(MAPPINGS),
},
}),
}
def filter(self, sha1_gits):
"""Filter out known sha1s and return only missing ones.
"""
yield from self.idx_storage.revision_metadata_missing((
{
'id': sha1_git,
'indexer_configuration_id': self.tool['id'],
} for sha1_git in sha1_gits
))
def index(self, rev):
"""Index rev by processing it and organizing result.
use metadata_detector to iterate on filenames
- if one filename detected -> sends file to content indexer
- if multiple file detected -> translation needed at revision level
Args:
- rev (bytes): revision artifact from storage
+ rev (dict): revision artifact from storage
Returns:
dict: dictionary representing a revision_metadata, with keys:
- id (str): rev's identifier (sha1_git)
- indexer_configuration_id (bytes): tool used
- translated_metadata: dict of retrieved metadata
"""
result = {
'id': rev['id'],
'indexer_configuration_id': self.tool['id'],
'translated_metadata': None
}
try:
root_dir = rev['directory']
dir_ls = self.storage.directory_ls(root_dir, recursive=False)
files = [entry for entry in dir_ls if entry['type'] == 'file']
detected_files = detect_metadata(files)
result['translated_metadata'] = self.translate_revision_metadata(
detected_files,
log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])
)
except Exception as e:
self.log.exception(
'Problem when indexing rev: %r', e)
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- mimetype (bytes): mimetype in bytes
- encoding (bytes): encoding in bytes
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
# TODO: add functions in storage to keep data in revision_metadata
self.idx_storage.revision_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
def translate_revision_metadata(self, detected_files, log_suffix):
"""
Determine plan of action to translate metadata when containing
one or multiple detected files:
Args:
detected_files (dict): dictionary mapping context names (e.g.,
"npm", "authors") to list of sha1
Returns:
dict: dict with translated metadata according to the CodeMeta
vocabulary
"""
translated_metadata = []
tool = {
'name': 'swh-metadata-translator',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': None
},
}
# TODO: iterate on each context, on each file
# -> get raw_contents
# -> translate each content
config = {
k: self.config[k]
for k in [INDEXER_CFG_KEY, 'objstorage', 'storage']
}
config['tools'] = [tool]
for context in detected_files.keys():
cfg = deepcopy(config)
cfg['tools'][0]['configuration']['context'] = context
c_metadata_indexer = ContentMetadataIndexer(config=cfg)
# sha1s that are in content_metadata table
sha1s_in_storage = []
metadata_generator = self.idx_storage.content_metadata_get(
detected_files[context])
for c in metadata_generator:
# extracting translated_metadata
sha1 = c['id']
sha1s_in_storage.append(sha1)
local_metadata = c['translated_metadata']
# local metadata is aggregated
if local_metadata:
translated_metadata.append(local_metadata)
sha1s_filtered = [item for item in detected_files[context]
if item not in sha1s_in_storage]
if sha1s_filtered:
# content indexing
try:
c_metadata_indexer.run(sha1s_filtered,
policy_update='ignore-dups',
log_suffix=log_suffix)
# on the fly possibility:
for result in c_metadata_indexer.results:
local_metadata = result['translated_metadata']
translated_metadata.append(local_metadata)
except Exception:
self.log.exception(
"Exception while indexing metadata on contents")
# transform translated_metadata into min set with swh-metadata-detector
min_metadata = extract_minimal_metadata_dict(translated_metadata)
return min_metadata
class OriginMetadataIndexer(OriginIndexer):
CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata'
ADDITIONAL_CONFIG = {
'tools': ('list', [])
}
USE_TOOLS = False
def run(self, origin_head, policy_update):
"""Expected to be called with the result of RevisionMetadataIndexer
as first argument; ie. not a list of ids as other indexers would.
Args:
origin_head (dict): {str(origin_id): rev_id}
keys `origin_id` and `revision_id`, which is the result
of OriginHeadIndexer.
policy_update (str): `'ignore-dups'` or `'update-dups'`
"""
origin_head_map = {origin_id: hashutil.hash_to_bytes(rev_id)
for (origin_id, rev_id) in origin_head.items()}
# Fix up the argument order. revisions_metadata has to be the
# first argument because of celery.chain; the next line calls
# run() with the usual order, ie. origin ids first.
return super().run(ids=list(origin_head_map),
policy_update=policy_update,
parse_ids=False,
origin_head_map=origin_head_map)
def index(self, origin, *, origin_head_map):
# Get the last revision of the origin.
revision_id = origin_head_map[str(origin['id'])]
revision_metadata = self.idx_storage \
.revision_metadata_get([revision_id])
results = []
for item in revision_metadata:
assert item['id'] == revision_id
# Get the metadata of that revision, and return it
results.append({
'origin_id': origin['id'],
'metadata': item['translated_metadata'],
'from_revision': revision_id,
'indexer_configuration_id':
item['tool']['id'],
})
return results
def persist_index_computations(self, results, policy_update):
self.idx_storage.origin_intrinsic_metadata_add(
list(itertools.chain(*results)),
conflict_update=(policy_update == 'update-dups'))
+class FullOriginMetadataIndexer(OriginIndexer):
+ CONFIG_BASE_FILENAME = 'indexer/full_origin_intrinsic_metadata'
+
+ ADDITIONAL_CONFIG = {
+ 'tools': ('list', [])
+ }
+
+ USE_TOOLS = False
+
+ def __init__(self):
+ super().__init__()
+ self.origin_head_indexer = OriginHeadIndexer()
+ self.revision_metadata_indexer = RevisionMetadataIndexer()
+
+ def index(self, origin):
+ head_result = self.origin_head_indexer.index(origin)
+ if not head_result:
+ return
+ rev_id = head_result['revision_id']
+
+ rev = list(self.storage.revision_get([rev_id]))
+ if not rev:
+ self.warning('Missing head revision %s of origin %r',
+ (hashutil.hash_to_bytes(rev_id), origin))
+ return
+ assert len(rev) == 1
+ rev = rev[0]
+ rev_metadata = self.revision_metadata_indexer.index(rev)
+ orig_metadata = {
+ 'from_revision': rev_metadata['id'],
+ 'origin_id': origin['id'],
+ 'metadata': rev_metadata['translated_metadata'],
+ 'indexer_configuration_id':
+ rev_metadata['indexer_configuration_id'],
+ }
+ return (orig_metadata, rev_metadata)
+
+ def persist_index_computations(self, results, policy_update):
+ self.idx_storage.revision_metadata_add(
+ [rev_item for (orig_item, rev_item) in results],
+ conflict_update=(policy_update == 'update-dups'))
+
+ self.idx_storage.origin_intrinsic_metadata_add(
+ [orig_item for (orig_item, rev_item) in results],
+ conflict_update=(policy_update == 'update-dups'))
+
+
@click.command()
@click.option('--revs', '-i',
help='Default sha1_git to lookup', multiple=True)
def main(revs):
_git_sha1s = list(map(hashutil.hash_to_bytes, revs))
rev_metadata_indexer = RevisionMetadataIndexer()
rev_metadata_indexer.run(_git_sha1s, 'update-dups')
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
main()
diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py
index f11ccc4..ecc6daa 100644
--- a/swh/indexer/tasks.py
+++ b/swh/indexer/tasks.py
@@ -1,77 +1,85 @@
# Copyright (C) 2016-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import current_app as app
from .mimetype import MimetypeIndexer, MimetypeRangeIndexer
from .language import LanguageIndexer
from .ctags import CtagsIndexer
from .fossology_license import (
FossologyLicenseIndexer, FossologyLicenseRangeIndexer
)
from .rehash import RecomputeChecksums
-from .metadata import RevisionMetadataIndexer, OriginMetadataIndexer
+from .metadata import (
+ RevisionMetadataIndexer, OriginMetadataIndexer, FullOriginMetadataIndexer,
+)
from .origin_head import OriginHeadIndexer
@app.task(name=__name__ + '.RevisionMetadata')
def revision_metadata(*args, **kwargs):
results = RevisionMetadataIndexer().run(*args, **kwargs)
return getattr(results, 'results', results)
@app.task(name=__name__ + '.OriginMetadata')
def origin_metadata(*args, **kwargs):
results = OriginMetadataIndexer().run(*args, **kwargs)
return getattr(results, 'results', results)
+@app.task(name=__name__ + '.FullOriginMetadata')
+def full_origin_metadata(*args, **kwargs):
+ results = FullOriginMetadataIndexer().run(*args, **kwargs)
+ return getattr(results, 'results', results)
+
+
@app.task(name=__name__ + '.OriginHead')
def origin_head(*args, **kwargs):
results = OriginHeadIndexer().run(*args, **kwargs)
return getattr(results, 'results', results)
@app.task(name=__name__ + '.ContentLanguage')
def content_language(*args, **kwargs):
results = LanguageIndexer().run(*args, **kwargs)
return getattr(results, 'results', results)
@app.task(name=__name__ + '.Ctags')
def ctags(*args, **kwargs):
results = CtagsIndexer().run(*args, **kwargs)
return getattr(results, 'results', results)
@app.task(name=__name__ + '.ContentFossologyLicense')
def fossology_license(*args, **kwargs):
results = FossologyLicenseIndexer().run(*args, **kwargs)
return getattr(results, 'results', results)
@app.task(name=__name__ + '.RecomputeChecksums')
def recompute_checksums(*args, **kwargs):
results = RecomputeChecksums().run(*args, **kwargs)
return getattr(results, 'results', results)
@app.task(name=__name__ + '.ContentMimetype')
def mimetype(*args, **kwargs):
results = MimetypeIndexer().run(*args, **kwargs)
return {'status': 'eventful' if results else 'uneventful'}
@app.task(name=__name__ + '.ContentRangeMimetype')
def range_mimetype(*args, **kwargs):
results = MimetypeRangeIndexer(*args, **kwargs)
return {'status': 'eventful' if results else 'uneventful'}
@app.task(name=__name__ + '.ContentRangeFossologyLicense')
def range_license(*args, **kwargs):
results = FossologyLicenseRangeIndexer(*args, **kwargs)
return {'status': 'eventful' if results else 'uneventful'}
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 2e447ca..8386b25 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,936 +1,936 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
from swh.model.hashutil import hash_to_bytes
from swh.indexer.metadata_dictionary import (
CROSSWALK_TABLE, MAPPINGS, merge_values)
from swh.indexer.metadata_detector import (
detect_metadata, extract_minimal_metadata_dict
)
from swh.indexer.metadata import (
ContentMetadataIndexer, RevisionMetadataIndexer
)
from .utils import (
BASE_TEST_CONFIG, fill_obj_storage, fill_storage
)
TRANSLATOR_TOOL = {
'name': 'swh-metadata-translator',
'version': '0.0.2',
'configuration': {
'type': 'local',
'context': 'NpmMapping'
}
}
class ContentMetadataTestIndexer(ContentMetadataIndexer):
"""Specific Metadata whose configuration is enough to satisfy the
indexing tests.
"""
def parse_config_file(self, *args, **kwargs):
assert False, 'should not be called; the rev indexer configures it.'
REVISION_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
'tools': TRANSLATOR_TOOL,
}
class Metadata(unittest.TestCase):
"""
Tests metadata_mock_tool tool for Metadata detection
"""
def setUp(self):
"""
shows the entire diff in the results
"""
self.maxDiff = None
self.npm_mapping = MAPPINGS['NpmMapping']()
self.codemeta_mapping = MAPPINGS['CodemetaMapping']()
self.maven_mapping = MAPPINGS['MavenMapping']()
self.pkginfo_mapping = MAPPINGS['PythonPkginfoMapping']()
self.gemspec_mapping = MAPPINGS['GemspecMapping']()
def test_crosstable(self):
self.assertEqual(CROSSWALK_TABLE['NodeJS'], {
'repository': 'http://schema.org/codeRepository',
'os': 'http://schema.org/operatingSystem',
'cpu': 'http://schema.org/processorRequirements',
'engines':
'http://schema.org/processorRequirements',
'author': 'http://schema.org/author',
'author.email': 'http://schema.org/email',
'author.name': 'http://schema.org/name',
'contributor': 'http://schema.org/contributor',
'keywords': 'http://schema.org/keywords',
'license': 'http://schema.org/license',
'version': 'http://schema.org/version',
'description': 'http://schema.org/description',
'name': 'http://schema.org/name',
'bugs': 'https://codemeta.github.io/terms/issueTracker',
'homepage': 'http://schema.org/url'
})
def test_merge_values(self):
self.assertEqual(
merge_values('a', 'b'),
['a', 'b'])
self.assertEqual(
merge_values(['a', 'b'], 'c'),
['a', 'b', 'c'])
self.assertEqual(
merge_values('a', ['b', 'c']),
['a', 'b', 'c'])
self.assertEqual(
merge_values({'@list': ['a']}, {'@list': ['b']}),
{'@list': ['a', 'b']})
self.assertEqual(
merge_values({'@list': ['a', 'b']}, {'@list': ['c']}),
{'@list': ['a', 'b', 'c']})
with self.assertRaises(ValueError):
merge_values({'@list': ['a']}, 'b')
with self.assertRaises(ValueError):
merge_values('a', {'@list': ['b']})
with self.assertRaises(ValueError):
merge_values({'@list': ['a']}, ['b'])
with self.assertRaises(ValueError):
merge_values(['a'], {'@list': ['b']})
self.assertEqual(
merge_values('a', None),
'a')
self.assertEqual(
merge_values(['a', 'b'], None),
['a', 'b'])
self.assertEqual(
merge_values(None, ['b', 'c']),
['b', 'c'])
self.assertEqual(
merge_values({'@list': ['a']}, None),
{'@list': ['a']})
self.assertEqual(
merge_values(None, {'@list': ['a']}),
{'@list': ['a']})
def test_compute_metadata_none(self):
"""
testing content empty content is empty
should return None
"""
# given
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
# when
result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_compute_metadata_npm(self):
"""
testing only computation of metadata with hard_mapping_npm
"""
# given
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
},
"author": {
"email": "moranegg@example.com",
"name": "Morane G"
}
}
"""
declared_metadata = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'test_metadata',
'version': '0.0.2',
'description': 'Simple package.json test for indexer',
'codeRepository':
'git+https://github.com/moranegg/metadata_test',
'author': [{
'type': 'Person',
'name': 'Morane G',
'email': 'moranegg@example.com',
}],
}
# when
result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_extract_minimal_metadata_dict(self):
"""
Test the creation of a coherent minimal metadata set
"""
# given
metadata_list = [{
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'test_1',
'version': '0.0.2',
'description': 'Simple package.json test for indexer',
'codeRepository':
'git+https://github.com/moranegg/metadata_test',
}, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'test_0_1',
'version': '0.0.2',
'description': 'Simple package.json test for indexer',
'codeRepository':
'git+https://github.com/moranegg/metadata_test'
}, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'test_metadata',
'version': '0.0.2',
'author': 'moranegg',
}]
# when
results = extract_minimal_metadata_dict(metadata_list)
# then
expected_results = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
"version": '0.0.2',
"description": 'Simple package.json test for indexer',
"name": ['test_1', 'test_0_1', 'test_metadata'],
"author": ['moranegg'],
"codeRepository":
'git+https://github.com/moranegg/metadata_test',
}
self.assertEqual(expected_results, results)
def test_index_content_metadata_npm(self):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
# given
sha1s = [
hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'),
hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'),
hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'),
]
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
config = BASE_TEST_CONFIG.copy()
config['tools'] = [TRANSLATOR_TOOL]
metadata_indexer = ContentMetadataTestIndexer(config=config)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# when
metadata_indexer.run(sha1s, policy_update='ignore-dups')
results = list(metadata_indexer.idx_storage.content_metadata_get(
sha1s))
expected_results = [{
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'codeRepository':
'git+https://github.com/moranegg/metadata_test',
'description': 'Simple package.json test for indexer',
'name': 'test_metadata',
'version': '0.0.1'
},
'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5')
}, {
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'issueTracker':
'https://github.com/npm/npm/issues',
'author': [{
'type': 'Person',
'name': 'Isaac Z. Schlueter',
'email': 'i@izs.me',
'url': 'http://blog.izs.me',
}],
'codeRepository':
'git+https://github.com/npm/npm',
'description': 'a package manager for JavaScript',
'license': 'https://spdx.org/licenses/Artistic-2.0',
'version': '5.0.3',
'name': 'npm',
'keywords': [
'install',
'modules',
'package manager',
'package.json'
],
'url': 'https://docs.npmjs.com/'
},
'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607')
}]
for result in results:
del result['tool']
# The assertion below returns False sometimes because of nested lists
self.assertEqual(expected_results, results)
def test_npm_bugs_normalization(self):
# valid dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"url": "https://github.com/owner/project/issues",
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'issueTracker': 'https://github.com/owner/project/issues',
'type': 'SoftwareSourceCode',
})
# "invalid" dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'type': 'SoftwareSourceCode',
})
# string
package_json = b"""{
"name": "foo",
"bugs": "https://github.com/owner/project/issues"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'issueTracker': 'https://github.com/owner/project/issues',
'type': 'SoftwareSourceCode',
})
def test_npm_repository_normalization(self):
# normal
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git",
"url" : "https://github.com/npm/cli.git"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'codeRepository': 'git+https://github.com/npm/cli.git',
'type': 'SoftwareSourceCode',
})
# missing url
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'type': 'SoftwareSourceCode',
})
# github shortcut
package_json = b"""{
"name": "foo",
"repository": "github:npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
expected_result = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'codeRepository': 'git+https://github.com/npm/cli.git',
'type': 'SoftwareSourceCode',
}
self.assertEqual(result, expected_result)
# github shortshortcut
package_json = b"""{
"name": "foo",
"repository": "npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, expected_result)
# gitlab shortcut
package_json = b"""{
"name": "foo",
"repository": "gitlab:user/repo"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
'codeRepository': 'git+https://gitlab.com/user/repo.git',
'type': 'SoftwareSourceCode',
})
def test_detect_metadata_package_json(self):
# given
df = [{
'sha1_git': b'abc',
'name': b'index.js',
'target': b'abc',
'length': 897,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'dir_a',
'sha1': b'bcd'
},
{
'sha1_git': b'aab',
'name': b'package.json',
'target': b'aab',
'length': 712,
'status': 'visible',
'type': 'file',
'perms': 33188,
'dir_id': b'dir_a',
'sha1': b'cde'
}]
# when
results = detect_metadata(df)
expected_results = {
'NpmMapping': [
b'cde'
]
}
# then
self.assertEqual(expected_results, results)
def test_compute_metadata_valid_codemeta(self):
raw_content = (
b"""{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
{
"@type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"@id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"@id": "https://doi.org/10.13039/100000001",
"@type": "Organization",
"name": "National Science Foundation"
},
"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version":"2.0",
"dateCreated":"2017-06-05",
"datePublished":"2017-06-05",
"programmingLanguage": "JSON-LD"
}""") # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description":
"CodeMeta is a concept vocabulary that can "
"be used to standardize the exchange of software metadata "
"across repositories and organizations.",
"name":
"CodeMeta: Minimal metadata schemas for science "
"software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X"
},
{
"type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl":
"https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"id": "https://doi.org/10.13039/100000001",
"type": "Organization",
"name": "National Science Foundation"
},
"funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
"in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version": "2.0",
"dateCreated": "2017-06-05",
"datePublished": "2017-06-05",
"programmingLanguage": "JSON-LD"
}
result = self.codemeta_mapping.translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_maven(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'identifier': 'com.mycompany.app',
'version': '1.2.3',
'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt',
'codeRepository':
'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
})
def test_compute_metadata_maven_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
def test_compute_metadata_maven_almost_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
def test_compute_metadata_maven_invalid_xml(self):
expected_warning = (
'WARNING:swh.indexer.metadata_dictionary.MavenMapping:'
'Error parsing XML from foo')
raw_content = b"""
"""
with self.assertLogs('swh.indexer.metadata_dictionary',
level='WARNING') as cm:
result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
raw_content = b"""
"""
with self.assertLogs('swh.indexer.metadata_dictionary',
level='WARNING') as cm:
result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_minimal(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'identifier': 'com.mycompany.app',
'version': '1.2.3',
'codeRepository':
'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app',
})
def test_compute_metadata_maven_multiple(self):
'''Tests when there are multiple code repos and licenses.'''
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
example
Example Maven Repo
default
http://example.org/maven2
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
MIT license
https://opensource.org/licenses/MIT
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'Maven Default Project',
'identifier': 'com.mycompany.app',
'version': '1.2.3',
'license': [
'https://www.apache.org/licenses/LICENSE-2.0.txt',
'https://opensource.org/licenses/MIT',
],
'codeRepository': [
'http://repo1.maven.org/maven2/com/mycompany/app/my-app',
'http://example.org/maven2/com/mycompany/app/my-app',
]
})
def test_compute_metadata_pkginfo(self):
raw_content = (b"""\
Metadata-Version: 2.1
Name: swh.core
Version: 0.0.49
Summary: Software Heritage core utilities
Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
Description: swh-core
========
\x20
core library for swh's modules:
- config parser
- hash computations
- serialization
- logging mechanism
\x20
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Description-Content-Type: text/markdown
Provides-Extra: testing
""") # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertCountEqual(result['description'], [
'Software Heritage core utilities', # note the comma here
'swh-core\n'
'========\n'
'\n'
"core library for swh's modules:\n"
'- config parser\n'
'- hash computations\n'
'- serialization\n'
'- logging mechanism\n'
''],
result)
del result['description']
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'url': 'https://forge.softwareheritage.org/diffusion/DCORE/',
'name': 'swh.core',
'author': [{
'type': 'Person',
'name': 'Software Heritage developers',
'email': 'swh-devel@inria.fr',
}],
'version': '0.0.49',
})
def test_compute_metadata_pkginfo_utf8(self):
raw_content = (b'''\
Metadata-Version: 1.1
Name: snowpyt
Description-Content-Type: UNKNOWN
Description: foo
Hydrology N\xc2\xb083
''') # noqa
- result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content)
+ result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'snowpyt',
'description': 'foo\nHydrology N°83',
})
def test_compute_metadata_pkginfo_license(self):
raw_content = (b"""\
Metadata-Version: 2.1
Name: foo
License: MIT
""") # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'name': 'foo',
'license': 'MIT',
})
def test_gemspec_base(self):
raw_content = b"""
Gem::Specification.new do |s|
s.name = 'example'
s.version = '0.1.0'
s.licenses = ['MIT']
s.summary = "This is an example!"
s.description = "Much longer explanation of the example!"
s.authors = ["Ruby Coder"]
s.email = 'rubycoder@example.com'
s.files = ["lib/example.rb"]
s.homepage = 'https://rubygems.org/gems/example'
s.metadata = { "source_code_uri" => "https://github.com/example/example" }
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(result.pop('description'), [
"This is an example!",
"Much longer explanation of the example!"
])
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'author': ['Ruby Coder'],
'name': 'example',
'license': 'https://spdx.org/licenses/MIT',
'codeRepository': 'https://rubygems.org/gems/example',
'email': 'rubycoder@example.com',
'version': '0.1.0',
})
def test_gemspec_two_author_fields(self):
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1"]
s.author = "Ruby Coder2"
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(result.pop('author'), [
'Ruby Coder1', 'Ruby Coder2'])
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
def test_gemspec_invalid_author(self):
raw_content = b"""
Gem::Specification.new do |s|
s.author = ["Ruby Coder"]
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
raw_content = b"""
Gem::Specification.new do |s|
s.author = "Ruby Coder1",
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'author': ['Ruby Coder1'],
})
def test_revision_metadata_indexer(self):
metadata_indexer = RevisionMetadataIndexer(
config=REVISION_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()})
assert tool is not None
metadata_indexer.idx_storage.content_metadata_add([{
'indexer_configuration_id': tool['id'],
'id': b'cde',
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
'issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'version': '1.0.0',
'name': 'yarn-parser',
'author': ['Andrew Nesbitt'],
'url':
'https://github.com/librariesio/yarn-parser#readme',
'processorRequirements': {'node': '7.5'},
'license': 'AGPL-3.0',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
'codeRepository':
'git+https://github.com/librariesio/yarn-parser.git',
'description':
'Tiny web service for parsing yarn.lock files',
}
}])
sha1_gits = [
hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
]
metadata_indexer.run(sha1_gits, 'update-dups')
results = list(metadata_indexer.idx_storage.revision_metadata_get(
sha1_gits))
expected_results = [{
'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'),
'tool': TRANSLATOR_TOOL,
'translated_metadata': {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'url':
'https://github.com/librariesio/yarn-parser#readme',
'codeRepository':
'git+https://github.com/librariesio/yarn-parser.git',
'author': ['Andrew Nesbitt'],
'license': 'AGPL-3.0',
'version': '1.0.0',
'description':
'Tiny web service for parsing yarn.lock files',
'issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'name': 'yarn-parser',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
},
}]
for result in results:
del result['tool']['id']
# then
self.assertEqual(expected_results, results)
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 8d04327..24b611b 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,138 +1,217 @@
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from celery.result import AsyncResult
from unittest import mock
+from swh.objstorage.objstorage_in_memory import InMemoryObjStorage
from swh.model.hashutil import hash_to_bytes
+from swh.scheduler.celery_backend.runner import run_ready_tasks
from swh.storage.in_memory import Storage
-
-from swh.indexer.storage.in_memory import IndexerStorage
-from swh.objstorage.objstorage_in_memory import InMemoryObjStorage
-
-from swh.scheduler.celery_backend.runner import run_ready_tasks
from swh.indexer.metadata import (
- OriginMetadataIndexer, RevisionMetadataIndexer
+ OriginMetadataIndexer, RevisionMetadataIndexer,
+ FullOriginMetadataIndexer
)
from swh.indexer.origin_head import OriginHeadIndexer
+from swh.indexer.storage.in_memory import IndexerStorage
from .utils import fill_storage, fill_obj_storage, BASE_TEST_CONFIG
from .test_metadata import REVISION_METADATA_CONFIG
ORIGIN_HEAD_CONFIG = {
**BASE_TEST_CONFIG,
'tools': {
'name': 'origin-metadata',
'version': '0.0.1',
'configuration': {},
},
'tasks': {
'revision_metadata': 'revision_metadata',
'origin_intrinsic_metadata': 'origin_intrinsic_metadata',
}
}
@pytest.mark.db
@mock.patch('swh.indexer.metadata.RevisionMetadataIndexer.parse_config_file')
@mock.patch('swh.indexer.origin_head.OriginHeadIndexer.parse_config_file')
@mock.patch('swh.indexer.storage.in_memory.IndexerStorage')
@mock.patch('swh.storage.in_memory.Storage')
def test_pipeline(storage_mock, idx_storage_mock,
origin_head_parse_config, revision_metadata_parse_config,
swh_app, celery_session_worker, indexer_scheduler):
scheduler = indexer_scheduler
# Always returns the same instance of the idx storage, because
# this function is called by each of the three indexers.
objstorage = InMemoryObjStorage()
storage = Storage()
idx_storage = IndexerStorage()
origin_head_parse_config.return_value = ORIGIN_HEAD_CONFIG
revision_metadata_parse_config.return_value = REVISION_METADATA_CONFIG
storage_mock.return_value = storage
idx_storage_mock.return_value = idx_storage
fill_obj_storage(objstorage)
fill_storage(storage)
# TODO: find a better way to share the ContentMetadataIndexer use
# the same objstorage instance.
import swh.objstorage
old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory']
swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage
try:
RevisionMetadataIndexer.scheduler = scheduler
OriginMetadataIndexer.scheduler = scheduler
indexer = OriginHeadIndexer()
indexer.scheduler = scheduler
indexer.run(["git+https://github.com/librariesio/yarn-parser"])
tasks = []
tasks.extend(run_ready_tasks(scheduler, swh_app)) # Run the first task
# Wait for the task to complete and schedule the 2nd one
task = [x for x in tasks if x['task'] == 1]
assert len(task) == 1
promise = AsyncResult(id=task[0]['backend_id'])
promise.wait()
tasks.extend(run_ready_tasks(scheduler, swh_app)) # Run the 2nd task
task = [x for x in tasks if x['task'] == 2]
assert len(task) == 1
promise = AsyncResult(id=task[0]['backend_id'])
promise.wait()
finally:
swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage
del RevisionMetadataIndexer.scheduler
del OriginMetadataIndexer.scheduler
origin = storage.origin_get({
'type': 'git',
'url': 'https://github.com/librariesio/yarn-parser'})
rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
metadata = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'url':
'https://github.com/librariesio/yarn-parser#readme',
'codeRepository':
'git+git+https://github.com/librariesio/yarn-parser.git',
'author': [{
'type': 'Person',
'name': 'Andrew Nesbitt'
}],
'license': 'https://spdx.org/licenses/AGPL-3.0',
'version': '1.0.0',
'description':
'Tiny web service for parsing yarn.lock files',
'issueTracker':
'https://github.com/librariesio/yarn-parser/issues',
'name': 'yarn-parser',
'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
}
rev_metadata = {
'id': rev_id,
'translated_metadata': metadata,
}
origin_metadata = {
'origin_id': origin['id'],
'from_revision': rev_id,
'metadata': metadata,
}
results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
for result in results:
del result['tool']
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
origin['id']]))
for result in results:
del result['tool']
assert results == [origin_metadata]
+
+
+@mock.patch('swh.indexer.metadata.RevisionMetadataIndexer.parse_config_file')
+@mock.patch('swh.indexer.origin_head.OriginHeadIndexer.parse_config_file')
+@mock.patch('swh.indexer.storage.in_memory.IndexerStorage')
+@mock.patch('swh.storage.in_memory.Storage')
+def test_full_origin_metadata_indexer(
+ storage_mock, idx_storage_mock, origin_head_parse_config,
+ revision_metadata_parse_config):
+ # Always returns the same instance of the idx storage, because
+ # this function is called by each of the three indexers.
+ objstorage = InMemoryObjStorage()
+ storage = Storage()
+ idx_storage = IndexerStorage()
+
+ origin_head_parse_config.return_value = ORIGIN_HEAD_CONFIG
+ revision_metadata_parse_config.return_value = REVISION_METADATA_CONFIG
+ storage_mock.return_value = storage
+ idx_storage_mock.return_value = idx_storage
+
+ fill_obj_storage(objstorage)
+ fill_storage(storage)
+
+ # TODO: find a better way to share the ContentMetadataIndexer use
+ # the same objstorage instance.
+ import swh.objstorage
+ old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory']
+ swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage
+ try:
+ indexer = FullOriginMetadataIndexer()
+ indexer.storage = storage
+ indexer.idx_storage = idx_storage
+ indexer.run(["git+https://github.com/librariesio/yarn-parser"])
+ finally:
+ swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage
+
+ origin = storage.origin_get({
+ 'type': 'git',
+ 'url': 'https://github.com/librariesio/yarn-parser'})
+ rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f')
+
+ metadata = {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'url':
+ 'https://github.com/librariesio/yarn-parser#readme',
+ 'codeRepository':
+ 'git+git+https://github.com/librariesio/yarn-parser.git',
+ 'author': [{
+ 'type': 'Person',
+ 'name': 'Andrew Nesbitt'
+ }],
+ 'license': 'https://spdx.org/licenses/AGPL-3.0',
+ 'version': '1.0.0',
+ 'description':
+ 'Tiny web service for parsing yarn.lock files',
+ 'issueTracker':
+ 'https://github.com/librariesio/yarn-parser/issues',
+ 'name': 'yarn-parser',
+ 'keywords': ['yarn', 'parse', 'lock', 'dependencies'],
+ }
+ rev_metadata = {
+ 'id': rev_id,
+ 'translated_metadata': metadata,
+ }
+ origin_metadata = {
+ 'origin_id': origin['id'],
+ 'from_revision': rev_id,
+ 'metadata': metadata,
+ }
+
+ results = list(indexer.idx_storage.revision_metadata_get([rev_id]))
+ for result in results:
+ del result['tool']
+ assert results == [rev_metadata]
+
+ results = list(indexer.idx_storage.origin_intrinsic_metadata_get([
+ origin['id']]))
+ for result in results:
+ del result['tool']
+ assert results == [origin_metadata]