diff --git a/PKG-INFO b/PKG-INFO index 9ddab14..ff194c6 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.126 +Version: 0.0.127 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN +Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest -Project-URL: Funding, https://www.softwareheritage.org/donate Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index 9ddab14..ff194c6 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,69 +1,69 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.0.126 +Version: 0.0.127 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN +Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest -Project-URL: Funding, https://www.softwareheritage.org/donate Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index 0b6fcb1..08dcf08 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,336 +1,339 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import itertools import logging from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ # Note: This used when the content metadata indexer is used alone # (not the case for example in the case of the RevisionMetadataIndexer) CONFIG_BASE_FILENAME = 'indexer/content_metadata' def __init__(self, tool, config): # FIXME: Simplify this twisted way to use the exact same # config of RevisionMetadataIndexer object that uses # internally ContentMetadataIndexer self.config = config self.config['tools'] = tool self.results = [] super().__init__() self.tool = self.tools[0] # Tool is now registered (cf. prepare call) def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the translated_metadata keys will be returned as None """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: mapping_name = self.tool['tool_configuration']['context'] result['translated_metadata'] = MAPPINGS[mapping_name] \ .translate(data) except Exception: self.log.exception( - "Problem during tool retrieval of metadata translation") + "Problem during metadata translation " + "for content %s" % hashutil.hash_to_hex(id)) + if result['translated_metadata'] is None: + return None return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containing metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ CONFIG_BASE_FILENAME = 'indexer/revision_metadata' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': ['NpmMapping', 'CodemetaMapping'] }, }), } ContentMetadataIndexer = ContentMetadataIndexer def prepare(self): super().prepare() self.tool = self.tools[0] def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (bytes): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: - id (str): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - translated_metadata: dict of retrieved metadata """ result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: root_dir = rev['directory'] dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( detected_files) except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in revision_metadata self.idx_storage.revision_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def translate_revision_metadata(self, detected_files): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files (dict): dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: dict: dict with translated metadata according to the CodeMeta vocabulary """ translated_metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': None }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { k: self.config[k] for k in [INDEXER_CFG_KEY, 'objstorage', 'storage'] } for context in detected_files.keys(): tool['configuration']['context'] = context c_metadata_indexer = self.ContentMetadataIndexer(tool, config) # sha1s that are in content_metadata table sha1s_in_storage = [] metadata_generator = self.idx_storage.content_metadata_get( detected_files[context]) for c in metadata_generator: # extracting translated_metadata sha1 = c['id'] sha1s_in_storage.append(sha1) local_metadata = c['translated_metadata'] # local metadata is aggregated if local_metadata: translated_metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] if sha1s_filtered: # content indexing try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups') # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result['translated_metadata'] translated_metadata.append(local_metadata) except Exception: self.log.exception( "Exception while indexing metadata on contents") # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) return min_metadata class OriginMetadataIndexer(OriginIndexer): CONFIG_BASE_FILENAME = 'indexer/origin_intrinsic_metadata' ADDITIONAL_CONFIG = { 'tools': ('list', []) } def check(self, **kwargs): kwargs['check_tools'] = False super().check(**kwargs) def filter(self, ids): return ids def run(self, origin_head, policy_update): """Expected to be called with the result of RevisionMetadataIndexer as first argument; ie. not a list of ids as other indexers would. Args: origin_head (dict): {str(origin_id): rev_id} keys `origin_id` and `revision_id`, which is the result of OriginHeadIndexer. policy_update (str): `'ignore-dups'` or `'update-dups'` """ origin_head_map = {origin_id: hashutil.hash_to_bytes(rev_id) for (origin_id, rev_id) in origin_head.items()} # Fix up the argument order. revisions_metadata has to be the # first argument because of celery.chain; the next line calls # run() with the usual order, ie. origin ids first. return super().run(ids=list(origin_head_map), policy_update=policy_update, parse_ids=False, origin_head_map=origin_head_map) def index(self, origin, *, origin_head_map): # Get the last revision of the origin. revision_id = origin_head_map[str(origin['id'])] revision_metadata = self.idx_storage \ .revision_metadata_get([revision_id]) results = [] for item in revision_metadata: assert item['id'] == revision_id # Get the metadata of that revision, and return it results.append({ 'origin_id': origin['id'], 'metadata': item['translated_metadata'], 'from_revision': revision_id, 'indexer_configuration_id': item['tool']['id'], }) return results def persist_index_computations(self, results, policy_update): self.idx_storage.origin_intrinsic_metadata_add( list(itertools.chain(*results)), conflict_update=(policy_update == 'update-dups')) @click.command() @click.option('--revs', '-i', help='Default sha1_git to lookup', multiple=True) def main(revs): _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) rev_metadata_indexer = RevisionMetadataIndexer() rev_metadata_indexer.run(_git_sha1s, 'update-dups') if __name__ == '__main__': logging.basicConfig(level=logging.INFO) main() diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py index 300fa46..c1df17a 100644 --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -1,405 +1,421 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import re import abc import json import logging import email.parser import xmltodict from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI from swh.indexer.codemeta import compact, expand MAPPINGS = {} def register_mapping(cls): MAPPINGS[cls.__name__] = cls() return cls class BaseMapping(metaclass=abc.ABCMeta): """Base class for mappings to inherit from To implement a new mapping: - inherit this class - override translate function """ def __init__(self): self.log = logging.getLogger('%s.%s' % ( self.__class__.__module__, self.__class__.__name__)) @abc.abstractmethod def detect_metadata_files(self, files): """ Detects files potentially containing metadata Args: file_entries (list): list of files Returns: list: list of sha1 (possibly empty) """ pass @abc.abstractmethod def translate(self, file_content): pass def normalize_translation(self, metadata): return compact(metadata) class SingleFileMapping(BaseMapping): """Base class for all mappings that use a single file as input.""" @property @abc.abstractmethod def filename(self): """The .json file to extract metadata from.""" pass def detect_metadata_files(self, file_entries): for entry in file_entries: if entry['name'] == self.filename: return [entry['sha1']] return [] class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly a key-value store (eg. a shallow JSON dict).""" @property @abc.abstractmethod def mapping(self): """A translation dict to map dict keys into a canonical name.""" pass def translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object and translating with the appropriate mapping Args: content_dict (dict): content dict to translate Returns: dict: translated metadata in json-friendly form needed for the indexer """ translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} for k, v in content_dict.items(): # First, check if there is a specific translation # method for this key translation_method = getattr( self, 'translate_' + k.replace('-', '_'), None) if translation_method: translation_method(translated_metadata, v) elif k in self.mapping: # if there is no method, but the key is known from the # crosswalk table # if there is a normalization method, use it on the value normalization_method = getattr( self, 'normalize_' + k.replace('-', '_'), None) if normalization_method: v = normalization_method(v) # set the translation metadata with the normalized value translated_metadata[self.mapping[k]] = v if normalize: return self.normalize_translation(translated_metadata) else: return translated_metadata class JsonMapping(DictMapping, SingleFileMapping): """Base class for all mappings that use a JSON file as input.""" def translate(self, raw_content): """ Translates content by parsing content from a bytestring containing json data and translating with the appropriate mapping Args: raw_content (bytes): raw content to translate Returns: dict: translated metadata in json-friendly form needed for the indexer """ try: raw_content = raw_content.decode() except UnicodeDecodeError: self.log.warning('Error unidecoding %r', raw_content) return try: content_dict = json.loads(raw_content) except json.JSONDecodeError: self.log.warning('Error unjsoning %r' % raw_content) return return self.translate_dict(content_dict) @register_mapping class NpmMapping(JsonMapping): """ dedicated class for NPM (package.json) mapping and translation """ mapping = CROSSWALK_TABLE['NodeJS'] filename = b'package.json' _schema_shortcuts = { - 'github': 'https://github.com/', - 'gist': 'https://gist.github.com/', - 'bitbucket': 'https://bitbucket.org/', - 'gitlab': 'https://gitlab.com/', + 'github': 'git+https://github.com/%s.git', + 'gist': 'git+https://gist.github.com/%s.git', + 'gitlab': 'git+https://gitlab.com/%s.git', + # Bitbucket supports both hg and git, and the shortcut does not + # tell which one to use. + # 'bitbucket': 'https://bitbucket.org/', } def normalize_repository(self, d): """https://docs.npmjs.com/files/package.json#repository""" - if isinstance(d, dict): + if isinstance(d, dict) and {'type', 'url'} <= set(d): url = '{type}+{url}'.format(**d) elif isinstance(d, str): if '://' in d: url = d elif ':' in d: (schema, rest) = d.split(':', 1) if schema in self._schema_shortcuts: - url = self._schema_shortcuts[schema] + rest + url = self._schema_shortcuts[schema] % rest else: return None else: - url = self._schema_shortcuts['github'] + d + url = self._schema_shortcuts['github'] % d else: return None return {'@id': url} def normalize_bugs(self, d): - return {'@id': '{url}'.format(**d)} + """https://docs.npmjs.com/files/package.json#bugs""" + if isinstance(d, dict) and 'url' in d: + return {'@id': '{url}'.format(**d)} + elif isinstance(d, str): + return {'@id': d} + else: + return None _parse_author = re.compile(r'^ *' r'(?P.*?)' r'( +<(?P.*)>)?' r'( +\((?P.*)\))?' r' *$') def normalize_author(self, d): 'https://docs.npmjs.com/files/package.json' \ '#people-fields-author-contributors' author = {'@type': SCHEMA_URI+'Person'} if isinstance(d, dict): name = d.get('name', None) email = d.get('email', None) url = d.get('url', None) elif isinstance(d, str): match = self._parse_author.match(d) name = match.group('name') email = match.group('email') url = match.group('url') else: return None if name: author[SCHEMA_URI+'name'] = name if email: author[SCHEMA_URI+'email'] = email if url: author[SCHEMA_URI+'url'] = {'@id': url} return {"@list": [author]} def normalize_license(self, s): - return {"@id": "https://spdx.org/licenses/" + s} + if isinstance(s, str): + return {"@id": "https://spdx.org/licenses/" + s} + else: + return None def normalize_homepage(self, s): return {"@id": s} @register_mapping class CodemetaMapping(SingleFileMapping): """ dedicated class for CodeMeta (codemeta.json) mapping and translation """ filename = b'codemeta.json' def translate(self, content): return self.normalize_translation(expand(json.loads(content.decode()))) @register_mapping class MavenMapping(DictMapping, SingleFileMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ filename = b'pom.xml' mapping = CROSSWALK_TABLE['Java (Maven)'] def translate(self, content): d = xmltodict.parse(content)['project'] metadata = self.translate_dict(d, normalize=False) metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) return self.normalize_translation(metadata) _default_repository = {'url': 'https://repo.maven.apache.org/maven2/'} def parse_repositories(self, d): """https://maven.apache.org/pom.html#Repositories""" if 'repositories' not in d: return [self.parse_repository(d, self._default_repository)] else: repositories = d['repositories'].get('repository', []) if not isinstance(repositories, list): repositories = [repositories] results = [] for repo in repositories: res = self.parse_repository(d, repo) if res: results.append(res) return results def parse_repository(self, d, repo): if repo.get('layout', 'default') != 'default': return # TODO ? - url = repo['url'] - if d['groupId']: - url = os.path.join(url, *d['groupId'].split('.')) - if d['artifactId']: - url = os.path.join(url, d['artifactId']) - return {"@id": url} + url = repo.get('url') + group_id = d.get('groupId') + artifact_id = d.get('artifactId') + if isinstance(url, str): + if isinstance(group_id, str): + url = os.path.join(url, *group_id.split('.')) + if isinstance(artifact_id, str): + url = os.path.join(url, artifact_id) + return {"@id": url} def normalize_groupId(self, id_): return {"@id": id_} def parse_licenses(self, d): """https://maven.apache.org/pom.html#Licenses The origin XML has the form: Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt Which was translated to a dict by xmltodict and is given as `d`: >>> d = { ... # ... ... "licenses": { ... "license": { ... "name": "Apache License, Version 2.0", ... "url": ... "https://www.apache.org/licenses/LICENSE-2.0.txt" ... } ... } ... } >>> MavenMapping().parse_licenses(d) [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] or, if there are more than one license: >>> from pprint import pprint >>> d = { ... # ... ... "licenses": { ... "license": [ ... { ... "name": "Apache License, Version 2.0", ... "url": ... "https://www.apache.org/licenses/LICENSE-2.0.txt" ... }, ... { ... "name": "MIT License, ", ... "url": "https://opensource.org/licenses/MIT" ... } ... ] ... } ... } >>> pprint(MavenMapping().parse_licenses(d)) [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, {'@id': 'https://opensource.org/licenses/MIT'}] """ licenses = d.get('licenses', {}).get('license', []) if isinstance(licenses, dict): licenses = [licenses] - return [{"@id": license['url']} for license in licenses] + return [{"@id": license['url']} + for license in licenses + if 'url' in license] _normalize_pkginfo_key = str.lower @register_mapping class PythonPkginfoMapping(DictMapping, SingleFileMapping): """Dedicated class for Python's PKG-INFO mapping and translation. https://www.python.org/dev/peps/pep-0314/""" filename = b'PKG-INFO' mapping = {_normalize_pkginfo_key(k): v for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} _parser = email.parser.BytesHeaderParser() def translate(self, content): msg = self._parser.parsebytes(content) d = {} for (key, value) in msg.items(): key = _normalize_pkginfo_key(key) if value != 'UNKNOWN': d.setdefault(key, []).append(value) metadata = self.translate_dict(d, normalize=False) if SCHEMA_URI+'author' in metadata or SCHEMA_URI+'email' in metadata: metadata[SCHEMA_URI+'author'] = { '@list': [{ '@type': SCHEMA_URI+'Person', SCHEMA_URI+'name': metadata.pop(SCHEMA_URI+'author', [None])[0], SCHEMA_URI+'email': metadata.pop(SCHEMA_URI+'email', [None])[0], }] } return self.normalize_translation(metadata) def translate_summary(self, translated_metadata, v): k = self.mapping['summary'] translated_metadata.setdefault(k, []).append(v) def translate_description(self, translated_metadata, v): k = self.mapping['description'] translated_metadata.setdefault(k, []).append(v) def normalize_home_page(self, urls): return [{'@id': url} for url in urls] def normalize_license(self, licenses): return [{'@id': license} for license in licenses] def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", "unknown_term": "ut", "prerequisites" :"packageXYZ"}""" result = MAPPINGS["NpmMapping"].translate(raw_content) result1 = MAPPINGS["MavenMapping"].translate(raw_content1) print(result) print(result1) if __name__ == "__main__": main() diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 85630d9..dcdc7da 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,657 +1,764 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.model.hashutil import hash_to_bytes from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS from swh.indexer.metadata_detector import ( detect_metadata, extract_minimal_metadata_dict ) from swh.indexer.metadata import ( ContentMetadataIndexer, RevisionMetadataIndexer ) from .test_utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage ) TRANSLATOR_TOOL = { 'name': 'swh-metadata-translator', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } class ContentMetadataTestIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): assert False, 'should not be called; the rev indexer configures it.' class RevisionMetadataTestIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ ContentMetadataIndexer = ContentMetadataTestIndexer def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': TRANSLATOR_TOOL, } class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { 'repository': 'http://schema.org/codeRepository', 'os': 'http://schema.org/operatingSystem', 'cpu': 'http://schema.org/processorRequirements', 'engines': 'http://schema.org/processorRequirements', 'author': 'http://schema.org/author', 'author.email': 'http://schema.org/email', 'author.name': 'http://schema.org/name', 'contributor': 'http://schema.org/contributor', 'keywords': 'http://schema.org/keywords', 'license': 'http://schema.org/license', 'version': 'http://schema.org/version', 'description': 'http://schema.org/description', 'name': 'http://schema.org/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', 'homepage': 'http://schema.org/url' }) def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" # None if no metadata was found or an error occurred declared_metadata = None # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.2", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" }, "author": { "email": "moranegg@example.com", "name": "Morane G" } } """ declared_metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'test_metadata', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'author': [{ 'type': 'Person', 'name': 'Morane G', 'email': 'moranegg@example.com', }], } # when result = MAPPINGS["NpmMapping"].translate(content) # then self.assertEqual(declared_metadata, result) def test_extract_minimal_metadata_dict(self): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_0_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_metadata', 'version': '0.0.2', 'author': 'moranegg', }] # when results = extract_minimal_metadata_dict(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', "version": '0.0.2', "description": 'Simple package.json test for indexer', "name": ['test_1', 'test_0_1', 'test_metadata'], "author": ['moranegg'], "codeRepository": 'git+https://github.com/moranegg/metadata_test', } self.assertEqual(expected_results, results) def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = [ hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5'), hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607'), hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069'), ] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping metadata_indexer = ContentMetadataTestIndexer( tool=TRANSLATOR_TOOL, config=BASE_TEST_CONFIG.copy()) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') results = list(metadata_indexer.idx_storage.content_metadata_get( sha1s)) expected_results = [{ 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': hash_to_bytes('26a9f72a7c87cc9205725cfd879f514ff4f3d8d5') }, { 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': 'https://github.com/npm/npm/issues', 'author': [{ 'type': 'Person', 'name': 'Isaac Z. Schlueter', 'email': 'i@izs.me', 'url': 'http://blog.izs.me', }], 'codeRepository': 'git+https://github.com/npm/npm', 'description': 'a package manager for JavaScript', 'license': 'https://spdx.org/licenses/Artistic-2.0', 'version': '5.0.3', 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': hash_to_bytes('d4c647f0fc257591cc9ba1722484229780d1c607') - }, { - 'translated_metadata': None, - 'id': hash_to_bytes('02fb2c89e14f7fab46701478c83779c7beb7b069') }] for result in results: del result['tool'] # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) + def test_npm_bugs_normalization(self): + # valid dictionary + package_json = b"""{ + "name": "foo", + "bugs": { + "url": "https://github.com/owner/project/issues", + "email": "foo@example.com" + } + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'issueTracker': 'https://github.com/owner/project/issues', + 'type': 'SoftwareSourceCode', + }) + + # "invalid" dictionary + package_json = b"""{ + "name": "foo", + "bugs": { + "email": "foo@example.com" + } + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'type': 'SoftwareSourceCode', + }) + + # string + package_json = b"""{ + "name": "foo", + "bugs": "https://github.com/owner/project/issues" + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'issueTracker': 'https://github.com/owner/project/issues', + 'type': 'SoftwareSourceCode', + }) + + def test_npm_repository_normalization(self): + # normal + package_json = b"""{ + "name": "foo", + "repository": { + "type" : "git", + "url" : "https://github.com/npm/cli.git" + } + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'codeRepository': 'git+https://github.com/npm/cli.git', + 'type': 'SoftwareSourceCode', + }) + + # missing url + package_json = b"""{ + "name": "foo", + "repository": { + "type" : "git" + } + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'type': 'SoftwareSourceCode', + }) + + # github shortcut + package_json = b"""{ + "name": "foo", + "repository": "github:npm/cli" + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + expected_result = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'codeRepository': 'git+https://github.com/npm/cli.git', + 'type': 'SoftwareSourceCode', + } + self.assertEqual(result, expected_result) + + # github shortshortcut + package_json = b"""{ + "name": "foo", + "repository": "npm/cli" + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, expected_result) + + # gitlab shortcut + package_json = b"""{ + "name": "foo", + "repository": "gitlab:user/repo" + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'codeRepository': 'git+https://gitlab.com/user/repo.git', + 'type': 'SoftwareSourceCode', + }) + def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'NpmMapping': [ b'cde' ] } # then self.assertEqual(expected_results, results) def test_compute_metadata_valid_codemeta(self): raw_content = ( b"""{ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, { "@type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "@id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "@type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "@id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "@id": "https://doi.org/10.13039/100000001", "@type": "Organization", "name": "National Science Foundation" }, "funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", "keywords": [ "metadata", "software" ], "version":"2.0", "dateCreated":"2017-06-05", "datePublished":"2017-06-05", "programmingLanguage": "JSON-LD" }""") # noqa expected_result = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "identifier": "CodeMeta", "description": "CodeMeta is a concept vocabulary that can " "be used to standardize the exchange of software metadata " "across repositories and organizations.", "name": "CodeMeta: Minimal metadata schemas for science " "software and code, in JSON-LD", "codeRepository": "https://github.com/codemeta/codemeta", "issueTracker": "https://github.com/codemeta/codemeta/issues", "license": "https://spdx.org/licenses/Apache-2.0", "version": "2.0", "author": [ { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, { "type": "Person", "givenName": "Matthew B.", "familyName": "Jones", "email": "jones@nceas.ucsb.edu", "id": "http://orcid.org/0000-0003-0077-4738" } ], "maintainer": { "type": "Person", "givenName": "Carl", "familyName": "Boettiger", "email": "cboettig@gmail.com", "id": "http://orcid.org/0000-0002-1642-628X" }, "contIntegration": "https://travis-ci.org/codemeta/codemeta", "developmentStatus": "active", "downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", "funder": { "id": "https://doi.org/10.13039/100000001", "type": "Organization", "name": "National Science Foundation" }, "funding": "1549758; Codemeta: A Rosetta Stone for Metadata " "in Scientific Software", "keywords": [ "metadata", "software" ], "version": "2.0", "dateCreated": "2017-06-05", "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD" } result = MAPPINGS["CodemetaMapping"].translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'codeRepository': 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', }) def test_compute_metadata_maven_minimal(self): raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'codeRepository': 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', 'license': [], }) def test_compute_metadata_maven_multiple(self): '''Tests when there are multiple code repos and licenses.''' raw_content = b""" Maven Default Project 4.0.0 com.mycompany.app my-app 1.2.3 central Maven Repository Switchboard default http://repo1.maven.org/maven2 false example Example Maven Repo default http://example.org/maven2 Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt repo A business-friendly OSS license MIT license https://opensource.org/licenses/MIT """ result = MAPPINGS["MavenMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'Maven Default Project', 'identifier': 'com.mycompany.app', 'version': '1.2.3', 'license': [ 'https://www.apache.org/licenses/LICENSE-2.0.txt', 'https://opensource.org/licenses/MIT', ], 'codeRepository': [ 'http://repo1.maven.org/maven2/com/mycompany/app/my-app', 'http://example.org/maven2/com/mycompany/app/my-app', ] }) def test_compute_metadata_pkginfo(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: swh.core Version: 0.0.49 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-core Description: swh-core ======== core library for swh's modules: - config parser - hash computations - serialization - logging mechanism Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing """) # noqa result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) self.assertCountEqual(result['description'], [ 'Software Heritage core utilities', # note the comma here 'swh-core\n' ' ========\n' ' \n' " core library for swh's modules:\n" ' - config parser\n' ' - hash computations\n' ' - serialization\n' ' - logging mechanism\n' ' '], result) del result['description'] self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'url': 'https://forge.softwareheritage.org/diffusion/DCORE/', 'name': 'swh.core', 'author': [{ 'type': 'Person', 'name': 'Software Heritage developers', 'email': 'swh-devel@inria.fr', }], 'version': '0.0.49', }) def test_compute_metadata_pkginfo_license(self): raw_content = (b"""\ Metadata-Version: 2.1 Name: foo License: MIT """) # noqa result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'name': 'foo', 'license': 'MIT', }) def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) tool = metadata_indexer.idx_storage.indexer_configuration_get( {'tool_'+k: v for (k, v) in TRANSLATOR_TOOL.items()}) assert tool is not None metadata_indexer.idx_storage.content_metadata_add([{ 'indexer_configuration_id': tool['id'], 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', 'author': ['Andrew Nesbitt'], 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'description': 'Tiny web service for parsing yarn.lock files', } }]) sha1_gits = [ hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), ] metadata_indexer.run(sha1_gits, 'update-dups') results = list(metadata_indexer.idx_storage.revision_metadata_get( sha1_gits)) expected_results = [{ 'id': hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f'), 'tool': TRANSLATOR_TOOL, 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'author': ['Andrew Nesbitt'], 'license': 'AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], }, }] for result in results: del result['tool']['id'] # then self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py index 5053bd1..20b756c 100644 --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -1,172 +1,173 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import time import unittest +import unittest.mock from celery import task from swh.model.hashutil import hash_to_bytes from swh.storage.in_memory import Storage from swh.indexer.metadata import ( OriginMetadataIndexer, RevisionMetadataIndexer ) from swh.indexer.storage.in_memory import IndexerStorage from swh.objstorage.objstorage_in_memory import InMemoryObjStorage from swh.scheduler.tests.scheduler_testing import SchedulerTestFixture from .test_utils import ( BASE_TEST_CONFIG, fill_storage, fill_obj_storage ) from .test_origin_head import OriginHeadTestIndexer from .test_metadata import ContentMetadataTestIndexer class RevisionMetadataTestIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ ContentMetadataIndexer = ContentMetadataTestIndexer def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'swh-metadata-detector', 'version': '0.0.2', 'configuration': { 'type': 'local', 'context': 'NpmMapping' } } } @task def revision_metadata_test_task(*args, **kwargs): indexer = RevisionMetadataTestIndexer() indexer.run(*args, **kwargs) return indexer.results class OriginMetadataTestIndexer(OriginMetadataIndexer): def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': [] } @task def origin_intrinsic_metadata_test_task(*args, **kwargs): indexer = OriginMetadataTestIndexer() indexer.run(*args, **kwargs) return indexer.results class OriginHeadTestIndexer(OriginHeadTestIndexer): def prepare(self): super().prepare() self.config['tasks'] = { 'revision_metadata': 'revision_metadata_test_task', 'origin_intrinsic_metadata': 'origin_intrinsic_metadata_test_task', } class TestOriginMetadata(SchedulerTestFixture, unittest.TestCase): def setUp(self): super().setUp() self.maxDiff = None self.add_scheduler_task_type( 'revision_metadata_test_task', 'swh.indexer.tests.test_origin_metadata.' 'revision_metadata_test_task') self.add_scheduler_task_type( 'origin_intrinsic_metadata_test_task', 'swh.indexer.tests.test_origin_metadata.' 'origin_intrinsic_metadata_test_task') RevisionMetadataTestIndexer.scheduler = self.scheduler def tearDown(self): del RevisionMetadataTestIndexer.scheduler super().tearDown() @unittest.mock.patch('swh.indexer.storage.in_memory.IndexerStorage') @unittest.mock.patch('swh.storage.in_memory.Storage') def test_pipeline(self, storage_mock, idx_storage_mock): # Always returns the same instance of the idx storage, because # this function is called by each of the three indexers. objstorage = InMemoryObjStorage() storage = Storage() idx_storage = IndexerStorage() storage_mock.return_value = storage idx_storage_mock.return_value = idx_storage fill_obj_storage(objstorage) fill_storage(storage) # TODO: find a better way to share the ContentMetadataIndexer use # the same objstorage instance. import swh.objstorage old_inmem_objstorage = swh.objstorage._STORAGE_CLASSES['memory'] swh.objstorage._STORAGE_CLASSES['memory'] = lambda: objstorage try: indexer = OriginHeadTestIndexer() indexer.scheduler = self.scheduler indexer.run(["git+https://github.com/librariesio/yarn-parser"]) self.run_ready_tasks() # Run the first task # Give it time to complete and schedule the 2nd one time.sleep(0.1) self.run_ready_tasks() # Run the second task finally: swh.objstorage._STORAGE_CLASSES['memory'] = old_inmem_objstorage origin = storage.origin_get({ 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}) rev_id = hash_to_bytes('8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f') metadata = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'codeRepository': 'git+git+https://github.com/librariesio/yarn-parser.git', 'author': [{ 'type': 'Person', 'name': 'Andrew Nesbitt' }], 'license': 'https://spdx.org/licenses/AGPL-3.0', 'version': '1.0.0', 'description': 'Tiny web service for parsing yarn.lock files', 'issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], } rev_metadata = { 'id': rev_id, 'translated_metadata': metadata, } origin_metadata = { 'origin_id': origin['id'], 'from_revision': rev_id, 'metadata': metadata, } results = list(indexer.idx_storage.revision_metadata_get([rev_id])) for result in results: del result['tool'] self.assertEqual(results, [rev_metadata]) results = list(indexer.idx_storage.origin_intrinsic_metadata_get([ origin['id']])) for result in results: del result['tool'] self.assertEqual(results, [origin_metadata]) diff --git a/version.txt b/version.txt index 60b0617..699aa4e 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.126-0-g3224c26 \ No newline at end of file +v0.0.127-0-g1146b7c \ No newline at end of file