diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -90,11 +90,11 @@ codemeta_translation[col][local_name.strip()] = \ canonical_name - return codemeta_translation + return (header, codemeta_translation) with open(CROSSWALK_TABLE_PATH) as fd: - CROSSWALK_TABLE = _read_crosstable(fd) + (CODEMETA_KEYS, CROSSWALK_TABLE) = _read_crosstable(fd) def _document_loader(url): diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -126,6 +126,10 @@ """Base class for mappings that take as input a file that is mostly a key-value store (eg. a shallow JSON dict).""" + string_fields = [] + '''List of fields that are simple strings, and don't need any + normalization.''' + @property @abc.abstractmethod def mapping(self): @@ -163,6 +167,12 @@ self, 'normalize_' + k.replace('-', '_'), None) if normalization_method: v = normalization_method(v) + elif k in self.string_fields and isinstance(v, str): + pass + elif k in self.string_fields and isinstance(v, list): + v = [x for x in v if isinstance(x, str)] + else: + continue # set the translation metadata with the normalized value if codemeta_key in translated_metadata: @@ -214,6 +224,7 @@ name = 'npm' mapping = CROSSWALK_TABLE['NodeJS'] filename = b'package.json' + string_fields = ['name', 'version', 'homepage', 'description', 'email'] _schema_shortcuts = { 'github': 'git+https://github.com/%s.git', @@ -239,7 +250,8 @@ ... 'foo/bar') {'@id': 'git+https://github.com/foo/bar.git'} """ - if isinstance(d, dict) and {'type', 'url'} <= set(d): + if isinstance(d, dict) and isinstance(d.get('type'), str) \ + and isinstance(d.get('url'), str): url = '{type}+{url}'.format(**d) elif isinstance(d, str): if '://' in d: @@ -270,8 +282,8 @@ ... 'https://example.org/bugs/') {'@id': 'https://example.org/bugs/'} """ - if isinstance(d, dict) and 'url' in d: - return {'@id': '{url}'.format(**d)} + if isinstance(d, dict) and isinstance(d.get('url'), str): + return {'@id': d['url']} elif isinstance(d, str): return {'@id': d} else: @@ -316,11 +328,11 @@ url = match.group('url') else: return None - if name: + if name and isinstance(name, str): author[SCHEMA_URI+'name'] = name - if email: + if email and isinstance(email, str): author[SCHEMA_URI+'email'] = email - if url: + if url and isinstance(url, str): author[SCHEMA_URI+'url'] = {'@id': url} return {"@list": [author]} @@ -344,6 +356,15 @@ if isinstance(s, str): return {"@id": s} + def normalize_keywords(self, l): + """https://docs.npmjs.com/files/package.json#homepage + + >>> NpmMapping().normalize_keywords(['foo', 'bar']) + ['foo', 'bar' + """ + if isinstance(l, list): + return [x for x in l if isinstance(x, str)] + @register_mapping class CodemetaMapping(SingleFileMapping): @@ -352,9 +373,14 @@ """ name = 'codemeta' filename = b'codemeta.json' + string_fields = ['name', 'version', 'url', 'description', 'email'] def translate(self, content): - return self.normalize_translation(expand(json.loads(content.decode()))) + try: + return self.normalize_translation(expand( + json.loads(content.decode()))) + except Exception: + return None @register_mapping @@ -365,6 +391,7 @@ name = 'maven' filename = b'pom.xml' mapping = CROSSWALK_TABLE['Java (Maven)'] + string_fields = ['name', 'version', 'description', 'email'] def translate(self, content): try: @@ -407,12 +434,14 @@ repositories = d.get('repositories') if not repositories: results = [self.parse_repository(d, self._default_repository)] - else: + elif isinstance(repositories, dict): repositories = repositories.get('repository') or [] if not isinstance(repositories, list): repositories = [repositories] results = [self.parse_repository(d, repo) for repo in repositories] + else: + results = [] return [res for res in results if res] or None def parse_repository(self, d, repo): @@ -432,7 +461,8 @@ >>> MavenMapping().normalize_groupId('org.example') {'@id': 'org.example'} """ - return {"@id": id_} + if isinstance(id_, str): + return {"@id": id_} def parse_licenses(self, d): """https://maven.apache.org/pom.html#Licenses @@ -490,7 +520,8 @@ return return [{"@id": license['url']} for license in licenses - if isinstance(license, dict) and 'url' in license] or None + if isinstance(license, dict) + and isinstance(license.get('url'), str)] or None _normalize_pkginfo_key = str.lower @@ -513,6 +544,8 @@ filename = b'PKG-INFO' mapping = {_normalize_pkginfo_key(k): v for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} + string_fields = ['name', 'version', 'description', 'summary', + 'author', 'author-email'] _parser = email.parser.BytesHeaderParser( policy=LinebreakPreservingEmailPolicy()) @@ -548,6 +581,7 @@ class GemspecMapping(DictMapping): name = 'gemspec' mapping = CROSSWALK_TABLE['Ruby Gem'] + string_fields = ['name', 'version', 'description', 'summary', 'email'] _re_spec_new = re.compile(r'.*Gem::Specification.new +(do|\{) +\|.*\|.*') _re_spec_entry = re.compile(r'\s*\w+\.(?P\w+)\s*=\s*(?P.*)') diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -3,10 +3,15 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import json import unittest +from hypothesis import given, strategies +import xmltodict + from swh.model.hashutil import hash_to_bytes +from swh.indexer.codemeta import CODEMETA_KEYS from swh.indexer.metadata_dictionary import ( CROSSWALK_TABLE, MAPPINGS, merge_values) from swh.indexer.metadata_detector import ( @@ -17,7 +22,8 @@ ) from .utils import ( - BASE_TEST_CONFIG, fill_obj_storage, fill_storage + BASE_TEST_CONFIG, fill_obj_storage, fill_storage, + json_document_strategy ) @@ -782,6 +788,18 @@ 'https://repo.maven.apache.org/maven2/com/mycompany/app/my-app', }) + raw_content = b""" + + + 1.2.3 + """ + result = self.maven_mapping.translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'version': '1.2.3', + }) + def test_compute_metadata_maven_invalid_licenses(self): raw_content = b""" @@ -1040,6 +1058,42 @@ 'description': 'execute system commands with aliases', }) + @given(json_document_strategy( + keys=list(MAPPINGS['NpmMapping'].mapping))) + def test_npm_adversarial(self, doc): + raw = json.dumps(doc).encode() + self.npm_mapping.translate(raw) + + @given(json_document_strategy(keys=CODEMETA_KEYS)) + def test_codemeta_adversarial(self, doc): + raw = json.dumps(doc).encode() + self.codemeta_mapping.translate(raw) + + @given(json_document_strategy( + keys=list(MAPPINGS['MavenMapping'].mapping))) + def test_maven_adversarial(self, doc): + raw = xmltodict.unparse({'project': doc}, pretty=True) + self.maven_mapping.translate(raw) + + @given(strategies.dictionaries( + # keys + strategies.one_of( + strategies.characters(), + *map(strategies.just, MAPPINGS['GemspecMapping'].mapping) + ), + # values + strategies.recursive( + strategies.characters(), + lambda children: strategies.lists(children, 1) + ) + )) + def test_gemspec_adversarial(self, doc): + parts = ['Gem::Specification.new do |s|\n'] + for (k, v) in doc.items(): + parts.append(' s.{} = {}\n'.format(k, repr(v))) + parts.append('end\n') + self.maven_mapping.translate(''.join(parts)) + def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataIndexer( config=REVISION_METADATA_CONFIG) diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -8,6 +8,8 @@ import hashlib import random +from hypothesis import strategies + from swh.model import hashutil from swh.model.hashutil import hash_to_bytes, hash_to_hex @@ -405,6 +407,39 @@ }] +json_dict_keys = strategies.one_of( + strategies.characters(), + *map(strategies.just, ['type', 'url', 'name', 'email', '@id', + '@context', 'repository', 'license', + ]), +) +"""Hypothesis strategy that generates strings, with an emphasis on those +that are often used as dictionary keys in metadata files.""" + + +generic_json_document = strategies.recursive( + strategies.none() | strategies.booleans() | strategies.floats() | + strategies.characters(), + lambda children: ( + strategies.lists(children, 1) | + strategies.dictionaries(json_dict_keys, children, min_size=1) + ) +) +"""Hypothesis strategy that generates possible values for values of JSON +metadata files.""" + + +def json_document_strategy(keys=None): + """Generates an hypothesis strategy that generates metadata files + for a format that uses the given keys.""" + if keys is None: + keys = strategies.characters() + else: + keys = strategies.one_of(map(strategies.just, keys)) + + return strategies.dictionaries(keys, generic_json_document, min_size=2) + + def filter_dict(d, keys): 'return a copy of the dict with keys deleted' if not isinstance(keys, (list, tuple)):