diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -161,15 +161,17 @@ filename = b'package.json' _schema_shortcuts = { - 'github': 'https://github.com/', - 'gist': 'https://gist.github.com/', - 'bitbucket': 'https://bitbucket.org/', - 'gitlab': 'https://gitlab.com/', + 'github': 'git+https://github.com/%s.git', + 'gist': 'git+https://gist.github.com/%s.git', + 'gitlab': 'git+https://gitlab.com/%s.git', + # Bitbucket supports both hg and git, and the shortcut does not + # tell which one to use. + # 'bitbucket': 'https://bitbucket.org/', } def normalize_repository(self, d): """https://docs.npmjs.com/files/package.json#repository""" - if isinstance(d, dict): + if isinstance(d, dict) and {'type', 'url'} <= set(d): url = '{type}+{url}'.format(**d) elif isinstance(d, str): if '://' in d: @@ -177,11 +179,11 @@ elif ':' in d: (schema, rest) = d.split(':', 1) if schema in self._schema_shortcuts: - url = self._schema_shortcuts[schema] + rest + url = self._schema_shortcuts[schema] % rest else: return None else: - url = self._schema_shortcuts['github'] + d + url = self._schema_shortcuts['github'] % d else: return None @@ -189,7 +191,13 @@ return {'@id': url} def normalize_bugs(self, d): - return {'@id': '{url}'.format(**d)} + """https://docs.npmjs.com/files/package.json#bugs""" + if isinstance(d, dict) and 'url' in d: + return {'@id': '{url}'.format(**d)} + elif isinstance(d, str): + return {'@id': d} + else: + return None _parse_author = re.compile(r'^ *' r'(?P.*?)' diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -248,6 +248,116 @@ # The assertion below returns False sometimes because of nested lists self.assertEqual(expected_results, results) + def test_npm_bugs_normalization(self): + # valid dictionary + package_json = b"""{ + "name": "foo", + "bugs": { + "url": "https://github.com/owner/project/issues", + "email": "foo@example.com" + } + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'issueTracker': 'https://github.com/owner/project/issues', + 'type': 'SoftwareSourceCode', + }) + + # "invalid" dictionary + package_json = b"""{ + "name": "foo", + "bugs": { + "email": "foo@example.com" + } + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'type': 'SoftwareSourceCode', + }) + + # string + package_json = b"""{ + "name": "foo", + "bugs": "https://github.com/owner/project/issues" + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'issueTracker': 'https://github.com/owner/project/issues', + 'type': 'SoftwareSourceCode', + }) + + def test_npm_repository_normalization(self): + # normal + package_json = b"""{ + "name": "foo", + "repository": { + "type" : "git", + "url" : "https://github.com/npm/cli.git" + } + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'codeRepository': 'git+https://github.com/npm/cli.git', + 'type': 'SoftwareSourceCode', + }) + + # missing url + package_json = b"""{ + "name": "foo", + "repository": { + "type" : "git" + } + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'type': 'SoftwareSourceCode', + }) + + # github shortcut + package_json = b"""{ + "name": "foo", + "repository": "github:npm/cli" + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + expected_result = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'codeRepository': 'git+https://github.com/npm/cli.git', + 'type': 'SoftwareSourceCode', + } + self.assertEqual(result, expected_result) + + # github shortshortcut + package_json = b"""{ + "name": "foo", + "repository": "npm/cli" + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, expected_result) + + # gitlab shortcut + package_json = b"""{ + "name": "foo", + "repository": "gitlab:user/repo" + }""" + result = MAPPINGS["NpmMapping"].translate(package_json) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'name': 'foo', + 'codeRepository': 'git+https://gitlab.com/user/repo.git', + 'type': 'SoftwareSourceCode', + }) + def test_detect_metadata_package_json(self): # given df = [{