Changeset View
Standalone View
swh/indexer/tests/test_metadata.py
Show First 20 Lines • Show All 78 Lines • ▼ Show 20 Lines | def setUp(self): | ||||
'type': 'local', | 'type': 'local', | ||||
'context': 'NpmMapping' | 'context': 'NpmMapping' | ||||
} | } | ||||
} | } | ||||
MockIndexerStorage.added_data = [] | MockIndexerStorage.added_data = [] | ||||
def test_crosstable(self): | def test_crosstable(self): | ||||
self.assertEqual(CROSSWALK_TABLE['NodeJS'], { | self.assertEqual(CROSSWALK_TABLE['NodeJS'], { | ||||
'repository': 'codeRepository', | 'repository': 'http://schema.org/codeRepository', | ||||
'os': 'operatingSystem', | 'os': 'http://schema.org/operatingSystem', | ||||
'cpu': 'processorRequirements', | 'cpu': 'http://schema.org/processorRequirements', | ||||
'engines': 'processorRequirements', | 'engines': | ||||
'dependencies': 'softwareRequirements', | 'http://schema.org/processorRequirements', | ||||
'bundleDependencies': 'softwareRequirements', | 'author': 'http://schema.org/author', | ||||
'bundledDependencies': 'softwareRequirements', | 'author.email': 'http://schema.org/email', | ||||
'peerDependencies': 'softwareRequirements', | 'author.name': 'http://schema.org/name', | ||||
'author': 'creator', | 'contributor': 'http://schema.org/contributor', | ||||
'author.email': 'email', | 'keywords': 'http://schema.org/keywords', | ||||
'author.name': 'name', | 'license': 'http://schema.org/license', | ||||
'contributor': 'contributor', | 'version': 'http://schema.org/version', | ||||
'keywords': 'keywords', | 'description': 'http://schema.org/description', | ||||
'license': 'license', | 'name': 'http://schema.org/name', | ||||
'version': 'version', | 'bugs': 'https://codemeta.github.io/terms/issueTracker', | ||||
'description': 'description', | 'homepage': 'http://schema.org/url' | ||||
'name': 'name', | |||||
'devDependencies': 'softwareSuggestions', | |||||
'optionalDependencies': 'softwareSuggestions', | |||||
'bugs': 'issueTracker', | |||||
'homepage': 'url' | |||||
}) | }) | ||||
def test_compute_metadata_none(self): | def test_compute_metadata_none(self): | ||||
""" | """ | ||||
testing content empty content is empty | testing content empty content is empty | ||||
should return None | should return None | ||||
""" | """ | ||||
# given | # given | ||||
Show All 14 Lines | def test_compute_metadata_npm(self): | ||||
content = b""" | content = b""" | ||||
{ | { | ||||
"name": "test_metadata", | "name": "test_metadata", | ||||
"version": "0.0.2", | "version": "0.0.2", | ||||
"description": "Simple package.json test for indexer", | "description": "Simple package.json test for indexer", | ||||
"repository": { | "repository": { | ||||
"type": "git", | "type": "git", | ||||
"url": "https://github.com/moranegg/metadata_test" | "url": "https://github.com/moranegg/metadata_test" | ||||
}, | |||||
"author": { | |||||
"email": "moranegg@example.com", | |||||
"name": "Morane G" | |||||
} | } | ||||
} | } | ||||
""" | """ | ||||
declared_metadata = { | declared_metadata = { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | |||||
'type': 'SoftwareSourceCode', | |||||
'name': 'test_metadata', | 'name': 'test_metadata', | ||||
'version': '0.0.2', | 'version': '0.0.2', | ||||
'description': 'Simple package.json test for indexer', | 'description': 'Simple package.json test for indexer', | ||||
'codeRepository': { | 'schema:codeRepository': | ||||
'type': 'git', | 'git+https://github.com/moranegg/metadata_test', | ||||
'url': 'https://github.com/moranegg/metadata_test' | 'schema:author': { | ||||
'type': 'Person', | |||||
'name': 'Morane G', | |||||
'email': 'moranegg@example.com', | |||||
}, | }, | ||||
'other': {} | |||||
} | } | ||||
# when | # when | ||||
result = MAPPINGS["NpmMapping"].translate(content) | result = MAPPINGS["NpmMapping"].translate(content) | ||||
# then | # then | ||||
self.assertEqual(declared_metadata, result) | self.assertEqual(declared_metadata, result) | ||||
def test_extract_minimal_metadata_dict(self): | def test_extract_minimal_metadata_dict(self): | ||||
""" | """ | ||||
Test the creation of a coherent minimal metadata set | Test the creation of a coherent minimal metadata set | ||||
""" | """ | ||||
# given | # given | ||||
metadata_list = [{ | metadata_list = [{ | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | |||||
'name': 'test_1', | 'name': 'test_1', | ||||
'version': '0.0.2', | 'version': '0.0.2', | ||||
'description': 'Simple package.json test for indexer', | 'description': 'Simple package.json test for indexer', | ||||
'codeRepository': { | 'schema:codeRepository': | ||||
'type': 'git', | 'git+https://github.com/moranegg/metadata_test', | ||||
'url': 'https://github.com/moranegg/metadata_test' | |||||
}, | |||||
'other': {} | |||||
}, { | }, { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | |||||
'name': 'test_0_1', | 'name': 'test_0_1', | ||||
'version': '0.0.2', | 'version': '0.0.2', | ||||
'description': 'Simple package.json test for indexer', | 'description': 'Simple package.json test for indexer', | ||||
'codeRepository': { | 'schema:codeRepository': | ||||
'type': 'git', | 'git+https://github.com/moranegg/metadata_test' | ||||
'url': 'https://github.com/moranegg/metadata_test' | |||||
}, | |||||
'other': {} | |||||
}, { | }, { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | |||||
'name': 'test_metadata', | 'name': 'test_metadata', | ||||
'version': '0.0.2', | 'version': '0.0.2', | ||||
'author': 'moranegg', | 'schema:author': 'moranegg', | ||||
'other': {} | |||||
}] | }] | ||||
# when | # when | ||||
results = extract_minimal_metadata_dict(metadata_list) | results = extract_minimal_metadata_dict(metadata_list) | ||||
# then | # then | ||||
expected_results = { | expected_results = { | ||||
"developmentStatus": None, | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
"version": ['0.0.2'], | "version": '0.0.2', | ||||
"operatingSystem": None, | "description": 'Simple package.json test for indexer', | ||||
"description": ['Simple package.json test for indexer'], | |||||
"keywords": None, | |||||
"issueTracker": None, | |||||
"name": ['test_1', 'test_0_1', 'test_metadata'], | "name": ['test_1', 'test_0_1', 'test_metadata'], | ||||
"author": ['moranegg'], | "schema:author": 'moranegg', | ||||
"relatedLink": None, | "schema:codeRepository": | ||||
"url": None, | 'git+https://github.com/moranegg/metadata_test', | ||||
"license": None, | |||||
"maintainer": None, | |||||
"email": None, | |||||
"softwareRequirements": None, | |||||
"identifier": None, | |||||
"codeRepository": [{ | |||||
'type': 'git', | |||||
'url': 'https://github.com/moranegg/metadata_test' | |||||
}] | |||||
} | } | ||||
self.assertEqual(expected_results, results) | self.assertEqual(expected_results, results) | ||||
def test_index_content_metadata_npm(self): | def test_index_content_metadata_npm(self): | ||||
""" | """ | ||||
testing NPM with package.json | testing NPM with package.json | ||||
- one sha1 uses a file that can't be translated to metadata and | - one sha1 uses a file that can't be translated to metadata and | ||||
should return None in the translated metadata | should return None in the translated metadata | ||||
Show All 9 Lines | def test_index_content_metadata_npm(self): | ||||
# when | # when | ||||
metadata_indexer.run(sha1s, policy_update='ignore-dups') | metadata_indexer.run(sha1s, policy_update='ignore-dups') | ||||
results = metadata_indexer.idx_storage.added_data | results = metadata_indexer.idx_storage.added_data | ||||
expected_results = [('content_metadata', False, [{ | expected_results = [('content_metadata', False, [{ | ||||
'indexer_configuration_id': 30, | 'indexer_configuration_id': 30, | ||||
'translated_metadata': { | 'translated_metadata': { | ||||
'other': {}, | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
'codeRepository': { | 'type': 'SoftwareSourceCode', | ||||
'type': 'git', | 'schema:codeRepository': | ||||
'url': 'https://github.com/moranegg/metadata_test' | 'git+https://github.com/moranegg/metadata_test', | ||||
}, | |||||
'description': 'Simple package.json test for indexer', | 'description': 'Simple package.json test for indexer', | ||||
'name': 'test_metadata', | 'name': 'test_metadata', | ||||
'version': '0.0.1' | 'version': '0.0.1' | ||||
}, | }, | ||||
'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' | 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' | ||||
}, { | }, { | ||||
'indexer_configuration_id': 30, | 'indexer_configuration_id': 30, | ||||
'translated_metadata': { | 'translated_metadata': { | ||||
'softwareRequirements': { | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
'JSONStream': '~1.3.1', | 'type': 'SoftwareSourceCode', | ||||
'abbrev': '~1.1.0', | 'codemeta:issueTracker': | ||||
'ansi-regex': '~2.1.1', | 'https://github.com/npm/npm/issues', | ||||
'ansicolors': '~0.3.2', | 'schema:author': { | ||||
'ansistyles': '~0.1.3' | 'type': 'Person', | ||||
}, | 'name': 'Isaac Z. Schlueter', | ||||
'issueTracker': { | 'email': 'i@izs.me', | ||||
'url': 'https://github.com/npm/npm/issues' | 'schema:url': 'http://blog.izs.me', | ||||
}, | |||||
'creator': | |||||
'Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me)', | |||||
'codeRepository': { | |||||
'type': 'git', | |||||
'url': 'https://github.com/npm/npm' | |||||
}, | }, | ||||
'schema:codeRepository': | |||||
'git+https://github.com/npm/npm', | |||||
'description': 'a package manager for JavaScript', | 'description': 'a package manager for JavaScript', | ||||
'softwareSuggestions': { | 'schema:license': 'Artistic-2.0', | ||||
'tacks': '~1.2.6', | |||||
'tap': '~10.3.2' | |||||
}, | |||||
'license': 'Artistic-2.0', | |||||
'version': '5.0.3', | 'version': '5.0.3', | ||||
'other': { | |||||
moranegg: I kept a property called `other` in the content_metadata to regroup all metadata I wasn't able… | |||||
Done Inline ActionsIt's just that it's not defined by codemeta's schema definition, so jsonld.compact drops it. I could add a new property with an absolute URI, though. vlorentz: It's just that it's not defined by codemeta's schema definition, so `jsonld.compact` drops it. | |||||
'preferGlobal': True, | |||||
'config': { | |||||
'publishtest': False | |||||
} | |||||
}, | |||||
'name': 'npm', | 'name': 'npm', | ||||
'keywords': [ | 'keywords': [ | ||||
'install', | 'install', | ||||
'modules', | 'modules', | ||||
'package manager', | 'package manager', | ||||
'package.json' | 'package.json' | ||||
], | ], | ||||
'url': 'https://docs.npmjs.com/' | 'schema:url': 'https://docs.npmjs.com/' | ||||
}, | }, | ||||
'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' | 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' | ||||
}, { | }, { | ||||
'indexer_configuration_id': 30, | 'indexer_configuration_id': 30, | ||||
'translated_metadata': None, | 'translated_metadata': None, | ||||
'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' | 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' | ||||
}])] | }])] | ||||
Show All 30 Lines | def test_detect_metadata_package_json(self): | ||||
expected_results = { | expected_results = { | ||||
'NpmMapping': [ | 'NpmMapping': [ | ||||
b'cde' | b'cde' | ||||
] | ] | ||||
} | } | ||||
# then | # then | ||||
self.assertEqual(expected_results, results) | self.assertEqual(expected_results, results) | ||||
def test_compute_metadata_valid_codemeta(self): | |||||
raw_content = ( | |||||
b"""{ | |||||
Not Done Inline ActionsWhen running the indexation of a metadata file with a wrong @context, even with the same url with 1.0 at the end, Traceback (most recent call last): File "/usr/lib/python3/dist-packages/pyld/jsonld.py", line 4308, in _retrieve_context_urls remote_doc = load_document(url) File "/home/morane/Documents/code/swh-environment/swh-indexer/swh/indexer/codemeta.py", line 108, in _document_loader raise Exception(url) Exception: https://doi.org/10.5063/schema/codemeta-1.0 During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/lib/python3/dist-packages/pyld/jsonld.py", line 800, in expand input_, {}, options['documentLoader'], options['base']) File "/usr/lib/python3/dist-packages/pyld/jsonld.py", line 4315, in _retrieve_context_urls code='loading remote context failed', cause=cause) pyld.jsonld.JsonLdError: <exception str() failed> During handling of the above exception, another exception occurred: Traceback (most recent call last): File "metadata_dictionary.py", line 345, in <module> main() File "metadata_dictionary.py", line 335, in main result = MAPPINGS["CodemetaMapping"].translate(raw_content) File "metadata_dictionary.py", line 226, in translate return self.normalize_translation(expand(json.loads(content.decode()))) File "/home/morane/Documents/code/swh-environment/swh-indexer/swh/indexer/codemeta.py", line 120, in expand options={'documentLoader': _document_loader}) File "/usr/lib/python3/dist-packages/pyld/jsonld.py", line 171, in expand return JsonLdProcessor().expand(input_, options) File "/usr/lib/python3/dist-packages/pyld/jsonld.py", line 804, in expand 'jsonld.ExpandError', cause=cause) pyld.jsonld.JsonLdError: <exception str() failed> This is an observation, I'm not saying we should fix it, but the usage of the DOI url might not be on all codemeta.json files we find. moranegg: When running the indexation of a metadata file with a wrong @context, even with the same url… | |||||
Done Inline ActionsUnfortunately, it's either that or pulling untrusted schemas from the internet :/ vlorentz: Unfortunately, it's either that or pulling untrusted schemas from the internet :/ | |||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | |||||
"@type": "SoftwareSourceCode", | |||||
"identifier": "CodeMeta", | |||||
"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", | |||||
"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", | |||||
"codeRepository": "https://github.com/codemeta/codemeta", | |||||
"issueTracker": "https://github.com/codemeta/codemeta/issues", | |||||
"license": "https://spdx.org/licenses/Apache-2.0", | |||||
"version": "2.0", | |||||
"author": [ | |||||
{ | |||||
"@type": "Person", | |||||
"givenName": "Carl", | |||||
"familyName": "Boettiger", | |||||
"email": "cboettig@gmail.com", | |||||
"@id": "http://orcid.org/0000-0002-1642-628X" | |||||
}, | |||||
{ | |||||
"@type": "Person", | |||||
"givenName": "Matthew B.", | |||||
"familyName": "Jones", | |||||
"email": "jones@nceas.ucsb.edu", | |||||
"@id": "http://orcid.org/0000-0003-0077-4738" | |||||
} | |||||
], | |||||
"maintainer": { | |||||
"@type": "Person", | |||||
"givenName": "Carl", | |||||
"familyName": "Boettiger", | |||||
"email": "cboettig@gmail.com", | |||||
"@id": "http://orcid.org/0000-0002-1642-628X" | |||||
}, | |||||
"contIntegration": "https://travis-ci.org/codemeta/codemeta", | |||||
"developmentStatus": "active", | |||||
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", | |||||
"funder": { | |||||
"@id": "https://doi.org/10.13039/100000001", | |||||
"@type": "Organization", | |||||
"name": "National Science Foundation" | |||||
}, | |||||
"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", | |||||
"keywords": [ | |||||
"metadata", | |||||
"software" | |||||
], | |||||
"version":"2.0", | |||||
"dateCreated":"2017-06-05", | |||||
"datePublished":"2017-06-05", | |||||
"programmingLanguage": "JSON-LD" | |||||
}""") # noqa | |||||
expected_result = { | |||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | |||||
"type": "SoftwareSourceCode", | |||||
"identifier": "CodeMeta", | |||||
"description": | |||||
"CodeMeta is a concept vocabulary that can " | |||||
"be used to standardize the exchange of software metadata " | |||||
"across repositories and organizations.", | |||||
"name": | |||||
"CodeMeta: Minimal metadata schemas for science " | |||||
"software and code, in JSON-LD", | |||||
"codeRepository": "https://github.com/codemeta/codemeta", | |||||
"issueTracker": "https://github.com/codemeta/codemeta/issues", | |||||
"license": "https://spdx.org/licenses/Apache-2.0", | |||||
"version": "2.0", | |||||
"author": [ | |||||
{ | |||||
"type": "Person", | |||||
"givenName": "Carl", | |||||
"familyName": "Boettiger", | |||||
"email": "cboettig@gmail.com", | |||||
"id": "http://orcid.org/0000-0002-1642-628X" | |||||
}, | |||||
{ | |||||
"type": "Person", | |||||
"givenName": "Matthew B.", | |||||
"familyName": "Jones", | |||||
"email": "jones@nceas.ucsb.edu", | |||||
"id": "http://orcid.org/0000-0003-0077-4738" | |||||
} | |||||
], | |||||
"maintainer": { | |||||
"type": "Person", | |||||
"givenName": "Carl", | |||||
"familyName": "Boettiger", | |||||
"email": "cboettig@gmail.com", | |||||
"id": "http://orcid.org/0000-0002-1642-628X" | |||||
}, | |||||
"contIntegration": "https://travis-ci.org/codemeta/codemeta", | |||||
"developmentStatus": "active", | |||||
"downloadUrl": | |||||
"https://github.com/codemeta/codemeta/archive/2.0.zip", | |||||
"funder": { | |||||
"id": "https://doi.org/10.13039/100000001", | |||||
"type": "Organization", | |||||
"name": "National Science Foundation" | |||||
}, | |||||
"funding": "1549758; Codemeta: A Rosetta Stone for Metadata " | |||||
"in Scientific Software", | |||||
"keywords": [ | |||||
"metadata", | |||||
"software" | |||||
], | |||||
"version": "2.0", | |||||
"dateCreated": "2017-06-05", | |||||
"datePublished": "2017-06-05", | |||||
"programmingLanguage": "JSON-LD" | |||||
} | |||||
result = MAPPINGS["CodemetaMapping"].translate(raw_content) | |||||
self.assertEqual(result, expected_result) | |||||
def test_compute_metadata_maven(self): | |||||
raw_content = b""" | |||||
<project> | |||||
<name>Maven Default Project</name> | |||||
<modelVersion>4.0.0</modelVersion> | |||||
<groupId>com.mycompany.app</groupId> | |||||
<artifactId>my-app</artifactId> | |||||
<version>1.2.3</version> | |||||
<repositories> | |||||
<repository> | |||||
<id>central</id> | |||||
<name>Maven Repository Switchboard</name> | |||||
<layout>default</layout> | |||||
<url>http://repo1.maven.org/maven2</url> | |||||
<snapshots> | |||||
<enabled>false</enabled> | |||||
</snapshots> | |||||
</repository> | |||||
</repositories> | |||||
</project>""" | |||||
result = MAPPINGS["MavenMapping"].translate(raw_content) | |||||
self.assertEqual(result, { | |||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | |||||
'type': 'SoftwareSourceCode', | |||||
'name': 'Maven Default Project', | |||||
'schema:identifier': 'com.mycompany.app', | |||||
'version': '1.2.3', | |||||
'schema:codeRepository': | |||||
'http://repo1.maven.org/maven2/com/mycompany/app/my-app', | |||||
}) | |||||
def test_revision_metadata_indexer(self): | def test_revision_metadata_indexer(self): | ||||
metadata_indexer = TestRevisionMetadataIndexer() | metadata_indexer = TestRevisionMetadataIndexer() | ||||
sha1_gits = [ | sha1_gits = [ | ||||
b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', | b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', | ||||
] | ] | ||||
metadata_indexer.run(sha1_gits, 'update-dups') | metadata_indexer.run(sha1_gits, 'update-dups') | ||||
results = metadata_indexer.idx_storage.added_data | results = metadata_indexer.idx_storage.added_data | ||||
expected_results = [('revision_metadata', True, [{ | expected_results = [('revision_metadata', True, [{ | ||||
'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', | 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', | ||||
'translated_metadata': { | 'translated_metadata': { | ||||
'identifier': None, | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
Not Done Inline ActionsI'm not sure where to write this comment, but when you have multiple detected files, what happens with the translated_metadata output? moranegg: I'm not sure where to write this comment, but when you have multiple detected files, what… | |||||
Done Inline Actions
Yes. Actually, they are all lists at the beginning, but JSON-LD compaction reduces them to their element. vlorentz: > does each property become a list?
Yes. Actually, they are all lists at the beginning, but… | |||||
'maintainer': None, | 'url': | ||||
'url': [ | 'https://github.com/librariesio/yarn-parser#readme', | ||||
'https://github.com/librariesio/yarn-parser#readme' | 'schema:codeRepository': | ||||
], | 'git+https://github.com/librariesio/yarn-parser.git', | ||||
'codeRepository': [{ | 'schema:author': 'Andrew Nesbitt', | ||||
'type': 'git', | 'license': 'AGPL-3.0', | ||||
'url': 'git+https://github.com/librariesio/yarn-parser.git' | 'version': '1.0.0', | ||||
}], | 'description': | ||||
'author': ['Andrew Nesbitt'], | 'Tiny web service for parsing yarn.lock files', | ||||
'license': ['AGPL-3.0'], | 'codemeta:issueTracker': | ||||
'version': ['1.0.0'], | 'https://github.com/librariesio/yarn-parser/issues', | ||||
'description': [ | 'name': 'yarn-parser', | ||||
'Tiny web service for parsing yarn.lock files' | 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], | ||||
], | |||||
'relatedLink': None, | |||||
'developmentStatus': None, | |||||
'operatingSystem': None, | |||||
'issueTracker': [{ | |||||
'url': 'https://github.com/librariesio/yarn-parser/issues' | |||||
}], | |||||
'softwareRequirements': [{ | |||||
'express': '^4.14.0', | |||||
'yarn': '^0.21.0', | |||||
'body-parser': '^1.15.2' | |||||
}], | |||||
'name': ['yarn-parser'], | |||||
'keywords': [['yarn', 'parse', 'lock', 'dependencies']], | |||||
'email': None | |||||
}, | }, | ||||
'indexer_configuration_id': 7 | 'indexer_configuration_id': 7 | ||||
}])] | }])] | ||||
# then | # then | ||||
self.assertEqual(expected_results, results) | self.assertEqual(expected_results, results) |
I kept a property called other in the content_metadata to regroup all metadata I wasn't able to translate to CodeMeta.
I see that you deleted this property, was it problematic with the type of output- keeping a codemeta.json output?