Changeset View
Standalone View
swh/indexer/tests/test_metadata.py
Show First 20 Lines • Show All 78 Lines • ▼ Show 20 Lines | def setUp(self): | ||||
'type': 'local', | 'type': 'local', | ||||
'context': 'NpmMapping' | 'context': 'NpmMapping' | ||||
} | } | ||||
} | } | ||||
MockIndexerStorage.added_data = [] | MockIndexerStorage.added_data = [] | ||||
def test_crosstable(self): | def test_crosstable(self): | ||||
self.assertEqual(CROSSWALK_TABLE['NodeJS'], { | self.assertEqual(CROSSWALK_TABLE['NodeJS'], { | ||||
'repository': 'https://codemeta.github.io/terms/codeRepository', | 'repository': 'http://schema.org/codeRepository', | ||||
'os': 'https://codemeta.github.io/terms/operatingSystem', | 'os': 'http://schema.org/operatingSystem', | ||||
'cpu': 'https://codemeta.github.io/terms/processorRequirements', | 'cpu': 'http://schema.org/processorRequirements', | ||||
'engines': | 'engines': | ||||
'https://codemeta.github.io/terms/processorRequirements', | 'http://schema.org/processorRequirements', | ||||
'author': 'https://codemeta.github.io/terms/author', | 'author': 'http://schema.org/author', | ||||
'author.email': 'https://codemeta.github.io/terms/email', | 'author.email': 'http://schema.org/email', | ||||
'author.name': 'https://codemeta.github.io/terms/name', | 'author.name': 'http://schema.org/name', | ||||
'contributor': 'https://codemeta.github.io/terms/contributor', | 'contributor': 'http://schema.org/contributor', | ||||
'keywords': 'https://codemeta.github.io/terms/keywords', | 'keywords': 'http://schema.org/keywords', | ||||
'license': 'https://codemeta.github.io/terms/license', | 'license': 'http://schema.org/license', | ||||
'version': 'https://codemeta.github.io/terms/version', | 'version': 'http://schema.org/version', | ||||
'description': 'https://codemeta.github.io/terms/description', | 'description': 'http://schema.org/description', | ||||
'name': 'https://codemeta.github.io/terms/name', | 'name': 'http://schema.org/name', | ||||
'bugs': 'https://codemeta.github.io/terms/issueTracker', | 'bugs': 'https://codemeta.github.io/terms/issueTracker', | ||||
'homepage': 'https://codemeta.github.io/terms/url' | 'homepage': 'http://schema.org/url' | ||||
}) | }) | ||||
def test_compute_metadata_none(self): | def test_compute_metadata_none(self): | ||||
""" | """ | ||||
testing content empty content is empty | testing content empty content is empty | ||||
should return None | should return None | ||||
""" | """ | ||||
# given | # given | ||||
Show All 23 Lines | def test_compute_metadata_npm(self): | ||||
"author": { | "author": { | ||||
"email": "moranegg@example.com", | "email": "moranegg@example.com", | ||||
"name": "Morane G" | "name": "Morane G" | ||||
} | } | ||||
} | } | ||||
""" | """ | ||||
declared_metadata = { | declared_metadata = { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
'codemeta:name': 'test_metadata', | 'type': 'SoftwareSourceCode', | ||||
'codemeta:version': '0.0.2', | 'name': 'test_metadata', | ||||
'codemeta:description': 'Simple package.json test for indexer', | 'version': '0.0.2', | ||||
'codemeta:codeRepository': | 'description': 'Simple package.json test for indexer', | ||||
'schema:codeRepository': | |||||
'git+https://github.com/moranegg/metadata_test', | 'git+https://github.com/moranegg/metadata_test', | ||||
'codemeta:author': { | 'schema:author': { | ||||
'type': 'codemeta:Person', | 'type': 'Person', | ||||
'codemeta:name': 'Morane G', | 'name': 'Morane G', | ||||
'codemeta:email': 'moranegg@example.com', | 'email': 'moranegg@example.com', | ||||
}, | }, | ||||
} | } | ||||
# when | # when | ||||
result = MAPPINGS["NpmMapping"].translate(content) | result = MAPPINGS["NpmMapping"].translate(content) | ||||
# then | # then | ||||
self.assertEqual(declared_metadata, result) | self.assertEqual(declared_metadata, result) | ||||
def test_extract_minimal_metadata_dict(self): | def test_extract_minimal_metadata_dict(self): | ||||
""" | """ | ||||
Test the creation of a coherent minimal metadata set | Test the creation of a coherent minimal metadata set | ||||
""" | """ | ||||
# given | # given | ||||
metadata_list = [{ | metadata_list = [{ | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
'codemeta:name': 'test_1', | 'name': 'test_1', | ||||
'codemeta:version': '0.0.2', | 'version': '0.0.2', | ||||
'codemeta:description': 'Simple package.json test for indexer', | 'description': 'Simple package.json test for indexer', | ||||
'codemeta:codeRepository': | 'schema:codeRepository': | ||||
'git+https://github.com/moranegg/metadata_test', | 'git+https://github.com/moranegg/metadata_test', | ||||
}, { | }, { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
'codemeta:name': 'test_0_1', | 'name': 'test_0_1', | ||||
'codemeta:version': '0.0.2', | 'version': '0.0.2', | ||||
'codemeta:description': 'Simple package.json test for indexer', | 'description': 'Simple package.json test for indexer', | ||||
'codemeta:codeRepository': | 'schema:codeRepository': | ||||
'git+https://github.com/moranegg/metadata_test' | 'git+https://github.com/moranegg/metadata_test' | ||||
}, { | }, { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
'codemeta:name': 'test_metadata', | 'name': 'test_metadata', | ||||
'codemeta:version': '0.0.2', | 'version': '0.0.2', | ||||
'codemeta:author': 'moranegg', | 'schema:author': 'moranegg', | ||||
}] | }] | ||||
# when | # when | ||||
results = extract_minimal_metadata_dict(metadata_list) | results = extract_minimal_metadata_dict(metadata_list) | ||||
# then | # then | ||||
expected_results = { | expected_results = { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
"codemeta:version": '0.0.2', | "version": '0.0.2', | ||||
"codemeta:description": 'Simple package.json test for indexer', | "description": 'Simple package.json test for indexer', | ||||
"codemeta:name": ['test_1', 'test_0_1', 'test_metadata'], | "name": ['test_1', 'test_0_1', 'test_metadata'], | ||||
"codemeta:author": 'moranegg', | "schema:author": 'moranegg', | ||||
"codemeta:codeRepository": | "schema:codeRepository": | ||||
'git+https://github.com/moranegg/metadata_test', | 'git+https://github.com/moranegg/metadata_test', | ||||
} | } | ||||
self.assertEqual(expected_results, results) | self.assertEqual(expected_results, results) | ||||
def test_index_content_metadata_npm(self): | def test_index_content_metadata_npm(self): | ||||
""" | """ | ||||
testing NPM with package.json | testing NPM with package.json | ||||
- one sha1 uses a file that can't be translated to metadata and | - one sha1 uses a file that can't be translated to metadata and | ||||
Show All 11 Lines | def test_index_content_metadata_npm(self): | ||||
# when | # when | ||||
metadata_indexer.run(sha1s, policy_update='ignore-dups') | metadata_indexer.run(sha1s, policy_update='ignore-dups') | ||||
results = metadata_indexer.idx_storage.added_data | results = metadata_indexer.idx_storage.added_data | ||||
expected_results = [('content_metadata', False, [{ | expected_results = [('content_metadata', False, [{ | ||||
'indexer_configuration_id': 30, | 'indexer_configuration_id': 30, | ||||
'translated_metadata': { | 'translated_metadata': { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
'codemeta:codeRepository': | 'type': 'SoftwareSourceCode', | ||||
'schema:codeRepository': | |||||
'git+https://github.com/moranegg/metadata_test', | 'git+https://github.com/moranegg/metadata_test', | ||||
'codemeta:description': 'Simple package.json test for indexer', | 'description': 'Simple package.json test for indexer', | ||||
'codemeta:name': 'test_metadata', | 'name': 'test_metadata', | ||||
'codemeta:version': '0.0.1' | 'version': '0.0.1' | ||||
}, | }, | ||||
'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' | 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' | ||||
}, { | }, { | ||||
'indexer_configuration_id': 30, | 'indexer_configuration_id': 30, | ||||
'translated_metadata': { | 'translated_metadata': { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
'type': 'SoftwareSourceCode', | |||||
'codemeta:issueTracker': | 'codemeta:issueTracker': | ||||
'https://github.com/npm/npm/issues', | 'https://github.com/npm/npm/issues', | ||||
'codemeta:author': { | 'schema:author': { | ||||
'type': 'codemeta:Person', | 'type': 'Person', | ||||
'codemeta:name': 'Isaac Z. Schlueter', | 'name': 'Isaac Z. Schlueter', | ||||
'codemeta:email': 'i@izs.me', | 'email': 'i@izs.me', | ||||
'codemeta:url': 'http://blog.izs.me', | 'schema:url': 'http://blog.izs.me', | ||||
}, | }, | ||||
'codemeta:codeRepository': | 'schema:codeRepository': | ||||
moranegg: I kept a property called `other` in the content_metadata to regroup all metadata I wasn't able… | |||||
Done Inline ActionsIt's just that it's not defined by codemeta's schema definition, so jsonld.compact drops it. I could add a new property with an absolute URI, though. vlorentz: It's just that it's not defined by codemeta's schema definition, so `jsonld.compact` drops it. | |||||
'git+https://github.com/npm/npm', | 'git+https://github.com/npm/npm', | ||||
'codemeta:description': 'a package manager for JavaScript', | 'description': 'a package manager for JavaScript', | ||||
'codemeta:license': 'Artistic-2.0', | 'schema:license': 'Artistic-2.0', | ||||
'codemeta:version': '5.0.3', | 'version': '5.0.3', | ||||
'codemeta:name': 'npm', | 'name': 'npm', | ||||
'codemeta:keywords': [ | 'keywords': [ | ||||
'install', | 'install', | ||||
'modules', | 'modules', | ||||
'package manager', | 'package manager', | ||||
'package.json' | 'package.json' | ||||
], | ], | ||||
'codemeta:url': 'https://docs.npmjs.com/' | 'schema:url': 'https://docs.npmjs.com/' | ||||
}, | }, | ||||
'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' | 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' | ||||
}, { | }, { | ||||
'indexer_configuration_id': 30, | 'indexer_configuration_id': 30, | ||||
'translated_metadata': None, | 'translated_metadata': None, | ||||
'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' | 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' | ||||
}])] | }])] | ||||
Show All 30 Lines | def test_detect_metadata_package_json(self): | ||||
expected_results = { | expected_results = { | ||||
'NpmMapping': [ | 'NpmMapping': [ | ||||
b'cde' | b'cde' | ||||
] | ] | ||||
} | } | ||||
# then | # then | ||||
self.assertEqual(expected_results, results) | self.assertEqual(expected_results, results) | ||||
def test_compute_metadata_valid_codemeta(self): | |||||
raw_content = ( | |||||
b"""{ | |||||
Not Done Inline ActionsWhen running the indexation of a metadata file with a wrong @context, even with the same url with 1.0 at the end, Traceback (most recent call last): File "/usr/lib/python3/dist-packages/pyld/jsonld.py", line 4308, in _retrieve_context_urls remote_doc = load_document(url) File "/home/morane/Documents/code/swh-environment/swh-indexer/swh/indexer/codemeta.py", line 108, in _document_loader raise Exception(url) Exception: https://doi.org/10.5063/schema/codemeta-1.0 During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/lib/python3/dist-packages/pyld/jsonld.py", line 800, in expand input_, {}, options['documentLoader'], options['base']) File "/usr/lib/python3/dist-packages/pyld/jsonld.py", line 4315, in _retrieve_context_urls code='loading remote context failed', cause=cause) pyld.jsonld.JsonLdError: <exception str() failed> During handling of the above exception, another exception occurred: Traceback (most recent call last): File "metadata_dictionary.py", line 345, in <module> main() File "metadata_dictionary.py", line 335, in main result = MAPPINGS["CodemetaMapping"].translate(raw_content) File "metadata_dictionary.py", line 226, in translate return self.normalize_translation(expand(json.loads(content.decode()))) File "/home/morane/Documents/code/swh-environment/swh-indexer/swh/indexer/codemeta.py", line 120, in expand options={'documentLoader': _document_loader}) File "/usr/lib/python3/dist-packages/pyld/jsonld.py", line 171, in expand return JsonLdProcessor().expand(input_, options) File "/usr/lib/python3/dist-packages/pyld/jsonld.py", line 804, in expand 'jsonld.ExpandError', cause=cause) pyld.jsonld.JsonLdError: <exception str() failed> This is an observation, I'm not saying we should fix it, but the usage of the DOI url might not be on all codemeta.json files we find. moranegg: When running the indexation of a metadata file with a wrong @context, even with the same url… | |||||
Done Inline ActionsUnfortunately, it's either that or pulling untrusted schemas from the internet :/ vlorentz: Unfortunately, it's either that or pulling untrusted schemas from the internet :/ | |||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | |||||
"@type": "SoftwareSourceCode", | |||||
"identifier": "CodeMeta", | |||||
"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.", | |||||
"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD", | |||||
"codeRepository": "https://github.com/codemeta/codemeta", | |||||
"issueTracker": "https://github.com/codemeta/codemeta/issues", | |||||
"license": "https://spdx.org/licenses/Apache-2.0", | |||||
"version": "2.0", | |||||
"author": [ | |||||
{ | |||||
"@type": "Person", | |||||
"givenName": "Carl", | |||||
"familyName": "Boettiger", | |||||
"email": "cboettig@gmail.com", | |||||
"@id": "http://orcid.org/0000-0002-1642-628X" | |||||
}, | |||||
{ | |||||
"@type": "Person", | |||||
"givenName": "Matthew B.", | |||||
"familyName": "Jones", | |||||
"email": "jones@nceas.ucsb.edu", | |||||
"@id": "http://orcid.org/0000-0003-0077-4738" | |||||
} | |||||
], | |||||
"maintainer": { | |||||
"@type": "Person", | |||||
"givenName": "Carl", | |||||
"familyName": "Boettiger", | |||||
"email": "cboettig@gmail.com", | |||||
"@id": "http://orcid.org/0000-0002-1642-628X" | |||||
}, | |||||
"contIntegration": "https://travis-ci.org/codemeta/codemeta", | |||||
"developmentStatus": "active", | |||||
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip", | |||||
"funder": { | |||||
"@id": "https://doi.org/10.13039/100000001", | |||||
"@type": "Organization", | |||||
"name": "National Science Foundation" | |||||
}, | |||||
"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software", | |||||
"keywords": [ | |||||
"metadata", | |||||
"software" | |||||
], | |||||
"version":"2.0", | |||||
"dateCreated":"2017-06-05", | |||||
"datePublished":"2017-06-05", | |||||
"programmingLanguage": "JSON-LD" | |||||
}""") # noqa | |||||
expected_result = { | |||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | |||||
"type": "SoftwareSourceCode", | |||||
"identifier": "CodeMeta", | |||||
"description": | |||||
"CodeMeta is a concept vocabulary that can " | |||||
"be used to standardize the exchange of software metadata " | |||||
"across repositories and organizations.", | |||||
"name": | |||||
"CodeMeta: Minimal metadata schemas for science " | |||||
"software and code, in JSON-LD", | |||||
"codeRepository": "https://github.com/codemeta/codemeta", | |||||
"issueTracker": "https://github.com/codemeta/codemeta/issues", | |||||
"license": "https://spdx.org/licenses/Apache-2.0", | |||||
"version": "2.0", | |||||
"author": [ | |||||
{ | |||||
"type": "Person", | |||||
"givenName": "Carl", | |||||
"familyName": "Boettiger", | |||||
"email": "cboettig@gmail.com", | |||||
"id": "http://orcid.org/0000-0002-1642-628X" | |||||
}, | |||||
{ | |||||
"type": "Person", | |||||
"givenName": "Matthew B.", | |||||
"familyName": "Jones", | |||||
"email": "jones@nceas.ucsb.edu", | |||||
"id": "http://orcid.org/0000-0003-0077-4738" | |||||
} | |||||
], | |||||
"maintainer": { | |||||
"type": "Person", | |||||
"givenName": "Carl", | |||||
"familyName": "Boettiger", | |||||
"email": "cboettig@gmail.com", | |||||
"id": "http://orcid.org/0000-0002-1642-628X" | |||||
}, | |||||
"contIntegration": "https://travis-ci.org/codemeta/codemeta", | |||||
"developmentStatus": "active", | |||||
"downloadUrl": | |||||
"https://github.com/codemeta/codemeta/archive/2.0.zip", | |||||
"funder": { | |||||
"id": "https://doi.org/10.13039/100000001", | |||||
"type": "Organization", | |||||
"name": "National Science Foundation" | |||||
}, | |||||
"funding": "1549758; Codemeta: A Rosetta Stone for Metadata " | |||||
"in Scientific Software", | |||||
"keywords": [ | |||||
"metadata", | |||||
"software" | |||||
], | |||||
"version": "2.0", | |||||
"dateCreated": "2017-06-05", | |||||
"datePublished": "2017-06-05", | |||||
"programmingLanguage": "JSON-LD" | |||||
} | |||||
result = MAPPINGS["CodemetaMapping"].translate(raw_content) | |||||
self.assertEqual(result, expected_result) | |||||
def test_compute_metadata_maven(self): | |||||
raw_content = b""" | |||||
<project> | |||||
<name>Maven Default Project</name> | |||||
<modelVersion>4.0.0</modelVersion> | |||||
<groupId>com.mycompany.app</groupId> | |||||
<artifactId>my-app</artifactId> | |||||
<version>1.2.3</version> | |||||
<repositories> | |||||
<repository> | |||||
<id>central</id> | |||||
<name>Maven Repository Switchboard</name> | |||||
<layout>default</layout> | |||||
<url>http://repo1.maven.org/maven2</url> | |||||
<snapshots> | |||||
<enabled>false</enabled> | |||||
</snapshots> | |||||
</repository> | |||||
</repositories> | |||||
</project>""" | |||||
result = MAPPINGS["MavenMapping"].translate(raw_content) | |||||
self.assertEqual(result, { | |||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | |||||
'type': 'SoftwareSourceCode', | |||||
'name': 'Maven Default Project', | |||||
'schema:identifier': 'com.mycompany.app', | |||||
'version': '1.2.3', | |||||
'schema:codeRepository': | |||||
'http://repo1.maven.org/maven2/com/mycompany/app/my-app', | |||||
}) | |||||
def test_revision_metadata_indexer(self): | def test_revision_metadata_indexer(self): | ||||
metadata_indexer = RevisionMetadataTestIndexer() | metadata_indexer = RevisionMetadataTestIndexer() | ||||
sha1_gits = [ | sha1_gits = [ | ||||
b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', | b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', | ||||
] | ] | ||||
metadata_indexer.run(sha1_gits, 'update-dups') | metadata_indexer.run(sha1_gits, 'update-dups') | ||||
results = metadata_indexer.idx_storage.added_data | results = metadata_indexer.idx_storage.added_data | ||||
expected_results = [('revision_metadata', True, [{ | expected_results = [('revision_metadata', True, [{ | ||||
'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', | 'id': '8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', | ||||
'translated_metadata': { | 'translated_metadata': { | ||||
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', | ||||
Not Done Inline ActionsI'm not sure where to write this comment, but when you have multiple detected files, what happens with the translated_metadata output? moranegg: I'm not sure where to write this comment, but when you have multiple detected files, what… | |||||
Done Inline Actions
Yes. Actually, they are all lists at the beginning, but JSON-LD compaction reduces them to their element. vlorentz: > does each property become a list?
Yes. Actually, they are all lists at the beginning, but… | |||||
'codemeta:url': | 'url': | ||||
'https://github.com/librariesio/yarn-parser#readme', | 'https://github.com/librariesio/yarn-parser#readme', | ||||
'codemeta:codeRepository': | 'schema:codeRepository': | ||||
'git+https://github.com/librariesio/yarn-parser.git', | 'git+https://github.com/librariesio/yarn-parser.git', | ||||
'codemeta:author': 'Andrew Nesbitt', | 'schema:author': 'Andrew Nesbitt', | ||||
'codemeta:license': 'AGPL-3.0', | 'license': 'AGPL-3.0', | ||||
'codemeta:version': '1.0.0', | 'version': '1.0.0', | ||||
'codemeta:description': | 'description': | ||||
'Tiny web service for parsing yarn.lock files', | 'Tiny web service for parsing yarn.lock files', | ||||
'codemeta:issueTracker': | 'codemeta:issueTracker': | ||||
'https://github.com/librariesio/yarn-parser/issues', | 'https://github.com/librariesio/yarn-parser/issues', | ||||
'codemeta:name': 'yarn-parser', | 'name': 'yarn-parser', | ||||
'codemeta:keywords': ['yarn', 'parse', 'lock', 'dependencies'], | 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], | ||||
}, | }, | ||||
'indexer_configuration_id': 7 | 'indexer_configuration_id': 7 | ||||
}])] | }])] | ||||
# then | # then | ||||
self.assertEqual(expected_results, results) | self.assertEqual(expected_results, results) |
I kept a property called other in the content_metadata to regroup all metadata I wasn't able to translate to CodeMeta.
I see that you deleted this property, was it problematic with the type of output- keeping a codemeta.json output?