diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
index 0e66f21..c5aa5a0 100644
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -1,71 +1,76 @@
from typing import Dict, List, Optional, Union
import yaml
from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CROSSWALK_TABLE, SCHEMA_URI
from .base import DictMapping, SingleFileMapping
class SafeLoader(yaml.SafeLoader):
yaml_implicit_resolvers = {
k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"]
for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
}
class CffMapping(DictMapping, SingleFileMapping):
"""Dedicated class for Citation (CITATION.cff) mapping and translation"""
name = "cff"
filename = b"CITATION.cff"
mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"]
string_fields = ["keywords", "license", "abstract", "version", "doi"]
- def translate(self, raw_content: bytes) -> Dict[str, str]:
+ def translate(self, raw_content: bytes) -> Optional[Dict[str, str]]:
raw_content_string: str = raw_content.decode()
- content_dict = yaml.load(raw_content_string, Loader=SafeLoader)
- metadata = self._translate_dict(content_dict)
+ try:
+ content_dict = yaml.load(raw_content_string, Loader=SafeLoader)
+ except yaml.scanner.ScannerError:
+ return None
- metadata["@context"] = CODEMETA_CONTEXT_URL
+ if isinstance(content_dict, dict):
+ metadata = self._translate_dict(content_dict)
+ metadata["@context"] = CODEMETA_CONTEXT_URL
+ return metadata
- return metadata
+ return None
def normalize_authors(self, d: List[dict]) -> Dict[str, list]:
result = []
for author in d:
author_data: Dict[str, Optional[Union[str, Dict]]] = {
"@type": SCHEMA_URI + "Person"
}
if "orcid" in author:
author_data["@id"] = author["orcid"]
if "affiliation" in author:
author_data[SCHEMA_URI + "affiliation"] = {
"@type": SCHEMA_URI + "Organization",
SCHEMA_URI + "name": author["affiliation"],
}
if "family-names" in author:
author_data[SCHEMA_URI + "familyName"] = author["family-names"]
if "given-names" in author:
author_data[SCHEMA_URI + "givenName"] = author["given-names"]
result.append(author_data)
result_final = {"@list": result}
return result_final
def normalize_doi(self, s: str) -> Dict[str, str]:
if isinstance(s, str):
return {"@id": "https://doi.org/" + s}
def normalize_license(self, s: str) -> Dict[str, str]:
if isinstance(s, str):
return {"@id": "https://spdx.org/licenses/" + s}
def normalize_repository_code(self, s: str) -> Dict[str, str]:
if isinstance(s, str):
return {"@id": s}
def normalize_date_released(self, s: str) -> Dict[str, str]:
if isinstance(s, str):
return {"@value": s, "@type": SCHEMA_URI + "Date"}
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 3fb726e..9cc2aae 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,1257 +1,1310 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import logging
from hypothesis import HealthCheck, given, settings, strategies
import pytest
from swh.indexer.codemeta import CODEMETA_TERMS
from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_dictionary.maven import MavenMapping
from swh.indexer.metadata_dictionary.npm import NpmMapping
from swh.indexer.metadata_dictionary.ruby import GemspecMapping
from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow
from swh.indexer.tests.utils import DIRECTORY2
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Directory, DirectoryEntry
from .utils import (
BASE_TEST_CONFIG,
YARN_PARSER_METADATA,
fill_obj_storage,
fill_storage,
json_document_strategy,
xml_document_strategy,
)
TRANSLATOR_TOOL = {
"name": "swh-metadata-translator",
"version": "0.0.2",
"configuration": {"type": "local", "context": "NpmMapping"},
}
class ContentMetadataTestIndexer(ContentMetadataIndexer):
"""Specific Metadata whose configuration is enough to satisfy the
indexing tests.
"""
def parse_config_file(self, *args, **kwargs):
assert False, "should not be called; the dir indexer configures it."
DIRECTORY_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
"tools": TRANSLATOR_TOOL,
}
class TestMetadata:
"""
Tests metadata_mock_tool tool for Metadata detection
"""
def setup_method(self):
self.npm_mapping = MAPPINGS["NpmMapping"]()
self.codemeta_mapping = MAPPINGS["CodemetaMapping"]()
self.maven_mapping = MAPPINGS["MavenMapping"]()
self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]()
self.gemspec_mapping = MAPPINGS["GemspecMapping"]()
self.cff_mapping = MAPPINGS["CffMapping"]()
def test_compute_metadata_none(self):
"""
testing content empty content is empty
should return None
"""
# given
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
# when
result = self.npm_mapping.translate(content)
# then
assert declared_metadata == result
def test_compute_metadata_cff(self):
"""
testing CITATION.cff translation
"""
# given
content = """# YAML 1.2
---
abstract: "Command line program to convert from Citation File \
Format to various other formats such as BibTeX, EndNote, RIS, \
schema.org, CodeMeta, and .zenodo.json."
authors:
-
affiliation: "Netherlands eScience Center"
family-names: Klaver
given-names: Tom
-
affiliation: "Humboldt-Universität zu Berlin"
family-names: Druskat
given-names: Stephan
orcid: https://orcid.org/0000-0003-4925-7248
cff-version: "1.0.3"
date-released: 2019-11-12
doi: 10.5281/zenodo.1162057
keywords:
- "citation"
- "bibliography"
- "cff"
- "CITATION.cff"
license: Apache-2.0
message: "If you use this software, please cite it using these metadata."
repository-code: "https://github.com/citation-file-format/cff-converter-python"
title: cffconvert
version: "1.4.0-alpha0"
""".encode(
"utf-8"
)
expected = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [
{
"type": "Person",
"affiliation": {
"type": "Organization",
"name": "Netherlands eScience Center",
},
"familyName": "Klaver",
"givenName": "Tom",
},
{
"id": "https://orcid.org/0000-0003-4925-7248",
"type": "Person",
"affiliation": {
"type": "Organization",
"name": "Humboldt-Universität zu Berlin",
},
"familyName": "Druskat",
"givenName": "Stephan",
},
],
"codeRepository": (
"https://github.com/citation-file-format/cff-converter-python"
),
"datePublished": "2019-11-12",
"description": """Command line program to convert from \
Citation File Format to various other formats such as BibTeX, EndNote, \
RIS, schema.org, CodeMeta, and .zenodo.json.""",
"identifier": "https://doi.org/10.5281/zenodo.1162057",
"keywords": ["citation", "bibliography", "cff", "CITATION.cff"],
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "1.4.0-alpha0",
}
# when
result = self.cff_mapping.translate(content)
# then
assert expected == result
+ def test_compute_metadata_cff_invalid_yaml(self):
+ """
+ test yaml translation for invalid yaml file
+ """
+ # given
+ content = """cff-version: 1.0.3
+message: To cite the SigMF specification, please include the following:
+authors:
+ - name: The GNU Radio Foundation, Inc.
+ """.encode(
+ "utf-8"
+ )
+
+ expected = None
+
+ result = self.cff_mapping.translate(content)
+ # then
+ assert expected == result
+
+ def test_compute_metadata_cff_empty(self):
+ """
+ test yaml translation for empty yaml file
+ """
+ # given
+ content = """
+ """.encode(
+ "utf-8"
+ )
+
+ expected = None
+
+ result = self.cff_mapping.translate(content)
+ # then
+ assert expected == result
+
+ def test_compute_metadata_cff_list(self):
+ """
+ test yaml translation for empty yaml file
+ """
+ # given
+ content = """
+- Foo
+- Bar
+ """.encode(
+ "utf-8"
+ )
+
+ expected = None
+
+ result = self.cff_mapping.translate(content)
+ # then
+ assert expected == result
+
def test_compute_metadata_npm(self):
"""
testing only computation of metadata with hard_mapping_npm
"""
# given
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
},
"author": {
"email": "moranegg@example.com",
"name": "Morane G"
}
}
"""
declared_metadata = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"author": [
{
"type": "Person",
"name": "Morane G",
"email": "moranegg@example.com",
}
],
}
# when
result = self.npm_mapping.translate(content)
# then
assert declared_metadata == result
def test_index_content_metadata_npm(self):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
# given
sha1s = [
hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"),
]
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
config = BASE_TEST_CONFIG.copy()
config["tools"] = [TRANSLATOR_TOOL]
metadata_indexer = ContentMetadataTestIndexer(config=config)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# when
metadata_indexer.run(sha1s)
results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
expected_results = [
ContentMetadataRow(
id=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
tool=TRANSLATOR_TOOL,
metadata={
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"description": "Simple package.json test for indexer",
"name": "test_metadata",
"version": "0.0.1",
},
),
ContentMetadataRow(
id=hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
tool=TRANSLATOR_TOOL,
metadata={
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"issueTracker": "https://github.com/npm/npm/issues",
"author": [
{
"type": "Person",
"name": "Isaac Z. Schlueter",
"email": "i@izs.me",
"url": "http://blog.izs.me",
}
],
"codeRepository": "git+https://github.com/npm/npm",
"description": "a package manager for JavaScript",
"license": "https://spdx.org/licenses/Artistic-2.0",
"version": "5.0.3",
"name": "npm",
"keywords": [
"install",
"modules",
"package manager",
"package.json",
],
"url": "https://docs.npmjs.com/",
},
),
]
for result in results:
del result.tool["id"]
# The assertion below returns False sometimes because of nested lists
assert expected_results == results
def test_npm_bugs_normalization(self):
# valid dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"url": "https://github.com/owner/project/issues",
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
}
# "invalid" dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
}
# string
package_json = b"""{
"name": "foo",
"bugs": "https://github.com/owner/project/issues"
}"""
result = self.npm_mapping.translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
}
def test_npm_repository_normalization(self):
# normal
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git",
"url" : "https://github.com/npm/cli.git"
}
}"""
result = self.npm_mapping.translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
# missing url
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git"
}
}"""
result = self.npm_mapping.translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
}
# github shortcut
package_json = b"""{
"name": "foo",
"repository": "github:npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
assert result == expected_result
# github shortshortcut
package_json = b"""{
"name": "foo",
"repository": "npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
assert result == expected_result
# gitlab shortcut
package_json = b"""{
"name": "foo",
"repository": "gitlab:user/repo"
}"""
result = self.npm_mapping.translate(package_json)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://gitlab.com/user/repo.git",
"type": "SoftwareSourceCode",
}
@pytest.mark.parametrize(
"filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
)
def test_detect_metadata_package_json(self, filename):
# given
df = [
{
"sha1_git": b"abc",
"name": b"index.js",
"target": b"abc",
"length": 897,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"bcd",
},
{
"sha1_git": b"aab",
"name": filename,
"target": b"aab",
"length": 712,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"cde",
},
]
# when
results = detect_metadata(df)
expected_results = {"NpmMapping": [b"cde"]}
# then
assert expected_results == results
def test_detect_metadata_codemeta_json_uppercase(self):
# given
df = [
{
"sha1_git": b"abc",
"name": b"index.html",
"target": b"abc",
"length": 897,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"bcd",
},
{
"sha1_git": b"aab",
"name": b"CODEMETA.json",
"target": b"aab",
"length": 712,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"bcd",
},
]
# when
results = detect_metadata(df)
expected_results = {"CodemetaMapping": [b"bcd"]}
# then
assert expected_results == results
def test_compute_metadata_valid_codemeta(self):
raw_content = b"""{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
{
"@type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"@id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"@id": "https://doi.org/10.13039/100000001",
"@type": "Organization",
"name": "National Science Foundation"
},
"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version":"2.0",
"dateCreated":"2017-06-05",
"datePublished":"2017-06-05",
"programmingLanguage": "JSON-LD"
}""" # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can "
"be used to standardize the exchange of software metadata "
"across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science "
"software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X",
},
{
"type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"id": "http://orcid.org/0000-0003-0077-4738",
},
],
"maintainer": {
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X",
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"id": "https://doi.org/10.13039/100000001",
"type": "Organization",
"name": "National Science Foundation",
},
"funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
"in Scientific Software",
"keywords": ["metadata", "software"],
"version": "2.0",
"dateCreated": "2017-06-05",
"datePublished": "2017-06-05",
"programmingLanguage": "JSON-LD",
}
result = self.codemeta_mapping.translate(raw_content)
assert result == expected_result
def test_compute_metadata_codemeta_alternate_context(self):
raw_content = b"""{
"@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta"
}""" # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
}
result = self.codemeta_mapping.translate(raw_content)
assert result == expected_result
def test_compute_metadata_maven(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
"codeRepository": (
"http://repo1.maven.org/maven2/com/mycompany/app/my-app"
),
}
def test_compute_metadata_maven_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
}
def test_compute_metadata_maven_almost_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
}
def test_compute_metadata_maven_invalid_xml(self, caplog):
expected_warning = (
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error parsing XML from foo",
)
caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning]
assert result is None
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning]
assert result is None
def test_compute_metadata_maven_unknown_encoding(self, caplog):
expected_warning = (
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error detecting XML encoding from foo",
)
caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning]
assert result is None
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples == [expected_warning]
assert result is None
def test_compute_metadata_maven_invalid_encoding(self, caplog):
expected_warning = [
# libexpat1 <= 2.2.10-2+deb11u1
[
(
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error unidecoding XML from foo",
)
],
# libexpat1 >= 2.2.10-2+deb11u2
[
(
"swh.indexer.metadata_dictionary.maven.MavenMapping",
logging.WARNING,
"Error parsing XML from foo",
)
],
]
caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
raw_content = b"""
"""
caplog.clear()
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
assert caplog.record_tuples in expected_warning
assert result is None
def test_compute_metadata_maven_minimal(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
def test_compute_metadata_maven_empty_nodes(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
raw_content = b"""
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"version": "1.2.3",
}
def test_compute_metadata_maven_invalid_licenses(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
foo
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
}
def test_compute_metadata_maven_multiple(self):
"""Tests when there are multiple code repos and licenses."""
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
example
Example Maven Repo
default
http://example.org/maven2
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
MIT license
https://opensource.org/licenses/MIT
"""
result = self.maven_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"license": [
"https://www.apache.org/licenses/LICENSE-2.0.txt",
"https://opensource.org/licenses/MIT",
],
"codeRepository": [
"http://repo1.maven.org/maven2/com/mycompany/app/my-app",
"http://example.org/maven2/com/mycompany/app/my-app",
],
}
def test_compute_metadata_pkginfo(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: swh.core
Version: 0.0.49
Summary: Software Heritage core utilities
Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
Description: swh-core
========
\x20
core library for swh's modules:
- config parser
- hash computations
- serialization
- logging mechanism
\x20
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Description-Content-Type: text/markdown
Provides-Extra: testing
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
assert result["description"] == [
"Software Heritage core utilities", # note the comma here
"swh-core\n"
"========\n"
"\n"
"core library for swh's modules:\n"
"- config parser\n"
"- hash computations\n"
"- serialization\n"
"- logging mechanism\n"
"",
], result
del result["description"]
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"url": "https://forge.softwareheritage.org/diffusion/DCORE/",
"name": "swh.core",
"author": [
{
"type": "Person",
"name": "Software Heritage developers",
"email": "swh-devel@inria.fr",
}
],
"version": "0.0.49",
}
def test_compute_metadata_pkginfo_utf8(self):
raw_content = b"""\
Metadata-Version: 1.1
Name: snowpyt
Description-Content-Type: UNKNOWN
Description: foo
Hydrology N\xc2\xb083
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "snowpyt",
"description": "foo\nHydrology N°83",
}
def test_compute_metadata_pkginfo_keywords(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: foo
Keywords: foo bar baz
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "foo",
"keywords": ["foo", "bar", "baz"],
}
def test_compute_metadata_pkginfo_license(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: foo
License: MIT
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "foo",
"license": "MIT",
}
def test_gemspec_base(self):
raw_content = b"""
Gem::Specification.new do |s|
s.name = 'example'
s.version = '0.1.0'
s.licenses = ['MIT']
s.summary = "This is an example!"
s.description = "Much longer explanation of the example!"
s.authors = ["Ruby Coder"]
s.email = 'rubycoder@example.com'
s.files = ["lib/example.rb"]
s.homepage = 'https://rubygems.org/gems/example'
s.metadata = { "source_code_uri" => "https://github.com/example/example" }
end"""
result = self.gemspec_mapping.translate(raw_content)
assert set(result.pop("description")) == {
"This is an example!",
"Much longer explanation of the example!",
}
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"type": "Person", "name": "Ruby Coder"}],
"name": "example",
"license": "https://spdx.org/licenses/MIT",
"codeRepository": "https://rubygems.org/gems/example",
"email": "rubycoder@example.com",
"version": "0.1.0",
}
def test_gemspec_two_author_fields(self):
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1"]
s.author = "Ruby Coder2"
end"""
result = self.gemspec_mapping.translate(raw_content)
assert result.pop("author") in (
[
{"type": "Person", "name": "Ruby Coder1"},
{"type": "Person", "name": "Ruby Coder2"},
],
[
{"type": "Person", "name": "Ruby Coder2"},
{"type": "Person", "name": "Ruby Coder1"},
],
)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
}
def test_gemspec_invalid_author(self):
raw_content = b"""
Gem::Specification.new do |s|
s.author = ["Ruby Coder"]
end"""
result = self.gemspec_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
}
raw_content = b"""
Gem::Specification.new do |s|
s.author = "Ruby Coder1",
end"""
result = self.gemspec_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
}
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
end"""
result = self.gemspec_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"type": "Person", "name": "Ruby Coder1"}],
}
def test_gemspec_alternative_header(self):
raw_content = b"""
require './lib/version'
Gem::Specification.new { |s|
s.name = 'rb-system-with-aliases'
s.summary = 'execute system commands with aliases'
}
"""
result = self.gemspec_mapping.translate(raw_content)
assert result == {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "rb-system-with-aliases",
"description": "execute system commands with aliases",
}
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=list(NpmMapping.mapping)))
def test_npm_adversarial(self, doc):
raw = json.dumps(doc).encode()
self.npm_mapping.translate(raw)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=CODEMETA_TERMS))
def test_codemeta_adversarial(self, doc):
raw = json.dumps(doc).encode()
self.codemeta_mapping.translate(raw)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(
xml_document_strategy(
keys=list(MavenMapping.mapping),
root="project",
xmlns="http://maven.apache.org/POM/4.0.0",
)
)
def test_maven_adversarial(self, doc):
self.maven_mapping.translate(doc)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(
strategies.dictionaries(
# keys
strategies.one_of(
strategies.text(), *map(strategies.just, GemspecMapping.mapping)
),
# values
strategies.recursive(
strategies.characters(),
lambda children: strategies.lists(children, min_size=1),
),
)
)
def test_gemspec_adversarial(self, doc):
parts = [b"Gem::Specification.new do |s|\n"]
for (k, v) in doc.items():
parts.append(" s.{} = {}\n".format(k, repr(v)).encode())
parts.append(b"end\n")
self.gemspec_mapping.translate(b"".join(parts))
def test_directory_metadata_indexer(self):
metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
dir_ = DIRECTORY2
metadata_indexer.idx_storage.content_metadata_add(
[
ContentMetadataRow(
id=DIRECTORY2.entries[0].target,
indexer_configuration_id=tool["id"],
metadata=YARN_PARSER_METADATA,
)
]
)
metadata_indexer.run([dir_.id])
results = list(
metadata_indexer.idx_storage.directory_intrinsic_metadata_get(
[DIRECTORY2.id]
)
)
expected_results = [
DirectoryIntrinsicMetadataRow(
id=dir_.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
]
for result in results:
del result.tool["id"]
# then
assert results == expected_results
def test_directory_metadata_indexer_single_root_dir(self):
metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# Add a parent directory, that is the only directory at the root
# of the directory
dir_ = DIRECTORY2
new_dir = Directory(
entries=(
DirectoryEntry(
name=b"foobar-1.0.0",
type="dir",
target=dir_.id,
perms=16384,
),
),
)
assert new_dir.id is not None
metadata_indexer.storage.directory_add([new_dir])
tool = metadata_indexer.idx_storage.indexer_configuration_get(
{f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
metadata_indexer.idx_storage.content_metadata_add(
[
ContentMetadataRow(
id=DIRECTORY2.entries[0].target,
indexer_configuration_id=tool["id"],
metadata=YARN_PARSER_METADATA,
)
]
)
metadata_indexer.run([new_dir.id])
results = list(
metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id])
)
expected_results = [
DirectoryIntrinsicMetadataRow(
id=new_dir.id,
tool=TRANSLATOR_TOOL,
metadata=YARN_PARSER_METADATA,
mappings=["npm"],
)
]
for result in results:
del result.tool["id"]
# then
assert results == expected_results