diff --git a/swh/indexer/tests/conftest.py b/swh/indexer/tests/conftest.py
index fa85d75..1ba1528 100644
--- a/swh/indexer/tests/conftest.py
+++ b/swh/indexer/tests/conftest.py
@@ -1,77 +1,74 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import timedelta
from unittest.mock import patch
import pytest
from swh.objstorage import get_objstorage
from swh.storage import get_storage
from swh.indexer.storage import get_indexer_storage
from .utils import fill_storage, fill_obj_storage
TASK_NAMES = ["revision_intrinsic_metadata", "origin_intrinsic_metadata"]
-storage_config = {"cls": "pipeline", "steps": [{"cls": "validate"}, {"cls": "memory"},]}
-
-
@pytest.fixture
def indexer_scheduler(swh_scheduler):
for taskname in TASK_NAMES:
swh_scheduler.create_task_type(
{
"type": taskname,
"description": "The {} indexer testing task".format(taskname),
"backend_name": "swh.indexer.tests.tasks.{}".format(taskname),
"default_interval": timedelta(days=1),
"min_interval": timedelta(hours=6),
"max_interval": timedelta(days=12),
"num_retries": 3,
}
)
return swh_scheduler
@pytest.fixture
def idx_storage():
"""An instance of in-memory indexer storage that gets injected into all
indexers classes.
"""
idx_storage = get_indexer_storage("memory", {})
with patch("swh.indexer.storage.in_memory.IndexerStorage") as idx_storage_mock:
idx_storage_mock.return_value = idx_storage
yield idx_storage
@pytest.fixture
def storage():
"""An instance of in-memory storage that gets injected into all indexers
classes.
"""
- storage = get_storage(**storage_config)
+ storage = get_storage(cls="memory")
fill_storage(storage)
with patch("swh.storage.in_memory.InMemoryStorage") as storage_mock:
storage_mock.return_value = storage
yield storage
@pytest.fixture
def obj_storage():
"""An instance of in-memory objstorage that gets injected into all indexers
classes.
"""
objstorage = get_objstorage("memory", {})
fill_obj_storage(objstorage)
with patch.dict(
"swh.objstorage.factory._STORAGE_CLASSES", {"memory": lambda: objstorage}
):
yield objstorage
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index c3ef250..7abb4ed 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,1210 +1,1205 @@
-# Copyright (C) 2017-2018 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import unittest
-import attr
-
from hypothesis import given, strategies, settings, HealthCheck
from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Directory, DirectoryEntry, Revision
from swh.indexer.codemeta import CODEMETA_TERMS
from swh.indexer.metadata_dictionary import MAPPINGS
from swh.indexer.metadata_dictionary.maven import MavenMapping
from swh.indexer.metadata_dictionary.npm import NpmMapping
from swh.indexer.metadata_dictionary.ruby import GemspecMapping
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata import ContentMetadataIndexer, RevisionMetadataIndexer
+from swh.indexer.tests.utils import REVISION, DIRECTORY2
+
from .utils import (
BASE_TEST_CONFIG,
fill_obj_storage,
fill_storage,
YARN_PARSER_METADATA,
json_document_strategy,
xml_document_strategy,
)
TRANSLATOR_TOOL = {
"name": "swh-metadata-translator",
"version": "0.0.2",
"configuration": {"type": "local", "context": "NpmMapping"},
}
class ContentMetadataTestIndexer(ContentMetadataIndexer):
"""Specific Metadata whose configuration is enough to satisfy the
indexing tests.
"""
def parse_config_file(self, *args, **kwargs):
assert False, "should not be called; the rev indexer configures it."
REVISION_METADATA_CONFIG = {
**BASE_TEST_CONFIG,
"tools": TRANSLATOR_TOOL,
}
class Metadata(unittest.TestCase):
"""
Tests metadata_mock_tool tool for Metadata detection
"""
def setUp(self):
"""
shows the entire diff in the results
"""
self.maxDiff = None
self.npm_mapping = MAPPINGS["NpmMapping"]()
self.codemeta_mapping = MAPPINGS["CodemetaMapping"]()
self.maven_mapping = MAPPINGS["MavenMapping"]()
self.pkginfo_mapping = MAPPINGS["PythonPkginfoMapping"]()
self.gemspec_mapping = MAPPINGS["GemspecMapping"]()
def test_compute_metadata_none(self):
"""
testing content empty content is empty
should return None
"""
# given
content = b""
# None if no metadata was found or an error occurred
declared_metadata = None
# when
result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_compute_metadata_npm(self):
"""
testing only computation of metadata with hard_mapping_npm
"""
# given
content = b"""
{
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
},
"author": {
"email": "moranegg@example.com",
"name": "Morane G"
}
}
"""
declared_metadata = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "test_metadata",
"version": "0.0.2",
"description": "Simple package.json test for indexer",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"author": [
{"type": "Person", "name": "Morane G", "email": "moranegg@example.com",}
],
}
# when
result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
def test_index_content_metadata_npm(self):
"""
testing NPM with package.json
- one sha1 uses a file that can't be translated to metadata and
should return None in the translated metadata
"""
# given
sha1s = [
hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
hash_to_bytes("02fb2c89e14f7fab46701478c83779c7beb7b069"),
]
# this metadata indexer computes only metadata for package.json
# in npm context with a hard mapping
config = BASE_TEST_CONFIG.copy()
config["tools"] = [TRANSLATOR_TOOL]
metadata_indexer = ContentMetadataTestIndexer(config=config)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# when
metadata_indexer.run(sha1s, policy_update="ignore-dups")
results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
expected_results = [
{
"metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"codeRepository": "git+https://github.com/moranegg/metadata_test",
"description": "Simple package.json test for indexer",
"name": "test_metadata",
"version": "0.0.1",
},
"id": hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
},
{
"metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"issueTracker": "https://github.com/npm/npm/issues",
"author": [
{
"type": "Person",
"name": "Isaac Z. Schlueter",
"email": "i@izs.me",
"url": "http://blog.izs.me",
}
],
"codeRepository": "git+https://github.com/npm/npm",
"description": "a package manager for JavaScript",
"license": "https://spdx.org/licenses/Artistic-2.0",
"version": "5.0.3",
"name": "npm",
"keywords": [
"install",
"modules",
"package manager",
"package.json",
],
"url": "https://docs.npmjs.com/",
},
"id": hash_to_bytes("d4c647f0fc257591cc9ba1722484229780d1c607"),
},
]
for result in results:
del result["tool"]
# The assertion below returns False sometimes because of nested lists
self.assertEqual(expected_results, results)
def test_npm_bugs_normalization(self):
# valid dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"url": "https://github.com/owner/project/issues",
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
},
)
# "invalid" dictionary
package_json = b"""{
"name": "foo",
"bugs": {
"email": "foo@example.com"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
},
)
# string
package_json = b"""{
"name": "foo",
"bugs": "https://github.com/owner/project/issues"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"issueTracker": "https://github.com/owner/project/issues",
"type": "SoftwareSourceCode",
},
)
def test_npm_repository_normalization(self):
# normal
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git",
"url" : "https://github.com/npm/cli.git"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
},
)
# missing url
package_json = b"""{
"name": "foo",
"repository": {
"type" : "git"
}
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"type": "SoftwareSourceCode",
},
)
# github shortcut
package_json = b"""{
"name": "foo",
"repository": "github:npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://github.com/npm/cli.git",
"type": "SoftwareSourceCode",
}
self.assertEqual(result, expected_result)
# github shortshortcut
package_json = b"""{
"name": "foo",
"repository": "npm/cli"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(result, expected_result)
# gitlab shortcut
package_json = b"""{
"name": "foo",
"repository": "gitlab:user/repo"
}"""
result = self.npm_mapping.translate(package_json)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"name": "foo",
"codeRepository": "git+https://gitlab.com/user/repo.git",
"type": "SoftwareSourceCode",
},
)
def test_detect_metadata_package_json(self):
# given
df = [
{
"sha1_git": b"abc",
"name": b"index.js",
"target": b"abc",
"length": 897,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"bcd",
},
{
"sha1_git": b"aab",
"name": b"package.json",
"target": b"aab",
"length": 712,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"cde",
},
]
# when
results = detect_metadata(df)
expected_results = {"NpmMapping": [b"cde"]}
# then
self.assertEqual(expected_results, results)
def test_compute_metadata_valid_codemeta(self):
raw_content = b"""{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can be used to standardize the exchange of software metadata across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
{
"@type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"@id": "http://orcid.org/0000-0003-0077-4738"
}
],
"maintainer": {
"@type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"@id": "http://orcid.org/0000-0002-1642-628X"
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"@id": "https://doi.org/10.13039/100000001",
"@type": "Organization",
"name": "National Science Foundation"
},
"funding":"1549758; Codemeta: A Rosetta Stone for Metadata in Scientific Software",
"keywords": [
"metadata",
"software"
],
"version":"2.0",
"dateCreated":"2017-06-05",
"datePublished":"2017-06-05",
"programmingLanguage": "JSON-LD"
}""" # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
"description": "CodeMeta is a concept vocabulary that can "
"be used to standardize the exchange of software metadata "
"across repositories and organizations.",
"name": "CodeMeta: Minimal metadata schemas for science "
"software and code, in JSON-LD",
"codeRepository": "https://github.com/codemeta/codemeta",
"issueTracker": "https://github.com/codemeta/codemeta/issues",
"license": "https://spdx.org/licenses/Apache-2.0",
"version": "2.0",
"author": [
{
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X",
},
{
"type": "Person",
"givenName": "Matthew B.",
"familyName": "Jones",
"email": "jones@nceas.ucsb.edu",
"id": "http://orcid.org/0000-0003-0077-4738",
},
],
"maintainer": {
"type": "Person",
"givenName": "Carl",
"familyName": "Boettiger",
"email": "cboettig@gmail.com",
"id": "http://orcid.org/0000-0002-1642-628X",
},
"contIntegration": "https://travis-ci.org/codemeta/codemeta",
"developmentStatus": "active",
"downloadUrl": "https://github.com/codemeta/codemeta/archive/2.0.zip",
"funder": {
"id": "https://doi.org/10.13039/100000001",
"type": "Organization",
"name": "National Science Foundation",
},
"funding": "1549758; Codemeta: A Rosetta Stone for Metadata "
"in Scientific Software",
"keywords": ["metadata", "software"],
"version": "2.0",
"dateCreated": "2017-06-05",
"datePublished": "2017-06-05",
"programmingLanguage": "JSON-LD",
}
result = self.codemeta_mapping.translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_codemeta_alternate_context(self):
raw_content = b"""{
"@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
"@type": "SoftwareSourceCode",
"identifier": "CodeMeta"
}""" # noqa
expected_result = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "CodeMeta",
}
result = self.codemeta_mapping.translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_maven(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
"codeRepository": (
"http://repo1.maven.org/maven2/com/mycompany/app/my-app"
),
},
)
def test_compute_metadata_maven_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
def test_compute_metadata_maven_almost_empty(self):
raw_content = b"""
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
def test_compute_metadata_maven_invalid_xml(self):
expected_warning = (
"WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
"Error parsing XML from foo"
)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_unknown_encoding(self):
expected_warning = (
"WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
"Error detecting XML encoding from foo"
)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_invalid_encoding(self):
expected_warning = (
"WARNING:swh.indexer.metadata_dictionary.maven.MavenMapping:"
"Error unidecoding XML from foo"
)
raw_content = b"""
"""
with self.assertLogs("swh.indexer.metadata_dictionary", level="WARNING") as cm:
result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_minimal(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
def test_compute_metadata_maven_empty_nodes(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
raw_content = b"""
1.2.3
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"version": "1.2.3",
},
)
def test_compute_metadata_maven_invalid_licenses(self):
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
foo
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"codeRepository": (
"https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
),
},
)
def test_compute_metadata_maven_multiple(self):
"""Tests when there are multiple code repos and licenses."""
raw_content = b"""
Maven Default Project
4.0.0
com.mycompany.app
my-app
1.2.3
central
Maven Repository Switchboard
default
http://repo1.maven.org/maven2
false
example
Example Maven Repo
default
http://example.org/maven2
Apache License, Version 2.0
https://www.apache.org/licenses/LICENSE-2.0.txt
repo
A business-friendly OSS license
MIT license
https://opensource.org/licenses/MIT
"""
result = self.maven_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "Maven Default Project",
"identifier": "com.mycompany.app",
"version": "1.2.3",
"license": [
"https://www.apache.org/licenses/LICENSE-2.0.txt",
"https://opensource.org/licenses/MIT",
],
"codeRepository": [
"http://repo1.maven.org/maven2/com/mycompany/app/my-app",
"http://example.org/maven2/com/mycompany/app/my-app",
],
},
)
def test_compute_metadata_pkginfo(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: swh.core
Version: 0.0.49
Summary: Software Heritage core utilities
Home-page: https://forge.softwareheritage.org/diffusion/DCORE/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-core
Description: swh-core
========
\x20
core library for swh's modules:
- config parser
- hash computations
- serialization
- logging mechanism
\x20
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Description-Content-Type: text/markdown
Provides-Extra: testing
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertCountEqual(
result["description"],
[
"Software Heritage core utilities", # note the comma here
"swh-core\n"
"========\n"
"\n"
"core library for swh's modules:\n"
"- config parser\n"
"- hash computations\n"
"- serialization\n"
"- logging mechanism\n"
"",
],
result,
)
del result["description"]
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"url": "https://forge.softwareheritage.org/diffusion/DCORE/",
"name": "swh.core",
"author": [
{
"type": "Person",
"name": "Software Heritage developers",
"email": "swh-devel@inria.fr",
}
],
"version": "0.0.49",
},
)
def test_compute_metadata_pkginfo_utf8(self):
raw_content = b"""\
Metadata-Version: 1.1
Name: snowpyt
Description-Content-Type: UNKNOWN
Description: foo
Hydrology N\xc2\xb083
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "snowpyt",
"description": "foo\nHydrology N°83",
},
)
def test_compute_metadata_pkginfo_keywords(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: foo
Keywords: foo bar baz
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "foo",
"keywords": ["foo", "bar", "baz"],
},
)
def test_compute_metadata_pkginfo_license(self):
raw_content = b"""\
Metadata-Version: 2.1
Name: foo
License: MIT
""" # noqa
result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "foo",
"license": "MIT",
},
)
def test_gemspec_base(self):
raw_content = b"""
Gem::Specification.new do |s|
s.name = 'example'
s.version = '0.1.0'
s.licenses = ['MIT']
s.summary = "This is an example!"
s.description = "Much longer explanation of the example!"
s.authors = ["Ruby Coder"]
s.email = 'rubycoder@example.com'
s.files = ["lib/example.rb"]
s.homepage = 'https://rubygems.org/gems/example'
s.metadata = { "source_code_uri" => "https://github.com/example/example" }
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(
result.pop("description"),
["This is an example!", "Much longer explanation of the example!"],
)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"type": "Person", "name": "Ruby Coder"}],
"name": "example",
"license": "https://spdx.org/licenses/MIT",
"codeRepository": "https://rubygems.org/gems/example",
"email": "rubycoder@example.com",
"version": "0.1.0",
},
)
def test_gemspec_two_author_fields(self):
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1"]
s.author = "Ruby Coder2"
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(
result.pop("author"),
[
{"type": "Person", "name": "Ruby Coder1"},
{"type": "Person", "name": "Ruby Coder2"},
],
)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
def test_gemspec_invalid_author(self):
raw_content = b"""
Gem::Specification.new do |s|
s.author = ["Ruby Coder"]
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
raw_content = b"""
Gem::Specification.new do |s|
s.author = "Ruby Coder1",
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
},
)
raw_content = b"""
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
end"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [{"type": "Person", "name": "Ruby Coder1"}],
},
)
def test_gemspec_alternative_header(self):
raw_content = b"""
require './lib/version'
Gem::Specification.new { |s|
s.name = 'rb-system-with-aliases'
s.summary = 'execute system commands with aliases'
}
"""
result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(
result,
{
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"name": "rb-system-with-aliases",
"description": "execute system commands with aliases",
},
)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=list(NpmMapping.mapping)))
def test_npm_adversarial(self, doc):
raw = json.dumps(doc).encode()
self.npm_mapping.translate(raw)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(json_document_strategy(keys=CODEMETA_TERMS))
def test_codemeta_adversarial(self, doc):
raw = json.dumps(doc).encode()
self.codemeta_mapping.translate(raw)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(
xml_document_strategy(
keys=list(MavenMapping.mapping),
root="project",
xmlns="http://maven.apache.org/POM/4.0.0",
)
)
def test_maven_adversarial(self, doc):
self.maven_mapping.translate(doc)
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(
strategies.dictionaries(
# keys
strategies.one_of(
strategies.text(), *map(strategies.just, GemspecMapping.mapping)
),
# values
strategies.recursive(
strategies.characters(),
lambda children: strategies.lists(children, min_size=1),
),
)
)
def test_gemspec_adversarial(self, doc):
parts = [b"Gem::Specification.new do |s|\n"]
for (k, v) in doc.items():
parts.append(" s.{} = {}\n".format(k, repr(v)).encode())
parts.append(b"end\n")
self.gemspec_mapping.translate(b"".join(parts))
def test_revision_metadata_indexer(self):
metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
tool = metadata_indexer.idx_storage.indexer_configuration_get(
- {"tool_" + k: v for (k, v) in TRANSLATOR_TOOL.items()}
+ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
+ rev = REVISION
+ assert rev.directory == DIRECTORY2.id
metadata_indexer.idx_storage.content_metadata_add(
[
{
"indexer_configuration_id": tool["id"],
- "id": b"cde",
+ "id": DIRECTORY2.entries[0].target,
"metadata": YARN_PARSER_METADATA,
}
]
)
- sha1_gits = [
- hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
- ]
- metadata_indexer.run(sha1_gits, "update-dups")
+ metadata_indexer.run([rev.id], "update-dups")
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get(sha1_gits)
+ metadata_indexer.idx_storage.revision_intrinsic_metadata_get([REVISION.id])
)
expected_results = [
{
- "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
+ "id": rev.id,
"tool": TRANSLATOR_TOOL,
"metadata": YARN_PARSER_METADATA,
"mappings": ["npm"],
}
]
for result in results:
del result["tool"]["id"]
# then
- self.assertEqual(expected_results, results)
+ self.assertEqual(results, expected_results)
def test_revision_metadata_indexer_single_root_dir(self):
metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG)
fill_obj_storage(metadata_indexer.objstorage)
fill_storage(metadata_indexer.storage)
# Add a parent directory, that is the only directory at the root
# of the revision
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
- rev = metadata_indexer.storage._revisions[rev_id]
- subdir_id = rev.directory
- rev = attr.evolve(rev, directory=b"123456")
- metadata_indexer.storage.directory_add(
- [
- {
- "id": b"123456",
- "entries": [
- {
- "name": b"foobar-1.0.0",
- "type": "dir",
- "target": subdir_id,
- "perms": 16384,
- }
- ],
- }
- ]
+ rev = REVISION
+ assert rev.directory == DIRECTORY2.id
+
+ directory = Directory(
+ entries=(
+ DirectoryEntry(
+ name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384,
+ ),
+ ),
)
+ assert directory.id is not None
+ metadata_indexer.storage.directory_add([directory])
+
+ new_rev_dict = {**rev.to_dict(), "directory": directory.id}
+ new_rev_dict.pop("id")
+ new_rev = Revision.from_dict(new_rev_dict)
+ metadata_indexer.storage.revision_add([new_rev])
tool = metadata_indexer.idx_storage.indexer_configuration_get(
- {"tool_" + k: v for (k, v) in TRANSLATOR_TOOL.items()}
+ {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
)
assert tool is not None
metadata_indexer.idx_storage.content_metadata_add(
[
{
"indexer_configuration_id": tool["id"],
- "id": b"cde",
+ "id": DIRECTORY2.entries[0].target,
"metadata": YARN_PARSER_METADATA,
}
]
)
- sha1_gits = [
- hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
- ]
- metadata_indexer.run(sha1_gits, "update-dups")
+ metadata_indexer.run([new_rev.id], "update-dups")
results = list(
- metadata_indexer.idx_storage.revision_intrinsic_metadata_get(sha1_gits)
+ metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id])
)
expected_results = [
{
- "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
+ "id": new_rev.id,
"tool": TRANSLATOR_TOOL,
"metadata": YARN_PARSER_METADATA,
"mappings": ["npm"],
}
]
for result in results:
del result["tool"]["id"]
# then
- self.assertEqual(expected_results, results)
+ self.assertEqual(results, expected_results)
diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py
index f87cf81..c137dd0 100644
--- a/swh/indexer/tests/test_origin_head.py
+++ b/swh/indexer/tests/test_origin_head.py
@@ -1,191 +1,170 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
from datetime import datetime, timezone
from swh.model.model import OriginVisit, OriginVisitStatus
from swh.indexer.origin_head import OriginHeadIndexer
from swh.indexer.tests.utils import BASE_TEST_CONFIG, fill_storage
from swh.storage.utils import now
+from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType
+
ORIGIN_HEAD_CONFIG = {
**BASE_TEST_CONFIG,
"tools": {"name": "origin-metadata", "version": "0.0.1", "configuration": {},},
"tasks": {"revision_intrinsic_metadata": None, "origin_intrinsic_metadata": None,},
}
class OriginHeadTestIndexer(OriginHeadIndexer):
"""Specific indexer whose configuration is enough to satisfy the
indexing tests.
"""
def parse_config_file(self, *args, **kwargs):
return ORIGIN_HEAD_CONFIG
def persist_index_computations(self, results, policy_update):
self.results = results
class OriginHead(unittest.TestCase):
def setUp(self):
self.indexer = OriginHeadTestIndexer()
self.indexer.catch_exceptions = False
fill_storage(self.indexer.storage)
def test_git(self):
- self.indexer.run(["https://github.com/SoftwareHeritage/swh-storage"])
+ origin_url = "https://github.com/SoftwareHeritage/swh-storage"
+ self.indexer.run([origin_url])
+ rev_id = b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm"
self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{"
- b"\xd7}\xac\xefrm",
- "origin_url": "https://github.com/SoftwareHeritage/swh-storage",
- }
- ],
+ self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
)
def test_git_partial_snapshot(self):
"""Checks partial snapshots are ignored."""
origin_url = "https://github.com/SoftwareHeritage/swh-core"
- self.indexer.storage.origin_add([{"url": origin_url,}])
+ self.indexer.storage.origin_add([Origin(url=origin_url)])
visit = self.indexer.storage.origin_visit_add(
[
OriginVisit(
origin=origin_url,
date=datetime(2019, 2, 27, tzinfo=timezone.utc),
type="git",
)
]
)[0]
self.indexer.storage.snapshot_add(
[
- {
- "id": b"foo",
- "branches": {
+ Snapshot(
+ branches={
b"foo": None,
- b"HEAD": {"target_type": "alias", "target": b"foo",},
+ b"HEAD": SnapshotBranch(
+ target_type=TargetType.ALIAS, target=b"foo",
+ ),
},
- }
+ ),
]
)
visit_status = OriginVisitStatus(
origin=origin_url,
visit=visit.visit,
date=now(),
status="partial",
snapshot=b"foo",
)
self.indexer.storage.origin_visit_status_add([visit_status])
self.indexer.run([origin_url])
self.assertEqual(self.indexer.results, [])
def test_vcs_missing_snapshot(self):
- self.indexer.storage.origin_add(
- [{"url": "https://github.com/SoftwareHeritage/swh-indexer",}]
- )
- self.indexer.run(["https://github.com/SoftwareHeritage/swh-indexer"])
+ origin_url = "https://github.com/SoftwareHeritage/swh-indexer"
+ self.indexer.storage.origin_add([Origin(url=origin_url)])
+ self.indexer.run([origin_url])
self.assertEqual(self.indexer.results, [])
def test_pypi_missing_branch(self):
origin_url = "https://pypi.org/project/abcdef/"
- self.indexer.storage.origin_add([{"url": origin_url,}])
+ self.indexer.storage.origin_add([Origin(url=origin_url,)])
visit = self.indexer.storage.origin_visit_add(
[
OriginVisit(
origin=origin_url,
date=datetime(2019, 2, 27, tzinfo=timezone.utc),
type="pypi",
)
]
)[0]
self.indexer.storage.snapshot_add(
[
- {
- "id": b"foo",
- "branches": {
+ Snapshot(
+ branches={
b"foo": None,
- b"HEAD": {"target_type": "alias", "target": b"foo",},
+ b"HEAD": SnapshotBranch(
+ target_type=TargetType.ALIAS, target=b"foo",
+ ),
},
- }
+ )
]
)
visit_status = OriginVisitStatus(
origin=origin_url,
visit=visit.visit,
date=now(),
status="full",
snapshot=b"foo",
)
self.indexer.storage.origin_visit_status_add([visit_status])
self.indexer.run(["https://pypi.org/project/abcdef/"])
self.assertEqual(self.indexer.results, [])
def test_ftp(self):
- self.indexer.run(["rsync://ftp.gnu.org/gnu/3dldf"])
+ origin_url = "rsync://ftp.gnu.org/gnu/3dldf"
+ self.indexer.run([origin_url])
+ rev_id = b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by"
self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee"
- b"\xcc\x1a\xb4`\x8c\x8by",
- "origin_url": "rsync://ftp.gnu.org/gnu/3dldf",
- }
- ],
+ self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
)
def test_ftp_missing_snapshot(self):
- self.indexer.storage.origin_add([{"url": "rsync://ftp.gnu.org/gnu/foobar",}])
- self.indexer.run(["rsync://ftp.gnu.org/gnu/foobar"])
+ origin_url = "rsync://ftp.gnu.org/gnu/foobar"
+ self.indexer.storage.origin_add([Origin(url=origin_url)])
+ self.indexer.run([origin_url])
self.assertEqual(self.indexer.results, [])
def test_deposit(self):
- self.indexer.run(["https://forge.softwareheritage.org/source/jesuisgpl/"])
+ origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/"
+ self.indexer.storage.origin_add([Origin(url=origin_url)])
+ self.indexer.run([origin_url])
+ rev_id = b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb"
self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{"
- b"\xa6\xe9\x99\xb1\x9e]q\xeb",
- "origin_url": "https://forge.softwareheritage.org/source/"
- "jesuisgpl/",
- }
- ],
+ self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
)
def test_deposit_missing_snapshot(self):
- self.indexer.storage.origin_add(
- [{"url": "https://forge.softwareheritage.org/source/foobar",}]
- )
- self.indexer.run(["https://forge.softwareheritage.org/source/foobar"])
+ origin_url = "https://forge.softwareheritage.org/source/foobar"
+ self.indexer.storage.origin_add([Origin(url=origin_url,)])
+ self.indexer.run([origin_url])
self.assertEqual(self.indexer.results, [])
def test_pypi(self):
- self.indexer.run(["https://pypi.org/project/limnoria/"])
+ origin_url = "https://pypi.org/project/limnoria/"
+ self.indexer.run([origin_url])
+
+ rev_id = b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t"
self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k"
- b"A\x10\x9d\xc5\xfa2\xf8t",
- "origin_url": "https://pypi.org/project/limnoria/",
- }
- ],
+ self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url}],
)
def test_svn(self):
- self.indexer.run(["http://0-512-md.googlecode.com/svn/"])
+ origin_url = "http://0-512-md.googlecode.com/svn/"
+ self.indexer.run([origin_url])
+ rev_id = b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18"
self.assertEqual(
- self.indexer.results,
- [
- {
- "revision_id": b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8"
- b"\xc9\xad#.\x1bw=\x18",
- "origin_url": "http://0-512-md.googlecode.com/svn/",
- }
- ],
+ self.indexer.results, [{"revision_id": rev_id, "origin_url": origin_url,}],
)
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 79e8de3..2533981 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,224 +1,212 @@
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from unittest.mock import patch
-from swh.model.hashutil import hash_to_bytes
-
from swh.indexer.metadata import OriginMetadataIndexer
-from .utils import YARN_PARSER_METADATA
+from swh.model.model import Origin
+
+from .utils import YARN_PARSER_METADATA, REVISION
from .test_metadata import REVISION_METADATA_CONFIG
def test_origin_metadata_indexer(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
- indexer.run(["https://github.com/librariesio/yarn-parser"])
-
origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ indexer.run([origin])
+ rev_id = REVISION.id
rev_metadata = {
"id": rev_id,
"metadata": YARN_PARSER_METADATA,
"mappings": ["npm"],
}
origin_metadata = {
"id": origin,
"from_revision": rev_id,
"metadata": YARN_PARSER_METADATA,
"mappings": ["npm"],
}
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
for result in results:
del result["tool"]
assert results == [rev_metadata]
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
for result in results:
del result["tool"]
assert results == [origin_metadata]
def test_origin_metadata_indexer_duplicate_origin(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.storage = storage
indexer.idx_storage = idx_storage
indexer.run(["https://github.com/librariesio/yarn-parser"])
-
indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert len(results) == 1
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert len(results) == 1
def test_origin_metadata_indexer_missing_head(idx_storage, storage, obj_storage):
-
- storage.origin_add([{"url": "https://example.com"}])
+ storage.origin_add([Origin(url="https://example.com")])
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.run(["https://example.com"])
origin = "https://example.com"
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_partial_missing_head(
idx_storage, storage, obj_storage
):
- storage.origin_add([{"url": "https://example.com"}])
-
- indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
- indexer.run(["https://example.com", "https://github.com/librariesio/yarn-parser"])
-
origin1 = "https://example.com"
origin2 = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ storage.origin_add([Origin(url=origin1)])
+ indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ indexer.run([origin1, origin2])
- rev_metadata = {
- "id": rev_id,
- "metadata": YARN_PARSER_METADATA,
- "mappings": ["npm"],
- }
- origin_metadata = {
- "id": origin2,
- "from_revision": rev_id,
- "metadata": YARN_PARSER_METADATA,
- "mappings": ["npm"],
- }
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
for result in results:
del result["tool"]
- assert results == [rev_metadata]
+ assert results == [
+ {"id": rev_id, "metadata": YARN_PARSER_METADATA, "mappings": ["npm"],}
+ ]
results = list(
indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
)
for result in results:
del result["tool"]
- assert results == [origin_metadata]
+ assert results == [
+ {
+ "id": origin2,
+ "from_revision": rev_id,
+ "metadata": YARN_PARSER_METADATA,
+ "mappings": ["npm"],
+ }
+ ]
def test_origin_metadata_indexer_duplicate_revision(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
indexer.storage = storage
indexer.idx_storage = idx_storage
- indexer.run(
- [
- "https://github.com/librariesio/yarn-parser",
- "https://github.com/librariesio/yarn-parser.git",
- ]
- )
-
origin1 = "https://github.com/librariesio/yarn-parser"
origin2 = "https://github.com/librariesio/yarn-parser.git"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ indexer.run([origin1, origin2])
+
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert len(results) == 1
results = list(
indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
)
assert len(results) == 2
def test_origin_metadata_indexer_no_metadata_file(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ origin = "https://github.com/librariesio/yarn-parser"
with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
- indexer.run(["https://github.com/librariesio/yarn-parser"])
+ indexer.run([origin])
- origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_no_metadata(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ origin = "https://github.com/librariesio/yarn-parser"
with patch(
"swh.indexer.metadata.RevisionMetadataIndexer"
".translate_revision_intrinsic_metadata",
return_value=(["npm"], {"@context": "foo"}),
):
- indexer.run(["https://github.com/librariesio/yarn-parser"])
+ indexer.run([origin])
- origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_error(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
+ origin = "https://github.com/librariesio/yarn-parser"
with patch(
"swh.indexer.metadata.RevisionMetadataIndexer"
".translate_revision_intrinsic_metadata",
return_value=None,
):
- indexer.run(["https://github.com/librariesio/yarn-parser"])
+ indexer.run([origin])
- origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_delete_metadata(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
- indexer.run(["https://github.com/librariesio/yarn-parser"])
-
origin = "https://github.com/librariesio/yarn-parser"
- rev_id = hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f")
+ indexer.run([origin])
+
+ rev_id = REVISION.id
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert results != []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results != []
with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
- indexer.run(["https://github.com/librariesio/yarn-parser"])
+ indexer.run([origin])
results = list(indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
assert results == []
results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
assert results == []
def test_origin_metadata_indexer_unknown_origin(idx_storage, storage, obj_storage):
indexer = OriginMetadataIndexer(config=REVISION_METADATA_CONFIG)
result = indexer.index_list(["https://unknown.org/foo"])
assert not result
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
index a22211e..b3f0612 100644
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -1,732 +1,774 @@
# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import abc
import functools
-import random
from typing import Dict, Any
import unittest
from hypothesis import strategies
from swh.model import hashutil
from swh.model.hashutil import hash_to_bytes, hash_to_hex
-from swh.model.model import OriginVisit, OriginVisitStatus
+from swh.model.model import (
+ Content,
+ Directory,
+ DirectoryEntry,
+ Origin,
+ OriginVisit,
+ OriginVisitStatus,
+ Person,
+ Revision,
+ RevisionType,
+ Snapshot,
+ SnapshotBranch,
+ TargetType,
+ Timestamp,
+ TimestampWithTimezone,
+)
from swh.storage.utils import now
from swh.indexer.storage import INDEXER_CFG_KEY
BASE_TEST_CONFIG: Dict[str, Dict[str, Any]] = {
- "storage": {"cls": "pipeline", "steps": [{"cls": "validate"}, {"cls": "memory"},]},
+ "storage": {"cls": "memory"},
"objstorage": {"cls": "memory", "args": {},},
INDEXER_CFG_KEY: {"cls": "memory", "args": {},},
}
+
+ORIGINS = [
+ Origin(url="https://github.com/SoftwareHeritage/swh-storage"),
+ Origin(url="rsync://ftp.gnu.org/gnu/3dldf"),
+ Origin(url="https://forge.softwareheritage.org/source/jesuisgpl/"),
+ Origin(url="https://pypi.org/project/limnoria/"),
+ Origin(url="http://0-512-md.googlecode.com/svn/"),
+ Origin(url="https://github.com/librariesio/yarn-parser"),
+ Origin(url="https://github.com/librariesio/yarn-parser.git"),
+]
+
+
ORIGIN_VISITS = [
- {"type": "git", "url": "https://github.com/SoftwareHeritage/swh-storage"},
- {"type": "ftp", "url": "rsync://ftp.gnu.org/gnu/3dldf"},
- {"type": "deposit", "url": "https://forge.softwareheritage.org/source/jesuisgpl/"},
- {"type": "pypi", "url": "https://pypi.org/project/limnoria/"},
- {"type": "svn", "url": "http://0-512-md.googlecode.com/svn/"},
- {"type": "git", "url": "https://github.com/librariesio/yarn-parser"},
- {"type": "git", "url": "https://github.com/librariesio/yarn-parser.git"},
+ {"type": "git", "origin": ORIGINS[0].url},
+ {"type": "ftp", "origin": ORIGINS[1].url},
+ {"type": "deposit", "origin": ORIGINS[2].url},
+ {"type": "pypi", "origin": ORIGINS[3].url},
+ {"type": "svn", "origin": ORIGINS[4].url},
+ {"type": "git", "origin": ORIGINS[5].url},
+ {"type": "git", "origin": ORIGINS[6].url},
]
+
+DIRECTORY = Directory(
+ id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"),
+ entries=(
+ DirectoryEntry(
+ name=b"index.js",
+ type="file",
+ target=hash_to_bytes("01c9379dfc33803963d07c1ccc748d3fe4c96bb5"),
+ perms=0o100644,
+ ),
+ DirectoryEntry(
+ name=b"package.json",
+ type="file",
+ target=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"),
+ perms=0o100644,
+ ),
+ DirectoryEntry(
+ name=b".github",
+ type="dir",
+ target=Directory(entries=()).id,
+ perms=0o040000,
+ ),
+ ),
+)
+
+DIRECTORY2 = Directory(
+ id=b"\xf8zz\xa1\x12`<1$\xfav\xf9\x01\xfd5\x85F`\xf2\xb6",
+ entries=(
+ DirectoryEntry(
+ name=b"package.json",
+ type="file",
+ target=hash_to_bytes("f5305243b3ce7ef8dc864ebc73794da304025beb"),
+ perms=0o100644,
+ ),
+ ),
+)
+
+REVISION = Revision(
+ id=hash_to_bytes("c6201cb1b9b9df9a7542f9665c3b5dfab85e9775"),
+ message=b"Improve search functionality",
+ author=Person(
+ name=b"Andrew Nesbitt",
+ fullname=b"Andrew Nesbitt ",
+ email=b"andrewnez@gmail.com",
+ ),
+ committer=Person(
+ name=b"Andrew Nesbitt",
+ fullname=b"Andrew Nesbitt ",
+ email=b"andrewnez@gmail.com",
+ ),
+ committer_date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1380883849, microseconds=0,),
+ offset=120,
+ negative_utc=False,
+ ),
+ type=RevisionType.GIT,
+ synthetic=False,
+ date=TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1487596456, microseconds=0,),
+ offset=0,
+ negative_utc=False,
+ ),
+ directory=DIRECTORY2.id,
+ parents=(),
+)
+
+REVISIONS = [REVISION]
+
SNAPSHOTS = [
- {
- "origin": "https://github.com/SoftwareHeritage/swh-storage",
- "branches": {
- b"refs/heads/add-revision-origin-cache": {
- "target": b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0'
- b"s\xe7/\xe9l\x1e",
- "target_type": "revision",
- },
- b"refs/head/master": {
- "target": b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}" b"\xac\xefrm",
- "target_type": "revision",
- },
- b"HEAD": {"target": b"refs/head/master", "target_type": "alias"},
- b"refs/tags/v0.0.103": {
- "target": b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b"\x0f\xdd",
- "target_type": "release",
- },
- },
- },
- {
- "origin": "rsync://ftp.gnu.org/gnu/3dldf",
- "branches": {
- b"3DLDF-1.1.4.tar.gz": {
- "target": b"dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc" b'"G\x99\x11',
- "target_type": "revision",
- },
- b"3DLDF-2.0.2.tar.gz": {
- "target": b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e="
- b"\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V",
- "target_type": "revision",
- },
- b"3DLDF-2.0.3-examples.tar.gz": {
- "target": b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97"
- b"\xfe\xadZ\x80\x80\xc1\x83\xff",
- "target_type": "revision",
- },
- b"3DLDF-2.0.3.tar.gz": {
- "target": b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee"
- b"\xcc\x1a\xb4`\x8c\x8by",
- "target_type": "revision",
- },
- b"3DLDF-2.0.tar.gz": {
- "target": b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G" b"\xd3\xd1m",
- "target_type": "revision",
- },
- },
- },
- {
- "origin": "https://forge.softwareheritage.org/source/jesuisgpl/",
- "branches": {
- b"master": {
- "target": b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{"
- b"\xa6\xe9\x99\xb1\x9e]q\xeb",
- "target_type": "revision",
- }
- },
- "id": b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r ",
- },
- {
- "origin": "https://pypi.org/project/limnoria/",
- "branches": {
- b"HEAD": {"target": b"releases/2018.09.09", "target_type": "alias"},
- b"releases/2018.09.01": {
- "target": b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d"
- b"\xbb\xdfF\xfdw\xcf",
- "target_type": "revision",
- },
- b"releases/2018.09.09": {
- "target": b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k"
- b"A\x10\x9d\xc5\xfa2\xf8t",
- "target_type": "revision",
- },
- },
- "id": b"{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay" b"\x12\x9e\xd6\xb3",
- },
- {
- "origin": "http://0-512-md.googlecode.com/svn/",
- "branches": {
- b"master": {
- "target": b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8"
- b"\xc9\xad#.\x1bw=\x18",
- "target_type": "revision",
- }
+ Snapshot(
+ id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"),
+ branches={
+ b"refs/heads/add-revision-origin-cache": SnapshotBranch(
+ target=b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0s\xe7/\xe9l\x1e',
+ target_type=TargetType.REVISION,
+ ),
+ b"refs/head/master": SnapshotBranch(
+ target=b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm",
+ target_type=TargetType.REVISION,
+ ),
+ b"HEAD": SnapshotBranch(
+ target=b"refs/head/master", target_type=TargetType.ALIAS
+ ),
+ b"refs/tags/v0.0.103": SnapshotBranch(
+ target=b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+\x0f\xdd',
+ target_type=TargetType.RELEASE,
+ ),
},
- "id": b"\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7"
- b"\x05\xea\xb8\x1f\xc4H\xf4s",
- },
- {
- "origin": "https://github.com/librariesio/yarn-parser",
- "branches": {
- b"HEAD": {
- "target": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
- "target_type": "revision",
- }
+ ),
+ Snapshot(
+ id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"),
+ branches={
+ b"3DLDF-1.1.4.tar.gz": SnapshotBranch(
+ target=b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc"G\x99\x11',
+ target_type=TargetType.REVISION,
+ ),
+ b"3DLDF-2.0.2.tar.gz": SnapshotBranch(
+ target=b"\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V", # noqa
+ target_type=TargetType.REVISION,
+ ),
+ b"3DLDF-2.0.3-examples.tar.gz": SnapshotBranch(
+ target=b"!H\x19\xc0\xee\x82-\x12F1\xbd\x97\xfe\xadZ\x80\x80\xc1\x83\xff", # noqa
+ target_type=TargetType.REVISION,
+ ),
+ b"3DLDF-2.0.3.tar.gz": SnapshotBranch(
+ target=b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by", # noqa
+ target_type=TargetType.REVISION,
+ ),
+ b"3DLDF-2.0.tar.gz": SnapshotBranch(
+ target=b"F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G\xd3\xd1m",
+ target_type=TargetType.REVISION,
+ ),
},
- },
- {
- "origin": "https://github.com/librariesio/yarn-parser.git",
- "branches": {
- b"HEAD": {
- "target": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
- "target_type": "revision",
- }
+ ),
+ Snapshot(
+ id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"),
+ branches={
+ b"master": SnapshotBranch(
+ target=b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb", # noqa
+ target_type=TargetType.REVISION,
+ )
},
- },
-]
-
-
-REVISIONS = [
- {
- "id": hash_to_bytes("8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f"),
- "message": b"Improve search functionality",
- "author": {
- "name": b"Andrew Nesbitt",
- "fullname": b"Andrew Nesbitt ",
- "email": b"andrewnez@gmail.com",
+ ),
+ Snapshot(
+ id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"),
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=b"releases/2018.09.09", target_type=TargetType.ALIAS
+ ),
+ b"releases/2018.09.01": SnapshotBranch(
+ target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf",
+ target_type=TargetType.REVISION,
+ ),
+ b"releases/2018.09.09": SnapshotBranch(
+ target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t", # noqa
+ target_type=TargetType.REVISION,
+ ),
},
- "committer": {
- "name": b"Andrew Nesbitt",
- "fullname": b"Andrew Nesbitt ",
- "email": b"andrewnez@gmail.com",
+ ),
+ Snapshot(
+ id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"),
+ branches={
+ b"master": SnapshotBranch(
+ target=b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18",
+ target_type=TargetType.REVISION,
+ )
},
- "committer_date": {
- "negative_utc": False,
- "offset": 120,
- "timestamp": {"microseconds": 0, "seconds": 1380883849,},
+ ),
+ Snapshot(
+ id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=REVISION.id, target_type=TargetType.REVISION,
+ )
},
- "type": "git",
- "synthetic": False,
- "date": {
- "negative_utc": False,
- "timestamp": {"seconds": 1487596456, "microseconds": 0,},
- "offset": 0,
+ ),
+ Snapshot(
+ id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"),
+ branches={
+ b"HEAD": SnapshotBranch(
+ target=REVISION.id, target_type=TargetType.REVISION,
+ )
},
- "directory": b"10",
- "parents": (),
- }
+ ),
]
-DIRECTORY_ID = b"10"
-
-DIRECTORY_ENTRIES = [
- {"name": b"index.js", "type": "file", "target": b"abc", "perms": 33188,},
- {"name": b"package.json", "type": "file", "target": b"cde", "perms": 33188,},
- {"name": b".github", "type": "dir", "target": b"11", "perms": 16384,},
-]
SHA1_TO_LICENSES = {
"01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"],
"02fb2c89e14f7fab46701478c83779c7beb7b069": ["Apache2.0"],
"103bc087db1d26afc3a0283f38663d081e9b01e6": ["MIT"],
"688a5ef812c53907562fe379d4b3851e69c7cb15": ["AGPL"],
"da39a3ee5e6b4b0d3255bfef95601890afd80709": [],
}
SHA1_TO_CTAGS = {
"01c9379dfc33803963d07c1ccc748d3fe4c96bb5": [
{"name": "foo", "kind": "str", "line": 10, "lang": "bar",}
],
"d4c647f0fc257591cc9ba1722484229780d1c607": [
{"name": "let", "kind": "int", "line": 100, "lang": "haskell",}
],
"688a5ef812c53907562fe379d4b3851e69c7cb15": [
{"name": "symbol", "kind": "float", "line": 99, "lang": "python",}
],
}
OBJ_STORAGE_DATA = {
"01c9379dfc33803963d07c1ccc748d3fe4c96bb5": b"this is some text",
"688a5ef812c53907562fe379d4b3851e69c7cb15": b"another text",
"8986af901dd2043044ce8f0d8fc039153641cf17": b"yet another text",
"02fb2c89e14f7fab46701478c83779c7beb7b069": b"""
import unittest
import logging
from swh.indexer.mimetype import MimetypeIndexer
from swh.indexer.tests.test_utils import MockObjStorage
class MockStorage():
def content_mimetype_add(self, mimetypes):
self.state = mimetypes
self.conflict_update = conflict_update
def indexer_configuration_add(self, tools):
return [{
'id': 10,
}]
""",
"103bc087db1d26afc3a0283f38663d081e9b01e6": b"""
#ifndef __AVL__
#define __AVL__
typedef struct _avl_tree avl_tree;
typedef struct _data_t {
int content;
} data_t;
""",
"93666f74f1cf635c8c8ac118879da6ec5623c410": b"""
(should 'pygments (recognize 'lisp 'easily))
""",
"26a9f72a7c87cc9205725cfd879f514ff4f3d8d5": b"""
{
"name": "test_metadata",
"version": "0.0.1",
"description": "Simple package.json test for indexer",
"repository": {
"type": "git",
"url": "https://github.com/moranegg/metadata_test"
}
}
""",
"d4c647f0fc257591cc9ba1722484229780d1c607": b"""
{
"version": "5.0.3",
"name": "npm",
"description": "a package manager for JavaScript",
"keywords": [
"install",
"modules",
"package manager",
"package.json"
],
"preferGlobal": true,
"config": {
"publishtest": false
},
"homepage": "https://docs.npmjs.com/",
"author": "Isaac Z. Schlueter (http://blog.izs.me)",
"repository": {
"type": "git",
"url": "https://github.com/npm/npm"
},
"bugs": {
"url": "https://github.com/npm/npm/issues"
},
"dependencies": {
"JSONStream": "~1.3.1",
"abbrev": "~1.1.0",
"ansi-regex": "~2.1.1",
"ansicolors": "~0.3.2",
"ansistyles": "~0.1.3"
},
"devDependencies": {
"tacks": "~1.2.6",
"tap": "~10.3.2"
},
"license": "Artistic-2.0"
}
""",
"a7ab314d8a11d2c93e3dcf528ca294e7b431c449": b"""
""",
"da39a3ee5e6b4b0d3255bfef95601890afd80709": b"",
- # 626364
- hash_to_hex(b"bcd"): b"unimportant content for bcd",
- # 636465
- hash_to_hex(
- b"cde"
- ): b"""
+ # was 626364 / b'bcd'
+ "e3e40fee6ff8a52f06c3b428bfe7c0ed2ef56e92": b"unimportant content for bcd",
+ # was 636465 / b'cde' now yarn-parser package.json
+ "f5305243b3ce7ef8dc864ebc73794da304025beb": b"""
{
"name": "yarn-parser",
"version": "1.0.0",
"description": "Tiny web service for parsing yarn.lock files",
"main": "index.js",
"scripts": {
"start": "node index.js",
"test": "mocha"
},
"engines": {
"node": "9.8.0"
},
"repository": {
"type": "git",
"url": "git+https://github.com/librariesio/yarn-parser.git"
},
"keywords": [
"yarn",
"parse",
"lock",
"dependencies"
],
"author": "Andrew Nesbitt",
"license": "AGPL-3.0",
"bugs": {
"url": "https://github.com/librariesio/yarn-parser/issues"
},
"homepage": "https://github.com/librariesio/yarn-parser#readme",
"dependencies": {
"@yarnpkg/lockfile": "^1.0.0",
"body-parser": "^1.15.2",
"express": "^4.14.0"
},
"devDependencies": {
"chai": "^4.1.2",
"mocha": "^5.2.0",
"request": "^2.87.0",
"test": "^0.6.0"
}
}
""",
}
+
YARN_PARSER_METADATA = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"url": "https://github.com/librariesio/yarn-parser#readme",
"codeRepository": "git+git+https://github.com/librariesio/yarn-parser.git",
"author": [{"type": "Person", "name": "Andrew Nesbitt"}],
"license": "https://spdx.org/licenses/AGPL-3.0",
"version": "1.0.0",
"description": "Tiny web service for parsing yarn.lock files",
"issueTracker": "https://github.com/librariesio/yarn-parser/issues",
"name": "yarn-parser",
"keywords": ["yarn", "parse", "lock", "dependencies"],
"type": "SoftwareSourceCode",
}
json_dict_keys = strategies.one_of(
strategies.characters(),
strategies.just("type"),
strategies.just("url"),
strategies.just("name"),
strategies.just("email"),
strategies.just("@id"),
strategies.just("@context"),
strategies.just("repository"),
strategies.just("license"),
strategies.just("repositories"),
strategies.just("licenses"),
)
"""Hypothesis strategy that generates strings, with an emphasis on those
that are often used as dictionary keys in metadata files."""
generic_json_document = strategies.recursive(
strategies.none()
| strategies.booleans()
| strategies.floats()
| strategies.characters(),
lambda children: (
strategies.lists(children, min_size=1)
| strategies.dictionaries(json_dict_keys, children, min_size=1)
),
)
"""Hypothesis strategy that generates possible values for values of JSON
metadata files."""
def json_document_strategy(keys=None):
"""Generates an hypothesis strategy that generates metadata files
for a JSON-based format that uses the given keys."""
if keys is None:
keys = strategies.characters()
else:
keys = strategies.one_of(map(strategies.just, keys))
return strategies.dictionaries(keys, generic_json_document, min_size=1)
def _tree_to_xml(root, xmlns, data):
def encode(s):
"Skips unpaired surrogates generated by json_document_strategy"
return s.encode("utf8", "replace")
def to_xml(data, indent=b" "):
if data is None:
return b""
elif isinstance(data, (bool, str, int, float)):
return indent + encode(str(data))
elif isinstance(data, list):
return b"\n".join(to_xml(v, indent=indent) for v in data)
elif isinstance(data, dict):
lines = []
for (key, value) in data.items():
lines.append(indent + encode("<{}>".format(key)))
lines.append(to_xml(value, indent=indent + b" "))
lines.append(indent + encode("{}>".format(key)))
return b"\n".join(lines)
else:
raise TypeError(data)
return b"\n".join(
[
'<{} xmlns="{}">'.format(root, xmlns).encode(),
to_xml(data),
"{}>".format(root).encode(),
]
)
class TreeToXmlTest(unittest.TestCase):
def test_leaves(self):
self.assertEqual(
_tree_to_xml("root", "http://example.com", None),
b'\n\n',
)
self.assertEqual(
_tree_to_xml("root", "http://example.com", True),
b'\n True\n',
)
self.assertEqual(
_tree_to_xml("root", "http://example.com", "abc"),
b'\n abc\n',
)
self.assertEqual(
_tree_to_xml("root", "http://example.com", 42),
b'\n 42\n',
)
self.assertEqual(
_tree_to_xml("root", "http://example.com", 3.14),
b'\n 3.14\n',
)
def test_dict(self):
self.assertIn(
_tree_to_xml("root", "http://example.com", {"foo": "bar", "baz": "qux"}),
[
b'\n'
b" \n bar\n \n"
b" \n qux\n \n"
b"",
b'\n'
b" \n qux\n \n"
b" \n bar\n \n"
b"",
],
)
def test_list(self):
self.assertEqual(
_tree_to_xml(
"root", "http://example.com", [{"foo": "bar"}, {"foo": "baz"},]
),
b'\n'
b" \n bar\n \n"
b" \n baz\n \n"
b"",
)
def xml_document_strategy(keys, root, xmlns):
"""Generates an hypothesis strategy that generates metadata files
for an XML format that uses the given keys."""
return strategies.builds(
functools.partial(_tree_to_xml, root, xmlns), json_document_strategy(keys)
)
def filter_dict(d, keys):
"return a copy of the dict with keys deleted"
if not isinstance(keys, (list, tuple)):
keys = (keys,)
return dict((k, v) for (k, v) in d.items() if k not in keys)
def fill_obj_storage(obj_storage):
"""Add some content in an object storage."""
for (obj_id, content) in OBJ_STORAGE_DATA.items():
obj_storage.add(content, obj_id=hash_to_bytes(obj_id))
def fill_storage(storage):
- visit_types = {}
- for visit in ORIGIN_VISITS:
- storage.origin_add([{"url": visit["url"]}])
- visit_types[visit["url"]] = visit["type"]
- for snap in SNAPSHOTS:
- origin_url = snap["origin"]
+ storage.origin_add(ORIGINS)
+ storage.directory_add([DIRECTORY, DIRECTORY2])
+ storage.revision_add(REVISIONS)
+ storage.snapshot_add(SNAPSHOTS)
+
+ for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):
+ assert snapshot.id is not None
+
visit = storage.origin_visit_add(
- [OriginVisit(origin=origin_url, date=now(), type=visit_types[origin_url],)]
+ [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])]
)[0]
- snap_id = snap.get("id") or bytes([random.randint(0, 255) for _ in range(32)])
- storage.snapshot_add([{"id": snap_id, "branches": snap["branches"]}])
visit_status = OriginVisitStatus(
- origin=origin_url,
+ origin=visit.origin,
visit=visit.visit,
date=now(),
status="full",
- snapshot=snap_id,
+ snapshot=snapshot.id,
)
storage.origin_visit_status_add([visit_status])
- storage.revision_add(REVISIONS)
contents = []
for (obj_id, content) in OBJ_STORAGE_DATA.items():
content_hashes = hashutil.MultiHash.from_data(content).digest()
contents.append(
- {
- "data": content,
- "length": len(content),
- "status": "visible",
- "sha1": hash_to_bytes(obj_id),
- "sha1_git": hash_to_bytes(obj_id),
- "sha256": content_hashes["sha256"],
- "blake2s256": content_hashes["blake2s256"],
- }
+ Content(
+ data=content,
+ length=len(content),
+ status="visible",
+ sha1=hash_to_bytes(obj_id),
+ sha1_git=hash_to_bytes(obj_id),
+ sha256=content_hashes["sha256"],
+ blake2s256=content_hashes["blake2s256"],
+ )
)
storage.content_add(contents)
- storage.directory_add([{"id": DIRECTORY_ID, "entries": DIRECTORY_ENTRIES,}])
class CommonContentIndexerTest(metaclass=abc.ABCMeta):
legacy_get_format = False
"""True if and only if the tested indexer uses the legacy format.
see: https://forge.softwareheritage.org/T1433
"""
def get_indexer_results(self, ids):
"""Override this for indexers that don't have a mock storage."""
return self.indexer.idx_storage.state
def assert_legacy_results_ok(self, sha1s, expected_results=None):
# XXX old format, remove this when all endpoints are
# updated to the new one
# see: https://forge.softwareheritage.org/T1433
sha1s = [
sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s
]
actual_results = list(self.get_indexer_results(sha1s))
if expected_results is None:
expected_results = self.expected_results
self.assertEqual(
len(expected_results),
len(actual_results),
(expected_results, actual_results),
)
for indexed_data in actual_results:
_id = indexed_data["id"]
expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
expected_data["id"] = _id
self.assertEqual(indexed_data, expected_data)
def assert_results_ok(self, sha1s, expected_results=None):
if self.legacy_get_format:
self.assert_legacy_results_ok(sha1s, expected_results)
return
sha1s = [
sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) for sha1 in sha1s
]
actual_results = list(self.get_indexer_results(sha1s))
if expected_results is None:
expected_results = self.expected_results
self.assertEqual(
len(expected_results),
len(actual_results),
(expected_results, actual_results),
)
for indexed_data in actual_results:
(_id, indexed_data) = list(indexed_data.items())[0]
expected_data = expected_results[hashutil.hash_to_hex(_id)].copy()
expected_data = [expected_data]
self.assertEqual(indexed_data, expected_data)
def test_index(self):
"""Known sha1 have their data indexed
"""
sha1s = [self.id0, self.id1, self.id2]
# when
self.indexer.run(sha1s, policy_update="update-dups")
self.assert_results_ok(sha1s)
# 2nd pass
self.indexer.run(sha1s, policy_update="ignore-dups")
self.assert_results_ok(sha1s)
def test_index_one_unknown_sha1(self):
"""Unknown sha1 are not indexed"""
sha1s = [
self.id1,
"799a5ef812c53907562fe379d4b3851e69c7cb15", # unknown
"800a5ef812c53907562fe379d4b3851e69c7cb15",
] # unknown
# when
self.indexer.run(sha1s, policy_update="update-dups")
# then
expected_results = {
k: v for k, v in self.expected_results.items() if k in sha1s
}
self.assert_results_ok(sha1s, expected_results)
class CommonContentIndexerRangeTest:
"""Allows to factorize tests on range indexer.
"""
def setUp(self):
self.contents = sorted(OBJ_STORAGE_DATA)
def assert_results_ok(self, start, end, actual_results, expected_results=None):
if expected_results is None:
expected_results = self.expected_results
actual_results = list(actual_results)
for indexed_data in actual_results:
_id = indexed_data["id"]
assert isinstance(_id, bytes)
indexed_data = indexed_data.copy()
indexed_data["id"] = hash_to_hex(indexed_data["id"])
self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
self.assertTrue(start <= _id <= end)
_tool_id = indexed_data["indexer_configuration_id"]
self.assertEqual(_tool_id, self.indexer.tool["id"])
def test__index_contents(self):
"""Indexing contents without existing data results in indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = list(self.indexer._index_contents(start, end, indexed={}))
self.assert_results_ok(start, end, actual_results)
def test__index_contents_with_indexed_data(self):
"""Indexing contents with existing data results in less indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
data_indexed = [self.id0, self.id2]
# given
actual_results = self.indexer._index_contents(
start, end, indexed=set(map(hash_to_bytes, data_indexed))
)
# craft the expected results
expected_results = self.expected_results.copy()
for already_indexed_key in data_indexed:
expected_results.pop(already_indexed_key)
self.assert_results_ok(start, end, actual_results, expected_results)
def test_generate_content_get(self):
"""Optimal indexing should result in indexed data
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end)
# then
self.assertEqual(actual_results, {"status": "uneventful"})
def test_generate_content_get_input_as_bytes(self):
"""Optimal indexing should result in indexed data
Input are in bytes here.
"""
_start, _end = [self.contents[0], self.contents[2]] # output hex ids
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end, skip_existing=False)
# no already indexed data so same result as prior test
# then
self.assertEqual(actual_results, {"status": "uneventful"})
def test_generate_content_get_no_result(self):
"""No result indexed returns False"""
_start, _end = [
"0000000000000000000000000000000000000000",
"0000000000000000000000000000000000000001",
]
start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end, incremental=False)
# then
self.assertEqual(actual_results, {"status": "uneventful"})