diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import json +import logging import requests from typing import Dict, Optional, Any, Mapping @@ -19,6 +20,9 @@ from swh.loader.package.loader import PackageLoader +logger = logging.getLogger(__name__) + + def retrieve_sources(url: str) -> Dict[str, Any]: response = requests.get(url, allow_redirects=True) @@ -29,6 +33,59 @@ return json.loads(response.content.decode('utf-8')) +def clean_sources(sources: Dict[str, Any]) -> Dict[str, Any]: + """Validate and clean the sources structure. First, it ensures all top + level keys are presents. Then, it walks on the sources list + and removes sources that don't contain required keys. + + Raises: + ValueError: if a top level key is missing + + """ + # Required top level keys + required_keys = ['version', 'revision', 'sources'] + missing_keys = [] + for required_key in required_keys: + if required_key not in sources: + missing_keys.append(required_key) + + if missing_keys != []: + raise ValueError("sources structure invalid, missing: %s", + ",".join(missing_keys)) + + # Only the version 1 is currently supported + if sources['version'] != 1: + raise ValueError("The sources structure version '%d' is not supported", + sources['version']) + + # If a source doesn't contain required attributes, this source is + # skipped but others could still be archived. + verified_sources = [] + for source in sources['sources']: + valid = True + required_keys = ['urls', 'integrity', 'type'] + for required_key in required_keys: + if required_key not in source: + logger.info("Skip source '%s' because key '%s' is missing", + source, required_key) + valid = False + if source['type'] != 'url': + logger.info( + "Skip source '%s' because the type %s is not supported", + source, source['type']) + valid = False + if not isinstance(source['urls'], list): + logger.info( + "Skip source '%s' because the urls attribute is not a list", + source) + valid = False + if valid: + verified_sources.append(source) + + sources['sources'] = verified_sources + return sources + + class NixGuixLoader(PackageLoader): """Load sources from a sources.json file. This loader is used to load sources used by functional package manager (eg. Nix and Guix). @@ -38,8 +95,9 @@ def __init__(self, url): super().__init__(url=url) - s = retrieve_sources(url) - self.sources = s['sources'] + raw = retrieve_sources(url) + clean = clean_sources(raw) + self.sources = clean['sources'] self.provider_url = url self._integrityByUrl = {s['urls'][0]: s['integrity'] @@ -48,7 +106,7 @@ # The revision used to create the sources.json file. For Nix, # this revision belongs to the github.com/nixos/nixpkgs # repository - self.revision = s['revision'] + self.revision = clean['revision'] # Note: this could be renamed get_artifacts in the PackageLoader # base class. diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -7,7 +7,7 @@ from json.decoder import JSONDecodeError from swh.loader.package.nixguix.loader import ( - NixGuixLoader, retrieve_sources + NixGuixLoader, retrieve_sources, clean_sources ) from swh.loader.package.tests.common import ( @@ -33,6 +33,60 @@ NixGuixLoader('https://example.com/file.txt') +def test_clean_sources_invalid_schema(swh_config, requests_mock_datadir): + sources = {} + with pytest.raises(ValueError, + match="sources structure invalid, missing: .*"): + clean_sources(sources) + + +def test_clean_sources_invalid_version(swh_config, requests_mock_datadir): + sources = { + 'version': 2, + 'sources': [], + 'revision': 'my-revision' + } + + with pytest.raises(ValueError, + match="sources structure version .* is not supported"): + clean_sources(sources) + + +def test_clean_sources_invalid_sources(swh_config, requests_mock_datadir): + sources = { + 'version': 1, + 'sources': [ + # Valid source + { + 'type': 'url', + 'urls': ['my-url'], + 'integrity': 'my-integrity' + }, + # integrity is missing + { + 'type': 'url', + 'urls': ['my-url'], + }, + # urls is not a list + { + 'type': 'url', + 'urls': 'my-url', + 'integrity': 'my-integrity' + }, + # type is not url + { + 'type': 'git', + 'urls': ['my-url'], + 'integrity': 'my-integrity' + } + ], + 'revision': 'my-revision' + } + clean = clean_sources(sources) + + assert len(clean['sources']) == 1 + + def test_loader_one_visit(swh_config, requests_mock_datadir): loader = NixGuixLoader(sources_url) res = loader.load() diff --git a/swh/loader/package/nixguix/tests/test_tasks.py b/swh/loader/package/nixguix/tests/test_tasks.py --- a/swh/loader/package/nixguix/tests/test_tasks.py +++ b/swh/loader/package/nixguix/tests/test_tasks.py @@ -12,6 +12,7 @@ mock_retrieve_sources = mocker.patch( 'swh.loader.package.nixguix.loader.retrieve_sources') mock_retrieve_sources.return_value = { + 'version': 1, 'sources': [], 'revision': 'some-revision' }