diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -3,11 +3,14 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import attr +import copy import json import logging -from typing import Any, Dict, Iterator, Mapping, Optional, Tuple +import re + +from typing import Any, Dict, List, Iterator, Mapping, Optional, Tuple -import attr from swh.model import hashutil from swh.model.model import ( @@ -211,6 +214,12 @@ return json.loads(raw_sources.decode("utf-8")) +# Known unsupported archive so far +PATTERN_KNOWN_UNSUPPORTED_ARCHIVE = re.compile( + r".*\.(iso|whl|gem|pom|msi|pod|png|rock|ttf|jar|c|rpm|diff|patch)$", re.DOTALL +) + + def clean_sources(sources: Dict[str, Any]) -> Dict[str, Any]: """Validate and clean the sources structure. First, ensure all top level keys are present. Then, walk the sources list and remove sources that do not contain required @@ -258,23 +267,42 @@ for required_key in required_keys: if required_key not in source: logger.info( - "Skip source '%s' because key '%s' is missing", source, required_key + f"Skip source '{source}' because key '{required_key}' is missing", ) valid = False + if valid and source["type"] != "url": logger.info( - "Skip source '%s' because the type %s is not supported", - source, - source["type"], + f"Skip source '{source}' because the type {source['type']} " + "is not supported", ) valid = False + if valid and not isinstance(source["urls"], list): logger.info( - "Skip source '%s' because the urls attribute is not a list", source + f"Skip source {source} because the urls attribute is not a list" ) valid = False - if valid: - verified_sources.append(source) + + if valid and len(source["urls"]) > 0: # Filter out unsupported archives + supported_sources: List[str] = [] + for source_url in source["urls"]: + if PATTERN_KNOWN_UNSUPPORTED_ARCHIVE.match(source_url): + logger.info(f"Skip unsupported artifact url {source_url}") + continue + supported_sources.append(source_url) + + if len(supported_sources) == 0: + logger.info( + f"Skip source {source} because urls only reference " + "unsupported artifacts. Unsupported " + f"artifacts so far: {PATTERN_KNOWN_UNSUPPORTED_ARCHIVE}" + ) + continue + + new_source = copy.deepcopy(source) + new_source["urls"] = supported_sources + verified_sources.append(new_source) sources["sources"] = verified_sources return sources diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -142,17 +142,20 @@ def test_clean_sources_invalid_sources(swh_config, requests_mock_datadir): + valid_sources = [ + # 1 valid source + {"type": "url", "urls": ["my-url.tar.gz"], "integrity": "my-integrity"}, + ] sources = { "version": 1, - "sources": [ - # Valid source - {"type": "url", "urls": ["my-url"], "integrity": "my-integrity"}, + "sources": valid_sources + + [ # integrity is missing - {"type": "url", "urls": ["my-url"],}, + {"type": "url", "urls": ["my-url.tgz"],}, # urls is not a list - {"type": "url", "urls": "my-url", "integrity": "my-integrity"}, + {"type": "url", "urls": "my-url.zip", "integrity": "my-integrity"}, # type is not url - {"type": "git", "urls": ["my-url"], "integrity": "my-integrity"}, + {"type": "git", "urls": ["my-url.zip"], "integrity": "my-integrity"}, # missing fields which got double-checked nonetheless... {"integrity": "my-integrity"}, ], @@ -160,7 +163,65 @@ } clean = clean_sources(sources) - assert len(clean["sources"]) == 1 + assert len(clean["sources"]) == len(valid_sources) + + +def test_clean_sources_unsupported_artifacts(swh_config, requests_mock_datadir): + supported_sources = [ + { + "type": "url", + "urls": [f"https://server.org/my-url.{ext}"], + "integrity": "my-integrity", + } + for ext in [ + "known-unknown-but-ok", # this is fine as well with the current approach + "zip", + "tar.gz", + "tgz", + "tar.bz2", + "tbz", + "tbz2", + "tar.xz", + "tar", + "zip", + "7z", + "Z", + ] + ] + + unsupported_sources = [ + { + "type": "url", + "urls": [f"https://server.org/my-url.{ext}"], + "integrity": "my-integrity", + } + for ext in [ + "iso", + "whl", + "gem", + "pom", + "msi", + "pod", + "png", + "rock", + "ttf", + "jar", + "c", + "rpm", + "diff", + "patch", + ] + ] + + sources = { + "version": 1, + "sources": supported_sources + unsupported_sources, + "revision": "my-revision", + } + + clean = clean_sources(sources) + + assert len(clean["sources"]) == len(supported_sources) def test_loader_one_visit(swh_config, requests_mock_datadir, raw_sources):