diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -3,11 +3,14 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import attr +import copy import json import logging +import re + from typing import Any, Dict, Iterator, Mapping, Optional, Tuple -import attr from swh.model import hashutil from swh.model.model import ( @@ -211,6 +214,11 @@ return json.loads(raw_sources.decode("utf-8")) +PATTERN_SUPPORTED_ARCHIVE = re.compile( + r".*(zip|tar.gz|tar.bz2|tar.xz|tbz|tgz|tar)$", re.DOTALL +) + + def clean_sources(sources: Dict[str, Any]) -> Dict[str, Any]: """Validate and clean the sources structure. First, ensure all top level keys are present. Then, walk the sources list and remove sources that do not contain required @@ -258,23 +266,40 @@ for required_key in required_keys: if required_key not in source: logger.info( - "Skip source '%s' because key '%s' is missing", source, required_key + f"Skip source '{source}' because key '{required_key}' is missing", ) valid = False + if valid and source["type"] != "url": logger.info( - "Skip source '%s' because the type %s is not supported", - source, - source["type"], + f"Skip source '{source}' because the type {source['type']} " + "is not supported", ) valid = False + if valid and not isinstance(source["urls"], list): logger.info( - "Skip source '%s' because the urls attribute is not a list", source + f"Skip source {source} because the urls attribute is not a list" ) valid = False - if valid: - verified_sources.append(source) + + if valid and len(source["urls"]) > 0: # Filter out unsupported archives + supported_sources = [] + for source_url in source["urls"]: + if PATTERN_SUPPORTED_ARCHIVE.match(source_url): + supported_sources.append(source_url) + + if len(supported_sources) == 0: + logger.info( + f"Skip source {source} because urls only reference" + " unsupported artifacts. Supported" + f" artifacts so far: {PATTERN_SUPPORTED_ARCHIVE}" + ) + continue + + new_source = copy.deepcopy(source) + new_source["urls"] = supported_sources + verified_sources.append(new_source) sources["sources"] = verified_sources return sources diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -142,17 +142,20 @@ def test_clean_sources_invalid_sources(swh_config, requests_mock_datadir): + valid_sources = [ + # 1 valid source + {"type": "url", "urls": ["my-url.tar.gz"], "integrity": "my-integrity"}, + ] sources = { "version": 1, - "sources": [ - # Valid source - {"type": "url", "urls": ["my-url"], "integrity": "my-integrity"}, + "sources": valid_sources + + [ # integrity is missing - {"type": "url", "urls": ["my-url"],}, + {"type": "url", "urls": ["my-url.tgz"],}, # urls is not a list - {"type": "url", "urls": "my-url", "integrity": "my-integrity"}, + {"type": "url", "urls": "my-url.zip", "integrity": "my-integrity"}, # type is not url - {"type": "git", "urls": ["my-url"], "integrity": "my-integrity"}, + {"type": "git", "urls": ["my-url.zip"], "integrity": "my-integrity"}, # missing fields which got double-checked nonetheless... {"integrity": "my-integrity"}, ], @@ -160,7 +163,38 @@ } clean = clean_sources(sources) - assert len(clean["sources"]) == 1 + assert len(clean["sources"]) == len(valid_sources) + + +def test_clean_sources_unsupported_artifacts(swh_config, requests_mock_datadir): + valid_sources = [ + { + "type": "url", + "urls": [f"https://server.org/my-url.{ext}"], + "integrity": "my-integrity", + } + for ext in ["zip", "tar.gz", "tar.bz2", "tbz", "tar.xz", "tgz", "tar", "zip"] + ] + + sources = { + "version": 1, + "sources": valid_sources + + [ + # Mostly Valid but file are not supported so filtered out + { + "type": "url", + "urls": ["https://server.org/my-url.patch"], + "integrity": "my-integrity", + }, + # Same gem not supported + {"type": "url", "urls": ["http://my-url.gem"], "integrity": "integrity"}, + ], + "revision": "my-revision", + } + + clean = clean_sources(sources) + + assert len(clean["sources"]) == len(valid_sources) def test_loader_one_visit(swh_config, requests_mock_datadir, raw_sources):