diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -63,8 +63,11 @@ def __init__(self, url): super().__init__(url=url) + unsupported_file_extensions = self.config.get("unsupported_file_extensions", []) self.raw_sources = retrieve_sources(url) - clean = clean_sources(parse_sources(self.raw_sources)) + clean = clean_sources( + parse_sources(self.raw_sources), unsupported_file_extensions + ) self.sources = clean["sources"] self.provider_url = url @@ -214,13 +217,19 @@ return json.loads(raw_sources.decode("utf-8")) -# Known unsupported archive so far -PATTERN_KNOWN_UNSUPPORTED_ARCHIVE = re.compile( - r".*\.(iso|whl|gem|pom|msi|pod|png|rock|ttf|jar|c|rpm|diff|patch)$", re.DOTALL -) +def make_pattern_unsupported_file_extension(unsupported_file_extensions: List[str],): + """Make a regexp pattern for unsupported file extension out of a list + of unsupported archive extension list. + + """ + return re.compile( + rf".*\.({'|'.join(map(re.escape, unsupported_file_extensions))})$", re.DOTALL + ) -def clean_sources(sources: Dict[str, Any]) -> Dict[str, Any]: +def clean_sources( + sources: Dict[str, Any], unsupported_file_extensions=[] +) -> Dict[str, Any]: """Validate and clean the sources structure. First, ensure all top level keys are present. Then, walk the sources list and remove sources that do not contain required keys. @@ -229,6 +238,7 @@ - required keys are missing - source type is not supported - urls attribute type is not a list + - extension is known not to be supported by the loader Raises: ValueError if: @@ -236,9 +246,12 @@ - top-level version is not 1 Returns: - Dict sources + source Dict cleaned up """ + pattern_unsupported_file = make_pattern_unsupported_file_extension( + unsupported_file_extensions + ) # Required top level keys required_keys = ["version", "revision", "sources"] missing_keys = [] @@ -287,7 +300,7 @@ if valid and len(source["urls"]) > 0: # Filter out unsupported archives supported_sources: List[str] = [] for source_url in source["urls"]: - if PATTERN_KNOWN_UNSUPPORTED_ARCHIVE.match(source_url): + if pattern_unsupported_file.match(source_url): logger.info(f"Skip unsupported artifact url {source_url}") continue supported_sources.append(source_url) @@ -296,7 +309,7 @@ logger.info( f"Skip source {source} because urls only reference " "unsupported artifacts. Unsupported " - f"artifacts so far: {PATTERN_KNOWN_UNSUPPORTED_ARCHIVE}" + f"artifacts so far: {pattern_unsupported_file}" ) continue diff --git a/swh/loader/package/nixguix/tests/conftest.py b/swh/loader/package/nixguix/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/nixguix/tests/conftest.py @@ -0,0 +1,33 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from typing import Any, Dict + + +@pytest.fixture +def swh_loader_config(swh_storage_backend_config) -> Dict[str, Any]: + # nixguix loader needs a pg-storage backend because some tests share data + return { + "storage": swh_storage_backend_config, + "unsupported_file_extensions": [ + "patch", + "iso", + "whl", + "gem", + "pom", + "msi", + "pod", + "png", + "rock", + "ttf", + "jar", + "c", + "el", + "rpm", + "diff", + ], + } diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py --- a/swh/loader/package/nixguix/tests/test_nixguix.py +++ b/swh/loader/package/nixguix/tests/test_nixguix.py @@ -33,6 +33,7 @@ parse_sources, retrieve_sources, clean_sources, + make_pattern_unsupported_file_extension, ) from swh.loader.package.utils import download @@ -166,7 +167,45 @@ assert len(clean["sources"]) == len(valid_sources) +def test_make_pattern_unsupported_file_extension(): + unsupported_extensions = ["el", "c", "txt"] + supported_extensions = ["Z", "7z"] # for test + + actual_unsupported_pattern = make_pattern_unsupported_file_extension( + unsupported_extensions + ) + + for supported_ext in supported_extensions: + assert supported_ext not in unsupported_extensions + + supported_filepath = f"anything.{supported_ext}" + actual_match = actual_unsupported_pattern.match(supported_filepath) + assert not actual_match + + for unsupported_ext in unsupported_extensions: + unsupported_filepath = f"something.{unsupported_ext}" + actual_match = actual_unsupported_pattern.match(unsupported_filepath) + assert actual_match + + def test_clean_sources_unsupported_artifacts(swh_config, requests_mock_datadir): + unsupported_file_extensions = [ + "iso", + "whl", + "gem", + "pom", + "msi", + "pod", + "png", + "rock", + "ttf", + "jar", + "c", + "el", + "rpm", + "diff", + "patch", + ] supported_sources = [ { "type": "url", @@ -195,22 +234,7 @@ "urls": [f"https://server.org/my-url.{ext}"], "integrity": "my-integrity", } - for ext in [ - "iso", - "whl", - "gem", - "pom", - "msi", - "pod", - "png", - "rock", - "ttf", - "jar", - "c", - "rpm", - "diff", - "patch", - ] + for ext in unsupported_file_extensions ] sources = { @@ -219,7 +243,7 @@ "revision": "my-revision", } - clean = clean_sources(sources) + clean = clean_sources(sources, unsupported_file_extensions) assert len(clean["sources"]) == len(supported_sources)