Page MenuHomeSoftware Heritage

D3742.id13185.diff
No OneTemporary

D3742.id13185.diff

diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py
--- a/swh/loader/package/nixguix/loader.py
+++ b/swh/loader/package/nixguix/loader.py
@@ -3,11 +3,14 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import attr
+import copy
import json
import logging
-from typing import Any, Dict, Iterator, Mapping, Optional, Tuple
+import re
+
+from typing import Any, Dict, List, Iterator, Mapping, Optional, Tuple
-import attr
from swh.model import hashutil
from swh.model.model import (
@@ -211,6 +214,12 @@
return json.loads(raw_sources.decode("utf-8"))
+# Known unsupported archive so far
+PATTERN_KNOWN_UNSUPPORTED_ARCHIVE = re.compile(
+ r".*(iso|whl|gem|pom|msi|pod|png|rock|ttf|jar|c|rpm|diff|patch)$", re.DOTALL
+)
+
+
def clean_sources(sources: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and clean the sources structure. First, ensure all top level keys are
present. Then, walk the sources list and remove sources that do not contain required
@@ -258,23 +267,42 @@
for required_key in required_keys:
if required_key not in source:
logger.info(
- "Skip source '%s' because key '%s' is missing", source, required_key
+ f"Skip source '{source}' because key '{required_key}' is missing",
)
valid = False
+
if valid and source["type"] != "url":
logger.info(
- "Skip source '%s' because the type %s is not supported",
- source,
- source["type"],
+ f"Skip source '{source}' because the type {source['type']} "
+ "is not supported",
)
valid = False
+
if valid and not isinstance(source["urls"], list):
logger.info(
- "Skip source '%s' because the urls attribute is not a list", source
+ f"Skip source {source} because the urls attribute is not a list"
)
valid = False
- if valid:
- verified_sources.append(source)
+
+ if valid and len(source["urls"]) > 0: # Filter out unsupported archives
+ supported_sources: List[str] = []
+ for source_url in source["urls"]:
+ if PATTERN_KNOWN_UNSUPPORTED_ARCHIVE.match(source_url):
+ logger.info(f"Skip unsupported artifact url {source_url}")
+ continue
+ supported_sources.append(source_url)
+
+ if len(supported_sources) == 0:
+ logger.info(
+ f"Skip source {source} because urls only reference "
+ "unsupported artifacts. Unsupported "
+ f"artifacts so far: {PATTERN_KNOWN_UNSUPPORTED_ARCHIVE}"
+ )
+ continue
+
+ new_source = copy.deepcopy(source)
+ new_source["urls"] = supported_sources
+ verified_sources.append(new_source)
sources["sources"] = verified_sources
return sources
diff --git a/swh/loader/package/nixguix/tests/test_nixguix.py b/swh/loader/package/nixguix/tests/test_nixguix.py
--- a/swh/loader/package/nixguix/tests/test_nixguix.py
+++ b/swh/loader/package/nixguix/tests/test_nixguix.py
@@ -142,17 +142,20 @@
def test_clean_sources_invalid_sources(swh_config, requests_mock_datadir):
+ valid_sources = [
+ # 1 valid source
+ {"type": "url", "urls": ["my-url.tar.gz"], "integrity": "my-integrity"},
+ ]
sources = {
"version": 1,
- "sources": [
- # Valid source
- {"type": "url", "urls": ["my-url"], "integrity": "my-integrity"},
+ "sources": valid_sources
+ + [
# integrity is missing
- {"type": "url", "urls": ["my-url"],},
+ {"type": "url", "urls": ["my-url.tgz"],},
# urls is not a list
- {"type": "url", "urls": "my-url", "integrity": "my-integrity"},
+ {"type": "url", "urls": "my-url.zip", "integrity": "my-integrity"},
# type is not url
- {"type": "git", "urls": ["my-url"], "integrity": "my-integrity"},
+ {"type": "git", "urls": ["my-url.zip"], "integrity": "my-integrity"},
# missing fields which got double-checked nonetheless...
{"integrity": "my-integrity"},
],
@@ -160,7 +163,65 @@
}
clean = clean_sources(sources)
- assert len(clean["sources"]) == 1
+ assert len(clean["sources"]) == len(valid_sources)
+
+
+def test_clean_sources_unsupported_artifacts(swh_config, requests_mock_datadir):
+ supported_sources = [
+ {
+ "type": "url",
+ "urls": [f"https://server.org/my-url.{ext}"],
+ "integrity": "my-integrity",
+ }
+ for ext in [
+ "known-unknown-but-ok", # this is fine as well with the current approach
+ "zip",
+ "tar.gz",
+ "tgz",
+ "tar.bz2",
+ "tbz",
+ "tbz2",
+ "tar.xz",
+ "tar",
+ "zip",
+ "7z",
+ "Z",
+ ]
+ ]
+
+ unsupported_sources = [
+ {
+ "type": "url",
+ "urls": [f"https://server.org/my-url.{ext}"],
+ "integrity": "my-integrity",
+ }
+ for ext in [
+ "iso",
+ "whl",
+ "gem",
+ "pom",
+ "msi",
+ "pod",
+ "png",
+ "rock",
+ "ttf",
+ "jar",
+ "c",
+ "rpm",
+ "diff",
+ "patch",
+ ]
+ ]
+
+ sources = {
+ "version": 1,
+ "sources": supported_sources + unsupported_sources,
+ "revision": "my-revision",
+ }
+
+ clean = clean_sources(sources)
+
+ assert len(clean["sources"]) == len(supported_sources)
def test_loader_one_visit(swh_config, requests_mock_datadir, raw_sources):

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 9:54 AM (5 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3234008

Event Timeline