Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066621
D8606.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
10 KB
Subscribers
None
D8606.id.diff
View Options
diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -41,6 +41,13 @@
pass
+class ArtifactNatureMistyped(ValueError):
+ """Raised when a remote artifact's neither a tarball nor a file. It's probably a
+ misconfiguration in the manifest that badly typed a vcs repository."""
+
+ pass
+
+
@dataclass
class OriginUpstream:
"""Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix)."""
@@ -73,10 +80,10 @@
origin: str
"""Origin url of the vcs"""
- ref: Optional[str]
- """Reference either a svn commit id, a git commit, ..."""
type: str
"""Type of (d)vcs, e.g. svn, git, hg, ..."""
+ ref: Optional[str] = None
+ """Reference either a svn commit id, a git commit, ..."""
class ArtifactType(Enum):
@@ -109,6 +116,8 @@
Raises:
ArtifactNatureUndetected when the artifact's nature cannot be detected out
of its url
+ ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to
+ the caller to do what's right with it.
Returns: A tuple (bool, url). The boolean represents whether the url is an archive
or not. The second parameter is the actual url once the head request is issued
@@ -123,27 +132,37 @@
IndexError in case no extension is available
"""
- return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
+ urlparsed = urlparse(url)
+ if urlparsed.scheme not in ("http", "https", "ftp"):
+ raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
+ return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
index = random.randrange(len(urls))
url = urls[index]
+
try:
is_tar = _is_tarball(url)
return is_tar, urls[0]
except IndexError:
if request is None:
raise ArtifactNatureUndetected(
- "Cannot determine artifact type from url %s", url
+ f"Cannot determine artifact type from url <{url}>"
)
logger.warning(
- "Cannot detect extension for '%s'. Fallback to http head query",
+ "Cannot detect extension for <%s>. Fallback to http head query",
url,
)
- response = request.head(url)
+
+ try:
+ response = request.head(url)
+ except requests.exceptions.InvalidSchema:
+ raise ArtifactNatureUndetected(
+ f"Cannot determine artifact type from url <{url}>"
+ )
if not response.ok or response.status_code == 404:
raise ArtifactNatureUndetected(
- "Cannot determine artifact type from url %s", url
+ f"Cannot determine artifact type from url <{url}>"
)
location = response.headers.get("Location")
if location: # It's not always present
@@ -154,7 +173,7 @@
return _is_tarball(location), location
except IndexError:
logger.warning(
- "Still cannot detect extension through location '%s'...",
+ "Still cannot detect extension through location <%s>...",
url,
)
@@ -166,7 +185,7 @@
return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0]
raise ArtifactNatureUndetected(
- "Cannot determine artifact type from url %s", url
+ f"Cannot determine artifact type from url <{url}>"
)
@@ -287,23 +306,48 @@
urls = artifact.get("urls")
if not urls:
# Nothing to fetch
- logger.warning("Skipping url '%s': empty artifact", artifact)
+ logger.warning("Skipping url <%s>: empty artifact", artifact)
continue
assert urls is not None
+
+ # Deal with misplaced origins
+
# FIXME: T3294: Fix missing scheme in urls
origin, *fallback_urls = urls
integrity = artifact.get("integrity")
if integrity is None:
- logger.warning("Skipping url '%s': missing integrity field", origin)
+ logger.warning("Skipping url <%s>: missing integrity field", origin)
continue
try:
is_tar, origin = is_tarball(urls, self.session)
+ except ArtifactNatureMistyped:
+ logger.warning(
+ "Mistyped url <%s>: trying to deal with it properly", origin
+ )
+ urlparsed = urlparse(origin)
+ artifact_type = urlparsed.scheme
+ if artifact_type in VCS_SUPPORTED:
+ artifact_url = (
+ self.github_session.get_canonical_url(origin)
+ if self.github_session
+ else origin
+ )
+ if not artifact_url:
+ continue
+ yield ArtifactType.VCS, VCS(
+ origin=artifact_url, type=artifact_type
+ )
+ else:
+ logger.warning(
+ "Skipping url <%s>: undetected remote artifact type", origin
+ )
+ continue
except ArtifactNatureUndetected:
logger.warning(
- "Skipping url '%s': undetected remote artifact type", origin
+ "Skipping url <%s>: undetected remote artifact type", origin
)
continue
@@ -325,7 +369,7 @@
)
else:
logger.warning(
- "Skipping artifact '%s': unsupported type %s",
+ "Skipping artifact <%s>: unsupported type %s",
artifact,
artifact_type,
)
diff --git a/swh/lister/nixguix/tests/data/guix-swh_sources.json b/swh/lister/nixguix/tests/data/guix-swh_sources.json
--- a/swh/lister/nixguix/tests/data/guix-swh_sources.json
+++ b/swh/lister/nixguix/tests/data/guix-swh_sources.json
@@ -12,6 +12,18 @@
{
"type": "url",
"urls": [ "https://example.org/another-file-no-integrity-so-skipped.txt" ]
+ },
+ {
+ "type": "url",
+ "urls": [
+ "ftp://ftp.ourproject.org/file-with-no-extension"
+ ],
+ "integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
+ },
+ {
+ "type": "url",
+ "urls": [ "unknown://example.org/wrong-scheme-so-skipped.txt" ],
+ "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
}
],
"version":"1",
diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
--- a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
+++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
@@ -22,6 +22,13 @@
],
"integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0="
},
+ {
+ "type": "url",
+ "urls": [
+ "ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz"
+ ],
+ "integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
+ },
{
"type": "url",
"urls": [
@@ -45,6 +52,11 @@
"type": "svn",
"svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2",
"svn_revision": 39057
+ },
+ {
+ "type": "url",
+ "urls": ["svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"],
+ "integrity": "sha256-VifIQ+UEVMKJ+cNS+Xxusazinr5Cgu1lmGuhqj/5Mpk="
}
],
"version": "1",
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
--- a/swh/lister/nixguix/tests/test_lister.py
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -15,6 +15,7 @@
from swh.lister import TARBALL_EXTENSIONS
from swh.lister.nixguix.lister import (
POSSIBLE_TARBALL_MIMETYPES,
+ ArtifactNatureMistyped,
ArtifactNatureUndetected,
NixGuixLister,
is_tarball,
@@ -31,19 +32,20 @@
@pytest.mark.parametrize(
- "urls",
+ "tarballs",
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
+ [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS],
)
-def test_is_tarball_simple(urls):
+def test_is_tarball_simple(tarballs):
"""Simple check on tarball should discriminate betwenn tarball and file"""
+ urls = [f"https://example.org/{tarball}" for tarball in tarballs]
is_tar, origin = is_tarball(urls)
assert is_tar is True
assert origin == urls[0]
@pytest.mark.parametrize(
- "urls",
+ "files",
[
["abc.lisp"],
["one.abc", "two.bcd"],
@@ -52,8 +54,9 @@
["config.nix", "flakes.nix"],
],
)
-def test_is_tarball_simple_not_tarball(urls):
+def test_is_tarball_simple_not_tarball(files):
"""Simple check on tarball should discriminate betwenn tarball and file"""
+ urls = [f"http://example.org/{file}" for file in files]
is_tar, origin = is_tarball(urls)
assert is_tar is False
assert origin == urls[0]
@@ -65,7 +68,7 @@
url = "https://example.org/crates/package/download"
urls = [url]
with pytest.raises(ArtifactNatureUndetected):
- is_tarball(url) # no request parameter, this cannot fallback, raises
+ is_tarball(urls) # no request parameter, this cannot fallback, raises
with pytest.raises(ArtifactNatureUndetected):
requests_mock.head(
@@ -87,6 +90,16 @@
)
is_tarball(urls, requests)
+ with pytest.raises(ArtifactNatureMistyped):
+ is_tarball(["foo://example.org/unsupported-scheme"])
+
+ with pytest.raises(ArtifactNatureMistyped):
+ fallback_url = "foo://example.org/unsupported-scheme"
+ requests_mock.head(
+ url, headers={"location": fallback_url} # still no extension, cannot detect
+ )
+ is_tarball(urls, requests)
+
@pytest.mark.parametrize(
"fallback_url, expected_result",
@@ -162,6 +175,8 @@
url = artifact["urls"][0]
if url.endswith(".c") or url.endswith(".txt"):
expected_visit_types["content"] += 1
+ elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
+ expected_visit_types["svn"] += 1
else:
expected_visit_types["directory"] += 1
@@ -214,6 +229,12 @@
"https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped",
status_code=404,
)
+ # Will raise for that origin, this will get ignored as we cannot determine anything
+ # from its name
+ requests_mock.head(
+ "ftp://ftp.ourproject.org/file-with-no-extension",
+ exc=requests.exceptions.InvalidSchema,
+ )
listed_result = lister.run()
# only the origin upstream is listed, every other entries are unsupported or incomplete
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 4:10 PM (11 w, 19 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219542
Attached To
D8606: nixguix: Deal with mistyped origins
Event Timeline
Log In to Comment