Page MenuHomeSoftware Heritage

D8606.id.diff
No OneTemporary

D8606.id.diff

diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -41,6 +41,13 @@
pass
+class ArtifactNatureMistyped(ValueError):
+ """Raised when a remote artifact's neither a tarball nor a file. It's probably a
+ misconfiguration in the manifest that badly typed a vcs repository."""
+
+ pass
+
+
@dataclass
class OriginUpstream:
"""Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix)."""
@@ -73,10 +80,10 @@
origin: str
"""Origin url of the vcs"""
- ref: Optional[str]
- """Reference either a svn commit id, a git commit, ..."""
type: str
"""Type of (d)vcs, e.g. svn, git, hg, ..."""
+ ref: Optional[str] = None
+ """Reference either a svn commit id, a git commit, ..."""
class ArtifactType(Enum):
@@ -109,6 +116,8 @@
Raises:
ArtifactNatureUndetected when the artifact's nature cannot be detected out
of its url
+ ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to
+ the caller to do what's right with it.
Returns: A tuple (bool, url). The boolean represents whether the url is an archive
or not. The second parameter is the actual url once the head request is issued
@@ -123,27 +132,37 @@
IndexError in case no extension is available
"""
- return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
+ urlparsed = urlparse(url)
+ if urlparsed.scheme not in ("http", "https", "ftp"):
+ raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
+ return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
index = random.randrange(len(urls))
url = urls[index]
+
try:
is_tar = _is_tarball(url)
return is_tar, urls[0]
except IndexError:
if request is None:
raise ArtifactNatureUndetected(
- "Cannot determine artifact type from url %s", url
+ f"Cannot determine artifact type from url <{url}>"
)
logger.warning(
- "Cannot detect extension for '%s'. Fallback to http head query",
+ "Cannot detect extension for <%s>. Fallback to http head query",
url,
)
- response = request.head(url)
+
+ try:
+ response = request.head(url)
+ except requests.exceptions.InvalidSchema:
+ raise ArtifactNatureUndetected(
+ f"Cannot determine artifact type from url <{url}>"
+ )
if not response.ok or response.status_code == 404:
raise ArtifactNatureUndetected(
- "Cannot determine artifact type from url %s", url
+ f"Cannot determine artifact type from url <{url}>"
)
location = response.headers.get("Location")
if location: # It's not always present
@@ -154,7 +173,7 @@
return _is_tarball(location), location
except IndexError:
logger.warning(
- "Still cannot detect extension through location '%s'...",
+ "Still cannot detect extension through location <%s>...",
url,
)
@@ -166,7 +185,7 @@
return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0]
raise ArtifactNatureUndetected(
- "Cannot determine artifact type from url %s", url
+ f"Cannot determine artifact type from url <{url}>"
)
@@ -287,23 +306,48 @@
urls = artifact.get("urls")
if not urls:
# Nothing to fetch
- logger.warning("Skipping url '%s': empty artifact", artifact)
+ logger.warning("Skipping url <%s>: empty artifact", artifact)
continue
assert urls is not None
+
+ # Deal with misplaced origins
+
# FIXME: T3294: Fix missing scheme in urls
origin, *fallback_urls = urls
integrity = artifact.get("integrity")
if integrity is None:
- logger.warning("Skipping url '%s': missing integrity field", origin)
+ logger.warning("Skipping url <%s>: missing integrity field", origin)
continue
try:
is_tar, origin = is_tarball(urls, self.session)
+ except ArtifactNatureMistyped:
+ logger.warning(
+ "Mistyped url <%s>: trying to deal with it properly", origin
+ )
+ urlparsed = urlparse(origin)
+ artifact_type = urlparsed.scheme
+ if artifact_type in VCS_SUPPORTED:
+ artifact_url = (
+ self.github_session.get_canonical_url(origin)
+ if self.github_session
+ else origin
+ )
+ if not artifact_url:
+ continue
+ yield ArtifactType.VCS, VCS(
+ origin=artifact_url, type=artifact_type
+ )
+ else:
+ logger.warning(
+ "Skipping url <%s>: undetected remote artifact type", origin
+ )
+ continue
except ArtifactNatureUndetected:
logger.warning(
- "Skipping url '%s': undetected remote artifact type", origin
+ "Skipping url <%s>: undetected remote artifact type", origin
)
continue
@@ -325,7 +369,7 @@
)
else:
logger.warning(
- "Skipping artifact '%s': unsupported type %s",
+ "Skipping artifact <%s>: unsupported type %s",
artifact,
artifact_type,
)
diff --git a/swh/lister/nixguix/tests/data/guix-swh_sources.json b/swh/lister/nixguix/tests/data/guix-swh_sources.json
--- a/swh/lister/nixguix/tests/data/guix-swh_sources.json
+++ b/swh/lister/nixguix/tests/data/guix-swh_sources.json
@@ -12,6 +12,18 @@
{
"type": "url",
"urls": [ "https://example.org/another-file-no-integrity-so-skipped.txt" ]
+ },
+ {
+ "type": "url",
+ "urls": [
+ "ftp://ftp.ourproject.org/file-with-no-extension"
+ ],
+ "integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
+ },
+ {
+ "type": "url",
+ "urls": [ "unknown://example.org/wrong-scheme-so-skipped.txt" ],
+ "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
}
],
"version":"1",
diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
--- a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
+++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json
@@ -22,6 +22,13 @@
],
"integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0="
},
+ {
+ "type": "url",
+ "urls": [
+ "ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz"
+ ],
+ "integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
+ },
{
"type": "url",
"urls": [
@@ -45,6 +52,11 @@
"type": "svn",
"svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2",
"svn_revision": 39057
+ },
+ {
+ "type": "url",
+ "urls": ["svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"],
+ "integrity": "sha256-VifIQ+UEVMKJ+cNS+Xxusazinr5Cgu1lmGuhqj/5Mpk="
}
],
"version": "1",
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
--- a/swh/lister/nixguix/tests/test_lister.py
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -15,6 +15,7 @@
from swh.lister import TARBALL_EXTENSIONS
from swh.lister.nixguix.lister import (
POSSIBLE_TARBALL_MIMETYPES,
+ ArtifactNatureMistyped,
ArtifactNatureUndetected,
NixGuixLister,
is_tarball,
@@ -31,19 +32,20 @@
@pytest.mark.parametrize(
- "urls",
+ "tarballs",
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
+ [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS],
)
-def test_is_tarball_simple(urls):
+def test_is_tarball_simple(tarballs):
"""Simple check on tarball should discriminate betwenn tarball and file"""
+ urls = [f"https://example.org/{tarball}" for tarball in tarballs]
is_tar, origin = is_tarball(urls)
assert is_tar is True
assert origin == urls[0]
@pytest.mark.parametrize(
- "urls",
+ "files",
[
["abc.lisp"],
["one.abc", "two.bcd"],
@@ -52,8 +54,9 @@
["config.nix", "flakes.nix"],
],
)
-def test_is_tarball_simple_not_tarball(urls):
+def test_is_tarball_simple_not_tarball(files):
"""Simple check on tarball should discriminate betwenn tarball and file"""
+ urls = [f"http://example.org/{file}" for file in files]
is_tar, origin = is_tarball(urls)
assert is_tar is False
assert origin == urls[0]
@@ -65,7 +68,7 @@
url = "https://example.org/crates/package/download"
urls = [url]
with pytest.raises(ArtifactNatureUndetected):
- is_tarball(url) # no request parameter, this cannot fallback, raises
+ is_tarball(urls) # no request parameter, this cannot fallback, raises
with pytest.raises(ArtifactNatureUndetected):
requests_mock.head(
@@ -87,6 +90,16 @@
)
is_tarball(urls, requests)
+ with pytest.raises(ArtifactNatureMistyped):
+ is_tarball(["foo://example.org/unsupported-scheme"])
+
+ with pytest.raises(ArtifactNatureMistyped):
+ fallback_url = "foo://example.org/unsupported-scheme"
+ requests_mock.head(
+ url, headers={"location": fallback_url} # still no extension, cannot detect
+ )
+ is_tarball(urls, requests)
+
@pytest.mark.parametrize(
"fallback_url, expected_result",
@@ -162,6 +175,8 @@
url = artifact["urls"][0]
if url.endswith(".c") or url.endswith(".txt"):
expected_visit_types["content"] += 1
+ elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
+ expected_visit_types["svn"] += 1
else:
expected_visit_types["directory"] += 1
@@ -214,6 +229,12 @@
"https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped",
status_code=404,
)
+ # Will raise for that origin, this will get ignored as we cannot determine anything
+ # from its name
+ requests_mock.head(
+ "ftp://ftp.ourproject.org/file-with-no-extension",
+ exc=requests.exceptions.InvalidSchema,
+ )
listed_result = lister.run()
# only the origin upstream is listed, every other entries are unsupported or incomplete

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 4:10 PM (11 w, 19 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219542

Event Timeline