diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -41,6 +41,13 @@ pass +class ArtifactNatureMistyped(ValueError): + """Raised when a remote artifact's neither a tarball nor a file. It's probably a + misconfiguration in the manifest that badly typed a vcs repository.""" + + pass + + @dataclass class OriginUpstream: """Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" @@ -73,10 +80,10 @@ origin: str """Origin url of the vcs""" - ref: Optional[str] - """Reference either a svn commit id, a git commit, ...""" type: str """Type of (d)vcs, e.g. svn, git, hg, ...""" + ref: Optional[str] = None + """Reference either a svn commit id, a git commit, ...""" class ArtifactType(Enum): @@ -109,6 +116,8 @@ Raises: ArtifactNatureUndetected when the artifact's nature cannot be detected out of its url + ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to + the caller to do what's right with it. Returns: A tuple (bool, url). The boolean represents whether the url is an archive or not. The second parameter is the actual url once the head request is issued @@ -123,27 +132,37 @@ IndexError in case no extension is available """ - return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS + urlparsed = urlparse(url) + if urlparsed.scheme not in ("http", "https", "ftp"): + raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") + return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS index = random.randrange(len(urls)) url = urls[index] + try: is_tar = _is_tarball(url) return is_tar, urls[0] except IndexError: if request is None: raise ArtifactNatureUndetected( - "Cannot determine artifact type from url %s", url + f"Cannot determine artifact type from url <{url}>" ) logger.warning( - "Cannot detect extension for '%s'. Fallback to http head query", + "Cannot detect extension for <%s>. Fallback to http head query", url, ) - response = request.head(url) + + try: + response = request.head(url) + except requests.exceptions.InvalidSchema: + raise ArtifactNatureUndetected( + f"Cannot determine artifact type from url <{url}>" + ) if not response.ok or response.status_code == 404: raise ArtifactNatureUndetected( - "Cannot determine artifact type from url %s", url + f"Cannot determine artifact type from url <{url}>" ) location = response.headers.get("Location") if location: # It's not always present @@ -154,7 +173,7 @@ return _is_tarball(location), location except IndexError: logger.warning( - "Still cannot detect extension through location '%s'...", + "Still cannot detect extension through location <%s>...", url, ) @@ -166,7 +185,7 @@ return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0] raise ArtifactNatureUndetected( - "Cannot determine artifact type from url %s", url + f"Cannot determine artifact type from url <{url}>" ) @@ -287,23 +306,48 @@ urls = artifact.get("urls") if not urls: # Nothing to fetch - logger.warning("Skipping url '%s': empty artifact", artifact) + logger.warning("Skipping url <%s>: empty artifact", artifact) continue assert urls is not None + + # Deal with misplaced origins + # FIXME: T3294: Fix missing scheme in urls origin, *fallback_urls = urls integrity = artifact.get("integrity") if integrity is None: - logger.warning("Skipping url '%s': missing integrity field", origin) + logger.warning("Skipping url <%s>: missing integrity field", origin) continue try: is_tar, origin = is_tarball(urls, self.session) + except ArtifactNatureMistyped: + logger.warning( + "Mistyped url <%s>: trying to deal with it properly", origin + ) + urlparsed = urlparse(origin) + artifact_type = urlparsed.scheme + if artifact_type in VCS_SUPPORTED: + artifact_url = ( + self.github_session.get_canonical_url(origin) + if self.github_session + else origin + ) + if not artifact_url: + continue + yield ArtifactType.VCS, VCS( + origin=artifact_url, type=artifact_type + ) + else: + logger.warning( + "Skipping url <%s>: undetected remote artifact type", origin + ) + continue except ArtifactNatureUndetected: logger.warning( - "Skipping url '%s': undetected remote artifact type", origin + "Skipping url <%s>: undetected remote artifact type", origin ) continue @@ -325,7 +369,7 @@ ) else: logger.warning( - "Skipping artifact '%s': unsupported type %s", + "Skipping artifact <%s>: unsupported type %s", artifact, artifact_type, ) diff --git a/swh/lister/nixguix/tests/data/guix-swh_sources.json b/swh/lister/nixguix/tests/data/guix-swh_sources.json --- a/swh/lister/nixguix/tests/data/guix-swh_sources.json +++ b/swh/lister/nixguix/tests/data/guix-swh_sources.json @@ -12,6 +12,18 @@ { "type": "url", "urls": [ "https://example.org/another-file-no-integrity-so-skipped.txt" ] + }, + { + "type": "url", + "urls": [ + "ftp://ftp.ourproject.org/file-with-no-extension" + ], + "integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ=" + }, + { + "type": "url", + "urls": [ "unknown://example.org/wrong-scheme-so-skipped.txt" ], + "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI=" } ], "version":"1", diff --git a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json --- a/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json +++ b/swh/lister/nixguix/tests/data/nixpkgs-swh_sources.json @@ -22,6 +22,13 @@ ], "integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0=" }, + { + "type": "url", + "urls": [ + "ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz" + ], + "integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ=" + }, { "type": "url", "urls": [ @@ -45,6 +52,11 @@ "type": "svn", "svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2", "svn_revision": 39057 + }, + { + "type": "url", + "urls": ["svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"], + "integrity": "sha256-VifIQ+UEVMKJ+cNS+Xxusazinr5Cgu1lmGuhqj/5Mpk=" } ], "version": "1", diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -15,6 +15,7 @@ from swh.lister import TARBALL_EXTENSIONS from swh.lister.nixguix.lister import ( POSSIBLE_TARBALL_MIMETYPES, + ArtifactNatureMistyped, ArtifactNatureUndetected, NixGuixLister, is_tarball, @@ -31,19 +32,20 @@ @pytest.mark.parametrize( - "urls", + "tarballs", [[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS] + [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS], ) -def test_is_tarball_simple(urls): +def test_is_tarball_simple(tarballs): """Simple check on tarball should discriminate betwenn tarball and file""" + urls = [f"https://example.org/{tarball}" for tarball in tarballs] is_tar, origin = is_tarball(urls) assert is_tar is True assert origin == urls[0] @pytest.mark.parametrize( - "urls", + "files", [ ["abc.lisp"], ["one.abc", "two.bcd"], @@ -52,8 +54,9 @@ ["config.nix", "flakes.nix"], ], ) -def test_is_tarball_simple_not_tarball(urls): +def test_is_tarball_simple_not_tarball(files): """Simple check on tarball should discriminate betwenn tarball and file""" + urls = [f"http://example.org/{file}" for file in files] is_tar, origin = is_tarball(urls) assert is_tar is False assert origin == urls[0] @@ -65,7 +68,7 @@ url = "https://example.org/crates/package/download" urls = [url] with pytest.raises(ArtifactNatureUndetected): - is_tarball(url) # no request parameter, this cannot fallback, raises + is_tarball(urls) # no request parameter, this cannot fallback, raises with pytest.raises(ArtifactNatureUndetected): requests_mock.head( @@ -87,6 +90,16 @@ ) is_tarball(urls, requests) + with pytest.raises(ArtifactNatureMistyped): + is_tarball(["foo://example.org/unsupported-scheme"]) + + with pytest.raises(ArtifactNatureMistyped): + fallback_url = "foo://example.org/unsupported-scheme" + requests_mock.head( + url, headers={"location": fallback_url} # still no extension, cannot detect + ) + is_tarball(urls, requests) + @pytest.mark.parametrize( "fallback_url, expected_result", @@ -162,6 +175,8 @@ url = artifact["urls"][0] if url.endswith(".c") or url.endswith(".txt"): expected_visit_types["content"] += 1 + elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless + expected_visit_types["svn"] += 1 else: expected_visit_types["directory"] += 1 @@ -214,6 +229,12 @@ "https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped", status_code=404, ) + # Will raise for that origin, this will get ignored as we cannot determine anything + # from its name + requests_mock.head( + "ftp://ftp.ourproject.org/file-with-no-extension", + exc=requests.exceptions.InvalidSchema, + ) listed_result = lister.run() # only the origin upstream is listed, every other entries are unsupported or incomplete