Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/nixguix/lister.py
Show All 16 Lines | |||||
import base64 | import base64 | ||||
from dataclasses import dataclass | from dataclasses import dataclass | ||||
from enum import Enum | from enum import Enum | ||||
import logging | import logging | ||||
from pathlib import Path | from pathlib import Path | ||||
import random | import random | ||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union | from typing import Any, Dict, Iterator, List, Optional, Tuple, Union | ||||
from urllib.parse import urlparse | from urllib.parse import parse_qsl, urlparse | ||||
import requests | import requests | ||||
from requests.exceptions import ConnectionError, InvalidSchema, SSLError | from requests.exceptions import ConnectionError, InvalidSchema, SSLError | ||||
from swh.core.github.utils import GitHubSession | from swh.core.github.utils import GitHubSession | ||||
from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT | from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT | ||||
from swh.lister import TARBALL_EXTENSIONS | from swh.lister import TARBALL_EXTENSIONS | ||||
from swh.lister.pattern import CredentialsType, StatelessLister | from swh.lister.pattern import CredentialsType, StatelessLister | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
class ArtifactNatureUndetected(ValueError): | class ArtifactNatureUndetected(ValueError): | ||||
"""Raised when a remote artifact's nature (tarball, file) cannot be detected.""" | """Raised when a remote artifact's nature (tarball, file) cannot be detected.""" | ||||
pass | pass | ||||
class ArtifactNatureMistyped(ValueError): | class ArtifactNatureMistyped(ValueError): | ||||
"""Raised when a remote artifact's neither a tarball nor a file. | """Raised when a remote artifact is neither a tarball nor a file. | ||||
Error of this type are' probably a misconfiguration in the manifest generation that | Error of this type are' probably a misconfiguration in the manifest generation that | ||||
badly typed a vcs repository. | badly typed a vcs repository. | ||||
""" | """ | ||||
pass | pass | ||||
class ArtifactWithoutExtension(ValueError): | |||||
"""Raised when an artifact nature cannot be determined by its name. | |||||
This exception is solely for internal use of the :meth:`is_tarball` method. | |||||
""" | |||||
pass | |||||
class ChecksumsComputation(Enum): | class ChecksumsComputation(Enum): | ||||
"""The possible artifact types listed out of the manifest.""" | """The possible artifact types listed out of the manifest.""" | ||||
STANDARD = "standard" | STANDARD = "standard" | ||||
"""Standard checksums (e.g. sha1, sha256, ...) on the tarball or file.""" | """Standard checksums (e.g. sha1, sha256, ...) on the tarball or file.""" | ||||
NAR = "nar" | NAR = "nar" | ||||
"""The hash is computed over the NAR archive dump of the output (e.g. uncompressed | """The hash is computed over the NAR archive dump of the output (e.g. uncompressed | ||||
directory.)""" | directory.)""" | ||||
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines | Returns: A tuple (bool, url). The boolean represents whether the url is an archive | ||||
as a fallback of not finding out whether the urls are tarballs or not. | as a fallback of not finding out whether the urls are tarballs or not. | ||||
""" | """ | ||||
def _is_tarball(url): | def _is_tarball(url): | ||||
"""Determine out of an extension whether url is a tarball. | """Determine out of an extension whether url is a tarball. | ||||
Raises: | Raises: | ||||
IndexError in case no extension is available | ArtifactWithoutExtension in case no extension is available | ||||
""" | """ | ||||
urlparsed = urlparse(url) | urlparsed = urlparse(url) | ||||
if urlparsed.scheme not in ("http", "https", "ftp"): | if urlparsed.scheme not in ("http", "https", "ftp"): | ||||
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") | raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") | ||||
return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS | |||||
errors = [] | |||||
query_params = dict(parse_qsl(urlparsed.query)) | |||||
for path in [query_params.get(key) for key in ["f", "file", "url", "name"]] + [ | |||||
urlparsed.path | |||||
]: | |||||
if not path: | |||||
continue | |||||
try: | |||||
file_ = Path(path).suffixes[-1] | |||||
break | |||||
except IndexError as e: | |||||
errors.append(ArtifactWithoutExtension(e)) | |||||
if errors: | |||||
raise errors[-1] | |||||
anlambert: You should check values of all query parameters I think as you cannot guess their names.
You… | |||||
Done Inline Actionsok, looks like a better approach. I'm gonna land as is for now. And do another diff to simplify all this (i have another diff already stacked on this and another incoming diff and I'd like to avoid having to spend my time rebasing stuff ;) ardumont: ok, looks like a better approach.
I'm gonna land as is for now.
And do another diff to… | |||||
return file_.lstrip(".") in TARBALL_EXTENSIONS | |||||
index = random.randrange(len(urls)) | index = random.randrange(len(urls)) | ||||
url = urls[index] | url = urls[index] | ||||
try: | try: | ||||
return _is_tarball(url), urls[0] | return _is_tarball(url), urls[0] | ||||
except IndexError: | except ArtifactWithoutExtension: | ||||
if request is None: | if request is None: | ||||
raise ArtifactNatureUndetected( | raise ArtifactNatureUndetected( | ||||
f"Cannot determine artifact type from url <{url}>" | f"Cannot determine artifact type from url <{url}>" | ||||
) | ) | ||||
logger.warning( | logger.warning( | ||||
"Cannot detect extension for <%s>. Fallback to http head query", | "Cannot detect extension for <%s>. Fallback to http head query", | ||||
url, | url, | ||||
) | ) | ||||
Show All 11 Lines | except ArtifactWithoutExtension: | ||||
) | ) | ||||
location = response.headers.get("Location") | location = response.headers.get("Location") | ||||
if location: # It's not always present | if location: # It's not always present | ||||
logger.debug("Location: %s", location) | logger.debug("Location: %s", location) | ||||
try: | try: | ||||
# FIXME: location is also returned as it's considered the true origin, | # FIXME: location is also returned as it's considered the true origin, | ||||
# true enough? | # true enough? | ||||
return _is_tarball(location), location | return _is_tarball(location), location | ||||
except IndexError: | except ArtifactWithoutExtension: | ||||
logger.warning( | logger.warning( | ||||
"Still cannot detect extension through location <%s>...", | "Still cannot detect extension through location <%s>...", | ||||
url, | url, | ||||
) | ) | ||||
content_type = response.headers.get("Content-Type") | content_type = response.headers.get("Content-Type") | ||||
if content_type: | if content_type: | ||||
logger.debug("Content-Type: %s", content_type) | logger.debug("Content-Type: %s", content_type) | ||||
▲ Show 20 Lines • Show All 239 Lines • Show Last 20 Lines |
You should check values of all query parameters I think as you cannot guess their names.
You can also make that code shorter the following way: