Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/nixguix/lister.py
Show All 35 Lines | |||||||||||
class ArtifactNatureUndetected(ValueError): | class ArtifactNatureUndetected(ValueError): | ||||||||||
"""Raised when a remote artifact's nature (tarball, file) cannot be detected.""" | """Raised when a remote artifact's nature (tarball, file) cannot be detected.""" | ||||||||||
pass | pass | ||||||||||
class ArtifactNatureMistyped(ValueError): | |||||||||||
"""Raised when a remote artifact's neither a tarball nor a file. It's probably a | |||||||||||
misconfiguration in the manifest that badly typed a vcs repository.""" | |||||||||||
pass | |||||||||||
@dataclass | @dataclass | ||||||||||
class OriginUpstream: | class OriginUpstream: | ||||||||||
"""Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" | """Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix).""" | ||||||||||
origin: str | origin: str | ||||||||||
"""Canonical url of the repository""" | """Canonical url of the repository""" | ||||||||||
version: int | version: int | ||||||||||
"""Version of the repository (dismissed?)""" | """Version of the repository (dismissed?)""" | ||||||||||
Show All 16 Lines | |||||||||||
@dataclass | @dataclass | ||||||||||
class VCS: | class VCS: | ||||||||||
"""Metadata information on VCS.""" | """Metadata information on VCS.""" | ||||||||||
origin: str | origin: str | ||||||||||
"""Origin url of the vcs""" | """Origin url of the vcs""" | ||||||||||
ref: Optional[str] | |||||||||||
"""Reference either a svn commit id, a git commit, ...""" | |||||||||||
type: str | type: str | ||||||||||
"""Type of (d)vcs, e.g. svn, git, hg, ...""" | """Type of (d)vcs, e.g. svn, git, hg, ...""" | ||||||||||
ref: Optional[str] = None | |||||||||||
"""Reference either a svn commit id, a git commit, ...""" | |||||||||||
class ArtifactType(Enum): | class ArtifactType(Enum): | ||||||||||
"""The possible artifact types listed out of the manifest.""" | """The possible artifact types listed out of the manifest.""" | ||||||||||
ARTIFACT = "artifact" | ARTIFACT = "artifact" | ||||||||||
ORIGIN = "origin" | ORIGIN = "origin" | ||||||||||
VCS = "vcs" | VCS = "vcs" | ||||||||||
Show All 16 Lines | def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: | ||||||||||
not provided, this raises an ArtifactNatureUndetected exception. | not provided, this raises an ArtifactNatureUndetected exception. | ||||||||||
Args: | Args: | ||||||||||
urls: name of the remote files for which the extension needs to be checked. | urls: name of the remote files for which the extension needs to be checked. | ||||||||||
Raises: | Raises: | ||||||||||
ArtifactNatureUndetected when the artifact's nature cannot be detected out | ArtifactNatureUndetected when the artifact's nature cannot be detected out | ||||||||||
of its url | of its url | ||||||||||
ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to | |||||||||||
the caller to do what's right with it. | |||||||||||
Returns: A tuple (bool, url). The boolean represents whether the url is an archive | Returns: A tuple (bool, url). The boolean represents whether the url is an archive | ||||||||||
or not. The second parameter is the actual url once the head request is issued | or not. The second parameter is the actual url once the head request is issued | ||||||||||
as a fallback of not finding out whether the urls are tarballs or not. | as a fallback of not finding out whether the urls are tarballs or not. | ||||||||||
""" | """ | ||||||||||
def _is_tarball(url): | def _is_tarball(url): | ||||||||||
"""Determine out of an extension whether url is a tarball. | """Determine out of an extension whether url is a tarball. | ||||||||||
Raises: | Raises: | ||||||||||
IndexError in case no extension is available | IndexError in case no extension is available | ||||||||||
""" | """ | ||||||||||
return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS | urlparsed = urlparse(url) | ||||||||||
anlambert: There is also tarball URLs with `ftp` scheme, for instance ftp://ftp.ourproject. | |||||||||||
if urlparsed.scheme not in ("http", "https", "ftp"): | |||||||||||
Done Inline Actions
More standard vlorentz: More standard | |||||||||||
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") | |||||||||||
return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS | |||||||||||
index = random.randrange(len(urls)) | index = random.randrange(len(urls)) | ||||||||||
url = urls[index] | url = urls[index] | ||||||||||
try: | try: | ||||||||||
is_tar = _is_tarball(url) | is_tar = _is_tarball(url) | ||||||||||
return is_tar, urls[0] | return is_tar, urls[0] | ||||||||||
except IndexError: | except IndexError: | ||||||||||
if request is None: | if request is None: | ||||||||||
raise ArtifactNatureUndetected( | raise ArtifactNatureUndetected( | ||||||||||
"Cannot determine artifact type from url %s", url | f"Cannot determine artifact type from url <{url}>" | ||||||||||
) | ) | ||||||||||
logger.warning( | logger.warning( | ||||||||||
"Cannot detect extension for '%s'. Fallback to http head query", | "Cannot detect extension for <%s>. Fallback to http head query", | ||||||||||
url, | url, | ||||||||||
) | ) | ||||||||||
try: | |||||||||||
response = request.head(url) | response = request.head(url) | ||||||||||
except requests.exceptions.InvalidSchema: | |||||||||||
raise ArtifactNatureUndetected( | |||||||||||
f"Cannot determine artifact type from url <{url}>" | |||||||||||
) | |||||||||||
if not response.ok or response.status_code == 404: | if not response.ok or response.status_code == 404: | ||||||||||
raise ArtifactNatureUndetected( | raise ArtifactNatureUndetected( | ||||||||||
"Cannot determine artifact type from url %s", url | f"Cannot determine artifact type from url <{url}>" | ||||||||||
) | ) | ||||||||||
location = response.headers.get("Location") | location = response.headers.get("Location") | ||||||||||
if location: # It's not always present | if location: # It's not always present | ||||||||||
logger.debug("Location: %s", location) | logger.debug("Location: %s", location) | ||||||||||
try: | try: | ||||||||||
# FIXME: location is also returned as it's considered the true origin, | # FIXME: location is also returned as it's considered the true origin, | ||||||||||
# true enough? | # true enough? | ||||||||||
return _is_tarball(location), location | return _is_tarball(location), location | ||||||||||
except IndexError: | except IndexError: | ||||||||||
logger.warning( | logger.warning( | ||||||||||
"Still cannot detect extension through location '%s'...", | "Still cannot detect extension through location <%s>...", | ||||||||||
url, | url, | ||||||||||
) | ) | ||||||||||
content_type = response.headers.get("Content-Type") | content_type = response.headers.get("Content-Type") | ||||||||||
if content_type: | if content_type: | ||||||||||
logger.debug("Content-Type: %s", content_type) | logger.debug("Content-Type: %s", content_type) | ||||||||||
if content_type == "application/json": | if content_type == "application/json": | ||||||||||
return False, urls[0] | return False, urls[0] | ||||||||||
return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0] | return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0] | ||||||||||
raise ArtifactNatureUndetected( | raise ArtifactNatureUndetected( | ||||||||||
"Cannot determine artifact type from url %s", url | f"Cannot determine artifact type from url <{url}>" | ||||||||||
) | ) | ||||||||||
VCS_KEYS_MAPPING = { | VCS_KEYS_MAPPING = { | ||||||||||
"git": { | "git": { | ||||||||||
"ref": "git_ref", | "ref": "git_ref", | ||||||||||
"url": "git_url", | "url": "git_url", | ||||||||||
}, | }, | ||||||||||
▲ Show 20 Lines • Show All 104 Lines • ▼ Show 20 Lines | def get_pages(self) -> Iterator[PageResult]: | ||||||||||
yield ArtifactType.VCS, VCS( | yield ArtifactType.VCS, VCS( | ||||||||||
origin=artifact_url, type=artifact_type, ref=plain_ref | origin=artifact_url, type=artifact_type, ref=plain_ref | ||||||||||
) | ) | ||||||||||
elif artifact_type == "url": | elif artifact_type == "url": | ||||||||||
# It's either a tarball or a file | # It's either a tarball or a file | ||||||||||
urls = artifact.get("urls") | urls = artifact.get("urls") | ||||||||||
if not urls: | if not urls: | ||||||||||
# Nothing to fetch | # Nothing to fetch | ||||||||||
logger.warning("Skipping url '%s': empty artifact", artifact) | logger.warning("Skipping url <%s>: empty artifact", artifact) | ||||||||||
continue | continue | ||||||||||
assert urls is not None | assert urls is not None | ||||||||||
# Deal with misplaced origins | |||||||||||
# FIXME: T3294: Fix missing scheme in urls | # FIXME: T3294: Fix missing scheme in urls | ||||||||||
origin, *fallback_urls = urls | origin, *fallback_urls = urls | ||||||||||
integrity = artifact.get("integrity") | integrity = artifact.get("integrity") | ||||||||||
if integrity is None: | if integrity is None: | ||||||||||
logger.warning("Skipping url '%s': missing integrity field", origin) | logger.warning("Skipping url <%s>: missing integrity field", origin) | ||||||||||
continue | continue | ||||||||||
try: | try: | ||||||||||
is_tar, origin = is_tarball(urls, self.session) | is_tar, origin = is_tarball(urls, self.session) | ||||||||||
except ArtifactNatureMistyped: | |||||||||||
logger.warning( | |||||||||||
"Mistyped url <%s>: trying to deal with it properly", origin | |||||||||||
Done Inline Actions
ditto vlorentz: ditto | |||||||||||
) | |||||||||||
urlparsed = urlparse(origin) | |||||||||||
artifact_type = urlparsed.scheme | |||||||||||
if artifact_type in VCS_SUPPORTED: | |||||||||||
artifact_url = ( | |||||||||||
self.github_session.get_canonical_url(origin) | |||||||||||
if self.github_session | |||||||||||
else origin | |||||||||||
) | |||||||||||
if not artifact_url: | |||||||||||
continue | |||||||||||
yield ArtifactType.VCS, VCS( | |||||||||||
origin=artifact_url, type=artifact_type | |||||||||||
) | |||||||||||
else: | |||||||||||
logger.warning( | |||||||||||
"Skipping url <%s>: undetected remote artifact type", origin | |||||||||||
) | |||||||||||
continue | |||||||||||
except ArtifactNatureUndetected: | except ArtifactNatureUndetected: | ||||||||||
logger.warning( | logger.warning( | ||||||||||
"Skipping url '%s': undetected remote artifact type", origin | "Skipping url <%s>: undetected remote artifact type", origin | ||||||||||
) | ) | ||||||||||
continue | continue | ||||||||||
# Determine the content checksum stored in the integrity field and | # Determine the content checksum stored in the integrity field and | ||||||||||
# convert into a dict of checksums. This only parses the | # convert into a dict of checksums. This only parses the | ||||||||||
# `hash-expression` (hash-<b64-encoded-checksum>) as defined in | # `hash-expression` (hash-<b64-encoded-checksum>) as defined in | ||||||||||
# https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute | # https://w3c.github.io/webappsec-subresource-integrity/#the-integrity-attribute | ||||||||||
chksum_algo, chksum_b64 = integrity.split("-") | chksum_algo, chksum_b64 = integrity.split("-") | ||||||||||
checksums: Dict[str, str] = { | checksums: Dict[str, str] = { | ||||||||||
chksum_algo: base64.decodebytes(chksum_b64.encode()).hex() | chksum_algo: base64.decodebytes(chksum_b64.encode()).hex() | ||||||||||
} | } | ||||||||||
logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) | logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) | ||||||||||
yield ArtifactType.ARTIFACT, Artifact( | yield ArtifactType.ARTIFACT, Artifact( | ||||||||||
origin=origin, | origin=origin, | ||||||||||
fallback_urls=fallback_urls, | fallback_urls=fallback_urls, | ||||||||||
checksums=checksums, | checksums=checksums, | ||||||||||
visit_type="directory" if is_tar else "content", | visit_type="directory" if is_tar else "content", | ||||||||||
) | ) | ||||||||||
else: | else: | ||||||||||
logger.warning( | logger.warning( | ||||||||||
"Skipping artifact '%s': unsupported type %s", | "Skipping artifact <%s>: unsupported type %s", | ||||||||||
artifact, | artifact, | ||||||||||
artifact_type, | artifact_type, | ||||||||||
) | ) | ||||||||||
def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]: | def vcs_to_listed_origin(self, artifact: VCS) -> Iterator[ListedOrigin]: | ||||||||||
"""Given a vcs repository, yield a ListedOrigin.""" | """Given a vcs repository, yield a ListedOrigin.""" | ||||||||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||||||||
# FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...) | # FIXME: What to do with the "ref" (e.g. git/hg/svn commit, ...) | ||||||||||
Show All 37 Lines |
There is also tarball URLs with ftp scheme, for instance ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz with the nixguix listing.