Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/nixguix/lister.py
Show All 30 Lines | |||||
from swh.core.github.utils import GitHubSession | from swh.core.github.utils import GitHubSession | ||||
from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT | from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT | ||||
from swh.lister import TARBALL_EXTENSIONS | from swh.lister import TARBALL_EXTENSIONS | ||||
from swh.lister.pattern import CredentialsType, StatelessLister | from swh.lister.pattern import CredentialsType, StatelessLister | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
anlambert: you could merge your comment into a single one, something like: `By default, ignore binary… | |||||
# By default, ignore binary files and archives containing binaries | |||||
DEFAULT_EXTENSIONS_TO_IGNORE = [ | |||||
"AppImage", | |||||
"bin", | |||||
"exe", | |||||
"iso", | |||||
"linux64", | |||||
"msi", | |||||
"png", | |||||
"dic", | |||||
"deb", | |||||
"rpm", | |||||
] | |||||
class ArtifactNatureUndetected(ValueError): | class ArtifactNatureUndetected(ValueError): | ||||
"""Raised when a remote artifact's nature (tarball, file) cannot be detected.""" | """Raised when a remote artifact's nature (tarball, file) cannot be detected.""" | ||||
pass | pass | ||||
class ArtifactNatureMistyped(ValueError): | class ArtifactNatureMistyped(ValueError): | ||||
"""Raised when a remote artifact is neither a tarball nor a file. | """Raised when a remote artifact is neither a tarball nor a file. | ||||
Error of this type are' probably a misconfiguration in the manifest generation that | Error of this type are' probably a misconfiguration in the manifest generation that | ||||
badly typed a vcs repository. | badly typed a vcs repository. | ||||
""" | """ | ||||
pass | pass | ||||
class ArtifactWithoutExtension(ValueError): | class ArtifactWithoutExtension(ValueError): | ||||
"""Raised when an artifact nature cannot be determined by its name. | """Raised when an artifact nature cannot be determined by its name.""" | ||||
This exception is solely for internal use of the :meth:`is_tarball` method. | |||||
""" | |||||
pass | pass | ||||
class ChecksumsComputation(Enum): | class ChecksumsComputation(Enum): | ||||
"""The possible artifact types listed out of the manifest.""" | """The possible artifact types listed out of the manifest.""" | ||||
STANDARD = "standard" | STANDARD = "standard" | ||||
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines | |||||
VCS_SUPPORTED = ("git", "svn", "hg") | VCS_SUPPORTED = ("git", "svn", "hg") | ||||
# Rough approximation of what we can find of mimetypes for tarballs "out there" | # Rough approximation of what we can find of mimetypes for tarballs "out there" | ||||
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) | POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) | ||||
def url_endswith( | |||||
urlparsed, extensions: List[str], raise_when_no_extension: bool = True | |||||
) -> bool: | |||||
"""Determine whether urlparsed ends with one of the extensions. | |||||
Raises: | |||||
ArtifactWithoutExtension in case no extension is available and raise_when_no_extension | |||||
is True (the default) | |||||
""" | |||||
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)] | |||||
if raise_when_no_extension and not any(path.suffix != "" for path in paths): | |||||
raise ArtifactWithoutExtension | |||||
return any(path.suffix.endswith(tuple(extensions)) for path in paths) | |||||
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: | def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: | ||||
"""Determine whether a list of files actually are tarballs or simple files. | """Determine whether a list of files actually are tarballs or simple files. | ||||
When this cannot be answered simply out of the url, when request is provided, this | When this cannot be answered simply out of the url, when request is provided, this | ||||
executes a HTTP `HEAD` query on the url to determine the information. If request is | executes a HTTP `HEAD` query on the url to determine the information. If request is | ||||
not provided, this raises an ArtifactNatureUndetected exception. | not provided, this raises an ArtifactNatureUndetected exception. | ||||
Args: | Args: | ||||
Show All 16 Lines | def _is_tarball(url): | ||||
Raises: | Raises: | ||||
ArtifactWithoutExtension in case no extension is available | ArtifactWithoutExtension in case no extension is available | ||||
""" | """ | ||||
urlparsed = urlparse(url) | urlparsed = urlparse(url) | ||||
if urlparsed.scheme not in ("http", "https", "ftp"): | if urlparsed.scheme not in ("http", "https", "ftp"): | ||||
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") | raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") | ||||
return url_endswith(urlparsed, TARBALL_EXTENSIONS) | |||||
paths = [ | |||||
Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query) | |||||
] | |||||
if not any(path.suffix != "" for path in paths): | |||||
raise ArtifactWithoutExtension | |||||
return any(path.suffix.endswith(tuple(TARBALL_EXTENSIONS)) for path in paths) | |||||
index = random.randrange(len(urls)) | index = random.randrange(len(urls)) | ||||
url = urls[index] | url = urls[index] | ||||
try: | try: | ||||
return _is_tarball(url), urls[0] | return _is_tarball(url), urls[0] | ||||
except ArtifactWithoutExtension: | except ArtifactWithoutExtension: | ||||
if request is None: | if request is None: | ||||
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines | class NixGuixLister(StatelessLister[PageResult]): | ||||
Note that no `last_update` is available in either manifest. | Note that no `last_update` is available in either manifest. | ||||
For `url` types artifacts, this tries to determine the artifact's nature, tarball or | For `url` types artifacts, this tries to determine the artifact's nature, tarball or | ||||
file. It first tries to compute out of the "url" extension. In case of no extension, | file. It first tries to compute out of the "url" extension. In case of no extension, | ||||
it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` | it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` | ||||
response header, and then checks the extension again. | response header, and then checks the extension again. | ||||
Optionally, when the `extension_to_ignore` parameter is provided, it extends the | |||||
default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed. | |||||
This can be used to drop further binary files detected in the wild. | |||||
""" | """ | ||||
LISTER_NAME = "nixguix" | LISTER_NAME = "nixguix" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
scheduler, | scheduler, | ||||
url: str, | url: str, | ||||
origin_upstream: str, | origin_upstream: str, | ||||
instance: Optional[str] = None, | instance: Optional[str] = None, | ||||
credentials: Optional[CredentialsType] = None, | credentials: Optional[CredentialsType] = None, | ||||
# canonicalize urls, can be turned off during docker runs | # canonicalize urls, can be turned off during docker runs | ||||
canonicalize: bool = True, | canonicalize: bool = True, | ||||
extensions_to_ignore: List[str] = [], | |||||
**kwargs: Any, | **kwargs: Any, | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
url=url.rstrip("/"), | url=url.rstrip("/"), | ||||
instance=instance, | instance=instance, | ||||
credentials=credentials, | credentials=credentials, | ||||
) | ) | ||||
# either full fqdn NixOS/nixpkgs or guix repository urls | # either full fqdn NixOS/nixpkgs or guix repository urls | ||||
# maybe add an assert on those specific urls? | # maybe add an assert on those specific urls? | ||||
self.origin_upstream = origin_upstream | self.origin_upstream = origin_upstream | ||||
self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore | |||||
Done Inline Actionswhy a copy here ? anlambert: why a copy here ? | |||||
Done Inline Actions¯\_(ツ)_/¯ ardumont: ¯\_(ツ)_/¯ | |||||
self.session = requests.Session() | self.session = requests.Session() | ||||
# for testing purposes, we may want to skip this step (e.g. docker run and rate | # for testing purposes, we may want to skip this step (e.g. docker run and rate | ||||
# limit) | # limit) | ||||
self.github_session = ( | self.github_session = ( | ||||
GitHubSession( | GitHubSession( | ||||
credentials=self.credentials, | credentials=self.credentials, | ||||
user_agent=str(self.session.headers["User-Agent"]), | user_agent=str(self.session.headers["User-Agent"]), | ||||
▲ Show 20 Lines • Show All 148 Lines • ▼ Show 20 Lines | def get_pages(self) -> Iterator[PageResult]: | ||||
# the output can be anything, including a directory tree. | # the output can be anything, including a directory tree. | ||||
outputHashMode = artifact.get("outputHashMode", "flat") | outputHashMode = artifact.get("outputHashMode", "flat") | ||||
if not is_tar and outputHashMode == "recursive": | if not is_tar and outputHashMode == "recursive": | ||||
# T4608: Cannot deal with those properly yet as some can be missing | # T4608: Cannot deal with those properly yet as some can be missing | ||||
# 'critical' information about how to recompute the hash (e.g. fs | # 'critical' information about how to recompute the hash (e.g. fs | ||||
# layout, executable bit, ...) | # layout, executable bit, ...) | ||||
logger.warning( | logger.warning( | ||||
"Skipping artifact <%s>: 'file' artifact of type <%s> is " | "Skipping artifact <%s>: 'file' artifact of type <%s> is" | ||||
" missing information to properly check its integrity", | " missing information to properly check its integrity", | ||||
artifact, | artifact, | ||||
artifact_type, | artifact_type, | ||||
) | ) | ||||
continue | continue | ||||
# At this point plenty of heuristics happened and we should have found | |||||
# the right origin and its nature. | |||||
# Let's check and filter it out if it is to be ignored (if possible). | |||||
# Some origin urls may not have extension at this point (e.g | |||||
# http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through. | |||||
if url_endswith( | |||||
urlparse(origin), | |||||
self.extensions_to_ignore, | |||||
raise_when_no_extension=False, | |||||
): | |||||
logger.warning( | |||||
"Skipping artifact <%s>: 'file' artifact of type <%s> is" | |||||
" ignored due to lister configuration. It should ignore" | |||||
" origins with extension [%s]", | |||||
origin, | |||||
artifact_type, | |||||
",".join(self.extensions_to_ignore), | |||||
) | |||||
continue | |||||
logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) | logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) | ||||
yield ArtifactType.ARTIFACT, Artifact( | yield ArtifactType.ARTIFACT, Artifact( | ||||
origin=origin, | origin=origin, | ||||
fallback_urls=fallback_urls, | fallback_urls=fallback_urls, | ||||
checksums=checksums, | checksums=checksums, | ||||
checksums_computation=MAPPING_CHECKSUMS_COMPUTATION[outputHashMode], | checksums_computation=MAPPING_CHECKSUMS_COMPUTATION[outputHashMode], | ||||
visit_type="directory" if is_tar else "content", | visit_type="directory" if is_tar else "content", | ||||
) | ) | ||||
Show All 38 Lines |
you could merge your comment into a single one, something like: By default, ignore binary files and archives containing binaries