Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/nixguix/lister.py
| Show All 30 Lines | |||||
| from swh.core.github.utils import GitHubSession | from swh.core.github.utils import GitHubSession | ||||
| from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT | from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT | ||||
| from swh.lister import TARBALL_EXTENSIONS | from swh.lister import TARBALL_EXTENSIONS | ||||
| from swh.lister.pattern import CredentialsType, StatelessLister | from swh.lister.pattern import CredentialsType, StatelessLister | ||||
| from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
anlambert: you could merge your comment into a single one, something like: `By default, ignore binary… | |||||
| # By default, ignore binary files and archives containing binaries | |||||
| DEFAULT_EXTENSIONS_TO_IGNORE = [ | |||||
| "AppImage", | |||||
| "bin", | |||||
| "exe", | |||||
| "iso", | |||||
| "linux64", | |||||
| "msi", | |||||
| "png", | |||||
| "dic", | |||||
| "deb", | |||||
| "rpm", | |||||
| ] | |||||
| class ArtifactNatureUndetected(ValueError): | class ArtifactNatureUndetected(ValueError): | ||||
| """Raised when a remote artifact's nature (tarball, file) cannot be detected.""" | """Raised when a remote artifact's nature (tarball, file) cannot be detected.""" | ||||
| pass | pass | ||||
| class ArtifactNatureMistyped(ValueError): | class ArtifactNatureMistyped(ValueError): | ||||
| """Raised when a remote artifact is neither a tarball nor a file. | """Raised when a remote artifact is neither a tarball nor a file. | ||||
| Error of this type are' probably a misconfiguration in the manifest generation that | Error of this type are' probably a misconfiguration in the manifest generation that | ||||
| badly typed a vcs repository. | badly typed a vcs repository. | ||||
| """ | """ | ||||
| pass | pass | ||||
| class ArtifactWithoutExtension(ValueError): | class ArtifactWithoutExtension(ValueError): | ||||
| """Raised when an artifact nature cannot be determined by its name. | """Raised when an artifact nature cannot be determined by its name.""" | ||||
| This exception is solely for internal use of the :meth:`is_tarball` method. | |||||
| """ | |||||
| pass | pass | ||||
| class ChecksumsComputation(Enum): | class ChecksumsComputation(Enum): | ||||
| """The possible artifact types listed out of the manifest.""" | """The possible artifact types listed out of the manifest.""" | ||||
| STANDARD = "standard" | STANDARD = "standard" | ||||
| ▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines | |||||
| VCS_SUPPORTED = ("git", "svn", "hg") | VCS_SUPPORTED = ("git", "svn", "hg") | ||||
| # Rough approximation of what we can find of mimetypes for tarballs "out there" | # Rough approximation of what we can find of mimetypes for tarballs "out there" | ||||
| POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) | POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys()) | ||||
| def url_endswith( | |||||
| urlparsed, extensions: List[str], raise_when_no_extension: bool = True | |||||
| ) -> bool: | |||||
| """Determine whether urlparsed ends with one of the extensions. | |||||
| Raises: | |||||
| ArtifactWithoutExtension in case no extension is available and raise_when_no_extension | |||||
| is True (the default) | |||||
| """ | |||||
| paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)] | |||||
| if raise_when_no_extension and not any(path.suffix != "" for path in paths): | |||||
| raise ArtifactWithoutExtension | |||||
| return any(path.suffix.endswith(tuple(extensions)) for path in paths) | |||||
| def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: | def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]: | ||||
| """Determine whether a list of files actually are tarballs or simple files. | """Determine whether a list of files actually are tarballs or simple files. | ||||
| When this cannot be answered simply out of the url, when request is provided, this | When this cannot be answered simply out of the url, when request is provided, this | ||||
| executes a HTTP `HEAD` query on the url to determine the information. If request is | executes a HTTP `HEAD` query on the url to determine the information. If request is | ||||
| not provided, this raises an ArtifactNatureUndetected exception. | not provided, this raises an ArtifactNatureUndetected exception. | ||||
| Args: | Args: | ||||
| Show All 16 Lines | def _is_tarball(url): | ||||
| Raises: | Raises: | ||||
| ArtifactWithoutExtension in case no extension is available | ArtifactWithoutExtension in case no extension is available | ||||
| """ | """ | ||||
| urlparsed = urlparse(url) | urlparsed = urlparse(url) | ||||
| if urlparsed.scheme not in ("http", "https", "ftp"): | if urlparsed.scheme not in ("http", "https", "ftp"): | ||||
| raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") | raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'") | ||||
| return url_endswith(urlparsed, TARBALL_EXTENSIONS) | |||||
| paths = [ | |||||
| Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query) | |||||
| ] | |||||
| if not any(path.suffix != "" for path in paths): | |||||
| raise ArtifactWithoutExtension | |||||
| return any(path.suffix.endswith(tuple(TARBALL_EXTENSIONS)) for path in paths) | |||||
| index = random.randrange(len(urls)) | index = random.randrange(len(urls)) | ||||
| url = urls[index] | url = urls[index] | ||||
| try: | try: | ||||
| return _is_tarball(url), urls[0] | return _is_tarball(url), urls[0] | ||||
| except ArtifactWithoutExtension: | except ArtifactWithoutExtension: | ||||
| if request is None: | if request is None: | ||||
| ▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines | class NixGuixLister(StatelessLister[PageResult]): | ||||
| Note that no `last_update` is available in either manifest. | Note that no `last_update` is available in either manifest. | ||||
| For `url` types artifacts, this tries to determine the artifact's nature, tarball or | For `url` types artifacts, this tries to determine the artifact's nature, tarball or | ||||
| file. It first tries to compute out of the "url" extension. In case of no extension, | file. It first tries to compute out of the "url" extension. In case of no extension, | ||||
| it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` | it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location` | ||||
| response header, and then checks the extension again. | response header, and then checks the extension again. | ||||
| Optionally, when the `extension_to_ignore` parameter is provided, it extends the | |||||
| default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed. | |||||
| This can be used to drop further binary files detected in the wild. | |||||
| """ | """ | ||||
| LISTER_NAME = "nixguix" | LISTER_NAME = "nixguix" | ||||
| def __init__( | def __init__( | ||||
| self, | self, | ||||
| scheduler, | scheduler, | ||||
| url: str, | url: str, | ||||
| origin_upstream: str, | origin_upstream: str, | ||||
| instance: Optional[str] = None, | instance: Optional[str] = None, | ||||
| credentials: Optional[CredentialsType] = None, | credentials: Optional[CredentialsType] = None, | ||||
| # canonicalize urls, can be turned off during docker runs | # canonicalize urls, can be turned off during docker runs | ||||
| canonicalize: bool = True, | canonicalize: bool = True, | ||||
| extensions_to_ignore: List[str] = [], | |||||
| **kwargs: Any, | **kwargs: Any, | ||||
| ): | ): | ||||
| super().__init__( | super().__init__( | ||||
| scheduler=scheduler, | scheduler=scheduler, | ||||
| url=url.rstrip("/"), | url=url.rstrip("/"), | ||||
| instance=instance, | instance=instance, | ||||
| credentials=credentials, | credentials=credentials, | ||||
| ) | ) | ||||
| # either full fqdn NixOS/nixpkgs or guix repository urls | # either full fqdn NixOS/nixpkgs or guix repository urls | ||||
| # maybe add an assert on those specific urls? | # maybe add an assert on those specific urls? | ||||
| self.origin_upstream = origin_upstream | self.origin_upstream = origin_upstream | ||||
| self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore | |||||
Done Inline Actionswhy a copy here ? anlambert: why a copy here ? | |||||
Done Inline Actions¯\_(ツ)_/¯ ardumont: ¯\_(ツ)_/¯ | |||||
| self.session = requests.Session() | self.session = requests.Session() | ||||
| # for testing purposes, we may want to skip this step (e.g. docker run and rate | # for testing purposes, we may want to skip this step (e.g. docker run and rate | ||||
| # limit) | # limit) | ||||
| self.github_session = ( | self.github_session = ( | ||||
| GitHubSession( | GitHubSession( | ||||
| credentials=self.credentials, | credentials=self.credentials, | ||||
| user_agent=str(self.session.headers["User-Agent"]), | user_agent=str(self.session.headers["User-Agent"]), | ||||
| ▲ Show 20 Lines • Show All 148 Lines • ▼ Show 20 Lines | def get_pages(self) -> Iterator[PageResult]: | ||||
| # the output can be anything, including a directory tree. | # the output can be anything, including a directory tree. | ||||
| outputHashMode = artifact.get("outputHashMode", "flat") | outputHashMode = artifact.get("outputHashMode", "flat") | ||||
| if not is_tar and outputHashMode == "recursive": | if not is_tar and outputHashMode == "recursive": | ||||
| # T4608: Cannot deal with those properly yet as some can be missing | # T4608: Cannot deal with those properly yet as some can be missing | ||||
| # 'critical' information about how to recompute the hash (e.g. fs | # 'critical' information about how to recompute the hash (e.g. fs | ||||
| # layout, executable bit, ...) | # layout, executable bit, ...) | ||||
| logger.warning( | logger.warning( | ||||
| "Skipping artifact <%s>: 'file' artifact of type <%s> is " | "Skipping artifact <%s>: 'file' artifact of type <%s> is" | ||||
| " missing information to properly check its integrity", | " missing information to properly check its integrity", | ||||
| artifact, | artifact, | ||||
| artifact_type, | artifact_type, | ||||
| ) | ) | ||||
| continue | continue | ||||
| # At this point plenty of heuristics happened and we should have found | |||||
| # the right origin and its nature. | |||||
| # Let's check and filter it out if it is to be ignored (if possible). | |||||
| # Some origin urls may not have extension at this point (e.g | |||||
| # http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through. | |||||
| if url_endswith( | |||||
| urlparse(origin), | |||||
| self.extensions_to_ignore, | |||||
| raise_when_no_extension=False, | |||||
| ): | |||||
| logger.warning( | |||||
| "Skipping artifact <%s>: 'file' artifact of type <%s> is" | |||||
| " ignored due to lister configuration. It should ignore" | |||||
| " origins with extension [%s]", | |||||
| origin, | |||||
| artifact_type, | |||||
| ",".join(self.extensions_to_ignore), | |||||
| ) | |||||
| continue | |||||
| logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) | logger.debug("%s: %s", "dir" if is_tar else "cnt", origin) | ||||
| yield ArtifactType.ARTIFACT, Artifact( | yield ArtifactType.ARTIFACT, Artifact( | ||||
| origin=origin, | origin=origin, | ||||
| fallback_urls=fallback_urls, | fallback_urls=fallback_urls, | ||||
| checksums=checksums, | checksums=checksums, | ||||
| checksums_computation=MAPPING_CHECKSUMS_COMPUTATION[outputHashMode], | checksums_computation=MAPPING_CHECKSUMS_COMPUTATION[outputHashMode], | ||||
| visit_type="directory" if is_tar else "content", | visit_type="directory" if is_tar else "content", | ||||
| ) | ) | ||||
| Show All 38 Lines | |||||
you could merge your comment into a single one, something like: By default, ignore binary files and archives containing binaries