diff --git a/PKG-INFO b/PKG-INFO index 35d33b3..6bbaf3d 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,125 +1,125 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 3.0.0 +Version: 3.0.1 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.golang` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` - `swh.lister.gogs` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index 35d33b3..6bbaf3d 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,125 +1,125 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 3.0.0 +Version: 3.0.1 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.golang` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` - `swh.lister.gogs` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py index 58e5371..af3a3d8 100644 --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -1,500 +1,501 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import datetime import logging from pathlib import Path import re import tarfile +import tempfile from typing import Any, Dict, Iterator, List, Optional from urllib.parse import unquote, urljoin from bs4 import BeautifulSoup import requests from tenacity.before_sleep import before_sleep_log from swh.lister.utils import throttling_retry from swh.model.hashutil import hash_to_hex from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. ArchListerPage = List[Dict[str, Any]] def size_to_bytes(size: str) -> int: """Convert human readable file size to bytes. Resulting value is an approximation as input value is in most case rounded. Args: size: A string representing a human readable file size (eg: '500K') Returns: A decimal representation of file size Examples:: >>> size_to_bytes("500") 500 >>> size_to_bytes("1K") 1000 """ units = { "K": 1000, "M": 1000**2, "G": 1000**3, "T": 1000**4, "P": 1000**5, "E": 1000**6, "Z": 1000**7, "Y": 1000**8, } if size.endswith(tuple(units)): v, u = (size[:-1], size[-1]) return int(v) * units[u] else: return int(size) class ArchLister(StatelessLister[ArchListerPage]): """List Arch linux origins from 'core', 'extra', and 'community' repositories For 'official' Arch Linux it downloads core.tar.gz, extra.tar.gz and community.tar.gz from https://archive.archlinux.org/repos/last/ extract to a temp directory and then walks through each 'desc' files. Each 'desc' file describe the latest released version of a package and helps to build an origin url from where scrapping artifacts metadata. For 'arm' Arch Linux it follow the same discovery process parsing 'desc' files. The main difference is that we can't get existing versions of an arm package because https://archlinuxarm.org does not have an 'archive' website or api. """ LISTER_NAME = "arch" VISIT_TYPE = "arch" INSTANCE = "arch" - DESTINATION_PATH = Path("/tmp/archlinux_archive") - ARCH_PACKAGE_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}" ARCH_PACKAGE_VERSIONS_URL_PATTERN = "{base_url}/packages/{pkgname[0]}/{pkgname}" ARCH_PACKAGE_DOWNLOAD_URL_PATTERN = ( "{base_url}/packages/{pkgname[0]}/{pkgname}/{filename}" ) ARCH_API_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}/json" ARM_PACKAGE_URL_PATTERN = "{base_url}/packages/{arch}/{pkgname}" ARM_PACKAGE_DOWNLOAD_URL_PATTERN = "{base_url}/{arch}/{repo}/{filename}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, flavours: Dict[str, Any] = { "official": { "archs": ["x86_64"], "repos": ["core", "extra", "community"], "base_info_url": "https://archlinux.org", "base_archive_url": "https://archive.archlinux.org", "base_mirror_url": "", "base_api_url": "https://archlinux.org", }, "arm": { "archs": ["armv7h", "aarch64"], "repos": ["core", "extra", "community"], "base_info_url": "https://archlinuxarm.org", "base_archive_url": "", "base_mirror_url": "https://uk.mirror.archlinuxarm.org", "base_api_url": "", }, }, ): super().__init__( scheduler=scheduler, credentials=credentials, url=flavours["official"]["base_info_url"], instance=self.INSTANCE, ) self.flavours = flavours self.session = requests.Session() self.session.headers.update( { "User-Agent": USER_AGENT, } ) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response: - logger.info("Fetching URL %s with params %s", url, params) + logger.debug("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def scrap_package_versions( self, name: str, repo: str, base_url: str ) -> List[Dict[str, Any]]: """Given a package 'name' and 'repo', make an http call to origin url and parse its content to get package versions artifacts data. That method is suitable only for 'official' Arch Linux, not 'arm'. Args: name: Package name repo: The repository the package belongs to (one of self.repos) Returns: A list of dict of version Example:: [ {"url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", # noqa: B950 "arch": "x86_64", "repo": "core", "name": "dialog", "version": "1:1.3_20190211-1", "length": 180000, "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", "last_modified": "2019-02-13T08:36:00"}, ] """ url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format( pkgname=name, base_url=base_url ) response = self.request_get(url=url, params={}) soup = BeautifulSoup(response.text, "html.parser") links = soup.find_all("a", href=True) # drop the first line (used to go to up directory) if links[0].attrs["href"] == "../": links.pop(0) versions = [] for link in links: # filename displayed can be cropped if name is too long, get it from href instead filename = unquote(link.attrs["href"]) if filename.endswith((".tar.xz", ".tar.zst")): # Extract arch from filename arch_rex = re.compile( rf"^{re.escape(name)}-(?P.*)-(?Pany|i686|x86_64)" rf"(.pkg.tar.(?:zst|xz))$" ) m = arch_rex.match(filename) if m is None: logger.error( "Can not find a match for architecture in %(filename)s", dict(filename=filename), ) else: arch = m.group("arch") version = m.group("version") # Extract last_modified and an approximate file size raw_text = link.next_sibling raw_text_rex = re.compile( r"^(?P\d+-\w+-\d+ \d\d:\d\d)\s+(?P\w+)$" ) s = raw_text_rex.search(raw_text.strip()) if s is None: logger.error( "Can not find a match for 'last_modified' and/or " "'size' in '%(raw_text)s'", dict(raw_text=raw_text), ) else: assert s.groups() assert len(s.groups()) == 2 last_modified_str, size = s.groups() # format as expected last_modified = datetime.datetime.strptime( last_modified_str, "%d-%b-%Y %H:%M" ).isoformat() length = size_to_bytes(size) # we want bytes # link url is relative, format a canonical one url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format( base_url=base_url, pkgname=name, filename=filename ) versions.append( dict( name=name, version=version, repo=repo, arch=arch, filename=filename, url=url, last_modified=last_modified, length=length, ) ) return versions def get_repo_archive(self, url: str, destination_path: Path) -> Path: """Given an url and a destination path, retrieve and extract .tar.gz archive which contains 'desc' file for each package. Each .tar.gz archive corresponds to an Arch Linux repo ('core', 'extra', 'community'). Args: url: url of the .tar.gz archive to download destination_path: the path on disk where to extract archive Returns: a directory Path where the archive has been extracted to. """ res = self.request_get(url=url, params={}) destination_path.parent.mkdir(parents=True, exist_ok=True) destination_path.write_bytes(res.content) extract_to = Path(str(destination_path).split(".tar.gz")[0]) tar = tarfile.open(destination_path) tar.extractall(path=extract_to) tar.close() return extract_to def parse_desc_file( self, path: Path, repo: str, base_url: str, dl_url_fmt: str, ) -> Dict[str, Any]: """Extract package information from a 'desc' file. There are subtle differences between parsing 'official' and 'arm' des files Args: path: A path to a 'desc' file on disk repo: The repo the package belongs to Returns: A dict of metadata Example:: {'api_url': 'https://archlinux.org/packages/core/x86_64/dialog/json', 'arch': 'x86_64', 'base': 'dialog', 'builddate': '1650081535', 'csize': '203028', 'desc': 'A tool to display dialog boxes from shell scripts', 'filename': 'dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst', 'isize': '483988', 'license': 'LGPL2.1', 'md5sum': '06407c0cb11c50d7bf83d600f2e8107c', 'name': 'dialog', 'packager': 'Evangelos Foutras ', 'pgpsig': 'pgpsig content xxx', 'project_url': 'https://invisible-island.net/dialog/', 'provides': 'libdialog.so=15-64', 'repo': 'core', 'sha256sum': 'ef8c8971f591de7db0f455970ef5d81d5aced1ddf139f963f16f6730b1851fa7', 'url': 'https://archive.archlinux.org/packages/.all/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst', # noqa: B950 'version': '1:1.3_20220414-1'} """ rex = re.compile(r"^\%(?P\w+)\%\n(?P.*)\n$", re.M) with path.open("rb") as content: parsed = rex.findall(content.read().decode()) data = {entry[0].lower(): entry[1] for entry in parsed} if "url" in data.keys(): data["project_url"] = data["url"] assert data["name"] assert data["filename"] assert data["arch"] data["repo"] = repo data["url"] = urljoin( base_url, dl_url_fmt.format( base_url=base_url, pkgname=data["name"], filename=data["filename"], arch=data["arch"], repo=repo, ), ) assert data["md5sum"] assert data["sha256sum"] data["checksums"] = { "md5sum": hash_to_hex(data["md5sum"]), "sha256sum": hash_to_hex(data["sha256sum"]), } return data def get_pages(self) -> Iterator[ArchListerPage]: """Yield an iterator sorted by name in ascending order of pages. Each page is a list of package belonging to a flavour ('official', 'arm'), and a repo ('core', 'extra', 'community') """ for name, flavour in self.flavours.items(): for arch in flavour["archs"]: for repo in flavour["repos"]: yield self._get_repo_page(name, flavour, arch, repo) def _get_repo_page( self, name: str, flavour: Dict[str, Any], arch: str, repo: str ) -> ArchListerPage: - page = [] - if name == "official": - prefix = urljoin(flavour["base_archive_url"], "/repos/last/") - filename = f"{repo}.files.tar.gz" - archive_url = urljoin(prefix, f"{repo}/os/{arch}/{filename}") - destination_path = Path(self.DESTINATION_PATH, arch, filename) - base_url = flavour["base_archive_url"] - dl_url_fmt = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN - base_info_url = flavour["base_info_url"] - info_url_fmt = self.ARCH_PACKAGE_URL_PATTERN - elif name == "arm": - filename = f"{repo}.files.tar.gz" - archive_url = urljoin( - flavour["base_mirror_url"], f"{arch}/{repo}/{filename}" + with tempfile.TemporaryDirectory() as tmpdir: + page = [] + if name == "official": + prefix = urljoin(flavour["base_archive_url"], "/repos/last/") + filename = f"{repo}.files.tar.gz" + archive_url = urljoin(prefix, f"{repo}/os/{arch}/{filename}") + destination_path = Path(tmpdir, arch, filename) + base_url = flavour["base_archive_url"] + dl_url_fmt = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN + base_info_url = flavour["base_info_url"] + info_url_fmt = self.ARCH_PACKAGE_URL_PATTERN + elif name == "arm": + filename = f"{repo}.files.tar.gz" + archive_url = urljoin( + flavour["base_mirror_url"], f"{arch}/{repo}/{filename}" + ) + destination_path = Path(tmpdir, arch, filename) + base_url = flavour["base_mirror_url"] + dl_url_fmt = self.ARM_PACKAGE_DOWNLOAD_URL_PATTERN + base_info_url = flavour["base_info_url"] + info_url_fmt = self.ARM_PACKAGE_URL_PATTERN + + archive = self.get_repo_archive( + url=archive_url, destination_path=destination_path ) - destination_path = Path(self.DESTINATION_PATH, arch, filename) - base_url = flavour["base_mirror_url"] - dl_url_fmt = self.ARM_PACKAGE_DOWNLOAD_URL_PATTERN - base_info_url = flavour["base_info_url"] - info_url_fmt = self.ARM_PACKAGE_URL_PATTERN - - archive = self.get_repo_archive( - url=archive_url, destination_path=destination_path - ) - - assert archive - - packages_desc = list(archive.glob("**/desc")) - logger.debug( - "Processing %(instance)s source packages info from " - "%(flavour)s %(arch)s %(repo)s repository, " - "(%(qty)s packages).", - dict( - instance=self.instance, - flavour=name, - arch=arch, - repo=repo, - qty=len(packages_desc), - ), - ) - for package_desc in packages_desc: - data = self.parse_desc_file( - path=package_desc, - repo=repo, - base_url=base_url, - dl_url_fmt=dl_url_fmt, + assert archive + + packages_desc = list(archive.glob("**/desc")) + logger.debug( + "Processing %(instance)s source packages info from " + "%(flavour)s %(arch)s %(repo)s repository, " + "(%(qty)s packages).", + dict( + instance=self.instance, + flavour=name, + arch=arch, + repo=repo, + qty=len(packages_desc), + ), ) - assert data["builddate"] - last_modified = datetime.datetime.fromtimestamp( - float(data["builddate"]), tz=datetime.timezone.utc - ) + for package_desc in packages_desc: + data = self.parse_desc_file( + path=package_desc, + repo=repo, + base_url=base_url, + dl_url_fmt=dl_url_fmt, + ) - assert data["name"] - assert data["filename"] - assert data["arch"] - url = info_url_fmt.format( - base_url=base_info_url, - pkgname=data["name"], - filename=data["filename"], - repo=repo, - arch=data["arch"], - ) + assert data["builddate"] + last_modified = datetime.datetime.fromtimestamp( + float(data["builddate"]), tz=datetime.timezone.utc + ) - assert data["version"] - if name == "official": - # find all versions of a package scrapping archive - versions = self.scrap_package_versions( - name=data["name"], repo=repo, base_url=base_url + assert data["name"] + assert data["filename"] + assert data["arch"] + url = info_url_fmt.format( + base_url=base_info_url, + pkgname=data["name"], + filename=data["filename"], + repo=repo, + arch=data["arch"], ) - elif name == "arm": - # There is no way to get related versions of a package, - # but 'data' represents the latest released version, - # use it in this case - assert data["builddate"] - assert data["csize"] - assert data["url"] - versions = [ - dict( - name=data["name"], - version=data["version"], - repo=repo, - arch=data["arch"], - filename=data["filename"], - url=data["url"], - last_modified=last_modified.replace(tzinfo=None).isoformat( - timespec="seconds" - ), - length=int(data["csize"]), - ) - ] - package = { - "name": data["name"], - "version": data["version"], - "last_modified": last_modified, - "url": url, - "versions": versions, - "data": data, - } - page.append(package) - return page + assert data["version"] + if name == "official": + # find all versions of a package scrapping archive + versions = self.scrap_package_versions( + name=data["name"], repo=repo, base_url=base_url + ) + elif name == "arm": + # There is no way to get related versions of a package, + # but 'data' represents the latest released version, + # use it in this case + assert data["builddate"] + assert data["csize"] + assert data["url"] + versions = [ + dict( + name=data["name"], + version=data["version"], + repo=repo, + arch=data["arch"], + filename=data["filename"], + url=data["url"], + last_modified=last_modified.replace(tzinfo=None).isoformat( + timespec="seconds" + ), + length=int(data["csize"]), + ) + ] + + package = { + "name": data["name"], + "version": data["version"], + "last_modified": last_modified, + "url": url, + "versions": versions, + "data": data, + } + page.append(package) + return page def get_origins_from_page(self, page: ArchListerPage) -> Iterator[ListedOrigin]: """Iterate on all arch pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for origin in page: artifacts = [] arch_metadata = [] for version in origin["versions"]: artifacts.append( { "version": version["version"], "filename": version["filename"], "url": version["url"], "length": version["length"], } ) arch_metadata.append( { "version": version["version"], "name": version["name"], "arch": version["arch"], "repo": version["repo"], "last_modified": version["last_modified"], } ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=origin["url"], last_update=origin["last_modified"], extra_loader_arguments={ "artifacts": artifacts, "arch_metadata": arch_metadata, }, ) diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py index 0d6b2b9..0a2f141 100644 --- a/swh/lister/golang/lister.py +++ b/swh/lister/golang/lister.py @@ -1,188 +1,188 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from datetime import datetime import json import logging from typing import Any, Dict, Iterator, List, Optional, Tuple import iso8601 import requests from tenacity import before_sleep_log from swh.lister.utils import retry_policy_generic, throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @dataclass class GolangStateType: last_seen: Optional[datetime] = None """Last timestamp of a package version we have saved. Used as a starting point for an incremental listing.""" GolangPageType = List[Dict[str, Any]] class GolangLister(Lister[GolangStateType, GolangPageType]): """ List all Golang modules and send associated origins to scheduler. The lister queries the Golang module index, whose documentation can be found at https://index.golang.org """ GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index" # `limit` seems to be... limited to 2000. GOLANG_MODULES_INDEX_LIMIT = 2000 - LISTER_NAME = "Golang" + LISTER_NAME = "golang" def __init__( self, scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, url=self.GOLANG_MODULES_INDEX_URL, - instance="Golang", + instance=self.LISTER_NAME, credentials=credentials, ) self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) self.incremental = incremental def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType: as_string = d.get("last_seen") last_seen = iso8601.parse_date(as_string) if as_string is not None else None return GolangStateType(last_seen=last_seen) def state_to_dict(self, state: GolangStateType) -> Dict[str, Any]: return { "last_seen": state.last_seen.isoformat() if state.last_seen is not None else None } def finalize(self): if self.incremental and self.state.last_seen is not None: scheduler_state = self.get_state_from_scheduler() if ( scheduler_state.last_seen is None or self.state.last_seen > scheduler_state.last_seen ): self.updated = True @throttling_retry( retry=retry_policy_generic, before_sleep=before_sleep_log(logger, logging.WARNING), ) def api_request(self, url: str) -> List[str]: logger.debug("Fetching URL %s", url) response = self.session.get(url) if response.status_code not in (200, 304): # Log response content to ease debugging logger.warning( "Unexpected HTTP status code %s for URL %s", response.status_code, response.url, ) response.raise_for_status() return response.text.split() def get_single_page( self, since: Optional[datetime] = None ) -> Tuple[GolangPageType, Optional[datetime]]: """Return a page from the API and the timestamp of its last entry. Since all entries are sorted by chronological order, the timestamp is useful both for pagination and later for incremental runs.""" url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}" if since is not None: # The Golang index does not understand `+00:00` for some reason # and expects the "timezone zero" notation instead. This works # because all times are UTC. utc_offset = since.utcoffset() assert ( utc_offset is not None and utc_offset.total_seconds() == 0 ), "Non-UTC datetime" as_date = since.isoformat().replace("+00:00", "Z") url = f"{url}&since={as_date}" entries = self.api_request(url) page: GolangPageType = [] if not entries: return page, since for as_json in entries: entry = json.loads(as_json) timestamp = iso8601.parse_date(entry["Timestamp"]) # We've already parsed it and we'll need the datetime later, save it entry["Timestamp"] = timestamp page.append(entry) # The index is guaranteed to be sorted in chronological order since = timestamp return page, since def get_pages(self) -> Iterator[GolangPageType]: since = None if self.incremental: since = self.state.last_seen page, since = self.get_single_page(since=since) if since == self.state.last_seen: # The index returns packages whose timestamp are greater or # equal to the date provided as parameter, which will create # an infinite loop if not stopped here. return [], since if since is not None: self.state.last_seen = since while page: yield page page, since = self.get_single_page(since=since) if since == self.state.last_seen: return [], since if since is not None: self.state.last_seen = since def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]: """ Iterate on all Golang projects and yield ListedOrigin instances. """ assert self.lister_obj.id is not None for module in page: path = module["Path"] # The loader will be expected to use the golang proxy to do the # actual downloading. We're using `pkg.go.dev` so that the URL points # to somewhere useful for a human instead of an (incomplete) API path. origin_url = f"https://pkg.go.dev/{path}" # Since the Go index lists versions and not just packages, there will # be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side, # so only the last timestamp will be used, with no duplicates. # Performance should not be an issue as they are sent to the db in bulk. yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="golang", last_update=module["Timestamp"], ) diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py index 8abb582..a17ad0e 100644 --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -1,125 +1,125 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from typing import Any, Dict, Iterator, List, Optional import iso8601 import requests from requests.exceptions import HTTPError from tenacity.before_sleep import before_sleep_log from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import __version__ from ..pattern import CredentialsType, StatelessLister # https://github.com/dart-lang/pub/blob/master/doc/repository-spec-v2.md#metadata-headers USER_AGENT = ( f"Software Heritage PubDev Lister v{__version__} " "(+https://www.softwareheritage.org/contact)" ) logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. PubDevListerPage = List[str] class PubDevLister(StatelessLister[PubDevListerPage]): """List pub.dev (Dart, Flutter) origins.""" LISTER_NAME = "pubdev" VISIT_TYPE = "pubdev" INSTANCE = "pubdev" BASE_URL = "https://pub.dev/" PACKAGE_NAMES_URL_PATTERN = "{base_url}api/package-names" PACKAGE_INFO_URL_PATTERN = "{base_url}api/packages/{pkgname}" ORIGIN_URL_PATTERN = "{base_url}packages/{pkgname}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, ) self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: - logger.info("Fetching URL %s with params %s", url, params) + logger.debug("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[PubDevListerPage]: """Yield an iterator which returns 'page' It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package origins. The http api call get "{base_url}package-names" to retrieve a sorted list of all package names. There is only one page that list all origins url based on "{base_url}packages/{pkgname}" """ response = self.page_request( url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={} ) yield response.json()["packages"] def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for pkgname in page: package_info_url = self.PACKAGE_INFO_URL_PATTERN.format( base_url=self.url, pkgname=pkgname ) try: response = self.page_request(url=package_info_url, params={}) except HTTPError: logger.warning( "Failed to fetch metadata for package %s, skipping it from listing.", pkgname, ) continue package_metadata = response.json() package_versions = package_metadata["versions"] last_published = max( package_version["published"] for package_version in package_versions ) origin_url = self.ORIGIN_URL_PATTERN.format( base_url=self.url, pkgname=pkgname ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=origin_url, last_update=iso8601.parse_date(last_published), )