diff --git a/CONTRIBUTORS b/CONTRIBUTORS index e7f5577..e10aa6b 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1,8 +1,10 @@ Archit Agrawal Avi Kelman (fiendish) Léni Gauffier Yann Gautier Sushant Sushant Hezekiah Maina Boris Baldassari Léo Andrès +Franck Bret +Kumar Shivendu diff --git a/PKG-INFO b/PKG-INFO index 4c7c008..35d33b3 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,123 +1,125 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.9.3 +Version: 3.0.0 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` +- `swh.lister.golang` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` +- `swh.lister.gogs` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, -`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) +`gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/README.md b/README.md index ae43250..f54483f 100644 --- a/README.md +++ b/README.md @@ -1,102 +1,104 @@ swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` +- `swh.lister.golang` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` +- `swh.lister.gogs` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, -`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) +`gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/mypy.ini b/mypy.ini index eb2343b..42c58d8 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,45 +1,44 @@ [mypy] namespace_packages = True warn_unused_ignores = True # 3rd party libraries without stubs (yet) [mypy-bs4.*] ignore_missing_imports = True [mypy-celery.*] ignore_missing_imports = True [mypy-debian.*] ignore_missing_imports = True [mypy-iso8601.*] ignore_missing_imports = True [mypy-launchpadlib.*] ignore_missing_imports = True [mypy-lazr.*] ignore_missing_imports = True [mypy-lxml.*] ignore_missing_imports = True [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True [mypy-pytest_postgresql.*] ignore_missing_imports = True [mypy-requests_mock.*] ignore_missing_imports = True [mypy-urllib3.util.*] ignore_missing_imports = True -[mypy-xmltodict.*] +[mypy-dulwich.*] ignore_missing_imports = True - diff --git a/requirements.txt b/requirements.txt index ea5ee0f..a909c6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ python_debian requests setuptools iso8601 beautifulsoup4 launchpadlib tenacity >= 6.2 -xmltodict lxml +dulwich diff --git a/setup.py b/setup.py index 6a374fd..8d3d7dd 100755 --- a/setup.py +++ b/setup.py @@ -1,90 +1,96 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import open from os import path from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.lister", description="Software Heritage lister", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DLSGH/", packages=find_packages(), install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), setup_requires=["setuptools-scm"], extras_require={"testing": parse_requirements("test")}, use_scm_version=True, include_package_data=True, entry_points=""" [swh.cli.subcommands] lister=swh.lister.cli [swh.workers] + lister.arch=swh.lister.arch:register + lister.aur=swh.lister.aur:register lister.bitbucket=swh.lister.bitbucket:register + lister.bower=swh.lister.bower:register lister.cgit=swh.lister.cgit:register lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register lister.gnu=swh.lister.gnu:register + lister.golang=swh.lister.golang:register lister.launchpad=swh.lister.launchpad:register lister.npm=swh.lister.npm:register lister.opam=swh.lister.opam:register lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register + lister.pubdev=swh.lister.pubdev:register lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register + lister.gogs=swh.lister.gogs:register """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-lister", "Documentation": "https://docs.softwareheritage.org/devel/swh-lister/", }, ) diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index 4c7c008..35d33b3 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,123 +1,125 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.9.3 +Version: 3.0.0 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` +- `swh.lister.golang` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` +- `swh.lister.gogs` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, -`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) +`gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/swh.lister.egg-info/SOURCES.txt b/swh.lister.egg-info/SOURCES.txt index 77b24f0..dad21b0 100644 --- a/swh.lister.egg-info/SOURCES.txt +++ b/swh.lister.egg-info/SOURCES.txt @@ -1,265 +1,331 @@ .git-blame-ignore-revs .gitignore .pre-commit-config.yaml ACKNOWLEDGEMENTS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.md conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/cli.rst docs/conf.py docs/index.rst docs/new_lister_template.py docs/run_a_new_lister.rst docs/save_forge.rst docs/tutorial.rst docs/_static/.placeholder docs/_templates/.placeholder docs/images/new_base.png docs/images/new_bitbucket_lister.png docs/images/new_github_lister.png docs/images/old_github_lister.png sql/crawler.sql sql/pimp_db.sql swh/__init__.py swh.lister.egg-info/PKG-INFO swh.lister.egg-info/SOURCES.txt swh.lister.egg-info/dependency_links.txt swh.lister.egg-info/entry_points.txt swh.lister.egg-info/requires.txt swh.lister.egg-info/top_level.txt swh/lister/__init__.py swh/lister/cli.py swh/lister/pattern.py swh/lister/py.typed swh/lister/utils.py +swh/lister/arch/__init__.py +swh/lister/arch/lister.py +swh/lister/arch/tasks.py +swh/lister/arch/tests/__init__.py +swh/lister/arch/tests/test_lister.py +swh/lister/arch/tests/test_tasks.py +swh/lister/arch/tests/data/fake_archlinux_archives_init.sh +swh/lister/arch/tests/data/https_archive.archlinux.org/packages_d_dialog +swh/lister/arch/tests/data/https_archive.archlinux.org/packages_g_gnome-code-assistance +swh/lister/arch/tests/data/https_archive.archlinux.org/packages_g_gzip +swh/lister/arch/tests/data/https_archive.archlinux.org/packages_l_libasyncns +swh/lister/arch/tests/data/https_archive.archlinux.org/packages_m_mercurial +swh/lister/arch/tests/data/https_archive.archlinux.org/packages_p_python-hglib +swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_community_os_x86_64_community.files.tar.gz +swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_core_os_x86_64_core.files.tar.gz +swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_extra_os_x86_64_extra.files.tar.gz +swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_community_community.files.tar.gz +swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_core.files.tar.gz +swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_extra_extra.files.tar.gz +swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_community_community.files.tar.gz +swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_core_core.files.tar.gz +swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_extra_extra.files.tar.gz +swh/lister/aur/__init__.py +swh/lister/aur/lister.py +swh/lister/aur/tasks.py +swh/lister/aur/tests/__init__.py +swh/lister/aur/tests/test_lister.py +swh/lister/aur/tests/test_tasks.py +swh/lister/aur/tests/data/fake_aur_packages.sh +swh/lister/aur/tests/data/packages-meta-v1.json.gz swh/lister/bitbucket/__init__.py swh/lister/bitbucket/lister.py swh/lister/bitbucket/tasks.py swh/lister/bitbucket/tests/__init__.py swh/lister/bitbucket/tests/test_lister.py swh/lister/bitbucket/tests/test_tasks.py swh/lister/bitbucket/tests/data/bb_api_repositories_page1.json swh/lister/bitbucket/tests/data/bb_api_repositories_page2.json +swh/lister/bower/__init__.py +swh/lister/bower/lister.py +swh/lister/bower/tasks.py +swh/lister/bower/tests/__init__.py +swh/lister/bower/tests/test_lister.py +swh/lister/bower/tests/test_tasks.py +swh/lister/bower/tests/data/https_registry.bower.io/packages swh/lister/cgit/__init__.py swh/lister/cgit/lister.py swh/lister/cgit/tasks.py swh/lister/cgit/tests/__init__.py swh/lister/cgit/tests/repo_list.txt swh/lister/cgit/tests/test_lister.py swh/lister/cgit/tests/test_tasks.py swh/lister/cgit/tests/data/https_git.baserock.org/cgit swh/lister/cgit/tests/data/https_git.eclipse.org/c swh/lister/cgit/tests/data/https_git.savannah.gnu.org/README swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit swh/lister/cgit/tests/data/https_git.savannah.gnu.org/cgit_elisp-es.git swh/lister/cgit/tests/data/https_git.tizen/README swh/lister/cgit/tests/data/https_git.tizen/cgit swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=100 swh/lister/cgit/tests/data/https_git.tizen/cgit,ofs=50 swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Projects swh/lister/cgit/tests/data/https_git.tizen/cgit_All-Users swh/lister/cgit/tests/data/https_git.tizen/cgit_Lock-Projects swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-base swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_alsa-scenario-scn-data-0-mc1n2 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e3250 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_ap_samsung_audio-hal-e4x12 swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_devices_nfc-plugin-nxp swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_intel_mfld_bootstub-mfld-blackbay swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_mtdev swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_opengl-es-virtual-drv swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libdrm swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_panda_libnl swh/lister/cgit/tests/data/https_git.tizen/cgit_adaptation_xorg_driver_xserver-xorg-misc swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-gallery-efl swh/lister/cgit/tests/data/https_git.tizen/cgit_apps_core_preloaded_ug-setting-homescreen-efl swh/lister/cgit/tests/data/https_jff.email/cgit swh/lister/cran/__init__.py swh/lister/cran/list_all_packages.R swh/lister/cran/lister.py swh/lister/cran/tasks.py swh/lister/cran/tests/__init__.py swh/lister/cran/tests/test_lister.py swh/lister/cran/tests/test_tasks.py swh/lister/cran/tests/data/list-r-packages.json swh/lister/crates/__init__.py swh/lister/crates/lister.py swh/lister/crates/tasks.py swh/lister/crates/tests/__init__.py swh/lister/crates/tests/test_lister.py swh/lister/crates/tests/test_tasks.py swh/lister/crates/tests/data/fake-crates-repository.tar.gz swh/lister/crates/tests/data/fake_crates_repository_init.sh swh/lister/debian/__init__.py swh/lister/debian/lister.py swh/lister/debian/tasks.py swh/lister/debian/tests/__init__.py swh/lister/debian/tests/test_lister.py swh/lister/debian/tests/test_tasks.py swh/lister/debian/tests/data/Sources_bullseye swh/lister/debian/tests/data/Sources_buster swh/lister/debian/tests/data/Sources_stretch swh/lister/gitea/__init__.py swh/lister/gitea/lister.py swh/lister/gitea/tasks.py swh/lister/gitea/tests/__init__.py swh/lister/gitea/tests/test_lister.py swh/lister/gitea/tests/test_tasks.py swh/lister/gitea/tests/data/https_try.gitea.io/repos_page1 swh/lister/gitea/tests/data/https_try.gitea.io/repos_page2 swh/lister/github/__init__.py swh/lister/github/lister.py swh/lister/github/tasks.py swh/lister/github/utils.py swh/lister/github/tests/__init__.py swh/lister/github/tests/test_lister.py swh/lister/github/tests/test_tasks.py swh/lister/gitlab/__init__.py swh/lister/gitlab/lister.py swh/lister/gitlab/tasks.py swh/lister/gitlab/tests/__init__.py swh/lister/gitlab/tests/test_lister.py swh/lister/gitlab/tests/test_tasks.py swh/lister/gitlab/tests/data/https_foss.heptapod.net/api_response_page1.json swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page1.json swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page2.json swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json swh/lister/gitlab/tests/data/https_gitlab.com/api_response_page1.json swh/lister/gnu/__init__.py swh/lister/gnu/lister.py swh/lister/gnu/tasks.py swh/lister/gnu/tree.py swh/lister/gnu/tests/__init__.py swh/lister/gnu/tests/test_lister.py swh/lister/gnu/tests/test_tasks.py swh/lister/gnu/tests/test_tree.py swh/lister/gnu/tests/data/tree.json swh/lister/gnu/tests/data/tree.min.json swh/lister/gnu/tests/data/https_ftp.gnu.org/tree.json.gz +swh/lister/gogs/__init__.py +swh/lister/gogs/lister.py +swh/lister/gogs/tasks.py +swh/lister/gogs/tests/__init__.py +swh/lister/gogs/tests/test_lister.py +swh/lister/gogs/tests/test_tasks.py +swh/lister/gogs/tests/data/https_try.gogs.io/repos_page1 +swh/lister/gogs/tests/data/https_try.gogs.io/repos_page2 +swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 +swh/lister/gogs/tests/data/https_try.gogs.io/repos_page4 +swh/lister/golang/__init__.py +swh/lister/golang/lister.py +swh/lister/golang/tasks.py +swh/lister/golang/tests/__init__.py +swh/lister/golang/tests/test_lister.py +swh/lister/golang/tests/test_tasks.py +swh/lister/golang/tests/data/page-1.txt +swh/lister/golang/tests/data/page-2.txt +swh/lister/golang/tests/data/page-3.txt swh/lister/launchpad/__init__.py swh/lister/launchpad/lister.py swh/lister/launchpad/tasks.py swh/lister/launchpad/tests/__init__.py swh/lister/launchpad/tests/conftest.py swh/lister/launchpad/tests/test_lister.py swh/lister/launchpad/tests/test_tasks.py swh/lister/launchpad/tests/data/launchpad_bzr_response.json swh/lister/launchpad/tests/data/launchpad_response1.json swh/lister/launchpad/tests/data/launchpad_response2.json swh/lister/maven/README.md swh/lister/maven/__init__.py swh/lister/maven/lister.py swh/lister/maven/tasks.py swh/lister/maven/tests/__init__.py swh/lister/maven/tests/test_lister.py swh/lister/maven/tests/test_tasks.py swh/lister/maven/tests/data/http_indexes/export_full.fld swh/lister/maven/tests/data/http_indexes/export_incr_first.fld swh/lister/maven/tests/data/http_indexes/export_null_mtime.fld swh/lister/maven/tests/data/https_maven.org/arangodb-graphql-1.2.pom +swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.malformed.pom swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom swh/lister/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom swh/lister/npm/__init__.py swh/lister/npm/lister.py swh/lister/npm/tasks.py swh/lister/npm/tests/test_lister.py swh/lister/npm/tests/test_tasks.py swh/lister/npm/tests/data/npm_full_page1.json swh/lister/npm/tests/data/npm_full_page2.json swh/lister/npm/tests/data/npm_incremental_page1.json swh/lister/npm/tests/data/npm_incremental_page2.json swh/lister/opam/__init__.py swh/lister/opam/lister.py swh/lister/opam/tasks.py swh/lister/opam/tests/__init__.py swh/lister/opam/tests/test_lister.py swh/lister/opam/tests/test_tasks.py swh/lister/opam/tests/data/fake_opam_repo/repo swh/lister/opam/tests/data/fake_opam_repo/version swh/lister/opam/tests/data/fake_opam_repo/packages/agrid/agrid.0.1/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.1/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.2/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.3/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.4/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.5/opam swh/lister/opam/tests/data/fake_opam_repo/packages/calculon/calculon.0.6/opam swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.1/opam swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.2/opam swh/lister/opam/tests/data/fake_opam_repo/packages/directories/directories.0.3/opam swh/lister/opam/tests/data/fake_opam_repo/packages/ocb/ocb.0.1/opam swh/lister/packagist/__init__.py swh/lister/packagist/lister.py swh/lister/packagist/tasks.py swh/lister/packagist/tests/__init__.py swh/lister/packagist/tests/test_lister.py swh/lister/packagist/tests/test_tasks.py swh/lister/packagist/tests/data/den1n_contextmenu.json swh/lister/packagist/tests/data/ljjackson_linnworks.json swh/lister/packagist/tests/data/lky_wx_article.json swh/lister/packagist/tests/data/spryker-eco_computop-api.json swh/lister/phabricator/__init__.py swh/lister/phabricator/lister.py swh/lister/phabricator/tasks.py swh/lister/phabricator/tests/__init__.py swh/lister/phabricator/tests/test_lister.py swh/lister/phabricator/tests/test_tasks.py swh/lister/phabricator/tests/data/__init__.py swh/lister/phabricator/tests/data/phabricator_api_repositories_page1.json swh/lister/phabricator/tests/data/phabricator_api_repositories_page2.json +swh/lister/pubdev/__init__.py +swh/lister/pubdev/lister.py +swh/lister/pubdev/tasks.py +swh/lister/pubdev/tests/__init__.py +swh/lister/pubdev/tests/test_lister.py +swh/lister/pubdev/tests/test_tasks.py +swh/lister/pubdev/tests/data/https_pub.dev/api_package-names +swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker +swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Babylon swh/lister/pypi/__init__.py swh/lister/pypi/lister.py swh/lister/pypi/tasks.py swh/lister/pypi/tests/__init__.py swh/lister/pypi/tests/test_lister.py swh/lister/pypi/tests/test_tasks.py swh/lister/sourceforge/__init__.py swh/lister/sourceforge/lister.py swh/lister/sourceforge/tasks.py swh/lister/sourceforge/tests/__init__.py swh/lister/sourceforge/tests/test_lister.py swh/lister/sourceforge/tests/test_tasks.py swh/lister/sourceforge/tests/data/aaron.html swh/lister/sourceforge/tests/data/aaron.json swh/lister/sourceforge/tests/data/adobexmp.json swh/lister/sourceforge/tests/data/backapps-website.json swh/lister/sourceforge/tests/data/backapps.json swh/lister/sourceforge/tests/data/main-sitemap.xml swh/lister/sourceforge/tests/data/mojunk.json swh/lister/sourceforge/tests/data/mramm.json swh/lister/sourceforge/tests/data/ocaml-lpd.html swh/lister/sourceforge/tests/data/ocaml-lpd.json swh/lister/sourceforge/tests/data/os3dmodels.json swh/lister/sourceforge/tests/data/random-mercurial.json swh/lister/sourceforge/tests/data/subsitemap-0.xml swh/lister/sourceforge/tests/data/subsitemap-1.xml swh/lister/sourceforge/tests/data/t12eksandbox.html swh/lister/sourceforge/tests/data/t12eksandbox.json swh/lister/tests/__init__.py swh/lister/tests/test_cli.py swh/lister/tests/test_pattern.py swh/lister/tests/test_utils.py swh/lister/tuleap/__init__.py swh/lister/tuleap/lister.py swh/lister/tuleap/tasks.py swh/lister/tuleap/tests/__init__.py swh/lister/tuleap/tests/test_lister.py swh/lister/tuleap/tests/test_tasks.py swh/lister/tuleap/tests/data/https_tuleap.net/projects swh/lister/tuleap/tests/data/https_tuleap.net/repo_1 swh/lister/tuleap/tests/data/https_tuleap.net/repo_2 swh/lister/tuleap/tests/data/https_tuleap.net/repo_3 \ No newline at end of file diff --git a/swh.lister.egg-info/entry_points.txt b/swh.lister.egg-info/entry_points.txt index 5db8730..38fe44f 100644 --- a/swh.lister.egg-info/entry_points.txt +++ b/swh.lister.egg-info/entry_points.txt @@ -1,22 +1,28 @@ [swh.cli.subcommands] lister = swh.lister.cli [swh.workers] +lister.arch = swh.lister.arch:register +lister.aur = swh.lister.aur:register lister.bitbucket = swh.lister.bitbucket:register +lister.bower = swh.lister.bower:register lister.cgit = swh.lister.cgit:register lister.cran = swh.lister.cran:register lister.crates = swh.lister.crates:register lister.debian = swh.lister.debian:register lister.gitea = swh.lister.gitea:register lister.github = swh.lister.github:register lister.gitlab = swh.lister.gitlab:register lister.gnu = swh.lister.gnu:register +lister.gogs = swh.lister.gogs:register +lister.golang = swh.lister.golang:register lister.launchpad = swh.lister.launchpad:register lister.maven = swh.lister.maven:register lister.npm = swh.lister.npm:register lister.opam = swh.lister.opam:register lister.packagist = swh.lister.packagist:register lister.phabricator = swh.lister.phabricator:register +lister.pubdev = swh.lister.pubdev:register lister.pypi = swh.lister.pypi:register lister.sourceforge = swh.lister.sourceforge:register lister.tuleap = swh.lister.tuleap:register diff --git a/swh.lister.egg-info/requires.txt b/swh.lister.egg-info/requires.txt index 5e69dc1..5c598f4 100644 --- a/swh.lister.egg-info/requires.txt +++ b/swh.lister.egg-info/requires.txt @@ -1,19 +1,19 @@ python_debian requests setuptools iso8601 beautifulsoup4 launchpadlib tenacity>=6.2 -xmltodict lxml +dulwich swh.core[db,github]>=2.8 swh.scheduler>=0.8 [testing] pytest pytest-mock requests_mock types-click types-pyyaml types-requests diff --git a/swh/lister/arch/__init__.py b/swh/lister/arch/__init__.py new file mode 100644 index 0000000..276e4d2 --- /dev/null +++ b/swh/lister/arch/__init__.py @@ -0,0 +1,226 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Arch Linux lister +================= + +The Arch lister list origins from `archlinux.org`_, the official Arch Linux packages, +and from `archlinuxarm.org`_, the Arch Linux ARM packages, an unofficial port for arm. + +Packages are put in three different repositories, `core`, `extra` and `community`. + +To manage listing those origins, this lister must be instantiated with a `flavours` dict. + +`flavours` default values:: + + "official": { + "archs": ["x86_64"], + "repos": ["core", "extra", "community"], + "base_info_url": "https://archlinux.org", + "base_archive_url": "https://archive.archlinux.org", + "base_mirror_url": "", + "base_api_url": "https://archlinux.org", + }, + "arm": { + "archs": ["armv7h", "aarch64"], + "repos": ["core", "extra", "community"], + "base_info_url": "https://archlinuxarm.org", + "base_archive_url": "", + "base_mirror_url": "https://uk.mirror.archlinuxarm.org", + "base_api_url": "", + } + +From official Arch Linux repositories we can list all packages and all released versions. +They provide an api and archives. + +From Arch Linux ARM repositories we can list all packages at their latest versions, they +do not provide api or archives. + +As of August 2022 `archlinux.org`_ list 12592 packages and `archlinuxarm.org` 24044 packages. +Please note that those amounts are the total of `regular`_ and `split`_ packages. + +Origins retrieving strategy +--------------------------- + +Download repositories archives as tar.gz files from https://archive.archlinux.org/repos/last/, +extract to a temp directory and then walks through each 'desc' files. +Repository archive index url example for Arch Linux `core repository`_ and Arch +Linux ARM `extra repository`_. + +Each 'desc' file describe the latest released version of a package and helps +to build an origin url and `package versions url`_ from where scrapping artifacts metadata +and get a list of versions. + +For Arch Linux ARM it follow the same discovery process parsing 'desc' files. +The main difference is that we can't get existing versions of an arm package +because https://archlinuxarm.org does not have an 'archive' website or api. + +Page listing +------------ + +Each page is a list of package belonging to a flavour ('official', 'arm'), and a +repo ('core', 'extra', 'community'). + +Each line of a page represents an origin url for a package name with related metadata and versions. + +Origin url examples: + +* **Arch Linux**: https://archlinux.org/packages/extra/x86_64/mercurial +* **Arch Linux ARM**: https://archlinuxarm.org/packages/armv7h/mercurial + +The data schema for each line is: + +* **name**: Package name +* **version**: Last released package version +* **last_modified**: Iso8601 last modified date from timestamp +* **url**: Origin url +* **data**: Package metadata dict +* **versions**: A list of dict with artifacts metadata for each versions + +The data schema for `versions` within a line: + +* **name**: Package name +* **version**: Package version +* **repo**: One of core, extra, community +* **arch**: Processor architecture targeted +* **filename**: Filename of the archive to download +* **url**: Package download url +* **last_modified**: Iso8601 last modified date from timestamp, used as publication date + for this version +* **length**: Length of the archive to download + +Origins from page +----------------- + +The origin url corresponds to: + +* **Arch Linux**: https://archlinux.org/packages/{repo}/{arch}/{name} +* **Arch Linux ARM**: https://archlinuxarm.org/packages/{arch}/{name} + +Additionally we add some data set to "extra_loader_arguments": + +* **artifacts**: Represent data about the Arch Linux package archive to download, + following :ref:`original-artifacts-json specification ` +* **arch_metadata**: To store all other interesting attributes that do not belongs to artifacts. + +Origin data example Arch Linux official:: + + { + "url": "https://archlinux.org/packages/extra/x86_64/mercurial", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-4.8.2-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "4.8.2-1", + "length": 4000000, + "filename": "mercurial-4.8.2-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-4.9-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "4.9-1", + "length": 4000000, + "filename": "mercurial-4.9-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-4.9.1-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "4.9.1-1", + "length": 4000000, + "filename": "mercurial-4.9.1-1-x86_64.pkg.tar.xz", + }, + ... + ], + "arch_metadata": [ + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "4.8.2-1", + "last_modified": "2019-01-15T20:31:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "4.9-1", + "last_modified": "2019-02-12T06:15:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "4.9.1-1", + "last_modified": "2019-03-30T17:40:00", + }, + ], + }, + }, + +Origin data example Arch Linux ARM:: + + { + "url": "https://archlinuxarm.org/packages/armv7h/mercurial", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://uk.mirror.archlinuxarm.org/armv7h/extra/mercurial-6.1.3-1-armv7h.pkg.tar.xz", # noqa: B950 + "length": 4897816, + "version": "6.1.3-1", + "filename": "mercurial-6.1.3-1-armv7h.pkg.tar.xz", + } + ], + "arch_metadata": [ + { + "arch": "armv7h", + "name": "mercurial", + "repo": "extra", + "version": "6.1.3-1", + "last_modified": "2022-06-02T22:13:08", + } + ], + }, + }, + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/arch/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker-compose up -d + +Then connect to the lister:: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler):: + + swh lister run -l arch + +.. _archlinux.org: https://archlinux.org/packages/ +.. _archlinuxarm.org: https://archlinuxarm.org/packages/ +.. _core repository: https://archive.archlinux.org/repos/last/core/os/x86_64/core.files.tar.gz +.. _extra repository: https://uk.mirror.archlinuxarm.org/armv7h/extra/extra.files.tar.gz +.. _package versions url: https://archive.archlinux.org/packages/m/mercurial/ +.. _regular: https://wiki.archlinux.org/title/PKGBUILD#Package_name +.. _split: https://man.archlinux.org/man/PKGBUILD.5#PACKAGE_SPLITTING +""" + + +def register(): + from .lister import ArchLister + + return { + "lister": ArchLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py new file mode 100644 index 0000000..58e5371 --- /dev/null +++ b/swh/lister/arch/lister.py @@ -0,0 +1,500 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import datetime +import logging +from pathlib import Path +import re +import tarfile +from typing import Any, Dict, Iterator, List, Optional +from urllib.parse import unquote, urljoin + +from bs4 import BeautifulSoup +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.model.hashutil import hash_to_hex +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +ArchListerPage = List[Dict[str, Any]] + + +def size_to_bytes(size: str) -> int: + """Convert human readable file size to bytes. + + Resulting value is an approximation as input value is in most case rounded. + + Args: + size: A string representing a human readable file size (eg: '500K') + + Returns: + A decimal representation of file size + + Examples:: + + >>> size_to_bytes("500") + 500 + >>> size_to_bytes("1K") + 1000 + """ + units = { + "K": 1000, + "M": 1000**2, + "G": 1000**3, + "T": 1000**4, + "P": 1000**5, + "E": 1000**6, + "Z": 1000**7, + "Y": 1000**8, + } + if size.endswith(tuple(units)): + v, u = (size[:-1], size[-1]) + return int(v) * units[u] + else: + return int(size) + + +class ArchLister(StatelessLister[ArchListerPage]): + """List Arch linux origins from 'core', 'extra', and 'community' repositories + + For 'official' Arch Linux it downloads core.tar.gz, extra.tar.gz and community.tar.gz + from https://archive.archlinux.org/repos/last/ extract to a temp directory and + then walks through each 'desc' files. + + Each 'desc' file describe the latest released version of a package and helps + to build an origin url from where scrapping artifacts metadata. + + For 'arm' Arch Linux it follow the same discovery process parsing 'desc' files. + The main difference is that we can't get existing versions of an arm package + because https://archlinuxarm.org does not have an 'archive' website or api. + """ + + LISTER_NAME = "arch" + VISIT_TYPE = "arch" + INSTANCE = "arch" + + DESTINATION_PATH = Path("/tmp/archlinux_archive") + + ARCH_PACKAGE_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}" + ARCH_PACKAGE_VERSIONS_URL_PATTERN = "{base_url}/packages/{pkgname[0]}/{pkgname}" + ARCH_PACKAGE_DOWNLOAD_URL_PATTERN = ( + "{base_url}/packages/{pkgname[0]}/{pkgname}/{filename}" + ) + ARCH_API_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}/json" + + ARM_PACKAGE_URL_PATTERN = "{base_url}/packages/{arch}/{pkgname}" + ARM_PACKAGE_DOWNLOAD_URL_PATTERN = "{base_url}/{arch}/{repo}/{filename}" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + flavours: Dict[str, Any] = { + "official": { + "archs": ["x86_64"], + "repos": ["core", "extra", "community"], + "base_info_url": "https://archlinux.org", + "base_archive_url": "https://archive.archlinux.org", + "base_mirror_url": "", + "base_api_url": "https://archlinux.org", + }, + "arm": { + "archs": ["armv7h", "aarch64"], + "repos": ["core", "extra", "community"], + "base_info_url": "https://archlinuxarm.org", + "base_archive_url": "", + "base_mirror_url": "https://uk.mirror.archlinuxarm.org", + "base_api_url": "", + }, + }, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + url=flavours["official"]["base_info_url"], + instance=self.INSTANCE, + ) + + self.flavours = flavours + self.session = requests.Session() + self.session.headers.update( + { + "User-Agent": USER_AGENT, + } + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def scrap_package_versions( + self, name: str, repo: str, base_url: str + ) -> List[Dict[str, Any]]: + """Given a package 'name' and 'repo', make an http call to origin url and parse its content + to get package versions artifacts data. + That method is suitable only for 'official' Arch Linux, not 'arm'. + + Args: + name: Package name + repo: The repository the package belongs to (one of self.repos) + + Returns: + A list of dict of version + + Example:: + + [ + {"url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", # noqa: B950 + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20190211-1", + "length": 180000, + "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", + "last_modified": "2019-02-13T08:36:00"}, + ] + """ + url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format( + pkgname=name, base_url=base_url + ) + response = self.request_get(url=url, params={}) + soup = BeautifulSoup(response.text, "html.parser") + links = soup.find_all("a", href=True) + + # drop the first line (used to go to up directory) + if links[0].attrs["href"] == "../": + links.pop(0) + + versions = [] + + for link in links: + # filename displayed can be cropped if name is too long, get it from href instead + filename = unquote(link.attrs["href"]) + + if filename.endswith((".tar.xz", ".tar.zst")): + # Extract arch from filename + arch_rex = re.compile( + rf"^{re.escape(name)}-(?P.*)-(?Pany|i686|x86_64)" + rf"(.pkg.tar.(?:zst|xz))$" + ) + m = arch_rex.match(filename) + if m is None: + logger.error( + "Can not find a match for architecture in %(filename)s", + dict(filename=filename), + ) + else: + arch = m.group("arch") + version = m.group("version") + + # Extract last_modified and an approximate file size + raw_text = link.next_sibling + raw_text_rex = re.compile( + r"^(?P\d+-\w+-\d+ \d\d:\d\d)\s+(?P\w+)$" + ) + s = raw_text_rex.search(raw_text.strip()) + if s is None: + logger.error( + "Can not find a match for 'last_modified' and/or " + "'size' in '%(raw_text)s'", + dict(raw_text=raw_text), + ) + else: + assert s.groups() + assert len(s.groups()) == 2 + last_modified_str, size = s.groups() + + # format as expected + last_modified = datetime.datetime.strptime( + last_modified_str, "%d-%b-%Y %H:%M" + ).isoformat() + + length = size_to_bytes(size) # we want bytes + + # link url is relative, format a canonical one + url = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN.format( + base_url=base_url, pkgname=name, filename=filename + ) + versions.append( + dict( + name=name, + version=version, + repo=repo, + arch=arch, + filename=filename, + url=url, + last_modified=last_modified, + length=length, + ) + ) + return versions + + def get_repo_archive(self, url: str, destination_path: Path) -> Path: + """Given an url and a destination path, retrieve and extract .tar.gz archive + which contains 'desc' file for each package. + Each .tar.gz archive corresponds to an Arch Linux repo ('core', 'extra', 'community'). + + Args: + url: url of the .tar.gz archive to download + destination_path: the path on disk where to extract archive + + Returns: + a directory Path where the archive has been extracted to. + """ + res = self.request_get(url=url, params={}) + destination_path.parent.mkdir(parents=True, exist_ok=True) + destination_path.write_bytes(res.content) + + extract_to = Path(str(destination_path).split(".tar.gz")[0]) + tar = tarfile.open(destination_path) + tar.extractall(path=extract_to) + tar.close() + + return extract_to + + def parse_desc_file( + self, + path: Path, + repo: str, + base_url: str, + dl_url_fmt: str, + ) -> Dict[str, Any]: + """Extract package information from a 'desc' file. + There are subtle differences between parsing 'official' and 'arm' des files + + Args: + path: A path to a 'desc' file on disk + repo: The repo the package belongs to + + Returns: + A dict of metadata + + Example:: + + {'api_url': 'https://archlinux.org/packages/core/x86_64/dialog/json', + 'arch': 'x86_64', + 'base': 'dialog', + 'builddate': '1650081535', + 'csize': '203028', + 'desc': 'A tool to display dialog boxes from shell scripts', + 'filename': 'dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst', + 'isize': '483988', + 'license': 'LGPL2.1', + 'md5sum': '06407c0cb11c50d7bf83d600f2e8107c', + 'name': 'dialog', + 'packager': 'Evangelos Foutras ', + 'pgpsig': 'pgpsig content xxx', + 'project_url': 'https://invisible-island.net/dialog/', + 'provides': 'libdialog.so=15-64', + 'repo': 'core', + 'sha256sum': 'ef8c8971f591de7db0f455970ef5d81d5aced1ddf139f963f16f6730b1851fa7', + 'url': 'https://archive.archlinux.org/packages/.all/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst', # noqa: B950 + 'version': '1:1.3_20220414-1'} + """ + rex = re.compile(r"^\%(?P\w+)\%\n(?P.*)\n$", re.M) + with path.open("rb") as content: + parsed = rex.findall(content.read().decode()) + data = {entry[0].lower(): entry[1] for entry in parsed} + + if "url" in data.keys(): + data["project_url"] = data["url"] + + assert data["name"] + assert data["filename"] + assert data["arch"] + + data["repo"] = repo + data["url"] = urljoin( + base_url, + dl_url_fmt.format( + base_url=base_url, + pkgname=data["name"], + filename=data["filename"], + arch=data["arch"], + repo=repo, + ), + ) + + assert data["md5sum"] + assert data["sha256sum"] + data["checksums"] = { + "md5sum": hash_to_hex(data["md5sum"]), + "sha256sum": hash_to_hex(data["sha256sum"]), + } + return data + + def get_pages(self) -> Iterator[ArchListerPage]: + """Yield an iterator sorted by name in ascending order of pages. + + Each page is a list of package belonging to a flavour ('official', 'arm'), + and a repo ('core', 'extra', 'community') + """ + + for name, flavour in self.flavours.items(): + for arch in flavour["archs"]: + for repo in flavour["repos"]: + yield self._get_repo_page(name, flavour, arch, repo) + + def _get_repo_page( + self, name: str, flavour: Dict[str, Any], arch: str, repo: str + ) -> ArchListerPage: + page = [] + if name == "official": + prefix = urljoin(flavour["base_archive_url"], "/repos/last/") + filename = f"{repo}.files.tar.gz" + archive_url = urljoin(prefix, f"{repo}/os/{arch}/{filename}") + destination_path = Path(self.DESTINATION_PATH, arch, filename) + base_url = flavour["base_archive_url"] + dl_url_fmt = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN + base_info_url = flavour["base_info_url"] + info_url_fmt = self.ARCH_PACKAGE_URL_PATTERN + elif name == "arm": + filename = f"{repo}.files.tar.gz" + archive_url = urljoin( + flavour["base_mirror_url"], f"{arch}/{repo}/{filename}" + ) + destination_path = Path(self.DESTINATION_PATH, arch, filename) + base_url = flavour["base_mirror_url"] + dl_url_fmt = self.ARM_PACKAGE_DOWNLOAD_URL_PATTERN + base_info_url = flavour["base_info_url"] + info_url_fmt = self.ARM_PACKAGE_URL_PATTERN + + archive = self.get_repo_archive( + url=archive_url, destination_path=destination_path + ) + + assert archive + + packages_desc = list(archive.glob("**/desc")) + logger.debug( + "Processing %(instance)s source packages info from " + "%(flavour)s %(arch)s %(repo)s repository, " + "(%(qty)s packages).", + dict( + instance=self.instance, + flavour=name, + arch=arch, + repo=repo, + qty=len(packages_desc), + ), + ) + + for package_desc in packages_desc: + data = self.parse_desc_file( + path=package_desc, + repo=repo, + base_url=base_url, + dl_url_fmt=dl_url_fmt, + ) + + assert data["builddate"] + last_modified = datetime.datetime.fromtimestamp( + float(data["builddate"]), tz=datetime.timezone.utc + ) + + assert data["name"] + assert data["filename"] + assert data["arch"] + url = info_url_fmt.format( + base_url=base_info_url, + pkgname=data["name"], + filename=data["filename"], + repo=repo, + arch=data["arch"], + ) + + assert data["version"] + if name == "official": + # find all versions of a package scrapping archive + versions = self.scrap_package_versions( + name=data["name"], repo=repo, base_url=base_url + ) + elif name == "arm": + # There is no way to get related versions of a package, + # but 'data' represents the latest released version, + # use it in this case + assert data["builddate"] + assert data["csize"] + assert data["url"] + versions = [ + dict( + name=data["name"], + version=data["version"], + repo=repo, + arch=data["arch"], + filename=data["filename"], + url=data["url"], + last_modified=last_modified.replace(tzinfo=None).isoformat( + timespec="seconds" + ), + length=int(data["csize"]), + ) + ] + + package = { + "name": data["name"], + "version": data["version"], + "last_modified": last_modified, + "url": url, + "versions": versions, + "data": data, + } + page.append(package) + return page + + def get_origins_from_page(self, page: ArchListerPage) -> Iterator[ListedOrigin]: + """Iterate on all arch pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + for origin in page: + artifacts = [] + arch_metadata = [] + for version in origin["versions"]: + artifacts.append( + { + "version": version["version"], + "filename": version["filename"], + "url": version["url"], + "length": version["length"], + } + ) + arch_metadata.append( + { + "version": version["version"], + "name": version["name"], + "arch": version["arch"], + "repo": version["repo"], + "last_modified": version["last_modified"], + } + ) + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=origin["url"], + last_update=origin["last_modified"], + extra_loader_arguments={ + "artifacts": artifacts, + "arch_metadata": arch_metadata, + }, + ) diff --git a/swh/lister/arch/tasks.py b/swh/lister/arch/tasks.py new file mode 100644 index 0000000..40a3ef0 --- /dev/null +++ b/swh/lister/arch/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.arch.lister import ArchLister + + +@shared_task(name=__name__ + ".ArchListerTask") +def list_arch(**lister_args): + """Lister task for Arch Linux""" + return ArchLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/arch/tests/__init__.py b/swh/lister/arch/tests/__init__.py new file mode 100644 index 0000000..8ba0ac1 --- /dev/null +++ b/swh/lister/arch/tests/__init__.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from pathlib import PosixPath +import subprocess +from typing import Optional, Union + +# TODO: prepare_repository_from_archive method is duplicated from crates lister tests, +# centralize to tests utils? + + +def prepare_repository_from_archive( + archive_path: str, + filename: Optional[str] = None, + tmp_path: Union[PosixPath, str] = "/tmp", +) -> str: + """Given an existing archive_path, uncompress it. + Returns a file repo url which can be used as origin url. + + This does not deal with the case where the archive passed along does not exist. + """ + if not isinstance(tmp_path, str): + tmp_path = str(tmp_path) + # uncompress folder/repositories/dump for the loader to ingest + subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path]) + # build the origin url (or some derivative form) + _fname = filename if filename else os.path.basename(archive_path) + repo_url = f"file://{tmp_path}/{_fname}" + return repo_url diff --git a/swh/lister/arch/tests/data/fake_archlinux_archives_init.sh b/swh/lister/arch/tests/data/fake_archlinux_archives_init.sh new file mode 100755 index 0000000..d94f468 --- /dev/null +++ b/swh/lister/arch/tests/data/fake_archlinux_archives_init.sh @@ -0,0 +1,905 @@ +#!/usr/bin/env bash + +# Script to generate fake-.tar.gz files and fake http responses for +# archive.archlinux.org and mirror.archlinuxarm.org +# For tests purposes only + +set -euo pipefail + +# files and directories +mkdir https_archive.archlinux.org +mkdir https_uk.mirror.archlinuxarm.org + +mkdir -p tmp_dir/archives/ +cd tmp_dir/archives/ + +mkdir -p core.files +mkdir -p core.files/gzip-1.12-1 +mkdir -p core.files/dialog-1:1.3_20220414-1 + +mkdir -p extra.files +mkdir -p extra.files/mercurial-6.1.2-1 +mkdir -p extra.files/libasyncns-0.8+3+g68cd5af-3 + +mkdir -p community.files +mkdir -p community.files/python-hglib-2.6.2-4 +mkdir -p community.files/gnome-code-assistance-3:3.16.1+r14+gaad6437-1 + +echo -e """%FILENAME% +gzip-1.12-1-x86_64.pkg.tar.zst + +%NAME% +gzip + +%BASE% +gzip + +%VERSION% +1.12-1 + +%DESC% +GNU compression utility + +%GROUPS% +base-devel + +%CSIZE% +81552 + +%ISIZE% +150448 + +%MD5SUM% +3e72c94305917d00d9e361a687cf0a3e + +%SHA256SUM% +0ee561edfbc1c7c6a204f7cfa43437c3362311b4fd09ea0541134aaea3a8cc07 + +%PGPSIG% +iQIzBAABCgAdFiEE4kC1fixGMLp2ji8m/BtUfI2BcsgFAmJPIMEACgkQ/BtUfI2BcsjDTw//Zzu/G+1B2qKIwqy7s/3WieNflLj8PdroF2V+5/W9O70zY4P3edkzjJVCjp9j8esIwfacDfgJvqpdQE5oBJKrwtp3FHEKSRXYUwkOWeGcxO9F8scRclqPYIybfeD3zp0hL2iXE3x4NOg46znlYXqr19Nnovb0Pf0XQ3x8B8qwk997aUvmJz40iQ31EOuQ/PaxboOdiGPkAfflBYdcoDS2XprT5Po9bNoHen5qdN55eF3mipOVZMiynZoHVwgWT/lVwEuAUMxPMLW/QAHn7UEyWIii+ysUZCECf7sVUHOtdro4Y3bUl85JlyFx113dvJDy7QVX4qh89YFLHb0E3ml64wa+I5/q8Y2l7FRPr07n6yhb+MDQcA9hteDOzYzhT7gThrtJAEVJSHxxlGoC/GHgPTWwc7RD80OUcAiGJBUjxYUOKy/CBJ7H4zaRCa28CWrh7IEqDUu6hrCZQHAzmAYbF8X8BnIAAg7jkH3tlH8zwtw3eFxATWgSWfPFsl7jPdGwSUjmff1YvOCjh8r4YFuqqejZQSsUWnO9vE37DCVod7qSBPLzJOfCyPpSoouDC3p+vxhJ5Da5vkqUJk017QYDcGxMMyS1joAPBzkkesca7Ej+eHovfEA5mMLmRHR7lULPBnMjz9IW2i0MvRPt4m8wlFucUAUsrMiTn0WM+V2k2qI= + +%URL% +https://www.gnu.org/software/gzip/ + +%LICENSE% +GPL3 + +%ARCH% +x86_64 + +%BUILDDATE% +1649352820 + +%PACKAGER% +Levente Polyak + +%DEPENDS% +glibc +bash +less +""" > core.files/gzip-1.12-1/desc + +echo -e """%FILENAME% +dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst + +%NAME% +dialog + +%BASE% +dialog + +%VERSION% +1:1.3_20220414-1 + +%DESC% +A tool to display dialog boxes from shell scripts + +%CSIZE% +203028 + +%ISIZE% +483988 + +%MD5SUM% +06407c0cb11c50d7bf83d600f2e8107c + +%SHA256SUM% +ef8c8971f591de7db0f455970ef5d81d5aced1ddf139f963f16f6730b1851fa7 + +%PGPSIG% +iQEzBAABCAAdFiEEhs/8qRjPOvRxR1iAUeixSKmZnDQFAmJaPyAACgkQUeixSKmZnDQvZAf/X3qO7Wg6f+tnQ4qciRcRpegsExNRfKo6S1XhA9C0BC0LJDcTRHG1C7+NVB6dSSL5SdOSVTEACjDc2APppNuPDOxFtzl9doYMHqYTSud2yEUPpE8K+3mhcyHpeOxJC6ZIsQWOjug9FTBwUCUo6s5nHjkmRpsK0zYgK9ezmOSZXlS3QSNRaGbKzn1JM8BOUv5Y29f8nTCXNn1m6RW0yAlyz4rdHDVWfaBFvYL7IC/6uwA+92LB3egyEzYx6uuRvvlBR75Rh+IATBxfwLo1bNNEmFWA/W6vooICjF2E23zk4yaKw08f+V2fjRDn9Hs/i2B4bNNNWeOf5q7j7y5EnBbmeQ== + +%URL% +https://invisible-island.net/dialog/ + +%LICENSE% +LGPL2.1 + +%ARCH% +x86_64 + +%BUILDDATE% +1650081535 + +%PACKAGER% +Evangelos Foutras + +%PROVIDES% +libdialog.so=15-64 + +%DEPENDS% +sh +ncurses +""" > core.files/dialog-1:1.3_20220414-1/desc + +echo -e """%FILENAME% +mercurial-6.1.2-1-x86_64.pkg.tar.zst + +%NAME% +mercurial + +%BASE% +mercurial + +%VERSION% +6.1.2-1 + +%DESC% +A scalable distributed SCM tool + +%CSIZE% +5034047 + +%ISIZE% +26912816 + +%MD5SUM% +037ff48bf6127e9d37ad7da7026a6dc0 + +%SHA256SUM% +be33e7bf800d1e84714cd40029d103873e65f5a72dea19d6ad935f3439512cf8 + +%PGPSIG% +iQEzBAABCAAdFiEEFRnVq6Zb9vwrc8dWek52CV2KUuQFAmJ2UgAACgkQek52CV2KUuRecggAo3nP9o1hmew82njxj8i0Nab8Ih2wXfutxDSNjOr5UFH5ei8wD60EU2iyZK0YhXI+cozoRlDI6lIjcvWiDH3s9m09xoCX/HAnPfaWkCo9h8DEQX/qxHKc8o87UPVebkLNqKGSu/xXd+n3A5gVl1pI3+7HpaXwOFuTtSFpb+hQ46kW2of+q1NaMpAsLX68uQ0rfaurvIkLZFZDK4zBnRxHXrPMlnj6KbCy/U3/H/ySQTSdfa3YiFe5KzL5dcbPlryCGC4N+xhEn/PYc7OL5I/1iEY9F4sggZQOUh4wXUkv6hc0Xp6Htp7kMKuJUoPJt8kaeUZnIWkB1CCBP5IxnET7Ag== + +%URL% +https://www.mercurial-scm.org/ + +%LICENSE% +GPL + +%ARCH% +x86_64 + +%BUILDDATE% +1651921313 + +%PACKAGER% +Antonio Rojas + +%DEPENDS% +python + +%OPTDEPENDS% +tk: for the hgk GUI + +%MAKEDEPENDS% +python-docutils +""" > extra.files/mercurial-6.1.2-1/desc + +echo -e """%FILENAME% +libasyncns-0.8+3+g68cd5af-3-x86_64.pkg.tar.zst + +%NAME% +libasyncns + +%BASE% +libasyncns + +%VERSION% +0.8+3+g68cd5af-3 + +%DESC% +A C library for executing name service queries asynchronously + +%CSIZE% +17036 + +%ISIZE% +48763 + +%MD5SUM% +0aad62f00eab3d0ec7798cb5b4a6eddd + +%SHA256SUM% +a0262e191dd3b00343e79e3521159c963e26b7a438d4cc44137c64cf0da90516 + +%PGPSIG% +iQIzBAABCAAdFiEEtZcfLFwQqaCMYAMPeGxj8zDXy5IFAl7DmIIACgkQeGxj8zDXy5IE8w/7BRDCV4cdGG/2DK0ezqicrMTmpRjiN0Zh14s39V/wHt4VmU786y0fjR/2UfvxncnDqBTmiPbe6Ilv5vQ/4aHbRObqeVFD76iMKAPmBWLOvI8rGLlZjK9zLOKTHwKb7EBk4D4HrF/zd/c1Mz5rGkF/RAgchDT2G4NTozG3SUA1cL3TWgvPn4CIEeq2XTO01GCqXMiavdSuyAEIzKjc9zNPQ/2N1qQ2zPuzjbkEEk4Tk2ewKUQbKcVYpp+iwxm5sUFqd/mpnT4scve7bsHm0VduZbz5mqz2fg57/RU8qQ6GjLZjEHQGE2B3YUWzZlxN2x4+skXN7CRNmfAVyoe7C6hsED8cyKx+J8V+hk67xeIfEm0KCOhegpL/TM/O7xR9w5y3WFmN0VS96l5do9hZkkzNn7o64hvjtEypo/sCF/46KqHwJNezukbWENIWJcHYu8EqLaZsTFD+vQ8iXk7xy2ocQQTIfFlThNwPG+oGK8holQjOhdn8C+m8mG5QFQmUqhcPch4qRsUq1uY3CFooIX4pRghkIyrFwmwhxiao2HLegKS9v5RqMCGxJ3jPXT0tB7s56fpC3D2moCJtaN+GUsU3YW/a0gHgIhzCC7HJYZ+T+MkG5LW3Lb5swMXa4Qe5GcSzc1w+cpMurQKANGNk382TO5CRmo3e+dO4CLaXmUGOjGw= + +%URL% +http://0pointer.de/lennart/projects/libasyncns + +%LICENSE% +LGPL + +%ARCH% +x86_64 + +%BUILDDATE% +1589876807 + +%PACKAGER% +Felix Yan + +%DEPENDS% +glibc + +%MAKEDEPENDS% +git +lynx +""" > extra.files/libasyncns-0.8+3+g68cd5af-3/desc + +echo -e """%FILENAME% +python-hglib-2.6.2-4-any.pkg.tar.zst + +%NAME% +python-hglib + +%BASE% +python-hglib + +%VERSION% +2.6.2-4 + +%DESC% +A library with a fast, convenient interface to Mercurial. It uses Mercurial's command server for communication with hg. + +%CSIZE% +44083 + +%ISIZE% +242821 + +%MD5SUM% +ecc6598834dc216efd938466a2425eae + +%SHA256SUM% +fd273811023e8c58090d65118d27f5c10ad10ea5d1fbdbcf88c730327cea0952 + +%PGPSIG% +iQEzBAABCAAdFiEEhs/8qRjPOvRxR1iAUeixSKmZnDQFAmGpaF0ACgkQUeixSKmZnDSHMwf/bCyNUXK2BoZfdNe0hTZJ54M9FgMZC81QPINAugjxpwOYd5zK43PB/n1t5rNpC2jy8G8J5Yuq8eJr5aFV9GB/yeDDlf3gqtOHQteYZjl+oGcfqtVF4i6/e4rXd1mvRH7fFxI18rLThL3Pei+cblh6iZ0NVVqbrd2opURuUvAPwYLN+/YNurFNdS5E1K+TDpMaunA9flatLFV6Cqn3kkyWh0aMT4hN0bv2kvS0AnD3iKh7YTeaHvx1y4o33zcVRDjepcV4ywE6wozteM+Xcelu3XUlZC6luNX05XsQ7x3fKJTFmXrz3y7vYwhq427nuyVEE/yujZLOhBIqLl2VGRUBfQ== + +%URL% +https://pypi.python.org/pypi/python-hglib + +%LICENSE% +MIT + +%ARCH% +any + +%BUILDDATE% +1638492205 + +%PACKAGER% +Evangelos Foutras + +%DEPENDS% +python +mercurial + +%CHECKDEPENDS% +python-nose +""" > community.files/python-hglib-2.6.2-4/desc + +echo -e """%FILENAME% +gnome-code-assistance-2:3.16.1+14+gaad6437-2-x86_64.pkg.tar.zst + +%NAME% +gnome-code-assistance + +%BASE% +gnome-code-assistance + +%VERSION% +2:3.16.1+14+gaad6437-2 + +%DESC% +Code assistance services for GNOME + +%GROUPS% +gnome-extra + +%CSIZE% +1854253 + +%ISIZE% +6795615 + +%MD5SUM% +eadcf1a6bb70a3e564f260b7fc58135a + +%SHA256SUM% +6fd0c80b63d205a1edf5c39c7a62d16499e802566f2451c2b85cd28c9bc30ec7 + +%PGPSIG% +iQIzBAABCAAdFiEEtZcfLFwQqaCMYAMPeGxj8zDXy5IFAmGpWGMACgkQeGxj8zDXy5L5FA/8CXB1h17mEitVHfHvtUbQy/5eZ+REzHQzmtk8SJ5oMk9ojxTuQh95M4gEQrp55g/BWxuXSbnCXu8N0SRpaKgX67kqJn3vnoHGnjobr80L7TXqSEtXj15/153VuoFg5atmbsOgIdgkCzhAJJKxIt0nGfPlegLxHIZ7Ig06dzI9dc2W+cKotnWW6QuRn1CYD28ZKvBhMhBmjcDu6Rj1muz5NvO80HABP7+AVRsqd2eGJdoX/BmBBqjEGnPtXE1wY/uCuG+XWLy2MeV5ps4f8LYubNOa0KIutyEe6IX+29aQGhawI2G4d04azoTBZpy6xtocJzyW+P+vTxcv/4jhj5E6v7izJy34LTShnHd5J/UWiXl50HWKjJbVPN4o0rWX3EptDHX0gyj+1lvS5Za12Lyy8oGeID10T7N8mcEVREM8XylKz7O7wSaKKbVOXQVAWZ/mQwk7GuWOgGH/nPtVgyNNdHSh+3urPzhuvMSoytJmRo4FbOyRju1Zb3RbbIDWA04Dh7DLH1CvxZ53JkNt0wHZFVt792hmZ4o/wFMVXoNrUnHuI9G1sT8TcYjSmiXlZ7l5cyLo6AsvsFUY2ZuBNXz+3M3CzGyGoAV0Hi2SYl5FaZHuKoFh+P5Xk5ngm42kyoAiQKfrled3ff5fWXqU0jbGDUte+QuLcsKYKZ20YhjP2nc= + +%URL% +https://wiki.gnome.org/Projects/CodeAssistance + +%LICENSE% +GPL3 + +%ARCH% +x86_64 + +%BUILDDATE% +1638488044 + +%PACKAGER% +Felix Yan + +%DEPENDS% +libgee +python-dbus +python-gobject +python-pylint +python-pyflakes +python-pycodestyle +python-lxml +python-simplejson +ruby-dbus +ruby-sass + +%OPTDEPENDS% +clang: Assistance for C and C++ +gjs: Assistance for JavaScript +go: Assistance for Go + +%MAKEDEPENDS% +intltool +gobject-introspection +llvm +clang +gjs +go +gnome-common +git +""" > community.files/gnome-code-assistance-3:3.16.1+r14+gaad6437-1/desc + +# Tar archives +tar -czf ../../https_archive.archlinux.org/repos_last_core_os_x86_64_core.files.tar.gz core.files/* +tar -czf ../../https_archive.archlinux.org/repos_last_extra_os_x86_64_extra.files.tar.gz extra.files/* +tar -czf ../../https_archive.archlinux.org/repos_last_community_os_x86_64_community.files.tar.gz community.files/* + + +# Fixtures for archlinuxarm.org + +mkdir -p arm/aarch64/core.files/gzip-1.12-1 +mkdir -p arm/armv7h/core.files/gzip-1.12-1 + +mkdir -p arm/aarch64/extra.files/mercurial-6.1.2-1 +mkdir -p arm/armv7h/extra.files/mercurial-6.1.2-1 + +mkdir -p arm/aarch64/community.files/python-hglib-2.6.2-4 +mkdir -p arm/armv7h/community.files/python-hglib-2.6.2-4 + +echo -e """%FILENAME% +gzip-1.12-1-aarch64.pkg.tar.xz + +%NAME% +gzip + +%BASE% +gzip + +%VERSION% +1.12-1 + +%DESC% +GNU compression utility + +%GROUPS% +base-devel + +%CSIZE% +79640 + +%ISIZE% +162688 + +%MD5SUM% +97d1e76302213f0499f45aa4a4d329cc + +%SHA256SUM% +9065fdaf21dfcac231b0e5977599b37596a0d964f48ec0a6bff628084d636d4c + +%PGPSIG% +iQIzBAABCAAdFiEEaLNTfzmjE7PldNBndxk/FSvb5qYFAmJPUuQACgkQdxk/FSvb5qZTvhAAxa3rIyWh/hXRePyAkPKl14YhopF4FDoyoCA9DJBz8bJ0qCe7IE/lCFgIH3CFPOVQxDttxo2q6KHt/Di2P5TYMyXrkoDdB9dwuku0DPIsYzhAp1PVFUTUe599c8rNVGTn/k62WvcK7jxD0p8niHjveVRjwmJ+uZf3a9AGoedNsQN94I/dnWu2ggFUBXF6c77ak78ED7k2xTlBv2fSK9Jkzkcjxtc4kZKjxzF4NVTnNVJkz6UgFUyESausSfE/ub247pdmk0zTHTPodPKtwuECA8ZwsRrETf5if0WjX81E9ox7AtZ7mcBNuZKdeuBaU4WW3sqH60G3t8c3ZmpxzYWJCOdsiUwYkAu6bw7yvEREdm2J6ZZx4CE59b+1hepO8/BKg+Gxe9jNrSaEqug6SaueXO68Gk3uPqVRbgcXNg6TyHEKPcEhBQS8mzpvTomUOt6A9XCiwVVbuyCfltYKxLwQRh1BeWv2Y3KCjkT4oudxWVXW3FxYjFriw+g6RA41MZ9f+jVMr2cE+QidqN/GmuR8RrFQZhJZ9iZx7S4DrPyxLToPPnTIzRuBJKlDXBgEfIB8cbSSDpZU5SfILZYYHfr9j5hYED765t3959pqBqJ9wPJ0qmydGWInM1M2o1lRyEisWtwTHaAv9LSMklSaMCRPIydqnsdjkHPLAUYlE8yTuRE= + +%URL% +https://www.gnu.org/software/gzip/ + +%LICENSE% +GPL3 + +%ARCH% +aarch64 + +%BUILDDATE% +1649365694 + +%PACKAGER% +Arch Linux ARM Build System """ > arm/aarch64/core.files/gzip-1.12-1/desc + +echo -e """%FILENAME% +gzip-1.12-1-armv7h.pkg.tar.xz + +%NAME% +gzip + +%BASE% +gzip + +%VERSION% +1.12-1 + +%DESC% +GNU compression utility + +%GROUPS% +base-devel + +%CSIZE% +78468 + +%ISIZE% +153864 + +%MD5SUM% +490c9e28db91740f1adcea64cb6ec1aa + +%SHA256SUM% +4ffc8bbede3bbdd9dd6ad6f85bb689b3f4b985655e56285691db2a1346eaf0e7 + +%PGPSIG% +iQIzBAABCAAdFiEEaLNTfzmjE7PldNBndxk/FSvb5qYFAmJPUxgACgkQdxk/FSvb5qZZkBAAwllACKZT9wnFxCcPvZGl/fkHzMs0nyWEsP+JbMaQQaKnSmh8DfklBi+V2rBCRAJiDwBhLjxSS+maW3uxbfaMgNGTl3lSlwvfIz9pUl+OxwS4WB3uMZLNvebVuqO9FQIAB+MdT8ZnWRFRlnj1WPGuDndkZDLlmOqNLWNOgkNS2FAXC0s1nKVGOM8Wd2llYlQkqCglVgOcj4PCmkSBX/BtFJ5gUeelATJiaKQSxN8xFaFbYStlzUe6HhE5Ou2wLHE+XYCEFIgvkoTgZ3eZQbQrV7z/hFW1iv+h9RBbEUcFAZGPbemC3C/PDRMJQySucNEsxCn3huI2KYx0RJunKVJ83QSGJr6xYzSZvCckC9LjHL8DnOOgn+bKJGNc+hBA5EH5/otc17Sr1H+mhx54duc5rH/kUxNg8RwsUEMCgeIw3YQnxeN8GVDbHfsshzk2S+dzOsOZwH+Y0BOknXfQYdssKLKHdktfS2G6t3izZqaflOFLXc5429KAAHldJ+NpYpsKPhCMWYEtdD9Cb21FrdePrlA20BTK02v897gw6qu01vDn7S9fKyQDOjTwO9UZB/S3w99srxwZ3MD6EQH4eLyvD7FSNPYlwiB+WNh+J3+9acHHE9iZ4OCyuutBYf9Pjvwiu9dY1PurqNl3Wd++B/MBYoAX2G6hJr8y8bOF1WBvIhQ= + +%URL% +https://www.gnu.org/software/gzip/ + +%LICENSE% +GPL3 + +%ARCH% +armv7h + +%BUILDDATE% +1649365715 + +%PACKAGER% +Arch Linux ARM Build System """ > arm/armv7h/core.files/gzip-1.12-1/desc + +echo -e """%FILENAME% +mercurial-6.1.3-1-aarch64.pkg.tar.xz + +%NAME% +mercurial + +%BASE% +mercurial + +%VERSION% +6.1.3-1 + +%DESC% +A scalable distributed SCM tool + +%CSIZE% +4931228 + +%ISIZE% +26959193 + +%MD5SUM% +0464390744f42faba80c323ee7c72406 + +%SHA256SUM% +635edb47117e7bda0b821d86e61906c802bd880d4a30a64185d9feec1bd25db6 + +%PGPSIG% +iQIzBAABCAAdFiEEaLNTfzmjE7PldNBndxk/FSvb5qYFAmKZNusACgkQdxk/FSvb5qaIkg/9FuZemlogqBd7AJA2hi9o/jtcX2nj6m12w76PeZXXgZ9//lV+BVb/fjOThz+ndfmGU34vuyfIDrbolajjWcSUtfwlhIohETbrwHfFTNp2GzA4TmKxl1Mw40ibHP+NptgB0i5z+FRUt5RJyfUBokQqSLzcUr5g1XhSmpEBCDC2tR2nZiq4miW4tJIRwM3HBvAJAtdfRGxGi5rs+Qd2hblTRGITfUA1QJxgq6WJjTbuRPb+BN0ohXHMk9GXVQXh0Df8u9WjleQiPT310W/gXNCd9THfYQr2iC1rbd12/oQsgvEelZuN9ZEtzUmFW5KyCjot4uSxj6jV0fa+nxA4Iyqmma2JzUvF9daObxPWbpD3d1Y+i68J/60ekAnN/7cI+YMBjGtCNzJkOW09Hk+gqHe6/ePwejvkvxqENXwLTMBp57Jjg/+RDJ2gvlNfGPskknLqxc6gz59J9fK/ytC9IwTIF54EDPbtAfLcukxG0HKeGZ54bHsE5397UrdqB8auSVsqkZzlauhJs9QaLnbtBBYaFYRgnmRBj3TMYbJRP+1qgxEOQHUOFLwnDGyfOInozxY0pip3GBbICoxFxss39YzeeR4PVqobLWJQq/uNsJhBmVG6dbYGoHsgWHhl2uTalu//mQhMZgcRBRgZ6FqjniSdme/feXMvacHY7n/OrPWiNPY= + +%URL% +https://www.mercurial-scm.org/ + +%LICENSE% +GPL + +%ARCH% +aarch64 + +%BUILDDATE% +1654208118 + +%PACKAGER% +Arch Linux ARM Build System """ > arm/aarch64/extra.files/mercurial-6.1.2-1/desc + +echo -e """%FILENAME% +mercurial-6.1.3-1-armv7h.pkg.tar.xz + +%NAME% +mercurial + +%BASE% +mercurial + +%VERSION% +6.1.3-1 + +%DESC% +A scalable distributed SCM tool + +%CSIZE% +4897816 + +%ISIZE% +26853841 + +%MD5SUM% +453effa55e32be3ef9de5a58f322b9c4 + +%SHA256SUM% +c1321de5890a6f53d41c1a5e339733be145221828703f13bccf3e7fc22612396 + +%PGPSIG% +iQIzBAABCAAdFiEEaLNTfzmjE7PldNBndxk/FSvb5qYFAmKZNpkACgkQdxk/FSvb5qabaw//Z/NRzDzAlQdEYE3sBB6eJSum9HQrUQDHX7c0fl+wyc0sc+thzfUVueQzFi9EkoTZb9zyuTYGt5KPdQ4cAfEj0ikwxDrS1RFzGyre30OgyQfbGMGnC1BQG4TOLWwS+mFn/tMoeriuMtgHoljsbjn+bSI2JONW6U/kf0s726/HDvmKFLyhHsF6ZGlOQC+ASBR84CY496Yc1SJTnQmGaWzDmF2zfK7OxMkVvVJw7Zi0OgF1L+WEIHHgS0T+bYk6rLX3xxgwQ37XczN9+SSFTM77bF1LfJIlLbLspaE6m8EJnpsTnX8nCvGWfbdPhDqGLdVw6hnNMLPFIXxXuY3KgfwGUX1UKxfvHpbjR0uYvW32Xs85lqsHZShtmaWYTJMDjiLht/6d8uAQLPAOjdDneyaCf0XEMHor8yAd9zcVmSgd/s+TJQYtWK9fsl+QVk8WS484iSSRPZFtVzJpqg1TYulaWha6DZCCidVkryStHnoGi+3vti/9FtUs7jn086PzDfugj9DoV7ixJ6edxIgp7r3TYgzzVTHuyhXBOaE0dp+IX3ekcMF7C37qrfS9uVIVVtMYvnQRICULYlB0LLHvrK1+m4z4ETpqNrjNevcUChns24rnJmmdkOEv/pzmAR7oYmX8rFda8wgiYfciBQzi71XcmP/SyIQud3UJUbvZjiTBRGk= + +%URL% +https://www.mercurial-scm.org/ + +%LICENSE% +GPL + +%ARCH% +armv7h + +%BUILDDATE% +1654207988 + +%PACKAGER% +Arch Linux ARM Build System """ > arm/armv7h/extra.files/mercurial-6.1.2-1/desc + +echo -e """%FILENAME% +python-hglib-2.6.2-4-any.pkg.tar.xz + +%NAME% +python-hglib + +%BASE% +python-hglib + +%VERSION% +2.6.2-4 + +%DESC% +A library with a fast, convenient interface to Mercurial. It uses Mercurial's command server for communication with hg. + +%CSIZE% +41432 + +%ISIZE% +242769 + +%MD5SUM% +0f763d5e85c4ffe728153f2836838674 + +%SHA256SUM% +7a873e20d1822403c8ecf0c790de02439368000e9b1b74881788a9faea8c81b6 + +%PGPSIG% +iQIzBAABCAAdFiEEaLNTfzmjE7PldNBndxk/FSvb5qYFAmG4xMQACgkQdxk/FSvb5qboEQ/+PMN4p7cUqEuArNug8UW0h8sG8vXJXyjQo3HxdhIswuNItuBiCaTzFRH+M5Dnoh+Jy+9wLvbzqLnPXkOTgFTBakjyZ8Bxkt1lTYUOUmqCaR3s1nqajOqIRKAAjUuh1oIiM8Hyyfsgrd244jPtRFlL3y6RPgjfd8M9euV9WCxIRVR0ztnvLURlE+yyGVjv6g4rfcwcIPEjV3XUKRd8kLyWkBwDMUgM8rbeVLZjKxdAa1N3XTAikgUi7IJDafpC83IfTzWhBQFaIJ0yQG2FE5FSbY6GlpcpAIktXwxCTBEXYVRtl+tQwDVoLqgExVBMvCza9Nsstav1WwgKnqMIc5HwfNhSPjKoLPKERYhGVpKwY0doal0rfr1gFn4ZOE/WBwCrAFscB9MWhZ/WFXQiWrXfl72YCh6fCdZN5S5xdculOebehgmXP409AE9N0VVM0iCOIG1P9YTv9OWr5VZnUpezKMVXztUHmlAXTkN1dCgPiA34OJ4ExlceNMqbb/ltie0dkRiFRnbat0wIt1KXmhcei4qw0IaFbQo/dvFvigUJ21BUTqzC6ktFICdmL8dJeRjfC7ysu1u/uU+Rq60J/vI6DK1R06oJ5wqVumMdGY4NliZDTooV3+s6M3m/hDkx9IKVn6h+bSTkqiEkgDf/AD3xrKWtO15c1xpA6YeFTkEWL/w= + +%URL% +https://pypi.python.org/pypi/python-hglib + +%LICENSE% +MIT + +%ARCH% +any + +%BUILDDATE% +1639498940 + +%PACKAGER% +Arch Linux ARM Build System """ > arm/aarch64/community.files/python-hglib-2.6.2-4/desc + +echo -e """%FILENAME% +python-hglib-2.6.2-4-any.pkg.tar.xz + +%NAME% +python-hglib + +%BASE% +python-hglib + +%VERSION% +2.6.2-4 + +%DESC% +A library with a fast, convenient interface to Mercurial. It uses Mercurial's command server for communication with hg. + +%CSIZE% +41408 + +%ISIZE% +242769 + +%MD5SUM% +198ef7d6dd40d778a0bf585cce30b1c8 + +%SHA256SUM% +4b0f51e57f22ddc0dbe308244fc1db85b9e9f721396dbcfbcab38bcb4fe16e10 + +%PGPSIG% +iQIzBAABCAAdFiEEaLNTfzmjE7PldNBndxk/FSvb5qYFAmG5GNYACgkQdxk/FSvb5qZa6g/7BcAPCxD5zwY2cBVe5XuwzsTU4cDGoPJ8zVmj9NEpKnoheF29Lfs+dibguAzfox110DhJebVVmS6HeFpQ6QUQtNaO4cser5XBGgF5PTVa4y+gKXiHCOuzmp+iEKmt6u5Gp1lDoMRJb+EvkRRMO51DXCODMvZbj32fyfiVNZsZR5nffQbQ0AWiJ+xeYZVD/7i+mj+wDLtG3+r9KoFFV6C+ZU5g3NDKoHLgLVQLStfiQSDVtIjemPp+CwWQ6AUpC7vzxiPg5X5JdWj5hw/9AgVJHMWqa2q2YgDOPQzLBGgFCRRED96IYoc1ID7ZzI4tXZIQ8L9N4NkIVMyNdZzc1G9XiwOpg360nKNqbfp3igN12Lg8wqXeYdYVdh1xoo3mVIiJ0oo7fygQpRk5RU/UHcahaxcCQgeMvivaW3Xjb3BM4iKcKJb8GcSPdndTKzJlKOAUk+lD5rGICO5tLSKzkoQzB+ULDitEBU9E2VJ1KAczd7d6xZ3IqjO9GUhHIVRvlK31hQcBUA9g0bwZciZlv8M1ZqSeoeZ8SXvk57a92tGvqbfrjMqK9j7DXsi5w6CIGunV1ceHoVIxzzCboBYZU0cLkUpL9jzU0YXPN7Y2F2Lkn0/uVa0xd9HwSqVTyR88k4aqoL618hcbVoHh9EziU/Oc+ME4YB1VYH7kj66Ob/9Y9gI= + +%URL% +https://pypi.python.org/pypi/python-hglib + +%LICENSE% +MIT + +%ARCH% +any + +%BUILDDATE% +1639520456 + +%PACKAGER% +Arch Linux ARM Build System +""" > arm/armv7h/community.files/python-hglib-2.6.2-4/desc + +# Tar arm indexes to convenient path and filename +tar -czf ../../https_uk.mirror.archlinuxarm.org/aarch64_core_core.files.tar.gz arm/aarch64/core.files/* +tar -czf ../../https_uk.mirror.archlinuxarm.org/aarch64_extra_extra.files.tar.gz arm/aarch64/extra.files/* +tar -czf ../../https_uk.mirror.archlinuxarm.org/aarch64_community_community.files.tar.gz arm/aarch64/community.files/* + +tar -czf ../../https_uk.mirror.archlinuxarm.org/armv7h_core_core.files.tar.gz arm/armv7h/core.files/* +tar -czf ../../https_uk.mirror.archlinuxarm.org/armv7h_extra_extra.files.tar.gz arm/armv7h/extra.files/* +tar -czf ../../https_uk.mirror.archlinuxarm.org/armv7h_community_community.files.tar.gz arm/armv7h/community.files/* + +# archive.archlinux.org directory listing html responses (to get packages related versions listing) + +cd ../../ + +echo """ +Index of /packages/g/gzip/ + +

Index of /packages/g/gzip/


../
+gzip-1.10-1-x86_64.pkg.tar.xz                      30-Dec-2018 18:38     78K
+gzip-1.10-1-x86_64.pkg.tar.xz.sig                  30-Dec-2018 18:38     558
+gzip-1.10-2-x86_64.pkg.tar.xz                      06-Oct-2019 16:02     78K
+gzip-1.10-2-x86_64.pkg.tar.xz.sig                  06-Oct-2019 16:02     558
+gzip-1.10-3-x86_64.pkg.tar.xz                      13-Nov-2019 15:55     78K
+gzip-1.10-3-x86_64.pkg.tar.xz.sig                  13-Nov-2019 15:55     566
+gzip-1.11-1-x86_64.pkg.tar.zst                     04-Sep-2021 02:02     82K
+gzip-1.11-1-x86_64.pkg.tar.zst.sig                 04-Sep-2021 02:02     558
+gzip-1.12-1-x86_64.pkg.tar.zst                     07-Apr-2022 17:35     80K
+gzip-1.12-1-x86_64.pkg.tar.zst.sig                 07-Apr-2022 17:35     566
+

+""" > https_archive.archlinux.org/packages_g_gzip + +echo -e """ +Index of /packages/d/dialog/ + +

Index of /packages/d/dialog/


../
+dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz          13-Feb-2019 08:36    180K
+dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz.sig      13-Feb-2019 08:36     310
+dialog-1:1.3_20190724-1-x86_64.pkg.tar.xz          26-Jul-2019 21:39    180K
+dialog-1:1.3_20190724-1-x86_64.pkg.tar.xz.sig      26-Jul-2019 21:43     310
+dialog-1:1.3_20190728-1-x86_64.pkg.tar.xz          29-Jul-2019 12:10    180K
+dialog-1:1.3_20190728-1-x86_64.pkg.tar.xz.sig      29-Jul-2019 12:10     310
+dialog-1:1.3_20190806-1-x86_64.pkg.tar.xz          07-Aug-2019 04:19    182K
+dialog-1:1.3_20190806-1-x86_64.pkg.tar.xz.sig      07-Aug-2019 04:19     310
+dialog-1:1.3_20190808-1-x86_64.pkg.tar.xz          09-Aug-2019 22:49    182K
+dialog-1:1.3_20190808-1-x86_64.pkg.tar.xz.sig      09-Aug-2019 22:50     310
+dialog-1:1.3_20191110-1-x86_64.pkg.tar.xz          11-Nov-2019 11:15    183K
+dialog-1:1.3_20191110-1-x86_64.pkg.tar.xz.sig      11-Nov-2019 11:17     310
+dialog-1:1.3_20191110-2-x86_64.pkg.tar.xz          13-Nov-2019 17:40    183K
+dialog-1:1.3_20191110-2-x86_64.pkg.tar.xz.sig      13-Nov-2019 17:41     310
+dialog-1:1.3_20191209-1-x86_64.pkg.tar.xz          10-Dec-2019 09:56    183K
+dialog-1:1.3_20191209-1-x86_64.pkg.tar.xz.sig      10-Dec-2019 09:57     310
+dialog-1:1.3_20191210-1-x86_64.pkg.tar.xz          12-Dec-2019 15:55    184K
+dialog-1:1.3_20191210-1-x86_64.pkg.tar.xz.sig      12-Dec-2019 15:56     310
+dialog-1:1.3_20200228-1-x86_64.pkg.tar.zst         06-Mar-2020 02:21    196K
+dialog-1:1.3_20200228-1-x86_64.pkg.tar.zst.sig     06-Mar-2020 02:22     310
+dialog-1:1.3_20200327-1-x86_64.pkg.tar.zst         29-Mar-2020 17:08    196K
+dialog-1:1.3_20200327-1-x86_64.pkg.tar.zst.sig     29-Mar-2020 17:09     310
+dialog-1:1.3_20201126-1-x86_64.pkg.tar.zst         27-Nov-2020 12:19    199K
+dialog-1:1.3_20201126-1-x86_64.pkg.tar.zst.sig     27-Nov-2020 12:20     310
+dialog-1:1.3_20210117-1-x86_64.pkg.tar.zst         18-Jan-2021 18:05    200K
+dialog-1:1.3_20210117-1-x86_64.pkg.tar.zst.sig     18-Jan-2021 18:05     310
+dialog-1:1.3_20210306-1-x86_64.pkg.tar.zst         07-Mar-2021 11:40    201K
+dialog-1:1.3_20210306-1-x86_64.pkg.tar.zst.sig     07-Mar-2021 11:41     310
+dialog-1:1.3_20210319-1-x86_64.pkg.tar.zst         20-Mar-2021 00:12    201K
+dialog-1:1.3_20210319-1-x86_64.pkg.tar.zst.sig     20-Mar-2021 00:13     310
+dialog-1:1.3_20210324-1-x86_64.pkg.tar.zst         26-Mar-2021 17:53    201K
+dialog-1:1.3_20210324-1-x86_64.pkg.tar.zst.sig     26-Mar-2021 17:53     310
+dialog-1:1.3_20210509-1-x86_64.pkg.tar.zst         16-May-2021 02:04    198K
+dialog-1:1.3_20210509-1-x86_64.pkg.tar.zst.sig     16-May-2021 02:04     310
+dialog-1:1.3_20210530-1-x86_64.pkg.tar.zst         31-May-2021 14:59    198K
+dialog-1:1.3_20210530-1-x86_64.pkg.tar.zst.sig     31-May-2021 15:00     310
+dialog-1:1.3_20210621-1-x86_64.pkg.tar.zst         23-Jun-2021 02:59    199K
+dialog-1:1.3_20210621-1-x86_64.pkg.tar.zst.sig     23-Jun-2021 03:00     310
+dialog-1:1.3_20211107-1-x86_64.pkg.tar.zst         09-Nov-2021 14:06    197K
+dialog-1:1.3_20211107-1-x86_64.pkg.tar.zst.sig     09-Nov-2021 14:13     310
+dialog-1:1.3_20211214-1-x86_64.pkg.tar.zst         14-Dec-2021 09:26    197K
+dialog-1:1.3_20211214-1-x86_64.pkg.tar.zst.sig     14-Dec-2021 09:27     310
+dialog-1:1.3_20220117-1-x86_64.pkg.tar.zst         19-Jan-2022 09:56    199K
+dialog-1:1.3_20220117-1-x86_64.pkg.tar.zst.sig     19-Jan-2022 09:56     310
+dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst         16-Apr-2022 03:59    198K
+dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst.sig     16-Apr-2022 03:59     310
+

+""" > https_archive.archlinux.org/packages_d_dialog + +echo -e """ + +Index of /packages/m/mercurial/ + +

Index of /packages/m/mercurial/


../
+mercurial-4.8.2-1-x86_64.pkg.tar.xz                15-Jan-2019 20:31      4M
+mercurial-4.8.2-1-x86_64.pkg.tar.xz.sig            15-Jan-2019 20:31     310
+mercurial-4.9-1-x86_64.pkg.tar.xz                  12-Feb-2019 06:15      4M
+mercurial-4.9-1-x86_64.pkg.tar.xz.sig              12-Feb-2019 06:15     310
+mercurial-4.9.1-1-x86_64.pkg.tar.xz                30-Mar-2019 17:40      4M
+mercurial-4.9.1-1-x86_64.pkg.tar.xz.sig            30-Mar-2019 17:40     310
+mercurial-5.0-1-x86_64.pkg.tar.xz                  10-May-2019 08:44      4M
+mercurial-5.0-1-x86_64.pkg.tar.xz.sig              10-May-2019 08:44     310
+mercurial-5.0.1-1-x86_64.pkg.tar.xz                10-Jun-2019 18:05      4M
+mercurial-5.0.1-1-x86_64.pkg.tar.xz.sig            10-Jun-2019 18:05     310
+mercurial-5.0.2-1-x86_64.pkg.tar.xz                10-Jul-2019 04:58      4M
+mercurial-5.0.2-1-x86_64.pkg.tar.xz.sig            10-Jul-2019 04:58     310
+mercurial-5.1-1-x86_64.pkg.tar.xz                  17-Aug-2019 19:58      4M
+mercurial-5.1-1-x86_64.pkg.tar.xz.sig              17-Aug-2019 19:58     310
+mercurial-5.1.2-1-x86_64.pkg.tar.xz                08-Oct-2019 08:38      4M
+mercurial-5.1.2-1-x86_64.pkg.tar.xz.sig            08-Oct-2019 08:38     310
+mercurial-5.2-1-x86_64.pkg.tar.xz                  28-Nov-2019 06:41      4M
+mercurial-5.2-1-x86_64.pkg.tar.xz.sig              28-Nov-2019 06:41     310
+mercurial-5.2.1-1-x86_64.pkg.tar.zst               06-Jan-2020 12:35      4M
+mercurial-5.2.1-1-x86_64.pkg.tar.zst.sig           06-Jan-2020 12:35     310
+mercurial-5.2.2-1-x86_64.pkg.tar.zst               15-Jan-2020 14:07      5M
+mercurial-5.2.2-1-x86_64.pkg.tar.zst.sig           15-Jan-2020 14:07     310
+mercurial-5.2.2-2-x86_64.pkg.tar.zst               30-Jan-2020 20:05      4M
+mercurial-5.2.2-2-x86_64.pkg.tar.zst.sig           30-Jan-2020 20:05     310
+mercurial-5.3-1-x86_64.pkg.tar.zst                 13-Feb-2020 21:40      5M
+mercurial-5.3-1-x86_64.pkg.tar.zst.sig             13-Feb-2020 21:40     566
+mercurial-5.3.1-1-x86_64.pkg.tar.zst               07-Mar-2020 23:58      4M
+mercurial-5.3.1-1-x86_64.pkg.tar.zst.sig           07-Mar-2020 23:58     310
+mercurial-5.3.2-1-x86_64.pkg.tar.zst               05-Apr-2020 17:48      4M
+mercurial-5.3.2-1-x86_64.pkg.tar.zst.sig           05-Apr-2020 17:48     310
+mercurial-5.4-1-x86_64.pkg.tar.zst                 10-May-2020 17:19      5M
+mercurial-5.4-1-x86_64.pkg.tar.zst.sig             10-May-2020 17:19     310
+mercurial-5.4-2-x86_64.pkg.tar.zst                 04-Jun-2020 13:38      5M
+mercurial-5.4-2-x86_64.pkg.tar.zst.sig             04-Jun-2020 13:38     310
+mercurial-5.4.1-1-x86_64.pkg.tar.zst               06-Jun-2020 12:28      5M
+mercurial-5.4.1-1-x86_64.pkg.tar.zst.sig           06-Jun-2020 12:28     310
+mercurial-5.4.2-1-x86_64.pkg.tar.zst               02-Jul-2020 21:35      5M
+mercurial-5.4.2-1-x86_64.pkg.tar.zst.sig           02-Jul-2020 21:35     566
+mercurial-5.5-1-x86_64.pkg.tar.zst                 05-Aug-2020 10:39      5M
+mercurial-5.5-1-x86_64.pkg.tar.zst.sig             05-Aug-2020 10:39     310
+mercurial-5.5.1-1-x86_64.pkg.tar.zst               03-Sep-2020 19:05      5M
+mercurial-5.5.1-1-x86_64.pkg.tar.zst.sig           03-Sep-2020 19:05     310
+mercurial-5.5.2-1-x86_64.pkg.tar.zst               07-Oct-2020 20:05      5M
+mercurial-5.5.2-1-x86_64.pkg.tar.zst.sig           07-Oct-2020 20:05     310
+mercurial-5.6-1-x86_64.pkg.tar.zst                 03-Nov-2020 17:26      5M
+mercurial-5.6-1-x86_64.pkg.tar.zst.sig             03-Nov-2020 17:26     310
+mercurial-5.6-2-x86_64.pkg.tar.zst                 09-Nov-2020 16:54      5M
+mercurial-5.6-2-x86_64.pkg.tar.zst.sig             09-Nov-2020 16:54     310
+mercurial-5.6-3-x86_64.pkg.tar.zst                 11-Nov-2020 15:20      5M
+mercurial-5.6-3-x86_64.pkg.tar.zst.sig             11-Nov-2020 15:20     310
+mercurial-5.6.1-1-x86_64.pkg.tar.zst               05-Dec-2020 12:29      5M
+mercurial-5.6.1-1-x86_64.pkg.tar.zst.sig           05-Dec-2020 12:29     310
+mercurial-5.7-1-x86_64.pkg.tar.zst                 04-Feb-2021 08:41      5M
+mercurial-5.7-1-x86_64.pkg.tar.zst.sig             04-Feb-2021 08:41     310
+mercurial-5.7.1-1-x86_64.pkg.tar.zst               11-Mar-2021 07:51      5M
+mercurial-5.7.1-1-x86_64.pkg.tar.zst.sig           11-Mar-2021 07:51     310
+mercurial-5.8-1-x86_64.pkg.tar.zst                 04-May-2021 17:55      5M
+mercurial-5.8-1-x86_64.pkg.tar.zst.sig             04-May-2021 17:55     310
+mercurial-5.8-2-x86_64.pkg.tar.zst                 08-May-2021 22:08      5M
+mercurial-5.8-2-x86_64.pkg.tar.zst.sig             08-May-2021 22:08     310
+mercurial-5.8.1-1-x86_64.pkg.tar.zst               13-Jul-2021 07:04      5M
+mercurial-5.8.1-1-x86_64.pkg.tar.zst.sig           13-Jul-2021 07:04     310
+mercurial-5.9.1-1-x86_64.pkg.tar.zst               01-Sep-2021 12:48      5M
+mercurial-5.9.1-1-x86_64.pkg.tar.zst.sig           01-Sep-2021 12:48     310
+mercurial-5.9.1-2-x86_64.pkg.tar.zst               24-Sep-2021 17:39      5M
+mercurial-5.9.1-2-x86_64.pkg.tar.zst.sig           24-Sep-2021 17:39     310
+mercurial-5.9.2-1-x86_64.pkg.tar.zst               07-Oct-2021 21:52      5M
+mercurial-5.9.2-1-x86_64.pkg.tar.zst.sig           07-Oct-2021 21:52     310
+mercurial-5.9.3-1-x86_64.pkg.tar.zst               27-Oct-2021 07:20      5M
+mercurial-5.9.3-1-x86_64.pkg.tar.zst.sig           27-Oct-2021 07:20     310
+mercurial-6.0-1-x86_64.pkg.tar.zst                 25-Nov-2021 17:10      5M
+mercurial-6.0-1-x86_64.pkg.tar.zst.sig             25-Nov-2021 17:10     310
+mercurial-6.0-2-x86_64.pkg.tar.zst                 30-Nov-2021 20:53      5M
+mercurial-6.0-2-x86_64.pkg.tar.zst.sig             30-Nov-2021 20:53     310
+mercurial-6.0-3-x86_64.pkg.tar.zst                 02-Dec-2021 12:06      5M
+mercurial-6.0-3-x86_64.pkg.tar.zst.sig             02-Dec-2021 12:06     310
+mercurial-6.0.1-1-x86_64.pkg.tar.zst               08-Jan-2022 10:07      5M
+mercurial-6.0.1-1-x86_64.pkg.tar.zst.sig           08-Jan-2022 10:07     310
+mercurial-6.0.2-1-x86_64.pkg.tar.zst               03-Feb-2022 13:28      5M
+mercurial-6.0.2-1-x86_64.pkg.tar.zst.sig           03-Feb-2022 13:28     310
+mercurial-6.0.3-1-x86_64.pkg.tar.zst               23-Feb-2022 20:50      5M
+mercurial-6.0.3-1-x86_64.pkg.tar.zst.sig           23-Feb-2022 20:50     310
+mercurial-6.1-1-x86_64.pkg.tar.zst                 03-Mar-2022 18:06      5M
+mercurial-6.1-1-x86_64.pkg.tar.zst.sig             03-Mar-2022 18:06     310
+mercurial-6.1-2-x86_64.pkg.tar.zst                 04-Mar-2022 08:37      5M
+mercurial-6.1-2-x86_64.pkg.tar.zst.sig             04-Mar-2022 08:37     310
+mercurial-6.1.1-1-x86_64.pkg.tar.zst               07-Apr-2022 18:26      5M
+mercurial-6.1.1-1-x86_64.pkg.tar.zst.sig           07-Apr-2022 18:26     310
+mercurial-6.1.2-1-x86_64.pkg.tar.zst               07-May-2022 11:03      5M
+mercurial-6.1.2-1-x86_64.pkg.tar.zst.sig           07-May-2022 11:03     310
+

+""" > https_archive.archlinux.org/packages_m_mercurial + +echo -e """ +Index of /packages/l/libasyncns/ + +

Index of /packages/l/libasyncns/


../
+libasyncns-0.8+3+g68cd5af-2-x86_64.pkg.tar.xz      09-Nov-2018 23:39     16K
+libasyncns-0.8+3+g68cd5af-2-x86_64.pkg.tar.xz.sig  09-Nov-2018 23:39     310
+libasyncns-0.8+3+g68cd5af-3-x86_64.pkg.tar.zst     19-May-2020 08:28     17K
+libasyncns-0.8+3+g68cd5af-3-x86_64.pkg.tar.zst.sig 19-May-2020 08:28     566
+libasyncns-1:0.8+r3+g68cd5af-1-x86_64.pkg.tar.zst  18-May-2022 17:23     17K
+libasyncns-1:0.8+r3+g68cd5af-1-x86_64.pkg.tar.z..> 18-May-2022 17:23     141
+

+""" > https_archive.archlinux.org/packages_l_libasyncns + +echo -e """ +Index of /packages/p/python-hglib/ + +

Index of /packages/p/python-hglib/


../
+python-hglib-2.6.1-3-any.pkg.tar.xz                06-Nov-2019 14:08     40K
+python-hglib-2.6.1-3-any.pkg.tar.xz.sig            06-Nov-2019 14:08     566
+python-hglib-2.6.2-1-any.pkg.tar.zst               19-Nov-2020 22:29     43K
+python-hglib-2.6.2-1-any.pkg.tar.zst.sig           19-Nov-2020 22:29     566
+python-hglib-2.6.2-2-any.pkg.tar.zst               19-Nov-2020 22:31     43K
+python-hglib-2.6.2-2-any.pkg.tar.zst.sig           19-Nov-2020 22:31     566
+python-hglib-2.6.2-3-any.pkg.tar.zst               19-Nov-2020 22:35     43K
+python-hglib-2.6.2-3-any.pkg.tar.zst.sig           19-Nov-2020 22:35     566
+python-hglib-2.6.2-4-any.pkg.tar.zst               03-Dec-2021 00:44     43K
+python-hglib-2.6.2-4-any.pkg.tar.zst.sig           03-Dec-2021 00:44     310
+

+""" > https_archive.archlinux.org/packages_p_python-hglib + +echo -e """ +Index of /packages/g/gnome-code-assistance/ + +

Index of /packages/g/gnome-code-assistance/


../
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-1-x8..> 10-Nov-2019 20:55      2M
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-1-x8..> 10-Nov-2019 20:56     310
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-2-x8..> 28-Mar-2020 15:58      2M
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-2-x8..> 28-Mar-2020 15:58     310
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-3-x8..> 05-Jul-2020 15:28      2M
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-3-x8..> 05-Jul-2020 15:28     590
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-4-x8..> 12-Nov-2020 17:28      2M
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-4-x8..> 12-Nov-2020 17:29     310
+gnome-code-assistance-2:3.16.1+14+gaad6437-1-x8..> 24-Feb-2021 16:30      2M
+gnome-code-assistance-2:3.16.1+14+gaad6437-1-x8..> 24-Feb-2021 16:30     141
+gnome-code-assistance-2:3.16.1+14+gaad6437-2-x8..> 02-Dec-2021 23:36      2M
+gnome-code-assistance-2:3.16.1+14+gaad6437-2-x8..> 02-Dec-2021 23:36     566
+gnome-code-assistance-3.16.1+14+gaad6437-1-x86_..> 15-Mar-2019 19:23      2M
+gnome-code-assistance-3.16.1+14+gaad6437-1-x86_..> 15-Mar-2019 19:23     310
+gnome-code-assistance-3.16.1+14+gaad6437-2-x86_..> 24-Aug-2019 20:05      2M
+gnome-code-assistance-3.16.1+14+gaad6437-2-x86_..> 24-Aug-2019 20:05     310
+gnome-code-assistance-3.16.1+15+gb9ffc4d-1-x86_..> 25-Aug-2019 20:55      2M
+gnome-code-assistance-3.16.1+15+gb9ffc4d-1-x86_..> 25-Aug-2019 20:55     310
+gnome-code-assistance-3:3.16.1+r14+gaad6437-1-x..> 18-May-2022 17:23      2M
+gnome-code-assistance-3:3.16.1+r14+gaad6437-1-x..> 18-May-2022 17:23     141
+

+""" > https_archive.archlinux.org/packages_g_gnome-code-assistance + +# Clean up removing tmp_dir +rm -rf tmp_dir/ diff --git a/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_d_dialog b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_d_dialog new file mode 100644 index 0000000..4b2984c --- /dev/null +++ b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_d_dialog @@ -0,0 +1,52 @@ + +Index of /packages/d/dialog/ + +

Index of /packages/d/dialog/


../
+dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz          13-Feb-2019 08:36    180K
+dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz.sig      13-Feb-2019 08:36     310
+dialog-1:1.3_20190724-1-x86_64.pkg.tar.xz          26-Jul-2019 21:39    180K
+dialog-1:1.3_20190724-1-x86_64.pkg.tar.xz.sig      26-Jul-2019 21:43     310
+dialog-1:1.3_20190728-1-x86_64.pkg.tar.xz          29-Jul-2019 12:10    180K
+dialog-1:1.3_20190728-1-x86_64.pkg.tar.xz.sig      29-Jul-2019 12:10     310
+dialog-1:1.3_20190806-1-x86_64.pkg.tar.xz          07-Aug-2019 04:19    182K
+dialog-1:1.3_20190806-1-x86_64.pkg.tar.xz.sig      07-Aug-2019 04:19     310
+dialog-1:1.3_20190808-1-x86_64.pkg.tar.xz          09-Aug-2019 22:49    182K
+dialog-1:1.3_20190808-1-x86_64.pkg.tar.xz.sig      09-Aug-2019 22:50     310
+dialog-1:1.3_20191110-1-x86_64.pkg.tar.xz          11-Nov-2019 11:15    183K
+dialog-1:1.3_20191110-1-x86_64.pkg.tar.xz.sig      11-Nov-2019 11:17     310
+dialog-1:1.3_20191110-2-x86_64.pkg.tar.xz          13-Nov-2019 17:40    183K
+dialog-1:1.3_20191110-2-x86_64.pkg.tar.xz.sig      13-Nov-2019 17:41     310
+dialog-1:1.3_20191209-1-x86_64.pkg.tar.xz          10-Dec-2019 09:56    183K
+dialog-1:1.3_20191209-1-x86_64.pkg.tar.xz.sig      10-Dec-2019 09:57     310
+dialog-1:1.3_20191210-1-x86_64.pkg.tar.xz          12-Dec-2019 15:55    184K
+dialog-1:1.3_20191210-1-x86_64.pkg.tar.xz.sig      12-Dec-2019 15:56     310
+dialog-1:1.3_20200228-1-x86_64.pkg.tar.zst         06-Mar-2020 02:21    196K
+dialog-1:1.3_20200228-1-x86_64.pkg.tar.zst.sig     06-Mar-2020 02:22     310
+dialog-1:1.3_20200327-1-x86_64.pkg.tar.zst         29-Mar-2020 17:08    196K
+dialog-1:1.3_20200327-1-x86_64.pkg.tar.zst.sig     29-Mar-2020 17:09     310
+dialog-1:1.3_20201126-1-x86_64.pkg.tar.zst         27-Nov-2020 12:19    199K
+dialog-1:1.3_20201126-1-x86_64.pkg.tar.zst.sig     27-Nov-2020 12:20     310
+dialog-1:1.3_20210117-1-x86_64.pkg.tar.zst         18-Jan-2021 18:05    200K
+dialog-1:1.3_20210117-1-x86_64.pkg.tar.zst.sig     18-Jan-2021 18:05     310
+dialog-1:1.3_20210306-1-x86_64.pkg.tar.zst         07-Mar-2021 11:40    201K
+dialog-1:1.3_20210306-1-x86_64.pkg.tar.zst.sig     07-Mar-2021 11:41     310
+dialog-1:1.3_20210319-1-x86_64.pkg.tar.zst         20-Mar-2021 00:12    201K
+dialog-1:1.3_20210319-1-x86_64.pkg.tar.zst.sig     20-Mar-2021 00:13     310
+dialog-1:1.3_20210324-1-x86_64.pkg.tar.zst         26-Mar-2021 17:53    201K
+dialog-1:1.3_20210324-1-x86_64.pkg.tar.zst.sig     26-Mar-2021 17:53     310
+dialog-1:1.3_20210509-1-x86_64.pkg.tar.zst         16-May-2021 02:04    198K
+dialog-1:1.3_20210509-1-x86_64.pkg.tar.zst.sig     16-May-2021 02:04     310
+dialog-1:1.3_20210530-1-x86_64.pkg.tar.zst         31-May-2021 14:59    198K
+dialog-1:1.3_20210530-1-x86_64.pkg.tar.zst.sig     31-May-2021 15:00     310
+dialog-1:1.3_20210621-1-x86_64.pkg.tar.zst         23-Jun-2021 02:59    199K
+dialog-1:1.3_20210621-1-x86_64.pkg.tar.zst.sig     23-Jun-2021 03:00     310
+dialog-1:1.3_20211107-1-x86_64.pkg.tar.zst         09-Nov-2021 14:06    197K
+dialog-1:1.3_20211107-1-x86_64.pkg.tar.zst.sig     09-Nov-2021 14:13     310
+dialog-1:1.3_20211214-1-x86_64.pkg.tar.zst         14-Dec-2021 09:26    197K
+dialog-1:1.3_20211214-1-x86_64.pkg.tar.zst.sig     14-Dec-2021 09:27     310
+dialog-1:1.3_20220117-1-x86_64.pkg.tar.zst         19-Jan-2022 09:56    199K
+dialog-1:1.3_20220117-1-x86_64.pkg.tar.zst.sig     19-Jan-2022 09:56     310
+dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst         16-Apr-2022 03:59    198K
+dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst.sig     16-Apr-2022 03:59     310
+

+ diff --git a/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_g_gnome-code-assistance b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_g_gnome-code-assistance new file mode 100644 index 0000000..4a78ff3 --- /dev/null +++ b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_g_gnome-code-assistance @@ -0,0 +1,26 @@ + +Index of /packages/g/gnome-code-assistance/ + +

Index of /packages/g/gnome-code-assistance/


../
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-1-x8..> 10-Nov-2019 20:55      2M
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-1-x8..> 10-Nov-2019 20:56     310
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-2-x8..> 28-Mar-2020 15:58      2M
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-2-x8..> 28-Mar-2020 15:58     310
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-3-x8..> 05-Jul-2020 15:28      2M
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-3-x8..> 05-Jul-2020 15:28     590
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-4-x8..> 12-Nov-2020 17:28      2M
+gnome-code-assistance-1:3.16.1+15+g0fd8b5f-4-x8..> 12-Nov-2020 17:29     310
+gnome-code-assistance-2:3.16.1+14+gaad6437-1-x8..> 24-Feb-2021 16:30      2M
+gnome-code-assistance-2:3.16.1+14+gaad6437-1-x8..> 24-Feb-2021 16:30     141
+gnome-code-assistance-2:3.16.1+14+gaad6437-2-x8..> 02-Dec-2021 23:36      2M
+gnome-code-assistance-2:3.16.1+14+gaad6437-2-x8..> 02-Dec-2021 23:36     566
+gnome-code-assistance-3.16.1+14+gaad6437-1-x86_..> 15-Mar-2019 19:23      2M
+gnome-code-assistance-3.16.1+14+gaad6437-1-x86_..> 15-Mar-2019 19:23     310
+gnome-code-assistance-3.16.1+14+gaad6437-2-x86_..> 24-Aug-2019 20:05      2M
+gnome-code-assistance-3.16.1+14+gaad6437-2-x86_..> 24-Aug-2019 20:05     310
+gnome-code-assistance-3.16.1+15+gb9ffc4d-1-x86_..> 25-Aug-2019 20:55      2M
+gnome-code-assistance-3.16.1+15+gb9ffc4d-1-x86_..> 25-Aug-2019 20:55     310
+gnome-code-assistance-3:3.16.1+r14+gaad6437-1-x..> 18-May-2022 17:23      2M
+gnome-code-assistance-3:3.16.1+r14+gaad6437-1-x..> 18-May-2022 17:23     141
+

+ diff --git a/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_g_gzip b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_g_gzip new file mode 100644 index 0000000..b28010c --- /dev/null +++ b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_g_gzip @@ -0,0 +1,16 @@ + +Index of /packages/g/gzip/ + +

Index of /packages/g/gzip/


../
+gzip-1.10-1-x86_64.pkg.tar.xz                      30-Dec-2018 18:38     78K
+gzip-1.10-1-x86_64.pkg.tar.xz.sig                  30-Dec-2018 18:38     558
+gzip-1.10-2-x86_64.pkg.tar.xz                      06-Oct-2019 16:02     78K
+gzip-1.10-2-x86_64.pkg.tar.xz.sig                  06-Oct-2019 16:02     558
+gzip-1.10-3-x86_64.pkg.tar.xz                      13-Nov-2019 15:55     78K
+gzip-1.10-3-x86_64.pkg.tar.xz.sig                  13-Nov-2019 15:55     566
+gzip-1.11-1-x86_64.pkg.tar.zst                     04-Sep-2021 02:02     82K
+gzip-1.11-1-x86_64.pkg.tar.zst.sig                 04-Sep-2021 02:02     558
+gzip-1.12-1-x86_64.pkg.tar.zst                     07-Apr-2022 17:35     80K
+gzip-1.12-1-x86_64.pkg.tar.zst.sig                 07-Apr-2022 17:35     566
+

+ diff --git a/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_l_libasyncns b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_l_libasyncns new file mode 100644 index 0000000..a72be63 --- /dev/null +++ b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_l_libasyncns @@ -0,0 +1,12 @@ + +Index of /packages/l/libasyncns/ + +

Index of /packages/l/libasyncns/


../
+libasyncns-0.8+3+g68cd5af-2-x86_64.pkg.tar.xz      09-Nov-2018 23:39     16K
+libasyncns-0.8+3+g68cd5af-2-x86_64.pkg.tar.xz.sig  09-Nov-2018 23:39     310
+libasyncns-0.8+3+g68cd5af-3-x86_64.pkg.tar.zst     19-May-2020 08:28     17K
+libasyncns-0.8+3+g68cd5af-3-x86_64.pkg.tar.zst.sig 19-May-2020 08:28     566
+libasyncns-1:0.8+r3+g68cd5af-1-x86_64.pkg.tar.zst  18-May-2022 17:23     17K
+libasyncns-1:0.8+r3+g68cd5af-1-x86_64.pkg.tar.z..> 18-May-2022 17:23     141
+

+ diff --git a/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_m_mercurial b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_m_mercurial new file mode 100644 index 0000000..33b421d --- /dev/null +++ b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_m_mercurial @@ -0,0 +1,97 @@ + + +Index of /packages/m/mercurial/ + +

Index of /packages/m/mercurial/


../
+mercurial-4.8.2-1-x86_64.pkg.tar.xz                15-Jan-2019 20:31      4M
+mercurial-4.8.2-1-x86_64.pkg.tar.xz.sig            15-Jan-2019 20:31     310
+mercurial-4.9-1-x86_64.pkg.tar.xz                  12-Feb-2019 06:15      4M
+mercurial-4.9-1-x86_64.pkg.tar.xz.sig              12-Feb-2019 06:15     310
+mercurial-4.9.1-1-x86_64.pkg.tar.xz                30-Mar-2019 17:40      4M
+mercurial-4.9.1-1-x86_64.pkg.tar.xz.sig            30-Mar-2019 17:40     310
+mercurial-5.0-1-x86_64.pkg.tar.xz                  10-May-2019 08:44      4M
+mercurial-5.0-1-x86_64.pkg.tar.xz.sig              10-May-2019 08:44     310
+mercurial-5.0.1-1-x86_64.pkg.tar.xz                10-Jun-2019 18:05      4M
+mercurial-5.0.1-1-x86_64.pkg.tar.xz.sig            10-Jun-2019 18:05     310
+mercurial-5.0.2-1-x86_64.pkg.tar.xz                10-Jul-2019 04:58      4M
+mercurial-5.0.2-1-x86_64.pkg.tar.xz.sig            10-Jul-2019 04:58     310
+mercurial-5.1-1-x86_64.pkg.tar.xz                  17-Aug-2019 19:58      4M
+mercurial-5.1-1-x86_64.pkg.tar.xz.sig              17-Aug-2019 19:58     310
+mercurial-5.1.2-1-x86_64.pkg.tar.xz                08-Oct-2019 08:38      4M
+mercurial-5.1.2-1-x86_64.pkg.tar.xz.sig            08-Oct-2019 08:38     310
+mercurial-5.2-1-x86_64.pkg.tar.xz                  28-Nov-2019 06:41      4M
+mercurial-5.2-1-x86_64.pkg.tar.xz.sig              28-Nov-2019 06:41     310
+mercurial-5.2.1-1-x86_64.pkg.tar.zst               06-Jan-2020 12:35      4M
+mercurial-5.2.1-1-x86_64.pkg.tar.zst.sig           06-Jan-2020 12:35     310
+mercurial-5.2.2-1-x86_64.pkg.tar.zst               15-Jan-2020 14:07      5M
+mercurial-5.2.2-1-x86_64.pkg.tar.zst.sig           15-Jan-2020 14:07     310
+mercurial-5.2.2-2-x86_64.pkg.tar.zst               30-Jan-2020 20:05      4M
+mercurial-5.2.2-2-x86_64.pkg.tar.zst.sig           30-Jan-2020 20:05     310
+mercurial-5.3-1-x86_64.pkg.tar.zst                 13-Feb-2020 21:40      5M
+mercurial-5.3-1-x86_64.pkg.tar.zst.sig             13-Feb-2020 21:40     566
+mercurial-5.3.1-1-x86_64.pkg.tar.zst               07-Mar-2020 23:58      4M
+mercurial-5.3.1-1-x86_64.pkg.tar.zst.sig           07-Mar-2020 23:58     310
+mercurial-5.3.2-1-x86_64.pkg.tar.zst               05-Apr-2020 17:48      4M
+mercurial-5.3.2-1-x86_64.pkg.tar.zst.sig           05-Apr-2020 17:48     310
+mercurial-5.4-1-x86_64.pkg.tar.zst                 10-May-2020 17:19      5M
+mercurial-5.4-1-x86_64.pkg.tar.zst.sig             10-May-2020 17:19     310
+mercurial-5.4-2-x86_64.pkg.tar.zst                 04-Jun-2020 13:38      5M
+mercurial-5.4-2-x86_64.pkg.tar.zst.sig             04-Jun-2020 13:38     310
+mercurial-5.4.1-1-x86_64.pkg.tar.zst               06-Jun-2020 12:28      5M
+mercurial-5.4.1-1-x86_64.pkg.tar.zst.sig           06-Jun-2020 12:28     310
+mercurial-5.4.2-1-x86_64.pkg.tar.zst               02-Jul-2020 21:35      5M
+mercurial-5.4.2-1-x86_64.pkg.tar.zst.sig           02-Jul-2020 21:35     566
+mercurial-5.5-1-x86_64.pkg.tar.zst                 05-Aug-2020 10:39      5M
+mercurial-5.5-1-x86_64.pkg.tar.zst.sig             05-Aug-2020 10:39     310
+mercurial-5.5.1-1-x86_64.pkg.tar.zst               03-Sep-2020 19:05      5M
+mercurial-5.5.1-1-x86_64.pkg.tar.zst.sig           03-Sep-2020 19:05     310
+mercurial-5.5.2-1-x86_64.pkg.tar.zst               07-Oct-2020 20:05      5M
+mercurial-5.5.2-1-x86_64.pkg.tar.zst.sig           07-Oct-2020 20:05     310
+mercurial-5.6-1-x86_64.pkg.tar.zst                 03-Nov-2020 17:26      5M
+mercurial-5.6-1-x86_64.pkg.tar.zst.sig             03-Nov-2020 17:26     310
+mercurial-5.6-2-x86_64.pkg.tar.zst                 09-Nov-2020 16:54      5M
+mercurial-5.6-2-x86_64.pkg.tar.zst.sig             09-Nov-2020 16:54     310
+mercurial-5.6-3-x86_64.pkg.tar.zst                 11-Nov-2020 15:20      5M
+mercurial-5.6-3-x86_64.pkg.tar.zst.sig             11-Nov-2020 15:20     310
+mercurial-5.6.1-1-x86_64.pkg.tar.zst               05-Dec-2020 12:29      5M
+mercurial-5.6.1-1-x86_64.pkg.tar.zst.sig           05-Dec-2020 12:29     310
+mercurial-5.7-1-x86_64.pkg.tar.zst                 04-Feb-2021 08:41      5M
+mercurial-5.7-1-x86_64.pkg.tar.zst.sig             04-Feb-2021 08:41     310
+mercurial-5.7.1-1-x86_64.pkg.tar.zst               11-Mar-2021 07:51      5M
+mercurial-5.7.1-1-x86_64.pkg.tar.zst.sig           11-Mar-2021 07:51     310
+mercurial-5.8-1-x86_64.pkg.tar.zst                 04-May-2021 17:55      5M
+mercurial-5.8-1-x86_64.pkg.tar.zst.sig             04-May-2021 17:55     310
+mercurial-5.8-2-x86_64.pkg.tar.zst                 08-May-2021 22:08      5M
+mercurial-5.8-2-x86_64.pkg.tar.zst.sig             08-May-2021 22:08     310
+mercurial-5.8.1-1-x86_64.pkg.tar.zst               13-Jul-2021 07:04      5M
+mercurial-5.8.1-1-x86_64.pkg.tar.zst.sig           13-Jul-2021 07:04     310
+mercurial-5.9.1-1-x86_64.pkg.tar.zst               01-Sep-2021 12:48      5M
+mercurial-5.9.1-1-x86_64.pkg.tar.zst.sig           01-Sep-2021 12:48     310
+mercurial-5.9.1-2-x86_64.pkg.tar.zst               24-Sep-2021 17:39      5M
+mercurial-5.9.1-2-x86_64.pkg.tar.zst.sig           24-Sep-2021 17:39     310
+mercurial-5.9.2-1-x86_64.pkg.tar.zst               07-Oct-2021 21:52      5M
+mercurial-5.9.2-1-x86_64.pkg.tar.zst.sig           07-Oct-2021 21:52     310
+mercurial-5.9.3-1-x86_64.pkg.tar.zst               27-Oct-2021 07:20      5M
+mercurial-5.9.3-1-x86_64.pkg.tar.zst.sig           27-Oct-2021 07:20     310
+mercurial-6.0-1-x86_64.pkg.tar.zst                 25-Nov-2021 17:10      5M
+mercurial-6.0-1-x86_64.pkg.tar.zst.sig             25-Nov-2021 17:10     310
+mercurial-6.0-2-x86_64.pkg.tar.zst                 30-Nov-2021 20:53      5M
+mercurial-6.0-2-x86_64.pkg.tar.zst.sig             30-Nov-2021 20:53     310
+mercurial-6.0-3-x86_64.pkg.tar.zst                 02-Dec-2021 12:06      5M
+mercurial-6.0-3-x86_64.pkg.tar.zst.sig             02-Dec-2021 12:06     310
+mercurial-6.0.1-1-x86_64.pkg.tar.zst               08-Jan-2022 10:07      5M
+mercurial-6.0.1-1-x86_64.pkg.tar.zst.sig           08-Jan-2022 10:07     310
+mercurial-6.0.2-1-x86_64.pkg.tar.zst               03-Feb-2022 13:28      5M
+mercurial-6.0.2-1-x86_64.pkg.tar.zst.sig           03-Feb-2022 13:28     310
+mercurial-6.0.3-1-x86_64.pkg.tar.zst               23-Feb-2022 20:50      5M
+mercurial-6.0.3-1-x86_64.pkg.tar.zst.sig           23-Feb-2022 20:50     310
+mercurial-6.1-1-x86_64.pkg.tar.zst                 03-Mar-2022 18:06      5M
+mercurial-6.1-1-x86_64.pkg.tar.zst.sig             03-Mar-2022 18:06     310
+mercurial-6.1-2-x86_64.pkg.tar.zst                 04-Mar-2022 08:37      5M
+mercurial-6.1-2-x86_64.pkg.tar.zst.sig             04-Mar-2022 08:37     310
+mercurial-6.1.1-1-x86_64.pkg.tar.zst               07-Apr-2022 18:26      5M
+mercurial-6.1.1-1-x86_64.pkg.tar.zst.sig           07-Apr-2022 18:26     310
+mercurial-6.1.2-1-x86_64.pkg.tar.zst               07-May-2022 11:03      5M
+mercurial-6.1.2-1-x86_64.pkg.tar.zst.sig           07-May-2022 11:03     310
+

+ diff --git a/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_p_python-hglib b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_p_python-hglib new file mode 100644 index 0000000..14ec9b3 --- /dev/null +++ b/swh/lister/arch/tests/data/https_archive.archlinux.org/packages_p_python-hglib @@ -0,0 +1,16 @@ + +Index of /packages/p/python-hglib/ + +

Index of /packages/p/python-hglib/


../
+python-hglib-2.6.1-3-any.pkg.tar.xz                06-Nov-2019 14:08     40K
+python-hglib-2.6.1-3-any.pkg.tar.xz.sig            06-Nov-2019 14:08     566
+python-hglib-2.6.2-1-any.pkg.tar.zst               19-Nov-2020 22:29     43K
+python-hglib-2.6.2-1-any.pkg.tar.zst.sig           19-Nov-2020 22:29     566
+python-hglib-2.6.2-2-any.pkg.tar.zst               19-Nov-2020 22:31     43K
+python-hglib-2.6.2-2-any.pkg.tar.zst.sig           19-Nov-2020 22:31     566
+python-hglib-2.6.2-3-any.pkg.tar.zst               19-Nov-2020 22:35     43K
+python-hglib-2.6.2-3-any.pkg.tar.zst.sig           19-Nov-2020 22:35     566
+python-hglib-2.6.2-4-any.pkg.tar.zst               03-Dec-2021 00:44     43K
+python-hglib-2.6.2-4-any.pkg.tar.zst.sig           03-Dec-2021 00:44     310
+

+ diff --git a/swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_community_os_x86_64_community.files.tar.gz b/swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_community_os_x86_64_community.files.tar.gz new file mode 100644 index 0000000..b123d56 Binary files /dev/null and b/swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_community_os_x86_64_community.files.tar.gz differ diff --git a/swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_core_os_x86_64_core.files.tar.gz b/swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_core_os_x86_64_core.files.tar.gz new file mode 100644 index 0000000..7fb8d4e Binary files /dev/null and b/swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_core_os_x86_64_core.files.tar.gz differ diff --git a/swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_extra_os_x86_64_extra.files.tar.gz b/swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_extra_os_x86_64_extra.files.tar.gz new file mode 100644 index 0000000..bf7a652 Binary files /dev/null and b/swh/lister/arch/tests/data/https_archive.archlinux.org/repos_last_extra_os_x86_64_extra.files.tar.gz differ diff --git a/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_community_community.files.tar.gz b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_community_community.files.tar.gz new file mode 100644 index 0000000..5fb27a6 Binary files /dev/null and b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_community_community.files.tar.gz differ diff --git a/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_core.files.tar.gz b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_core.files.tar.gz new file mode 100644 index 0000000..c3d8959 Binary files /dev/null and b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_core.files.tar.gz differ diff --git a/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_extra_extra.files.tar.gz b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_extra_extra.files.tar.gz new file mode 100644 index 0000000..58a798f Binary files /dev/null and b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_extra_extra.files.tar.gz differ diff --git a/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_community_community.files.tar.gz b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_community_community.files.tar.gz new file mode 100644 index 0000000..e2943c3 Binary files /dev/null and b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_community_community.files.tar.gz differ diff --git a/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_core_core.files.tar.gz b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_core_core.files.tar.gz new file mode 100644 index 0000000..6c58ee9 Binary files /dev/null and b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_core_core.files.tar.gz differ diff --git a/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_extra_extra.files.tar.gz b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_extra_extra.files.tar.gz new file mode 100644 index 0000000..633fe1c Binary files /dev/null and b/swh/lister/arch/tests/data/https_uk.mirror.archlinuxarm.org/armv7h_extra_extra.files.tar.gz differ diff --git a/swh/lister/arch/tests/test_lister.py b/swh/lister/arch/tests/test_lister.py new file mode 100644 index 0000000..daa8712 --- /dev/null +++ b/swh/lister/arch/tests/test_lister.py @@ -0,0 +1,1394 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.lister.arch.lister import ArchLister + +expected_origins = [ + { + "url": "https://archlinux.org/packages/core/x86_64/dialog", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20190211-1", + "length": 180000, + "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190724-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20190724-1", + "length": 180000, + "filename": "dialog-1:1.3_20190724-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190728-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20190728-1", + "length": 180000, + "filename": "dialog-1:1.3_20190728-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190806-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20190806-1", + "length": 182000, + "filename": "dialog-1:1.3_20190806-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190808-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20190808-1", + "length": 182000, + "filename": "dialog-1:1.3_20190808-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20191110-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20191110-1", + "length": 183000, + "filename": "dialog-1:1.3_20191110-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20191110-2-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20191110-2", + "length": 183000, + "filename": "dialog-1:1.3_20191110-2-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20191209-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20191209-1", + "length": 183000, + "filename": "dialog-1:1.3_20191209-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20191210-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20191210-1", + "length": 184000, + "filename": "dialog-1:1.3_20191210-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20200228-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20200228-1", + "length": 196000, + "filename": "dialog-1:1.3_20200228-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20200327-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20200327-1", + "length": 196000, + "filename": "dialog-1:1.3_20200327-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20201126-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20201126-1", + "length": 199000, + "filename": "dialog-1:1.3_20201126-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20210117-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20210117-1", + "length": 200000, + "filename": "dialog-1:1.3_20210117-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20210306-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20210306-1", + "length": 201000, + "filename": "dialog-1:1.3_20210306-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20210319-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20210319-1", + "length": 201000, + "filename": "dialog-1:1.3_20210319-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20210324-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20210324-1", + "length": 201000, + "filename": "dialog-1:1.3_20210324-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20210509-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20210509-1", + "length": 198000, + "filename": "dialog-1:1.3_20210509-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20210530-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20210530-1", + "length": 198000, + "filename": "dialog-1:1.3_20210530-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20210621-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20210621-1", + "length": 199000, + "filename": "dialog-1:1.3_20210621-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20211107-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20211107-1", + "length": 197000, + "filename": "dialog-1:1.3_20211107-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20211214-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20211214-1", + "length": 197000, + "filename": "dialog-1:1.3_20211214-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20220117-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20220117-1", + "length": 199000, + "filename": "dialog-1:1.3_20220117-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20220414-1", + "length": 198000, + "filename": "dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", + }, + ], + "arch_metadata": [ + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20190211-1", + "last_modified": "2019-02-13T08:36:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20190724-1", + "last_modified": "2019-07-26T21:39:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20190728-1", + "last_modified": "2019-07-29T12:10:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20190806-1", + "last_modified": "2019-08-07T04:19:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20190808-1", + "last_modified": "2019-08-09T22:49:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20191110-1", + "last_modified": "2019-11-11T11:15:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20191110-2", + "last_modified": "2019-11-13T17:40:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20191209-1", + "last_modified": "2019-12-10T09:56:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20191210-1", + "last_modified": "2019-12-12T15:55:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20200228-1", + "last_modified": "2020-03-06T02:21:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20200327-1", + "last_modified": "2020-03-29T17:08:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20201126-1", + "last_modified": "2020-11-27T12:19:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20210117-1", + "last_modified": "2021-01-18T18:05:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20210306-1", + "last_modified": "2021-03-07T11:40:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20210319-1", + "last_modified": "2021-03-20T00:12:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20210324-1", + "last_modified": "2021-03-26T17:53:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20210509-1", + "last_modified": "2021-05-16T02:04:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20210530-1", + "last_modified": "2021-05-31T14:59:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20210621-1", + "last_modified": "2021-06-23T02:59:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20211107-1", + "last_modified": "2021-11-09T14:06:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20211214-1", + "last_modified": "2021-12-14T09:26:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20220117-1", + "last_modified": "2022-01-19T09:56:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20220414-1", + "last_modified": "2022-04-16T03:59:00", + }, + ], + }, + }, + { + "url": "https://archlinux.org/packages/community/x86_64/gnome-code-assistance", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://archive.archlinux.org/packages/g/gnome-code-assistance/gnome-code-assistance-1:3.16.1+15+g0fd8b5f-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:3.16.1+15+g0fd8b5f-1", + "length": 2000000, + "filename": "gnome-code-assistance-1:3.16.1+15+g0fd8b5f-1-x86_64.pkg.tar.xz", # noqa: B950 + }, + { + "url": "https://archive.archlinux.org/packages/g/gnome-code-assistance/gnome-code-assistance-1:3.16.1+15+g0fd8b5f-2-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:3.16.1+15+g0fd8b5f-2", + "length": 2000000, + "filename": "gnome-code-assistance-1:3.16.1+15+g0fd8b5f-2-x86_64.pkg.tar.zst", # noqa: B950 + }, + { + "url": "https://archive.archlinux.org/packages/g/gnome-code-assistance/gnome-code-assistance-1:3.16.1+15+g0fd8b5f-3-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:3.16.1+15+g0fd8b5f-3", + "length": 2000000, + "filename": "gnome-code-assistance-1:3.16.1+15+g0fd8b5f-3-x86_64.pkg.tar.zst", # noqa: B950 + }, + { + "url": "https://archive.archlinux.org/packages/g/gnome-code-assistance/gnome-code-assistance-1:3.16.1+15+g0fd8b5f-4-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:3.16.1+15+g0fd8b5f-4", + "length": 2000000, + "filename": "gnome-code-assistance-1:3.16.1+15+g0fd8b5f-4-x86_64.pkg.tar.zst", # noqa: B950 + }, + { + "url": "https://archive.archlinux.org/packages/g/gnome-code-assistance/gnome-code-assistance-2:3.16.1+14+gaad6437-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "2:3.16.1+14+gaad6437-1", + "length": 2000000, + "filename": "gnome-code-assistance-2:3.16.1+14+gaad6437-1-x86_64.pkg.tar.zst", # noqa: B950 + }, + { + "url": "https://archive.archlinux.org/packages/g/gnome-code-assistance/gnome-code-assistance-2:3.16.1+14+gaad6437-2-x86_64.pkg.tar.zst", # noqa: B950 + "version": "2:3.16.1+14+gaad6437-2", + "length": 2000000, + "filename": "gnome-code-assistance-2:3.16.1+14+gaad6437-2-x86_64.pkg.tar.zst", # noqa: B950 + }, + { + "url": "https://archive.archlinux.org/packages/g/gnome-code-assistance/gnome-code-assistance-3.16.1+14+gaad6437-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "3.16.1+14+gaad6437-1", + "length": 2000000, + "filename": "gnome-code-assistance-3.16.1+14+gaad6437-1-x86_64.pkg.tar.xz", # noqa: B950 + }, + { + "url": "https://archive.archlinux.org/packages/g/gnome-code-assistance/gnome-code-assistance-3.16.1+14+gaad6437-2-x86_64.pkg.tar.xz", # noqa: B950 + "version": "3.16.1+14+gaad6437-2", + "length": 2000000, + "filename": "gnome-code-assistance-3.16.1+14+gaad6437-2-x86_64.pkg.tar.xz", # noqa: B950 + }, + { + "url": "https://archive.archlinux.org/packages/g/gnome-code-assistance/gnome-code-assistance-3.16.1+15+gb9ffc4d-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "3.16.1+15+gb9ffc4d-1", + "length": 2000000, + "filename": "gnome-code-assistance-3.16.1+15+gb9ffc4d-1-x86_64.pkg.tar.xz", # noqa: B950 + }, + { + "url": "https://archive.archlinux.org/packages/g/gnome-code-assistance/gnome-code-assistance-3:3.16.1+r14+gaad6437-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "3:3.16.1+r14+gaad6437-1", + "length": 2000000, + "filename": "gnome-code-assistance-3:3.16.1+r14+gaad6437-1-x86_64.pkg.tar.zst", # noqa: B950 + }, + ], + "arch_metadata": [ + { + "arch": "x86_64", + "repo": "community", + "name": "gnome-code-assistance", + "version": "1:3.16.1+15+g0fd8b5f-1", + "last_modified": "2019-11-10T20:55:00", + }, + { + "arch": "x86_64", + "repo": "community", + "name": "gnome-code-assistance", + "version": "1:3.16.1+15+g0fd8b5f-2", + "last_modified": "2020-03-28T15:58:00", + }, + { + "arch": "x86_64", + "repo": "community", + "name": "gnome-code-assistance", + "version": "1:3.16.1+15+g0fd8b5f-3", + "last_modified": "2020-07-05T15:28:00", + }, + { + "arch": "x86_64", + "repo": "community", + "name": "gnome-code-assistance", + "version": "1:3.16.1+15+g0fd8b5f-4", + "last_modified": "2020-11-12T17:28:00", + }, + { + "arch": "x86_64", + "repo": "community", + "name": "gnome-code-assistance", + "version": "2:3.16.1+14+gaad6437-1", + "last_modified": "2021-02-24T16:30:00", + }, + { + "arch": "x86_64", + "repo": "community", + "name": "gnome-code-assistance", + "version": "2:3.16.1+14+gaad6437-2", + "last_modified": "2021-12-02T23:36:00", + }, + { + "arch": "x86_64", + "repo": "community", + "name": "gnome-code-assistance", + "version": "3.16.1+14+gaad6437-1", + "last_modified": "2019-03-15T19:23:00", + }, + { + "arch": "x86_64", + "repo": "community", + "name": "gnome-code-assistance", + "version": "3.16.1+14+gaad6437-2", + "last_modified": "2019-08-24T20:05:00", + }, + { + "arch": "x86_64", + "repo": "community", + "name": "gnome-code-assistance", + "version": "3.16.1+15+gb9ffc4d-1", + "last_modified": "2019-08-25T20:55:00", + }, + { + "arch": "x86_64", + "repo": "community", + "name": "gnome-code-assistance", + "version": "3:3.16.1+r14+gaad6437-1", + "last_modified": "2022-05-18T17:23:00", + }, + ], + }, + }, + { + "url": "https://archlinux.org/packages/core/x86_64/gzip", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://archive.archlinux.org/packages/g/gzip/gzip-1.10-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1.10-1", + "length": 78000, + "filename": "gzip-1.10-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/g/gzip/gzip-1.10-2-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1.10-2", + "length": 78000, + "filename": "gzip-1.10-2-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/g/gzip/gzip-1.10-3-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1.10-3", + "length": 78000, + "filename": "gzip-1.10-3-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/g/gzip/gzip-1.11-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1.11-1", + "length": 82000, + "filename": "gzip-1.11-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/g/gzip/gzip-1.12-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1.12-1", + "length": 80000, + "filename": "gzip-1.12-1-x86_64.pkg.tar.zst", + }, + ], + "arch_metadata": [ + { + "arch": "x86_64", + "repo": "core", + "name": "gzip", + "version": "1.10-1", + "last_modified": "2018-12-30T18:38:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "gzip", + "version": "1.10-2", + "last_modified": "2019-10-06T16:02:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "gzip", + "version": "1.10-3", + "last_modified": "2019-11-13T15:55:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "gzip", + "version": "1.11-1", + "last_modified": "2021-09-04T02:02:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "gzip", + "version": "1.12-1", + "last_modified": "2022-04-07T17:35:00", + }, + ], + }, + }, + { + "url": "https://archlinux.org/packages/extra/x86_64/libasyncns", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://archive.archlinux.org/packages/l/libasyncns/libasyncns-0.8+3+g68cd5af-2-x86_64.pkg.tar.xz", # noqa: B950 + "version": "0.8+3+g68cd5af-2", + "length": 16000, + "filename": "libasyncns-0.8+3+g68cd5af-2-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/l/libasyncns/libasyncns-0.8+3+g68cd5af-3-x86_64.pkg.tar.zst", # noqa: B950 + "version": "0.8+3+g68cd5af-3", + "length": 17000, + "filename": "libasyncns-0.8+3+g68cd5af-3-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/l/libasyncns/libasyncns-1:0.8+r3+g68cd5af-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:0.8+r3+g68cd5af-1", + "length": 17000, + "filename": "libasyncns-1:0.8+r3+g68cd5af-1-x86_64.pkg.tar.zst", # noqa: B950 + }, + ], + "arch_metadata": [ + { + "arch": "x86_64", + "repo": "extra", + "name": "libasyncns", + "version": "0.8+3+g68cd5af-2", + "last_modified": "2018-11-09T23:39:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "libasyncns", + "version": "0.8+3+g68cd5af-3", + "last_modified": "2020-05-19T08:28:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "libasyncns", + "version": "1:0.8+r3+g68cd5af-1", + "last_modified": "2022-05-18T17:23:00", + }, + ], + }, + }, + { + "url": "https://archlinux.org/packages/extra/x86_64/mercurial", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-4.8.2-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "4.8.2-1", + "length": 4000000, + "filename": "mercurial-4.8.2-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-4.9-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "4.9-1", + "length": 4000000, + "filename": "mercurial-4.9-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-4.9.1-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "4.9.1-1", + "length": 4000000, + "filename": "mercurial-4.9.1-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.0-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "5.0-1", + "length": 4000000, + "filename": "mercurial-5.0-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.0.1-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "5.0.1-1", + "length": 4000000, + "filename": "mercurial-5.0.1-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.0.2-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "5.0.2-1", + "length": 4000000, + "filename": "mercurial-5.0.2-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.1-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "5.1-1", + "length": 4000000, + "filename": "mercurial-5.1-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.1.2-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "5.1.2-1", + "length": 4000000, + "filename": "mercurial-5.1.2-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.2-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "5.2-1", + "length": 4000000, + "filename": "mercurial-5.2-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.2.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.2.1-1", + "length": 4000000, + "filename": "mercurial-5.2.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.2.2-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.2.2-1", + "length": 5000000, + "filename": "mercurial-5.2.2-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.2.2-2-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.2.2-2", + "length": 4000000, + "filename": "mercurial-5.2.2-2-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.3-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.3-1", + "length": 5000000, + "filename": "mercurial-5.3-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.3.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.3.1-1", + "length": 4000000, + "filename": "mercurial-5.3.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.3.2-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.3.2-1", + "length": 4000000, + "filename": "mercurial-5.3.2-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.4-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.4-1", + "length": 5000000, + "filename": "mercurial-5.4-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.4-2-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.4-2", + "length": 5000000, + "filename": "mercurial-5.4-2-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.4.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.4.1-1", + "length": 5000000, + "filename": "mercurial-5.4.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.4.2-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.4.2-1", + "length": 5000000, + "filename": "mercurial-5.4.2-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.5-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.5-1", + "length": 5000000, + "filename": "mercurial-5.5-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.5.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.5.1-1", + "length": 5000000, + "filename": "mercurial-5.5.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.5.2-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.5.2-1", + "length": 5000000, + "filename": "mercurial-5.5.2-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.6-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.6-1", + "length": 5000000, + "filename": "mercurial-5.6-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.6-2-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.6-2", + "length": 5000000, + "filename": "mercurial-5.6-2-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.6-3-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.6-3", + "length": 5000000, + "filename": "mercurial-5.6-3-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.6.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.6.1-1", + "length": 5000000, + "filename": "mercurial-5.6.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.7-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.7-1", + "length": 5000000, + "filename": "mercurial-5.7-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.7.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.7.1-1", + "length": 5000000, + "filename": "mercurial-5.7.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.8-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.8-1", + "length": 5000000, + "filename": "mercurial-5.8-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.8-2-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.8-2", + "length": 5000000, + "filename": "mercurial-5.8-2-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.8.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.8.1-1", + "length": 5000000, + "filename": "mercurial-5.8.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.9.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.9.1-1", + "length": 5000000, + "filename": "mercurial-5.9.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.9.1-2-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.9.1-2", + "length": 5000000, + "filename": "mercurial-5.9.1-2-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.9.2-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.9.2-1", + "length": 5000000, + "filename": "mercurial-5.9.2-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-5.9.3-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "5.9.3-1", + "length": 5000000, + "filename": "mercurial-5.9.3-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-6.0-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "6.0-1", + "length": 5000000, + "filename": "mercurial-6.0-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-6.0-2-x86_64.pkg.tar.zst", # noqa: B950 + "version": "6.0-2", + "length": 5000000, + "filename": "mercurial-6.0-2-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-6.0-3-x86_64.pkg.tar.zst", # noqa: B950 + "version": "6.0-3", + "length": 5000000, + "filename": "mercurial-6.0-3-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-6.0.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "6.0.1-1", + "length": 5000000, + "filename": "mercurial-6.0.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-6.0.2-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "6.0.2-1", + "length": 5000000, + "filename": "mercurial-6.0.2-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-6.0.3-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "6.0.3-1", + "length": 5000000, + "filename": "mercurial-6.0.3-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-6.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "6.1-1", + "length": 5000000, + "filename": "mercurial-6.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-6.1-2-x86_64.pkg.tar.zst", # noqa: B950 + "version": "6.1-2", + "length": 5000000, + "filename": "mercurial-6.1-2-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-6.1.1-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "6.1.1-1", + "length": 5000000, + "filename": "mercurial-6.1.1-1-x86_64.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/m/mercurial/mercurial-6.1.2-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "6.1.2-1", + "length": 5000000, + "filename": "mercurial-6.1.2-1-x86_64.pkg.tar.zst", + }, + ], + "arch_metadata": [ + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "4.8.2-1", + "last_modified": "2019-01-15T20:31:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "4.9-1", + "last_modified": "2019-02-12T06:15:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "4.9.1-1", + "last_modified": "2019-03-30T17:40:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.0-1", + "last_modified": "2019-05-10T08:44:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.0.1-1", + "last_modified": "2019-06-10T18:05:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.0.2-1", + "last_modified": "2019-07-10T04:58:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.1-1", + "last_modified": "2019-08-17T19:58:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.1.2-1", + "last_modified": "2019-10-08T08:38:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.2-1", + "last_modified": "2019-11-28T06:41:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.2.1-1", + "last_modified": "2020-01-06T12:35:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.2.2-1", + "last_modified": "2020-01-15T14:07:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.2.2-2", + "last_modified": "2020-01-30T20:05:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.3-1", + "last_modified": "2020-02-13T21:40:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.3.1-1", + "last_modified": "2020-03-07T23:58:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.3.2-1", + "last_modified": "2020-04-05T17:48:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.4-1", + "last_modified": "2020-05-10T17:19:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.4-2", + "last_modified": "2020-06-04T13:38:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.4.1-1", + "last_modified": "2020-06-06T12:28:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.4.2-1", + "last_modified": "2020-07-02T21:35:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.5-1", + "last_modified": "2020-08-05T10:39:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.5.1-1", + "last_modified": "2020-09-03T19:05:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.5.2-1", + "last_modified": "2020-10-07T20:05:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.6-1", + "last_modified": "2020-11-03T17:26:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.6-2", + "last_modified": "2020-11-09T16:54:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.6-3", + "last_modified": "2020-11-11T15:20:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.6.1-1", + "last_modified": "2020-12-05T12:29:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.7-1", + "last_modified": "2021-02-04T08:41:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.7.1-1", + "last_modified": "2021-03-11T07:51:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.8-1", + "last_modified": "2021-05-04T17:55:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.8-2", + "last_modified": "2021-05-08T22:08:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.8.1-1", + "last_modified": "2021-07-13T07:04:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.9.1-1", + "last_modified": "2021-09-01T12:48:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.9.1-2", + "last_modified": "2021-09-24T17:39:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.9.2-1", + "last_modified": "2021-10-07T21:52:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "5.9.3-1", + "last_modified": "2021-10-27T07:20:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "6.0-1", + "last_modified": "2021-11-25T17:10:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "6.0-2", + "last_modified": "2021-11-30T20:53:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "6.0-3", + "last_modified": "2021-12-02T12:06:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "6.0.1-1", + "last_modified": "2022-01-08T10:07:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "6.0.2-1", + "last_modified": "2022-02-03T13:28:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "6.0.3-1", + "last_modified": "2022-02-23T20:50:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "6.1-1", + "last_modified": "2022-03-03T18:06:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "6.1-2", + "last_modified": "2022-03-04T08:37:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "6.1.1-1", + "last_modified": "2022-04-07T18:26:00", + }, + { + "arch": "x86_64", + "repo": "extra", + "name": "mercurial", + "version": "6.1.2-1", + "last_modified": "2022-05-07T11:03:00", + }, + ], + }, + }, + { + "url": "https://archlinux.org/packages/community/any/python-hglib", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://archive.archlinux.org/packages/p/python-hglib/python-hglib-2.6.1-3-any.pkg.tar.xz", # noqa: B950 + "version": "2.6.1-3", + "length": 40000, + "filename": "python-hglib-2.6.1-3-any.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/p/python-hglib/python-hglib-2.6.2-1-any.pkg.tar.zst", # noqa: B950 + "version": "2.6.2-1", + "length": 43000, + "filename": "python-hglib-2.6.2-1-any.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/p/python-hglib/python-hglib-2.6.2-2-any.pkg.tar.zst", # noqa: B950 + "version": "2.6.2-2", + "length": 43000, + "filename": "python-hglib-2.6.2-2-any.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/p/python-hglib/python-hglib-2.6.2-3-any.pkg.tar.zst", # noqa: B950 + "version": "2.6.2-3", + "length": 43000, + "filename": "python-hglib-2.6.2-3-any.pkg.tar.zst", + }, + { + "url": "https://archive.archlinux.org/packages/p/python-hglib/python-hglib-2.6.2-4-any.pkg.tar.zst", # noqa: B950 + "version": "2.6.2-4", + "length": 43000, + "filename": "python-hglib-2.6.2-4-any.pkg.tar.zst", + }, + ], + "arch_metadata": [ + { + "arch": "any", + "repo": "community", + "name": "python-hglib", + "version": "2.6.1-3", + "last_modified": "2019-11-06T14:08:00", + }, + { + "arch": "any", + "repo": "community", + "name": "python-hglib", + "version": "2.6.2-1", + "last_modified": "2020-11-19T22:29:00", + }, + { + "arch": "any", + "repo": "community", + "name": "python-hglib", + "version": "2.6.2-2", + "last_modified": "2020-11-19T22:31:00", + }, + { + "arch": "any", + "repo": "community", + "name": "python-hglib", + "version": "2.6.2-3", + "last_modified": "2020-11-19T22:35:00", + }, + { + "arch": "any", + "repo": "community", + "name": "python-hglib", + "version": "2.6.2-4", + "last_modified": "2021-12-03T00:44:00", + }, + ], + }, + }, + { + "url": "https://archlinuxarm.org/packages/aarch64/gzip", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://uk.mirror.archlinuxarm.org/aarch64/core/gzip-1.12-1-aarch64.pkg.tar.xz", # noqa: B950 + "length": 79640, + "version": "1.12-1", + "filename": "gzip-1.12-1-aarch64.pkg.tar.xz", + } + ], + "arch_metadata": [ + { + "arch": "aarch64", + "name": "gzip", + "repo": "core", + "version": "1.12-1", + "last_modified": "2022-04-07T21:08:14", + } + ], + }, + }, + { + "url": "https://archlinuxarm.org/packages/aarch64/mercurial", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://uk.mirror.archlinuxarm.org/aarch64/extra/mercurial-6.1.3-1-aarch64.pkg.tar.xz", # noqa: B950 + "length": 4931228, + "version": "6.1.3-1", + "filename": "mercurial-6.1.3-1-aarch64.pkg.tar.xz", + } + ], + "arch_metadata": [ + { + "arch": "aarch64", + "name": "mercurial", + "repo": "extra", + "version": "6.1.3-1", + "last_modified": "2022-06-02T22:15:18", + } + ], + }, + }, + { + "url": "https://archlinuxarm.org/packages/any/python-hglib", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://uk.mirror.archlinuxarm.org/any/community/python-hglib-2.6.2-4-any.pkg.tar.xz", # noqa: B950 + "length": 41432, + "version": "2.6.2-4", + "filename": "python-hglib-2.6.2-4-any.pkg.tar.xz", + } + ], + "arch_metadata": [ + { + "arch": "any", + "name": "python-hglib", + "repo": "community", + "version": "2.6.2-4", + "last_modified": "2021-12-14T16:22:20", + } + ], + }, + }, + { + "url": "https://archlinuxarm.org/packages/armv7h/gzip", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://uk.mirror.archlinuxarm.org/armv7h/core/gzip-1.12-1-armv7h.pkg.tar.xz", # noqa: B950 + "length": 78468, + "version": "1.12-1", + "filename": "gzip-1.12-1-armv7h.pkg.tar.xz", + } + ], + "arch_metadata": [ + { + "arch": "armv7h", + "name": "gzip", + "repo": "core", + "version": "1.12-1", + "last_modified": "2022-04-07T21:08:35", + } + ], + }, + }, + { + "url": "https://archlinuxarm.org/packages/armv7h/mercurial", + "visit_type": "arch", + "extra_loader_arguments": { + "artifacts": [ + { + "url": "https://uk.mirror.archlinuxarm.org/armv7h/extra/mercurial-6.1.3-1-armv7h.pkg.tar.xz", # noqa: B950 + "length": 4897816, + "version": "6.1.3-1", + "filename": "mercurial-6.1.3-1-armv7h.pkg.tar.xz", + } + ], + "arch_metadata": [ + { + "arch": "armv7h", + "name": "mercurial", + "repo": "extra", + "version": "6.1.3-1", + "last_modified": "2022-06-02T22:13:08", + } + ], + }, + }, +] + + +def test_arch_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = ArchLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 9 + assert res.origins == 12 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert [ + ( + scheduled.visit_type, + scheduled.url, + scheduled.extra_loader_arguments["artifacts"], + scheduled.extra_loader_arguments["arch_metadata"], + ) + for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) + ] == [ + ( + "arch", + expected["url"], + expected["extra_loader_arguments"]["artifacts"], + expected["extra_loader_arguments"]["arch_metadata"], + ) + for expected in sorted(expected_origins, key=lambda expected: expected["url"]) + ] diff --git a/swh/lister/arch/tests/test_tasks.py b/swh/lister/arch/tests/test_tasks.py new file mode 100644 index 0000000..a0c7232 --- /dev/null +++ b/swh/lister/arch/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_arch_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.arch.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_arch_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked ArchLister + lister = mocker.patch("swh.lister.arch.tasks.ArchLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.arch.tasks.ArchListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with() diff --git a/swh/lister/aur/__init__.py b/swh/lister/aur/__init__.py new file mode 100644 index 0000000..833c72b --- /dev/null +++ b/swh/lister/aur/__init__.py @@ -0,0 +1,135 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +AUR (Arch User Repository) lister +================================= + +The AUR lister list origins from `aur.archlinux.org`_, the Arch User Repository. +For each package, there is a git repository, we use the git url as origin and the +snapshot url as the artifact for the loader to download. + +Each git repository consist of a directory (for which name corresponds to the package name), +and at least two files, .SRCINFO and PKGBUILD which are recipes for building the package. + +Each package has a version, the latest one. There isn't any archives of previous versions, +so the lister will always list one version per package. + +As of August 2022 `aur.archlinux.org`_ list 84438 packages. Please note that this amount +is the total of `regular`_ and `split`_ packages. +We will archive `regular` and `split` packages but only their `pkgbase` because that is +the only one that actually has source code. +The packages amount is 78554 after removing the split ones. + +Origins retrieving strategy +--------------------------- + +An rpc api exists but it is recommended to save bandwidth so it's not used. See +`New AUR Metadata Archives`_ for more on this topic. + +To get an index of all AUR existing packages we download a `packages-meta-v1.json.gz`_ +which contains a json file listing all existing packages definitions. + +Each entry describes the latest released version of a package. The origin url +for a package is built using `pkgbase` and corresponds to a git repository. + +Note that we list only standard package (when pkgbase equal pkgname), not the ones +belonging to split packages. + +It takes only a couple of minutes to download the 7 MB index archive and parses its +content. + +Page listing +------------ + +Each page is related to one package. As its not possible to get all previous +versions, it will always returns one line. + +Each page corresponds to a package with a `version`, an `url` for a Git +repository, a `project_url` which represents the upstream project url and +a canonical `snapshot_url` from which a tar.gz archive of the package can +be downloaded. + +The data schema for each line is: + +* **pkgname**: Package name +* **version**: Package version +* **url**: Git repository url for a package +* **snapshot_url**: Package download url +* **project_url**: Upstream project url if any +* **last_modified**: Iso8601 last update date + +Origins from page +----------------- + +The lister yields one origin per page. +The origin url corresponds to the git url of a package, for example ``https://aur.archlinux.org/{package}.git``. + +Additionally we add some data set to "extra_loader_arguments": + +* **artifacts**: Represent data about the Aur package snapshot to download, + following :ref:`original-artifacts-json specification ` +* **aur_metadata**: To store all other interesting attributes that do not belongs to artifacts. + +Origin data example:: + + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/hg-evolve.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "hg-evolve.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950 + "version": "10.5.1-1", + } + ], + "aur_metadata": [ + { + "version": "10.5.1-1", + "project_url": "https://www.mercurial-scm.org/doc/evolution/", + "last_update": "2022-04-27T20:02:56+00:00", + "pkgname": "hg-evolve", + } + ], + }, + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/aur/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker-compose up -d + +Then connect to the lister:: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler):: + + swh lister run -l aur + +.. _aur.archlinux.org: https://aur.archlinux.org +.. _New AUR Metadata Archives: https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html +.. _packages-meta-v1.json.gz: https://aur.archlinux.org/packages-meta-v1.json.gz +.. _regular: https://wiki.archlinux.org/title/PKGBUILD#Package_name +.. _split: https://man.archlinux.org/man/PKGBUILD.5#PACKAGE_SPLITTING +""" + + +def register(): + from .lister import AurLister + + return { + "lister": AurLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py new file mode 100644 index 0000000..778a848 --- /dev/null +++ b/swh/lister/aur/lister.py @@ -0,0 +1,154 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime +import logging +from typing import Any, Dict, Iterator, List, Optional + +import requests + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +AurListerPage = Dict[str, Any] + + +class AurLister(StatelessLister[AurListerPage]): + """List Arch User Repository (AUR) origins. + + Given an url (used as a base url, default is 'https://aur.archlinux.org'), + download a 'packages-meta-v1.json.gz' which contains a json file listing all + existing packages definitions. + + Each entry describes the latest released version of a package. The origin url + for a package is built using 'pkgname' and corresponds to a git repository. + + An rpc api exists but it is recommended to save bandwidth so it's not used. See + https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html + for more on this. + """ + + LISTER_NAME = "aur" + VISIT_TYPE = "aur" + INSTANCE = "aur" + + BASE_URL = "https://aur.archlinux.org" + DEFAULT_PACKAGES_INDEX_URL = "{base_url}/packages-meta-v1.json.gz" + PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git" + PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz" + ORIGIN_URL_PATTERN = "{base_url}/packages/{pkgname}" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.BASE_URL, + ) + + def download_packages_index(self) -> List[Dict[str, Any]]: + """Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string, + and download the archive to self.DESTINATION_PATH + + Returns: + a directory Path where the archive has been downloaded to. + """ + url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url) + return requests.get(url).json() + + def get_pages(self) -> Iterator[AurListerPage]: + """Yield an iterator which returns 'page' + + Each page corresponds to a package with a 'version', an 'url' for a Git + repository, a 'project_url' which represents the upstream project url and + a canonical 'snapshot_url' from which a tar.gz archive of the package can + be downloaded. + """ + packages = self.download_packages_index() + + logger.debug("Found %s AUR packages in aur_index", len(packages)) + + for package in packages: + # Exclude lines where Name differs from PackageBase as they represents + # split package and they don't have resolvable snapshots url + if package["Name"] == package["PackageBase"]: + logger.debug("Processing AUR package %s", package["Name"]) + pkgname = package["PackageBase"] + version = package["Version"] + project_url = package["URL"] + last_modified = datetime.datetime.fromtimestamp( + float(package["LastModified"]), tz=datetime.timezone.utc + ).isoformat() + yield { + "pkgname": pkgname, + "version": version, + "url": self.ORIGIN_URL_PATTERN.format( + base_url=self.BASE_URL, pkgname=pkgname + ), + "git_url": self.PACKAGE_VCS_URL_PATTERN.format( + base_url=self.BASE_URL, pkgname=pkgname + ), + "snapshot_url": self.PACKAGE_SNAPSHOT_URL_PATTERN.format( + base_url=self.BASE_URL, pkgname=pkgname + ), + "project_url": project_url, + "last_modified": last_modified, + } + + def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances. + It uses the vcs (Git) url as an origin and adds `artifacts` and `aur_metadata` + entries to 'extra_loader_arguments'. + + `artifacts` describe the file to download and `aur_metadata` store some + metadata that can be useful for the loader. + """ + assert self.lister_obj.id is not None + + last_update = datetime.datetime.fromisoformat(origin["last_modified"]) + filename = origin["snapshot_url"].split("/")[-1] + + artifacts = [ + { + "filename": filename, + "url": origin["snapshot_url"], + "version": origin["version"], + } + ] + aur_metadata = [ + { + "version": origin["version"], + "project_url": origin["project_url"], + "last_update": origin["last_modified"], + "pkgname": origin["pkgname"], + } + ] + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=origin["url"], + last_update=last_update, + extra_loader_arguments={ + "artifacts": artifacts, + "aur_metadata": aur_metadata, + }, + ) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type="git", + url=origin["git_url"], + last_update=last_update, + ) diff --git a/swh/lister/aur/tasks.py b/swh/lister/aur/tasks.py new file mode 100644 index 0000000..52de9db --- /dev/null +++ b/swh/lister/aur/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.aur.lister import AurLister + + +@shared_task(name=__name__ + ".AurListerTask") +def list_aur(**lister_args): + """Lister task for Arch User Repository (AUR)""" + return AurLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/aur/tests/__init__.py b/swh/lister/aur/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/aur/tests/data/fake_aur_packages.sh b/swh/lister/aur/tests/data/fake_aur_packages.sh new file mode 100755 index 0000000..26ad1e3 --- /dev/null +++ b/swh/lister/aur/tests/data/fake_aur_packages.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# Script to generate packages-meta-v1.json.gz +# files and fake http responses for https_aur.archlinux.org +# For tests purposes only + +set -euo pipefail + +# files and directories +mkdir https_aur.archlinux.org + +mkdir -p tmp_dir/archives/ +cd tmp_dir/archives/ + +echo -e '''[ +{"ID":787300,"Name":"tealdeer-git","PackageBaseID":110159,"PackageBase":"tealdeer-git","Version":"r255.30b7c5f-1","Description":"A fast tldr client in Rust.","URL":"https://github.com/dbrgn/tealdeer","NumVotes":11,"Popularity":0.009683,"OutOfDate":null,"Maintainer":"dbrgn","FirstSubmitted":1460795753,"LastModified":1599251812,"URLPath":"/cgit/aur.git/snapshot/tealdeer-git.tar.gz"}, +{"ID":860370,"Name":"ibus-git","PackageBaseID":163059,"PackageBase":"ibus-git","Version":"1.5.23+12+gef4c5c7e-1","Description":"Next Generation Input Bus for Linux","URL":"https://github.com/ibus/ibus/wiki","NumVotes":1,"Popularity":0.989573,"OutOfDate":null,"Maintainer":"tallero","FirstSubmitted":1612764731,"LastModified":1612764731,"URLPath":"/cgit/aur.git/snapshot/ibus-git.tar.gz"}, +{"ID":1043337,"Name":"libervia-web-hg","PackageBaseID":170485,"PackageBase":"libervia-web-hg","Version":"0.9.0.r1492.3a34d78f2717-1","Description":"Salut à Toi, multi-frontends multi-purposes XMPP client (Web interface)","URL":"http://salut-a-toi.org/","NumVotes":0,"Popularity":0.0,"OutOfDate":null,"Maintainer":"jnanar","FirstSubmitted":1630224837,"LastModified":1645889458,"URLPath":"/cgit/aur.git/snapshot/libervia-web-hg.tar.gz"}, +{"ID":1072642,"Name":"hg-evolve","PackageBaseID":135047,"PackageBase":"hg-evolve","Version":"10.5.1-1","Description":"Flexible evolution of Mercurial history","URL":"https://www.mercurial-scm.org/doc/evolution/","NumVotes":6,"Popularity":0.003887,"OutOfDate":null,"Maintainer":"damien-43","FirstSubmitted":1534190432,"LastModified":1651089776,"URLPath":"/cgit/aur.git/snapshot/hg-evolve.tar.gz"} +]''' > packages-meta-v1.json + +# Gzip archive +gzip -c packages-meta-v1.json > ../../https_aur.archlinux.org/packages-meta-v1.json.gz + +# Clean up removing tmp_dir +cd ../../ +rm -rf tmp_dir/ diff --git a/swh/lister/aur/tests/data/packages-meta-v1.json.gz b/swh/lister/aur/tests/data/packages-meta-v1.json.gz new file mode 100644 index 0000000..56b8241 Binary files /dev/null and b/swh/lister/aur/tests/data/packages-meta-v1.json.gz differ diff --git a/swh/lister/aur/tests/test_lister.py b/swh/lister/aur/tests/test_lister.py new file mode 100644 index 0000000..7b67d4a --- /dev/null +++ b/swh/lister/aur/tests/test_lister.py @@ -0,0 +1,143 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import gzip +import json +import os + +from swh.lister.aur.lister import AurLister + +expected_origins = [ + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/packages/hg-evolve", + "git_url": "https://aur.archlinux.org/hg-evolve.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "hg-evolve.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950 + "version": "10.5.1-1", + } + ], + "aur_metadata": [ + { + "version": "10.5.1-1", + "project_url": "https://www.mercurial-scm.org/doc/evolution/", + "last_update": "2022-04-27T20:02:56+00:00", + "pkgname": "hg-evolve", + } + ], + }, + }, + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/packages/ibus-git", + "git_url": "https://aur.archlinux.org/ibus-git.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "ibus-git.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/ibus-git.tar.gz", # noqa: B950 + "version": "1.5.23+12+gef4c5c7e-1", + } + ], + "aur_metadata": [ + { + "version": "1.5.23+12+gef4c5c7e-1", + "project_url": "https://github.com/ibus/ibus/wiki", + "last_update": "2021-02-08T06:12:11+00:00", + "pkgname": "ibus-git", + } + ], + }, + }, + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/packages/libervia-web-hg", + "git_url": "https://aur.archlinux.org/libervia-web-hg.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "libervia-web-hg.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/libervia-web-hg.tar.gz", # noqa: B950 + "version": "0.9.0.r1492.3a34d78f2717-1", + } + ], + "aur_metadata": [ + { + "version": "0.9.0.r1492.3a34d78f2717-1", + "project_url": "http://salut-a-toi.org/", + "last_update": "2022-02-26T15:30:58+00:00", + "pkgname": "libervia-web-hg", + } + ], + }, + }, + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/packages/tealdeer-git", + "git_url": "https://aur.archlinux.org/tealdeer-git.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "tealdeer-git.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/tealdeer-git.tar.gz", # noqa: B950 + "version": "r255.30b7c5f-1", + } + ], + "aur_metadata": [ + { + "version": "r255.30b7c5f-1", + "project_url": "https://github.com/dbrgn/tealdeer", + "last_update": "2020-09-04T20:36:52+00:00", + "pkgname": "tealdeer-git", + } + ], + }, + }, +] + + +def test_aur_lister(datadir, swh_scheduler, requests_mock): + + lister = AurLister(scheduler=swh_scheduler) + + packages_index_filename = "packages-meta-v1.json.gz" + + # simulate requests behavior: gzip and deflate transfer-encodings are automatically decoded + with gzip.open(os.path.join(datadir, packages_index_filename), "rb") as f: + requests_mock.get( + f"{lister.BASE_URL}/{packages_index_filename}", json=json.loads(f.read()) + ) + + res = lister.run() + + assert res.pages == 4 + assert res.origins == 8 + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + aur_origins = [origin for origin in scheduler_origins if origin.visit_type == "aur"] + git_origins = [origin for origin in scheduler_origins if origin.visit_type == "git"] + + assert [ + ( + scheduled.visit_type, + scheduled.url, + scheduled.extra_loader_arguments["artifacts"], + ) + for scheduled in sorted(aur_origins, key=lambda scheduled: scheduled.url) + ] == [ + ( + "aur", + expected["url"], + expected["extra_loader_arguments"]["artifacts"], + ) + for expected in sorted(expected_origins, key=lambda expected: expected["url"]) + ] + + assert {origin.url for origin in git_origins} == { + origin["git_url"] for origin in expected_origins + } diff --git a/swh/lister/aur/tests/test_tasks.py b/swh/lister/aur/tests/test_tasks.py new file mode 100644 index 0000000..44e72d1 --- /dev/null +++ b/swh/lister/aur/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_aur_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.aur.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_aur_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked AurLister + lister = mocker.patch("swh.lister.aur.tasks.AurLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.aur.tasks.AurListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with() diff --git a/swh/lister/bitbucket/tests/test_lister.py b/swh/lister/bitbucket/tests/test_lister.py index c568dbf..e624e8e 100644 --- a/swh/lister/bitbucket/tests/test_lister.py +++ b/swh/lister/bitbucket/tests/test_lister.py @@ -1,184 +1,178 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import json import os import pytest from swh.lister.bitbucket.lister import BitbucketLister @pytest.fixture def bb_api_repositories_page1(datadir): data_file_path = os.path.join(datadir, "bb_api_repositories_page1.json") with open(data_file_path, "r") as data_file: return json.load(data_file) @pytest.fixture def bb_api_repositories_page2(datadir): data_file_path = os.path.join(datadir, "bb_api_repositories_page2.json") with open(data_file_path, "r") as data_file: return json.load(data_file) def _check_listed_origins(lister_origins, scheduler_origins): """Asserts that the two collections have the same origins from the point of view of the lister""" - - sorted_lister_origins = list(sorted(lister_origins)) - sorted_scheduler_origins = list(sorted(scheduler_origins)) - - assert len(sorted_lister_origins) == len(sorted_scheduler_origins) - - for lo, so in zip(sorted_lister_origins, sorted_scheduler_origins): - assert lo.url == so.url - assert lo.last_update == so.last_update + assert {(lo.url, lo.last_update) for lo in lister_origins} == { + (so.url, so.last_update) for so in scheduler_origins + } def test_bitbucket_incremental_lister( swh_scheduler, requests_mock, mocker, bb_api_repositories_page1, bb_api_repositories_page2, ): """Simple Bitbucket listing with two pages containing 10 origins""" requests_mock.get( BitbucketLister.API_URL, [ {"json": bb_api_repositories_page1}, {"json": bb_api_repositories_page2}, ], ) lister = BitbucketLister(scheduler=swh_scheduler, page_size=10) # First listing stats = lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert stats.pages == 2 assert stats.origins == 20 assert len(scheduler_origins) == 20 assert lister.updated lister_state = lister.get_state_from_scheduler() last_repo_cdate = lister_state.last_repo_cdate.isoformat() assert hasattr(lister_state, "last_repo_cdate") assert last_repo_cdate == bb_api_repositories_page2["values"][-1]["created_on"] # Second listing, restarting from last state lister.session.get = mocker.spy(lister.session, "get") lister.run() url_params = lister.url_params url_params["after"] = last_repo_cdate lister.session.get.assert_called_once_with(lister.API_URL, params=url_params) all_origins = ( bb_api_repositories_page1["values"] + bb_api_repositories_page2["values"] ) _check_listed_origins(lister.get_origins_from_page(all_origins), scheduler_origins) def test_bitbucket_lister_rate_limit_hit( swh_scheduler, requests_mock, mocker, bb_api_repositories_page1, bb_api_repositories_page2, ): """Simple Bitbucket listing with two pages containing 10 origins""" requests_mock.get( BitbucketLister.API_URL, [ {"json": bb_api_repositories_page1, "status_code": 200}, {"json": None, "status_code": 429}, {"json": None, "status_code": 429}, {"json": bb_api_repositories_page2, "status_code": 200}, ], ) lister = BitbucketLister(scheduler=swh_scheduler, page_size=10) mocker.patch.object(lister.page_request.retry, "sleep") stats = lister.run() assert stats.pages == 2 assert stats.origins == 20 assert len(swh_scheduler.get_listed_origins(lister.lister_obj.id).results) == 20 def test_bitbucket_full_lister( swh_scheduler, requests_mock, mocker, bb_api_repositories_page1, bb_api_repositories_page2, ): """Simple Bitbucket listing with two pages containing 10 origins""" requests_mock.get( BitbucketLister.API_URL, [ {"json": bb_api_repositories_page1}, {"json": bb_api_repositories_page2}, {"json": bb_api_repositories_page1}, {"json": bb_api_repositories_page2}, ], ) credentials = {"bitbucket": {"bitbucket": [{"username": "u", "password": "p"}]}} lister = BitbucketLister( scheduler=swh_scheduler, page_size=10, incremental=True, credentials=credentials ) assert lister.session.auth is not None # First do a incremental run to have an initial lister state stats = lister.run() last_lister_state = lister.get_state_from_scheduler() assert stats.origins == 20 # Then do the full run and verify lister state did not change # Modify last listed repo modification date to check it will be not saved # to lister state after its execution last_page2_repo = bb_api_repositories_page2["values"][-1] last_page2_repo["created_on"] = datetime.now().isoformat() last_page2_repo["updated_on"] = datetime.now().isoformat() lister = BitbucketLister(scheduler=swh_scheduler, page_size=10, incremental=False) assert lister.session.auth is None stats = lister.run() assert stats.pages == 2 assert stats.origins == 20 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results # 20 because scheduler upserts based on (id, type, url) assert len(scheduler_origins) == 20 # Modification on created_on SHOULD NOT impact lister state assert lister.get_state_from_scheduler() == last_lister_state # Modification on updated_on SHOULD impact lister state all_origins = ( bb_api_repositories_page1["values"] + bb_api_repositories_page2["values"] ) _check_listed_origins(lister.get_origins_from_page(all_origins), scheduler_origins) diff --git a/swh/lister/bower/__init__.py b/swh/lister/bower/__init__.py new file mode 100644 index 0000000..1f1c017 --- /dev/null +++ b/swh/lister/bower/__init__.py @@ -0,0 +1,76 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Bower lister +============ + +The `Bower`_ lister list origins from its packages registry `registry.bower.io`_. + +Bower is a tool to manage Javascript packages. + +The registry provide an `http api`_ from where the lister retrieve package names +and url. + +As of August 2022 `registry.bower.io`_ list 71028 package names. + +Note that even if the project is still maintained(security fixes, no new features), it is +recommended to not use it anymore and prefer Yarn as a replacement since 2018. + +Origins retrieving strategy +--------------------------- + +To get a list of all package names we call `https://registry.bower.io/packages` endpoint. +There is no other way for discovery (no archive index, no database dump, no dvcs repository). + +Page listing +------------ + +There is only one page that list all origins url. + +Origins from page +----------------- + +The lister yields all origins url from one page. It is a list of package name and url. +Origins url corresponds to Git repository url. +Bower is supposed to support Svn repository too but on +/- 71000 urls I have only found 35 +urls that may not be Git repository. + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/bower/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker-compose up -d + +Then connect to the lister:: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler):: + + swh lister run -l bower + +.. _Bower: https://bower.io +.. _registry.bower.io: https://registry.bower.io +.. _http api: https://registry.bower.io/packages +""" + + +def register(): + from .lister import BowerLister + + return { + "lister": BowerLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py new file mode 100644 index 0000000..f516b2b --- /dev/null +++ b/swh/lister/bower/lister.py @@ -0,0 +1,91 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import logging +from typing import Any, Dict, Iterator, List, Optional + +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +BowerListerPage = List[Dict[str, str]] + + +class BowerLister(StatelessLister[BowerListerPage]): + """List Bower (Javascript package manager) origins.""" + + LISTER_NAME = "bower" + VISIT_TYPE = "git" # Bower origins url are Git repositories + INSTANCE = "bower" + + API_URL = "https://registry.bower.io/packages" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.API_URL, + ) + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[BowerListerPage]: + """Yield an iterator which returns 'page' + + It uses the api endpoint provided by `https://registry.bower.io/packages` + to get a list of package names with an origin url that corresponds to Git + repository. + + There is only one page that list all origins urls. + """ + response = self.page_request(url=self.url, params={}) + yield response.json() + + def get_origins_from_page(self, page: BowerListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + + for entry in page: + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=entry["url"], + last_update=None, + ) diff --git a/swh/lister/bower/tasks.py b/swh/lister/bower/tasks.py new file mode 100644 index 0000000..c35c039 --- /dev/null +++ b/swh/lister/bower/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.bower.lister import BowerLister + + +@shared_task(name=__name__ + ".BowerListerTask") +def list_bower(**lister_args): + """Lister task for Bower (Javascript package manager) registry""" + return BowerLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/bower/tests/__init__.py b/swh/lister/bower/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/bower/tests/data/https_registry.bower.io/packages b/swh/lister/bower/tests/data/https_registry.bower.io/packages new file mode 100644 index 0000000..ce58af5 --- /dev/null +++ b/swh/lister/bower/tests/data/https_registry.bower.io/packages @@ -0,0 +1,14 @@ +[ + { + "name": "font-awesome", + "url": "https://github.com/FortAwesome/Font-Awesome.git" + }, + { + "name": "redux", + "url": "https://github.com/reactjs/redux.git" + }, + { + "name": "vue", + "url": "https://github.com/vuejs/vue.git" + } +] diff --git a/swh/lister/bower/tests/test_lister.py b/swh/lister/bower/tests/test_lister.py new file mode 100644 index 0000000..6a0be4b --- /dev/null +++ b/swh/lister/bower/tests/test_lister.py @@ -0,0 +1,37 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.lister.bower.lister import BowerLister + +expected_origins = [ + {"name": "font-awesome", "url": "https://github.com/FortAwesome/Font-Awesome.git"}, + {"name": "redux", "url": "https://github.com/reactjs/redux.git"}, + {"name": "vue", "url": "https://github.com/vuejs/vue.git"}, +] + + +def test_bower_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = BowerLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 1 + 1 + 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(expected_origins) + + assert { + ( + scheduled.visit_type, + scheduled.url, + ) + for scheduled in scheduler_origins + } == { + ( + "git", + expected["url"], + ) + for expected in expected_origins + } diff --git a/swh/lister/bower/tests/test_tasks.py b/swh/lister/bower/tests/test_tasks.py new file mode 100644 index 0000000..c09d88f --- /dev/null +++ b/swh/lister/bower/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_bower_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.bower.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_bower_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked BowerLister + lister = mocker.patch("swh.lister.bower.tasks.BowerLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.bower.tasks.BowerListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with() diff --git a/swh/lister/crates/__init__.py b/swh/lister/crates/__init__.py index 2b31785..c4ca72c 100644 --- a/swh/lister/crates/__init__.py +++ b/swh/lister/crates/__init__.py @@ -1,12 +1,142 @@ # Copyright (C) 2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +""" +Crates lister +============= + +The Crates lister list origins from `Crates.io`_, the Rust community’s crate registry. + +Origins are `packages`_ for the `Rust language`_ ecosystem. +Package follow a `layout specifications`_ to be usable with the `Cargo`_ package manager +and have a `Cargo.toml`_ file manifest which consists in metadata to describe and build +a specific package version. + +As of August 2022 `Crates.io`_ list 89013 packages name for a total of 588215 released +versions. + +Origins retrieving strategy +--------------------------- + +A json http api to list packages from crates.io but we choose a `different strategy`_ +in order to reduce to its bare minimum the amount of http call and bandwidth. +We clone a git repository which contains a tree of directories whose last child folder +name corresponds to the package name and contains a Cargo.toml file with some json data +to describe all existing versions of the package. +It takes a few seconds to clone the repository and browse it to build a full index of +existing package and related versions. +The lister is incremental, so the first time it clones and browses the repository as +previously described then stores the last seen commit id. +Next time, it retrieves the list of new and changed files since last commit id and +returns new or changed package with all of their related versions. + +Note that all Git related operations are done with `Dulwich`_, a Python +implementation of the Git file formats and protocols. + +Page listing +------------ + +Each page is related to one package. +Each line of a page corresponds to different versions of this package. + +The data schema for each line is: + +* **name**: Package name +* **version**: Package version +* **crate_file**: Package download url +* **checksum**: Package download checksum +* **yanked**: Whether the package is yanked or not +* **last_update**: Iso8601 last update date computed upon git commit date of the + related Cargo.toml file + +Origins from page +----------------- + +The lister yields one origin per page. +The origin url corresponds to the http api url for a package, for example +"https://crates.io/api/v1/crates/{package}". + +Additionally we add some data set to "extra_loader_arguments": + +* **artifacts**: Represent data about the Crates to download, following + :ref:`original-artifacts-json specification ` +* **crates_metadata**: To store all other interesting attributes that do not belongs + to artifacts. For now it mainly indicate when a version is `yanked`_. + +Origin data example:: + + { + "url": "https://crates.io/api/v1/crates/rand", + "artifacts": [ + { + "checksums": { + "sha256": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d", # noqa: B950 + }, + "filename": "rand-0.1.1.crate", + "url": "https://static.crates.io/crates/rand/rand-0.1.1.crate", + "version": "0.1.1", + }, + { + "checksums": { + "sha256": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7", # noqa: B950 + }, + "filename": "rand-0.1.2.crate", + "url": "https://static.crates.io/crates/rand/rand-0.1.2.crate", + "version": "0.1.2", + }, + ], + "crates_metadata": [ + { + "version": "0.1.1", + "yanked": False, + }, + { + "version": "0.1.2", + "yanked": False, + }, + ], + } + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/crates/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment: + + docker-compose up -d + +Then connect to the lister: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler): + + swh lister run -l crates + +.. _Crates.io: https://crates.io +.. _packages: https://doc.rust-lang.org/book/ch07-01-packages-and-crates.html +.. _Rust language: https://www.rust-lang.org/ +.. _layout specifications: https://doc.rust-lang.org/cargo/guide/project-layout.html +.. _Cargo: https://doc.rust-lang.org/cargo/guide/why-cargo-exists.html#enter-cargo +.. _Cargo.toml: https://doc.rust-lang.org/cargo/reference/manifest.html +.. _different strategy: https://crates.io/data-access +.. _Dulwich: https://www.dulwich.io/ +.. _yanked: https://doc.rust-lang.org/cargo/reference/publishing.html#cargo-yank +""" + + def register(): from .lister import CratesLister return { "lister": CratesLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py index 63604a1..fbe3003 100644 --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -1,162 +1,259 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - +from dataclasses import asdict, dataclass +import datetime +import io import json import logging from pathlib import Path -import subprocess -from typing import Any, Dict, Iterator, List +import shutil +from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urlparse -import iso8601 +from dulwich import porcelain +from dulwich.patch import write_tree_diff +from dulwich.repo import Repo from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. CratesListerPage = List[Dict[str, Any]] -class CratesLister(StatelessLister[CratesListerPage]): +@dataclass +class CratesListerState: + """Store lister state for incremental mode operations. + 'last_commit' represents a git commit hash + """ + + last_commit: str = "" + + +class CratesLister(Lister[CratesListerState, CratesListerPage]): """List origins from the "crates.io" forge. It basically fetches https://github.com/rust-lang/crates.io-index.git to a - temp directory and then walks through each file to get the crate's info. + temp directory and then walks through each file to get the crate's info on + the first run. + + In incremental mode, it relies on the same Git repository but instead of reading + each file of the repo, it get the differences through ``git log last_commit..HEAD``. + Resulting output string is parsed to build page entries. """ # Part of the lister API, that identifies this lister LISTER_NAME = "crates" # (Optional) CVS type of the origins listed by this lister, if constant VISIT_TYPE = "crates" INSTANCE = "crates" INDEX_REPOSITORY_URL = "https://github.com/rust-lang/crates.io-index.git" DESTINATION_PATH = Path("/tmp/crates.io-index") CRATE_FILE_URL_PATTERN = ( "https://static.crates.io/crates/{crate}/{crate}-{version}.crate" ) CRATE_API_URL_PATTERN = "https://crates.io/api/v1/crates/{crate}" def __init__( self, scheduler: SchedulerInterface, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.INDEX_REPOSITORY_URL, instance=self.INSTANCE, ) + def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: + if "last_commit" not in d: + d["last_commit"] = "" + return CratesListerState(**d) + + def state_to_dict(self, state: CratesListerState) -> Dict[str, Any]: + return asdict(state) + def get_index_repository(self) -> None: """Get crates.io-index repository up to date running git command.""" - - subprocess.check_call( - [ - "git", - "clone", - self.INDEX_REPOSITORY_URL, - self.DESTINATION_PATH, - ] - ) + if self.DESTINATION_PATH.exists(): + porcelain.pull( + self.DESTINATION_PATH, remote_location=self.INDEX_REPOSITORY_URL + ) + else: + porcelain.clone( + source=self.INDEX_REPOSITORY_URL, target=self.DESTINATION_PATH + ) def get_crates_index(self) -> List[Path]: """Build a sorted list of file paths excluding dotted directories and dotted files. Each file path corresponds to a crate that lists all available versions. """ - crates_index = sorted( path for path in self.DESTINATION_PATH.rglob("*") if not any(part.startswith(".") for part in path.parts) and path.is_file() and path != self.DESTINATION_PATH / "config.json" ) return crates_index + def get_last_commit_hash(self, repository_path: Path) -> str: + """Returns the last commit hash of a git repository""" + assert repository_path.exists() + + repo = Repo(str(repository_path)) + head = repo.head() + last_commit = repo[head] + + return last_commit.id.decode() + + def get_last_update_by_file(self, filepath: Path) -> Optional[datetime.datetime]: + """Given a file path within a Git repository, returns its last commit + date as iso8601 + """ + repo = Repo(str(self.DESTINATION_PATH)) + # compute relative path otherwise it fails + relative_path = filepath.relative_to(self.DESTINATION_PATH) + walker = repo.get_walker(paths=[bytes(relative_path)], max_entries=1) + try: + commit = next(iter(walker)).commit + except StopIteration: + logger.error( + "Can not find %s related commits in repository %s", relative_path, repo + ) + return None + else: + last_update = datetime.datetime.fromtimestamp( + commit.author_time, datetime.timezone.utc + ) + return last_update + + def page_entry_dict(self, entry: Dict[str, Any]) -> Dict[str, Any]: + """Transform package version definition dict to a suitable + page entry dict + """ + return dict( + name=entry["name"], + version=entry["vers"], + checksum=entry["cksum"], + yanked=entry["yanked"], + crate_file=self.CRATE_FILE_URL_PATTERN.format( + crate=entry["name"], version=entry["vers"] + ), + ) + def get_pages(self) -> Iterator[CratesListerPage]: """Yield an iterator sorted by name in ascending order of pages. Each page is a list of crate versions with: - name: Name of the crate - version: Version - checksum: Checksum - crate_file: Url of the crate file - last_update: Date of the last commit of the corresponding index file """ # Fetch crates.io index repository self.get_index_repository() - # Get a list of all crates files from the index repository - crates_index = self.get_crates_index() - logger.debug("found %s crates in crates_index", len(crates_index)) + if not self.state.last_commit: + # First discovery + # List all crates files from the index repository + crates_index = self.get_crates_index() + else: + # Incremental case + # Get new package version by parsing a range of commits from index repository + repo = Repo(str(self.DESTINATION_PATH)) + head = repo[repo.head()] + last = repo[self.state.last_commit.encode()] + outstream = io.BytesIO() + write_tree_diff(outstream, repo.object_store, last.tree, head.tree) + raw_diff = outstream.getvalue() + crates_index = [] + for line in raw_diff.splitlines(): + if line.startswith(b"+++ b/"): + filepath = line.split(b"+++ b/", 1)[1] + crates_index.append(self.DESTINATION_PATH / filepath.decode()) + crates_index = sorted(crates_index) + + logger.debug("Found %s crates in crates_index", len(crates_index)) + + # Each line of a crate file is a json entry describing released versions + # for a package for crate in crates_index: page = [] - # %cI is for strict iso8601 date formatting - last_update_str = subprocess.check_output( - ["git", "log", "-1", "--pretty=format:%cI", str(crate)], - cwd=self.DESTINATION_PATH, - ) - last_update = iso8601.parse_date(last_update_str.decode().strip()) + last_update = self.get_last_update_by_file(crate) with crate.open("rb") as current_file: for line in current_file: data = json.loads(line) - # pick only the data we need - page.append( - dict( - name=data["name"], - version=data["vers"], - checksum=data["cksum"], - crate_file=self.CRATE_FILE_URL_PATTERN.format( - crate=data["name"], version=data["vers"] - ), - last_update=last_update, - ) - ) + entry = self.page_entry_dict(data) + entry["last_update"] = last_update + page.append(entry) yield page def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: """Iterate on all crate pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) last_update = page[0]["last_update"] artifacts = [] + crates_metadata = [] for version in page: filename = urlparse(version["crate_file"]).path.split("/")[-1] # Build an artifact entry following original-artifacts-json specification # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 artifact = { "filename": f"{filename}", "checksums": { "sha256": f"{version['checksum']}", }, "url": version["crate_file"], "version": version["version"], } artifacts.append(artifact) + data = {f"{version['version']}": {"yanked": f"{version['yanked']}"}} + crates_metadata.append(data) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=url, last_update=last_update, extra_loader_arguments={ "artifacts": artifacts, + "crates_metadata": crates_metadata, }, ) + + def finalize(self) -> None: + last = self.get_last_commit_hash(repository_path=self.DESTINATION_PATH) + if self.state.last_commit == last: + self.updated = False + else: + self.state.last_commit = last + self.updated = True + + logger.debug("Listing crates origin completed with last commit id %s", last) + + # Cleanup by removing the repository directory + if self.DESTINATION_PATH.exists(): + shutil.rmtree(self.DESTINATION_PATH) + logger.debug( + "Successfully removed %s directory", str(self.DESTINATION_PATH) + ) diff --git a/swh/lister/crates/tests/data/fake-crates-repository.tar.gz b/swh/lister/crates/tests/data/fake-crates-repository.tar.gz index 8b384b4..498b105 100644 Binary files a/swh/lister/crates/tests/data/fake-crates-repository.tar.gz and b/swh/lister/crates/tests/data/fake-crates-repository.tar.gz differ diff --git a/swh/lister/crates/tests/data/fake_crates_repository_init.sh b/swh/lister/crates/tests/data/fake_crates_repository_init.sh index 60680d6..6368601 100755 --- a/swh/lister/crates/tests/data/fake_crates_repository_init.sh +++ b/swh/lister/crates/tests/data/fake_crates_repository_init.sh @@ -1,37 +1,64 @@ #!/usr/bin/env bash # Script to generate fake-crates-repository.tar.gz # Creates a git repository like https://github.com/rust-lang/crates.io-index # for tests purposes set -euo pipefail # files and directories mkdir -p tmp_dir/crates.io-index/ cd tmp_dir/crates.io-index/ mkdir -p .dot-dir touch .dot-dir/empty mkdir -p ra/nd mkdir -p re/ge touch .dot-file touch config.json +# Init as a git repository +git init +git add . +git commit -m "Init fake crates.io-index repository for tests purpose" + echo '{"name":"rand","vers":"0.1.1","deps":[],"cksum":"48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d","features":{},"yanked":false}' > ra/nd/rand +git add . +git commit -m " Updating crate rand#0.1.1" + echo '{"name":"rand","vers":"0.1.2","deps":[{"name":"libc","req":"^0.1.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"},{"name":"log","req":"^0.2.1","features":[""],"optional":false,"default_features":true,"target":null,"kind":"normal"}],"cksum":"6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7","features":{},"yanked":false}' >> ra/nd/rand +git add . +git commit -m " Updating crate rand#0.1.2" echo '{"name":"regex","vers":"0.1.0","deps":[],"cksum":"f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5","features":{},"yanked":false}' > re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.0" + echo '{"name":"regex","vers":"0.1.1","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.1" + echo '{"name":"regex","vers":"0.1.2","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.2" + echo '{"name":"regex","vers":"0.1.3","deps":[{"name":"regex_macros","req":"^0.1.0","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3","features":{},"yanked":false}' >> re/ge/regex +git add . +git commit -m " Updating crate regex#0.1.3" echo '{"name":"regex-syntax","vers":"0.1.0","deps":[{"name":"rand","req":"^0.3","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"},{"name":"quickcheck","req":"^0.2","features":[""],"optional":false,"default_features":true,"target":null,"kind":"dev"}],"cksum":"398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944","features":{},"yanked":false}' > re/ge/regex-syntax - -# Init as a git repository -git init git add . -git commit -m "Init fake crates.io-index repository for tests purpose" +git commit -m " Updating crate regex-syntax#0.1.0" # Save some space rm .git/hooks/*.sample + +# Compress git directory as a tar.gz archive +cd ../ +tar -cvzf fake-crates-repository.tar.gz crates.io-index +mv fake-crates-repository.tar.gz ../ + +# Clean up tmp_dir +cd ../ +rm -rf tmp_dir diff --git a/swh/lister/crates/tests/test_lister.py b/swh/lister/crates/tests/test_lister.py index bbb1c7d..8b26379 100644 --- a/swh/lister/crates/tests/test_lister.py +++ b/swh/lister/crates/tests/test_lister.py @@ -1,114 +1,238 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path -from swh.lister.crates.lister import CratesLister +from dulwich.repo import Repo + +from swh.lister.crates.lister import CratesLister, CratesListerState from swh.lister.crates.tests import prepare_repository_from_archive expected_origins = [ { "url": "https://crates.io/api/v1/crates/rand", "artifacts": [ { "checksums": { "sha256": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d", # noqa: B950 }, "filename": "rand-0.1.1.crate", "url": "https://static.crates.io/crates/rand/rand-0.1.1.crate", "version": "0.1.1", }, { "checksums": { "sha256": "6e229ed392842fa93c1d76018d197b7e1b74250532bafb37b0e1d121a92d4cf7", # noqa: B950 }, "filename": "rand-0.1.2.crate", "url": "https://static.crates.io/crates/rand/rand-0.1.2.crate", "version": "0.1.2", }, ], + "metadata": [ + { + "version": "0.1.1", + "yanked": False, + }, + { + "version": "0.1.2", + "yanked": False, + }, + ], }, { "url": "https://crates.io/api/v1/crates/regex", "artifacts": [ { "checksums": { "sha256": "f0ff1ca641d3c9a2c30464dac30183a8b91cdcc959d616961be020cdea6255c5", # noqa: B950 }, "filename": "regex-0.1.0.crate", "url": "https://static.crates.io/crates/regex/regex-0.1.0.crate", "version": "0.1.0", }, { "checksums": { "sha256": "a07bef996bd38a73c21a8e345d2c16848b41aa7ec949e2fedffe9edf74cdfb36", # noqa: B950 }, "filename": "regex-0.1.1.crate", "url": "https://static.crates.io/crates/regex/regex-0.1.1.crate", "version": "0.1.1", }, { "checksums": { "sha256": "343bd0171ee23346506db6f4c64525de6d72f0e8cc533f83aea97f3e7488cbf9", # noqa: B950 }, "filename": "regex-0.1.2.crate", "url": "https://static.crates.io/crates/regex/regex-0.1.2.crate", "version": "0.1.2", }, { "checksums": { "sha256": "defb220c4054ca1b95fe8b0c9a6e782dda684c1bdf8694df291733ae8a3748e3", # noqa: B950 }, "filename": "regex-0.1.3.crate", "url": "https://static.crates.io/crates/regex/regex-0.1.3.crate", "version": "0.1.3", }, ], + "metadata": [ + { + "version": "0.1.0", + "yanked": False, + }, + { + "version": "0.1.1", + "yanked": False, + }, + { + "version": "0.1.2", + "yanked": False, + }, + { + "version": "0.1.3", + "yanked": False, + }, + ], }, { "url": "https://crates.io/api/v1/crates/regex-syntax", "artifacts": [ { "checksums": { "sha256": "398952a2f6cd1d22bc1774fd663808e32cf36add0280dee5cdd84a8fff2db944", # noqa: B950 }, "filename": "regex-syntax-0.1.0.crate", "url": "https://static.crates.io/crates/regex-syntax/regex-syntax-0.1.0.crate", "version": "0.1.0", }, ], + "metadata": [ + { + "version": "0.1.0", + "yanked": False, + }, + ], }, ] +expected_origins_incremental = [expected_origins[1], expected_origins[2]] + + def test_crates_lister(datadir, tmp_path, swh_scheduler): archive_path = Path(datadir, "fake-crates-repository.tar.gz") repo_url = prepare_repository_from_archive( archive_path, "crates.io-index", tmp_path ) lister = CratesLister(scheduler=swh_scheduler) lister.INDEX_REPOSITORY_URL = repo_url lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" res = lister.run() assert res.pages == 3 assert res.origins == 3 - expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url")) - scheduler_origins_sorted = sorted( - swh_scheduler.get_listed_origins(lister.lister_obj.id).results, - key=lambda x: x.url, + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert [ + ( + scheduled.visit_type, + scheduled.url, + scheduled.extra_loader_arguments["artifacts"], + ) + for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) + ] == [ + ( + "crates", + expected["url"], + expected["artifacts"], + ) + for expected in sorted(expected_origins, key=lambda expected: expected["url"]) + ] + + +def test_crates_lister_incremental(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path ) - for scheduled, expected in zip(scheduler_origins_sorted, expected_origins_sorted): - assert scheduled.visit_type == "crates" - assert scheduled.url == expected.get("url") - assert scheduled.extra_loader_arguments.get("artifacts") == expected.get( - "artifacts" + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + # The lister has not run yet, get the index repository + lister.get_index_repository() + # Set a CratesListerState with a last commit value to force incremental case + repo = Repo(lister.DESTINATION_PATH) + # Lets set this last commit to third one from head + step = list(repo.get_walker(max_entries=3))[-1] + last_commit_state = CratesListerState(last_commit=step.commit.id.decode()) + lister.state = last_commit_state + + res = lister.run() + + assert res.pages == 2 + assert res.origins == 2 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert [ + ( + scheduled.visit_type, + scheduled.url, + scheduled.extra_loader_arguments["artifacts"], + ) + for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) + ] == [ + ( + "crates", + expected["url"], + expected["artifacts"], + ) + for expected in sorted( + expected_origins_incremental, key=lambda expected: expected["url"] ) + ] + + +def test_crates_lister_incremental_nothing_new(datadir, tmp_path, swh_scheduler): + """Ensure incremental mode runs fine when the repository last commit is the same + than lister.state.las-_commit""" + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" + lister.get_index_repository() + + repo = Repo(lister.DESTINATION_PATH) + + # Set a CratesListerState with a last commit value to force incremental case + last_commit_state = CratesListerState(last_commit=repo.head().decode()) + lister.state = last_commit_state + + res = lister.run() + + assert res.pages == 0 + assert res.origins == 0 + + +def test_crates_lister_repository_cleanup(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-crates-repository.tar.gz") + repo_url = prepare_repository_from_archive( + archive_path, "crates.io-index", tmp_path + ) + + lister = CratesLister(scheduler=swh_scheduler) + lister.INDEX_REPOSITORY_URL = repo_url + lister.DESTINATION_PATH = tmp_path.parent / "crates.io-index-tests" - assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + lister.run() + # Repository directory should not exists after the lister runs + assert not lister.DESTINATION_PATH.exists() diff --git a/swh/lister/gitea/lister.py b/swh/lister/gitea/lister.py index 25bea4e..51084b6 100644 --- a/swh/lister/gitea/lister.py +++ b/swh/lister/gitea/lister.py @@ -1,142 +1,27 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -import random -from typing import Any, Dict, Iterator, List, Optional -from urllib.parse import urljoin -import iso8601 -import requests -from tenacity.before_sleep import before_sleep_log - -from swh.lister.utils import throttling_retry -from swh.scheduler.interface import SchedulerInterface -from swh.scheduler.model import ListedOrigin - -from .. import USER_AGENT -from ..pattern import CredentialsType, StatelessLister +from ..gogs.lister import GogsLister logger = logging.getLogger(__name__) -RepoListPage = List[Dict[str, Any]] - -class GiteaLister(StatelessLister[RepoListPage]): +class GiteaLister(GogsLister): """List origins from Gitea. Gitea API documentation: https://try.gitea.io/api/swagger The API does pagination and provides navigation URLs through the 'Link' header. The default value for page size is the maximum value observed on the instances accessible at https://try.gitea.io/api/v1/ and https://codeberg.org/api/v1/.""" LISTER_NAME = "gitea" - REPO_LIST_PATH = "repos/search" - - def __init__( - self, - scheduler: SchedulerInterface, - url: str, - instance: Optional[str] = None, - api_token: Optional[str] = None, - page_size: int = 50, - credentials: CredentialsType = None, - ): - super().__init__( - scheduler=scheduler, - credentials=credentials, - url=url, - instance=instance, - ) - - self.query_params = { - "sort": "id", - "order": "asc", - "limit": page_size, - "page": 1, - } - - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "User-Agent": USER_AGENT, - } + def on_anonymous_mode(self): + logger.warning( + "No authentication token set in configuration, using anonymous mode" ) - - if api_token is None: - if len(self.credentials) > 0: - cred = random.choice(self.credentials) - username = cred.get("username") - api_token = cred["password"] - logger.warning( - "Using authentication token from user %s", username or "???" - ) - else: - logger.warning( - "No authentication token set in configuration, using anonymous mode" - ) - - if api_token: - self.session.headers["Authorization"] = "Token %s" % api_token - - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: - - logger.info("Fetching URL %s with params %s", url, params) - - response = self.session.get(url, params=params) - - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - response.raise_for_status() - - return response - - @classmethod - def results_simplified(cls, body: Dict[str, RepoListPage]) -> RepoListPage: - fields_filter = ["id", "clone_url", "updated_at"] - return [{k: r[k] for k in fields_filter} for r in body["data"]] - - def get_pages(self) -> Iterator[RepoListPage]: - # base with trailing slash, path without leading slash for urljoin - url: str = urljoin(self.url, self.REPO_LIST_PATH) - - response = self.page_request(url, self.query_params) - - while True: - page_results = self.results_simplified(response.json()) - - yield page_results - - assert len(response.links) > 0, "API changed: no Link header found" - if "next" in response.links: - url = response.links["next"]["url"] - else: - # last page - break - - response = self.page_request(url, {}) - - def get_origins_from_page(self, page: RepoListPage) -> Iterator[ListedOrigin]: - """Convert a page of Gitea repositories into a list of ListedOrigins.""" - assert self.lister_obj.id is not None - - for repo in page: - last_update = iso8601.parse_date(repo["updated_at"]) - - yield ListedOrigin( - lister_id=self.lister_obj.id, - url=repo["clone_url"], - visit_type="git", - last_update=last_update, - ) diff --git a/swh/lister/gitea/tests/test_lister.py b/swh/lister/gitea/tests/test_lister.py index 860124e..90ec624 100644 --- a/swh/lister/gitea/tests/test_lister.py +++ b/swh/lister/gitea/tests/test_lister.py @@ -1,151 +1,153 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from pathlib import Path from typing import Dict, List, Tuple import pytest import requests -from swh.lister.gitea.lister import GiteaLister, RepoListPage +from swh.lister.gitea.lister import GiteaLister +from swh.lister.gogs.lister import GogsListerPage from swh.scheduler.model import ListedOrigin TRYGITEA_URL = "https://try.gitea.io/api/v1/" -TRYGITEA_P1_URL = TRYGITEA_URL + "repos/search?sort=id&order=asc&limit=3&page=1" -TRYGITEA_P2_URL = TRYGITEA_URL + "repos/search?sort=id&order=asc&limit=3&page=2" +TRYGITEA_P1_URL = TRYGITEA_URL + "repos/search?limit=3&page=1" +TRYGITEA_P2_URL = TRYGITEA_URL + "repos/search?limit=3&page=2" @pytest.fixture -def trygitea_p1(datadir) -> Tuple[str, Dict[str, str], RepoListPage, List[str]]: +def trygitea_p1(datadir) -> Tuple[str, Dict[str, str], GogsListerPage, List[str]]: text = Path(datadir, "https_try.gitea.io", "repos_page1").read_text() headers = { "Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=TRYGITEA_P2_URL) } - page_result = GiteaLister.results_simplified(json.loads(text)) - origin_urls = [r["clone_url"] for r in page_result] + page_data = json.loads(text) + page_result = GogsListerPage( + repos=GiteaLister.extract_repos(page_data), next_link=TRYGITEA_P2_URL + ) + origin_urls = [r["clone_url"] for r in page_data["data"]] return text, headers, page_result, origin_urls @pytest.fixture -def trygitea_p2(datadir) -> Tuple[str, Dict[str, str], RepoListPage, List[str]]: +def trygitea_p2(datadir) -> Tuple[str, Dict[str, str], GogsListerPage, List[str]]: text = Path(datadir, "https_try.gitea.io", "repos_page2").read_text() headers = { "Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=TRYGITEA_P1_URL) } - page_result = GiteaLister.results_simplified(json.loads(text)) - origin_urls = [r["clone_url"] for r in page_result] + page_data = json.loads(text) + page_result = GogsListerPage( + repos=GiteaLister.extract_repos(page_data), next_link=None + ) + origin_urls = [r["clone_url"] for r in page_data["data"]] return text, headers, page_result, origin_urls def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): """Asserts that the two collections have the same origin URLs. Does not test last_update.""" - - sorted_lister_urls = list(sorted(lister_urls)) - sorted_scheduler_origins = list(sorted(scheduler_origins)) - - assert len(sorted_lister_urls) == len(sorted_scheduler_origins) - - for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): - assert l_url == s_origin.url + assert set(lister_urls) == {origin.url for origin in scheduler_origins} def test_gitea_full_listing( swh_scheduler, requests_mock, mocker, trygitea_p1, trygitea_p2 ): """Covers full listing of multiple pages, rate-limit, page size (required for test), checking page results and listed origins, statelessness.""" kwargs = dict(url=TRYGITEA_URL, instance="try_gitea", page_size=3) lister = GiteaLister(scheduler=swh_scheduler, **kwargs) lister.get_origins_from_page = mocker.spy(lister, "get_origins_from_page") p1_text, p1_headers, p1_result, p1_origin_urls = trygitea_p1 p2_text, p2_headers, p2_result, p2_origin_urls = trygitea_p2 requests_mock.get(TRYGITEA_P1_URL, text=p1_text, headers=p1_headers) requests_mock.get( TRYGITEA_P2_URL, [ {"status_code": requests.codes.too_many_requests}, {"text": p2_text, "headers": p2_headers}, ], ) # end test setup stats = lister.run() # start test checks assert stats.pages == 2 assert stats.origins == 6 calls = [mocker.call(p1_result), mocker.call(p2_result)] lister.get_origins_from_page.assert_has_calls(calls) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) - assert lister.get_state_from_scheduler() is None + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == TRYGITEA_P2_URL + assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"] def test_gitea_auth_instance(swh_scheduler, requests_mock, trygitea_p1): """Covers token authentication, token from credentials, instance inference from URL.""" api_token = "teapot" instance = "try.gitea.io" creds = {"gitea": {instance: [{"username": "u", "password": api_token}]}} kwargs1 = dict(url=TRYGITEA_URL, api_token=api_token) lister = GiteaLister(scheduler=swh_scheduler, **kwargs1) # test API token assert "Authorization" in lister.session.headers assert lister.session.headers["Authorization"].lower() == "token %s" % api_token kwargs2 = dict(url=TRYGITEA_URL, credentials=creds) lister = GiteaLister(scheduler=swh_scheduler, **kwargs2) # test API token from credentials assert "Authorization" in lister.session.headers assert lister.session.headers["Authorization"].lower() == "token %s" % api_token # test instance inference from URL assert lister.instance assert "gitea" in lister.instance # infer something related to that # setup requests mocking p1_text, p1_headers, _, _ = trygitea_p1 p1_headers["Link"] = p1_headers["Link"].replace("next", "") # only 1 page base_url = TRYGITEA_URL + lister.REPO_LIST_PATH requests_mock.get(base_url, text=p1_text, headers=p1_headers) # now check the lister runs without error stats = lister.run() assert stats.pages == 1 @pytest.mark.parametrize("http_code", [400, 500, 502]) def test_gitea_list_http_error(swh_scheduler, requests_mock, http_code): """Test handling of some HTTP errors commonly encountered""" lister = GiteaLister(scheduler=swh_scheduler, url=TRYGITEA_URL, page_size=3) base_url = TRYGITEA_URL + lister.REPO_LIST_PATH requests_mock.get(base_url, status_code=http_code) with pytest.raises(requests.HTTPError): lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0 diff --git a/swh/lister/gogs/__init__.py b/swh/lister/gogs/__init__.py new file mode 100644 index 0000000..d5ae381 --- /dev/null +++ b/swh/lister/gogs/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import GogsLister + + return { + "lister": GogsLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py new file mode 100644 index 0000000..8c5a72d --- /dev/null +++ b/swh/lister/gogs/lister.py @@ -0,0 +1,207 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import asdict, dataclass +import logging +import random +from typing import Any, Dict, Iterator, List, Optional +from urllib.parse import parse_qs, urljoin, urlparse + +import iso8601 +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, Lister + +logger = logging.getLogger(__name__) + +Repo = Dict[str, Any] + + +@dataclass +class GogsListerPage: + repos: Optional[List[Repo]] = None + next_link: Optional[str] = None + + +@dataclass +class GogsListerState: + last_seen_next_link: Optional[str] = None + """Last link header (could be already visited) during an incremental pass.""" + last_seen_repo_id: Optional[int] = None + """Last repo id seen during an incremental pass.""" + + +def _parse_page_id(url: Optional[str]) -> int: + """Parse the page id from a Gogs page url.""" + if url is None: + return 0 + + return int(parse_qs(urlparse(url).query)["page"][0]) + + +class GogsLister(Lister[GogsListerState, GogsListerPage]): + + """List origins from the Gogs + + Gogs API documentation: https://github.com/gogs/docs-api + + The API is protected behind authentication so credentials/API tokens + are mandatory. It supports pagination and provides next page URL + through the 'next' value of the 'Link' header. The default value for + page size ('limit') is 10 but the maximum allowed value is 50. + """ + + LISTER_NAME = "gogs" + + VISIT_TYPE = "git" + + REPO_LIST_PATH = "repos/search" + + def __init__( + self, + scheduler: SchedulerInterface, + url: str, + instance: Optional[str] = None, + api_token: Optional[str] = None, + page_size: int = 50, + credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + url=url, + instance=instance, + ) + + self.query_params = { + "limit": page_size, + } + + self.api_token = api_token + if self.api_token is None: + + if len(self.credentials) > 0: + cred = random.choice(self.credentials) + username = cred.get("username") + self.api_token = cred["password"] + logger.info("Using authentication credentials from user %s", username) + else: + # Raises an error on Gogs, or a warning on Gitea + self.on_anonymous_mode() + + self.max_page_limit = 2 + + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + ) + + if self.api_token: + self.session.headers["Authorization"] = f"token {self.api_token}" + + def on_anonymous_mode(self): + raise ValueError("No credentials or API token provided") + + def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState: + return GogsListerState(**d) + + def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]: + return asdict(state) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url, params) -> requests.Response: + + logger.debug("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + @classmethod + def extract_repos(cls, body: Dict[str, Any]) -> List[Repo]: + fields_filter = ["id", "clone_url", "updated_at"] + return [{k: r[k] for k in fields_filter} for r in body["data"]] + + def get_pages(self) -> Iterator[GogsListerPage]: + page_id = 1 + if self.state.last_seen_next_link is not None: + page_id = _parse_page_id(self.state.last_seen_next_link) + + # base with trailing slash, path without leading slash for urljoin + next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH) + response = self.page_request(next_link, {**self.query_params, "page": page_id}) + + while next_link is not None: + repos = self.extract_repos(response.json()) + + assert len(response.links) > 0, "API changed: no Link header found" + if "next" in response.links: + next_link = response.links["next"]["url"] + else: + next_link = None # Happens for the last page + + yield GogsListerPage(repos=repos, next_link=next_link) + + if next_link is not None: + response = self.page_request(next_link, {}) + + def get_origins_from_page(self, page: GogsListerPage) -> Iterator[ListedOrigin]: + """Convert a page of Gogs repositories into a list of ListedOrigins""" + assert self.lister_obj.id is not None + assert page.repos is not None + + for r in page.repos: + last_update = iso8601.parse_date(r["updated_at"]) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=r["clone_url"], + last_update=last_update, + ) + + def commit_page(self, page: GogsListerPage) -> None: + last_seen_next_link = page.next_link + + page_id = _parse_page_id(last_seen_next_link) + state_page_id = _parse_page_id(self.state.last_seen_next_link) + + if page_id > state_page_id: + self.state.last_seen_next_link = last_seen_next_link + + if (page.repos is not None) and len(page.repos) > 0: + self.state.last_seen_repo_id = page.repos[-1]["id"] + + def finalize(self) -> None: + scheduler_state = self.get_state_from_scheduler() + + state_page_id = _parse_page_id(self.state.last_seen_next_link) + scheduler_page_id = _parse_page_id(scheduler_state.last_seen_next_link) + + state_last_repo_id = self.state.last_seen_repo_id or 0 + scheduler_last_repo_id = scheduler_state.last_seen_repo_id or 0 + + if (state_page_id >= scheduler_page_id) and ( + state_last_repo_id > scheduler_last_repo_id + ): + self.updated = True # Marked updated only if it finds new repos diff --git a/swh/lister/gogs/tasks.py b/swh/lister/gogs/tasks.py new file mode 100644 index 0000000..81287fa --- /dev/null +++ b/swh/lister/gogs/tasks.py @@ -0,0 +1,28 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict, Optional + +from celery import shared_task + +from .lister import GogsLister + + +@shared_task(name=__name__ + ".FullGogsRelister") +def list_gogs_full( + url: str, + instance: Optional[str] = None, + api_token: Optional[str] = None, + page_size: Optional[int] = None, +) -> Dict[str, int]: + """Full update of a Gogs instance""" + lister = GogsLister.from_configfile( + url=url, instance=instance, api_token=api_token, page_size=page_size + ) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/gogs/tests/__init__.py b/swh/lister/gogs/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page1 b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page1 new file mode 100644 index 0000000..861092d --- /dev/null +++ b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page1 @@ -0,0 +1,98 @@ +{ + "data": [ + { + "id": 190, + "owner": { + "id": 338, + "username": "carwyn", + "login": "carwyn", + "full_name": "", + "email": "carwyn@carwyn.com", + "avatar_url": "https://secure.gravatar.com/avatar/65a98c538bcc360e9e9739d2af7908b0?d=identicon" + }, + "name": "test-repo", + "full_name": "carwyn/test-repo", + "description": "An example.", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 1024, + "html_url": "https://try.gogs.io/carwyn/test-repo", + "ssh_url": "git@try.gogs.io:carwyn/test-repo.git", + "clone_url": "https://try.gogs.io/carwyn/test-repo.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-02-17T21:11:54Z", + "updated_at": "2022-03-26T07:28:38Z" + }, + { + "id": 258, + "owner": { + "id": 462, + "username": "juquinha", + "login": "juquinha", + "full_name": "", + "email": "juquinha123@mailinator.com", + "avatar_url": "https://secure.gravatar.com/avatar/40cdc8c32069ac441ff7f5c9bfe0f9ef?d=identicon" + }, + "name": "zicarepo", + "full_name": "juquinha/zicarepo", + "description": "Foo test.", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 8192, + "html_url": "https://try.gogs.io/juquinha/zicarepo", + "ssh_url": "git@try.gogs.io:juquinha/zicarepo.git", + "clone_url": "https://try.gogs.io/juquinha/zicarepo.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 1, + "default_branch": "master", + "created_at": "2015-02-24T12:13:57Z", + "updated_at": "2022-03-26T07:28:38Z" + }, + { + "id": 334, + "owner": { + "id": 582, + "username": "ivilata", + "login": "ivilata", + "full_name": "", + "email": "ivan@pangea.org", + "avatar_url": "https://secure.gravatar.com/avatar/ed21e55837a9080c57181f624aefa905?d=identicon" + }, + "name": "footest", + "full_name": "ivilata/footest", + "description": "Dummy repo for testing issue handling mainly.", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 3072, + "html_url": "https://try.gogs.io/ivilata/footest", + "ssh_url": "git@try.gogs.io:ivilata/footest.git", + "clone_url": "https://try.gogs.io/ivilata/footest.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 1, + "default_branch": "master", + "created_at": "2015-03-03T17:03:45Z", + "updated_at": "2022-03-26T07:28:38Z" + } + ], + "ok": true +} diff --git a/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page2 b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page2 new file mode 100644 index 0000000..0f967f1 --- /dev/null +++ b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page2 @@ -0,0 +1,98 @@ +{ + "data": [ + { + "id": 337, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "zork-repo", + "full_name": "zork/zork-repo", + "description": "This is a test thing.", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 13312, + "html_url": "https://try.gogs.io/zork/zork-repo", + "ssh_url": "git@try.gogs.io:zork/zork-repo.git", + "clone_url": "https://try.gogs.io/zork/zork-repo.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:31:53Z", + "updated_at": "2022-03-26T07:28:38Z" + }, + { + "id": 338, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "supernova", + "full_name": "zork/supernova", + "description": "This is a description. Blah blah blah.", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 1471488, + "html_url": "https://try.gogs.io/zork/supernova", + "ssh_url": "git@try.gogs.io:zork/supernova.git", + "clone_url": "https://try.gogs.io/zork/supernova.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:44:20Z", + "updated_at": "2022-03-26T07:28:38Z" + }, + { + "id": 339, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "digits", + "full_name": "zork/digits", + "description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 18432, + "html_url": "https://try.gogs.io/zork/digits", + "ssh_url": "git@try.gogs.io:zork/digits.git", + "clone_url": "https://try.gogs.io/zork/digits.git", + "website": "", + "stars_count": 0, + "forks_count": 1, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:47:56Z", + "updated_at": "2022-03-26T07:28:38Z" + } + ], + "ok": true +} diff --git a/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 new file mode 100644 index 0000000..5a1ac12 --- /dev/null +++ b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page3 @@ -0,0 +1,168 @@ +{ + "data": [ + { + "id": 340, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "beyond-the-titanic", + "full_name": "zork/beyond-the-titanic", + "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 1436672, + "html_url": "https://try.gogs.io/zork/beyond-the-titanic", + "ssh_url": "git@try.gogs.io:zork/beyond-the-titanic.git", + "clone_url": "https://try.gogs.io/zork/beyond-the-titanic.git", + "website": "", + "stars_count": 0, + "forks_count": 1, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:51:12Z", + "updated_at": "2022-03-26T07:28:38Z" + }, + { + "id": 350, + "owner": { + "id": 599, + "username": "perekre", + "login": "perekre", + "full_name": "", + "email": "perekre@nincsmail.com", + "avatar_url": "https://secure.gravatar.com/avatar/0e2666adf16f8a958a56141a2d94565c?d=identicon" + }, + "name": "beyond-the-titanic", + "full_name": "perekre/beyond-the-titanic", + "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?", + "private": false, + "fork": true, + "parent": { + "id": 340, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "beyond-the-titanic", + "full_name": "zork/beyond-the-titanic", + "description": "Adventure awaits you onboard the RMS Titanic. Can you survive the sinking and make it home to San Francisco?", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 1436672, + "html_url": "https://try.gogs.io/zork/beyond-the-titanic", + "ssh_url": "git@try.gogs.io:zork/beyond-the-titanic.git", + "clone_url": "https://try.gogs.io/zork/beyond-the-titanic.git", + "website": "", + "stars_count": 0, + "forks_count": 1, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:51:12Z", + "updated_at": "2022-03-26T07:28:38Z", + "permissions": { + "admin": false, + "push": false, + "pull": true + } + }, + "empty": false, + "mirror": false, + "size": 1437696, + "html_url": "https://try.gogs.io/perekre/beyond-the-titanic", + "ssh_url": "git@try.gogs.io:perekre/beyond-the-titanic.git", + "clone_url": "https://try.gogs.io/perekre/beyond-the-titanic.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-04T10:40:46Z", + "updated_at": "2022-03-26T07:28:38Z" + }, + { + "id": 369, + "owner": { + "id": 108, + "username": "yinheli", + "login": "yinheli", + "full_name": "", + "email": "me@yinheli.com", + "avatar_url": "https://secure.gravatar.com/avatar/dedb067ecae8155b87428ac7920dd0ae?d=identicon" + }, + "name": "digits", + "full_name": "yinheli/digits", + "description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.", + "private": false, + "fork": true, + "parent": { + "id": 339, + "owner": { + "id": 585, + "username": "zork", + "login": "zork", + "full_name": "", + "email": "f905334@trbvm.com", + "avatar_url": "https://secure.gravatar.com/avatar/ebcb8e171a1a47fde8ded46b2618f135?d=identicon" + }, + "name": "digits", + "full_name": "zork/digits", + "description": "Distantly related to the game Mastermind, you are given clues to help determine a random number combination. The object of the game is to guess the solution in as few tries as possible.", + "private": false, + "fork": false, + "parent": null, + "empty": false, + "mirror": false, + "size": 18432, + "html_url": "https://try.gogs.io/zork/digits", + "ssh_url": "git@try.gogs.io:zork/digits.git", + "clone_url": "https://try.gogs.io/zork/digits.git", + "website": "", + "stars_count": 0, + "forks_count": 1, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-03T22:47:56Z", + "updated_at": "2022-03-26T07:28:38Z", + "permissions": { + "admin": false, + "push": false, + "pull": true + } + }, + "empty": false, + "mirror": false, + "size": 18432, + "html_url": "https://try.gogs.io/yinheli/digits", + "ssh_url": "git@try.gogs.io:yinheli/digits.git", + "clone_url": "https://try.gogs.io/yinheli/digits.git", + "website": "", + "stars_count": 0, + "forks_count": 0, + "watchers_count": 1, + "open_issues_count": 0, + "default_branch": "master", + "created_at": "2015-03-06T01:31:17Z", + "updated_at": "2022-03-26T07:28:38Z" + } + ], + "ok": true +} diff --git a/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page4 b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page4 new file mode 100644 index 0000000..ad2c034 --- /dev/null +++ b/swh/lister/gogs/tests/data/https_try.gogs.io/repos_page4 @@ -0,0 +1 @@ +{"data":[{"id":380,"owner":{"id":653,"username":"gdr","login":"gdr","full_name":"","email":"gdr@gdr.name","avatar_url":"https://secure.gravatar.com/avatar/237e2bf0a3687301ed4ef3c65e56c672?d=identicon"},"name":"ttrss-af_nofacebook","full_name":"gdr/ttrss-af_nofacebook","description":"Tiny Tiny RSS plugin for removing Facebook's l.php links","private":false,"fork":false,"parent":null,"empty":false,"mirror":true,"size":4096,"html_url":"https://try.gogs.io/gdr/ttrss-af_nofacebook","ssh_url":"git@try.gogs.io:gdr/ttrss-af_nofacebook.git","clone_url":"https://try.gogs.io/gdr/ttrss-af_nofacebook.git","website":"","stars_count":0,"forks_count":0,"watchers_count":1,"open_issues_count":0,"default_branch":"master","created_at":"2015-03-06T11:39:30Z","updated_at":"2022-03-26T07:28:38Z"},{"id":399,"owner":{"id":683,"username":"brejoc","login":"brejoc","full_name":"","email":"brejoc@gmail.com","avatar_url":"https://secure.gravatar.com/avatar/80674ca691e4a325d8bff1977a1d881d?d=identicon"},"name":"gosgp","full_name":"brejoc/gosgp","description":"Command line SuperGenPass password generator written in go.","private":false,"fork":false,"parent":null,"empty":false,"mirror":false,"size":35840,"html_url":"https://try.gogs.io/brejoc/gosgp","ssh_url":"git@try.gogs.io:brejoc/gosgp.git","clone_url":"https://try.gogs.io/brejoc/gosgp.git","website":"","stars_count":0,"forks_count":2,"watchers_count":1,"open_issues_count":1,"default_branch":"master","created_at":"2015-03-09T12:15:24Z","updated_at":"2022-03-26T07:28:38Z"},{"id":403,"owner":{"id":687,"username":"mirdhyn","login":"mirdhyn","full_name":"","email":"mirdhyn@gmail.com","avatar_url":"https://secure.gravatar.com/avatar/047818f3fffe0df833958ea40a25fd5c?d=identicon"},"name":"gosgp","full_name":"mirdhyn/gosgp","description":"Command line SuperGenPass password generator written in go.","private":false,"fork":true,"parent":{"id":399,"owner":{"id":683,"username":"brejoc","login":"brejoc","full_name":"","email":"brejoc@gmail.com","avatar_url":"https://secure.gravatar.com/avatar/80674ca691e4a325d8bff1977a1d881d?d=identicon"},"name":"gosgp","full_name":"brejoc/gosgp","description":"Command line SuperGenPass password generator written in go.","private":false,"fork":false,"parent":null,"empty":false,"mirror":false,"size":35840,"html_url":"https://try.gogs.io/brejoc/gosgp","ssh_url":"git@try.gogs.io:brejoc/gosgp.git","clone_url":"https://try.gogs.io/brejoc/gosgp.git","website":"","stars_count":0,"forks_count":2,"watchers_count":1,"open_issues_count":1,"default_branch":"master","created_at":"2015-03-09T12:15:24Z","updated_at":"2022-03-26T07:28:38Z","permissions":{"admin":false,"push":false,"pull":true}},"empty":false,"mirror":false,"size":48128,"html_url":"https://try.gogs.io/mirdhyn/gosgp","ssh_url":"git@try.gogs.io:mirdhyn/gosgp.git","clone_url":"https://try.gogs.io/mirdhyn/gosgp.git","website":"","stars_count":0,"forks_count":0,"watchers_count":1,"open_issues_count":0,"default_branch":"master","created_at":"2015-03-09T21:34:21Z","updated_at":"2022-03-26T07:28:38Z"}],"ok":true} \ No newline at end of file diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py new file mode 100644 index 0000000..5c9b651 --- /dev/null +++ b/swh/lister/gogs/tests/test_lister.py @@ -0,0 +1,322 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +from pathlib import Path +from typing import List +from unittest.mock import Mock + +import pytest +from requests import HTTPError + +from swh.lister.gogs.lister import GogsLister, GogsListerPage, _parse_page_id +from swh.scheduler.model import ListedOrigin + +TRY_GOGS_URL = "https://try.gogs.io/api/v1/" + + +def try_gogs_page(n: int): + return TRY_GOGS_URL + GogsLister.REPO_LIST_PATH + f"?page={n}&limit=3" + + +P1 = try_gogs_page(1) +P2 = try_gogs_page(2) +P3 = try_gogs_page(3) +P4 = try_gogs_page(4) + + +@pytest.fixture +def trygogs_p1(datadir): + text = Path(datadir, "https_try.gogs.io", "repos_page1").read_text() + headers = {"Link": f'<{P2}>; rel="next"'} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=P2 + ) + origin_urls = [r["clone_url"] for r in page_result.repos] + return text, headers, page_result, origin_urls + + +@pytest.fixture +def trygogs_p2(datadir): + text = Path(datadir, "https_try.gogs.io", "repos_page2").read_text() + headers = {"Link": f'<{P3}>; rel="next",<{P1}>; rel="prev"'} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 + ) + origin_urls = [r["clone_url"] for r in page_result.repos] + return text, headers, page_result, origin_urls + + +@pytest.fixture +def trygogs_p3(datadir): + text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text() + headers = {"Link": f'<{P4}>; rel="next",<{P2}>; rel="prev"'} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 + ) + origin_urls = [r["clone_url"] for r in page_result.repos] + return text, headers, page_result, origin_urls + + +@pytest.fixture +def trygogs_p4(datadir): + text = Path(datadir, "https_try.gogs.io", "repos_page4").read_text() + headers = {"Link": f'<{P3}>; rel="prev"'} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=P3 + ) + origin_urls = [r["clone_url"] for r in page_result.repos] + return text, headers, page_result, origin_urls + + +@pytest.fixture +def trygogs_p3_last(datadir): + text = Path(datadir, "https_try.gogs.io", "repos_page3").read_text() + headers = {"Link": f'<{P2}>; rel="prev",<{P1}>; rel="first"'} + page_result = GogsListerPage( + repos=GogsLister.extract_repos(json.loads(text)), next_link=None + ) + origin_urls = [r["clone_url"] for r in page_result.repos] + return text, headers, page_result, origin_urls + + +@pytest.fixture +def trygogs_p3_empty(): + origins_urls = [] + body = {"data": [], "ok": True} + headers = {"Link": f'<{P2}>; rel="prev",<{P1}>; rel="first"'} + page_result = GogsListerPage(repos=GogsLister.extract_repos(body), next_link=None) + text = json.dumps(body) + return text, headers, page_result, origins_urls + + +def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): + """Asserts that the two collections have the same origin URLs. + + Does not test last_update.""" + assert set(lister_urls) == {origin.url for origin in scheduler_origins} + + +def test_gogs_full_listing( + swh_scheduler, requests_mock, mocker, trygogs_p1, trygogs_p2, trygogs_p3_last +): + kwargs = dict( + url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" + ) + lister = GogsLister(scheduler=swh_scheduler, **kwargs) + + lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") + + p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 + p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last + + requests_mock.get(P1, text=p1_text, headers=p1_headers) + requests_mock.get(P2, text=p2_text, headers=p2_headers) + requests_mock.get(P3, text=p3_text, headers=p3_headers) + + stats = lister.run() + + assert stats.pages == 3 + assert stats.origins == 9 + + calls = map(mocker.call, [p1_result, p2_result, p3_result]) + lister.get_origins_from_page.assert_has_calls(list(calls)) + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins + ) + + assert ( + lister.get_state_from_scheduler().last_seen_next_link == P3 + ) # P3 didn't provide any next link so it remains the last_seen_next_link + + +def test_gogs_auth_instance( + swh_scheduler, requests_mock, trygogs_p1, trygogs_p2, trygogs_p3_empty +): + """Covers token authentication, token from credentials, + instance inference from URL.""" + + api_token = "secret" + instance = "try_gogs" + + # Test lister initialization without api_token or credentials: + with pytest.raises(ValueError, match="No credentials or API token provided"): + kwargs1 = dict(url=TRY_GOGS_URL, instance=instance) + GogsLister(scheduler=swh_scheduler, **kwargs1) + + # Test lister initialization using api_token: + kwargs2 = dict(url=TRY_GOGS_URL, api_token=api_token, instance=instance) + lister = GogsLister(scheduler=swh_scheduler, **kwargs2) + assert lister.session.headers["Authorization"].lower() == "token %s" % api_token + + # Test lister initialization with credentials and run it: + creds = {"gogs": {instance: [{"username": "u", "password": api_token}]}} + kwargs3 = dict(url=TRY_GOGS_URL, credentials=creds, instance=instance, page_size=3) + lister = GogsLister(scheduler=swh_scheduler, **kwargs3) + assert lister.session.headers["Authorization"].lower() == "token %s" % api_token + assert lister.instance == "try_gogs" + + # setup requests mocking + p1_text, p1_headers, _, _ = trygogs_p1 + p2_text, p2_headers, _, _ = trygogs_p2 + p3_text, p3_headers, _, _ = trygogs_p3_empty + + requests_mock.get(P1, text=p1_text, headers=p1_headers) + requests_mock.get(P2, text=p2_text, headers=p2_headers) + requests_mock.get(P3, text=p3_text, headers=p3_headers) + + # lister should run without any error and extract the origins + stats = lister.run() + assert stats.pages == 3 + assert stats.origins == 6 + + +@pytest.mark.parametrize("http_code", [400, 500, 502]) +def test_gogs_list_http_error( + swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3_last +): + """Test handling of some HTTP errors commonly encountered""" + + lister = GogsLister(scheduler=swh_scheduler, url=TRY_GOGS_URL, api_token="secret") + + p1_text, p1_headers, _, p1_origin_urls = trygogs_p1 + p3_text, p3_headers, _, _ = trygogs_p3_last + + base_url = TRY_GOGS_URL + lister.REPO_LIST_PATH + requests_mock.get( + base_url, + [ + {"text": p1_text, "headers": p1_headers, "status_code": 200}, + {"status_code": http_code}, + {"text": p3_text, "headers": p3_headers, "status_code": 200}, + ], + ) + + with pytest.raises(HTTPError): + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + check_listed_origins( + p1_origin_urls, scheduler_origins + ) # Only the first page is listed + + +def test_gogs_incremental_lister( + swh_scheduler, + requests_mock, + mocker, + trygogs_p1, + trygogs_p2, + trygogs_p3, + trygogs_p3_last, + trygogs_p3_empty, + trygogs_p4, +): + kwargs = dict( + url=TRY_GOGS_URL, instance="try_gogs", page_size=3, api_token="secret" + ) + lister = GogsLister(scheduler=swh_scheduler, **kwargs) + + lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") + + # First listing attempt: P1 and P2 return 3 origins each + # while P3 (current last page) is empty. + + p1_text, p1_headers, p1_result, p1_origin_urls = trygogs_p1 + p2_text, p2_headers, p2_result, p2_origin_urls = trygogs_p2 + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_empty + + requests_mock.get(P1, text=p1_text, headers=p1_headers) + requests_mock.get(P2, text=p2_text, headers=p2_headers) + requests_mock.get(P3, text=p3_text, headers=p3_headers) + + attempt1_stats = lister.run() + assert attempt1_stats.pages == 3 + assert attempt1_stats.origins == 6 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P3 + assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"] + assert lister.updated + + check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) + + lister.updated = False # Reset the flag + + # Second listing attempt: P3 isn't empty anymore. + # The lister should restart from last state and hence revisit P3. + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last + requests_mock.get(P3, text=p3_text, headers=p3_headers) + + lister.session.get = mocker.spy(lister.session, "get") + + attempt2_stats = lister.run() + + assert attempt2_stats.pages == 1 + assert attempt2_stats.origins == 3 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + page_id = _parse_page_id(lister_state.last_seen_next_link) + query_params = lister.query_params + query_params["page"] = page_id + + lister.session.get.assert_called_once_with( + TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params + ) + + # All the 9 origins (3 pages) should be passed on to the scheduler: + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins + ) + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P3 + assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"] + assert lister.updated + + lister.updated = False # Reset the flag + + # Third listing attempt: No new origins + # The lister should revisit last seen page (P3) + attempt3_stats = lister.run() + + assert attempt3_stats.pages == 1 + assert attempt3_stats.origins == 3 + + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P3 + assert lister_state.last_seen_repo_id == p3_result.repos[-1]["id"] + assert lister.updated is False # No new origins so state isn't updated. + + # Fourth listing attempt: Page 4 is introduced and returns 3 new origins + # The lister should revisit last seen page (P3) as well as P4. + p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3 # new P3 points to P4 + p4_text, p4_headers, p4_result, p4_origin_urls = trygogs_p4 + + requests_mock.get(P3, text=p3_text, headers=p3_headers) + requests_mock.get(P4, text=p4_text, headers=p4_headers) + + attempt4_stats = lister.run() + + assert attempt4_stats.pages == 2 + assert attempt4_stats.origins == 6 + + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_seen_next_link == P4 + assert lister_state.last_seen_repo_id == p4_result.repos[-1]["id"] + assert lister.updated + + # All the 12 origins (4 pages) should be passed on to the scheduler: + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls + p4_origin_urls, + scheduler_origins, + ) diff --git a/swh/lister/gogs/tests/test_tasks.py b/swh/lister/gogs/tests/test_tasks.py new file mode 100644 index 0000000..2f38341 --- /dev/null +++ b/swh/lister/gogs/tests/test_tasks.py @@ -0,0 +1,61 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from unittest.mock import patch + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.gogs.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +@patch("swh.lister.gogs.tasks.GogsLister") +def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://try.gogs.io/api/v1/") + res = swh_scheduler_celery_app.send_task( + "swh.lister.gogs.tasks.FullGogsRelister", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + actual_kwargs = dict(**kwargs, instance=None, api_token=None, page_size=None) + + lister.from_configfile.assert_called_once_with(**actual_kwargs) + lister.run.assert_called_once_with() + + +@patch("swh.lister.gogs.tasks.GogsLister") +def test_full_listing_params( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://gogs-host.com/api/v1/", + instance="foo", + api_token="test", + page_size=50, + ) + res = swh_scheduler_celery_app.send_task( + "swh.lister.gogs.tasks.FullGogsRelister", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/crates/__init__.py b/swh/lister/golang/__init__.py similarity index 79% copy from swh/lister/crates/__init__.py copy to swh/lister/golang/__init__.py index 2b31785..fe20282 100644 --- a/swh/lister/crates/__init__.py +++ b/swh/lister/golang/__init__.py @@ -1,12 +1,12 @@ # Copyright (C) 2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): - from .lister import CratesLister + from .lister import GolangLister return { - "lister": CratesLister, + "lister": GolangLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py new file mode 100644 index 0000000..0d6b2b9 --- /dev/null +++ b/swh/lister/golang/lister.py @@ -0,0 +1,188 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import dataclass +from datetime import datetime +import json +import logging +from typing import Any, Dict, Iterator, List, Optional, Tuple + +import iso8601 +import requests +from tenacity import before_sleep_log + +from swh.lister.utils import retry_policy_generic, throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, Lister + +logger = logging.getLogger(__name__) + + +@dataclass +class GolangStateType: + last_seen: Optional[datetime] = None + """Last timestamp of a package version we have saved. + Used as a starting point for an incremental listing.""" + + +GolangPageType = List[Dict[str, Any]] + + +class GolangLister(Lister[GolangStateType, GolangPageType]): + """ + List all Golang modules and send associated origins to scheduler. + + The lister queries the Golang module index, whose documentation can be found + at https://index.golang.org + """ + + GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index" + # `limit` seems to be... limited to 2000. + GOLANG_MODULES_INDEX_LIMIT = 2000 + LISTER_NAME = "Golang" + + def __init__( + self, + scheduler: SchedulerInterface, + incremental: bool = False, + credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + url=self.GOLANG_MODULES_INDEX_URL, + instance="Golang", + credentials=credentials, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + self.incremental = incremental + + def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType: + as_string = d.get("last_seen") + last_seen = iso8601.parse_date(as_string) if as_string is not None else None + return GolangStateType(last_seen=last_seen) + + def state_to_dict(self, state: GolangStateType) -> Dict[str, Any]: + return { + "last_seen": state.last_seen.isoformat() + if state.last_seen is not None + else None + } + + def finalize(self): + if self.incremental and self.state.last_seen is not None: + scheduler_state = self.get_state_from_scheduler() + + if ( + scheduler_state.last_seen is None + or self.state.last_seen > scheduler_state.last_seen + ): + self.updated = True + + @throttling_retry( + retry=retry_policy_generic, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def api_request(self, url: str) -> List[str]: + logger.debug("Fetching URL %s", url) + + response = self.session.get(url) + + if response.status_code not in (200, 304): + # Log response content to ease debugging + logger.warning( + "Unexpected HTTP status code %s for URL %s", + response.status_code, + response.url, + ) + + response.raise_for_status() + + return response.text.split() + + def get_single_page( + self, since: Optional[datetime] = None + ) -> Tuple[GolangPageType, Optional[datetime]]: + """Return a page from the API and the timestamp of its last entry. + Since all entries are sorted by chronological order, the timestamp is useful + both for pagination and later for incremental runs.""" + url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}" + if since is not None: + # The Golang index does not understand `+00:00` for some reason + # and expects the "timezone zero" notation instead. This works + # because all times are UTC. + utc_offset = since.utcoffset() + assert ( + utc_offset is not None and utc_offset.total_seconds() == 0 + ), "Non-UTC datetime" + as_date = since.isoformat().replace("+00:00", "Z") + url = f"{url}&since={as_date}" + + entries = self.api_request(url) + page: GolangPageType = [] + if not entries: + return page, since + + for as_json in entries: + entry = json.loads(as_json) + timestamp = iso8601.parse_date(entry["Timestamp"]) + # We've already parsed it and we'll need the datetime later, save it + entry["Timestamp"] = timestamp + page.append(entry) + # The index is guaranteed to be sorted in chronological order + since = timestamp + + return page, since + + def get_pages(self) -> Iterator[GolangPageType]: + since = None + if self.incremental: + since = self.state.last_seen + page, since = self.get_single_page(since=since) + if since == self.state.last_seen: + # The index returns packages whose timestamp are greater or + # equal to the date provided as parameter, which will create + # an infinite loop if not stopped here. + return [], since + if since is not None: + self.state.last_seen = since + + while page: + yield page + page, since = self.get_single_page(since=since) + if since == self.state.last_seen: + return [], since + if since is not None: + self.state.last_seen = since + + def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]: + """ + Iterate on all Golang projects and yield ListedOrigin instances. + """ + assert self.lister_obj.id is not None + + for module in page: + path = module["Path"] + # The loader will be expected to use the golang proxy to do the + # actual downloading. We're using `pkg.go.dev` so that the URL points + # to somewhere useful for a human instead of an (incomplete) API path. + origin_url = f"https://pkg.go.dev/{path}" + + # Since the Go index lists versions and not just packages, there will + # be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side, + # so only the last timestamp will be used, with no duplicates. + # Performance should not be an issue as they are sent to the db in bulk. + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="golang", + last_update=module["Timestamp"], + ) diff --git a/swh/lister/golang/tasks.py b/swh/lister/golang/tasks.py new file mode 100644 index 0000000..3bbba0d --- /dev/null +++ b/swh/lister/golang/tasks.py @@ -0,0 +1,25 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from .lister import GolangLister + + +@shared_task(name=__name__ + ".FullGolangLister") +def list_golang(**lister_args): + "List the Golang module registry" + return GolangLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".IncrementalGolangLister") +def list_golang_incremental(**lister_args): + """Incremental update of Golang packages""" + lister = GolangLister.from_configfile(incremental=True, **lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/golang/tests/__init__.py b/swh/lister/golang/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/golang/tests/data/page-1.txt b/swh/lister/golang/tests/data/page-1.txt new file mode 100644 index 0000000..b699d0b --- /dev/null +++ b/swh/lister/golang/tests/data/page-1.txt @@ -0,0 +1,5 @@ +{"Path":"golang.org/x/text","Version":"v0.3.0","Timestamp":"2019-04-10T19:08:52.997264Z"} +{"Path":"github.com/oklog/ulid","Version":"v1.3.1","Timestamp":"2019-04-11T18:47:23.234198Z"} +{"Path":"collectd.org","Version":"v0.3.0","Timestamp":"2019-04-11T18:47:25.450546Z"} +{"Path":"github.com/nats-io/nuid","Version":"v1.0.1","Timestamp":"2019-04-11T18:47:28.102348Z"} +{"Path":"github.com/bmizerany/pat","Version":"v0.0.0-20170815010413-6226ea591a40","Timestamp":"2019-04-11T18:47:29.390564Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/data/page-2.txt b/swh/lister/golang/tests/data/page-2.txt new file mode 100644 index 0000000..badc2fe --- /dev/null +++ b/swh/lister/golang/tests/data/page-2.txt @@ -0,0 +1,4 @@ +{"Path":"github.com/djherbis/buffer","Version":"v1.0.0","Timestamp":"2019-04-11T18:47:29.974874Z"} +{"Path":"github.com/djherbis/nio","Version":"v2.0.3+incompatible","Timestamp":"2019-04-11T18:47:32.283312Z"} +{"Path":"github.com/gobuffalo/buffalo-plugins","Version":"v1.13.0","Timestamp":"2019-04-15T13:54:34.222985Z"} +{"Path":"github.com/markbates/refresh","Version":"v1.7.1","Timestamp":"2019-04-15T13:54:35.250835Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/data/page-3.txt b/swh/lister/golang/tests/data/page-3.txt new file mode 100644 index 0000000..37e9d96 --- /dev/null +++ b/swh/lister/golang/tests/data/page-3.txt @@ -0,0 +1,10 @@ +{"Path":"github.com/mitchellh/go-homedir","Version":"v1.1.0","Timestamp":"2019-04-15T13:54:35.678214Z"} +{"Path":"github.com/gobuffalo/packr","Version":"v1.22.0","Timestamp":"2019-04-15T13:54:35.6889Z"} +{"Path":"golang.org/x/sys","Version":"v0.0.0-20190220154126-629670e5acc5","Timestamp":"2019-04-15T13:54:37.555525Z"} +{"Path":"github.com/gobuffalo/genny","Version":"v0.0.0-20190104222617-a71664fc38e7","Timestamp":"2019-04-15T13:54:37.841547Z"} +{"Path":"github.com/blang/semver","Version":"v3.5.1+incompatible","Timestamp":"2019-04-15T13:54:39.107258Z"} +{"Path":"github.com/gobuffalo/buffalo-pop","Version":"v1.3.0","Timestamp":"2019-04-15T13:54:39.135792Z"} +{"Path":"golang.org/x/tools","Version":"v0.0.0-20190131142011-8dbcc66f33bb","Timestamp":"2019-04-15T13:54:39.250757Z"} +{"Path":"github.com/gobuffalo/clara","Version":"v0.4.1","Timestamp":"2019-04-15T13:54:40.651916Z"} +{"Path":"golang.org/x/tools","Version":"v0.0.0-20181213190329-bbccd8cae4a9","Timestamp":"2019-04-15T13:54:41.905064Z"} +{"Path":"github.com/pkg/errors","Version":"v0.0.0-20161002052512-839d9e913e06","Timestamp":"2019-04-18T02:07:41.336899Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/test_lister.py b/swh/lister/golang/tests/test_lister.py new file mode 100644 index 0000000..3cc9c64 --- /dev/null +++ b/swh/lister/golang/tests/test_lister.py @@ -0,0 +1,241 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime +from pathlib import Path + +import iso8601 + +from swh.lister.golang.lister import GolangLister, GolangStateType +from swh.lister.tests.test_utils import assert_sleep_calls +from swh.lister.utils import WAIT_EXP_BASE + +# https://pkg.go.dev prefix omitted +expected_listed = [ + ("collectd.org", "2019-04-11T18:47:25.450546+00:00"), + ( + "github.com/blang/semver", + "2019-04-15T13:54:39.107258+00:00", + ), + ( + "github.com/bmizerany/pat", + "2019-04-11T18:47:29.390564+00:00", + ), + ( + "github.com/djherbis/buffer", + "2019-04-11T18:47:29.974874+00:00", + ), + ( + "github.com/djherbis/nio", + "2019-04-11T18:47:32.283312+00:00", + ), + ( + "github.com/gobuffalo/buffalo-plugins", + "2019-04-15T13:54:34.222985+00:00", + ), + ( + "github.com/gobuffalo/buffalo-pop", + "2019-04-15T13:54:39.135792+00:00", + ), + ( + "github.com/gobuffalo/clara", + "2019-04-15T13:54:40.651916+00:00", + ), + ( + "github.com/gobuffalo/genny", + "2019-04-15T13:54:37.841547+00:00", + ), + ( + "github.com/gobuffalo/packr", + "2019-04-15T13:54:35.688900+00:00", + ), + ( + "github.com/markbates/refresh", + "2019-04-15T13:54:35.250835+00:00", + ), + ( + "github.com/mitchellh/go-homedir", + "2019-04-15T13:54:35.678214+00:00", + ), + ( + "github.com/nats-io/nuid", + "2019-04-11T18:47:28.102348+00:00", + ), + ( + "github.com/oklog/ulid", + "2019-04-11T18:47:23.234198+00:00", + ), + ( + "github.com/pkg/errors", + "2019-04-18T02:07:41.336899+00:00", + ), + ( + "golang.org/x/sys", + "2019-04-15T13:54:37.555525+00:00", + ), + ("golang.org/x/text", "2019-04-10T19:08:52.997264+00:00"), + # only one x/tools listed even though there are two version, and only the + # latest one's timestamp is used. + ( + "golang.org/x/tools", + "2019-04-15T13:54:41.905064+00:00", + ), +] + + +def _generate_responses(datadir, requests_mock): + responses = [] + for file in Path(datadir).glob("page-*.txt"): + # Test that throttling and server errors are retries + responses.append({"text": "", "status_code": 429}) + responses.append({"text": "", "status_code": 500}) + # Also test that the lister appropriately gets out of the infinite loop + responses.append({"text": file.read_text(), "status_code": 200}) + + requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses) + + +def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir): + # first listing, should return one origin per package + lister = GolangLister(scheduler=swh_scheduler) + + # Exponential retries take a long time, so stub time.sleep + mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep") + + _generate_responses(datadir, requests_mock) + + stats = lister.run() + + assert stats.pages == 3 + # The two `golang.org/x/tools` versions are *not* listed as separate origins + assert stats.origins == 18 + + scheduler_origins = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + + for scheduled, (url, timestamp) in zip(scheduler_origins, expected_listed): + assert scheduled.url == f"https://pkg.go.dev/{url}" + assert scheduled.last_update == iso8601.parse_date(timestamp) + assert scheduled.visit_type == "golang" + + assert len(scheduler_origins) == len(expected_listed) + + # Test `time.sleep` is called with exponential retries + assert_sleep_calls( + mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE] + ) + + # doing it all again (without incremental) should give us the same result + lister = GolangLister(scheduler=swh_scheduler) + mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep") + _generate_responses(datadir, requests_mock) + stats = lister.run() + + assert stats.pages == 3 + assert stats.origins == 18 + + +def test_golang_lister_incremental(swh_scheduler, requests_mock, datadir, mocker): + # first listing, should return one origin per package + lister = GolangLister(scheduler=swh_scheduler, incremental=True) + mock = mocker.spy(lister, "get_single_page") + + responses = [ + {"text": Path(datadir, "page-1.txt").read_text(), "status_code": 200}, + ] + requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses) + + stats = lister.run() + + page1_last_timestamp = datetime.datetime( + 2019, 4, 11, 18, 47, 29, 390564, tzinfo=datetime.timezone.utc + ) + page2_last_timestamp = datetime.datetime( + 2019, 4, 15, 13, 54, 35, 250835, tzinfo=datetime.timezone.utc + ) + page3_last_timestamp = datetime.datetime( + 2019, 4, 18, 2, 7, 41, 336899, tzinfo=datetime.timezone.utc + ) + mock.assert_has_calls( + [ + # First call has no state + mocker.call(since=None), + # Second call is the last timestamp in the listed page + mocker.call(since=page1_last_timestamp), + ] + ) + + assert lister.get_state_from_scheduler() == GolangStateType( + last_seen=page1_last_timestamp + ) + + assert stats.pages == 1 + assert stats.origins == 5 + + # Incremental should list nothing + lister = GolangLister(scheduler=swh_scheduler, incremental=True) + mock = mocker.spy(lister, "get_single_page") + stats = lister.run() + mock.assert_has_calls([mocker.call(since=page1_last_timestamp)]) + assert stats.pages == 0 + assert stats.origins == 0 + + # Add more responses + responses = [ + {"text": Path(datadir, "page-2.txt").read_text(), "status_code": 200}, + ] + + requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses) + + # Incremental should list new page + lister = GolangLister(scheduler=swh_scheduler, incremental=True) + mock = mocker.spy(lister, "get_single_page") + stats = lister.run() + mock.assert_has_calls( + [ + mocker.call(since=page1_last_timestamp), + mocker.call(since=page2_last_timestamp), + ] + ) + assert stats.pages == 1 + assert stats.origins == 4 + + # Incremental should list nothing again + lister = GolangLister(scheduler=swh_scheduler, incremental=True) + mock = mocker.spy(lister, "get_single_page") + stats = lister.run() + assert stats.pages == 0 + assert stats.origins == 0 + mock.assert_has_calls([mocker.call(since=page2_last_timestamp)]) + + # Add yet more responses + responses = [ + {"text": Path(datadir, "page-3.txt").read_text(), "status_code": 200}, + ] + + requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses) + + # Incremental should list new page again + lister = GolangLister(scheduler=swh_scheduler, incremental=True) + mock = mocker.spy(lister, "get_single_page") + stats = lister.run() + assert stats.pages == 1 + assert stats.origins == 9 + mock.assert_has_calls( + [ + mocker.call(since=page2_last_timestamp), + mocker.call(since=page3_last_timestamp), + ] + ) + + # Incremental should list nothing one last time + lister = GolangLister(scheduler=swh_scheduler, incremental=True) + mock = mocker.spy(lister, "get_single_page") + stats = lister.run() + assert stats.pages == 0 + assert stats.origins == 0 + mock.assert_has_calls([mocker.call(since=page3_last_timestamp)]) diff --git a/swh/lister/golang/tests/test_tasks.py b/swh/lister/golang/tests/test_tasks.py new file mode 100644 index 0000000..92458cc --- /dev/null +++ b/swh/lister/golang/tests/test_tasks.py @@ -0,0 +1,52 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_golang_full_listing_task( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.golang.tasks.GolangLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=28000) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.FullGolangLister") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with() + + +def test_golang_incremental_listing_task( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.golang.tasks.GolangLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=28000) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task( + "swh.lister.golang.tasks.IncrementalGolangLister" + ) + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with(incremental=True) + lister.run.assert_called_once_with() diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 2dc6cc5..2560feb 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -1,425 +1,428 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass from datetime import datetime, timezone import logging import re from typing import Any, Dict, Iterator, Optional from urllib.parse import urljoin +from bs4 import BeautifulSoup +import lxml import requests from tenacity.before_sleep import before_sleep_log -import xmltodict from swh.core.github.utils import GitHubSession from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) RepoPage = Dict[str, Any] SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr") @dataclass class MavenListerState: """State of the MavenLister""" last_seen_doc: int = -1 """Last doc ID ingested during an incremental pass """ last_seen_pom: int = -1 """Last doc ID related to a pom and ingested during an incremental pass """ class MavenLister(Lister[MavenListerState, RepoPage]): """List origins from a Maven repository. Maven Central provides artifacts for Java builds. It includes POM files and source archives, which we download to get the source code of artifacts and links to their scm repository. This lister yields origins of types: git/svn/hg or whatever the Artifacts use as repository type, plus maven types for the maven loader (tgz, jar).""" LISTER_NAME = "maven" def __init__( self, scheduler: SchedulerInterface, url: str, index_url: str = None, instance: Optional[str] = None, credentials: CredentialsType = None, incremental: bool = True, ): """Lister class for Maven repositories. Args: url: main URL of the Maven repository, i.e. url of the base index used to fetch maven artifacts. For Maven central use https://repo1.maven.org/maven2/ index_url: the URL to download the exported text indexes from. Would typically be a local host running the export docker image. See README.md in this directory for more information. instance: Name of maven instance. Defaults to url's network location if unset. incremental: bool, defaults to True. Defines if incremental listing is activated or not. """ self.BASE_URL = url self.INDEX_URL = index_url self.incremental = incremental super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.session = requests.Session() self.session.headers.update( { "Accept": "application/json", "User-Agent": USER_AGENT, } ) self.jar_origins: Dict[str, ListedOrigin] = {} self.github_session = GitHubSession( credentials=self.credentials, user_agent=USER_AGENT ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: return asdict(state) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[RepoPage]: """Retrieve and parse exported maven indexes to identify all pom files and src archives. """ # Example of returned RepoPage's: # [ # { # "type": "maven", # "url": "https://maven.xwiki.org/..-5.4.2-sources.jar", # "time": 1626109619335, # "gid": "org.xwiki.platform", # "aid": "xwiki-platform-wikistream-events-xwiki", # "version": "5.4.2" # }, # { # "type": "scm", # "url": "scm:git:git://github.com/openengsb/openengsb-framework.git", # "project": "openengsb-framework", # }, # ... # ] # Download the main text index file. logger.info("Downloading computed index from %s.", self.INDEX_URL) assert self.INDEX_URL is not None response = requests.get(self.INDEX_URL, stream=True) if response.status_code != 200: logger.error("Index %s not found, stopping", self.INDEX_URL) response.raise_for_status() # Prepare regexes to parse index exports. # Parse doc id. # Example line: "doc 13" re_doc = re.compile(r"^doc (?P\d+)$") # Parse gid, aid, version, classifier, extension. # Example line: " value al.aldi|sprova4j|0.1.0|sources|jar" re_val = re.compile( r"^\s{4}value (?P[^|]+)\|(?P[^|]+)\|(?P[^|]+)\|" + r"(?P[^|]+)\|(?P[^|]+)$" ) # Parse last modification time. # Example line: " value jar|1626109619335|14316|2|2|0|jar" re_time = re.compile( r"^\s{4}value ([^|]+)\|(?P[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)" + r"\|([^|]+)\|([^|]+)$" ) # Read file line by line and process it out_pom: Dict = {} jar_src: Dict = {} doc_id: int = 0 jar_src["doc"] = None url_src = None iterator = response.iter_lines(chunk_size=1024) for line_bytes in iterator: # Read the index text export and get URLs and SCMs. line = line_bytes.decode(errors="ignore") m_doc = re_doc.match(line) if m_doc is not None: doc_id = int(m_doc.group("doc")) # jar_src["doc"] contains the id of the current document, whatever # its type (scm or jar). jar_src["doc"] = doc_id else: m_val = re_val.match(line) if m_val is not None: (gid, aid, version, classifier, ext) = m_val.groups() ext = ext.strip() path = "/".join(gid.split(".")) if classifier == "NA" and ext.lower() == "pom": # If incremental mode, we don't record any line that is # before our last recorded doc id. if ( self.incremental and self.state and self.state.last_seen_pom and self.state.last_seen_pom >= doc_id ): continue url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}" url_pom = urljoin( self.BASE_URL, url_path, ) out_pom[url_pom] = doc_id elif ( classifier.lower() == "sources" or ("src" in classifier) ) and ext.lower() in ("zip", "jar"): url_path = ( f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}" ) url_src = urljoin(self.BASE_URL, url_path) jar_src["gid"] = gid jar_src["aid"] = aid jar_src["version"] = version else: m_time = re_time.match(line) if m_time is not None and url_src is not None: time = m_time.group("mtime") jar_src["time"] = int(time) artifact_metadata_d = { "type": "maven", "url": url_src, **jar_src, } logger.debug( "* Yielding jar %s: %s", url_src, artifact_metadata_d ) yield artifact_metadata_d url_src = None logger.info("Found %s poms.", len(out_pom)) # Now fetch pom files and scan them for scm info. logger.info("Fetching poms..") for pom in out_pom: try: response = self.page_request(pom, {}) - project = xmltodict.parse(response.content) - project_d = project.get("project", {}) - scm_d = project_d.get("scm") - if scm_d is not None: - connection = scm_d.get("connection") + parsed_pom = BeautifulSoup(response.content, "xml") + project = parsed_pom.find("project") + if project is None: + continue + scm = project.find("scm") + if scm is not None: + connection = scm.find("connection") if connection is not None: artifact_metadata_d = { "type": "scm", "doc": out_pom[pom], - "url": connection, + "url": connection.text, } logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) yield artifact_metadata_d else: logger.debug("No scm.connection in pom %s", pom) else: logger.debug("No scm in pom %s", pom) except requests.HTTPError: logger.warning( "POM info page could not be fetched, skipping project '%s'", pom, ) - except xmltodict.expat.ExpatError as error: - logger.info("Could not parse POM %s XML: %s. Next.", pom, error) + except lxml.etree.Error as error: + logger.info("Could not parse POM %s XML: %s.", pom, error) def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: """Retrieve scm origin out of the page information. Only called when type of the page is scm. Try and detect an scm/vcs repository. Note that official format is in the form: scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put the repo url (without the "scm:type"), so we have to check against the content to extract the type and url properly. Raises AssertionError when the type of the page is not 'scm' Returns ListedOrigin with proper canonical scm url (for github) if any is found, None otherwise. """ assert page["type"] == "scm" visit_type: Optional[str] = None url: Optional[str] = None m_scm = re.match(r"^scm:(?P[^:]+):(?P.*)$", page["url"]) if m_scm is None: return None scm_type = m_scm.group("type") if scm_type and scm_type in SUPPORTED_SCM_TYPES: url = m_scm.group("url") visit_type = scm_type elif page["url"].endswith(".git"): url = page["url"].lstrip("scm:") visit_type = "git" else: return None if url and visit_type == "git": # Non-github urls will be returned as is, github ones will be canonical ones url = self.github_session.get_canonical_url(url) if not url: return None assert visit_type is not None assert self.lister_obj.id is not None return ListedOrigin( lister_id=self.lister_obj.id, url=url, visit_type=visit_type, ) def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: """Convert a page of Maven repositories into a list of ListedOrigins.""" if page["type"] == "scm": listed_origin = self.get_scm(page) if listed_origin: yield listed_origin else: # Origin is gathering source archives: last_update_dt = None last_update_iso = "" try: last_update_seconds = str(page["time"])[:-3] last_update_dt = datetime.fromtimestamp(int(last_update_seconds)) last_update_dt = last_update_dt.astimezone(timezone.utc) except (OverflowError, ValueError): logger.warning("- Failed to convert datetime %s.", last_update_seconds) if last_update_dt: last_update_iso = last_update_dt.isoformat() # Origin URL will target page holding sources for all versions of # an artifactId (package name) inside a groupId (namespace) path = "/".join(page["gid"].split(".")) origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}") artifact = { **{k: v for k, v in page.items() if k != "doc"}, "time": last_update_iso, "base_url": self.BASE_URL, } if origin_url not in self.jar_origins: # Create ListedOrigin instance if we did not see that origin yet assert self.lister_obj.id is not None jar_origin = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=page["type"], last_update=last_update_dt, extra_loader_arguments={"artifacts": [artifact]}, ) self.jar_origins[origin_url] = jar_origin else: # Update list of source artifacts for that origin otherwise jar_origin = self.jar_origins[origin_url] artifacts = jar_origin.extra_loader_arguments["artifacts"] if artifact not in artifacts: artifacts.append(artifact) if ( jar_origin.last_update and last_update_dt and last_update_dt > jar_origin.last_update ): jar_origin.last_update = last_update_dt if not self.incremental or ( self.state and page["doc"] > self.state.last_seen_doc ): # Yield origin with updated source artifacts, multiple instances of # ListedOrigin for the same origin URL but with different artifacts # list will be sent to the scheduler but it will deduplicate them and # take the latest one to upsert in database yield jar_origin def commit_page(self, page: RepoPage) -> None: """Update currently stored state using the latest listed doc. Note: this is a noop for full listing mode """ if self.incremental and self.state: # We need to differentiate the two state counters according # to the type of origin. if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc: self.state.last_seen_doc = page["doc"] elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom: self.state.last_seen_doc = page["doc"] self.state.last_seen_pom = page["doc"] def finalize(self) -> None: """Finalize the lister state, set update if any progress has been made. Note: this is a noop for full listing mode """ if self.incremental and self.state: last_seen_doc = self.state.last_seen_doc last_seen_pom = self.state.last_seen_pom scheduler_state = self.get_state_from_scheduler() if last_seen_doc and last_seen_pom: if (scheduler_state.last_seen_doc < last_seen_doc) or ( scheduler_state.last_seen_pom < last_seen_pom ): self.updated = True diff --git a/swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom b/swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom new file mode 100644 index 0000000..fa9648b --- /dev/null +++ b/swh/lister/maven/tests/data/https_maven.org/citrus-parent-3.0.7.pom @@ -0,0 +1,769 @@ + + + + org.sonatype.oss + oss-parent + 7 + + 4.0.0 + com.alibaba.citrus + citrus-parent + pom + Citrus Parent Project + 3.0.7 + Another Java-based WEB Framework + http://www.openwebx.org/ + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + scm:git:https://github.com/webx/citrus + scm:git:git@github.com:webx/citrus.git + http://github.com/webx/citrus + + + + baobao + Michael Zhou + yizhi@taobao.com + + architect + developer + + + http://i54.tinypic.com/2jewmjr.jpg + + -6 + + + + 1.5 + GBK + 3.0.7 + 2.5.6.SEC03 + 1.0 + 1.0 + 6.1.22 + ${java.encoding} + + + + spring3 + + 3.0.6.RELEASE + + + + + dist/webx + dist/test + common/expr + common/logconfig + common/util + common/springext + common/generictype + common/asm + common/hessian + common/codegen + test/util + test/webx + service/base + service/dataresolver + service/form + service/resource + service/upload + service/requestcontext + service/pipeline + service/configuration + service/mappingrule + service/moduleloader + service/pull + service/template + service/jsp + service/velocity + service/freemarker + service/uribroker + service/mail + webx/framework + webx/turbine + webx/dev + + + + org.slf4j + slf4j-api + + + org.slf4j + jcl-over-slf4j + + + + + + + + + ${project.groupId} + citrus-webx-all + ${webx-version} + + + ${project.groupId} + citrus-test-all + ${webx-version} + test + + + ${project.groupId} + citrus-common-logconfig + ${webx-version} + + + ${project.groupId} + citrus-common-expr + ${webx-version} + + + ${project.groupId} + citrus-common-util + ${webx-version} + + + ${project.groupId} + citrus-common-springext + ${webx-version} + + + ${project.groupId} + citrus-common-generictype + ${webx-version} + + + ${project.groupId} + citrus-common-asm + ${webx-version} + + + ${project.groupId} + citrus-common-hessian + ${webx-version} + + + ${project.groupId} + citrus-common-codegen + ${webx-version} + + + ${project.groupId} + citrus-test-util + ${webx-version} + test + + + ${project.groupId} + citrus-test-webx + ${webx-version} + test + + + ${project.groupId} + citrus-service-base + ${webx-version} + + + ${project.groupId} + citrus-service-dataresolver + ${webx-version} + + + ${project.groupId} + citrus-service-form + ${webx-version} + + + ${project.groupId} + citrus-service-resource + ${webx-version} + + + ${project.groupId} + citrus-service-upload + ${webx-version} + + + ${project.groupId} + citrus-service-requestcontext + ${webx-version} + + + ${project.groupId} + citrus-service-pipeline + ${webx-version} + + + ${project.groupId} + citrus-service-configuration + ${webx-version} + + + ${project.groupId} + citrus-service-mappingrule + ${webx-version} + + + ${project.groupId} + citrus-service-moduleloader + ${webx-version} + + + ${project.groupId} + citrus-service-pull + ${webx-version} + + + ${project.groupId} + citrus-service-template + ${webx-version} + + + ${project.groupId} + citrus-service-jsp + ${webx-version} + + + ${project.groupId} + citrus-service-velocity + ${webx-version} + + + ${project.groupId} + citrus-service-freemarker + ${webx-version} + + + ${project.groupId} + citrus-service-uribroker + ${webx-version} + + + ${project.groupId} + citrus-service-mail + ${webx-version} + + + ${project.groupId} + citrus-webx-framework + ${webx-version} + + + ${project.groupId} + citrus-webx-turbine + ${webx-version} + + + ${project.groupId} + citrus-webx-dev + ${webx-version} + + + + + + org.slf4j + slf4j-api + 1.6.1 + + + + org.slf4j + jcl-over-slf4j + 1.6.1 + + + + commons-logging + commons-logging + 1.1.1 + provided + + + + ch.qos.logback + logback-classic + 0.9.24 + runtime + + + + org.slf4j + slf4j-log4j12 + 1.6.1 + runtime + + + log4j + log4j + 1.2.16 + runtime + + + + + + junit + junit + 4.8.2 + test + + + org.hamcrest + hamcrest-library + 1.1 + test + + + httpunit + httpunit + 1.7 + test + + + jtidy + jtidy + + + + + rhino + js + 1.7R1 + test + + + nekohtml + nekohtml + 1.9.6 + test + + + xerces + xercesImpl + 2.9.1 + test + + + xml-apis + xml-apis + + + + + xalan + xalan + 2.7.1 + test + + + xml-apis + xml-apis + + + + + org.easymock + easymockclassextension + 3.0 + test + + + org.apache.tomcat + jasper + 6.0.33 + test + + + org.jvnet.mock-javamail + mock-javamail + 1.7 + test + + + oro + oro + 2.0.8 + + + + + + ecs + ecs + 1.4.2 + + + org.apache.commons + commons-jexl + 2.0.1 + + + org.apache.velocity + velocity + 1.6.4 + + + org.freemarker + freemarker + 2.3.16 + + + commons-fileupload + commons-fileupload + 1.2.1 + + + commons-io + commons-io + 1.4 + + + commons-codec + commons-codec + 1.3 + + + org.codehaus.groovy + groovy-all + 1.6.3 + runtime + + + org.apache.ant + ant + + + org.apache.ant + ant-launcher + + + jline + jline + + + + + dom4j + dom4j + 1.6.1 + + + xml-apis + xml-apis + + + + + cglib + cglib-nodep + 2.2 + + + javax.servlet + servlet-api + 2.5 + provided + + + javax.mail + mail + 1.4.1 + provided + + + javax.activation + activation + 1.1 + provided + + + janino + janino + 2.5.10 + test + + + xml-apis + xml-apis + 1.3.04 + + + + + + org.springframework + spring-core + ${spring-version} + + + org.springframework + spring-beans + ${spring-version} + + + org.springframework + spring-aop + ${spring-version} + + + org.springframework + spring-context + ${spring-version} + + + org.springframework + spring-context-support + ${spring-version} + + + org.springframework + spring-tx + ${spring-version} + + + org.springframework + spring-jdbc + ${spring-version} + + + org.springframework + spring-orm + ${spring-version} + + + org.springframework + spring-web + ${spring-version} + + + org.springframework + spring-webmvc + ${spring-version} + + + org.springframework + spring-test + ${spring-version} + test + + + + + + + maven-deploy-plugin + false + + false + + + + maven-compiler-plugin + + ${java.version} + ${java.version} + + + + maven-jar-plugin + + + + true + true + + + + + + maven-antrun-plugin + + + compile + + + + + + + + + + + run + + + + + + maven-surefire-plugin + + + **/*Tests.java + + -Xmx256m + + + + maven-eclipse-plugin + + true + + org.eclipse.jdt.launching.JRE_CONTAINER + + + + + maven-source-plugin + + + attach-sources + + jar-no-fork + + + + + + org.mortbay.jetty + maven-jetty-plugin + + citrus + 9999 + + + productionMode + false + + + + + + com.alibaba.citrus.tool + maven-springext-plugin + + + maven-gpg-plugin + + + sign-artifacts + verify + + sign + + + + + + + + + maven-antrun-plugin + 1.6 + + + maven-compiler-plugin + + 2.3.2 + + + maven-jar-plugin + 2.3.2 + + + maven-deploy-plugin + 2.7 + + true + + + + maven-eclipse-plugin + 2.8 + + + maven-shade-plugin + 1.4 + + + maven-source-plugin + 2.1.2 + + + maven-javadoc-plugin + 2.8 + + + maven-surefire-plugin + 2.10 + + + org.mortbay.jetty + maven-jetty-plugin + ${jetty-version} + + + com.alibaba.citrus.tool + maven-springext-plugin + ${springext-plugin-version} + + + maven-gpg-plugin + 1.4 + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + org.apache.maven.plugins + maven-antrun-plugin + [1.0,) + + run + + + + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + [1.0,) + + enforce + + + + + + + + + + + + + + \ No newline at end of file diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index b2a88f9..6a75a99 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -1,353 +1,379 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path import iso8601 import pytest import requests from swh.lister.maven.lister import MavenLister MVN_URL = "https://repo1.maven.org/maven2/" # main maven repo url INDEX_URL = "http://indexes/export.fld" # index directory url URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom" URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom" URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom" USER_REPO0 = "aldialimucaj/sprova4j" GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}" GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}" LIST_GIT = (GIT_REPO_URL0_HTTPS,) USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java" GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}" GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git" GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}" LIST_GIT_INCR = (GIT_REPO_URL1_HTTPS,) +USER_REPO2 = "webx/citrus" +GIT_REPO_URL2_HTTPS = f"https://github.com/{USER_REPO2}" +GIT_REPO_URL2_API = f"https://api.github.com/repos/{USER_REPO2}" + LIST_SRC = (MVN_URL + "al/aldi/sprova4j",) LIST_SRC_DATA = ( { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.0/sprova4j-0.1.0-sources.jar", "time": "2021-07-12T17:06:59+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.0", "base_url": MVN_URL, }, { "type": "maven", "url": "https://repo1.maven.org/maven2/al/aldi/sprova4j" + "/0.1.1/sprova4j-0.1.1-sources.jar", "time": "2021-07-12T17:37:05+00:00", "gid": "al.aldi", "aid": "sprova4j", "version": "0.1.1", "base_url": MVN_URL, }, ) @pytest.fixture def maven_index_full(datadir) -> bytes: return Path(datadir, "http_indexes", "export_full.fld").read_bytes() @pytest.fixture def maven_index_incr_first(datadir) -> bytes: return Path(datadir, "http_indexes", "export_incr_first.fld").read_bytes() @pytest.fixture def maven_pom_1(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.0.pom").read_bytes() @pytest.fixture def maven_index_null_mtime(datadir) -> bytes: return Path(datadir, "http_indexes", "export_null_mtime.fld").read_bytes() @pytest.fixture def maven_pom_1_malformed(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.0.malformed.pom").read_bytes() @pytest.fixture def maven_pom_2(datadir) -> bytes: return Path(datadir, "https_maven.org", "sprova4j-0.1.1.pom").read_bytes() @pytest.fixture def maven_pom_3(datadir) -> bytes: return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes() +@pytest.fixture +def maven_pom_multi_byte_encoding(datadir) -> bytes: + return Path(datadir, "https_maven.org", "citrus-parent-3.0.7.pom").read_bytes() + + @pytest.fixture def requests_mock(requests_mock): """If github api calls for the configured scm repository, returns its canonical url.""" for url_api, url_html in [ (GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS), (GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS), + (GIT_REPO_URL2_API, GIT_REPO_URL2_HTTPS), ]: requests_mock.get( url_api, json={"html_url": url_html}, ) yield requests_mock @pytest.fixture(autouse=True) def network_requests_mock( requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3 ): requests_mock.get(INDEX_URL, content=maven_index_full) requests_mock.get(URL_POM_1, content=maven_pom_1) requests_mock.get(URL_POM_2, content=maven_pom_2) requests_mock.get(URL_POM_3, content=maven_pom_3) def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 3 git origins + 1 maven origin with 2 releases (one per jar) assert len(origin_urls) == 3 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_full_listing_malformed( swh_scheduler, requests_mock, maven_pom_1_malformed, ): """Covers full listing of multiple pages, checking page results with a malformed scm entry in pom.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) # Set up test. requests_mock.get(URL_POM_1, content=maven_pom_1_malformed) # Then run the lister. stats = lister.run() # Start test checks. assert stats.pages == 5 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 2 git origins + 1 maven origin with 2 releases (one per jar) assert len(origin_urls) == 3 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == -1 assert scheduler_state.last_seen_pom == -1 def test_maven_incremental_listing( swh_scheduler, requests_mock, maven_index_full, maven_index_incr_first, ): """Covers full listing of multiple pages, checking page results and listed origins, with a second updated run for statefulness.""" lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) # Set up test. requests_mock.get(INDEX_URL, content=maven_index_incr_first) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] # 1 git origins + 1 maven origin with 1 release (one per jar) assert len(origin_urls) == 2 assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC) for origin in scheduler_origins: if origin.visit_type == "maven": last_update_src = iso8601.parse_date(LIST_SRC_DATA[0]["time"]) assert last_update_src == origin.last_update assert origin.extra_loader_arguments["artifacts"] == [LIST_SRC_DATA[0]] # Second execution of the lister, incremental mode lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=True, ) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 1 assert scheduler_state.last_seen_pom == 1 # Set up test. requests_mock.get(INDEX_URL, content=maven_index_full) # Then run the lister. stats = lister.run() # Start test checks. assert lister.incremental assert lister.updated assert stats.pages == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results origin_urls = [origin.url for origin in scheduler_origins] assert sorted(origin_urls) == sorted(LIST_SRC + LIST_GIT + LIST_GIT_INCR) for origin in scheduler_origins: if origin.visit_type == "maven": for src in LIST_SRC_DATA: last_update_src = iso8601.parse_date(src["time"]) assert last_update_src <= origin.last_update assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) scheduler_state = lister.get_state_from_scheduler() assert scheduler_state is not None assert scheduler_state.last_seen_doc == 4 assert scheduler_state.last_seen_pom == 4 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_on_index_read(swh_scheduler, requests_mock, http_code): """should stop listing if the lister fails to retrieve the main index url.""" lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) requests_mock.get(INDEX_URL, status_code=http_code) with pytest.raises(requests.HTTPError): # listing cannot continues so stop lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0 @pytest.mark.parametrize("http_code", [400, 404, 500, 502]) def test_maven_list_http_error_artifacts( swh_scheduler, requests_mock, http_code, ): """should continue listing when failing to retrieve artifacts.""" # Test failure of artefacts retrieval. requests_mock.get(URL_POM_1, status_code=http_code) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) # on artifacts though, that raises but continue listing lister.run() # If the maven_index_full step succeeded but not the get_pom step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 2 def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime): requests_mock.get(INDEX_URL, content=maven_index_null_mtime) # Run the lister. lister = MavenLister( scheduler=swh_scheduler, url=MVN_URL, instance="maven.org", index_url=INDEX_URL, incremental=False, ) stats = lister.run() # Start test checks. assert stats.pages == 1 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 1 assert scheduler_origins[0].last_update is None def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1): """should continue listing when failing to decode pom file.""" # Test failure of pom parsing by reencoding a UTF-8 pom file to a not expected one requests_mock.get(URL_POM_1, content=maven_pom_1.decode("utf-8").encode("utf-32")) lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) lister.run() # If the maven_index_full step succeeded but not the pom parsing step, # then we get only one maven-jar origin and one git origin. scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 2 + + +def test_maven_list_pom_multi_byte_encoding( + swh_scheduler, requests_mock, maven_pom_multi_byte_encoding +): + """should parse POM file with multi-byte encoding.""" + + # replace pom file with a multi-byte encoding one + requests_mock.get(URL_POM_1, content=maven_pom_multi_byte_encoding) + + lister = MavenLister(scheduler=swh_scheduler, url=MVN_URL, index_url=INDEX_URL) + + lister.run() + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == 3 diff --git a/swh/lister/pubdev/__init__.py b/swh/lister/pubdev/__init__.py new file mode 100644 index 0000000..63bde65 --- /dev/null +++ b/swh/lister/pubdev/__init__.py @@ -0,0 +1,71 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Pub.dev lister +============== + +The Pubdev lister list origins from `pub.dev`_, the `Dart`_ and `Flutter`_ packages registry. + +The registry provide an `http api`_ from where the lister retrieve package names. + +As of August 2022 `pub.dev`_ list 33535 package names. + +Origins retrieving strategy +--------------------------- + +To get a list of all package names we call `https://pub.dev/api/packages` endpoint. +There is no other way for discovery (no archive index, no database dump, no dvcs repository). + +Page listing +------------ + +There is only one page that list all origins url based +on `https://pub.dev/api/packages/{pkgname}`. +The origin url corresponds to the http api endpoint that returns complete information +about the package versions (name, version, author, description, release date). + +Origins from page +----------------- + +The lister yields all origins url from one page. + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/pubdev/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker-compose up -d + +Then connect to the lister:: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler):: + + swh lister run -l pubdev + +.. _pub.dev: https://pub.dev +.. _Dart: https://dart.dev +.. _Flutter: https://flutter.dev +.. _http api: https://pub.dev/help/api +""" + + +def register(): + from .lister import PubDevLister + + return { + "lister": PubDevLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py new file mode 100644 index 0000000..8abb582 --- /dev/null +++ b/swh/lister/pubdev/lister.py @@ -0,0 +1,125 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import logging +from typing import Any, Dict, Iterator, List, Optional + +import iso8601 +import requests +from requests.exceptions import HTTPError +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import __version__ +from ..pattern import CredentialsType, StatelessLister + +# https://github.com/dart-lang/pub/blob/master/doc/repository-spec-v2.md#metadata-headers +USER_AGENT = ( + f"Software Heritage PubDev Lister v{__version__} " + "(+https://www.softwareheritage.org/contact)" +) + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +PubDevListerPage = List[str] + + +class PubDevLister(StatelessLister[PubDevListerPage]): + """List pub.dev (Dart, Flutter) origins.""" + + LISTER_NAME = "pubdev" + VISIT_TYPE = "pubdev" + INSTANCE = "pubdev" + + BASE_URL = "https://pub.dev/" + PACKAGE_NAMES_URL_PATTERN = "{base_url}api/package-names" + PACKAGE_INFO_URL_PATTERN = "{base_url}api/packages/{pkgname}" + ORIGIN_URL_PATTERN = "{base_url}packages/{pkgname}" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.BASE_URL, + ) + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[PubDevListerPage]: + """Yield an iterator which returns 'page' + + It uses the api provided by https://pub.dev/api/ to find Dart and Flutter package + origins. + + The http api call get "{base_url}package-names" to retrieve a sorted list + of all package names. + + There is only one page that list all origins url based on "{base_url}packages/{pkgname}" + """ + response = self.page_request( + url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={} + ) + yield response.json()["packages"] + + def get_origins_from_page(self, page: PubDevListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + + for pkgname in page: + package_info_url = self.PACKAGE_INFO_URL_PATTERN.format( + base_url=self.url, pkgname=pkgname + ) + try: + response = self.page_request(url=package_info_url, params={}) + except HTTPError: + logger.warning( + "Failed to fetch metadata for package %s, skipping it from listing.", + pkgname, + ) + continue + package_metadata = response.json() + package_versions = package_metadata["versions"] + last_published = max( + package_version["published"] for package_version in package_versions + ) + origin_url = self.ORIGIN_URL_PATTERN.format( + base_url=self.url, pkgname=pkgname + ) + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=origin_url, + last_update=iso8601.parse_date(last_published), + ) diff --git a/swh/lister/pubdev/tasks.py b/swh/lister/pubdev/tasks.py new file mode 100644 index 0000000..d0f23ab --- /dev/null +++ b/swh/lister/pubdev/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.pubdev.lister import PubDevLister + + +@shared_task(name=__name__ + ".PubDevListerTask") +def list_pubdev(**lister_args): + """Lister task for pub.dev (Dart, Flutter) registry""" + return PubDevLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/pubdev/tests/__init__.py b/swh/lister/pubdev/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names new file mode 100644 index 0000000..f16fc53 --- /dev/null +++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_package-names @@ -0,0 +1,7 @@ +{ + "packages": [ + "Autolinker", + "Babylon" + ], + "nextUrl": null +} \ No newline at end of file diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker new file mode 100644 index 0000000..5d19592 --- /dev/null +++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Autolinker @@ -0,0 +1,44 @@ +{ + "name": "Autolinker", + "latest": { + "version": "0.1.1", + "pubspec": { + "version": "0.1.1", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + "name": "Autolinker", + "author": "hackcave " + }, + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "archive_sha256": "0a5209a2d5a292a26fc65d7edb430163f209a7c7c24ba4f301676f1afd79fa3f", + "published": "2014-12-24T22:34:02.534090Z" + }, + "versions": [ + { + "version": "0.1.0", + "pubspec": { + "version": "0.1.0", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + "name": "Autolinker", + "author": "hackcave " + }, + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.0.tar.gz", + "archive_sha256": "717b30e27311c775293d4795ce33d15cedb5e5d21fa140f2cb46b30f3e969041", + "published": "2014-12-24T21:16:03.118270Z" + }, + { + "version": "0.1.1", + "pubspec": { + "version": "0.1.1", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + "name": "Autolinker", + "author": "hackcave " + }, + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "archive_sha256": "0a5209a2d5a292a26fc65d7edb430163f209a7c7c24ba4f301676f1afd79fa3f", + "published": "2014-12-24T22:34:02.534090Z" + } + ] +} \ No newline at end of file diff --git a/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Babylon b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Babylon new file mode 100644 index 0000000..770d7ee --- /dev/null +++ b/swh/lister/pubdev/tests/data/https_pub.dev/api_packages_Babylon @@ -0,0 +1,51 @@ +{ + "name": "Babylon", + "latest": { + "version": "0.0.3", + "pubspec": { + "version": "0.0.3", + "name": "Babylon", + "dependencies": { + "js": ">=0.6.0", + "browser": ">=0.10.0+2" + }, + "author": "Cedric Krause ", + "description": "A starting point for Dart libraries or applications.", + "homepage": "https://www.cedware.com", + "environment": { + "sdk": ">=1.0.0 <2.0.0" + }, + "dev_dependencies": { + "test": ">=0.12.0 <0.13.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/Babylon/versions/0.0.3.tar.gz", + "archive_sha256": "a18166c8082d795f22c38270b7fed0c306d5cb59fe390ce3a34c300770c4a8b3", + "published": "2016-06-01T19:15:38.052Z" + }, + "versions": [ + { + "version": "0.0.3", + "pubspec": { + "version": "0.0.3", + "name": "Babylon", + "dependencies": { + "js": ">=0.6.0", + "browser": ">=0.10.0+2" + }, + "author": "Cedric Krause ", + "description": "A starting point for Dart libraries or applications.", + "homepage": "https://www.cedware.com", + "environment": { + "sdk": ">=1.0.0 <2.0.0" + }, + "dev_dependencies": { + "test": ">=0.12.0 <0.13.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/Babylon/versions/0.0.3.tar.gz", + "archive_sha256": "a18166c8082d795f22c38270b7fed0c306d5cb59fe390ce3a34c300770c4a8b3", + "published": "2016-06-01T19:15:38.052Z" + } + ] +} \ No newline at end of file diff --git a/swh/lister/pubdev/tests/test_lister.py b/swh/lister/pubdev/tests/test_lister.py new file mode 100644 index 0000000..ac2be14 --- /dev/null +++ b/swh/lister/pubdev/tests/test_lister.py @@ -0,0 +1,49 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pubdev.lister import USER_AGENT, PubDevLister + +expected_origins = { + "https://pub.dev/packages/Autolinker", + "https://pub.dev/packages/Babylon", +} + + +def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = PubDevLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 2 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(expected_origins) + + for origin in scheduler_origins: + assert origin.visit_type == "pubdev" + assert origin.url in expected_origins + assert origin.last_update is not None + + +def _match_request(request): + return request.headers.get("User-Agent") == USER_AGENT + + +def test_pubdev_lister_skip_package( + datadir, requests_mock_datadir, swh_scheduler, requests_mock +): + + requests_mock.get( + "https://pub.dev/api/packages/Autolinker", + status_code=404, + additional_matcher=_match_request, + ) + + lister = PubDevLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 1 diff --git a/swh/lister/pubdev/tests/test_tasks.py b/swh/lister/pubdev/tests/test_tasks.py new file mode 100644 index 0000000..d9ab3f8 --- /dev/null +++ b/swh/lister/pubdev/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_pubdev_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_pubdev_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked PubDevLister + lister = mocker.patch("swh.lister.pubdev.tasks.PubDevLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.pubdev.tasks.PubDevListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with() diff --git a/swh/lister/pypi/tests/test_lister.py b/swh/lister/pypi/tests/test_lister.py index fefb01f..a6dac88 100644 --- a/swh/lister/pypi/tests/test_lister.py +++ b/swh/lister/pypi/tests/test_lister.py @@ -1,259 +1,252 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict from datetime import datetime, timezone from typing import List import pytest from swh.lister.pypi.lister import ChangelogEntry, PyPILister, pypi_url from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): """Asserts that the two collections have the same origin URLs""" - - sorted_lister_urls = list(sorted(lister_urls)) - sorted_scheduler_origins = list(sorted(scheduler_origins)) - - assert len(sorted_lister_urls) == len(sorted_scheduler_origins) - - for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): - assert l_url == s_origin.url + assert set(lister_urls) == {origin.url for origin in scheduler_origins} @pytest.mark.parametrize( "credentials, expected_credentials", [ (None, []), ({"key": "value"}, []), ( {"pypi": {"pypi": [{"username": "user", "password": "pass"}]}}, [{"username": "user", "password": "pass"}], ), ], ) def test_lister_pypi_instantiation_with_credentials( credentials, expected_credentials, swh_scheduler ): lister = PyPILister(swh_scheduler, credentials=credentials) # Credentials are allowed in constructor assert lister.credentials == expected_credentials def test_lister_pypi_from_configfile(swh_scheduler_config, mocker): load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") load_from_envvar.return_value = { "scheduler": {"cls": "local", **swh_scheduler_config}, "credentials": {}, } lister = PyPILister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None def to_serial(changelog_entry: ChangelogEntry) -> int: """Helper utility to read the serial entry in the tuple Args: changelog_entry: Changelog entry to read data from Returns: The serial from the entry """ return changelog_entry[4] def configure_scheduler_state( scheduler: SchedulerInterface, data: List[ChangelogEntry] ): """Allows to pre configure a last serial state for the lister consistent with the test data set (the last_serial will be something inferior than the most minimal serial in the data set). Args: scheduler: The actual scheduler instance used during test data: The actual dataset used during test """ # Compute the lowest serial to make it a minimum state to store in the scheduler lowest_serial = min(map(to_serial, data)) # We'll need to configure the scheduler's state lister_obj = scheduler.get_or_create_lister( name=PyPILister.LISTER_NAME, instance_name=PyPILister.INSTANCE ) lister_obj.current_state = {"last_serial": lowest_serial - 10} scheduler.update_lister(lister_obj) @pytest.fixture def mock_pypi_xmlrpc(mocker, swh_scheduler): """This setups a lister so it can actually fake the call to the rpc service executed during an incremental listing. To retrieve or update the faked data, open a python3 toplevel and execute the following: .. code:: python from datetime import timezone, datetime, timedelta from xmlrpc.client import ServerProxy from swh.scheduler.utils import utcnow RPC_URL = "https://pypi.org/pypi" cli = ServerProxy(RPC_URL) last_serial = cli.changelog_last_serial() # 10854808 last_state_serial = 2168587 results = cli.changelog_since_serial(last_state_serial) Returns: the following Tuple[serial, List[PackageUpdate], MagicMock, MagicMock] type. """ data = [ ["wordsmith", None, 1465998124, "add Owner DoublePlusAwks", 2168628], ["wordsmith", "0.1", 1465998123, "new release", 2168629], ["wordsmith", "0.1", 1465998131, "update classifiers", 2168630], [ "UFx", "1.0", 1465998207, "update author_email, home_page, summary, description", 2168631, ], ["UFx", "1.0", 1465998236, "remove file UFx-1.0.tar.gz", 2168632], ["wordsmith", "0.1", 1465998309, "update classifiers", 2168633], [ "wordsmith", "0.1", 1465998406, "update summary, description, classifiers", 2168634, ], ["property-manager", "2.0", 1465998436, "new release", 2168635], [ "property-manager", "2.0", 1465998439, "add source file property-manager-2.0.tar.gz", 2168636, ], ["numtest", "2.0.0", 1465998446, "new release", 2168637], ["property-manager", "2.1", 1465998468, "new release", 2168638], [ "property-manager", "2.1", 1465998472, "add source file property-manager-2.1.tar.gz", 2168639, ], ["kafka-utils", "0.2.0", 1465998477, "new release", 2168640], [ "kafka-utils", "0.2.0", 1465998480, "add source file kafka-utils-0.2.0.tar.gz", 2168641, ], ["numtest", "2.0.1", 1465998520, "new release", 2168642], ["coala-bears", "0.3.0.dev20160615134909", 1465998552, "new release", 2168643], [ "coala-bears", "0.3.0.dev20160615134909", 1465998556, "add py3 file coala_bears-0.3.0.dev20160615134909-py3-none-any.whl", 2168644, ], ["django_sphinxsearch", "0.4.0", 1465998571, "new release", 2168645], [ "django_sphinxsearch", "0.4.0", 1465998573, "add source file django_sphinxsearch-0.4.0.tar.gz", 2168646, ], [ "coala-bears", "0.3.0.dev20160615134909", 1465998589, "add source file coala-bears-0.3.0.dev20160615134909.tar.gz", 2168647, ], ] highest_serial = min(map(to_serial, data)) def sleep(seconds): pass mocker.patch("swh.lister.pypi.lister.sleep").return_value = sleep class FakeServerProxy: """Fake Server Proxy""" def changelog_last_serial(self): return highest_serial def changelog_since_serial(self, serial): return data mock_serverproxy = mocker.patch("swh.lister.pypi.lister.ServerProxy") mock_serverproxy.return_value = FakeServerProxy() return highest_serial, data, mock_serverproxy @pytest.mark.parametrize("configure_state", [True, False]) def test_lister_pypi_run(mock_pypi_xmlrpc, swh_scheduler, configure_state): highest_serial, data, mock_serverproxy = mock_pypi_xmlrpc if configure_state: configure_scheduler_state(swh_scheduler, data) updated_packages = defaultdict(list) for [package, _, release_date, _, _] in data: updated_packages[package].append(release_date) assert len(updated_packages) > 0 expected_last_updates = { pypi_url(package): datetime.fromtimestamp(max(releases)).replace( tzinfo=timezone.utc ) for package, releases in updated_packages.items() } expected_pypi_urls = [pypi_url(package_name) for package_name in updated_packages] lister = PyPILister(scheduler=swh_scheduler) stats = lister.run() assert mock_serverproxy.called assert stats.pages == 1 assert stats.origins == len(updated_packages) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == stats.origins check_listed_origins(expected_pypi_urls, scheduler_origins) actual_scheduler_state = lister.get_state_from_scheduler() # This new visit updated the state to the new one assert actual_scheduler_state.last_serial == highest_serial for listed_origin in scheduler_origins: assert listed_origin.last_update is not None assert listed_origin.last_update == expected_last_updates[listed_origin.url] def test__if_rate_limited(): # TODO pass diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index dfaf72b..ed0f34e 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -1,55 +1,59 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.lister.cli import SUPPORTED_LISTERS, get_lister lister_args = { "cgit": { "url": "https://git.eclipse.org/c/", }, "phabricator": { "instance": "softwareheritage", "url": "https://forge.softwareheritage.org/api/diffusion.repository.search", "api_token": "bogus", }, "gitea": { "url": "https://try.gitea.io/api/v1/", }, "tuleap": { "url": "https://tuleap.net", }, "gitlab": { "url": "https://gitlab.ow2.org/api/v4", "instance": "ow2", }, "opam": {"url": "https://opam.ocaml.org", "instance": "opam"}, "maven": { "url": "https://repo1.maven.org/maven2/", "index_url": "http://indexes/export.fld", }, + "gogs": { + "url": "https://try.gogs.io/", + "api_token": "secret", + }, } def test_get_lister_wrong_input(): """Unsupported lister should raise""" with pytest.raises(ValueError) as e: get_lister("unknown", "db-url") assert "Invalid lister" in str(e.value) def test_get_lister(swh_scheduler_config): """Instantiating a supported lister should be ok""" # Drop launchpad lister from the lister to check, its test setup is more involved # than the other listers and it's not currently done here for lister_name in SUPPORTED_LISTERS: lst = get_lister( lister_name, scheduler={"cls": "local", **swh_scheduler_config}, **lister_args.get(lister_name, {}), ) assert hasattr(lst, "run") diff --git a/swh/lister/tuleap/tests/test_lister.py b/swh/lister/tuleap/tests/test_lister.py index 5e74d35..16d0c7a 100644 --- a/swh/lister/tuleap/tests/test_lister.py +++ b/swh/lister/tuleap/tests/test_lister.py @@ -1,171 +1,165 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from pathlib import Path from typing import Dict, List, Tuple import pytest import requests from swh.lister.tuleap.lister import RepoPage, TuleapLister from swh.scheduler.model import ListedOrigin TULEAP_URL = "https://tuleap.net/" TULEAP_PROJECTS_URL = TULEAP_URL + "api/projects/" TULEAP_REPO_1_URL = TULEAP_URL + "api/projects/685/git" # manjaromemodoc TULEAP_REPO_2_URL = TULEAP_URL + "api/projects/309/git" # myaurora TULEAP_REPO_3_URL = TULEAP_URL + "api/projects/1080/git" # tuleap cleanup module GIT_REPOS = ( "https://tuleap.net/plugins/git/manjaromemodoc/manjaro-memo-documentation.git", "https://tuleap.net/plugins/git/myaurora/myaurora.git", ) @pytest.fixture def tuleap_projects(datadir) -> Tuple[str, Dict[str, str], List[str]]: text = Path(datadir, "https_tuleap.net", "projects").read_text() headers = { "X-PAGINATION-LIMIT-MAX": "50", "X-PAGINATION-LIMIT": "10", "X-PAGINATION-SIZE": "2", } repo_json = json.loads(text) projects = [p["shortname"] for p in repo_json] return text, headers, projects @pytest.fixture def tuleap_repo_1(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[str]]: text = Path(datadir, "https_tuleap.net", "repo_1").read_text() headers = { "X-PAGINATION-LIMIT-MAX": "50", "X-PAGINATION-LIMIT": "10", "X-PAGINATION-SIZE": "1", } reps = json.loads(text) page_results = [] for r in reps["repositories"]: page_results.append( TuleapLister.results_simplified(url=TULEAP_URL, repo_type="git", repo=r) ) origin_urls = [r["uri"] for r in page_results] return text, headers, page_results, origin_urls @pytest.fixture def tuleap_repo_2(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[str]]: text = Path(datadir, "https_tuleap.net", "repo_2").read_text() headers = { "X-PAGINATION-LIMIT-MAX": "50", "X-PAGINATION-LIMIT": "10", "X-PAGINATION-SIZE": "1", } reps = json.loads(text) page_results = [] for r in reps["repositories"]: page_results.append( TuleapLister.results_simplified(url=TULEAP_URL, repo_type="git", repo=r) ) origin_urls = [r["uri"] for r in page_results] return text, headers, page_results, origin_urls @pytest.fixture def tuleap_repo_3(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[str]]: text = Path(datadir, "https_tuleap.net", "repo_3").read_text() headers = { "X-PAGINATION-LIMIT-MAX": "50", "X-PAGINATION-LIMIT": "10", "X-PAGINATION-SIZE": "0", } reps = json.loads(text) page_results = [] for r in reps["repositories"]: page_results.append( TuleapLister.results_simplified(url=TULEAP_URL, repo_type="git", repo=r) ) origin_urls = [r["uri"] for r in page_results] return text, headers, page_results, origin_urls def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): """Asserts that the two collections have the same origin URLs. Does not test last_update.""" - sorted_lister_urls = list(sorted(lister_urls)) - sorted_scheduler_origins = list(sorted(scheduler_origins)) - - assert len(sorted_lister_urls) == len(sorted_scheduler_origins) - - for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins): - assert l_url == s_origin.url + assert set(lister_urls) == {origin.url for origin in scheduler_origins} def test_tuleap_full_listing( swh_scheduler, requests_mock, mocker, tuleap_projects, tuleap_repo_1, tuleap_repo_2, tuleap_repo_3, ): """Covers full listing of multiple pages, rate-limit, page size (required for test), checking page results and listed origins, statelessness.""" lister = TuleapLister( scheduler=swh_scheduler, url=TULEAP_URL, instance="tuleap.net" ) p_text, p_headers, p_projects = tuleap_projects r1_text, r1_headers, r1_result, r1_origin_urls = tuleap_repo_1 r2_text, r2_headers, r2_result, r2_origin_urls = tuleap_repo_2 r3_text, r3_headers, r3_result, r3_origin_urls = tuleap_repo_3 requests_mock.get(TULEAP_PROJECTS_URL, text=p_text, headers=p_headers) requests_mock.get(TULEAP_REPO_1_URL, text=r1_text, headers=r1_headers) requests_mock.get( TULEAP_REPO_2_URL, [ {"status_code": requests.codes.too_many_requests}, {"text": r2_text, "headers": r2_headers}, ], ) requests_mock.get(TULEAP_REPO_3_URL, text=r3_text, headers=r3_headers) # end test setup stats = lister.run() # start test checks assert stats.pages == 2 assert stats.origins == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results check_listed_origins( r1_origin_urls + r2_origin_urls + r3_origin_urls, scheduler_origins ) check_listed_origins(GIT_REPOS, scheduler_origins) assert lister.get_state_from_scheduler() is None @pytest.mark.parametrize("http_code", [400, 500, 502]) def test_tuleap_list_http_error(swh_scheduler, requests_mock, http_code): """Test handling of some HTTP errors commonly encountered""" lister = TuleapLister(scheduler=swh_scheduler, url=TULEAP_URL) requests_mock.get(TULEAP_PROJECTS_URL, status_code=http_code) with pytest.raises(requests.HTTPError): lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == 0