diff --git a/PKG-INFO b/PKG-INFO index 1462b4d..06e2a38 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,52 +1,52 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 4.1.0 +Version: 4.2.0 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/swh.loader.core.egg-info/PKG-INFO b/swh.loader.core.egg-info/PKG-INFO index 1462b4d..06e2a38 100644 --- a/swh.loader.core.egg-info/PKG-INFO +++ b/swh.loader.core.egg-info/PKG-INFO @@ -1,52 +1,52 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 4.1.0 +Version: 4.2.0 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/swh.loader.core.egg-info/SOURCES.txt b/swh.loader.core.egg-info/SOURCES.txt index eac17d5..cbd0425 100644 --- a/swh.loader.core.egg-info/SOURCES.txt +++ b/swh.loader.core.egg-info/SOURCES.txt @@ -1,288 +1,300 @@ .git-blame-ignore-revs .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.rst conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/README.rst docs/cli.rst docs/conf.py docs/index.rst docs/package-loader-specifications.rst docs/package-loader-tutorial.rst docs/vcs-loader-overview.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.loader.core.egg-info/PKG-INFO swh.loader.core.egg-info/SOURCES.txt swh.loader.core.egg-info/dependency_links.txt swh.loader.core.egg-info/entry_points.txt swh.loader.core.egg-info/requires.txt swh.loader.core.egg-info/top_level.txt swh/loader/__init__.py swh/loader/cli.py swh/loader/exception.py swh/loader/pytest_plugin.py swh/loader/core/__init__.py swh/loader/core/converters.py swh/loader/core/loader.py swh/loader/core/metadata_fetchers.py swh/loader/core/py.typed swh/loader/core/utils.py swh/loader/core/tests/__init__.py swh/loader/core/tests/test_converters.py swh/loader/core/tests/test_loader.py swh/loader/core/tests/test_utils.py swh/loader/package/__init__.py swh/loader/package/loader.py swh/loader/package/py.typed swh/loader/package/utils.py swh/loader/package/arch/__init__.py swh/loader/package/arch/loader.py swh/loader/package/arch/tasks.py swh/loader/package/arch/tests/__init__.py swh/loader/package/arch/tests/test_arch.py swh/loader/package/arch/tests/test_tasks.py swh/loader/package/arch/tests/data/fake_arch.sh swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz swh/loader/package/archive/__init__.py swh/loader/package/archive/loader.py swh/loader/package/archive/tasks.py swh/loader/package/archive/tests/__init__.py swh/loader/package/archive/tests/test_archive.py swh/loader/package/archive/tests/test_tasks.py swh/loader/package/archive/tests/data/not_gzipped_tarball.tar.gz swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz swh/loader/package/aur/__init__.py swh/loader/package/aur/loader.py swh/loader/package/aur/tasks.py swh/loader/package/aur/tests/__init__.py swh/loader/package/aur/tests/test_aur.py swh/loader/package/aur/tests/test_tasks.py swh/loader/package/aur/tests/data/fake_aur.sh swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_a-fake-one.tar.gz swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_hg-evolve.tar.gz swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_ibus-git.tar.gz swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_libervia-web-hg.tar.gz swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_tealdeer-git.tar.gz swh/loader/package/cran/__init__.py swh/loader/package/cran/loader.py swh/loader/package/cran/tasks.py swh/loader/package/cran/tests/__init__.py swh/loader/package/cran/tests/test_cran.py swh/loader/package/cran/tests/test_tasks.py swh/loader/package/cran/tests/data/description/KnownBR swh/loader/package/cran/tests/data/description/acepack swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz swh/loader/package/crates/__init__.py swh/loader/package/crates/loader.py swh/loader/package/crates/tasks.py swh/loader/package/crates/tests/__init__.py swh/loader/package/crates/tests/test_crates.py swh/loader/package/crates/tests/test_tasks.py swh/loader/package/crates/tests/data/fake_crates.sh swh/loader/package/crates/tests/data/https_crates.io/api_v1_crates_hg-core swh/loader/package/crates/tests/data/https_crates.io/api_v1_crates_micro-timer swh/loader/package/crates/tests/data/https_static.crates.io/crates_hg-core_hg-core-0.0.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.2.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.2.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.2.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.3.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.3.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.4.0.crate swh/loader/package/debian/__init__.py swh/loader/package/debian/loader.py swh/loader/package/debian/tasks.py swh/loader/package/debian/tests/__init__.py swh/loader/package/debian/tests/test_debian.py swh/loader/package/debian/tests/test_tasks.py swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.diff.gz swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.dsc swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.diff.gz swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.dsc swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2.orig.tar.gz swh/loader/package/debian/tests/data/http_deb.debian.org/onefile.txt swh/loader/package/deposit/__init__.py swh/loader/package/deposit/loader.py swh/loader/package/deposit/tasks.py swh/loader/package/deposit/tests/__init__.py swh/loader/package/deposit/tests/conftest.py swh/loader/package/deposit/tests/test_deposit.py swh/loader/package/deposit/tests/test_tasks.py swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello-2.10.zip swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello-2.12.tar.gz swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.12.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.13.json swh/loader/package/golang/__init__.py swh/loader/package/golang/loader.py swh/loader/package/golang/tasks.py swh/loader/package/golang/tests/__init__.py swh/loader/package/golang/tests/test_golang.py swh/loader/package/golang/tests/test_tasks.py swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@latest swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_list swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.info swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.zip +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@latest +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_list +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.info +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.zip +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@latest +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_list +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.info +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.zip swh/loader/package/maven/__init__.py swh/loader/package/maven/loader.py swh/loader/package/maven/tasks.py swh/loader/package/maven/tests/__init__.py swh/loader/package/maven/tests/test_maven.py swh/loader/package/maven/tests/test_tasks.py swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0-sources.jar swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1-sources.jar swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom swh/loader/package/nixguix/__init__.py swh/loader/package/nixguix/loader.py swh/loader/package/nixguix/tasks.py swh/loader/package/nixguix/tests/__init__.py swh/loader/package/nixguix/tests/conftest.py swh/loader/package/nixguix/tests/test_nixguix.py swh/loader/package/nixguix/tests/test_tasks.py swh/loader/package/nixguix/tests/data/https_example.com/file.txt swh/loader/package/nixguix/tests/data/https_fail.com/truncated-archive.tgz swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz swh/loader/package/nixguix/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_github.com/owner-3_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 swh/loader/package/npm/__init__.py swh/loader/package/npm/loader.py swh/loader/package/npm/tasks.py swh/loader/package/npm/tests/__init__.py swh/loader/package/npm/tests/test_npm.py swh/loader/package/npm/tests/test_tasks.py swh/loader/package/npm/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.1-alpha.14.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/jammit-express_-_jammit-express-0.0.1.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/nativescript-telerik-analytics_-_nativescript-telerik-analytics-1.0.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.2.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.4.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.5.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.1.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.2.0.tgz swh/loader/package/npm/tests/data/https_replicate.npmjs.com/@aller_shared swh/loader/package/npm/tests/data/https_replicate.npmjs.com/catify swh/loader/package/npm/tests/data/https_replicate.npmjs.com/jammit-express swh/loader/package/npm/tests/data/https_replicate.npmjs.com/jammit-no-time swh/loader/package/npm/tests/data/https_replicate.npmjs.com/nativescript-telerik-analytics swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_visit1 swh/loader/package/opam/__init__.py swh/loader/package/opam/loader.py swh/loader/package/opam/tasks.py swh/loader/package/opam/tests/__init__.py swh/loader/package/opam/tests/test_opam.py swh/loader/package/opam/tests/test_tasks.py swh/loader/package/opam/tests/data/fake_opam_repo/_repo swh/loader/package/opam/tests/data/fake_opam_repo/version swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/lock swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/repos-config swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/agrid/agrid.0.1/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.1/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.2/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.3/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/ocb/ocb.0.1/opam swh/loader/package/opam/tests/data/https_github.com/OCamlPro_agrid_archive_0.1.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.1.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.2.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.3.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_ocb_archive_0.1.tar.gz swh/loader/package/pubdev/__init__.py swh/loader/package/pubdev/loader.py swh/loader/package/pubdev/tasks.py swh/loader/package/pubdev/tests/__init__.py swh/loader/package/pubdev/tests/test_pubdev.py swh/loader/package/pubdev/tests/test_tasks.py swh/loader/package/pubdev/tests/data/fake_pubdev.sh swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_Autolinker_versions_0.1.1.tar.gz swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_authentication_versions_0.0.1.tar.gz swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_bezier_versions_1.1.5.tar.gz swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_1.0.0.tar.gz swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_3.8.2.tar.gz swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_abstract_io swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_audio_manager swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf swh/loader/package/pypi/__init__.py swh/loader/package/pypi/loader.py swh/loader/package/pypi/tasks.py swh/loader/package/pypi/tests/__init__.py swh/loader/package/pypi/tests/test_pypi.py swh/loader/package/pypi/tests/test_tasks.py swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_86_10_c9555ec63106153aaaad753a281ff47f4ac79e980ff7f5d740d6649cd56a_upymenu-0.0.1.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 swh/loader/package/pypi/tests/data/https_pypi.org/pypi_nexter_json swh/loader/package/pypi/tests/data/https_pypi.org/pypi_upymenu_json swh/loader/package/tests/__init__.py swh/loader/package/tests/common.py swh/loader/package/tests/test_conftest.py swh/loader/package/tests/test_loader.py swh/loader/package/tests/test_loader_metadata.py swh/loader/package/tests/test_utils.py +swh/loader/package/tests/data/https_example.org/package_example_example-v1.0.tar.gz +swh/loader/package/tests/data/https_example.org/package_example_example-v2.0.tar.gz +swh/loader/package/tests/data/https_example.org/package_example_example-v3.0.tar.gz +swh/loader/package/tests/data/https_example.org/package_example_example-v4.0.tar.gz swh/loader/tests/__init__.py swh/loader/tests/conftest.py swh/loader/tests/py.typed swh/loader/tests/test_cli.py swh/loader/tests/test_init.py swh/loader/tests/data/0805nexter-1.1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/crates/loader.py b/swh/loader/package/crates/loader.py index a2ebc2b..2943ae9 100644 --- a/swh/loader/package/crates/loader.py +++ b/swh/loader/package/crates/loader.py @@ -1,354 +1,354 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.version import StrictVersion import json from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple from urllib.parse import urlparse import attr import toml from typing_extensions import TypedDict from swh.loader.package.loader import BasePackageInfo, PackageLoader -from swh.loader.package.utils import api_info, cached_method, release_name +from swh.loader.package.utils import cached_method, get_url_body, release_name from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface class ExtrinsicPackageMetadata(TypedDict): """Data structure for package extrinsic metadata pulled from http api endpoint. We set only the keys we need according to what is available when querying https://crates.io/api/v1/crates/, where `name` is the name of the crate package (see JSON response example at https://crates.io/api/v1/crates/hg-core). Usage example: .. code-block:: python e_metadata = ExtrinsicPackageMetadata(**self.info()) """ # noqa categories: List[Dict[Any, Any]] """Related categories""" crate: Dict[Any, Any] """Crate project information""" keywords: List[Any] """Keywords""" versions: List[Dict[Any, Any]] """A list of released versions for a crate""" class ExtrinsicVersionPackageMetadata(TypedDict): """Data structure for specific package version extrinsic metadata, pulled from http api endpoint. Similar to `ExtrinsicPackageMetadata` in its usage, but we flatten the data related to a specific version. """ crate: str """The package name""" crate_size: int """The package size""" created_at: str """First released at""" downloads: str """Number of downloads""" license: str """Package license""" num: str """Package version""" published_by: Dict[Any, Any] """Publishers information""" updated_at: str """Last update""" yanked: bool """Is that version yanked? (yanked means release-level deprecation)""" class IntrinsicPackageMetadata(TypedDict): """Data structure for specific package version intrinsic metadata. Data is extracted from the crate package's .toml file. Then the data of the 'package' entry is flattened. Cargo.toml file content example: .. code-block:: toml [package] name = "hg-core" version = "0.0.1" authors = ["Georges Racinet "] description = "Mercurial pure Rust core library, with no assumption on Python bindings (FFI)" homepage = "https://mercurial-scm.org" license = "GPL-2.0-or-later" repository = "https://www.mercurial-scm.org/repo/hg" [lib] name = "hg" [dev-dependencies.rand] version = "~0.6" [dev-dependencies.rand_pcg] version = "~0.1" :param toml: toml object """ name: str """The package name""" version: str """Package version""" authors: List[str] """Authors""" description: str """Package and release description""" homepage: str """Homepage of the project""" license: str """Package license""" repository: str """Source code repository""" @attr.s class CratesPackageInfo(BasePackageInfo): name = attr.ib(type=str) """Name of the package""" version = attr.ib(type=str) """Current version""" e_metadata: Dict[str, Any] = attr.ib(factory=ExtrinsicPackageMetadata) """Extrinsic package metadata, common to all versions""" e_metadata_version: Dict[str, Any] = attr.ib( factory=ExtrinsicVersionPackageMetadata ) """Extrinsic package metadata specific to a version""" i_metadata: Dict[str, Any] = attr.ib(factory=IntrinsicPackageMetadata) """Intrinsic metadata of the current package version""" def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: """Extract intrinsic metadata from Cargo.toml file at dir_path. Each crate archive has a Cargo.toml at the root of the archive. Args: dir_path: A directory on disk where a Cargo.toml must be present Returns: A dict mapping from toml parser """ return toml.load(dir_path / "Cargo.toml") def extract_author(p_info: CratesPackageInfo) -> Person: """Extract package author from intrinsic metadata and return it as a `Person` model. Args: p_info: CratesPackageInfo that should contains i_metadata entries Returns: Only one author (Person) of the package. Currently limited by internal detail of the swh stack (see T3887). """ authors = p_info.i_metadata["authors"] fullname = authors[0] # TODO: here we have a list of author, see T3887 return Person.from_fullname(fullname.encode()) def extract_description(p_info: CratesPackageInfo) -> str: """Extract package description from intrinsic metadata and return it as a string. Args: p_info: CratesPackageInfo that should contains i_metadata and entries Returns: Package description from metadata. """ return p_info.i_metadata["description"] class CratesLoader(PackageLoader[CratesPackageInfo]): """Load Crates package origins into swh archive.""" visit_type = "crates" def __init__( self, storage: StorageInterface, url: str, artifacts: List[Dict[str, Any]], **kwargs, ): """Constructor Args: url: Origin url, (e.g. https://crates.io/api/v1/crates/) artifacts: A list of dict listing all existing released versions for a package (Usually set with crates lister `extra_loader_arguments`). Each line is a dict that should have an `url` (where to download package specific version) and a `version` entry. Example:: [ { "version": , "url": "https://static.crates.io/crates//-.crate", } ] """ # noqa super().__init__(storage=storage, url=url, **kwargs) self.url = url self.artifacts: Dict[str, Dict] = { artifact["version"]: artifact for artifact in artifacts } @cached_method def _raw_info(self) -> bytes: """Get crate metadata (fetched from http api endpoint set as self.url) Returns: Content response as bytes. Content response is a json document. """ - return api_info(self.url) + return get_url_body(self.url) @cached_method def info(self) -> Dict: """Parse http api json response and return the crate metadata information as a Dict.""" return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: """Get all released versions of a crate Returns: A sequence of versions Example:: ["0.1.1", "0.10.2"] """ versions = list(self.artifacts.keys()) versions.sort(key=StrictVersion) return versions def get_default_version(self) -> str: """Get the newest release version of a crate Returns: A string representing a version Example:: "0.1.2" """ return self.get_versions()[-1] def get_package_info(self, version: str) -> Iterator[Tuple[str, CratesPackageInfo]]: """Get release name and package information from version Args: version: crate version (e.g: "0.1.0") Returns: Iterator of tuple (release_name, p_info) """ artifact = self.artifacts[version] filename = artifact["filename"] package_name = urlparse(self.url).path.split("/")[-1] url = artifact["url"] # Get extrinsic metadata from http api e_metadata = ExtrinsicPackageMetadata(**self.info()) # type: ignore[misc] # Extract crate info for current version (One .crate file for a given version) (crate_version,) = [ crate for crate in e_metadata["versions"] if crate["num"] == version ] e_metadata_version = ExtrinsicVersionPackageMetadata( # type: ignore[misc] **crate_version ) p_info = CratesPackageInfo( name=package_name, filename=filename, url=url, version=version, e_metadata=e_metadata, e_metadata_version=e_metadata_version, ) yield release_name(version, filename), p_info def build_release( self, p_info: CratesPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # Extract intrinsic metadata from dir_path/Cargo.toml name = p_info.name version = p_info.version dir_path = Path(uncompressed_path, f"{name}-{version}") i_metadata_raw = extract_intrinsic_metadata(dir_path) # Get only corresponding key of IntrinsicPackageMetadata i_metadata_keys = [k for k in IntrinsicPackageMetadata.__annotations__.keys()] # We use data only from "package" entry i_metadata = { k: v for k, v in i_metadata_raw["package"].items() if k in i_metadata_keys } p_info.i_metadata = IntrinsicPackageMetadata(**i_metadata) # type: ignore[misc] author = extract_author(p_info) description = extract_description(p_info) message = ( f"Synthetic release for Crate source package {p_info.name} " f"version {p_info.version}\n\n" f"{description}\n" ) # The only way to get a value for updated_at is through extrinsic metadata updated_at = p_info.e_metadata_version.get("updated_at") return Release( name=version.encode(), author=author, date=TimestampWithTimezone.from_iso8601(updated_at), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/golang/loader.py b/swh/loader/package/golang/loader.py index 9caff6a..0bc68a4 100644 --- a/swh/loader/package/golang/loader.py +++ b/swh/loader/package/golang/loader.py @@ -1,91 +1,109 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging +import re from typing import Iterator, Optional, Sequence, Tuple import attr from swh.loader.package.loader import BasePackageInfo, PackageLoader -from swh.loader.package.utils import EMPTY_AUTHOR, api_info, release_name +from swh.loader.package.utils import ( + EMPTY_AUTHOR, + get_url_body, + release_name, + cached_method, +) from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) +def _uppercase_encode(url: str) -> str: + return re.sub("([A-Z]{1})", r"!\1", url).lower() + + @attr.s class GolangPackageInfo(BasePackageInfo): name = attr.ib(type=str) timestamp = attr.ib(type=Optional[TimestampWithTimezone]) class GolangLoader(PackageLoader[GolangPackageInfo]): """Load Golang module zip file into SWH archive.""" visit_type = "golang" GOLANG_PKG_DEV_URL = "https://pkg.go.dev" GOLANG_PROXY_URL = "https://proxy.golang.org" def __init__( self, storage: StorageInterface, url: str, max_content_size: Optional[int] = None, **kwargs, ): super().__init__(storage, url, max_content_size=max_content_size, **kwargs) # The lister saves human-usable URLs, so we translate them to proxy URLs # for use in the loader. # This URL format is detailed in https://go.dev/ref/mod#goproxy-protocol assert url.startswith( self.GOLANG_PKG_DEV_URL ), "Go package URL (%s) not from %s" % (url, self.GOLANG_PKG_DEV_URL) self.name = url[len(self.GOLANG_PKG_DEV_URL) + 1 :] self.url = url.replace(self.GOLANG_PKG_DEV_URL, self.GOLANG_PROXY_URL) + self.url = _uppercase_encode(self.url) def get_versions(self) -> Sequence[str]: - return api_info(f"{self.url}/@v/list").decode().splitlines() - + versions = get_url_body(f"{self.url}/@v/list").decode().splitlines() + # some go packages only have a development version not listed by the endpoint above, + # so ensure to return it or it will be missed by the golang loader + default_version = self.get_default_version() + if default_version not in versions: + versions.append(default_version) + return versions + + @cached_method def get_default_version(self) -> str: - latest = api_info(f"{self.url}/@latest") + latest = get_url_body(f"{self.url}/@latest") return json.loads(latest)["Version"] def _raw_info(self, version: str) -> dict: - url = f"{self.url}/@v/{version}.info" - return json.loads(api_info(url)) + url = f"{self.url}/@v/{_uppercase_encode(version)}.info" + return json.loads(get_url_body(url)) def get_package_info(self, version: str) -> Iterator[Tuple[str, GolangPackageInfo]]: # Encode the name because creating nested folders can become problematic encoded_name = self.name.replace("/", "__") filename = f"{encoded_name}-{version}.zip" timestamp = TimestampWithTimezone.from_iso8601(self._raw_info(version)["Time"]) p_info = GolangPackageInfo( url=f"{self.url}/@v/{version}.zip", filename=filename, version=version, timestamp=timestamp, name=self.name, ) yield release_name(version), p_info def build_release( self, p_info: GolangPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: msg = ( f"Synthetic release for Golang source package {p_info.name} " f"version {p_info.version}\n" ) return Release( name=p_info.version.encode(), message=msg.encode(), date=p_info.timestamp, author=EMPTY_AUTHOR, # Go modules offer very little metadata target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@latest b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@latest new file mode 100644 index 0000000..cc87e10 --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@latest @@ -0,0 +1 @@ +{"Version":"v1.0.1","Time":"2022-03-23T18:02:43Z"} \ No newline at end of file diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_list b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_list new file mode 100644 index 0000000..b18d465 --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_list @@ -0,0 +1 @@ +v1.0.1 diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.info b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.info new file mode 100644 index 0000000..cc87e10 --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.info @@ -0,0 +1 @@ +{"Version":"v1.0.1","Time":"2022-03-23T18:02:43Z"} \ No newline at end of file diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.zip b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.zip new file mode 100644 index 0000000..8fe5583 Binary files /dev/null and b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.zip differ diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@latest b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@latest new file mode 100644 index 0000000..bcf634e --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@latest @@ -0,0 +1 @@ +{"Version":"v0.0.0-20131225113241-85981e2038bf","Time":"2013-12-25T11:32:41Z"} \ No newline at end of file diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_list b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_list new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.info b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.info new file mode 100644 index 0000000..bcf634e --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.info @@ -0,0 +1 @@ +{"Version":"v0.0.0-20131225113241-85981e2038bf","Time":"2013-12-25T11:32:41Z"} \ No newline at end of file diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.zip b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.zip new file mode 100644 index 0000000..f37b383 Binary files /dev/null and b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.zip differ diff --git a/swh/loader/package/golang/tests/test_golang.py b/swh/loader/package/golang/tests/test_golang.py index 63bde1b..5888d9b 100644 --- a/swh/loader/package/golang/tests/test_golang.py +++ b/swh/loader/package/golang/tests/test_golang.py @@ -1,13 +1,31 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.loader.package.golang.loader import GolangLoader def test_golang_loader_first_visit(swh_storage, requests_mock_datadir): url = "https://pkg.go.dev/example.com/basic-go-module" loader = GolangLoader(swh_storage, url) assert loader.load()["status"] == "eventful" + + +def test_golang_loader_package_name_with_uppercase_characters( + swh_storage, requests_mock_datadir +): + url = "https://pkg.go.dev/github.com/adam-hanna/arrayOperations" + loader = GolangLoader(swh_storage, url) + + assert loader.load()["status"] == "eventful" + + +def test_golang_loader_package_with_dev_version_only( + swh_storage, requests_mock_datadir +): + url = "https://pkg.go.dev/github.com/xgdapg/daemon" + loader = GolangLoader(swh_storage, url) + + assert loader.load()["status"] == "eventful" diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 96ff69e..4cc6430 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,1088 +1,1111 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib from itertools import islice import json import logging import os import string import sys import tempfile from typing import ( Any, Dict, Generic, Iterator, List, Mapping, Optional, Sequence, Set, Tuple, TypeVar, ) import attr from requests.exceptions import ContentDecodingError import sentry_sdk from swh.core.tarball import uncompress from swh.loader.core.loader import BaseLoader from swh.loader.exception import NotFound from swh.loader.package.utils import download from swh.model import from_disk from swh.model.hashutil import hash_to_hex from swh.model.model import ( ExtID, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, ) from swh.model.model import ( Origin, OriginVisit, OriginVisitStatus, RawExtrinsicMetadata, Release, Revision, Sha1Git, Snapshot, ) from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import StorageInterface from swh.storage.utils import now logger = logging.getLogger(__name__) SWH_METADATA_AUTHORITY = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="https://softwareheritage.org/", metadata={}, ) """Metadata authority for extrinsic metadata generated by Software Heritage. Used for metadata on "original artifacts", ie. length, filename, and checksums of downloaded archive files.""" PartialExtID = Tuple[str, int, bytes] """The ``extid_type`` and ``extid`` fields of an :class:`ExtID` object.""" @attr.s class RawExtrinsicMetadataCore: """Contains the core of the metadata extracted by a loader, that will be used to build a full RawExtrinsicMetadata object by adding object identifier, context, and provenance information.""" format = attr.ib(type=str) metadata = attr.ib(type=bytes) discovery_date = attr.ib(type=Optional[datetime.datetime], default=None) """Defaults to the visit date.""" @attr.s class BasePackageInfo: """Compute the primary key for a dict using the id_keys as primary key composite. Args: d: A dict entry to compute the primary key on id_keys: Sequence of keys to use as primary key Returns: The identity for that dict entry """ url = attr.ib(type=str) filename = attr.ib(type=Optional[str]) version = attr.ib(type=str) """Version name/number.""" MANIFEST_FORMAT: Optional[string.Template] = None """If not None, used by the default extid() implementation to format a manifest, before hashing it to produce an ExtID.""" EXTID_TYPE: str = "package-manifest-sha256" EXTID_VERSION: int = 0 # The following attribute has kw_only=True in order to allow subclasses # to add attributes. Without kw_only, attributes without default values cannot # go after attributes with default values. # See directory_extrinsic_metadata = attr.ib( type=List[RawExtrinsicMetadataCore], default=[], kw_only=True, ) """:term:`extrinsic metadata` collected by the loader, that will be attached to the loaded directory and added to the Metadata storage.""" # TODO: add support for metadata for releases and contents def extid(self) -> Optional[PartialExtID]: """Returns a unique intrinsic identifier of this package info, or None if this package info is not 'deduplicatable' (meaning that we will always load it, instead of checking the ExtID storage to see if we already did)""" if self.MANIFEST_FORMAT is None: return None else: manifest = self.MANIFEST_FORMAT.substitute( {k: str(v) for (k, v) in attr.asdict(self).items()} ) return ( self.EXTID_TYPE, self.EXTID_VERSION, hashlib.sha256(manifest.encode()).digest(), ) TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) class PackageLoader(BaseLoader, Generic[TPackageInfo]): def __init__(self, storage: StorageInterface, url: str, **kwargs: Any): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: storage: Storage instance url: Origin url to load data from """ super().__init__(storage=storage, origin_url=url, **kwargs) + self.status_load = "" + self.status_visit = "" + + def load_status(self) -> Dict[str, str]: + """Detailed loading status.""" + return { + "status": self.status_load, + } + + def visit_status(self) -> str: + """Detailed visit status.""" + return self.status_visit def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Raises: class:`swh.loader.exception.NotFound` error when failing to read the published package versions. Returns: Sequence of published versions """ return [] def get_package_info(self, version: str) -> Iterator[Tuple[str, TPackageInfo]]: """Given a release version of a package, retrieve the associated package information for such version. Args: version: Package version Returns: (branch name, package metadata) """ yield from {} def build_release( self, p_info: TPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: """Build the release from the archive metadata (extrinsic artifact metadata) and the intrinsic metadata. Args: p_info: Package information uncompressed_path: Artifact uncompressed path on disk """ raise NotImplementedError("build_release") def get_default_version(self) -> str: """Retrieve the latest release version if any. Returns: Latest version """ return "" def last_snapshot(self) -> Optional[Snapshot]: """Retrieve the last snapshot out of the last visit.""" return snapshot_get_latest(self.storage, self.origin.url) def new_packageinfo_to_extid(self, p_info: TPackageInfo) -> Optional[PartialExtID]: return p_info.extid() def _get_known_extids( self, packages_info: List[TPackageInfo] ) -> Dict[PartialExtID, List[CoreSWHID]]: """Compute the ExtIDs from new PackageInfo objects, searches which are already loaded in the archive, and returns them if any.""" # Compute the ExtIDs of all the new packages, grouped by extid type new_extids: Dict[Tuple[str, int], List[bytes]] = {} for p_info in packages_info: res = p_info.extid() if res is not None: (extid_type, extid_version, extid_extid) = res new_extids.setdefault((extid_type, extid_version), []).append( extid_extid ) # For each extid type, call extid_get_from_extid() with all the extids of # that type, and store them in the '(type, extid) -> target' map. known_extids: Dict[PartialExtID, List[CoreSWHID]] = {} for ((extid_type, extid_version), extids) in new_extids.items(): for extid in self.storage.extid_get_from_extid( extid_type, extids, version=extid_version ): if extid is not None: key = (extid.extid_type, extid_version, extid.extid) known_extids.setdefault(key, []).append(extid.target) return known_extids def resolve_object_from_extids( self, known_extids: Dict[PartialExtID, List[CoreSWHID]], p_info: TPackageInfo, whitelist: Set[Sha1Git], ) -> Optional[CoreSWHID]: """Resolve the revision/release from known ExtIDs and a package info object. If the artifact has already been downloaded, this will return the existing release (or revision) targeting that uncompressed artifact directory. Otherwise, this returns None. Args: known_extids: Dict built from a list of ExtID, with the target as value p_info: Package information whitelist: Any ExtID with target not in this set is filtered out Returns: None or release/revision SWHID """ new_extid = p_info.extid() if new_extid is None: return None extid_targets = set() for extid_target in known_extids.get(new_extid, []): if extid_target.object_id not in whitelist: # There is a known ExtID for this package, but its target is not # in the snapshot. # This can happen for three reasons: # # 1. a loader crashed after writing the ExtID, but before writing # the snapshot # 2. some other loader loaded the same artifact, but produced # a different revision, causing an additional ExtID object # to be written. We will probably find this loader's ExtID # in a future iteration of this loop. # Note that for now, this is impossible, as each loader has a # completely different extid_type, but this is an implementation # detail of each loader. # 3. we took a snapshot, then the package disappeared, # then we took another snapshot, and the package reappeared # # In case of 1, we must actually load the package now, # so let's do it. # TODO: detect when we are in case 3 using release_missing # or revision_missing instead of the snapshot. continue elif extid_target.object_type in (ObjectType.RELEASE, ObjectType.REVISION): extid_targets.add(extid_target) else: # Note that this case should never be reached unless there is a # collision between a revision hash and some non-revision object's # hash, but better safe than sorry. logger.warning( "%s is in the whitelist, but is not a revision/release.", hash_to_hex(extid_target.object_type), ) if extid_targets: # This is a known package version, as we have an extid to reference it. # Let's return one of them. # If there is a release extid, return it. release_extid_targets = { extid_target for extid_target in extid_targets if extid_target.object_type == ObjectType.RELEASE } # Exclude missing targets missing_releases = { CoreSWHID(object_type=ObjectType.RELEASE, object_id=id_) for id_ in self.storage.release_missing( [swhid.object_id for swhid in release_extid_targets] ) } if missing_releases: err_message = "Found ExtIDs pointing to missing releases" logger.error(err_message + ": %s", missing_releases) with sentry_sdk.push_scope() as scope: scope.set_extra( "missing_releases", [str(x) for x in missing_releases] ) sentry_sdk.capture_message(err_message, "error") release_extid_targets -= missing_releases extid_target2 = self.select_extid_target(p_info, release_extid_targets) if extid_target2: return extid_target2 # If there is no release extid (ie. if the package was only loaded with # older versions of this loader, which produced revision objects instead # of releases), return a revision extid when possible. revision_extid_targets = { extid_target for extid_target in extid_targets if extid_target.object_type == ObjectType.REVISION } if revision_extid_targets: assert len(extid_targets) == 1, extid_targets extid_target = list(extid_targets)[0] return extid_target # No target found (this is probably a new package version) return None def select_extid_target( self, p_info: TPackageInfo, extid_targets: Set[CoreSWHID] ) -> Optional[CoreSWHID]: """Given a list of release extid targets, choses one appropriate for the given package info. Package loaders shyould implement this if their ExtIDs may map to multiple releases, so they can fetch releases from the storage and inspect their fields to select the right one for this ``p_info``. """ if extid_targets: # The base package loader does not have the domain-specific knowledge # to select the right release -> crash if there is more than one. assert len(extid_targets) == 1, extid_targets return list(extid_targets)[0] return None def download_package( self, p_info: TPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Download artifacts for a specific package. All downloads happen in in the tmpdir folder. Default implementation expects the artifacts package info to be about one artifact per package. Note that most implementation have 1 artifact per package. But some implementation have multiple artifacts per package (debian), some have none, the package is the artifact (gnu). Args: artifacts_package_info: Information on the package artifacts to download (url, filename, etc...) tmpdir: Location to retrieve such artifacts Returns: List of (path, computed hashes) """ try: return [download(p_info.url, dest=tmpdir, filename=p_info.filename)] except ContentDecodingError: # package might be erroneously marked as gzip compressed while is is not, # try to download its raw bytes again without attempting to uncompress # the input stream return [ download( p_info.url, dest=tmpdir, filename=p_info.filename, extra_request_headers={"Accept-Encoding": "identity"}, ) ] def uncompress( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str ) -> str: """Uncompress the artifact(s) in the destination folder dest. Optionally, this could need to use the p_info dict for some more information (debian). """ uncompressed_path = os.path.join(dest, "src") for a_path, _ in dl_artifacts: uncompress(a_path, dest=uncompressed_path) return uncompressed_path def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """Return an extra dict of branches that are used to update the set of branches. """ return {} def finalize_visit( self, *, snapshot: Optional[Snapshot], visit: OriginVisit, status_visit: str, status_load: str, failed_branches: List[str], errors: Optional[List[str]] = None, ) -> Dict[str, Any]: """Finalize the visit: - flush eventual unflushed data to storage - update origin visit's status - return the task's status """ + self.status_load = status_load + self.status_visit = status_visit self.storage.flush() snapshot_id: Optional[bytes] = None if snapshot and snapshot.id: # to prevent the snapshot.id to b"" snapshot_id = snapshot.id assert visit.visit visit_status = OriginVisitStatus( origin=self.origin.url, visit=visit.visit, type=self.visit_type, date=now(), status=status_visit, snapshot=snapshot_id, ) self.storage.origin_visit_status_add([visit_status]) result: Dict[str, Any] = { "status": status_load, } if snapshot_id: result["snapshot_id"] = hash_to_hex(snapshot_id) if failed_branches: logger.warning("%d failed branches", len(failed_branches)) for i, urls in enumerate(islice(failed_branches, 50)): prefix_url = "Failed branches: " if i == 0 else "" logger.warning("%s%s", prefix_url, urls) return result def load(self) -> Dict: """Load for a specific origin the associated contents. 1. Get the list of versions in an origin. 2. Get the snapshot from the previous run of the loader, and filter out versions that were already loaded, if their :term:`extids ` match Then, for each remaining version in the origin 3. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 4. Extract the downloaded files. By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 5. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 6. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 7. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 8. Generate and load the snapshot for the visit Using the revisions/releases collected at step 7., and the branch information from step 2., generate a snapshot and load it into the Software Heritage archive """ - status_load = "uneventful" # either: eventful, uneventful, failed - status_visit = "full" # see swh.model.model.OriginVisitStatus + self.status_load = "uneventful" # either: eventful, uneventful, failed + self.status_visit = "full" # see swh.model.model.OriginVisitStatus snapshot = None failed_branches: List[str] = [] # Prepare origin and origin_visit origin = Origin(url=self.origin.url) try: self.storage.origin_add([origin]) visit = list( self.storage.origin_visit_add( [ OriginVisit( origin=self.origin.url, date=self.visit_date, type=self.visit_type, ) ] ) )[0] except Exception as e: logger.exception( "Failed to initialize origin_visit for %s", self.origin.url ) sentry_sdk.capture_exception(e) + self.status_load = self.status_visit = "failed" return {"status": "failed"} # Get the previous snapshot for this origin. It is then used to see which # of the package's versions are already loaded in the archive. try: last_snapshot = self.last_snapshot() logger.debug("last snapshot: %s", last_snapshot) except Exception as e: logger.exception("Failed to get previous state for %s", self.origin.url) sentry_sdk.capture_exception(e) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=[str(e)], ) load_exceptions: List[Exception] = [] # Get the list of all version names try: versions = self.get_versions() except NotFound as e: return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="not_found", status_load="failed", errors=[str(e)], ) except Exception as e: logger.exception("Failed to get list of versions for %s", self.origin.url) sentry_sdk.capture_exception(e) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=[str(e)], ) + errors = [] + # Get the metadata of each version's package - packages_info: List[Tuple[str, TPackageInfo]] = [ - (branch_name, p_info) - for version in versions - for (branch_name, p_info) in self.get_package_info(version) - ] + packages_info: List[Tuple[str, TPackageInfo]] = [] + for version in versions: + try: + for branch_name, p_info in self.get_package_info(version): + packages_info.append((branch_name, p_info)) + except Exception as e: + load_exceptions.append(e) + sentry_sdk.capture_exception(e) + error = f"Failed to get package info for version {version} of {self.origin.url}" + logger.exception(error) + errors.append(f"{error}: {e}") # Compute the ExtID of each of these packages known_extids = self._get_known_extids([p_info for (_, p_info) in packages_info]) if last_snapshot is None: last_snapshot_targets: Set[Sha1Git] = set() else: last_snapshot_targets = { branch.target for branch in last_snapshot.branches.values() } new_extids: Set[ExtID] = set() tmp_releases: Dict[str, List[Tuple[str, Sha1Git]]] = { version: [] for version in versions } - errors = [] + for (branch_name, p_info) in packages_info: logger.debug("package_info: %s", p_info) # Check if the package was already loaded, using its ExtID swhid = self.resolve_object_from_extids( known_extids, p_info, last_snapshot_targets ) if swhid is not None and swhid.object_type == ObjectType.REVISION: # This package was already loaded, but by an older version # of this loader, which produced revisions instead of releases. # Let's fetch the revision's data, and "upgrade" it into a release. (rev,) = self.storage.revision_get([swhid.object_id]) if not rev: logger.error( "Failed to upgrade branch %s from revision to " "release, %s is missing from the storage. " "Falling back to re-loading from the origin.", branch_name, swhid, ) else: rev = None if swhid is None or (swhid.object_type == ObjectType.REVISION and not rev): # No matching revision or release found in the last snapshot, load it. release_id = None try: res = self._load_release(p_info, origin) if res: (release_id, directory_id) = res assert release_id assert directory_id self._load_extrinsic_directory_metadata( p_info, release_id, directory_id ) self.storage.flush() - status_load = "eventful" + self.status_load = "eventful" except Exception as e: self.storage.clear_buffers() load_exceptions.append(e) sentry_sdk.capture_exception(e) error = f"Failed to load branch {branch_name} for {self.origin.url}" logger.exception(error) failed_branches.append(branch_name) errors.append(f"{error}: {e}") continue if release_id is None: continue add_extid = True elif swhid.object_type == ObjectType.REVISION: # If 'rev' was None, the previous block would have run. assert rev is not None rel = rev2rel(rev, p_info.version) self.storage.release_add([rel]) logger.debug("Upgraded %s to %s", swhid, rel.swhid()) release_id = rel.id # Create a new extid for this package, so the next run of this loader # will be able to find the new release, and use it (instead of the # old revision) add_extid = True elif swhid.object_type == ObjectType.RELEASE: # This package was already loaded, nothing to do. release_id = swhid.object_id add_extid = False else: assert False, f"Unexpected object type: {swhid}" assert release_id is not None if add_extid: partial_extid = p_info.extid() if partial_extid is not None: (extid_type, extid_version, extid) = partial_extid release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=release_id ) new_extids.add( ExtID( extid_type=extid_type, extid_version=extid_version, extid=extid, target=release_swhid, ) ) tmp_releases[p_info.version].append((branch_name, release_id)) if load_exceptions: - status_visit = "partial" + self.status_visit = "partial" if not tmp_releases: # We could not load any releases; fail completely logger.error("Failed to load any release for %s", self.origin.url) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=errors, ) try: # Retrieve the default release version (the "latest" one) default_version = self.get_default_version() logger.debug("default version: %s", default_version) # Retrieve extra branches extra_branches = self.extra_branches() logger.debug("extra branches: %s", extra_branches) snapshot = self._load_snapshot( default_version, tmp_releases, extra_branches ) self.storage.flush() except Exception as e: error = f"Failed to build snapshot for origin {self.origin.url}" logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) - status_visit = "failed" - status_load = "failed" + self.status_visit = "failed" + self.status_load = "failed" if snapshot: try: metadata_objects = self.build_extrinsic_snapshot_metadata(snapshot.id) self.load_metadata_objects(metadata_objects) except Exception as e: error = ( f"Failed to load extrinsic snapshot metadata for {self.origin.url}" ) logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) - status_visit = "partial" - status_load = "failed" + self.status_visit = "partial" + self.status_load = "failed" try: metadata_objects = self.build_extrinsic_origin_metadata() self.load_metadata_objects(metadata_objects) except Exception as e: error = f"Failed to load extrinsic origin metadata for {self.origin.url}" logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) - status_visit = "partial" - status_load = "failed" + self.status_visit = "partial" + self.status_load = "failed" - if status_load != "failed": + if self.status_load != "failed": self._load_extids(new_extids) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, - status_visit=status_visit, - status_load=status_load, + status_visit=self.status_visit, + status_load=self.status_load, errors=errors, ) def _load_directory( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], tmpdir: str ) -> Tuple[str, from_disk.Directory]: uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) logger.debug("uncompressed_path: %s", uncompressed_path) directory = from_disk.Directory.from_disk( path=uncompressed_path.encode("utf-8"), max_content_length=self.max_content_size, ) contents, skipped_contents, directories = from_disk.iter_directory(directory) logger.debug("Number of skipped contents: %s", len(skipped_contents)) self.storage.skipped_content_add(skipped_contents) logger.debug("Number of contents: %s", len(contents)) self.storage.content_add(contents) logger.debug("Number of directories: %s", len(directories)) self.storage.directory_add(directories) return (uncompressed_path, directory) def _load_release( self, p_info: TPackageInfo, origin ) -> Optional[Tuple[Sha1Git, Sha1Git]]: """Does all the loading of a release itself: * downloads a package and uncompresses it * loads it from disk * adds contents, directories, and release to self.storage * returns (release_id, directory_id) Raises exception when unable to download or uncompress artifacts """ with tempfile.TemporaryDirectory() as tmpdir: dl_artifacts = self.download_package(p_info, tmpdir) (uncompressed_path, directory) = self._load_directory(dl_artifacts, tmpdir) # FIXME: This should be release. cf. D409 release = self.build_release( p_info, uncompressed_path, directory=directory.hash ) if not release: # Some artifacts are missing intrinsic metadata # skipping those return None metadata = [metadata for (filepath, metadata) in dl_artifacts] assert release.target is not None, release assert release.target_type == ModelObjectType.DIRECTORY, release metadata_target = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) original_artifact_metadata = RawExtrinsicMetadata( target=metadata_target, discovery_date=self.visit_date, authority=SWH_METADATA_AUTHORITY, fetcher=self.get_metadata_fetcher(), format="original-artifacts-json", metadata=json.dumps(metadata).encode(), origin=self.origin.url, release=release.swhid(), ) self.load_metadata_objects([original_artifact_metadata]) logger.debug("Release: %s", release) self.storage.release_add([release]) assert directory.hash return (release.id, directory.hash) def _load_snapshot( self, default_version: str, releases: Dict[str, List[Tuple[str, bytes]]], extra_branches: Dict[bytes, Mapping[str, Any]], ) -> Optional[Snapshot]: """Build snapshot out of the current releases stored and extra branches. Then load it in the storage. """ logger.debug("releases: %s", releases) # Build and load the snapshot branches = {} # type: Dict[bytes, Mapping[str, Any]] for version, branch_name_releases in releases.items(): if version == default_version and len(branch_name_releases) == 1: # only 1 branch (no ambiguity), we can create an alias # branch 'HEAD' branch_name, _ = branch_name_releases[0] # except for some corner case (deposit) if branch_name != "HEAD": branches[b"HEAD"] = { "target_type": "alias", "target": branch_name.encode("utf-8"), } for branch_name, target in branch_name_releases: branches[branch_name.encode("utf-8")] = { "target_type": "release", "target": target, } # Deal with extra-branches for name, branch_target in extra_branches.items(): if name in branches: error_message = f"Extra branch '{name!r}' has been ignored" logger.error(error_message) sentry_sdk.capture_message(error_message, "error") else: branches[name] = branch_target snapshot_data = {"branches": branches} logger.debug("snapshot: %s", snapshot_data) snapshot = Snapshot.from_dict(snapshot_data) logger.debug("snapshot: %s", snapshot) self.storage.snapshot_add([snapshot]) return snapshot def get_loader_name(self) -> str: """Returns a fully qualified name of this loader.""" return f"{self.__class__.__module__}.{self.__class__.__name__}" def get_loader_version(self) -> str: """Returns the version of the current loader.""" module_name = self.__class__.__module__ or "" module_name_parts = module_name.split(".") # Iterate rootward through the package hierarchy until we find a parent of this # loader's module with a __version__ attribute. for prefix_size in range(len(module_name_parts), 0, -1): package_name = ".".join(module_name_parts[0:prefix_size]) module = sys.modules[package_name] if hasattr(module, "__version__"): return module.__version__ # If this loader's class has no parent package with a __version__, # it should implement it itself. raise NotImplementedError( f"Could not dynamically find the version of {self.get_loader_name()}." ) def get_metadata_fetcher(self) -> MetadataFetcher: """Returns a MetadataFetcher instance representing this package loader; which is used to for adding provenance information to extracted extrinsic metadata, if any.""" return MetadataFetcher( name=self.get_loader_name(), version=self.get_loader_version(), metadata={}, ) def get_metadata_authority(self) -> MetadataAuthority: """For package loaders that get extrinsic metadata, returns the authority the metadata are coming from. """ raise NotImplementedError("get_metadata_authority") def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: """Returns metadata items, used by build_extrinsic_origin_metadata.""" return [] def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]: """Builds a list of full RawExtrinsicMetadata objects, using metadata returned by get_extrinsic_origin_metadata.""" metadata_items = self.get_extrinsic_origin_metadata() if not metadata_items: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in metadata_items: metadata_objects.append( RawExtrinsicMetadata( target=self.origin.swhid(), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, ) ) return metadata_objects def get_extrinsic_snapshot_metadata(self) -> List[RawExtrinsicMetadataCore]: """Returns metadata items, used by build_extrinsic_snapshot_metadata.""" return [] def build_extrinsic_snapshot_metadata( self, snapshot_id: Sha1Git ) -> List[RawExtrinsicMetadata]: """Builds a list of full RawExtrinsicMetadata objects, using metadata returned by get_extrinsic_snapshot_metadata.""" metadata_items = self.get_extrinsic_snapshot_metadata() if not metadata_items: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in metadata_items: metadata_objects.append( RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.SNAPSHOT, object_id=snapshot_id ), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, origin=self.origin.url, ) ) return metadata_objects def build_extrinsic_directory_metadata( self, p_info: TPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, ) -> List[RawExtrinsicMetadata]: if not p_info.directory_extrinsic_metadata: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in p_info.directory_extrinsic_metadata: metadata_objects.append( RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=directory_id ), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, origin=self.origin.url, release=CoreSWHID( object_type=ObjectType.RELEASE, object_id=release_id ), ) ) return metadata_objects def _load_extrinsic_directory_metadata( self, p_info: TPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, ) -> None: metadata_objects = self.build_extrinsic_directory_metadata( p_info, release_id, directory_id ) self.load_metadata_objects(metadata_objects) def _load_extids(self, extids: Set[ExtID]) -> None: if not extids: return try: self.storage.extid_add(list(extids)) except Exception as e: logger.exception("Failed to load new ExtIDs for %s", self.origin.url) sentry_sdk.capture_exception(e) # No big deal, it just means the next visit will load the same versions # again. def rev2rel(rev: Revision, version: str) -> Release: """Converts a revision to a release.""" message = rev.message if message and not message.endswith(b"\n"): message += b"\n" return Release( name=version.encode(), message=message, target=rev.directory, target_type=ModelObjectType.DIRECTORY, synthetic=rev.synthetic, author=rev.author, date=rev.date, ) diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py index 9648790..46eeaf0 100644 --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -1,308 +1,308 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import json import logging import re from typing import Any, Dict, Iterator, List, Mapping, Optional, Set, Tuple import attr from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, PartialExtID, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method +from swh.loader.package.utils import EMPTY_AUTHOR, cached_method, get_url_body from swh.model import hashutil from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Release, Sha1Git, ) from swh.model.swhids import CoreSWHID from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EXTID_TYPE = "subresource-integrity" """The ExtID is an ASCII string, as defined by https://w3c.github.io/webappsec-subresource-integrity/""" EXTID_VERSION = 0 @attr.s class NixGuixPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) integrity = attr.ib(type=str) """Hash of the archive, formatted as in the Subresource Integrity specification.""" @classmethod def from_metadata( cls, metadata: Dict[str, Any], version: str ) -> "NixGuixPackageInfo": return cls( url=metadata["url"], filename=None, version=version, integrity=metadata["integrity"], raw_info=metadata, ) def extid(self) -> PartialExtID: return (EXTID_TYPE, EXTID_VERSION, self.integrity.encode("ascii")) class NixGuixLoader(PackageLoader[NixGuixPackageInfo]): """Load sources from a sources.json file. This loader is used to load sources used by functional package manager (eg. Nix and Guix). """ visit_type = "nixguix" def __init__( self, storage: StorageInterface, url: str, unsupported_file_extensions: List[str] = [], **kwargs: Any, ): super().__init__(storage=storage, url=url, **kwargs) self.provider_url = url self.unsupported_file_extensions = unsupported_file_extensions # Note: this could be renamed get_artifacts in the PackageLoader # base class. @cached_method def raw_sources(self): return retrieve_sources(self.origin.url) @cached_method def supported_sources(self): raw_sources = self.raw_sources() return clean_sources( parse_sources(raw_sources), self.unsupported_file_extensions ) @cached_method def integrity_by_url(self) -> Dict[str, str]: sources = self.supported_sources() return {s["urls"][0]: s["integrity"] for s in sources["sources"]} def get_versions(self) -> List[str]: """The first mirror of the mirror list is used as branch name in the snapshot. """ return list(self.integrity_by_url().keys()) def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url=self.origin.url, metadata={}, ) def get_extrinsic_snapshot_metadata(self): return [ RawExtrinsicMetadataCore( format="nixguix-sources-json", metadata=self.raw_sources(), ), ] # Note: this could be renamed get_artifact_info in the PackageLoader # base class. def get_package_info(self, url) -> Iterator[Tuple[str, NixGuixPackageInfo]]: # TODO: try all mirrors and not only the first one. A source # can be fetched from several urls, called mirrors. We # currently only use the first one, but if the first one # fails, we should try the second one and so on. integrity = self.integrity_by_url()[url] p_info = NixGuixPackageInfo.from_metadata( {"url": url, "integrity": integrity}, version=url ) yield url, p_info def select_extid_target( self, p_info: NixGuixPackageInfo, extid_targets: Set[CoreSWHID] ) -> Optional[CoreSWHID]: if extid_targets: # The archive URL is part of the release name. As that URL is not # intrinsic metadata, it means different releases may be created for # the same SRI so they have the same extid. # Therefore, we need to pick the one with the right URL. releases = self.storage.release_get( [target.object_id for target in extid_targets] ) extid_targets = { release.swhid() for release in releases if release is not None and release.name == p_info.version.encode() } return super().select_extid_target(p_info, extid_targets) def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """We add a branch to the snapshot called 'evaluation' pointing to the revision used to generate the sources.json file. This revision is specified in the sources.json file itself. For the nixpkgs origin, this revision is coming from the github.com/nixos/nixpkgs repository. Note this repository is not loaded explicitly. So, this pointer can target a nonexistent revision for a time. However, the github and gnu loaders are supposed to load this revision and should create the revision pointed by this branch. This branch can be used to identify the snapshot associated to a Nix/Guix evaluation. """ # The revision used to create the sources.json file. For Nix, # this revision belongs to the github.com/nixos/nixpkgs # repository revision = self.supported_sources()["revision"] return { b"evaluation": { "target_type": "revision", "target": hashutil.hash_to_bytes(revision), } } def build_release( self, p_info: NixGuixPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: return Release( name=p_info.version.encode(), message=None, author=EMPTY_AUTHOR, date=None, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def retrieve_sources(url: str) -> bytes: """Retrieve sources. Potentially raise NotFound error.""" - return api_info(url, allow_redirects=True) + return get_url_body(url, allow_redirects=True) def parse_sources(raw_sources: bytes) -> Dict[str, Any]: return json.loads(raw_sources.decode("utf-8")) def make_pattern_unsupported_file_extension( unsupported_file_extensions: List[str], ): """Make a regexp pattern for unsupported file extension out of a list of unsupported archive extension list. """ return re.compile( rf".*\.({'|'.join(map(re.escape, unsupported_file_extensions))})$", re.DOTALL ) def clean_sources( sources: Dict[str, Any], unsupported_file_extensions=[] ) -> Dict[str, Any]: """Validate and clean the sources structure. First, ensure all top level keys are present. Then, walk the sources list and remove sources that do not contain required keys. Filter out source entries whose: - required keys are missing - source type is not supported - urls attribute type is not a list - extension is known not to be supported by the loader Raises: ValueError if: - a required top level key is missing - top-level version is not 1 Returns: source Dict cleaned up """ pattern_unsupported_file = make_pattern_unsupported_file_extension( unsupported_file_extensions ) # Required top level keys required_keys = ["version", "revision", "sources"] missing_keys = [] for required_key in required_keys: if required_key not in sources: missing_keys.append(required_key) if missing_keys != []: raise ValueError( f"sources structure invalid, missing: {','.join(missing_keys)}" ) # Only the version 1 is currently supported version = int(sources["version"]) if version != 1: raise ValueError( f"The sources structure version '{sources['version']}' is not supported" ) # If a source doesn't contain required attributes, this source is # skipped but others could still be archived. verified_sources = [] for source in sources["sources"]: valid = True required_keys = ["urls", "integrity", "type"] for required_key in required_keys: if required_key not in source: logger.info( f"Skip source '{source}' because key '{required_key}' is missing", ) valid = False if valid and source["type"] != "url": logger.info( f"Skip source '{source}' because the type {source['type']} " "is not supported", ) valid = False if valid and not isinstance(source["urls"], list): logger.info( f"Skip source {source} because the urls attribute is not a list" ) valid = False if valid and len(source["urls"]) > 0: # Filter out unsupported archives supported_sources: List[str] = [] for source_url in source["urls"]: if pattern_unsupported_file.match(source_url): logger.info(f"Skip unsupported artifact url {source_url}") continue supported_sources.append(source_url) if len(supported_sources) == 0: logger.info( f"Skip source {source} because urls only reference " "unsupported artifacts. Unsupported " f"artifacts so far: {pattern_unsupported_file}" ) continue new_source = copy.deepcopy(source) new_source["urls"] = supported_sources verified_sources.append(new_source) sources["sources"] = verified_sources return sources diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py index 91081c6..a44e22d 100644 --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -1,300 +1,300 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from codecs import BOM_UTF8 import json import logging import os import string from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union from urllib.parse import quote import attr import chardet from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import api_info, cached_method, release_name +from swh.loader.package.utils import cached_method, get_url_body, release_name from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Person, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EMPTY_PERSON = Person.from_fullname(b"") @attr.s class NpmPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) package_name = attr.ib(type=str) date = attr.ib(type=Optional[str]) shasum = attr.ib(type=str) """sha1 checksum""" # we cannot rely only on $shasum, as it is technically possible for two versions # of the same package to have the exact same tarball. # But the release data (message and date) are extrinsic to the content of the # package, so they differ between versions. # So we need every attribute used to build the release object to be part of the # manifest. MANIFEST_FORMAT = string.Template( "date $date\nname $package_name\nshasum $shasum\nurl $url\nversion $version" ) EXTID_TYPE = "npm-manifest-sha256" EXTID_VERSION = 0 @classmethod def from_metadata( cls, project_metadata: Dict[str, Any], version: str ) -> "NpmPackageInfo": package_metadata = project_metadata["versions"][version] url = package_metadata["dist"]["tarball"] assert package_metadata["name"] == project_metadata["name"] # No date available in intrinsic metadata: retrieve it from the API # metadata, using the version number that the API claims this package # has. extrinsic_version = package_metadata["version"] if "time" in project_metadata: date = project_metadata["time"][extrinsic_version] elif "mtime" in package_metadata: date = package_metadata["mtime"] else: date = None return cls( package_name=package_metadata["name"], url=url, filename=os.path.basename(url), date=date, shasum=package_metadata["dist"]["shasum"], version=extrinsic_version, raw_info=package_metadata, directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="replicate-npm-package-json", metadata=json.dumps(package_metadata).encode(), ) ], ) class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive.""" visit_type = "npm" def __init__(self, storage: StorageInterface, url: str, **kwargs: Any): """Constructor Args str: origin url (e.g. https://www.npmjs.com/package/) """ super().__init__(storage=storage, url=url, **kwargs) self.package_name = url.split("https://www.npmjs.com/package/")[1] safe_name = quote(self.package_name, safe="") self.provider_url = f"https://replicate.npmjs.com/{safe_name}/" self._info: Dict[str, Any] = {} self._versions = None @cached_method def _raw_info(self) -> bytes: - return api_info(self.provider_url) + return get_url_body(self.provider_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from npm registry)""" return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: return sorted(list(self.info()["versions"].keys())) def get_default_version(self) -> str: return self.info()["dist-tags"].get("latest", "") def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={}, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, NpmPackageInfo]]: p_info = NpmPackageInfo.from_metadata( project_metadata=self.info(), version=version ) yield release_name(version), p_info def build_release( self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # Metadata from NPM is not intrinsic to tarballs. # This means two package versions can have the same tarball, but different # metadata. To avoid mixing up releases, every field used to build the # release object must be part of NpmPackageInfo.MANIFEST_FORMAT. i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None author = extract_npm_package_author(i_metadata) assert self.package_name == p_info.package_name msg = ( f"Synthetic release for NPM source package {p_info.package_name} " f"version {p_info.version}\n" ) if p_info.date is None: url = p_info.url artifact_name = os.path.basename(url) raise ValueError( "Origin %s: Cannot determine upload time for artifact %s." % (p_info.url, artifact_name) ) date = TimestampWithTimezone.from_iso8601(p_info.date) # FIXME: this is to remain bug-compatible with earlier versions: date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0)) r = Release( name=p_info.version.encode(), message=msg.encode(), author=author, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) return r def _author_str(author_data: Union[Dict, List, str]) -> str: """Parse author from package.json author fields""" if isinstance(author_data, dict): author_str = "" name = author_data.get("name") if name is not None: if isinstance(name, str): author_str += name elif isinstance(name, list): author_str += _author_str(name[0]) if len(name) > 0 else "" email = author_data.get("email") if email is not None: author_str += f" <{email}>" result = author_str elif isinstance(author_data, list): result = _author_str(author_data[0]) if len(author_data) > 0 else "" else: result = author_data return result def extract_npm_package_author(package_json: Dict[str, Any]) -> Person: """ Extract package author from a ``package.json`` file content and return it in swh format. Args: package_json: Dict holding the content of parsed ``package.json`` file Returns: Person """ for author_key in ("author", "authors"): if author_key in package_json: author_data = package_json[author_key] if author_data is None: return EMPTY_PERSON author_str = _author_str(author_data) return Person.from_fullname(author_str.encode()) return EMPTY_PERSON def _lstrip_bom(s, bom=BOM_UTF8): if s.startswith(bom): return s[len(bom) :] else: return s def load_json(json_bytes): """ Try to load JSON from bytes and return a dictionary. First try to decode from utf-8. If the decoding failed, try to detect the encoding and decode again with replace error handling. If JSON is malformed, an empty dictionary will be returned. Args: json_bytes (bytes): binary content of a JSON file Returns: dict: JSON data loaded in a dictionary """ json_data = {} try: json_str = _lstrip_bom(json_bytes).decode("utf-8") except UnicodeDecodeError: encoding = chardet.detect(json_bytes)["encoding"] if encoding: json_str = json_bytes.decode(encoding, "replace") try: json_data = json.loads(json_str) except json.decoder.JSONDecodeError: pass return json_data def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from npm. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) == 0: return {} project_dirname = lst[0] package_json_path = os.path.join(dir_path, project_dirname, "package.json") if not os.path.exists(package_json_path): return {} with open(package_json_path, "rb") as package_json_file: package_json_bytes = package_json_file.read() return load_json(package_json_bytes) diff --git a/swh/loader/package/pubdev/loader.py b/swh/loader/package/pubdev/loader.py index bcce138..608457a 100644 --- a/swh/loader/package/pubdev/loader.py +++ b/swh/loader/package/pubdev/loader.py @@ -1,194 +1,194 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from pathlib import Path from typing import Any, Dict, Iterator, Optional, Sequence, Tuple import attr from packaging.version import parse as parse_version import yaml from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import ( EMPTY_AUTHOR, Person, - api_info, cached_method, + get_url_body, release_name, ) from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface @attr.s class PubDevPackageInfo(BasePackageInfo): name = attr.ib(type=str) """Name of the package""" version = attr.ib(type=str) """Current version""" last_modified = attr.ib(type=str) """Last modified date as release date""" author = attr.ib(type=Person) """Author""" description = attr.ib(type=str) """Description""" def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: """Extract intrinsic metadata from pubspec.yaml file at dir_path. Each pub.dev package version has a pubspec.yaml file at the root of the archive. See https://dart.dev/tools/pub/pubspec for pubspec specifications. Args: dir_path: A directory on disk where a pubspec.yaml must be present Returns: A dict mapping from yaml parser """ pubspec_path = dir_path / "pubspec.yaml" return yaml.safe_load(pubspec_path.read_text()) class PubDevLoader(PackageLoader[PubDevPackageInfo]): visit_type = "pubdev" PUBDEV_BASE_URL = "https://pub.dev/" def __init__( self, storage: StorageInterface, url: str, **kwargs, ): super().__init__(storage=storage, url=url, **kwargs) self.url = url assert url.startswith(self.PUBDEV_BASE_URL) self.package_info_url = url.replace( self.PUBDEV_BASE_URL, f"{self.PUBDEV_BASE_URL}api/" ) def _raw_info(self) -> bytes: - return api_info(self.package_info_url) + return get_url_body(self.package_info_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from pub.dev registry)""" # Use strict=False in order to correctly manage case where \n is present in a string info = json.loads(self._raw_info(), strict=False) # Arrange versions list as a new dict with `version` as key versions = {v["version"]: v for v in info["versions"]} info["versions"] = versions return info def get_versions(self) -> Sequence[str]: """Get all released versions of a PubDev package Returns: A sequence of versions Example:: ["0.1.1", "0.10.2"] """ versions = list(self.info()["versions"].keys()) versions.sort(key=parse_version) return versions def get_default_version(self) -> str: """Get the newest release version of a PubDev package Returns: A string representing a version Example:: "0.1.2" """ latest = self.info()["latest"] return latest["version"] def get_package_info(self, version: str) -> Iterator[Tuple[str, PubDevPackageInfo]]: """Get release name and package information from version Package info comes from extrinsic metadata (from self.info()) Args: version: Package version (e.g: "0.1.0") Returns: Iterator of tuple (release_name, p_info) """ v = self.info()["versions"][version] assert v["version"] == version url = v["archive_url"] name = v["pubspec"]["name"] filename = f"{name}-{version}.tar.gz" last_modified = v["published"] if "authors" in v["pubspec"]: # TODO: here we have a list of author, see T3887 author = Person.from_fullname(v["pubspec"]["authors"][0].encode()) elif "author" in v["pubspec"] and v["pubspec"]["author"] is not None: author = Person.from_fullname(v["pubspec"]["author"].encode()) else: author = EMPTY_AUTHOR description = v["pubspec"]["description"] p_info = PubDevPackageInfo( name=name, filename=filename, url=url, version=version, last_modified=last_modified, author=author, description=description, ) yield release_name(version), p_info def build_release( self, p_info: PubDevPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # Extract intrinsic metadata from uncompressed_path/pubspec.yaml intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) name: str = intrinsic_metadata["name"] version: str = intrinsic_metadata["version"] assert version == p_info.version # author from intrinsic_metadata should not take precedence over the one # returned by the api, see https://dart.dev/tools/pub/pubspec#authorauthors author: Person = p_info.author if "description" in intrinsic_metadata and intrinsic_metadata["description"]: description = intrinsic_metadata["description"] else: description = p_info.description message = ( f"Synthetic release for pub.dev source package {name} " f"version {version}\n\n" f"{description}\n" ) return Release( name=version.encode(), author=author, date=TimestampWithTimezone.from_iso8601(p_info.last_modified), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py index cb427a9..fe814f7 100644 --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -1,243 +1,248 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import os from typing import Any, Dict, Iterator, Optional, Sequence, Tuple from urllib.parse import urlparse import attr from pkginfo import UnpackedSDist from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, PartialExtID, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method, release_name +from swh.loader.package.utils import ( + EMPTY_AUTHOR, + cached_method, + get_url_body, + release_name, +) from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Person, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EXTID_TYPE = "pypi-archive-sha256" EXTID_VERSION = 0 @attr.s class PyPIPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) name = attr.ib(type=str) comment_text = attr.ib(type=Optional[str]) sha256 = attr.ib(type=str) upload_time = attr.ib(type=str) @classmethod def from_metadata( cls, metadata: Dict[str, Any], name: str, version: str ) -> "PyPIPackageInfo": return cls( url=metadata["url"], filename=metadata["filename"], version=version, raw_info=metadata, name=name, comment_text=metadata.get("comment_text"), sha256=metadata["digests"]["sha256"], upload_time=metadata["upload_time"], directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="pypi-project-json", metadata=json.dumps(metadata).encode(), ) ], ) def extid(self) -> PartialExtID: return (EXTID_TYPE, EXTID_VERSION, hash_to_bytes(self.sha256)) class PyPILoader(PackageLoader[PyPIPackageInfo]): """Load pypi origin's artifact releases into swh archive.""" visit_type = "pypi" def __init__(self, storage: StorageInterface, url: str, **kwargs): super().__init__(storage=storage, url=url, **kwargs) self.provider_url = pypi_api_url(self.origin.url) @cached_method def _raw_info(self) -> bytes: - return api_info(self.provider_url) + return get_url_body(self.provider_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from pypi registry)""" return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: return self.info()["releases"].keys() def get_default_version(self) -> str: return self.info()["info"]["version"] def get_metadata_authority(self): p_url = urlparse(self.origin.url) return MetadataAuthority( type=MetadataAuthorityType.FORGE, url=f"{p_url.scheme}://{p_url.netloc}/", metadata={}, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, PyPIPackageInfo]]: res = [] for meta in self.info()["releases"][version]: # process only standard sdist archives if meta["packagetype"] != "sdist" or meta["filename"].lower().endswith( (".deb", ".egg", ".rpm", ".whl") ): continue p_info = PyPIPackageInfo.from_metadata( meta, name=self.info()["info"]["name"], version=version ) res.append((version, p_info)) if len(res) == 1: version, p_info = res[0] yield release_name(version), p_info else: for version, p_info in res: yield release_name(version, p_info.filename), p_info def build_release( self, p_info: PyPIPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None # from intrinsic metadata version_ = i_metadata.get("version", p_info.version) author_ = author(i_metadata) if p_info.comment_text: msg = p_info.comment_text else: msg = ( f"Synthetic release for PyPI source package {p_info.name} " f"version {version_}\n" ) date = TimestampWithTimezone.from_iso8601(p_info.upload_time) return Release( name=p_info.version.encode(), message=msg.encode(), author=author_, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def pypi_api_url(url: str) -> str: """Compute api url from a project url Args: url (str): PyPI instance's url (e.g: https://pypi.org/project/requests) This deals with correctly transforming the project's api url (e.g https://pypi.org/pypi/requests/json) Returns: api url """ p_url = urlparse(url) project_name = p_url.path.rstrip("/").split("/")[-1] url = "%s://%s/pypi/%s/json" % (p_url.scheme, p_url.netloc, project_name) return url def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) != 1: return {} project_dirname = lst[0] pkginfo_path = os.path.join(dir_path, project_dirname, "PKG-INFO") if not os.path.exists(pkginfo_path): return {} pkginfo = UnpackedSDist(pkginfo_path) raw = pkginfo.__dict__ raw.pop("filename") # this gets added with the ondisk location return raw def author(data: Dict) -> Person: """Given a dict of project/release artifact information (coming from PyPI), returns an author subset. Args: data (dict): Representing either artifact information or release information. Returns: swh-model dict representing a person. """ name = data.get("author") email = data.get("author_email") fullname = None # type: Optional[str] if email: fullname = "%s <%s>" % (name, email) else: fullname = name if not fullname: return EMPTY_AUTHOR if name is not None: name = name.encode("utf-8") if email is not None: email = email.encode("utf-8") return Person(fullname=fullname.encode("utf-8"), name=name, email=email) diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v1.0.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v1.0.tar.gz new file mode 100644 index 0000000..dba14f3 Binary files /dev/null and b/swh/loader/package/tests/data/https_example.org/package_example_example-v1.0.tar.gz differ diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v2.0.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v2.0.tar.gz new file mode 100644 index 0000000..05f2f02 Binary files /dev/null and b/swh/loader/package/tests/data/https_example.org/package_example_example-v2.0.tar.gz differ diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v3.0.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v3.0.tar.gz new file mode 100644 index 0000000..a20afe9 Binary files /dev/null and b/swh/loader/package/tests/data/https_example.org/package_example_example-v3.0.tar.gz differ diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v4.0.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v4.0.tar.gz new file mode 100644 index 0000000..9a68b23 Binary files /dev/null and b/swh/loader/package/tests/data/https_example.org/package_example_example-v4.0.tar.gz differ diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py index 1ea43dc..9cb719d 100644 --- a/swh/loader/package/tests/test_loader.py +++ b/swh/loader/package/tests/test_loader.py @@ -1,541 +1,637 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib import logging import string +from typing import Optional from unittest.mock import Mock, call, patch import attr import pytest from swh.loader.core.loader import ( SENTRY_ORIGIN_URL_TAG_NAME, SENTRY_VISIT_TYPE_TAG_NAME, ) from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import EMPTY_AUTHOR from swh.model.model import ( Origin, OriginVisit, OriginVisitStatus, Person, Release, Revision, RevisionType, + Sha1Git, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.model.model import ExtID from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ObjectType from swh.storage import get_storage from swh.storage.algos.snapshot import snapshot_get_latest class FakeStorage: def origin_add(self, origins): raise ValueError("We refuse to add an origin") def origin_visit_get_latest(self, origin): return None class FakeStorage2(FakeStorage): def origin_add(self, origins): pass def origin_visit_add(self, visits): raise ValueError("We refuse to add an origin visit") class StubPackageInfo(BasePackageInfo): pass +ORIGIN_URL = "https://example.org/package/example" + + class StubPackageLoader(PackageLoader[StubPackageInfo]): visit_type = "stub" def get_versions(self): return ["v1.0", "v2.0", "v3.0", "v4.0"] def get_package_info(self, version): + filename = f"example-{version}.tar.gz" p_info = StubPackageInfo( - "http://example.org", f"example-{version}.tar", version=version + f"{ORIGIN_URL}/{filename}", + filename, + version=version, ) extid_type = "extid-type1" if version in ("v1.0", "v2.0") else "extid-type2" # Versions 1.0 and 2.0 have an extid of a given type, v3.0 has an extid # of a different type patch.object( p_info, "extid", return_value=(extid_type, 0, f"extid-of-{version}".encode()), autospec=True, ).start() yield (f"branch-{version}", p_info) - def _load_release(self, p_info, origin): - return None + def build_release( + self, p_info: StubPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + msg = ( + f"Synthetic release for source package {p_info.url} " + f"version {p_info.version}\n" + ) + + return Release( + name=p_info.version.encode(), + message=msg.encode(), + date=None, + author=EMPTY_AUTHOR, + target_type=ModelObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) + + +def test_loader_origin_visit_success(swh_storage, requests_mock_datadir): + + loader = StubPackageLoader(swh_storage, ORIGIN_URL) + + assert loader.load() == { + "snapshot_id": "dcb9ecef64af73f2cdac7f5463cb6dece6b1db61", + "status": "eventful", + } + + assert loader.load_status() == {"status": "eventful"} + assert loader.visit_status() == "full" + + assert set(loader.last_snapshot().branches.keys()) == { + f"branch-{version}".encode() for version in loader.get_versions() + } def test_loader_origin_visit_failure(swh_storage): """Failure to add origin or origin visit should failed immediately""" loader = StubPackageLoader(swh_storage, "some-url") loader.storage = FakeStorage() actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} + assert loader.load_status() == {"status": "failed"} + assert loader.visit_status() == "failed" + loader.storage = FakeStorage2() actual_load_status2 = loader.load() assert actual_load_status2 == {"status": "failed"} + assert loader.load_status() == {"status": "failed"} + assert loader.visit_status() == "failed" + def test_resolve_object_from_extids() -> None: storage = get_storage("memory") target = b"\x01" * 20 rel1 = Release( name=b"aaaa", message=b"aaaa", target=target, target_type=ModelObjectType.DIRECTORY, synthetic=False, ) rel2 = Release( name=b"bbbb", message=b"bbbb", target=target, target_type=ModelObjectType.DIRECTORY, synthetic=False, ) storage.release_add([rel1, rel2]) - loader = StubPackageLoader(storage, "http://example.org/") + loader = StubPackageLoader(storage, ORIGIN_URL) p_info = Mock(wraps=BasePackageInfo(None, None, None)) # type: ignore # The PackageInfo does not support extids p_info.extid.return_value = None known_extids = {("extid-type", 0, b"extid-of-aaaa"): [rel1.swhid()]} whitelist = {b"unused"} assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None # Some known extid, and the PackageInfo is not one of them (ie. cache miss) p_info.extid.return_value = ("extid-type", 0, b"extid-of-cccc") assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None # Some known extid, and the PackageInfo is one of them (ie. cache hit), # but the target release was not in the previous snapshot p_info.extid.return_value = ("extid-type", 0, b"extid-of-aaaa") assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None # Some known extid, and the PackageInfo is one of them (ie. cache hit), # and the target release was in the previous snapshot whitelist = {rel1.id} assert ( loader.resolve_object_from_extids(known_extids, p_info, whitelist) == rel1.swhid() ) # Same as before, but there is more than one extid, and only one is an allowed # release whitelist = {rel1.id} known_extids = {("extid-type", 0, b"extid-of-aaaa"): [rel2.swhid(), rel1.swhid()]} assert ( loader.resolve_object_from_extids(known_extids, p_info, whitelist) == rel1.swhid() ) def test_resolve_object_from_extids_missing_target() -> None: storage = get_storage("memory") target = b"\x01" * 20 rel = Release( name=b"aaaa", message=b"aaaa", target=target, target_type=ModelObjectType.DIRECTORY, synthetic=False, ) - loader = StubPackageLoader(storage, "http://example.org/") + loader = StubPackageLoader(storage, ORIGIN_URL) p_info = Mock(wraps=BasePackageInfo(None, None, None)) # type: ignore known_extids = {("extid-type", 0, b"extid-of-aaaa"): [rel.swhid()]} p_info.extid.return_value = ("extid-type", 0, b"extid-of-aaaa") whitelist = {rel.id} # Targeted release is missing from the storage assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None storage.release_add([rel]) # Targeted release now exists assert ( loader.resolve_object_from_extids(known_extids, p_info, whitelist) == rel.swhid() ) def test_load_get_known_extids() -> None: """Checks PackageLoader.load() fetches known extids efficiently""" storage = Mock(wraps=get_storage("memory")) - loader = StubPackageLoader(storage, "http://example.org") + loader = StubPackageLoader(storage, ORIGIN_URL) loader.load() # Calls should be grouped by extid type storage.extid_get_from_extid.assert_has_calls( [ call("extid-type1", [b"extid-of-v1.0", b"extid-of-v2.0"], version=0), call("extid-type2", [b"extid-of-v3.0", b"extid-of-v4.0"], version=0), ], any_order=True, ) def test_load_extids() -> None: """Checks PackageLoader.load() skips iff it should, and writes (only) the new ExtIDs""" storage = get_storage("memory") dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) rels = [ Release( name=f"v{i}.0".encode(), message=b"blah\n", target=dir_swhid.object_id, target_type=ModelObjectType.DIRECTORY, synthetic=True, ) for i in (1, 2, 3, 4) ] storage.release_add(rels[0:3]) - origin = "http://example.org" + origin = ORIGIN_URL rel1_swhid = rels[0].swhid() rel2_swhid = rels[1].swhid() rel3_swhid = rels[2].swhid() rel4_swhid = rels[3].swhid() # Results of a previous load storage.extid_add( [ ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid), ] ) last_snapshot = Snapshot( branches={ b"v1.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel1_swhid.object_id ), b"v2.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel2_swhid.object_id ), b"v3.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel3_swhid.object_id ), } ) storage.snapshot_add([last_snapshot]) date = datetime.datetime.now(tz=datetime.timezone.utc) storage.origin_add([Origin(url=origin)]) storage.origin_visit_add( - [OriginVisit(origin="http://example.org", visit=1, date=date, type="tar")] + [OriginVisit(origin=origin, visit=1, date=date, type="tar")] ) storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin, visit=1, status="full", date=date, snapshot=last_snapshot.id, ) ] ) - loader = StubPackageLoader(storage, "http://example.org") + loader = StubPackageLoader(storage, origin) patch.object( loader, "_load_release", return_value=(rel4_swhid.object_id, dir_swhid.object_id), autospec=True, ).start() loader.load() + assert loader.load_status() == {"status": "eventful"} + assert loader.visit_status() == "full" + assert loader._load_release.mock_calls == [ # type: ignore # v1.0: not loaded because there is already its (extid_type, extid, rel) # in the storage. # v2.0: loaded, because there is already a similar extid, but different type call( - StubPackageInfo(origin, "example-v2.0.tar", "v2.0"), + StubPackageInfo( + f"{origin}/example-v2.0.tar.gz", "example-v2.0.tar.gz", "v2.0" + ), Origin(url=origin), ), # v3.0: loaded despite having an (extid_type, extid) in storage, because # the target of the extid is not in the previous snapshot call( - StubPackageInfo(origin, "example-v3.0.tar", "v3.0"), + StubPackageInfo( + f"{origin}/example-v3.0.tar.gz", "example-v3.0.tar.gz", "v3.0" + ), Origin(url=origin), ), # v4.0: loaded, because there isn't its extid call( - StubPackageInfo(origin, "example-v4.0.tar", "v4.0"), + StubPackageInfo( + f"{origin}/example-v4.0.tar.gz", "example-v4.0.tar.gz", "v4.0" + ), Origin(url=origin), ), ] # then check the snapshot has all the branches. # versions 2.0 to 4.0 all point to rel4_swhid (instead of the value of the last # snapshot), because they had to be loaded (mismatched extid), and the mocked # _load_release always returns rel4_swhid. snapshot = Snapshot( branches={ b"branch-v1.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel1_swhid.object_id ), b"branch-v2.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel4_swhid.object_id ), b"branch-v3.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel4_swhid.object_id ), b"branch-v4.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel4_swhid.object_id ), } ) assert snapshot_get_latest(storage, origin) == snapshot extids = storage.extid_get_from_target( ObjectType.RELEASE, [ rel1_swhid.object_id, rel2_swhid.object_id, rel3_swhid.object_id, rel4_swhid.object_id, ], ) assert set(extids) == { # What we inserted at the beginning of the test: ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid), # Added by the loader: ExtID("extid-type1", b"extid-of-v2.0", rel4_swhid), ExtID("extid-type2", b"extid-of-v3.0", rel4_swhid), ExtID("extid-type2", b"extid-of-v4.0", rel4_swhid), } def test_load_upgrade_from_revision_extids(caplog): """Tests that, when loading incrementally based on a snapshot made by an old version of the loader, the loader will convert revisions to releases and add them to the storage. Also checks that, if an extid exists pointing to a non-existent revision (which should never happen, but you never know...), the release is loaded from scratch.""" storage = get_storage("memory") - origin = "http://example.org" + origin = ORIGIN_URL dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"d" * 20) dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) date = TimestampWithTimezone.from_datetime( datetime.datetime.now(tz=datetime.timezone.utc) ) person = Person.from_fullname(b"Jane Doe ") rev1 = Revision( message=b"blah", author=person, date=date, committer=person, committer_date=date, directory=dir1_swhid.object_id, type=RevisionType.TAR, synthetic=True, ) rel1 = Release( name=b"v1.0", message=b"blah\n", author=person, date=date, target=dir1_swhid.object_id, target_type=ModelObjectType.DIRECTORY, synthetic=True, ) rev1_swhid = rev1.swhid() rel1_swhid = rel1.swhid() rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=b"b" * 20) rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20) # Results of a previous load storage.extid_add( [ ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0), ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0), ] ) storage.revision_add([rev1]) last_snapshot = Snapshot( branches={ b"v1.0": SnapshotBranch( target_type=TargetType.REVISION, target=rev1_swhid.object_id ), b"v2.0": SnapshotBranch( target_type=TargetType.REVISION, target=rev2_swhid.object_id ), } ) storage.snapshot_add([last_snapshot]) date = datetime.datetime.now(tz=datetime.timezone.utc) storage.origin_add([Origin(url=origin)]) storage.origin_visit_add( - [OriginVisit(origin="http://example.org", visit=1, date=date, type="tar")] + [OriginVisit(origin=origin, visit=1, date=date, type="tar")] ) storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin, visit=1, status="full", date=date, snapshot=last_snapshot.id, ) ] ) - loader = StubPackageLoader(storage, "http://example.org") + loader = StubPackageLoader(storage, origin) patch.object( loader, "_load_release", return_value=(rel2_swhid.object_id, dir2_swhid.object_id), autospec=True, ).start() patch.object( loader, "get_versions", return_value=["v1.0", "v2.0", "v3.0"], autospec=True, ).start() caplog.set_level(logging.ERROR) loader.load() + assert loader.load_status() == {"status": "eventful"} + assert loader.visit_status() == "full" + assert len(caplog.records) == 1 (record,) = caplog.records assert record.levelname == "ERROR" assert "Failed to upgrade branch branch-v2.0" in record.message assert loader._load_release.mock_calls == [ # v1.0: not loaded because there is already a revision matching it # v2.0: loaded, as the revision is missing from the storage even though there # is an extid - call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"), Origin(url=origin)), + call( + StubPackageInfo( + f"{origin}/example-v2.0.tar.gz", "example-v2.0.tar.gz", "v2.0" + ), + Origin(url=origin), + ), # v3.0: loaded (did not exist yet) - call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"), Origin(url=origin)), + call( + StubPackageInfo( + f"{origin}/example-v3.0.tar.gz", "example-v3.0.tar.gz", "v3.0" + ), + Origin(url=origin), + ), ] snapshot = Snapshot( branches={ b"branch-v1.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel1_swhid.object_id ), b"branch-v2.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel2_swhid.object_id ), b"branch-v3.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel2_swhid.object_id ), } ) assert snapshot_get_latest(storage, origin) == snapshot extids = storage.extid_get_from_target( ObjectType.RELEASE, [ rel1_swhid.object_id, rel2_swhid.object_id, ], ) assert set(extids) == { ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid), ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid), } def test_manifest_extid(): """Compute primary key should return the right identity""" @attr.s class TestPackageInfo(BasePackageInfo): a = attr.ib() b = attr.ib() length = attr.ib() filename = attr.ib() MANIFEST_FORMAT = string.Template("$a $b") p_info = TestPackageInfo( url="http://example.org/", a=1, b=2, length=221837, filename="8sync-0.1.0.tar.gz", version="0.1.0", ) actual_id = p_info.extid() assert actual_id == ("package-manifest-sha256", 0, hashlib.sha256(b"1 2").digest()) def test_no_env_swh_config_filename_raise(monkeypatch): """No SWH_CONFIG_FILENAME environment variable makes package loader init raise""" class DummyPackageLoader(PackageLoader): """A dummy package loader for test purpose""" pass monkeypatch.delenv("SWH_CONFIG_FILENAME", raising=False) with pytest.raises( AssertionError, match="SWH_CONFIG_FILENAME environment variable is undefined" ): DummyPackageLoader.from_configfile(url="some-url") class StubPackageLoaderWithError(StubPackageLoader): def get_versions(self, *args, **kwargs): raise Exception("error") def test_loader_sentry_tags_on_error(swh_storage, sentry_events): - origin_url = "http://example.org/package/name" + origin_url = ORIGIN_URL loader = StubPackageLoaderWithError(swh_storage, origin_url) loader.load() + assert loader.load_status() == {"status": "failed"} + assert loader.visit_status() == "failed" sentry_tags = sentry_events[0]["tags"] assert sentry_tags.get(SENTRY_ORIGIN_URL_TAG_NAME) == origin_url assert ( sentry_tags.get(SENTRY_VISIT_TYPE_TAG_NAME) == StubPackageLoaderWithError.visit_type ) + + +class StubPackageLoaderWithPackageInfoFailure(StubPackageLoader): + def get_package_info(self, version): + if version == "v2.0": + raise Exception("Error when getting package info") + else: + return super().get_package_info(version) + + +def test_loader_origin_with_package_info_failure(swh_storage, requests_mock_datadir): + + loader = StubPackageLoaderWithPackageInfoFailure(swh_storage, ORIGIN_URL) + + assert loader.load() == { + "snapshot_id": "b4cce7081d661fb7f4d7a1db96e8044b752eb0b0", + "status": "eventful", + } + + assert loader.load_status() == {"status": "eventful"} + assert loader.visit_status() == "partial" + + assert set(loader.last_snapshot().branches.keys()) == { + f"branch-v{i}.0".encode() for i in (1, 3, 4) + } diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py index bf1f4da..acff6af 100644 --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -1,273 +1,308 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os from unittest.mock import MagicMock from urllib.error import URLError from urllib.parse import quote import pytest from requests.exceptions import HTTPError -from swh.loader.exception import NotFound import swh.loader.package -from swh.loader.package.utils import api_info, download, release_name +from swh.loader.package.utils import download, get_url_body, release_name def test_version_generation(): assert ( swh.loader.package.__version__ != "devel" ), "Make sure swh.loader.core is installed (e.g. pip install -e .)" @pytest.mark.fs def test_download_fail_to_download(tmp_path, requests_mock): url = "https://pypi.org/pypi/arrow/json" status_code = 404 requests_mock.get(url, status_code=status_code) with pytest.raises( HTTPError, match=f"{status_code} Client Error: None for url: {url}" ): download(url, tmp_path) _filename = "requests-0.0.1.tar.gz" _data = "this is something" def _check_download_ok(url, dest, filename=_filename, hashes={}): actual_filepath, actual_hashes = download(url, dest, hashes=hashes) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes["length"] == len(_data) assert ( actual_hashes["checksums"]["sha1"] == "fdd1ce606a904b08c816ba84f3125f2af44d92b2" ) assert ( actual_hashes["checksums"]["sha256"] == "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5" ) @pytest.mark.fs def test_download_ok(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_ok_no_header(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data) # no header information _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_ok_with_hashes(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) # good hashes for such file good = { "sha1": "fdd1ce606a904b08c816ba84f3125f2af44d92b2", "sha256": "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5", # noqa } _check_download_ok(url, dest=str(tmp_path), hashes=good) @pytest.mark.fs def test_download_fail_hashes_mismatch(tmp_path, requests_mock): """Mismatch hash after download should raise""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) # good hashes for such file good = { "sha1": "fdd1ce606a904b08c816ba84f3125f2af44d92b2", "sha256": "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5", # noqa } for hash_algo in good.keys(): wrong_hash = good[hash_algo].replace("1", "0") expected_hashes = good.copy() expected_hashes[hash_algo] = wrong_hash # set the wrong hash expected_msg = "Failure when fetching %s. " "Checksum mismatched: %s != %s" % ( url, wrong_hash, good[hash_algo], ) with pytest.raises(ValueError, match=expected_msg): download(url, dest=str(tmp_path), hashes=expected_hashes) @pytest.mark.fs def test_ftp_download_ok(tmp_path, mocker): """Download without issue should provide filename and hashes""" url = f"ftp://pypi.org/pypi/requests/{_filename}" cm = MagicMock() cm.getstatus.return_value = 200 cm.read.side_effect = [_data.encode(), b""] cm.__enter__.return_value = cm mocker.patch("swh.loader.package.utils.urlopen").return_value = cm _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_ftp_download_ko(tmp_path, mocker): """Download without issue should provide filename and hashes""" filename = "requests-0.0.1.tar.gz" url = "ftp://pypi.org/pypi/requests/%s" % filename mocker.patch("swh.loader.package.utils.urlopen").side_effect = URLError("FTP error") with pytest.raises(URLError): download(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_with_redirection(tmp_path, requests_mock): """Download with redirection should use the targeted URL to extract filename""" url = "https://example.org/project/requests/download" redirection_url = f"https://example.org/project/requests/files/{_filename}" requests_mock.get(url, status_code=302, headers={"location": redirection_url}) requests_mock.get( redirection_url, text=_data, headers={"content-length": str(len(_data))} ) _check_download_ok(url, dest=str(tmp_path)) def test_download_extracting_filename_from_url(tmp_path, requests_mock): """Extracting filename from url must sanitize the filename first""" url = "https://example.org/project/requests-0.0.1.tar.gz?a=b&c=d&foo=bar" requests_mock.get( url, status_code=200, text=_data, headers={"content-length": str(len(_data))} ) _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs @pytest.mark.parametrize( "filename", [f'"{_filename}"', _filename, '"filename with spaces.tar.gz"'] ) def test_download_filename_from_content_disposition(tmp_path, requests_mock, filename): """Filename should be extracted from content-disposition request header when available.""" url = "https://example.org/download/requests/tar.gz/v0.0.1" requests_mock.get( url, text=_data, headers={ "content-length": str(len(_data)), "content-disposition": f"attachment; filename={filename}", }, ) _check_download_ok(url, dest=str(tmp_path), filename=filename.strip('"')) @pytest.mark.fs @pytest.mark.parametrize("filename", ['"archive école.tar.gz"', "archive_école.tgz"]) def test_download_utf8_filename_from_content_disposition( tmp_path, requests_mock, filename ): """Filename should be extracted from content-disposition request header when available.""" url = "https://example.org/download/requests/tar.gz/v0.0.1" data = "this is something" requests_mock.get( url, text=data, headers={ "content-length": str(len(data)), "content-disposition": f"attachment; filename*=utf-8''{quote(filename)}", }, ) _check_download_ok(url, dest=str(tmp_path), filename=filename.strip('"')) def test_api_info_failure(requests_mock): """Failure to fetch info/release information should raise""" url = "https://pypi.org/pypi/requests/json" status_code = 400 requests_mock.get(url, status_code=status_code) - with pytest.raises(NotFound) as e0: - api_info(url) - - assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % (url, status_code) + with pytest.raises( + HTTPError, match=f"{status_code} Client Error: None for url: {url}" + ): + get_url_body(url) def test_api_info(requests_mock): """Fetching json info from pypi project should be ok""" url = "https://pypi.org/pypi/requests/json" requests_mock.get(url, text='{"version": "0.0.1"}') - actual_info = json.loads(api_info(url)) + actual_info = json.loads(get_url_body(url)) assert actual_info == { "version": "0.0.1", } def test_release_name(): for version, filename, expected_release in [ ("0.0.1", None, "releases/0.0.1"), ("0.0.2", "something", "releases/0.0.2/something"), ]: assert release_name(version, filename) == expected_release @pytest.fixture(autouse=True) def mock_download_retry_sleep(mocker): mocker.patch.object(download.retry, "sleep") def test_download_retry(mocker, requests_mock, tmp_path): url = f"https://example.org/project/requests/files/{_filename}" requests_mock.get( url, [ {"status_code": 429}, {"status_code": 429}, { "text": _data, "headers": {"content-length": str(len(_data))}, "status_code": 200, }, ], ) _check_download_ok(url, dest=str(tmp_path)) def test_download_retry_reraise(mocker, requests_mock, tmp_path): url = f"https://example.org/project/requests/files/{_filename}" requests_mock.get( url, [{"status_code": 429}] * 5, ) with pytest.raises(HTTPError): _check_download_ok(url, dest=str(tmp_path)) + + +@pytest.fixture(autouse=True) +def mock_api_info_retry_sleep(mocker): + mocker.patch.object(get_url_body.retry, "sleep") + + +def test_api_info_retry(mocker, requests_mock, tmp_path): + url = "https://example.org/api/endpoint" + json_data = {"foo": "bar"} + + requests_mock.get( + url, + [ + {"status_code": 429}, + {"status_code": 429}, + { + "json": json_data, + "status_code": 200, + }, + ], + ) + + assert json.loads(get_url_body(url)) == json_data + + +def test_api_info_retry_reraise(mocker, requests_mock, tmp_path): + url = "https://example.org/api/endpoint" + + requests_mock.get( + url, + [{"status_code": 429}] * 5, + ) + + with pytest.raises(HTTPError, match=f"429 Client Error: None for url: {url}"): + get_url_body(url) diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py index df3127c..adf882b 100644 --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -1,207 +1,213 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import functools import itertools import logging import os import re from typing import Callable, Dict, Optional, Tuple, TypeVar from urllib.parse import unquote, urlsplit from urllib.request import urlopen import requests from requests.exceptions import HTTPError from tenacity import retry from tenacity.before_sleep import before_sleep_log from tenacity.stop import stop_after_attempt from tenacity.wait import wait_exponential from swh.loader.exception import NotFound from swh.loader.package import DEFAULT_PARAMS from swh.model.hashutil import HASH_BLOCK_SIZE, MultiHash from swh.model.model import Person logger = logging.getLogger(__name__) DOWNLOAD_HASHES = set(["sha1", "sha256", "length"]) EMPTY_AUTHOR = Person.from_fullname(b"") -def api_info(url: str, **extra_params) -> bytes: - """Basic api client to retrieve information on project. This deals with - fetching json metadata about pypi projects. - - Args: - url (str): The api url (e.g PyPI, npm, etc...) - - Raises: - NotFound in case of query failures (for some reasons: 404, ...) - - Returns: - The associated response's information - - """ - response = requests.get(url, **{**DEFAULT_PARAMS, **extra_params}) - if response.status_code != 200: - raise NotFound(f"Fail to query '{url}'. Reason: {response.status_code}") - return response.content - - def _content_disposition_filename(header: str) -> Optional[str]: fname = None fnames = re.findall(r"filename[\*]?=([^;]+)", header) if fnames and "utf-8''" in fnames[0].lower(): # RFC 5987 fname = re.sub("utf-8''", "", fnames[0], flags=re.IGNORECASE) fname = unquote(fname) elif fnames: fname = fnames[0] if fname: fname = os.path.basename(fname.strip().strip('"')) return fname def _retry_if_throttling(retry_state) -> bool: """Custom tenacity retry predicate for handling HTTP responses with status code 429 (too many requests). """ attempt = retry_state.outcome if attempt.failed: exception = attempt.exception() return ( isinstance(exception, HTTPError) and exception.response.status_code == 429 ) return False -@retry( +throttling_retry = retry( retry=_retry_if_throttling, wait=wait_exponential(exp_base=10), stop=stop_after_attempt(max_attempt_number=5), before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True, ) + + +@throttling_retry def download( url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, extra_request_headers: Optional[Dict[str, str]] = None, ) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to hashes: Dict of expected hashes (key is the hash algo) for the artifact to download (those hashes are expected to be hex string) auth: Optional tuple of login/password (for http authentication service, e.g. deposit) Raises: ValueError in case of any error when fetching/computing (length, checksums mismatched...) Returns: Tuple of local (filepath, hashes of filepath) """ params = copy.deepcopy(DEFAULT_PARAMS) if auth is not None: params["auth"] = auth if extra_request_headers is not None: params["headers"].update(extra_request_headers) # so the connection does not hang indefinitely (read/connection timeout) timeout = params.get("timeout", 60) if url.startswith("ftp://"): response = urlopen(url, timeout=timeout) chunks = (response.read(HASH_BLOCK_SIZE) for _ in itertools.count()) response_data = itertools.takewhile(bool, chunks) else: response = requests.get(url, **params, timeout=timeout, stream=True) response.raise_for_status() # update URL to response one as requests follow redirection by default # on GET requests url = response.url # try to extract filename from content-disposition header if available if filename is None and "content-disposition" in response.headers: filename = _content_disposition_filename( response.headers["content-disposition"] ) response_data = response.iter_content(chunk_size=HASH_BLOCK_SIZE) filename = filename if filename else os.path.basename(urlsplit(url).path) logger.debug("filename: %s", filename) filepath = os.path.join(dest, filename) logger.debug("filepath: %s", filepath) h = MultiHash(hash_names=DOWNLOAD_HASHES | set(hashes.keys())) with open(filepath, "wb") as f: for chunk in response_data: h.update(chunk) f.write(chunk) response.close() # Also check the expected hashes if provided if hashes: actual_hashes = h.hexdigest() for algo_hash in hashes.keys(): actual_digest = actual_hashes[algo_hash] expected_digest = hashes[algo_hash] if actual_digest != expected_digest: raise ValueError( "Failure when fetching %s. " "Checksum mismatched: %s != %s" % (url, expected_digest, actual_digest) ) computed_hashes = h.hexdigest() length = computed_hashes.pop("length") extrinsic_metadata = { "length": length, "filename": filename, "checksums": computed_hashes, "url": url, } logger.debug("extrinsic_metadata", extrinsic_metadata) return filepath, extrinsic_metadata +@throttling_retry +def get_url_body(url: str, **extra_params) -> bytes: + """Basic HTTP client to retrieve information on software package, + typically JSON metadata from a REST API. + + Args: + url (str): An HTTP URL + + Raises: + NotFound in case of query failures (for some reasons: 404, ...) + + Returns: + The associated response's information + + """ + logger.debug("Fetching %s", url) + response = requests.get(url, **{**DEFAULT_PARAMS, **extra_params}) + if response.status_code == 404: + raise NotFound(f"Fail to query '{url}'. Reason: {response.status_code}") + response.raise_for_status() + return response.content + + def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version TReturn = TypeVar("TReturn") TSelf = TypeVar("TSelf") _UNDEFINED = object() def cached_method(f: Callable[[TSelf], TReturn]) -> Callable[[TSelf], TReturn]: cache_name = f"_cached_{f.__name__}" @functools.wraps(f) def newf(self): value = getattr(self, cache_name, _UNDEFINED) if value is _UNDEFINED: value = f(self) setattr(self, cache_name, value) return value return newf