diff --git a/PKG-INFO b/PKG-INFO index 1462b4d..06e2a38 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,52 +1,52 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 4.1.0 +Version: 4.2.0 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/debian/changelog b/debian/changelog index 98d3d96..339cfd5 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,1817 +1,1827 @@ -swh-loader-core (4.1.0-1~swh1~bpo10+1) buster-swh; urgency=medium +swh-loader-core (4.2.0-1~swh1) unstable-swh; urgency=medium - * Rebuild for buster-swh - - -- Software Heritage autobuilder (on jenkins-debian1) Wed, 14 Sep 2022 07:43:50 +0000 + * New upstream release 4.2.0 - (tagged by Antoine R. Dumont + (@ardumont) on 2022-09-21 22:09:47 + +0200) + * Upstream changes: - v4.2.0 - package/utils: rename api_info + function to get_url_body - package/utils: Add debug log and + throttling retry to api_info - package/loader: Handle errors + when retrieving package version info - package/loader: Implement + load_status and visit_status - package/tests: Add a visit + success test for StubPackageLoader - golang: Ensure to include + dev version when no releases exist - golang: Ensure to case- + encode URLs for retrieving package info + + -- Software Heritage autobuilder (on jenkins-debian1) Wed, 21 Sep 2022 20:14:57 +0000 swh-loader-core (4.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 4.1.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-09-14 09:36:57 +0200) * Upstream changes: - v4.1.0 - Pubdev: Ensure we correctly parse and sort release versions - loader: Add origin URL and visit type as sentry tags -- Software Heritage autobuilder (on jenkins-debian1) Wed, 14 Sep 2022 07:41:42 +0000 swh-loader-core (4.0.0-1~swh2) unstable-swh; urgency=medium * Update build dependencies and bump new release -- Antoine R. Dumont (@ardumont) Fri, 09 Sep 2022 11:47:53 +0200 swh-loader-core (4.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 4.0.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-09-09 09:03:40 +0200) * Upstream changes: - v4.0.0 - New package loader Golang - New package loader pubdev - New package loader Arch Linux - New package loader Arch Linux User - New package loader Crates - docs: Mention caveats of using archive checksums as ExtID. - package/utils: Add retry policy to download in case of throttling - package/archive: Handle tarball artifact with null time - Initialize 'status' before try block - Always log an error when setting 'failed' status - Add method process_data(), run between fetch_data() and store_data() -- Software Heritage autobuilder (on jenkins-debian1) Fri, 09 Sep 2022 07:11:14 +0000 swh-loader-core (3.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.5.0 - (tagged by Valentin Lorentz on 2022-05-20 14:19:02 +0200) * Upstream changes: - v3.5.0 - * BaseLoader.flush: Return the output of storage.flush -- Software Heritage autobuilder (on jenkins-debian1) Fri, 20 May 2022 12:24:23 +0000 swh-loader-core (3.4.1-1~swh1) unstable-swh; urgency=medium * New upstream release 3.4.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-13 10:22:15 +0200) * Upstream changes: - v3.4.1 - Initialize the success boolean early to avoid unbound exception -- Software Heritage autobuilder (on jenkins-debian1) Fri, 13 May 2022 08:26:59 +0000 swh-loader-core (3.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.4.0 - (tagged by Valentin Lorentz on 2022-05-06 10:36:18 +0200) * Upstream changes: - v3.4.0 - * crates: Do not literalinclude JSON file in ExtrinsicPackageMetadata doc - * Add Sentry Captures - * maven: Use most recent release of a package as default version - * loader.core: Add statsd timing and metadata metrics -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 May 2022 08:42:14 +0000 swh-loader-core (3.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.3.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-29 14:46:13 +0200) * Upstream changes: - v3.3.0 - Rust lang, Crates loader - package/maven: Fix jar archive download after changes in lister -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Apr 2022 12:51:12 +0000 swh-loader-core (3.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.2.0 - (tagged by Valentin Lorentz on 2022-04-27 16:26:36 +0200) * Upstream changes: - v3.2.0 - * Store the result of MetadataFetcher.get_parent_origins - * cli: Pass metadata_fetcher_credentials from the config to the loader -- Software Heritage autobuilder (on jenkins-debian1) Wed, 27 Apr 2022 14:31:04 +0000 swh-loader-core (3.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.1.0 - (tagged by Valentin Lorentz on 2022-04-26 11:36:30 +0200) * Upstream changes: - v3.1.0 - * package loaders: Simplify initialization - * BaseLoader: Add hook to call metadata fetchers before loading an origin - * pre-commit maintenance - * debian: Fix loading when md5sum is missing in dsc file -- Software Heritage autobuilder (on jenkins-debian1) Tue, 26 Apr 2022 09:41:57 +0000 swh-loader-core (3.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.0.0 - (tagged by Valentin Lorentz on 2022-04-21 10:27:07 +0200) * Upstream changes: - v3.0.0 - * Remove unused function BaseLoader.store_metadata. - * Remove unused BaseLoader.origin_metadata attribute - * Replace self.url with self.origin.url in package loaders - * BaseLoader: Add 'origin_url' argument and remove 'prepare_origin_visit' method -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Apr 2022 08:31:52 +0000 swh-loader-core (2.6.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.2 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-14 11:46:06 +0200) * Upstream changes: - v2.6.2 - maven: Consistently read lister input to ingest a mvn origin -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Apr 2022 09:53:26 +0000 swh-loader-core (2.6.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-08 11:03:06 +0200) * Upstream changes: - v2.6.1 - Rename metadata key in data received from the deposit server - origin/master npm: Add all fields we use to the ExtID manifest - npm: Include package version id in ExtID manifest -- Software Heritage autobuilder (on jenkins-debian1) Fri, 08 Apr 2022 09:13:17 +0000 swh-loader-core (2.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.0 - (tagged by Valentin Lorentz on 2022-03-02 13:54:45 +0100) * Upstream changes: - v2.6.0 - * Update for the new output format of the Deposit's API. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 02 Mar 2022 12:58:43 +0000 swh-loader-core (2.5.4-1~swh2) unstable-swh; urgency=medium * Bump new release with opam tests deactivated -- Antoine R. Dumont (@ardumont) Fri, 25 Feb 2022 12:40:40 +0100 swh-loader-core (2.5.4-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.4 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-25 10:23:51 +0100) * Upstream changes: - v2.5.4 - loader/opam/tests: Do not run actual opam init command call -- Software Heritage autobuilder (on jenkins-debian1) Fri, 25 Feb 2022 09:28:10 +0000 swh-loader-core (2.5.3-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.3 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-24 16:02:53 +0100) * Upstream changes: - v2.5.3 - opam: Allow build to run the opam init completely -- Software Heritage autobuilder (on jenkins-debian1) Thu, 24 Feb 2022 15:07:20 +0000 swh-loader-core (2.5.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.2 - (tagged by Valentin Lorentz on 2022-02-24 09:52:26 +0100) * Upstream changes: - v2.5.2 - * deposit: Remove unused raw_info -- Software Heritage autobuilder (on jenkins-debian1) Thu, 24 Feb 2022 08:57:52 +0000 swh-loader-core (2.5.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-16 15:27:02 +0100) * Upstream changes: - v2.5.1 - Add URL and directory to CLI loader status echo - Fix load_maven scheduling task name - docs: Fix typo detected with codespell - pre-commit: Bump hooks and add new one to check commit message spelling -- Software Heritage autobuilder (on jenkins-debian1) Wed, 16 Feb 2022 14:30:47 +0000 swh-loader-core (2.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-08 10:46:14 +0100) * Upstream changes: - v2.5.0 - Move visit date helper from hg loader to core -- Software Heritage autobuilder (on jenkins-debian1) Tue, 08 Feb 2022 09:49:53 +0000 swh-loader-core (2.4.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.4.1 - (tagged by Nicolas Dandrimont on 2022-02-03 14:12:05 +0100) * Upstream changes: - Release swh.loader.core 2.4.1 - fix Person mangling -- Software Heritage autobuilder (on jenkins-debian1) Thu, 03 Feb 2022 13:17:35 +0000 swh-loader-core (2.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.3.0 - (tagged by Nicolas Dandrimont on 2022-01-24 11:18:43 +0100) * Upstream changes: - Release swh.loader.core - Stop using the deprecated 'TimestampWithTimezone.offset' attribute - Include clone_with_timeout utility from swh.loader.mercurial -- Software Heritage autobuilder (on jenkins-debian1) Mon, 24 Jan 2022 10:22:35 +0000 swh-loader-core (2.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-01-18 14:33:08 +0100) * Upstream changes: - v2.2.0 - tests: Replace 'offset' and 'negative_utc' with 'offset_bytes' - deposit: Remove 'negative_utc' from test data - tests: Use TimestampWithTimezone.from_datetime() instead of the constructor - Add releases notes (from user-provided Atom document) to release messages. - deposit: Strip 'offset_bytes' from date dicts to support swh-model 4.0.0 - Pin mypy and drop type annotations which makes mypy unhappy -- Software Heritage autobuilder (on jenkins-debian1) Tue, 18 Jan 2022 15:52:53 +0000 swh-loader-core (2.1.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.1.1 - (tagged by Valentin Lorentz on 2021-12-09 17:14:12 +0100) * Upstream changes: - v2.1.1 - * nixguix: Fix crash when filtering extids on archives that were already loaded, but only from different URLs -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Dec 2021 16:17:54 +0000 swh-loader-core (2.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.1.0 - (tagged by Valentin Lorentz on 2021-12-09 16:34:51 +0100) * Upstream changes: - v2.1.0 - * maven: various refactorings - * nixguix: Filter out releases with URLs different from the expected one -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Dec 2021 15:38:14 +0000 swh-loader-core (2.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.0.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-12-07 15:53:23 +0100) * Upstream changes: - v2.0.0 - package-loaders: Add support for extid versions, and bump it for Debian - debian: Remove the extrinsic version from release names - debian: Fix confusion between the two versions -- Software Heritage autobuilder (on jenkins-debian1) Tue, 07 Dec 2021 14:57:19 +0000 swh-loader-core (1.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.0 - (tagged by Antoine Lambert on 2021-12-07 10:54:49 +0100) * Upstream changes: - version 1.3.0 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 07 Dec 2021 09:58:53 +0000 swh-loader-core (1.2.1-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-12-03 16:15:32 +0100) * Upstream changes: - v1.2.1 - package.loader: Deduplicate extid target -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Dec 2021 15:19:13 +0000 swh-loader-core (1.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-12-03 12:16:04 +0100) * Upstream changes: - v1.2.0 - debian: Rename loading task function to fix scheduling - debian: Handle extra sha1 sum in source package metadata - debian: Remove unused date parameter of DebianLoader - package.loader: Deduplicate target SWHIDs - package-loader-tutorial: Update to mention releases instead of revisions - package-loader-tutorial: Add a checklist - package-loader-tutorial: Highlight the recommendation to submit the loader early. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Dec 2021 11:19:52 +0000 swh-loader-core (1.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.1.0 - (tagged by Valentin Lorentz on 2021-11-22 11:58:11 +0100) * Upstream changes: - v1.1.0 - * Package loader: Uniformize author and message -- Software Heritage autobuilder (on jenkins-debian1) Mon, 22 Nov 2021 11:01:45 +0000 swh-loader-core (1.0.1-1~swh1) unstable-swh; urgency=medium * New upstream release 1.0.1 - (tagged by Valentin Lorentz on 2021-11-10 14:47:52 +0100) * Upstream changes: - v1.0.1 - * utils: Add types and let log instruction do the formatting - * Fix tests when run by gbp on Sid. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 10 Nov 2021 13:53:43 +0000 swh-loader-core (1.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.0.0 - (tagged by Valentin Lorentz on 2021-11-10 14:25:24 +0100) * Upstream changes: - v1.0.0 - Main change: thismakes package loaders write releases instead of revisions - Other more-or-less related changes: - * Add missing documentation for `get_metadata_authority`. - * opam: Write package definitions to the extrinsic metadata storage - * deposit: Remove 'parent' deposit - * cleanup tests and unused code - * Document how each package loader populates fields. - * Refactor package loaders to make the version part of BasePackageInfo -- Software Heritage autobuilder (on jenkins-debian1) Wed, 10 Nov 2021 13:38:43 +0000 swh-loader-core (0.25.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.25.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-29 09:19:10 +0200) * Upstream changes: - v0.25.0 - Allow opam loader to actually use multi-instance opam root - opam: Define a initialize_opam_root parameter for opam loader -- Software Heritage autobuilder (on jenkins-debian1) Wed, 29 Sep 2021 07:26:12 +0000 swh-loader-core (0.23.5-1~swh1) unstable-swh; urgency=medium * New upstream release 0.23.5 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-24 17:31:22 +0200) * Upstream changes: - v0.23.5 - opam: Initialize opam root directory outside the constructor -- Software Heritage autobuilder (on jenkins-debian1) Fri, 24 Sep 2021 15:34:52 +0000 swh-loader-core (0.23.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.23.4 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-20 11:53:11 +0200) * Upstream changes: - v0.23.4 - Ensure that filename fallback out of an url is properly sanitized -- Software Heritage autobuilder (on jenkins-debian1) Mon, 20 Sep 2021 09:56:31 +0000 swh-loader-core (0.23.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.23.3 - (tagged by Antoine Lambert on 2021-09-16 10:47:40 +0200) * Upstream changes: - version 0.23.3 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 16 Sep 2021 08:51:47 +0000 swh-loader-core (0.23.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.23.2 - (tagged by Valentin Lorentz on 2021-08-12 12:22:44 +0200) * Upstream changes: - v0.23.2 - * deposit: Update status_detail on loader failure -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Aug 2021 10:25:44 +0000 swh-loader-core (0.23.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.23.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-08-05 16:11:02 +0200) * Upstream changes: - v0.23.1 - Fix pypi upload issue. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 05 Aug 2021 14:20:37 +0000 swh-loader-core (0.22.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.22.3 - (tagged by Valentin Lorentz on 2021-06-25 14:50:40 +0200) * Upstream changes: - v0.22.3 - * Use the postgresql class to instantiate storage in tests - * package-loader-tutorial: Add anchor so it can be referenced from swh-docs -- Software Heritage autobuilder (on jenkins-debian1) Fri, 25 Jun 2021 12:57:33 +0000 swh-loader-core (0.22.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.22.2 - (tagged by Antoine Lambert on 2021-06-10 16:11:30 +0200) * Upstream changes: - version 0.22.2 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 10 Jun 2021 14:19:06 +0000 swh-loader-core (0.22.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.22.1 - (tagged by Antoine Lambert on 2021-05-27 14:02:35 +0200) * Upstream changes: - version 0.22.1 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 27 May 2021 12:20:04 +0000 swh-loader-core (0.22.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.22.0 - (tagged by Valentin Lorentz on 2021-04-15 15:13:56 +0200) * Upstream changes: - v0.22.0 - Documentation: - * Document the big picture view of VCS and package loaders - * Add a package loader tutorial. - * Write an overview of how to write VCS loaders. - * Fix various Sphinx warnings - Package loaders: - * Add sha512 as a valid field in dsc metadata - * package loaders: Stop reading/writing Revision.metadata -- Software Heritage autobuilder (on jenkins-debian1) Thu, 15 Apr 2021 13:18:13 +0000 swh-loader-core (0.21.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.21.0 - (tagged by Valentin Lorentz on 2021-03-30 17:19:13 +0200) * Upstream changes: - v0.21.0 - * tests: recompute ids when evolving RawExtrinsicMetadata objects, to support swh-model 2.0.0 - * deposit.loader: Make archive.tar the default_filename - * debian: Make resolve_revision_from use the sha256 of the .dsc - * package.loader.*: unify package "cache"/deduplication using ExtIDs - * package.loader: Lookup packages from the ExtID storage - * package.loader: Write to the ExtID storage -- Software Heritage autobuilder (on jenkins-debian1) Tue, 30 Mar 2021 15:26:35 +0000 swh-loader-core (0.20.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.20.0 - (tagged by Valentin Lorentz on 2021-03-02 10:52:18 +0100) * Upstream changes: - v0.20.0 - * RawExtrinsicMetadata: update to use the API in swh-model 1.0.0 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 02 Mar 2021 09:57:21 +0000 swh-loader-core (0.19.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.19.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-25 15:52:12 +0100) * Upstream changes: - v0.19.0 - deposit: Make deposit loader deal with tarball as well - deposit: Update deposit status when the load status is 'partial' - Make finalize_visit a method instead of nested function. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 25 Feb 2021 14:55:54 +0000 swh-loader-core (0.18.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.18.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-19 18:02:58 +0100) * Upstream changes: - v0.18.1 - nixguix: Fix missing max_content_size constructor parameter -- Software Heritage autobuilder (on jenkins-debian1) Fri, 19 Feb 2021 17:06:33 +0000 swh-loader-core (0.18.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.18.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-17 13:13:24 +0100) * Upstream changes: - v0.18.0 - core.loader: Merge Loader into BaseLoader - Unify loader instantiation - nixguix: Ensure interaction with the origin url for edge case tests -- Software Heritage autobuilder (on jenkins-debian1) Wed, 17 Feb 2021 12:16:47 +0000 swh-loader-core (0.17.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.17.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-11 11:20:55 +0100) * Upstream changes: - v0.17.0 - package: Mark visit as not_found when relevant - package: Mark visit status as failed when relevant - core: Allow vcs loaders to deal with not_found status - core: Mark visit status as failed when relevant - loader: Make loader write the origin_visit_status' type -- Software Heritage autobuilder (on jenkins-debian1) Thu, 11 Feb 2021 10:23:42 +0000 swh-loader-core (0.16.0-1~swh2) unstable-swh; urgency=medium * Bump dependencies -- Antoine R. Dumont (@ardumont) Wed, 03 Feb 2021 14:25:26 +0100 swh-loader-core (0.16.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.16.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-03 14:14:01 +0100) * Upstream changes: - v0.16.0 - Adapt origin_get_latest_visit_status according to latest api change - Add a cli section in the doc - tox.ini: Add swh.core[testing] requirement - Small docstring improvements in the deposit loader code -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Feb 2021 13:17:30 +0000 swh-loader-core (0.15.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.15.0 - (tagged by Nicolas Dandrimont on 2020-11-03 17:21:21 +0100) * Upstream changes: - Release swh-loader-core v0.15.0 - Attach raw extrinsic metadata to directories, not revisions - Handle a bunch of deprecation warnings: - explicit args in swh.objstorage get_objstorage - id -> target for raw extrinsic metadata objects - positional arguments for storage.raw_extrinsic_metadata_get -- Software Heritage autobuilder (on jenkins-debian1) Tue, 03 Nov 2020 16:26:20 +0000 swh-loader-core (0.14.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.14.0 - (tagged by Valentin Lorentz on 2020-10-16 18:23:28 +0200) * Upstream changes: - v0.14.0 - * npm: write metadata on revisions instead of snapshots. - * pypi: write metadata on revisions instead of snapshots. - * deposit.loader: Avoid unnecessary metadata json transformation -- Software Heritage autobuilder (on jenkins-debian1) Fri, 16 Oct 2020 16:26:14 +0000 swh-loader-core (0.13.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.13.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-02 16:54:05 +0200) * Upstream changes: - v0.13.1 - core.loader: Allow config parameter passing through constructor - tox.ini: pin black to the pre-commit version (19.10b0) to avoid flip-flops -- Software Heritage autobuilder (on jenkins-debian1) Fri, 02 Oct 2020 14:55:59 +0000 swh-loader-core (0.13.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.13.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-02 13:18:55 +0200) * Upstream changes: - v0.13.0 - package.loader: Migrate away from SWHConfig mixin - core.loader: Migrate away from SWHConfig mixin - Expose deposit configuration only within the deposit tests -- Software Heritage autobuilder (on jenkins-debian1) Fri, 02 Oct 2020 11:21:55 +0000 swh-loader-core (0.12.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.12.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-01 16:03:45 +0200) * Upstream changes: - v0.12.0 - deposit: Adapt loader to send extrinsic raw metadata to the metadata storage - core.loader: Log information about origin currently being ingested - Adapt cli declaration entrypoint to swh.core 0.3 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 01 Oct 2020 14:04:59 +0000 swh-loader-core (0.11.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.11.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-18 10:19:56 +0200) * Upstream changes: - v0.11.0 - loader: Stop materializing full lists of objects to be stored - tests.get_stats: Don't return a 'person' count - python: Reorder imports with isort - pre-commit: Add isort hook and configuration - pre-commit: Update flake8 hook configuration - cli: speedup the `swh` cli command startup time -- Software Heritage autobuilder (on jenkins-debian1) Fri, 18 Sep 2020 09:12:18 +0000 swh-loader-core (0.10.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.10.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-04 13:19:29 +0200) * Upstream changes: - v0.10.0 - loader: Adapt to latest storage revision_get change - origin/master Rename metadata format 'original-artifact-json' to 'original-artifacts-json'. - Tell pytest not to recurse in dotdirs. - package loader: Add the 'url' to the 'original_artifact' extrinsic metadata. - Write 'original_artifact' metadata to the extrinsic metadata storage. - Move parts of _load_revision to a new _load_directory method. - tests: Don't use naive datetimes. - package.loader: Split the warning message into multiple chunks - Replace calls to snapshot_get with snapshot_get_all_branches. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 04 Sep 2020 11:28:09 +0000 swh-loader-core (0.9.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-08-08 14:47:52 +0200) * Upstream changes: - v0.9.1 - nixguix: Make the unsupported artifact extensions configurable - package.loader: Log a failure summary report at the end of the task -- Software Heritage autobuilder (on jenkins-debian1) Sat, 08 Aug 2020 12:51:33 +0000 swh-loader-core (0.9.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-08-07 22:57:14 +0200) * Upstream changes: - v0.9.0 - nixguix: Filter out unsupported artifact extensions - swh.loader.tests: Use snapshot_get_all_branches in check_snapshot - test_npm: Adapt content_get_metadata call to content_get - npm: Fix assertion to use the correct storage api -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 Aug 2020 21:00:40 +0000 swh-loader-core (0.8.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-08-06 16:48:38 +0200) * Upstream changes: - v0.8.1 - Adapt code according to storage signature -- Software Heritage autobuilder (on jenkins-debian1) Thu, 06 Aug 2020 14:50:39 +0000 swh-loader-core (0.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-08-05 10:16:36 +0200) * Upstream changes: - v0.8.0 - archive: fix docstring - nixguix: Fix docstring - nixguix: Align error message formatting using f-string - nixguix: Fix format issue in error message - Convert the 'metadata' and 'info' cached-properties/lazy-attributes into methods - cran: fix call to logger.warning - pypi: Load the content of the API's response as extrinsic snapshot metadata - Add a default value for RawExtrinsicMetadataCore.discovery_date - npm: Load the content of the API's response as extrinsic snapshot metadata - Make retrieve_sources use generic api_info instead of duplicating its code - nixguix: Load the content of sources.json as extrinsic snapshot metadata - Update tests to accept PagedResult from storage.raw_extrinsic_metadata_get -- Software Heritage autobuilder (on jenkins-debian1) Wed, 05 Aug 2020 08:19:20 +0000 swh-loader-core (0.7.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.3 - (tagged by Valentin Lorentz on 2020-07-30 19:16:21 +0200) * Upstream changes: - v0.7.3 - core.loader: Fix Iterable/List typing issues - package.loader: Fix type warning -- Software Heritage autobuilder (on jenkins-debian1) Thu, 30 Jul 2020 17:23:57 +0000 swh-loader-core (0.7.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.2 - (tagged by Valentin Lorentz on 2020-07-29 11:41:39 +0200) * Upstream changes: - v0.7.2 - * Fix typo in message logged on extrinsic metadata loading errors. - * Don't pass non-sequence iterables to the storage API. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 29 Jul 2020 09:45:52 +0000 swh-loader-core (0.7.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-28 12:14:02 +0200) * Upstream changes: - v0.7.1 - Apply rename of object_metadata to raw_extrinsic_metadata. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 28 Jul 2020 10:16:56 +0000 swh-loader-core (0.6.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-23 11:12:29 +0200) * Upstream changes: - v0.6.1 - npm.loader: Fix null author parsing corner case - npm.loader: Fix author parsing corner case - npm.loader: Extract _author_str function + add types, tests - core.loader: docs: Update origin_add reference -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 Jul 2020 09:15:41 +0000 swh-loader-core (0.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.0 - (tagged by Valentin Lorentz on 2020-07-20 13:23:22 +0200) * Upstream changes: - v0.6.0 - * Use the new object_metadata_add endpoint instead of origin_metadata_add. - * Apply renaming of MetadataAuthorityType.DEPOSIT to MetadataAuthorityType.DEPOSIT_CLIENT. -- Software Heritage autobuilder (on jenkins-debian1) Mon, 20 Jul 2020 11:27:53 +0000 swh-loader-core (0.5.10-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.10 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-17 15:10:42 +0200) * Upstream changes: - v0.5.10 - test_init: Decrease assertion checks so debian package builds fine - test_nixguix: Simplify the nixguix specific check_snapshot function -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Jul 2020 13:13:19 +0000 swh-loader-core (0.5.9-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.9 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-17 11:52:38 +0200) * Upstream changes: - v0.5.9 - test.check_snapshot: Drop accepting using dict for snapshot comparison - test: Check against snapshot model object -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Jul 2020 09:55:12 +0000 swh-loader-core (0.5.8-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.8 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-16 17:18:17 +0200) * Upstream changes: - v0.5.8 - test_init: Use snapshot object -- Software Heritage autobuilder (on jenkins-debian1) Thu, 16 Jul 2020 15:20:49 +0000 swh-loader-core (0.5.7-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.7 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-16 16:10:57 +0200) * Upstream changes: - v0.5.7 - test_init: Fix tests using the latest swh-storage fixture -- Software Heritage autobuilder (on jenkins-debian1) Thu, 16 Jul 2020 14:14:59 +0000 swh-loader-core (0.5.5-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.5 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-15 12:34:09 +0200) * Upstream changes: - v0.5.5 - check_snapshot: Check existence down to contents - Expose a pytest_plugin module so other loaders can reuse for tests - pytest: Remove no longer needed pytest setup - Fix branches types in tests - Small code improvement in package/loader.py -- Software Heritage autobuilder (on jenkins-debian1) Wed, 15 Jul 2020 10:37:11 +0000 swh-loader-core (0.5.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.4 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-10 09:52:21 +0200) * Upstream changes: - v0.5.4 - Clean up the swh.scheduler / swh.storage pytest plugin imports -- Software Heritage autobuilder (on jenkins-debian1) Fri, 10 Jul 2020 07:54:56 +0000 swh-loader-core (0.5.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.3 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-09 09:46:21 +0200) * Upstream changes: - v0.5.3 - Update the revision metadata field as an immutable dict - tests: Use dedicated storage and scheduler fixtures - loaders.tests: Simplify and add coverage to check_snapshot -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Jul 2020 07:48:33 +0000 swh-loader-core (0.5.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-07 12:29:17 +0200) * Upstream changes: - v0.5.2 - nixguix/loader: Check further the source entry only if it's valid - nixguix/loader: Allow version both as string or integer - Move remaining common test utility functions to top-level arborescence - Move common test utility function to the top-level arborescence - Define common test helper function - Reuse swh.model.from_disk.iter_directory function -- Software Heritage autobuilder (on jenkins-debian1) Tue, 07 Jul 2020 10:31:36 +0000 swh-loader-core (0.5.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-01 12:32:54 +0200) * Upstream changes: - v0.5.1 - Use origin_add instead of deprecated origin_add_one endpoint - Migrate to use object's "object_type" field when computing objects -- Software Heritage autobuilder (on jenkins-debian1) Wed, 01 Jul 2020 10:34:59 +0000 swh-loader-core (0.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-29 13:18:41 +0200) * Upstream changes: - v0.5.0 - loader*: Drop obsolete origin visit fields -- Software Heritage autobuilder (on jenkins-debian1) Mon, 29 Jun 2020 11:20:59 +0000 swh-loader-core (0.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-23 15:02:20 +0200) * Upstream changes: - v0.4.0 - loader: Retrieve latest snapshot with snapshot-get-latest function -- Software Heritage autobuilder (on jenkins-debian1) Tue, 23 Jun 2020 13:14:09 +0000 swh-loader-core (0.3.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-22 15:13:05 +0200) * Upstream changes: - v0.3.2 - Add helper function to ensure loader visit are as expected -- Software Heritage autobuilder (on jenkins-debian1) Mon, 22 Jun 2020 13:15:41 +0000 swh-loader-core (0.3.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.1 - (tagged by Antoine Lambert on 2020-06-12 16:43:18 +0200) * Upstream changes: - version 0.3.1 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 12 Jun 2020 14:47:42 +0000 swh-loader-core (0.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-12 11:05:41 +0200) * Upstream changes: - v0.3.0 - Migrate to new storage.origin_visit_add endpoint - loader: Migrate to origin visit status - test_deposits: Fix origin_metadata_get which is a paginated endpoint - Fix a potential UnboundLocalError in clean_dangling_folders() -- Software Heritage autobuilder (on jenkins-debian1) Fri, 12 Jun 2020 09:08:17 +0000 swh-loader-core (0.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.0 - (tagged by David Douard on 2020-06-04 14:20:08 +0200) * Upstream changes: - v0.2.0 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 04 Jun 2020 12:25:57 +0000 swh-loader-core (0.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.0 - (tagged by Nicolas Dandrimont on 2020-05-29 16:01:11 +0200) * Upstream changes: - Release swh.loader.core v0.1.0 - Make sure partial visits don't reference unloaded snapshots - Ensure proper behavior when loading into partial archives (e.g. staging) - Improve test coverage -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 May 2020 14:05:36 +0000 swh-loader-core (0.0.97-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.97 - (tagged by Antoine R. Dumont (@ardumont) on 2020-05-26 14:22:51 +0200) * Upstream changes: - v0.0.97 - nixguix: catch and log artifact resolution failures - nixguix: Override known_artifacts to filter out "evaluation" branch - nixguix.tests: Add missing __init__ file -- Software Heritage autobuilder (on jenkins-debian1) Tue, 26 May 2020 12:25:35 +0000 swh-loader-core (0.0.96-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.96 - (tagged by Valentin Lorentz on 2020-05-19 18:42:23 +0200) * Upstream changes: - v0.0.96 - * Pass bytes instead a dict to origin_metadata_add. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 19 May 2020 16:45:03 +0000 swh-loader-core (0.0.95-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.95 - (tagged by Valentin Lorentz on 2020-05-19 14:44:01 +0200) * Upstream changes: - v0.0.95 - * Use the new swh-storage API for storing metadata. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 19 May 2020 12:47:48 +0000 swh-loader-core (0.0.94-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.94 - (tagged by Antoine R. Dumont (@ardumont) on 2020-05-15 12:49:22 +0200) * Upstream changes: - v0.0.94 - deposit: Adapt loader to use the latest deposit update api - tests: Use proper date initialization - setup.py: add documentation link -- Software Heritage autobuilder (on jenkins-debian1) Fri, 15 May 2020 10:52:16 +0000 swh-loader-core (0.0.93-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.93 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-23 16:43:16 +0200) * Upstream changes: - v0.0.93 - deposit.loader: Build revision out of the deposit api read metadata -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 Apr 2020 14:46:48 +0000 swh-loader-core (0.0.92-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.92 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-23 11:49:30 +0200) * Upstream changes: - v0.0.92 - deposit.loader: Fix revision metadata redundancy in deposit metadata - loader.deposit: Clarify FIXME intent - test_nixguix: Remove the incorrect fixme - test_nixguix: Add a fixme note on test_loader_two_visits - package.nixguix: Ensure the revisions are structurally sound -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 Apr 2020 09:52:18 +0000 swh-loader-core (0.0.91-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.91 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-21 15:59:55 +0200) * Upstream changes: - v0.0.91 - deposit.loader: Fix committer date appropriately - tests_deposit: Define specific requests_mock_datadir fixture - nixguix: Move helper function below the class definition - setup: Update the minimum required runtime python3 version -- Software Heritage autobuilder (on jenkins-debian1) Tue, 21 Apr 2020 14:02:51 +0000 swh-loader-core (0.0.90-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.90 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-15 14:27:01 +0200) * Upstream changes: - v0.0.90 - Improve exception handling -- Software Heritage autobuilder (on jenkins-debian1) Wed, 15 Apr 2020 12:30:07 +0000 swh-loader-core (0.0.89-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.89 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-14 15:48:15 +0200) * Upstream changes: - v0.0.89 - package.utils: Define a timeout on download connections - package.loader: Clear proxy buffer state when failing to load revision - Fix a couple of storage args deprecation warnings - cli: Sort loaders list and fix some tests - Add a pyproject.toml file to target py37 for black - Enable black -- Software Heritage autobuilder (on jenkins-debian1) Tue, 14 Apr 2020 15:30:08 +0000 swh-loader-core (0.0.88-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.88 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-03 15:52:07 +0200) * Upstream changes: - v0.0.88 - v0.0.88 nixguix: validate and clean sources.json structure -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Apr 2020 13:54:24 +0000 swh-loader-core (0.0.87-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.87 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-02 14:37:37 +0200) * Upstream changes: - v0.0.87 - nixguix: rename the `url` source attribute to `urls` - nixguix: rename the test file - nixguix: add the integrity attribute in release metadata -- Software Heritage autobuilder (on jenkins-debian1) Thu, 02 Apr 2020 12:39:58 +0000 swh-loader-core (0.0.86-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.86 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-26 16:15:24 +0100) * Upstream changes: - v0.0.86 - core.loader: Remove origin_visit_update call from DVCSLoader class -- Software Heritage autobuilder (on jenkins-debian1) Thu, 26 Mar 2020 15:19:29 +0000 swh-loader-core (0.0.85-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.85 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-26 15:36:58 +0100) * Upstream changes: - v0.0.85 - core.loader: Allow core loader to update origin_visit in one call - Rename the functional loader to nixguix loader -- Software Heritage autobuilder (on jenkins-debian1) Thu, 26 Mar 2020 14:43:17 +0000 swh-loader-core (0.0.84-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.84 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-24 11:29:49 +0100) * Upstream changes: - v0.0.84 - test: Use storage endpoint to check latest origin visit status - package.loader: Fix status visit to 'partial' - package.loader: add a test to reproduce EOFError error -- Software Heritage autobuilder (on jenkins-debian1) Tue, 24 Mar 2020 10:32:55 +0000 swh-loader-core (0.0.83-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.83 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-23 15:16:14 +0100) * Upstream changes: - v0.0.83 - Make the swh.loader.package exception handling more granular - package.loader: Reference a snapshot on partial visit - package.loader: Extract a _load_snapshot method - functional: create a branch named evaluation pointing to the evaluation commit - package.loader: add extra_branches method -- Software Heritage autobuilder (on jenkins-debian1) Mon, 23 Mar 2020 14:19:43 +0000 swh-loader-core (0.0.82-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.82 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-18 11:55:48 +0100) * Upstream changes: - v0.0.82 - functional.loader: Add loader - package.loader: ignore non tarball source -- Software Heritage autobuilder (on jenkins-debian1) Wed, 18 Mar 2020 10:59:38 +0000 swh-loader-core (0.0.81-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.81 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-16 13:14:33 +0100) * Upstream changes: - v0.0.81 - Migrate to latest storage.origin_visit_add api change - Move Person parsing to swh- model. -- Software Heritage autobuilder (on jenkins-debian1) Mon, 16 Mar 2020 12:17:43 +0000 swh-loader-core (0.0.80-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.80 - (tagged by Valentin Lorentz on 2020-02-28 17:05:14 +0100) * Upstream changes: - v0.0.80 - * use swh-model objects instead of dicts. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 28 Feb 2020 16:10:06 +0000 swh-loader-core (0.0.79-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.79 - (tagged by Antoine R. Dumont (@ardumont) on 2020-02-25 11:40:05 +0100) * Upstream changes: - v0.0.79 - Move revision loading logic to its own function. - Use swh-storage validation proxy earlier in the pipeline. - Use swh-storage validation proxy. - Add missing __init__.py and fix tests. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 25 Feb 2020 10:48:07 +0000 swh-loader-core (0.0.78-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.78 - (tagged by Antoine R. Dumont (@ardumont) on 2020-02-06 15:28:11 +0100) * Upstream changes: - v0.0.78 - tests: Use new get_storage signature - loader.core.converters: Prefer the with open pattern to read file - test_converters: Add coverage on prepare_contents method - test_converters: Migrate to pytest - loader.core/package: Call storage's (skipped_)content_add endpoints -- Software Heritage autobuilder (on jenkins-debian1) Thu, 06 Feb 2020 15:09:05 +0000 swh-loader-core (0.0.77-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.77 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-30 10:32:08 +0100) * Upstream changes: - v0.0.77 - loader.npm: If no upload time provided, use artifact's mtime if provided - loader.npm: Fail ingestion if at least 1 artifact has no upload time -- Software Heritage autobuilder (on jenkins-debian1) Thu, 30 Jan 2020 09:37:58 +0000 swh-loader-core (0.0.76-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.76 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-28 13:07:30 +0100) * Upstream changes: - v0.0.76 - npm.loader: Skip artifacts with no intrinsic metadata - pypi.loader: Skip artifacts with no intrinsic metadata - package.loader: Fix edge case when some listing returns no content - core.loader: Drop retro- compatibility class names - loader.tests: Add filter and buffer proxy storage - docs: Fix sphinx warnings - README: Update class names -- Software Heritage autobuilder (on jenkins-debian1) Tue, 28 Jan 2020 12:11:07 +0000 swh-loader-core (0.0.75-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.75 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-16 14:14:29 +0100) * Upstream changes: - v0.0.75 - cran.loader: Align cran loader with other package loaders -- Software Heritage autobuilder (on jenkins-debian1) Thu, 16 Jan 2020 13:17:30 +0000 swh-loader-core (0.0.74-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.74 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-15 15:30:13 +0100) * Upstream changes: - v0.0.74 - Drop no longer used retrying dependency - core.loader: Clean up indirection and retry behavior - tests: Use retry proxy storage in loaders - core.loader: Drop dead code - cran.loader: Fix parsing description file error -- Software Heritage autobuilder (on jenkins-debian1) Wed, 15 Jan 2020 14:33:57 +0000 swh-loader-core (0.0.73-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.73 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-09 10:00:21 +0100) * Upstream changes: - v0.0.73 - package.cran: Name CRAN task appropriately -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Jan 2020 09:05:07 +0000 swh-loader-core (0.0.72-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.72 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-06 16:37:58 +0100) * Upstream changes: - v0.0.72 - package.loader: Fail fast when unable to create origin/origin_visit - cran.loader: Add implementation -- Software Heritage autobuilder (on jenkins-debian1) Mon, 06 Jan 2020 15:50:08 +0000 swh-loader-core (0.0.71-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.71 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-20 14:22:31 +0100) * Upstream changes: - v0.0.71 - package.utils: Drop unneeded hashes from download computation -- Software Heritage autobuilder (on jenkins-debian1) Fri, 20 Dec 2019 13:26:09 +0000 swh-loader-core (0.0.70-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.70 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-20 11:32:09 +0100) * Upstream changes: - v0.0.70 - debian.loader: Improve and fix revision resolution's corner cases -- Software Heritage autobuilder (on jenkins-debian1) Fri, 20 Dec 2019 10:39:34 +0000 swh-loader-core (0.0.69-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.69 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-12 16:21:59 +0100) * Upstream changes: - v0.0.69 - loader.core: Fix correctly loader initialization -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Dec 2019 15:26:13 +0000 swh-loader-core (0.0.68-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.68 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-12 15:45:21 +0100) * Upstream changes: - v0.0.68 - loader.core: Fix initialization issue in dvcs loaders -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Dec 2019 14:49:12 +0000 swh-loader-core (0.0.67-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.67 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-12 14:02:47 +0100) * Upstream changes: - v0.0.67 - loader.core: Type methods - loader.core: Transform data input into list - loader.core: Add missing conversion step on content -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Dec 2019 13:07:47 +0000 swh-loader-core (0.0.66-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.66 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-12 12:01:14 +0100) * Upstream changes: - v0.0.66 - Drop deprecated behavior -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Dec 2019 11:05:17 +0000 swh-loader-core (0.0.65-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.65 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-12 11:42:46 +0100) * Upstream changes: - v0.0.65 - loader.cli: Improve current implementation - tasks: Enforce kwargs use in task message -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Dec 2019 10:51:02 +0000 swh-loader-core (0.0.64-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.64 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-10 09:49:06 +0100) * Upstream changes: - v0.0.64 - requirements-test: Add missing test dependency - tests: Refactor using pytest-mock's mocker fixture - loader.cli: Add tests around cli - package.npm: Align loader instantiation - loader.cli: Reference new loader cli -- Software Heritage autobuilder (on jenkins-debian1) Tue, 10 Dec 2019 08:56:02 +0000 swh-loader-core (0.0.63-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.63 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-05 16:01:49 +0100) * Upstream changes: - v0.0.63 - Add missing inclusion instruction -- Software Heritage autobuilder (on jenkins-debian1) Thu, 05 Dec 2019 15:05:39 +0000 swh-loader-core (0.0.62-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.62 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-05 15:46:46 +0100) * Upstream changes: - v0.0.62 - Move package loaders to their own namespace -- Software Heritage autobuilder (on jenkins-debian1) Thu, 05 Dec 2019 14:50:19 +0000 swh-loader-core (0.0.61-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.61 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-28 17:25:49 +0100) * Upstream changes: - v0.0.61 - pypi: metadata -> revision: Deal with previous metadata format - npm: metadata -> revision: Deal with previous metadata format -- Software Heritage autobuilder (on jenkins-debian1) Thu, 28 Nov 2019 16:29:47 +0000 swh-loader-core (0.0.60-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.60 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-26 12:09:28 +0100) * Upstream changes: - v0.0.60 - package.deposit: Fix revision- get inconsistency - package.deposit: Provide parents in any case - package.deposit: Fix url computation issue - utils: Work around header issue during download -- Software Heritage autobuilder (on jenkins-debian1) Tue, 26 Nov 2019 11:18:41 +0000 swh-loader-core (0.0.59-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.59 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-22 18:11:33 +0100) * Upstream changes: - v0.0.59 - npm: Explicitly retrieve the revision date from extrinsic metadata -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Nov 2019 17:15:34 +0000 swh-loader-core (0.0.58-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.58 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-22 12:08:10 +0100) * Upstream changes: - v0.0.58 - package.pypi: Filter out non- sdist package type -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Nov 2019 11:11:56 +0000 swh-loader-core (0.0.57-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.57 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-22 11:26:11 +0100) * Upstream changes: - v0.0.57 - package.pypi: Fix project url computation edge case - Use pkg_resources to get the package version instead of vcversioner -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Nov 2019 10:31:11 +0000 swh-loader-core (0.0.56-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.56 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-21 16:12:46 +0100) * Upstream changes: - v0.0.56 - package.tasks: Rename appropriately load_deb_package task type name - Fix typos reported by codespell - Add a pre-commit config file -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Nov 2019 15:16:23 +0000 swh-loader-core (0.0.55-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.55 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-21 13:51:03 +0100) * Upstream changes: - v0.0.55 - package.tasks: Rename load_archive into load_archive_files - Migrate tox.ini to extras = xxx instead of deps = .[testing] - Merge tox test environments -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Nov 2019 12:56:07 +0000 swh-loader-core (0.0.54-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.54 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-21 11:29:20 +0100) * Upstream changes: - v0.0.54 - loader.package.deposit: Drop swh.deposit.client requirement - Include all requirements in MANIFEST.in -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Nov 2019 10:32:23 +0000 swh-loader-core (0.0.53-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.53 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-20 14:26:36 +0100) * Upstream changes: - v0.0.53 - loader.package.tasks: Document tasks - Define correctly the setup.py's entry_points -- Software Heritage autobuilder (on jenkins-debian1) Wed, 20 Nov 2019 13:30:10 +0000 swh-loader-core (0.0.52-1~swh3) unstable-swh; urgency=medium * Update dh-python version constraint -- Antoine R. Dumont (@ardumont) Wed, 20 Nov 2019 12:03:00 +0100 swh-loader-core (0.0.52-1~swh2) unstable-swh; urgency=medium * Add egg-info to pybuild.testfiles. -- Antoine R. Dumont (@ardumont) Wed, 20 Nov 2019 11:42:42 +0100 swh-loader-core (0.0.52-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.52 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-19 15:15:40 +0100) * Upstream changes: - v0.0.52 - Ensure BufferedLoader and UnbufferedLoader do flush their storage - loader.package: Register loader package tasks - package.tasks: Rename debian task to load_deb -- Software Heritage autobuilder (on jenkins-debian1) Tue, 19 Nov 2019 14:18:41 +0000 swh-loader-core (0.0.51-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.51 - (tagged by David Douard on 2019-11-18 17:05:17 +0100) * Upstream changes: - v0.0.51 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 18 Nov 2019 16:09:44 +0000 swh-loader-core (0.0.50-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.50 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-13 15:56:55 +0100) * Upstream changes: - v0.0.50 - package.loader: Check snapshot_id is set as returned value - package.loader: Ensure the origin visit type is set appropriately - package.loader: Fix serialization issue - package.debian: Align origin_visit type to 'deb' as in production -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Nov 2019 15:04:37 +0000 swh-loader-core (0.0.49-1~swh2) unstable-swh; urgency=medium * Update dependencies -- Antoine R. Dumont Fri, 08 Nov 2019 14:07:20 +0100 swh-loader-core (0.0.49-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.49 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-08 13:21:56 +0100) * Upstream changes: - v0.0.49 - New package loader implementations: archive, pypi, npm, deposit, debian -- Software Heritage autobuilder (on jenkins-debian1) Fri, 08 Nov 2019 12:29:47 +0000 swh-loader-core (0.0.48-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.48 - (tagged by Stefano Zacchiroli on 2019-10-01 16:49:39 +0200) * Upstream changes: - v0.0.48 - * typing: minimal changes to make a no-op mypy run pass -- Software Heritage autobuilder (on jenkins-debian1) Tue, 01 Oct 2019 14:52:59 +0000 swh-loader-core (0.0.47-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.47 - (tagged by Antoine Lambert on 2019-10-01 11:32:50 +0200) * Upstream changes: - version 0.0.47: Workaround HashCollision errors -- Software Heritage autobuilder (on jenkins-debian1) Tue, 01 Oct 2019 09:35:38 +0000 swh-loader-core (0.0.46-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.46 - (tagged by Antoine R. Dumont (@ardumont) on 2019-09-06 18:30:42 +0200) * Upstream changes: - v0.0.46 - pytest.ini: Remove warnings about our custom markers - pep8: Fix log.warning calls - core/loader: Fix get_save_data_path implementation - Fix validation errors in test. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 Sep 2019 16:33:13 +0000 swh-loader-core (0.0.45-1~swh2) unstable-swh; urgency=medium * Fix missing build dependency -- Antoine R. Dumont (@ardumont) Tue, 03 Sep 2019 14:12:13 +0200 swh-loader-core (0.0.45-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.45 - (tagged by Antoine R. Dumont (@ardumont) on 2019-09-03 10:38:36 +0200) * Upstream changes: - v0.0.45 - loader: Provide visit type when calling origin_visit_add - loader: Drop keys 'perms' and 'path' from content before sending to the - storage - swh.loader.package: Implement GNU loader - docs: add code of conduct document -- Software Heritage autobuilder (on jenkins-debian1) Tue, 03 Sep 2019 08:41:49 +0000 swh-loader-core (0.0.44-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.44 - (tagged by Valentin Lorentz on 2019-06-25 12:18:27 +0200) * Upstream changes: - Drop use of deprecated methods fetch_history_* -- Software Heritage autobuilder (on jenkins-debian1) Wed, 26 Jun 2019 09:40:59 +0000 swh-loader-core (0.0.43-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.43 - (tagged by Valentin Lorentz on 2019-06-18 16:21:58 +0200) * Upstream changes: - Use origin urls instead of origin ids. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 19 Jun 2019 09:33:53 +0000 swh-loader-core (0.0.42-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.42 - (tagged by David Douard on 2019-05-20 11:28:49 +0200) * Upstream changes: - v0.0.42 - update/fix requirements -- Software Heritage autobuilder (on jenkins-debian1) Mon, 20 May 2019 09:33:47 +0000 swh-loader-core (0.0.41-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.41 - (tagged by Antoine R. Dumont (@ardumont) on 2019-04-11 11:46:00 +0200) * Upstream changes: - v0.0.41 - core.loader: Migrate to latest snapshot_add, origin_visit_update api - core.loader: Count only the effectively new objects ingested - test_utils: Add coverage on utils module -- Software Heritage autobuilder (on jenkins-debian1) Thu, 11 Apr 2019 09:52:55 +0000 swh-loader-core (0.0.40-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.40 - (tagged by Antoine Lambert on 2019-03-29 10:57:14 +0100) * Upstream changes: - version 0.0.40 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Mar 2019 10:02:37 +0000 swh-loader-core (0.0.39-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.39 - (tagged by Antoine R. Dumont (@ardumont) on 2019-01-30 11:10:39 +0100) * Upstream changes: - v0.0.39 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 30 Jan 2019 10:13:56 +0000 swh-loader-core (0.0.35-1~swh1) unstable-swh; urgency=medium * v0.0.35 * tests: Initialize tox.ini use * tests, debian/*: Migrate to pytest -- Antoine R. Dumont (@ardumont) Tue, 23 Oct 2018 15:47:22 +0200 swh-loader-core (0.0.34-1~swh1) unstable-swh; urgency=medium * v0.0.34 * setup: prepare for PyPI upload * README.md: Simplify module description * core.tests: Install tests fixture for derivative loaders to use -- Antoine R. Dumont (@ardumont) Tue, 09 Oct 2018 14:11:29 +0200 swh-loader-core (0.0.33-1~swh1) unstable-swh; urgency=medium * v0.0.33 * loader/utils: Add clean_dangling_folders function to ease clean up * loader/core: Add optional pre_cleanup for dangling files cleaning -- Antoine R. Dumont (@ardumont) Fri, 09 Mar 2018 14:41:17 +0100 swh-loader-core (0.0.32-1~swh1) unstable-swh; urgency=medium * v0.0.32 * Improve origin_visit initialization step * Properly sandbox the prepare statement so that if it breaks, we can * update appropriately the visit with the correct status -- Antoine R. Dumont (@ardumont) Wed, 07 Mar 2018 11:06:27 +0100 swh-loader-core (0.0.31-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core v0.0.31 * Remove backwards-compatibility when sending snapshots -- Nicolas Dandrimont Tue, 13 Feb 2018 18:52:20 +0100 swh-loader-core (0.0.30-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core v0.0.30 * Update Debian metadata for snapshot-related breakage -- Nicolas Dandrimont Tue, 06 Feb 2018 14:22:53 +0100 swh-loader-core (0.0.29-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core v0.0.29 * Replace occurrences with snapshots * Enhance logging on error cases -- Nicolas Dandrimont Tue, 06 Feb 2018 14:13:11 +0100 swh-loader-core (0.0.28-1~swh1) unstable-swh; urgency=medium * v0.0.28 * Add stateless loader base class * Remove bare exception handlers -- Antoine R. Dumont (@ardumont) Tue, 19 Dec 2017 17:48:09 +0100 swh-loader-core (0.0.27-1~swh1) unstable-swh; urgency=medium * v0.0.27 * Migrate from indexer's indexer_configuration to storage's tool notion. -- Antoine R. Dumont (@ardumont) Thu, 07 Dec 2017 10:36:23 +0100 swh-loader-core (0.0.26-1~swh1) unstable-swh; urgency=medium * v0.0.26 * Fix send_provider method -- Antoine R. Dumont (@ardumont) Tue, 05 Dec 2017 15:40:57 +0100 swh-loader-core (0.0.25-1~swh1) unstable-swh; urgency=medium * v0.0.25 * swh.loader.core: Fix to retrieve the provider_id as an actual id * swh.loader.core: Fix log format error * swh.loader.core: Align log message according to conventions -- Antoine R. Dumont (@ardumont) Wed, 29 Nov 2017 12:55:45 +0100 swh-loader-core (0.0.24-1~swh1) unstable-swh; urgency=medium * v0.0.24 * Added metadata injection possible from loader core -- Antoine R. Dumont (@ardumont) Fri, 24 Nov 2017 11:35:40 +0100 swh-loader-core (0.0.23-1~swh1) unstable-swh; urgency=medium * v0.0.23 * loader: Fix dangling data flush -- Antoine R. Dumont (@ardumont) Tue, 07 Nov 2017 16:25:20 +0100 swh-loader-core (0.0.22-1~swh1) unstable-swh; urgency=medium * v0.0.22 * core.loader: Use the global setup set in swh.core.config * core.loader: Properly batch object insertions for big requests -- Antoine R. Dumont (@ardumont) Mon, 30 Oct 2017 18:50:00 +0100 swh-loader-core (0.0.21-1~swh1) unstable-swh; urgency=medium * v0.0.21 * swh.loader.core: Only send origin if not already sent before -- Antoine R. Dumont (@ardumont) Tue, 24 Oct 2017 16:30:53 +0200 swh-loader-core (0.0.20-1~swh1) unstable-swh; urgency=medium * v0.0.20 * Permit to add 'post_load' actions in loaders -- Antoine R. Dumont (@ardumont) Fri, 13 Oct 2017 14:30:37 +0200 swh-loader-core (0.0.19-1~swh1) unstable-swh; urgency=medium * v0.0.19 * Permit to add 'post_load' actions in loaders -- Antoine R. Dumont (@ardumont) Fri, 13 Oct 2017 14:14:14 +0200 swh-loader-core (0.0.18-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core version 0.0.18 * Update packaging runes -- Nicolas Dandrimont Thu, 12 Oct 2017 18:07:53 +0200 swh-loader-core (0.0.17-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core v0.0.17 * Allow iterating when fetching and storing data * Allow overriding the status of the loaded visit * Allow overriding the status of the load itself -- Nicolas Dandrimont Wed, 11 Oct 2017 16:38:29 +0200 swh-loader-core (0.0.16-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core v0.0.16 * Migrate from swh.model.git to swh.model.from_disk -- Nicolas Dandrimont Fri, 06 Oct 2017 14:46:41 +0200 swh-loader-core (0.0.15-1~swh1) unstable-swh; urgency=medium * v0.0.15 * docs: Add sphinx apidoc generation skeleton * docs: Add a simple README.md explaining the module's goal * swh.loader.core.loader: Unify origin_visit add/update function call -- Antoine R. Dumont (@ardumont) Fri, 29 Sep 2017 11:47:37 +0200 swh-loader-core (0.0.14-1~swh1) unstable-swh; urgency=medium * v0.0.14 * Add the blake2s256 hash computation -- Antoine R. Dumont (@ardumont) Sat, 25 Mar 2017 18:20:52 +0100 swh-loader-core (0.0.13-1~swh1) unstable-swh; urgency=medium * v0.0.13 * Improve core loader's interface api -- Antoine R. Dumont (@ardumont) Wed, 22 Feb 2017 13:43:54 +0100 swh-loader-core (0.0.12-1~swh1) unstable-swh; urgency=medium * v0.0.12 * Update storage configuration reading -- Antoine R. Dumont (@ardumont) Thu, 15 Dec 2016 18:34:41 +0100 swh-loader-core (0.0.11-1~swh1) unstable-swh; urgency=medium * v0.0.11 * d/control: Bump dependency to latest storage * Fix: Objects can be injected even though global loading failed * Populate the counters in fetch_history * Open open/close fetch_history function in the core loader -- Antoine R. Dumont (@ardumont) Wed, 24 Aug 2016 14:38:55 +0200 swh-loader-core (0.0.10-1~swh1) unstable-swh; urgency=medium * v0.0.10 * d/control: Update dependency -- Antoine R. Dumont (@ardumont) Sat, 11 Jun 2016 02:26:50 +0200 swh-loader-core (0.0.9-1~swh1) unstable-swh; urgency=medium * v0.0.9 * Improve default task that initialize storage as well -- Antoine R. Dumont (@ardumont) Fri, 10 Jun 2016 15:12:14 +0200 swh-loader-core (0.0.8-1~swh1) unstable-swh; urgency=medium * v0.0.8 * Migrate specific converter to the right module * Fix dangling parameter -- Antoine R. Dumont (@ardumont) Wed, 08 Jun 2016 18:09:23 +0200 swh-loader-core (0.0.7-1~swh1) unstable-swh; urgency=medium * v0.0.7 * Fix on revision conversion -- Antoine R. Dumont (@ardumont) Wed, 08 Jun 2016 16:19:02 +0200 swh-loader-core (0.0.6-1~swh1) unstable-swh; urgency=medium * v0.0.6 * d/control: Bump dependency on swh-model * d/control: Add missing description * Keep the abstraction for all entities * Align parameter definition order * Fix missing option in DEFAULT ones * Decrease verbosity * Fix missing origin_id assignment * d/rules: Add target to run tests during packaging -- Antoine R. Dumont (@ardumont) Wed, 08 Jun 2016 16:00:40 +0200 swh-loader-core (0.0.5-1~swh1) unstable-swh; urgency=medium * v0.0.5 -- Antoine R. Dumont (@ardumont) Wed, 25 May 2016 12:17:06 +0200 swh-loader-core (0.0.4-1~swh1) unstable-swh; urgency=medium * v0.0.4 * Rename package from python3-swh.loader to python3-swh.loader.core -- Antoine R. Dumont (@ardumont) Wed, 25 May 2016 11:44:48 +0200 swh-loader-core (0.0.3-1~swh1) unstable-swh; urgency=medium * v0.0.3 * Improve default configuration * Rename package from swh-loader-vcs to swh-loader -- Antoine R. Dumont (@ardumont) Wed, 25 May 2016 11:23:06 +0200 swh-loader-core (0.0.2-1~swh1) unstable-swh; urgency=medium * v0.0.2 * Fix: Flush data even when no data is sent to swh-storage -- Antoine R. Dumont (@ardumont) Tue, 24 May 2016 16:41:49 +0200 swh-loader-core (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * v0.0.1 -- Antoine R. Dumont (@ardumont) Wed, 13 Apr 2016 16:54:47 +0200 diff --git a/swh.loader.core.egg-info/PKG-INFO b/swh.loader.core.egg-info/PKG-INFO index 1462b4d..06e2a38 100644 --- a/swh.loader.core.egg-info/PKG-INFO +++ b/swh.loader.core.egg-info/PKG-INFO @@ -1,52 +1,52 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 4.1.0 +Version: 4.2.0 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/swh.loader.core.egg-info/SOURCES.txt b/swh.loader.core.egg-info/SOURCES.txt index eac17d5..cbd0425 100644 --- a/swh.loader.core.egg-info/SOURCES.txt +++ b/swh.loader.core.egg-info/SOURCES.txt @@ -1,288 +1,300 @@ .git-blame-ignore-revs .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.rst conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/README.rst docs/cli.rst docs/conf.py docs/index.rst docs/package-loader-specifications.rst docs/package-loader-tutorial.rst docs/vcs-loader-overview.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.loader.core.egg-info/PKG-INFO swh.loader.core.egg-info/SOURCES.txt swh.loader.core.egg-info/dependency_links.txt swh.loader.core.egg-info/entry_points.txt swh.loader.core.egg-info/requires.txt swh.loader.core.egg-info/top_level.txt swh/loader/__init__.py swh/loader/cli.py swh/loader/exception.py swh/loader/pytest_plugin.py swh/loader/core/__init__.py swh/loader/core/converters.py swh/loader/core/loader.py swh/loader/core/metadata_fetchers.py swh/loader/core/py.typed swh/loader/core/utils.py swh/loader/core/tests/__init__.py swh/loader/core/tests/test_converters.py swh/loader/core/tests/test_loader.py swh/loader/core/tests/test_utils.py swh/loader/package/__init__.py swh/loader/package/loader.py swh/loader/package/py.typed swh/loader/package/utils.py swh/loader/package/arch/__init__.py swh/loader/package/arch/loader.py swh/loader/package/arch/tasks.py swh/loader/package/arch/tests/__init__.py swh/loader/package/arch/tests/test_arch.py swh/loader/package/arch/tests/test_tasks.py swh/loader/package/arch/tests/data/fake_arch.sh swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz swh/loader/package/archive/__init__.py swh/loader/package/archive/loader.py swh/loader/package/archive/tasks.py swh/loader/package/archive/tests/__init__.py swh/loader/package/archive/tests/test_archive.py swh/loader/package/archive/tests/test_tasks.py swh/loader/package/archive/tests/data/not_gzipped_tarball.tar.gz swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz swh/loader/package/aur/__init__.py swh/loader/package/aur/loader.py swh/loader/package/aur/tasks.py swh/loader/package/aur/tests/__init__.py swh/loader/package/aur/tests/test_aur.py swh/loader/package/aur/tests/test_tasks.py swh/loader/package/aur/tests/data/fake_aur.sh swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_a-fake-one.tar.gz swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_hg-evolve.tar.gz swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_ibus-git.tar.gz swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_libervia-web-hg.tar.gz swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_tealdeer-git.tar.gz swh/loader/package/cran/__init__.py swh/loader/package/cran/loader.py swh/loader/package/cran/tasks.py swh/loader/package/cran/tests/__init__.py swh/loader/package/cran/tests/test_cran.py swh/loader/package/cran/tests/test_tasks.py swh/loader/package/cran/tests/data/description/KnownBR swh/loader/package/cran/tests/data/description/acepack swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz swh/loader/package/crates/__init__.py swh/loader/package/crates/loader.py swh/loader/package/crates/tasks.py swh/loader/package/crates/tests/__init__.py swh/loader/package/crates/tests/test_crates.py swh/loader/package/crates/tests/test_tasks.py swh/loader/package/crates/tests/data/fake_crates.sh swh/loader/package/crates/tests/data/https_crates.io/api_v1_crates_hg-core swh/loader/package/crates/tests/data/https_crates.io/api_v1_crates_micro-timer swh/loader/package/crates/tests/data/https_static.crates.io/crates_hg-core_hg-core-0.0.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.2.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.2.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.2.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.3.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.3.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.4.0.crate swh/loader/package/debian/__init__.py swh/loader/package/debian/loader.py swh/loader/package/debian/tasks.py swh/loader/package/debian/tests/__init__.py swh/loader/package/debian/tests/test_debian.py swh/loader/package/debian/tests/test_tasks.py swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.diff.gz swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.dsc swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.diff.gz swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.dsc swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2.orig.tar.gz swh/loader/package/debian/tests/data/http_deb.debian.org/onefile.txt swh/loader/package/deposit/__init__.py swh/loader/package/deposit/loader.py swh/loader/package/deposit/tasks.py swh/loader/package/deposit/tests/__init__.py swh/loader/package/deposit/tests/conftest.py swh/loader/package/deposit/tests/test_deposit.py swh/loader/package/deposit/tests/test_tasks.py swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello-2.10.zip swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello-2.12.tar.gz swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.12.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.13.json swh/loader/package/golang/__init__.py swh/loader/package/golang/loader.py swh/loader/package/golang/tasks.py swh/loader/package/golang/tests/__init__.py swh/loader/package/golang/tests/test_golang.py swh/loader/package/golang/tests/test_tasks.py swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@latest swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_list swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.info swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.zip +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@latest +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_list +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.info +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.zip +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@latest +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_list +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.info +swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.zip swh/loader/package/maven/__init__.py swh/loader/package/maven/loader.py swh/loader/package/maven/tasks.py swh/loader/package/maven/tests/__init__.py swh/loader/package/maven/tests/test_maven.py swh/loader/package/maven/tests/test_tasks.py swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0-sources.jar swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1-sources.jar swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom swh/loader/package/nixguix/__init__.py swh/loader/package/nixguix/loader.py swh/loader/package/nixguix/tasks.py swh/loader/package/nixguix/tests/__init__.py swh/loader/package/nixguix/tests/conftest.py swh/loader/package/nixguix/tests/test_nixguix.py swh/loader/package/nixguix/tests/test_tasks.py swh/loader/package/nixguix/tests/data/https_example.com/file.txt swh/loader/package/nixguix/tests/data/https_fail.com/truncated-archive.tgz swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz swh/loader/package/nixguix/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_github.com/owner-3_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 swh/loader/package/npm/__init__.py swh/loader/package/npm/loader.py swh/loader/package/npm/tasks.py swh/loader/package/npm/tests/__init__.py swh/loader/package/npm/tests/test_npm.py swh/loader/package/npm/tests/test_tasks.py swh/loader/package/npm/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.1-alpha.14.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/jammit-express_-_jammit-express-0.0.1.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/nativescript-telerik-analytics_-_nativescript-telerik-analytics-1.0.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.2.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.4.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.5.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.1.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.2.0.tgz swh/loader/package/npm/tests/data/https_replicate.npmjs.com/@aller_shared swh/loader/package/npm/tests/data/https_replicate.npmjs.com/catify swh/loader/package/npm/tests/data/https_replicate.npmjs.com/jammit-express swh/loader/package/npm/tests/data/https_replicate.npmjs.com/jammit-no-time swh/loader/package/npm/tests/data/https_replicate.npmjs.com/nativescript-telerik-analytics swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_visit1 swh/loader/package/opam/__init__.py swh/loader/package/opam/loader.py swh/loader/package/opam/tasks.py swh/loader/package/opam/tests/__init__.py swh/loader/package/opam/tests/test_opam.py swh/loader/package/opam/tests/test_tasks.py swh/loader/package/opam/tests/data/fake_opam_repo/_repo swh/loader/package/opam/tests/data/fake_opam_repo/version swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/lock swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/repos-config swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/agrid/agrid.0.1/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.1/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.2/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.3/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/ocb/ocb.0.1/opam swh/loader/package/opam/tests/data/https_github.com/OCamlPro_agrid_archive_0.1.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.1.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.2.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.3.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_ocb_archive_0.1.tar.gz swh/loader/package/pubdev/__init__.py swh/loader/package/pubdev/loader.py swh/loader/package/pubdev/tasks.py swh/loader/package/pubdev/tests/__init__.py swh/loader/package/pubdev/tests/test_pubdev.py swh/loader/package/pubdev/tests/test_tasks.py swh/loader/package/pubdev/tests/data/fake_pubdev.sh swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_Autolinker_versions_0.1.1.tar.gz swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_authentication_versions_0.0.1.tar.gz swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_bezier_versions_1.1.5.tar.gz swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_1.0.0.tar.gz swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_3.8.2.tar.gz swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_abstract_io swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_audio_manager swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf swh/loader/package/pypi/__init__.py swh/loader/package/pypi/loader.py swh/loader/package/pypi/tasks.py swh/loader/package/pypi/tests/__init__.py swh/loader/package/pypi/tests/test_pypi.py swh/loader/package/pypi/tests/test_tasks.py swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_86_10_c9555ec63106153aaaad753a281ff47f4ac79e980ff7f5d740d6649cd56a_upymenu-0.0.1.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 swh/loader/package/pypi/tests/data/https_pypi.org/pypi_nexter_json swh/loader/package/pypi/tests/data/https_pypi.org/pypi_upymenu_json swh/loader/package/tests/__init__.py swh/loader/package/tests/common.py swh/loader/package/tests/test_conftest.py swh/loader/package/tests/test_loader.py swh/loader/package/tests/test_loader_metadata.py swh/loader/package/tests/test_utils.py +swh/loader/package/tests/data/https_example.org/package_example_example-v1.0.tar.gz +swh/loader/package/tests/data/https_example.org/package_example_example-v2.0.tar.gz +swh/loader/package/tests/data/https_example.org/package_example_example-v3.0.tar.gz +swh/loader/package/tests/data/https_example.org/package_example_example-v4.0.tar.gz swh/loader/tests/__init__.py swh/loader/tests/conftest.py swh/loader/tests/py.typed swh/loader/tests/test_cli.py swh/loader/tests/test_init.py swh/loader/tests/data/0805nexter-1.1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/crates/loader.py b/swh/loader/package/crates/loader.py index a2ebc2b..2943ae9 100644 --- a/swh/loader/package/crates/loader.py +++ b/swh/loader/package/crates/loader.py @@ -1,354 +1,354 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.version import StrictVersion import json from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple from urllib.parse import urlparse import attr import toml from typing_extensions import TypedDict from swh.loader.package.loader import BasePackageInfo, PackageLoader -from swh.loader.package.utils import api_info, cached_method, release_name +from swh.loader.package.utils import cached_method, get_url_body, release_name from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface class ExtrinsicPackageMetadata(TypedDict): """Data structure for package extrinsic metadata pulled from http api endpoint. We set only the keys we need according to what is available when querying https://crates.io/api/v1/crates/, where `name` is the name of the crate package (see JSON response example at https://crates.io/api/v1/crates/hg-core). Usage example: .. code-block:: python e_metadata = ExtrinsicPackageMetadata(**self.info()) """ # noqa categories: List[Dict[Any, Any]] """Related categories""" crate: Dict[Any, Any] """Crate project information""" keywords: List[Any] """Keywords""" versions: List[Dict[Any, Any]] """A list of released versions for a crate""" class ExtrinsicVersionPackageMetadata(TypedDict): """Data structure for specific package version extrinsic metadata, pulled from http api endpoint. Similar to `ExtrinsicPackageMetadata` in its usage, but we flatten the data related to a specific version. """ crate: str """The package name""" crate_size: int """The package size""" created_at: str """First released at""" downloads: str """Number of downloads""" license: str """Package license""" num: str """Package version""" published_by: Dict[Any, Any] """Publishers information""" updated_at: str """Last update""" yanked: bool """Is that version yanked? (yanked means release-level deprecation)""" class IntrinsicPackageMetadata(TypedDict): """Data structure for specific package version intrinsic metadata. Data is extracted from the crate package's .toml file. Then the data of the 'package' entry is flattened. Cargo.toml file content example: .. code-block:: toml [package] name = "hg-core" version = "0.0.1" authors = ["Georges Racinet "] description = "Mercurial pure Rust core library, with no assumption on Python bindings (FFI)" homepage = "https://mercurial-scm.org" license = "GPL-2.0-or-later" repository = "https://www.mercurial-scm.org/repo/hg" [lib] name = "hg" [dev-dependencies.rand] version = "~0.6" [dev-dependencies.rand_pcg] version = "~0.1" :param toml: toml object """ name: str """The package name""" version: str """Package version""" authors: List[str] """Authors""" description: str """Package and release description""" homepage: str """Homepage of the project""" license: str """Package license""" repository: str """Source code repository""" @attr.s class CratesPackageInfo(BasePackageInfo): name = attr.ib(type=str) """Name of the package""" version = attr.ib(type=str) """Current version""" e_metadata: Dict[str, Any] = attr.ib(factory=ExtrinsicPackageMetadata) """Extrinsic package metadata, common to all versions""" e_metadata_version: Dict[str, Any] = attr.ib( factory=ExtrinsicVersionPackageMetadata ) """Extrinsic package metadata specific to a version""" i_metadata: Dict[str, Any] = attr.ib(factory=IntrinsicPackageMetadata) """Intrinsic metadata of the current package version""" def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: """Extract intrinsic metadata from Cargo.toml file at dir_path. Each crate archive has a Cargo.toml at the root of the archive. Args: dir_path: A directory on disk where a Cargo.toml must be present Returns: A dict mapping from toml parser """ return toml.load(dir_path / "Cargo.toml") def extract_author(p_info: CratesPackageInfo) -> Person: """Extract package author from intrinsic metadata and return it as a `Person` model. Args: p_info: CratesPackageInfo that should contains i_metadata entries Returns: Only one author (Person) of the package. Currently limited by internal detail of the swh stack (see T3887). """ authors = p_info.i_metadata["authors"] fullname = authors[0] # TODO: here we have a list of author, see T3887 return Person.from_fullname(fullname.encode()) def extract_description(p_info: CratesPackageInfo) -> str: """Extract package description from intrinsic metadata and return it as a string. Args: p_info: CratesPackageInfo that should contains i_metadata and entries Returns: Package description from metadata. """ return p_info.i_metadata["description"] class CratesLoader(PackageLoader[CratesPackageInfo]): """Load Crates package origins into swh archive.""" visit_type = "crates" def __init__( self, storage: StorageInterface, url: str, artifacts: List[Dict[str, Any]], **kwargs, ): """Constructor Args: url: Origin url, (e.g. https://crates.io/api/v1/crates/) artifacts: A list of dict listing all existing released versions for a package (Usually set with crates lister `extra_loader_arguments`). Each line is a dict that should have an `url` (where to download package specific version) and a `version` entry. Example:: [ { "version": , "url": "https://static.crates.io/crates//-.crate", } ] """ # noqa super().__init__(storage=storage, url=url, **kwargs) self.url = url self.artifacts: Dict[str, Dict] = { artifact["version"]: artifact for artifact in artifacts } @cached_method def _raw_info(self) -> bytes: """Get crate metadata (fetched from http api endpoint set as self.url) Returns: Content response as bytes. Content response is a json document. """ - return api_info(self.url) + return get_url_body(self.url) @cached_method def info(self) -> Dict: """Parse http api json response and return the crate metadata information as a Dict.""" return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: """Get all released versions of a crate Returns: A sequence of versions Example:: ["0.1.1", "0.10.2"] """ versions = list(self.artifacts.keys()) versions.sort(key=StrictVersion) return versions def get_default_version(self) -> str: """Get the newest release version of a crate Returns: A string representing a version Example:: "0.1.2" """ return self.get_versions()[-1] def get_package_info(self, version: str) -> Iterator[Tuple[str, CratesPackageInfo]]: """Get release name and package information from version Args: version: crate version (e.g: "0.1.0") Returns: Iterator of tuple (release_name, p_info) """ artifact = self.artifacts[version] filename = artifact["filename"] package_name = urlparse(self.url).path.split("/")[-1] url = artifact["url"] # Get extrinsic metadata from http api e_metadata = ExtrinsicPackageMetadata(**self.info()) # type: ignore[misc] # Extract crate info for current version (One .crate file for a given version) (crate_version,) = [ crate for crate in e_metadata["versions"] if crate["num"] == version ] e_metadata_version = ExtrinsicVersionPackageMetadata( # type: ignore[misc] **crate_version ) p_info = CratesPackageInfo( name=package_name, filename=filename, url=url, version=version, e_metadata=e_metadata, e_metadata_version=e_metadata_version, ) yield release_name(version, filename), p_info def build_release( self, p_info: CratesPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # Extract intrinsic metadata from dir_path/Cargo.toml name = p_info.name version = p_info.version dir_path = Path(uncompressed_path, f"{name}-{version}") i_metadata_raw = extract_intrinsic_metadata(dir_path) # Get only corresponding key of IntrinsicPackageMetadata i_metadata_keys = [k for k in IntrinsicPackageMetadata.__annotations__.keys()] # We use data only from "package" entry i_metadata = { k: v for k, v in i_metadata_raw["package"].items() if k in i_metadata_keys } p_info.i_metadata = IntrinsicPackageMetadata(**i_metadata) # type: ignore[misc] author = extract_author(p_info) description = extract_description(p_info) message = ( f"Synthetic release for Crate source package {p_info.name} " f"version {p_info.version}\n\n" f"{description}\n" ) # The only way to get a value for updated_at is through extrinsic metadata updated_at = p_info.e_metadata_version.get("updated_at") return Release( name=version.encode(), author=author, date=TimestampWithTimezone.from_iso8601(updated_at), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/golang/loader.py b/swh/loader/package/golang/loader.py index 9caff6a..0bc68a4 100644 --- a/swh/loader/package/golang/loader.py +++ b/swh/loader/package/golang/loader.py @@ -1,91 +1,109 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging +import re from typing import Iterator, Optional, Sequence, Tuple import attr from swh.loader.package.loader import BasePackageInfo, PackageLoader -from swh.loader.package.utils import EMPTY_AUTHOR, api_info, release_name +from swh.loader.package.utils import ( + EMPTY_AUTHOR, + get_url_body, + release_name, + cached_method, +) from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) +def _uppercase_encode(url: str) -> str: + return re.sub("([A-Z]{1})", r"!\1", url).lower() + + @attr.s class GolangPackageInfo(BasePackageInfo): name = attr.ib(type=str) timestamp = attr.ib(type=Optional[TimestampWithTimezone]) class GolangLoader(PackageLoader[GolangPackageInfo]): """Load Golang module zip file into SWH archive.""" visit_type = "golang" GOLANG_PKG_DEV_URL = "https://pkg.go.dev" GOLANG_PROXY_URL = "https://proxy.golang.org" def __init__( self, storage: StorageInterface, url: str, max_content_size: Optional[int] = None, **kwargs, ): super().__init__(storage, url, max_content_size=max_content_size, **kwargs) # The lister saves human-usable URLs, so we translate them to proxy URLs # for use in the loader. # This URL format is detailed in https://go.dev/ref/mod#goproxy-protocol assert url.startswith( self.GOLANG_PKG_DEV_URL ), "Go package URL (%s) not from %s" % (url, self.GOLANG_PKG_DEV_URL) self.name = url[len(self.GOLANG_PKG_DEV_URL) + 1 :] self.url = url.replace(self.GOLANG_PKG_DEV_URL, self.GOLANG_PROXY_URL) + self.url = _uppercase_encode(self.url) def get_versions(self) -> Sequence[str]: - return api_info(f"{self.url}/@v/list").decode().splitlines() - + versions = get_url_body(f"{self.url}/@v/list").decode().splitlines() + # some go packages only have a development version not listed by the endpoint above, + # so ensure to return it or it will be missed by the golang loader + default_version = self.get_default_version() + if default_version not in versions: + versions.append(default_version) + return versions + + @cached_method def get_default_version(self) -> str: - latest = api_info(f"{self.url}/@latest") + latest = get_url_body(f"{self.url}/@latest") return json.loads(latest)["Version"] def _raw_info(self, version: str) -> dict: - url = f"{self.url}/@v/{version}.info" - return json.loads(api_info(url)) + url = f"{self.url}/@v/{_uppercase_encode(version)}.info" + return json.loads(get_url_body(url)) def get_package_info(self, version: str) -> Iterator[Tuple[str, GolangPackageInfo]]: # Encode the name because creating nested folders can become problematic encoded_name = self.name.replace("/", "__") filename = f"{encoded_name}-{version}.zip" timestamp = TimestampWithTimezone.from_iso8601(self._raw_info(version)["Time"]) p_info = GolangPackageInfo( url=f"{self.url}/@v/{version}.zip", filename=filename, version=version, timestamp=timestamp, name=self.name, ) yield release_name(version), p_info def build_release( self, p_info: GolangPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: msg = ( f"Synthetic release for Golang source package {p_info.name} " f"version {p_info.version}\n" ) return Release( name=p_info.version.encode(), message=msg.encode(), date=p_info.timestamp, author=EMPTY_AUTHOR, # Go modules offer very little metadata target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@latest b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@latest new file mode 100644 index 0000000..cc87e10 --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@latest @@ -0,0 +1 @@ +{"Version":"v1.0.1","Time":"2022-03-23T18:02:43Z"} \ No newline at end of file diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_list b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_list new file mode 100644 index 0000000..b18d465 --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_list @@ -0,0 +1 @@ +v1.0.1 diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.info b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.info new file mode 100644 index 0000000..cc87e10 --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.info @@ -0,0 +1 @@ +{"Version":"v1.0.1","Time":"2022-03-23T18:02:43Z"} \ No newline at end of file diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.zip b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.zip new file mode 100644 index 0000000..8fe5583 Binary files /dev/null and b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_adam-hanna_array!operations_@v_v1.0.1.zip differ diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@latest b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@latest new file mode 100644 index 0000000..bcf634e --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@latest @@ -0,0 +1 @@ +{"Version":"v0.0.0-20131225113241-85981e2038bf","Time":"2013-12-25T11:32:41Z"} \ No newline at end of file diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_list b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_list new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.info b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.info new file mode 100644 index 0000000..bcf634e --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.info @@ -0,0 +1 @@ +{"Version":"v0.0.0-20131225113241-85981e2038bf","Time":"2013-12-25T11:32:41Z"} \ No newline at end of file diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.zip b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.zip new file mode 100644 index 0000000..f37b383 Binary files /dev/null and b/swh/loader/package/golang/tests/data/https_proxy.golang.org/github.com_xgdapg_daemon_@v_v0.0.0-20131225113241-85981e2038bf.zip differ diff --git a/swh/loader/package/golang/tests/test_golang.py b/swh/loader/package/golang/tests/test_golang.py index 63bde1b..5888d9b 100644 --- a/swh/loader/package/golang/tests/test_golang.py +++ b/swh/loader/package/golang/tests/test_golang.py @@ -1,13 +1,31 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.loader.package.golang.loader import GolangLoader def test_golang_loader_first_visit(swh_storage, requests_mock_datadir): url = "https://pkg.go.dev/example.com/basic-go-module" loader = GolangLoader(swh_storage, url) assert loader.load()["status"] == "eventful" + + +def test_golang_loader_package_name_with_uppercase_characters( + swh_storage, requests_mock_datadir +): + url = "https://pkg.go.dev/github.com/adam-hanna/arrayOperations" + loader = GolangLoader(swh_storage, url) + + assert loader.load()["status"] == "eventful" + + +def test_golang_loader_package_with_dev_version_only( + swh_storage, requests_mock_datadir +): + url = "https://pkg.go.dev/github.com/xgdapg/daemon" + loader = GolangLoader(swh_storage, url) + + assert loader.load()["status"] == "eventful" diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 96ff69e..4cc6430 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,1088 +1,1111 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib from itertools import islice import json import logging import os import string import sys import tempfile from typing import ( Any, Dict, Generic, Iterator, List, Mapping, Optional, Sequence, Set, Tuple, TypeVar, ) import attr from requests.exceptions import ContentDecodingError import sentry_sdk from swh.core.tarball import uncompress from swh.loader.core.loader import BaseLoader from swh.loader.exception import NotFound from swh.loader.package.utils import download from swh.model import from_disk from swh.model.hashutil import hash_to_hex from swh.model.model import ( ExtID, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, ) from swh.model.model import ( Origin, OriginVisit, OriginVisitStatus, RawExtrinsicMetadata, Release, Revision, Sha1Git, Snapshot, ) from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import StorageInterface from swh.storage.utils import now logger = logging.getLogger(__name__) SWH_METADATA_AUTHORITY = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="https://softwareheritage.org/", metadata={}, ) """Metadata authority for extrinsic metadata generated by Software Heritage. Used for metadata on "original artifacts", ie. length, filename, and checksums of downloaded archive files.""" PartialExtID = Tuple[str, int, bytes] """The ``extid_type`` and ``extid`` fields of an :class:`ExtID` object.""" @attr.s class RawExtrinsicMetadataCore: """Contains the core of the metadata extracted by a loader, that will be used to build a full RawExtrinsicMetadata object by adding object identifier, context, and provenance information.""" format = attr.ib(type=str) metadata = attr.ib(type=bytes) discovery_date = attr.ib(type=Optional[datetime.datetime], default=None) """Defaults to the visit date.""" @attr.s class BasePackageInfo: """Compute the primary key for a dict using the id_keys as primary key composite. Args: d: A dict entry to compute the primary key on id_keys: Sequence of keys to use as primary key Returns: The identity for that dict entry """ url = attr.ib(type=str) filename = attr.ib(type=Optional[str]) version = attr.ib(type=str) """Version name/number.""" MANIFEST_FORMAT: Optional[string.Template] = None """If not None, used by the default extid() implementation to format a manifest, before hashing it to produce an ExtID.""" EXTID_TYPE: str = "package-manifest-sha256" EXTID_VERSION: int = 0 # The following attribute has kw_only=True in order to allow subclasses # to add attributes. Without kw_only, attributes without default values cannot # go after attributes with default values. # See directory_extrinsic_metadata = attr.ib( type=List[RawExtrinsicMetadataCore], default=[], kw_only=True, ) """:term:`extrinsic metadata` collected by the loader, that will be attached to the loaded directory and added to the Metadata storage.""" # TODO: add support for metadata for releases and contents def extid(self) -> Optional[PartialExtID]: """Returns a unique intrinsic identifier of this package info, or None if this package info is not 'deduplicatable' (meaning that we will always load it, instead of checking the ExtID storage to see if we already did)""" if self.MANIFEST_FORMAT is None: return None else: manifest = self.MANIFEST_FORMAT.substitute( {k: str(v) for (k, v) in attr.asdict(self).items()} ) return ( self.EXTID_TYPE, self.EXTID_VERSION, hashlib.sha256(manifest.encode()).digest(), ) TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) class PackageLoader(BaseLoader, Generic[TPackageInfo]): def __init__(self, storage: StorageInterface, url: str, **kwargs: Any): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: storage: Storage instance url: Origin url to load data from """ super().__init__(storage=storage, origin_url=url, **kwargs) + self.status_load = "" + self.status_visit = "" + + def load_status(self) -> Dict[str, str]: + """Detailed loading status.""" + return { + "status": self.status_load, + } + + def visit_status(self) -> str: + """Detailed visit status.""" + return self.status_visit def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Raises: class:`swh.loader.exception.NotFound` error when failing to read the published package versions. Returns: Sequence of published versions """ return [] def get_package_info(self, version: str) -> Iterator[Tuple[str, TPackageInfo]]: """Given a release version of a package, retrieve the associated package information for such version. Args: version: Package version Returns: (branch name, package metadata) """ yield from {} def build_release( self, p_info: TPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: """Build the release from the archive metadata (extrinsic artifact metadata) and the intrinsic metadata. Args: p_info: Package information uncompressed_path: Artifact uncompressed path on disk """ raise NotImplementedError("build_release") def get_default_version(self) -> str: """Retrieve the latest release version if any. Returns: Latest version """ return "" def last_snapshot(self) -> Optional[Snapshot]: """Retrieve the last snapshot out of the last visit.""" return snapshot_get_latest(self.storage, self.origin.url) def new_packageinfo_to_extid(self, p_info: TPackageInfo) -> Optional[PartialExtID]: return p_info.extid() def _get_known_extids( self, packages_info: List[TPackageInfo] ) -> Dict[PartialExtID, List[CoreSWHID]]: """Compute the ExtIDs from new PackageInfo objects, searches which are already loaded in the archive, and returns them if any.""" # Compute the ExtIDs of all the new packages, grouped by extid type new_extids: Dict[Tuple[str, int], List[bytes]] = {} for p_info in packages_info: res = p_info.extid() if res is not None: (extid_type, extid_version, extid_extid) = res new_extids.setdefault((extid_type, extid_version), []).append( extid_extid ) # For each extid type, call extid_get_from_extid() with all the extids of # that type, and store them in the '(type, extid) -> target' map. known_extids: Dict[PartialExtID, List[CoreSWHID]] = {} for ((extid_type, extid_version), extids) in new_extids.items(): for extid in self.storage.extid_get_from_extid( extid_type, extids, version=extid_version ): if extid is not None: key = (extid.extid_type, extid_version, extid.extid) known_extids.setdefault(key, []).append(extid.target) return known_extids def resolve_object_from_extids( self, known_extids: Dict[PartialExtID, List[CoreSWHID]], p_info: TPackageInfo, whitelist: Set[Sha1Git], ) -> Optional[CoreSWHID]: """Resolve the revision/release from known ExtIDs and a package info object. If the artifact has already been downloaded, this will return the existing release (or revision) targeting that uncompressed artifact directory. Otherwise, this returns None. Args: known_extids: Dict built from a list of ExtID, with the target as value p_info: Package information whitelist: Any ExtID with target not in this set is filtered out Returns: None or release/revision SWHID """ new_extid = p_info.extid() if new_extid is None: return None extid_targets = set() for extid_target in known_extids.get(new_extid, []): if extid_target.object_id not in whitelist: # There is a known ExtID for this package, but its target is not # in the snapshot. # This can happen for three reasons: # # 1. a loader crashed after writing the ExtID, but before writing # the snapshot # 2. some other loader loaded the same artifact, but produced # a different revision, causing an additional ExtID object # to be written. We will probably find this loader's ExtID # in a future iteration of this loop. # Note that for now, this is impossible, as each loader has a # completely different extid_type, but this is an implementation # detail of each loader. # 3. we took a snapshot, then the package disappeared, # then we took another snapshot, and the package reappeared # # In case of 1, we must actually load the package now, # so let's do it. # TODO: detect when we are in case 3 using release_missing # or revision_missing instead of the snapshot. continue elif extid_target.object_type in (ObjectType.RELEASE, ObjectType.REVISION): extid_targets.add(extid_target) else: # Note that this case should never be reached unless there is a # collision between a revision hash and some non-revision object's # hash, but better safe than sorry. logger.warning( "%s is in the whitelist, but is not a revision/release.", hash_to_hex(extid_target.object_type), ) if extid_targets: # This is a known package version, as we have an extid to reference it. # Let's return one of them. # If there is a release extid, return it. release_extid_targets = { extid_target for extid_target in extid_targets if extid_target.object_type == ObjectType.RELEASE } # Exclude missing targets missing_releases = { CoreSWHID(object_type=ObjectType.RELEASE, object_id=id_) for id_ in self.storage.release_missing( [swhid.object_id for swhid in release_extid_targets] ) } if missing_releases: err_message = "Found ExtIDs pointing to missing releases" logger.error(err_message + ": %s", missing_releases) with sentry_sdk.push_scope() as scope: scope.set_extra( "missing_releases", [str(x) for x in missing_releases] ) sentry_sdk.capture_message(err_message, "error") release_extid_targets -= missing_releases extid_target2 = self.select_extid_target(p_info, release_extid_targets) if extid_target2: return extid_target2 # If there is no release extid (ie. if the package was only loaded with # older versions of this loader, which produced revision objects instead # of releases), return a revision extid when possible. revision_extid_targets = { extid_target for extid_target in extid_targets if extid_target.object_type == ObjectType.REVISION } if revision_extid_targets: assert len(extid_targets) == 1, extid_targets extid_target = list(extid_targets)[0] return extid_target # No target found (this is probably a new package version) return None def select_extid_target( self, p_info: TPackageInfo, extid_targets: Set[CoreSWHID] ) -> Optional[CoreSWHID]: """Given a list of release extid targets, choses one appropriate for the given package info. Package loaders shyould implement this if their ExtIDs may map to multiple releases, so they can fetch releases from the storage and inspect their fields to select the right one for this ``p_info``. """ if extid_targets: # The base package loader does not have the domain-specific knowledge # to select the right release -> crash if there is more than one. assert len(extid_targets) == 1, extid_targets return list(extid_targets)[0] return None def download_package( self, p_info: TPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Download artifacts for a specific package. All downloads happen in in the tmpdir folder. Default implementation expects the artifacts package info to be about one artifact per package. Note that most implementation have 1 artifact per package. But some implementation have multiple artifacts per package (debian), some have none, the package is the artifact (gnu). Args: artifacts_package_info: Information on the package artifacts to download (url, filename, etc...) tmpdir: Location to retrieve such artifacts Returns: List of (path, computed hashes) """ try: return [download(p_info.url, dest=tmpdir, filename=p_info.filename)] except ContentDecodingError: # package might be erroneously marked as gzip compressed while is is not, # try to download its raw bytes again without attempting to uncompress # the input stream return [ download( p_info.url, dest=tmpdir, filename=p_info.filename, extra_request_headers={"Accept-Encoding": "identity"}, ) ] def uncompress( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str ) -> str: """Uncompress the artifact(s) in the destination folder dest. Optionally, this could need to use the p_info dict for some more information (debian). """ uncompressed_path = os.path.join(dest, "src") for a_path, _ in dl_artifacts: uncompress(a_path, dest=uncompressed_path) return uncompressed_path def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """Return an extra dict of branches that are used to update the set of branches. """ return {} def finalize_visit( self, *, snapshot: Optional[Snapshot], visit: OriginVisit, status_visit: str, status_load: str, failed_branches: List[str], errors: Optional[List[str]] = None, ) -> Dict[str, Any]: """Finalize the visit: - flush eventual unflushed data to storage - update origin visit's status - return the task's status """ + self.status_load = status_load + self.status_visit = status_visit self.storage.flush() snapshot_id: Optional[bytes] = None if snapshot and snapshot.id: # to prevent the snapshot.id to b"" snapshot_id = snapshot.id assert visit.visit visit_status = OriginVisitStatus( origin=self.origin.url, visit=visit.visit, type=self.visit_type, date=now(), status=status_visit, snapshot=snapshot_id, ) self.storage.origin_visit_status_add([visit_status]) result: Dict[str, Any] = { "status": status_load, } if snapshot_id: result["snapshot_id"] = hash_to_hex(snapshot_id) if failed_branches: logger.warning("%d failed branches", len(failed_branches)) for i, urls in enumerate(islice(failed_branches, 50)): prefix_url = "Failed branches: " if i == 0 else "" logger.warning("%s%s", prefix_url, urls) return result def load(self) -> Dict: """Load for a specific origin the associated contents. 1. Get the list of versions in an origin. 2. Get the snapshot from the previous run of the loader, and filter out versions that were already loaded, if their :term:`extids ` match Then, for each remaining version in the origin 3. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 4. Extract the downloaded files. By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 5. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 6. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 7. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 8. Generate and load the snapshot for the visit Using the revisions/releases collected at step 7., and the branch information from step 2., generate a snapshot and load it into the Software Heritage archive """ - status_load = "uneventful" # either: eventful, uneventful, failed - status_visit = "full" # see swh.model.model.OriginVisitStatus + self.status_load = "uneventful" # either: eventful, uneventful, failed + self.status_visit = "full" # see swh.model.model.OriginVisitStatus snapshot = None failed_branches: List[str] = [] # Prepare origin and origin_visit origin = Origin(url=self.origin.url) try: self.storage.origin_add([origin]) visit = list( self.storage.origin_visit_add( [ OriginVisit( origin=self.origin.url, date=self.visit_date, type=self.visit_type, ) ] ) )[0] except Exception as e: logger.exception( "Failed to initialize origin_visit for %s", self.origin.url ) sentry_sdk.capture_exception(e) + self.status_load = self.status_visit = "failed" return {"status": "failed"} # Get the previous snapshot for this origin. It is then used to see which # of the package's versions are already loaded in the archive. try: last_snapshot = self.last_snapshot() logger.debug("last snapshot: %s", last_snapshot) except Exception as e: logger.exception("Failed to get previous state for %s", self.origin.url) sentry_sdk.capture_exception(e) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=[str(e)], ) load_exceptions: List[Exception] = [] # Get the list of all version names try: versions = self.get_versions() except NotFound as e: return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="not_found", status_load="failed", errors=[str(e)], ) except Exception as e: logger.exception("Failed to get list of versions for %s", self.origin.url) sentry_sdk.capture_exception(e) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=[str(e)], ) + errors = [] + # Get the metadata of each version's package - packages_info: List[Tuple[str, TPackageInfo]] = [ - (branch_name, p_info) - for version in versions - for (branch_name, p_info) in self.get_package_info(version) - ] + packages_info: List[Tuple[str, TPackageInfo]] = [] + for version in versions: + try: + for branch_name, p_info in self.get_package_info(version): + packages_info.append((branch_name, p_info)) + except Exception as e: + load_exceptions.append(e) + sentry_sdk.capture_exception(e) + error = f"Failed to get package info for version {version} of {self.origin.url}" + logger.exception(error) + errors.append(f"{error}: {e}") # Compute the ExtID of each of these packages known_extids = self._get_known_extids([p_info for (_, p_info) in packages_info]) if last_snapshot is None: last_snapshot_targets: Set[Sha1Git] = set() else: last_snapshot_targets = { branch.target for branch in last_snapshot.branches.values() } new_extids: Set[ExtID] = set() tmp_releases: Dict[str, List[Tuple[str, Sha1Git]]] = { version: [] for version in versions } - errors = [] + for (branch_name, p_info) in packages_info: logger.debug("package_info: %s", p_info) # Check if the package was already loaded, using its ExtID swhid = self.resolve_object_from_extids( known_extids, p_info, last_snapshot_targets ) if swhid is not None and swhid.object_type == ObjectType.REVISION: # This package was already loaded, but by an older version # of this loader, which produced revisions instead of releases. # Let's fetch the revision's data, and "upgrade" it into a release. (rev,) = self.storage.revision_get([swhid.object_id]) if not rev: logger.error( "Failed to upgrade branch %s from revision to " "release, %s is missing from the storage. " "Falling back to re-loading from the origin.", branch_name, swhid, ) else: rev = None if swhid is None or (swhid.object_type == ObjectType.REVISION and not rev): # No matching revision or release found in the last snapshot, load it. release_id = None try: res = self._load_release(p_info, origin) if res: (release_id, directory_id) = res assert release_id assert directory_id self._load_extrinsic_directory_metadata( p_info, release_id, directory_id ) self.storage.flush() - status_load = "eventful" + self.status_load = "eventful" except Exception as e: self.storage.clear_buffers() load_exceptions.append(e) sentry_sdk.capture_exception(e) error = f"Failed to load branch {branch_name} for {self.origin.url}" logger.exception(error) failed_branches.append(branch_name) errors.append(f"{error}: {e}") continue if release_id is None: continue add_extid = True elif swhid.object_type == ObjectType.REVISION: # If 'rev' was None, the previous block would have run. assert rev is not None rel = rev2rel(rev, p_info.version) self.storage.release_add([rel]) logger.debug("Upgraded %s to %s", swhid, rel.swhid()) release_id = rel.id # Create a new extid for this package, so the next run of this loader # will be able to find the new release, and use it (instead of the # old revision) add_extid = True elif swhid.object_type == ObjectType.RELEASE: # This package was already loaded, nothing to do. release_id = swhid.object_id add_extid = False else: assert False, f"Unexpected object type: {swhid}" assert release_id is not None if add_extid: partial_extid = p_info.extid() if partial_extid is not None: (extid_type, extid_version, extid) = partial_extid release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=release_id ) new_extids.add( ExtID( extid_type=extid_type, extid_version=extid_version, extid=extid, target=release_swhid, ) ) tmp_releases[p_info.version].append((branch_name, release_id)) if load_exceptions: - status_visit = "partial" + self.status_visit = "partial" if not tmp_releases: # We could not load any releases; fail completely logger.error("Failed to load any release for %s", self.origin.url) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=errors, ) try: # Retrieve the default release version (the "latest" one) default_version = self.get_default_version() logger.debug("default version: %s", default_version) # Retrieve extra branches extra_branches = self.extra_branches() logger.debug("extra branches: %s", extra_branches) snapshot = self._load_snapshot( default_version, tmp_releases, extra_branches ) self.storage.flush() except Exception as e: error = f"Failed to build snapshot for origin {self.origin.url}" logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) - status_visit = "failed" - status_load = "failed" + self.status_visit = "failed" + self.status_load = "failed" if snapshot: try: metadata_objects = self.build_extrinsic_snapshot_metadata(snapshot.id) self.load_metadata_objects(metadata_objects) except Exception as e: error = ( f"Failed to load extrinsic snapshot metadata for {self.origin.url}" ) logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) - status_visit = "partial" - status_load = "failed" + self.status_visit = "partial" + self.status_load = "failed" try: metadata_objects = self.build_extrinsic_origin_metadata() self.load_metadata_objects(metadata_objects) except Exception as e: error = f"Failed to load extrinsic origin metadata for {self.origin.url}" logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) - status_visit = "partial" - status_load = "failed" + self.status_visit = "partial" + self.status_load = "failed" - if status_load != "failed": + if self.status_load != "failed": self._load_extids(new_extids) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, - status_visit=status_visit, - status_load=status_load, + status_visit=self.status_visit, + status_load=self.status_load, errors=errors, ) def _load_directory( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], tmpdir: str ) -> Tuple[str, from_disk.Directory]: uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) logger.debug("uncompressed_path: %s", uncompressed_path) directory = from_disk.Directory.from_disk( path=uncompressed_path.encode("utf-8"), max_content_length=self.max_content_size, ) contents, skipped_contents, directories = from_disk.iter_directory(directory) logger.debug("Number of skipped contents: %s", len(skipped_contents)) self.storage.skipped_content_add(skipped_contents) logger.debug("Number of contents: %s", len(contents)) self.storage.content_add(contents) logger.debug("Number of directories: %s", len(directories)) self.storage.directory_add(directories) return (uncompressed_path, directory) def _load_release( self, p_info: TPackageInfo, origin ) -> Optional[Tuple[Sha1Git, Sha1Git]]: """Does all the loading of a release itself: * downloads a package and uncompresses it * loads it from disk * adds contents, directories, and release to self.storage * returns (release_id, directory_id) Raises exception when unable to download or uncompress artifacts """ with tempfile.TemporaryDirectory() as tmpdir: dl_artifacts = self.download_package(p_info, tmpdir) (uncompressed_path, directory) = self._load_directory(dl_artifacts, tmpdir) # FIXME: This should be release. cf. D409 release = self.build_release( p_info, uncompressed_path, directory=directory.hash ) if not release: # Some artifacts are missing intrinsic metadata # skipping those return None metadata = [metadata for (filepath, metadata) in dl_artifacts] assert release.target is not None, release assert release.target_type == ModelObjectType.DIRECTORY, release metadata_target = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) original_artifact_metadata = RawExtrinsicMetadata( target=metadata_target, discovery_date=self.visit_date, authority=SWH_METADATA_AUTHORITY, fetcher=self.get_metadata_fetcher(), format="original-artifacts-json", metadata=json.dumps(metadata).encode(), origin=self.origin.url, release=release.swhid(), ) self.load_metadata_objects([original_artifact_metadata]) logger.debug("Release: %s", release) self.storage.release_add([release]) assert directory.hash return (release.id, directory.hash) def _load_snapshot( self, default_version: str, releases: Dict[str, List[Tuple[str, bytes]]], extra_branches: Dict[bytes, Mapping[str, Any]], ) -> Optional[Snapshot]: """Build snapshot out of the current releases stored and extra branches. Then load it in the storage. """ logger.debug("releases: %s", releases) # Build and load the snapshot branches = {} # type: Dict[bytes, Mapping[str, Any]] for version, branch_name_releases in releases.items(): if version == default_version and len(branch_name_releases) == 1: # only 1 branch (no ambiguity), we can create an alias # branch 'HEAD' branch_name, _ = branch_name_releases[0] # except for some corner case (deposit) if branch_name != "HEAD": branches[b"HEAD"] = { "target_type": "alias", "target": branch_name.encode("utf-8"), } for branch_name, target in branch_name_releases: branches[branch_name.encode("utf-8")] = { "target_type": "release", "target": target, } # Deal with extra-branches for name, branch_target in extra_branches.items(): if name in branches: error_message = f"Extra branch '{name!r}' has been ignored" logger.error(error_message) sentry_sdk.capture_message(error_message, "error") else: branches[name] = branch_target snapshot_data = {"branches": branches} logger.debug("snapshot: %s", snapshot_data) snapshot = Snapshot.from_dict(snapshot_data) logger.debug("snapshot: %s", snapshot) self.storage.snapshot_add([snapshot]) return snapshot def get_loader_name(self) -> str: """Returns a fully qualified name of this loader.""" return f"{self.__class__.__module__}.{self.__class__.__name__}" def get_loader_version(self) -> str: """Returns the version of the current loader.""" module_name = self.__class__.__module__ or "" module_name_parts = module_name.split(".") # Iterate rootward through the package hierarchy until we find a parent of this # loader's module with a __version__ attribute. for prefix_size in range(len(module_name_parts), 0, -1): package_name = ".".join(module_name_parts[0:prefix_size]) module = sys.modules[package_name] if hasattr(module, "__version__"): return module.__version__ # If this loader's class has no parent package with a __version__, # it should implement it itself. raise NotImplementedError( f"Could not dynamically find the version of {self.get_loader_name()}." ) def get_metadata_fetcher(self) -> MetadataFetcher: """Returns a MetadataFetcher instance representing this package loader; which is used to for adding provenance information to extracted extrinsic metadata, if any.""" return MetadataFetcher( name=self.get_loader_name(), version=self.get_loader_version(), metadata={}, ) def get_metadata_authority(self) -> MetadataAuthority: """For package loaders that get extrinsic metadata, returns the authority the metadata are coming from. """ raise NotImplementedError("get_metadata_authority") def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: """Returns metadata items, used by build_extrinsic_origin_metadata.""" return [] def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]: """Builds a list of full RawExtrinsicMetadata objects, using metadata returned by get_extrinsic_origin_metadata.""" metadata_items = self.get_extrinsic_origin_metadata() if not metadata_items: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in metadata_items: metadata_objects.append( RawExtrinsicMetadata( target=self.origin.swhid(), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, ) ) return metadata_objects def get_extrinsic_snapshot_metadata(self) -> List[RawExtrinsicMetadataCore]: """Returns metadata items, used by build_extrinsic_snapshot_metadata.""" return [] def build_extrinsic_snapshot_metadata( self, snapshot_id: Sha1Git ) -> List[RawExtrinsicMetadata]: """Builds a list of full RawExtrinsicMetadata objects, using metadata returned by get_extrinsic_snapshot_metadata.""" metadata_items = self.get_extrinsic_snapshot_metadata() if not metadata_items: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in metadata_items: metadata_objects.append( RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.SNAPSHOT, object_id=snapshot_id ), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, origin=self.origin.url, ) ) return metadata_objects def build_extrinsic_directory_metadata( self, p_info: TPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, ) -> List[RawExtrinsicMetadata]: if not p_info.directory_extrinsic_metadata: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in p_info.directory_extrinsic_metadata: metadata_objects.append( RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=directory_id ), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, origin=self.origin.url, release=CoreSWHID( object_type=ObjectType.RELEASE, object_id=release_id ), ) ) return metadata_objects def _load_extrinsic_directory_metadata( self, p_info: TPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, ) -> None: metadata_objects = self.build_extrinsic_directory_metadata( p_info, release_id, directory_id ) self.load_metadata_objects(metadata_objects) def _load_extids(self, extids: Set[ExtID]) -> None: if not extids: return try: self.storage.extid_add(list(extids)) except Exception as e: logger.exception("Failed to load new ExtIDs for %s", self.origin.url) sentry_sdk.capture_exception(e) # No big deal, it just means the next visit will load the same versions # again. def rev2rel(rev: Revision, version: str) -> Release: """Converts a revision to a release.""" message = rev.message if message and not message.endswith(b"\n"): message += b"\n" return Release( name=version.encode(), message=message, target=rev.directory, target_type=ModelObjectType.DIRECTORY, synthetic=rev.synthetic, author=rev.author, date=rev.date, ) diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py index 9648790..46eeaf0 100644 --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -1,308 +1,308 @@ # Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import json import logging import re from typing import Any, Dict, Iterator, List, Mapping, Optional, Set, Tuple import attr from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, PartialExtID, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method +from swh.loader.package.utils import EMPTY_AUTHOR, cached_method, get_url_body from swh.model import hashutil from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Release, Sha1Git, ) from swh.model.swhids import CoreSWHID from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EXTID_TYPE = "subresource-integrity" """The ExtID is an ASCII string, as defined by https://w3c.github.io/webappsec-subresource-integrity/""" EXTID_VERSION = 0 @attr.s class NixGuixPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) integrity = attr.ib(type=str) """Hash of the archive, formatted as in the Subresource Integrity specification.""" @classmethod def from_metadata( cls, metadata: Dict[str, Any], version: str ) -> "NixGuixPackageInfo": return cls( url=metadata["url"], filename=None, version=version, integrity=metadata["integrity"], raw_info=metadata, ) def extid(self) -> PartialExtID: return (EXTID_TYPE, EXTID_VERSION, self.integrity.encode("ascii")) class NixGuixLoader(PackageLoader[NixGuixPackageInfo]): """Load sources from a sources.json file. This loader is used to load sources used by functional package manager (eg. Nix and Guix). """ visit_type = "nixguix" def __init__( self, storage: StorageInterface, url: str, unsupported_file_extensions: List[str] = [], **kwargs: Any, ): super().__init__(storage=storage, url=url, **kwargs) self.provider_url = url self.unsupported_file_extensions = unsupported_file_extensions # Note: this could be renamed get_artifacts in the PackageLoader # base class. @cached_method def raw_sources(self): return retrieve_sources(self.origin.url) @cached_method def supported_sources(self): raw_sources = self.raw_sources() return clean_sources( parse_sources(raw_sources), self.unsupported_file_extensions ) @cached_method def integrity_by_url(self) -> Dict[str, str]: sources = self.supported_sources() return {s["urls"][0]: s["integrity"] for s in sources["sources"]} def get_versions(self) -> List[str]: """The first mirror of the mirror list is used as branch name in the snapshot. """ return list(self.integrity_by_url().keys()) def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url=self.origin.url, metadata={}, ) def get_extrinsic_snapshot_metadata(self): return [ RawExtrinsicMetadataCore( format="nixguix-sources-json", metadata=self.raw_sources(), ), ] # Note: this could be renamed get_artifact_info in the PackageLoader # base class. def get_package_info(self, url) -> Iterator[Tuple[str, NixGuixPackageInfo]]: # TODO: try all mirrors and not only the first one. A source # can be fetched from several urls, called mirrors. We # currently only use the first one, but if the first one # fails, we should try the second one and so on. integrity = self.integrity_by_url()[url] p_info = NixGuixPackageInfo.from_metadata( {"url": url, "integrity": integrity}, version=url ) yield url, p_info def select_extid_target( self, p_info: NixGuixPackageInfo, extid_targets: Set[CoreSWHID] ) -> Optional[CoreSWHID]: if extid_targets: # The archive URL is part of the release name. As that URL is not # intrinsic metadata, it means different releases may be created for # the same SRI so they have the same extid. # Therefore, we need to pick the one with the right URL. releases = self.storage.release_get( [target.object_id for target in extid_targets] ) extid_targets = { release.swhid() for release in releases if release is not None and release.name == p_info.version.encode() } return super().select_extid_target(p_info, extid_targets) def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """We add a branch to the snapshot called 'evaluation' pointing to the revision used to generate the sources.json file. This revision is specified in the sources.json file itself. For the nixpkgs origin, this revision is coming from the github.com/nixos/nixpkgs repository. Note this repository is not loaded explicitly. So, this pointer can target a nonexistent revision for a time. However, the github and gnu loaders are supposed to load this revision and should create the revision pointed by this branch. This branch can be used to identify the snapshot associated to a Nix/Guix evaluation. """ # The revision used to create the sources.json file. For Nix, # this revision belongs to the github.com/nixos/nixpkgs # repository revision = self.supported_sources()["revision"] return { b"evaluation": { "target_type": "revision", "target": hashutil.hash_to_bytes(revision), } } def build_release( self, p_info: NixGuixPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: return Release( name=p_info.version.encode(), message=None, author=EMPTY_AUTHOR, date=None, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def retrieve_sources(url: str) -> bytes: """Retrieve sources. Potentially raise NotFound error.""" - return api_info(url, allow_redirects=True) + return get_url_body(url, allow_redirects=True) def parse_sources(raw_sources: bytes) -> Dict[str, Any]: return json.loads(raw_sources.decode("utf-8")) def make_pattern_unsupported_file_extension( unsupported_file_extensions: List[str], ): """Make a regexp pattern for unsupported file extension out of a list of unsupported archive extension list. """ return re.compile( rf".*\.({'|'.join(map(re.escape, unsupported_file_extensions))})$", re.DOTALL ) def clean_sources( sources: Dict[str, Any], unsupported_file_extensions=[] ) -> Dict[str, Any]: """Validate and clean the sources structure. First, ensure all top level keys are present. Then, walk the sources list and remove sources that do not contain required keys. Filter out source entries whose: - required keys are missing - source type is not supported - urls attribute type is not a list - extension is known not to be supported by the loader Raises: ValueError if: - a required top level key is missing - top-level version is not 1 Returns: source Dict cleaned up """ pattern_unsupported_file = make_pattern_unsupported_file_extension( unsupported_file_extensions ) # Required top level keys required_keys = ["version", "revision", "sources"] missing_keys = [] for required_key in required_keys: if required_key not in sources: missing_keys.append(required_key) if missing_keys != []: raise ValueError( f"sources structure invalid, missing: {','.join(missing_keys)}" ) # Only the version 1 is currently supported version = int(sources["version"]) if version != 1: raise ValueError( f"The sources structure version '{sources['version']}' is not supported" ) # If a source doesn't contain required attributes, this source is # skipped but others could still be archived. verified_sources = [] for source in sources["sources"]: valid = True required_keys = ["urls", "integrity", "type"] for required_key in required_keys: if required_key not in source: logger.info( f"Skip source '{source}' because key '{required_key}' is missing", ) valid = False if valid and source["type"] != "url": logger.info( f"Skip source '{source}' because the type {source['type']} " "is not supported", ) valid = False if valid and not isinstance(source["urls"], list): logger.info( f"Skip source {source} because the urls attribute is not a list" ) valid = False if valid and len(source["urls"]) > 0: # Filter out unsupported archives supported_sources: List[str] = [] for source_url in source["urls"]: if pattern_unsupported_file.match(source_url): logger.info(f"Skip unsupported artifact url {source_url}") continue supported_sources.append(source_url) if len(supported_sources) == 0: logger.info( f"Skip source {source} because urls only reference " "unsupported artifacts. Unsupported " f"artifacts so far: {pattern_unsupported_file}" ) continue new_source = copy.deepcopy(source) new_source["urls"] = supported_sources verified_sources.append(new_source) sources["sources"] = verified_sources return sources diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py index 91081c6..a44e22d 100644 --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -1,300 +1,300 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from codecs import BOM_UTF8 import json import logging import os import string from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union from urllib.parse import quote import attr import chardet from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import api_info, cached_method, release_name +from swh.loader.package.utils import cached_method, get_url_body, release_name from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Person, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EMPTY_PERSON = Person.from_fullname(b"") @attr.s class NpmPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) package_name = attr.ib(type=str) date = attr.ib(type=Optional[str]) shasum = attr.ib(type=str) """sha1 checksum""" # we cannot rely only on $shasum, as it is technically possible for two versions # of the same package to have the exact same tarball. # But the release data (message and date) are extrinsic to the content of the # package, so they differ between versions. # So we need every attribute used to build the release object to be part of the # manifest. MANIFEST_FORMAT = string.Template( "date $date\nname $package_name\nshasum $shasum\nurl $url\nversion $version" ) EXTID_TYPE = "npm-manifest-sha256" EXTID_VERSION = 0 @classmethod def from_metadata( cls, project_metadata: Dict[str, Any], version: str ) -> "NpmPackageInfo": package_metadata = project_metadata["versions"][version] url = package_metadata["dist"]["tarball"] assert package_metadata["name"] == project_metadata["name"] # No date available in intrinsic metadata: retrieve it from the API # metadata, using the version number that the API claims this package # has. extrinsic_version = package_metadata["version"] if "time" in project_metadata: date = project_metadata["time"][extrinsic_version] elif "mtime" in package_metadata: date = package_metadata["mtime"] else: date = None return cls( package_name=package_metadata["name"], url=url, filename=os.path.basename(url), date=date, shasum=package_metadata["dist"]["shasum"], version=extrinsic_version, raw_info=package_metadata, directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="replicate-npm-package-json", metadata=json.dumps(package_metadata).encode(), ) ], ) class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive.""" visit_type = "npm" def __init__(self, storage: StorageInterface, url: str, **kwargs: Any): """Constructor Args str: origin url (e.g. https://www.npmjs.com/package/) """ super().__init__(storage=storage, url=url, **kwargs) self.package_name = url.split("https://www.npmjs.com/package/")[1] safe_name = quote(self.package_name, safe="") self.provider_url = f"https://replicate.npmjs.com/{safe_name}/" self._info: Dict[str, Any] = {} self._versions = None @cached_method def _raw_info(self) -> bytes: - return api_info(self.provider_url) + return get_url_body(self.provider_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from npm registry)""" return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: return sorted(list(self.info()["versions"].keys())) def get_default_version(self) -> str: return self.info()["dist-tags"].get("latest", "") def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", metadata={}, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, NpmPackageInfo]]: p_info = NpmPackageInfo.from_metadata( project_metadata=self.info(), version=version ) yield release_name(version), p_info def build_release( self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # Metadata from NPM is not intrinsic to tarballs. # This means two package versions can have the same tarball, but different # metadata. To avoid mixing up releases, every field used to build the # release object must be part of NpmPackageInfo.MANIFEST_FORMAT. i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None author = extract_npm_package_author(i_metadata) assert self.package_name == p_info.package_name msg = ( f"Synthetic release for NPM source package {p_info.package_name} " f"version {p_info.version}\n" ) if p_info.date is None: url = p_info.url artifact_name = os.path.basename(url) raise ValueError( "Origin %s: Cannot determine upload time for artifact %s." % (p_info.url, artifact_name) ) date = TimestampWithTimezone.from_iso8601(p_info.date) # FIXME: this is to remain bug-compatible with earlier versions: date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0)) r = Release( name=p_info.version.encode(), message=msg.encode(), author=author, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) return r def _author_str(author_data: Union[Dict, List, str]) -> str: """Parse author from package.json author fields""" if isinstance(author_data, dict): author_str = "" name = author_data.get("name") if name is not None: if isinstance(name, str): author_str += name elif isinstance(name, list): author_str += _author_str(name[0]) if len(name) > 0 else "" email = author_data.get("email") if email is not None: author_str += f" <{email}>" result = author_str elif isinstance(author_data, list): result = _author_str(author_data[0]) if len(author_data) > 0 else "" else: result = author_data return result def extract_npm_package_author(package_json: Dict[str, Any]) -> Person: """ Extract package author from a ``package.json`` file content and return it in swh format. Args: package_json: Dict holding the content of parsed ``package.json`` file Returns: Person """ for author_key in ("author", "authors"): if author_key in package_json: author_data = package_json[author_key] if author_data is None: return EMPTY_PERSON author_str = _author_str(author_data) return Person.from_fullname(author_str.encode()) return EMPTY_PERSON def _lstrip_bom(s, bom=BOM_UTF8): if s.startswith(bom): return s[len(bom) :] else: return s def load_json(json_bytes): """ Try to load JSON from bytes and return a dictionary. First try to decode from utf-8. If the decoding failed, try to detect the encoding and decode again with replace error handling. If JSON is malformed, an empty dictionary will be returned. Args: json_bytes (bytes): binary content of a JSON file Returns: dict: JSON data loaded in a dictionary """ json_data = {} try: json_str = _lstrip_bom(json_bytes).decode("utf-8") except UnicodeDecodeError: encoding = chardet.detect(json_bytes)["encoding"] if encoding: json_str = json_bytes.decode(encoding, "replace") try: json_data = json.loads(json_str) except json.decoder.JSONDecodeError: pass return json_data def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from npm. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) == 0: return {} project_dirname = lst[0] package_json_path = os.path.join(dir_path, project_dirname, "package.json") if not os.path.exists(package_json_path): return {} with open(package_json_path, "rb") as package_json_file: package_json_bytes = package_json_file.read() return load_json(package_json_bytes) diff --git a/swh/loader/package/pubdev/loader.py b/swh/loader/package/pubdev/loader.py index bcce138..608457a 100644 --- a/swh/loader/package/pubdev/loader.py +++ b/swh/loader/package/pubdev/loader.py @@ -1,194 +1,194 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from pathlib import Path from typing import Any, Dict, Iterator, Optional, Sequence, Tuple import attr from packaging.version import parse as parse_version import yaml from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import ( EMPTY_AUTHOR, Person, - api_info, cached_method, + get_url_body, release_name, ) from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface @attr.s class PubDevPackageInfo(BasePackageInfo): name = attr.ib(type=str) """Name of the package""" version = attr.ib(type=str) """Current version""" last_modified = attr.ib(type=str) """Last modified date as release date""" author = attr.ib(type=Person) """Author""" description = attr.ib(type=str) """Description""" def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: """Extract intrinsic metadata from pubspec.yaml file at dir_path. Each pub.dev package version has a pubspec.yaml file at the root of the archive. See https://dart.dev/tools/pub/pubspec for pubspec specifications. Args: dir_path: A directory on disk where a pubspec.yaml must be present Returns: A dict mapping from yaml parser """ pubspec_path = dir_path / "pubspec.yaml" return yaml.safe_load(pubspec_path.read_text()) class PubDevLoader(PackageLoader[PubDevPackageInfo]): visit_type = "pubdev" PUBDEV_BASE_URL = "https://pub.dev/" def __init__( self, storage: StorageInterface, url: str, **kwargs, ): super().__init__(storage=storage, url=url, **kwargs) self.url = url assert url.startswith(self.PUBDEV_BASE_URL) self.package_info_url = url.replace( self.PUBDEV_BASE_URL, f"{self.PUBDEV_BASE_URL}api/" ) def _raw_info(self) -> bytes: - return api_info(self.package_info_url) + return get_url_body(self.package_info_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from pub.dev registry)""" # Use strict=False in order to correctly manage case where \n is present in a string info = json.loads(self._raw_info(), strict=False) # Arrange versions list as a new dict with `version` as key versions = {v["version"]: v for v in info["versions"]} info["versions"] = versions return info def get_versions(self) -> Sequence[str]: """Get all released versions of a PubDev package Returns: A sequence of versions Example:: ["0.1.1", "0.10.2"] """ versions = list(self.info()["versions"].keys()) versions.sort(key=parse_version) return versions def get_default_version(self) -> str: """Get the newest release version of a PubDev package Returns: A string representing a version Example:: "0.1.2" """ latest = self.info()["latest"] return latest["version"] def get_package_info(self, version: str) -> Iterator[Tuple[str, PubDevPackageInfo]]: """Get release name and package information from version Package info comes from extrinsic metadata (from self.info()) Args: version: Package version (e.g: "0.1.0") Returns: Iterator of tuple (release_name, p_info) """ v = self.info()["versions"][version] assert v["version"] == version url = v["archive_url"] name = v["pubspec"]["name"] filename = f"{name}-{version}.tar.gz" last_modified = v["published"] if "authors" in v["pubspec"]: # TODO: here we have a list of author, see T3887 author = Person.from_fullname(v["pubspec"]["authors"][0].encode()) elif "author" in v["pubspec"] and v["pubspec"]["author"] is not None: author = Person.from_fullname(v["pubspec"]["author"].encode()) else: author = EMPTY_AUTHOR description = v["pubspec"]["description"] p_info = PubDevPackageInfo( name=name, filename=filename, url=url, version=version, last_modified=last_modified, author=author, description=description, ) yield release_name(version), p_info def build_release( self, p_info: PubDevPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # Extract intrinsic metadata from uncompressed_path/pubspec.yaml intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) name: str = intrinsic_metadata["name"] version: str = intrinsic_metadata["version"] assert version == p_info.version # author from intrinsic_metadata should not take precedence over the one # returned by the api, see https://dart.dev/tools/pub/pubspec#authorauthors author: Person = p_info.author if "description" in intrinsic_metadata and intrinsic_metadata["description"]: description = intrinsic_metadata["description"] else: description = p_info.description message = ( f"Synthetic release for pub.dev source package {name} " f"version {version}\n\n" f"{description}\n" ) return Release( name=version.encode(), author=author, date=TimestampWithTimezone.from_iso8601(p_info.last_modified), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py index cb427a9..fe814f7 100644 --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -1,243 +1,248 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import os from typing import Any, Dict, Iterator, Optional, Sequence, Tuple from urllib.parse import urlparse import attr from pkginfo import UnpackedSDist from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, PartialExtID, RawExtrinsicMetadataCore, ) -from swh.loader.package.utils import EMPTY_AUTHOR, api_info, cached_method, release_name +from swh.loader.package.utils import ( + EMPTY_AUTHOR, + cached_method, + get_url_body, + release_name, +) from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Person, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) EXTID_TYPE = "pypi-archive-sha256" EXTID_VERSION = 0 @attr.s class PyPIPackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) name = attr.ib(type=str) comment_text = attr.ib(type=Optional[str]) sha256 = attr.ib(type=str) upload_time = attr.ib(type=str) @classmethod def from_metadata( cls, metadata: Dict[str, Any], name: str, version: str ) -> "PyPIPackageInfo": return cls( url=metadata["url"], filename=metadata["filename"], version=version, raw_info=metadata, name=name, comment_text=metadata.get("comment_text"), sha256=metadata["digests"]["sha256"], upload_time=metadata["upload_time"], directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="pypi-project-json", metadata=json.dumps(metadata).encode(), ) ], ) def extid(self) -> PartialExtID: return (EXTID_TYPE, EXTID_VERSION, hash_to_bytes(self.sha256)) class PyPILoader(PackageLoader[PyPIPackageInfo]): """Load pypi origin's artifact releases into swh archive.""" visit_type = "pypi" def __init__(self, storage: StorageInterface, url: str, **kwargs): super().__init__(storage=storage, url=url, **kwargs) self.provider_url = pypi_api_url(self.origin.url) @cached_method def _raw_info(self) -> bytes: - return api_info(self.provider_url) + return get_url_body(self.provider_url) @cached_method def info(self) -> Dict: """Return the project metadata information (fetched from pypi registry)""" return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: return self.info()["releases"].keys() def get_default_version(self) -> str: return self.info()["info"]["version"] def get_metadata_authority(self): p_url = urlparse(self.origin.url) return MetadataAuthority( type=MetadataAuthorityType.FORGE, url=f"{p_url.scheme}://{p_url.netloc}/", metadata={}, ) def get_package_info(self, version: str) -> Iterator[Tuple[str, PyPIPackageInfo]]: res = [] for meta in self.info()["releases"][version]: # process only standard sdist archives if meta["packagetype"] != "sdist" or meta["filename"].lower().endswith( (".deb", ".egg", ".rpm", ".whl") ): continue p_info = PyPIPackageInfo.from_metadata( meta, name=self.info()["info"]["name"], version=version ) res.append((version, p_info)) if len(res) == 1: version, p_info = res[0] yield release_name(version), p_info else: for version, p_info in res: yield release_name(version, p_info.filename), p_info def build_release( self, p_info: PyPIPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None # from intrinsic metadata version_ = i_metadata.get("version", p_info.version) author_ = author(i_metadata) if p_info.comment_text: msg = p_info.comment_text else: msg = ( f"Synthetic release for PyPI source package {p_info.name} " f"version {version_}\n" ) date = TimestampWithTimezone.from_iso8601(p_info.upload_time) return Release( name=p_info.version.encode(), message=msg.encode(), author=author_, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def pypi_api_url(url: str) -> str: """Compute api url from a project url Args: url (str): PyPI instance's url (e.g: https://pypi.org/project/requests) This deals with correctly transforming the project's api url (e.g https://pypi.org/pypi/requests/json) Returns: api url """ p_url = urlparse(url) project_name = p_url.path.rstrip("/").split("/")[-1] url = "%s://%s/pypi/%s/json" % (p_url.scheme, p_url.netloc, project_name) return url def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from pypi. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) != 1: return {} project_dirname = lst[0] pkginfo_path = os.path.join(dir_path, project_dirname, "PKG-INFO") if not os.path.exists(pkginfo_path): return {} pkginfo = UnpackedSDist(pkginfo_path) raw = pkginfo.__dict__ raw.pop("filename") # this gets added with the ondisk location return raw def author(data: Dict) -> Person: """Given a dict of project/release artifact information (coming from PyPI), returns an author subset. Args: data (dict): Representing either artifact information or release information. Returns: swh-model dict representing a person. """ name = data.get("author") email = data.get("author_email") fullname = None # type: Optional[str] if email: fullname = "%s <%s>" % (name, email) else: fullname = name if not fullname: return EMPTY_AUTHOR if name is not None: name = name.encode("utf-8") if email is not None: email = email.encode("utf-8") return Person(fullname=fullname.encode("utf-8"), name=name, email=email) diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v1.0.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v1.0.tar.gz new file mode 100644 index 0000000..dba14f3 Binary files /dev/null and b/swh/loader/package/tests/data/https_example.org/package_example_example-v1.0.tar.gz differ diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v2.0.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v2.0.tar.gz new file mode 100644 index 0000000..05f2f02 Binary files /dev/null and b/swh/loader/package/tests/data/https_example.org/package_example_example-v2.0.tar.gz differ diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v3.0.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v3.0.tar.gz new file mode 100644 index 0000000..a20afe9 Binary files /dev/null and b/swh/loader/package/tests/data/https_example.org/package_example_example-v3.0.tar.gz differ diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v4.0.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v4.0.tar.gz new file mode 100644 index 0000000..9a68b23 Binary files /dev/null and b/swh/loader/package/tests/data/https_example.org/package_example_example-v4.0.tar.gz differ diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py index 1ea43dc..9cb719d 100644 --- a/swh/loader/package/tests/test_loader.py +++ b/swh/loader/package/tests/test_loader.py @@ -1,541 +1,637 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib import logging import string +from typing import Optional from unittest.mock import Mock, call, patch import attr import pytest from swh.loader.core.loader import ( SENTRY_ORIGIN_URL_TAG_NAME, SENTRY_VISIT_TYPE_TAG_NAME, ) from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import EMPTY_AUTHOR from swh.model.model import ( Origin, OriginVisit, OriginVisitStatus, Person, Release, Revision, RevisionType, + Sha1Git, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.model.model import ExtID from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ObjectType from swh.storage import get_storage from swh.storage.algos.snapshot import snapshot_get_latest class FakeStorage: def origin_add(self, origins): raise ValueError("We refuse to add an origin") def origin_visit_get_latest(self, origin): return None class FakeStorage2(FakeStorage): def origin_add(self, origins): pass def origin_visit_add(self, visits): raise ValueError("We refuse to add an origin visit") class StubPackageInfo(BasePackageInfo): pass +ORIGIN_URL = "https://example.org/package/example" + + class StubPackageLoader(PackageLoader[StubPackageInfo]): visit_type = "stub" def get_versions(self): return ["v1.0", "v2.0", "v3.0", "v4.0"] def get_package_info(self, version): + filename = f"example-{version}.tar.gz" p_info = StubPackageInfo( - "http://example.org", f"example-{version}.tar", version=version + f"{ORIGIN_URL}/{filename}", + filename, + version=version, ) extid_type = "extid-type1" if version in ("v1.0", "v2.0") else "extid-type2" # Versions 1.0 and 2.0 have an extid of a given type, v3.0 has an extid # of a different type patch.object( p_info, "extid", return_value=(extid_type, 0, f"extid-of-{version}".encode()), autospec=True, ).start() yield (f"branch-{version}", p_info) - def _load_release(self, p_info, origin): - return None + def build_release( + self, p_info: StubPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + msg = ( + f"Synthetic release for source package {p_info.url} " + f"version {p_info.version}\n" + ) + + return Release( + name=p_info.version.encode(), + message=msg.encode(), + date=None, + author=EMPTY_AUTHOR, + target_type=ModelObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) + + +def test_loader_origin_visit_success(swh_storage, requests_mock_datadir): + + loader = StubPackageLoader(swh_storage, ORIGIN_URL) + + assert loader.load() == { + "snapshot_id": "dcb9ecef64af73f2cdac7f5463cb6dece6b1db61", + "status": "eventful", + } + + assert loader.load_status() == {"status": "eventful"} + assert loader.visit_status() == "full" + + assert set(loader.last_snapshot().branches.keys()) == { + f"branch-{version}".encode() for version in loader.get_versions() + } def test_loader_origin_visit_failure(swh_storage): """Failure to add origin or origin visit should failed immediately""" loader = StubPackageLoader(swh_storage, "some-url") loader.storage = FakeStorage() actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} + assert loader.load_status() == {"status": "failed"} + assert loader.visit_status() == "failed" + loader.storage = FakeStorage2() actual_load_status2 = loader.load() assert actual_load_status2 == {"status": "failed"} + assert loader.load_status() == {"status": "failed"} + assert loader.visit_status() == "failed" + def test_resolve_object_from_extids() -> None: storage = get_storage("memory") target = b"\x01" * 20 rel1 = Release( name=b"aaaa", message=b"aaaa", target=target, target_type=ModelObjectType.DIRECTORY, synthetic=False, ) rel2 = Release( name=b"bbbb", message=b"bbbb", target=target, target_type=ModelObjectType.DIRECTORY, synthetic=False, ) storage.release_add([rel1, rel2]) - loader = StubPackageLoader(storage, "http://example.org/") + loader = StubPackageLoader(storage, ORIGIN_URL) p_info = Mock(wraps=BasePackageInfo(None, None, None)) # type: ignore # The PackageInfo does not support extids p_info.extid.return_value = None known_extids = {("extid-type", 0, b"extid-of-aaaa"): [rel1.swhid()]} whitelist = {b"unused"} assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None # Some known extid, and the PackageInfo is not one of them (ie. cache miss) p_info.extid.return_value = ("extid-type", 0, b"extid-of-cccc") assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None # Some known extid, and the PackageInfo is one of them (ie. cache hit), # but the target release was not in the previous snapshot p_info.extid.return_value = ("extid-type", 0, b"extid-of-aaaa") assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None # Some known extid, and the PackageInfo is one of them (ie. cache hit), # and the target release was in the previous snapshot whitelist = {rel1.id} assert ( loader.resolve_object_from_extids(known_extids, p_info, whitelist) == rel1.swhid() ) # Same as before, but there is more than one extid, and only one is an allowed # release whitelist = {rel1.id} known_extids = {("extid-type", 0, b"extid-of-aaaa"): [rel2.swhid(), rel1.swhid()]} assert ( loader.resolve_object_from_extids(known_extids, p_info, whitelist) == rel1.swhid() ) def test_resolve_object_from_extids_missing_target() -> None: storage = get_storage("memory") target = b"\x01" * 20 rel = Release( name=b"aaaa", message=b"aaaa", target=target, target_type=ModelObjectType.DIRECTORY, synthetic=False, ) - loader = StubPackageLoader(storage, "http://example.org/") + loader = StubPackageLoader(storage, ORIGIN_URL) p_info = Mock(wraps=BasePackageInfo(None, None, None)) # type: ignore known_extids = {("extid-type", 0, b"extid-of-aaaa"): [rel.swhid()]} p_info.extid.return_value = ("extid-type", 0, b"extid-of-aaaa") whitelist = {rel.id} # Targeted release is missing from the storage assert loader.resolve_object_from_extids(known_extids, p_info, whitelist) is None storage.release_add([rel]) # Targeted release now exists assert ( loader.resolve_object_from_extids(known_extids, p_info, whitelist) == rel.swhid() ) def test_load_get_known_extids() -> None: """Checks PackageLoader.load() fetches known extids efficiently""" storage = Mock(wraps=get_storage("memory")) - loader = StubPackageLoader(storage, "http://example.org") + loader = StubPackageLoader(storage, ORIGIN_URL) loader.load() # Calls should be grouped by extid type storage.extid_get_from_extid.assert_has_calls( [ call("extid-type1", [b"extid-of-v1.0", b"extid-of-v2.0"], version=0), call("extid-type2", [b"extid-of-v3.0", b"extid-of-v4.0"], version=0), ], any_order=True, ) def test_load_extids() -> None: """Checks PackageLoader.load() skips iff it should, and writes (only) the new ExtIDs""" storage = get_storage("memory") dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) rels = [ Release( name=f"v{i}.0".encode(), message=b"blah\n", target=dir_swhid.object_id, target_type=ModelObjectType.DIRECTORY, synthetic=True, ) for i in (1, 2, 3, 4) ] storage.release_add(rels[0:3]) - origin = "http://example.org" + origin = ORIGIN_URL rel1_swhid = rels[0].swhid() rel2_swhid = rels[1].swhid() rel3_swhid = rels[2].swhid() rel4_swhid = rels[3].swhid() # Results of a previous load storage.extid_add( [ ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid), ] ) last_snapshot = Snapshot( branches={ b"v1.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel1_swhid.object_id ), b"v2.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel2_swhid.object_id ), b"v3.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel3_swhid.object_id ), } ) storage.snapshot_add([last_snapshot]) date = datetime.datetime.now(tz=datetime.timezone.utc) storage.origin_add([Origin(url=origin)]) storage.origin_visit_add( - [OriginVisit(origin="http://example.org", visit=1, date=date, type="tar")] + [OriginVisit(origin=origin, visit=1, date=date, type="tar")] ) storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin, visit=1, status="full", date=date, snapshot=last_snapshot.id, ) ] ) - loader = StubPackageLoader(storage, "http://example.org") + loader = StubPackageLoader(storage, origin) patch.object( loader, "_load_release", return_value=(rel4_swhid.object_id, dir_swhid.object_id), autospec=True, ).start() loader.load() + assert loader.load_status() == {"status": "eventful"} + assert loader.visit_status() == "full" + assert loader._load_release.mock_calls == [ # type: ignore # v1.0: not loaded because there is already its (extid_type, extid, rel) # in the storage. # v2.0: loaded, because there is already a similar extid, but different type call( - StubPackageInfo(origin, "example-v2.0.tar", "v2.0"), + StubPackageInfo( + f"{origin}/example-v2.0.tar.gz", "example-v2.0.tar.gz", "v2.0" + ), Origin(url=origin), ), # v3.0: loaded despite having an (extid_type, extid) in storage, because # the target of the extid is not in the previous snapshot call( - StubPackageInfo(origin, "example-v3.0.tar", "v3.0"), + StubPackageInfo( + f"{origin}/example-v3.0.tar.gz", "example-v3.0.tar.gz", "v3.0" + ), Origin(url=origin), ), # v4.0: loaded, because there isn't its extid call( - StubPackageInfo(origin, "example-v4.0.tar", "v4.0"), + StubPackageInfo( + f"{origin}/example-v4.0.tar.gz", "example-v4.0.tar.gz", "v4.0" + ), Origin(url=origin), ), ] # then check the snapshot has all the branches. # versions 2.0 to 4.0 all point to rel4_swhid (instead of the value of the last # snapshot), because they had to be loaded (mismatched extid), and the mocked # _load_release always returns rel4_swhid. snapshot = Snapshot( branches={ b"branch-v1.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel1_swhid.object_id ), b"branch-v2.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel4_swhid.object_id ), b"branch-v3.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel4_swhid.object_id ), b"branch-v4.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel4_swhid.object_id ), } ) assert snapshot_get_latest(storage, origin) == snapshot extids = storage.extid_get_from_target( ObjectType.RELEASE, [ rel1_swhid.object_id, rel2_swhid.object_id, rel3_swhid.object_id, rel4_swhid.object_id, ], ) assert set(extids) == { # What we inserted at the beginning of the test: ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid), # Added by the loader: ExtID("extid-type1", b"extid-of-v2.0", rel4_swhid), ExtID("extid-type2", b"extid-of-v3.0", rel4_swhid), ExtID("extid-type2", b"extid-of-v4.0", rel4_swhid), } def test_load_upgrade_from_revision_extids(caplog): """Tests that, when loading incrementally based on a snapshot made by an old version of the loader, the loader will convert revisions to releases and add them to the storage. Also checks that, if an extid exists pointing to a non-existent revision (which should never happen, but you never know...), the release is loaded from scratch.""" storage = get_storage("memory") - origin = "http://example.org" + origin = ORIGIN_URL dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"d" * 20) dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) date = TimestampWithTimezone.from_datetime( datetime.datetime.now(tz=datetime.timezone.utc) ) person = Person.from_fullname(b"Jane Doe ") rev1 = Revision( message=b"blah", author=person, date=date, committer=person, committer_date=date, directory=dir1_swhid.object_id, type=RevisionType.TAR, synthetic=True, ) rel1 = Release( name=b"v1.0", message=b"blah\n", author=person, date=date, target=dir1_swhid.object_id, target_type=ModelObjectType.DIRECTORY, synthetic=True, ) rev1_swhid = rev1.swhid() rel1_swhid = rel1.swhid() rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=b"b" * 20) rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20) # Results of a previous load storage.extid_add( [ ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0), ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0), ] ) storage.revision_add([rev1]) last_snapshot = Snapshot( branches={ b"v1.0": SnapshotBranch( target_type=TargetType.REVISION, target=rev1_swhid.object_id ), b"v2.0": SnapshotBranch( target_type=TargetType.REVISION, target=rev2_swhid.object_id ), } ) storage.snapshot_add([last_snapshot]) date = datetime.datetime.now(tz=datetime.timezone.utc) storage.origin_add([Origin(url=origin)]) storage.origin_visit_add( - [OriginVisit(origin="http://example.org", visit=1, date=date, type="tar")] + [OriginVisit(origin=origin, visit=1, date=date, type="tar")] ) storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin, visit=1, status="full", date=date, snapshot=last_snapshot.id, ) ] ) - loader = StubPackageLoader(storage, "http://example.org") + loader = StubPackageLoader(storage, origin) patch.object( loader, "_load_release", return_value=(rel2_swhid.object_id, dir2_swhid.object_id), autospec=True, ).start() patch.object( loader, "get_versions", return_value=["v1.0", "v2.0", "v3.0"], autospec=True, ).start() caplog.set_level(logging.ERROR) loader.load() + assert loader.load_status() == {"status": "eventful"} + assert loader.visit_status() == "full" + assert len(caplog.records) == 1 (record,) = caplog.records assert record.levelname == "ERROR" assert "Failed to upgrade branch branch-v2.0" in record.message assert loader._load_release.mock_calls == [ # v1.0: not loaded because there is already a revision matching it # v2.0: loaded, as the revision is missing from the storage even though there # is an extid - call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"), Origin(url=origin)), + call( + StubPackageInfo( + f"{origin}/example-v2.0.tar.gz", "example-v2.0.tar.gz", "v2.0" + ), + Origin(url=origin), + ), # v3.0: loaded (did not exist yet) - call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"), Origin(url=origin)), + call( + StubPackageInfo( + f"{origin}/example-v3.0.tar.gz", "example-v3.0.tar.gz", "v3.0" + ), + Origin(url=origin), + ), ] snapshot = Snapshot( branches={ b"branch-v1.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel1_swhid.object_id ), b"branch-v2.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel2_swhid.object_id ), b"branch-v3.0": SnapshotBranch( target_type=TargetType.RELEASE, target=rel2_swhid.object_id ), } ) assert snapshot_get_latest(storage, origin) == snapshot extids = storage.extid_get_from_target( ObjectType.RELEASE, [ rel1_swhid.object_id, rel2_swhid.object_id, ], ) assert set(extids) == { ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid), ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid), } def test_manifest_extid(): """Compute primary key should return the right identity""" @attr.s class TestPackageInfo(BasePackageInfo): a = attr.ib() b = attr.ib() length = attr.ib() filename = attr.ib() MANIFEST_FORMAT = string.Template("$a $b") p_info = TestPackageInfo( url="http://example.org/", a=1, b=2, length=221837, filename="8sync-0.1.0.tar.gz", version="0.1.0", ) actual_id = p_info.extid() assert actual_id == ("package-manifest-sha256", 0, hashlib.sha256(b"1 2").digest()) def test_no_env_swh_config_filename_raise(monkeypatch): """No SWH_CONFIG_FILENAME environment variable makes package loader init raise""" class DummyPackageLoader(PackageLoader): """A dummy package loader for test purpose""" pass monkeypatch.delenv("SWH_CONFIG_FILENAME", raising=False) with pytest.raises( AssertionError, match="SWH_CONFIG_FILENAME environment variable is undefined" ): DummyPackageLoader.from_configfile(url="some-url") class StubPackageLoaderWithError(StubPackageLoader): def get_versions(self, *args, **kwargs): raise Exception("error") def test_loader_sentry_tags_on_error(swh_storage, sentry_events): - origin_url = "http://example.org/package/name" + origin_url = ORIGIN_URL loader = StubPackageLoaderWithError(swh_storage, origin_url) loader.load() + assert loader.load_status() == {"status": "failed"} + assert loader.visit_status() == "failed" sentry_tags = sentry_events[0]["tags"] assert sentry_tags.get(SENTRY_ORIGIN_URL_TAG_NAME) == origin_url assert ( sentry_tags.get(SENTRY_VISIT_TYPE_TAG_NAME) == StubPackageLoaderWithError.visit_type ) + + +class StubPackageLoaderWithPackageInfoFailure(StubPackageLoader): + def get_package_info(self, version): + if version == "v2.0": + raise Exception("Error when getting package info") + else: + return super().get_package_info(version) + + +def test_loader_origin_with_package_info_failure(swh_storage, requests_mock_datadir): + + loader = StubPackageLoaderWithPackageInfoFailure(swh_storage, ORIGIN_URL) + + assert loader.load() == { + "snapshot_id": "b4cce7081d661fb7f4d7a1db96e8044b752eb0b0", + "status": "eventful", + } + + assert loader.load_status() == {"status": "eventful"} + assert loader.visit_status() == "partial" + + assert set(loader.last_snapshot().branches.keys()) == { + f"branch-v{i}.0".encode() for i in (1, 3, 4) + } diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py index bf1f4da..acff6af 100644 --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -1,273 +1,308 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os from unittest.mock import MagicMock from urllib.error import URLError from urllib.parse import quote import pytest from requests.exceptions import HTTPError -from swh.loader.exception import NotFound import swh.loader.package -from swh.loader.package.utils import api_info, download, release_name +from swh.loader.package.utils import download, get_url_body, release_name def test_version_generation(): assert ( swh.loader.package.__version__ != "devel" ), "Make sure swh.loader.core is installed (e.g. pip install -e .)" @pytest.mark.fs def test_download_fail_to_download(tmp_path, requests_mock): url = "https://pypi.org/pypi/arrow/json" status_code = 404 requests_mock.get(url, status_code=status_code) with pytest.raises( HTTPError, match=f"{status_code} Client Error: None for url: {url}" ): download(url, tmp_path) _filename = "requests-0.0.1.tar.gz" _data = "this is something" def _check_download_ok(url, dest, filename=_filename, hashes={}): actual_filepath, actual_hashes = download(url, dest, hashes=hashes) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes["length"] == len(_data) assert ( actual_hashes["checksums"]["sha1"] == "fdd1ce606a904b08c816ba84f3125f2af44d92b2" ) assert ( actual_hashes["checksums"]["sha256"] == "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5" ) @pytest.mark.fs def test_download_ok(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_ok_no_header(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data) # no header information _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_ok_with_hashes(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) # good hashes for such file good = { "sha1": "fdd1ce606a904b08c816ba84f3125f2af44d92b2", "sha256": "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5", # noqa } _check_download_ok(url, dest=str(tmp_path), hashes=good) @pytest.mark.fs def test_download_fail_hashes_mismatch(tmp_path, requests_mock): """Mismatch hash after download should raise""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) # good hashes for such file good = { "sha1": "fdd1ce606a904b08c816ba84f3125f2af44d92b2", "sha256": "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5", # noqa } for hash_algo in good.keys(): wrong_hash = good[hash_algo].replace("1", "0") expected_hashes = good.copy() expected_hashes[hash_algo] = wrong_hash # set the wrong hash expected_msg = "Failure when fetching %s. " "Checksum mismatched: %s != %s" % ( url, wrong_hash, good[hash_algo], ) with pytest.raises(ValueError, match=expected_msg): download(url, dest=str(tmp_path), hashes=expected_hashes) @pytest.mark.fs def test_ftp_download_ok(tmp_path, mocker): """Download without issue should provide filename and hashes""" url = f"ftp://pypi.org/pypi/requests/{_filename}" cm = MagicMock() cm.getstatus.return_value = 200 cm.read.side_effect = [_data.encode(), b""] cm.__enter__.return_value = cm mocker.patch("swh.loader.package.utils.urlopen").return_value = cm _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_ftp_download_ko(tmp_path, mocker): """Download without issue should provide filename and hashes""" filename = "requests-0.0.1.tar.gz" url = "ftp://pypi.org/pypi/requests/%s" % filename mocker.patch("swh.loader.package.utils.urlopen").side_effect = URLError("FTP error") with pytest.raises(URLError): download(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_with_redirection(tmp_path, requests_mock): """Download with redirection should use the targeted URL to extract filename""" url = "https://example.org/project/requests/download" redirection_url = f"https://example.org/project/requests/files/{_filename}" requests_mock.get(url, status_code=302, headers={"location": redirection_url}) requests_mock.get( redirection_url, text=_data, headers={"content-length": str(len(_data))} ) _check_download_ok(url, dest=str(tmp_path)) def test_download_extracting_filename_from_url(tmp_path, requests_mock): """Extracting filename from url must sanitize the filename first""" url = "https://example.org/project/requests-0.0.1.tar.gz?a=b&c=d&foo=bar" requests_mock.get( url, status_code=200, text=_data, headers={"content-length": str(len(_data))} ) _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs @pytest.mark.parametrize( "filename", [f'"{_filename}"', _filename, '"filename with spaces.tar.gz"'] ) def test_download_filename_from_content_disposition(tmp_path, requests_mock, filename): """Filename should be extracted from content-disposition request header when available.""" url = "https://example.org/download/requests/tar.gz/v0.0.1" requests_mock.get( url, text=_data, headers={ "content-length": str(len(_data)), "content-disposition": f"attachment; filename={filename}", }, ) _check_download_ok(url, dest=str(tmp_path), filename=filename.strip('"')) @pytest.mark.fs @pytest.mark.parametrize("filename", ['"archive école.tar.gz"', "archive_école.tgz"]) def test_download_utf8_filename_from_content_disposition( tmp_path, requests_mock, filename ): """Filename should be extracted from content-disposition request header when available.""" url = "https://example.org/download/requests/tar.gz/v0.0.1" data = "this is something" requests_mock.get( url, text=data, headers={ "content-length": str(len(data)), "content-disposition": f"attachment; filename*=utf-8''{quote(filename)}", }, ) _check_download_ok(url, dest=str(tmp_path), filename=filename.strip('"')) def test_api_info_failure(requests_mock): """Failure to fetch info/release information should raise""" url = "https://pypi.org/pypi/requests/json" status_code = 400 requests_mock.get(url, status_code=status_code) - with pytest.raises(NotFound) as e0: - api_info(url) - - assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % (url, status_code) + with pytest.raises( + HTTPError, match=f"{status_code} Client Error: None for url: {url}" + ): + get_url_body(url) def test_api_info(requests_mock): """Fetching json info from pypi project should be ok""" url = "https://pypi.org/pypi/requests/json" requests_mock.get(url, text='{"version": "0.0.1"}') - actual_info = json.loads(api_info(url)) + actual_info = json.loads(get_url_body(url)) assert actual_info == { "version": "0.0.1", } def test_release_name(): for version, filename, expected_release in [ ("0.0.1", None, "releases/0.0.1"), ("0.0.2", "something", "releases/0.0.2/something"), ]: assert release_name(version, filename) == expected_release @pytest.fixture(autouse=True) def mock_download_retry_sleep(mocker): mocker.patch.object(download.retry, "sleep") def test_download_retry(mocker, requests_mock, tmp_path): url = f"https://example.org/project/requests/files/{_filename}" requests_mock.get( url, [ {"status_code": 429}, {"status_code": 429}, { "text": _data, "headers": {"content-length": str(len(_data))}, "status_code": 200, }, ], ) _check_download_ok(url, dest=str(tmp_path)) def test_download_retry_reraise(mocker, requests_mock, tmp_path): url = f"https://example.org/project/requests/files/{_filename}" requests_mock.get( url, [{"status_code": 429}] * 5, ) with pytest.raises(HTTPError): _check_download_ok(url, dest=str(tmp_path)) + + +@pytest.fixture(autouse=True) +def mock_api_info_retry_sleep(mocker): + mocker.patch.object(get_url_body.retry, "sleep") + + +def test_api_info_retry(mocker, requests_mock, tmp_path): + url = "https://example.org/api/endpoint" + json_data = {"foo": "bar"} + + requests_mock.get( + url, + [ + {"status_code": 429}, + {"status_code": 429}, + { + "json": json_data, + "status_code": 200, + }, + ], + ) + + assert json.loads(get_url_body(url)) == json_data + + +def test_api_info_retry_reraise(mocker, requests_mock, tmp_path): + url = "https://example.org/api/endpoint" + + requests_mock.get( + url, + [{"status_code": 429}] * 5, + ) + + with pytest.raises(HTTPError, match=f"429 Client Error: None for url: {url}"): + get_url_body(url) diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py index df3127c..adf882b 100644 --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -1,207 +1,213 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import functools import itertools import logging import os import re from typing import Callable, Dict, Optional, Tuple, TypeVar from urllib.parse import unquote, urlsplit from urllib.request import urlopen import requests from requests.exceptions import HTTPError from tenacity import retry from tenacity.before_sleep import before_sleep_log from tenacity.stop import stop_after_attempt from tenacity.wait import wait_exponential from swh.loader.exception import NotFound from swh.loader.package import DEFAULT_PARAMS from swh.model.hashutil import HASH_BLOCK_SIZE, MultiHash from swh.model.model import Person logger = logging.getLogger(__name__) DOWNLOAD_HASHES = set(["sha1", "sha256", "length"]) EMPTY_AUTHOR = Person.from_fullname(b"") -def api_info(url: str, **extra_params) -> bytes: - """Basic api client to retrieve information on project. This deals with - fetching json metadata about pypi projects. - - Args: - url (str): The api url (e.g PyPI, npm, etc...) - - Raises: - NotFound in case of query failures (for some reasons: 404, ...) - - Returns: - The associated response's information - - """ - response = requests.get(url, **{**DEFAULT_PARAMS, **extra_params}) - if response.status_code != 200: - raise NotFound(f"Fail to query '{url}'. Reason: {response.status_code}") - return response.content - - def _content_disposition_filename(header: str) -> Optional[str]: fname = None fnames = re.findall(r"filename[\*]?=([^;]+)", header) if fnames and "utf-8''" in fnames[0].lower(): # RFC 5987 fname = re.sub("utf-8''", "", fnames[0], flags=re.IGNORECASE) fname = unquote(fname) elif fnames: fname = fnames[0] if fname: fname = os.path.basename(fname.strip().strip('"')) return fname def _retry_if_throttling(retry_state) -> bool: """Custom tenacity retry predicate for handling HTTP responses with status code 429 (too many requests). """ attempt = retry_state.outcome if attempt.failed: exception = attempt.exception() return ( isinstance(exception, HTTPError) and exception.response.status_code == 429 ) return False -@retry( +throttling_retry = retry( retry=_retry_if_throttling, wait=wait_exponential(exp_base=10), stop=stop_after_attempt(max_attempt_number=5), before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True, ) + + +@throttling_retry def download( url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, extra_request_headers: Optional[Dict[str, str]] = None, ) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to hashes: Dict of expected hashes (key is the hash algo) for the artifact to download (those hashes are expected to be hex string) auth: Optional tuple of login/password (for http authentication service, e.g. deposit) Raises: ValueError in case of any error when fetching/computing (length, checksums mismatched...) Returns: Tuple of local (filepath, hashes of filepath) """ params = copy.deepcopy(DEFAULT_PARAMS) if auth is not None: params["auth"] = auth if extra_request_headers is not None: params["headers"].update(extra_request_headers) # so the connection does not hang indefinitely (read/connection timeout) timeout = params.get("timeout", 60) if url.startswith("ftp://"): response = urlopen(url, timeout=timeout) chunks = (response.read(HASH_BLOCK_SIZE) for _ in itertools.count()) response_data = itertools.takewhile(bool, chunks) else: response = requests.get(url, **params, timeout=timeout, stream=True) response.raise_for_status() # update URL to response one as requests follow redirection by default # on GET requests url = response.url # try to extract filename from content-disposition header if available if filename is None and "content-disposition" in response.headers: filename = _content_disposition_filename( response.headers["content-disposition"] ) response_data = response.iter_content(chunk_size=HASH_BLOCK_SIZE) filename = filename if filename else os.path.basename(urlsplit(url).path) logger.debug("filename: %s", filename) filepath = os.path.join(dest, filename) logger.debug("filepath: %s", filepath) h = MultiHash(hash_names=DOWNLOAD_HASHES | set(hashes.keys())) with open(filepath, "wb") as f: for chunk in response_data: h.update(chunk) f.write(chunk) response.close() # Also check the expected hashes if provided if hashes: actual_hashes = h.hexdigest() for algo_hash in hashes.keys(): actual_digest = actual_hashes[algo_hash] expected_digest = hashes[algo_hash] if actual_digest != expected_digest: raise ValueError( "Failure when fetching %s. " "Checksum mismatched: %s != %s" % (url, expected_digest, actual_digest) ) computed_hashes = h.hexdigest() length = computed_hashes.pop("length") extrinsic_metadata = { "length": length, "filename": filename, "checksums": computed_hashes, "url": url, } logger.debug("extrinsic_metadata", extrinsic_metadata) return filepath, extrinsic_metadata +@throttling_retry +def get_url_body(url: str, **extra_params) -> bytes: + """Basic HTTP client to retrieve information on software package, + typically JSON metadata from a REST API. + + Args: + url (str): An HTTP URL + + Raises: + NotFound in case of query failures (for some reasons: 404, ...) + + Returns: + The associated response's information + + """ + logger.debug("Fetching %s", url) + response = requests.get(url, **{**DEFAULT_PARAMS, **extra_params}) + if response.status_code == 404: + raise NotFound(f"Fail to query '{url}'. Reason: {response.status_code}") + response.raise_for_status() + return response.content + + def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version TReturn = TypeVar("TReturn") TSelf = TypeVar("TSelf") _UNDEFINED = object() def cached_method(f: Callable[[TSelf], TReturn]) -> Callable[[TSelf], TReturn]: cache_name = f"_cached_{f.__name__}" @functools.wraps(f) def newf(self): value = getattr(self, cache_name, _UNDEFINED) if value is _UNDEFINED: value = f(self) setattr(self, cache_name, value) return value return newf