diff --git a/PKG-INFO b/PKG-INFO index c872963..0de803a 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,52 +1,52 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 3.5.0 +Version: 4.0.0 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/debian/changelog b/debian/changelog index 0e88cab..7fefd5c 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,1783 +1,1800 @@ -swh-loader-core (3.5.0-1~swh1~bpo10+1) buster-swh; urgency=medium +swh-loader-core (4.0.0-1~swh2) unstable-swh; urgency=medium - * Rebuild for buster-swh + * Update build dependencies and bump new release - -- Software Heritage autobuilder (on jenkins-debian1) Fri, 20 May 2022 12:27:20 +0000 + -- Antoine R. Dumont (@ardumont) Fri, 09 Sep 2022 11:47:53 +0200 + +swh-loader-core (4.0.0-1~swh1) unstable-swh; urgency=medium + + * New upstream release 4.0.0 - (tagged by Antoine R. Dumont + (@ardumont) on 2022-09-09 09:03:40 + +0200) + * Upstream changes: - v4.0.0 - New package loader Golang - + New package loader pubdev - New package loader Arch Linux - + New package loader Arch Linux User - New package loader Crates + - docs: Mention caveats of using archive checksums as ExtID. - + package/utils: Add retry policy to download in case of throttling + - package/archive: Handle tarball artifact with null time - + Initialize 'status' before try block - Always log an error when + setting 'failed' status - Add method process_data(), run between + fetch_data() and store_data() + + -- Software Heritage autobuilder (on jenkins-debian1) Fri, 09 Sep 2022 07:11:14 +0000 swh-loader-core (3.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.5.0 - (tagged by Valentin Lorentz on 2022-05-20 14:19:02 +0200) * Upstream changes: - v3.5.0 - * BaseLoader.flush: Return the output of storage.flush -- Software Heritage autobuilder (on jenkins-debian1) Fri, 20 May 2022 12:24:23 +0000 swh-loader-core (3.4.1-1~swh1) unstable-swh; urgency=medium * New upstream release 3.4.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-05-13 10:22:15 +0200) * Upstream changes: - v3.4.1 - Initialize the success boolean early to avoid unbound exception -- Software Heritage autobuilder (on jenkins-debian1) Fri, 13 May 2022 08:26:59 +0000 swh-loader-core (3.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.4.0 - (tagged by Valentin Lorentz on 2022-05-06 10:36:18 +0200) * Upstream changes: - v3.4.0 - * crates: Do not literalinclude JSON file in ExtrinsicPackageMetadata doc - * Add Sentry Captures - * maven: Use most recent release of a package as default version - * loader.core: Add statsd timing and metadata metrics -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 May 2022 08:42:14 +0000 swh-loader-core (3.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.3.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-29 14:46:13 +0200) * Upstream changes: - v3.3.0 - Rust lang, Crates loader - package/maven: Fix jar archive download after changes in lister -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Apr 2022 12:51:12 +0000 swh-loader-core (3.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.2.0 - (tagged by Valentin Lorentz on 2022-04-27 16:26:36 +0200) * Upstream changes: - v3.2.0 - * Store the result of MetadataFetcher.get_parent_origins - * cli: Pass metadata_fetcher_credentials from the config to the loader -- Software Heritage autobuilder (on jenkins-debian1) Wed, 27 Apr 2022 14:31:04 +0000 swh-loader-core (3.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.1.0 - (tagged by Valentin Lorentz on 2022-04-26 11:36:30 +0200) * Upstream changes: - v3.1.0 - * package loaders: Simplify initialization - * BaseLoader: Add hook to call metadata fetchers before loading an origin - * pre-commit maintenance - * debian: Fix loading when md5sum is missing in dsc file -- Software Heritage autobuilder (on jenkins-debian1) Tue, 26 Apr 2022 09:41:57 +0000 swh-loader-core (3.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 3.0.0 - (tagged by Valentin Lorentz on 2022-04-21 10:27:07 +0200) * Upstream changes: - v3.0.0 - * Remove unused function BaseLoader.store_metadata. - * Remove unused BaseLoader.origin_metadata attribute - * Replace self.url with self.origin.url in package loaders - * BaseLoader: Add 'origin_url' argument and remove 'prepare_origin_visit' method -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Apr 2022 08:31:52 +0000 swh-loader-core (2.6.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.2 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-14 11:46:06 +0200) * Upstream changes: - v2.6.2 - maven: Consistently read lister input to ingest a mvn origin -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Apr 2022 09:53:26 +0000 swh-loader-core (2.6.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-04-08 11:03:06 +0200) * Upstream changes: - v2.6.1 - Rename metadata key in data received from the deposit server - origin/master npm: Add all fields we use to the ExtID manifest - npm: Include package version id in ExtID manifest -- Software Heritage autobuilder (on jenkins-debian1) Fri, 08 Apr 2022 09:13:17 +0000 swh-loader-core (2.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.6.0 - (tagged by Valentin Lorentz on 2022-03-02 13:54:45 +0100) * Upstream changes: - v2.6.0 - * Update for the new output format of the Deposit's API. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 02 Mar 2022 12:58:43 +0000 swh-loader-core (2.5.4-1~swh2) unstable-swh; urgency=medium * Bump new release with opam tests deactivated -- Antoine R. Dumont (@ardumont) Fri, 25 Feb 2022 12:40:40 +0100 swh-loader-core (2.5.4-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.4 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-25 10:23:51 +0100) * Upstream changes: - v2.5.4 - loader/opam/tests: Do not run actual opam init command call -- Software Heritage autobuilder (on jenkins-debian1) Fri, 25 Feb 2022 09:28:10 +0000 swh-loader-core (2.5.3-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.3 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-24 16:02:53 +0100) * Upstream changes: - v2.5.3 - opam: Allow build to run the opam init completely -- Software Heritage autobuilder (on jenkins-debian1) Thu, 24 Feb 2022 15:07:20 +0000 swh-loader-core (2.5.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.2 - (tagged by Valentin Lorentz on 2022-02-24 09:52:26 +0100) * Upstream changes: - v2.5.2 - * deposit: Remove unused raw_info -- Software Heritage autobuilder (on jenkins-debian1) Thu, 24 Feb 2022 08:57:52 +0000 swh-loader-core (2.5.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-16 15:27:02 +0100) * Upstream changes: - v2.5.1 - Add URL and directory to CLI loader status echo - Fix load_maven scheduling task name - docs: Fix typo detected with codespell - pre-commit: Bump hooks and add new one to check commit message spelling -- Software Heritage autobuilder (on jenkins-debian1) Wed, 16 Feb 2022 14:30:47 +0000 swh-loader-core (2.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.5.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-02-08 10:46:14 +0100) * Upstream changes: - v2.5.0 - Move visit date helper from hg loader to core -- Software Heritage autobuilder (on jenkins-debian1) Tue, 08 Feb 2022 09:49:53 +0000 swh-loader-core (2.4.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.4.1 - (tagged by Nicolas Dandrimont on 2022-02-03 14:12:05 +0100) * Upstream changes: - Release swh.loader.core 2.4.1 - fix Person mangling -- Software Heritage autobuilder (on jenkins-debian1) Thu, 03 Feb 2022 13:17:35 +0000 swh-loader-core (2.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.3.0 - (tagged by Nicolas Dandrimont on 2022-01-24 11:18:43 +0100) * Upstream changes: - Release swh.loader.core - Stop using the deprecated 'TimestampWithTimezone.offset' attribute - Include clone_with_timeout utility from swh.loader.mercurial -- Software Heritage autobuilder (on jenkins-debian1) Mon, 24 Jan 2022 10:22:35 +0000 swh-loader-core (2.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-01-18 14:33:08 +0100) * Upstream changes: - v2.2.0 - tests: Replace 'offset' and 'negative_utc' with 'offset_bytes' - deposit: Remove 'negative_utc' from test data - tests: Use TimestampWithTimezone.from_datetime() instead of the constructor - Add releases notes (from user-provided Atom document) to release messages. - deposit: Strip 'offset_bytes' from date dicts to support swh-model 4.0.0 - Pin mypy and drop type annotations which makes mypy unhappy -- Software Heritage autobuilder (on jenkins-debian1) Tue, 18 Jan 2022 15:52:53 +0000 swh-loader-core (2.1.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.1.1 - (tagged by Valentin Lorentz on 2021-12-09 17:14:12 +0100) * Upstream changes: - v2.1.1 - * nixguix: Fix crash when filtering extids on archives that were already loaded, but only from different URLs -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Dec 2021 16:17:54 +0000 swh-loader-core (2.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.1.0 - (tagged by Valentin Lorentz on 2021-12-09 16:34:51 +0100) * Upstream changes: - v2.1.0 - * maven: various refactorings - * nixguix: Filter out releases with URLs different from the expected one -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Dec 2021 15:38:14 +0000 swh-loader-core (2.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.0.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-12-07 15:53:23 +0100) * Upstream changes: - v2.0.0 - package-loaders: Add support for extid versions, and bump it for Debian - debian: Remove the extrinsic version from release names - debian: Fix confusion between the two versions -- Software Heritage autobuilder (on jenkins-debian1) Tue, 07 Dec 2021 14:57:19 +0000 swh-loader-core (1.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.3.0 - (tagged by Antoine Lambert on 2021-12-07 10:54:49 +0100) * Upstream changes: - version 1.3.0 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 07 Dec 2021 09:58:53 +0000 swh-loader-core (1.2.1-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-12-03 16:15:32 +0100) * Upstream changes: - v1.2.1 - package.loader: Deduplicate extid target -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Dec 2021 15:19:13 +0000 swh-loader-core (1.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-12-03 12:16:04 +0100) * Upstream changes: - v1.2.0 - debian: Rename loading task function to fix scheduling - debian: Handle extra sha1 sum in source package metadata - debian: Remove unused date parameter of DebianLoader - package.loader: Deduplicate target SWHIDs - package-loader-tutorial: Update to mention releases instead of revisions - package-loader-tutorial: Add a checklist - package-loader-tutorial: Highlight the recommendation to submit the loader early. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Dec 2021 11:19:52 +0000 swh-loader-core (1.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.1.0 - (tagged by Valentin Lorentz on 2021-11-22 11:58:11 +0100) * Upstream changes: - v1.1.0 - * Package loader: Uniformize author and message -- Software Heritage autobuilder (on jenkins-debian1) Mon, 22 Nov 2021 11:01:45 +0000 swh-loader-core (1.0.1-1~swh1) unstable-swh; urgency=medium * New upstream release 1.0.1 - (tagged by Valentin Lorentz on 2021-11-10 14:47:52 +0100) * Upstream changes: - v1.0.1 - * utils: Add types and let log instruction do the formatting - * Fix tests when run by gbp on Sid. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 10 Nov 2021 13:53:43 +0000 swh-loader-core (1.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.0.0 - (tagged by Valentin Lorentz on 2021-11-10 14:25:24 +0100) * Upstream changes: - v1.0.0 - Main change: thismakes package loaders write releases instead of revisions - Other more-or-less related changes: - * Add missing documentation for `get_metadata_authority`. - * opam: Write package definitions to the extrinsic metadata storage - * deposit: Remove 'parent' deposit - * cleanup tests and unused code - * Document how each package loader populates fields. - * Refactor package loaders to make the version part of BasePackageInfo -- Software Heritage autobuilder (on jenkins-debian1) Wed, 10 Nov 2021 13:38:43 +0000 swh-loader-core (0.25.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.25.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-29 09:19:10 +0200) * Upstream changes: - v0.25.0 - Allow opam loader to actually use multi-instance opam root - opam: Define a initialize_opam_root parameter for opam loader -- Software Heritage autobuilder (on jenkins-debian1) Wed, 29 Sep 2021 07:26:12 +0000 swh-loader-core (0.23.5-1~swh1) unstable-swh; urgency=medium * New upstream release 0.23.5 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-24 17:31:22 +0200) * Upstream changes: - v0.23.5 - opam: Initialize opam root directory outside the constructor -- Software Heritage autobuilder (on jenkins-debian1) Fri, 24 Sep 2021 15:34:52 +0000 swh-loader-core (0.23.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.23.4 - (tagged by Antoine R. Dumont (@ardumont) on 2021-09-20 11:53:11 +0200) * Upstream changes: - v0.23.4 - Ensure that filename fallback out of an url is properly sanitized -- Software Heritage autobuilder (on jenkins-debian1) Mon, 20 Sep 2021 09:56:31 +0000 swh-loader-core (0.23.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.23.3 - (tagged by Antoine Lambert on 2021-09-16 10:47:40 +0200) * Upstream changes: - version 0.23.3 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 16 Sep 2021 08:51:47 +0000 swh-loader-core (0.23.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.23.2 - (tagged by Valentin Lorentz on 2021-08-12 12:22:44 +0200) * Upstream changes: - v0.23.2 - * deposit: Update status_detail on loader failure -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Aug 2021 10:25:44 +0000 swh-loader-core (0.23.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.23.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-08-05 16:11:02 +0200) * Upstream changes: - v0.23.1 - Fix pypi upload issue. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 05 Aug 2021 14:20:37 +0000 swh-loader-core (0.22.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.22.3 - (tagged by Valentin Lorentz on 2021-06-25 14:50:40 +0200) * Upstream changes: - v0.22.3 - * Use the postgresql class to instantiate storage in tests - * package-loader-tutorial: Add anchor so it can be referenced from swh-docs -- Software Heritage autobuilder (on jenkins-debian1) Fri, 25 Jun 2021 12:57:33 +0000 swh-loader-core (0.22.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.22.2 - (tagged by Antoine Lambert on 2021-06-10 16:11:30 +0200) * Upstream changes: - version 0.22.2 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 10 Jun 2021 14:19:06 +0000 swh-loader-core (0.22.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.22.1 - (tagged by Antoine Lambert on 2021-05-27 14:02:35 +0200) * Upstream changes: - version 0.22.1 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 27 May 2021 12:20:04 +0000 swh-loader-core (0.22.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.22.0 - (tagged by Valentin Lorentz on 2021-04-15 15:13:56 +0200) * Upstream changes: - v0.22.0 - Documentation: - * Document the big picture view of VCS and package loaders - * Add a package loader tutorial. - * Write an overview of how to write VCS loaders. - * Fix various Sphinx warnings - Package loaders: - * Add sha512 as a valid field in dsc metadata - * package loaders: Stop reading/writing Revision.metadata -- Software Heritage autobuilder (on jenkins-debian1) Thu, 15 Apr 2021 13:18:13 +0000 swh-loader-core (0.21.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.21.0 - (tagged by Valentin Lorentz on 2021-03-30 17:19:13 +0200) * Upstream changes: - v0.21.0 - * tests: recompute ids when evolving RawExtrinsicMetadata objects, to support swh-model 2.0.0 - * deposit.loader: Make archive.tar the default_filename - * debian: Make resolve_revision_from use the sha256 of the .dsc - * package.loader.*: unify package "cache"/deduplication using ExtIDs - * package.loader: Lookup packages from the ExtID storage - * package.loader: Write to the ExtID storage -- Software Heritage autobuilder (on jenkins-debian1) Tue, 30 Mar 2021 15:26:35 +0000 swh-loader-core (0.20.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.20.0 - (tagged by Valentin Lorentz on 2021-03-02 10:52:18 +0100) * Upstream changes: - v0.20.0 - * RawExtrinsicMetadata: update to use the API in swh-model 1.0.0 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 02 Mar 2021 09:57:21 +0000 swh-loader-core (0.19.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.19.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-25 15:52:12 +0100) * Upstream changes: - v0.19.0 - deposit: Make deposit loader deal with tarball as well - deposit: Update deposit status when the load status is 'partial' - Make finalize_visit a method instead of nested function. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 25 Feb 2021 14:55:54 +0000 swh-loader-core (0.18.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.18.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-19 18:02:58 +0100) * Upstream changes: - v0.18.1 - nixguix: Fix missing max_content_size constructor parameter -- Software Heritage autobuilder (on jenkins-debian1) Fri, 19 Feb 2021 17:06:33 +0000 swh-loader-core (0.18.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.18.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-17 13:13:24 +0100) * Upstream changes: - v0.18.0 - core.loader: Merge Loader into BaseLoader - Unify loader instantiation - nixguix: Ensure interaction with the origin url for edge case tests -- Software Heritage autobuilder (on jenkins-debian1) Wed, 17 Feb 2021 12:16:47 +0000 swh-loader-core (0.17.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.17.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-11 11:20:55 +0100) * Upstream changes: - v0.17.0 - package: Mark visit as not_found when relevant - package: Mark visit status as failed when relevant - core: Allow vcs loaders to deal with not_found status - core: Mark visit status as failed when relevant - loader: Make loader write the origin_visit_status' type -- Software Heritage autobuilder (on jenkins-debian1) Thu, 11 Feb 2021 10:23:42 +0000 swh-loader-core (0.16.0-1~swh2) unstable-swh; urgency=medium * Bump dependencies -- Antoine R. Dumont (@ardumont) Wed, 03 Feb 2021 14:25:26 +0100 swh-loader-core (0.16.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.16.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-03 14:14:01 +0100) * Upstream changes: - v0.16.0 - Adapt origin_get_latest_visit_status according to latest api change - Add a cli section in the doc - tox.ini: Add swh.core[testing] requirement - Small docstring improvements in the deposit loader code -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Feb 2021 13:17:30 +0000 swh-loader-core (0.15.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.15.0 - (tagged by Nicolas Dandrimont on 2020-11-03 17:21:21 +0100) * Upstream changes: - Release swh-loader-core v0.15.0 - Attach raw extrinsic metadata to directories, not revisions - Handle a bunch of deprecation warnings: - explicit args in swh.objstorage get_objstorage - id -> target for raw extrinsic metadata objects - positional arguments for storage.raw_extrinsic_metadata_get -- Software Heritage autobuilder (on jenkins-debian1) Tue, 03 Nov 2020 16:26:20 +0000 swh-loader-core (0.14.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.14.0 - (tagged by Valentin Lorentz on 2020-10-16 18:23:28 +0200) * Upstream changes: - v0.14.0 - * npm: write metadata on revisions instead of snapshots. - * pypi: write metadata on revisions instead of snapshots. - * deposit.loader: Avoid unnecessary metadata json transformation -- Software Heritage autobuilder (on jenkins-debian1) Fri, 16 Oct 2020 16:26:14 +0000 swh-loader-core (0.13.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.13.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-02 16:54:05 +0200) * Upstream changes: - v0.13.1 - core.loader: Allow config parameter passing through constructor - tox.ini: pin black to the pre-commit version (19.10b0) to avoid flip-flops -- Software Heritage autobuilder (on jenkins-debian1) Fri, 02 Oct 2020 14:55:59 +0000 swh-loader-core (0.13.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.13.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-02 13:18:55 +0200) * Upstream changes: - v0.13.0 - package.loader: Migrate away from SWHConfig mixin - core.loader: Migrate away from SWHConfig mixin - Expose deposit configuration only within the deposit tests -- Software Heritage autobuilder (on jenkins-debian1) Fri, 02 Oct 2020 11:21:55 +0000 swh-loader-core (0.12.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.12.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-01 16:03:45 +0200) * Upstream changes: - v0.12.0 - deposit: Adapt loader to send extrinsic raw metadata to the metadata storage - core.loader: Log information about origin currently being ingested - Adapt cli declaration entrypoint to swh.core 0.3 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 01 Oct 2020 14:04:59 +0000 swh-loader-core (0.11.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.11.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-18 10:19:56 +0200) * Upstream changes: - v0.11.0 - loader: Stop materializing full lists of objects to be stored - tests.get_stats: Don't return a 'person' count - python: Reorder imports with isort - pre-commit: Add isort hook and configuration - pre-commit: Update flake8 hook configuration - cli: speedup the `swh` cli command startup time -- Software Heritage autobuilder (on jenkins-debian1) Fri, 18 Sep 2020 09:12:18 +0000 swh-loader-core (0.10.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.10.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-04 13:19:29 +0200) * Upstream changes: - v0.10.0 - loader: Adapt to latest storage revision_get change - origin/master Rename metadata format 'original-artifact-json' to 'original-artifacts-json'. - Tell pytest not to recurse in dotdirs. - package loader: Add the 'url' to the 'original_artifact' extrinsic metadata. - Write 'original_artifact' metadata to the extrinsic metadata storage. - Move parts of _load_revision to a new _load_directory method. - tests: Don't use naive datetimes. - package.loader: Split the warning message into multiple chunks - Replace calls to snapshot_get with snapshot_get_all_branches. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 04 Sep 2020 11:28:09 +0000 swh-loader-core (0.9.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-08-08 14:47:52 +0200) * Upstream changes: - v0.9.1 - nixguix: Make the unsupported artifact extensions configurable - package.loader: Log a failure summary report at the end of the task -- Software Heritage autobuilder (on jenkins-debian1) Sat, 08 Aug 2020 12:51:33 +0000 swh-loader-core (0.9.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-08-07 22:57:14 +0200) * Upstream changes: - v0.9.0 - nixguix: Filter out unsupported artifact extensions - swh.loader.tests: Use snapshot_get_all_branches in check_snapshot - test_npm: Adapt content_get_metadata call to content_get - npm: Fix assertion to use the correct storage api -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 Aug 2020 21:00:40 +0000 swh-loader-core (0.8.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-08-06 16:48:38 +0200) * Upstream changes: - v0.8.1 - Adapt code according to storage signature -- Software Heritage autobuilder (on jenkins-debian1) Thu, 06 Aug 2020 14:50:39 +0000 swh-loader-core (0.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-08-05 10:16:36 +0200) * Upstream changes: - v0.8.0 - archive: fix docstring - nixguix: Fix docstring - nixguix: Align error message formatting using f-string - nixguix: Fix format issue in error message - Convert the 'metadata' and 'info' cached-properties/lazy-attributes into methods - cran: fix call to logger.warning - pypi: Load the content of the API's response as extrinsic snapshot metadata - Add a default value for RawExtrinsicMetadataCore.discovery_date - npm: Load the content of the API's response as extrinsic snapshot metadata - Make retrieve_sources use generic api_info instead of duplicating its code - nixguix: Load the content of sources.json as extrinsic snapshot metadata - Update tests to accept PagedResult from storage.raw_extrinsic_metadata_get -- Software Heritage autobuilder (on jenkins-debian1) Wed, 05 Aug 2020 08:19:20 +0000 swh-loader-core (0.7.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.3 - (tagged by Valentin Lorentz on 2020-07-30 19:16:21 +0200) * Upstream changes: - v0.7.3 - core.loader: Fix Iterable/List typing issues - package.loader: Fix type warning -- Software Heritage autobuilder (on jenkins-debian1) Thu, 30 Jul 2020 17:23:57 +0000 swh-loader-core (0.7.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.2 - (tagged by Valentin Lorentz on 2020-07-29 11:41:39 +0200) * Upstream changes: - v0.7.2 - * Fix typo in message logged on extrinsic metadata loading errors. - * Don't pass non-sequence iterables to the storage API. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 29 Jul 2020 09:45:52 +0000 swh-loader-core (0.7.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-28 12:14:02 +0200) * Upstream changes: - v0.7.1 - Apply rename of object_metadata to raw_extrinsic_metadata. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 28 Jul 2020 10:16:56 +0000 swh-loader-core (0.6.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-23 11:12:29 +0200) * Upstream changes: - v0.6.1 - npm.loader: Fix null author parsing corner case - npm.loader: Fix author parsing corner case - npm.loader: Extract _author_str function + add types, tests - core.loader: docs: Update origin_add reference -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 Jul 2020 09:15:41 +0000 swh-loader-core (0.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.0 - (tagged by Valentin Lorentz on 2020-07-20 13:23:22 +0200) * Upstream changes: - v0.6.0 - * Use the new object_metadata_add endpoint instead of origin_metadata_add. - * Apply renaming of MetadataAuthorityType.DEPOSIT to MetadataAuthorityType.DEPOSIT_CLIENT. -- Software Heritage autobuilder (on jenkins-debian1) Mon, 20 Jul 2020 11:27:53 +0000 swh-loader-core (0.5.10-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.10 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-17 15:10:42 +0200) * Upstream changes: - v0.5.10 - test_init: Decrease assertion checks so debian package builds fine - test_nixguix: Simplify the nixguix specific check_snapshot function -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Jul 2020 13:13:19 +0000 swh-loader-core (0.5.9-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.9 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-17 11:52:38 +0200) * Upstream changes: - v0.5.9 - test.check_snapshot: Drop accepting using dict for snapshot comparison - test: Check against snapshot model object -- Software Heritage autobuilder (on jenkins-debian1) Fri, 17 Jul 2020 09:55:12 +0000 swh-loader-core (0.5.8-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.8 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-16 17:18:17 +0200) * Upstream changes: - v0.5.8 - test_init: Use snapshot object -- Software Heritage autobuilder (on jenkins-debian1) Thu, 16 Jul 2020 15:20:49 +0000 swh-loader-core (0.5.7-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.7 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-16 16:10:57 +0200) * Upstream changes: - v0.5.7 - test_init: Fix tests using the latest swh-storage fixture -- Software Heritage autobuilder (on jenkins-debian1) Thu, 16 Jul 2020 14:14:59 +0000 swh-loader-core (0.5.5-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.5 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-15 12:34:09 +0200) * Upstream changes: - v0.5.5 - check_snapshot: Check existence down to contents - Expose a pytest_plugin module so other loaders can reuse for tests - pytest: Remove no longer needed pytest setup - Fix branches types in tests - Small code improvement in package/loader.py -- Software Heritage autobuilder (on jenkins-debian1) Wed, 15 Jul 2020 10:37:11 +0000 swh-loader-core (0.5.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.4 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-10 09:52:21 +0200) * Upstream changes: - v0.5.4 - Clean up the swh.scheduler / swh.storage pytest plugin imports -- Software Heritage autobuilder (on jenkins-debian1) Fri, 10 Jul 2020 07:54:56 +0000 swh-loader-core (0.5.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.3 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-09 09:46:21 +0200) * Upstream changes: - v0.5.3 - Update the revision metadata field as an immutable dict - tests: Use dedicated storage and scheduler fixtures - loaders.tests: Simplify and add coverage to check_snapshot -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Jul 2020 07:48:33 +0000 swh-loader-core (0.5.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-07 12:29:17 +0200) * Upstream changes: - v0.5.2 - nixguix/loader: Check further the source entry only if it's valid - nixguix/loader: Allow version both as string or integer - Move remaining common test utility functions to top-level arborescence - Move common test utility function to the top-level arborescence - Define common test helper function - Reuse swh.model.from_disk.iter_directory function -- Software Heritage autobuilder (on jenkins-debian1) Tue, 07 Jul 2020 10:31:36 +0000 swh-loader-core (0.5.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-01 12:32:54 +0200) * Upstream changes: - v0.5.1 - Use origin_add instead of deprecated origin_add_one endpoint - Migrate to use object's "object_type" field when computing objects -- Software Heritage autobuilder (on jenkins-debian1) Wed, 01 Jul 2020 10:34:59 +0000 swh-loader-core (0.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-29 13:18:41 +0200) * Upstream changes: - v0.5.0 - loader*: Drop obsolete origin visit fields -- Software Heritage autobuilder (on jenkins-debian1) Mon, 29 Jun 2020 11:20:59 +0000 swh-loader-core (0.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-23 15:02:20 +0200) * Upstream changes: - v0.4.0 - loader: Retrieve latest snapshot with snapshot-get-latest function -- Software Heritage autobuilder (on jenkins-debian1) Tue, 23 Jun 2020 13:14:09 +0000 swh-loader-core (0.3.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-22 15:13:05 +0200) * Upstream changes: - v0.3.2 - Add helper function to ensure loader visit are as expected -- Software Heritage autobuilder (on jenkins-debian1) Mon, 22 Jun 2020 13:15:41 +0000 swh-loader-core (0.3.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.1 - (tagged by Antoine Lambert on 2020-06-12 16:43:18 +0200) * Upstream changes: - version 0.3.1 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 12 Jun 2020 14:47:42 +0000 swh-loader-core (0.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-12 11:05:41 +0200) * Upstream changes: - v0.3.0 - Migrate to new storage.origin_visit_add endpoint - loader: Migrate to origin visit status - test_deposits: Fix origin_metadata_get which is a paginated endpoint - Fix a potential UnboundLocalError in clean_dangling_folders() -- Software Heritage autobuilder (on jenkins-debian1) Fri, 12 Jun 2020 09:08:17 +0000 swh-loader-core (0.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.0 - (tagged by David Douard on 2020-06-04 14:20:08 +0200) * Upstream changes: - v0.2.0 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 04 Jun 2020 12:25:57 +0000 swh-loader-core (0.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.0 - (tagged by Nicolas Dandrimont on 2020-05-29 16:01:11 +0200) * Upstream changes: - Release swh.loader.core v0.1.0 - Make sure partial visits don't reference unloaded snapshots - Ensure proper behavior when loading into partial archives (e.g. staging) - Improve test coverage -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 May 2020 14:05:36 +0000 swh-loader-core (0.0.97-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.97 - (tagged by Antoine R. Dumont (@ardumont) on 2020-05-26 14:22:51 +0200) * Upstream changes: - v0.0.97 - nixguix: catch and log artifact resolution failures - nixguix: Override known_artifacts to filter out "evaluation" branch - nixguix.tests: Add missing __init__ file -- Software Heritage autobuilder (on jenkins-debian1) Tue, 26 May 2020 12:25:35 +0000 swh-loader-core (0.0.96-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.96 - (tagged by Valentin Lorentz on 2020-05-19 18:42:23 +0200) * Upstream changes: - v0.0.96 - * Pass bytes instead a dict to origin_metadata_add. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 19 May 2020 16:45:03 +0000 swh-loader-core (0.0.95-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.95 - (tagged by Valentin Lorentz on 2020-05-19 14:44:01 +0200) * Upstream changes: - v0.0.95 - * Use the new swh-storage API for storing metadata. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 19 May 2020 12:47:48 +0000 swh-loader-core (0.0.94-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.94 - (tagged by Antoine R. Dumont (@ardumont) on 2020-05-15 12:49:22 +0200) * Upstream changes: - v0.0.94 - deposit: Adapt loader to use the latest deposit update api - tests: Use proper date initialization - setup.py: add documentation link -- Software Heritage autobuilder (on jenkins-debian1) Fri, 15 May 2020 10:52:16 +0000 swh-loader-core (0.0.93-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.93 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-23 16:43:16 +0200) * Upstream changes: - v0.0.93 - deposit.loader: Build revision out of the deposit api read metadata -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 Apr 2020 14:46:48 +0000 swh-loader-core (0.0.92-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.92 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-23 11:49:30 +0200) * Upstream changes: - v0.0.92 - deposit.loader: Fix revision metadata redundancy in deposit metadata - loader.deposit: Clarify FIXME intent - test_nixguix: Remove the incorrect fixme - test_nixguix: Add a fixme note on test_loader_two_visits - package.nixguix: Ensure the revisions are structurally sound -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 Apr 2020 09:52:18 +0000 swh-loader-core (0.0.91-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.91 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-21 15:59:55 +0200) * Upstream changes: - v0.0.91 - deposit.loader: Fix committer date appropriately - tests_deposit: Define specific requests_mock_datadir fixture - nixguix: Move helper function below the class definition - setup: Update the minimum required runtime python3 version -- Software Heritage autobuilder (on jenkins-debian1) Tue, 21 Apr 2020 14:02:51 +0000 swh-loader-core (0.0.90-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.90 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-15 14:27:01 +0200) * Upstream changes: - v0.0.90 - Improve exception handling -- Software Heritage autobuilder (on jenkins-debian1) Wed, 15 Apr 2020 12:30:07 +0000 swh-loader-core (0.0.89-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.89 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-14 15:48:15 +0200) * Upstream changes: - v0.0.89 - package.utils: Define a timeout on download connections - package.loader: Clear proxy buffer state when failing to load revision - Fix a couple of storage args deprecation warnings - cli: Sort loaders list and fix some tests - Add a pyproject.toml file to target py37 for black - Enable black -- Software Heritage autobuilder (on jenkins-debian1) Tue, 14 Apr 2020 15:30:08 +0000 swh-loader-core (0.0.88-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.88 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-03 15:52:07 +0200) * Upstream changes: - v0.0.88 - v0.0.88 nixguix: validate and clean sources.json structure -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Apr 2020 13:54:24 +0000 swh-loader-core (0.0.87-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.87 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-02 14:37:37 +0200) * Upstream changes: - v0.0.87 - nixguix: rename the `url` source attribute to `urls` - nixguix: rename the test file - nixguix: add the integrity attribute in release metadata -- Software Heritage autobuilder (on jenkins-debian1) Thu, 02 Apr 2020 12:39:58 +0000 swh-loader-core (0.0.86-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.86 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-26 16:15:24 +0100) * Upstream changes: - v0.0.86 - core.loader: Remove origin_visit_update call from DVCSLoader class -- Software Heritage autobuilder (on jenkins-debian1) Thu, 26 Mar 2020 15:19:29 +0000 swh-loader-core (0.0.85-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.85 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-26 15:36:58 +0100) * Upstream changes: - v0.0.85 - core.loader: Allow core loader to update origin_visit in one call - Rename the functional loader to nixguix loader -- Software Heritage autobuilder (on jenkins-debian1) Thu, 26 Mar 2020 14:43:17 +0000 swh-loader-core (0.0.84-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.84 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-24 11:29:49 +0100) * Upstream changes: - v0.0.84 - test: Use storage endpoint to check latest origin visit status - package.loader: Fix status visit to 'partial' - package.loader: add a test to reproduce EOFError error -- Software Heritage autobuilder (on jenkins-debian1) Tue, 24 Mar 2020 10:32:55 +0000 swh-loader-core (0.0.83-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.83 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-23 15:16:14 +0100) * Upstream changes: - v0.0.83 - Make the swh.loader.package exception handling more granular - package.loader: Reference a snapshot on partial visit - package.loader: Extract a _load_snapshot method - functional: create a branch named evaluation pointing to the evaluation commit - package.loader: add extra_branches method -- Software Heritage autobuilder (on jenkins-debian1) Mon, 23 Mar 2020 14:19:43 +0000 swh-loader-core (0.0.82-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.82 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-18 11:55:48 +0100) * Upstream changes: - v0.0.82 - functional.loader: Add loader - package.loader: ignore non tarball source -- Software Heritage autobuilder (on jenkins-debian1) Wed, 18 Mar 2020 10:59:38 +0000 swh-loader-core (0.0.81-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.81 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-16 13:14:33 +0100) * Upstream changes: - v0.0.81 - Migrate to latest storage.origin_visit_add api change - Move Person parsing to swh- model. -- Software Heritage autobuilder (on jenkins-debian1) Mon, 16 Mar 2020 12:17:43 +0000 swh-loader-core (0.0.80-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.80 - (tagged by Valentin Lorentz on 2020-02-28 17:05:14 +0100) * Upstream changes: - v0.0.80 - * use swh-model objects instead of dicts. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 28 Feb 2020 16:10:06 +0000 swh-loader-core (0.0.79-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.79 - (tagged by Antoine R. Dumont (@ardumont) on 2020-02-25 11:40:05 +0100) * Upstream changes: - v0.0.79 - Move revision loading logic to its own function. - Use swh-storage validation proxy earlier in the pipeline. - Use swh-storage validation proxy. - Add missing __init__.py and fix tests. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 25 Feb 2020 10:48:07 +0000 swh-loader-core (0.0.78-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.78 - (tagged by Antoine R. Dumont (@ardumont) on 2020-02-06 15:28:11 +0100) * Upstream changes: - v0.0.78 - tests: Use new get_storage signature - loader.core.converters: Prefer the with open pattern to read file - test_converters: Add coverage on prepare_contents method - test_converters: Migrate to pytest - loader.core/package: Call storage's (skipped_)content_add endpoints -- Software Heritage autobuilder (on jenkins-debian1) Thu, 06 Feb 2020 15:09:05 +0000 swh-loader-core (0.0.77-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.77 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-30 10:32:08 +0100) * Upstream changes: - v0.0.77 - loader.npm: If no upload time provided, use artifact's mtime if provided - loader.npm: Fail ingestion if at least 1 artifact has no upload time -- Software Heritage autobuilder (on jenkins-debian1) Thu, 30 Jan 2020 09:37:58 +0000 swh-loader-core (0.0.76-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.76 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-28 13:07:30 +0100) * Upstream changes: - v0.0.76 - npm.loader: Skip artifacts with no intrinsic metadata - pypi.loader: Skip artifacts with no intrinsic metadata - package.loader: Fix edge case when some listing returns no content - core.loader: Drop retro- compatibility class names - loader.tests: Add filter and buffer proxy storage - docs: Fix sphinx warnings - README: Update class names -- Software Heritage autobuilder (on jenkins-debian1) Tue, 28 Jan 2020 12:11:07 +0000 swh-loader-core (0.0.75-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.75 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-16 14:14:29 +0100) * Upstream changes: - v0.0.75 - cran.loader: Align cran loader with other package loaders -- Software Heritage autobuilder (on jenkins-debian1) Thu, 16 Jan 2020 13:17:30 +0000 swh-loader-core (0.0.74-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.74 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-15 15:30:13 +0100) * Upstream changes: - v0.0.74 - Drop no longer used retrying dependency - core.loader: Clean up indirection and retry behavior - tests: Use retry proxy storage in loaders - core.loader: Drop dead code - cran.loader: Fix parsing description file error -- Software Heritage autobuilder (on jenkins-debian1) Wed, 15 Jan 2020 14:33:57 +0000 swh-loader-core (0.0.73-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.73 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-09 10:00:21 +0100) * Upstream changes: - v0.0.73 - package.cran: Name CRAN task appropriately -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Jan 2020 09:05:07 +0000 swh-loader-core (0.0.72-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.72 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-06 16:37:58 +0100) * Upstream changes: - v0.0.72 - package.loader: Fail fast when unable to create origin/origin_visit - cran.loader: Add implementation -- Software Heritage autobuilder (on jenkins-debian1) Mon, 06 Jan 2020 15:50:08 +0000 swh-loader-core (0.0.71-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.71 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-20 14:22:31 +0100) * Upstream changes: - v0.0.71 - package.utils: Drop unneeded hashes from download computation -- Software Heritage autobuilder (on jenkins-debian1) Fri, 20 Dec 2019 13:26:09 +0000 swh-loader-core (0.0.70-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.70 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-20 11:32:09 +0100) * Upstream changes: - v0.0.70 - debian.loader: Improve and fix revision resolution's corner cases -- Software Heritage autobuilder (on jenkins-debian1) Fri, 20 Dec 2019 10:39:34 +0000 swh-loader-core (0.0.69-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.69 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-12 16:21:59 +0100) * Upstream changes: - v0.0.69 - loader.core: Fix correctly loader initialization -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Dec 2019 15:26:13 +0000 swh-loader-core (0.0.68-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.68 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-12 15:45:21 +0100) * Upstream changes: - v0.0.68 - loader.core: Fix initialization issue in dvcs loaders -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Dec 2019 14:49:12 +0000 swh-loader-core (0.0.67-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.67 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-12 14:02:47 +0100) * Upstream changes: - v0.0.67 - loader.core: Type methods - loader.core: Transform data input into list - loader.core: Add missing conversion step on content -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Dec 2019 13:07:47 +0000 swh-loader-core (0.0.66-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.66 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-12 12:01:14 +0100) * Upstream changes: - v0.0.66 - Drop deprecated behavior -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Dec 2019 11:05:17 +0000 swh-loader-core (0.0.65-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.65 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-12 11:42:46 +0100) * Upstream changes: - v0.0.65 - loader.cli: Improve current implementation - tasks: Enforce kwargs use in task message -- Software Heritage autobuilder (on jenkins-debian1) Thu, 12 Dec 2019 10:51:02 +0000 swh-loader-core (0.0.64-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.64 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-10 09:49:06 +0100) * Upstream changes: - v0.0.64 - requirements-test: Add missing test dependency - tests: Refactor using pytest-mock's mocker fixture - loader.cli: Add tests around cli - package.npm: Align loader instantiation - loader.cli: Reference new loader cli -- Software Heritage autobuilder (on jenkins-debian1) Tue, 10 Dec 2019 08:56:02 +0000 swh-loader-core (0.0.63-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.63 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-05 16:01:49 +0100) * Upstream changes: - v0.0.63 - Add missing inclusion instruction -- Software Heritage autobuilder (on jenkins-debian1) Thu, 05 Dec 2019 15:05:39 +0000 swh-loader-core (0.0.62-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.62 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-05 15:46:46 +0100) * Upstream changes: - v0.0.62 - Move package loaders to their own namespace -- Software Heritage autobuilder (on jenkins-debian1) Thu, 05 Dec 2019 14:50:19 +0000 swh-loader-core (0.0.61-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.61 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-28 17:25:49 +0100) * Upstream changes: - v0.0.61 - pypi: metadata -> revision: Deal with previous metadata format - npm: metadata -> revision: Deal with previous metadata format -- Software Heritage autobuilder (on jenkins-debian1) Thu, 28 Nov 2019 16:29:47 +0000 swh-loader-core (0.0.60-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.60 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-26 12:09:28 +0100) * Upstream changes: - v0.0.60 - package.deposit: Fix revision- get inconsistency - package.deposit: Provide parents in any case - package.deposit: Fix url computation issue - utils: Work around header issue during download -- Software Heritage autobuilder (on jenkins-debian1) Tue, 26 Nov 2019 11:18:41 +0000 swh-loader-core (0.0.59-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.59 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-22 18:11:33 +0100) * Upstream changes: - v0.0.59 - npm: Explicitly retrieve the revision date from extrinsic metadata -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Nov 2019 17:15:34 +0000 swh-loader-core (0.0.58-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.58 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-22 12:08:10 +0100) * Upstream changes: - v0.0.58 - package.pypi: Filter out non- sdist package type -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Nov 2019 11:11:56 +0000 swh-loader-core (0.0.57-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.57 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-22 11:26:11 +0100) * Upstream changes: - v0.0.57 - package.pypi: Fix project url computation edge case - Use pkg_resources to get the package version instead of vcversioner -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Nov 2019 10:31:11 +0000 swh-loader-core (0.0.56-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.56 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-21 16:12:46 +0100) * Upstream changes: - v0.0.56 - package.tasks: Rename appropriately load_deb_package task type name - Fix typos reported by codespell - Add a pre-commit config file -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Nov 2019 15:16:23 +0000 swh-loader-core (0.0.55-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.55 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-21 13:51:03 +0100) * Upstream changes: - v0.0.55 - package.tasks: Rename load_archive into load_archive_files - Migrate tox.ini to extras = xxx instead of deps = .[testing] - Merge tox test environments -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Nov 2019 12:56:07 +0000 swh-loader-core (0.0.54-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.54 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-21 11:29:20 +0100) * Upstream changes: - v0.0.54 - loader.package.deposit: Drop swh.deposit.client requirement - Include all requirements in MANIFEST.in -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Nov 2019 10:32:23 +0000 swh-loader-core (0.0.53-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.53 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-20 14:26:36 +0100) * Upstream changes: - v0.0.53 - loader.package.tasks: Document tasks - Define correctly the setup.py's entry_points -- Software Heritage autobuilder (on jenkins-debian1) Wed, 20 Nov 2019 13:30:10 +0000 swh-loader-core (0.0.52-1~swh3) unstable-swh; urgency=medium * Update dh-python version constraint -- Antoine R. Dumont (@ardumont) Wed, 20 Nov 2019 12:03:00 +0100 swh-loader-core (0.0.52-1~swh2) unstable-swh; urgency=medium * Add egg-info to pybuild.testfiles. -- Antoine R. Dumont (@ardumont) Wed, 20 Nov 2019 11:42:42 +0100 swh-loader-core (0.0.52-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.52 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-19 15:15:40 +0100) * Upstream changes: - v0.0.52 - Ensure BufferedLoader and UnbufferedLoader do flush their storage - loader.package: Register loader package tasks - package.tasks: Rename debian task to load_deb -- Software Heritage autobuilder (on jenkins-debian1) Tue, 19 Nov 2019 14:18:41 +0000 swh-loader-core (0.0.51-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.51 - (tagged by David Douard on 2019-11-18 17:05:17 +0100) * Upstream changes: - v0.0.51 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 18 Nov 2019 16:09:44 +0000 swh-loader-core (0.0.50-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.50 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-13 15:56:55 +0100) * Upstream changes: - v0.0.50 - package.loader: Check snapshot_id is set as returned value - package.loader: Ensure the origin visit type is set appropriately - package.loader: Fix serialization issue - package.debian: Align origin_visit type to 'deb' as in production -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Nov 2019 15:04:37 +0000 swh-loader-core (0.0.49-1~swh2) unstable-swh; urgency=medium * Update dependencies -- Antoine R. Dumont Fri, 08 Nov 2019 14:07:20 +0100 swh-loader-core (0.0.49-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.49 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-08 13:21:56 +0100) * Upstream changes: - v0.0.49 - New package loader implementations: archive, pypi, npm, deposit, debian -- Software Heritage autobuilder (on jenkins-debian1) Fri, 08 Nov 2019 12:29:47 +0000 swh-loader-core (0.0.48-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.48 - (tagged by Stefano Zacchiroli on 2019-10-01 16:49:39 +0200) * Upstream changes: - v0.0.48 - * typing: minimal changes to make a no-op mypy run pass -- Software Heritage autobuilder (on jenkins-debian1) Tue, 01 Oct 2019 14:52:59 +0000 swh-loader-core (0.0.47-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.47 - (tagged by Antoine Lambert on 2019-10-01 11:32:50 +0200) * Upstream changes: - version 0.0.47: Workaround HashCollision errors -- Software Heritage autobuilder (on jenkins-debian1) Tue, 01 Oct 2019 09:35:38 +0000 swh-loader-core (0.0.46-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.46 - (tagged by Antoine R. Dumont (@ardumont) on 2019-09-06 18:30:42 +0200) * Upstream changes: - v0.0.46 - pytest.ini: Remove warnings about our custom markers - pep8: Fix log.warning calls - core/loader: Fix get_save_data_path implementation - Fix validation errors in test. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 Sep 2019 16:33:13 +0000 swh-loader-core (0.0.45-1~swh2) unstable-swh; urgency=medium * Fix missing build dependency -- Antoine R. Dumont (@ardumont) Tue, 03 Sep 2019 14:12:13 +0200 swh-loader-core (0.0.45-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.45 - (tagged by Antoine R. Dumont (@ardumont) on 2019-09-03 10:38:36 +0200) * Upstream changes: - v0.0.45 - loader: Provide visit type when calling origin_visit_add - loader: Drop keys 'perms' and 'path' from content before sending to the - storage - swh.loader.package: Implement GNU loader - docs: add code of conduct document -- Software Heritage autobuilder (on jenkins-debian1) Tue, 03 Sep 2019 08:41:49 +0000 swh-loader-core (0.0.44-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.44 - (tagged by Valentin Lorentz on 2019-06-25 12:18:27 +0200) * Upstream changes: - Drop use of deprecated methods fetch_history_* -- Software Heritage autobuilder (on jenkins-debian1) Wed, 26 Jun 2019 09:40:59 +0000 swh-loader-core (0.0.43-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.43 - (tagged by Valentin Lorentz on 2019-06-18 16:21:58 +0200) * Upstream changes: - Use origin urls instead of origin ids. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 19 Jun 2019 09:33:53 +0000 swh-loader-core (0.0.42-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.42 - (tagged by David Douard on 2019-05-20 11:28:49 +0200) * Upstream changes: - v0.0.42 - update/fix requirements -- Software Heritage autobuilder (on jenkins-debian1) Mon, 20 May 2019 09:33:47 +0000 swh-loader-core (0.0.41-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.41 - (tagged by Antoine R. Dumont (@ardumont) on 2019-04-11 11:46:00 +0200) * Upstream changes: - v0.0.41 - core.loader: Migrate to latest snapshot_add, origin_visit_update api - core.loader: Count only the effectively new objects ingested - test_utils: Add coverage on utils module -- Software Heritage autobuilder (on jenkins-debian1) Thu, 11 Apr 2019 09:52:55 +0000 swh-loader-core (0.0.40-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.40 - (tagged by Antoine Lambert on 2019-03-29 10:57:14 +0100) * Upstream changes: - version 0.0.40 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Mar 2019 10:02:37 +0000 swh-loader-core (0.0.39-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.39 - (tagged by Antoine R. Dumont (@ardumont) on 2019-01-30 11:10:39 +0100) * Upstream changes: - v0.0.39 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 30 Jan 2019 10:13:56 +0000 swh-loader-core (0.0.35-1~swh1) unstable-swh; urgency=medium * v0.0.35 * tests: Initialize tox.ini use * tests, debian/*: Migrate to pytest -- Antoine R. Dumont (@ardumont) Tue, 23 Oct 2018 15:47:22 +0200 swh-loader-core (0.0.34-1~swh1) unstable-swh; urgency=medium * v0.0.34 * setup: prepare for PyPI upload * README.md: Simplify module description * core.tests: Install tests fixture for derivative loaders to use -- Antoine R. Dumont (@ardumont) Tue, 09 Oct 2018 14:11:29 +0200 swh-loader-core (0.0.33-1~swh1) unstable-swh; urgency=medium * v0.0.33 * loader/utils: Add clean_dangling_folders function to ease clean up * loader/core: Add optional pre_cleanup for dangling files cleaning -- Antoine R. Dumont (@ardumont) Fri, 09 Mar 2018 14:41:17 +0100 swh-loader-core (0.0.32-1~swh1) unstable-swh; urgency=medium * v0.0.32 * Improve origin_visit initialization step * Properly sandbox the prepare statement so that if it breaks, we can * update appropriately the visit with the correct status -- Antoine R. Dumont (@ardumont) Wed, 07 Mar 2018 11:06:27 +0100 swh-loader-core (0.0.31-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core v0.0.31 * Remove backwards-compatibility when sending snapshots -- Nicolas Dandrimont Tue, 13 Feb 2018 18:52:20 +0100 swh-loader-core (0.0.30-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core v0.0.30 * Update Debian metadata for snapshot-related breakage -- Nicolas Dandrimont Tue, 06 Feb 2018 14:22:53 +0100 swh-loader-core (0.0.29-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core v0.0.29 * Replace occurrences with snapshots * Enhance logging on error cases -- Nicolas Dandrimont Tue, 06 Feb 2018 14:13:11 +0100 swh-loader-core (0.0.28-1~swh1) unstable-swh; urgency=medium * v0.0.28 * Add stateless loader base class * Remove bare exception handlers -- Antoine R. Dumont (@ardumont) Tue, 19 Dec 2017 17:48:09 +0100 swh-loader-core (0.0.27-1~swh1) unstable-swh; urgency=medium * v0.0.27 * Migrate from indexer's indexer_configuration to storage's tool notion. -- Antoine R. Dumont (@ardumont) Thu, 07 Dec 2017 10:36:23 +0100 swh-loader-core (0.0.26-1~swh1) unstable-swh; urgency=medium * v0.0.26 * Fix send_provider method -- Antoine R. Dumont (@ardumont) Tue, 05 Dec 2017 15:40:57 +0100 swh-loader-core (0.0.25-1~swh1) unstable-swh; urgency=medium * v0.0.25 * swh.loader.core: Fix to retrieve the provider_id as an actual id * swh.loader.core: Fix log format error * swh.loader.core: Align log message according to conventions -- Antoine R. Dumont (@ardumont) Wed, 29 Nov 2017 12:55:45 +0100 swh-loader-core (0.0.24-1~swh1) unstable-swh; urgency=medium * v0.0.24 * Added metadata injection possible from loader core -- Antoine R. Dumont (@ardumont) Fri, 24 Nov 2017 11:35:40 +0100 swh-loader-core (0.0.23-1~swh1) unstable-swh; urgency=medium * v0.0.23 * loader: Fix dangling data flush -- Antoine R. Dumont (@ardumont) Tue, 07 Nov 2017 16:25:20 +0100 swh-loader-core (0.0.22-1~swh1) unstable-swh; urgency=medium * v0.0.22 * core.loader: Use the global setup set in swh.core.config * core.loader: Properly batch object insertions for big requests -- Antoine R. Dumont (@ardumont) Mon, 30 Oct 2017 18:50:00 +0100 swh-loader-core (0.0.21-1~swh1) unstable-swh; urgency=medium * v0.0.21 * swh.loader.core: Only send origin if not already sent before -- Antoine R. Dumont (@ardumont) Tue, 24 Oct 2017 16:30:53 +0200 swh-loader-core (0.0.20-1~swh1) unstable-swh; urgency=medium * v0.0.20 * Permit to add 'post_load' actions in loaders -- Antoine R. Dumont (@ardumont) Fri, 13 Oct 2017 14:30:37 +0200 swh-loader-core (0.0.19-1~swh1) unstable-swh; urgency=medium * v0.0.19 * Permit to add 'post_load' actions in loaders -- Antoine R. Dumont (@ardumont) Fri, 13 Oct 2017 14:14:14 +0200 swh-loader-core (0.0.18-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core version 0.0.18 * Update packaging runes -- Nicolas Dandrimont Thu, 12 Oct 2017 18:07:53 +0200 swh-loader-core (0.0.17-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core v0.0.17 * Allow iterating when fetching and storing data * Allow overriding the status of the loaded visit * Allow overriding the status of the load itself -- Nicolas Dandrimont Wed, 11 Oct 2017 16:38:29 +0200 swh-loader-core (0.0.16-1~swh1) unstable-swh; urgency=medium * Release swh.loader.core v0.0.16 * Migrate from swh.model.git to swh.model.from_disk -- Nicolas Dandrimont Fri, 06 Oct 2017 14:46:41 +0200 swh-loader-core (0.0.15-1~swh1) unstable-swh; urgency=medium * v0.0.15 * docs: Add sphinx apidoc generation skeleton * docs: Add a simple README.md explaining the module's goal * swh.loader.core.loader: Unify origin_visit add/update function call -- Antoine R. Dumont (@ardumont) Fri, 29 Sep 2017 11:47:37 +0200 swh-loader-core (0.0.14-1~swh1) unstable-swh; urgency=medium * v0.0.14 * Add the blake2s256 hash computation -- Antoine R. Dumont (@ardumont) Sat, 25 Mar 2017 18:20:52 +0100 swh-loader-core (0.0.13-1~swh1) unstable-swh; urgency=medium * v0.0.13 * Improve core loader's interface api -- Antoine R. Dumont (@ardumont) Wed, 22 Feb 2017 13:43:54 +0100 swh-loader-core (0.0.12-1~swh1) unstable-swh; urgency=medium * v0.0.12 * Update storage configuration reading -- Antoine R. Dumont (@ardumont) Thu, 15 Dec 2016 18:34:41 +0100 swh-loader-core (0.0.11-1~swh1) unstable-swh; urgency=medium * v0.0.11 * d/control: Bump dependency to latest storage * Fix: Objects can be injected even though global loading failed * Populate the counters in fetch_history * Open open/close fetch_history function in the core loader -- Antoine R. Dumont (@ardumont) Wed, 24 Aug 2016 14:38:55 +0200 swh-loader-core (0.0.10-1~swh1) unstable-swh; urgency=medium * v0.0.10 * d/control: Update dependency -- Antoine R. Dumont (@ardumont) Sat, 11 Jun 2016 02:26:50 +0200 swh-loader-core (0.0.9-1~swh1) unstable-swh; urgency=medium * v0.0.9 * Improve default task that initialize storage as well -- Antoine R. Dumont (@ardumont) Fri, 10 Jun 2016 15:12:14 +0200 swh-loader-core (0.0.8-1~swh1) unstable-swh; urgency=medium * v0.0.8 * Migrate specific converter to the right module * Fix dangling parameter -- Antoine R. Dumont (@ardumont) Wed, 08 Jun 2016 18:09:23 +0200 swh-loader-core (0.0.7-1~swh1) unstable-swh; urgency=medium * v0.0.7 * Fix on revision conversion -- Antoine R. Dumont (@ardumont) Wed, 08 Jun 2016 16:19:02 +0200 swh-loader-core (0.0.6-1~swh1) unstable-swh; urgency=medium * v0.0.6 * d/control: Bump dependency on swh-model * d/control: Add missing description * Keep the abstraction for all entities * Align parameter definition order * Fix missing option in DEFAULT ones * Decrease verbosity * Fix missing origin_id assignment * d/rules: Add target to run tests during packaging -- Antoine R. Dumont (@ardumont) Wed, 08 Jun 2016 16:00:40 +0200 swh-loader-core (0.0.5-1~swh1) unstable-swh; urgency=medium * v0.0.5 -- Antoine R. Dumont (@ardumont) Wed, 25 May 2016 12:17:06 +0200 swh-loader-core (0.0.4-1~swh1) unstable-swh; urgency=medium * v0.0.4 * Rename package from python3-swh.loader to python3-swh.loader.core -- Antoine R. Dumont (@ardumont) Wed, 25 May 2016 11:44:48 +0200 swh-loader-core (0.0.3-1~swh1) unstable-swh; urgency=medium * v0.0.3 * Improve default configuration * Rename package from swh-loader-vcs to swh-loader -- Antoine R. Dumont (@ardumont) Wed, 25 May 2016 11:23:06 +0200 swh-loader-core (0.0.2-1~swh1) unstable-swh; urgency=medium * v0.0.2 * Fix: Flush data even when no data is sent to swh-storage -- Antoine R. Dumont (@ardumont) Tue, 24 May 2016 16:41:49 +0200 swh-loader-core (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * v0.0.1 -- Antoine R. Dumont (@ardumont) Wed, 13 Apr 2016 16:54:47 +0200 diff --git a/debian/control b/debian/control index 0dee749..12017ff 100644 --- a/debian/control +++ b/debian/control @@ -1,43 +1,46 @@ Source: swh-loader-core Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 3), python3-all, python3-dateutil, python3-debian, python3-iso8601, python3-pkginfo, python3-pytest, python3-pytest-mock, python3-pytest-postgresql, python3-psutil, python3-requests-mock, python3-setuptools, python3-setuptools-scm, python3-swh.core, python3-swh.core.db.pytestplugin, python3-swh.model (>= 3.1.0~), python3-swh.storage (>= 0.22.0~), python3-swh.scheduler (>= 0.4.0~), + python3-tenacity, + python3-toml, opam, + zstd, Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/60/ Package: python3-swh.loader.core Architecture: all Depends: python3-swh.core, python3-swh.model (>= 3.1.0~), python3-swh.storage (>= 0.22.0~), python3-swh.scheduler (>= 0.4.0~), ${misc:Depends}, ${python3:Depends} Breaks: python3-swh.deposit.loader (<< 0.0.48~), python3-swh.loader.debian (<< 0.0.9~), python3-swh.loader.dir (<< 0.0.31~), python3-swh.loader.git (<< 0.0.36~), python3-swh.loader.mercurial (<< 0.0.3~), python3-swh.loader.svn (<< 0.0.35~), python3-swh.loader.tar (<< 0.0.33~) Description: Software Heritage Loader Core diff --git a/docs/package-loader-specifications.rst b/docs/package-loader-specifications.rst index 1d734b8..ce808d0 100644 --- a/docs/package-loader-specifications.rst +++ b/docs/package-loader-specifications.rst @@ -1,133 +1,169 @@ .. _package-loader-specifications: Package loader specifications ============================= Release fields -------------- Here is an overview of the fields (+ internal version name + branch name) used by each package loader, after D6616: .. list-table:: Fields used by each package loader :header-rows: 1 * - Loader - internal version - branch name - name - message - synthetic - author - date - Notes + * - arch + - ``p_info.​version`` + - ``release_name(​version, filename)`` + - =version + - Synthetic release for Arch Linux source package {p_info.name} version {p_info.version} {description} + - true + - from intrinsic metadata + - from extra_loader_arguments['arch_metadata'] + - Intrinsic metadata extracted from .PKGINFO file of the package * - archive - passed as arg - ``release_name(​version)`` - =version - "Synthetic release for archive at {p_info.url}\n" - true - "" - passed as arg - + * - aur + - ``p_info.​version`` + - ``release_name(​version, filename)`` + - =version + - Synthetic release for Aur source package {p_info.name} version {p_info.version} {description} + - true + - "" + - from extra_loader_arguments['aur_metadata'] + - Intrinsic metadata extracted from .SRCINFO file of the package * - cran - ``metadata.get(​"Version", passed as arg)`` - ``release_name(​version)`` - =version - standard message - true - ``metadata.get(​"Maintainer", "")`` - ``metadata.get(​"Date")`` - metadata is intrinsic * - crates - ``p_info.​version`` - - ``release_name(​version, filename)`` + - ``release_name(​version, filename) + "\n\n" + i_metadata.description + "\n"`` - =version - Synthetic release for Crate source package {p_info.name} version {p_info.version} {description} - true - - from intrinsic metadata - - from extrinsic metadata + - from int metadata + - from ext metadata - ``i_metadata`` for intrinsic metadata, ``e_metadata`` for extrinsic metadata * - debian - =``version`` - ``release_name(​version)`` - =``i_version`` - standard message (using ``i_version``) - true - ``metadata​.changelog​.person`` - ``metadata​.changelog​.date`` - metadata is intrinsic. Old revisions have ``dsc`` as type ``i_version`` is the intrinsic version (eg. ``0.7.2-3``) while ``version`` contains the debian suite name (eg. ``stretch/contrib/0.7.2-3``) and is passed as arg + * - golang + - ``p_info.​version`` + - ``release_name(version)`` + - =version + - Synthetic release for Golang source package {p_info.name} version {p_info.version} + - true + - "" + - from ext metadata + - Golang offers basically no metadata outside of version and timestamp * - deposit - HEAD - only HEAD - HEAD - "{client}: Deposit {id} in collection {collection}\n" - true - original author - ```` from SWORD XML - revisions had parents * - maven-loader - passed as arg - HEAD - ``release_name(version)`` - "Synthetic release for archive at {p_info.url}\n" - true - "" - passed as arg - Only one artefact per url (jar/zip src) * - nixguix - URL - URL - URL - None - true - "" - None - it's the URL of the artifact referenced by the derivation * - npm - ``metadata​["version"]`` - ``release_name(​version)`` - =version - standard message - true - from int metadata or "" - from ext metadata or None - * - opam - as given by opam - "{opam_package}​.{version}" - =version - standard message - true - from metadata - None - "{self.opam_package}​.{version}" matches the version names used by opam's backend. metadata is extrinsic + * - pubdev + - ``p_info.​version`` + - ``release_name(​version)`` + - =version + - Synthetic release for pub.dev source package {name} version {version} {description} + - true + - from extrinsic metadata + - from extrinsic metadata + - name, version and description from intrinsic metadata * - pypi - ``metadata​["version"]`` - ``release_name(​version)`` or ``release_name(​version, filename)`` - =version - ``metadata[​'comment_text']}`` or standard message - true - from int metadata or "" - from ext metadata or None - metadata is intrinsic using this function:: def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version and "standard message" being:: msg = ( f"Synthetic release for {PACKAGE_MANAGER} source package {name} " f"version {version}\n" ) The ``target_type`` field is always ``dir``, and the target the id of a directory loaded by unpacking a tarball/zip file/... diff --git a/docs/package-loader-tutorial.rst b/docs/package-loader-tutorial.rst index 304936c..6543bd2 100644 --- a/docs/package-loader-tutorial.rst +++ b/docs/package-loader-tutorial.rst @@ -1,699 +1,712 @@ .. _package-loader-tutorial: Package Loader Tutorial ======================= In this tutorial, we will see how to write a loader for |swh| that loads packages from a package manager, such as PyPI or Debian's. First, you should be familiar with Python, unit-testing, |swh|'s :ref:`data-model` and :ref:`architecture`, and go through the :ref:`developer-setup`. Creating the files hierarchy ---------------------------- Once this is done, you should create a new directory (ie. a (sub)package from Python's point of view) for you loader. It can be either a subdirectory of ``swh-loader-core/swh/loader/package/`` like the other package loaders, or it can be in its own package. If you choose the latter, you should also create the base file of any Python package (such as ``setup.py``), you should import them from the `swh-py-template`_ repository. In the rest of this tutorial, we will assume you chose the former and your loader is named "New Loader", so your package loader is in ``swh-loader-core/swh/loader/package/newloader/``. Next, you should create boilerplate files needed for SWH loaders: ``__init__.py``, ``tasks.py``, ``tests/__init__.py``, and ``tests/test_tasks.py``; copy them from an existing package, such as ``swh-loader-core/swh/loader/package/pypi/``, and replace the names in those with your loader's. Finally, create an `entrypoint`_ in :file:`setup.py`, so your loader can be discovered by the SWH Celery workers:: entry_points=""" [swh.workers] loader.newloader=swh.loader.package.newloader:register """, .. _swh-py-template: https://forge.softwareheritage.org/source/swh-py-template/ .. _entrypoint: https://setuptools.readthedocs.io/en/latest/userguide/entry_point.html Writing a minimal loader ------------------------ It is now time for the interesting part: writing the code to load packages from a package manager into the |swh| archive. Create a file named :file:`loader.py` in your package's directory, with two empty classes (replace the names with what you think is relevant):: from typing import Optional import attr from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.model.model import Person, Release, Sha1Git, TimestampWithTimezone @attr.s class NewPackageInfo(BasePackageInfo): pass class NewLoader(PackageLoader[NewPackageInfo]): visit_type = "newloader" We now have to fill some of the methods declared by :class:`swh.loader.package.PackageLoader`: in your new ``NewLoader`` class. Listing versions ++++++++++++++++ ``get_versions`` should return the list of names of all versions of the origin defined at ``self.url`` by the default constructor; and ``get_default_version`` should return the name of the default version (usually the latest stable release). They are both implemented with an API call to the package repository. For example, for PyPI origin https://pypi.org/project/requests, this is done with a request to https://pypi.org/pypi/requests/json. Getting package information +++++++++++++++++++++++++++ Next, ``get_package_info`` takes as argument a version name (as returned by ``get_versions``) and yields ``(branch_name, p_info)`` tuples, where ``branch_name`` is a string and ``pkg_info`` is an instance of the ``NewPackageInfo`` class we defined earlier. Each of these tuples should match a single file the loader will download from the origin. Usually, there is only one file per versions, but this is not true for all package repositories (eg. CRAN and PyPI allow multiple version artifacts per version). As ``NewPackageInfo`` derives from :py:class:`swh.loader.package.BasePackageInfo`, it can be created like this:: return NewPackageInfo(url="https://...", filename="...-versionX.Y.tar.gz") The ``url`` must be a URL where to download the archive from. ``filename`` is optional, but it is nice to fill it when possible/relevant. The base ``PackageLoader`` will then take care of calling ``get_versions()`` to get all the versions, then call ``get_package_info()`` get the list of archives to download, download them, and load all the directories in the archive. This means you do not need to manage downloads yourself; and we are now done with interactions with the package repository. Building a release +++++++++++++++++++ The final step for your minimal loader to work, is to implement ``build_release``. This is a very important part, as it will create a release object that will be inserted in |swh|, as a link between origins and the directories. This function takes three important arguments: * ``p_info`` is an object returned by ``get_package_info()`` * ``uncompressed_path`` is the location on the disk where the base ``PackageLoader`` extracted the archive, so you can access files from the archive. * ``directory`` is an :term:`intrinsic identifier` of the directory that was loaded from the archive The way to implement it depends very much on how the package manager works, but here is a rough idea:: def build_release( self, p_info: NewPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: author = Person(name="Jane Doe", email="jdoe@example.org") date = TimestampWithTimezone.from_iso8601("2021-04-01T11:55:20Z") return Release( name="v2.0.0", message="This is a new release of the project", author=author, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) The strings here are placeholders, and you should extract them from either the extracted archive (using ``uncompressed_path``), or from the package repository's API; see the :ref:`existing specifications ` for examples of values to use. The various classes used in this example are :py:class:`swh.model.model.Person`, :py:class:`swh.model.model.TimestampWithTimezone`, and :py:class:`swh.model.model.Release`. Note that you have access to the ``NewPackageInfo`` object created by ``get_package_info()``, so you can extend the ``NewPackageInfo`` class to pass data between these two functions. A few caveats: * Make sure the timezone matches the source's * ``Person`` can also be built with just a ``fullname``, if there aren't distinct fields for name and email. When in doubt, it's better to just write the ``fullname`` than try to parse it * ``author`` and ``committer`` (resp. ``date`` and ``committer_date``) may be different if the release was written and published by different people (resp. dates). This is only relevant when loading from VCS, so you can usually ignore it in you package loader. Running your loader +++++++++++++++++++ .. _docker-run-loader-cli: With Docker ^^^^^^^^^^^ We recommend you use our `Docker environment`_ to test your loader. In short, install Docker, ``cd`` to ``swh-environment/docker/``, then `edit docker-compose.override.yml`_ to insert your new loader in the Docker environment, something like this will do:: version: '2' services: swh-loader-core: volumes: - "$HOME/swh-environment/swh-loader-core:/src/swh-loader-core" Then start the Docker environment:: docker-compose start Then, you can run your loader:: docker-compose exec swh-loader swh loader run newloader "https://example.org/~jdoe/project/" where ``newloader`` is the name you registered as an entrypoint in ``setup.py`` and ``https://example.org/~jdoe/project/`` is the origin URL, that will be set as the ``self.url`` attribute of your loader. For example, to run the PyPI loader, the command would be:: docker-compose exec swh-loader swh loader run pypi "https://pypi.org/project/requests/" If you get this error, make sure you properly configured ``docker-compose.override.yml``:: Error: Invalid value for '[...]': invalid choice: newloader Without Docker ^^^^^^^^^^^^^^ If you do not want to use the Docker environment, you will need to start an :ref:`swh-storage` instance yourself, and create a config file that references it:: storage: cls: remote url: http://localhost:5002/ Or alternatively, this more efficient configuration:: storage: cls: pipeline steps: - cls: buffer min_batch_size: content: 10000 content_bytes: 104857600 directory: 1000 release: 1000 - cls: filter - cls: remote url: http://localhost:5002/ And run your loader with:: swh loader -C loader.yml run newloader "https://example.org/~jdoe/project/" where ``newloader`` is the name you registered as an entrypoint in ``setup.py`` and ``https://example.org/~jdoe/project/`` is the origin URL, that will be set as the ``self.url`` attribute of your loader. For example, with PyPI:: swh loader -C loader.yml run pypi "https://pypi.org/project/requests/" .. _Docker environment: https://forge.softwareheritage.org/source/swh-environment/browse/master/docker/ .. _edit docker-compose.override.yml: https://forge.softwareheritage.org/source/swh-environment/browse/master/docker/#install-a-swh-package-from Testing your loader +++++++++++++++++++ You must write tests for your loader. First, of course, unit tests for the internal functions of your loader, if any (eg. the functions used to extract metadata); but this is not covered in this tutorial. Most importantly, you should write integration tests for your loader, that will simulate an origin, run the loader, and check everything is loaded in the storage as it should be. As we do not want tests to directly query an origin (it makes tests flaky, hard to reproduce, and put unnecessary load on the origin), we usually mock it using the :py:func:`swh.core.pytest_plugin.requests_mock_datadir` fixture It works by creating a ``data/`` folder in your tests (such as ``swh/loader/package/newloader/tests/data/``) and downloading results from API calls there, in the structured documented in :py:func:`swh.core.pytest_plugin.requests_mock_datadir_factory` The files in the ``datadir/`` will then be served whenever the loader tries to access an URL. This is very dependent on the kind of repositories your loader will read from, so here is an example with the PyPI loader. The files ``swh/loader/package/pypi/tests/data/https_pypi.org/pypi_nexter_json`` and ``swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-*`` are used in this test:: from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats def test_pypi_visit_1_release_with_2_artifacts(swh_storage, requests_mock_datadir): # Initialize the loader url = "https://pypi.org/project/nexter" loader = PyPILoader(swh_storage, url) # Run the loader, with a swh-storage instance, on the given URL. # HTTP calls will be mocked by the requests_mock_datadir fixture actual_load_status = loader.load() # Check the loader loaded exactly the snapshot we expected # (when writing your tests for the first time, you cannot know the # snapshot id without running your loader; so let it error and write # down the result here) expected_snapshot_id = hash_to_bytes("1394b2e59351a944cc763bd9d26d90ce8e8121a8") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } # Check the content of the snapshot. (ditto) expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"releases/1.1.0/nexter-1.1.0.zip": SnapshotBranch( target=hash_to_bytes("f7d43faeb65b64d3faa67e4f46559db57d26b9a4"), target_type=TargetType.RELEASE, ), b"releases/1.1.0/nexter-1.1.0.tar.gz": SnapshotBranch( target=hash_to_bytes("732bb9dc087e6015884daaebb8b82559be729b5a"), target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, swh_storage) # Check the visit was properly created with the right type assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) # Then you could check the directory structure: directory_id = swh_storage.release_get( [hash_to_bytes("f7d43faeb65b64d3faa67e4f46559db57d26b9a4")] )[0].target entries = list(swh_storage.directory_ls(directory_id, recursive=True)) assert entries == [ ... ] Here are some scenarios you should test, when relevant: * No versions * One version * Two or more versions * More than one package per version, if relevant * Corrupt packages (missing metadata, ...), if relevant * API errors * etc. Making your loader incremental ------------------------------ .. important:: In the previous sections, you wrote a fully functional loader for a new type of package repository. This is great! Please tell us about it, and :ref:`submit it for review ` so we can give you some feedback early. Now, we will see a key optimization for any package loader: skipping packages it already downloaded, using :term:`extids `. The rough idea it to find some way to uniquely identify packages before downloading them and encode it in a short string, the ExtID. Using checksums +++++++++++++++ Ideally, this short string is a checksum of the archive, provided by the API before downloading the archive itself. This is ideal, because this ensures that we detect changes in the package's content even if it keeps the same name and version number. +However, this is only usable when all fields used to generate release objects +(message, authors, ...) are extracted from the archive. + +.. important:: + + If release objects are generated from extrinsic fields (ie. not extracted from + the archive, such as authorship information added by the package repository) + two different package versions with the same tarball would end up with the + same release number; causing the loader to create incorrect snapshots. If this is not the case of the repository you want to load from, skip to the next subsection. This is used for example by the PyPI loader (with a sha256sum) and the NPM loader (with a sha1sum). The Debian loader uses a similar scheme: as a single package is assembled from a set of tarballs, it only uses the hash of the ``.dsc`` file, which itself contains a hash of all the tarballs. This is implemented by overriding the ``extid`` method of you ``NewPackageInfo`` class, that returns the type of the ExtID (see below) and the ExtID itself:: from swh.loader.package.loader import PartialExtID EXTID_TYPE: str = "pypi-archive-sha256" @attr.s class NewPackageInfo(BasePackageInfo): sha256: str def extid(self) -> PartialExtID: return (EXTID_TYPE, hash_to_bytes(self.sha256)) and the loader's ``get_package_info`` method sets the right value in the ``sha256`` attribute. Using a custom manifest +++++++++++++++++++++++ Unfortunately, this does not work for all packages, as some package repositories do not provide a checksum of the archives via their API. If this is the case of the repository you want to load from, you need to find a way around it. It highly depends on the repository, so this tutorial cannot cover how to do it. We do however provide an easy option that should work in most cases: creating a "manifest" of the archive with some metadata in it, and hashing it. For example, when loading from the GNU FTP servers, we have access to some metadata, that is somewhat good enough to deduplicate. We write them all in a string and hash that string. It is done like this:: import string @attr.s class ArchivePackageInfo(BasePackageInfo): length = attr.ib(type=int) """Size of the archive file""" time = attr.ib(type=Union[str, datetime.datetime]) """Timestamp of the archive file on the server""" version = attr.ib(type=str) EXTID_FORMAT = "package-manifest-sha256" MANIFEST_FORMAT = string.Template("$time $length $version $url") The default implementation of :py:func:`swh.loader.package.loader.BasePackageInfo.extid` will read this template, substitute the variables based on the object's attributes, compute the hash of the result, and return it. Note that, as mentioned before, this is not perfect because a tarball may be replaced with a different tarball of exactly the same length and modification time, and we won't detect it. But this is extremely unlikely, so we consider it to be good enough. +.. important:: + + The manifest must cover all fields used to generate Release objects. + Alternatively, if this is not good enough for your loader, you can simply not implement ExtIDs, and your loader will always load all tarballs. This can be bandwidth-heavy for both |swh| and the origin you are loaded from, so this decision should not be taken lightly. Choosing the ExtID type +++++++++++++++++++++++ The type of your ExtID should be a short ASCII string, that is both unique to your loader and descriptive of how it was computed. Why unique to the loader? Because different loaders may load the same archive differently. For example, if I was to create an archive with both a ``PKG-INFO`` and a ``package.json`` file, and submit it to both NPM and PyPI, both package repositories would have exactly the same tarball. But the NPM loader would create the release based on authorship info in ``package.json``, and the PyPI loader based on ``PKG-INFO``. But we do not want the PyPI loader to assume it already created a release itself, while the release was created by the NPM loader! And why descriptive? This is simply for future-proofing; in case your loader changes the format of the ExtID (eg. by using a different hash algorithm). Testing your incremental loading ++++++++++++++++++++++++++++++++ If you followed the steps above, your loader is now able to detect what packages it already downloaded and skip them. This is what we call an incremental loader. It is now time to write tests to make sure your loader fulfills this promise. This time, we want to use ``requests_mock_datadir_visits`` instead of ``requests_mock_datadir``, because we want to mock the repository's API to emulate its results changing over time (eg. because a new version was published between two runs of the loader). See the documentation of :py:func:`swh.core.pytest_plugin.requests_mock_datadir_factory` for a description of the file layout to use. Let's take, once again, a look at ``swh/loader/package/pypi/tests/test_pypi.py``, to use as an example:: def test_pypi_incremental_visit(swh_storage, requests_mock_datadir_visits): """With prior visit, 2nd load will result with a different snapshot """ # Initialize the loader url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) # First visit visit1_actual_load_status = loader.load() visit1_stats = get_stats(swh_storage) # Make sure everything is in order expected_snapshot_id = hash_to_bytes("ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a") assert visit1_actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 0, "release": 2, "skipped_content": 0, "snapshot": 1, } == visit1_stats # Reset internal state del loader._cached__raw_info del loader._cached_info # Second visit visit2_actual_load_status = loader.load() visit2_stats = get_stats(swh_storage) # Check the result of the visit assert visit2_actual_load_status["status"] == "eventful", visit2_actual_load_status expected_snapshot_id2 = hash_to_bytes("2e5149a7b0725d18231a37b342e9b7c4e121f283") assert visit2_actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id2.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id2 ) assert { "content": 6 + 1, # 1 more content "directory": 4 + 2, # 2 more directories "origin": 1, "origin_visit": 1 + 1, "release": 2 + 1, # 1 more release "revision": 0, "skipped_content": 0, "snapshot": 1 + 1, # 1 more snapshot } == visit2_stats # Check all content objects were loaded expected_contents = map( hash_to_bytes, [ "a61e24cdfdab3bb7817f6be85d37a3e666b34566", "938c33483285fd8ad57f15497f538320df82aeb8", "a27576d60e08c94a05006d2e6d540c0fdb5f38c8", "405859113963cb7a797642b45f171d6360425d16", "e5686aa568fdb1d19d7f1329267082fe40482d31", "83ecf6ec1114fd260ca7a833a2d165e71258c338", "92689fa2b7fb4d4fc6fb195bf73a50c87c030639", ], ) assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] # Check all directory objects were loaded expected_dirs = map( hash_to_bytes, [ "05219ba38bc542d4345d5638af1ed56c7d43ca7d", "cf019eb456cf6f78d8c4674596f1c9a97ece8f44", "b178b66bd22383d5f16f4f5c923d39ca798861b4", "c3a58f8b57433a4b56caaa5033ae2e0931405338", "e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a", "52604d46843b898f5a43208045d09fcf8731631b", ], ) assert list(swh_storage.directory_missing(expected_dirs)) == [] # etc. Loading metadata ---------------- Finally, an optional step: collecting and loading :term:`extrinsic metadata`. This is metadata that your loader may collect while loading an origin. For example, the PyPI loader collects some parts of the API response (eg. https://pypi.org/pypi/requests/json) They are stored as raw bytestring, along with a format (an ASCII string) and a date of discovery (usually the time your loader ran). This is done by adding them to the ``directory_extrinsic_metadata`` attribute of your ``NewPackageInfo`` object when creating it in ``get_package_info`` as :class:`swh.loader.package.loader.RawExtrinsicMetadataCore` objects:: NewPackageInfo( ..., directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="new-format", metadata=b"foo bar baz", discovery_date=datetime.datetime(...), ) ] ) ``format`` should be a human-readable ASCII string that unambiguously describes the format. Readers of the metadata object will have a built-in list of formats they understand, and will check if your metadata object is among them. You should use one of the :ref:`known metadata formats ` if possible, or add yours to this list. ``metadata`` is the metadata object itself. When possible, it should be copied verbatim from the source object you got, and should not be created by the loader. If this is not possible, for example because it is extracted from a larger JSON or XML document, make sure you do as little modifications as possible to reduce the risks of corruption. ``discovery_date`` is optional, and defaults to the time your loader started working. In theory, you can write extrinsic metadata on any kind of objects, eg. by implementing :py:meth:`swh.loader.package.loader.PackageLoader.get_extrinsic_origin_metadata`, :py:meth:`swh.loader.package.loader.PackageLoader.get_extrinsic_snapshot_metadata`; but this is rarely relevant in practice. Be sure to check if loader can find any potentially interesting metadata, though! You also need to implement a new method on your loader class, to return information on where the metadata is coming from, called a metadata authority. This authority is identified by a URI, such as ``https://github.com/`` for GitHub, ``https://pypi.org/`` for PyPI, etc. For example:: from swh.model.model import MetadataAuthority, MetadataAuthorityType def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://pypi.org/", ) If your loader supports loading from different instances (like GitLab), you can define the authority dynamically based on the URL of the origin:: def get_metadata_authority(self): p_url = urlparse(self.url) return MetadataAuthority( type=MetadataAuthorityType.FORGE, url=f"{p_url.scheme}://{p_url.netloc}/", ) Checklist --------- Before the final addition of a new loader, here is a list of things to check for. Most of them are a reminder of other sections above. * There is (or will be) a lister to trigger it * Tested with pytest, from scratch and incrementally (if relevant) * Tested in Docker, from scratch and incrementally (if relevant) * Release fields are consistent with the :ref:`existing specifications `, and you updated the specifications to add your loader. They must be explicitly tested. * Relevant metadata are loaded with as little processing as possible (ie. keep the original format unchanged, instead of converting it to a JSON/msgpack/... format) and :ref:`their format is documented `. They must tested as well. * There is no risk of extid clashes, even across instances (if relevant), even in presence of malicious actors (as far as reasonably possible) Final words ----------- Congratulations, you made it to the end. If you have not already, please `contact us`_ to tell us about your new loader, and :ref:`submit your loader for review ` on our forge so we can merge it and run it along our other loaders to archive more repositories. And if you have any change in mind to improve this tutorial for future readers, please submit them too. Thank you for your contributions! .. _contact us: https://www.softwareheritage.org/community/developers/ diff --git a/requirements-swh.txt b/requirements-swh.txt index c89f10b..30b3fcc 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ -swh.core >= 0.3 +swh.core >= 2.12 swh.model >= 4.4.0 swh.objstorage >= 0.2.2 swh.scheduler >= 0.4.0 swh.storage >= 0.29.0 diff --git a/setup.py b/setup.py index a4f4d95..421f131 100755 --- a/setup.py +++ b/setup.py @@ -1,82 +1,86 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import open from os import path from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.rst"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.loader.core", description="Software Heritage Base Loader", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DLDBASE", packages=find_packages(), # packages's modules scripts=[], # scripts to package install_requires=parse_requirements() + parse_requirements("swh"), setup_requires=["setuptools-scm"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, entry_points=""" [swh.cli.subcommands] loader=swh.loader.cli [swh.workers] + loader.arch=swh.loader.package.arch:register loader.archive=swh.loader.package.archive:register + loader.aur=swh.loader.package.aur:register loader.cran=swh.loader.package.cran:register loader.crates=swh.loader.package.crates:register loader.debian=swh.loader.package.debian:register loader.deposit=swh.loader.package.deposit:register + loader.golang=swh.loader.package.golang:register loader.nixguix=swh.loader.package.nixguix:register loader.npm=swh.loader.package.npm:register loader.opam=swh.loader.package.opam:register + loader.pubdev=swh.loader.package.pubdev:register loader.pypi=swh.loader.package.pypi:register loader.maven=swh.loader.package.maven:register """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-loader-core", "Documentation": "https://docs.softwareheritage.org/devel/swh-loader-core/", }, ) diff --git a/swh.loader.core.egg-info/PKG-INFO b/swh.loader.core.egg-info/PKG-INFO index c872963..0de803a 100644 --- a/swh.loader.core.egg-info/PKG-INFO +++ b/swh.loader.core.egg-info/PKG-INFO @@ -1,52 +1,52 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 3.5.0 +Version: 4.0.0 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/swh.loader.core.egg-info/SOURCES.txt b/swh.loader.core.egg-info/SOURCES.txt index c2042ba..0bbd56c 100644 --- a/swh.loader.core.egg-info/SOURCES.txt +++ b/swh.loader.core.egg-info/SOURCES.txt @@ -1,238 +1,287 @@ .git-blame-ignore-revs .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.rst conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/README.rst docs/cli.rst docs/conf.py docs/index.rst docs/package-loader-specifications.rst docs/package-loader-tutorial.rst docs/vcs-loader-overview.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.loader.core.egg-info/PKG-INFO swh.loader.core.egg-info/SOURCES.txt swh.loader.core.egg-info/dependency_links.txt swh.loader.core.egg-info/entry_points.txt swh.loader.core.egg-info/requires.txt swh.loader.core.egg-info/top_level.txt swh/loader/__init__.py swh/loader/cli.py swh/loader/exception.py swh/loader/pytest_plugin.py swh/loader/core/__init__.py swh/loader/core/converters.py swh/loader/core/loader.py swh/loader/core/metadata_fetchers.py swh/loader/core/py.typed swh/loader/core/utils.py swh/loader/core/tests/__init__.py swh/loader/core/tests/test_converters.py swh/loader/core/tests/test_loader.py swh/loader/core/tests/test_utils.py swh/loader/package/__init__.py swh/loader/package/loader.py swh/loader/package/py.typed swh/loader/package/utils.py +swh/loader/package/arch/__init__.py +swh/loader/package/arch/loader.py +swh/loader/package/arch/tasks.py +swh/loader/package/arch/tests/__init__.py +swh/loader/package/arch/tests/test_arch.py +swh/loader/package/arch/tests/test_tasks.py +swh/loader/package/arch/tests/data/fake_arch.sh +swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz +swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst +swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz swh/loader/package/archive/__init__.py swh/loader/package/archive/loader.py swh/loader/package/archive/tasks.py swh/loader/package/archive/tests/__init__.py swh/loader/package/archive/tests/test_archive.py swh/loader/package/archive/tests/test_tasks.py swh/loader/package/archive/tests/data/not_gzipped_tarball.tar.gz swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz +swh/loader/package/aur/__init__.py +swh/loader/package/aur/loader.py +swh/loader/package/aur/tasks.py +swh/loader/package/aur/tests/__init__.py +swh/loader/package/aur/tests/test_aur.py +swh/loader/package/aur/tests/test_tasks.py +swh/loader/package/aur/tests/data/fake_aur.sh +swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_a-fake-one.tar.gz +swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_hg-evolve.tar.gz +swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_ibus-git.tar.gz +swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_libervia-web-hg.tar.gz +swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_tealdeer-git.tar.gz swh/loader/package/cran/__init__.py swh/loader/package/cran/loader.py swh/loader/package/cran/tasks.py swh/loader/package/cran/tests/__init__.py swh/loader/package/cran/tests/test_cran.py swh/loader/package/cran/tests/test_tasks.py swh/loader/package/cran/tests/data/description/KnownBR swh/loader/package/cran/tests/data/description/acepack swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz swh/loader/package/crates/__init__.py swh/loader/package/crates/loader.py swh/loader/package/crates/tasks.py swh/loader/package/crates/tests/__init__.py swh/loader/package/crates/tests/test_crates.py swh/loader/package/crates/tests/test_tasks.py swh/loader/package/crates/tests/data/fake_crates.sh swh/loader/package/crates/tests/data/https_crates.io/api_v1_crates_hg-core swh/loader/package/crates/tests/data/https_crates.io/api_v1_crates_micro-timer swh/loader/package/crates/tests/data/https_static.crates.io/crates_hg-core_hg-core-0.0.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.2.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.2.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.2.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.3.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.3.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.4.0.crate swh/loader/package/debian/__init__.py swh/loader/package/debian/loader.py swh/loader/package/debian/tasks.py swh/loader/package/debian/tests/__init__.py swh/loader/package/debian/tests/test_debian.py swh/loader/package/debian/tests/test_tasks.py swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.diff.gz swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.dsc swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.diff.gz swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.dsc swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2.orig.tar.gz swh/loader/package/debian/tests/data/http_deb.debian.org/onefile.txt swh/loader/package/deposit/__init__.py swh/loader/package/deposit/loader.py swh/loader/package/deposit/tasks.py swh/loader/package/deposit/tests/__init__.py swh/loader/package/deposit/tests/conftest.py swh/loader/package/deposit/tests/test_deposit.py swh/loader/package/deposit/tests/test_tasks.py swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello-2.10.zip swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello-2.12.tar.gz swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.12.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.13.json +swh/loader/package/golang/__init__.py +swh/loader/package/golang/loader.py +swh/loader/package/golang/tasks.py +swh/loader/package/golang/tests/__init__.py +swh/loader/package/golang/tests/test_golang.py +swh/loader/package/golang/tests/test_tasks.py +swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@latest +swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_list +swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.info +swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.zip swh/loader/package/maven/__init__.py swh/loader/package/maven/loader.py swh/loader/package/maven/tasks.py swh/loader/package/maven/tests/__init__.py swh/loader/package/maven/tests/test_maven.py swh/loader/package/maven/tests/test_tasks.py swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0-sources.jar swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1-sources.jar swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom swh/loader/package/nixguix/__init__.py swh/loader/package/nixguix/loader.py swh/loader/package/nixguix/tasks.py swh/loader/package/nixguix/tests/__init__.py swh/loader/package/nixguix/tests/conftest.py swh/loader/package/nixguix/tests/test_nixguix.py swh/loader/package/nixguix/tests/test_tasks.py swh/loader/package/nixguix/tests/data/https_example.com/file.txt swh/loader/package/nixguix/tests/data/https_fail.com/truncated-archive.tgz swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz swh/loader/package/nixguix/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_github.com/owner-3_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 swh/loader/package/npm/__init__.py swh/loader/package/npm/loader.py swh/loader/package/npm/tasks.py swh/loader/package/npm/tests/__init__.py swh/loader/package/npm/tests/test_npm.py swh/loader/package/npm/tests/test_tasks.py swh/loader/package/npm/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.1-alpha.14.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/jammit-express_-_jammit-express-0.0.1.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/nativescript-telerik-analytics_-_nativescript-telerik-analytics-1.0.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.2.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.4.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.5.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.1.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.2.0.tgz swh/loader/package/npm/tests/data/https_replicate.npmjs.com/@aller_shared swh/loader/package/npm/tests/data/https_replicate.npmjs.com/catify swh/loader/package/npm/tests/data/https_replicate.npmjs.com/jammit-express swh/loader/package/npm/tests/data/https_replicate.npmjs.com/jammit-no-time swh/loader/package/npm/tests/data/https_replicate.npmjs.com/nativescript-telerik-analytics swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_visit1 swh/loader/package/opam/__init__.py swh/loader/package/opam/loader.py swh/loader/package/opam/tasks.py swh/loader/package/opam/tests/__init__.py swh/loader/package/opam/tests/test_opam.py swh/loader/package/opam/tests/test_tasks.py swh/loader/package/opam/tests/data/fake_opam_repo/_repo swh/loader/package/opam/tests/data/fake_opam_repo/version swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/lock swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/repos-config swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/agrid/agrid.0.1/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.1/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.2/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.3/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/ocb/ocb.0.1/opam swh/loader/package/opam/tests/data/https_github.com/OCamlPro_agrid_archive_0.1.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.1.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.2.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.3.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_ocb_archive_0.1.tar.gz +swh/loader/package/pubdev/__init__.py +swh/loader/package/pubdev/loader.py +swh/loader/package/pubdev/tasks.py +swh/loader/package/pubdev/tests/__init__.py +swh/loader/package/pubdev/tests/test_pubdev.py +swh/loader/package/pubdev/tests/test_tasks.py +swh/loader/package/pubdev/tests/data/fake_pubdev.sh +swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_Autolinker_versions_0.1.1.tar.gz +swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_authentication_versions_0.0.1.tar.gz +swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_bezier_versions_1.1.5.tar.gz +swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_1.0.0.tar.gz +swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_3.8.2.tar.gz +swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker +swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_abstract_io +swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication +swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier +swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf swh/loader/package/pypi/__init__.py swh/loader/package/pypi/loader.py swh/loader/package/pypi/tasks.py swh/loader/package/pypi/tests/__init__.py swh/loader/package/pypi/tests/test_pypi.py swh/loader/package/pypi/tests/test_tasks.py swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_86_10_c9555ec63106153aaaad753a281ff47f4ac79e980ff7f5d740d6649cd56a_upymenu-0.0.1.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 swh/loader/package/pypi/tests/data/https_pypi.org/pypi_nexter_json swh/loader/package/pypi/tests/data/https_pypi.org/pypi_upymenu_json swh/loader/package/tests/__init__.py swh/loader/package/tests/common.py swh/loader/package/tests/test_conftest.py swh/loader/package/tests/test_loader.py swh/loader/package/tests/test_loader_metadata.py swh/loader/package/tests/test_utils.py swh/loader/tests/__init__.py swh/loader/tests/conftest.py swh/loader/tests/py.typed swh/loader/tests/test_cli.py swh/loader/tests/test_init.py swh/loader/tests/data/0805nexter-1.1.0.tar.gz \ No newline at end of file diff --git a/swh.loader.core.egg-info/entry_points.txt b/swh.loader.core.egg-info/entry_points.txt index 2c2ca76..018531c 100644 --- a/swh.loader.core.egg-info/entry_points.txt +++ b/swh.loader.core.egg-info/entry_points.txt @@ -1,14 +1,18 @@ [swh.cli.subcommands] loader = swh.loader.cli [swh.workers] +loader.arch = swh.loader.package.arch:register loader.archive = swh.loader.package.archive:register +loader.aur = swh.loader.package.aur:register loader.cran = swh.loader.package.cran:register loader.crates = swh.loader.package.crates:register loader.debian = swh.loader.package.debian:register loader.deposit = swh.loader.package.deposit:register +loader.golang = swh.loader.package.golang:register loader.maven = swh.loader.package.maven:register loader.nixguix = swh.loader.package.nixguix:register loader.npm = swh.loader.package.npm:register loader.opam = swh.loader.package.opam:register +loader.pubdev = swh.loader.package.pubdev:register loader.pypi = swh.loader.package.pypi:register diff --git a/swh.loader.core.egg-info/requires.txt b/swh.loader.core.egg-info/requires.txt index 3aebd1f..fe872fe 100644 --- a/swh.loader.core.egg-info/requires.txt +++ b/swh.loader.core.egg-info/requires.txt @@ -1,25 +1,25 @@ psutil requests iso8601 pkginfo python-debian python-dateutil typing-extensions toml -swh.core>=0.3 +swh.core>=2.12 swh.model>=4.4.0 swh.objstorage>=0.2.2 swh.scheduler>=0.4.0 swh.storage>=0.29.0 [testing] pytest pytest-mock requests_mock swh-core[testing] swh-scheduler[testing]>=0.5.0 swh-storage[testing]>=0.10.6 types-click types-python-dateutil types-pyyaml types-requests diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py index dcb74bc..30e30f6 100644 --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -1,615 +1,636 @@ # Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib import logging import os import time from typing import Any, ContextManager, Dict, Iterable, List, Optional, Union import sentry_sdk from swh.core.config import load_from_envvar from swh.core.statsd import Statsd from swh.loader.core.metadata_fetchers import CredentialsType, get_fetchers_for_lister from swh.loader.exception import NotFound from swh.model.model import ( BaseContent, Content, Directory, Origin, OriginVisit, OriginVisitStatus, RawExtrinsicMetadata, Release, Revision, Sha1Git, SkippedContent, Snapshot, ) from swh.storage import get_storage from swh.storage.interface import StorageInterface from swh.storage.utils import now DEFAULT_CONFIG: Dict[str, Any] = { "max_content_size": 100 * 1024 * 1024, } class BaseLoader: """Base class for (D)VCS loaders (e.g Svn, Git, Mercurial, ...) or PackageLoader (e.g PyPI, Npm, CRAN, ...) A loader retrieves origin information (git/mercurial/svn repositories, pypi/npm/... package artifacts), ingests the contents/directories/revisions/releases/snapshot read from those artifacts and send them to the archive through the storage backend. The main entry point for the loader is the :func:`load` function. 2 static methods (:func:`from_config`, :func:`from_configfile`) centralizes and eases the loader instantiation from either configuration dict or configuration file. Some class examples: - :class:`SvnLoader` - :class:`GitLoader` - :class:`PyPILoader` - :class:`NpmLoader` Args: lister_name: Name of the lister which triggered this load. If provided, the loader will try to use the forge's API to retrieve extrinsic metadata lister_instance_name: Name of the lister instance which triggered this load. Must be None iff lister_name is, but it may be the empty string for listers with a single instance. """ visit_type: str origin: Origin loaded_snapshot_id: Optional[Sha1Git] parent_origins: Optional[List[Origin]] """If the given origin is a "forge fork" (ie. created with the "Fork" button of GitHub-like forges), :meth:`build_extrinsic_origin_metadata` sets this to a list of origins it was forked from; closest parent first.""" def __init__( self, storage: StorageInterface, origin_url: str, logging_class: Optional[str] = None, save_data_path: Optional[str] = None, max_content_size: Optional[int] = None, lister_name: Optional[str] = None, lister_instance_name: Optional[str] = None, metadata_fetcher_credentials: CredentialsType = None, ): if lister_name == "": raise ValueError("lister_name must not be the empty string") if lister_name is None and lister_instance_name is not None: raise ValueError( f"lister_name is None but lister_instance_name is {lister_instance_name!r}" ) if lister_name is not None and lister_instance_name is None: raise ValueError( f"lister_instance_name is None but lister_name is {lister_name!r}" ) self.storage = storage self.origin = Origin(url=origin_url) self.max_content_size = int(max_content_size) if max_content_size else None self.lister_name = lister_name self.lister_instance_name = lister_instance_name self.metadata_fetcher_credentials = metadata_fetcher_credentials or {} if logging_class is None: logging_class = "%s.%s" % ( self.__class__.__module__, self.__class__.__name__, ) self.log = logging.getLogger(logging_class) _log = logging.getLogger("requests.packages.urllib3.connectionpool") _log.setLevel(logging.WARN) # possibly overridden in self.prepare method self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) self.loaded_snapshot_id = None if save_data_path: path = save_data_path os.stat(path) if not os.access(path, os.R_OK | os.W_OK): raise PermissionError("Permission denied: %r" % path) self.save_data_path = save_data_path self.parent_origins = None self.statsd = Statsd( namespace="swh_loader", constant_tags={"visit_type": self.visit_type} ) @classmethod def from_config(cls, storage: Dict[str, Any], **config: Any): """Instantiate a loader from a configuration dict. This is basically a backwards-compatibility shim for the CLI. Args: storage: instantiation config for the storage config: the configuration dict for the loader, with the following keys: - credentials (optional): credentials list for the scheduler - any other kwargs passed to the loader. Returns: the instantiated loader """ # Drop the legacy config keys which aren't used for this generation of loader. for legacy_key in ("storage", "celery"): config.pop(legacy_key, None) # Instantiate the storage storage_instance = get_storage(**storage) return cls(storage=storage_instance, **config) @classmethod def from_configfile(cls, **kwargs: Any): """Instantiate a loader from the configuration loaded from the SWH_CONFIG_FILENAME envvar, with potential extra keyword arguments if their value is not None. Args: kwargs: kwargs passed to the loader instantiation """ config = dict(load_from_envvar(DEFAULT_CONFIG)) config.update({k: v for k, v in kwargs.items() if v is not None}) return cls.from_config(**config) def save_data(self) -> None: """Save the data associated to the current load""" raise NotImplementedError def get_save_data_path(self) -> str: """The path to which we archive the loader's raw data""" if not hasattr(self, "__save_data_path"): year = str(self.visit_date.year) assert self.origin url = self.origin.url.encode("utf-8") origin_url_hash = hashlib.sha1(url).hexdigest() path = "%s/sha1:%s/%s/%s" % ( self.save_data_path, origin_url_hash[0:2], origin_url_hash, year, ) os.makedirs(path, exist_ok=True) self.__save_data_path = path return self.__save_data_path def flush(self) -> Dict[str, int]: """Flush any potential buffered data not sent to swh-storage. Returns the same value as :meth:`swh.storage.interface.StorageInterface.flush`. """ return self.storage.flush() def cleanup(self) -> None: """Last step executed by the loader.""" raise NotImplementedError def _store_origin_visit(self) -> None: """Store origin and visit references. Sets the self.visit references.""" assert self.origin self.storage.origin_add([self.origin]) assert isinstance(self.visit_type, str) self.visit = list( self.storage.origin_visit_add( [ OriginVisit( origin=self.origin.url, date=self.visit_date, type=self.visit_type, ) ] ) )[0] def prepare(self) -> None: """Second step executed by the loader to prepare some state needed by the loader. Raises NotFound exception if the origin to ingest is not found. """ raise NotImplementedError def get_origin(self) -> Origin: """Get the origin that is currently being loaded. self.origin should be set in :func:`prepare_origin` Returns: dict: an origin ready to be sent to storage by :func:`origin_add`. """ assert self.origin return self.origin def fetch_data(self) -> bool: """Fetch the data from the source the loader is currently loading (ex: git/hg/svn/... repository). Returns: a value that is interpreted as a boolean. If True, fetch_data needs to be called again to complete loading. """ raise NotImplementedError + def process_data(self) -> bool: + """Run any additional processing between fetching and storing the data + + Returns: + a value that is interpreted as a boolean. If True, fetch_data needs + to be called again to complete loading. + Ignored if ``fetch_data`` already returned :const:`False`. + """ + return True + def store_data(self): """Store fetched data in the database. Should call the :func:`maybe_load_xyz` methods, which handle the bundles sent to storage, rather than send directly. """ raise NotImplementedError def load_status(self) -> Dict[str, str]: """Detailed loading status. Defaults to logging an eventful load. Returns: a dictionary that is eventually passed back as the task's result to the scheduler, allowing tuning of the task recurrence mechanism. """ return { "status": "eventful", } def post_load(self, success: bool = True) -> None: """Permit the loader to do some additional actions according to status after the loading is done. The flag success indicates the loading's status. Defaults to doing nothing. This is up to the implementer of this method to make sure this does not break. Args: success (bool): the success status of the loading """ pass def visit_status(self) -> str: """Detailed visit status. Defaults to logging a full visit. """ return "full" def pre_cleanup(self) -> None: """As a first step, will try and check for dangling data to cleanup. This should do its best to avoid raising issues. """ pass def load(self) -> Dict[str, str]: r"""Loading logic for the loader to follow: - Store the actual ``origin_visit`` to storage - Call :meth:`prepare` to prepare any eventual state - Call :meth:`get_origin` to get the origin we work with and store - while True: - Call :meth:`fetch_data` to fetch the data to store + - Call :meth:`process_data` to optionally run processing between + :meth:`fetch_data` and :meth:`store_data` - Call :meth:`store_data` to store the data - Call :meth:`cleanup` to clean up any eventual state put in place in :meth:`prepare` method. """ try: with self.statsd_timed("pre_cleanup"): self.pre_cleanup() except Exception: msg = "Cleaning up dangling data failed! Continue loading." self.log.warning(msg) sentry_sdk.capture_exception() self._store_origin_visit() assert ( self.visit.visit ), "The method `_store_origin_visit` should set the visit (OriginVisit)" self.log.info( "Load origin '%s' with type '%s'", self.origin.url, self.visit.type ) try: with self.statsd_timed("build_extrinsic_origin_metadata"): metadata = self.build_extrinsic_origin_metadata() self.load_metadata_objects(metadata) except Exception as e: sentry_sdk.capture_exception(e) # Do not fail the whole task if this is the only failure self.log.exception( "Failure while loading extrinsic origin metadata.", extra={ "swh_task_args": [], "swh_task_kwargs": { "origin": self.origin.url, "lister_name": self.lister_name, "lister_instance_name": self.lister_instance_name, }, }, ) total_time_fetch_data = 0.0 + total_time_process_data = 0.0 total_time_store_data = 0.0 + # Initially not a success, will be True when actually one + status = "failed" + success = False + try: - # Initially not a success, will be True when actually one - success = False with self.statsd_timed("prepare"): self.prepare() while True: t1 = time.monotonic() more_data_to_fetch = self.fetch_data() t2 = time.monotonic() total_time_fetch_data += t2 - t1 - self.store_data() + + more_data_to_fetch = self.process_data() and more_data_to_fetch t3 = time.monotonic() - total_time_store_data += t3 - t2 + total_time_process_data += t3 - t2 + + self.store_data() + t4 = time.monotonic() + total_time_store_data += t4 - t3 if not more_data_to_fetch: break self.statsd_timing("fetch_data", total_time_fetch_data * 1000.0) + self.statsd_timing("process_data", total_time_process_data * 1000.0) self.statsd_timing("store_data", total_time_store_data * 1000.0) status = self.visit_status() visit_status = OriginVisitStatus( origin=self.origin.url, visit=self.visit.visit, type=self.visit_type, date=now(), status=status, snapshot=self.loaded_snapshot_id, ) self.storage.origin_visit_status_add([visit_status]) success = True with self.statsd_timed( "post_load", tags={"success": success, "status": status} ): self.post_load() except BaseException as e: success = False if isinstance(e, NotFound): status = "not_found" task_status = "uneventful" else: status = "partial" if self.loaded_snapshot_id else "failed" task_status = "failed" self.log.exception( "Loading failure, updating to `%s` status", status, extra={ "swh_task_args": [], "swh_task_kwargs": { "origin": self.origin.url, "lister_name": self.lister_name, "lister_instance_name": self.lister_instance_name, }, }, ) if not isinstance(e, (SystemExit, KeyboardInterrupt)): sentry_sdk.capture_exception() visit_status = OriginVisitStatus( origin=self.origin.url, visit=self.visit.visit, type=self.visit_type, date=now(), status=status, snapshot=self.loaded_snapshot_id, ) self.storage.origin_visit_status_add([visit_status]) with self.statsd_timed( "post_load", tags={"success": success, "status": status} ): self.post_load(success=success) if not isinstance(e, Exception): # e derives from BaseException but not Exception; this is most likely # SystemExit or KeyboardInterrupt, so we should re-raise it. raise return {"status": task_status} finally: with self.statsd_timed( "flush", tags={"success": success, "status": status} ): self.flush() with self.statsd_timed( "cleanup", tags={"success": success, "status": status} ): self.cleanup() return self.load_status() def load_metadata_objects( self, metadata_objects: List[RawExtrinsicMetadata] ) -> None: if not metadata_objects: return authorities = {mo.authority for mo in metadata_objects} self.storage.metadata_authority_add(list(authorities)) fetchers = {mo.fetcher for mo in metadata_objects} self.storage.metadata_fetcher_add(list(fetchers)) self.storage.raw_extrinsic_metadata_add(metadata_objects) def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]: """Builds a list of full RawExtrinsicMetadata objects, using a metadata fetcher returned by :func:`get_fetcher_classes`.""" if self.lister_name is None: self.log.debug("lister_not provided, skipping extrinsic origin metadata") return [] assert ( self.lister_instance_name is not None ), "lister_instance_name is None, but lister_name is not" metadata = [] fetcher_classes = get_fetchers_for_lister(self.lister_name) self.statsd_average("metadata_fetchers", len(fetcher_classes)) for cls in fetcher_classes: metadata_fetcher = cls( origin=self.origin, lister_name=self.lister_name, lister_instance_name=self.lister_instance_name, credentials=self.metadata_fetcher_credentials, ) with self.statsd_timed( "fetch_one_metadata", tags={"fetcher": cls.FETCHER_NAME} ): metadata.extend(metadata_fetcher.get_origin_metadata()) if self.parent_origins is None: self.parent_origins = metadata_fetcher.get_parent_origins() self.statsd_average( "metadata_parent_origins", len(self.parent_origins), tags={"fetcher": cls.FETCHER_NAME}, ) self.statsd_average("metadata_objects", len(metadata)) return metadata def statsd_timed(self, name: str, tags: Dict[str, Any] = {}) -> ContextManager: """ Wrapper for :meth:`swh.core.statsd.Statsd.timed`, which uses the standard metric name and tags for loaders. """ return self.statsd.timed( "operation_duration_seconds", tags={"operation": name, **tags} ) def statsd_timing(self, name: str, value: float, tags: Dict[str, Any] = {}) -> None: """ Wrapper for :meth:`swh.core.statsd.Statsd.timing`, which uses the standard metric name and tags for loaders. """ self.statsd.timing( "operation_duration_seconds", value, tags={"operation": name, **tags} ) def statsd_average( self, name: str, value: Union[int, float], tags: Dict[str, Any] = {} ) -> None: """Increments both ``{name}_sum`` (by the ``value``) and ``{name}_count`` (by ``1``), allowing to prometheus to compute the average ``value`` over time.""" self.statsd.increment(f"{name}_sum", value, tags=tags) self.statsd.increment(f"{name}_count", tags=tags) class DVCSLoader(BaseLoader): """This base class is a pattern for dvcs loaders (e.g. git, mercurial). Those loaders are able to load all the data in one go. For example, the loader defined in swh-loader-git :class:`BulkUpdater`. For other loaders (stateful one, (e.g :class:`SWHSvnLoader`), inherit directly from :class:`BaseLoader`. """ def cleanup(self) -> None: """Clean up an eventual state installed for computations.""" pass def has_contents(self) -> bool: """Checks whether we need to load contents""" return True def get_contents(self) -> Iterable[BaseContent]: """Get the contents that need to be loaded""" raise NotImplementedError def has_directories(self) -> bool: """Checks whether we need to load directories""" return True def get_directories(self) -> Iterable[Directory]: """Get the directories that need to be loaded""" raise NotImplementedError def has_revisions(self) -> bool: """Checks whether we need to load revisions""" return True def get_revisions(self) -> Iterable[Revision]: """Get the revisions that need to be loaded""" raise NotImplementedError def has_releases(self) -> bool: """Checks whether we need to load releases""" return True def get_releases(self) -> Iterable[Release]: """Get the releases that need to be loaded""" raise NotImplementedError def get_snapshot(self) -> Snapshot: """Get the snapshot that needs to be loaded""" raise NotImplementedError def eventful(self) -> bool: """Whether the load was eventful""" raise NotImplementedError def store_data(self) -> None: assert self.origin if self.save_data_path: self.save_data() if self.has_contents(): for obj in self.get_contents(): if isinstance(obj, Content): self.storage.content_add([obj]) elif isinstance(obj, SkippedContent): self.storage.skipped_content_add([obj]) else: raise TypeError(f"Unexpected content type: {obj}") if self.has_directories(): for directory in self.get_directories(): self.storage.directory_add([directory]) if self.has_revisions(): for revision in self.get_revisions(): self.storage.revision_add([revision]) if self.has_releases(): for release in self.get_releases(): self.storage.release_add([release]) snapshot = self.get_snapshot() self.storage.snapshot_add([snapshot]) self.flush() self.loaded_snapshot_id = snapshot.id diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py index 98cff64..dacec8b 100644 --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -1,480 +1,481 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib import logging import time from unittest.mock import MagicMock, call import pytest from swh.loader.core.loader import BaseLoader, DVCSLoader from swh.loader.core.metadata_fetchers import MetadataFetcherProtocol from swh.loader.exception import NotFound from swh.loader.tests import assert_last_visit_matches from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, Origin, RawExtrinsicMetadata, Snapshot, ) import swh.storage.exc ORIGIN = Origin(url="some-url") PARENT_ORIGIN = Origin(url="base-origin-url") METADATA_AUTHORITY = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="http://example.org/" ) REMD = RawExtrinsicMetadata( target=ORIGIN.swhid(), discovery_date=datetime.datetime.now(tz=datetime.timezone.utc), authority=METADATA_AUTHORITY, fetcher=MetadataFetcher( name="test fetcher", version="0.0.1", ), format="test-format", metadata=b'{"foo": "bar"}', ) class DummyLoader: """Base Loader to overload and simplify the base class (technical: to avoid repetition in other *Loader classes)""" visit_type = "git" def __init__(self, storage, *args, **kwargs): super().__init__(storage, ORIGIN.url, *args, **kwargs) def cleanup(self): pass def prepare(self, *args, **kwargs): pass def fetch_data(self): pass def get_snapshot_id(self): return None class DummyDVCSLoader(DummyLoader, DVCSLoader): """DVCS Loader that does nothing in regards to DAG objects.""" def get_contents(self): return [] def get_directories(self): return [] def get_revisions(self): return [] def get_releases(self): return [] def get_snapshot(self): return Snapshot(branches={}) def eventful(self): return False class DummyBaseLoader(DummyLoader, BaseLoader): """Buffered loader will send new data when threshold is reached""" def store_data(self): pass class DummyMetadataFetcher: SUPPORTED_LISTERS = {"fake-forge"} FETCHER_NAME = "fake-forge" def __init__(self, origin, credentials, lister_name, lister_instance_name): pass def get_origin_metadata(self): return [REMD] def get_parent_origins(self): return [] class DummyMetadataFetcherWithFork: SUPPORTED_LISTERS = {"fake-forge"} FETCHER_NAME = "fake-forge" def __init__(self, origin, credentials, lister_name, lister_instance_name): pass def get_origin_metadata(self): return [REMD] def get_parent_origins(self): return [PARENT_ORIGIN] def test_types(): assert isinstance( DummyMetadataFetcher(None, None, None, None), MetadataFetcherProtocol ) assert isinstance( DummyMetadataFetcherWithFork(None, None, None, None), MetadataFetcherProtocol ) def test_base_loader(swh_storage): loader = DummyBaseLoader(swh_storage) result = loader.load() assert result == {"status": "eventful"} def test_base_loader_with_config(swh_storage): loader = DummyBaseLoader(swh_storage, "logger-name") result = loader.load() assert result == {"status": "eventful"} def test_base_loader_with_known_lister_name(swh_storage, mocker): fetcher_cls = MagicMock(wraps=DummyMetadataFetcher) fetcher_cls.SUPPORTED_LISTERS = DummyMetadataFetcher.SUPPORTED_LISTERS fetcher_cls.FETCHER_NAME = "fake-forge" mocker.patch( "swh.loader.core.metadata_fetchers._fetchers", return_value=[fetcher_cls] ) loader = DummyBaseLoader( swh_storage, lister_name="fake-forge", lister_instance_name="" ) statsd_report = mocker.patch.object(loader.statsd, "_report") result = loader.load() assert result == {"status": "eventful"} fetcher_cls.assert_called_once() fetcher_cls.assert_called_once_with( origin=ORIGIN, credentials={}, lister_name="fake-forge", lister_instance_name="", ) assert swh_storage.raw_extrinsic_metadata_get( ORIGIN.swhid(), METADATA_AUTHORITY ).results == [REMD] assert loader.parent_origins == [] assert [ call("metadata_fetchers_sum", "c", 1, {}, 1), call("metadata_fetchers_count", "c", 1, {}, 1), call("metadata_parent_origins_sum", "c", 0, {"fetcher": "fake-forge"}, 1), call("metadata_parent_origins_count", "c", 1, {"fetcher": "fake-forge"}, 1), call("metadata_objects_sum", "c", 1, {}, 1), call("metadata_objects_count", "c", 1, {}, 1), ] == [c for c in statsd_report.mock_calls if "metadata_" in c[1][0]] assert loader.statsd.namespace == "swh_loader" assert loader.statsd.constant_tags == {"visit_type": "git"} def test_base_loader_with_unknown_lister_name(swh_storage, mocker): fetcher_cls = MagicMock(wraps=DummyMetadataFetcher) fetcher_cls.SUPPORTED_LISTERS = DummyMetadataFetcher.SUPPORTED_LISTERS mocker.patch( "swh.loader.core.metadata_fetchers._fetchers", return_value=[fetcher_cls] ) loader = DummyBaseLoader( swh_storage, lister_name="other-lister", lister_instance_name="" ) result = loader.load() assert result == {"status": "eventful"} fetcher_cls.assert_not_called() with pytest.raises(swh.storage.exc.StorageArgumentException): swh_storage.raw_extrinsic_metadata_get(ORIGIN.swhid(), METADATA_AUTHORITY) def test_base_loader_forked_origin(swh_storage, mocker): fetcher_cls = MagicMock(wraps=DummyMetadataFetcherWithFork) fetcher_cls.SUPPORTED_LISTERS = DummyMetadataFetcherWithFork.SUPPORTED_LISTERS fetcher_cls.FETCHER_NAME = "fake-forge" mocker.patch( "swh.loader.core.metadata_fetchers._fetchers", return_value=[fetcher_cls] ) loader = DummyBaseLoader( swh_storage, lister_name="fake-forge", lister_instance_name="" ) statsd_report = mocker.patch.object(loader.statsd, "_report") result = loader.load() assert result == {"status": "eventful"} fetcher_cls.assert_called_once() fetcher_cls.assert_called_once_with( origin=ORIGIN, credentials={}, lister_name="fake-forge", lister_instance_name="", ) assert swh_storage.raw_extrinsic_metadata_get( ORIGIN.swhid(), METADATA_AUTHORITY ).results == [REMD] assert loader.parent_origins == [PARENT_ORIGIN] assert [ call("metadata_fetchers_sum", "c", 1, {}, 1), call("metadata_fetchers_count", "c", 1, {}, 1), call("metadata_parent_origins_sum", "c", 1, {"fetcher": "fake-forge"}, 1), call("metadata_parent_origins_count", "c", 1, {"fetcher": "fake-forge"}, 1), call("metadata_objects_sum", "c", 1, {}, 1), call("metadata_objects_count", "c", 1, {}, 1), ] == [c for c in statsd_report.mock_calls if "metadata_" in c[1][0]] assert loader.statsd.namespace == "swh_loader" assert loader.statsd.constant_tags == {"visit_type": "git"} def test_base_loader_post_load_raise(swh_storage, mocker): loader = DummyBaseLoader(swh_storage) post_load = mocker.patch.object(loader, "post_load") # raise exception in post_load when success is True def post_load_method(*args, success=True): if success: raise Exception("Error in post_load") post_load.side_effect = post_load_method result = loader.load() assert result == {"status": "failed"} # ensure post_load has been called twice, once with success to True and # once with success to False as the first post_load call raised exception assert post_load.call_args_list == [mocker.call(), mocker.call(success=False)] def test_dvcs_loader(swh_storage): loader = DummyDVCSLoader(swh_storage) result = loader.load() assert result == {"status": "eventful"} def test_dvcs_loader_with_config(swh_storage): loader = DummyDVCSLoader(swh_storage, "another-logger") result = loader.load() assert result == {"status": "eventful"} def test_loader_logger_default_name(swh_storage): loader = DummyBaseLoader(swh_storage) assert isinstance(loader.log, logging.Logger) assert loader.log.name == "swh.loader.core.tests.test_loader.DummyBaseLoader" loader = DummyDVCSLoader(swh_storage) assert isinstance(loader.log, logging.Logger) assert loader.log.name == "swh.loader.core.tests.test_loader.DummyDVCSLoader" def test_loader_logger_with_name(swh_storage): loader = DummyBaseLoader(swh_storage, "some.logger.name") assert isinstance(loader.log, logging.Logger) assert loader.log.name == "some.logger.name" def test_loader_save_data_path(swh_storage, tmp_path): loader = DummyBaseLoader(swh_storage, "some.logger.name.1", save_data_path=tmp_path) url = "http://bitbucket.org/something" loader.origin = Origin(url=url) loader.visit_date = datetime.datetime(year=2019, month=10, day=1) hash_url = hashlib.sha1(url.encode("utf-8")).hexdigest() expected_save_path = "%s/sha1:%s/%s/2019" % (str(tmp_path), hash_url[0:2], hash_url) save_path = loader.get_save_data_path() assert save_path == expected_save_path def _check_load_failure(caplog, loader, exc_class, exc_text, status="partial"): """Check whether a failed load properly logged its exception, and that the snapshot didn't get referenced in storage""" assert isinstance(loader, DVCSLoader) # was implicit so far for record in caplog.records: if record.levelname != "ERROR": continue assert "Loading failure" in record.message assert record.exc_info exc = record.exc_info[1] assert isinstance(exc, exc_class) assert exc_text in exc.args[0] # Check that the get_snapshot operation would have succeeded assert loader.get_snapshot() is not None # And confirm that the visit doesn't reference a snapshot visit = assert_last_visit_matches(loader.storage, ORIGIN.url, status) if status != "partial": assert visit.snapshot is None # But that the snapshot didn't get loaded assert loader.loaded_snapshot_id is None @pytest.mark.parametrize("success", [True, False]) def test_loader_timings(swh_storage, mocker, success): current_time = time.time() mocker.patch("time.monotonic", side_effect=lambda: current_time) mocker.patch("swh.core.statsd.monotonic", side_effect=lambda: current_time) runtimes = { "pre_cleanup": 2.0, "build_extrinsic_origin_metadata": 3.0, "prepare": 5.0, "fetch_data": 7.0, - "store_data": 11.0, - "post_load": 13.0, - "flush": 17.0, - "cleanup": 23.0, + "process_data": 11.0, + "store_data": 13.0, + "post_load": 17.0, + "flush": 23.0, + "cleanup": 27.0, } class TimedLoader(BaseLoader): visit_type = "my-visit-type" def __getattribute__(self, method_name): if method_name == "visit_status" and not success: def crashy(): raise Exception("oh no") return crashy if method_name not in runtimes: return super().__getattribute__(method_name) def meth(*args, **kwargs): nonlocal current_time current_time += runtimes[method_name] return meth loader = TimedLoader(swh_storage, origin_url="http://example.org/hello.git") statsd_report = mocker.patch.object(loader.statsd, "_report") loader.load() if success: expected_tags = { "post_load": {"success": True, "status": "full"}, "flush": {"success": True, "status": "full"}, "cleanup": {"success": True, "status": "full"}, } else: expected_tags = { "post_load": {"success": False, "status": "failed"}, "flush": {"success": False, "status": "failed"}, "cleanup": {"success": False, "status": "failed"}, } # note that this is a list equality, so order of entries in 'runtimes' matters. # This is not perfect, but call() objects are not hashable so it's simpler this way, # even if not perfect. assert statsd_report.mock_calls == [ call( "operation_duration_seconds", "ms", value * 1000, {"operation": key, **expected_tags.get(key, {})}, 1, ) for (key, value) in runtimes.items() ] assert loader.statsd.namespace == "swh_loader" assert loader.statsd.constant_tags == {"visit_type": "my-visit-type"} class DummyDVCSLoaderExc(DummyDVCSLoader): """A loader which raises an exception when loading some contents""" def get_contents(self): raise RuntimeError("Failed to get contents!") def test_dvcs_loader_exc_partial_visit(swh_storage, caplog): logger_name = "dvcsloaderexc" caplog.set_level(logging.ERROR, logger=logger_name) loader = DummyDVCSLoaderExc(swh_storage, logging_class=logger_name) # fake the loading ending up in a snapshot loader.loaded_snapshot_id = hash_to_bytes( "9e4dd2b40d1b46b70917c0949aa2195c823a648e" ) result = loader.load() # loading failed assert result == {"status": "failed"} # still resulted in a partial visit with a snapshot (somehow) _check_load_failure( caplog, loader, RuntimeError, "Failed to get contents!", ) class BrokenStorageProxy: def __init__(self, storage): self.storage = storage def __getattr__(self, attr): return getattr(self.storage, attr) def snapshot_add(self, snapshots): raise RuntimeError("Failed to add snapshot!") class DummyDVCSLoaderStorageExc(DummyDVCSLoader): """A loader which raises an exception when loading some contents""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.storage = BrokenStorageProxy(self.storage) def test_dvcs_loader_storage_exc_failed_visit(swh_storage, caplog): logger_name = "dvcsloaderexc" caplog.set_level(logging.ERROR, logger=logger_name) loader = DummyDVCSLoaderStorageExc(swh_storage, logging_class=logger_name) result = loader.load() assert result == {"status": "failed"} _check_load_failure( caplog, loader, RuntimeError, "Failed to add snapshot!", status="failed" ) class DummyDVCSLoaderNotFound(DummyDVCSLoader, BaseLoader): """A loader which raises a not_found exception during the prepare method call""" def prepare(*args, **kwargs): raise NotFound("Unknown origin!") def load_status(self): return { "status": "uneventful", } def test_loader_not_found(swh_storage, caplog): loader = DummyDVCSLoaderNotFound(swh_storage) result = loader.load() assert result == {"status": "uneventful"} _check_load_failure(caplog, loader, NotFound, "Unknown origin!", status="not_found") diff --git a/swh/loader/package/arch/__init__.py b/swh/loader/package/arch/__init__.py new file mode 100644 index 0000000..ef34674 --- /dev/null +++ b/swh/loader/package/arch/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import ArchLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": ArchLoader, + } diff --git a/swh/loader/package/arch/loader.py b/swh/loader/package/arch/loader.py new file mode 100644 index 0000000..7ab9fc2 --- /dev/null +++ b/swh/loader/package/arch/loader.py @@ -0,0 +1,141 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from distutils.version import LooseVersion +from pathlib import Path +import re +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple + +import attr + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import release_name +from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone +from swh.storage.interface import StorageInterface + + +@attr.s +class ArchPackageInfo(BasePackageInfo): + + name = attr.ib(type=str) + """Name of the package""" + + version = attr.ib(type=str) + """Current version""" + + last_modified = attr.ib(type=str) + """File last modified date as release date""" + + +def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: + """Extract intrinsic metadata from .PKGINFO file at dir_path. + + Each Arch linux package has a .PKGINFO file at the root of the archive. + + Args: + dir_path: A directory on disk where a package has been extracted + + Returns: + A dict mapping + """ + pkginfo_path = Path(dir_path, ".PKGINFO") + rex = re.compile(r"^(\w+)\s=\s(.*)$", re.M) + with pkginfo_path.open("rb") as content: + parsed = rex.findall(content.read().decode()) + data = {entry[0].lower(): entry[1] for entry in parsed} + if "url" in data.keys(): + data["project_url"] = data["url"] + return data + + +class ArchLoader(PackageLoader[ArchPackageInfo]): + visit_type = "arch" + + def __init__( + self, + storage: StorageInterface, + url: str, + artifacts: List[Dict[str, Any]], + arch_metadata: List[Dict[str, Any]], + **kwargs, + ): + + super().__init__(storage=storage, url=url, **kwargs) + self.url = url + self.artifacts: Dict[str, Dict] = { + artifact["version"]: artifact for artifact in artifacts + } + self.arch_metadata: Dict[str, Dict] = { + metadata["version"]: metadata for metadata in arch_metadata + } + + def get_versions(self) -> Sequence[str]: + """Get all released versions of an Arch Linux package + + Returns: + A sequence of versions + + Example:: + + ["0.1.1", "0.10.2"] + """ + versions = list(self.artifacts.keys()) + versions.sort(key=LooseVersion) + return versions + + def get_default_version(self) -> str: + """Get the newest release version of an Arch Linux package + + Returns: + A string representing a version + + Example:: + + "0.1.2" + """ + return self.get_versions()[-1] + + def get_package_info(self, version: str) -> Iterator[Tuple[str, ArchPackageInfo]]: + """Get release name and package information from version + + Args: + version: arch version (e.g: "0.1.0") + + Returns: + Iterator of tuple (release_name, p_info) + """ + artifact = self.artifacts[version] + metadata = self.arch_metadata[version] + assert version == artifact["version"] == metadata["version"] + + p_info = ArchPackageInfo( + name=metadata["name"], + filename=artifact["filename"], + url=artifact["url"], + version=version, + last_modified=metadata["last_modified"], + ) + yield release_name(version, artifact["filename"]), p_info + + def build_release( + self, p_info: ArchPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) + author = Person.from_fullname(intrinsic_metadata["packager"].encode()) + description = intrinsic_metadata["pkgdesc"] + + message = ( + f"Synthetic release for Arch Linux source package {p_info.name} " + f"version {p_info.version}\n\n" + f"{description}\n" + ) + return Release( + name=p_info.version.encode(), + author=author, + date=TimestampWithTimezone.from_iso8601(p_info.last_modified), + message=message.encode(), + target_type=ObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) diff --git a/swh/loader/package/crates/tasks.py b/swh/loader/package/arch/tasks.py similarity index 51% copy from swh/loader/package/crates/tasks.py copy to swh/loader/package/arch/tasks.py index 9385263..0e6ded9 100644 --- a/swh/loader/package/crates/tasks.py +++ b/swh/loader/package/arch/tasks.py @@ -1,14 +1,14 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task -from swh.loader.package.crates.loader import CratesLoader +from swh.loader.package.arch.loader import ArchLoader -@shared_task(name=__name__ + ".LoadCrates") -def load_crates(*, url=None, artifacts: list): - """Load Rust crate package""" - return CratesLoader.from_configfile(url=url, artifacts=artifacts).load() +@shared_task(name=__name__ + ".LoadArch") +def load_arch(**kwargs): + """Load Arch Linux packages""" + return ArchLoader.from_configfile(**kwargs).load() diff --git a/swh/loader/package/arch/tests/__init__.py b/swh/loader/package/arch/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/arch/tests/data/fake_arch.sh b/swh/loader/package/arch/tests/data/fake_arch.sh new file mode 100755 index 0000000..471d207 --- /dev/null +++ b/swh/loader/package/arch/tests/data/fake_arch.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +# Script to generate fake Arch Linux packages files and fake http response. + +set -euo pipefail + +# Please note that you need to install Zstandard compression tool (zstd) to compress +# to .zst archive and Xz utils (xz) to compress to .xz archive. +command -v zstd || echo "you should install 'zstd' to run this script" +command -v xz || echo "you should install 'xz' to run this script" + +# files and directories +mkdir https_archive.archlinux.org +mkdir https_uk.mirror.archlinuxarm.org +mkdir -p tmp_dir/arch/ + +cd tmp_dir/arch/ + +mkdir 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64' +mkdir 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64' + +echo -e '''pkgname = dialog +pkgbase = dialog +pkgver = 1:1.3_20190211-1 +pkgdesc = A tool to display dialog boxes from shell scripts +url = https://invisible-island.net/dialog/ +builddate = 1550046926 +packager = Evangelos Foutras +size = 455680 +arch = x86_64 +license = LGPL2.1 +provides = libdialog.so=15-64 +depend = sh +''' > packages_d_dialog_dialog-1:1.3_20190211-1-x86_64/.PKGINFO + +echo -e '''pkgname = dialog +pkgbase = dialog +pkgver = 1:1.3_20220414-1 +pkgdesc = A tool to display dialog boxes from shell scripts +url = https://invisible-island.net/dialog/ +builddate = 1650081535 +packager = Evangelos Foutras +size = 483988 +arch = x86_64 +license = LGPL2.1 +provides = libdialog.so=15-64 +depend = sh +depend = ncurses +''' > packages_d_dialog_dialog-1:1.3_20220414-1-x86_64/.PKGINFO + +# Compress packages folders to .tar.gz archives + +tar --force-local -acf 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz' -C 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64' . +tar --force-local -acf 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst' -C 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64' . + +mv *.xz ../../https_archive.archlinux.org +mv *.zst ../../https_archive.archlinux.org + +# uk.mirror.archlinuxarm.org +mkdir 'aarch64_core_gzip-1.12-1-aarch64' + +echo -e '''# Generated by makepkg 6.0.1 +# using fakeroot version 1.28 +pkgname = gzip +pkgbase = gzip +pkgver = 1.12-1 +pkgdesc = GNU compression utility +url = https://www.gnu.org/software/gzip/ +builddate = 1649365694 +packager = Arch Linux ARM Build System +size = 162688 +arch = aarch64 +license = GPL3 +group = base-devel +depend = glibc +depend = bash +depend = less +''' > aarch64_core_gzip-1.12-1-aarch64/.PKGINFO + +tar --force-local -acf 'aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz' -C 'aarch64_core_gzip-1.12-1-aarch64' . + +mv *.xz ../../https_uk.mirror.archlinuxarm.org + +# Clean up removing tmp_dir +cd ../../ +rm -r tmp_dir/ diff --git a/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz new file mode 100644 index 0000000..92e2f00 Binary files /dev/null and b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz differ diff --git a/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst new file mode 100644 index 0000000..a0ebd62 Binary files /dev/null and b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst differ diff --git a/swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz b/swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz new file mode 100644 index 0000000..c7578fa Binary files /dev/null and b/swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz differ diff --git a/swh/loader/package/arch/tests/test_arch.py b/swh/loader/package/arch/tests/test_arch.py new file mode 100644 index 0000000..3180f9d --- /dev/null +++ b/swh/loader/package/arch/tests/test_arch.py @@ -0,0 +1,253 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import pytest + +from swh.loader.package.arch.loader import ArchLoader +from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats +from swh.model.hashutil import hash_to_bytes +from swh.model.model import ( + ObjectType, + Person, + Release, + Snapshot, + SnapshotBranch, + TargetType, + TimestampWithTimezone, +) + +EXPECTED_PACKAGES = [ + { + "url": "https://archive.archlinux.org/packages/d/dialog/", + "artifacts": [ + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20190211-1", + "length": 180000, + "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20220414-1", + "length": 198000, + "filename": "dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", + }, + ], + "arch_metadata": [ + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20190211-1", + "last_modified": "2019-02-13T08:36:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20220414-1", + "last_modified": "2022-04-16T03:59:00", + }, + ], + }, + { + "url": "https://archlinuxarm.org/packages/aarch64/gzip", + "artifacts": [ + { + "url": "https://uk.mirror.archlinuxarm.org/aarch64/core/gzip-1.12-1-aarch64.pkg.tar.xz", # noqa: B950 + "length": 79640, + "version": "1.12-1", + "filename": "gzip-1.12-1-aarch64.pkg.tar.xz", + } + ], + "arch_metadata": [ + { + "arch": "aarch64", + "name": "gzip", + "repo": "core", + "version": "1.12-1", + "last_modified": "2022-04-07T21:08:14", + } + ], + }, +] + + +def test_get_versions(swh_storage): + loader = ArchLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[0]["arch_metadata"], + ) + + assert loader.get_versions() == [ + "1:1.3_20190211-1", + "1:1.3_20220414-1", + ] + + +def test_get_default_version(requests_mock_datadir, swh_storage): + loader = ArchLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[0]["arch_metadata"], + ) + assert loader.get_default_version() == "1:1.3_20220414-1" + + +def test_arch_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): + loader = ArchLoader( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + artifacts=EXPECTED_PACKAGES[1]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[1]["arch_metadata"], + ) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + + expected_snapshot_id = "4020d0a278027550e336b5481a4159a913c91aa4" + expected_release_id = "7681098c9e381f9cc8bd1724d57eeee2182982dc" + + assert expected_snapshot_id == actual_load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(actual_load_status["snapshot_id"]), + branches={ + b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz": SnapshotBranch( + target=hash_to_bytes(expected_release_id), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz", + target_type=TargetType.ALIAS, + ), + }, + ) + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 1, + "directory": 1, + "origin": 1, + "origin_visit": 1, + "release": 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( + name=b"1.12-1", + message=b"Synthetic release for Arch Linux source package gzip version " + b"1.12-1\n\nGNU compression utility\n", + target=hash_to_bytes("bd742aaf422953a1f7a5e084ec4a7477491d63fb"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=Person.from_fullname( + b"Arch Linux ARM Build System " + ), + date=TimestampWithTimezone.from_iso8601("2022-04-07T21:08:14+00:00"), + id=hash_to_bytes(expected_release_id), + ) + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + status="full", + type="arch", + snapshot=expected_snapshot.id, + ) + + +def test_arch_loader_load_n_versions(datadir, requests_mock_datadir, swh_storage): + + loader = ArchLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[0]["arch_metadata"], + ) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + + expected_snapshot_id = "832139d69a91edffcc3a96cca11deaf9255041c3" + + assert expected_snapshot_id == actual_load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(actual_load_status["snapshot_id"]), + branches={ + b"releases/1:1.3_20190211-1/" + b"dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz": SnapshotBranch( + target=hash_to_bytes("37efb727ff8bb8fbf92518aa8fe5fff2ad427d06"), + target_type=TargetType.RELEASE, + ), + b"releases/1:1.3_20220414-1/" + b"dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst": SnapshotBranch( + target=hash_to_bytes("020d3f5627df7474f257fd04f1ede4415296e265"), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/1:1.3_20220414-1/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 2, + "directory": 2, + "origin": 1, + "origin_visit": 1, + "release": 2, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + status="full", + type="arch", + snapshot=expected_snapshot.id, + ) + + +def test_arch_invalid_origin_archive_not_found(swh_storage, requests_mock_datadir): + url = "https://nowhere/packages/42" + loader = ArchLoader( + swh_storage, + url, + artifacts=[ + { + "filename": "42-0.0.1.pkg.xz", + "url": "https://mirror2.nowhere/pkg/42-0.0.1.pkg.xz", + "version": "0.0.1", + "length": 42, + }, + ], + arch_metadata=[ + { + "version": "0.0.1", + "arch": "aarch64", + "name": "42", + "repo": "community", + "last_modified": "2022-04-07T21:08:14", + }, + ], + ) + with pytest.raises(Exception): + assert loader.load() == {"status": "failed"} + assert_last_visit_matches( + swh_storage, url, status="not_found", type="arch", snapshot=None + ) diff --git a/swh/loader/package/arch/tests/test_tasks.py b/swh/loader/package/arch/tests/test_tasks.py new file mode 100644 index 0000000..b5178ac --- /dev/null +++ b/swh/loader/package/arch/tests/test_tasks.py @@ -0,0 +1,40 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_tasks_arch_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.arch.loader.ArchLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.arch.tasks.LoadArch", + kwargs=dict( + url="some-url/packages/s/some-package", + artifacts=[ + { + "version": "0.0.1", + "url": "https://somewhere/some-package-0.0.1.pkg.xz", + "filename": "some-package-0.0.1.pkg.xz", + "length": 42, + } + ], + arch_metadata=[ + { + "version": "0.0.1", + "arch": "aarch64", + "name": "some-package", + "repo": "community", + "last_modified": "1970-01-01T21:08:14", + } + ], + ), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py index 853a8c4..b96cad6 100644 --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -1,164 +1,168 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib import logging from os import path import string from typing import Any, Dict, Iterator, Mapping, Optional, Sequence, Tuple, Union import attr import iso8601 from swh.loader.package.loader import BasePackageInfo, PackageLoader, PartialExtID from swh.loader.package.utils import EMPTY_AUTHOR, release_name from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) @attr.s class ArchivePackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) length = attr.ib(type=int) """Size of the archive file""" time = attr.ib(type=Union[str, datetime.datetime]) """Timestamp of the archive file on the server""" # default format for gnu MANIFEST_FORMAT = string.Template("$time $length $version $url") def extid(self, manifest_format: Optional[string.Template] = None) -> PartialExtID: """Returns a unique intrinsic identifier of this package info ``manifest_format`` allows overriding the class' default MANIFEST_FORMAT""" manifest_format = manifest_format or self.MANIFEST_FORMAT # TODO: use parsed attributes instead of self.raw_info manifest = manifest_format.substitute( {k: str(v) for (k, v) in self.raw_info.items()} ) return ( self.EXTID_TYPE, self.EXTID_VERSION, hashlib.sha256(manifest.encode()).digest(), ) @classmethod def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": url = a_metadata["url"] filename = a_metadata.get("filename") return cls( url=url, filename=filename if filename else path.split(url)[-1], raw_info=a_metadata, length=a_metadata["length"], time=a_metadata["time"], version=a_metadata["version"], ) class ArchiveLoader(PackageLoader[ArchivePackageInfo]): """Load archive origin's artifact files into swh archive""" visit_type = "tar" def __init__( self, storage: StorageInterface, url: str, artifacts: Sequence[Dict[str, Any]], extid_manifest_format: Optional[str] = None, snapshot_append: bool = False, **kwargs: Any, ): f"""Loader constructor. For now, this is the lister's task output. Args: url: Origin url artifacts: List of artifact information with keys: - **time**: last modification time as either isoformat date string or timestamp - **url**: the artifact url to retrieve filename - **filename**: optionally, the file's name - **version**: artifact's version - **length**: artifact's length extid_manifest_format: template string used to format a manifest, which is hashed to get the extid of a package. Defaults to {ArchivePackageInfo.MANIFEST_FORMAT!r} snapshot_append: if :const:`True`, append latest snapshot content to the new snapshot created by the loader """ super().__init__(storage=storage, url=url, **kwargs) self.artifacts = artifacts # assume order is enforced in the lister self.extid_manifest_format = ( None if extid_manifest_format is None else string.Template(extid_manifest_format) ) self.snapshot_append = snapshot_append def get_versions(self) -> Sequence[str]: versions = [] for archive in self.artifacts: v = archive.get("version") if v: versions.append(v) return versions def get_default_version(self) -> str: # It's the most recent, so for this loader, it's the last one return self.artifacts[-1]["version"] def get_package_info( self, version: str ) -> Iterator[Tuple[str, ArchivePackageInfo]]: for a_metadata in self.artifacts: p_info = ArchivePackageInfo.from_metadata(a_metadata) if version == p_info.version: # FIXME: this code assumes we have only 1 artifact per # versioned package yield release_name(version), p_info def new_packageinfo_to_extid( self, p_info: ArchivePackageInfo ) -> Optional[PartialExtID]: return p_info.extid(manifest_format=self.extid_manifest_format) def build_release( self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: time = p_info.time # assume it's a timestamp if isinstance(time, str): # otherwise, assume it's a parsable date parsed_time = iso8601.parse_date(time) else: parsed_time = time - normalized_time = TimestampWithTimezone.from_datetime(parsed_time) + normalized_time = ( + TimestampWithTimezone.from_datetime(parsed_time) + if parsed_time is not None + else None + ) msg = f"Synthetic release for archive at {p_info.url}\n" return Release( name=p_info.version.encode(), message=msg.encode(), date=normalized_time, author=EMPTY_AUTHOR, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: if not self.snapshot_append: return {} last_snapshot = self.last_snapshot() return last_snapshot.to_dict()["branches"] if last_snapshot else {} diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py index a590c1d..7a32b2c 100644 --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -1,488 +1,502 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy import datetime import hashlib from io import BytesIO from pathlib import Path import string import attr import pytest from requests.exceptions import ContentDecodingError from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import ( ObjectType, Person, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) URL = "https://ftp.gnu.org/gnu/8sync/" GNU_ARTIFACTS = [ { "time": 944729610, "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz", "length": 221837, "filename": "8sync-0.1.0.tar.gz", "version": "0.1.0", }, { "time": 1480991830, "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", "length": 238466, "filename": "8sync-0.2.0.tar.gz", "version": "0.2.0", }, ] _expected_new_contents_first_visit = [ "e9258d81faf5881a2f96a77ba609396f82cb97ad", "1170cf105b04b7e2822a0e09d2acf71da7b9a130", "fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac", "0057bec9b5422aff9256af240b177ac0e3ac2608", "2b8d0d0b43a1078fc708930c8ddc2956a86c566e", "27de3b3bc6545d2a797aeeb4657c0e215a0c2e55", "2e6db43f5cd764e677f416ff0d0c78c7a82ef19b", "ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62", "edeb33282b2bffa0e608e9d2fd960fd08093c0ea", "d64e64d4c73679323f8d4cde2643331ba6c20af9", "7a756602914be889c0a2d3952c710144b3e64cb0", "84fb589b554fcb7f32b806951dcf19518d67b08f", "8624bcdae55baeef00cd11d5dfcfa60f68710a02", "e08441aeab02704cfbd435d6445f7c072f8f524e", "f67935bc3a83a67259cda4b2d43373bd56703844", "809788434b433eb2e3cfabd5d591c9a659d5e3d8", "7d7c6c8c5ebaeff879f61f37083a3854184f6c41", "b99fec102eb24bffd53ab61fc30d59e810f116a2", "7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68", "f0c97052e567948adf03e641301e9983c478ccff", "7fb724242e2b62b85ca64190c31dcae5303e19b3", "4f9709e64a9134fe8aefb36fd827b84d8b617ab5", "7350628ccf194c2c3afba4ac588c33e3f3ac778d", "0bb892d9391aa706dc2c3b1906567df43cbe06a2", "49d4c0ce1a16601f1e265d446b6c5ea6b512f27c", "6b5cc594ac466351450f7f64a0b79fdaf4435ad3", "3046e5d1f70297e2a507b98224b6222c9688d610", "1572607d456d7f633bc6065a2b3048496d679a31", ] _expected_new_directories_first_visit = [ "daabc65ec75d487b1335ffc101c0ac11c803f8fc", "263be23b4a8101d3ad0d9831319a3e0f2b065f36", "7f6e63ba6eb3e2236f65892cd822041f1a01dd5c", "4db0a3ecbc976083e2dac01a62f93729698429a3", "dfef1c80e1098dd5deda664bb44a9ab1f738af13", "eca971d346ea54d95a6e19d5051f900237fafdaa", "3aebc29ed1fccc4a6f2f2010fb8e57882406b528", ] _expected_new_releases_first_visit = { "c92b2ad9e70ef1dce455e8fe1d8e41b92512cc08": ( "3aebc29ed1fccc4a6f2f2010fb8e57882406b528" ) } def test_archive_visit_with_no_artifact_found(swh_storage, requests_mock_datadir): url = URL unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz" loader = ArchiveLoader( swh_storage, url, artifacts=[ { "time": 944729610, "url": unknown_artifact_url, # unknown artifact "length": 221837, "filename": "8sync-0.1.0.tar.gz", "version": "0.1.0", } ], ) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None stats = get_stats(swh_storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert_last_visit_matches(swh_storage, url, status="partial", type="tar") def test_archive_visit_with_release_artifact_no_prior_visit( swh_storage, requests_mock_datadir ): """With no prior visit, load a gnu project ends up with 1 snapshot""" loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS[:1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" expected_snapshot_first_visit_id = hash_to_bytes( "9efecc835e8f99254934f256b5301b94f348fd17" ) assert actual_load_status["snapshot_id"] == hash_to_hex( expected_snapshot_first_visit_id ) assert_last_visit_matches(swh_storage, URL, status="full", type="tar") stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats release_id = hash_to_bytes(list(_expected_new_releases_first_visit)[0]) expected_snapshot = Snapshot( id=expected_snapshot_first_visit_id, branches={ b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"releases/0.1.0", ), b"releases/0.1.0": SnapshotBranch( target_type=TargetType.RELEASE, target=release_id, ), }, ) check_snapshot(expected_snapshot, swh_storage) assert swh_storage.release_get([release_id])[0] == Release( id=release_id, name=b"0.1.0", message=( b"Synthetic release for archive at " b"https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz\n" ), target=hash_to_bytes("3aebc29ed1fccc4a6f2f2010fb8e57882406b528"), target_type=ObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname(b""), date=TimestampWithTimezone.from_datetime( datetime.datetime(1999, 12, 9, 8, 53, 30, tzinfo=datetime.timezone.utc) ), ) expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) assert list(swh_storage.directory_missing(expected_dirs)) == [] expected_rels = map(hash_to_bytes, _expected_new_releases_first_visit) assert list(swh_storage.release_missing(expected_rels)) == [] def test_archive_2_visits_without_change(swh_storage, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot""" url = URL loader = ArchiveLoader(swh_storage, url, artifacts=GNU_ARTIFACTS[:1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, url, status="full", type="tar") actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" assert actual_load_status2["snapshot_id"] is not None assert actual_load_status["snapshot_id"] == actual_load_status2["snapshot_id"] assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] assert len(urls) == 1 def test_archive_2_visits_with_new_artifact(swh_storage, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot""" url = URL artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(swh_storage, url, [artifact1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, url, status="full", type="tar") stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] assert len(urls) == 1 artifact2 = GNU_ARTIFACTS[1] loader2 = ArchiveLoader(swh_storage, url, [artifact1, artifact2]) stats2 = get_stats(swh_storage) assert stats == stats2 # ensure we share the storage actual_load_status2 = loader2.load() assert actual_load_status2["status"] == "eventful" assert actual_load_status2["snapshot_id"] is not None stats2 = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit) + 14, "directory": len(_expected_new_directories_first_visit) + 8, "origin": 1, "origin_visit": 1 + 1, "release": len(_expected_new_releases_first_visit) + 1, "revision": 0, "skipped_content": 0, "snapshot": 1 + 1, } == stats2 assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] # 1 artifact (2nd time no modification) + 1 new artifact assert len(urls) == 2 def test_archive_2_visits_without_change_not_gnu(swh_storage, requests_mock_datadir): """Load a project archive (not gnu) ends up with 1 snapshot""" url = "https://something.else.org/8sync/" artifacts = [ # this is not a gnu artifact { "time": "1999-12-09T09:53:30+00:00", # it's also not a timestamp "sha256": "d5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4", # noqa # keep a gnu artifact reference to avoid adding other test files "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", "length": 238466, "filename": "8sync-0.2.0.tar.gz", "version": "0.2.0", } ] # Here the loader defines the id_keys to use for existence in the snapshot # It's not the default archive loader which loader = ArchiveLoader( swh_storage, url, artifacts=artifacts, extid_manifest_format="$sha256 $length $url", ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, url, status="full", type="tar") actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" assert actual_load_status2["snapshot_id"] == actual_load_status["snapshot_id"] assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] assert len(urls) == 1 def test_archive_extid(): """Compute primary key should return the right identity""" @attr.s class TestPackageInfo(ArchivePackageInfo): a = attr.ib() b = attr.ib() metadata = GNU_ARTIFACTS[0] p_info = TestPackageInfo( raw_info={**metadata, "a": 1, "b": 2}, a=1, b=2, **metadata, ) for manifest_format, expected_manifest in [ (string.Template("$a $b"), b"1 2"), (string.Template(""), b""), (None, "{time} {length} {version} {url}".format(**metadata).encode()), ]: actual_id = p_info.extid(manifest_format=manifest_format) assert actual_id == ( "package-manifest-sha256", 0, hashlib.sha256(expected_manifest).digest(), ) with pytest.raises(KeyError): p_info.extid(manifest_format=string.Template("$a $unknown_key")) def test_archive_snapshot_append(swh_storage, requests_mock_datadir): # first loading with a first artifact artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(swh_storage, URL, [artifact1], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 branch_artifact1_name = f"releases/{artifact1['version']}".encode() assert b"HEAD" in snapshot.branches assert branch_artifact1_name in snapshot.branches assert snapshot.branches[b"HEAD"].target == branch_artifact1_name # second loading with a second artifact artifact2 = GNU_ARTIFACTS[1] loader = ArchiveLoader(swh_storage, URL, [artifact2], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot, should contain a new branch and the # branch for the first artifact snapshot = loader.last_snapshot() assert len(snapshot.branches) == 3 branch_artifact2_name = f"releases/{artifact2['version']}".encode() assert b"HEAD" in snapshot.branches assert branch_artifact2_name in snapshot.branches assert branch_artifact1_name in snapshot.branches assert snapshot.branches[b"HEAD"].target == branch_artifact2_name def test_archive_snapshot_append_branch_override(swh_storage, requests_mock_datadir): # first loading for a first artifact artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(swh_storage, URL, [artifact1], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 branch_artifact1_name = f"releases/{artifact1['version']}".encode() assert branch_artifact1_name in snapshot.branches branch_target_first_visit = snapshot.branches[branch_artifact1_name].target # second loading for a second artifact with same version as the first one # but with different tarball content artifact2 = dict(GNU_ARTIFACTS[0]) artifact2["url"] = GNU_ARTIFACTS[1]["url"] artifact2["time"] = GNU_ARTIFACTS[1]["time"] artifact2["length"] = GNU_ARTIFACTS[1]["length"] loader = ArchiveLoader(swh_storage, URL, [artifact2], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot, should contain the same branch as previously # but with different target snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 assert branch_artifact1_name in snapshot.branches branch_target_second_visit = snapshot.branches[branch_artifact1_name].target assert branch_target_first_visit != branch_target_second_visit @pytest.fixture def not_gzipped_tarball_bytes(datadir): return Path(datadir, "not_gzipped_tarball.tar.gz").read_bytes() def test_archive_not_gzipped_tarball( swh_storage, requests_mock, not_gzipped_tarball_bytes ): """Check that a tarball erroneously marked as gzip compressed can still be downloaded and processed. """ filename = "not_gzipped_tarball.tar.gz" url = f"https://example.org/ftp/{filename}" requests_mock.get( url, [ { "exc": ContentDecodingError, }, { "body": BytesIO(not_gzipped_tarball_bytes), }, ], ) loader = ArchiveLoader( swh_storage, url, artifacts=[ { "time": 944729610, "url": url, "length": 221837, "filename": filename, "version": "0.1.0", } ], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 assert b"releases/0.1.0" in snapshot.branches + + +def test_archive_visit_no_time_for_tarball(swh_storage, requests_mock_datadir): + artifacts = copy.deepcopy(GNU_ARTIFACTS) + for artifact in artifacts: + artifact["time"] = None + + loader = ArchiveLoader(swh_storage, URL, artifacts=artifacts) + + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + + assert_last_visit_matches(swh_storage, URL, status="full", type="tar") diff --git a/swh/loader/package/aur/__init__.py b/swh/loader/package/aur/__init__.py new file mode 100644 index 0000000..1682976 --- /dev/null +++ b/swh/loader/package/aur/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import AurLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": AurLoader, + } diff --git a/swh/loader/package/aur/loader.py b/swh/loader/package/aur/loader.py new file mode 100644 index 0000000..24577be --- /dev/null +++ b/swh/loader/package/aur/loader.py @@ -0,0 +1,160 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from distutils.version import LooseVersion +from pathlib import Path +import re +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple + +import attr + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import EMPTY_AUTHOR, release_name +from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone +from swh.storage.interface import StorageInterface + + +@attr.s +class AurPackageInfo(BasePackageInfo): + + name = attr.ib(type=str) + """Name of the package""" + + version = attr.ib(type=str) + """Current version""" + + last_modified = attr.ib(type=str) + """File last modified date as release date""" + + +def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: + """Extract intrinsic metadata from .SRCINFO file at dir_path. + + Each Aur package has a .SRCINFO file at the root of the archive. + + Args: + dir_path: A directory on disk where a package has been extracted + + Returns: + A dict mapping + """ + assert dir_path.exists() + # top directory from extracted archive is always named with the package name + (pkgname,) = [elt.name for elt in dir_path.iterdir() if elt.is_dir()] + srcinfo_path = Path(dir_path, pkgname, ".SRCINFO") + rex = re.compile(r"^(\w+)\s=\s(.*)$", re.M) + with srcinfo_path.open("r") as content: + # Except first and last line, lines may starts with a tab, remove them + srcinfo = content.read().replace("\t", "") + parsed = rex.findall(srcinfo) + data: Dict[str, Any] = {} + for (k, v) in parsed: + if k in data: + if type(data[k]) is not list: + data[k] = [data[k]] + data[k].append(v) + else: + data[k] = v + return data + + +class AurLoader(PackageLoader[AurPackageInfo]): + visit_type = "aur" + + def __init__( + self, + storage: StorageInterface, + url: str, + artifacts: List[Dict[str, Any]], + aur_metadata: List[Dict[str, Any]], + **kwargs, + ): + + super().__init__(storage=storage, url=url, **kwargs) + self.url = url + self.artifacts: Dict[str, Dict] = { + artifact["version"]: artifact for artifact in artifacts + } + self.aur_metadata: Dict[str, Dict] = { + meta["version"]: meta for meta in aur_metadata + } + + def get_versions(self) -> Sequence[str]: + """Get all released versions of an Aur package + + Returns: + A sequence of versions + + Example:: + + ["0.1.1", "0.10.2"] + """ + versions = list(self.artifacts) + versions.sort(key=LooseVersion) + return versions + + def get_default_version(self) -> str: + """Get the newest release version of an Aur package + + Returns: + A string representing a version + + Example:: + + "0.1.2" + """ + return self.get_versions()[-1] + + def get_package_info(self, version: str) -> Iterator[Tuple[str, AurPackageInfo]]: + """Get release name and package information from version + + Args: + version: aur version (e.g: "0.1.0") + + Returns: + Iterator of tuple (release_name, p_info) + """ + artifact = self.artifacts[version] + assert version == artifact["version"] + data = self.aur_metadata[version] + + url = artifact["url"] + filename = artifact["filename"] + + p_info = AurPackageInfo( + name=data["pkgname"], + filename=filename, + url=url, + version=version, + last_modified=data["last_update"], + ) + yield release_name(version, filename), p_info + + def build_release( + self, p_info: AurPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) + author = EMPTY_AUTHOR + description: str = "" + assert intrinsic_metadata["pkgdesc"] + + if type(intrinsic_metadata["pkgdesc"]) is list: + description = "\n".join(intrinsic_metadata["pkgdesc"]) + else: + description = intrinsic_metadata["pkgdesc"] + + message = ( + f"Synthetic release for Aur source package {p_info.name} " + f"version {p_info.version}\n\n" + f"{description}\n" + ) + return Release( + name=p_info.version.encode(), + author=author, + date=TimestampWithTimezone.from_iso8601(p_info.last_modified), + message=message.encode(), + target_type=ObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) diff --git a/swh/loader/package/crates/tasks.py b/swh/loader/package/aur/tasks.py similarity index 51% copy from swh/loader/package/crates/tasks.py copy to swh/loader/package/aur/tasks.py index 9385263..9cfb24b 100644 --- a/swh/loader/package/crates/tasks.py +++ b/swh/loader/package/aur/tasks.py @@ -1,14 +1,14 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task -from swh.loader.package.crates.loader import CratesLoader +from swh.loader.package.aur.loader import AurLoader -@shared_task(name=__name__ + ".LoadCrates") -def load_crates(*, url=None, artifacts: list): - """Load Rust crate package""" - return CratesLoader.from_configfile(url=url, artifacts=artifacts).load() +@shared_task(name=__name__ + ".LoadAur") +def load_aur(**kwargs): + """Load Arch User Repository packages""" + return AurLoader.from_configfile(**kwargs).load() diff --git a/swh/loader/package/aur/tests/__init__.py b/swh/loader/package/aur/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/aur/tests/data/fake_aur.sh b/swh/loader/package/aur/tests/data/fake_aur.sh new file mode 100755 index 0000000..70e6844 --- /dev/null +++ b/swh/loader/package/aur/tests/data/fake_aur.sh @@ -0,0 +1,187 @@ +#!/usr/bin/env bash + +# Script to generate fake Aur packages files and servable directories. + +set -euo pipefail + +# Create directories +readonly TMP=tmp_dir/aur +readonly BASE_URL=https_aur.archlinux.org +readonly SNAPSHOT_PREFIX=cgit_aur.git_snapshot + +mkdir -p $TMP +mkdir -p $BASE_URL + +cd $TMP + +mkdir 'hg-evolve' +echo -e '''pkgbase = hg-evolve + pkgdesc = Flexible evolution of Mercurial history + pkgver = 10.5.2 + pkgrel = 1 + url = https://www.mercurial-scm.org/doc/evolution/ + arch = any + license = GPL2 + makedepends = python-build + makedepends = python-installer + makedepends = python-wheel + depends = mercurial + source = https://files.pythonhosted.org/packages/source/h/hg-evolve/hg-evolve-10.5.2.tar.gz + sha512sums = 81a1cc1202ffaf364fde70c6a36e32330e93aa69c9b9f7e11fbc11f988f7fb302d8b79414c644d274fedb7f0a67e10c4344c0206a1424f2bb97ae2cb11a51315 + +pkgname = hg-evolve +''' > hg-evolve/.SRCINFO + +mkdir 'ibus-git' +echo -e '''pkgbase = ibus-git + pkgdesc = Next Generation Input Bus for Linux + pkgver = 1.5.23+12+gef4c5c7e + pkgrel = 1 + url = https://github.com/ibus/ibus/wiki + arch = x86_64 + license = LGPL + makedepends = gobject-introspection + makedepends = vala + makedepends = intltool + makedepends = gnome-common + makedepends = gtk-doc + makedepends = gtk2 + makedepends = qt5-base + makedepends = unicode-cldr + makedepends = unicode-character-database + makedepends = unicode-emoji + makedepends = git + depends = dconf + depends = gtk3 + depends = hicolor-icon-theme + depends = libnotify + depends = python-dbus + depends = python-gobject + depends = iso-codes + depends = librsvg + options = !emptydirs + source = ibus::git+https://github.com/ibus/ibus + sha512sums = SKIP + +pkgname = ibus-git + depends = dconf + depends = gtk3 + depends = hicolor-icon-theme + depends = libnotify + depends = python-dbus + depends = python-gobject + depends = iso-codes + depends = librsvg + depends = libibus-git=1.5.23+12+gef4c5c7e + provides = ibus + conflicts = ibus + +pkgname = libibus-git + pkgdesc = IBus support library + depends = libglib-2.0.so + depends = libgobject-2.0.so + depends = libgio-2.0.so + provides = libibus + provides = libibus-1.0.so + conflicts = libibus +''' > ibus-git/.SRCINFO + +mkdir 'libervia-web-hg' +echo -e '''pkgbase = libervia-web-hg + pkgdesc = Salut à Toi, multi-frontends multi-purposes XMPP client (Web interface) + pkgver = 0.9.0.r1492.3a34d78f2717 + pkgrel = 1 + url = http://salut-a-toi.org/ + install = libervia-web-hg.install + arch = any + license = AGPL3 + makedepends = python-setuptools + makedepends = mercurial + depends = python + depends = python-jinja + depends = python-shortuuid-git + depends = libervia-media-hg + depends = libervia-backend-hg + depends = libervia-templates-hg + depends = python-zope-interface + depends = python-pyopenssl + depends = python-autobahn + depends = dbus + depends = python-brython + provides = libervia-web + options = !strip + source = hg+https://repos.goffi.org/libervia + md5sums = SKIP + +pkgname = libervia-web-hg +''' > libervia-web-hg/.SRCINFO + +mkdir 'tealdeer-git' +echo -e '''# Generated by mksrcinfo v8 +# Fri Sep 4 20:36:25 UTC 2020 +pkgbase = tealdeer-git + pkgdesc = A fast tldr client in Rust. + pkgver = r255.30b7c5f + pkgrel = 1 + url = https://github.com/dbrgn/tealdeer + arch = x86_64 + arch = armv6h + arch = armv7h + arch = aarch64 + license = MIT + license = Apache + makedepends = git + makedepends = rust + makedepends = cargo + depends = openssl + provides = tldr + conflicts = tldr + options = !emptydirs + source = git+https://github.com/dbrgn/tealdeer + sha256sums = SKIP + +pkgname = tealdeer-git +''' > tealdeer-git/.SRCINFO + +mkdir 'a-fake-one' +echo -e '''# This one does not exists +# For test purpose, in particular for multi keys, multi lines edge case +pkgbase = a-fake-one + pkgdesc = A first line of description. + pkgdesc = A second line for more information. + pkgver = 0.0.1 + pkgrel = 1 + url = https://nowhere/a-fake-one + url = https://mirror/a-fake-one + arch = x86_64 + arch = armv6h + arch = armv7h + arch = aarch64 + license = MIT + license = Apache + makedepends = git + makedepends = rust + makedepends = cargo + depends = openssl + provides = a-fake-one + conflicts = a-fake-one + options = !emptydirs + source = git+https://nowhere/a-fake-one + sha256sums = SKIP + +pkgname = a-fake-one +''' > a-fake-one/.SRCINFO + +# Compress packages folders to .tar.gz archives +tar -czf ${SNAPSHOT_PREFIX}_hg-evolve.tar.gz hg-evolve +tar -czf ${SNAPSHOT_PREFIX}_ibus-git.tar.gz ibus-git +tar -czf ${SNAPSHOT_PREFIX}_libervia-web-hg.tar.gz libervia-web-hg +tar -czf ${SNAPSHOT_PREFIX}_tealdeer-git.tar.gz tealdeer-git +tar -czf ${SNAPSHOT_PREFIX}_a-fake-one.tar.gz a-fake-one + +# Move .tar.gz archives to a servable directory +mv *.tar.gz ../../$BASE_URL + +# Clean up removing tmp_dir +cd ../../ +rm -r tmp_dir/ diff --git a/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_a-fake-one.tar.gz b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_a-fake-one.tar.gz new file mode 100644 index 0000000..f193cc6 Binary files /dev/null and b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_a-fake-one.tar.gz differ diff --git a/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_hg-evolve.tar.gz b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_hg-evolve.tar.gz new file mode 100644 index 0000000..d95aa4e Binary files /dev/null and b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_hg-evolve.tar.gz differ diff --git a/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_ibus-git.tar.gz b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_ibus-git.tar.gz new file mode 100644 index 0000000..99cb97b Binary files /dev/null and b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_ibus-git.tar.gz differ diff --git a/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_libervia-web-hg.tar.gz b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_libervia-web-hg.tar.gz new file mode 100644 index 0000000..a02d15d Binary files /dev/null and b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_libervia-web-hg.tar.gz differ diff --git a/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_tealdeer-git.tar.gz b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_tealdeer-git.tar.gz new file mode 100644 index 0000000..4ff29c4 Binary files /dev/null and b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_tealdeer-git.tar.gz differ diff --git a/swh/loader/package/aur/tests/test_aur.py b/swh/loader/package/aur/tests/test_aur.py new file mode 100644 index 0000000..86e69ee --- /dev/null +++ b/swh/loader/package/aur/tests/test_aur.py @@ -0,0 +1,276 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.loader.package.aur.loader import AurLoader +from swh.loader.package.utils import EMPTY_AUTHOR +from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats +from swh.model.hashutil import hash_to_bytes +from swh.model.model import ( + ObjectType, + Release, + Snapshot, + SnapshotBranch, + TargetType, + TimestampWithTimezone, +) + +EXPECTED_PACKAGES = [ + { + "url": "https://aur.archlinux.org/hg-evolve.git", + "artifacts": [ + { + "filename": "hg-evolve.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950 + "version": "10.5.2-1", + } + ], + "aur_metadata": [ + { + "version": "10.5.2-1", + "project_url": "https://www.mercurial-scm.org/doc/evolution/", + "last_update": "2022-07-16T00:08:41+00:00", + "pkgname": "hg-evolve", + } + ], + }, + { + "url": "https://aur.archlinux.org/ibus-git.git", + "artifacts": [ + { + "filename": "ibus-git.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/ibus-git.tar.gz", # noqa: B950 + "version": "1.5.23+12+gef4c5c7e-1", + } + ], + "aur_metadata": [ + { + "version": "1.5.23+12+gef4c5c7e-1", + "project_url": "https://github.com/ibus/ibus/wiki", + "last_update": "2021-02-08T06:12:11+00:00", + "pkgname": "ibus-git", + } + ], + }, + { + "url": "https://aur.archlinux.org/libervia-web-hg.git", + "artifacts": [ + { + "filename": "libervia-web-hg.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/libervia-web-hg.tar.gz", # noqa: B950 + "version": "0.9.0.r1492.3a34d78f2717-1", + } + ], + "aur_metadata": [ + { + "version": "0.9.0.r1492.3a34d78f2717-1", + "project_url": "http://salut-a-toi.org/", + "last_update": "2022-02-26T15:30:58+00:00", + "pkgname": "libervia-web-hg", + } + ], + }, + { + "url": "https://aur.archlinux.org/tealdeer-git.git", + "artifacts": [ + { + "filename": "tealdeer-git.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/tealdeer-git.tar.gz", # noqa: B950 + "version": "r255.30b7c5f-1", + } + ], + "aur_metadata": [ + { + "version": "r255.30b7c5f-1", + "project_url": "https://github.com/dbrgn/tealdeer", + "last_update": "2020-09-04T20:36:52+00:00", + "pkgname": "tealdeer-git", + } + ], + }, + { + "url": "https://aur.archlinux.org/a-fake-one.git", + "artifacts": [ + { + "filename": "a-fake-one.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/a-fake-one.tar.gz", # noqa: B950 + "version": "0.0.1", + }, + ], + "aur_metadata": [ + { + "version": "0.0.1", + "project_url": "https://nowhere/a-fake-one", + "last_update": "2022-02-02T22:22:22+00:00", + "pkgname": "a-fake-one", + } + ], + }, +] + + +def test_get_versions(swh_storage): + loader = AurLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + aur_metadata=EXPECTED_PACKAGES[0]["aur_metadata"], + ) + + assert loader.get_versions() == [ + "10.5.2-1", + ] + + +def test_get_default_version(requests_mock_datadir, swh_storage): + loader = AurLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + aur_metadata=EXPECTED_PACKAGES[0]["aur_metadata"], + ) + assert loader.get_default_version() == "10.5.2-1" + + +def test_aur_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): + loader = AurLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + aur_metadata=EXPECTED_PACKAGES[0]["aur_metadata"], + ) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + + expected_snapshot_id = "fb9ff853036ea48c94f5e5366a9e49d7610d98ed" + expected_release_id = "35ddfe3106bb47f259a9316898de5cab5bf15864" + + assert expected_snapshot_id == actual_load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(actual_load_status["snapshot_id"]), + branches={ + b"releases/10.5.2-1/hg-evolve.tar.gz": SnapshotBranch( + target=hash_to_bytes(expected_release_id), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/10.5.2-1/hg-evolve.tar.gz", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 1, + "directory": 2, + "origin": 1, + "origin_visit": 1, + "release": 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( + name=b"10.5.2-1", + message=b"Synthetic release for Aur source package hg-evolve version " + b"10.5.2-1\n\nFlexible evolution of Mercurial history\n", + target=hash_to_bytes("cc4079be57e7cc0dbf2ecc76c81f5d84782ba632"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=EMPTY_AUTHOR, + date=TimestampWithTimezone.from_iso8601("2022-07-16T00:08:41+00:00"), + id=hash_to_bytes(expected_release_id), + ) + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + status="full", + type="aur", + snapshot=expected_snapshot.id, + ) + + +def test_aur_loader_load_expected_packages(datadir, requests_mock_datadir, swh_storage): + # Exclude the last 'fake' package from EXPECTED_PACKAGES + for package in EXPECTED_PACKAGES[:-1]: + loader = AurLoader( + swh_storage, + url=package["url"], + artifacts=package["artifacts"], + aur_metadata=package["aur_metadata"], + ) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + + stats = get_stats(swh_storage) + assert { + "content": 1 + 1 + 1 + 1, + "directory": 2 + 2 + 2 + 2, + "origin": 1 + 1 + 1 + 1, + "origin_visit": 1 + 1 + 1 + 1, + "release": 1 + 1 + 1 + 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1 + 1 + 1 + 1, + } == stats + + +def test_aur_invalid_origin_not_found(swh_storage, requests_mock_datadir): + url = "http://nowhere/packages/42.git" + loader = AurLoader( + swh_storage, + url, + artifacts=[ + { + "version": "0.0.1", + "url": "https://myforge.nowhere/42/42.tar.gz", + "filename": "42.tar.gz", + }, + ], + aur_metadata=[ + { + "pkgname": "42", + "version": "0.0.1", + "project_url": "https://myforge.nowhere/42", + "last_update": "2022-04-07T21:08:14", + }, + ], + ) + + load_status = loader.load() + assert load_status["status"] == "uneventful" + + +def test_aur_parse_srcinfo(swh_storage, requests_mock_datadir): + """Ensure that multiple lines of `pkgdesc` in .SRCINFO results in `description` + string""" + + loader = AurLoader( + swh_storage, + url=EXPECTED_PACKAGES[-1]["url"], + artifacts=EXPECTED_PACKAGES[-1]["artifacts"], + aur_metadata=EXPECTED_PACKAGES[-1]["aur_metadata"], + ) + loader.load() + + expected_release_id = "2af50761854fee5589b75ff0ecd6886d1185377e" + + assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( + name=b"0.0.1", + message=b"Synthetic release for Aur source package a-fake-one version 0.0.1\n\n" + b"A first line of description.\nA second line for more information.\n", + target=hash_to_bytes("82c770b7d8b1aa573e57b13864831e141d40fe26"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=EMPTY_AUTHOR, + date=TimestampWithTimezone.from_iso8601("2022-02-02T22:22:22+00:00"), + id=hash_to_bytes(expected_release_id), + ) diff --git a/swh/loader/package/aur/tests/test_tasks.py b/swh/loader/package/aur/tests/test_tasks.py new file mode 100644 index 0000000..b3ebafa --- /dev/null +++ b/swh/loader/package/aur/tests/test_tasks.py @@ -0,0 +1,38 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_tasks_aur_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.aur.loader.AurLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.aur.tasks.LoadAur", + kwargs=dict( + url="https://somewhere/some-package.git", + artifacts=[ + { + "filename": "some-package.tar.gz", + "url": "https://somewhere/some-package.tar.gz", + "version": "0.0.1", + } + ], + aur_metadata=[ + { + "version": "0.0.1", + "project_url": "https://somewhere/some-package", + "last_update": "1970-01-01T21:08:14", + "pkgname": "some-package", + } + ], + ), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/crates/loader.py b/swh/loader/package/crates/loader.py index c61d126..a2ebc2b 100644 --- a/swh/loader/package/crates/loader.py +++ b/swh/loader/package/crates/loader.py @@ -1,354 +1,354 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.version import StrictVersion import json from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple from urllib.parse import urlparse import attr import toml from typing_extensions import TypedDict from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import api_info, cached_method, release_name from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface class ExtrinsicPackageMetadata(TypedDict): """Data structure for package extrinsic metadata pulled from http api endpoint. We set only the keys we need according to what is available when querying https://crates.io/api/v1/crates/, where `name` is the name of the crate package (see JSON response example at https://crates.io/api/v1/crates/hg-core). Usage example: .. code-block:: python e_metadata = ExtrinsicPackageMetadata(**self.info()) """ # noqa categories: List[Dict[Any, Any]] """Related categories""" crate: Dict[Any, Any] """Crate project information""" keywords: List[Any] """Keywords""" versions: List[Dict[Any, Any]] """A list of released versions for a crate""" class ExtrinsicVersionPackageMetadata(TypedDict): """Data structure for specific package version extrinsic metadata, pulled from http api endpoint. Similar to `ExtrinsicPackageMetadata` in its usage, but we flatten the data related to a specific version. """ crate: str """The package name""" crate_size: int """The package size""" created_at: str """First released at""" downloads: str """Number of downloads""" license: str """Package license""" num: str """Package version""" published_by: Dict[Any, Any] """Publishers information""" updated_at: str """Last update""" yanked: bool """Is that version yanked? (yanked means release-level deprecation)""" class IntrinsicPackageMetadata(TypedDict): """Data structure for specific package version intrinsic metadata. Data is extracted from the crate package's .toml file. Then the data of the 'package' entry is flattened. Cargo.toml file content example: .. code-block:: toml [package] name = "hg-core" version = "0.0.1" authors = ["Georges Racinet "] description = "Mercurial pure Rust core library, with no assumption on Python bindings (FFI)" homepage = "https://mercurial-scm.org" license = "GPL-2.0-or-later" repository = "https://www.mercurial-scm.org/repo/hg" [lib] name = "hg" [dev-dependencies.rand] version = "~0.6" [dev-dependencies.rand_pcg] version = "~0.1" :param toml: toml object """ name: str """The package name""" version: str """Package version""" authors: List[str] """Authors""" description: str """Package and release description""" homepage: str """Homepage of the project""" license: str """Package license""" repository: str """Source code repository""" @attr.s class CratesPackageInfo(BasePackageInfo): name = attr.ib(type=str) """Name of the package""" version = attr.ib(type=str) """Current version""" e_metadata: Dict[str, Any] = attr.ib(factory=ExtrinsicPackageMetadata) """Extrinsic package metadata, common to all versions""" e_metadata_version: Dict[str, Any] = attr.ib( factory=ExtrinsicVersionPackageMetadata ) """Extrinsic package metadata specific to a version""" i_metadata: Dict[str, Any] = attr.ib(factory=IntrinsicPackageMetadata) """Intrinsic metadata of the current package version""" def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: """Extract intrinsic metadata from Cargo.toml file at dir_path. Each crate archive has a Cargo.toml at the root of the archive. Args: dir_path: A directory on disk where a Cargo.toml must be present Returns: A dict mapping from toml parser """ return toml.load(dir_path / "Cargo.toml") def extract_author(p_info: CratesPackageInfo) -> Person: """Extract package author from intrinsic metadata and return it as a `Person` model. Args: p_info: CratesPackageInfo that should contains i_metadata entries Returns: Only one author (Person) of the package. Currently limited by internal detail of the swh stack (see T3887). """ authors = p_info.i_metadata["authors"] fullname = authors[0] # TODO: here we have a list of author, see T3887 return Person.from_fullname(fullname.encode()) def extract_description(p_info: CratesPackageInfo) -> str: """Extract package description from intrinsic metadata and return it as a string. Args: p_info: CratesPackageInfo that should contains i_metadata and entries Returns: Package description from metadata. """ return p_info.i_metadata["description"] class CratesLoader(PackageLoader[CratesPackageInfo]): """Load Crates package origins into swh archive.""" visit_type = "crates" def __init__( self, storage: StorageInterface, url: str, artifacts: List[Dict[str, Any]], - max_content_size: Optional[int] = None, + **kwargs, ): """Constructor Args: url: Origin url, (e.g. https://crates.io/api/v1/crates/) artifacts: A list of dict listing all existing released versions for a package (Usually set with crates lister `extra_loader_arguments`). Each line is a dict that should have an `url` (where to download package specific version) and a `version` entry. Example:: [ { "version": , "url": "https://static.crates.io/crates//-.crate", } ] """ # noqa - super().__init__(storage=storage, url=url, max_content_size=max_content_size) + super().__init__(storage=storage, url=url, **kwargs) self.url = url self.artifacts: Dict[str, Dict] = { artifact["version"]: artifact for artifact in artifacts } @cached_method def _raw_info(self) -> bytes: """Get crate metadata (fetched from http api endpoint set as self.url) Returns: Content response as bytes. Content response is a json document. """ return api_info(self.url) @cached_method def info(self) -> Dict: """Parse http api json response and return the crate metadata information as a Dict.""" return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: """Get all released versions of a crate Returns: A sequence of versions Example:: ["0.1.1", "0.10.2"] """ versions = list(self.artifacts.keys()) versions.sort(key=StrictVersion) return versions def get_default_version(self) -> str: """Get the newest release version of a crate Returns: A string representing a version Example:: "0.1.2" """ return self.get_versions()[-1] def get_package_info(self, version: str) -> Iterator[Tuple[str, CratesPackageInfo]]: """Get release name and package information from version Args: version: crate version (e.g: "0.1.0") Returns: Iterator of tuple (release_name, p_info) """ artifact = self.artifacts[version] filename = artifact["filename"] package_name = urlparse(self.url).path.split("/")[-1] url = artifact["url"] # Get extrinsic metadata from http api e_metadata = ExtrinsicPackageMetadata(**self.info()) # type: ignore[misc] # Extract crate info for current version (One .crate file for a given version) (crate_version,) = [ crate for crate in e_metadata["versions"] if crate["num"] == version ] e_metadata_version = ExtrinsicVersionPackageMetadata( # type: ignore[misc] **crate_version ) p_info = CratesPackageInfo( name=package_name, filename=filename, url=url, version=version, e_metadata=e_metadata, e_metadata_version=e_metadata_version, ) yield release_name(version, filename), p_info def build_release( self, p_info: CratesPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # Extract intrinsic metadata from dir_path/Cargo.toml name = p_info.name version = p_info.version dir_path = Path(uncompressed_path, f"{name}-{version}") i_metadata_raw = extract_intrinsic_metadata(dir_path) # Get only corresponding key of IntrinsicPackageMetadata i_metadata_keys = [k for k in IntrinsicPackageMetadata.__annotations__.keys()] # We use data only from "package" entry i_metadata = { k: v for k, v in i_metadata_raw["package"].items() if k in i_metadata_keys } p_info.i_metadata = IntrinsicPackageMetadata(**i_metadata) # type: ignore[misc] author = extract_author(p_info) description = extract_description(p_info) message = ( f"Synthetic release for Crate source package {p_info.name} " - f"version {p_info.version}\n" + f"version {p_info.version}\n\n" f"{description}\n" ) # The only way to get a value for updated_at is through extrinsic metadata updated_at = p_info.e_metadata_version.get("updated_at") return Release( name=version.encode(), author=author, date=TimestampWithTimezone.from_iso8601(updated_at), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/crates/tasks.py b/swh/loader/package/crates/tasks.py index 9385263..0b7e24c 100644 --- a/swh/loader/package/crates/tasks.py +++ b/swh/loader/package/crates/tasks.py @@ -1,14 +1,14 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task from swh.loader.package.crates.loader import CratesLoader @shared_task(name=__name__ + ".LoadCrates") -def load_crates(*, url=None, artifacts: list): +def load_crates(**kwargs): """Load Rust crate package""" - return CratesLoader.from_configfile(url=url, artifacts=artifacts).load() + return CratesLoader.from_configfile(**kwargs).load() diff --git a/swh/loader/package/crates/tests/test_crates.py b/swh/loader/package/crates/tests/test_crates.py index 8d7f750..1ff76f7 100644 --- a/swh/loader/package/crates/tests/test_crates.py +++ b/swh/loader/package/crates/tests/test_crates.py @@ -1,287 +1,287 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.loader.package.crates.loader import CratesLoader from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( ObjectType, Person, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) CRATES_EXTRA = [ { "url": "https://crates.io/api/v1/crates/hg-core", "artifacts": [ { "checksums": { "sha256": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d", # noqa: B950 }, "filename": "hg-core-0.0.1.crate", "url": "https://static.crates.io/crates/hg-core/hg-core-0.0.1.crate", "version": "0.0.1", }, ], }, { "url": "https://crates.io/api/v1/crates/micro-timer", "artifacts": [ { "checksums": { "sha256": "69ad8fd116f8af0298ae4e83e587b1600af12709022471e25581c3aeb1da77ce", # noqa: B950 }, "filename": "micro-timer-0.1.0.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.1.0.crate", "version": "0.1.0", }, { "checksums": { "sha256": "7b3f65fe0e109daad8d47e1938c9b5f9353efacd86bbe7ff013f84ae7ca758bf", # noqa: B950 }, "filename": "micro-timer-0.1.1.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.1.1.crate", "version": "0.1.1", }, { "checksums": { "sha256": "16439fea388f712c1df7737ceb8f784d407844624b4796faf1e1bf8bbaa97445", # noqa: B950 }, "filename": "micro-timer-0.1.2.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.1.2.crate", "version": "0.1.2", }, { "checksums": { "sha256": "336b4c0f071d16674747faa4643d742cc096fec2bf8cf01bb1a98d984bedcaf1", # noqa: B950 }, "filename": "micro-timer-0.2.0.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.2.0.crate", "version": "0.2.0", }, { "checksums": { "sha256": "987429cd6162a80ed5ff44fc790f5090b1c6d617ac73a2e272965ed91201d79b", # noqa: B950 }, "filename": "micro-timer-0.2.1.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.2.1.crate", "version": "0.2.1", }, { "checksums": { "sha256": "25b31d6cb9112984323d05d7a353f272ae5d7a307074f9ab9b25c00121b8c947", # noqa: B950 }, "filename": "micro-timer-0.3.0.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.3.0.crate", "version": "0.3.0", }, { "checksums": { "sha256": "2620153e1d903d26b72b89f0e9c48d8c4756cba941c185461dddc234980c298c", # noqa: B950 }, "filename": "micro-timer-0.3.1.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.3.1.crate", "version": "0.3.1", }, { "checksums": { "sha256": "5de32cb59a062672560d6f0842c4aa7714727457b9fe2daf8987d995a176a405", # noqa: B950 }, "filename": "micro-timer-0.4.0.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.4.0.crate", "version": "0.4.0", }, ], }, ] def test_get_versions(requests_mock_datadir, swh_storage): loader = CratesLoader( swh_storage, url=CRATES_EXTRA[1]["url"], artifacts=CRATES_EXTRA[1]["artifacts"], ) assert loader.get_versions() == [ "0.1.0", "0.1.1", "0.1.2", "0.2.0", "0.2.1", "0.3.0", "0.3.1", "0.4.0", ] def test_get_default_version(requests_mock_datadir, swh_storage): loader = CratesLoader( swh_storage, url=CRATES_EXTRA[1]["url"], artifacts=CRATES_EXTRA[1]["artifacts"], ) assert loader.get_default_version() == "0.4.0" def test_crate_invalid_origin_archive_not_found(swh_storage, requests_mock_datadir): url = "https://nowhere-to-run/nowhere-to-hide" loader = CratesLoader( swh_storage, url, artifacts=[ { "filename": "nowhere-to-hide-0.0.1.crate", "url": "https://nowhere-to-run/nowhere-to-hide-0.0.1.crate", "version": "0.0.1", }, ], ) with pytest.raises(Exception): assert loader.load() == {"status": "failed"} assert_last_visit_matches( swh_storage, url, status="not_found", type="crates", snapshot=None ) def test_crates_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): loader = CratesLoader( swh_storage, url=CRATES_EXTRA[0]["url"], artifacts=CRATES_EXTRA[0]["artifacts"], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None - expected_snapshot_id = "353cd6858c88ee8210432ea1098993c2e9966561" - expected_release_id = "d578833534017430f1b93eb741620899620c2505" + expected_snapshot_id = "b3affb4949eb89b244f0e1d1fe235fc1d26bde76" + expected_release_id = "237c4cdd44a90e620795e5a07ebcc72bc82487f7" assert expected_snapshot_id == actual_load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"releases/0.0.1/hg-core-0.0.1.crate": SnapshotBranch( target=hash_to_bytes(expected_release_id), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/0.0.1/hg-core-0.0.1.crate", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 1, "directory": 2, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( name=b"0.0.1", - message=b"Synthetic release for Crate source package hg-core version " - b"0.0.1\nMercurial pure Rust core library, with no assumption " + message=b"Synthetic release for Crate source package hg-core version 0.0.1\n\n" + b"Mercurial pure Rust core library, with no assumption " b"on Python bindings (FFI)\n", target=hash_to_bytes("674c3b0b54628d55b93a79dc7adf304efc01b371"), target_type=ObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname(b"Georges Racinet "), date=TimestampWithTimezone.from_iso8601("2019-04-16T18:48:11.404457+00:00"), id=hash_to_bytes(expected_release_id), ) def test_crates_loader_load_n_versions(datadir, requests_mock_datadir, swh_storage): url = CRATES_EXTRA[1]["url"] loader = CratesLoader( swh_storage, url=CRATES_EXTRA[1]["url"], artifacts=CRATES_EXTRA[1]["artifacts"], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None - expected_snapshot_id = "016cbbe3bb78424c35b898015a2d80d79359e2ad" + expected_snapshot_id = "3f8ca5908a570fa32270b07a0946bcffa88babd5" assert expected_snapshot_id == actual_load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"releases/0.4.0/micro-timer-0.4.0.crate": SnapshotBranch( - target=hash_to_bytes("3237c1174c4ccfa8e934d1bfd8d80b3a89760e39"), + target=hash_to_bytes("b038a927244c852fb3794aecbebdc70f68ddf067"), target_type=TargetType.RELEASE, ), b"releases/0.3.1/micro-timer-0.3.1.crate": SnapshotBranch( - target=hash_to_bytes("8b727a280051cdb90468ede2746e176e6fdf355f"), + target=hash_to_bytes("ea331a2ce755e6f0cd9d05c9be52accde68536c4"), target_type=TargetType.RELEASE, ), b"releases/0.3.0/micro-timer-0.3.0.crate": SnapshotBranch( - target=hash_to_bytes("f45ec236ae50fb37e924a3d2cc093e72b6cbf1cd"), + target=hash_to_bytes("7ea45f915ace083ed361bb12593625bf4cf1f5f2"), target_type=TargetType.RELEASE, ), b"releases/0.2.1/micro-timer-0.2.1.crate": SnapshotBranch( - target=hash_to_bytes("50a60a2c3696df7cd1b623bd7dbea2c89b994e42"), + target=hash_to_bytes("074f27605be8b759e5d7c638f026aac3709f58e5"), target_type=TargetType.RELEASE, ), b"releases/0.2.0/micro-timer-0.2.0.crate": SnapshotBranch( - target=hash_to_bytes("f0592dc0ae05399d872017d0260c45b875cb590e"), + target=hash_to_bytes("a1d642aaa54c5361f67e57adbd86e01f3a3276f8"), target_type=TargetType.RELEASE, ), b"releases/0.1.2/micro-timer-0.1.2.crate": SnapshotBranch( - target=hash_to_bytes("9220d7823fc40ab44e3ae3227522e7de672fad3e"), + target=hash_to_bytes("60f18ae067ce235bc60243bf5cdaaae474b11978"), target_type=TargetType.RELEASE, ), b"releases/0.1.1/micro-timer-0.1.1.crate": SnapshotBranch( - target=hash_to_bytes("38529b7e355f79fdce31a3ba891e146174e10237"), + target=hash_to_bytes("fd6c55dfd016d58647a2d44b29a3fd4e3afa7671"), target_type=TargetType.RELEASE, ), b"releases/0.1.0/micro-timer-0.1.0.crate": SnapshotBranch( - target=hash_to_bytes("5e5e6120af55b65c577e09331df54e70fad5e8b0"), + target=hash_to_bytes("3e07559a4b366a397b1ca154e72753ce27223ca1"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/0.4.0/micro-timer-0.4.0.crate", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 8, "directory": 16, "origin": 1, "origin_visit": 1, "release": 8, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert_last_visit_matches( swh_storage, url, status="full", type="crates", snapshot=expected_snapshot.id, ) diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py index 181415c..ed540f9 100644 --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -1,565 +1,565 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json import re import pytest from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.loader.package.deposit.loader import ApiClient, DepositLoader from swh.loader.package.loader import now from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import ( Origin, Person, RawExtrinsicMetadata, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType DEPOSIT_URL = "https://deposit.softwareheritage.org/1/private" @pytest.fixture def requests_mock_datadir(requests_mock_datadir): """Enhance default mock data to mock put requests as the loader does some internal update queries there. """ requests_mock_datadir.put(re.compile("https")) return requests_mock_datadir def test_deposit_init_ok(swh_storage, deposit_client, swh_loader_config): url = "some-url" deposit_id = 999 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) # Something that does not exist assert loader.origin.url == url assert loader.client is not None assert loader.client.base_url == swh_loader_config["deposit"]["url"] def test_deposit_from_configfile(swh_config): """Ensure the deposit instantiation is ok""" loader = DepositLoader.from_configfile( url="some-url", deposit_id="666", default_filename="archive.zip" ) assert isinstance(loader.client, ApiClient) def test_deposit_loading_unknown_deposit( swh_storage, deposit_client, requests_mock_datadir ): """Loading an unknown deposit should fail no origin, no visit, no snapshot """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url" unknown_deposit_id = 667 loader = DepositLoader( swh_storage, url, unknown_deposit_id, deposit_client, default_filename="archive.zip", ) # does not exist actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} stats = get_stats(loader.storage) assert { "content": 0, "directory": 0, "origin": 0, "origin_visit": 0, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 0, } == stats requests_mock_datadir_missing_one = requests_mock_datadir_factory( ignore_urls=[ f"{DEPOSIT_URL}/666/raw/", ] ) def test_deposit_loading_failure_to_retrieve_1_artifact( swh_storage, deposit_client, requests_mock_datadir_missing_one ): """Deposit with missing artifact ends up with an uneventful/partial visit""" # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url-2" deposit_id = 666 requests_mock_datadir_missing_one.put(re.compile("https")) loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(loader.storage, url, status="partial", type="deposit") stats = get_stats(loader.storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir_missing_one.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "failed", "status_detail": { "loading": [ - "Failed to load branch HEAD for some-url-2: Fail to query " - "'https://deposit.softwareheritage.org/1/private/666/raw/'. Reason: 404" + "Failed to load branch HEAD for some-url-2: 404 Client Error: None " + "for url: https://deposit.softwareheritage.org/1/private/666/raw/" ] }, } assert body == expected_body def test_deposit_loading_ok(swh_storage, deposit_client, requests_mock_datadir): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 666 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() expected_snapshot_id = "338b45d87e02fb5cbf324694bc4a898623d6a30f" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) release_id_hex = "2566a64a27bc00362e265be9666d7606750530a1" release_id = hash_to_bytes(release_id_hex) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch( target=release_id, target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, storage=loader.storage) release = loader.storage.release_get([release_id])[0] date = TimestampWithTimezone.from_datetime( datetime.datetime(2017, 10, 7, 15, 17, 8, tzinfo=datetime.timezone.utc) ) person = Person( fullname=b"Software Heritage", name=b"Software Heritage", email=b"robot@softwareheritage.org", ) assert release == Release( id=release_id, name=b"HEAD", message=b"hal: Deposit 666 in collection hal\n", author=person, date=date, target_type=ModelObjectType.DIRECTORY, target=b"\xfd-\xf1-\xc5SL\x1d\xa1\xe9\x18\x0b\x91Q\x02\xfbo`\x1d\x19", synthetic=True, metadata=None, ) # check metadata fetcher = MetadataFetcher( name="swh-deposit", version="0.0.1", ) authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="https://hal-test.archives-ouvertes.fr/", ) # Check origin metadata orig_meta = loader.storage.raw_extrinsic_metadata_get( Origin(url).swhid(), authority ) assert orig_meta.next_page_token is None raw_meta = loader.client.metadata_get(deposit_id) raw_metadata: str = raw_meta["raw_metadata"] # 2 raw metadata xml + 1 json dict assert len(orig_meta.results) == 2 orig_meta0 = orig_meta.results[0] assert orig_meta0.authority == authority assert orig_meta0.fetcher == fetcher # Check directory metadata assert release.target_type == ModelObjectType.DIRECTORY directory_swhid = CoreSWHID( object_type=ObjectType.DIRECTORY, object_id=release.target ) actual_dir_meta = loader.storage.raw_extrinsic_metadata_get( directory_swhid, authority ) assert actual_dir_meta.next_page_token is None assert len(actual_dir_meta.results) == 1 dir_meta = actual_dir_meta.results[0] assert dir_meta.authority == authority assert dir_meta.fetcher == fetcher assert dir_meta.metadata.decode() == raw_metadata # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "done", "release_id": release_id_hex, "directory_id": hash_to_hex(release.target), "snapshot_id": expected_snapshot_id, "origin_url": url, } assert body == expected_body stats = get_stats(loader.storage) assert { "content": 303, "directory": 12, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_deposit_loading_ok_2(swh_storage, deposit_client, requests_mock_datadir): """Field dates should be se appropriately""" external_id = "some-external-id" url = f"https://hal-test.archives-ouvertes.fr/{external_id}" deposit_id = 777 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() expected_snapshot_id = "3449b8ff31abeacefd33cca60e3074c1649dc3a1" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) release_id = "ba6c9a59ae3256e765d32b211cc183dc2380aed7" expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch( target=hash_to_bytes(release_id), target_type=TargetType.RELEASE ) }, ) check_snapshot(expected_snapshot, storage=loader.storage) raw_meta = loader.client.metadata_get(deposit_id) # Ensure the date fields are set appropriately in the release # Retrieve the release release = loader.storage.release_get([hash_to_bytes(release_id)])[0] assert release # swh-deposit uses the numeric 'offset_minutes' instead of the bytes offset # attribute, because its dates are always well-formed, and it can only send # JSON-serializable data. release_date_dict = { "timestamp": release.date.timestamp.to_dict(), "offset": release.date.offset_minutes(), } assert release_date_dict == raw_meta["deposit"]["author_date"] assert not release.metadata provider = { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": None, } tool = { "name": "swh-deposit", "version": "0.0.1", "configuration": {"sword_version": "2"}, } fetcher = MetadataFetcher( name="swh-deposit", version="0.0.1", ) authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="https://hal-test.archives-ouvertes.fr/", ) # Check the origin metadata swh side origin_extrinsic_metadata = loader.storage.raw_extrinsic_metadata_get( Origin(url).swhid(), authority ) assert origin_extrinsic_metadata.next_page_token is None raw_metadata: str = raw_meta["raw_metadata"] # 1 raw metadata xml + 1 json dict assert len(origin_extrinsic_metadata.results) == 2 origin_swhid = Origin(url).swhid() expected_metadata = [] origin_meta = origin_extrinsic_metadata.results[0] expected_metadata.append( RawExtrinsicMetadata( target=origin_swhid, discovery_date=origin_meta.discovery_date, metadata=raw_metadata.encode(), format="sword-v2-atom-codemeta-v2", authority=authority, fetcher=fetcher, ) ) origin_metadata = { "metadata": [raw_metadata], "provider": provider, "tool": tool, } expected_metadata.append( RawExtrinsicMetadata( target=origin_swhid, discovery_date=origin_extrinsic_metadata.results[-1].discovery_date, metadata=json.dumps(origin_metadata).encode(), format="original-artifacts-json", authority=authority, fetcher=fetcher, ) ) assert sorted(origin_extrinsic_metadata.results) == sorted(expected_metadata) # Check the release metadata swh side assert release.target_type == ModelObjectType.DIRECTORY directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) actual_directory_metadata = loader.storage.raw_extrinsic_metadata_get( directory_swhid, authority ) assert actual_directory_metadata.next_page_token is None assert len(actual_directory_metadata.results) == 1 release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release_id) ) dir_metadata_template = RawExtrinsicMetadata( target=directory_swhid, format="sword-v2-atom-codemeta-v2", authority=authority, fetcher=fetcher, origin=url, release=release_swhid, # to satisfy the constructor discovery_date=now(), metadata=b"", ) expected_directory_metadata = [] dir_metadata = actual_directory_metadata.results[0] expected_directory_metadata.append( RawExtrinsicMetadata.from_dict( { **{ k: v for (k, v) in dir_metadata_template.to_dict().items() if k != "id" }, "discovery_date": dir_metadata.discovery_date, "metadata": raw_metadata.encode(), } ) ) assert sorted(actual_directory_metadata.results) == sorted( expected_directory_metadata ) # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "done", "release_id": release_id, "directory_id": hash_to_hex(release.target), "snapshot_id": expected_snapshot_id, "origin_url": url, } assert body == expected_body def test_deposit_loading_ok_3(swh_storage, deposit_client, requests_mock_datadir): """Deposit loading can happen on tarball artifacts as well The latest deposit changes introduce the internal change. """ external_id = "hal-123456" url = f"https://hal-test.archives-ouvertes.fr/{external_id}" deposit_id = 888 loader = DepositLoader(swh_storage, url, deposit_id, deposit_client) actual_load_status = loader.load() expected_snapshot_id = "4677843de89e398f1d6bfedc9ca9b89c451c55c8" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) def test_deposit_loading_ok_release_notes( swh_storage, deposit_client, requests_mock_datadir ): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 999 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() expected_snapshot_id = "a307acffb7c29bebb3daf1bcb680bb3f452890a8" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) release_id_hex = "f5e8ec02ede57edbe061afa7fc2a07bb7d14a700" release_id = hash_to_bytes(release_id_hex) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch( target=release_id, target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, storage=loader.storage) release = loader.storage.release_get([release_id])[0] date = TimestampWithTimezone.from_datetime( datetime.datetime(2017, 10, 7, 15, 17, 8, tzinfo=datetime.timezone.utc) ) person = Person( fullname=b"Software Heritage", name=b"Software Heritage", email=b"robot@softwareheritage.org", ) assert release == Release( id=release_id, name=b"HEAD", message=( b"hal: Deposit 999 in collection hal\n\nThis release adds this and that.\n" ), author=person, date=date, target_type=ModelObjectType.DIRECTORY, target=b"\xfd-\xf1-\xc5SL\x1d\xa1\xe9\x18\x0b\x91Q\x02\xfbo`\x1d\x19", synthetic=True, metadata=None, ) diff --git a/swh/loader/package/golang/__init__.py b/swh/loader/package/golang/__init__.py new file mode 100644 index 0000000..e36c6ce --- /dev/null +++ b/swh/loader/package/golang/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import GolangLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": GolangLoader, + } diff --git a/swh/loader/package/golang/loader.py b/swh/loader/package/golang/loader.py new file mode 100644 index 0000000..9caff6a --- /dev/null +++ b/swh/loader/package/golang/loader.py @@ -0,0 +1,91 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import logging +from typing import Iterator, Optional, Sequence, Tuple + +import attr + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import EMPTY_AUTHOR, api_info, release_name +from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone +from swh.storage.interface import StorageInterface + +logger = logging.getLogger(__name__) + + +@attr.s +class GolangPackageInfo(BasePackageInfo): + name = attr.ib(type=str) + timestamp = attr.ib(type=Optional[TimestampWithTimezone]) + + +class GolangLoader(PackageLoader[GolangPackageInfo]): + """Load Golang module zip file into SWH archive.""" + + visit_type = "golang" + GOLANG_PKG_DEV_URL = "https://pkg.go.dev" + GOLANG_PROXY_URL = "https://proxy.golang.org" + + def __init__( + self, + storage: StorageInterface, + url: str, + max_content_size: Optional[int] = None, + **kwargs, + ): + super().__init__(storage, url, max_content_size=max_content_size, **kwargs) + # The lister saves human-usable URLs, so we translate them to proxy URLs + # for use in the loader. + # This URL format is detailed in https://go.dev/ref/mod#goproxy-protocol + assert url.startswith( + self.GOLANG_PKG_DEV_URL + ), "Go package URL (%s) not from %s" % (url, self.GOLANG_PKG_DEV_URL) + self.name = url[len(self.GOLANG_PKG_DEV_URL) + 1 :] + self.url = url.replace(self.GOLANG_PKG_DEV_URL, self.GOLANG_PROXY_URL) + + def get_versions(self) -> Sequence[str]: + return api_info(f"{self.url}/@v/list").decode().splitlines() + + def get_default_version(self) -> str: + latest = api_info(f"{self.url}/@latest") + return json.loads(latest)["Version"] + + def _raw_info(self, version: str) -> dict: + url = f"{self.url}/@v/{version}.info" + return json.loads(api_info(url)) + + def get_package_info(self, version: str) -> Iterator[Tuple[str, GolangPackageInfo]]: + # Encode the name because creating nested folders can become problematic + encoded_name = self.name.replace("/", "__") + filename = f"{encoded_name}-{version}.zip" + timestamp = TimestampWithTimezone.from_iso8601(self._raw_info(version)["Time"]) + p_info = GolangPackageInfo( + url=f"{self.url}/@v/{version}.zip", + filename=filename, + version=version, + timestamp=timestamp, + name=self.name, + ) + yield release_name(version), p_info + + def build_release( + self, p_info: GolangPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + msg = ( + f"Synthetic release for Golang source package {p_info.name} " + f"version {p_info.version}\n" + ) + + return Release( + name=p_info.version.encode(), + message=msg.encode(), + date=p_info.timestamp, + author=EMPTY_AUTHOR, # Go modules offer very little metadata + target_type=ObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) diff --git a/swh/loader/package/crates/tasks.py b/swh/loader/package/golang/tasks.py similarity index 51% copy from swh/loader/package/crates/tasks.py copy to swh/loader/package/golang/tasks.py index 9385263..167a437 100644 --- a/swh/loader/package/crates/tasks.py +++ b/swh/loader/package/golang/tasks.py @@ -1,14 +1,15 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task -from swh.loader.package.crates.loader import CratesLoader +from swh.loader.package.golang.loader import GolangLoader -@shared_task(name=__name__ + ".LoadCrates") -def load_crates(*, url=None, artifacts: list): - """Load Rust crate package""" - return CratesLoader.from_configfile(url=url, artifacts=artifacts).load() +@shared_task(name=__name__ + ".LoadGolang") +def load_golang(**kwargs): + """Load Golang module""" + loader = GolangLoader.from_configfile(**kwargs) + return loader.load() diff --git a/swh/loader/package/golang/tests/__init__.py b/swh/loader/package/golang/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@latest b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@latest new file mode 100644 index 0000000..ac70dcd --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@latest @@ -0,0 +1 @@ +{"Version":"v0.1.3","Time":"2022-03-15T13:54:34Z"} diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_list b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_list new file mode 100644 index 0000000..04e1946 --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_list @@ -0,0 +1 @@ +v0.1.3 diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.info b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.info new file mode 100644 index 0000000..52a74e2 --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.info @@ -0,0 +1 @@ +{"Version":"v0.1.3","Time":"2022-03-17T15:42:55Z"} diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.zip b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.zip new file mode 100644 index 0000000..4e023fb Binary files /dev/null and b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.zip differ diff --git a/swh/loader/package/golang/tests/test_golang.py b/swh/loader/package/golang/tests/test_golang.py new file mode 100644 index 0000000..63bde1b --- /dev/null +++ b/swh/loader/package/golang/tests/test_golang.py @@ -0,0 +1,13 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.loader.package.golang.loader import GolangLoader + + +def test_golang_loader_first_visit(swh_storage, requests_mock_datadir): + url = "https://pkg.go.dev/example.com/basic-go-module" + loader = GolangLoader(swh_storage, url) + + assert loader.load()["status"] == "eventful" diff --git a/swh/loader/package/golang/tests/test_tasks.py b/swh/loader/package/golang/tests/test_tasks.py new file mode 100644 index 0000000..18819b9 --- /dev/null +++ b/swh/loader/package/golang/tests/test_tasks.py @@ -0,0 +1,21 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_tasks_golang_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.golang.loader.GolangLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.golang.tasks.LoadGolang", + kwargs={"url": "https://pkg.go.dev/golang.org/whatever/package"}, + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 000ad98..96ff69e 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,1086 +1,1088 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib from itertools import islice import json import logging import os import string import sys import tempfile from typing import ( Any, Dict, Generic, Iterator, List, Mapping, Optional, Sequence, Set, Tuple, TypeVar, ) import attr from requests.exceptions import ContentDecodingError import sentry_sdk from swh.core.tarball import uncompress from swh.loader.core.loader import BaseLoader from swh.loader.exception import NotFound from swh.loader.package.utils import download from swh.model import from_disk from swh.model.hashutil import hash_to_hex from swh.model.model import ( ExtID, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, ) from swh.model.model import ( Origin, OriginVisit, OriginVisitStatus, RawExtrinsicMetadata, Release, Revision, Sha1Git, Snapshot, ) from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import StorageInterface from swh.storage.utils import now logger = logging.getLogger(__name__) SWH_METADATA_AUTHORITY = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="https://softwareheritage.org/", metadata={}, ) """Metadata authority for extrinsic metadata generated by Software Heritage. Used for metadata on "original artifacts", ie. length, filename, and checksums of downloaded archive files.""" PartialExtID = Tuple[str, int, bytes] """The ``extid_type`` and ``extid`` fields of an :class:`ExtID` object.""" @attr.s class RawExtrinsicMetadataCore: """Contains the core of the metadata extracted by a loader, that will be used to build a full RawExtrinsicMetadata object by adding object identifier, context, and provenance information.""" format = attr.ib(type=str) metadata = attr.ib(type=bytes) discovery_date = attr.ib(type=Optional[datetime.datetime], default=None) """Defaults to the visit date.""" @attr.s class BasePackageInfo: """Compute the primary key for a dict using the id_keys as primary key composite. Args: d: A dict entry to compute the primary key on id_keys: Sequence of keys to use as primary key Returns: The identity for that dict entry """ url = attr.ib(type=str) filename = attr.ib(type=Optional[str]) version = attr.ib(type=str) """Version name/number.""" MANIFEST_FORMAT: Optional[string.Template] = None """If not None, used by the default extid() implementation to format a manifest, before hashing it to produce an ExtID.""" EXTID_TYPE: str = "package-manifest-sha256" EXTID_VERSION: int = 0 # The following attribute has kw_only=True in order to allow subclasses # to add attributes. Without kw_only, attributes without default values cannot # go after attributes with default values. # See directory_extrinsic_metadata = attr.ib( type=List[RawExtrinsicMetadataCore], default=[], kw_only=True, ) """:term:`extrinsic metadata` collected by the loader, that will be attached to the loaded directory and added to the Metadata storage.""" # TODO: add support for metadata for releases and contents def extid(self) -> Optional[PartialExtID]: """Returns a unique intrinsic identifier of this package info, or None if this package info is not 'deduplicatable' (meaning that we will always load it, instead of checking the ExtID storage to see if we already did)""" if self.MANIFEST_FORMAT is None: return None else: manifest = self.MANIFEST_FORMAT.substitute( {k: str(v) for (k, v) in attr.asdict(self).items()} ) return ( self.EXTID_TYPE, self.EXTID_VERSION, hashlib.sha256(manifest.encode()).digest(), ) TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) class PackageLoader(BaseLoader, Generic[TPackageInfo]): def __init__(self, storage: StorageInterface, url: str, **kwargs: Any): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: storage: Storage instance url: Origin url to load data from """ super().__init__(storage=storage, origin_url=url, **kwargs) def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Raises: class:`swh.loader.exception.NotFound` error when failing to read the published package versions. Returns: Sequence of published versions """ return [] def get_package_info(self, version: str) -> Iterator[Tuple[str, TPackageInfo]]: """Given a release version of a package, retrieve the associated package information for such version. Args: version: Package version Returns: (branch name, package metadata) """ yield from {} def build_release( self, p_info: TPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: """Build the release from the archive metadata (extrinsic artifact metadata) and the intrinsic metadata. Args: p_info: Package information uncompressed_path: Artifact uncompressed path on disk """ raise NotImplementedError("build_release") def get_default_version(self) -> str: """Retrieve the latest release version if any. Returns: Latest version """ return "" def last_snapshot(self) -> Optional[Snapshot]: """Retrieve the last snapshot out of the last visit.""" return snapshot_get_latest(self.storage, self.origin.url) def new_packageinfo_to_extid(self, p_info: TPackageInfo) -> Optional[PartialExtID]: return p_info.extid() def _get_known_extids( self, packages_info: List[TPackageInfo] ) -> Dict[PartialExtID, List[CoreSWHID]]: """Compute the ExtIDs from new PackageInfo objects, searches which are already loaded in the archive, and returns them if any.""" # Compute the ExtIDs of all the new packages, grouped by extid type new_extids: Dict[Tuple[str, int], List[bytes]] = {} for p_info in packages_info: res = p_info.extid() if res is not None: (extid_type, extid_version, extid_extid) = res new_extids.setdefault((extid_type, extid_version), []).append( extid_extid ) # For each extid type, call extid_get_from_extid() with all the extids of # that type, and store them in the '(type, extid) -> target' map. known_extids: Dict[PartialExtID, List[CoreSWHID]] = {} for ((extid_type, extid_version), extids) in new_extids.items(): for extid in self.storage.extid_get_from_extid( extid_type, extids, version=extid_version ): if extid is not None: key = (extid.extid_type, extid_version, extid.extid) known_extids.setdefault(key, []).append(extid.target) return known_extids def resolve_object_from_extids( self, known_extids: Dict[PartialExtID, List[CoreSWHID]], p_info: TPackageInfo, whitelist: Set[Sha1Git], ) -> Optional[CoreSWHID]: """Resolve the revision/release from known ExtIDs and a package info object. If the artifact has already been downloaded, this will return the existing release (or revision) targeting that uncompressed artifact directory. Otherwise, this returns None. Args: known_extids: Dict built from a list of ExtID, with the target as value p_info: Package information whitelist: Any ExtID with target not in this set is filtered out Returns: None or release/revision SWHID """ new_extid = p_info.extid() if new_extid is None: return None extid_targets = set() for extid_target in known_extids.get(new_extid, []): if extid_target.object_id not in whitelist: # There is a known ExtID for this package, but its target is not # in the snapshot. # This can happen for three reasons: # # 1. a loader crashed after writing the ExtID, but before writing # the snapshot # 2. some other loader loaded the same artifact, but produced # a different revision, causing an additional ExtID object # to be written. We will probably find this loader's ExtID # in a future iteration of this loop. # Note that for now, this is impossible, as each loader has a # completely different extid_type, but this is an implementation # detail of each loader. # 3. we took a snapshot, then the package disappeared, # then we took another snapshot, and the package reappeared # # In case of 1, we must actually load the package now, # so let's do it. # TODO: detect when we are in case 3 using release_missing # or revision_missing instead of the snapshot. continue elif extid_target.object_type in (ObjectType.RELEASE, ObjectType.REVISION): extid_targets.add(extid_target) else: # Note that this case should never be reached unless there is a # collision between a revision hash and some non-revision object's # hash, but better safe than sorry. logger.warning( "%s is in the whitelist, but is not a revision/release.", hash_to_hex(extid_target.object_type), ) if extid_targets: # This is a known package version, as we have an extid to reference it. # Let's return one of them. # If there is a release extid, return it. release_extid_targets = { extid_target for extid_target in extid_targets if extid_target.object_type == ObjectType.RELEASE } # Exclude missing targets missing_releases = { CoreSWHID(object_type=ObjectType.RELEASE, object_id=id_) for id_ in self.storage.release_missing( [swhid.object_id for swhid in release_extid_targets] ) } if missing_releases: err_message = "Found ExtIDs pointing to missing releases" logger.error(err_message + ": %s", missing_releases) with sentry_sdk.push_scope() as scope: scope.set_extra( "missing_releases", [str(x) for x in missing_releases] ) sentry_sdk.capture_message(err_message, "error") release_extid_targets -= missing_releases extid_target2 = self.select_extid_target(p_info, release_extid_targets) if extid_target2: return extid_target2 # If there is no release extid (ie. if the package was only loaded with # older versions of this loader, which produced revision objects instead # of releases), return a revision extid when possible. revision_extid_targets = { extid_target for extid_target in extid_targets if extid_target.object_type == ObjectType.REVISION } if revision_extid_targets: assert len(extid_targets) == 1, extid_targets extid_target = list(extid_targets)[0] return extid_target # No target found (this is probably a new package version) return None def select_extid_target( self, p_info: TPackageInfo, extid_targets: Set[CoreSWHID] ) -> Optional[CoreSWHID]: """Given a list of release extid targets, choses one appropriate for the given package info. Package loaders shyould implement this if their ExtIDs may map to multiple releases, so they can fetch releases from the storage and inspect their fields to select the right one for this ``p_info``. """ if extid_targets: # The base package loader does not have the domain-specific knowledge # to select the right release -> crash if there is more than one. assert len(extid_targets) == 1, extid_targets return list(extid_targets)[0] return None def download_package( self, p_info: TPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Download artifacts for a specific package. All downloads happen in in the tmpdir folder. Default implementation expects the artifacts package info to be about one artifact per package. Note that most implementation have 1 artifact per package. But some implementation have multiple artifacts per package (debian), some have none, the package is the artifact (gnu). Args: artifacts_package_info: Information on the package artifacts to download (url, filename, etc...) tmpdir: Location to retrieve such artifacts Returns: List of (path, computed hashes) """ try: return [download(p_info.url, dest=tmpdir, filename=p_info.filename)] except ContentDecodingError: # package might be erroneously marked as gzip compressed while is is not, # try to download its raw bytes again without attempting to uncompress # the input stream return [ download( p_info.url, dest=tmpdir, filename=p_info.filename, extra_request_headers={"Accept-Encoding": "identity"}, ) ] def uncompress( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str ) -> str: """Uncompress the artifact(s) in the destination folder dest. Optionally, this could need to use the p_info dict for some more information (debian). """ uncompressed_path = os.path.join(dest, "src") for a_path, _ in dl_artifacts: uncompress(a_path, dest=uncompressed_path) return uncompressed_path def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """Return an extra dict of branches that are used to update the set of branches. """ return {} def finalize_visit( self, *, snapshot: Optional[Snapshot], visit: OriginVisit, status_visit: str, status_load: str, failed_branches: List[str], errors: Optional[List[str]] = None, ) -> Dict[str, Any]: """Finalize the visit: - flush eventual unflushed data to storage - update origin visit's status - return the task's status """ self.storage.flush() snapshot_id: Optional[bytes] = None if snapshot and snapshot.id: # to prevent the snapshot.id to b"" snapshot_id = snapshot.id assert visit.visit visit_status = OriginVisitStatus( origin=self.origin.url, visit=visit.visit, type=self.visit_type, date=now(), status=status_visit, snapshot=snapshot_id, ) self.storage.origin_visit_status_add([visit_status]) result: Dict[str, Any] = { "status": status_load, } if snapshot_id: result["snapshot_id"] = hash_to_hex(snapshot_id) if failed_branches: logger.warning("%d failed branches", len(failed_branches)) for i, urls in enumerate(islice(failed_branches, 50)): prefix_url = "Failed branches: " if i == 0 else "" logger.warning("%s%s", prefix_url, urls) return result def load(self) -> Dict: """Load for a specific origin the associated contents. 1. Get the list of versions in an origin. 2. Get the snapshot from the previous run of the loader, and filter out versions that were already loaded, if their :term:`extids ` match Then, for each remaining version in the origin 3. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 4. Extract the downloaded files. By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 5. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 6. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 7. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 8. Generate and load the snapshot for the visit Using the revisions/releases collected at step 7., and the branch information from step 2., generate a snapshot and load it into the Software Heritage archive """ status_load = "uneventful" # either: eventful, uneventful, failed status_visit = "full" # see swh.model.model.OriginVisitStatus snapshot = None failed_branches: List[str] = [] # Prepare origin and origin_visit origin = Origin(url=self.origin.url) try: self.storage.origin_add([origin]) visit = list( self.storage.origin_visit_add( [ OriginVisit( origin=self.origin.url, date=self.visit_date, type=self.visit_type, ) ] ) )[0] except Exception as e: logger.exception( "Failed to initialize origin_visit for %s", self.origin.url ) sentry_sdk.capture_exception(e) return {"status": "failed"} # Get the previous snapshot for this origin. It is then used to see which # of the package's versions are already loaded in the archive. try: last_snapshot = self.last_snapshot() logger.debug("last snapshot: %s", last_snapshot) except Exception as e: logger.exception("Failed to get previous state for %s", self.origin.url) sentry_sdk.capture_exception(e) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=[str(e)], ) load_exceptions: List[Exception] = [] # Get the list of all version names try: versions = self.get_versions() except NotFound as e: return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="not_found", status_load="failed", errors=[str(e)], ) except Exception as e: + logger.exception("Failed to get list of versions for %s", self.origin.url) sentry_sdk.capture_exception(e) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=[str(e)], ) # Get the metadata of each version's package packages_info: List[Tuple[str, TPackageInfo]] = [ (branch_name, p_info) for version in versions for (branch_name, p_info) in self.get_package_info(version) ] # Compute the ExtID of each of these packages known_extids = self._get_known_extids([p_info for (_, p_info) in packages_info]) if last_snapshot is None: last_snapshot_targets: Set[Sha1Git] = set() else: last_snapshot_targets = { branch.target for branch in last_snapshot.branches.values() } new_extids: Set[ExtID] = set() tmp_releases: Dict[str, List[Tuple[str, Sha1Git]]] = { version: [] for version in versions } errors = [] for (branch_name, p_info) in packages_info: logger.debug("package_info: %s", p_info) # Check if the package was already loaded, using its ExtID swhid = self.resolve_object_from_extids( known_extids, p_info, last_snapshot_targets ) if swhid is not None and swhid.object_type == ObjectType.REVISION: # This package was already loaded, but by an older version # of this loader, which produced revisions instead of releases. # Let's fetch the revision's data, and "upgrade" it into a release. (rev,) = self.storage.revision_get([swhid.object_id]) if not rev: logger.error( "Failed to upgrade branch %s from revision to " "release, %s is missing from the storage. " "Falling back to re-loading from the origin.", branch_name, swhid, ) else: rev = None if swhid is None or (swhid.object_type == ObjectType.REVISION and not rev): # No matching revision or release found in the last snapshot, load it. release_id = None try: res = self._load_release(p_info, origin) if res: (release_id, directory_id) = res assert release_id assert directory_id self._load_extrinsic_directory_metadata( p_info, release_id, directory_id ) self.storage.flush() status_load = "eventful" except Exception as e: self.storage.clear_buffers() load_exceptions.append(e) sentry_sdk.capture_exception(e) error = f"Failed to load branch {branch_name} for {self.origin.url}" logger.exception(error) failed_branches.append(branch_name) errors.append(f"{error}: {e}") continue if release_id is None: continue add_extid = True elif swhid.object_type == ObjectType.REVISION: # If 'rev' was None, the previous block would have run. assert rev is not None rel = rev2rel(rev, p_info.version) self.storage.release_add([rel]) logger.debug("Upgraded %s to %s", swhid, rel.swhid()) release_id = rel.id # Create a new extid for this package, so the next run of this loader # will be able to find the new release, and use it (instead of the # old revision) add_extid = True elif swhid.object_type == ObjectType.RELEASE: # This package was already loaded, nothing to do. release_id = swhid.object_id add_extid = False else: assert False, f"Unexpected object type: {swhid}" assert release_id is not None if add_extid: partial_extid = p_info.extid() if partial_extid is not None: (extid_type, extid_version, extid) = partial_extid release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=release_id ) new_extids.add( ExtID( extid_type=extid_type, extid_version=extid_version, extid=extid, target=release_swhid, ) ) tmp_releases[p_info.version].append((branch_name, release_id)) if load_exceptions: status_visit = "partial" if not tmp_releases: # We could not load any releases; fail completely + logger.error("Failed to load any release for %s", self.origin.url) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=errors, ) try: # Retrieve the default release version (the "latest" one) default_version = self.get_default_version() logger.debug("default version: %s", default_version) # Retrieve extra branches extra_branches = self.extra_branches() logger.debug("extra branches: %s", extra_branches) snapshot = self._load_snapshot( default_version, tmp_releases, extra_branches ) self.storage.flush() except Exception as e: error = f"Failed to build snapshot for origin {self.origin.url}" logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) status_visit = "failed" status_load = "failed" if snapshot: try: metadata_objects = self.build_extrinsic_snapshot_metadata(snapshot.id) self.load_metadata_objects(metadata_objects) except Exception as e: error = ( f"Failed to load extrinsic snapshot metadata for {self.origin.url}" ) logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) status_visit = "partial" status_load = "failed" try: metadata_objects = self.build_extrinsic_origin_metadata() self.load_metadata_objects(metadata_objects) except Exception as e: error = f"Failed to load extrinsic origin metadata for {self.origin.url}" logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) status_visit = "partial" status_load = "failed" if status_load != "failed": self._load_extids(new_extids) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit=status_visit, status_load=status_load, errors=errors, ) def _load_directory( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], tmpdir: str ) -> Tuple[str, from_disk.Directory]: uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) logger.debug("uncompressed_path: %s", uncompressed_path) directory = from_disk.Directory.from_disk( path=uncompressed_path.encode("utf-8"), max_content_length=self.max_content_size, ) contents, skipped_contents, directories = from_disk.iter_directory(directory) logger.debug("Number of skipped contents: %s", len(skipped_contents)) self.storage.skipped_content_add(skipped_contents) logger.debug("Number of contents: %s", len(contents)) self.storage.content_add(contents) logger.debug("Number of directories: %s", len(directories)) self.storage.directory_add(directories) return (uncompressed_path, directory) def _load_release( self, p_info: TPackageInfo, origin ) -> Optional[Tuple[Sha1Git, Sha1Git]]: """Does all the loading of a release itself: * downloads a package and uncompresses it * loads it from disk * adds contents, directories, and release to self.storage * returns (release_id, directory_id) Raises exception when unable to download or uncompress artifacts """ with tempfile.TemporaryDirectory() as tmpdir: dl_artifacts = self.download_package(p_info, tmpdir) (uncompressed_path, directory) = self._load_directory(dl_artifacts, tmpdir) # FIXME: This should be release. cf. D409 release = self.build_release( p_info, uncompressed_path, directory=directory.hash ) if not release: # Some artifacts are missing intrinsic metadata # skipping those return None metadata = [metadata for (filepath, metadata) in dl_artifacts] assert release.target is not None, release assert release.target_type == ModelObjectType.DIRECTORY, release metadata_target = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) original_artifact_metadata = RawExtrinsicMetadata( target=metadata_target, discovery_date=self.visit_date, authority=SWH_METADATA_AUTHORITY, fetcher=self.get_metadata_fetcher(), format="original-artifacts-json", metadata=json.dumps(metadata).encode(), origin=self.origin.url, release=release.swhid(), ) self.load_metadata_objects([original_artifact_metadata]) logger.debug("Release: %s", release) self.storage.release_add([release]) assert directory.hash return (release.id, directory.hash) def _load_snapshot( self, default_version: str, releases: Dict[str, List[Tuple[str, bytes]]], extra_branches: Dict[bytes, Mapping[str, Any]], ) -> Optional[Snapshot]: """Build snapshot out of the current releases stored and extra branches. Then load it in the storage. """ logger.debug("releases: %s", releases) # Build and load the snapshot branches = {} # type: Dict[bytes, Mapping[str, Any]] for version, branch_name_releases in releases.items(): if version == default_version and len(branch_name_releases) == 1: # only 1 branch (no ambiguity), we can create an alias # branch 'HEAD' branch_name, _ = branch_name_releases[0] # except for some corner case (deposit) if branch_name != "HEAD": branches[b"HEAD"] = { "target_type": "alias", "target": branch_name.encode("utf-8"), } for branch_name, target in branch_name_releases: branches[branch_name.encode("utf-8")] = { "target_type": "release", "target": target, } # Deal with extra-branches for name, branch_target in extra_branches.items(): if name in branches: error_message = f"Extra branch '{name!r}' has been ignored" logger.error(error_message) sentry_sdk.capture_message(error_message, "error") else: branches[name] = branch_target snapshot_data = {"branches": branches} logger.debug("snapshot: %s", snapshot_data) snapshot = Snapshot.from_dict(snapshot_data) logger.debug("snapshot: %s", snapshot) self.storage.snapshot_add([snapshot]) return snapshot def get_loader_name(self) -> str: """Returns a fully qualified name of this loader.""" return f"{self.__class__.__module__}.{self.__class__.__name__}" def get_loader_version(self) -> str: """Returns the version of the current loader.""" module_name = self.__class__.__module__ or "" module_name_parts = module_name.split(".") # Iterate rootward through the package hierarchy until we find a parent of this # loader's module with a __version__ attribute. for prefix_size in range(len(module_name_parts), 0, -1): package_name = ".".join(module_name_parts[0:prefix_size]) module = sys.modules[package_name] if hasattr(module, "__version__"): return module.__version__ # If this loader's class has no parent package with a __version__, # it should implement it itself. raise NotImplementedError( f"Could not dynamically find the version of {self.get_loader_name()}." ) def get_metadata_fetcher(self) -> MetadataFetcher: """Returns a MetadataFetcher instance representing this package loader; which is used to for adding provenance information to extracted extrinsic metadata, if any.""" return MetadataFetcher( name=self.get_loader_name(), version=self.get_loader_version(), metadata={}, ) def get_metadata_authority(self) -> MetadataAuthority: """For package loaders that get extrinsic metadata, returns the authority the metadata are coming from. """ raise NotImplementedError("get_metadata_authority") def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: """Returns metadata items, used by build_extrinsic_origin_metadata.""" return [] def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]: """Builds a list of full RawExtrinsicMetadata objects, using metadata returned by get_extrinsic_origin_metadata.""" metadata_items = self.get_extrinsic_origin_metadata() if not metadata_items: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in metadata_items: metadata_objects.append( RawExtrinsicMetadata( target=self.origin.swhid(), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, ) ) return metadata_objects def get_extrinsic_snapshot_metadata(self) -> List[RawExtrinsicMetadataCore]: """Returns metadata items, used by build_extrinsic_snapshot_metadata.""" return [] def build_extrinsic_snapshot_metadata( self, snapshot_id: Sha1Git ) -> List[RawExtrinsicMetadata]: """Builds a list of full RawExtrinsicMetadata objects, using metadata returned by get_extrinsic_snapshot_metadata.""" metadata_items = self.get_extrinsic_snapshot_metadata() if not metadata_items: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in metadata_items: metadata_objects.append( RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.SNAPSHOT, object_id=snapshot_id ), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, origin=self.origin.url, ) ) return metadata_objects def build_extrinsic_directory_metadata( self, p_info: TPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, ) -> List[RawExtrinsicMetadata]: if not p_info.directory_extrinsic_metadata: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in p_info.directory_extrinsic_metadata: metadata_objects.append( RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=directory_id ), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, origin=self.origin.url, release=CoreSWHID( object_type=ObjectType.RELEASE, object_id=release_id ), ) ) return metadata_objects def _load_extrinsic_directory_metadata( self, p_info: TPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, ) -> None: metadata_objects = self.build_extrinsic_directory_metadata( p_info, release_id, directory_id ) self.load_metadata_objects(metadata_objects) def _load_extids(self, extids: Set[ExtID]) -> None: if not extids: return try: self.storage.extid_add(list(extids)) except Exception as e: logger.exception("Failed to load new ExtIDs for %s", self.origin.url) sentry_sdk.capture_exception(e) # No big deal, it just means the next visit will load the same versions # again. def rev2rel(rev: Revision, version: str) -> Release: """Converts a revision to a release.""" message = rev.message if message and not message.endswith(b"\n"): message += b"\n" return Release( name=version.encode(), message=message, target=rev.directory, target_type=ModelObjectType.DIRECTORY, synthetic=rev.synthetic, author=rev.author, date=rev.date, ) diff --git a/swh/loader/package/pubdev/__init__.py b/swh/loader/package/pubdev/__init__.py new file mode 100644 index 0000000..0ae96b2 --- /dev/null +++ b/swh/loader/package/pubdev/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import PubDevLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": PubDevLoader, + } diff --git a/swh/loader/package/pubdev/loader.py b/swh/loader/package/pubdev/loader.py new file mode 100644 index 0000000..2a0a944 --- /dev/null +++ b/swh/loader/package/pubdev/loader.py @@ -0,0 +1,194 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from distutils.version import LooseVersion +import json +from pathlib import Path +from typing import Any, Dict, Iterator, Optional, Sequence, Tuple + +import attr +import yaml + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import ( + EMPTY_AUTHOR, + Person, + api_info, + cached_method, + release_name, +) +from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone +from swh.storage.interface import StorageInterface + + +@attr.s +class PubDevPackageInfo(BasePackageInfo): + + name = attr.ib(type=str) + """Name of the package""" + + version = attr.ib(type=str) + """Current version""" + + last_modified = attr.ib(type=str) + """Last modified date as release date""" + + author = attr.ib(type=Person) + """Author""" + + description = attr.ib(type=str) + """Description""" + + +def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: + """Extract intrinsic metadata from pubspec.yaml file at dir_path. + + Each pub.dev package version has a pubspec.yaml file at the root of the archive. + + See https://dart.dev/tools/pub/pubspec for pubspec specifications. + + Args: + dir_path: A directory on disk where a pubspec.yaml must be present + + Returns: + A dict mapping from yaml parser + """ + pubspec_path = dir_path / "pubspec.yaml" + return yaml.safe_load(pubspec_path.read_text()) + + +class PubDevLoader(PackageLoader[PubDevPackageInfo]): + visit_type = "pubdev" + + PUBDEV_BASE_URL = "https://pub.dev/" + + def __init__( + self, + storage: StorageInterface, + url: str, + **kwargs, + ): + + super().__init__(storage=storage, url=url, **kwargs) + self.url = url + assert url.startswith(self.PUBDEV_BASE_URL) + self.package_info_url = url.replace( + self.PUBDEV_BASE_URL, f"{self.PUBDEV_BASE_URL}api/" + ) + + def _raw_info(self) -> bytes: + return api_info(self.package_info_url) + + @cached_method + def info(self) -> Dict: + """Return the project metadata information (fetched from pub.dev registry)""" + # Use strict=False in order to correctly manage case where \n is present in a string + info = json.loads(self._raw_info(), strict=False) + # Arrange versions list as a new dict with `version` as key + versions = {v["version"]: v for v in info["versions"]} + info["versions"] = versions + return info + + def get_versions(self) -> Sequence[str]: + """Get all released versions of a PubDev package + + Returns: + A sequence of versions + + Example:: + + ["0.1.1", "0.10.2"] + """ + versions = list(self.info()["versions"].keys()) + versions.sort(key=LooseVersion) + return versions + + def get_default_version(self) -> str: + """Get the newest release version of a PubDev package + + Returns: + A string representing a version + + Example:: + + "0.1.2" + """ + latest = self.info()["latest"] + return latest["version"] + + def get_package_info(self, version: str) -> Iterator[Tuple[str, PubDevPackageInfo]]: + """Get release name and package information from version + + Package info comes from extrinsic metadata (from self.info()) + + Args: + version: Package version (e.g: "0.1.0") + + Returns: + Iterator of tuple (release_name, p_info) + """ + v = self.info()["versions"][version] + assert v["version"] == version + + url = v["archive_url"] + name = v["pubspec"]["name"] + filename = f"{name}-{version}.tar.gz" + last_modified = v["published"] + + if "authors" in v["pubspec"]: + # TODO: here we have a list of author, see T3887 + author = Person.from_fullname(v["pubspec"]["authors"][0].encode()) + elif "author" in v["pubspec"] and v["pubspec"]["author"] is not None: + author = Person.from_fullname(v["pubspec"]["author"].encode()) + else: + author = EMPTY_AUTHOR + + description = v["pubspec"]["description"] + + p_info = PubDevPackageInfo( + name=name, + filename=filename, + url=url, + version=version, + last_modified=last_modified, + author=author, + description=description, + ) + yield release_name(version), p_info + + def build_release( + self, p_info: PubDevPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + + # Extract intrinsic metadata from uncompressed_path/pubspec.yaml + intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) + + name: str = intrinsic_metadata["name"] + version: str = intrinsic_metadata["version"] + assert version == p_info.version + + # author from intrinsic_metadata should not take precedence over the one + # returned by the api, see https://dart.dev/tools/pub/pubspec#authorauthors + author: Person = p_info.author + + if "description" in intrinsic_metadata and intrinsic_metadata["description"]: + description = intrinsic_metadata["description"] + else: + description = p_info.description + + message = ( + f"Synthetic release for pub.dev source package {name} " + f"version {version}\n\n" + f"{description}\n" + ) + + return Release( + name=version.encode(), + author=author, + date=TimestampWithTimezone.from_iso8601(p_info.last_modified), + message=message.encode(), + target_type=ObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) diff --git a/swh/loader/package/crates/tasks.py b/swh/loader/package/pubdev/tasks.py similarity index 51% copy from swh/loader/package/crates/tasks.py copy to swh/loader/package/pubdev/tasks.py index 9385263..f6a2927 100644 --- a/swh/loader/package/crates/tasks.py +++ b/swh/loader/package/pubdev/tasks.py @@ -1,14 +1,14 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task -from swh.loader.package.crates.loader import CratesLoader +from swh.loader.package.pubdev.loader import PubDevLoader -@shared_task(name=__name__ + ".LoadCrates") -def load_crates(*, url=None, artifacts: list): - """Load Rust crate package""" - return CratesLoader.from_configfile(url=url, artifacts=artifacts).load() +@shared_task(name=__name__ + ".LoadPubDev") +def load_pubdev(**kwargs): + """Load packages from pub.dev (Dart, Flutter)""" + return PubDevLoader.from_configfile(**kwargs).load() diff --git a/swh/loader/package/pubdev/tests/__init__.py b/swh/loader/package/pubdev/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/pubdev/tests/data/fake_pubdev.sh b/swh/loader/package/pubdev/tests/data/fake_pubdev.sh new file mode 100644 index 0000000..c4c33dd --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/fake_pubdev.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash + +# Script to generate fake pub.dev http api response and fake Dart or FLutter packages archives as .tar.gz. + +set -euo pipefail + +# Create directories +readonly TMP=tmp_dir/pubdev +readonly BASE_API=https_pub.dev +readonly BASE_ARCHIVES=https_pub.dartlang.org + +mkdir -p $TMP +mkdir -p $BASE_API +mkdir -p $BASE_ARCHIVES + +# http api response as json +echo -e '''{"name":"authentication","latest":{"version":"0.0.1","pubspec":{"name":"authentication","description":"Persistent user authentication for Flutter with optional backend API integration.","version":"0.0.1","author":null,"homepage":null,"environment":{"sdk":">=2.7.0 <3.0.0","flutter":">=1.17.0 <2.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}},"dev_dependencies":{"flutter_test":{"sdk":"flutter"}},"flutter":{"plugin":{"platforms":{"some_platform":{"pluginClass":"somePluginClass"}}}}},"archive_url":"https://pub.dartlang.org/packages/authentication/versions/0.0.1.tar.gz","archive_sha256":"0179334b346cb67e4e6e3c905e5cc5c8e488a45ebd99fd2be3a7e0476d620d99","published":"2020-08-13T04:53:34.134687Z"},"versions":[{"version":"0.0.1","pubspec":{"name":"authentication","description":"Persistent user authentication for Flutter with optional backend API integration.","version":"0.0.1","author":null,"homepage":null,"environment":{"sdk":">=2.7.0 <3.0.0","flutter":">=1.17.0 <2.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}},"dev_dependencies":{"flutter_test":{"sdk":"flutter"}},"flutter":{"plugin":{"platforms":{"some_platform":{"pluginClass":"somePluginClass"}}}}},"archive_url":"https://pub.dartlang.org/packages/authentication/versions/0.0.1.tar.gz","archive_sha256":"0179334b346cb67e4e6e3c905e5cc5c8e488a45ebd99fd2be3a7e0476d620d99","published":"2020-08-13T04:53:34.134687Z"}]} +''' > $BASE_API/api_packages_authentication + +echo -e '''{"name":"Autolinker","latest":{"version":"0.1.1","pubspec":{"version":"0.1.1","homepage":"https://github.com/hackcave","description":"Port of Autolinker.js to dart","name":"Autolinker","author":"hackcave "},"archive_url":"https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz","published":"2014-12-24T22:34:02.534090Z"},"versions":[{"version":"0.1.1","pubspec":{"version":"0.1.1","homepage":"https://github.com/hackcave","description":"Port of Autolinker.js to dart","name":"Autolinker","author":"hackcave "},"archive_url":"https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz","published":"2014-12-24T22:34:02.534090Z"}]} +''' > ${BASE_API}/api_packages_Autolinker + +echo -e '''{"name":"bezier","latest":{"version":"1.1.5","pubspec":{"name":"bezier","version":"1.1.5","authors":["Aaron Barrett ","Isaac Barrett "],"description":"A 2D Bézier curve math library. Based heavily on the work of @TheRealPomax .\nLive examples can be found at .","homepage":"https://github.com/aab29/bezier.dart","environment":{"sdk":">=2.0.0 <3.0.0"},"dependencies":{"vector_math":"^2.0.0"},"dev_dependencies":{"test":"^1.0.0"}},"archive_url":"https://pub.dartlang.org/packages/bezier/versions/1.1.5.tar.gz","archive_sha256":"cc5da2fa927b5d347550f78d456cd984b7df78a7f0405119cdab12111e2f9ee8","published":"2019-12-22T03:17:30.805225Z"},"versions":[{"version":"1.1.5","pubspec":{"name":"bezier","version":"1.1.5","authors":["Aaron Barrett ","Isaac Barrett "],"description":"A 2D Bézier curve math library. Based heavily on the work of @TheRealPomax .\nLive examples can be found at .","homepage":"https://github.com/aab29/bezier.dart","environment":{"sdk":">=2.0.0 <3.0.0"},"dependencies":{"vector_math":"^2.0.0"},"dev_dependencies":{"test":"^1.0.0"}},"archive_url":"https://pub.dartlang.org/packages/bezier/versions/1.1.5.tar.gz","archive_sha256":"cc5da2fa927b5d347550f78d456cd984b7df78a7f0405119cdab12111e2f9ee8","published":"2019-12-22T03:17:30.805225Z"}]} +''' > ${BASE_API}/api_packages_bezier + +echo -e '''{"name":"pdf","latest":{"version":"3.8.2","pubspec":{"name":"pdf","description":"A pdf producer for Dart. It can create pdf files for both web or flutter.","homepage":"https://github.com/DavBfr/dart_pdf/tree/master/pdf","repository":"https://github.com/DavBfr/dart_pdf","issue_tracker":"https://github.com/DavBfr/dart_pdf/issues","version":"3.8.2","environment":{"sdk":">=2.12.0 <3.0.0"},"dependencies":{"archive":"^3.1.0","barcode":">=2.2.0 <3.0.0","crypto":"^3.0.0","image":">=3.0.1 <4.0.0","meta":">=1.3.0 <2.0.0","path_parsing":">=0.2.0 <2.0.0","vector_math":"^2.1.0","xml":">=5.1.0 <7.0.0"},"dev_dependencies":{"flutter_lints":"^1.0.4","test":">=1.16.0 <2.0.0"}},"archive_url":"https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz","published":"2022-07-25T11:38:25.983876Z"},"versions":[{"version":"1.0.0","pubspec":{"version":"1.0.0","name":"pdf","dependencies":{"ttf_parser":"^1.0.0","vector_math":"^2.0.7","meta":"^1.1.5"},"author":"David PHAM-VAN ","description":"A pdf producer for Dart","homepage":"https://github.com/davbfr/dart_pdf","environment":{"sdk":">=1.8.0 <2.0.0"},"dev_dependencies":{"test":"any"}},"archive_url":"https://pub.dartlang.org/packages/pdf/versions/1.0.0.tar.gz","published":"2018-07-16T21:12:28.894137Z"},{"version":"3.8.2","pubspec":{"name":"pdf","description":"A pdf producer for Dart. It can create pdf files for both web or flutter.","homepage":"https://github.com/DavBfr/dart_pdf/tree/master/pdf","repository":"https://github.com/DavBfr/dart_pdf","issue_tracker":"https://github.com/DavBfr/dart_pdf/issues","version":"3.8.2","environment":{"sdk":">=2.12.0 <3.0.0"},"dependencies":{"archive":"^3.1.0","barcode":">=2.2.0 <3.0.0","crypto":"^3.0.0","image":">=3.0.1 <4.0.0","meta":">=1.3.0 <2.0.0","path_parsing":">=0.2.0 <2.0.0","vector_math":"^2.1.0","xml":">=5.1.0 <7.0.0"},"dev_dependencies":{"flutter_lints":"^1.0.4","test":">=1.16.0 <2.0.0"}},"archive_url":"https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz","published":"2022-07-25T11:38:25.983876Z"}]} +''' > ${BASE_API}/api_packages_pdf + +echo -e '''{"name":"abstract_io","latest":{"version":"0.1.2+6","pubspec":{"name":"abstract_io","description":"Abstract IO is designed to simplify and generalize saving data both localy and externaly","version":"0.1.2+6","author":"Anders Groeschel","repository":"https://github.com/AndersGroeschel/abstract_io","homepage":"https://github.com/AndersGroeschel/abstract_io","environment":{"sdk":">=2.7.0 <3.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}}},"archive_url":"https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B6.tar.gz","archive_sha256":"9557fd384730d92a046cfccdff9625f2d646657219d5a0e447cb7eb0fdf90f18","published":"2020-08-03T21:31:05.764846Z"},"versions":[{"version":"0.1.2+4","pubspec":{"name":"abstract_io","description":"Abstract IO is designed to simplify and generalize saving data both localy and externaly","version":"0.1.2+4","author":"Anders Groeschel","repository":"https://github.com/AndersGroeschel/abstract_io","homepage":"https://github.com/AndersGroeschel/abstract_io","environment":{"sdk":">=2.7.0 <3.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}}},"archive_url":"https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B4.tar.gz","archive_sha256":"df687ff2a92774db04a28167ccddbfe9c2fc1ea63c6ae05c3236552fe350bb68","published":"2020-08-03T20:14:38.116237Z"},{"version":"0.1.2+5","pubspec":{"name":"abstract_io","description":"Abstract IO is designed to simplify and generalize saving data both localy and externaly","version":"0.1.2+5","author":"Anders Groeschel","repository":"https://github.com/AndersGroeschel/abstract_io","homepage":"https://github.com/AndersGroeschel/abstract_io","environment":{"sdk":">=2.7.0 <3.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}}},"archive_url":"https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B5.tar.gz","archive_sha256":"fc9199c2f9879d3c0d140c05a2f8c537561af256d98d209b4ee102e8107ec2b9","published":"2020-08-03T21:09:20.329418Z"},{"version":"0.1.2+6","pubspec":{"name":"abstract_io","description":"Abstract IO is designed to simplify and generalize saving data both localy and externaly","version":"0.1.2+6","author":"Anders Groeschel","repository":"https://github.com/AndersGroeschel/abstract_io","homepage":"https://github.com/AndersGroeschel/abstract_io","environment":{"sdk":">=2.7.0 <3.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}}},"archive_url":"https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B6.tar.gz","archive_sha256":"9557fd384730d92a046cfccdff9625f2d646657219d5a0e447cb7eb0fdf90f18","published":"2020-08-03T21:31:05.764846Z"}]} +''' > ${BASE_API}/api_packages_abstract_io + +# Dart package a pubspec.yaml file at thier root. Generate some of them. + +mkdir -p ${TMP}/packages_authentication_versions_0.0.1 +echo -e '''name: authentication +description: Persistent user authentication for Flutter with optional backend API integration. +version: 0.0.1 +author: +homepage: + +environment: + sdk: ">=2.7.0 <3.0.0" + flutter: ">=1.17.0 <2.0.0" + +dependencies: + flutter: + sdk: flutter + +dev_dependencies: + flutter_test: + sdk: flutter + +# For information on the generic Dart part of this file, see the +# following page: https://dart.dev/tools/pub/pubspec + +# The following section is specific to Flutter. +flutter: + # This section identifies this Flutter project as a plugin project. + # The 'pluginClass' and Android 'package' identifiers should not ordinarily + # be modified. They are used by the tooling to maintain consistency when + # adding or updating assets for this project. + plugin: + platforms: + # This plugin project was generated without specifying any + # platforms with the `--platform` argument. If you see the `fake_platform` map below, remove it and + # then add platforms following the instruction here: + # https://flutter.dev/docs/development/packages-and-plugins/developing-packages#plugin-platforms + # ------------------- + some_platform: + pluginClass: somePluginClass + # ------------------- + + # To add assets to your plugin package, add an assets section, like this: + # assets: + # - images/a_dot_burr.jpeg + # - images/a_dot_ham.jpeg + # + # For details regarding assets in packages, see + # https://flutter.dev/assets-and-images/#from-packages + # + # An image asset can refer to one or more resolution-specific "variants", see + # https://flutter.dev/assets-and-images/#resolution-aware. + + # To add custom fonts to your plugin package, add a fonts section here, + # in this "flutter" section. Each entry in this list should have a + # "family" key with the font family name, and a "fonts" key with a + # list giving the asset and other descriptors for the font. For + # example: + # fonts: + # - family: Schyler + # fonts: + # - asset: fonts/Schyler-Regular.ttf + # - asset: fonts/Schyler-Italic.ttf + # style: italic + # - family: Trajan Pro + # fonts: + # - asset: fonts/TrajanPro.ttf + # - asset: fonts/TrajanPro_Bold.ttf + # weight: 700 + # + # For details regarding fonts in packages, see + # https://flutter.dev/custom-fonts/#from-packages +''' > ${TMP}/packages_authentication_versions_0.0.1/pubspec.yaml + + +mkdir -p ${TMP}/packages_autolinker_versions_0.1.1 +echo -e '''name: Autolinker +version: 0.1.1 +author: hackcave +homepage: https://github.com/hackcave +description: + Port of Autolinker.js to dart +''' > ${TMP}/packages_autolinker_versions_0.1.1/pubspec.yaml + +mkdir -p ${TMP}/packages_bezier_versions_1.1.5 +echo -e '''name: bezier +version: 1.1.5 +authors: + - Aaron Barrett + - Isaac Barrett +description: >- + A 2D Bézier curve math library. Based heavily on the work of @TheRealPomax + . + + Live examples can be found at . +homepage: https://github.com/aab29/bezier.dart +environment: + sdk: ">=2.0.0 <3.0.0" +dependencies: + vector_math: ^2.0.0 +dev_dependencies: + test: ^1.0.0 +''' > ${TMP}/packages_bezier_versions_1.1.5/pubspec.yaml + +mkdir -p ${TMP}/packages_pdf_versions_1.0.0 +echo -e '''name: pdf +author: David PHAM-VAN +description: A pdf producer for Dart +homepage: https://github.com/davbfr/dart_pdf +version: 1.0.0 + +environment: + sdk: ">=1.8.0 <2.0.0" + +dependencies: + meta: "^1.1.5" + ttf_parser: "^1.0.0" + vector_math: "^2.0.7" + +dev_dependencies: + test: any +''' > ${TMP}/packages_pdf_versions_1.0.0/pubspec.yaml + +mkdir -p ${TMP}/packages_pdf_versions_3.8.2 +echo -e '''name: pdf +description: A pdf producer for Dart. It can create pdf files for both web or flutter. +homepage: https://github.com/DavBfr/dart_pdf/tree/master/pdf +repository: https://github.com/DavBfr/dart_pdf +issue_tracker: https://github.com/DavBfr/dart_pdf/issues +version: 3.8.2 + +environment: + sdk: ">=2.12.0 <3.0.0" + +dependencies: + archive: ^3.1.0 + barcode: ">=2.2.0 <3.0.0" + crypto: ^3.0.0 + image: ">=3.0.1 <4.0.0" + meta: ">=1.3.0 <2.0.0" + path_parsing: ">=0.2.0 <2.0.0" + vector_math: ^2.1.0 + xml: ">=5.1.0 <7.0.0" + +dev_dependencies: + flutter_lints: ^1.0.4 + test: ">=1.16.0 <2.0.0" +''' > ${TMP}/packages_pdf_versions_3.8.2/pubspec.yaml + +cd $TMP + +tar -czf packages_authentication_versions_0.0.1.tar.gz -C packages_authentication_versions_0.0.1 . +tar -czf packages_Autolinker_versions_0.1.1.tar.gz -C packages_autolinker_versions_0.1.1 . +tar -czf packages_bezier_versions_1.1.5.tar.gz -C packages_bezier_versions_1.1.5 . +tar -czf packages_pdf_versions_1.0.0.tar.gz -C packages_pdf_versions_1.0.0 . +tar -czf packages_pdf_versions_3.8.2.tar.gz -C packages_pdf_versions_3.8.2 . + + +# Move .tar.gz archives to a servable directory +mv *.tar.gz ../../$BASE_ARCHIVES + +# Clean up removing tmp_dir +cd ../../ +rm -r tmp_dir/ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_Autolinker_versions_0.1.1.tar.gz b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_Autolinker_versions_0.1.1.tar.gz new file mode 100644 index 0000000..5cdf2dd Binary files /dev/null and b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_Autolinker_versions_0.1.1.tar.gz differ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_authentication_versions_0.0.1.tar.gz b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_authentication_versions_0.0.1.tar.gz new file mode 100644 index 0000000..4338014 Binary files /dev/null and b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_authentication_versions_0.0.1.tar.gz differ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_bezier_versions_1.1.5.tar.gz b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_bezier_versions_1.1.5.tar.gz new file mode 100644 index 0000000..5a92354 Binary files /dev/null and b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_bezier_versions_1.1.5.tar.gz differ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_1.0.0.tar.gz b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_1.0.0.tar.gz new file mode 100644 index 0000000..d30d19f Binary files /dev/null and b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_1.0.0.tar.gz differ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_3.8.2.tar.gz b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_3.8.2.tar.gz new file mode 100644 index 0000000..45e37e8 Binary files /dev/null and b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_3.8.2.tar.gz differ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker new file mode 100644 index 0000000..b60f1d8 --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker @@ -0,0 +1,29 @@ +{ + "name": "Autolinker", + "latest": { + "version": "0.1.1", + "pubspec": { + "version": "0.1.1", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + "name": "Autolinker", + "author": "hackcave " + }, + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "published": "2014-12-24T22:34:02.534090Z" + }, + "versions": [ + { + "version": "0.1.1", + "pubspec": { + "version": "0.1.1", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + "name": "Autolinker", + "author": "hackcave " + }, + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "published": "2014-12-24T22:34:02.534090Z" + } + ] +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_abstract_io b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_abstract_io new file mode 100644 index 0000000..1d00f61 --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_abstract_io @@ -0,0 +1,93 @@ +{ + "name": "abstract_io", + "latest": { + "version": "0.1.2+6", + "pubspec": { + "name": "abstract_io", + "description": "Abstract IO is designed to simplify and generalize saving data both localy and externaly", + "version": "0.1.2+6", + "author": "Anders Groeschel", + "repository": "https://github.com/AndersGroeschel/abstract_io", + "homepage": "https://github.com/AndersGroeschel/abstract_io", + "environment": { + "sdk": ">=2.7.0 <3.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B6.tar.gz", + "archive_sha256": "9557fd384730d92a046cfccdff9625f2d646657219d5a0e447cb7eb0fdf90f18", + "published": "2020-08-03T21:31:05.764846Z" + }, + "versions": [ + { + "version": "0.1.2+4", + "pubspec": { + "name": "abstract_io", + "description": "Abstract IO is designed to simplify and generalize saving data both localy and externaly", + "version": "0.1.2+4", + "author": "Anders Groeschel", + "repository": "https://github.com/AndersGroeschel/abstract_io", + "homepage": "https://github.com/AndersGroeschel/abstract_io", + "environment": { + "sdk": ">=2.7.0 <3.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B4.tar.gz", + "archive_sha256": "df687ff2a92774db04a28167ccddbfe9c2fc1ea63c6ae05c3236552fe350bb68", + "published": "2020-08-03T20:14:38.116237Z" + }, + { + "version": "0.1.2+5", + "pubspec": { + "name": "abstract_io", + "description": "Abstract IO is designed to simplify and generalize saving data both localy and externaly", + "version": "0.1.2+5", + "author": "Anders Groeschel", + "repository": "https://github.com/AndersGroeschel/abstract_io", + "homepage": "https://github.com/AndersGroeschel/abstract_io", + "environment": { + "sdk": ">=2.7.0 <3.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B5.tar.gz", + "archive_sha256": "fc9199c2f9879d3c0d140c05a2f8c537561af256d98d209b4ee102e8107ec2b9", + "published": "2020-08-03T21:09:20.329418Z" + }, + { + "version": "0.1.2+6", + "pubspec": { + "name": "abstract_io", + "description": "Abstract IO is designed to simplify and generalize saving data both localy and externaly", + "version": "0.1.2+6", + "author": "Anders Groeschel", + "repository": "https://github.com/AndersGroeschel/abstract_io", + "homepage": "https://github.com/AndersGroeschel/abstract_io", + "environment": { + "sdk": ">=2.7.0 <3.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B6.tar.gz", + "archive_sha256": "9557fd384730d92a046cfccdff9625f2d646657219d5a0e447cb7eb0fdf90f18", + "published": "2020-08-03T21:31:05.764846Z" + } + ] +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication new file mode 100644 index 0000000..b4b312a --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication @@ -0,0 +1,77 @@ +{ + "name": "authentication", + "latest": { + "version": "0.0.1", + "pubspec": { + "name": "authentication", + "description": "Persistent user authentication for Flutter with optional backend API integration.", + "version": "0.0.1", + "author": null, + "homepage": null, + "environment": { + "sdk": ">=2.7.0 <3.0.0", + "flutter": ">=1.17.0 <2.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + }, + "dev_dependencies": { + "flutter_test": { + "sdk": "flutter" + } + }, + "flutter": { + "plugin": { + "platforms": { + "some_platform": { + "pluginClass": "somePluginClass" + } + } + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/authentication/versions/0.0.1.tar.gz", + "archive_sha256": "0179334b346cb67e4e6e3c905e5cc5c8e488a45ebd99fd2be3a7e0476d620d99", + "published": "2020-08-13T04:53:34.134687Z" + }, + "versions": [ + { + "version": "0.0.1", + "pubspec": { + "name": "authentication", + "description": "Persistent user authentication for Flutter with optional backend API integration.", + "version": "0.0.1", + "author": null, + "homepage": null, + "environment": { + "sdk": ">=2.7.0 <3.0.0", + "flutter": ">=1.17.0 <2.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + }, + "dev_dependencies": { + "flutter_test": { + "sdk": "flutter" + } + }, + "flutter": { + "plugin": { + "platforms": { + "some_platform": { + "pluginClass": "somePluginClass" + } + } + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/authentication/versions/0.0.1.tar.gz", + "archive_sha256": "0179334b346cb67e4e6e3c905e5cc5c8e488a45ebd99fd2be3a7e0476d620d99", + "published": "2020-08-13T04:53:34.134687Z" + } + ] +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier new file mode 100644 index 0000000..efd84a4 --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier @@ -0,0 +1,55 @@ +{ + "name": "bezier", + "latest": { + "version": "1.1.5", + "pubspec": { + "name": "bezier", + "version": "1.1.5", + "authors": [ + "Aaron Barrett ", + "Isaac Barrett " + ], + "description": "A 2D Bézier curve math library. Based heavily on the work of @TheRealPomax .\nLive examples can be found at .", + "homepage": "https://github.com/aab29/bezier.dart", + "environment": { + "sdk": ">=2.0.0 <3.0.0" + }, + "dependencies": { + "vector_math": "^2.0.0" + }, + "dev_dependencies": { + "test": "^1.0.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/bezier/versions/1.1.5.tar.gz", + "archive_sha256": "cc5da2fa927b5d347550f78d456cd984b7df78a7f0405119cdab12111e2f9ee8", + "published": "2019-12-22T03:17:30.805225Z" + }, + "versions": [ + { + "version": "1.1.5", + "pubspec": { + "name": "bezier", + "version": "1.1.5", + "authors": [ + "Aaron Barrett ", + "Isaac Barrett " + ], + "description": "A 2D Bézier curve math library. Based heavily on the work of @TheRealPomax .\nLive examples can be found at .", + "homepage": "https://github.com/aab29/bezier.dart", + "environment": { + "sdk": ">=2.0.0 <3.0.0" + }, + "dependencies": { + "vector_math": "^2.0.0" + }, + "dev_dependencies": { + "test": "^1.0.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/bezier/versions/1.1.5.tar.gz", + "archive_sha256": "cc5da2fa927b5d347550f78d456cd984b7df78a7f0405119cdab12111e2f9ee8", + "published": "2019-12-22T03:17:30.805225Z" + } + ] +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf new file mode 100644 index 0000000..c015890 --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf @@ -0,0 +1,88 @@ +{ + "name": "pdf", + "latest": { + "version": "3.8.2", + "pubspec": { + "name": "pdf", + "description": "A pdf producer for Dart. It can create pdf files for both web or flutter.", + "homepage": "https://github.com/DavBfr/dart_pdf/tree/master/pdf", + "repository": "https://github.com/DavBfr/dart_pdf", + "issue_tracker": "https://github.com/DavBfr/dart_pdf/issues", + "version": "3.8.2", + "environment": { + "sdk": ">=2.12.0 <3.0.0" + }, + "dependencies": { + "archive": "^3.1.0", + "barcode": ">=2.2.0 <3.0.0", + "crypto": "^3.0.0", + "image": ">=3.0.1 <4.0.0", + "meta": ">=1.3.0 <2.0.0", + "path_parsing": ">=0.2.0 <2.0.0", + "vector_math": "^2.1.0", + "xml": ">=5.1.0 <7.0.0" + }, + "dev_dependencies": { + "flutter_lints": "^1.0.4", + "test": ">=1.16.0 <2.0.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz", + "published": "2022-07-25T11:38:25.983876Z" + }, + "versions": [ + { + "version": "1.0.0", + "pubspec": { + "version": "1.0.0", + "name": "pdf", + "dependencies": { + "ttf_parser": "^1.0.0", + "vector_math": "^2.0.7", + "meta": "^1.1.5" + }, + "author": "David PHAM-VAN ", + "description": "A pdf producer for Dart", + "homepage": "https://github.com/davbfr/dart_pdf", + "environment": { + "sdk": ">=1.8.0 <2.0.0" + }, + "dev_dependencies": { + "test": "any" + } + }, + "archive_url": "https://pub.dartlang.org/packages/pdf/versions/1.0.0.tar.gz", + "published": "2018-07-16T21:12:28.894137Z" + }, + { + "version": "3.8.2", + "pubspec": { + "name": "pdf", + "description": "A pdf producer for Dart. It can create pdf files for both web or flutter.", + "homepage": "https://github.com/DavBfr/dart_pdf/tree/master/pdf", + "repository": "https://github.com/DavBfr/dart_pdf", + "issue_tracker": "https://github.com/DavBfr/dart_pdf/issues", + "version": "3.8.2", + "environment": { + "sdk": ">=2.12.0 <3.0.0" + }, + "dependencies": { + "archive": "^3.1.0", + "barcode": ">=2.2.0 <3.0.0", + "crypto": "^3.0.0", + "image": ">=3.0.1 <4.0.0", + "meta": ">=1.3.0 <2.0.0", + "path_parsing": ">=0.2.0 <2.0.0", + "vector_math": "^2.1.0", + "xml": ">=5.1.0 <7.0.0" + }, + "dev_dependencies": { + "flutter_lints": "^1.0.4", + "test": ">=1.16.0 <2.0.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz", + "published": "2022-07-25T11:38:25.983876Z" + } + ] +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/test_pubdev.py b/swh/loader/package/pubdev/tests/test_pubdev.py new file mode 100644 index 0000000..0979dfd --- /dev/null +++ b/swh/loader/package/pubdev/tests/test_pubdev.py @@ -0,0 +1,272 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.loader.package.pubdev.loader import PubDevLoader +from swh.loader.package.utils import EMPTY_AUTHOR +from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats +from swh.model.hashutil import hash_to_bytes +from swh.model.model import ( + ObjectType, + Person, + Release, + Snapshot, + SnapshotBranch, + TargetType, + TimestampWithTimezone, +) + +EXPECTED_PACKAGES = [ + { + "url": "https://pub.dev/packages/Autolinker", # one version + }, + { + "url": "https://pub.dev/packages/pdf", # multiple versions + }, + { + "url": "https://pub.dev/packages/bezier", # multiple authors + }, + { + "url": "https://pub.dev/packages/authentication", # empty author + }, + { + "url": "https://pub.dev/packages/abstract_io", # loose versions names + }, +] + + +def test_get_versions(requests_mock_datadir, swh_storage): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + ) + assert loader.get_versions() == [ + "1.0.0", + "3.8.2", + ] + + +def test_get_loose_versions(requests_mock_datadir, swh_storage): + """Sometimes version name does not follow semver""" + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[4]["url"], + ) + assert loader.get_versions() == ["0.1.2+4", "0.1.2+5", "0.1.2+6"] + + +def test_get_default_version(requests_mock_datadir, swh_storage): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + ) + assert loader.get_default_version() == "3.8.2" + + +def test_pubdev_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + ) + load_status = loader.load() + assert load_status["status"] == "eventful" + assert load_status["snapshot_id"] is not None + + expected_snapshot_id = "245092931ba809e6c54ebda8f865fb5a969a4134" + expected_release_id = "919f267ea050539606344d49d14bf594c4386e5a" + + assert expected_snapshot_id == load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(load_status["snapshot_id"]), + branches={ + b"releases/0.1.1": SnapshotBranch( + target=hash_to_bytes(expected_release_id), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/0.1.1", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 1, + "directory": 1, + "origin": 1, + "origin_visit": 1, + "release": 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( + name=b"0.1.1", + message=b"Synthetic release for pub.dev source package Autolinker version" + b" 0.1.1\n\nPort of Autolinker.js to dart\n", + target=hash_to_bytes("3fb6d4f2c0334d1604357ae92b2dd38a55a78194"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=Person( + fullname=b"hackcave ", + name=b"hackcave", + email=b"hackers@hackcave.org", + ), + date=TimestampWithTimezone.from_iso8601("2014-12-24T22:34:02.534090+00:00"), + id=hash_to_bytes(expected_release_id), + ) + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + status="full", + type="pubdev", + snapshot=expected_snapshot.id, + ) + + +def test_pubdev_loader_load_multiple_versions( + datadir, requests_mock_datadir, swh_storage +): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + ) + load_status = loader.load() + + assert load_status["status"] == "eventful" + assert load_status["snapshot_id"] is not None + + expected_snapshot_id = "43d5b68a9fa973aa95e56916aaef70841ccbc2a0" + + assert expected_snapshot_id == load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(load_status["snapshot_id"]), + branches={ + b"releases/1.0.0": SnapshotBranch( + target=hash_to_bytes("fbf8e40af675096681954553d737861e10b57216"), + target_type=TargetType.RELEASE, + ), + b"releases/3.8.2": SnapshotBranch( + target=hash_to_bytes("627a5d586e3fb4e7319b17f1aee268fe2fb8e01c"), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/3.8.2", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 1 + 1, + "directory": 1 + 1, + "origin": 1, + "origin_visit": 1, + "release": 1 + 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + status="full", + type="pubdev", + snapshot=expected_snapshot.id, + ) + + +def test_pubdev_loader_multiple_authors(datadir, requests_mock_datadir, swh_storage): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[2]["url"], + ) + load_status = loader.load() + assert load_status["status"] == "eventful" + assert load_status["snapshot_id"] is not None + + expected_snapshot_id = "4fa9f19d1d6ccc70921c8c50b278f510db63aa36" + expected_release_id = "538c98fd69a42d8d0561a7ca95b354de2143a3ab" + + assert expected_snapshot_id == load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(load_status["snapshot_id"]), + branches={ + b"releases/1.1.5": SnapshotBranch( + target=hash_to_bytes(expected_release_id), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/1.1.5", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + release = swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] + assert release.author == Person( + fullname=b"Aaron Barrett ", + name=b"Aaron Barrett", + email=b"aaron@aaronbarrett.com", + ) + + +def test_pubdev_loader_empty_author(datadir, requests_mock_datadir, swh_storage): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[3]["url"], + ) + + load_status = loader.load() + assert load_status["status"] == "eventful" + assert load_status["snapshot_id"] is not None + + expected_snapshot_id = "0c7fa6b9fced23c648d2093ad5597622683f8aed" + expected_release_id = "7d8c05181069aa1049a3f0bc1d13bedc34625d47" + + assert expected_snapshot_id == load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(load_status["snapshot_id"]), + branches={ + b"releases/0.0.1": SnapshotBranch( + target=hash_to_bytes(expected_release_id), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/0.0.1", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + release = swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] + assert release.author == EMPTY_AUTHOR + + +def test_pubdev_invalid_origin(swh_storage): + + with pytest.raises(AssertionError): + PubDevLoader( + swh_storage, + "http://nowhere/api/packages/42", + ) diff --git a/swh/loader/package/pubdev/tests/test_tasks.py b/swh/loader/package/pubdev/tests/test_tasks.py new file mode 100644 index 0000000..c5b2ce7 --- /dev/null +++ b/swh/loader/package/pubdev/tests/test_tasks.py @@ -0,0 +1,23 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_tasks_pubdev_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.pubdev.loader.PubDevLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.pubdev.tasks.LoadPubDev", + kwargs=dict( + url="https://pub.dev/packages/some-package", + ), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py index 75373e7..bf1f4da 100644 --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -1,236 +1,273 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os from unittest.mock import MagicMock from urllib.error import URLError from urllib.parse import quote import pytest +from requests.exceptions import HTTPError from swh.loader.exception import NotFound import swh.loader.package from swh.loader.package.utils import api_info, download, release_name def test_version_generation(): assert ( swh.loader.package.__version__ != "devel" ), "Make sure swh.loader.core is installed (e.g. pip install -e .)" @pytest.mark.fs def test_download_fail_to_download(tmp_path, requests_mock): url = "https://pypi.org/pypi/arrow/json" status_code = 404 requests_mock.get(url, status_code=status_code) - with pytest.raises(ValueError) as e: + with pytest.raises( + HTTPError, match=f"{status_code} Client Error: None for url: {url}" + ): download(url, tmp_path) - assert e.value.args[0] == "Fail to query '%s'. Reason: %s" % (url, status_code) - _filename = "requests-0.0.1.tar.gz" _data = "this is something" def _check_download_ok(url, dest, filename=_filename, hashes={}): actual_filepath, actual_hashes = download(url, dest, hashes=hashes) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes["length"] == len(_data) assert ( actual_hashes["checksums"]["sha1"] == "fdd1ce606a904b08c816ba84f3125f2af44d92b2" ) assert ( actual_hashes["checksums"]["sha256"] == "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5" ) @pytest.mark.fs def test_download_ok(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_ok_no_header(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data) # no header information _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_ok_with_hashes(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) # good hashes for such file good = { "sha1": "fdd1ce606a904b08c816ba84f3125f2af44d92b2", "sha256": "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5", # noqa } _check_download_ok(url, dest=str(tmp_path), hashes=good) @pytest.mark.fs def test_download_fail_hashes_mismatch(tmp_path, requests_mock): """Mismatch hash after download should raise""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) # good hashes for such file good = { "sha1": "fdd1ce606a904b08c816ba84f3125f2af44d92b2", "sha256": "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5", # noqa } for hash_algo in good.keys(): wrong_hash = good[hash_algo].replace("1", "0") expected_hashes = good.copy() expected_hashes[hash_algo] = wrong_hash # set the wrong hash expected_msg = "Failure when fetching %s. " "Checksum mismatched: %s != %s" % ( url, wrong_hash, good[hash_algo], ) with pytest.raises(ValueError, match=expected_msg): download(url, dest=str(tmp_path), hashes=expected_hashes) @pytest.mark.fs def test_ftp_download_ok(tmp_path, mocker): """Download without issue should provide filename and hashes""" url = f"ftp://pypi.org/pypi/requests/{_filename}" cm = MagicMock() cm.getstatus.return_value = 200 cm.read.side_effect = [_data.encode(), b""] cm.__enter__.return_value = cm mocker.patch("swh.loader.package.utils.urlopen").return_value = cm _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_ftp_download_ko(tmp_path, mocker): """Download without issue should provide filename and hashes""" filename = "requests-0.0.1.tar.gz" url = "ftp://pypi.org/pypi/requests/%s" % filename mocker.patch("swh.loader.package.utils.urlopen").side_effect = URLError("FTP error") with pytest.raises(URLError): download(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_with_redirection(tmp_path, requests_mock): """Download with redirection should use the targeted URL to extract filename""" url = "https://example.org/project/requests/download" redirection_url = f"https://example.org/project/requests/files/{_filename}" requests_mock.get(url, status_code=302, headers={"location": redirection_url}) requests_mock.get( redirection_url, text=_data, headers={"content-length": str(len(_data))} ) _check_download_ok(url, dest=str(tmp_path)) def test_download_extracting_filename_from_url(tmp_path, requests_mock): """Extracting filename from url must sanitize the filename first""" url = "https://example.org/project/requests-0.0.1.tar.gz?a=b&c=d&foo=bar" requests_mock.get( url, status_code=200, text=_data, headers={"content-length": str(len(_data))} ) _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs @pytest.mark.parametrize( "filename", [f'"{_filename}"', _filename, '"filename with spaces.tar.gz"'] ) def test_download_filename_from_content_disposition(tmp_path, requests_mock, filename): """Filename should be extracted from content-disposition request header when available.""" url = "https://example.org/download/requests/tar.gz/v0.0.1" requests_mock.get( url, text=_data, headers={ "content-length": str(len(_data)), "content-disposition": f"attachment; filename={filename}", }, ) _check_download_ok(url, dest=str(tmp_path), filename=filename.strip('"')) @pytest.mark.fs @pytest.mark.parametrize("filename", ['"archive école.tar.gz"', "archive_école.tgz"]) def test_download_utf8_filename_from_content_disposition( tmp_path, requests_mock, filename ): """Filename should be extracted from content-disposition request header when available.""" url = "https://example.org/download/requests/tar.gz/v0.0.1" data = "this is something" requests_mock.get( url, text=data, headers={ "content-length": str(len(data)), "content-disposition": f"attachment; filename*=utf-8''{quote(filename)}", }, ) _check_download_ok(url, dest=str(tmp_path), filename=filename.strip('"')) def test_api_info_failure(requests_mock): """Failure to fetch info/release information should raise""" url = "https://pypi.org/pypi/requests/json" status_code = 400 requests_mock.get(url, status_code=status_code) with pytest.raises(NotFound) as e0: api_info(url) assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % (url, status_code) def test_api_info(requests_mock): """Fetching json info from pypi project should be ok""" url = "https://pypi.org/pypi/requests/json" requests_mock.get(url, text='{"version": "0.0.1"}') actual_info = json.loads(api_info(url)) assert actual_info == { "version": "0.0.1", } def test_release_name(): for version, filename, expected_release in [ ("0.0.1", None, "releases/0.0.1"), ("0.0.2", "something", "releases/0.0.2/something"), ]: assert release_name(version, filename) == expected_release + + +@pytest.fixture(autouse=True) +def mock_download_retry_sleep(mocker): + mocker.patch.object(download.retry, "sleep") + + +def test_download_retry(mocker, requests_mock, tmp_path): + url = f"https://example.org/project/requests/files/{_filename}" + + requests_mock.get( + url, + [ + {"status_code": 429}, + {"status_code": 429}, + { + "text": _data, + "headers": {"content-length": str(len(_data))}, + "status_code": 200, + }, + ], + ) + + _check_download_ok(url, dest=str(tmp_path)) + + +def test_download_retry_reraise(mocker, requests_mock, tmp_path): + url = f"https://example.org/project/requests/files/{_filename}" + + requests_mock.get( + url, + [{"status_code": 429}] * 5, + ) + + with pytest.raises(HTTPError): + _check_download_ok(url, dest=str(tmp_path)) diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py index 0656eca..df3127c 100644 --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -1,185 +1,207 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import functools import itertools import logging import os import re from typing import Callable, Dict, Optional, Tuple, TypeVar from urllib.parse import unquote, urlsplit from urllib.request import urlopen import requests +from requests.exceptions import HTTPError +from tenacity import retry +from tenacity.before_sleep import before_sleep_log +from tenacity.stop import stop_after_attempt +from tenacity.wait import wait_exponential from swh.loader.exception import NotFound from swh.loader.package import DEFAULT_PARAMS from swh.model.hashutil import HASH_BLOCK_SIZE, MultiHash from swh.model.model import Person logger = logging.getLogger(__name__) DOWNLOAD_HASHES = set(["sha1", "sha256", "length"]) EMPTY_AUTHOR = Person.from_fullname(b"") def api_info(url: str, **extra_params) -> bytes: """Basic api client to retrieve information on project. This deals with fetching json metadata about pypi projects. Args: url (str): The api url (e.g PyPI, npm, etc...) Raises: NotFound in case of query failures (for some reasons: 404, ...) Returns: The associated response's information """ response = requests.get(url, **{**DEFAULT_PARAMS, **extra_params}) if response.status_code != 200: raise NotFound(f"Fail to query '{url}'. Reason: {response.status_code}") return response.content def _content_disposition_filename(header: str) -> Optional[str]: fname = None fnames = re.findall(r"filename[\*]?=([^;]+)", header) if fnames and "utf-8''" in fnames[0].lower(): # RFC 5987 fname = re.sub("utf-8''", "", fnames[0], flags=re.IGNORECASE) fname = unquote(fname) elif fnames: fname = fnames[0] if fname: fname = os.path.basename(fname.strip().strip('"')) return fname +def _retry_if_throttling(retry_state) -> bool: + """Custom tenacity retry predicate for handling HTTP responses with + status code 429 (too many requests). + """ + attempt = retry_state.outcome + if attempt.failed: + exception = attempt.exception() + return ( + isinstance(exception, HTTPError) and exception.response.status_code == 429 + ) + return False + + +@retry( + retry=_retry_if_throttling, + wait=wait_exponential(exp_base=10), + stop=stop_after_attempt(max_attempt_number=5), + before_sleep=before_sleep_log(logger, logging.WARNING), + reraise=True, +) def download( url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, extra_request_headers: Optional[Dict[str, str]] = None, ) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to hashes: Dict of expected hashes (key is the hash algo) for the artifact to download (those hashes are expected to be hex string) auth: Optional tuple of login/password (for http authentication service, e.g. deposit) Raises: ValueError in case of any error when fetching/computing (length, checksums mismatched...) Returns: Tuple of local (filepath, hashes of filepath) """ params = copy.deepcopy(DEFAULT_PARAMS) if auth is not None: params["auth"] = auth if extra_request_headers is not None: params["headers"].update(extra_request_headers) # so the connection does not hang indefinitely (read/connection timeout) timeout = params.get("timeout", 60) if url.startswith("ftp://"): response = urlopen(url, timeout=timeout) chunks = (response.read(HASH_BLOCK_SIZE) for _ in itertools.count()) response_data = itertools.takewhile(bool, chunks) else: response = requests.get(url, **params, timeout=timeout, stream=True) - if response.status_code != 200: - raise ValueError( - "Fail to query '%s'. Reason: %s" % (url, response.status_code) - ) + response.raise_for_status() # update URL to response one as requests follow redirection by default # on GET requests url = response.url # try to extract filename from content-disposition header if available if filename is None and "content-disposition" in response.headers: filename = _content_disposition_filename( response.headers["content-disposition"] ) response_data = response.iter_content(chunk_size=HASH_BLOCK_SIZE) filename = filename if filename else os.path.basename(urlsplit(url).path) logger.debug("filename: %s", filename) filepath = os.path.join(dest, filename) logger.debug("filepath: %s", filepath) h = MultiHash(hash_names=DOWNLOAD_HASHES | set(hashes.keys())) with open(filepath, "wb") as f: for chunk in response_data: h.update(chunk) f.write(chunk) response.close() # Also check the expected hashes if provided if hashes: actual_hashes = h.hexdigest() for algo_hash in hashes.keys(): actual_digest = actual_hashes[algo_hash] expected_digest = hashes[algo_hash] if actual_digest != expected_digest: raise ValueError( "Failure when fetching %s. " "Checksum mismatched: %s != %s" % (url, expected_digest, actual_digest) ) computed_hashes = h.hexdigest() length = computed_hashes.pop("length") extrinsic_metadata = { "length": length, "filename": filename, "checksums": computed_hashes, "url": url, } logger.debug("extrinsic_metadata", extrinsic_metadata) return filepath, extrinsic_metadata def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version TReturn = TypeVar("TReturn") TSelf = TypeVar("TSelf") _UNDEFINED = object() def cached_method(f: Callable[[TSelf], TReturn]) -> Callable[[TSelf], TReturn]: cache_name = f"_cached_{f.__name__}" @functools.wraps(f) def newf(self): value = getattr(self, cache_name, _UNDEFINED) if value is _UNDEFINED: value = f(self) setattr(self, cache_name, value) return value return newf