diff --git a/PKG-INFO b/PKG-INFO index c872963..0de803a 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,52 +1,52 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 3.5.0 +Version: 4.0.0 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/docs/package-loader-specifications.rst b/docs/package-loader-specifications.rst index 1d734b8..ce808d0 100644 --- a/docs/package-loader-specifications.rst +++ b/docs/package-loader-specifications.rst @@ -1,133 +1,169 @@ .. _package-loader-specifications: Package loader specifications ============================= Release fields -------------- Here is an overview of the fields (+ internal version name + branch name) used by each package loader, after D6616: .. list-table:: Fields used by each package loader :header-rows: 1 * - Loader - internal version - branch name - name - message - synthetic - author - date - Notes + * - arch + - ``p_info.​version`` + - ``release_name(​version, filename)`` + - =version + - Synthetic release for Arch Linux source package {p_info.name} version {p_info.version} {description} + - true + - from intrinsic metadata + - from extra_loader_arguments['arch_metadata'] + - Intrinsic metadata extracted from .PKGINFO file of the package * - archive - passed as arg - ``release_name(​version)`` - =version - "Synthetic release for archive at {p_info.url}\n" - true - "" - passed as arg - + * - aur + - ``p_info.​version`` + - ``release_name(​version, filename)`` + - =version + - Synthetic release for Aur source package {p_info.name} version {p_info.version} {description} + - true + - "" + - from extra_loader_arguments['aur_metadata'] + - Intrinsic metadata extracted from .SRCINFO file of the package * - cran - ``metadata.get(​"Version", passed as arg)`` - ``release_name(​version)`` - =version - standard message - true - ``metadata.get(​"Maintainer", "")`` - ``metadata.get(​"Date")`` - metadata is intrinsic * - crates - ``p_info.​version`` - - ``release_name(​version, filename)`` + - ``release_name(​version, filename) + "\n\n" + i_metadata.description + "\n"`` - =version - Synthetic release for Crate source package {p_info.name} version {p_info.version} {description} - true - - from intrinsic metadata - - from extrinsic metadata + - from int metadata + - from ext metadata - ``i_metadata`` for intrinsic metadata, ``e_metadata`` for extrinsic metadata * - debian - =``version`` - ``release_name(​version)`` - =``i_version`` - standard message (using ``i_version``) - true - ``metadata​.changelog​.person`` - ``metadata​.changelog​.date`` - metadata is intrinsic. Old revisions have ``dsc`` as type ``i_version`` is the intrinsic version (eg. ``0.7.2-3``) while ``version`` contains the debian suite name (eg. ``stretch/contrib/0.7.2-3``) and is passed as arg + * - golang + - ``p_info.​version`` + - ``release_name(version)`` + - =version + - Synthetic release for Golang source package {p_info.name} version {p_info.version} + - true + - "" + - from ext metadata + - Golang offers basically no metadata outside of version and timestamp * - deposit - HEAD - only HEAD - HEAD - "{client}: Deposit {id} in collection {collection}\n" - true - original author - ```` from SWORD XML - revisions had parents * - maven-loader - passed as arg - HEAD - ``release_name(version)`` - "Synthetic release for archive at {p_info.url}\n" - true - "" - passed as arg - Only one artefact per url (jar/zip src) * - nixguix - URL - URL - URL - None - true - "" - None - it's the URL of the artifact referenced by the derivation * - npm - ``metadata​["version"]`` - ``release_name(​version)`` - =version - standard message - true - from int metadata or "" - from ext metadata or None - * - opam - as given by opam - "{opam_package}​.{version}" - =version - standard message - true - from metadata - None - "{self.opam_package}​.{version}" matches the version names used by opam's backend. metadata is extrinsic + * - pubdev + - ``p_info.​version`` + - ``release_name(​version)`` + - =version + - Synthetic release for pub.dev source package {name} version {version} {description} + - true + - from extrinsic metadata + - from extrinsic metadata + - name, version and description from intrinsic metadata * - pypi - ``metadata​["version"]`` - ``release_name(​version)`` or ``release_name(​version, filename)`` - =version - ``metadata[​'comment_text']}`` or standard message - true - from int metadata or "" - from ext metadata or None - metadata is intrinsic using this function:: def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version and "standard message" being:: msg = ( f"Synthetic release for {PACKAGE_MANAGER} source package {name} " f"version {version}\n" ) The ``target_type`` field is always ``dir``, and the target the id of a directory loaded by unpacking a tarball/zip file/... diff --git a/docs/package-loader-tutorial.rst b/docs/package-loader-tutorial.rst index 304936c..6543bd2 100644 --- a/docs/package-loader-tutorial.rst +++ b/docs/package-loader-tutorial.rst @@ -1,699 +1,712 @@ .. _package-loader-tutorial: Package Loader Tutorial ======================= In this tutorial, we will see how to write a loader for |swh| that loads packages from a package manager, such as PyPI or Debian's. First, you should be familiar with Python, unit-testing, |swh|'s :ref:`data-model` and :ref:`architecture`, and go through the :ref:`developer-setup`. Creating the files hierarchy ---------------------------- Once this is done, you should create a new directory (ie. a (sub)package from Python's point of view) for you loader. It can be either a subdirectory of ``swh-loader-core/swh/loader/package/`` like the other package loaders, or it can be in its own package. If you choose the latter, you should also create the base file of any Python package (such as ``setup.py``), you should import them from the `swh-py-template`_ repository. In the rest of this tutorial, we will assume you chose the former and your loader is named "New Loader", so your package loader is in ``swh-loader-core/swh/loader/package/newloader/``. Next, you should create boilerplate files needed for SWH loaders: ``__init__.py``, ``tasks.py``, ``tests/__init__.py``, and ``tests/test_tasks.py``; copy them from an existing package, such as ``swh-loader-core/swh/loader/package/pypi/``, and replace the names in those with your loader's. Finally, create an `entrypoint`_ in :file:`setup.py`, so your loader can be discovered by the SWH Celery workers:: entry_points=""" [swh.workers] loader.newloader=swh.loader.package.newloader:register """, .. _swh-py-template: https://forge.softwareheritage.org/source/swh-py-template/ .. _entrypoint: https://setuptools.readthedocs.io/en/latest/userguide/entry_point.html Writing a minimal loader ------------------------ It is now time for the interesting part: writing the code to load packages from a package manager into the |swh| archive. Create a file named :file:`loader.py` in your package's directory, with two empty classes (replace the names with what you think is relevant):: from typing import Optional import attr from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.model.model import Person, Release, Sha1Git, TimestampWithTimezone @attr.s class NewPackageInfo(BasePackageInfo): pass class NewLoader(PackageLoader[NewPackageInfo]): visit_type = "newloader" We now have to fill some of the methods declared by :class:`swh.loader.package.PackageLoader`: in your new ``NewLoader`` class. Listing versions ++++++++++++++++ ``get_versions`` should return the list of names of all versions of the origin defined at ``self.url`` by the default constructor; and ``get_default_version`` should return the name of the default version (usually the latest stable release). They are both implemented with an API call to the package repository. For example, for PyPI origin https://pypi.org/project/requests, this is done with a request to https://pypi.org/pypi/requests/json. Getting package information +++++++++++++++++++++++++++ Next, ``get_package_info`` takes as argument a version name (as returned by ``get_versions``) and yields ``(branch_name, p_info)`` tuples, where ``branch_name`` is a string and ``pkg_info`` is an instance of the ``NewPackageInfo`` class we defined earlier. Each of these tuples should match a single file the loader will download from the origin. Usually, there is only one file per versions, but this is not true for all package repositories (eg. CRAN and PyPI allow multiple version artifacts per version). As ``NewPackageInfo`` derives from :py:class:`swh.loader.package.BasePackageInfo`, it can be created like this:: return NewPackageInfo(url="https://...", filename="...-versionX.Y.tar.gz") The ``url`` must be a URL where to download the archive from. ``filename`` is optional, but it is nice to fill it when possible/relevant. The base ``PackageLoader`` will then take care of calling ``get_versions()`` to get all the versions, then call ``get_package_info()`` get the list of archives to download, download them, and load all the directories in the archive. This means you do not need to manage downloads yourself; and we are now done with interactions with the package repository. Building a release +++++++++++++++++++ The final step for your minimal loader to work, is to implement ``build_release``. This is a very important part, as it will create a release object that will be inserted in |swh|, as a link between origins and the directories. This function takes three important arguments: * ``p_info`` is an object returned by ``get_package_info()`` * ``uncompressed_path`` is the location on the disk where the base ``PackageLoader`` extracted the archive, so you can access files from the archive. * ``directory`` is an :term:`intrinsic identifier` of the directory that was loaded from the archive The way to implement it depends very much on how the package manager works, but here is a rough idea:: def build_release( self, p_info: NewPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: author = Person(name="Jane Doe", email="jdoe@example.org") date = TimestampWithTimezone.from_iso8601("2021-04-01T11:55:20Z") return Release( name="v2.0.0", message="This is a new release of the project", author=author, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) The strings here are placeholders, and you should extract them from either the extracted archive (using ``uncompressed_path``), or from the package repository's API; see the :ref:`existing specifications ` for examples of values to use. The various classes used in this example are :py:class:`swh.model.model.Person`, :py:class:`swh.model.model.TimestampWithTimezone`, and :py:class:`swh.model.model.Release`. Note that you have access to the ``NewPackageInfo`` object created by ``get_package_info()``, so you can extend the ``NewPackageInfo`` class to pass data between these two functions. A few caveats: * Make sure the timezone matches the source's * ``Person`` can also be built with just a ``fullname``, if there aren't distinct fields for name and email. When in doubt, it's better to just write the ``fullname`` than try to parse it * ``author`` and ``committer`` (resp. ``date`` and ``committer_date``) may be different if the release was written and published by different people (resp. dates). This is only relevant when loading from VCS, so you can usually ignore it in you package loader. Running your loader +++++++++++++++++++ .. _docker-run-loader-cli: With Docker ^^^^^^^^^^^ We recommend you use our `Docker environment`_ to test your loader. In short, install Docker, ``cd`` to ``swh-environment/docker/``, then `edit docker-compose.override.yml`_ to insert your new loader in the Docker environment, something like this will do:: version: '2' services: swh-loader-core: volumes: - "$HOME/swh-environment/swh-loader-core:/src/swh-loader-core" Then start the Docker environment:: docker-compose start Then, you can run your loader:: docker-compose exec swh-loader swh loader run newloader "https://example.org/~jdoe/project/" where ``newloader`` is the name you registered as an entrypoint in ``setup.py`` and ``https://example.org/~jdoe/project/`` is the origin URL, that will be set as the ``self.url`` attribute of your loader. For example, to run the PyPI loader, the command would be:: docker-compose exec swh-loader swh loader run pypi "https://pypi.org/project/requests/" If you get this error, make sure you properly configured ``docker-compose.override.yml``:: Error: Invalid value for '[...]': invalid choice: newloader Without Docker ^^^^^^^^^^^^^^ If you do not want to use the Docker environment, you will need to start an :ref:`swh-storage` instance yourself, and create a config file that references it:: storage: cls: remote url: http://localhost:5002/ Or alternatively, this more efficient configuration:: storage: cls: pipeline steps: - cls: buffer min_batch_size: content: 10000 content_bytes: 104857600 directory: 1000 release: 1000 - cls: filter - cls: remote url: http://localhost:5002/ And run your loader with:: swh loader -C loader.yml run newloader "https://example.org/~jdoe/project/" where ``newloader`` is the name you registered as an entrypoint in ``setup.py`` and ``https://example.org/~jdoe/project/`` is the origin URL, that will be set as the ``self.url`` attribute of your loader. For example, with PyPI:: swh loader -C loader.yml run pypi "https://pypi.org/project/requests/" .. _Docker environment: https://forge.softwareheritage.org/source/swh-environment/browse/master/docker/ .. _edit docker-compose.override.yml: https://forge.softwareheritage.org/source/swh-environment/browse/master/docker/#install-a-swh-package-from Testing your loader +++++++++++++++++++ You must write tests for your loader. First, of course, unit tests for the internal functions of your loader, if any (eg. the functions used to extract metadata); but this is not covered in this tutorial. Most importantly, you should write integration tests for your loader, that will simulate an origin, run the loader, and check everything is loaded in the storage as it should be. As we do not want tests to directly query an origin (it makes tests flaky, hard to reproduce, and put unnecessary load on the origin), we usually mock it using the :py:func:`swh.core.pytest_plugin.requests_mock_datadir` fixture It works by creating a ``data/`` folder in your tests (such as ``swh/loader/package/newloader/tests/data/``) and downloading results from API calls there, in the structured documented in :py:func:`swh.core.pytest_plugin.requests_mock_datadir_factory` The files in the ``datadir/`` will then be served whenever the loader tries to access an URL. This is very dependent on the kind of repositories your loader will read from, so here is an example with the PyPI loader. The files ``swh/loader/package/pypi/tests/data/https_pypi.org/pypi_nexter_json`` and ``swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-*`` are used in this test:: from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats def test_pypi_visit_1_release_with_2_artifacts(swh_storage, requests_mock_datadir): # Initialize the loader url = "https://pypi.org/project/nexter" loader = PyPILoader(swh_storage, url) # Run the loader, with a swh-storage instance, on the given URL. # HTTP calls will be mocked by the requests_mock_datadir fixture actual_load_status = loader.load() # Check the loader loaded exactly the snapshot we expected # (when writing your tests for the first time, you cannot know the # snapshot id without running your loader; so let it error and write # down the result here) expected_snapshot_id = hash_to_bytes("1394b2e59351a944cc763bd9d26d90ce8e8121a8") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } # Check the content of the snapshot. (ditto) expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"releases/1.1.0/nexter-1.1.0.zip": SnapshotBranch( target=hash_to_bytes("f7d43faeb65b64d3faa67e4f46559db57d26b9a4"), target_type=TargetType.RELEASE, ), b"releases/1.1.0/nexter-1.1.0.tar.gz": SnapshotBranch( target=hash_to_bytes("732bb9dc087e6015884daaebb8b82559be729b5a"), target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, swh_storage) # Check the visit was properly created with the right type assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) # Then you could check the directory structure: directory_id = swh_storage.release_get( [hash_to_bytes("f7d43faeb65b64d3faa67e4f46559db57d26b9a4")] )[0].target entries = list(swh_storage.directory_ls(directory_id, recursive=True)) assert entries == [ ... ] Here are some scenarios you should test, when relevant: * No versions * One version * Two or more versions * More than one package per version, if relevant * Corrupt packages (missing metadata, ...), if relevant * API errors * etc. Making your loader incremental ------------------------------ .. important:: In the previous sections, you wrote a fully functional loader for a new type of package repository. This is great! Please tell us about it, and :ref:`submit it for review ` so we can give you some feedback early. Now, we will see a key optimization for any package loader: skipping packages it already downloaded, using :term:`extids `. The rough idea it to find some way to uniquely identify packages before downloading them and encode it in a short string, the ExtID. Using checksums +++++++++++++++ Ideally, this short string is a checksum of the archive, provided by the API before downloading the archive itself. This is ideal, because this ensures that we detect changes in the package's content even if it keeps the same name and version number. +However, this is only usable when all fields used to generate release objects +(message, authors, ...) are extracted from the archive. + +.. important:: + + If release objects are generated from extrinsic fields (ie. not extracted from + the archive, such as authorship information added by the package repository) + two different package versions with the same tarball would end up with the + same release number; causing the loader to create incorrect snapshots. If this is not the case of the repository you want to load from, skip to the next subsection. This is used for example by the PyPI loader (with a sha256sum) and the NPM loader (with a sha1sum). The Debian loader uses a similar scheme: as a single package is assembled from a set of tarballs, it only uses the hash of the ``.dsc`` file, which itself contains a hash of all the tarballs. This is implemented by overriding the ``extid`` method of you ``NewPackageInfo`` class, that returns the type of the ExtID (see below) and the ExtID itself:: from swh.loader.package.loader import PartialExtID EXTID_TYPE: str = "pypi-archive-sha256" @attr.s class NewPackageInfo(BasePackageInfo): sha256: str def extid(self) -> PartialExtID: return (EXTID_TYPE, hash_to_bytes(self.sha256)) and the loader's ``get_package_info`` method sets the right value in the ``sha256`` attribute. Using a custom manifest +++++++++++++++++++++++ Unfortunately, this does not work for all packages, as some package repositories do not provide a checksum of the archives via their API. If this is the case of the repository you want to load from, you need to find a way around it. It highly depends on the repository, so this tutorial cannot cover how to do it. We do however provide an easy option that should work in most cases: creating a "manifest" of the archive with some metadata in it, and hashing it. For example, when loading from the GNU FTP servers, we have access to some metadata, that is somewhat good enough to deduplicate. We write them all in a string and hash that string. It is done like this:: import string @attr.s class ArchivePackageInfo(BasePackageInfo): length = attr.ib(type=int) """Size of the archive file""" time = attr.ib(type=Union[str, datetime.datetime]) """Timestamp of the archive file on the server""" version = attr.ib(type=str) EXTID_FORMAT = "package-manifest-sha256" MANIFEST_FORMAT = string.Template("$time $length $version $url") The default implementation of :py:func:`swh.loader.package.loader.BasePackageInfo.extid` will read this template, substitute the variables based on the object's attributes, compute the hash of the result, and return it. Note that, as mentioned before, this is not perfect because a tarball may be replaced with a different tarball of exactly the same length and modification time, and we won't detect it. But this is extremely unlikely, so we consider it to be good enough. +.. important:: + + The manifest must cover all fields used to generate Release objects. + Alternatively, if this is not good enough for your loader, you can simply not implement ExtIDs, and your loader will always load all tarballs. This can be bandwidth-heavy for both |swh| and the origin you are loaded from, so this decision should not be taken lightly. Choosing the ExtID type +++++++++++++++++++++++ The type of your ExtID should be a short ASCII string, that is both unique to your loader and descriptive of how it was computed. Why unique to the loader? Because different loaders may load the same archive differently. For example, if I was to create an archive with both a ``PKG-INFO`` and a ``package.json`` file, and submit it to both NPM and PyPI, both package repositories would have exactly the same tarball. But the NPM loader would create the release based on authorship info in ``package.json``, and the PyPI loader based on ``PKG-INFO``. But we do not want the PyPI loader to assume it already created a release itself, while the release was created by the NPM loader! And why descriptive? This is simply for future-proofing; in case your loader changes the format of the ExtID (eg. by using a different hash algorithm). Testing your incremental loading ++++++++++++++++++++++++++++++++ If you followed the steps above, your loader is now able to detect what packages it already downloaded and skip them. This is what we call an incremental loader. It is now time to write tests to make sure your loader fulfills this promise. This time, we want to use ``requests_mock_datadir_visits`` instead of ``requests_mock_datadir``, because we want to mock the repository's API to emulate its results changing over time (eg. because a new version was published between two runs of the loader). See the documentation of :py:func:`swh.core.pytest_plugin.requests_mock_datadir_factory` for a description of the file layout to use. Let's take, once again, a look at ``swh/loader/package/pypi/tests/test_pypi.py``, to use as an example:: def test_pypi_incremental_visit(swh_storage, requests_mock_datadir_visits): """With prior visit, 2nd load will result with a different snapshot """ # Initialize the loader url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) # First visit visit1_actual_load_status = loader.load() visit1_stats = get_stats(swh_storage) # Make sure everything is in order expected_snapshot_id = hash_to_bytes("ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a") assert visit1_actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 0, "release": 2, "skipped_content": 0, "snapshot": 1, } == visit1_stats # Reset internal state del loader._cached__raw_info del loader._cached_info # Second visit visit2_actual_load_status = loader.load() visit2_stats = get_stats(swh_storage) # Check the result of the visit assert visit2_actual_load_status["status"] == "eventful", visit2_actual_load_status expected_snapshot_id2 = hash_to_bytes("2e5149a7b0725d18231a37b342e9b7c4e121f283") assert visit2_actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id2.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id2 ) assert { "content": 6 + 1, # 1 more content "directory": 4 + 2, # 2 more directories "origin": 1, "origin_visit": 1 + 1, "release": 2 + 1, # 1 more release "revision": 0, "skipped_content": 0, "snapshot": 1 + 1, # 1 more snapshot } == visit2_stats # Check all content objects were loaded expected_contents = map( hash_to_bytes, [ "a61e24cdfdab3bb7817f6be85d37a3e666b34566", "938c33483285fd8ad57f15497f538320df82aeb8", "a27576d60e08c94a05006d2e6d540c0fdb5f38c8", "405859113963cb7a797642b45f171d6360425d16", "e5686aa568fdb1d19d7f1329267082fe40482d31", "83ecf6ec1114fd260ca7a833a2d165e71258c338", "92689fa2b7fb4d4fc6fb195bf73a50c87c030639", ], ) assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] # Check all directory objects were loaded expected_dirs = map( hash_to_bytes, [ "05219ba38bc542d4345d5638af1ed56c7d43ca7d", "cf019eb456cf6f78d8c4674596f1c9a97ece8f44", "b178b66bd22383d5f16f4f5c923d39ca798861b4", "c3a58f8b57433a4b56caaa5033ae2e0931405338", "e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a", "52604d46843b898f5a43208045d09fcf8731631b", ], ) assert list(swh_storage.directory_missing(expected_dirs)) == [] # etc. Loading metadata ---------------- Finally, an optional step: collecting and loading :term:`extrinsic metadata`. This is metadata that your loader may collect while loading an origin. For example, the PyPI loader collects some parts of the API response (eg. https://pypi.org/pypi/requests/json) They are stored as raw bytestring, along with a format (an ASCII string) and a date of discovery (usually the time your loader ran). This is done by adding them to the ``directory_extrinsic_metadata`` attribute of your ``NewPackageInfo`` object when creating it in ``get_package_info`` as :class:`swh.loader.package.loader.RawExtrinsicMetadataCore` objects:: NewPackageInfo( ..., directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="new-format", metadata=b"foo bar baz", discovery_date=datetime.datetime(...), ) ] ) ``format`` should be a human-readable ASCII string that unambiguously describes the format. Readers of the metadata object will have a built-in list of formats they understand, and will check if your metadata object is among them. You should use one of the :ref:`known metadata formats ` if possible, or add yours to this list. ``metadata`` is the metadata object itself. When possible, it should be copied verbatim from the source object you got, and should not be created by the loader. If this is not possible, for example because it is extracted from a larger JSON or XML document, make sure you do as little modifications as possible to reduce the risks of corruption. ``discovery_date`` is optional, and defaults to the time your loader started working. In theory, you can write extrinsic metadata on any kind of objects, eg. by implementing :py:meth:`swh.loader.package.loader.PackageLoader.get_extrinsic_origin_metadata`, :py:meth:`swh.loader.package.loader.PackageLoader.get_extrinsic_snapshot_metadata`; but this is rarely relevant in practice. Be sure to check if loader can find any potentially interesting metadata, though! You also need to implement a new method on your loader class, to return information on where the metadata is coming from, called a metadata authority. This authority is identified by a URI, such as ``https://github.com/`` for GitHub, ``https://pypi.org/`` for PyPI, etc. For example:: from swh.model.model import MetadataAuthority, MetadataAuthorityType def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://pypi.org/", ) If your loader supports loading from different instances (like GitLab), you can define the authority dynamically based on the URL of the origin:: def get_metadata_authority(self): p_url = urlparse(self.url) return MetadataAuthority( type=MetadataAuthorityType.FORGE, url=f"{p_url.scheme}://{p_url.netloc}/", ) Checklist --------- Before the final addition of a new loader, here is a list of things to check for. Most of them are a reminder of other sections above. * There is (or will be) a lister to trigger it * Tested with pytest, from scratch and incrementally (if relevant) * Tested in Docker, from scratch and incrementally (if relevant) * Release fields are consistent with the :ref:`existing specifications `, and you updated the specifications to add your loader. They must be explicitly tested. * Relevant metadata are loaded with as little processing as possible (ie. keep the original format unchanged, instead of converting it to a JSON/msgpack/... format) and :ref:`their format is documented `. They must tested as well. * There is no risk of extid clashes, even across instances (if relevant), even in presence of malicious actors (as far as reasonably possible) Final words ----------- Congratulations, you made it to the end. If you have not already, please `contact us`_ to tell us about your new loader, and :ref:`submit your loader for review ` on our forge so we can merge it and run it along our other loaders to archive more repositories. And if you have any change in mind to improve this tutorial for future readers, please submit them too. Thank you for your contributions! .. _contact us: https://www.softwareheritage.org/community/developers/ diff --git a/requirements-swh.txt b/requirements-swh.txt index c89f10b..30b3fcc 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ -swh.core >= 0.3 +swh.core >= 2.12 swh.model >= 4.4.0 swh.objstorage >= 0.2.2 swh.scheduler >= 0.4.0 swh.storage >= 0.29.0 diff --git a/setup.py b/setup.py index a4f4d95..421f131 100755 --- a/setup.py +++ b/setup.py @@ -1,82 +1,86 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import open from os import path from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.rst"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.loader.core", description="Software Heritage Base Loader", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DLDBASE", packages=find_packages(), # packages's modules scripts=[], # scripts to package install_requires=parse_requirements() + parse_requirements("swh"), setup_requires=["setuptools-scm"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, entry_points=""" [swh.cli.subcommands] loader=swh.loader.cli [swh.workers] + loader.arch=swh.loader.package.arch:register loader.archive=swh.loader.package.archive:register + loader.aur=swh.loader.package.aur:register loader.cran=swh.loader.package.cran:register loader.crates=swh.loader.package.crates:register loader.debian=swh.loader.package.debian:register loader.deposit=swh.loader.package.deposit:register + loader.golang=swh.loader.package.golang:register loader.nixguix=swh.loader.package.nixguix:register loader.npm=swh.loader.package.npm:register loader.opam=swh.loader.package.opam:register + loader.pubdev=swh.loader.package.pubdev:register loader.pypi=swh.loader.package.pypi:register loader.maven=swh.loader.package.maven:register """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-loader-core", "Documentation": "https://docs.softwareheritage.org/devel/swh-loader-core/", }, ) diff --git a/swh.loader.core.egg-info/PKG-INFO b/swh.loader.core.egg-info/PKG-INFO index c872963..0de803a 100644 --- a/swh.loader.core.egg-info/PKG-INFO +++ b/swh.loader.core.egg-info/PKG-INFO @@ -1,52 +1,52 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 3.5.0 +Version: 4.0.0 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/swh.loader.core.egg-info/SOURCES.txt b/swh.loader.core.egg-info/SOURCES.txt index c2042ba..0bbd56c 100644 --- a/swh.loader.core.egg-info/SOURCES.txt +++ b/swh.loader.core.egg-info/SOURCES.txt @@ -1,238 +1,287 @@ .git-blame-ignore-revs .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile README.rst conftest.py mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docs/.gitignore docs/Makefile docs/README.rst docs/cli.rst docs/conf.py docs/index.rst docs/package-loader-specifications.rst docs/package-loader-tutorial.rst docs/vcs-loader-overview.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.loader.core.egg-info/PKG-INFO swh.loader.core.egg-info/SOURCES.txt swh.loader.core.egg-info/dependency_links.txt swh.loader.core.egg-info/entry_points.txt swh.loader.core.egg-info/requires.txt swh.loader.core.egg-info/top_level.txt swh/loader/__init__.py swh/loader/cli.py swh/loader/exception.py swh/loader/pytest_plugin.py swh/loader/core/__init__.py swh/loader/core/converters.py swh/loader/core/loader.py swh/loader/core/metadata_fetchers.py swh/loader/core/py.typed swh/loader/core/utils.py swh/loader/core/tests/__init__.py swh/loader/core/tests/test_converters.py swh/loader/core/tests/test_loader.py swh/loader/core/tests/test_utils.py swh/loader/package/__init__.py swh/loader/package/loader.py swh/loader/package/py.typed swh/loader/package/utils.py +swh/loader/package/arch/__init__.py +swh/loader/package/arch/loader.py +swh/loader/package/arch/tasks.py +swh/loader/package/arch/tests/__init__.py +swh/loader/package/arch/tests/test_arch.py +swh/loader/package/arch/tests/test_tasks.py +swh/loader/package/arch/tests/data/fake_arch.sh +swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz +swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst +swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz swh/loader/package/archive/__init__.py swh/loader/package/archive/loader.py swh/loader/package/archive/tasks.py swh/loader/package/archive/tests/__init__.py swh/loader/package/archive/tests/test_archive.py swh/loader/package/archive/tests/test_tasks.py swh/loader/package/archive/tests/data/not_gzipped_tarball.tar.gz swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 swh/loader/package/archive/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz +swh/loader/package/aur/__init__.py +swh/loader/package/aur/loader.py +swh/loader/package/aur/tasks.py +swh/loader/package/aur/tests/__init__.py +swh/loader/package/aur/tests/test_aur.py +swh/loader/package/aur/tests/test_tasks.py +swh/loader/package/aur/tests/data/fake_aur.sh +swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_a-fake-one.tar.gz +swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_hg-evolve.tar.gz +swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_ibus-git.tar.gz +swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_libervia-web-hg.tar.gz +swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_tealdeer-git.tar.gz swh/loader/package/cran/__init__.py swh/loader/package/cran/loader.py swh/loader/package/cran/tasks.py swh/loader/package/cran/tests/__init__.py swh/loader/package/cran/tests/test_cran.py swh/loader/package/cran/tests/test_tasks.py swh/loader/package/cran/tests/data/description/KnownBR swh/loader/package/cran/tests/data/description/acepack swh/loader/package/cran/tests/data/https_cran.r-project.org/src_contrib_1.4.0_Recommended_KernSmooth_2.22-6.tar.gz swh/loader/package/crates/__init__.py swh/loader/package/crates/loader.py swh/loader/package/crates/tasks.py swh/loader/package/crates/tests/__init__.py swh/loader/package/crates/tests/test_crates.py swh/loader/package/crates/tests/test_tasks.py swh/loader/package/crates/tests/data/fake_crates.sh swh/loader/package/crates/tests/data/https_crates.io/api_v1_crates_hg-core swh/loader/package/crates/tests/data/https_crates.io/api_v1_crates_micro-timer swh/loader/package/crates/tests/data/https_static.crates.io/crates_hg-core_hg-core-0.0.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.1.2.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.2.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.2.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.3.0.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.3.1.crate swh/loader/package/crates/tests/data/https_static.crates.io/crates_micro-timer_micro-timer-0.4.0.crate swh/loader/package/debian/__init__.py swh/loader/package/debian/loader.py swh/loader/package/debian/tasks.py swh/loader/package/debian/tests/__init__.py swh/loader/package/debian/tests/test_debian.py swh/loader/package/debian/tests/test_tasks.py swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.diff.gz swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-3.dsc swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.diff.gz swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2-4.dsc swh/loader/package/debian/tests/data/http_deb.debian.org/debian_pool_contrib_c_cicero_cicero_0.7.2.orig.tar.gz swh/loader/package/debian/tests/data/http_deb.debian.org/onefile.txt swh/loader/package/deposit/__init__.py swh/loader/package/deposit/loader.py swh/loader/package/deposit/tasks.py swh/loader/package/deposit/tests/__init__.py swh/loader/package/deposit/tests/conftest.py swh/loader/package/deposit/tests/test_deposit.py swh/loader/package/deposit/tests/test_tasks.py swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_666_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_777_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_888_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_meta swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/1_private_999_raw swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello-2.10.zip swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello-2.12.tar.gz swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.10.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.11.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.12.json swh/loader/package/deposit/tests/data/https_deposit.softwareheritage.org/hello_2.13.json +swh/loader/package/golang/__init__.py +swh/loader/package/golang/loader.py +swh/loader/package/golang/tasks.py +swh/loader/package/golang/tests/__init__.py +swh/loader/package/golang/tests/test_golang.py +swh/loader/package/golang/tests/test_tasks.py +swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@latest +swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_list +swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.info +swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.zip swh/loader/package/maven/__init__.py swh/loader/package/maven/loader.py swh/loader/package/maven/tasks.py swh/loader/package/maven/tests/__init__.py swh/loader/package/maven/tests/test_maven.py swh/loader/package/maven/tests/test_tasks.py swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0-sources.jar swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.0.pom swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1-sources.jar swh/loader/package/maven/tests/data/https_maven.org/sprova4j-0.1.1.pom swh/loader/package/nixguix/__init__.py swh/loader/package/nixguix/loader.py swh/loader/package/nixguix/tasks.py swh/loader/package/nixguix/tests/__init__.py swh/loader/package/nixguix/tests/conftest.py swh/loader/package/nixguix/tests/test_nixguix.py swh/loader/package/nixguix/tests/test_tasks.py swh/loader/package/nixguix/tests/data/https_example.com/file.txt swh/loader/package/nixguix/tests/data/https_fail.com/truncated-archive.tgz swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit1 swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.1.0.tar.gz_visit2 swh/loader/package/nixguix/tests/data/https_ftp.gnu.org/gnu_8sync_8sync-0.2.0.tar.gz swh/loader/package/nixguix/tests/data/https_github.com/owner-1_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_github.com/owner-2_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_github.com/owner-3_repository-1_revision-1.tgz swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources-EOFError.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources.json_visit1 swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json swh/loader/package/nixguix/tests/data/https_nix-community.github.io/nixpkgs-swh_sources_special.json_visit1 swh/loader/package/npm/__init__.py swh/loader/package/npm/loader.py swh/loader/package/npm/tasks.py swh/loader/package/npm/tests/__init__.py swh/loader/package/npm/tests/test_npm.py swh/loader/package/npm/tests/test_tasks.py swh/loader/package/npm/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.1-alpha.14.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/jammit-express_-_jammit-express-0.0.1.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/nativescript-telerik-analytics_-_nativescript-telerik-analytics-1.0.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.2.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3-beta.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.3.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.4.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.0.5.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.1.0.tgz swh/loader/package/npm/tests/data/https_registry.npmjs.org/org_-_org-0.2.0.tgz swh/loader/package/npm/tests/data/https_replicate.npmjs.com/@aller_shared swh/loader/package/npm/tests/data/https_replicate.npmjs.com/catify swh/loader/package/npm/tests/data/https_replicate.npmjs.com/jammit-express swh/loader/package/npm/tests/data/https_replicate.npmjs.com/jammit-no-time swh/loader/package/npm/tests/data/https_replicate.npmjs.com/nativescript-telerik-analytics swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_version_mismatch swh/loader/package/npm/tests/data/https_replicate.npmjs.com/org_visit1 swh/loader/package/opam/__init__.py swh/loader/package/opam/loader.py swh/loader/package/opam/tasks.py swh/loader/package/opam/tests/__init__.py swh/loader/package/opam/tests/test_opam.py swh/loader/package/opam/tests/test_tasks.py swh/loader/package/opam/tests/data/fake_opam_repo/_repo swh/loader/package/opam/tests/data/fake_opam_repo/version swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/lock swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/repos-config swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/agrid/agrid.0.1/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.1/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.2/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/directories/directories.0.3/opam swh/loader/package/opam/tests/data/fake_opam_repo/repo/loadertest/packages/ocb/ocb.0.1/opam swh/loader/package/opam/tests/data/https_github.com/OCamlPro_agrid_archive_0.1.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.1.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.2.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_directories_archive_0.3.tar.gz swh/loader/package/opam/tests/data/https_github.com/OCamlPro_ocb_archive_0.1.tar.gz +swh/loader/package/pubdev/__init__.py +swh/loader/package/pubdev/loader.py +swh/loader/package/pubdev/tasks.py +swh/loader/package/pubdev/tests/__init__.py +swh/loader/package/pubdev/tests/test_pubdev.py +swh/loader/package/pubdev/tests/test_tasks.py +swh/loader/package/pubdev/tests/data/fake_pubdev.sh +swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_Autolinker_versions_0.1.1.tar.gz +swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_authentication_versions_0.0.1.tar.gz +swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_bezier_versions_1.1.5.tar.gz +swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_1.0.0.tar.gz +swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_3.8.2.tar.gz +swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker +swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_abstract_io +swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication +swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier +swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf swh/loader/package/pypi/__init__.py swh/loader/package/pypi/loader.py swh/loader/package/pypi/tasks.py swh/loader/package/pypi/tests/__init__.py swh/loader/package/pypi/tests/test_pypi.py swh/loader/package/pypi/tests/test_tasks.py swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.2.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.3.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/0805nexter-1.4.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_70_97_c49fb8ec24a7aaab54c3dbfbb5a6ca1431419d9ee0f6c363d9ad01d2b8b1_0805nexter-1.3.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_86_10_c9555ec63106153aaaad753a281ff47f4ac79e980ff7f5d740d6649cd56a_upymenu-0.0.1.tar.gz swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_c4_a0_4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4_0805nexter-1.2.0.zip_visit1 swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip swh/loader/package/pypi/tests/data/https_files.pythonhosted.org/packages_ec_65_c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d_0805nexter-1.1.0.zip_visit1 swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json swh/loader/package/pypi/tests/data/https_pypi.org/pypi_0805nexter_json_visit1 swh/loader/package/pypi/tests/data/https_pypi.org/pypi_nexter_json swh/loader/package/pypi/tests/data/https_pypi.org/pypi_upymenu_json swh/loader/package/tests/__init__.py swh/loader/package/tests/common.py swh/loader/package/tests/test_conftest.py swh/loader/package/tests/test_loader.py swh/loader/package/tests/test_loader_metadata.py swh/loader/package/tests/test_utils.py swh/loader/tests/__init__.py swh/loader/tests/conftest.py swh/loader/tests/py.typed swh/loader/tests/test_cli.py swh/loader/tests/test_init.py swh/loader/tests/data/0805nexter-1.1.0.tar.gz \ No newline at end of file diff --git a/swh.loader.core.egg-info/entry_points.txt b/swh.loader.core.egg-info/entry_points.txt index 2c2ca76..018531c 100644 --- a/swh.loader.core.egg-info/entry_points.txt +++ b/swh.loader.core.egg-info/entry_points.txt @@ -1,14 +1,18 @@ [swh.cli.subcommands] loader = swh.loader.cli [swh.workers] +loader.arch = swh.loader.package.arch:register loader.archive = swh.loader.package.archive:register +loader.aur = swh.loader.package.aur:register loader.cran = swh.loader.package.cran:register loader.crates = swh.loader.package.crates:register loader.debian = swh.loader.package.debian:register loader.deposit = swh.loader.package.deposit:register +loader.golang = swh.loader.package.golang:register loader.maven = swh.loader.package.maven:register loader.nixguix = swh.loader.package.nixguix:register loader.npm = swh.loader.package.npm:register loader.opam = swh.loader.package.opam:register +loader.pubdev = swh.loader.package.pubdev:register loader.pypi = swh.loader.package.pypi:register diff --git a/swh.loader.core.egg-info/requires.txt b/swh.loader.core.egg-info/requires.txt index 3aebd1f..fe872fe 100644 --- a/swh.loader.core.egg-info/requires.txt +++ b/swh.loader.core.egg-info/requires.txt @@ -1,25 +1,25 @@ psutil requests iso8601 pkginfo python-debian python-dateutil typing-extensions toml -swh.core>=0.3 +swh.core>=2.12 swh.model>=4.4.0 swh.objstorage>=0.2.2 swh.scheduler>=0.4.0 swh.storage>=0.29.0 [testing] pytest pytest-mock requests_mock swh-core[testing] swh-scheduler[testing]>=0.5.0 swh-storage[testing]>=0.10.6 types-click types-python-dateutil types-pyyaml types-requests diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py index dcb74bc..30e30f6 100644 --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -1,615 +1,636 @@ # Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib import logging import os import time from typing import Any, ContextManager, Dict, Iterable, List, Optional, Union import sentry_sdk from swh.core.config import load_from_envvar from swh.core.statsd import Statsd from swh.loader.core.metadata_fetchers import CredentialsType, get_fetchers_for_lister from swh.loader.exception import NotFound from swh.model.model import ( BaseContent, Content, Directory, Origin, OriginVisit, OriginVisitStatus, RawExtrinsicMetadata, Release, Revision, Sha1Git, SkippedContent, Snapshot, ) from swh.storage import get_storage from swh.storage.interface import StorageInterface from swh.storage.utils import now DEFAULT_CONFIG: Dict[str, Any] = { "max_content_size": 100 * 1024 * 1024, } class BaseLoader: """Base class for (D)VCS loaders (e.g Svn, Git, Mercurial, ...) or PackageLoader (e.g PyPI, Npm, CRAN, ...) A loader retrieves origin information (git/mercurial/svn repositories, pypi/npm/... package artifacts), ingests the contents/directories/revisions/releases/snapshot read from those artifacts and send them to the archive through the storage backend. The main entry point for the loader is the :func:`load` function. 2 static methods (:func:`from_config`, :func:`from_configfile`) centralizes and eases the loader instantiation from either configuration dict or configuration file. Some class examples: - :class:`SvnLoader` - :class:`GitLoader` - :class:`PyPILoader` - :class:`NpmLoader` Args: lister_name: Name of the lister which triggered this load. If provided, the loader will try to use the forge's API to retrieve extrinsic metadata lister_instance_name: Name of the lister instance which triggered this load. Must be None iff lister_name is, but it may be the empty string for listers with a single instance. """ visit_type: str origin: Origin loaded_snapshot_id: Optional[Sha1Git] parent_origins: Optional[List[Origin]] """If the given origin is a "forge fork" (ie. created with the "Fork" button of GitHub-like forges), :meth:`build_extrinsic_origin_metadata` sets this to a list of origins it was forked from; closest parent first.""" def __init__( self, storage: StorageInterface, origin_url: str, logging_class: Optional[str] = None, save_data_path: Optional[str] = None, max_content_size: Optional[int] = None, lister_name: Optional[str] = None, lister_instance_name: Optional[str] = None, metadata_fetcher_credentials: CredentialsType = None, ): if lister_name == "": raise ValueError("lister_name must not be the empty string") if lister_name is None and lister_instance_name is not None: raise ValueError( f"lister_name is None but lister_instance_name is {lister_instance_name!r}" ) if lister_name is not None and lister_instance_name is None: raise ValueError( f"lister_instance_name is None but lister_name is {lister_name!r}" ) self.storage = storage self.origin = Origin(url=origin_url) self.max_content_size = int(max_content_size) if max_content_size else None self.lister_name = lister_name self.lister_instance_name = lister_instance_name self.metadata_fetcher_credentials = metadata_fetcher_credentials or {} if logging_class is None: logging_class = "%s.%s" % ( self.__class__.__module__, self.__class__.__name__, ) self.log = logging.getLogger(logging_class) _log = logging.getLogger("requests.packages.urllib3.connectionpool") _log.setLevel(logging.WARN) # possibly overridden in self.prepare method self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) self.loaded_snapshot_id = None if save_data_path: path = save_data_path os.stat(path) if not os.access(path, os.R_OK | os.W_OK): raise PermissionError("Permission denied: %r" % path) self.save_data_path = save_data_path self.parent_origins = None self.statsd = Statsd( namespace="swh_loader", constant_tags={"visit_type": self.visit_type} ) @classmethod def from_config(cls, storage: Dict[str, Any], **config: Any): """Instantiate a loader from a configuration dict. This is basically a backwards-compatibility shim for the CLI. Args: storage: instantiation config for the storage config: the configuration dict for the loader, with the following keys: - credentials (optional): credentials list for the scheduler - any other kwargs passed to the loader. Returns: the instantiated loader """ # Drop the legacy config keys which aren't used for this generation of loader. for legacy_key in ("storage", "celery"): config.pop(legacy_key, None) # Instantiate the storage storage_instance = get_storage(**storage) return cls(storage=storage_instance, **config) @classmethod def from_configfile(cls, **kwargs: Any): """Instantiate a loader from the configuration loaded from the SWH_CONFIG_FILENAME envvar, with potential extra keyword arguments if their value is not None. Args: kwargs: kwargs passed to the loader instantiation """ config = dict(load_from_envvar(DEFAULT_CONFIG)) config.update({k: v for k, v in kwargs.items() if v is not None}) return cls.from_config(**config) def save_data(self) -> None: """Save the data associated to the current load""" raise NotImplementedError def get_save_data_path(self) -> str: """The path to which we archive the loader's raw data""" if not hasattr(self, "__save_data_path"): year = str(self.visit_date.year) assert self.origin url = self.origin.url.encode("utf-8") origin_url_hash = hashlib.sha1(url).hexdigest() path = "%s/sha1:%s/%s/%s" % ( self.save_data_path, origin_url_hash[0:2], origin_url_hash, year, ) os.makedirs(path, exist_ok=True) self.__save_data_path = path return self.__save_data_path def flush(self) -> Dict[str, int]: """Flush any potential buffered data not sent to swh-storage. Returns the same value as :meth:`swh.storage.interface.StorageInterface.flush`. """ return self.storage.flush() def cleanup(self) -> None: """Last step executed by the loader.""" raise NotImplementedError def _store_origin_visit(self) -> None: """Store origin and visit references. Sets the self.visit references.""" assert self.origin self.storage.origin_add([self.origin]) assert isinstance(self.visit_type, str) self.visit = list( self.storage.origin_visit_add( [ OriginVisit( origin=self.origin.url, date=self.visit_date, type=self.visit_type, ) ] ) )[0] def prepare(self) -> None: """Second step executed by the loader to prepare some state needed by the loader. Raises NotFound exception if the origin to ingest is not found. """ raise NotImplementedError def get_origin(self) -> Origin: """Get the origin that is currently being loaded. self.origin should be set in :func:`prepare_origin` Returns: dict: an origin ready to be sent to storage by :func:`origin_add`. """ assert self.origin return self.origin def fetch_data(self) -> bool: """Fetch the data from the source the loader is currently loading (ex: git/hg/svn/... repository). Returns: a value that is interpreted as a boolean. If True, fetch_data needs to be called again to complete loading. """ raise NotImplementedError + def process_data(self) -> bool: + """Run any additional processing between fetching and storing the data + + Returns: + a value that is interpreted as a boolean. If True, fetch_data needs + to be called again to complete loading. + Ignored if ``fetch_data`` already returned :const:`False`. + """ + return True + def store_data(self): """Store fetched data in the database. Should call the :func:`maybe_load_xyz` methods, which handle the bundles sent to storage, rather than send directly. """ raise NotImplementedError def load_status(self) -> Dict[str, str]: """Detailed loading status. Defaults to logging an eventful load. Returns: a dictionary that is eventually passed back as the task's result to the scheduler, allowing tuning of the task recurrence mechanism. """ return { "status": "eventful", } def post_load(self, success: bool = True) -> None: """Permit the loader to do some additional actions according to status after the loading is done. The flag success indicates the loading's status. Defaults to doing nothing. This is up to the implementer of this method to make sure this does not break. Args: success (bool): the success status of the loading """ pass def visit_status(self) -> str: """Detailed visit status. Defaults to logging a full visit. """ return "full" def pre_cleanup(self) -> None: """As a first step, will try and check for dangling data to cleanup. This should do its best to avoid raising issues. """ pass def load(self) -> Dict[str, str]: r"""Loading logic for the loader to follow: - Store the actual ``origin_visit`` to storage - Call :meth:`prepare` to prepare any eventual state - Call :meth:`get_origin` to get the origin we work with and store - while True: - Call :meth:`fetch_data` to fetch the data to store + - Call :meth:`process_data` to optionally run processing between + :meth:`fetch_data` and :meth:`store_data` - Call :meth:`store_data` to store the data - Call :meth:`cleanup` to clean up any eventual state put in place in :meth:`prepare` method. """ try: with self.statsd_timed("pre_cleanup"): self.pre_cleanup() except Exception: msg = "Cleaning up dangling data failed! Continue loading." self.log.warning(msg) sentry_sdk.capture_exception() self._store_origin_visit() assert ( self.visit.visit ), "The method `_store_origin_visit` should set the visit (OriginVisit)" self.log.info( "Load origin '%s' with type '%s'", self.origin.url, self.visit.type ) try: with self.statsd_timed("build_extrinsic_origin_metadata"): metadata = self.build_extrinsic_origin_metadata() self.load_metadata_objects(metadata) except Exception as e: sentry_sdk.capture_exception(e) # Do not fail the whole task if this is the only failure self.log.exception( "Failure while loading extrinsic origin metadata.", extra={ "swh_task_args": [], "swh_task_kwargs": { "origin": self.origin.url, "lister_name": self.lister_name, "lister_instance_name": self.lister_instance_name, }, }, ) total_time_fetch_data = 0.0 + total_time_process_data = 0.0 total_time_store_data = 0.0 + # Initially not a success, will be True when actually one + status = "failed" + success = False + try: - # Initially not a success, will be True when actually one - success = False with self.statsd_timed("prepare"): self.prepare() while True: t1 = time.monotonic() more_data_to_fetch = self.fetch_data() t2 = time.monotonic() total_time_fetch_data += t2 - t1 - self.store_data() + + more_data_to_fetch = self.process_data() and more_data_to_fetch t3 = time.monotonic() - total_time_store_data += t3 - t2 + total_time_process_data += t3 - t2 + + self.store_data() + t4 = time.monotonic() + total_time_store_data += t4 - t3 if not more_data_to_fetch: break self.statsd_timing("fetch_data", total_time_fetch_data * 1000.0) + self.statsd_timing("process_data", total_time_process_data * 1000.0) self.statsd_timing("store_data", total_time_store_data * 1000.0) status = self.visit_status() visit_status = OriginVisitStatus( origin=self.origin.url, visit=self.visit.visit, type=self.visit_type, date=now(), status=status, snapshot=self.loaded_snapshot_id, ) self.storage.origin_visit_status_add([visit_status]) success = True with self.statsd_timed( "post_load", tags={"success": success, "status": status} ): self.post_load() except BaseException as e: success = False if isinstance(e, NotFound): status = "not_found" task_status = "uneventful" else: status = "partial" if self.loaded_snapshot_id else "failed" task_status = "failed" self.log.exception( "Loading failure, updating to `%s` status", status, extra={ "swh_task_args": [], "swh_task_kwargs": { "origin": self.origin.url, "lister_name": self.lister_name, "lister_instance_name": self.lister_instance_name, }, }, ) if not isinstance(e, (SystemExit, KeyboardInterrupt)): sentry_sdk.capture_exception() visit_status = OriginVisitStatus( origin=self.origin.url, visit=self.visit.visit, type=self.visit_type, date=now(), status=status, snapshot=self.loaded_snapshot_id, ) self.storage.origin_visit_status_add([visit_status]) with self.statsd_timed( "post_load", tags={"success": success, "status": status} ): self.post_load(success=success) if not isinstance(e, Exception): # e derives from BaseException but not Exception; this is most likely # SystemExit or KeyboardInterrupt, so we should re-raise it. raise return {"status": task_status} finally: with self.statsd_timed( "flush", tags={"success": success, "status": status} ): self.flush() with self.statsd_timed( "cleanup", tags={"success": success, "status": status} ): self.cleanup() return self.load_status() def load_metadata_objects( self, metadata_objects: List[RawExtrinsicMetadata] ) -> None: if not metadata_objects: return authorities = {mo.authority for mo in metadata_objects} self.storage.metadata_authority_add(list(authorities)) fetchers = {mo.fetcher for mo in metadata_objects} self.storage.metadata_fetcher_add(list(fetchers)) self.storage.raw_extrinsic_metadata_add(metadata_objects) def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]: """Builds a list of full RawExtrinsicMetadata objects, using a metadata fetcher returned by :func:`get_fetcher_classes`.""" if self.lister_name is None: self.log.debug("lister_not provided, skipping extrinsic origin metadata") return [] assert ( self.lister_instance_name is not None ), "lister_instance_name is None, but lister_name is not" metadata = [] fetcher_classes = get_fetchers_for_lister(self.lister_name) self.statsd_average("metadata_fetchers", len(fetcher_classes)) for cls in fetcher_classes: metadata_fetcher = cls( origin=self.origin, lister_name=self.lister_name, lister_instance_name=self.lister_instance_name, credentials=self.metadata_fetcher_credentials, ) with self.statsd_timed( "fetch_one_metadata", tags={"fetcher": cls.FETCHER_NAME} ): metadata.extend(metadata_fetcher.get_origin_metadata()) if self.parent_origins is None: self.parent_origins = metadata_fetcher.get_parent_origins() self.statsd_average( "metadata_parent_origins", len(self.parent_origins), tags={"fetcher": cls.FETCHER_NAME}, ) self.statsd_average("metadata_objects", len(metadata)) return metadata def statsd_timed(self, name: str, tags: Dict[str, Any] = {}) -> ContextManager: """ Wrapper for :meth:`swh.core.statsd.Statsd.timed`, which uses the standard metric name and tags for loaders. """ return self.statsd.timed( "operation_duration_seconds", tags={"operation": name, **tags} ) def statsd_timing(self, name: str, value: float, tags: Dict[str, Any] = {}) -> None: """ Wrapper for :meth:`swh.core.statsd.Statsd.timing`, which uses the standard metric name and tags for loaders. """ self.statsd.timing( "operation_duration_seconds", value, tags={"operation": name, **tags} ) def statsd_average( self, name: str, value: Union[int, float], tags: Dict[str, Any] = {} ) -> None: """Increments both ``{name}_sum`` (by the ``value``) and ``{name}_count`` (by ``1``), allowing to prometheus to compute the average ``value`` over time.""" self.statsd.increment(f"{name}_sum", value, tags=tags) self.statsd.increment(f"{name}_count", tags=tags) class DVCSLoader(BaseLoader): """This base class is a pattern for dvcs loaders (e.g. git, mercurial). Those loaders are able to load all the data in one go. For example, the loader defined in swh-loader-git :class:`BulkUpdater`. For other loaders (stateful one, (e.g :class:`SWHSvnLoader`), inherit directly from :class:`BaseLoader`. """ def cleanup(self) -> None: """Clean up an eventual state installed for computations.""" pass def has_contents(self) -> bool: """Checks whether we need to load contents""" return True def get_contents(self) -> Iterable[BaseContent]: """Get the contents that need to be loaded""" raise NotImplementedError def has_directories(self) -> bool: """Checks whether we need to load directories""" return True def get_directories(self) -> Iterable[Directory]: """Get the directories that need to be loaded""" raise NotImplementedError def has_revisions(self) -> bool: """Checks whether we need to load revisions""" return True def get_revisions(self) -> Iterable[Revision]: """Get the revisions that need to be loaded""" raise NotImplementedError def has_releases(self) -> bool: """Checks whether we need to load releases""" return True def get_releases(self) -> Iterable[Release]: """Get the releases that need to be loaded""" raise NotImplementedError def get_snapshot(self) -> Snapshot: """Get the snapshot that needs to be loaded""" raise NotImplementedError def eventful(self) -> bool: """Whether the load was eventful""" raise NotImplementedError def store_data(self) -> None: assert self.origin if self.save_data_path: self.save_data() if self.has_contents(): for obj in self.get_contents(): if isinstance(obj, Content): self.storage.content_add([obj]) elif isinstance(obj, SkippedContent): self.storage.skipped_content_add([obj]) else: raise TypeError(f"Unexpected content type: {obj}") if self.has_directories(): for directory in self.get_directories(): self.storage.directory_add([directory]) if self.has_revisions(): for revision in self.get_revisions(): self.storage.revision_add([revision]) if self.has_releases(): for release in self.get_releases(): self.storage.release_add([release]) snapshot = self.get_snapshot() self.storage.snapshot_add([snapshot]) self.flush() self.loaded_snapshot_id = snapshot.id diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py index 98cff64..dacec8b 100644 --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -1,480 +1,481 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib import logging import time from unittest.mock import MagicMock, call import pytest from swh.loader.core.loader import BaseLoader, DVCSLoader from swh.loader.core.metadata_fetchers import MetadataFetcherProtocol from swh.loader.exception import NotFound from swh.loader.tests import assert_last_visit_matches from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, MetadataFetcher, Origin, RawExtrinsicMetadata, Snapshot, ) import swh.storage.exc ORIGIN = Origin(url="some-url") PARENT_ORIGIN = Origin(url="base-origin-url") METADATA_AUTHORITY = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="http://example.org/" ) REMD = RawExtrinsicMetadata( target=ORIGIN.swhid(), discovery_date=datetime.datetime.now(tz=datetime.timezone.utc), authority=METADATA_AUTHORITY, fetcher=MetadataFetcher( name="test fetcher", version="0.0.1", ), format="test-format", metadata=b'{"foo": "bar"}', ) class DummyLoader: """Base Loader to overload and simplify the base class (technical: to avoid repetition in other *Loader classes)""" visit_type = "git" def __init__(self, storage, *args, **kwargs): super().__init__(storage, ORIGIN.url, *args, **kwargs) def cleanup(self): pass def prepare(self, *args, **kwargs): pass def fetch_data(self): pass def get_snapshot_id(self): return None class DummyDVCSLoader(DummyLoader, DVCSLoader): """DVCS Loader that does nothing in regards to DAG objects.""" def get_contents(self): return [] def get_directories(self): return [] def get_revisions(self): return [] def get_releases(self): return [] def get_snapshot(self): return Snapshot(branches={}) def eventful(self): return False class DummyBaseLoader(DummyLoader, BaseLoader): """Buffered loader will send new data when threshold is reached""" def store_data(self): pass class DummyMetadataFetcher: SUPPORTED_LISTERS = {"fake-forge"} FETCHER_NAME = "fake-forge" def __init__(self, origin, credentials, lister_name, lister_instance_name): pass def get_origin_metadata(self): return [REMD] def get_parent_origins(self): return [] class DummyMetadataFetcherWithFork: SUPPORTED_LISTERS = {"fake-forge"} FETCHER_NAME = "fake-forge" def __init__(self, origin, credentials, lister_name, lister_instance_name): pass def get_origin_metadata(self): return [REMD] def get_parent_origins(self): return [PARENT_ORIGIN] def test_types(): assert isinstance( DummyMetadataFetcher(None, None, None, None), MetadataFetcherProtocol ) assert isinstance( DummyMetadataFetcherWithFork(None, None, None, None), MetadataFetcherProtocol ) def test_base_loader(swh_storage): loader = DummyBaseLoader(swh_storage) result = loader.load() assert result == {"status": "eventful"} def test_base_loader_with_config(swh_storage): loader = DummyBaseLoader(swh_storage, "logger-name") result = loader.load() assert result == {"status": "eventful"} def test_base_loader_with_known_lister_name(swh_storage, mocker): fetcher_cls = MagicMock(wraps=DummyMetadataFetcher) fetcher_cls.SUPPORTED_LISTERS = DummyMetadataFetcher.SUPPORTED_LISTERS fetcher_cls.FETCHER_NAME = "fake-forge" mocker.patch( "swh.loader.core.metadata_fetchers._fetchers", return_value=[fetcher_cls] ) loader = DummyBaseLoader( swh_storage, lister_name="fake-forge", lister_instance_name="" ) statsd_report = mocker.patch.object(loader.statsd, "_report") result = loader.load() assert result == {"status": "eventful"} fetcher_cls.assert_called_once() fetcher_cls.assert_called_once_with( origin=ORIGIN, credentials={}, lister_name="fake-forge", lister_instance_name="", ) assert swh_storage.raw_extrinsic_metadata_get( ORIGIN.swhid(), METADATA_AUTHORITY ).results == [REMD] assert loader.parent_origins == [] assert [ call("metadata_fetchers_sum", "c", 1, {}, 1), call("metadata_fetchers_count", "c", 1, {}, 1), call("metadata_parent_origins_sum", "c", 0, {"fetcher": "fake-forge"}, 1), call("metadata_parent_origins_count", "c", 1, {"fetcher": "fake-forge"}, 1), call("metadata_objects_sum", "c", 1, {}, 1), call("metadata_objects_count", "c", 1, {}, 1), ] == [c for c in statsd_report.mock_calls if "metadata_" in c[1][0]] assert loader.statsd.namespace == "swh_loader" assert loader.statsd.constant_tags == {"visit_type": "git"} def test_base_loader_with_unknown_lister_name(swh_storage, mocker): fetcher_cls = MagicMock(wraps=DummyMetadataFetcher) fetcher_cls.SUPPORTED_LISTERS = DummyMetadataFetcher.SUPPORTED_LISTERS mocker.patch( "swh.loader.core.metadata_fetchers._fetchers", return_value=[fetcher_cls] ) loader = DummyBaseLoader( swh_storage, lister_name="other-lister", lister_instance_name="" ) result = loader.load() assert result == {"status": "eventful"} fetcher_cls.assert_not_called() with pytest.raises(swh.storage.exc.StorageArgumentException): swh_storage.raw_extrinsic_metadata_get(ORIGIN.swhid(), METADATA_AUTHORITY) def test_base_loader_forked_origin(swh_storage, mocker): fetcher_cls = MagicMock(wraps=DummyMetadataFetcherWithFork) fetcher_cls.SUPPORTED_LISTERS = DummyMetadataFetcherWithFork.SUPPORTED_LISTERS fetcher_cls.FETCHER_NAME = "fake-forge" mocker.patch( "swh.loader.core.metadata_fetchers._fetchers", return_value=[fetcher_cls] ) loader = DummyBaseLoader( swh_storage, lister_name="fake-forge", lister_instance_name="" ) statsd_report = mocker.patch.object(loader.statsd, "_report") result = loader.load() assert result == {"status": "eventful"} fetcher_cls.assert_called_once() fetcher_cls.assert_called_once_with( origin=ORIGIN, credentials={}, lister_name="fake-forge", lister_instance_name="", ) assert swh_storage.raw_extrinsic_metadata_get( ORIGIN.swhid(), METADATA_AUTHORITY ).results == [REMD] assert loader.parent_origins == [PARENT_ORIGIN] assert [ call("metadata_fetchers_sum", "c", 1, {}, 1), call("metadata_fetchers_count", "c", 1, {}, 1), call("metadata_parent_origins_sum", "c", 1, {"fetcher": "fake-forge"}, 1), call("metadata_parent_origins_count", "c", 1, {"fetcher": "fake-forge"}, 1), call("metadata_objects_sum", "c", 1, {}, 1), call("metadata_objects_count", "c", 1, {}, 1), ] == [c for c in statsd_report.mock_calls if "metadata_" in c[1][0]] assert loader.statsd.namespace == "swh_loader" assert loader.statsd.constant_tags == {"visit_type": "git"} def test_base_loader_post_load_raise(swh_storage, mocker): loader = DummyBaseLoader(swh_storage) post_load = mocker.patch.object(loader, "post_load") # raise exception in post_load when success is True def post_load_method(*args, success=True): if success: raise Exception("Error in post_load") post_load.side_effect = post_load_method result = loader.load() assert result == {"status": "failed"} # ensure post_load has been called twice, once with success to True and # once with success to False as the first post_load call raised exception assert post_load.call_args_list == [mocker.call(), mocker.call(success=False)] def test_dvcs_loader(swh_storage): loader = DummyDVCSLoader(swh_storage) result = loader.load() assert result == {"status": "eventful"} def test_dvcs_loader_with_config(swh_storage): loader = DummyDVCSLoader(swh_storage, "another-logger") result = loader.load() assert result == {"status": "eventful"} def test_loader_logger_default_name(swh_storage): loader = DummyBaseLoader(swh_storage) assert isinstance(loader.log, logging.Logger) assert loader.log.name == "swh.loader.core.tests.test_loader.DummyBaseLoader" loader = DummyDVCSLoader(swh_storage) assert isinstance(loader.log, logging.Logger) assert loader.log.name == "swh.loader.core.tests.test_loader.DummyDVCSLoader" def test_loader_logger_with_name(swh_storage): loader = DummyBaseLoader(swh_storage, "some.logger.name") assert isinstance(loader.log, logging.Logger) assert loader.log.name == "some.logger.name" def test_loader_save_data_path(swh_storage, tmp_path): loader = DummyBaseLoader(swh_storage, "some.logger.name.1", save_data_path=tmp_path) url = "http://bitbucket.org/something" loader.origin = Origin(url=url) loader.visit_date = datetime.datetime(year=2019, month=10, day=1) hash_url = hashlib.sha1(url.encode("utf-8")).hexdigest() expected_save_path = "%s/sha1:%s/%s/2019" % (str(tmp_path), hash_url[0:2], hash_url) save_path = loader.get_save_data_path() assert save_path == expected_save_path def _check_load_failure(caplog, loader, exc_class, exc_text, status="partial"): """Check whether a failed load properly logged its exception, and that the snapshot didn't get referenced in storage""" assert isinstance(loader, DVCSLoader) # was implicit so far for record in caplog.records: if record.levelname != "ERROR": continue assert "Loading failure" in record.message assert record.exc_info exc = record.exc_info[1] assert isinstance(exc, exc_class) assert exc_text in exc.args[0] # Check that the get_snapshot operation would have succeeded assert loader.get_snapshot() is not None # And confirm that the visit doesn't reference a snapshot visit = assert_last_visit_matches(loader.storage, ORIGIN.url, status) if status != "partial": assert visit.snapshot is None # But that the snapshot didn't get loaded assert loader.loaded_snapshot_id is None @pytest.mark.parametrize("success", [True, False]) def test_loader_timings(swh_storage, mocker, success): current_time = time.time() mocker.patch("time.monotonic", side_effect=lambda: current_time) mocker.patch("swh.core.statsd.monotonic", side_effect=lambda: current_time) runtimes = { "pre_cleanup": 2.0, "build_extrinsic_origin_metadata": 3.0, "prepare": 5.0, "fetch_data": 7.0, - "store_data": 11.0, - "post_load": 13.0, - "flush": 17.0, - "cleanup": 23.0, + "process_data": 11.0, + "store_data": 13.0, + "post_load": 17.0, + "flush": 23.0, + "cleanup": 27.0, } class TimedLoader(BaseLoader): visit_type = "my-visit-type" def __getattribute__(self, method_name): if method_name == "visit_status" and not success: def crashy(): raise Exception("oh no") return crashy if method_name not in runtimes: return super().__getattribute__(method_name) def meth(*args, **kwargs): nonlocal current_time current_time += runtimes[method_name] return meth loader = TimedLoader(swh_storage, origin_url="http://example.org/hello.git") statsd_report = mocker.patch.object(loader.statsd, "_report") loader.load() if success: expected_tags = { "post_load": {"success": True, "status": "full"}, "flush": {"success": True, "status": "full"}, "cleanup": {"success": True, "status": "full"}, } else: expected_tags = { "post_load": {"success": False, "status": "failed"}, "flush": {"success": False, "status": "failed"}, "cleanup": {"success": False, "status": "failed"}, } # note that this is a list equality, so order of entries in 'runtimes' matters. # This is not perfect, but call() objects are not hashable so it's simpler this way, # even if not perfect. assert statsd_report.mock_calls == [ call( "operation_duration_seconds", "ms", value * 1000, {"operation": key, **expected_tags.get(key, {})}, 1, ) for (key, value) in runtimes.items() ] assert loader.statsd.namespace == "swh_loader" assert loader.statsd.constant_tags == {"visit_type": "my-visit-type"} class DummyDVCSLoaderExc(DummyDVCSLoader): """A loader which raises an exception when loading some contents""" def get_contents(self): raise RuntimeError("Failed to get contents!") def test_dvcs_loader_exc_partial_visit(swh_storage, caplog): logger_name = "dvcsloaderexc" caplog.set_level(logging.ERROR, logger=logger_name) loader = DummyDVCSLoaderExc(swh_storage, logging_class=logger_name) # fake the loading ending up in a snapshot loader.loaded_snapshot_id = hash_to_bytes( "9e4dd2b40d1b46b70917c0949aa2195c823a648e" ) result = loader.load() # loading failed assert result == {"status": "failed"} # still resulted in a partial visit with a snapshot (somehow) _check_load_failure( caplog, loader, RuntimeError, "Failed to get contents!", ) class BrokenStorageProxy: def __init__(self, storage): self.storage = storage def __getattr__(self, attr): return getattr(self.storage, attr) def snapshot_add(self, snapshots): raise RuntimeError("Failed to add snapshot!") class DummyDVCSLoaderStorageExc(DummyDVCSLoader): """A loader which raises an exception when loading some contents""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.storage = BrokenStorageProxy(self.storage) def test_dvcs_loader_storage_exc_failed_visit(swh_storage, caplog): logger_name = "dvcsloaderexc" caplog.set_level(logging.ERROR, logger=logger_name) loader = DummyDVCSLoaderStorageExc(swh_storage, logging_class=logger_name) result = loader.load() assert result == {"status": "failed"} _check_load_failure( caplog, loader, RuntimeError, "Failed to add snapshot!", status="failed" ) class DummyDVCSLoaderNotFound(DummyDVCSLoader, BaseLoader): """A loader which raises a not_found exception during the prepare method call""" def prepare(*args, **kwargs): raise NotFound("Unknown origin!") def load_status(self): return { "status": "uneventful", } def test_loader_not_found(swh_storage, caplog): loader = DummyDVCSLoaderNotFound(swh_storage) result = loader.load() assert result == {"status": "uneventful"} _check_load_failure(caplog, loader, NotFound, "Unknown origin!", status="not_found") diff --git a/swh/loader/package/arch/__init__.py b/swh/loader/package/arch/__init__.py new file mode 100644 index 0000000..ef34674 --- /dev/null +++ b/swh/loader/package/arch/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import ArchLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": ArchLoader, + } diff --git a/swh/loader/package/arch/loader.py b/swh/loader/package/arch/loader.py new file mode 100644 index 0000000..7ab9fc2 --- /dev/null +++ b/swh/loader/package/arch/loader.py @@ -0,0 +1,141 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from distutils.version import LooseVersion +from pathlib import Path +import re +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple + +import attr + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import release_name +from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone +from swh.storage.interface import StorageInterface + + +@attr.s +class ArchPackageInfo(BasePackageInfo): + + name = attr.ib(type=str) + """Name of the package""" + + version = attr.ib(type=str) + """Current version""" + + last_modified = attr.ib(type=str) + """File last modified date as release date""" + + +def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: + """Extract intrinsic metadata from .PKGINFO file at dir_path. + + Each Arch linux package has a .PKGINFO file at the root of the archive. + + Args: + dir_path: A directory on disk where a package has been extracted + + Returns: + A dict mapping + """ + pkginfo_path = Path(dir_path, ".PKGINFO") + rex = re.compile(r"^(\w+)\s=\s(.*)$", re.M) + with pkginfo_path.open("rb") as content: + parsed = rex.findall(content.read().decode()) + data = {entry[0].lower(): entry[1] for entry in parsed} + if "url" in data.keys(): + data["project_url"] = data["url"] + return data + + +class ArchLoader(PackageLoader[ArchPackageInfo]): + visit_type = "arch" + + def __init__( + self, + storage: StorageInterface, + url: str, + artifacts: List[Dict[str, Any]], + arch_metadata: List[Dict[str, Any]], + **kwargs, + ): + + super().__init__(storage=storage, url=url, **kwargs) + self.url = url + self.artifacts: Dict[str, Dict] = { + artifact["version"]: artifact for artifact in artifacts + } + self.arch_metadata: Dict[str, Dict] = { + metadata["version"]: metadata for metadata in arch_metadata + } + + def get_versions(self) -> Sequence[str]: + """Get all released versions of an Arch Linux package + + Returns: + A sequence of versions + + Example:: + + ["0.1.1", "0.10.2"] + """ + versions = list(self.artifacts.keys()) + versions.sort(key=LooseVersion) + return versions + + def get_default_version(self) -> str: + """Get the newest release version of an Arch Linux package + + Returns: + A string representing a version + + Example:: + + "0.1.2" + """ + return self.get_versions()[-1] + + def get_package_info(self, version: str) -> Iterator[Tuple[str, ArchPackageInfo]]: + """Get release name and package information from version + + Args: + version: arch version (e.g: "0.1.0") + + Returns: + Iterator of tuple (release_name, p_info) + """ + artifact = self.artifacts[version] + metadata = self.arch_metadata[version] + assert version == artifact["version"] == metadata["version"] + + p_info = ArchPackageInfo( + name=metadata["name"], + filename=artifact["filename"], + url=artifact["url"], + version=version, + last_modified=metadata["last_modified"], + ) + yield release_name(version, artifact["filename"]), p_info + + def build_release( + self, p_info: ArchPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) + author = Person.from_fullname(intrinsic_metadata["packager"].encode()) + description = intrinsic_metadata["pkgdesc"] + + message = ( + f"Synthetic release for Arch Linux source package {p_info.name} " + f"version {p_info.version}\n\n" + f"{description}\n" + ) + return Release( + name=p_info.version.encode(), + author=author, + date=TimestampWithTimezone.from_iso8601(p_info.last_modified), + message=message.encode(), + target_type=ObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) diff --git a/swh/loader/package/crates/tasks.py b/swh/loader/package/arch/tasks.py similarity index 51% copy from swh/loader/package/crates/tasks.py copy to swh/loader/package/arch/tasks.py index 9385263..0e6ded9 100644 --- a/swh/loader/package/crates/tasks.py +++ b/swh/loader/package/arch/tasks.py @@ -1,14 +1,14 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task -from swh.loader.package.crates.loader import CratesLoader +from swh.loader.package.arch.loader import ArchLoader -@shared_task(name=__name__ + ".LoadCrates") -def load_crates(*, url=None, artifacts: list): - """Load Rust crate package""" - return CratesLoader.from_configfile(url=url, artifacts=artifacts).load() +@shared_task(name=__name__ + ".LoadArch") +def load_arch(**kwargs): + """Load Arch Linux packages""" + return ArchLoader.from_configfile(**kwargs).load() diff --git a/swh/loader/package/arch/tests/__init__.py b/swh/loader/package/arch/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/arch/tests/data/fake_arch.sh b/swh/loader/package/arch/tests/data/fake_arch.sh new file mode 100755 index 0000000..471d207 --- /dev/null +++ b/swh/loader/package/arch/tests/data/fake_arch.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +# Script to generate fake Arch Linux packages files and fake http response. + +set -euo pipefail + +# Please note that you need to install Zstandard compression tool (zstd) to compress +# to .zst archive and Xz utils (xz) to compress to .xz archive. +command -v zstd || echo "you should install 'zstd' to run this script" +command -v xz || echo "you should install 'xz' to run this script" + +# files and directories +mkdir https_archive.archlinux.org +mkdir https_uk.mirror.archlinuxarm.org +mkdir -p tmp_dir/arch/ + +cd tmp_dir/arch/ + +mkdir 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64' +mkdir 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64' + +echo -e '''pkgname = dialog +pkgbase = dialog +pkgver = 1:1.3_20190211-1 +pkgdesc = A tool to display dialog boxes from shell scripts +url = https://invisible-island.net/dialog/ +builddate = 1550046926 +packager = Evangelos Foutras +size = 455680 +arch = x86_64 +license = LGPL2.1 +provides = libdialog.so=15-64 +depend = sh +''' > packages_d_dialog_dialog-1:1.3_20190211-1-x86_64/.PKGINFO + +echo -e '''pkgname = dialog +pkgbase = dialog +pkgver = 1:1.3_20220414-1 +pkgdesc = A tool to display dialog boxes from shell scripts +url = https://invisible-island.net/dialog/ +builddate = 1650081535 +packager = Evangelos Foutras +size = 483988 +arch = x86_64 +license = LGPL2.1 +provides = libdialog.so=15-64 +depend = sh +depend = ncurses +''' > packages_d_dialog_dialog-1:1.3_20220414-1-x86_64/.PKGINFO + +# Compress packages folders to .tar.gz archives + +tar --force-local -acf 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz' -C 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64' . +tar --force-local -acf 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst' -C 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64' . + +mv *.xz ../../https_archive.archlinux.org +mv *.zst ../../https_archive.archlinux.org + +# uk.mirror.archlinuxarm.org +mkdir 'aarch64_core_gzip-1.12-1-aarch64' + +echo -e '''# Generated by makepkg 6.0.1 +# using fakeroot version 1.28 +pkgname = gzip +pkgbase = gzip +pkgver = 1.12-1 +pkgdesc = GNU compression utility +url = https://www.gnu.org/software/gzip/ +builddate = 1649365694 +packager = Arch Linux ARM Build System +size = 162688 +arch = aarch64 +license = GPL3 +group = base-devel +depend = glibc +depend = bash +depend = less +''' > aarch64_core_gzip-1.12-1-aarch64/.PKGINFO + +tar --force-local -acf 'aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz' -C 'aarch64_core_gzip-1.12-1-aarch64' . + +mv *.xz ../../https_uk.mirror.archlinuxarm.org + +# Clean up removing tmp_dir +cd ../../ +rm -r tmp_dir/ diff --git a/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz new file mode 100644 index 0000000..92e2f00 Binary files /dev/null and b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz differ diff --git a/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst new file mode 100644 index 0000000..a0ebd62 Binary files /dev/null and b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst differ diff --git a/swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz b/swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz new file mode 100644 index 0000000..c7578fa Binary files /dev/null and b/swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz differ diff --git a/swh/loader/package/arch/tests/test_arch.py b/swh/loader/package/arch/tests/test_arch.py new file mode 100644 index 0000000..3180f9d --- /dev/null +++ b/swh/loader/package/arch/tests/test_arch.py @@ -0,0 +1,253 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import pytest + +from swh.loader.package.arch.loader import ArchLoader +from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats +from swh.model.hashutil import hash_to_bytes +from swh.model.model import ( + ObjectType, + Person, + Release, + Snapshot, + SnapshotBranch, + TargetType, + TimestampWithTimezone, +) + +EXPECTED_PACKAGES = [ + { + "url": "https://archive.archlinux.org/packages/d/dialog/", + "artifacts": [ + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20190211-1", + "length": 180000, + "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20220414-1", + "length": 198000, + "filename": "dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", + }, + ], + "arch_metadata": [ + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20190211-1", + "last_modified": "2019-02-13T08:36:00", + }, + { + "arch": "x86_64", + "repo": "core", + "name": "dialog", + "version": "1:1.3_20220414-1", + "last_modified": "2022-04-16T03:59:00", + }, + ], + }, + { + "url": "https://archlinuxarm.org/packages/aarch64/gzip", + "artifacts": [ + { + "url": "https://uk.mirror.archlinuxarm.org/aarch64/core/gzip-1.12-1-aarch64.pkg.tar.xz", # noqa: B950 + "length": 79640, + "version": "1.12-1", + "filename": "gzip-1.12-1-aarch64.pkg.tar.xz", + } + ], + "arch_metadata": [ + { + "arch": "aarch64", + "name": "gzip", + "repo": "core", + "version": "1.12-1", + "last_modified": "2022-04-07T21:08:14", + } + ], + }, +] + + +def test_get_versions(swh_storage): + loader = ArchLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[0]["arch_metadata"], + ) + + assert loader.get_versions() == [ + "1:1.3_20190211-1", + "1:1.3_20220414-1", + ] + + +def test_get_default_version(requests_mock_datadir, swh_storage): + loader = ArchLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[0]["arch_metadata"], + ) + assert loader.get_default_version() == "1:1.3_20220414-1" + + +def test_arch_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): + loader = ArchLoader( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + artifacts=EXPECTED_PACKAGES[1]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[1]["arch_metadata"], + ) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + + expected_snapshot_id = "4020d0a278027550e336b5481a4159a913c91aa4" + expected_release_id = "7681098c9e381f9cc8bd1724d57eeee2182982dc" + + assert expected_snapshot_id == actual_load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(actual_load_status["snapshot_id"]), + branches={ + b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz": SnapshotBranch( + target=hash_to_bytes(expected_release_id), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz", + target_type=TargetType.ALIAS, + ), + }, + ) + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 1, + "directory": 1, + "origin": 1, + "origin_visit": 1, + "release": 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( + name=b"1.12-1", + message=b"Synthetic release for Arch Linux source package gzip version " + b"1.12-1\n\nGNU compression utility\n", + target=hash_to_bytes("bd742aaf422953a1f7a5e084ec4a7477491d63fb"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=Person.from_fullname( + b"Arch Linux ARM Build System " + ), + date=TimestampWithTimezone.from_iso8601("2022-04-07T21:08:14+00:00"), + id=hash_to_bytes(expected_release_id), + ) + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + status="full", + type="arch", + snapshot=expected_snapshot.id, + ) + + +def test_arch_loader_load_n_versions(datadir, requests_mock_datadir, swh_storage): + + loader = ArchLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[0]["arch_metadata"], + ) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + + expected_snapshot_id = "832139d69a91edffcc3a96cca11deaf9255041c3" + + assert expected_snapshot_id == actual_load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(actual_load_status["snapshot_id"]), + branches={ + b"releases/1:1.3_20190211-1/" + b"dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz": SnapshotBranch( + target=hash_to_bytes("37efb727ff8bb8fbf92518aa8fe5fff2ad427d06"), + target_type=TargetType.RELEASE, + ), + b"releases/1:1.3_20220414-1/" + b"dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst": SnapshotBranch( + target=hash_to_bytes("020d3f5627df7474f257fd04f1ede4415296e265"), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/1:1.3_20220414-1/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 2, + "directory": 2, + "origin": 1, + "origin_visit": 1, + "release": 2, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + status="full", + type="arch", + snapshot=expected_snapshot.id, + ) + + +def test_arch_invalid_origin_archive_not_found(swh_storage, requests_mock_datadir): + url = "https://nowhere/packages/42" + loader = ArchLoader( + swh_storage, + url, + artifacts=[ + { + "filename": "42-0.0.1.pkg.xz", + "url": "https://mirror2.nowhere/pkg/42-0.0.1.pkg.xz", + "version": "0.0.1", + "length": 42, + }, + ], + arch_metadata=[ + { + "version": "0.0.1", + "arch": "aarch64", + "name": "42", + "repo": "community", + "last_modified": "2022-04-07T21:08:14", + }, + ], + ) + with pytest.raises(Exception): + assert loader.load() == {"status": "failed"} + assert_last_visit_matches( + swh_storage, url, status="not_found", type="arch", snapshot=None + ) diff --git a/swh/loader/package/arch/tests/test_tasks.py b/swh/loader/package/arch/tests/test_tasks.py new file mode 100644 index 0000000..b5178ac --- /dev/null +++ b/swh/loader/package/arch/tests/test_tasks.py @@ -0,0 +1,40 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_tasks_arch_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.arch.loader.ArchLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.arch.tasks.LoadArch", + kwargs=dict( + url="some-url/packages/s/some-package", + artifacts=[ + { + "version": "0.0.1", + "url": "https://somewhere/some-package-0.0.1.pkg.xz", + "filename": "some-package-0.0.1.pkg.xz", + "length": 42, + } + ], + arch_metadata=[ + { + "version": "0.0.1", + "arch": "aarch64", + "name": "some-package", + "repo": "community", + "last_modified": "1970-01-01T21:08:14", + } + ], + ), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py index 853a8c4..b96cad6 100644 --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -1,164 +1,168 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib import logging from os import path import string from typing import Any, Dict, Iterator, Mapping, Optional, Sequence, Tuple, Union import attr import iso8601 from swh.loader.package.loader import BasePackageInfo, PackageLoader, PartialExtID from swh.loader.package.utils import EMPTY_AUTHOR, release_name from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) @attr.s class ArchivePackageInfo(BasePackageInfo): raw_info = attr.ib(type=Dict[str, Any]) length = attr.ib(type=int) """Size of the archive file""" time = attr.ib(type=Union[str, datetime.datetime]) """Timestamp of the archive file on the server""" # default format for gnu MANIFEST_FORMAT = string.Template("$time $length $version $url") def extid(self, manifest_format: Optional[string.Template] = None) -> PartialExtID: """Returns a unique intrinsic identifier of this package info ``manifest_format`` allows overriding the class' default MANIFEST_FORMAT""" manifest_format = manifest_format or self.MANIFEST_FORMAT # TODO: use parsed attributes instead of self.raw_info manifest = manifest_format.substitute( {k: str(v) for (k, v) in self.raw_info.items()} ) return ( self.EXTID_TYPE, self.EXTID_VERSION, hashlib.sha256(manifest.encode()).digest(), ) @classmethod def from_metadata(cls, a_metadata: Dict[str, Any]) -> "ArchivePackageInfo": url = a_metadata["url"] filename = a_metadata.get("filename") return cls( url=url, filename=filename if filename else path.split(url)[-1], raw_info=a_metadata, length=a_metadata["length"], time=a_metadata["time"], version=a_metadata["version"], ) class ArchiveLoader(PackageLoader[ArchivePackageInfo]): """Load archive origin's artifact files into swh archive""" visit_type = "tar" def __init__( self, storage: StorageInterface, url: str, artifacts: Sequence[Dict[str, Any]], extid_manifest_format: Optional[str] = None, snapshot_append: bool = False, **kwargs: Any, ): f"""Loader constructor. For now, this is the lister's task output. Args: url: Origin url artifacts: List of artifact information with keys: - **time**: last modification time as either isoformat date string or timestamp - **url**: the artifact url to retrieve filename - **filename**: optionally, the file's name - **version**: artifact's version - **length**: artifact's length extid_manifest_format: template string used to format a manifest, which is hashed to get the extid of a package. Defaults to {ArchivePackageInfo.MANIFEST_FORMAT!r} snapshot_append: if :const:`True`, append latest snapshot content to the new snapshot created by the loader """ super().__init__(storage=storage, url=url, **kwargs) self.artifacts = artifacts # assume order is enforced in the lister self.extid_manifest_format = ( None if extid_manifest_format is None else string.Template(extid_manifest_format) ) self.snapshot_append = snapshot_append def get_versions(self) -> Sequence[str]: versions = [] for archive in self.artifacts: v = archive.get("version") if v: versions.append(v) return versions def get_default_version(self) -> str: # It's the most recent, so for this loader, it's the last one return self.artifacts[-1]["version"] def get_package_info( self, version: str ) -> Iterator[Tuple[str, ArchivePackageInfo]]: for a_metadata in self.artifacts: p_info = ArchivePackageInfo.from_metadata(a_metadata) if version == p_info.version: # FIXME: this code assumes we have only 1 artifact per # versioned package yield release_name(version), p_info def new_packageinfo_to_extid( self, p_info: ArchivePackageInfo ) -> Optional[PartialExtID]: return p_info.extid(manifest_format=self.extid_manifest_format) def build_release( self, p_info: ArchivePackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: time = p_info.time # assume it's a timestamp if isinstance(time, str): # otherwise, assume it's a parsable date parsed_time = iso8601.parse_date(time) else: parsed_time = time - normalized_time = TimestampWithTimezone.from_datetime(parsed_time) + normalized_time = ( + TimestampWithTimezone.from_datetime(parsed_time) + if parsed_time is not None + else None + ) msg = f"Synthetic release for archive at {p_info.url}\n" return Release( name=p_info.version.encode(), message=msg.encode(), date=normalized_time, author=EMPTY_AUTHOR, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: if not self.snapshot_append: return {} last_snapshot = self.last_snapshot() return last_snapshot.to_dict()["branches"] if last_snapshot else {} diff --git a/swh/loader/package/archive/tests/test_archive.py b/swh/loader/package/archive/tests/test_archive.py index a590c1d..7a32b2c 100644 --- a/swh/loader/package/archive/tests/test_archive.py +++ b/swh/loader/package/archive/tests/test_archive.py @@ -1,488 +1,502 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy import datetime import hashlib from io import BytesIO from pathlib import Path import string import attr import pytest from requests.exceptions import ContentDecodingError from swh.loader.package.archive.loader import ArchiveLoader, ArchivePackageInfo from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import ( ObjectType, Person, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) URL = "https://ftp.gnu.org/gnu/8sync/" GNU_ARTIFACTS = [ { "time": 944729610, "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz", "length": 221837, "filename": "8sync-0.1.0.tar.gz", "version": "0.1.0", }, { "time": 1480991830, "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", "length": 238466, "filename": "8sync-0.2.0.tar.gz", "version": "0.2.0", }, ] _expected_new_contents_first_visit = [ "e9258d81faf5881a2f96a77ba609396f82cb97ad", "1170cf105b04b7e2822a0e09d2acf71da7b9a130", "fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac", "0057bec9b5422aff9256af240b177ac0e3ac2608", "2b8d0d0b43a1078fc708930c8ddc2956a86c566e", "27de3b3bc6545d2a797aeeb4657c0e215a0c2e55", "2e6db43f5cd764e677f416ff0d0c78c7a82ef19b", "ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62", "edeb33282b2bffa0e608e9d2fd960fd08093c0ea", "d64e64d4c73679323f8d4cde2643331ba6c20af9", "7a756602914be889c0a2d3952c710144b3e64cb0", "84fb589b554fcb7f32b806951dcf19518d67b08f", "8624bcdae55baeef00cd11d5dfcfa60f68710a02", "e08441aeab02704cfbd435d6445f7c072f8f524e", "f67935bc3a83a67259cda4b2d43373bd56703844", "809788434b433eb2e3cfabd5d591c9a659d5e3d8", "7d7c6c8c5ebaeff879f61f37083a3854184f6c41", "b99fec102eb24bffd53ab61fc30d59e810f116a2", "7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68", "f0c97052e567948adf03e641301e9983c478ccff", "7fb724242e2b62b85ca64190c31dcae5303e19b3", "4f9709e64a9134fe8aefb36fd827b84d8b617ab5", "7350628ccf194c2c3afba4ac588c33e3f3ac778d", "0bb892d9391aa706dc2c3b1906567df43cbe06a2", "49d4c0ce1a16601f1e265d446b6c5ea6b512f27c", "6b5cc594ac466351450f7f64a0b79fdaf4435ad3", "3046e5d1f70297e2a507b98224b6222c9688d610", "1572607d456d7f633bc6065a2b3048496d679a31", ] _expected_new_directories_first_visit = [ "daabc65ec75d487b1335ffc101c0ac11c803f8fc", "263be23b4a8101d3ad0d9831319a3e0f2b065f36", "7f6e63ba6eb3e2236f65892cd822041f1a01dd5c", "4db0a3ecbc976083e2dac01a62f93729698429a3", "dfef1c80e1098dd5deda664bb44a9ab1f738af13", "eca971d346ea54d95a6e19d5051f900237fafdaa", "3aebc29ed1fccc4a6f2f2010fb8e57882406b528", ] _expected_new_releases_first_visit = { "c92b2ad9e70ef1dce455e8fe1d8e41b92512cc08": ( "3aebc29ed1fccc4a6f2f2010fb8e57882406b528" ) } def test_archive_visit_with_no_artifact_found(swh_storage, requests_mock_datadir): url = URL unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz" loader = ArchiveLoader( swh_storage, url, artifacts=[ { "time": 944729610, "url": unknown_artifact_url, # unknown artifact "length": 221837, "filename": "8sync-0.1.0.tar.gz", "version": "0.1.0", } ], ) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None stats = get_stats(swh_storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert_last_visit_matches(swh_storage, url, status="partial", type="tar") def test_archive_visit_with_release_artifact_no_prior_visit( swh_storage, requests_mock_datadir ): """With no prior visit, load a gnu project ends up with 1 snapshot""" loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS[:1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" expected_snapshot_first_visit_id = hash_to_bytes( "9efecc835e8f99254934f256b5301b94f348fd17" ) assert actual_load_status["snapshot_id"] == hash_to_hex( expected_snapshot_first_visit_id ) assert_last_visit_matches(swh_storage, URL, status="full", type="tar") stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats release_id = hash_to_bytes(list(_expected_new_releases_first_visit)[0]) expected_snapshot = Snapshot( id=expected_snapshot_first_visit_id, branches={ b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"releases/0.1.0", ), b"releases/0.1.0": SnapshotBranch( target_type=TargetType.RELEASE, target=release_id, ), }, ) check_snapshot(expected_snapshot, swh_storage) assert swh_storage.release_get([release_id])[0] == Release( id=release_id, name=b"0.1.0", message=( b"Synthetic release for archive at " b"https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz\n" ), target=hash_to_bytes("3aebc29ed1fccc4a6f2f2010fb8e57882406b528"), target_type=ObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname(b""), date=TimestampWithTimezone.from_datetime( datetime.datetime(1999, 12, 9, 8, 53, 30, tzinfo=datetime.timezone.utc) ), ) expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit) assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit) assert list(swh_storage.directory_missing(expected_dirs)) == [] expected_rels = map(hash_to_bytes, _expected_new_releases_first_visit) assert list(swh_storage.release_missing(expected_rels)) == [] def test_archive_2_visits_without_change(swh_storage, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot""" url = URL loader = ArchiveLoader(swh_storage, url, artifacts=GNU_ARTIFACTS[:1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, url, status="full", type="tar") actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" assert actual_load_status2["snapshot_id"] is not None assert actual_load_status["snapshot_id"] == actual_load_status2["snapshot_id"] assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] assert len(urls) == 1 def test_archive_2_visits_with_new_artifact(swh_storage, requests_mock_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot""" url = URL artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(swh_storage, url, [artifact1]) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, url, status="full", type="tar") stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] assert len(urls) == 1 artifact2 = GNU_ARTIFACTS[1] loader2 = ArchiveLoader(swh_storage, url, [artifact1, artifact2]) stats2 = get_stats(swh_storage) assert stats == stats2 # ensure we share the storage actual_load_status2 = loader2.load() assert actual_load_status2["status"] == "eventful" assert actual_load_status2["snapshot_id"] is not None stats2 = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit) + 14, "directory": len(_expected_new_directories_first_visit) + 8, "origin": 1, "origin_visit": 1 + 1, "release": len(_expected_new_releases_first_visit) + 1, "revision": 0, "skipped_content": 0, "snapshot": 1 + 1, } == stats2 assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] # 1 artifact (2nd time no modification) + 1 new artifact assert len(urls) == 2 def test_archive_2_visits_without_change_not_gnu(swh_storage, requests_mock_datadir): """Load a project archive (not gnu) ends up with 1 snapshot""" url = "https://something.else.org/8sync/" artifacts = [ # this is not a gnu artifact { "time": "1999-12-09T09:53:30+00:00", # it's also not a timestamp "sha256": "d5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4", # noqa # keep a gnu artifact reference to avoid adding other test files "url": "https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz", "length": 238466, "filename": "8sync-0.2.0.tar.gz", "version": "0.2.0", } ] # Here the loader defines the id_keys to use for existence in the snapshot # It's not the default archive loader which loader = ArchiveLoader( swh_storage, url, artifacts=artifacts, extid_manifest_format="$sha256 $length $url", ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, url, status="full", type="tar") actual_load_status2 = loader.load() assert actual_load_status2["status"] == "uneventful" assert actual_load_status2["snapshot_id"] == actual_load_status["snapshot_id"] assert_last_visit_matches(swh_storage, url, status="full", type="tar") urls = [ m.url for m in requests_mock_datadir.request_history if m.url.startswith("https://ftp.gnu.org") ] assert len(urls) == 1 def test_archive_extid(): """Compute primary key should return the right identity""" @attr.s class TestPackageInfo(ArchivePackageInfo): a = attr.ib() b = attr.ib() metadata = GNU_ARTIFACTS[0] p_info = TestPackageInfo( raw_info={**metadata, "a": 1, "b": 2}, a=1, b=2, **metadata, ) for manifest_format, expected_manifest in [ (string.Template("$a $b"), b"1 2"), (string.Template(""), b""), (None, "{time} {length} {version} {url}".format(**metadata).encode()), ]: actual_id = p_info.extid(manifest_format=manifest_format) assert actual_id == ( "package-manifest-sha256", 0, hashlib.sha256(expected_manifest).digest(), ) with pytest.raises(KeyError): p_info.extid(manifest_format=string.Template("$a $unknown_key")) def test_archive_snapshot_append(swh_storage, requests_mock_datadir): # first loading with a first artifact artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(swh_storage, URL, [artifact1], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 branch_artifact1_name = f"releases/{artifact1['version']}".encode() assert b"HEAD" in snapshot.branches assert branch_artifact1_name in snapshot.branches assert snapshot.branches[b"HEAD"].target == branch_artifact1_name # second loading with a second artifact artifact2 = GNU_ARTIFACTS[1] loader = ArchiveLoader(swh_storage, URL, [artifact2], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot, should contain a new branch and the # branch for the first artifact snapshot = loader.last_snapshot() assert len(snapshot.branches) == 3 branch_artifact2_name = f"releases/{artifact2['version']}".encode() assert b"HEAD" in snapshot.branches assert branch_artifact2_name in snapshot.branches assert branch_artifact1_name in snapshot.branches assert snapshot.branches[b"HEAD"].target == branch_artifact2_name def test_archive_snapshot_append_branch_override(swh_storage, requests_mock_datadir): # first loading for a first artifact artifact1 = GNU_ARTIFACTS[0] loader = ArchiveLoader(swh_storage, URL, [artifact1], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 branch_artifact1_name = f"releases/{artifact1['version']}".encode() assert branch_artifact1_name in snapshot.branches branch_target_first_visit = snapshot.branches[branch_artifact1_name].target # second loading for a second artifact with same version as the first one # but with different tarball content artifact2 = dict(GNU_ARTIFACTS[0]) artifact2["url"] = GNU_ARTIFACTS[1]["url"] artifact2["time"] = GNU_ARTIFACTS[1]["time"] artifact2["length"] = GNU_ARTIFACTS[1]["length"] loader = ArchiveLoader(swh_storage, URL, [artifact2], snapshot_append=True) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(swh_storage, URL, status="full", type="tar") # check expected snapshot, should contain the same branch as previously # but with different target snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 assert branch_artifact1_name in snapshot.branches branch_target_second_visit = snapshot.branches[branch_artifact1_name].target assert branch_target_first_visit != branch_target_second_visit @pytest.fixture def not_gzipped_tarball_bytes(datadir): return Path(datadir, "not_gzipped_tarball.tar.gz").read_bytes() def test_archive_not_gzipped_tarball( swh_storage, requests_mock, not_gzipped_tarball_bytes ): """Check that a tarball erroneously marked as gzip compressed can still be downloaded and processed. """ filename = "not_gzipped_tarball.tar.gz" url = f"https://example.org/ftp/{filename}" requests_mock.get( url, [ { "exc": ContentDecodingError, }, { "body": BytesIO(not_gzipped_tarball_bytes), }, ], ) loader = ArchiveLoader( swh_storage, url, artifacts=[ { "time": 944729610, "url": url, "length": 221837, "filename": filename, "version": "0.1.0", } ], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None snapshot = loader.last_snapshot() assert len(snapshot.branches) == 2 assert b"releases/0.1.0" in snapshot.branches + + +def test_archive_visit_no_time_for_tarball(swh_storage, requests_mock_datadir): + artifacts = copy.deepcopy(GNU_ARTIFACTS) + for artifact in artifacts: + artifact["time"] = None + + loader = ArchiveLoader(swh_storage, URL, artifacts=artifacts) + + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + + assert_last_visit_matches(swh_storage, URL, status="full", type="tar") diff --git a/swh/loader/package/aur/__init__.py b/swh/loader/package/aur/__init__.py new file mode 100644 index 0000000..1682976 --- /dev/null +++ b/swh/loader/package/aur/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import AurLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": AurLoader, + } diff --git a/swh/loader/package/aur/loader.py b/swh/loader/package/aur/loader.py new file mode 100644 index 0000000..24577be --- /dev/null +++ b/swh/loader/package/aur/loader.py @@ -0,0 +1,160 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from distutils.version import LooseVersion +from pathlib import Path +import re +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple + +import attr + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import EMPTY_AUTHOR, release_name +from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone +from swh.storage.interface import StorageInterface + + +@attr.s +class AurPackageInfo(BasePackageInfo): + + name = attr.ib(type=str) + """Name of the package""" + + version = attr.ib(type=str) + """Current version""" + + last_modified = attr.ib(type=str) + """File last modified date as release date""" + + +def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: + """Extract intrinsic metadata from .SRCINFO file at dir_path. + + Each Aur package has a .SRCINFO file at the root of the archive. + + Args: + dir_path: A directory on disk where a package has been extracted + + Returns: + A dict mapping + """ + assert dir_path.exists() + # top directory from extracted archive is always named with the package name + (pkgname,) = [elt.name for elt in dir_path.iterdir() if elt.is_dir()] + srcinfo_path = Path(dir_path, pkgname, ".SRCINFO") + rex = re.compile(r"^(\w+)\s=\s(.*)$", re.M) + with srcinfo_path.open("r") as content: + # Except first and last line, lines may starts with a tab, remove them + srcinfo = content.read().replace("\t", "") + parsed = rex.findall(srcinfo) + data: Dict[str, Any] = {} + for (k, v) in parsed: + if k in data: + if type(data[k]) is not list: + data[k] = [data[k]] + data[k].append(v) + else: + data[k] = v + return data + + +class AurLoader(PackageLoader[AurPackageInfo]): + visit_type = "aur" + + def __init__( + self, + storage: StorageInterface, + url: str, + artifacts: List[Dict[str, Any]], + aur_metadata: List[Dict[str, Any]], + **kwargs, + ): + + super().__init__(storage=storage, url=url, **kwargs) + self.url = url + self.artifacts: Dict[str, Dict] = { + artifact["version"]: artifact for artifact in artifacts + } + self.aur_metadata: Dict[str, Dict] = { + meta["version"]: meta for meta in aur_metadata + } + + def get_versions(self) -> Sequence[str]: + """Get all released versions of an Aur package + + Returns: + A sequence of versions + + Example:: + + ["0.1.1", "0.10.2"] + """ + versions = list(self.artifacts) + versions.sort(key=LooseVersion) + return versions + + def get_default_version(self) -> str: + """Get the newest release version of an Aur package + + Returns: + A string representing a version + + Example:: + + "0.1.2" + """ + return self.get_versions()[-1] + + def get_package_info(self, version: str) -> Iterator[Tuple[str, AurPackageInfo]]: + """Get release name and package information from version + + Args: + version: aur version (e.g: "0.1.0") + + Returns: + Iterator of tuple (release_name, p_info) + """ + artifact = self.artifacts[version] + assert version == artifact["version"] + data = self.aur_metadata[version] + + url = artifact["url"] + filename = artifact["filename"] + + p_info = AurPackageInfo( + name=data["pkgname"], + filename=filename, + url=url, + version=version, + last_modified=data["last_update"], + ) + yield release_name(version, filename), p_info + + def build_release( + self, p_info: AurPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) + author = EMPTY_AUTHOR + description: str = "" + assert intrinsic_metadata["pkgdesc"] + + if type(intrinsic_metadata["pkgdesc"]) is list: + description = "\n".join(intrinsic_metadata["pkgdesc"]) + else: + description = intrinsic_metadata["pkgdesc"] + + message = ( + f"Synthetic release for Aur source package {p_info.name} " + f"version {p_info.version}\n\n" + f"{description}\n" + ) + return Release( + name=p_info.version.encode(), + author=author, + date=TimestampWithTimezone.from_iso8601(p_info.last_modified), + message=message.encode(), + target_type=ObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) diff --git a/swh/loader/package/crates/tasks.py b/swh/loader/package/aur/tasks.py similarity index 51% copy from swh/loader/package/crates/tasks.py copy to swh/loader/package/aur/tasks.py index 9385263..9cfb24b 100644 --- a/swh/loader/package/crates/tasks.py +++ b/swh/loader/package/aur/tasks.py @@ -1,14 +1,14 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task -from swh.loader.package.crates.loader import CratesLoader +from swh.loader.package.aur.loader import AurLoader -@shared_task(name=__name__ + ".LoadCrates") -def load_crates(*, url=None, artifacts: list): - """Load Rust crate package""" - return CratesLoader.from_configfile(url=url, artifacts=artifacts).load() +@shared_task(name=__name__ + ".LoadAur") +def load_aur(**kwargs): + """Load Arch User Repository packages""" + return AurLoader.from_configfile(**kwargs).load() diff --git a/swh/loader/package/aur/tests/__init__.py b/swh/loader/package/aur/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/aur/tests/data/fake_aur.sh b/swh/loader/package/aur/tests/data/fake_aur.sh new file mode 100755 index 0000000..70e6844 --- /dev/null +++ b/swh/loader/package/aur/tests/data/fake_aur.sh @@ -0,0 +1,187 @@ +#!/usr/bin/env bash + +# Script to generate fake Aur packages files and servable directories. + +set -euo pipefail + +# Create directories +readonly TMP=tmp_dir/aur +readonly BASE_URL=https_aur.archlinux.org +readonly SNAPSHOT_PREFIX=cgit_aur.git_snapshot + +mkdir -p $TMP +mkdir -p $BASE_URL + +cd $TMP + +mkdir 'hg-evolve' +echo -e '''pkgbase = hg-evolve + pkgdesc = Flexible evolution of Mercurial history + pkgver = 10.5.2 + pkgrel = 1 + url = https://www.mercurial-scm.org/doc/evolution/ + arch = any + license = GPL2 + makedepends = python-build + makedepends = python-installer + makedepends = python-wheel + depends = mercurial + source = https://files.pythonhosted.org/packages/source/h/hg-evolve/hg-evolve-10.5.2.tar.gz + sha512sums = 81a1cc1202ffaf364fde70c6a36e32330e93aa69c9b9f7e11fbc11f988f7fb302d8b79414c644d274fedb7f0a67e10c4344c0206a1424f2bb97ae2cb11a51315 + +pkgname = hg-evolve +''' > hg-evolve/.SRCINFO + +mkdir 'ibus-git' +echo -e '''pkgbase = ibus-git + pkgdesc = Next Generation Input Bus for Linux + pkgver = 1.5.23+12+gef4c5c7e + pkgrel = 1 + url = https://github.com/ibus/ibus/wiki + arch = x86_64 + license = LGPL + makedepends = gobject-introspection + makedepends = vala + makedepends = intltool + makedepends = gnome-common + makedepends = gtk-doc + makedepends = gtk2 + makedepends = qt5-base + makedepends = unicode-cldr + makedepends = unicode-character-database + makedepends = unicode-emoji + makedepends = git + depends = dconf + depends = gtk3 + depends = hicolor-icon-theme + depends = libnotify + depends = python-dbus + depends = python-gobject + depends = iso-codes + depends = librsvg + options = !emptydirs + source = ibus::git+https://github.com/ibus/ibus + sha512sums = SKIP + +pkgname = ibus-git + depends = dconf + depends = gtk3 + depends = hicolor-icon-theme + depends = libnotify + depends = python-dbus + depends = python-gobject + depends = iso-codes + depends = librsvg + depends = libibus-git=1.5.23+12+gef4c5c7e + provides = ibus + conflicts = ibus + +pkgname = libibus-git + pkgdesc = IBus support library + depends = libglib-2.0.so + depends = libgobject-2.0.so + depends = libgio-2.0.so + provides = libibus + provides = libibus-1.0.so + conflicts = libibus +''' > ibus-git/.SRCINFO + +mkdir 'libervia-web-hg' +echo -e '''pkgbase = libervia-web-hg + pkgdesc = Salut à Toi, multi-frontends multi-purposes XMPP client (Web interface) + pkgver = 0.9.0.r1492.3a34d78f2717 + pkgrel = 1 + url = http://salut-a-toi.org/ + install = libervia-web-hg.install + arch = any + license = AGPL3 + makedepends = python-setuptools + makedepends = mercurial + depends = python + depends = python-jinja + depends = python-shortuuid-git + depends = libervia-media-hg + depends = libervia-backend-hg + depends = libervia-templates-hg + depends = python-zope-interface + depends = python-pyopenssl + depends = python-autobahn + depends = dbus + depends = python-brython + provides = libervia-web + options = !strip + source = hg+https://repos.goffi.org/libervia + md5sums = SKIP + +pkgname = libervia-web-hg +''' > libervia-web-hg/.SRCINFO + +mkdir 'tealdeer-git' +echo -e '''# Generated by mksrcinfo v8 +# Fri Sep 4 20:36:25 UTC 2020 +pkgbase = tealdeer-git + pkgdesc = A fast tldr client in Rust. + pkgver = r255.30b7c5f + pkgrel = 1 + url = https://github.com/dbrgn/tealdeer + arch = x86_64 + arch = armv6h + arch = armv7h + arch = aarch64 + license = MIT + license = Apache + makedepends = git + makedepends = rust + makedepends = cargo + depends = openssl + provides = tldr + conflicts = tldr + options = !emptydirs + source = git+https://github.com/dbrgn/tealdeer + sha256sums = SKIP + +pkgname = tealdeer-git +''' > tealdeer-git/.SRCINFO + +mkdir 'a-fake-one' +echo -e '''# This one does not exists +# For test purpose, in particular for multi keys, multi lines edge case +pkgbase = a-fake-one + pkgdesc = A first line of description. + pkgdesc = A second line for more information. + pkgver = 0.0.1 + pkgrel = 1 + url = https://nowhere/a-fake-one + url = https://mirror/a-fake-one + arch = x86_64 + arch = armv6h + arch = armv7h + arch = aarch64 + license = MIT + license = Apache + makedepends = git + makedepends = rust + makedepends = cargo + depends = openssl + provides = a-fake-one + conflicts = a-fake-one + options = !emptydirs + source = git+https://nowhere/a-fake-one + sha256sums = SKIP + +pkgname = a-fake-one +''' > a-fake-one/.SRCINFO + +# Compress packages folders to .tar.gz archives +tar -czf ${SNAPSHOT_PREFIX}_hg-evolve.tar.gz hg-evolve +tar -czf ${SNAPSHOT_PREFIX}_ibus-git.tar.gz ibus-git +tar -czf ${SNAPSHOT_PREFIX}_libervia-web-hg.tar.gz libervia-web-hg +tar -czf ${SNAPSHOT_PREFIX}_tealdeer-git.tar.gz tealdeer-git +tar -czf ${SNAPSHOT_PREFIX}_a-fake-one.tar.gz a-fake-one + +# Move .tar.gz archives to a servable directory +mv *.tar.gz ../../$BASE_URL + +# Clean up removing tmp_dir +cd ../../ +rm -r tmp_dir/ diff --git a/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_a-fake-one.tar.gz b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_a-fake-one.tar.gz new file mode 100644 index 0000000..f193cc6 Binary files /dev/null and b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_a-fake-one.tar.gz differ diff --git a/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_hg-evolve.tar.gz b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_hg-evolve.tar.gz new file mode 100644 index 0000000..d95aa4e Binary files /dev/null and b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_hg-evolve.tar.gz differ diff --git a/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_ibus-git.tar.gz b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_ibus-git.tar.gz new file mode 100644 index 0000000..99cb97b Binary files /dev/null and b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_ibus-git.tar.gz differ diff --git a/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_libervia-web-hg.tar.gz b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_libervia-web-hg.tar.gz new file mode 100644 index 0000000..a02d15d Binary files /dev/null and b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_libervia-web-hg.tar.gz differ diff --git a/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_tealdeer-git.tar.gz b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_tealdeer-git.tar.gz new file mode 100644 index 0000000..4ff29c4 Binary files /dev/null and b/swh/loader/package/aur/tests/data/https_aur.archlinux.org/cgit_aur.git_snapshot_tealdeer-git.tar.gz differ diff --git a/swh/loader/package/aur/tests/test_aur.py b/swh/loader/package/aur/tests/test_aur.py new file mode 100644 index 0000000..86e69ee --- /dev/null +++ b/swh/loader/package/aur/tests/test_aur.py @@ -0,0 +1,276 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.loader.package.aur.loader import AurLoader +from swh.loader.package.utils import EMPTY_AUTHOR +from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats +from swh.model.hashutil import hash_to_bytes +from swh.model.model import ( + ObjectType, + Release, + Snapshot, + SnapshotBranch, + TargetType, + TimestampWithTimezone, +) + +EXPECTED_PACKAGES = [ + { + "url": "https://aur.archlinux.org/hg-evolve.git", + "artifacts": [ + { + "filename": "hg-evolve.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950 + "version": "10.5.2-1", + } + ], + "aur_metadata": [ + { + "version": "10.5.2-1", + "project_url": "https://www.mercurial-scm.org/doc/evolution/", + "last_update": "2022-07-16T00:08:41+00:00", + "pkgname": "hg-evolve", + } + ], + }, + { + "url": "https://aur.archlinux.org/ibus-git.git", + "artifacts": [ + { + "filename": "ibus-git.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/ibus-git.tar.gz", # noqa: B950 + "version": "1.5.23+12+gef4c5c7e-1", + } + ], + "aur_metadata": [ + { + "version": "1.5.23+12+gef4c5c7e-1", + "project_url": "https://github.com/ibus/ibus/wiki", + "last_update": "2021-02-08T06:12:11+00:00", + "pkgname": "ibus-git", + } + ], + }, + { + "url": "https://aur.archlinux.org/libervia-web-hg.git", + "artifacts": [ + { + "filename": "libervia-web-hg.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/libervia-web-hg.tar.gz", # noqa: B950 + "version": "0.9.0.r1492.3a34d78f2717-1", + } + ], + "aur_metadata": [ + { + "version": "0.9.0.r1492.3a34d78f2717-1", + "project_url": "http://salut-a-toi.org/", + "last_update": "2022-02-26T15:30:58+00:00", + "pkgname": "libervia-web-hg", + } + ], + }, + { + "url": "https://aur.archlinux.org/tealdeer-git.git", + "artifacts": [ + { + "filename": "tealdeer-git.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/tealdeer-git.tar.gz", # noqa: B950 + "version": "r255.30b7c5f-1", + } + ], + "aur_metadata": [ + { + "version": "r255.30b7c5f-1", + "project_url": "https://github.com/dbrgn/tealdeer", + "last_update": "2020-09-04T20:36:52+00:00", + "pkgname": "tealdeer-git", + } + ], + }, + { + "url": "https://aur.archlinux.org/a-fake-one.git", + "artifacts": [ + { + "filename": "a-fake-one.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/a-fake-one.tar.gz", # noqa: B950 + "version": "0.0.1", + }, + ], + "aur_metadata": [ + { + "version": "0.0.1", + "project_url": "https://nowhere/a-fake-one", + "last_update": "2022-02-02T22:22:22+00:00", + "pkgname": "a-fake-one", + } + ], + }, +] + + +def test_get_versions(swh_storage): + loader = AurLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + aur_metadata=EXPECTED_PACKAGES[0]["aur_metadata"], + ) + + assert loader.get_versions() == [ + "10.5.2-1", + ] + + +def test_get_default_version(requests_mock_datadir, swh_storage): + loader = AurLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + aur_metadata=EXPECTED_PACKAGES[0]["aur_metadata"], + ) + assert loader.get_default_version() == "10.5.2-1" + + +def test_aur_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): + loader = AurLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + aur_metadata=EXPECTED_PACKAGES[0]["aur_metadata"], + ) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + + expected_snapshot_id = "fb9ff853036ea48c94f5e5366a9e49d7610d98ed" + expected_release_id = "35ddfe3106bb47f259a9316898de5cab5bf15864" + + assert expected_snapshot_id == actual_load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(actual_load_status["snapshot_id"]), + branches={ + b"releases/10.5.2-1/hg-evolve.tar.gz": SnapshotBranch( + target=hash_to_bytes(expected_release_id), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/10.5.2-1/hg-evolve.tar.gz", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 1, + "directory": 2, + "origin": 1, + "origin_visit": 1, + "release": 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( + name=b"10.5.2-1", + message=b"Synthetic release for Aur source package hg-evolve version " + b"10.5.2-1\n\nFlexible evolution of Mercurial history\n", + target=hash_to_bytes("cc4079be57e7cc0dbf2ecc76c81f5d84782ba632"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=EMPTY_AUTHOR, + date=TimestampWithTimezone.from_iso8601("2022-07-16T00:08:41+00:00"), + id=hash_to_bytes(expected_release_id), + ) + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + status="full", + type="aur", + snapshot=expected_snapshot.id, + ) + + +def test_aur_loader_load_expected_packages(datadir, requests_mock_datadir, swh_storage): + # Exclude the last 'fake' package from EXPECTED_PACKAGES + for package in EXPECTED_PACKAGES[:-1]: + loader = AurLoader( + swh_storage, + url=package["url"], + artifacts=package["artifacts"], + aur_metadata=package["aur_metadata"], + ) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + + stats = get_stats(swh_storage) + assert { + "content": 1 + 1 + 1 + 1, + "directory": 2 + 2 + 2 + 2, + "origin": 1 + 1 + 1 + 1, + "origin_visit": 1 + 1 + 1 + 1, + "release": 1 + 1 + 1 + 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1 + 1 + 1 + 1, + } == stats + + +def test_aur_invalid_origin_not_found(swh_storage, requests_mock_datadir): + url = "http://nowhere/packages/42.git" + loader = AurLoader( + swh_storage, + url, + artifacts=[ + { + "version": "0.0.1", + "url": "https://myforge.nowhere/42/42.tar.gz", + "filename": "42.tar.gz", + }, + ], + aur_metadata=[ + { + "pkgname": "42", + "version": "0.0.1", + "project_url": "https://myforge.nowhere/42", + "last_update": "2022-04-07T21:08:14", + }, + ], + ) + + load_status = loader.load() + assert load_status["status"] == "uneventful" + + +def test_aur_parse_srcinfo(swh_storage, requests_mock_datadir): + """Ensure that multiple lines of `pkgdesc` in .SRCINFO results in `description` + string""" + + loader = AurLoader( + swh_storage, + url=EXPECTED_PACKAGES[-1]["url"], + artifacts=EXPECTED_PACKAGES[-1]["artifacts"], + aur_metadata=EXPECTED_PACKAGES[-1]["aur_metadata"], + ) + loader.load() + + expected_release_id = "2af50761854fee5589b75ff0ecd6886d1185377e" + + assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( + name=b"0.0.1", + message=b"Synthetic release for Aur source package a-fake-one version 0.0.1\n\n" + b"A first line of description.\nA second line for more information.\n", + target=hash_to_bytes("82c770b7d8b1aa573e57b13864831e141d40fe26"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=EMPTY_AUTHOR, + date=TimestampWithTimezone.from_iso8601("2022-02-02T22:22:22+00:00"), + id=hash_to_bytes(expected_release_id), + ) diff --git a/swh/loader/package/aur/tests/test_tasks.py b/swh/loader/package/aur/tests/test_tasks.py new file mode 100644 index 0000000..b3ebafa --- /dev/null +++ b/swh/loader/package/aur/tests/test_tasks.py @@ -0,0 +1,38 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_tasks_aur_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.aur.loader.AurLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.aur.tasks.LoadAur", + kwargs=dict( + url="https://somewhere/some-package.git", + artifacts=[ + { + "filename": "some-package.tar.gz", + "url": "https://somewhere/some-package.tar.gz", + "version": "0.0.1", + } + ], + aur_metadata=[ + { + "version": "0.0.1", + "project_url": "https://somewhere/some-package", + "last_update": "1970-01-01T21:08:14", + "pkgname": "some-package", + } + ], + ), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/crates/loader.py b/swh/loader/package/crates/loader.py index c61d126..a2ebc2b 100644 --- a/swh/loader/package/crates/loader.py +++ b/swh/loader/package/crates/loader.py @@ -1,354 +1,354 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.version import StrictVersion import json from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple from urllib.parse import urlparse import attr import toml from typing_extensions import TypedDict from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import api_info, cached_method, release_name from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface class ExtrinsicPackageMetadata(TypedDict): """Data structure for package extrinsic metadata pulled from http api endpoint. We set only the keys we need according to what is available when querying https://crates.io/api/v1/crates/, where `name` is the name of the crate package (see JSON response example at https://crates.io/api/v1/crates/hg-core). Usage example: .. code-block:: python e_metadata = ExtrinsicPackageMetadata(**self.info()) """ # noqa categories: List[Dict[Any, Any]] """Related categories""" crate: Dict[Any, Any] """Crate project information""" keywords: List[Any] """Keywords""" versions: List[Dict[Any, Any]] """A list of released versions for a crate""" class ExtrinsicVersionPackageMetadata(TypedDict): """Data structure for specific package version extrinsic metadata, pulled from http api endpoint. Similar to `ExtrinsicPackageMetadata` in its usage, but we flatten the data related to a specific version. """ crate: str """The package name""" crate_size: int """The package size""" created_at: str """First released at""" downloads: str """Number of downloads""" license: str """Package license""" num: str """Package version""" published_by: Dict[Any, Any] """Publishers information""" updated_at: str """Last update""" yanked: bool """Is that version yanked? (yanked means release-level deprecation)""" class IntrinsicPackageMetadata(TypedDict): """Data structure for specific package version intrinsic metadata. Data is extracted from the crate package's .toml file. Then the data of the 'package' entry is flattened. Cargo.toml file content example: .. code-block:: toml [package] name = "hg-core" version = "0.0.1" authors = ["Georges Racinet "] description = "Mercurial pure Rust core library, with no assumption on Python bindings (FFI)" homepage = "https://mercurial-scm.org" license = "GPL-2.0-or-later" repository = "https://www.mercurial-scm.org/repo/hg" [lib] name = "hg" [dev-dependencies.rand] version = "~0.6" [dev-dependencies.rand_pcg] version = "~0.1" :param toml: toml object """ name: str """The package name""" version: str """Package version""" authors: List[str] """Authors""" description: str """Package and release description""" homepage: str """Homepage of the project""" license: str """Package license""" repository: str """Source code repository""" @attr.s class CratesPackageInfo(BasePackageInfo): name = attr.ib(type=str) """Name of the package""" version = attr.ib(type=str) """Current version""" e_metadata: Dict[str, Any] = attr.ib(factory=ExtrinsicPackageMetadata) """Extrinsic package metadata, common to all versions""" e_metadata_version: Dict[str, Any] = attr.ib( factory=ExtrinsicVersionPackageMetadata ) """Extrinsic package metadata specific to a version""" i_metadata: Dict[str, Any] = attr.ib(factory=IntrinsicPackageMetadata) """Intrinsic metadata of the current package version""" def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: """Extract intrinsic metadata from Cargo.toml file at dir_path. Each crate archive has a Cargo.toml at the root of the archive. Args: dir_path: A directory on disk where a Cargo.toml must be present Returns: A dict mapping from toml parser """ return toml.load(dir_path / "Cargo.toml") def extract_author(p_info: CratesPackageInfo) -> Person: """Extract package author from intrinsic metadata and return it as a `Person` model. Args: p_info: CratesPackageInfo that should contains i_metadata entries Returns: Only one author (Person) of the package. Currently limited by internal detail of the swh stack (see T3887). """ authors = p_info.i_metadata["authors"] fullname = authors[0] # TODO: here we have a list of author, see T3887 return Person.from_fullname(fullname.encode()) def extract_description(p_info: CratesPackageInfo) -> str: """Extract package description from intrinsic metadata and return it as a string. Args: p_info: CratesPackageInfo that should contains i_metadata and entries Returns: Package description from metadata. """ return p_info.i_metadata["description"] class CratesLoader(PackageLoader[CratesPackageInfo]): """Load Crates package origins into swh archive.""" visit_type = "crates" def __init__( self, storage: StorageInterface, url: str, artifacts: List[Dict[str, Any]], - max_content_size: Optional[int] = None, + **kwargs, ): """Constructor Args: url: Origin url, (e.g. https://crates.io/api/v1/crates/) artifacts: A list of dict listing all existing released versions for a package (Usually set with crates lister `extra_loader_arguments`). Each line is a dict that should have an `url` (where to download package specific version) and a `version` entry. Example:: [ { "version": , "url": "https://static.crates.io/crates//-.crate", } ] """ # noqa - super().__init__(storage=storage, url=url, max_content_size=max_content_size) + super().__init__(storage=storage, url=url, **kwargs) self.url = url self.artifacts: Dict[str, Dict] = { artifact["version"]: artifact for artifact in artifacts } @cached_method def _raw_info(self) -> bytes: """Get crate metadata (fetched from http api endpoint set as self.url) Returns: Content response as bytes. Content response is a json document. """ return api_info(self.url) @cached_method def info(self) -> Dict: """Parse http api json response and return the crate metadata information as a Dict.""" return json.loads(self._raw_info()) def get_versions(self) -> Sequence[str]: """Get all released versions of a crate Returns: A sequence of versions Example:: ["0.1.1", "0.10.2"] """ versions = list(self.artifacts.keys()) versions.sort(key=StrictVersion) return versions def get_default_version(self) -> str: """Get the newest release version of a crate Returns: A string representing a version Example:: "0.1.2" """ return self.get_versions()[-1] def get_package_info(self, version: str) -> Iterator[Tuple[str, CratesPackageInfo]]: """Get release name and package information from version Args: version: crate version (e.g: "0.1.0") Returns: Iterator of tuple (release_name, p_info) """ artifact = self.artifacts[version] filename = artifact["filename"] package_name = urlparse(self.url).path.split("/")[-1] url = artifact["url"] # Get extrinsic metadata from http api e_metadata = ExtrinsicPackageMetadata(**self.info()) # type: ignore[misc] # Extract crate info for current version (One .crate file for a given version) (crate_version,) = [ crate for crate in e_metadata["versions"] if crate["num"] == version ] e_metadata_version = ExtrinsicVersionPackageMetadata( # type: ignore[misc] **crate_version ) p_info = CratesPackageInfo( name=package_name, filename=filename, url=url, version=version, e_metadata=e_metadata, e_metadata_version=e_metadata_version, ) yield release_name(version, filename), p_info def build_release( self, p_info: CratesPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # Extract intrinsic metadata from dir_path/Cargo.toml name = p_info.name version = p_info.version dir_path = Path(uncompressed_path, f"{name}-{version}") i_metadata_raw = extract_intrinsic_metadata(dir_path) # Get only corresponding key of IntrinsicPackageMetadata i_metadata_keys = [k for k in IntrinsicPackageMetadata.__annotations__.keys()] # We use data only from "package" entry i_metadata = { k: v for k, v in i_metadata_raw["package"].items() if k in i_metadata_keys } p_info.i_metadata = IntrinsicPackageMetadata(**i_metadata) # type: ignore[misc] author = extract_author(p_info) description = extract_description(p_info) message = ( f"Synthetic release for Crate source package {p_info.name} " - f"version {p_info.version}\n" + f"version {p_info.version}\n\n" f"{description}\n" ) # The only way to get a value for updated_at is through extrinsic metadata updated_at = p_info.e_metadata_version.get("updated_at") return Release( name=version.encode(), author=author, date=TimestampWithTimezone.from_iso8601(updated_at), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/crates/tasks.py b/swh/loader/package/crates/tasks.py index 9385263..0b7e24c 100644 --- a/swh/loader/package/crates/tasks.py +++ b/swh/loader/package/crates/tasks.py @@ -1,14 +1,14 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task from swh.loader.package.crates.loader import CratesLoader @shared_task(name=__name__ + ".LoadCrates") -def load_crates(*, url=None, artifacts: list): +def load_crates(**kwargs): """Load Rust crate package""" - return CratesLoader.from_configfile(url=url, artifacts=artifacts).load() + return CratesLoader.from_configfile(**kwargs).load() diff --git a/swh/loader/package/crates/tests/test_crates.py b/swh/loader/package/crates/tests/test_crates.py index 8d7f750..1ff76f7 100644 --- a/swh/loader/package/crates/tests/test_crates.py +++ b/swh/loader/package/crates/tests/test_crates.py @@ -1,287 +1,287 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.loader.package.crates.loader import CratesLoader from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( ObjectType, Person, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) CRATES_EXTRA = [ { "url": "https://crates.io/api/v1/crates/hg-core", "artifacts": [ { "checksums": { "sha256": "48a45b46c2a8c38348adb1205b13c3c5eb0174e0c0fec52cc88e9fb1de14c54d", # noqa: B950 }, "filename": "hg-core-0.0.1.crate", "url": "https://static.crates.io/crates/hg-core/hg-core-0.0.1.crate", "version": "0.0.1", }, ], }, { "url": "https://crates.io/api/v1/crates/micro-timer", "artifacts": [ { "checksums": { "sha256": "69ad8fd116f8af0298ae4e83e587b1600af12709022471e25581c3aeb1da77ce", # noqa: B950 }, "filename": "micro-timer-0.1.0.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.1.0.crate", "version": "0.1.0", }, { "checksums": { "sha256": "7b3f65fe0e109daad8d47e1938c9b5f9353efacd86bbe7ff013f84ae7ca758bf", # noqa: B950 }, "filename": "micro-timer-0.1.1.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.1.1.crate", "version": "0.1.1", }, { "checksums": { "sha256": "16439fea388f712c1df7737ceb8f784d407844624b4796faf1e1bf8bbaa97445", # noqa: B950 }, "filename": "micro-timer-0.1.2.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.1.2.crate", "version": "0.1.2", }, { "checksums": { "sha256": "336b4c0f071d16674747faa4643d742cc096fec2bf8cf01bb1a98d984bedcaf1", # noqa: B950 }, "filename": "micro-timer-0.2.0.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.2.0.crate", "version": "0.2.0", }, { "checksums": { "sha256": "987429cd6162a80ed5ff44fc790f5090b1c6d617ac73a2e272965ed91201d79b", # noqa: B950 }, "filename": "micro-timer-0.2.1.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.2.1.crate", "version": "0.2.1", }, { "checksums": { "sha256": "25b31d6cb9112984323d05d7a353f272ae5d7a307074f9ab9b25c00121b8c947", # noqa: B950 }, "filename": "micro-timer-0.3.0.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.3.0.crate", "version": "0.3.0", }, { "checksums": { "sha256": "2620153e1d903d26b72b89f0e9c48d8c4756cba941c185461dddc234980c298c", # noqa: B950 }, "filename": "micro-timer-0.3.1.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.3.1.crate", "version": "0.3.1", }, { "checksums": { "sha256": "5de32cb59a062672560d6f0842c4aa7714727457b9fe2daf8987d995a176a405", # noqa: B950 }, "filename": "micro-timer-0.4.0.crate", "url": "https://static.crates.io/crates/micro-timer/micro-timer-0.4.0.crate", "version": "0.4.0", }, ], }, ] def test_get_versions(requests_mock_datadir, swh_storage): loader = CratesLoader( swh_storage, url=CRATES_EXTRA[1]["url"], artifacts=CRATES_EXTRA[1]["artifacts"], ) assert loader.get_versions() == [ "0.1.0", "0.1.1", "0.1.2", "0.2.0", "0.2.1", "0.3.0", "0.3.1", "0.4.0", ] def test_get_default_version(requests_mock_datadir, swh_storage): loader = CratesLoader( swh_storage, url=CRATES_EXTRA[1]["url"], artifacts=CRATES_EXTRA[1]["artifacts"], ) assert loader.get_default_version() == "0.4.0" def test_crate_invalid_origin_archive_not_found(swh_storage, requests_mock_datadir): url = "https://nowhere-to-run/nowhere-to-hide" loader = CratesLoader( swh_storage, url, artifacts=[ { "filename": "nowhere-to-hide-0.0.1.crate", "url": "https://nowhere-to-run/nowhere-to-hide-0.0.1.crate", "version": "0.0.1", }, ], ) with pytest.raises(Exception): assert loader.load() == {"status": "failed"} assert_last_visit_matches( swh_storage, url, status="not_found", type="crates", snapshot=None ) def test_crates_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): loader = CratesLoader( swh_storage, url=CRATES_EXTRA[0]["url"], artifacts=CRATES_EXTRA[0]["artifacts"], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None - expected_snapshot_id = "353cd6858c88ee8210432ea1098993c2e9966561" - expected_release_id = "d578833534017430f1b93eb741620899620c2505" + expected_snapshot_id = "b3affb4949eb89b244f0e1d1fe235fc1d26bde76" + expected_release_id = "237c4cdd44a90e620795e5a07ebcc72bc82487f7" assert expected_snapshot_id == actual_load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"releases/0.0.1/hg-core-0.0.1.crate": SnapshotBranch( target=hash_to_bytes(expected_release_id), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/0.0.1/hg-core-0.0.1.crate", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 1, "directory": 2, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( name=b"0.0.1", - message=b"Synthetic release for Crate source package hg-core version " - b"0.0.1\nMercurial pure Rust core library, with no assumption " + message=b"Synthetic release for Crate source package hg-core version 0.0.1\n\n" + b"Mercurial pure Rust core library, with no assumption " b"on Python bindings (FFI)\n", target=hash_to_bytes("674c3b0b54628d55b93a79dc7adf304efc01b371"), target_type=ObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname(b"Georges Racinet "), date=TimestampWithTimezone.from_iso8601("2019-04-16T18:48:11.404457+00:00"), id=hash_to_bytes(expected_release_id), ) def test_crates_loader_load_n_versions(datadir, requests_mock_datadir, swh_storage): url = CRATES_EXTRA[1]["url"] loader = CratesLoader( swh_storage, url=CRATES_EXTRA[1]["url"], artifacts=CRATES_EXTRA[1]["artifacts"], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None - expected_snapshot_id = "016cbbe3bb78424c35b898015a2d80d79359e2ad" + expected_snapshot_id = "3f8ca5908a570fa32270b07a0946bcffa88babd5" assert expected_snapshot_id == actual_load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"releases/0.4.0/micro-timer-0.4.0.crate": SnapshotBranch( - target=hash_to_bytes("3237c1174c4ccfa8e934d1bfd8d80b3a89760e39"), + target=hash_to_bytes("b038a927244c852fb3794aecbebdc70f68ddf067"), target_type=TargetType.RELEASE, ), b"releases/0.3.1/micro-timer-0.3.1.crate": SnapshotBranch( - target=hash_to_bytes("8b727a280051cdb90468ede2746e176e6fdf355f"), + target=hash_to_bytes("ea331a2ce755e6f0cd9d05c9be52accde68536c4"), target_type=TargetType.RELEASE, ), b"releases/0.3.0/micro-timer-0.3.0.crate": SnapshotBranch( - target=hash_to_bytes("f45ec236ae50fb37e924a3d2cc093e72b6cbf1cd"), + target=hash_to_bytes("7ea45f915ace083ed361bb12593625bf4cf1f5f2"), target_type=TargetType.RELEASE, ), b"releases/0.2.1/micro-timer-0.2.1.crate": SnapshotBranch( - target=hash_to_bytes("50a60a2c3696df7cd1b623bd7dbea2c89b994e42"), + target=hash_to_bytes("074f27605be8b759e5d7c638f026aac3709f58e5"), target_type=TargetType.RELEASE, ), b"releases/0.2.0/micro-timer-0.2.0.crate": SnapshotBranch( - target=hash_to_bytes("f0592dc0ae05399d872017d0260c45b875cb590e"), + target=hash_to_bytes("a1d642aaa54c5361f67e57adbd86e01f3a3276f8"), target_type=TargetType.RELEASE, ), b"releases/0.1.2/micro-timer-0.1.2.crate": SnapshotBranch( - target=hash_to_bytes("9220d7823fc40ab44e3ae3227522e7de672fad3e"), + target=hash_to_bytes("60f18ae067ce235bc60243bf5cdaaae474b11978"), target_type=TargetType.RELEASE, ), b"releases/0.1.1/micro-timer-0.1.1.crate": SnapshotBranch( - target=hash_to_bytes("38529b7e355f79fdce31a3ba891e146174e10237"), + target=hash_to_bytes("fd6c55dfd016d58647a2d44b29a3fd4e3afa7671"), target_type=TargetType.RELEASE, ), b"releases/0.1.0/micro-timer-0.1.0.crate": SnapshotBranch( - target=hash_to_bytes("5e5e6120af55b65c577e09331df54e70fad5e8b0"), + target=hash_to_bytes("3e07559a4b366a397b1ca154e72753ce27223ca1"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/0.4.0/micro-timer-0.4.0.crate", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 8, "directory": 16, "origin": 1, "origin_visit": 1, "release": 8, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert_last_visit_matches( swh_storage, url, status="full", type="crates", snapshot=expected_snapshot.id, ) diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py index 181415c..ed540f9 100644 --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -1,565 +1,565 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import json import re import pytest from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.loader.package.deposit.loader import ApiClient, DepositLoader from swh.loader.package.loader import now from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.model import ( Origin, Person, RawExtrinsicMetadata, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.model.model import MetadataAuthority, MetadataAuthorityType, MetadataFetcher from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType DEPOSIT_URL = "https://deposit.softwareheritage.org/1/private" @pytest.fixture def requests_mock_datadir(requests_mock_datadir): """Enhance default mock data to mock put requests as the loader does some internal update queries there. """ requests_mock_datadir.put(re.compile("https")) return requests_mock_datadir def test_deposit_init_ok(swh_storage, deposit_client, swh_loader_config): url = "some-url" deposit_id = 999 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) # Something that does not exist assert loader.origin.url == url assert loader.client is not None assert loader.client.base_url == swh_loader_config["deposit"]["url"] def test_deposit_from_configfile(swh_config): """Ensure the deposit instantiation is ok""" loader = DepositLoader.from_configfile( url="some-url", deposit_id="666", default_filename="archive.zip" ) assert isinstance(loader.client, ApiClient) def test_deposit_loading_unknown_deposit( swh_storage, deposit_client, requests_mock_datadir ): """Loading an unknown deposit should fail no origin, no visit, no snapshot """ # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url" unknown_deposit_id = 667 loader = DepositLoader( swh_storage, url, unknown_deposit_id, deposit_client, default_filename="archive.zip", ) # does not exist actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} stats = get_stats(loader.storage) assert { "content": 0, "directory": 0, "origin": 0, "origin_visit": 0, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 0, } == stats requests_mock_datadir_missing_one = requests_mock_datadir_factory( ignore_urls=[ f"{DEPOSIT_URL}/666/raw/", ] ) def test_deposit_loading_failure_to_retrieve_1_artifact( swh_storage, deposit_client, requests_mock_datadir_missing_one ): """Deposit with missing artifact ends up with an uneventful/partial visit""" # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' url = "some-url-2" deposit_id = 666 requests_mock_datadir_missing_one.put(re.compile("https")) loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() assert actual_load_status["status"] == "uneventful" assert actual_load_status["snapshot_id"] is not None assert_last_visit_matches(loader.storage, url, status="partial", type="deposit") stats = get_stats(loader.storage) assert { "content": 0, "directory": 0, "origin": 1, "origin_visit": 1, "release": 0, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir_missing_one.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "failed", "status_detail": { "loading": [ - "Failed to load branch HEAD for some-url-2: Fail to query " - "'https://deposit.softwareheritage.org/1/private/666/raw/'. Reason: 404" + "Failed to load branch HEAD for some-url-2: 404 Client Error: None " + "for url: https://deposit.softwareheritage.org/1/private/666/raw/" ] }, } assert body == expected_body def test_deposit_loading_ok(swh_storage, deposit_client, requests_mock_datadir): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 666 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() expected_snapshot_id = "338b45d87e02fb5cbf324694bc4a898623d6a30f" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) release_id_hex = "2566a64a27bc00362e265be9666d7606750530a1" release_id = hash_to_bytes(release_id_hex) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch( target=release_id, target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, storage=loader.storage) release = loader.storage.release_get([release_id])[0] date = TimestampWithTimezone.from_datetime( datetime.datetime(2017, 10, 7, 15, 17, 8, tzinfo=datetime.timezone.utc) ) person = Person( fullname=b"Software Heritage", name=b"Software Heritage", email=b"robot@softwareheritage.org", ) assert release == Release( id=release_id, name=b"HEAD", message=b"hal: Deposit 666 in collection hal\n", author=person, date=date, target_type=ModelObjectType.DIRECTORY, target=b"\xfd-\xf1-\xc5SL\x1d\xa1\xe9\x18\x0b\x91Q\x02\xfbo`\x1d\x19", synthetic=True, metadata=None, ) # check metadata fetcher = MetadataFetcher( name="swh-deposit", version="0.0.1", ) authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="https://hal-test.archives-ouvertes.fr/", ) # Check origin metadata orig_meta = loader.storage.raw_extrinsic_metadata_get( Origin(url).swhid(), authority ) assert orig_meta.next_page_token is None raw_meta = loader.client.metadata_get(deposit_id) raw_metadata: str = raw_meta["raw_metadata"] # 2 raw metadata xml + 1 json dict assert len(orig_meta.results) == 2 orig_meta0 = orig_meta.results[0] assert orig_meta0.authority == authority assert orig_meta0.fetcher == fetcher # Check directory metadata assert release.target_type == ModelObjectType.DIRECTORY directory_swhid = CoreSWHID( object_type=ObjectType.DIRECTORY, object_id=release.target ) actual_dir_meta = loader.storage.raw_extrinsic_metadata_get( directory_swhid, authority ) assert actual_dir_meta.next_page_token is None assert len(actual_dir_meta.results) == 1 dir_meta = actual_dir_meta.results[0] assert dir_meta.authority == authority assert dir_meta.fetcher == fetcher assert dir_meta.metadata.decode() == raw_metadata # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "done", "release_id": release_id_hex, "directory_id": hash_to_hex(release.target), "snapshot_id": expected_snapshot_id, "origin_url": url, } assert body == expected_body stats = get_stats(loader.storage) assert { "content": 303, "directory": 12, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats def test_deposit_loading_ok_2(swh_storage, deposit_client, requests_mock_datadir): """Field dates should be se appropriately""" external_id = "some-external-id" url = f"https://hal-test.archives-ouvertes.fr/{external_id}" deposit_id = 777 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() expected_snapshot_id = "3449b8ff31abeacefd33cca60e3074c1649dc3a1" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) release_id = "ba6c9a59ae3256e765d32b211cc183dc2380aed7" expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch( target=hash_to_bytes(release_id), target_type=TargetType.RELEASE ) }, ) check_snapshot(expected_snapshot, storage=loader.storage) raw_meta = loader.client.metadata_get(deposit_id) # Ensure the date fields are set appropriately in the release # Retrieve the release release = loader.storage.release_get([hash_to_bytes(release_id)])[0] assert release # swh-deposit uses the numeric 'offset_minutes' instead of the bytes offset # attribute, because its dates are always well-formed, and it can only send # JSON-serializable data. release_date_dict = { "timestamp": release.date.timestamp.to_dict(), "offset": release.date.offset_minutes(), } assert release_date_dict == raw_meta["deposit"]["author_date"] assert not release.metadata provider = { "provider_name": "hal", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", "metadata": None, } tool = { "name": "swh-deposit", "version": "0.0.1", "configuration": {"sword_version": "2"}, } fetcher = MetadataFetcher( name="swh-deposit", version="0.0.1", ) authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="https://hal-test.archives-ouvertes.fr/", ) # Check the origin metadata swh side origin_extrinsic_metadata = loader.storage.raw_extrinsic_metadata_get( Origin(url).swhid(), authority ) assert origin_extrinsic_metadata.next_page_token is None raw_metadata: str = raw_meta["raw_metadata"] # 1 raw metadata xml + 1 json dict assert len(origin_extrinsic_metadata.results) == 2 origin_swhid = Origin(url).swhid() expected_metadata = [] origin_meta = origin_extrinsic_metadata.results[0] expected_metadata.append( RawExtrinsicMetadata( target=origin_swhid, discovery_date=origin_meta.discovery_date, metadata=raw_metadata.encode(), format="sword-v2-atom-codemeta-v2", authority=authority, fetcher=fetcher, ) ) origin_metadata = { "metadata": [raw_metadata], "provider": provider, "tool": tool, } expected_metadata.append( RawExtrinsicMetadata( target=origin_swhid, discovery_date=origin_extrinsic_metadata.results[-1].discovery_date, metadata=json.dumps(origin_metadata).encode(), format="original-artifacts-json", authority=authority, fetcher=fetcher, ) ) assert sorted(origin_extrinsic_metadata.results) == sorted(expected_metadata) # Check the release metadata swh side assert release.target_type == ModelObjectType.DIRECTORY directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) actual_directory_metadata = loader.storage.raw_extrinsic_metadata_get( directory_swhid, authority ) assert actual_directory_metadata.next_page_token is None assert len(actual_directory_metadata.results) == 1 release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release_id) ) dir_metadata_template = RawExtrinsicMetadata( target=directory_swhid, format="sword-v2-atom-codemeta-v2", authority=authority, fetcher=fetcher, origin=url, release=release_swhid, # to satisfy the constructor discovery_date=now(), metadata=b"", ) expected_directory_metadata = [] dir_metadata = actual_directory_metadata.results[0] expected_directory_metadata.append( RawExtrinsicMetadata.from_dict( { **{ k: v for (k, v) in dir_metadata_template.to_dict().items() if k != "id" }, "discovery_date": dir_metadata.discovery_date, "metadata": raw_metadata.encode(), } ) ) assert sorted(actual_directory_metadata.results) == sorted( expected_directory_metadata ) # Retrieve the information for deposit status update query to the deposit urls = [ m for m in requests_mock_datadir.request_history if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/" ] assert len(urls) == 1 update_query = urls[0] body = update_query.json() expected_body = { "status": "done", "release_id": release_id, "directory_id": hash_to_hex(release.target), "snapshot_id": expected_snapshot_id, "origin_url": url, } assert body == expected_body def test_deposit_loading_ok_3(swh_storage, deposit_client, requests_mock_datadir): """Deposit loading can happen on tarball artifacts as well The latest deposit changes introduce the internal change. """ external_id = "hal-123456" url = f"https://hal-test.archives-ouvertes.fr/{external_id}" deposit_id = 888 loader = DepositLoader(swh_storage, url, deposit_id, deposit_client) actual_load_status = loader.load() expected_snapshot_id = "4677843de89e398f1d6bfedc9ca9b89c451c55c8" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) def test_deposit_loading_ok_release_notes( swh_storage, deposit_client, requests_mock_datadir ): url = "https://hal-test.archives-ouvertes.fr/some-external-id" deposit_id = 999 loader = DepositLoader( swh_storage, url, deposit_id, deposit_client, default_filename="archive.zip" ) actual_load_status = loader.load() expected_snapshot_id = "a307acffb7c29bebb3daf1bcb680bb3f452890a8" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( loader.storage, url, status="full", type="deposit", snapshot=hash_to_bytes(expected_snapshot_id), ) release_id_hex = "f5e8ec02ede57edbe061afa7fc2a07bb7d14a700" release_id = hash_to_bytes(release_id_hex) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"HEAD": SnapshotBranch( target=release_id, target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, storage=loader.storage) release = loader.storage.release_get([release_id])[0] date = TimestampWithTimezone.from_datetime( datetime.datetime(2017, 10, 7, 15, 17, 8, tzinfo=datetime.timezone.utc) ) person = Person( fullname=b"Software Heritage", name=b"Software Heritage", email=b"robot@softwareheritage.org", ) assert release == Release( id=release_id, name=b"HEAD", message=( b"hal: Deposit 999 in collection hal\n\nThis release adds this and that.\n" ), author=person, date=date, target_type=ModelObjectType.DIRECTORY, target=b"\xfd-\xf1-\xc5SL\x1d\xa1\xe9\x18\x0b\x91Q\x02\xfbo`\x1d\x19", synthetic=True, metadata=None, ) diff --git a/swh/loader/package/golang/__init__.py b/swh/loader/package/golang/__init__.py new file mode 100644 index 0000000..e36c6ce --- /dev/null +++ b/swh/loader/package/golang/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import GolangLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": GolangLoader, + } diff --git a/swh/loader/package/golang/loader.py b/swh/loader/package/golang/loader.py new file mode 100644 index 0000000..9caff6a --- /dev/null +++ b/swh/loader/package/golang/loader.py @@ -0,0 +1,91 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import logging +from typing import Iterator, Optional, Sequence, Tuple + +import attr + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import EMPTY_AUTHOR, api_info, release_name +from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone +from swh.storage.interface import StorageInterface + +logger = logging.getLogger(__name__) + + +@attr.s +class GolangPackageInfo(BasePackageInfo): + name = attr.ib(type=str) + timestamp = attr.ib(type=Optional[TimestampWithTimezone]) + + +class GolangLoader(PackageLoader[GolangPackageInfo]): + """Load Golang module zip file into SWH archive.""" + + visit_type = "golang" + GOLANG_PKG_DEV_URL = "https://pkg.go.dev" + GOLANG_PROXY_URL = "https://proxy.golang.org" + + def __init__( + self, + storage: StorageInterface, + url: str, + max_content_size: Optional[int] = None, + **kwargs, + ): + super().__init__(storage, url, max_content_size=max_content_size, **kwargs) + # The lister saves human-usable URLs, so we translate them to proxy URLs + # for use in the loader. + # This URL format is detailed in https://go.dev/ref/mod#goproxy-protocol + assert url.startswith( + self.GOLANG_PKG_DEV_URL + ), "Go package URL (%s) not from %s" % (url, self.GOLANG_PKG_DEV_URL) + self.name = url[len(self.GOLANG_PKG_DEV_URL) + 1 :] + self.url = url.replace(self.GOLANG_PKG_DEV_URL, self.GOLANG_PROXY_URL) + + def get_versions(self) -> Sequence[str]: + return api_info(f"{self.url}/@v/list").decode().splitlines() + + def get_default_version(self) -> str: + latest = api_info(f"{self.url}/@latest") + return json.loads(latest)["Version"] + + def _raw_info(self, version: str) -> dict: + url = f"{self.url}/@v/{version}.info" + return json.loads(api_info(url)) + + def get_package_info(self, version: str) -> Iterator[Tuple[str, GolangPackageInfo]]: + # Encode the name because creating nested folders can become problematic + encoded_name = self.name.replace("/", "__") + filename = f"{encoded_name}-{version}.zip" + timestamp = TimestampWithTimezone.from_iso8601(self._raw_info(version)["Time"]) + p_info = GolangPackageInfo( + url=f"{self.url}/@v/{version}.zip", + filename=filename, + version=version, + timestamp=timestamp, + name=self.name, + ) + yield release_name(version), p_info + + def build_release( + self, p_info: GolangPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + msg = ( + f"Synthetic release for Golang source package {p_info.name} " + f"version {p_info.version}\n" + ) + + return Release( + name=p_info.version.encode(), + message=msg.encode(), + date=p_info.timestamp, + author=EMPTY_AUTHOR, # Go modules offer very little metadata + target_type=ObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) diff --git a/swh/loader/package/crates/tasks.py b/swh/loader/package/golang/tasks.py similarity index 51% copy from swh/loader/package/crates/tasks.py copy to swh/loader/package/golang/tasks.py index 9385263..167a437 100644 --- a/swh/loader/package/crates/tasks.py +++ b/swh/loader/package/golang/tasks.py @@ -1,14 +1,15 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task -from swh.loader.package.crates.loader import CratesLoader +from swh.loader.package.golang.loader import GolangLoader -@shared_task(name=__name__ + ".LoadCrates") -def load_crates(*, url=None, artifacts: list): - """Load Rust crate package""" - return CratesLoader.from_configfile(url=url, artifacts=artifacts).load() +@shared_task(name=__name__ + ".LoadGolang") +def load_golang(**kwargs): + """Load Golang module""" + loader = GolangLoader.from_configfile(**kwargs) + return loader.load() diff --git a/swh/loader/package/golang/tests/__init__.py b/swh/loader/package/golang/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@latest b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@latest new file mode 100644 index 0000000..ac70dcd --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@latest @@ -0,0 +1 @@ +{"Version":"v0.1.3","Time":"2022-03-15T13:54:34Z"} diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_list b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_list new file mode 100644 index 0000000..04e1946 --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_list @@ -0,0 +1 @@ +v0.1.3 diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.info b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.info new file mode 100644 index 0000000..52a74e2 --- /dev/null +++ b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.info @@ -0,0 +1 @@ +{"Version":"v0.1.3","Time":"2022-03-17T15:42:55Z"} diff --git a/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.zip b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.zip new file mode 100644 index 0000000..4e023fb Binary files /dev/null and b/swh/loader/package/golang/tests/data/https_proxy.golang.org/example.com_basic-go-module_@v_v0.1.3.zip differ diff --git a/swh/loader/package/golang/tests/test_golang.py b/swh/loader/package/golang/tests/test_golang.py new file mode 100644 index 0000000..63bde1b --- /dev/null +++ b/swh/loader/package/golang/tests/test_golang.py @@ -0,0 +1,13 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.loader.package.golang.loader import GolangLoader + + +def test_golang_loader_first_visit(swh_storage, requests_mock_datadir): + url = "https://pkg.go.dev/example.com/basic-go-module" + loader = GolangLoader(swh_storage, url) + + assert loader.load()["status"] == "eventful" diff --git a/swh/loader/package/golang/tests/test_tasks.py b/swh/loader/package/golang/tests/test_tasks.py new file mode 100644 index 0000000..18819b9 --- /dev/null +++ b/swh/loader/package/golang/tests/test_tasks.py @@ -0,0 +1,21 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_tasks_golang_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.golang.loader.GolangLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.golang.tasks.LoadGolang", + kwargs={"url": "https://pkg.go.dev/golang.org/whatever/package"}, + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 000ad98..96ff69e 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,1086 +1,1088 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import hashlib from itertools import islice import json import logging import os import string import sys import tempfile from typing import ( Any, Dict, Generic, Iterator, List, Mapping, Optional, Sequence, Set, Tuple, TypeVar, ) import attr from requests.exceptions import ContentDecodingError import sentry_sdk from swh.core.tarball import uncompress from swh.loader.core.loader import BaseLoader from swh.loader.exception import NotFound from swh.loader.package.utils import download from swh.model import from_disk from swh.model.hashutil import hash_to_hex from swh.model.model import ( ExtID, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, ) from swh.model.model import ( Origin, OriginVisit, OriginVisitStatus, RawExtrinsicMetadata, Release, Revision, Sha1Git, Snapshot, ) from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import StorageInterface from swh.storage.utils import now logger = logging.getLogger(__name__) SWH_METADATA_AUTHORITY = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="https://softwareheritage.org/", metadata={}, ) """Metadata authority for extrinsic metadata generated by Software Heritage. Used for metadata on "original artifacts", ie. length, filename, and checksums of downloaded archive files.""" PartialExtID = Tuple[str, int, bytes] """The ``extid_type`` and ``extid`` fields of an :class:`ExtID` object.""" @attr.s class RawExtrinsicMetadataCore: """Contains the core of the metadata extracted by a loader, that will be used to build a full RawExtrinsicMetadata object by adding object identifier, context, and provenance information.""" format = attr.ib(type=str) metadata = attr.ib(type=bytes) discovery_date = attr.ib(type=Optional[datetime.datetime], default=None) """Defaults to the visit date.""" @attr.s class BasePackageInfo: """Compute the primary key for a dict using the id_keys as primary key composite. Args: d: A dict entry to compute the primary key on id_keys: Sequence of keys to use as primary key Returns: The identity for that dict entry """ url = attr.ib(type=str) filename = attr.ib(type=Optional[str]) version = attr.ib(type=str) """Version name/number.""" MANIFEST_FORMAT: Optional[string.Template] = None """If not None, used by the default extid() implementation to format a manifest, before hashing it to produce an ExtID.""" EXTID_TYPE: str = "package-manifest-sha256" EXTID_VERSION: int = 0 # The following attribute has kw_only=True in order to allow subclasses # to add attributes. Without kw_only, attributes without default values cannot # go after attributes with default values. # See directory_extrinsic_metadata = attr.ib( type=List[RawExtrinsicMetadataCore], default=[], kw_only=True, ) """:term:`extrinsic metadata` collected by the loader, that will be attached to the loaded directory and added to the Metadata storage.""" # TODO: add support for metadata for releases and contents def extid(self) -> Optional[PartialExtID]: """Returns a unique intrinsic identifier of this package info, or None if this package info is not 'deduplicatable' (meaning that we will always load it, instead of checking the ExtID storage to see if we already did)""" if self.MANIFEST_FORMAT is None: return None else: manifest = self.MANIFEST_FORMAT.substitute( {k: str(v) for (k, v) in attr.asdict(self).items()} ) return ( self.EXTID_TYPE, self.EXTID_VERSION, hashlib.sha256(manifest.encode()).digest(), ) TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) class PackageLoader(BaseLoader, Generic[TPackageInfo]): def __init__(self, storage: StorageInterface, url: str, **kwargs: Any): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: storage: Storage instance url: Origin url to load data from """ super().__init__(storage=storage, origin_url=url, **kwargs) def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Raises: class:`swh.loader.exception.NotFound` error when failing to read the published package versions. Returns: Sequence of published versions """ return [] def get_package_info(self, version: str) -> Iterator[Tuple[str, TPackageInfo]]: """Given a release version of a package, retrieve the associated package information for such version. Args: version: Package version Returns: (branch name, package metadata) """ yield from {} def build_release( self, p_info: TPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: """Build the release from the archive metadata (extrinsic artifact metadata) and the intrinsic metadata. Args: p_info: Package information uncompressed_path: Artifact uncompressed path on disk """ raise NotImplementedError("build_release") def get_default_version(self) -> str: """Retrieve the latest release version if any. Returns: Latest version """ return "" def last_snapshot(self) -> Optional[Snapshot]: """Retrieve the last snapshot out of the last visit.""" return snapshot_get_latest(self.storage, self.origin.url) def new_packageinfo_to_extid(self, p_info: TPackageInfo) -> Optional[PartialExtID]: return p_info.extid() def _get_known_extids( self, packages_info: List[TPackageInfo] ) -> Dict[PartialExtID, List[CoreSWHID]]: """Compute the ExtIDs from new PackageInfo objects, searches which are already loaded in the archive, and returns them if any.""" # Compute the ExtIDs of all the new packages, grouped by extid type new_extids: Dict[Tuple[str, int], List[bytes]] = {} for p_info in packages_info: res = p_info.extid() if res is not None: (extid_type, extid_version, extid_extid) = res new_extids.setdefault((extid_type, extid_version), []).append( extid_extid ) # For each extid type, call extid_get_from_extid() with all the extids of # that type, and store them in the '(type, extid) -> target' map. known_extids: Dict[PartialExtID, List[CoreSWHID]] = {} for ((extid_type, extid_version), extids) in new_extids.items(): for extid in self.storage.extid_get_from_extid( extid_type, extids, version=extid_version ): if extid is not None: key = (extid.extid_type, extid_version, extid.extid) known_extids.setdefault(key, []).append(extid.target) return known_extids def resolve_object_from_extids( self, known_extids: Dict[PartialExtID, List[CoreSWHID]], p_info: TPackageInfo, whitelist: Set[Sha1Git], ) -> Optional[CoreSWHID]: """Resolve the revision/release from known ExtIDs and a package info object. If the artifact has already been downloaded, this will return the existing release (or revision) targeting that uncompressed artifact directory. Otherwise, this returns None. Args: known_extids: Dict built from a list of ExtID, with the target as value p_info: Package information whitelist: Any ExtID with target not in this set is filtered out Returns: None or release/revision SWHID """ new_extid = p_info.extid() if new_extid is None: return None extid_targets = set() for extid_target in known_extids.get(new_extid, []): if extid_target.object_id not in whitelist: # There is a known ExtID for this package, but its target is not # in the snapshot. # This can happen for three reasons: # # 1. a loader crashed after writing the ExtID, but before writing # the snapshot # 2. some other loader loaded the same artifact, but produced # a different revision, causing an additional ExtID object # to be written. We will probably find this loader's ExtID # in a future iteration of this loop. # Note that for now, this is impossible, as each loader has a # completely different extid_type, but this is an implementation # detail of each loader. # 3. we took a snapshot, then the package disappeared, # then we took another snapshot, and the package reappeared # # In case of 1, we must actually load the package now, # so let's do it. # TODO: detect when we are in case 3 using release_missing # or revision_missing instead of the snapshot. continue elif extid_target.object_type in (ObjectType.RELEASE, ObjectType.REVISION): extid_targets.add(extid_target) else: # Note that this case should never be reached unless there is a # collision between a revision hash and some non-revision object's # hash, but better safe than sorry. logger.warning( "%s is in the whitelist, but is not a revision/release.", hash_to_hex(extid_target.object_type), ) if extid_targets: # This is a known package version, as we have an extid to reference it. # Let's return one of them. # If there is a release extid, return it. release_extid_targets = { extid_target for extid_target in extid_targets if extid_target.object_type == ObjectType.RELEASE } # Exclude missing targets missing_releases = { CoreSWHID(object_type=ObjectType.RELEASE, object_id=id_) for id_ in self.storage.release_missing( [swhid.object_id for swhid in release_extid_targets] ) } if missing_releases: err_message = "Found ExtIDs pointing to missing releases" logger.error(err_message + ": %s", missing_releases) with sentry_sdk.push_scope() as scope: scope.set_extra( "missing_releases", [str(x) for x in missing_releases] ) sentry_sdk.capture_message(err_message, "error") release_extid_targets -= missing_releases extid_target2 = self.select_extid_target(p_info, release_extid_targets) if extid_target2: return extid_target2 # If there is no release extid (ie. if the package was only loaded with # older versions of this loader, which produced revision objects instead # of releases), return a revision extid when possible. revision_extid_targets = { extid_target for extid_target in extid_targets if extid_target.object_type == ObjectType.REVISION } if revision_extid_targets: assert len(extid_targets) == 1, extid_targets extid_target = list(extid_targets)[0] return extid_target # No target found (this is probably a new package version) return None def select_extid_target( self, p_info: TPackageInfo, extid_targets: Set[CoreSWHID] ) -> Optional[CoreSWHID]: """Given a list of release extid targets, choses one appropriate for the given package info. Package loaders shyould implement this if their ExtIDs may map to multiple releases, so they can fetch releases from the storage and inspect their fields to select the right one for this ``p_info``. """ if extid_targets: # The base package loader does not have the domain-specific knowledge # to select the right release -> crash if there is more than one. assert len(extid_targets) == 1, extid_targets return list(extid_targets)[0] return None def download_package( self, p_info: TPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Download artifacts for a specific package. All downloads happen in in the tmpdir folder. Default implementation expects the artifacts package info to be about one artifact per package. Note that most implementation have 1 artifact per package. But some implementation have multiple artifacts per package (debian), some have none, the package is the artifact (gnu). Args: artifacts_package_info: Information on the package artifacts to download (url, filename, etc...) tmpdir: Location to retrieve such artifacts Returns: List of (path, computed hashes) """ try: return [download(p_info.url, dest=tmpdir, filename=p_info.filename)] except ContentDecodingError: # package might be erroneously marked as gzip compressed while is is not, # try to download its raw bytes again without attempting to uncompress # the input stream return [ download( p_info.url, dest=tmpdir, filename=p_info.filename, extra_request_headers={"Accept-Encoding": "identity"}, ) ] def uncompress( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str ) -> str: """Uncompress the artifact(s) in the destination folder dest. Optionally, this could need to use the p_info dict for some more information (debian). """ uncompressed_path = os.path.join(dest, "src") for a_path, _ in dl_artifacts: uncompress(a_path, dest=uncompressed_path) return uncompressed_path def extra_branches(self) -> Dict[bytes, Mapping[str, Any]]: """Return an extra dict of branches that are used to update the set of branches. """ return {} def finalize_visit( self, *, snapshot: Optional[Snapshot], visit: OriginVisit, status_visit: str, status_load: str, failed_branches: List[str], errors: Optional[List[str]] = None, ) -> Dict[str, Any]: """Finalize the visit: - flush eventual unflushed data to storage - update origin visit's status - return the task's status """ self.storage.flush() snapshot_id: Optional[bytes] = None if snapshot and snapshot.id: # to prevent the snapshot.id to b"" snapshot_id = snapshot.id assert visit.visit visit_status = OriginVisitStatus( origin=self.origin.url, visit=visit.visit, type=self.visit_type, date=now(), status=status_visit, snapshot=snapshot_id, ) self.storage.origin_visit_status_add([visit_status]) result: Dict[str, Any] = { "status": status_load, } if snapshot_id: result["snapshot_id"] = hash_to_hex(snapshot_id) if failed_branches: logger.warning("%d failed branches", len(failed_branches)) for i, urls in enumerate(islice(failed_branches, 50)): prefix_url = "Failed branches: " if i == 0 else "" logger.warning("%s%s", prefix_url, urls) return result def load(self) -> Dict: """Load for a specific origin the associated contents. 1. Get the list of versions in an origin. 2. Get the snapshot from the previous run of the loader, and filter out versions that were already loaded, if their :term:`extids ` match Then, for each remaining version in the origin 3. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 4. Extract the downloaded files. By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 5. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 6. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 7. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 8. Generate and load the snapshot for the visit Using the revisions/releases collected at step 7., and the branch information from step 2., generate a snapshot and load it into the Software Heritage archive """ status_load = "uneventful" # either: eventful, uneventful, failed status_visit = "full" # see swh.model.model.OriginVisitStatus snapshot = None failed_branches: List[str] = [] # Prepare origin and origin_visit origin = Origin(url=self.origin.url) try: self.storage.origin_add([origin]) visit = list( self.storage.origin_visit_add( [ OriginVisit( origin=self.origin.url, date=self.visit_date, type=self.visit_type, ) ] ) )[0] except Exception as e: logger.exception( "Failed to initialize origin_visit for %s", self.origin.url ) sentry_sdk.capture_exception(e) return {"status": "failed"} # Get the previous snapshot for this origin. It is then used to see which # of the package's versions are already loaded in the archive. try: last_snapshot = self.last_snapshot() logger.debug("last snapshot: %s", last_snapshot) except Exception as e: logger.exception("Failed to get previous state for %s", self.origin.url) sentry_sdk.capture_exception(e) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=[str(e)], ) load_exceptions: List[Exception] = [] # Get the list of all version names try: versions = self.get_versions() except NotFound as e: return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="not_found", status_load="failed", errors=[str(e)], ) except Exception as e: + logger.exception("Failed to get list of versions for %s", self.origin.url) sentry_sdk.capture_exception(e) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=[str(e)], ) # Get the metadata of each version's package packages_info: List[Tuple[str, TPackageInfo]] = [ (branch_name, p_info) for version in versions for (branch_name, p_info) in self.get_package_info(version) ] # Compute the ExtID of each of these packages known_extids = self._get_known_extids([p_info for (_, p_info) in packages_info]) if last_snapshot is None: last_snapshot_targets: Set[Sha1Git] = set() else: last_snapshot_targets = { branch.target for branch in last_snapshot.branches.values() } new_extids: Set[ExtID] = set() tmp_releases: Dict[str, List[Tuple[str, Sha1Git]]] = { version: [] for version in versions } errors = [] for (branch_name, p_info) in packages_info: logger.debug("package_info: %s", p_info) # Check if the package was already loaded, using its ExtID swhid = self.resolve_object_from_extids( known_extids, p_info, last_snapshot_targets ) if swhid is not None and swhid.object_type == ObjectType.REVISION: # This package was already loaded, but by an older version # of this loader, which produced revisions instead of releases. # Let's fetch the revision's data, and "upgrade" it into a release. (rev,) = self.storage.revision_get([swhid.object_id]) if not rev: logger.error( "Failed to upgrade branch %s from revision to " "release, %s is missing from the storage. " "Falling back to re-loading from the origin.", branch_name, swhid, ) else: rev = None if swhid is None or (swhid.object_type == ObjectType.REVISION and not rev): # No matching revision or release found in the last snapshot, load it. release_id = None try: res = self._load_release(p_info, origin) if res: (release_id, directory_id) = res assert release_id assert directory_id self._load_extrinsic_directory_metadata( p_info, release_id, directory_id ) self.storage.flush() status_load = "eventful" except Exception as e: self.storage.clear_buffers() load_exceptions.append(e) sentry_sdk.capture_exception(e) error = f"Failed to load branch {branch_name} for {self.origin.url}" logger.exception(error) failed_branches.append(branch_name) errors.append(f"{error}: {e}") continue if release_id is None: continue add_extid = True elif swhid.object_type == ObjectType.REVISION: # If 'rev' was None, the previous block would have run. assert rev is not None rel = rev2rel(rev, p_info.version) self.storage.release_add([rel]) logger.debug("Upgraded %s to %s", swhid, rel.swhid()) release_id = rel.id # Create a new extid for this package, so the next run of this loader # will be able to find the new release, and use it (instead of the # old revision) add_extid = True elif swhid.object_type == ObjectType.RELEASE: # This package was already loaded, nothing to do. release_id = swhid.object_id add_extid = False else: assert False, f"Unexpected object type: {swhid}" assert release_id is not None if add_extid: partial_extid = p_info.extid() if partial_extid is not None: (extid_type, extid_version, extid) = partial_extid release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=release_id ) new_extids.add( ExtID( extid_type=extid_type, extid_version=extid_version, extid=extid, target=release_swhid, ) ) tmp_releases[p_info.version].append((branch_name, release_id)) if load_exceptions: status_visit = "partial" if not tmp_releases: # We could not load any releases; fail completely + logger.error("Failed to load any release for %s", self.origin.url) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit="failed", status_load="failed", errors=errors, ) try: # Retrieve the default release version (the "latest" one) default_version = self.get_default_version() logger.debug("default version: %s", default_version) # Retrieve extra branches extra_branches = self.extra_branches() logger.debug("extra branches: %s", extra_branches) snapshot = self._load_snapshot( default_version, tmp_releases, extra_branches ) self.storage.flush() except Exception as e: error = f"Failed to build snapshot for origin {self.origin.url}" logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) status_visit = "failed" status_load = "failed" if snapshot: try: metadata_objects = self.build_extrinsic_snapshot_metadata(snapshot.id) self.load_metadata_objects(metadata_objects) except Exception as e: error = ( f"Failed to load extrinsic snapshot metadata for {self.origin.url}" ) logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) status_visit = "partial" status_load = "failed" try: metadata_objects = self.build_extrinsic_origin_metadata() self.load_metadata_objects(metadata_objects) except Exception as e: error = f"Failed to load extrinsic origin metadata for {self.origin.url}" logger.exception(error) errors.append(f"{error}: {e}") sentry_sdk.capture_exception(e) status_visit = "partial" status_load = "failed" if status_load != "failed": self._load_extids(new_extids) return self.finalize_visit( snapshot=snapshot, visit=visit, failed_branches=failed_branches, status_visit=status_visit, status_load=status_load, errors=errors, ) def _load_directory( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], tmpdir: str ) -> Tuple[str, from_disk.Directory]: uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) logger.debug("uncompressed_path: %s", uncompressed_path) directory = from_disk.Directory.from_disk( path=uncompressed_path.encode("utf-8"), max_content_length=self.max_content_size, ) contents, skipped_contents, directories = from_disk.iter_directory(directory) logger.debug("Number of skipped contents: %s", len(skipped_contents)) self.storage.skipped_content_add(skipped_contents) logger.debug("Number of contents: %s", len(contents)) self.storage.content_add(contents) logger.debug("Number of directories: %s", len(directories)) self.storage.directory_add(directories) return (uncompressed_path, directory) def _load_release( self, p_info: TPackageInfo, origin ) -> Optional[Tuple[Sha1Git, Sha1Git]]: """Does all the loading of a release itself: * downloads a package and uncompresses it * loads it from disk * adds contents, directories, and release to self.storage * returns (release_id, directory_id) Raises exception when unable to download or uncompress artifacts """ with tempfile.TemporaryDirectory() as tmpdir: dl_artifacts = self.download_package(p_info, tmpdir) (uncompressed_path, directory) = self._load_directory(dl_artifacts, tmpdir) # FIXME: This should be release. cf. D409 release = self.build_release( p_info, uncompressed_path, directory=directory.hash ) if not release: # Some artifacts are missing intrinsic metadata # skipping those return None metadata = [metadata for (filepath, metadata) in dl_artifacts] assert release.target is not None, release assert release.target_type == ModelObjectType.DIRECTORY, release metadata_target = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) original_artifact_metadata = RawExtrinsicMetadata( target=metadata_target, discovery_date=self.visit_date, authority=SWH_METADATA_AUTHORITY, fetcher=self.get_metadata_fetcher(), format="original-artifacts-json", metadata=json.dumps(metadata).encode(), origin=self.origin.url, release=release.swhid(), ) self.load_metadata_objects([original_artifact_metadata]) logger.debug("Release: %s", release) self.storage.release_add([release]) assert directory.hash return (release.id, directory.hash) def _load_snapshot( self, default_version: str, releases: Dict[str, List[Tuple[str, bytes]]], extra_branches: Dict[bytes, Mapping[str, Any]], ) -> Optional[Snapshot]: """Build snapshot out of the current releases stored and extra branches. Then load it in the storage. """ logger.debug("releases: %s", releases) # Build and load the snapshot branches = {} # type: Dict[bytes, Mapping[str, Any]] for version, branch_name_releases in releases.items(): if version == default_version and len(branch_name_releases) == 1: # only 1 branch (no ambiguity), we can create an alias # branch 'HEAD' branch_name, _ = branch_name_releases[0] # except for some corner case (deposit) if branch_name != "HEAD": branches[b"HEAD"] = { "target_type": "alias", "target": branch_name.encode("utf-8"), } for branch_name, target in branch_name_releases: branches[branch_name.encode("utf-8")] = { "target_type": "release", "target": target, } # Deal with extra-branches for name, branch_target in extra_branches.items(): if name in branches: error_message = f"Extra branch '{name!r}' has been ignored" logger.error(error_message) sentry_sdk.capture_message(error_message, "error") else: branches[name] = branch_target snapshot_data = {"branches": branches} logger.debug("snapshot: %s", snapshot_data) snapshot = Snapshot.from_dict(snapshot_data) logger.debug("snapshot: %s", snapshot) self.storage.snapshot_add([snapshot]) return snapshot def get_loader_name(self) -> str: """Returns a fully qualified name of this loader.""" return f"{self.__class__.__module__}.{self.__class__.__name__}" def get_loader_version(self) -> str: """Returns the version of the current loader.""" module_name = self.__class__.__module__ or "" module_name_parts = module_name.split(".") # Iterate rootward through the package hierarchy until we find a parent of this # loader's module with a __version__ attribute. for prefix_size in range(len(module_name_parts), 0, -1): package_name = ".".join(module_name_parts[0:prefix_size]) module = sys.modules[package_name] if hasattr(module, "__version__"): return module.__version__ # If this loader's class has no parent package with a __version__, # it should implement it itself. raise NotImplementedError( f"Could not dynamically find the version of {self.get_loader_name()}." ) def get_metadata_fetcher(self) -> MetadataFetcher: """Returns a MetadataFetcher instance representing this package loader; which is used to for adding provenance information to extracted extrinsic metadata, if any.""" return MetadataFetcher( name=self.get_loader_name(), version=self.get_loader_version(), metadata={}, ) def get_metadata_authority(self) -> MetadataAuthority: """For package loaders that get extrinsic metadata, returns the authority the metadata are coming from. """ raise NotImplementedError("get_metadata_authority") def get_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadataCore]: """Returns metadata items, used by build_extrinsic_origin_metadata.""" return [] def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]: """Builds a list of full RawExtrinsicMetadata objects, using metadata returned by get_extrinsic_origin_metadata.""" metadata_items = self.get_extrinsic_origin_metadata() if not metadata_items: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in metadata_items: metadata_objects.append( RawExtrinsicMetadata( target=self.origin.swhid(), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, ) ) return metadata_objects def get_extrinsic_snapshot_metadata(self) -> List[RawExtrinsicMetadataCore]: """Returns metadata items, used by build_extrinsic_snapshot_metadata.""" return [] def build_extrinsic_snapshot_metadata( self, snapshot_id: Sha1Git ) -> List[RawExtrinsicMetadata]: """Builds a list of full RawExtrinsicMetadata objects, using metadata returned by get_extrinsic_snapshot_metadata.""" metadata_items = self.get_extrinsic_snapshot_metadata() if not metadata_items: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in metadata_items: metadata_objects.append( RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.SNAPSHOT, object_id=snapshot_id ), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, origin=self.origin.url, ) ) return metadata_objects def build_extrinsic_directory_metadata( self, p_info: TPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, ) -> List[RawExtrinsicMetadata]: if not p_info.directory_extrinsic_metadata: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] authority = self.get_metadata_authority() fetcher = self.get_metadata_fetcher() metadata_objects = [] for item in p_info.directory_extrinsic_metadata: metadata_objects.append( RawExtrinsicMetadata( target=ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=directory_id ), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, origin=self.origin.url, release=CoreSWHID( object_type=ObjectType.RELEASE, object_id=release_id ), ) ) return metadata_objects def _load_extrinsic_directory_metadata( self, p_info: TPackageInfo, release_id: Sha1Git, directory_id: Sha1Git, ) -> None: metadata_objects = self.build_extrinsic_directory_metadata( p_info, release_id, directory_id ) self.load_metadata_objects(metadata_objects) def _load_extids(self, extids: Set[ExtID]) -> None: if not extids: return try: self.storage.extid_add(list(extids)) except Exception as e: logger.exception("Failed to load new ExtIDs for %s", self.origin.url) sentry_sdk.capture_exception(e) # No big deal, it just means the next visit will load the same versions # again. def rev2rel(rev: Revision, version: str) -> Release: """Converts a revision to a release.""" message = rev.message if message and not message.endswith(b"\n"): message += b"\n" return Release( name=version.encode(), message=message, target=rev.directory, target_type=ModelObjectType.DIRECTORY, synthetic=rev.synthetic, author=rev.author, date=rev.date, ) diff --git a/swh/loader/package/pubdev/__init__.py b/swh/loader/package/pubdev/__init__.py new file mode 100644 index 0000000..0ae96b2 --- /dev/null +++ b/swh/loader/package/pubdev/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import PubDevLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": PubDevLoader, + } diff --git a/swh/loader/package/pubdev/loader.py b/swh/loader/package/pubdev/loader.py new file mode 100644 index 0000000..2a0a944 --- /dev/null +++ b/swh/loader/package/pubdev/loader.py @@ -0,0 +1,194 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from distutils.version import LooseVersion +import json +from pathlib import Path +from typing import Any, Dict, Iterator, Optional, Sequence, Tuple + +import attr +import yaml + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import ( + EMPTY_AUTHOR, + Person, + api_info, + cached_method, + release_name, +) +from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone +from swh.storage.interface import StorageInterface + + +@attr.s +class PubDevPackageInfo(BasePackageInfo): + + name = attr.ib(type=str) + """Name of the package""" + + version = attr.ib(type=str) + """Current version""" + + last_modified = attr.ib(type=str) + """Last modified date as release date""" + + author = attr.ib(type=Person) + """Author""" + + description = attr.ib(type=str) + """Description""" + + +def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: + """Extract intrinsic metadata from pubspec.yaml file at dir_path. + + Each pub.dev package version has a pubspec.yaml file at the root of the archive. + + See https://dart.dev/tools/pub/pubspec for pubspec specifications. + + Args: + dir_path: A directory on disk where a pubspec.yaml must be present + + Returns: + A dict mapping from yaml parser + """ + pubspec_path = dir_path / "pubspec.yaml" + return yaml.safe_load(pubspec_path.read_text()) + + +class PubDevLoader(PackageLoader[PubDevPackageInfo]): + visit_type = "pubdev" + + PUBDEV_BASE_URL = "https://pub.dev/" + + def __init__( + self, + storage: StorageInterface, + url: str, + **kwargs, + ): + + super().__init__(storage=storage, url=url, **kwargs) + self.url = url + assert url.startswith(self.PUBDEV_BASE_URL) + self.package_info_url = url.replace( + self.PUBDEV_BASE_URL, f"{self.PUBDEV_BASE_URL}api/" + ) + + def _raw_info(self) -> bytes: + return api_info(self.package_info_url) + + @cached_method + def info(self) -> Dict: + """Return the project metadata information (fetched from pub.dev registry)""" + # Use strict=False in order to correctly manage case where \n is present in a string + info = json.loads(self._raw_info(), strict=False) + # Arrange versions list as a new dict with `version` as key + versions = {v["version"]: v for v in info["versions"]} + info["versions"] = versions + return info + + def get_versions(self) -> Sequence[str]: + """Get all released versions of a PubDev package + + Returns: + A sequence of versions + + Example:: + + ["0.1.1", "0.10.2"] + """ + versions = list(self.info()["versions"].keys()) + versions.sort(key=LooseVersion) + return versions + + def get_default_version(self) -> str: + """Get the newest release version of a PubDev package + + Returns: + A string representing a version + + Example:: + + "0.1.2" + """ + latest = self.info()["latest"] + return latest["version"] + + def get_package_info(self, version: str) -> Iterator[Tuple[str, PubDevPackageInfo]]: + """Get release name and package information from version + + Package info comes from extrinsic metadata (from self.info()) + + Args: + version: Package version (e.g: "0.1.0") + + Returns: + Iterator of tuple (release_name, p_info) + """ + v = self.info()["versions"][version] + assert v["version"] == version + + url = v["archive_url"] + name = v["pubspec"]["name"] + filename = f"{name}-{version}.tar.gz" + last_modified = v["published"] + + if "authors" in v["pubspec"]: + # TODO: here we have a list of author, see T3887 + author = Person.from_fullname(v["pubspec"]["authors"][0].encode()) + elif "author" in v["pubspec"] and v["pubspec"]["author"] is not None: + author = Person.from_fullname(v["pubspec"]["author"].encode()) + else: + author = EMPTY_AUTHOR + + description = v["pubspec"]["description"] + + p_info = PubDevPackageInfo( + name=name, + filename=filename, + url=url, + version=version, + last_modified=last_modified, + author=author, + description=description, + ) + yield release_name(version), p_info + + def build_release( + self, p_info: PubDevPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + + # Extract intrinsic metadata from uncompressed_path/pubspec.yaml + intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) + + name: str = intrinsic_metadata["name"] + version: str = intrinsic_metadata["version"] + assert version == p_info.version + + # author from intrinsic_metadata should not take precedence over the one + # returned by the api, see https://dart.dev/tools/pub/pubspec#authorauthors + author: Person = p_info.author + + if "description" in intrinsic_metadata and intrinsic_metadata["description"]: + description = intrinsic_metadata["description"] + else: + description = p_info.description + + message = ( + f"Synthetic release for pub.dev source package {name} " + f"version {version}\n\n" + f"{description}\n" + ) + + return Release( + name=version.encode(), + author=author, + date=TimestampWithTimezone.from_iso8601(p_info.last_modified), + message=message.encode(), + target_type=ObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) diff --git a/swh/loader/package/crates/tasks.py b/swh/loader/package/pubdev/tasks.py similarity index 51% copy from swh/loader/package/crates/tasks.py copy to swh/loader/package/pubdev/tasks.py index 9385263..f6a2927 100644 --- a/swh/loader/package/crates/tasks.py +++ b/swh/loader/package/pubdev/tasks.py @@ -1,14 +1,14 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task -from swh.loader.package.crates.loader import CratesLoader +from swh.loader.package.pubdev.loader import PubDevLoader -@shared_task(name=__name__ + ".LoadCrates") -def load_crates(*, url=None, artifacts: list): - """Load Rust crate package""" - return CratesLoader.from_configfile(url=url, artifacts=artifacts).load() +@shared_task(name=__name__ + ".LoadPubDev") +def load_pubdev(**kwargs): + """Load packages from pub.dev (Dart, Flutter)""" + return PubDevLoader.from_configfile(**kwargs).load() diff --git a/swh/loader/package/pubdev/tests/__init__.py b/swh/loader/package/pubdev/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/loader/package/pubdev/tests/data/fake_pubdev.sh b/swh/loader/package/pubdev/tests/data/fake_pubdev.sh new file mode 100644 index 0000000..c4c33dd --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/fake_pubdev.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash + +# Script to generate fake pub.dev http api response and fake Dart or FLutter packages archives as .tar.gz. + +set -euo pipefail + +# Create directories +readonly TMP=tmp_dir/pubdev +readonly BASE_API=https_pub.dev +readonly BASE_ARCHIVES=https_pub.dartlang.org + +mkdir -p $TMP +mkdir -p $BASE_API +mkdir -p $BASE_ARCHIVES + +# http api response as json +echo -e '''{"name":"authentication","latest":{"version":"0.0.1","pubspec":{"name":"authentication","description":"Persistent user authentication for Flutter with optional backend API integration.","version":"0.0.1","author":null,"homepage":null,"environment":{"sdk":">=2.7.0 <3.0.0","flutter":">=1.17.0 <2.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}},"dev_dependencies":{"flutter_test":{"sdk":"flutter"}},"flutter":{"plugin":{"platforms":{"some_platform":{"pluginClass":"somePluginClass"}}}}},"archive_url":"https://pub.dartlang.org/packages/authentication/versions/0.0.1.tar.gz","archive_sha256":"0179334b346cb67e4e6e3c905e5cc5c8e488a45ebd99fd2be3a7e0476d620d99","published":"2020-08-13T04:53:34.134687Z"},"versions":[{"version":"0.0.1","pubspec":{"name":"authentication","description":"Persistent user authentication for Flutter with optional backend API integration.","version":"0.0.1","author":null,"homepage":null,"environment":{"sdk":">=2.7.0 <3.0.0","flutter":">=1.17.0 <2.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}},"dev_dependencies":{"flutter_test":{"sdk":"flutter"}},"flutter":{"plugin":{"platforms":{"some_platform":{"pluginClass":"somePluginClass"}}}}},"archive_url":"https://pub.dartlang.org/packages/authentication/versions/0.0.1.tar.gz","archive_sha256":"0179334b346cb67e4e6e3c905e5cc5c8e488a45ebd99fd2be3a7e0476d620d99","published":"2020-08-13T04:53:34.134687Z"}]} +''' > $BASE_API/api_packages_authentication + +echo -e '''{"name":"Autolinker","latest":{"version":"0.1.1","pubspec":{"version":"0.1.1","homepage":"https://github.com/hackcave","description":"Port of Autolinker.js to dart","name":"Autolinker","author":"hackcave "},"archive_url":"https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz","published":"2014-12-24T22:34:02.534090Z"},"versions":[{"version":"0.1.1","pubspec":{"version":"0.1.1","homepage":"https://github.com/hackcave","description":"Port of Autolinker.js to dart","name":"Autolinker","author":"hackcave "},"archive_url":"https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz","published":"2014-12-24T22:34:02.534090Z"}]} +''' > ${BASE_API}/api_packages_Autolinker + +echo -e '''{"name":"bezier","latest":{"version":"1.1.5","pubspec":{"name":"bezier","version":"1.1.5","authors":["Aaron Barrett ","Isaac Barrett "],"description":"A 2D Bézier curve math library. Based heavily on the work of @TheRealPomax .\nLive examples can be found at .","homepage":"https://github.com/aab29/bezier.dart","environment":{"sdk":">=2.0.0 <3.0.0"},"dependencies":{"vector_math":"^2.0.0"},"dev_dependencies":{"test":"^1.0.0"}},"archive_url":"https://pub.dartlang.org/packages/bezier/versions/1.1.5.tar.gz","archive_sha256":"cc5da2fa927b5d347550f78d456cd984b7df78a7f0405119cdab12111e2f9ee8","published":"2019-12-22T03:17:30.805225Z"},"versions":[{"version":"1.1.5","pubspec":{"name":"bezier","version":"1.1.5","authors":["Aaron Barrett ","Isaac Barrett "],"description":"A 2D Bézier curve math library. Based heavily on the work of @TheRealPomax .\nLive examples can be found at .","homepage":"https://github.com/aab29/bezier.dart","environment":{"sdk":">=2.0.0 <3.0.0"},"dependencies":{"vector_math":"^2.0.0"},"dev_dependencies":{"test":"^1.0.0"}},"archive_url":"https://pub.dartlang.org/packages/bezier/versions/1.1.5.tar.gz","archive_sha256":"cc5da2fa927b5d347550f78d456cd984b7df78a7f0405119cdab12111e2f9ee8","published":"2019-12-22T03:17:30.805225Z"}]} +''' > ${BASE_API}/api_packages_bezier + +echo -e '''{"name":"pdf","latest":{"version":"3.8.2","pubspec":{"name":"pdf","description":"A pdf producer for Dart. It can create pdf files for both web or flutter.","homepage":"https://github.com/DavBfr/dart_pdf/tree/master/pdf","repository":"https://github.com/DavBfr/dart_pdf","issue_tracker":"https://github.com/DavBfr/dart_pdf/issues","version":"3.8.2","environment":{"sdk":">=2.12.0 <3.0.0"},"dependencies":{"archive":"^3.1.0","barcode":">=2.2.0 <3.0.0","crypto":"^3.0.0","image":">=3.0.1 <4.0.0","meta":">=1.3.0 <2.0.0","path_parsing":">=0.2.0 <2.0.0","vector_math":"^2.1.0","xml":">=5.1.0 <7.0.0"},"dev_dependencies":{"flutter_lints":"^1.0.4","test":">=1.16.0 <2.0.0"}},"archive_url":"https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz","published":"2022-07-25T11:38:25.983876Z"},"versions":[{"version":"1.0.0","pubspec":{"version":"1.0.0","name":"pdf","dependencies":{"ttf_parser":"^1.0.0","vector_math":"^2.0.7","meta":"^1.1.5"},"author":"David PHAM-VAN ","description":"A pdf producer for Dart","homepage":"https://github.com/davbfr/dart_pdf","environment":{"sdk":">=1.8.0 <2.0.0"},"dev_dependencies":{"test":"any"}},"archive_url":"https://pub.dartlang.org/packages/pdf/versions/1.0.0.tar.gz","published":"2018-07-16T21:12:28.894137Z"},{"version":"3.8.2","pubspec":{"name":"pdf","description":"A pdf producer for Dart. It can create pdf files for both web or flutter.","homepage":"https://github.com/DavBfr/dart_pdf/tree/master/pdf","repository":"https://github.com/DavBfr/dart_pdf","issue_tracker":"https://github.com/DavBfr/dart_pdf/issues","version":"3.8.2","environment":{"sdk":">=2.12.0 <3.0.0"},"dependencies":{"archive":"^3.1.0","barcode":">=2.2.0 <3.0.0","crypto":"^3.0.0","image":">=3.0.1 <4.0.0","meta":">=1.3.0 <2.0.0","path_parsing":">=0.2.0 <2.0.0","vector_math":"^2.1.0","xml":">=5.1.0 <7.0.0"},"dev_dependencies":{"flutter_lints":"^1.0.4","test":">=1.16.0 <2.0.0"}},"archive_url":"https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz","published":"2022-07-25T11:38:25.983876Z"}]} +''' > ${BASE_API}/api_packages_pdf + +echo -e '''{"name":"abstract_io","latest":{"version":"0.1.2+6","pubspec":{"name":"abstract_io","description":"Abstract IO is designed to simplify and generalize saving data both localy and externaly","version":"0.1.2+6","author":"Anders Groeschel","repository":"https://github.com/AndersGroeschel/abstract_io","homepage":"https://github.com/AndersGroeschel/abstract_io","environment":{"sdk":">=2.7.0 <3.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}}},"archive_url":"https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B6.tar.gz","archive_sha256":"9557fd384730d92a046cfccdff9625f2d646657219d5a0e447cb7eb0fdf90f18","published":"2020-08-03T21:31:05.764846Z"},"versions":[{"version":"0.1.2+4","pubspec":{"name":"abstract_io","description":"Abstract IO is designed to simplify and generalize saving data both localy and externaly","version":"0.1.2+4","author":"Anders Groeschel","repository":"https://github.com/AndersGroeschel/abstract_io","homepage":"https://github.com/AndersGroeschel/abstract_io","environment":{"sdk":">=2.7.0 <3.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}}},"archive_url":"https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B4.tar.gz","archive_sha256":"df687ff2a92774db04a28167ccddbfe9c2fc1ea63c6ae05c3236552fe350bb68","published":"2020-08-03T20:14:38.116237Z"},{"version":"0.1.2+5","pubspec":{"name":"abstract_io","description":"Abstract IO is designed to simplify and generalize saving data both localy and externaly","version":"0.1.2+5","author":"Anders Groeschel","repository":"https://github.com/AndersGroeschel/abstract_io","homepage":"https://github.com/AndersGroeschel/abstract_io","environment":{"sdk":">=2.7.0 <3.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}}},"archive_url":"https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B5.tar.gz","archive_sha256":"fc9199c2f9879d3c0d140c05a2f8c537561af256d98d209b4ee102e8107ec2b9","published":"2020-08-03T21:09:20.329418Z"},{"version":"0.1.2+6","pubspec":{"name":"abstract_io","description":"Abstract IO is designed to simplify and generalize saving data both localy and externaly","version":"0.1.2+6","author":"Anders Groeschel","repository":"https://github.com/AndersGroeschel/abstract_io","homepage":"https://github.com/AndersGroeschel/abstract_io","environment":{"sdk":">=2.7.0 <3.0.0"},"dependencies":{"flutter":{"sdk":"flutter"}}},"archive_url":"https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B6.tar.gz","archive_sha256":"9557fd384730d92a046cfccdff9625f2d646657219d5a0e447cb7eb0fdf90f18","published":"2020-08-03T21:31:05.764846Z"}]} +''' > ${BASE_API}/api_packages_abstract_io + +# Dart package a pubspec.yaml file at thier root. Generate some of them. + +mkdir -p ${TMP}/packages_authentication_versions_0.0.1 +echo -e '''name: authentication +description: Persistent user authentication for Flutter with optional backend API integration. +version: 0.0.1 +author: +homepage: + +environment: + sdk: ">=2.7.0 <3.0.0" + flutter: ">=1.17.0 <2.0.0" + +dependencies: + flutter: + sdk: flutter + +dev_dependencies: + flutter_test: + sdk: flutter + +# For information on the generic Dart part of this file, see the +# following page: https://dart.dev/tools/pub/pubspec + +# The following section is specific to Flutter. +flutter: + # This section identifies this Flutter project as a plugin project. + # The 'pluginClass' and Android 'package' identifiers should not ordinarily + # be modified. They are used by the tooling to maintain consistency when + # adding or updating assets for this project. + plugin: + platforms: + # This plugin project was generated without specifying any + # platforms with the `--platform` argument. If you see the `fake_platform` map below, remove it and + # then add platforms following the instruction here: + # https://flutter.dev/docs/development/packages-and-plugins/developing-packages#plugin-platforms + # ------------------- + some_platform: + pluginClass: somePluginClass + # ------------------- + + # To add assets to your plugin package, add an assets section, like this: + # assets: + # - images/a_dot_burr.jpeg + # - images/a_dot_ham.jpeg + # + # For details regarding assets in packages, see + # https://flutter.dev/assets-and-images/#from-packages + # + # An image asset can refer to one or more resolution-specific "variants", see + # https://flutter.dev/assets-and-images/#resolution-aware. + + # To add custom fonts to your plugin package, add a fonts section here, + # in this "flutter" section. Each entry in this list should have a + # "family" key with the font family name, and a "fonts" key with a + # list giving the asset and other descriptors for the font. For + # example: + # fonts: + # - family: Schyler + # fonts: + # - asset: fonts/Schyler-Regular.ttf + # - asset: fonts/Schyler-Italic.ttf + # style: italic + # - family: Trajan Pro + # fonts: + # - asset: fonts/TrajanPro.ttf + # - asset: fonts/TrajanPro_Bold.ttf + # weight: 700 + # + # For details regarding fonts in packages, see + # https://flutter.dev/custom-fonts/#from-packages +''' > ${TMP}/packages_authentication_versions_0.0.1/pubspec.yaml + + +mkdir -p ${TMP}/packages_autolinker_versions_0.1.1 +echo -e '''name: Autolinker +version: 0.1.1 +author: hackcave +homepage: https://github.com/hackcave +description: + Port of Autolinker.js to dart +''' > ${TMP}/packages_autolinker_versions_0.1.1/pubspec.yaml + +mkdir -p ${TMP}/packages_bezier_versions_1.1.5 +echo -e '''name: bezier +version: 1.1.5 +authors: + - Aaron Barrett + - Isaac Barrett +description: >- + A 2D Bézier curve math library. Based heavily on the work of @TheRealPomax + . + + Live examples can be found at . +homepage: https://github.com/aab29/bezier.dart +environment: + sdk: ">=2.0.0 <3.0.0" +dependencies: + vector_math: ^2.0.0 +dev_dependencies: + test: ^1.0.0 +''' > ${TMP}/packages_bezier_versions_1.1.5/pubspec.yaml + +mkdir -p ${TMP}/packages_pdf_versions_1.0.0 +echo -e '''name: pdf +author: David PHAM-VAN +description: A pdf producer for Dart +homepage: https://github.com/davbfr/dart_pdf +version: 1.0.0 + +environment: + sdk: ">=1.8.0 <2.0.0" + +dependencies: + meta: "^1.1.5" + ttf_parser: "^1.0.0" + vector_math: "^2.0.7" + +dev_dependencies: + test: any +''' > ${TMP}/packages_pdf_versions_1.0.0/pubspec.yaml + +mkdir -p ${TMP}/packages_pdf_versions_3.8.2 +echo -e '''name: pdf +description: A pdf producer for Dart. It can create pdf files for both web or flutter. +homepage: https://github.com/DavBfr/dart_pdf/tree/master/pdf +repository: https://github.com/DavBfr/dart_pdf +issue_tracker: https://github.com/DavBfr/dart_pdf/issues +version: 3.8.2 + +environment: + sdk: ">=2.12.0 <3.0.0" + +dependencies: + archive: ^3.1.0 + barcode: ">=2.2.0 <3.0.0" + crypto: ^3.0.0 + image: ">=3.0.1 <4.0.0" + meta: ">=1.3.0 <2.0.0" + path_parsing: ">=0.2.0 <2.0.0" + vector_math: ^2.1.0 + xml: ">=5.1.0 <7.0.0" + +dev_dependencies: + flutter_lints: ^1.0.4 + test: ">=1.16.0 <2.0.0" +''' > ${TMP}/packages_pdf_versions_3.8.2/pubspec.yaml + +cd $TMP + +tar -czf packages_authentication_versions_0.0.1.tar.gz -C packages_authentication_versions_0.0.1 . +tar -czf packages_Autolinker_versions_0.1.1.tar.gz -C packages_autolinker_versions_0.1.1 . +tar -czf packages_bezier_versions_1.1.5.tar.gz -C packages_bezier_versions_1.1.5 . +tar -czf packages_pdf_versions_1.0.0.tar.gz -C packages_pdf_versions_1.0.0 . +tar -czf packages_pdf_versions_3.8.2.tar.gz -C packages_pdf_versions_3.8.2 . + + +# Move .tar.gz archives to a servable directory +mv *.tar.gz ../../$BASE_ARCHIVES + +# Clean up removing tmp_dir +cd ../../ +rm -r tmp_dir/ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_Autolinker_versions_0.1.1.tar.gz b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_Autolinker_versions_0.1.1.tar.gz new file mode 100644 index 0000000..5cdf2dd Binary files /dev/null and b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_Autolinker_versions_0.1.1.tar.gz differ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_authentication_versions_0.0.1.tar.gz b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_authentication_versions_0.0.1.tar.gz new file mode 100644 index 0000000..4338014 Binary files /dev/null and b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_authentication_versions_0.0.1.tar.gz differ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_bezier_versions_1.1.5.tar.gz b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_bezier_versions_1.1.5.tar.gz new file mode 100644 index 0000000..5a92354 Binary files /dev/null and b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_bezier_versions_1.1.5.tar.gz differ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_1.0.0.tar.gz b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_1.0.0.tar.gz new file mode 100644 index 0000000..d30d19f Binary files /dev/null and b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_1.0.0.tar.gz differ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_3.8.2.tar.gz b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_3.8.2.tar.gz new file mode 100644 index 0000000..45e37e8 Binary files /dev/null and b/swh/loader/package/pubdev/tests/data/https_pub.dartlang.org/packages_pdf_versions_3.8.2.tar.gz differ diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker new file mode 100644 index 0000000..b60f1d8 --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_Autolinker @@ -0,0 +1,29 @@ +{ + "name": "Autolinker", + "latest": { + "version": "0.1.1", + "pubspec": { + "version": "0.1.1", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + "name": "Autolinker", + "author": "hackcave " + }, + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "published": "2014-12-24T22:34:02.534090Z" + }, + "versions": [ + { + "version": "0.1.1", + "pubspec": { + "version": "0.1.1", + "homepage": "https://github.com/hackcave", + "description": "Port of Autolinker.js to dart", + "name": "Autolinker", + "author": "hackcave " + }, + "archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz", + "published": "2014-12-24T22:34:02.534090Z" + } + ] +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_abstract_io b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_abstract_io new file mode 100644 index 0000000..1d00f61 --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_abstract_io @@ -0,0 +1,93 @@ +{ + "name": "abstract_io", + "latest": { + "version": "0.1.2+6", + "pubspec": { + "name": "abstract_io", + "description": "Abstract IO is designed to simplify and generalize saving data both localy and externaly", + "version": "0.1.2+6", + "author": "Anders Groeschel", + "repository": "https://github.com/AndersGroeschel/abstract_io", + "homepage": "https://github.com/AndersGroeschel/abstract_io", + "environment": { + "sdk": ">=2.7.0 <3.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B6.tar.gz", + "archive_sha256": "9557fd384730d92a046cfccdff9625f2d646657219d5a0e447cb7eb0fdf90f18", + "published": "2020-08-03T21:31:05.764846Z" + }, + "versions": [ + { + "version": "0.1.2+4", + "pubspec": { + "name": "abstract_io", + "description": "Abstract IO is designed to simplify and generalize saving data both localy and externaly", + "version": "0.1.2+4", + "author": "Anders Groeschel", + "repository": "https://github.com/AndersGroeschel/abstract_io", + "homepage": "https://github.com/AndersGroeschel/abstract_io", + "environment": { + "sdk": ">=2.7.0 <3.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B4.tar.gz", + "archive_sha256": "df687ff2a92774db04a28167ccddbfe9c2fc1ea63c6ae05c3236552fe350bb68", + "published": "2020-08-03T20:14:38.116237Z" + }, + { + "version": "0.1.2+5", + "pubspec": { + "name": "abstract_io", + "description": "Abstract IO is designed to simplify and generalize saving data both localy and externaly", + "version": "0.1.2+5", + "author": "Anders Groeschel", + "repository": "https://github.com/AndersGroeschel/abstract_io", + "homepage": "https://github.com/AndersGroeschel/abstract_io", + "environment": { + "sdk": ">=2.7.0 <3.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B5.tar.gz", + "archive_sha256": "fc9199c2f9879d3c0d140c05a2f8c537561af256d98d209b4ee102e8107ec2b9", + "published": "2020-08-03T21:09:20.329418Z" + }, + { + "version": "0.1.2+6", + "pubspec": { + "name": "abstract_io", + "description": "Abstract IO is designed to simplify and generalize saving data both localy and externaly", + "version": "0.1.2+6", + "author": "Anders Groeschel", + "repository": "https://github.com/AndersGroeschel/abstract_io", + "homepage": "https://github.com/AndersGroeschel/abstract_io", + "environment": { + "sdk": ">=2.7.0 <3.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/abstract_io/versions/0.1.2%2B6.tar.gz", + "archive_sha256": "9557fd384730d92a046cfccdff9625f2d646657219d5a0e447cb7eb0fdf90f18", + "published": "2020-08-03T21:31:05.764846Z" + } + ] +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication new file mode 100644 index 0000000..b4b312a --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_authentication @@ -0,0 +1,77 @@ +{ + "name": "authentication", + "latest": { + "version": "0.0.1", + "pubspec": { + "name": "authentication", + "description": "Persistent user authentication for Flutter with optional backend API integration.", + "version": "0.0.1", + "author": null, + "homepage": null, + "environment": { + "sdk": ">=2.7.0 <3.0.0", + "flutter": ">=1.17.0 <2.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + }, + "dev_dependencies": { + "flutter_test": { + "sdk": "flutter" + } + }, + "flutter": { + "plugin": { + "platforms": { + "some_platform": { + "pluginClass": "somePluginClass" + } + } + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/authentication/versions/0.0.1.tar.gz", + "archive_sha256": "0179334b346cb67e4e6e3c905e5cc5c8e488a45ebd99fd2be3a7e0476d620d99", + "published": "2020-08-13T04:53:34.134687Z" + }, + "versions": [ + { + "version": "0.0.1", + "pubspec": { + "name": "authentication", + "description": "Persistent user authentication for Flutter with optional backend API integration.", + "version": "0.0.1", + "author": null, + "homepage": null, + "environment": { + "sdk": ">=2.7.0 <3.0.0", + "flutter": ">=1.17.0 <2.0.0" + }, + "dependencies": { + "flutter": { + "sdk": "flutter" + } + }, + "dev_dependencies": { + "flutter_test": { + "sdk": "flutter" + } + }, + "flutter": { + "plugin": { + "platforms": { + "some_platform": { + "pluginClass": "somePluginClass" + } + } + } + } + }, + "archive_url": "https://pub.dartlang.org/packages/authentication/versions/0.0.1.tar.gz", + "archive_sha256": "0179334b346cb67e4e6e3c905e5cc5c8e488a45ebd99fd2be3a7e0476d620d99", + "published": "2020-08-13T04:53:34.134687Z" + } + ] +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier new file mode 100644 index 0000000..efd84a4 --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_bezier @@ -0,0 +1,55 @@ +{ + "name": "bezier", + "latest": { + "version": "1.1.5", + "pubspec": { + "name": "bezier", + "version": "1.1.5", + "authors": [ + "Aaron Barrett ", + "Isaac Barrett " + ], + "description": "A 2D Bézier curve math library. Based heavily on the work of @TheRealPomax .\nLive examples can be found at .", + "homepage": "https://github.com/aab29/bezier.dart", + "environment": { + "sdk": ">=2.0.0 <3.0.0" + }, + "dependencies": { + "vector_math": "^2.0.0" + }, + "dev_dependencies": { + "test": "^1.0.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/bezier/versions/1.1.5.tar.gz", + "archive_sha256": "cc5da2fa927b5d347550f78d456cd984b7df78a7f0405119cdab12111e2f9ee8", + "published": "2019-12-22T03:17:30.805225Z" + }, + "versions": [ + { + "version": "1.1.5", + "pubspec": { + "name": "bezier", + "version": "1.1.5", + "authors": [ + "Aaron Barrett ", + "Isaac Barrett " + ], + "description": "A 2D Bézier curve math library. Based heavily on the work of @TheRealPomax .\nLive examples can be found at .", + "homepage": "https://github.com/aab29/bezier.dart", + "environment": { + "sdk": ">=2.0.0 <3.0.0" + }, + "dependencies": { + "vector_math": "^2.0.0" + }, + "dev_dependencies": { + "test": "^1.0.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/bezier/versions/1.1.5.tar.gz", + "archive_sha256": "cc5da2fa927b5d347550f78d456cd984b7df78a7f0405119cdab12111e2f9ee8", + "published": "2019-12-22T03:17:30.805225Z" + } + ] +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf new file mode 100644 index 0000000..c015890 --- /dev/null +++ b/swh/loader/package/pubdev/tests/data/https_pub.dev/api_packages_pdf @@ -0,0 +1,88 @@ +{ + "name": "pdf", + "latest": { + "version": "3.8.2", + "pubspec": { + "name": "pdf", + "description": "A pdf producer for Dart. It can create pdf files for both web or flutter.", + "homepage": "https://github.com/DavBfr/dart_pdf/tree/master/pdf", + "repository": "https://github.com/DavBfr/dart_pdf", + "issue_tracker": "https://github.com/DavBfr/dart_pdf/issues", + "version": "3.8.2", + "environment": { + "sdk": ">=2.12.0 <3.0.0" + }, + "dependencies": { + "archive": "^3.1.0", + "barcode": ">=2.2.0 <3.0.0", + "crypto": "^3.0.0", + "image": ">=3.0.1 <4.0.0", + "meta": ">=1.3.0 <2.0.0", + "path_parsing": ">=0.2.0 <2.0.0", + "vector_math": "^2.1.0", + "xml": ">=5.1.0 <7.0.0" + }, + "dev_dependencies": { + "flutter_lints": "^1.0.4", + "test": ">=1.16.0 <2.0.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz", + "published": "2022-07-25T11:38:25.983876Z" + }, + "versions": [ + { + "version": "1.0.0", + "pubspec": { + "version": "1.0.0", + "name": "pdf", + "dependencies": { + "ttf_parser": "^1.0.0", + "vector_math": "^2.0.7", + "meta": "^1.1.5" + }, + "author": "David PHAM-VAN ", + "description": "A pdf producer for Dart", + "homepage": "https://github.com/davbfr/dart_pdf", + "environment": { + "sdk": ">=1.8.0 <2.0.0" + }, + "dev_dependencies": { + "test": "any" + } + }, + "archive_url": "https://pub.dartlang.org/packages/pdf/versions/1.0.0.tar.gz", + "published": "2018-07-16T21:12:28.894137Z" + }, + { + "version": "3.8.2", + "pubspec": { + "name": "pdf", + "description": "A pdf producer for Dart. It can create pdf files for both web or flutter.", + "homepage": "https://github.com/DavBfr/dart_pdf/tree/master/pdf", + "repository": "https://github.com/DavBfr/dart_pdf", + "issue_tracker": "https://github.com/DavBfr/dart_pdf/issues", + "version": "3.8.2", + "environment": { + "sdk": ">=2.12.0 <3.0.0" + }, + "dependencies": { + "archive": "^3.1.0", + "barcode": ">=2.2.0 <3.0.0", + "crypto": "^3.0.0", + "image": ">=3.0.1 <4.0.0", + "meta": ">=1.3.0 <2.0.0", + "path_parsing": ">=0.2.0 <2.0.0", + "vector_math": "^2.1.0", + "xml": ">=5.1.0 <7.0.0" + }, + "dev_dependencies": { + "flutter_lints": "^1.0.4", + "test": ">=1.16.0 <2.0.0" + } + }, + "archive_url": "https://pub.dartlang.org/packages/pdf/versions/3.8.2.tar.gz", + "published": "2022-07-25T11:38:25.983876Z" + } + ] +} \ No newline at end of file diff --git a/swh/loader/package/pubdev/tests/test_pubdev.py b/swh/loader/package/pubdev/tests/test_pubdev.py new file mode 100644 index 0000000..0979dfd --- /dev/null +++ b/swh/loader/package/pubdev/tests/test_pubdev.py @@ -0,0 +1,272 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.loader.package.pubdev.loader import PubDevLoader +from swh.loader.package.utils import EMPTY_AUTHOR +from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats +from swh.model.hashutil import hash_to_bytes +from swh.model.model import ( + ObjectType, + Person, + Release, + Snapshot, + SnapshotBranch, + TargetType, + TimestampWithTimezone, +) + +EXPECTED_PACKAGES = [ + { + "url": "https://pub.dev/packages/Autolinker", # one version + }, + { + "url": "https://pub.dev/packages/pdf", # multiple versions + }, + { + "url": "https://pub.dev/packages/bezier", # multiple authors + }, + { + "url": "https://pub.dev/packages/authentication", # empty author + }, + { + "url": "https://pub.dev/packages/abstract_io", # loose versions names + }, +] + + +def test_get_versions(requests_mock_datadir, swh_storage): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + ) + assert loader.get_versions() == [ + "1.0.0", + "3.8.2", + ] + + +def test_get_loose_versions(requests_mock_datadir, swh_storage): + """Sometimes version name does not follow semver""" + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[4]["url"], + ) + assert loader.get_versions() == ["0.1.2+4", "0.1.2+5", "0.1.2+6"] + + +def test_get_default_version(requests_mock_datadir, swh_storage): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + ) + assert loader.get_default_version() == "3.8.2" + + +def test_pubdev_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + ) + load_status = loader.load() + assert load_status["status"] == "eventful" + assert load_status["snapshot_id"] is not None + + expected_snapshot_id = "245092931ba809e6c54ebda8f865fb5a969a4134" + expected_release_id = "919f267ea050539606344d49d14bf594c4386e5a" + + assert expected_snapshot_id == load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(load_status["snapshot_id"]), + branches={ + b"releases/0.1.1": SnapshotBranch( + target=hash_to_bytes(expected_release_id), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/0.1.1", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 1, + "directory": 1, + "origin": 1, + "origin_visit": 1, + "release": 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( + name=b"0.1.1", + message=b"Synthetic release for pub.dev source package Autolinker version" + b" 0.1.1\n\nPort of Autolinker.js to dart\n", + target=hash_to_bytes("3fb6d4f2c0334d1604357ae92b2dd38a55a78194"), + target_type=ObjectType.DIRECTORY, + synthetic=True, + author=Person( + fullname=b"hackcave ", + name=b"hackcave", + email=b"hackers@hackcave.org", + ), + date=TimestampWithTimezone.from_iso8601("2014-12-24T22:34:02.534090+00:00"), + id=hash_to_bytes(expected_release_id), + ) + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + status="full", + type="pubdev", + snapshot=expected_snapshot.id, + ) + + +def test_pubdev_loader_load_multiple_versions( + datadir, requests_mock_datadir, swh_storage +): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + ) + load_status = loader.load() + + assert load_status["status"] == "eventful" + assert load_status["snapshot_id"] is not None + + expected_snapshot_id = "43d5b68a9fa973aa95e56916aaef70841ccbc2a0" + + assert expected_snapshot_id == load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(load_status["snapshot_id"]), + branches={ + b"releases/1.0.0": SnapshotBranch( + target=hash_to_bytes("fbf8e40af675096681954553d737861e10b57216"), + target_type=TargetType.RELEASE, + ), + b"releases/3.8.2": SnapshotBranch( + target=hash_to_bytes("627a5d586e3fb4e7319b17f1aee268fe2fb8e01c"), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/3.8.2", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 1 + 1, + "directory": 1 + 1, + "origin": 1, + "origin_visit": 1, + "release": 1 + 1, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + status="full", + type="pubdev", + snapshot=expected_snapshot.id, + ) + + +def test_pubdev_loader_multiple_authors(datadir, requests_mock_datadir, swh_storage): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[2]["url"], + ) + load_status = loader.load() + assert load_status["status"] == "eventful" + assert load_status["snapshot_id"] is not None + + expected_snapshot_id = "4fa9f19d1d6ccc70921c8c50b278f510db63aa36" + expected_release_id = "538c98fd69a42d8d0561a7ca95b354de2143a3ab" + + assert expected_snapshot_id == load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(load_status["snapshot_id"]), + branches={ + b"releases/1.1.5": SnapshotBranch( + target=hash_to_bytes(expected_release_id), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/1.1.5", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + release = swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] + assert release.author == Person( + fullname=b"Aaron Barrett ", + name=b"Aaron Barrett", + email=b"aaron@aaronbarrett.com", + ) + + +def test_pubdev_loader_empty_author(datadir, requests_mock_datadir, swh_storage): + loader = PubDevLoader( + swh_storage, + url=EXPECTED_PACKAGES[3]["url"], + ) + + load_status = loader.load() + assert load_status["status"] == "eventful" + assert load_status["snapshot_id"] is not None + + expected_snapshot_id = "0c7fa6b9fced23c648d2093ad5597622683f8aed" + expected_release_id = "7d8c05181069aa1049a3f0bc1d13bedc34625d47" + + assert expected_snapshot_id == load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(load_status["snapshot_id"]), + branches={ + b"releases/0.0.1": SnapshotBranch( + target=hash_to_bytes(expected_release_id), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/0.0.1", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + release = swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] + assert release.author == EMPTY_AUTHOR + + +def test_pubdev_invalid_origin(swh_storage): + + with pytest.raises(AssertionError): + PubDevLoader( + swh_storage, + "http://nowhere/api/packages/42", + ) diff --git a/swh/loader/package/pubdev/tests/test_tasks.py b/swh/loader/package/pubdev/tests/test_tasks.py new file mode 100644 index 0000000..c5b2ce7 --- /dev/null +++ b/swh/loader/package/pubdev/tests/test_tasks.py @@ -0,0 +1,23 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_tasks_pubdev_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.pubdev.loader.PubDevLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.pubdev.tasks.LoadPubDev", + kwargs=dict( + url="https://pub.dev/packages/some-package", + ), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"} diff --git a/swh/loader/package/tests/test_utils.py b/swh/loader/package/tests/test_utils.py index 75373e7..bf1f4da 100644 --- a/swh/loader/package/tests/test_utils.py +++ b/swh/loader/package/tests/test_utils.py @@ -1,236 +1,273 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os from unittest.mock import MagicMock from urllib.error import URLError from urllib.parse import quote import pytest +from requests.exceptions import HTTPError from swh.loader.exception import NotFound import swh.loader.package from swh.loader.package.utils import api_info, download, release_name def test_version_generation(): assert ( swh.loader.package.__version__ != "devel" ), "Make sure swh.loader.core is installed (e.g. pip install -e .)" @pytest.mark.fs def test_download_fail_to_download(tmp_path, requests_mock): url = "https://pypi.org/pypi/arrow/json" status_code = 404 requests_mock.get(url, status_code=status_code) - with pytest.raises(ValueError) as e: + with pytest.raises( + HTTPError, match=f"{status_code} Client Error: None for url: {url}" + ): download(url, tmp_path) - assert e.value.args[0] == "Fail to query '%s'. Reason: %s" % (url, status_code) - _filename = "requests-0.0.1.tar.gz" _data = "this is something" def _check_download_ok(url, dest, filename=_filename, hashes={}): actual_filepath, actual_hashes = download(url, dest, hashes=hashes) actual_filename = os.path.basename(actual_filepath) assert actual_filename == filename assert actual_hashes["length"] == len(_data) assert ( actual_hashes["checksums"]["sha1"] == "fdd1ce606a904b08c816ba84f3125f2af44d92b2" ) assert ( actual_hashes["checksums"]["sha256"] == "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5" ) @pytest.mark.fs def test_download_ok(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_ok_no_header(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data) # no header information _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_ok_with_hashes(tmp_path, requests_mock): """Download without issue should provide filename and hashes""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) # good hashes for such file good = { "sha1": "fdd1ce606a904b08c816ba84f3125f2af44d92b2", "sha256": "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5", # noqa } _check_download_ok(url, dest=str(tmp_path), hashes=good) @pytest.mark.fs def test_download_fail_hashes_mismatch(tmp_path, requests_mock): """Mismatch hash after download should raise""" url = f"https://pypi.org/pypi/requests/{_filename}" requests_mock.get(url, text=_data, headers={"content-length": str(len(_data))}) # good hashes for such file good = { "sha1": "fdd1ce606a904b08c816ba84f3125f2af44d92b2", "sha256": "1d9224378d77925d612c9f926eb9fb92850e6551def8328011b6a972323298d5", # noqa } for hash_algo in good.keys(): wrong_hash = good[hash_algo].replace("1", "0") expected_hashes = good.copy() expected_hashes[hash_algo] = wrong_hash # set the wrong hash expected_msg = "Failure when fetching %s. " "Checksum mismatched: %s != %s" % ( url, wrong_hash, good[hash_algo], ) with pytest.raises(ValueError, match=expected_msg): download(url, dest=str(tmp_path), hashes=expected_hashes) @pytest.mark.fs def test_ftp_download_ok(tmp_path, mocker): """Download without issue should provide filename and hashes""" url = f"ftp://pypi.org/pypi/requests/{_filename}" cm = MagicMock() cm.getstatus.return_value = 200 cm.read.side_effect = [_data.encode(), b""] cm.__enter__.return_value = cm mocker.patch("swh.loader.package.utils.urlopen").return_value = cm _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs def test_ftp_download_ko(tmp_path, mocker): """Download without issue should provide filename and hashes""" filename = "requests-0.0.1.tar.gz" url = "ftp://pypi.org/pypi/requests/%s" % filename mocker.patch("swh.loader.package.utils.urlopen").side_effect = URLError("FTP error") with pytest.raises(URLError): download(url, dest=str(tmp_path)) @pytest.mark.fs def test_download_with_redirection(tmp_path, requests_mock): """Download with redirection should use the targeted URL to extract filename""" url = "https://example.org/project/requests/download" redirection_url = f"https://example.org/project/requests/files/{_filename}" requests_mock.get(url, status_code=302, headers={"location": redirection_url}) requests_mock.get( redirection_url, text=_data, headers={"content-length": str(len(_data))} ) _check_download_ok(url, dest=str(tmp_path)) def test_download_extracting_filename_from_url(tmp_path, requests_mock): """Extracting filename from url must sanitize the filename first""" url = "https://example.org/project/requests-0.0.1.tar.gz?a=b&c=d&foo=bar" requests_mock.get( url, status_code=200, text=_data, headers={"content-length": str(len(_data))} ) _check_download_ok(url, dest=str(tmp_path)) @pytest.mark.fs @pytest.mark.parametrize( "filename", [f'"{_filename}"', _filename, '"filename with spaces.tar.gz"'] ) def test_download_filename_from_content_disposition(tmp_path, requests_mock, filename): """Filename should be extracted from content-disposition request header when available.""" url = "https://example.org/download/requests/tar.gz/v0.0.1" requests_mock.get( url, text=_data, headers={ "content-length": str(len(_data)), "content-disposition": f"attachment; filename={filename}", }, ) _check_download_ok(url, dest=str(tmp_path), filename=filename.strip('"')) @pytest.mark.fs @pytest.mark.parametrize("filename", ['"archive école.tar.gz"', "archive_école.tgz"]) def test_download_utf8_filename_from_content_disposition( tmp_path, requests_mock, filename ): """Filename should be extracted from content-disposition request header when available.""" url = "https://example.org/download/requests/tar.gz/v0.0.1" data = "this is something" requests_mock.get( url, text=data, headers={ "content-length": str(len(data)), "content-disposition": f"attachment; filename*=utf-8''{quote(filename)}", }, ) _check_download_ok(url, dest=str(tmp_path), filename=filename.strip('"')) def test_api_info_failure(requests_mock): """Failure to fetch info/release information should raise""" url = "https://pypi.org/pypi/requests/json" status_code = 400 requests_mock.get(url, status_code=status_code) with pytest.raises(NotFound) as e0: api_info(url) assert e0.value.args[0] == "Fail to query '%s'. Reason: %s" % (url, status_code) def test_api_info(requests_mock): """Fetching json info from pypi project should be ok""" url = "https://pypi.org/pypi/requests/json" requests_mock.get(url, text='{"version": "0.0.1"}') actual_info = json.loads(api_info(url)) assert actual_info == { "version": "0.0.1", } def test_release_name(): for version, filename, expected_release in [ ("0.0.1", None, "releases/0.0.1"), ("0.0.2", "something", "releases/0.0.2/something"), ]: assert release_name(version, filename) == expected_release + + +@pytest.fixture(autouse=True) +def mock_download_retry_sleep(mocker): + mocker.patch.object(download.retry, "sleep") + + +def test_download_retry(mocker, requests_mock, tmp_path): + url = f"https://example.org/project/requests/files/{_filename}" + + requests_mock.get( + url, + [ + {"status_code": 429}, + {"status_code": 429}, + { + "text": _data, + "headers": {"content-length": str(len(_data))}, + "status_code": 200, + }, + ], + ) + + _check_download_ok(url, dest=str(tmp_path)) + + +def test_download_retry_reraise(mocker, requests_mock, tmp_path): + url = f"https://example.org/project/requests/files/{_filename}" + + requests_mock.get( + url, + [{"status_code": 429}] * 5, + ) + + with pytest.raises(HTTPError): + _check_download_ok(url, dest=str(tmp_path)) diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py index 0656eca..df3127c 100644 --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -1,185 +1,207 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import functools import itertools import logging import os import re from typing import Callable, Dict, Optional, Tuple, TypeVar from urllib.parse import unquote, urlsplit from urllib.request import urlopen import requests +from requests.exceptions import HTTPError +from tenacity import retry +from tenacity.before_sleep import before_sleep_log +from tenacity.stop import stop_after_attempt +from tenacity.wait import wait_exponential from swh.loader.exception import NotFound from swh.loader.package import DEFAULT_PARAMS from swh.model.hashutil import HASH_BLOCK_SIZE, MultiHash from swh.model.model import Person logger = logging.getLogger(__name__) DOWNLOAD_HASHES = set(["sha1", "sha256", "length"]) EMPTY_AUTHOR = Person.from_fullname(b"") def api_info(url: str, **extra_params) -> bytes: """Basic api client to retrieve information on project. This deals with fetching json metadata about pypi projects. Args: url (str): The api url (e.g PyPI, npm, etc...) Raises: NotFound in case of query failures (for some reasons: 404, ...) Returns: The associated response's information """ response = requests.get(url, **{**DEFAULT_PARAMS, **extra_params}) if response.status_code != 200: raise NotFound(f"Fail to query '{url}'. Reason: {response.status_code}") return response.content def _content_disposition_filename(header: str) -> Optional[str]: fname = None fnames = re.findall(r"filename[\*]?=([^;]+)", header) if fnames and "utf-8''" in fnames[0].lower(): # RFC 5987 fname = re.sub("utf-8''", "", fnames[0], flags=re.IGNORECASE) fname = unquote(fname) elif fnames: fname = fnames[0] if fname: fname = os.path.basename(fname.strip().strip('"')) return fname +def _retry_if_throttling(retry_state) -> bool: + """Custom tenacity retry predicate for handling HTTP responses with + status code 429 (too many requests). + """ + attempt = retry_state.outcome + if attempt.failed: + exception = attempt.exception() + return ( + isinstance(exception, HTTPError) and exception.response.status_code == 429 + ) + return False + + +@retry( + retry=_retry_if_throttling, + wait=wait_exponential(exp_base=10), + stop=stop_after_attempt(max_attempt_number=5), + before_sleep=before_sleep_log(logger, logging.WARNING), + reraise=True, +) def download( url: str, dest: str, hashes: Dict = {}, filename: Optional[str] = None, auth: Optional[Tuple[str, str]] = None, extra_request_headers: Optional[Dict[str, str]] = None, ) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to hashes: Dict of expected hashes (key is the hash algo) for the artifact to download (those hashes are expected to be hex string) auth: Optional tuple of login/password (for http authentication service, e.g. deposit) Raises: ValueError in case of any error when fetching/computing (length, checksums mismatched...) Returns: Tuple of local (filepath, hashes of filepath) """ params = copy.deepcopy(DEFAULT_PARAMS) if auth is not None: params["auth"] = auth if extra_request_headers is not None: params["headers"].update(extra_request_headers) # so the connection does not hang indefinitely (read/connection timeout) timeout = params.get("timeout", 60) if url.startswith("ftp://"): response = urlopen(url, timeout=timeout) chunks = (response.read(HASH_BLOCK_SIZE) for _ in itertools.count()) response_data = itertools.takewhile(bool, chunks) else: response = requests.get(url, **params, timeout=timeout, stream=True) - if response.status_code != 200: - raise ValueError( - "Fail to query '%s'. Reason: %s" % (url, response.status_code) - ) + response.raise_for_status() # update URL to response one as requests follow redirection by default # on GET requests url = response.url # try to extract filename from content-disposition header if available if filename is None and "content-disposition" in response.headers: filename = _content_disposition_filename( response.headers["content-disposition"] ) response_data = response.iter_content(chunk_size=HASH_BLOCK_SIZE) filename = filename if filename else os.path.basename(urlsplit(url).path) logger.debug("filename: %s", filename) filepath = os.path.join(dest, filename) logger.debug("filepath: %s", filepath) h = MultiHash(hash_names=DOWNLOAD_HASHES | set(hashes.keys())) with open(filepath, "wb") as f: for chunk in response_data: h.update(chunk) f.write(chunk) response.close() # Also check the expected hashes if provided if hashes: actual_hashes = h.hexdigest() for algo_hash in hashes.keys(): actual_digest = actual_hashes[algo_hash] expected_digest = hashes[algo_hash] if actual_digest != expected_digest: raise ValueError( "Failure when fetching %s. " "Checksum mismatched: %s != %s" % (url, expected_digest, actual_digest) ) computed_hashes = h.hexdigest() length = computed_hashes.pop("length") extrinsic_metadata = { "length": length, "filename": filename, "checksums": computed_hashes, "url": url, } logger.debug("extrinsic_metadata", extrinsic_metadata) return filepath, extrinsic_metadata def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version TReturn = TypeVar("TReturn") TSelf = TypeVar("TSelf") _UNDEFINED = object() def cached_method(f: Callable[[TSelf], TReturn]) -> Callable[[TSelf], TReturn]: cache_name = f"_cached_{f.__name__}" @functools.wraps(f) def newf(self): value = getattr(self, cache_name, _UNDEFINED) if value is _UNDEFINED: value = f(self) setattr(self, cache_name, value) return value return newf