diff --git a/PKG-INFO b/PKG-INFO index 03820e1..ba61e00 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,56 +1,56 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 0.23.5 +Version: 0.25.0 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/swh.loader.core.egg-info/PKG-INFO b/swh.loader.core.egg-info/PKG-INFO index 03820e1..ba61e00 100644 --- a/swh.loader.core.egg-info/PKG-INFO +++ b/swh.loader.core.egg-info/PKG-INFO @@ -1,56 +1,56 @@ Metadata-Version: 2.1 Name: swh.loader.core -Version: 0.23.5 +Version: 0.25.0 Summary: Software Heritage Base Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDBASE Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-core Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-core/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - Loader foundations ====================================== The Software Heritage Loader Core is a low-level loading utilities and helpers used by :term:`loaders `. The main entry points are classes: - :class:`swh.loader.core.loader.BaseLoader` for loaders (e.g. svn) - :class:`swh.loader.core.loader.DVCSLoader` for DVCS loaders (e.g. hg, git, ...) - :class:`swh.loader.package.loader.PackageLoader` for Package loaders (e.g. PyPI, Npm, ...) Package loaders --------------- This package also implements many package loaders directly, out of convenience, as they usually are quite similar and each fits in a single file. They all roughly follow these steps, explained in the :py:meth:`swh.loader.package.loader.PackageLoader.load` documentation. See the :ref:`package-loader-tutorial` for details. VCS loaders ----------- Unlike package loaders, VCS loaders remain in separate packages, as they often need more advanced conversions and very VCS-specific operations. This usually involves getting the branches of a repository and recursively loading revisions in the history (and directory trees in these revisions), until a known revision is found diff --git a/swh/loader/package/opam/loader.py b/swh/loader/package/opam/loader.py index 69e1158..8a9b29c 100644 --- a/swh/loader/package/opam/loader.py +++ b/swh/loader/package/opam/loader.py @@ -1,216 +1,231 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import os from subprocess import PIPE, Popen, call from typing import Iterator, List, Optional, Tuple import attr from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import cached_method from swh.model.model import Person, Revision, RevisionType, Sha1Git from swh.storage.interface import StorageInterface @attr.s class OpamPackageInfo(BasePackageInfo): author = attr.ib(type=Person) committer = attr.ib(type=Person) version = attr.ib(type=str) def opam_read( cmd: List[str], init_error_msg_if_any: Optional[str] = None ) -> Optional[str]: """This executes an opam command and returns the first line of the output. Args: cmd: Opam command to execute as a list of string init_error_msg_if_any: Error message to raise in case a problem occurs during initialization Raises: ValueError with the init_error_msg_if_any content in case stdout is not consumable and the variable is provided with non empty value. Returns: the first line of the executed command output """ with Popen(cmd, stdout=PIPE) as proc: if proc.stdout is not None: for line in io.TextIOWrapper(proc.stdout): # care only for the first line output result (mostly blank separated # values, callers will deal with the parsing of the line) return line elif init_error_msg_if_any: raise ValueError(init_error_msg_if_any) return None class OpamLoader(PackageLoader[OpamPackageInfo]): - """ - Load all versions of a given package in a given opam repository. + """Load all versions of a given package in a given opam repository. + + The state of the opam repository is stored in a directory called an opam root. This + folder is a requisite for the opam binary to actually list information on package. + + When initialize_opam_root is False (the default for production workers), the opam + root must already have been configured outside of the loading process. If not an + error is raised, thus failing the loading. - The state of the opam repository is stored in a directory called an - opam root. Either the opam root has been created by the loader and we - simply re-use it, either it doesn't exist yet and we create it on the - first package we try to load (next packages will be able to re-use it). + For standalone workers, initialize_opam_root must be set to True, so the ingestion + can take care of installing the required opam root properly. + + The remaining ingestion uses the opam binary to give the versions of the given + package. Then, for each version, the loader uses the opam binary to list the tarball + url to fetch and ingest. - Then we just ask the opam binary to give us the list of all versions of - the given package. For each version, we ask the opam binary to give us - the url to the tarball to archive. """ visit_type = "opam" def __init__( self, storage: StorageInterface, url: str, opam_root: str, opam_instance: str, opam_url: str, opam_package: str, max_content_size: Optional[int] = None, + initialize_opam_root: bool = False, ): super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.opam_root = opam_root self.opam_instance = opam_instance self.opam_url = opam_url self.opam_package = opam_package + self.initialize_opam_root = initialize_opam_root - def get_versions(self) -> List[str]: - """First initialize the opam root directory if needed the start listing the package -versions. + def get_package_dir(self) -> str: + return ( + f"{self.opam_root}/repo/{self.opam_instance}/packages/{self.opam_package}" + ) + + def get_package_name(self, version: str) -> str: + return f"{self.opam_package}.{version}" + + def get_package_file(self, version: str) -> str: + return f"{self.get_package_dir()}/{self.get_package_name(version)}/opam" + + @cached_method + def _compute_versions(self) -> List[str]: + """Compute the versions using opam internals + + Raises: + ValueError in case the lister is not able to determine the list of versions + + Returns: + The list of versions for the package """ - if not os.path.isdir(self.opam_root): - if os.path.isfile(self.opam_root): - raise ValueError("invalid opam root") - else: - call( - [ - "opam", - "init", - "--reinit", - "--bare", - "--no-setup", - "--root", - self.opam_root, - self.opam_instance, - self.opam_url, - ] - ) - elif not os.path.isfile(os.path.join(self.opam_root, "config")): - raise ValueError("invalid opam root") - - versions = opam_read( - [ - "opam", - "show", - "--color", - "never", - "--normalise", - "--root", - self.opam_root, - "-f", - "all-versions", - self.opam_package, - ], - init_error_msg_if_any=( + # TODO: use `opam show` instead of this workaround when it support the `--repo` + # flag + package_dir = self.get_package_dir() + if not os.path.exists(package_dir): + raise ValueError( f"can't get versions for package {self.opam_package} " - f"(at url {self.url}) from `opam show`" - ), - ) - return versions.split() if versions else [] + f"(at url {self.url})." + ) + versions = [ + ".".join(version.split(".")[1:]) for version in os.listdir(package_dir) + ] + if not versions: + raise ValueError( + f"can't get versions for package {self.opam_package} " + f"(at url {self.url})" + ) + versions.sort() + return versions - def get_default_version(self) -> str: + def get_versions(self) -> List[str]: + """First initialize the opam root directory if needed then start listing the + package versions. - init_error_msg = f"can't get default version for package {self.opam_package} \ - (at url {self.url}) from `opam show`" - # we only care about the first element of the first line - # which is the initial version - versions_ = opam_read( - [ - "opam", - "show", - "--color", - "never", - "--normalise", - "--root", - self.opam_root, - "-f", - "version", - self.opam_package, - ], - init_error_msg_if_any=init_error_msg, - ) - if not versions_: - raise ValueError(init_error_msg) - versions = versions_.split() - if len(versions) != 1: - raise ValueError(init_error_msg) - return versions[0] + Raises: + ValueError in case the lister is not able to determine the list of + versions or if the opam root directory is invalid. + + """ + if self.initialize_opam_root: + # for standalone loader (e.g docker), loader must initialize the opam root + # folder + call( + [ + "opam", + "init", + "--reinit", + "--bare", + "--no-setup", + "--root", + self.opam_root, + self.opam_instance, + self.opam_url, + ] + ) + else: + # for standard/production loaders, no need to initialize the opam root + # folder. It must be present though so check for it, if not present, raise + if not os.path.isfile(os.path.join(self.opam_root, "config")): + # so if not correctly setup, raise immediately + raise ValueError("Invalid opam root") + + return self._compute_versions() + + def get_default_version(self) -> str: + """Return the most recent version of the package as default.""" + return self._compute_versions()[-1] def get_enclosed_single_line_field(self, field, version) -> Optional[str]: + package_file = self.get_package_file(version) result = opam_read( [ "opam", "show", "--color", "never", + "--safe", "--normalise", "--root", self.opam_root, - "-f", + "--file", + package_file, + "--field", field, - f"{self.opam_package}.{version}", ] ) - # this needs to be cleaned up a bit (remove enclosing " and the trailing \n) - return result[1:-2] if result else None + # Sanitize the result if any (remove trailing \n and enclosing ") + return result.strip().strip('"') if result else None def get_package_info(self, version: str) -> Iterator[Tuple[str, OpamPackageInfo]]: - branch_name = f"{self.opam_package}.{version}" url = self.get_enclosed_single_line_field("url.src:", version) - if url is None: raise ValueError( f"can't get field url.src: for version {version} of package {self.opam_package} \ (at url {self.url}) from `opam show`" ) authors_field = self.get_enclosed_single_line_field("authors:", version) fullname = b"" if authors_field is None else str.encode(authors_field) author = Person(fullname=fullname, name=None, email=None) maintainer_field = self.get_enclosed_single_line_field("maintainer:", version) fullname = b"" if maintainer_field is None else str.encode(maintainer_field) committer = Person(fullname=fullname, name=None, email=None) - yield branch_name, OpamPackageInfo( + yield self.get_package_name(version), OpamPackageInfo( url=url, filename=None, author=author, committer=committer, version=version ) def build_revision( self, p_info: OpamPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: return Revision( type=RevisionType.TAR, author=p_info.author, committer=p_info.committer, message=str.encode(p_info.version), date=None, committer_date=None, parents=(), directory=directory, synthetic=True, ) diff --git a/swh/loader/package/opam/tests/test_opam.py b/swh/loader/package/opam/tests/test_opam.py index 9ffce58..7840466 100644 --- a/swh/loader/package/opam/tests/test_opam.py +++ b/swh/loader/package/opam/tests/test_opam.py @@ -1,176 +1,204 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from shutil import rmtree - from swh.loader.package.opam.loader import OpamLoader, OpamPackageInfo from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import Person, Snapshot, SnapshotBranch, TargetType +def test_opam_loader_no_opam_repository_fails(swh_storage, tmpdir, datadir): + """Running opam loader without a prepared opam repository fails""" + opam_url = f"file://{datadir}/fake_opam_repo" + opam_root = tmpdir + opam_instance = "loadertest" + opam_package = "agrid" + url = f"opam+{opam_url}/packages/{opam_package}" + + loader = OpamLoader( + swh_storage, + url, + opam_root, + opam_instance, + opam_url, + opam_package, + initialize_opam_root=False, # The opam directory must be present + ) + + # No opam root directory init directory from loader. So, at the opam root does not + # exist, the loading fails. That's the expected use for the production workers + # (whose opam_root maintenance will be externally managed). + actual_load_status = loader.load() + + assert actual_load_status == {"status": "failed"} + + def test_opam_loader_one_version(tmpdir, requests_mock_datadir, datadir, swh_storage): opam_url = f"file://{datadir}/fake_opam_repo" - opam_root = tmpdir - # the directory should NOT exist, we just need an unique name, so we delete it - rmtree(tmpdir) - opam_instance = "loadertest" - opam_package = "agrid" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( - swh_storage, url, opam_root, opam_instance, opam_url, opam_package + swh_storage, + url, + opam_root, + opam_instance, + opam_url, + opam_package, + initialize_opam_root=True, ) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("4e4bf977312460329d7f769b0be89937c9827efc") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } target = b"S\x8c\x8aq\xdcy\xa4/0\xa0\xb2j\xeb\xc1\x16\xad\xce\x06\xeaV" expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch(target=b"agrid.0.1", target_type=TargetType.ALIAS,), b"agrid.0.1": SnapshotBranch( target=target, target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot_id ) stats = get_stats(swh_storage) assert { "content": 18, "directory": 8, "origin": 1, "origin_visit": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats def test_opam_loader_many_version(tmpdir, requests_mock_datadir, datadir, swh_storage): opam_url = f"file://{datadir}/fake_opam_repo" - opam_root = tmpdir - # the directory should NOT exist, we just need an unique name, so we delete it - rmtree(tmpdir) - opam_instance = "loadertest" - opam_package = "directories" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( - swh_storage, url, opam_root, opam_instance, opam_url, opam_package + swh_storage, + url, + opam_root, + opam_instance, + opam_url, + opam_package, + initialize_opam_root=True, ) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("1b49be175dcf17c0f568bcd7aac3d4faadc41249") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"directories.0.3", target_type=TargetType.ALIAS, ), b"directories.0.1": SnapshotBranch( target=b"N\x92jA\xb2\x892\xeb\xcc\x9c\xa9\xb3\xea\xa7kz\xb08\xa6V", target_type=TargetType.REVISION, ), b"directories.0.2": SnapshotBranch( target=b"yj\xc9\x1a\x8f\xe0\xaa\xff[\x88\xffz" b"\x91C\xcc\x96\xb7\xd4\xf65", target_type=TargetType.REVISION, ), b"directories.0.3": SnapshotBranch( target=b"hA \xc4\xb5\x18A8\xb8C\x12\xa3\xa5T\xb7/v\x85X\xcb", target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot_id ) def test_opam_revision(tmpdir, requests_mock_datadir, swh_storage, datadir): opam_url = f"file://{datadir}/fake_opam_repo" - opam_root = tmpdir - # the directory should NOT exist, we just need an unique name, so we delete it - rmtree(tmpdir) - opam_instance = "loadertest" opam_package = "ocb" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( - swh_storage, url, opam_root, opam_instance, opam_url, opam_package + swh_storage, + url, + opam_root, + opam_instance, + opam_url, + opam_package, + initialize_opam_root=True, ) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("398df115b9feb2f463efd21941d69b7d59cd9025") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } info_iter = loader.get_package_info("0.1") branch_name, package_info = next(info_iter) expected_branch_name = "ocb.0.1" expected_package_info = OpamPackageInfo( url="https://github.com/OCamlPro/ocb/archive/0.1.tar.gz", filename=None, directory_extrinsic_metadata=[], author=Person( fullname=b"OCamlPro ", name=None, email=None ), committer=Person( fullname=b"OCamlPro ", name=None, email=None ), version="0.1", ) assert branch_name == expected_branch_name assert package_info == expected_package_info revision_id = b"o\xad\x7f=\x07\xbb\xaah\xdbI(\xb0'\x10z\xfc\xff\x06x\x1b" revision = swh_storage.revision_get([revision_id])[0] assert revision is not None assert revision.author == expected_package_info.author assert revision.committer == expected_package_info.committer