diff --git a/swh/loader/package/opam/loader.py b/swh/loader/package/opam/loader.py index 3fbb661..0b73b8a 100644 --- a/swh/loader/package/opam/loader.py +++ b/swh/loader/package/opam/loader.py @@ -1,222 +1,232 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import os from subprocess import PIPE, Popen, call from typing import Iterator, List, Optional, Tuple import attr from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import cached_method from swh.model.model import Person, Revision, RevisionType, Sha1Git from swh.storage.interface import StorageInterface @attr.s class OpamPackageInfo(BasePackageInfo): author = attr.ib(type=Person) committer = attr.ib(type=Person) version = attr.ib(type=str) def opam_read( cmd: List[str], init_error_msg_if_any: Optional[str] = None ) -> Optional[str]: """This executes an opam command and returns the first line of the output. Args: cmd: Opam command to execute as a list of string init_error_msg_if_any: Error message to raise in case a problem occurs during initialization Raises: ValueError with the init_error_msg_if_any content in case stdout is not consumable and the variable is provided with non empty value. Returns: the first line of the executed command output """ with Popen(cmd, stdout=PIPE) as proc: if proc.stdout is not None: for line in io.TextIOWrapper(proc.stdout): # care only for the first line output result (mostly blank separated # values, callers will deal with the parsing of the line) return line elif init_error_msg_if_any: raise ValueError(init_error_msg_if_any) return None class OpamLoader(PackageLoader[OpamPackageInfo]): - """ - Load all versions of a given package in a given opam repository. + """Load all versions of a given package in a given opam repository. + + The state of the opam repository is stored in a directory called an opam root. This + folder is a requisite for the opam binary to actually list information on package. + + When initialize_opam_root is False (the default for production workers), the opam + root must already have been configured outside of the loading process. If not an + error is raised, thus failing the loading. - The state of the opam repository is stored in a directory called an - opam root. Either the opam root has been created by the loader and we - simply re-use it, either it doesn't exist yet and we create it on the - first package we try to load (next packages will be able to re-use it). + For standalone workers, initialize_opam_root must be set to True, so the ingestion + can take care of installing the required opam root properly. + + The remaining ingestion uses the opam binary to give the versions of the given + package. Then, for each version, the loader uses the opam binary to list the tarball + url to fetch and ingest. - Then we just ask the opam binary to give us the list of all versions of - the given package. For each version, we ask the opam binary to give us - the url to the tarball to archive. """ visit_type = "opam" def __init__( self, storage: StorageInterface, url: str, opam_root: str, opam_instance: str, opam_url: str, opam_package: str, max_content_size: Optional[int] = None, + initialize_opam_root: bool = False, ): super().__init__(storage=storage, url=url, max_content_size=max_content_size) self.opam_root = opam_root self.opam_instance = opam_instance self.opam_url = opam_url self.opam_package = opam_package + self.initialize_opam_root = initialize_opam_root def get_package_dir(self) -> str: return ( f"{self.opam_root}/repo/{self.opam_instance}/packages/{self.opam_package}" ) def get_package_name(self, version: str) -> str: return f"{self.opam_package}.{version}" def get_package_file(self, version: str) -> str: return f"{self.get_package_dir()}/{self.get_package_name(version)}/opam" @cached_method def _compute_versions(self) -> List[str]: """Compute the versions using opam internals Raises: ValueError in case the lister is not able to determine the list of versions Returns: The list of versions for the package """ # HACK using opam internals (opam < 2.1) to list current package versions. We # need `opam show` to support the --repo flag but it does not currently so we # work around it. package_dir = self.get_package_dir() if not os.path.exists(package_dir): raise ValueError( f"can't get versions for package {self.opam_package} " f"(at url {self.url})." ) versions = [ ".".join(version.split(".")[1:]) for version in os.listdir(package_dir) ] if not versions: raise ValueError( f"can't get versions for package {self.opam_package} " f"(at url {self.url})" ) versions.sort() return versions def get_versions(self) -> List[str]: - """First initialize the opam root directory if needed then start listing the package - versions. + """First initialize the opam root directory if needed then start listing the + package versions. Raises: ValueError in case the lister is not able to determine the list of versions or if the opam root directory is invalid. """ - if not os.path.isdir(self.opam_root): - if os.path.isfile(self.opam_root): - raise ValueError("invalid opam root") - else: - call( - [ - "opam", - "init", - "--reinit", - "--bare", - "--no-setup", - "--root", - self.opam_root, - self.opam_instance, - self.opam_url, - ] - ) - elif not os.path.isfile(os.path.join(self.opam_root, "config")): - raise ValueError("invalid opam root") + if self.initialize_opam_root: + # for standalone loader (e.g docker), loader must initialize the opam root + # folder + call( + [ + "opam", + "init", + "--reinit", + "--bare", + "--no-setup", + "--root", + self.opam_root, + self.opam_instance, + self.opam_url, + ] + ) + else: + # for standard/production loaders, no need to initialize the opam root + # folder. It must be present though so check for it, if not present, raise + if not os.path.isfile(os.path.join(self.opam_root, "config")): + # so if not correctly setup, raise immediately + raise ValueError("Invalid opam root") return self._compute_versions() def get_default_version(self) -> str: """Return the most recent version of the package as default.""" return self._compute_versions()[-1] def get_enclosed_single_line_field(self, field, version) -> Optional[str]: package_file = self.get_package_file(version) result = opam_read( [ "opam", "show", "--color", "never", "--safe", "--normalise", "--root", self.opam_root, "--file", package_file, "--field", field, ] ) # Sanitize the result if any (remove trailing \n and enclosing ") return result.strip().strip('"') if result else None def get_package_info(self, version: str) -> Iterator[Tuple[str, OpamPackageInfo]]: url = self.get_enclosed_single_line_field("url.src:", version) if url is None: raise ValueError( f"can't get field url.src: for version {version} of package {self.opam_package} \ (at url {self.url}) from `opam show`" ) authors_field = self.get_enclosed_single_line_field("authors:", version) fullname = b"" if authors_field is None else str.encode(authors_field) author = Person(fullname=fullname, name=None, email=None) maintainer_field = self.get_enclosed_single_line_field("maintainer:", version) fullname = b"" if maintainer_field is None else str.encode(maintainer_field) committer = Person(fullname=fullname, name=None, email=None) yield self.get_package_name(version), OpamPackageInfo( url=url, filename=None, author=author, committer=committer, version=version ) def build_revision( self, p_info: OpamPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Revision]: return Revision( type=RevisionType.TAR, author=p_info.author, committer=p_info.committer, message=str.encode(p_info.version), date=None, committer_date=None, parents=(), directory=directory, synthetic=True, ) diff --git a/swh/loader/package/opam/tests/test_opam.py b/swh/loader/package/opam/tests/test_opam.py index 9ffce58..163c89c 100644 --- a/swh/loader/package/opam/tests/test_opam.py +++ b/swh/loader/package/opam/tests/test_opam.py @@ -1,176 +1,220 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from shutil import rmtree from swh.loader.package.opam.loader import OpamLoader, OpamPackageInfo from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import Person, Snapshot, SnapshotBranch, TargetType +def test_opam_loader_no_opam_repository_fails(swh_storage, tmpdir, datadir): + """Running opam loader without a prepared opam repository fails""" + opam_url = f"file://{datadir}/fake_opam_repo" + opam_root = tmpdir + opam_instance = "loadertest" + opam_package = "agrid" + url = f"opam+{opam_url}/packages/{opam_package}" + + loader = OpamLoader( + swh_storage, + url, + opam_root, + opam_instance, + opam_url, + opam_package, + initialize_opam_root=False, # The opam directory must be present + ) + + # No opam root directory init directory from loader. So, at the opam root does not + # exist, the loading fails. That's the expected use for the production workers + # (whose opam_root maintenance will be externally managed). + actual_load_status = loader.load() + + assert actual_load_status == {"status": "failed"} + + def test_opam_loader_one_version(tmpdir, requests_mock_datadir, datadir, swh_storage): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir # the directory should NOT exist, we just need an unique name, so we delete it rmtree(tmpdir) opam_instance = "loadertest" opam_package = "agrid" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( - swh_storage, url, opam_root, opam_instance, opam_url, opam_package + swh_storage, + url, + opam_root, + opam_instance, + opam_url, + opam_package, + initialize_opam_root=True, ) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("4e4bf977312460329d7f769b0be89937c9827efc") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } target = b"S\x8c\x8aq\xdcy\xa4/0\xa0\xb2j\xeb\xc1\x16\xad\xce\x06\xeaV" expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch(target=b"agrid.0.1", target_type=TargetType.ALIAS,), b"agrid.0.1": SnapshotBranch( target=target, target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot_id ) stats = get_stats(swh_storage) assert { "content": 18, "directory": 8, "origin": 1, "origin_visit": 1, "release": 0, "revision": 1, "skipped_content": 0, "snapshot": 1, } == stats def test_opam_loader_many_version(tmpdir, requests_mock_datadir, datadir, swh_storage): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir # the directory should NOT exist, we just need an unique name, so we delete it rmtree(tmpdir) opam_instance = "loadertest" opam_package = "directories" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( - swh_storage, url, opam_root, opam_instance, opam_url, opam_package + swh_storage, + url, + opam_root, + opam_instance, + opam_url, + opam_package, + initialize_opam_root=True, ) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("1b49be175dcf17c0f568bcd7aac3d4faadc41249") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"directories.0.3", target_type=TargetType.ALIAS, ), b"directories.0.1": SnapshotBranch( target=b"N\x92jA\xb2\x892\xeb\xcc\x9c\xa9\xb3\xea\xa7kz\xb08\xa6V", target_type=TargetType.REVISION, ), b"directories.0.2": SnapshotBranch( target=b"yj\xc9\x1a\x8f\xe0\xaa\xff[\x88\xffz" b"\x91C\xcc\x96\xb7\xd4\xf65", target_type=TargetType.REVISION, ), b"directories.0.3": SnapshotBranch( target=b"hA \xc4\xb5\x18A8\xb8C\x12\xa3\xa5T\xb7/v\x85X\xcb", target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( swh_storage, url, status="full", type="opam", snapshot=expected_snapshot_id ) def test_opam_revision(tmpdir, requests_mock_datadir, swh_storage, datadir): opam_url = f"file://{datadir}/fake_opam_repo" opam_root = tmpdir # the directory should NOT exist, we just need an unique name, so we delete it rmtree(tmpdir) opam_instance = "loadertest" opam_package = "ocb" url = f"opam+{opam_url}/packages/{opam_package}" loader = OpamLoader( - swh_storage, url, opam_root, opam_instance, opam_url, opam_package + swh_storage, + url, + opam_root, + opam_instance, + opam_url, + opam_package, + initialize_opam_root=True, ) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("398df115b9feb2f463efd21941d69b7d59cd9025") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } info_iter = loader.get_package_info("0.1") branch_name, package_info = next(info_iter) expected_branch_name = "ocb.0.1" expected_package_info = OpamPackageInfo( url="https://github.com/OCamlPro/ocb/archive/0.1.tar.gz", filename=None, directory_extrinsic_metadata=[], author=Person( fullname=b"OCamlPro ", name=None, email=None ), committer=Person( fullname=b"OCamlPro ", name=None, email=None ), version="0.1", ) assert branch_name == expected_branch_name assert package_info == expected_package_info revision_id = b"o\xad\x7f=\x07\xbb\xaah\xdbI(\xb0'\x10z\xfc\xff\x06x\x1b" revision = swh_storage.revision_get([revision_id])[0] assert revision is not None assert revision.author == expected_package_info.author assert revision.committer == expected_package_info.committer