diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -6,10 +6,10 @@ before. The main entry points are: -- :class:`swh.loader.mercurial.from_disk.HgLoaderFromDisk` which reads and loads a local +- :class:`swh.loader.mercurial.loader.HgLoader` which reads and loads a local repository into an SWH archive. -- :class:`swh.loader.mercurial.from_disk.HgArchiveLoaderFromDisk` which reads and loads +- :class:`swh.loader.mercurial.loader.HgArchiveLoader` which reads and loads a local repository wrapped within a tarball # CLI run @@ -27,5 +27,5 @@ ## Basic use ``` bash -swh loader --C /tmp/mercurial.yml run mercurial_from_disk https://www.mercurial-scm.org/repo/hello +swh loader --C /tmp/mercurial.yml run mercurial https://www.mercurial-scm.org/repo/hello ``` diff --git a/conftest.py b/conftest.py --- a/conftest.py +++ b/conftest.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,5 +15,5 @@ @pytest.fixture(scope="session") def swh_scheduler_celery_includes(swh_scheduler_celery_includes): return swh_scheduler_celery_includes + [ - "swh.loader.mercurial.tasks_from_disk", + "swh.loader.mercurial.tasks", ] diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ include_package_data=True, entry_points=""" [swh.workers] - loader.mercurial_from_disk=swh.loader.mercurial:register_from_disk + loader.mercurial=swh.loader.mercurial:register [console_scripts] swh-hg-identify=swh.loader.mercurial.identify:main """, diff --git a/swh/loader/mercurial/__init__.py b/swh/loader/mercurial/__init__.py --- a/swh/loader/mercurial/__init__.py +++ b/swh/loader/mercurial/__init__.py @@ -7,11 +7,11 @@ from typing import Any, Mapping -def register_from_disk() -> Mapping[str, Any]: +def register() -> Mapping[str, Any]: """Register the current worker module's definition""" - from .from_disk import HgLoaderFromDisk + from .loader import HgLoader return { - "task_modules": [f"{__name__}.tasks_from_disk"], - "loader": HgLoaderFromDisk, + "task_modules": [f"{__name__}.tasks"], + "loader": HgLoader, } diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/loader.py rename from swh/loader/mercurial/from_disk.py rename to swh/loader/mercurial/loader.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/loader.py @@ -3,6 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +"""Loaders for ingesting Mercurial repositories either local from disk, or remote, see +:class:`swh.loader.mercurial.loader.HgLoader` or from an archive, see +:class:`swh.loader.mercurial.from_disk.HgArchiveLoader`. + +""" + from collections import deque from datetime import datetime import os @@ -13,7 +19,7 @@ from swh.core.utils import grouper from swh.loader.core.loader import BaseLoader from swh.loader.core.utils import clean_dangling_folders -from swh.loader.mercurial.utils import get_minimum_env, parse_visit_date +from swh.loader.mercurial.utils import get_minimum_env from swh.model import identifiers from swh.model.from_disk import Content, DentryPerms, Directory from swh.model.hashutil import hash_to_bytehex @@ -45,7 +51,7 @@ b"": DentryPerms.content, } # type: Dict[bytes, DentryPerms] -TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" +TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.loader" EXTID_TYPE = "hg-nodeid" EXTID_VERSION: int = 1 @@ -101,7 +107,7 @@ return default -class HgLoaderFromDisk(BaseLoader): +class HgLoader(BaseLoader): """Load a mercurial repository from a local repository. Mercurial's branching model is more complete than Git's; it allows for multiple @@ -139,7 +145,7 @@ storage: StorageInterface, url: str, directory: Optional[str] = None, - logging_class: str = "swh.loader.mercurial.LoaderFromDisk", + logging_class: str = "swh.loader.mercurial.loader.HgLoader", visit_date: Optional[datetime] = None, temp_directory: str = "/tmp", clone_timeout_seconds: int = 7200, @@ -747,7 +753,7 @@ return self._last_root.hash -class HgArchiveLoaderFromDisk(HgLoaderFromDisk): +class HgArchiveLoader(HgLoader): """Mercurial loader for repository wrapped within tarballs.""" def __init__( @@ -763,7 +769,7 @@ storage=storage, url=url, visit_date=visit_date, - logging_class="swh.loader.mercurial.ArchiveLoaderFromDisk", + logging_class="swh.loader.mercurial.loader.ArchiveLoader", temp_directory=temp_directory, max_content_size=max_content_size, ) @@ -784,32 +790,3 @@ repo_name = os.listdir(self.temp_dir)[0] self.directory = os.path.join(self.archive_extract_temp_dir, repo_name) super().prepare() - - -# Allow direct usage of the loader from the command line with -# `python -m swh.loader.mercurial.from_disk $ORIGIN_URL` -if __name__ == "__main__": - import logging - - import click - - logging.basicConfig( - level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s" - ) - - @click.command() - @click.option("--origin-url", help="origin url") - @click.option("--hg-directory", help="Path to mercurial repository to load") - @click.option("--visit-date", default=None, help="Visit date") - def main(origin_url, hg_directory, visit_date): - from swh.storage import get_storage - - storage = get_storage(cls="memory") - return HgLoaderFromDisk( - storage, - origin_url, - directory=hg_directory, - visit_date=parse_visit_date(visit_date), - ).load() - - main() diff --git a/swh/loader/mercurial/tasks_from_disk.py b/swh/loader/mercurial/tasks.py rename from swh/loader/mercurial/tasks_from_disk.py rename to swh/loader/mercurial/tasks.py --- a/swh/loader/mercurial/tasks_from_disk.py +++ b/swh/loader/mercurial/tasks.py @@ -9,10 +9,10 @@ from swh.loader.mercurial.utils import parse_visit_date -from .from_disk import HgArchiveLoaderFromDisk, HgLoaderFromDisk +from .loader import HgArchiveLoader, HgLoader -@shared_task(name=__name__ + ".LoadMercurialFromDisk") +@shared_task(name=__name__ + ".LoadMercurial") def load_hg( *, url: str, directory: Optional[str] = None, visit_date: Optional[str] = None ): @@ -20,24 +20,24 @@ Import a mercurial tarball into swh. - Args: see :func:`HgLoaderFromDisk` constructor. + Args: see :func:`HgLoader` constructor. """ - loader = HgLoaderFromDisk.from_configfile( + loader = HgLoader.from_configfile( url=url, directory=directory, visit_date=parse_visit_date(visit_date) ) return loader.load() -@shared_task(name=__name__ + ".LoadArchiveMercurialFromDisk") +@shared_task(name=__name__ + ".LoadArchiveMercurial") def load_hg_from_archive( *, url: str, archive_path: Optional[str] = None, visit_date: Optional[str] = None ): """Import a mercurial tarball into swh. - Args: see :func:`HgArchiveLoaderFromDisk` constructor. + Args: see :func:`HgArchiveLoader` constructor. """ - loader = HgArchiveLoaderFromDisk.from_configfile( + loader = HgArchiveLoader.from_configfile( url=url, archive_path=archive_path, visit_date=parse_visit_date(visit_date) ) return loader.load() diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_loader.py rename from swh/loader/mercurial/tests/test_from_disk.py rename to swh/loader/mercurial/tests/test_loader.py --- a/swh/loader/mercurial/tests/test_from_disk.py +++ b/swh/loader/mercurial/tests/test_loader.py @@ -26,7 +26,7 @@ from swh.storage import get_storage from swh.storage.algos.snapshot import snapshot_get_latest -from ..from_disk import EXTID_VERSION, HgDirectory, HgLoaderFromDisk +from ..loader import EXTID_VERSION, HgDirectory, HgLoader from .loader_checker import ExpectedSwhids, LoaderChecker VISIT_DATE = parse_visit_date("2016-05-03 15:16:32+00") @@ -93,13 +93,12 @@ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) LoaderChecker( - loader=HgLoaderFromDisk(swh_storage, repo_url), - expected=ExpectedSwhids.load(json_path), + loader=HgLoader(swh_storage, repo_url), expected=ExpectedSwhids.load(json_path), ).check() # This test has as been adapted from the historical `HgBundle20Loader` tests -# to ensure compatibility of `HgLoaderFromDisk`. +# to ensure compatibility of `HgLoader`. # Hashes as been produced by copy pasting the result of the implementation # to prevent regressions. def test_loader_hg_new_visit_no_release(swh_storage, datadir, tmp_path): @@ -108,7 +107,7 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgLoaderFromDisk(swh_storage, url=repo_url) + loader = HgLoader(swh_storage, url=repo_url) assert loader.load() == {"status": "eventful"} @@ -174,7 +173,7 @@ "snapshot": 1, } assert stats == expected_stats - loader2 = HgLoaderFromDisk(swh_storage, url=repo_url) + loader2 = HgLoader(swh_storage, url=repo_url) assert loader2.load() == {"status": "uneventful"} # nothing new happened @@ -192,7 +191,7 @@ # This test has as been adapted from the historical `HgBundle20Loader` tests -# to ensure compatibility of `HgLoaderFromDisk`. +# to ensure compatibility of `HgLoader`. # Hashes as been produced by copy pasting the result of the implementation # to prevent regressions. def test_loader_hg_new_visit_with_release(swh_storage, datadir, tmp_path): @@ -202,7 +201,7 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgLoaderFromDisk(swh_storage, url=repo_url, visit_date=VISIT_DATE,) + loader = HgLoader(swh_storage, url=repo_url, visit_date=VISIT_DATE,) actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} @@ -255,7 +254,7 @@ # This test has as been adapted from the historical `HgBundle20Loader` tests -# to ensure compatibility of `HgLoaderFromDisk`. +# to ensure compatibility of `HgLoader`. # Hashes as been produced by copy pasting the result of the implementation # to prevent regressions. def test_visit_repository_with_transplant_operations(swh_storage, datadir, tmp_path): @@ -268,7 +267,7 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgLoaderFromDisk(swh_storage, url=repo_url, visit_date=VISIT_DATE,) + loader = HgLoader(swh_storage, url=repo_url, visit_date=VISIT_DATE,) # load hg repository actual_load_status = loader.load() @@ -351,7 +350,7 @@ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) repo_path = repo_url.replace("file://", "") - loader = HgLoaderFromDisk(swh_storage, repo_path) + loader = HgLoader(swh_storage, repo_path) assert loader.load() == {"status": "eventful"} assert get_stats(loader.storage) == { @@ -371,7 +370,7 @@ # Create a new loader (to start with a clean slate, eg. remove the caches), # with the new, partial, storage - loader2 = HgLoaderFromDisk(swh_storage, repo_path) + loader2 = HgLoader(swh_storage, repo_path) assert loader2.load() == {"status": "uneventful"} # Should have all the objects @@ -398,7 +397,7 @@ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) repo_path = repo_url.replace("file://", "") - loader = HgLoaderFromDisk(swh_storage, repo_path) + loader = HgLoader(swh_storage, repo_path) # Test 3 loads: full, and two incremental. assert loader.load() == {"status": "eventful"} @@ -427,7 +426,7 @@ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) repo_path = repo_url.replace("file://", "") - loader = HgLoaderFromDisk(swh_storage, repo_path) + loader = HgLoader(swh_storage, repo_path) assert loader.load() == {"status": "eventful"} assert get_stats(loader.storage) == { @@ -451,7 +450,7 @@ # Create a new loader (to start with a clean slate, eg. remove the caches), # with the new, partial, storage - loader = HgLoaderFromDisk(new_storage, repo_path) + loader = HgLoader(new_storage, repo_path) assert get_stats(loader.storage) == { "content": 0, @@ -484,7 +483,7 @@ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) directory = repo_url.replace("file://", "") - loader = HgLoaderFromDisk( + loader = HgLoader( storage=swh_storage, url=repo_url, directory=directory, # specify directory to avoid clone @@ -502,7 +501,7 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgLoaderFromDisk(storage=swh_storage, url=repo_url,) + loader = HgLoader(storage=swh_storage, url=repo_url,) actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} @@ -519,7 +518,7 @@ assert sorted(snapshot.branches.keys()) == expected_branches # Check that we don't load anything the second time - loader = HgLoaderFromDisk(storage=swh_storage, url=repo_url,) + loader = HgLoader(storage=swh_storage, url=repo_url,) actual_load_status = loader.load() @@ -543,7 +542,7 @@ # first load with missing commits hg_strip(repo_url.replace("file://", ""), "tip") - loader = HgLoaderFromDisk(swh_storage, repo_url) + loader = HgLoader(swh_storage, repo_url) assert loader.load() == {"status": "eventful"} assert get_stats(loader.storage) == { "content": 2, @@ -558,7 +557,7 @@ # second load with all commits repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgLoaderFromDisk(swh_storage, repo_url) + loader = HgLoader(swh_storage, repo_url) checker = LoaderChecker(loader=loader, expected=ExpectedSwhids.load(json_path),) checker.check() @@ -582,7 +581,7 @@ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) hg_strip(repo_url.replace("file://", ""), "tip") - loader = HgLoaderFromDisk(swh_storage, repo_url) + loader = HgLoader(swh_storage, repo_url) assert loader.load() == {"status": "eventful"} # Ensure we write ExtIDs to a specific version. @@ -613,21 +612,21 @@ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) repo_path = repo_url.replace("file://", "") - with unittest.mock.patch("swh.loader.mercurial.from_disk.EXTID_VERSION", 0): - loader = HgLoaderFromDisk(swh_storage, repo_path) + with unittest.mock.patch("swh.loader.mercurial.loader.EXTID_VERSION", 0): + loader = HgLoader(swh_storage, repo_path) assert loader.load() == {"status": "eventful"} - loader = HgLoaderFromDisk(swh_storage, repo_path) + loader = HgLoader(swh_storage, repo_path) assert loader.load() == {"status": "eventful"} - loader = HgLoaderFromDisk(swh_storage, repo_path) + loader = HgLoader(swh_storage, repo_path) assert loader.load() == {"status": "uneventful"} - with unittest.mock.patch("swh.loader.mercurial.from_disk.EXTID_VERSION", 10000): - loader = HgLoaderFromDisk(swh_storage, repo_path) + with unittest.mock.patch("swh.loader.mercurial.loader.EXTID_VERSION", 10000): + loader = HgLoader(swh_storage, repo_path) assert loader.load() == {"status": "eventful"} - loader = HgLoaderFromDisk(swh_storage, repo_path) + loader = HgLoader(swh_storage, repo_path) assert loader.load() == {"status": "uneventful"} @@ -639,7 +638,7 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgLoaderFromDisk(swh_storage, url=repo_url) + loader = HgLoader(swh_storage, url=repo_url) assert loader.load() == {"status": "eventful"} stats = get_stats(loader.storage) @@ -663,7 +662,7 @@ fork_url = prepare_repository_from_archive( archive_path, "the-sandbox-reloaded", tmp_path ) - loader2 = HgLoaderFromDisk( + loader2 = HgLoader( swh_storage, url=fork_url, directory=str(tmp_path / archive_name) ) @@ -691,6 +690,6 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgLoaderFromDisk(swh_storage, url=repo_url) + loader = HgLoader(swh_storage, url=repo_url) assert loader.load() == {"status": "eventful"} diff --git a/swh/loader/mercurial/tests/test_tasks_from_disk.py b/swh/loader/mercurial/tests/test_tasks.py rename from swh/loader/mercurial/tests/test_tasks_from_disk.py rename to swh/loader/mercurial/tests/test_tasks.py --- a/swh/loader/mercurial/tests/test_tasks_from_disk.py +++ b/swh/loader/mercurial/tests/test_tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,11 +7,11 @@ def test_loader( mocker, swh_config, swh_scheduler_celery_app, swh_scheduler_celery_worker ): - mock_loader = mocker.patch("swh.loader.mercurial.from_disk.HgLoaderFromDisk.load") + mock_loader = mocker.patch("swh.loader.mercurial.loader.HgLoader.load") mock_loader.return_value = {"status": "eventful"} res = swh_scheduler_celery_app.send_task( - "swh.loader.mercurial.tasks_from_disk.LoadMercurialFromDisk", + "swh.loader.mercurial.tasks.LoadMercurial", kwargs={"url": "origin_url", "directory": "/some/repo", "visit_date": "now",}, ) @@ -26,13 +26,11 @@ def test_archive_loader( mocker, swh_config, swh_scheduler_celery_app, swh_scheduler_celery_worker ): - mock_loader = mocker.patch( - "swh.loader.mercurial.from_disk.HgArchiveLoaderFromDisk.load" - ) + mock_loader = mocker.patch("swh.loader.mercurial.loader.HgArchiveLoader.load") mock_loader.return_value = {"status": "uneventful"} res = swh_scheduler_celery_app.send_task( - "swh.loader.mercurial.tasks_from_disk.LoadArchiveMercurialFromDisk", + "swh.loader.mercurial.tasks.LoadArchiveMercurial", kwargs={ "url": "another_url", "archive_path": "/some/tar.tgz",