diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ swh.model >= 0.4.0 swh.storage >= 0.22.0 swh.scheduler >= 0.0.39 -swh.loader.core >= 0.17.0 +swh.loader.core >= 0.18.0 diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ retrying sqlitedict mercurial +iso8601 diff --git a/swh/loader/mercurial/cli.py b/swh/loader/mercurial/cli.py --- a/swh/loader/mercurial/cli.py +++ b/swh/loader/mercurial/cli.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -34,6 +34,7 @@ def main( origin_url, hg_directory=None, hg_archive=None, visit_date=None, log_level=None ): + from swh.storage import get_storage logging.basicConfig( level=(log_level or "DEBUG").upper(), @@ -52,7 +53,8 @@ kwargs["directory"] = hg_directory - return HgLoader().load(**kwargs) + storage = get_storage(cls="memory") + return HgLoader(storage, **kwargs).load() if __name__ == "__main__": diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py --- a/swh/loader/mercurial/from_disk.py +++ b/swh/loader/mercurial/from_disk.py @@ -1,23 +1,20 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os from collections import deque -from datetime import datetime, timezone +from datetime import datetime +import os from shutil import rmtree from tempfile import mkdtemp -from typing import Any, Deque, Dict, Optional, Tuple, TypeVar, Union - -import dateutil +from typing import Deque, Dict, Optional, Tuple, TypeVar, Union -from swh.core.config import merge_configs from swh.loader.core.loader import BaseLoader from swh.loader.core.utils import clean_dangling_folders +from swh.loader.mercurial.utils import parse_visit_date from swh.model.from_disk import Content, DentryPerms, Directory from swh.model.hashutil import MultiHash, hash_to_bytehex -from swh.model.model import Content as ModelContent from swh.model.model import ( ObjectType, Origin, @@ -31,6 +28,8 @@ TargetType, TimestampWithTimezone, ) +from swh.model.model import Content as ModelContent +from swh.storage.interface import StorageInterface from . import hgutil from .archive_extract import tmp_extract @@ -41,38 +40,13 @@ b"x": DentryPerms.executable_content, b"": DentryPerms.content, } # type: Dict[bytes, DentryPerms] -DEFAULT_CONFIG: Dict[str, Any] = { - "temp_directory": "/tmp", - "clone_timeout_seconds": 7200, - "content_cache_size": 10_000, -} + TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk" T = TypeVar("T") -def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]: - """Convert visit date from Optional[Union[str, datetime]] to Optional[datetime]. - - `HgLoaderFromDisk` accepts `str` and `datetime` as visit date - while `BaseLoader` only deals with `datetime`. - """ - if visit_date is None: - return None - - if isinstance(visit_date, datetime): - return visit_date - - if visit_date == "now": - return datetime.now(tz=timezone.utc) - - if isinstance(visit_date, str): - return dateutil.parser.parse(visit_date) - - return ValueError(f"invalid visit date {visit_date!r}") - - class HgDirectory(Directory): """A more practical directory. @@ -122,11 +96,15 @@ def __init__( self, + storage: StorageInterface, url: str, directory: Optional[str] = None, logging_class: str = "swh.loader.mercurial.LoaderFromDisk", - visit_date: Optional[Union[datetime, str]] = None, - config: Optional[Dict[str, Any]] = None, + visit_date: Optional[datetime] = None, + temp_directory: str = "/tmp", + clone_timeout_seconds: int = 7200, + content_cache_size: int = 10_000, + max_content_size: Optional[int] = None, ): """Initialize the loader. @@ -137,14 +115,17 @@ visit_date: visit date of the repository config: loader configuration """ - super().__init__(logging_class=logging_class, config=config or {}) + super().__init__( + storage=storage, + logging_class=logging_class, + max_content_size=max_content_size, + ) - self.config = merge_configs(DEFAULT_CONFIG, self.config) - self._temp_directory = self.config["temp_directory"] - self._clone_timeout = self.config["clone_timeout_seconds"] + self._temp_directory = temp_directory + self._clone_timeout = clone_timeout_seconds self.origin_url = url - self.visit_date = parse_visit_date(visit_date) + self.visit_date = visit_date self.directory = directory self._repo: Optional[hgutil.Repository] = None @@ -162,7 +143,7 @@ # Cache the content hash across revisions to avoid recalculation. self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict( - self.config["content_cache_size"], + content_cache_size, ) def pre_cleanup(self) -> None: @@ -182,7 +163,7 @@ self.log.debug(f"Cleanup up repository {self._repo_directory}") rmtree(self._repo_directory) - def prepare_origin_visit(self, *args, **kwargs) -> None: + def prepare_origin_visit(self) -> None: """First step executed by the loader to prepare origin and visit references. Set/update self.origin, and optionally self.origin_url, self.visit_date. @@ -190,7 +171,7 @@ """ self.origin = Origin(url=self.origin_url) - def prepare(self, *args, **kwargs) -> None: + def prepare(self) -> None: """Second step executed by the loader to prepare some state needed by the loader. @@ -478,19 +459,28 @@ """Mercurial loader for repository wrapped within tarballs.""" def __init__( - self, url: str, visit_date: Optional[datetime] = None, archive_path: str = None + self, + storage: StorageInterface, + url: str, + visit_date: Optional[datetime] = None, + archive_path: str = None, + temp_directory: str = "/tmp", + max_content_size: Optional[int] = None, ): super().__init__( - url, + storage=storage, + url=url, visit_date=visit_date, logging_class="swh.loader.mercurial.ArchiveLoaderFromDisk", + temp_directory=temp_directory, + max_content_size=max_content_size, ) - self.temp_dir = None + self.archive_extract_temp_dir = None self.archive_path = archive_path - def prepare(self, *args, **kwargs): + def prepare(self): """Extract the archive instead of cloning.""" - self._temp_directory = tmp_extract( + self.archive_extract_temp_dir = tmp_extract( archive=self.archive_path, dir=self._temp_directory, prefix=TEMPORARY_DIR_PREFIX_PATTERN, @@ -500,14 +490,8 @@ ) repo_name = os.listdir(self.temp_dir)[0] - self.directory = os.path.join(self.temp_dir, repo_name) - super().prepare(*args, **kwargs) - - def cleanup(self) -> None: - """Remove the extracted archive instead of the cloned repository.""" - if self.temp_dir and os.path.exists(self.temp_dir): - rmtree(self.temp_dir) - super().cleanup() + self.directory = os.path.join(self.archive_extract_temp_dir, repo_name) + super().prepare() # Allow direct usage of the loader from the command line with @@ -526,8 +510,14 @@ @click.option("--hg-directory", help="Path to mercurial repository to load") @click.option("--visit-date", default=None, help="Visit date") def main(origin_url, hg_directory, visit_date): + from swh.storage import get_storage + + storage = get_storage(cls="memory") return HgLoaderFromDisk( - origin_url, directory=hg_directory, visit_date=visit_date + storage, + origin_url, + directory=hg_directory, + visit_date=parse_visit_date(visit_date), ).load() main() diff --git a/swh/loader/mercurial/loader.py b/swh/loader/mercurial/loader.py --- a/swh/loader/mercurial/loader.py +++ b/swh/loader/mercurial/loader.py @@ -28,11 +28,9 @@ from typing import Any, Dict, Iterable, List, Optional import billiard -from dateutil import parser import hglib from hglib.error import CommandError -from swh.core.config import merge_configs from swh.loader.core.loader import DVCSLoader from swh.loader.core.utils import clean_dangling_folders from swh.loader.exception import NotFound @@ -62,6 +60,7 @@ TimestampWithTimezone, ) from swh.storage.algos.origin import origin_get_latest_visit_status +from swh.storage.interface import StorageInterface from . import converters from .archive_extract import tmp_extract @@ -93,16 +92,6 @@ pass -DEFAULT_CONFIG: Dict[str, Any] = { - "bundle_filename": "HG20_none_bundle", - "reduce_effort": False, - "temp_directory": "/tmp", - "cache1_size": 800 * 1024 * 1024, - "cache2_size": 800 * 1024 * 1024, - "clone_timeout_seconds": 7200, -} - - class HgBundle20Loader(DVCSLoader): """Mercurial loader able to deal with remote or local repository. @@ -112,27 +101,40 @@ def __init__( self, - url, - visit_date=None, - directory=None, + storage: StorageInterface, + url: str, + visit_date: Optional[datetime.datetime] = None, + directory: Optional[str] = None, logging_class="swh.loader.mercurial.Bundle20Loader", + bundle_filename: Optional[str] = "HG20_none_bundle", + reduce_effort: bool = False, + temp_directory: str = "/tmp", + cache1_size: int = 800 * 1024 * 1024, + cache2_size: int = 800 * 1024 * 1024, + clone_timeout_seconds: int = 7200, + save_data_path: Optional[str] = None, + max_content_size: Optional[int] = None, ): - super().__init__(logging_class=logging_class) - self.config = merge_configs(DEFAULT_CONFIG, self.config) + super().__init__( + storage=storage, + logging_class=logging_class, + save_data_path=save_data_path, + max_content_size=max_content_size, + ) self.origin_url = url self.visit_date = visit_date self.directory = directory - self.bundle_filename = self.config["bundle_filename"] - self.reduce_effort_flag = self.config["reduce_effort"] + self.bundle_filename = bundle_filename + self.reduce_effort_flag = reduce_effort self.empty_repository = None - self.temp_directory = self.config["temp_directory"] - self.cache1_size = self.config["cache1_size"] - self.cache2_size = self.config["cache2_size"] - self.clone_timeout = self.config["clone_timeout_seconds"] + self.temp_directory = temp_directory + self.cache1_size = cache1_size + self.cache2_size = cache2_size + self.clone_timeout = clone_timeout_seconds self.working_directory = None self.bundle_path = None - self.heads = {} - self.releases = {} + self.heads: Dict[bytes, Any] = {} + self.releases: Dict[bytes, Any] = {} self.last_snapshot_id: Optional[bytes] = None def pre_cleanup(self): @@ -179,12 +181,8 @@ return b - def prepare_origin_visit(self, *args, **kwargs) -> None: + def prepare_origin_visit(self) -> None: self.origin = Origin(url=self.origin_url) - visit_date = self.visit_date - if isinstance(visit_date, str): # visit_date can be string or datetime - visit_date = parser.parse(visit_date) - self.visit_date = visit_date visit_status = origin_get_latest_visit_status( self.storage, self.origin_url, require_snapshot=True ) @@ -231,7 +229,7 @@ return result - def prepare(self, *args, **kwargs): + def prepare(self): """Prepare the necessary steps to load an actual remote or local repository. @@ -638,17 +636,28 @@ """ - def __init__(self, url, visit_date=None, archive_path=None): + def __init__( + self, + storage: StorageInterface, + url: str, + visit_date: Optional[datetime.datetime] = None, + archive_path=None, + temp_directory: str = "/tmp", + max_content_size: Optional[int] = None, + ): super().__init__( - url, + storage=storage, + url=url, visit_date=visit_date, logging_class="swh.loader.mercurial.HgArchiveBundle20Loader", + temp_directory=temp_directory, + max_content_size=max_content_size, ) - self.temp_dir = None + self.archive_extract_temp_dir = None self.archive_path = archive_path - def prepare(self, *args, **kwargs): - self.temp_dir = tmp_extract( + def prepare(self): + self.archive_extract_temp_dir = tmp_extract( archive=self.archive_path, dir=self.temp_directory, prefix=TEMPORARY_DIR_PREFIX_PATTERN, @@ -657,11 +666,6 @@ source=self.origin_url, ) - repo_name = os.listdir(self.temp_dir)[0] - self.directory = os.path.join(self.temp_dir, repo_name) - super().prepare(*args, **kwargs) - - def cleanup(self): - if self.temp_dir and os.path.exists(self.temp_dir): - rmtree(self.temp_dir) - super().cleanup() + repo_name = os.listdir(self.archive_extract_temp_dir)[0] + self.directory = os.path.join(self.archive_extract_temp_dir, repo_name) + super().prepare() diff --git a/swh/loader/mercurial/tasks.py b/swh/loader/mercurial/tasks.py --- a/swh/loader/mercurial/tasks.py +++ b/swh/loader/mercurial/tasks.py @@ -1,10 +1,12 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task +from swh.loader.mercurial.utils import parse_visit_date + from .loader import HgArchiveBundle20Loader, HgBundle20Loader @@ -14,10 +16,13 @@ Import a mercurial tarball into swh. - Args: see :func:`DepositLoader.load`. + Args: see :func:`HgBundle20Loader.load`. """ - loader = HgBundle20Loader(url, directory=directory, visit_date=visit_date) + + loader = HgBundle20Loader.from_configfile( + url=url, directory=directory, visit_date=parse_visit_date(visit_date) + ) return loader.load() @@ -25,9 +30,9 @@ def load_hg_from_archive(*, url, archive_path=None, visit_date=None): """Import a mercurial tarball into swh. - Args: see :func:`DepositLoader.load`. + Args: see :func:`HgArchiveBundle20Loader.load`. """ - loader = HgArchiveBundle20Loader( - url, archive_path=archive_path, visit_date=visit_date + loader = HgArchiveBundle20Loader.from_configfile( + url=url, archive_path=archive_path, visit_date=parse_visit_date(visit_date) ) return loader.load() diff --git a/swh/loader/mercurial/tasks_from_disk.py b/swh/loader/mercurial/tasks_from_disk.py --- a/swh/loader/mercurial/tasks_from_disk.py +++ b/swh/loader/mercurial/tasks_from_disk.py @@ -1,10 +1,12 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import shared_task +from swh.loader.mercurial.utils import parse_visit_date + from .from_disk import HgArchiveLoaderFromDisk, HgLoaderFromDisk @@ -17,7 +19,9 @@ Args: see :func:`DepositLoader.load`. """ - loader = HgLoaderFromDisk(url, directory=directory, visit_date=visit_date) + loader = HgLoaderFromDisk.from_configfile( + url=url, directory=directory, visit_date=parse_visit_date(visit_date) + ) return loader.load() @@ -27,7 +31,7 @@ Args: see :func:`DepositLoader.load`. """ - loader = HgArchiveLoaderFromDisk( - url, archive_path=archive_path, visit_date=visit_date + loader = HgArchiveLoaderFromDisk.from_configfile( + url=url, archive_path=archive_path, visit_date=parse_visit_date(visit_date) ) return loader.load() diff --git a/swh/loader/mercurial/tests/conftest.py b/swh/loader/mercurial/tests/conftest.py --- a/swh/loader/mercurial/tests/conftest.py +++ b/swh/loader/mercurial/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -9,33 +9,31 @@ @pytest.fixture -def swh_loader_config(swh_storage_backend_config, tmp_path) -> Dict[str, Any]: - swh_storage_backend_config["journal_writer"] = {} +def swh_storage_backend_config(swh_storage_backend_config): + """Basic pg storage configuration with no journal collaborator + (to avoid pulling optional dependency on clients of this fixture) + + """ return { + "cls": "filter", "storage": { - "cls": "pipeline", - "steps": [ - {"cls": "filter"}, - { - "cls": "buffer", - "min_batch_size": { - "content": 10000, - "content_bytes": 1073741824, - "directory": 2500, - "revision": 10, - "release": 100, - }, - }, - swh_storage_backend_config, - ], + "cls": "buffer", + "min_batch_size": { + "content": 10, + "content_bytes": 100 * 1024 * 1024, + "directory": 10, + "revision": 10, + "release": 10, + }, + "storage": swh_storage_backend_config, }, - "bundle_filename": "HG20_none_bundle", - "cache1_size": 838860800, - "cache2_size": 838860800, - "clone_timeout_seconds": 2 * 3600, - "reduce_effort": False, - "save_data": False, - "save_data_path": "", + } + + +@pytest.fixture +def swh_loader_config(swh_storage_backend_config, tmp_path) -> Dict[str, Any]: + return { + "storage": swh_storage_backend_config, "max_content_size": 104857600, "temp_directory": str(tmp_path), } diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py --- a/swh/loader/mercurial/tests/test_from_disk.py +++ b/swh/loader/mercurial/tests/test_from_disk.py @@ -1,12 +1,13 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os from datetime import datetime from hashlib import sha1 +import os +from swh.loader.mercurial.utils import parse_visit_date from swh.loader.tests import ( assert_last_visit_matches, check_snapshot, @@ -21,6 +22,9 @@ from ..from_disk import HgDirectory, HgLoaderFromDisk from .loader_checker import ExpectedSwhids, LoaderChecker +VISIT_DATE = parse_visit_date("2016-05-03 15:16:32+00") +assert VISIT_DATE is not None + def random_content() -> Content: """Create minimal content object.""" @@ -73,14 +77,15 @@ # # With more work it should event be possible to know which part # of an object is faulty. -def test_examples(swh_config, datadir, tmp_path): +def test_examples(swh_storage, datadir, tmp_path): for archive_name in ("hello", "transplant", "the-sandbox", "example"): archive_path = os.path.join(datadir, f"{archive_name}.tgz") json_path = os.path.join(datadir, f"{archive_name}.json") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) LoaderChecker( - loader=HgLoaderFromDisk(repo_url), expected=ExpectedSwhids.load(json_path), + loader=HgLoaderFromDisk(swh_storage, repo_url), + expected=ExpectedSwhids.load(json_path), ).check() @@ -88,13 +93,13 @@ # to ensure compatibility of `HgLoaderFromDisk`. # Hashes as been produced by copy pasting the result of the implementation # to prevent regressions. -def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): +def test_loader_hg_new_visit_no_release(swh_storage, datadir, tmp_path): """Eventful visit should yield 1 snapshot""" archive_name = "the-sandbox" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgLoaderFromDisk(url=repo_url) + loader = HgLoaderFromDisk(swh_storage, url=repo_url) assert loader.load() == {"status": "eventful"} @@ -141,14 +146,14 @@ # to ensure compatibility of `HgLoaderFromDisk`. # Hashes as been produced by copy pasting the result of the implementation # to prevent regressions. -def test_loader_hg_new_visit_with_release(swh_config, datadir, tmp_path): +def test_loader_hg_new_visit_with_release(swh_storage, datadir, tmp_path): """Eventful visit with release should yield 1 snapshot""" archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgLoaderFromDisk(url=repo_url, visit_date="2016-05-03 15:16:32+00") + loader = HgLoaderFromDisk(swh_storage, url=repo_url, visit_date=VISIT_DATE,) actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} @@ -200,7 +205,7 @@ # to ensure compatibility of `HgLoaderFromDisk`. # Hashes as been produced by copy pasting the result of the implementation # to prevent regressions. -def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): +def test_visit_repository_with_transplant_operations(swh_storage, datadir, tmp_path): """Visit a mercurial repository visit transplant operations within should yield a snapshot as well. @@ -210,7 +215,7 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgLoaderFromDisk(url=repo_url, visit_date="2016-05-03 15:16:32+00") + loader = HgLoaderFromDisk(swh_storage, url=repo_url, visit_date=VISIT_DATE,) # load hg repository actual_load_status = loader.load() diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py --- a/swh/loader/mercurial/tests/test_loader.py +++ b/swh/loader/mercurial/tests/test_loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -12,6 +12,7 @@ from hglib.error import CommandError import pytest +from swh.loader.mercurial.utils import parse_visit_date from swh.loader.tests import ( assert_last_visit_matches, check_snapshot, @@ -24,14 +25,17 @@ from ..loader import CloneTimeoutError, HgArchiveBundle20Loader, HgBundle20Loader +VISIT_DATE = parse_visit_date("2016-05-03 15:16:32+00") +assert VISIT_DATE is not None -def test_loader_hg_new_visit_no_release(swh_config, datadir, tmp_path): + +def test_loader_hg_new_visit_no_release(swh_storage, datadir, tmp_path): """Eventful visit should yield 1 snapshot""" archive_name = "the-sandbox" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(repo_url) + loader = HgBundle20Loader(swh_storage, repo_url) assert loader.load() == {"status": "eventful"} @@ -53,15 +57,15 @@ ) assert_last_visit_matches( - loader.storage, + swh_storage, repo_url, status="full", type="hg", snapshot=expected_snapshot.id, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert stats == { "content": 2, "directory": 3, @@ -75,9 +79,7 @@ # Ensure archive loader yields the same snapshot loader2 = HgArchiveBundle20Loader( - url=archive_path, - archive_path=archive_path, - visit_date="2016-05-03 15:16:32+00", + swh_storage, url=archive_path, archive_path=archive_path, visit_date=VISIT_DATE, ) actual_load_status = loader2.load() @@ -99,19 +101,19 @@ ) -def test_loader_hg_new_visit_with_release(swh_config, datadir, tmp_path): +def test_loader_hg_new_visit_with_release(swh_storage, datadir, tmp_path): """Eventful visit with release should yield 1 snapshot""" archive_name = "hello" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(url=repo_url, visit_date="2016-05-03 15:16:32+00",) + loader = HgBundle20Loader(swh_storage, url=repo_url, visit_date=VISIT_DATE,) actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} # then - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert stats == { "content": 3, "directory": 3, @@ -125,11 +127,11 @@ # cf. test_loader.org for explaining from where those hashes tip_release = hash_to_bytes("515c4d72e089404356d0f4b39d60f948b8999140") - release = loader.storage.release_get([tip_release])[0] + release = swh_storage.release_get([tip_release])[0] assert release is not None tip_revision_default = hash_to_bytes("c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27") - revision = loader.storage.revision_get([tip_revision_default])[0] + revision = swh_storage.revision_get([tip_revision_default])[0] assert revision is not None expected_snapshot = Snapshot( @@ -143,9 +145,9 @@ }, ) - check_snapshot(expected_snapshot, loader.storage) + check_snapshot(expected_snapshot, swh_storage) assert_last_visit_matches( - loader.storage, + swh_storage, repo_url, type=RevisionType.MERCURIAL.value, status="full", @@ -154,9 +156,7 @@ # Ensure archive loader yields the same snapshot loader2 = HgArchiveBundle20Loader( - url=archive_path, - archive_path=archive_path, - visit_date="2016-05-03 15:16:32+00", + swh_storage, url=archive_path, archive_path=archive_path, visit_date=VISIT_DATE, ) actual_load_status = loader2.load() @@ -178,7 +178,7 @@ ) -def test_visit_with_archive_decompression_failure(swh_config, mocker, datadir): +def test_visit_with_archive_decompression_failure(swh_storage, mocker, datadir): """Failure to decompress should fail early, no data is ingested""" mock_patoo = mocker.patch("swh.loader.mercurial.archive_extract.patoolib") mock_patoo.side_effect = ValueError @@ -187,13 +187,13 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") loader = HgArchiveBundle20Loader( - url=archive_path, visit_date="2016-05-03 15:16:32+00", + swh_storage, url=archive_path, visit_date=VISIT_DATE, ) actual_load_status = loader.load() assert actual_load_status == {"status": "failed"} - stats = get_stats(loader.storage) + stats = get_stats(swh_storage) assert stats == { "content": 0, "directory": 0, @@ -206,11 +206,11 @@ } # That visit yields the same snapshot assert_last_visit_matches( - loader.storage, archive_path, status="failed", type="hg", snapshot=None + swh_storage, archive_path, status="failed", type="hg", snapshot=None ) -def test_visit_error_with_snapshot_partial(swh_config, datadir, tmp_path, mocker): +def test_visit_error_with_snapshot_partial(swh_storage, datadir, tmp_path, mocker): """Incomplete ingestion leads to a 'partial' ingestion status""" mock = mocker.patch("swh.loader.mercurial.loader.HgBundle20Loader.store_metadata") mock.side_effect = ValueError @@ -219,12 +219,12 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(repo_url) + loader = HgBundle20Loader(swh_storage, repo_url) assert loader.load() == {"status": "failed"} assert_last_visit_matches( - loader.storage, + swh_storage, repo_url, status="partial", type="hg", @@ -242,7 +242,7 @@ ], ) def test_visit_error_with_status_not_found( - swh_config, datadir, tmp_path, mocker, error_msg + swh_storage, datadir, tmp_path, mocker, error_msg ): """Not reaching the repo leads to a 'not_found' ingestion status""" mock = mocker.patch("hglib.clone") @@ -252,16 +252,16 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(repo_url) + loader = HgBundle20Loader(swh_storage, repo_url) assert loader.load() == {"status": "uneventful"} assert_last_visit_matches( - loader.storage, repo_url, status="not_found", type="hg", snapshot=None, + swh_storage, repo_url, status="not_found", type="hg", snapshot=None, ) -def test_visit_error_with_clone_error(swh_config, datadir, tmp_path, mocker): +def test_visit_error_with_clone_error(swh_storage, datadir, tmp_path, mocker): """Testing failures other than 'not_found'""" mock = mocker.patch("hglib.clone") @@ -271,16 +271,16 @@ archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(repo_url) + loader = HgBundle20Loader(swh_storage, repo_url) assert loader.load() == {"status": "failed"} assert_last_visit_matches( - loader.storage, repo_url, status="failed", type="hg", snapshot=None, + swh_storage, repo_url, status="failed", type="hg", snapshot=None, ) -def test_visit_repository_with_transplant_operations(swh_config, datadir, tmp_path): +def test_visit_repository_with_transplant_operations(swh_storage, datadir, tmp_path): """Visit a mercurial repository visit transplant operations within should yield a snapshot as well. @@ -289,7 +289,7 @@ archive_name = "transplant" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) - loader = HgBundle20Loader(url=repo_url, visit_date="2019-05-23 12:06:00+00",) + loader = HgBundle20Loader(swh_storage, url=repo_url, visit_date=VISIT_DATE,) # load hg repository actual_load_status = loader.load() @@ -297,11 +297,11 @@ # collect swh revisions assert_last_visit_matches( - loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full" + swh_storage, repo_url, type=RevisionType.MERCURIAL.value, status="full" ) revisions = [] - snapshot = snapshot_get_latest(loader.storage, repo_url) + snapshot = snapshot_get_latest(swh_storage, repo_url) for branch in snapshot.branches.values(): if branch.target_type.value != "revision": continue @@ -310,7 +310,7 @@ # extract original changesets info and the transplant sources hg_changesets = set() transplant_sources = set() - for rev in loader.storage.revision_log(revisions): + for rev in swh_storage.revision_log(revisions): hg_changesets.add(rev["metadata"]["node"]) for k, v in rev["extra_headers"]: if k == b"transplant_source": diff --git a/swh/loader/mercurial/utils.py b/swh/loader/mercurial/utils.py new file mode 100644 --- /dev/null +++ b/swh/loader/mercurial/utils.py @@ -0,0 +1,29 @@ +# Copyright (C) 2020-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime, timezone +from typing import Optional, Union + +import dateutil + + +def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]: + """Convert visit date from either None, a string or a datetime to either None or + datetime. + + """ + if visit_date is None: + return None + + if isinstance(visit_date, datetime): + return visit_date + + if visit_date == "now": + return datetime.now(tz=timezone.utc) + + if isinstance(visit_date, str): + return dateutil.parser.parse(visit_date) + + return ValueError(f"invalid visit date {visit_date!r}")