diff --git a/swh/loader/core/tests/test_utils.py b/swh/loader/core/tests/test_utils.py index d1954ae..628243b 100644 --- a/swh/loader/core/tests/test_utils.py +++ b/swh/loader/core/tests/test_utils.py @@ -1,147 +1,171 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime import os import signal from time import sleep from unittest.mock import patch import pytest from swh.loader.core.utils import ( CloneFailure, CloneTimeout, clean_dangling_folders, clone_with_timeout, + parse_visit_date, ) def prepare_arborescence_from(tmpdir, folder_names): """Prepare arborescence tree with folders Args: tmpdir (Either[LocalPath, str]): Root temporary directory folder_names (List[str]): List of folder names Returns: List of folders """ dangling_folders = [] for dname in folder_names: d = str(tmpdir / dname) os.mkdir(d) dangling_folders.append(d) return str(tmpdir), dangling_folders def assert_dirs(actual_dirs, expected_dirs): """Assert that the directory actual and expected match """ for d in actual_dirs: assert d in expected_dirs assert len(actual_dirs) == len(expected_dirs) def test_clean_dangling_folders_0(tmpdir): """Folder does not exist, do nothing""" r = clean_dangling_folders("/path/does/not/exist", "unused-pattern") assert r is None @patch("swh.loader.core.utils.psutil.pid_exists", return_value=False) def test_clean_dangling_folders_1(mock_pid_exists, tmpdir): """Folder which matches pattern with dead pid are cleaned up """ rootpath, dangling = prepare_arborescence_from( tmpdir, ["something", "swh.loader.svn-4321.noisynoise",] ) clean_dangling_folders(rootpath, "swh.loader.svn") actual_dirs = os.listdir(rootpath) mock_pid_exists.assert_called_once_with(4321) assert_dirs(actual_dirs, ["something"]) @patch("swh.loader.core.utils.psutil.pid_exists", return_value=True) def test_clean_dangling_folders_2(mock_pid_exists, tmpdir): """Folder which matches pattern with live pid are skipped """ rootpath, dangling = prepare_arborescence_from( tmpdir, ["something", "swh.loader.hg-1234.noisynoise",] ) clean_dangling_folders(rootpath, "swh.loader.hg") actual_dirs = os.listdir(rootpath) mock_pid_exists.assert_called_once_with(1234) assert_dirs(actual_dirs, ["something", "swh.loader.hg-1234.noisynoise",]) @patch("swh.loader.core.utils.psutil.pid_exists", return_value=False) @patch( "swh.loader.core.utils.shutil.rmtree", side_effect=ValueError("Could not remove for reasons"), ) def test_clean_dangling_folders_3(mock_rmtree, mock_pid_exists, tmpdir): """Error in trying to clean dangling folders are skipped """ path1 = "thingy" path2 = "swh.loader.git-1468.noisy" rootpath, dangling = prepare_arborescence_from(tmpdir, [path1, path2,]) clean_dangling_folders(rootpath, "swh.loader.git") actual_dirs = os.listdir(rootpath) mock_pid_exists.assert_called_once_with(1468) mock_rmtree.assert_called_once_with(os.path.join(rootpath, path2)) assert_dirs(actual_dirs, [path2, path1]) def test_clone_with_timeout_no_error_no_timeout(): def succeed(): """This does nothing to simulate a successful clone""" clone_with_timeout("foo", "bar", succeed, timeout=0.5) def test_clone_with_timeout_no_error_timeout(): def slow(): """This lasts for more than the timeout""" sleep(1) with pytest.raises(CloneTimeout): clone_with_timeout("foo", "bar", slow, timeout=0.5) def test_clone_with_timeout_error(): def raise_something(): raise RuntimeError("panic!") with pytest.raises(CloneFailure): clone_with_timeout("foo", "bar", raise_something, timeout=0.5) def test_clone_with_timeout_sigkill(): """This also tests that the traceback is useful""" src = "https://www.mercurial-scm.org/repo/hello" dest = "/dev/null" timeout = 0.5 sleepy_time = 100 * timeout assert sleepy_time > timeout def ignores_sigterm(*args, **kwargs): # ignore SIGTERM to force sigkill signal.signal(signal.SIGTERM, lambda signum, frame: None) sleep(sleepy_time) # we make sure we exceed the timeout with pytest.raises(CloneTimeout) as e: clone_with_timeout(src, dest, ignores_sigterm, timeout) killed = True assert e.value.args == (src, timeout, killed) + + +VISIT_DATE_STR = "2021-02-17 15:50:04.518963" +VISIT_DATE = datetime(2021, 2, 17, 15, 50, 4, 518963) + + +@pytest.mark.parametrize( + "input_visit_date,expected_date", + [(None, None), (VISIT_DATE, VISIT_DATE), (VISIT_DATE_STR, VISIT_DATE),], +) +def test_utils_parse_visit_date(input_visit_date, expected_date): + assert parse_visit_date(input_visit_date) == expected_date + + +def test_utils_parse_visit_date_now(): + actual_date = parse_visit_date("now") + assert isinstance(actual_date, datetime) + + +def test_utils_parse_visit_date_fails(): + with pytest.raises(ValueError, match="invalid"): + parse_visit_date(10) # not a string nor a date diff --git a/swh/loader/core/utils.py b/swh/loader/core/utils.py index 632bef3..84be8ff 100644 --- a/swh/loader/core/utils.py +++ b/swh/loader/core/utils.py @@ -1,105 +1,127 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timezone import io import os import shutil import signal import time import traceback -from typing import Callable +from typing import Callable, Optional, Union from billiard import Process, Queue # type: ignore +from dateutil.parser import parse import psutil def clean_dangling_folders(dirpath: str, pattern_check: str, log=None) -> None: """Clean up potential dangling temporary working folder rooted at `dirpath`. Those folders must match a dedicated pattern and not belonging to a live pid. Args: dirpath: Path to check for dangling files pattern_check: A dedicated pattern to check on first level directory (e.g `swh.loader.mercurial.`, `swh.loader.svn.`) log (Logger): Optional logger """ if not os.path.exists(dirpath): return for filename in os.listdir(dirpath): path_to_cleanup = os.path.join(dirpath, filename) try: # pattern: `swh.loader.{loader-type}-pid.{noise}` if ( pattern_check not in filename or "-" not in filename ): # silently ignore unknown patterns continue _, pid_ = filename.split("-") pid = int(pid_.split(".")[0]) if psutil.pid_exists(pid): if log: log.debug("PID %s is live, skipping", pid) continue # could be removed concurrently, so check before removal if os.path.exists(path_to_cleanup): shutil.rmtree(path_to_cleanup) except Exception as e: if log: log.warn("Fail to clean dangling path %s: %s", path_to_cleanup, e) class CloneTimeout(Exception): pass class CloneFailure(Exception): pass def _clone_task(clone_func: Callable[[], None], errors: Queue) -> None: try: clone_func() except Exception as e: exc_buffer = io.StringIO() traceback.print_exc(file=exc_buffer) errors.put_nowait(exc_buffer.getvalue()) raise e def clone_with_timeout( src: str, dest: str, clone_func: Callable[[], None], timeout: float ) -> None: """Clone a repository with timeout. Args: src: clone source dest: clone destination clone_func: callable that does the actual cloning timeout: timeout in seconds """ errors: Queue = Queue() process = Process(target=_clone_task, args=(clone_func, errors)) process.start() process.join(timeout) if process.is_alive(): process.terminate() # Give it literally a second (in successive steps of 0.1 second), # then kill it. # Can't use `process.join(1)` here, billiard appears to be bugged # https://github.com/celery/billiard/issues/270 killed = False for _ in range(10): time.sleep(0.1) if not process.is_alive(): break else: killed = True os.kill(process.pid, signal.SIGKILL) raise CloneTimeout(src, timeout, killed) if not errors.empty(): raise CloneFailure(src, dest, errors.get()) + + +def parse_visit_date(visit_date: Optional[Union[datetime, str]]) -> Optional[datetime]: + """Convert visit date from either None, a string or a datetime to either None or + datetime. + + """ + if visit_date is None: + return None + + if isinstance(visit_date, datetime): + return visit_date + + if visit_date == "now": + return datetime.now(tz=timezone.utc) + + if isinstance(visit_date, str): + return parse(visit_date) + + raise ValueError(f"invalid visit date {visit_date!r}")