diff --git a/swh/loader/svn/converters.py b/swh/loader/svn/converters.py index 3915140..65878b4 100644 --- a/swh/loader/svn/converters.py +++ b/swh/loader/svn/converters.py @@ -1,82 +1,87 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime from typing import Dict, Optional, Sequence, Tuple -from swh.model.model import Person, Revision, RevisionType, TimestampWithTimezone +import dateutil -from .utils import strdate_to_timestamp +from swh.model.model import Person, Revision, RevisionType, TimestampWithTimezone def svn_date_to_swh_date(strdate: Optional[str]) -> TimestampWithTimezone: """Convert a string date to an swh one. Args: - strdate: A string formatted for .utils.strdate_to_timestamp - to do its jobs + strdate: A string representing a date with format like + 'YYYY-mm-DDTHH:MM:SS.800722Z' Returns: An swh date format """ - return TimestampWithTimezone( - timestamp=strdate_to_timestamp(strdate), offset=0, negative_utc=False, - ) + if not strdate: # either None or empty string + dt = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) + else: + # TODO: Migrate to iso8601 if possible + dt = dateutil.parser.parse(strdate) + assert dt.tzinfo is not None, strdate + return TimestampWithTimezone.from_datetime(dt) def svn_author_to_swh_person(author: Optional[bytes]) -> Person: """Convert an svn author to an swh person. Default policy: No information is added. Args: author: the svn author (in bytes) Returns: a Person """ return Person.from_fullname(author or b"") def build_swh_revision( rev: int, commit: Dict, repo_uuid: bytes, dir_id: bytes, parents: Sequence[bytes] ) -> Revision: """Given a svn revision, build a swh revision. This adds an 'extra-headers' entry with the repository's uuid and the svn revision. Args: rev: the svn revision number commit: the commit data: revision id, date, author, and message repo_uuid: The repository's uuid dir_id: the tree's hash identifier parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = commit["author_name"] msg = commit["message"] date = commit["author_date"] extra_headers: Tuple[Tuple[bytes, bytes], ...] = ( (b"svn_repo_uuid", repo_uuid), (b"svn_revision", str(rev).encode()), ) return Revision( type=RevisionType.SUBVERSION, date=date, committer_date=date, directory=dir_id, message=msg, author=author, committer=author, synthetic=True, extra_headers=extra_headers, parents=tuple(parents), ) diff --git a/swh/loader/svn/tests/test_utils.py b/swh/loader/svn/tests/test_utils.py index 17a3199..644f0fa 100644 --- a/swh/loader/svn/tests/test_utils.py +++ b/swh/loader/svn/tests/test_utils.py @@ -1,141 +1,124 @@ # Copyright (C) 2016-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import pty import shutil from subprocess import Popen from swh.loader.svn import utils -from swh.model.model import Timestamp def test_outputstream(): stdout_r, stdout_w = pty.openpty() echo = Popen(["echo", "-e", "foo\nbar\nbaz"], stdout=stdout_w) os.close(stdout_w) stdout_stream = utils.OutputStream(stdout_r) lines = [] while True: current_lines, readable = stdout_stream.read_lines() lines += current_lines if not readable: break echo.wait() os.close(stdout_r) assert lines == ["foo", "bar", "baz"] -def test_strdate_to_timestamp(): - """Formatted string date should be converted in timestamp.""" - actual_ts = utils.strdate_to_timestamp("2011-05-31T06:04:39.800722Z") - assert actual_ts == Timestamp(seconds=1306821879, microseconds=800722) - - actual_ts = utils.strdate_to_timestamp("2011-05-31T06:03:39.123450Z") - assert actual_ts == Timestamp(seconds=1306821819, microseconds=123450) - - -def test_strdate_to_timestamp_empty_does_not_break(): - """Empty or None date should be timestamp 0.""" - default_ts = Timestamp(seconds=0, microseconds=0) - assert default_ts == utils.strdate_to_timestamp("") - assert default_ts == utils.strdate_to_timestamp(None) - - def test_init_svn_repo_from_dump(datadir, tmp_path): """Mounting svn repository out of a dump is ok""" dump_name = "penguinsdbtools2018.dump.gz" dump_path = os.path.join(datadir, dump_name) tmp_repo, repo_path = utils.init_svn_repo_from_dump( dump_path, gzip=True, cleanup_dump=False, root_dir=tmp_path ) assert os.path.exists(dump_path), "Dump path should still exists" assert os.path.exists(repo_path), "Repository should exists" def test_init_svn_repo_from_dump_and_cleanup(datadir, tmp_path): """Mounting svn repository with a dump cleanup after is ok""" dump_name = "penguinsdbtools2018.dump.gz" dump_ori_path = os.path.join(datadir, dump_name) dump_path = os.path.join(tmp_path, dump_name) shutil.copyfile(dump_ori_path, dump_path) assert os.path.exists(dump_path) assert os.path.exists(dump_ori_path) tmp_repo, repo_path = utils.init_svn_repo_from_dump( dump_path, gzip=True, root_dir=tmp_path ) assert not os.path.exists(dump_path), "Dump path should no longer exists" assert os.path.exists(repo_path), "Repository should exists" assert os.path.exists(dump_ori_path), "Original dump path should still exists" def test_init_svn_repo_from_dump_and_cleanup_already_done( datadir, tmp_path, mocker, caplog ): """Mounting svn repository out of a dump is ok""" caplog.set_level(logging.INFO, "swh.loader.svn.utils") dump_name = "penguinsdbtools2018.dump.gz" dump_ori_path = os.path.join(datadir, dump_name) mock_remove = mocker.patch("os.remove") mock_remove.side_effect = FileNotFoundError dump_path = os.path.join(tmp_path, dump_name) shutil.copyfile(dump_ori_path, dump_path) assert os.path.exists(dump_path) assert os.path.exists(dump_ori_path) tmp_repo, repo_path = utils.init_svn_repo_from_dump( dump_path, gzip=True, root_dir=tmp_path ) assert os.path.exists(repo_path), "Repository should exists" assert os.path.exists(dump_ori_path), "Original dump path should still exists" assert len(caplog.record_tuples) == 1 assert "Failure to remove" in caplog.record_tuples[0][2] assert mock_remove.called def test_init_svn_repo_from_archive_dump(datadir, tmp_path): """Mounting svn repository out of an archive dump is ok""" dump_name = "penguinsdbtools2018.dump.gz" dump_path = os.path.join(datadir, dump_name) tmp_repo, repo_path = utils.init_svn_repo_from_archive_dump( dump_path, cleanup_dump=False, root_dir=tmp_path ) assert os.path.exists(dump_path), "Dump path should still exists" assert os.path.exists(repo_path), "Repository should exists" def test_init_svn_repo_from_archive_dump_and_cleanup(datadir, tmp_path): """Mounting svn repository out of a dump is ok""" dump_name = "penguinsdbtools2018.dump.gz" dump_ori_path = os.path.join(datadir, dump_name) dump_path = os.path.join(tmp_path, dump_name) shutil.copyfile(dump_ori_path, dump_path) assert os.path.exists(dump_path) assert os.path.exists(dump_ori_path) tmp_repo, repo_path = utils.init_svn_repo_from_archive_dump( dump_path, root_dir=tmp_path ) assert not os.path.exists(dump_path), "Dump path should no longer exists" assert os.path.exists(repo_path), "Repository should exists" assert os.path.exists(dump_ori_path), "Original dump path should still exists" diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py index 03927a2..3c1291b 100644 --- a/swh/loader/svn/utils.py +++ b/swh/loader/svn/utils.py @@ -1,195 +1,168 @@ # Copyright (C) 2016-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import errno import logging import os import shutil from subprocess import PIPE, Popen, call import tempfile -from typing import Tuple - -from dateutil import parser - -from swh.model.model import Optional, Timestamp +from typing import Optional, Tuple logger = logging.getLogger(__name__) -def strdate_to_timestamp(strdate: Optional[str]) -> Timestamp: - """Convert a string date to an int timestamp. - - Args: - strdate: A string representing a date with format like - 'YYYY-mm-DDTHH:MM:SS.800722Z' - - Returns: - A couple of integers: seconds, microseconds - - """ - if strdate: - # TODO: Migrate to iso8601 if possible - dt = parser.parse(strdate) - ts = { - "seconds": int(dt.timestamp()), - "microseconds": dt.microsecond, - } - else: # epoch - ts = {"seconds": 0, "microseconds": 0} - return Timestamp.from_dict(ts) - - class OutputStream: """Helper class to read lines from a program output while it is running Args: fileno (int): File descriptor of a program output stream opened in text mode """ def __init__(self, fileno): self._fileno = fileno self._buffer = "" def read_lines(self): """ Read available lines from the output stream and return them. Returns: Tuple[List[str], bool]: A tuple whose first member is the read lines and second member a boolean indicating if there are still some other lines available to read. """ try: output = os.read(self._fileno, 1000).decode() except OSError as e: if e.errno != errno.EIO: raise output = "" output = output.replace("\r\n", "\n") lines = output.split("\n") lines[0] = self._buffer + lines[0] if output: self._buffer = lines[-1] return (lines[:-1], True) else: self._buffer = "" if len(lines) == 1 and not lines[0]: lines = [] return (lines, False) def init_svn_repo_from_dump( dump_path: str, prefix: Optional[str] = None, suffix: Optional[str] = None, root_dir: str = "/tmp", gzip: bool = False, cleanup_dump: bool = True, ) -> Tuple[str, str]: """Given a path to a svn dump, initialize an svn repository with the content of said dump. Args: dump_path: The dump to the path prefix: optional prefix file name for the working directory suffix: optional suffix file name for the working directory root_dir: the root directory where the working directory is created gzip: Boolean to determine whether we treat the dump as compressed or not. cleanup_dump: Whether we want this function call to clean up the dump at the end of the repository initialization. Raises: ValueError in case of failure to run the command to uncompress and load the dump. Returns: A tuple: - temporary folder: containing the mounted repository - repo_path: path to the mounted repository inside the temporary folder """ project_name = os.path.basename(os.path.dirname(dump_path)) temp_dir = tempfile.mkdtemp(prefix=prefix, suffix=suffix, dir=root_dir) try: repo_path = os.path.join(temp_dir, project_name) # create the repository that will be loaded with the dump cmd = ["svnadmin", "create", repo_path] r = call(cmd) if r != 0: raise ValueError( "Failed to initialize empty svn repo for %s" % project_name ) read_dump_cmd = ["cat", dump_path] if gzip: read_dump_cmd = ["gzip", "-dc", dump_path] with Popen(read_dump_cmd, stdout=PIPE) as dump: # load dump and bypass properties validation as Unicode decoding errors # are already handled in loader implementation (see _ra_codecs_error_handler # in ra.py) cmd = ["svnadmin", "load", "-q", "--bypass-prop-validation", repo_path] r = call(cmd, stdin=dump.stdout) if r != 0: raise ValueError( "Failed to mount the svn dump for project %s" % project_name ) return temp_dir, repo_path except Exception as e: shutil.rmtree(temp_dir) raise e finally: if cleanup_dump: try: # At this time, the temporary svn repository is mounted from the dump or # the svn repository failed to mount. Either way, we can drop the dump. os.remove(dump_path) assert not os.path.exists(dump_path) except OSError as e: logger.warn("Failure to remove the dump %s: %s", dump_path, e) def init_svn_repo_from_archive_dump( archive_path: str, prefix: Optional[str] = None, suffix: Optional[str] = None, root_dir: str = "/tmp", cleanup_dump: bool = True, ) -> Tuple[str, str]: """Given a path to an archive containing an svn dump, initializes an svn repository with the content of the uncompressed dump. Args: archive_path: The archive svn dump path prefix: optional prefix file name for the working directory suffix: optional suffix file name for the working directory root_dir: the root directory where the working directory is created gzip: Boolean to determine whether we treat the dump as compressed or not. cleanup_dump: Whether we want this function call to clean up the dump at the end of the repository initialization. Raises: ValueError in case of failure to run the command to uncompress and load the dump. Returns: A tuple: - temporary folder: containing the mounted repository - repo_path: path to the mounted repository inside the temporary folder """ return init_svn_repo_from_dump( archive_path, prefix=prefix, suffix=suffix, root_dir=root_dir, gzip=True, cleanup_dump=cleanup_dump, )