diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2021 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,12 +8,11 @@ """ from datetime import datetime -from mmap import ACCESS_WRITE, mmap import os import pty import re import shutil -from subprocess import Popen +from subprocess import PIPE, Popen import tempfile from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple @@ -657,17 +656,19 @@ pass return svn_revision - def dump_svn_revisions(self, svn_url: str, last_loaded_svn_rev: int = -1) -> str: - """Generate a subversion dump file using the svnrdump tool. If the svnrdump - command failed somehow, the produced dump file is analyzed to determine if a - partial loading is still feasible. + def dump_svn_revisions( + self, svn_url: str, last_loaded_svn_rev: int = -1 + ) -> Tuple[str, int]: + """Generate a compressed subversion dump file using the svnrdump tool and gzip. + If the svnrdump command failed somehow, the produced dump file is analyzed to + determine if a partial loading is still feasible. Raises: NotFound when the repository is no longer found at url Returns: - The dump_path of the repository mounted - + The dump_path of the repository mounted and the max dumped revision number + (-1 if all revisions were dumped) """ # Build the svnrdump command line svnrdump_cmd = ["svnrdump", "dump", svn_url] @@ -684,12 +685,13 @@ # successfully dumped revision numbers are printed to it dump_temp_dir = tempfile.mkdtemp(dir=self.temp_dir) dump_name = "".join(c for c in svn_url if c.isalnum()) - dump_path = "%s/%s.svndump" % (dump_temp_dir, dump_name) + dump_path = "%s/%s.svndump.gz" % (dump_temp_dir, dump_name) stderr_lines = [] self.log.debug("Executing %s", " ".join(svnrdump_cmd)) with open(dump_path, "wb") as dump_file: + gzip = Popen(["gzip"], stdin=PIPE, stdout=dump_file) stderr_r, stderr_w = pty.openpty() - svnrdump = Popen(svnrdump_cmd, stdout=dump_file, stderr=stderr_w) + svnrdump = Popen(svnrdump_cmd, stdout=gzip.stdin, stderr=stderr_w) os.close(stderr_w) stderr_stream = OutputStream(stderr_r) readable = True @@ -706,9 +708,12 @@ error_messages.append(line) svnrdump.wait() os.close(stderr_r) + # denote end of read file + gzip.stdin.close() + gzip.wait() if svnrdump.returncode == 0: - return dump_path + return dump_path, -1 # There was an error but it does not mean that no revisions # can be loaded. @@ -736,27 +741,8 @@ last_loaded_svn_rev + 1, last_dumped_rev, ) - # Truncate the dump file after the last successfully dumped - # revision to avoid the loading of corrupted data - self.log.debug( - ( - "Truncating dump file after the last " - "successfully dumped revision (%s) to avoid " - "the loading of corrupted data" - ), - last_dumped_rev, - ) - - with open(dump_path, "r+b") as f: - with mmap(f.fileno(), 0, access=ACCESS_WRITE) as s: - pattern = ( - "Revision-number: %s" % (last_dumped_rev + 1) - ).encode() - n = s.rfind(pattern) - if n != -1: - s.resize(n) self.truncated_dump = True - return dump_path + return dump_path, last_dumped_rev elif last_dumped_rev != -1 and last_dumped_rev < last_loaded_svn_rev: raise Exception( ( @@ -809,7 +795,7 @@ # Then try to generate a dump file containing relevant svn revisions # to load, an exception will be thrown if something wrong happened - dump_path = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev) + dump_path, max_rev = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev) # Finally, mount the dump and load the repository self.log.debug('Mounting dump file with "svnadmin load".') @@ -818,6 +804,8 @@ prefix=TEMPORARY_DIR_PREFIX_PATTERN, suffix="-%s" % os.getpid(), root_dir=self.temp_dir, + gzip=True, + max_rev=max_rev, ) self.svn_url = "file://%s" % self.repo_path super().prepare() diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py --- a/swh/loader/svn/tests/test_loader.py +++ b/swh/loader/svn/tests/test_loader.py @@ -2092,7 +2092,7 @@ # init remote dump loader and mock some methods loader = SvnLoaderFromRemoteDump(swh_storage, repo_url, temp_directory=tmp_path) - loader.dump_svn_revisions = mocker.MagicMock() + loader.dump_svn_revisions = mocker.MagicMock(return_value=("", -1)) loader.start_from = mocker.MagicMock(return_value=(0, 0)) # prepare loading diff --git a/swh/loader/svn/tests/test_utils.py b/swh/loader/svn/tests/test_utils.py --- a/swh/loader/svn/tests/test_utils.py +++ b/swh/loader/svn/tests/test_utils.py @@ -7,12 +7,14 @@ import os from pathlib import Path import pty +import re import shutil -from subprocess import Popen +from subprocess import Popen, run import pytest from swh.loader.svn import utils +from swh.loader.tests import prepare_repository_from_archive def test_outputstream(): @@ -106,6 +108,44 @@ assert mock_remove.called +def test_init_svn_repo_from_truncated_dump(datadir, tmp_path): + """Mounting partial svn repository from a truncated dump should work""" + + # prepare a repository + archive_name = "pkg-gourmet" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) + + # dump it to file + dump_path = str(tmp_path / f"{archive_name}.dump") + truncated_dump_path = str(tmp_path / f"{archive_name}_truncated.dump") + svnrdump_cmd = ["svnrdump", "dump", repo_url] + with open(dump_path, "wb") as dump: + run(svnrdump_cmd, stdout=dump) + + # create a truncated dump file that will generate a "svnadmin load" error + with open(dump_path, "rb") as dump, open( + truncated_dump_path, "wb" + ) as truncated_dump: + dump_lines = dump.readlines() + assert len(dump_lines) > 150 + truncated_dump_content = b"".join(dump_lines[:150]) + truncated_dump.write(truncated_dump_content) + + # compute max revision number with non truncated data + revs = re.findall(rb"Revision-number: ([0-9]+)", truncated_dump_content) + max_rev = int(revs[-1]) - 1 + + # prepare repository from truncated dump + _, repo_path = utils.init_svn_repo_from_dump( + truncated_dump_path, gzip=False, root_dir=tmp_path, max_rev=max_rev + ) + + # check expected number of revisions have been loaded + svnadmin_info = run(["svnadmin", "info", repo_path], capture_output=True, text=True) + assert f"Revisions: {max_rev}\n" in svnadmin_info.stdout + + def test_init_svn_repo_from_archive_dump(datadir, tmp_path): """Mounting svn repository out of an archive dump is ok""" dump_name = "penguinsdbtools2018.dump.gz" diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py --- a/swh/loader/svn/utils.py +++ b/swh/loader/svn/utils.py @@ -65,6 +65,7 @@ root_dir: str = "/tmp", gzip: bool = False, cleanup_dump: bool = True, + max_rev: int = -1, ) -> Tuple[str, str]: """Given a path to a svn dump, initialize an svn repository with the content of said dump. @@ -110,14 +111,24 @@ # load dump and bypass properties validation as Unicode decoding errors # are already handled in loader implementation (see _ra_codecs_error_handler # in ra.py) - cmd = ["svnadmin", "load", "-q", "--bypass-prop-validation", repo_path] - completed_process = run( - cmd, stdin=dump.stdout, capture_output=True, text=True - ) - if completed_process.returncode != 0: + cmd = ["svnadmin", "load", "-q", "--bypass-prop-validation"] + if max_rev > 0: + cmd.append(f"-r1:{max_rev}") + cmd.append(repo_path) + svnadmin_load = run(cmd, stdin=dump.stdout, capture_output=True, text=True) + if svnadmin_load.returncode != 0: + if max_rev > 0: + # if max_rev is specified, we might have a truncated dump due to + # an error when executing svnrdump, check if max_rev have been + # loaded and continue loading process if it is the case + svnadmin_info = run( + ["svnadmin", "info", repo_path], capture_output=True, text=True + ) + if f"Revisions: {max_rev}\n" in svnadmin_info.stdout: + return temp_dir, repo_path raise ValueError( f"Failed to mount the svn dump for project {project_name}\n" - + completed_process.stderr + + svnadmin_load.stderr ) return temp_dir, repo_path except Exception as e: