Page MenuHomeSoftware Heritage

D8787.id.diff
No OneTemporary

D8787.id.diff

diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py
--- a/swh/loader/svn/loader.py
+++ b/swh/loader/svn/loader.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2021 The Software Heritage developers
+# Copyright (C) 2015-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -8,12 +8,11 @@
"""
from datetime import datetime
-from mmap import ACCESS_WRITE, mmap
import os
import pty
import re
import shutil
-from subprocess import Popen
+from subprocess import PIPE, Popen
import tempfile
from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple
@@ -657,17 +656,19 @@
pass
return svn_revision
- def dump_svn_revisions(self, svn_url: str, last_loaded_svn_rev: int = -1) -> str:
- """Generate a subversion dump file using the svnrdump tool. If the svnrdump
- command failed somehow, the produced dump file is analyzed to determine if a
- partial loading is still feasible.
+ def dump_svn_revisions(
+ self, svn_url: str, last_loaded_svn_rev: int = -1
+ ) -> Tuple[str, int]:
+ """Generate a compressed subversion dump file using the svnrdump tool and gzip.
+ If the svnrdump command failed somehow, the produced dump file is analyzed to
+ determine if a partial loading is still feasible.
Raises:
NotFound when the repository is no longer found at url
Returns:
- The dump_path of the repository mounted
-
+ The dump_path of the repository mounted and the max dumped revision number
+ (-1 if all revisions were dumped)
"""
# Build the svnrdump command line
svnrdump_cmd = ["svnrdump", "dump", svn_url]
@@ -684,12 +685,13 @@
# successfully dumped revision numbers are printed to it
dump_temp_dir = tempfile.mkdtemp(dir=self.temp_dir)
dump_name = "".join(c for c in svn_url if c.isalnum())
- dump_path = "%s/%s.svndump" % (dump_temp_dir, dump_name)
+ dump_path = "%s/%s.svndump.gz" % (dump_temp_dir, dump_name)
stderr_lines = []
self.log.debug("Executing %s", " ".join(svnrdump_cmd))
with open(dump_path, "wb") as dump_file:
+ gzip = Popen(["gzip"], stdin=PIPE, stdout=dump_file)
stderr_r, stderr_w = pty.openpty()
- svnrdump = Popen(svnrdump_cmd, stdout=dump_file, stderr=stderr_w)
+ svnrdump = Popen(svnrdump_cmd, stdout=gzip.stdin, stderr=stderr_w)
os.close(stderr_w)
stderr_stream = OutputStream(stderr_r)
readable = True
@@ -706,9 +708,12 @@
error_messages.append(line)
svnrdump.wait()
os.close(stderr_r)
+ # denote end of read file
+ gzip.stdin.close()
+ gzip.wait()
if svnrdump.returncode == 0:
- return dump_path
+ return dump_path, -1
# There was an error but it does not mean that no revisions
# can be loaded.
@@ -736,27 +741,8 @@
last_loaded_svn_rev + 1,
last_dumped_rev,
)
- # Truncate the dump file after the last successfully dumped
- # revision to avoid the loading of corrupted data
- self.log.debug(
- (
- "Truncating dump file after the last "
- "successfully dumped revision (%s) to avoid "
- "the loading of corrupted data"
- ),
- last_dumped_rev,
- )
-
- with open(dump_path, "r+b") as f:
- with mmap(f.fileno(), 0, access=ACCESS_WRITE) as s:
- pattern = (
- "Revision-number: %s" % (last_dumped_rev + 1)
- ).encode()
- n = s.rfind(pattern)
- if n != -1:
- s.resize(n)
self.truncated_dump = True
- return dump_path
+ return dump_path, last_dumped_rev
elif last_dumped_rev != -1 and last_dumped_rev < last_loaded_svn_rev:
raise Exception(
(
@@ -809,7 +795,7 @@
# Then try to generate a dump file containing relevant svn revisions
# to load, an exception will be thrown if something wrong happened
- dump_path = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev)
+ dump_path, max_rev = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev)
# Finally, mount the dump and load the repository
self.log.debug('Mounting dump file with "svnadmin load".')
@@ -818,6 +804,8 @@
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
suffix="-%s" % os.getpid(),
root_dir=self.temp_dir,
+ gzip=True,
+ max_rev=max_rev,
)
self.svn_url = "file://%s" % self.repo_path
super().prepare()
diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py
--- a/swh/loader/svn/tests/test_loader.py
+++ b/swh/loader/svn/tests/test_loader.py
@@ -2092,7 +2092,7 @@
# init remote dump loader and mock some methods
loader = SvnLoaderFromRemoteDump(swh_storage, repo_url, temp_directory=tmp_path)
- loader.dump_svn_revisions = mocker.MagicMock()
+ loader.dump_svn_revisions = mocker.MagicMock(return_value=("", -1))
loader.start_from = mocker.MagicMock(return_value=(0, 0))
# prepare loading
diff --git a/swh/loader/svn/tests/test_utils.py b/swh/loader/svn/tests/test_utils.py
--- a/swh/loader/svn/tests/test_utils.py
+++ b/swh/loader/svn/tests/test_utils.py
@@ -7,12 +7,14 @@
import os
from pathlib import Path
import pty
+import re
import shutil
-from subprocess import Popen
+from subprocess import Popen, run
import pytest
from swh.loader.svn import utils
+from swh.loader.tests import prepare_repository_from_archive
def test_outputstream():
@@ -106,6 +108,44 @@
assert mock_remove.called
+def test_init_svn_repo_from_truncated_dump(datadir, tmp_path):
+ """Mounting partial svn repository from a truncated dump should work"""
+
+ # prepare a repository
+ archive_name = "pkg-gourmet"
+ archive_path = os.path.join(datadir, f"{archive_name}.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+
+ # dump it to file
+ dump_path = str(tmp_path / f"{archive_name}.dump")
+ truncated_dump_path = str(tmp_path / f"{archive_name}_truncated.dump")
+ svnrdump_cmd = ["svnrdump", "dump", repo_url]
+ with open(dump_path, "wb") as dump:
+ run(svnrdump_cmd, stdout=dump)
+
+ # create a truncated dump file that will generate a "svnadmin load" error
+ with open(dump_path, "rb") as dump, open(
+ truncated_dump_path, "wb"
+ ) as truncated_dump:
+ dump_lines = dump.readlines()
+ assert len(dump_lines) > 150
+ truncated_dump_content = b"".join(dump_lines[:150])
+ truncated_dump.write(truncated_dump_content)
+
+ # compute max revision number with non truncated data
+ revs = re.findall(rb"Revision-number: ([0-9]+)", truncated_dump_content)
+ max_rev = int(revs[-1]) - 1
+
+ # prepare repository from truncated dump
+ _, repo_path = utils.init_svn_repo_from_dump(
+ truncated_dump_path, gzip=False, root_dir=tmp_path, max_rev=max_rev
+ )
+
+ # check expected number of revisions have been loaded
+ svnadmin_info = run(["svnadmin", "info", repo_path], capture_output=True, text=True)
+ assert f"Revisions: {max_rev}\n" in svnadmin_info.stdout
+
+
def test_init_svn_repo_from_archive_dump(datadir, tmp_path):
"""Mounting svn repository out of an archive dump is ok"""
dump_name = "penguinsdbtools2018.dump.gz"
diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py
--- a/swh/loader/svn/utils.py
+++ b/swh/loader/svn/utils.py
@@ -65,6 +65,7 @@
root_dir: str = "/tmp",
gzip: bool = False,
cleanup_dump: bool = True,
+ max_rev: int = -1,
) -> Tuple[str, str]:
"""Given a path to a svn dump, initialize an svn repository with the content of said
dump.
@@ -110,14 +111,24 @@
# load dump and bypass properties validation as Unicode decoding errors
# are already handled in loader implementation (see _ra_codecs_error_handler
# in ra.py)
- cmd = ["svnadmin", "load", "-q", "--bypass-prop-validation", repo_path]
- completed_process = run(
- cmd, stdin=dump.stdout, capture_output=True, text=True
- )
- if completed_process.returncode != 0:
+ cmd = ["svnadmin", "load", "-q", "--bypass-prop-validation"]
+ if max_rev > 0:
+ cmd.append(f"-r1:{max_rev}")
+ cmd.append(repo_path)
+ svnadmin_load = run(cmd, stdin=dump.stdout, capture_output=True, text=True)
+ if svnadmin_load.returncode != 0:
+ if max_rev > 0:
+ # if max_rev is specified, we might have a truncated dump due to
+ # an error when executing svnrdump, check if max_rev have been
+ # loaded and continue loading process if it is the case
+ svnadmin_info = run(
+ ["svnadmin", "info", repo_path], capture_output=True, text=True
+ )
+ if f"Revisions: {max_rev}\n" in svnadmin_info.stdout:
+ return temp_dir, repo_path
raise ValueError(
f"Failed to mount the svn dump for project {project_name}\n"
- + completed_process.stderr
+ + svnadmin_load.stderr
)
return temp_dir, repo_path
except Exception as e:

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 6:13 PM (11 w, 17 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3224987

Event Timeline