Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066680
D8787.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Subscribers
None
D8787.id.diff
View Options
diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py
--- a/swh/loader/svn/loader.py
+++ b/swh/loader/svn/loader.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2021 The Software Heritage developers
+# Copyright (C) 2015-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -8,12 +8,11 @@
"""
from datetime import datetime
-from mmap import ACCESS_WRITE, mmap
import os
import pty
import re
import shutil
-from subprocess import Popen
+from subprocess import PIPE, Popen
import tempfile
from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple
@@ -657,17 +656,19 @@
pass
return svn_revision
- def dump_svn_revisions(self, svn_url: str, last_loaded_svn_rev: int = -1) -> str:
- """Generate a subversion dump file using the svnrdump tool. If the svnrdump
- command failed somehow, the produced dump file is analyzed to determine if a
- partial loading is still feasible.
+ def dump_svn_revisions(
+ self, svn_url: str, last_loaded_svn_rev: int = -1
+ ) -> Tuple[str, int]:
+ """Generate a compressed subversion dump file using the svnrdump tool and gzip.
+ If the svnrdump command failed somehow, the produced dump file is analyzed to
+ determine if a partial loading is still feasible.
Raises:
NotFound when the repository is no longer found at url
Returns:
- The dump_path of the repository mounted
-
+ The dump_path of the repository mounted and the max dumped revision number
+ (-1 if all revisions were dumped)
"""
# Build the svnrdump command line
svnrdump_cmd = ["svnrdump", "dump", svn_url]
@@ -684,12 +685,13 @@
# successfully dumped revision numbers are printed to it
dump_temp_dir = tempfile.mkdtemp(dir=self.temp_dir)
dump_name = "".join(c for c in svn_url if c.isalnum())
- dump_path = "%s/%s.svndump" % (dump_temp_dir, dump_name)
+ dump_path = "%s/%s.svndump.gz" % (dump_temp_dir, dump_name)
stderr_lines = []
self.log.debug("Executing %s", " ".join(svnrdump_cmd))
with open(dump_path, "wb") as dump_file:
+ gzip = Popen(["gzip"], stdin=PIPE, stdout=dump_file)
stderr_r, stderr_w = pty.openpty()
- svnrdump = Popen(svnrdump_cmd, stdout=dump_file, stderr=stderr_w)
+ svnrdump = Popen(svnrdump_cmd, stdout=gzip.stdin, stderr=stderr_w)
os.close(stderr_w)
stderr_stream = OutputStream(stderr_r)
readable = True
@@ -706,9 +708,12 @@
error_messages.append(line)
svnrdump.wait()
os.close(stderr_r)
+ # denote end of read file
+ gzip.stdin.close()
+ gzip.wait()
if svnrdump.returncode == 0:
- return dump_path
+ return dump_path, -1
# There was an error but it does not mean that no revisions
# can be loaded.
@@ -736,27 +741,8 @@
last_loaded_svn_rev + 1,
last_dumped_rev,
)
- # Truncate the dump file after the last successfully dumped
- # revision to avoid the loading of corrupted data
- self.log.debug(
- (
- "Truncating dump file after the last "
- "successfully dumped revision (%s) to avoid "
- "the loading of corrupted data"
- ),
- last_dumped_rev,
- )
-
- with open(dump_path, "r+b") as f:
- with mmap(f.fileno(), 0, access=ACCESS_WRITE) as s:
- pattern = (
- "Revision-number: %s" % (last_dumped_rev + 1)
- ).encode()
- n = s.rfind(pattern)
- if n != -1:
- s.resize(n)
self.truncated_dump = True
- return dump_path
+ return dump_path, last_dumped_rev
elif last_dumped_rev != -1 and last_dumped_rev < last_loaded_svn_rev:
raise Exception(
(
@@ -809,7 +795,7 @@
# Then try to generate a dump file containing relevant svn revisions
# to load, an exception will be thrown if something wrong happened
- dump_path = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev)
+ dump_path, max_rev = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev)
# Finally, mount the dump and load the repository
self.log.debug('Mounting dump file with "svnadmin load".')
@@ -818,6 +804,8 @@
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
suffix="-%s" % os.getpid(),
root_dir=self.temp_dir,
+ gzip=True,
+ max_rev=max_rev,
)
self.svn_url = "file://%s" % self.repo_path
super().prepare()
diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py
--- a/swh/loader/svn/tests/test_loader.py
+++ b/swh/loader/svn/tests/test_loader.py
@@ -2092,7 +2092,7 @@
# init remote dump loader and mock some methods
loader = SvnLoaderFromRemoteDump(swh_storage, repo_url, temp_directory=tmp_path)
- loader.dump_svn_revisions = mocker.MagicMock()
+ loader.dump_svn_revisions = mocker.MagicMock(return_value=("", -1))
loader.start_from = mocker.MagicMock(return_value=(0, 0))
# prepare loading
diff --git a/swh/loader/svn/tests/test_utils.py b/swh/loader/svn/tests/test_utils.py
--- a/swh/loader/svn/tests/test_utils.py
+++ b/swh/loader/svn/tests/test_utils.py
@@ -7,12 +7,14 @@
import os
from pathlib import Path
import pty
+import re
import shutil
-from subprocess import Popen
+from subprocess import Popen, run
import pytest
from swh.loader.svn import utils
+from swh.loader.tests import prepare_repository_from_archive
def test_outputstream():
@@ -106,6 +108,44 @@
assert mock_remove.called
+def test_init_svn_repo_from_truncated_dump(datadir, tmp_path):
+ """Mounting partial svn repository from a truncated dump should work"""
+
+ # prepare a repository
+ archive_name = "pkg-gourmet"
+ archive_path = os.path.join(datadir, f"{archive_name}.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+
+ # dump it to file
+ dump_path = str(tmp_path / f"{archive_name}.dump")
+ truncated_dump_path = str(tmp_path / f"{archive_name}_truncated.dump")
+ svnrdump_cmd = ["svnrdump", "dump", repo_url]
+ with open(dump_path, "wb") as dump:
+ run(svnrdump_cmd, stdout=dump)
+
+ # create a truncated dump file that will generate a "svnadmin load" error
+ with open(dump_path, "rb") as dump, open(
+ truncated_dump_path, "wb"
+ ) as truncated_dump:
+ dump_lines = dump.readlines()
+ assert len(dump_lines) > 150
+ truncated_dump_content = b"".join(dump_lines[:150])
+ truncated_dump.write(truncated_dump_content)
+
+ # compute max revision number with non truncated data
+ revs = re.findall(rb"Revision-number: ([0-9]+)", truncated_dump_content)
+ max_rev = int(revs[-1]) - 1
+
+ # prepare repository from truncated dump
+ _, repo_path = utils.init_svn_repo_from_dump(
+ truncated_dump_path, gzip=False, root_dir=tmp_path, max_rev=max_rev
+ )
+
+ # check expected number of revisions have been loaded
+ svnadmin_info = run(["svnadmin", "info", repo_path], capture_output=True, text=True)
+ assert f"Revisions: {max_rev}\n" in svnadmin_info.stdout
+
+
def test_init_svn_repo_from_archive_dump(datadir, tmp_path):
"""Mounting svn repository out of an archive dump is ok"""
dump_name = "penguinsdbtools2018.dump.gz"
diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py
--- a/swh/loader/svn/utils.py
+++ b/swh/loader/svn/utils.py
@@ -65,6 +65,7 @@
root_dir: str = "/tmp",
gzip: bool = False,
cleanup_dump: bool = True,
+ max_rev: int = -1,
) -> Tuple[str, str]:
"""Given a path to a svn dump, initialize an svn repository with the content of said
dump.
@@ -110,14 +111,24 @@
# load dump and bypass properties validation as Unicode decoding errors
# are already handled in loader implementation (see _ra_codecs_error_handler
# in ra.py)
- cmd = ["svnadmin", "load", "-q", "--bypass-prop-validation", repo_path]
- completed_process = run(
- cmd, stdin=dump.stdout, capture_output=True, text=True
- )
- if completed_process.returncode != 0:
+ cmd = ["svnadmin", "load", "-q", "--bypass-prop-validation"]
+ if max_rev > 0:
+ cmd.append(f"-r1:{max_rev}")
+ cmd.append(repo_path)
+ svnadmin_load = run(cmd, stdin=dump.stdout, capture_output=True, text=True)
+ if svnadmin_load.returncode != 0:
+ if max_rev > 0:
+ # if max_rev is specified, we might have a truncated dump due to
+ # an error when executing svnrdump, check if max_rev have been
+ # loaded and continue loading process if it is the case
+ svnadmin_info = run(
+ ["svnadmin", "info", repo_path], capture_output=True, text=True
+ )
+ if f"Revisions: {max_rev}\n" in svnadmin_info.stdout:
+ return temp_dir, repo_path
raise ValueError(
f"Failed to mount the svn dump for project {project_name}\n"
- + completed_process.stderr
+ + svnadmin_load.stderr
)
return temp_dir, repo_path
except Exception as e:
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 6:13 PM (11 w, 17 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3224987
Attached To
D8787: loader: Compress dump file and rework truncated dump handling
Event Timeline
Log In to Comment