Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066299
D439.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
D439.diff
View Options
diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py
--- a/swh/loader/svn/loader.py
+++ b/swh/loader/svn/loader.py
@@ -9,9 +9,13 @@
"""
import os
+import re
import shutil
import tempfile
+from mmap import mmap, ACCESS_WRITE
+from subprocess import run, PIPE
+
from swh.model import hashutil
from swh.model.from_disk import Directory
from swh.model.identifiers import identifier_to_bytes, revision_identifier
@@ -20,7 +24,9 @@
from swh.loader.core.utils import clean_dangling_folders
from . import svn, converters
-from .utils import init_svn_repo_from_archive_dump
+from .utils import (
+ init_svn_repo_from_dump, init_svn_repo_from_archive_dump
+)
from .exception import SvnLoaderUneventful
from .exception import SvnLoaderHistoryAltered
@@ -597,3 +603,135 @@
self.temp_dir, os.path.basename(self.repo_path))
self.log.debug(msg)
shutil.rmtree(self.temp_dir)
+
+
+class SvnLoaderFromRemoteDump(SvnLoader):
+ """
+ Create a subversion repository dump using the svnrdump utility,
+ mount it locally and load the repository from it.
+ """
+ def __init__(self):
+ super().__init__()
+ self.temp_dir = tempfile.mkdtemp(dir=self.temp_directory)
+ self.repo_path = None
+ self.truncated_dump = False
+
+ def get_last_loaded_svn_rev(self, svn_url):
+ """
+ Check if the svn repository has already been visited
+ and return the last loaded svn revision number or -1
+ otherwise.
+ """
+ last_loaded_svn_rev = -1
+ try:
+ origin = \
+ self.storage.origin_get({'type': 'svn', 'url': svn_url})
+ last_swh_rev = \
+ self.swh_latest_snapshot_revision(origin['id'])['revision']
+ last_swh_rev_headers = \
+ dict(last_swh_rev['metadata']['extra_headers'])
+ last_loaded_svn_rev = int(last_swh_rev_headers['svn_revision'])
+ except Exception:
+ pass
+ return last_loaded_svn_rev
+
+ def dump_svn_revisions(self, svn_url, last_loaded_svn_rev=-1):
+ """
+ Generate a subversion dump file using the svnrdump tool.
+ If the svnrdump command failed somehow,
+ the produced dump file is analyzed to determine if a partial
+ loading is still feasible.
+ """
+ # Build the svnrdump command line
+ svnrdump_cmd = ['svnrdump', 'dump', svn_url]
+
+ # Launch the svnrdump command while capturing stderr as
+ # successfully dumped revision numbers are printed to it
+ dump_temp_dir = tempfile.mkdtemp(dir=self.temp_dir)
+ dump_name = ''.join(c for c in svn_url if c.isalnum())
+ dump_path = '%s/%s.svndump' % (dump_temp_dir, dump_name)
+ self.log.debug('Executing %s' % ' '.join(svnrdump_cmd))
+ with open(dump_path, 'wb') as dump_file:
+ svnrdump = run(svnrdump_cmd, stdout=dump_file, stderr=PIPE)
+
+ if svnrdump.returncode == 0:
+ return dump_path
+
+ # There was an error but it does not mean that no revisions
+ # can be loaded.
+
+ # Get the stderr line with latest dumped revision
+ stderr_lines = svnrdump.stderr.split(b'\n')
+ last_dumped_rev = None
+ if len(stderr_lines) > 1:
+ last_dumped_rev = stderr_lines[-2]
+
+ if last_dumped_rev:
+ # Get the latest dumped revision number
+ matched_rev = re.search(b'.*revision ([0-9]+)', last_dumped_rev)
+ last_dumped_rev = int(matched_rev.group(1)) if matched_rev else -1
+ # Check if revisions inside the dump file can be loaded anyway
+ if last_dumped_rev > last_loaded_svn_rev:
+ self.log.debug(('svnrdump did not dump all expected revisions '
+ 'but revisions range %s:%s are available in '
+ 'the generated dump file and will be loaded '
+ 'into the archive.') % (last_loaded_svn_rev+1,
+ last_dumped_rev))
+ # Truncate the dump file after the last successfully dumped
+ # revision to avoid the loading of corrupted data
+ self.log.debug(('Truncating dump file after the last '
+ 'successfully dumped revision (%s) to avoid '
+ 'the loading of corrupted data')
+ % last_dumped_rev)
+
+ with open(dump_path, 'r+b') as f:
+ with mmap(f.fileno(), 0, access=ACCESS_WRITE) as s:
+ pattern = ('Revision-number: %s' %
+ (last_dumped_rev+1)).encode()
+ n = s.rfind(pattern)
+ if n != -1:
+ s.resize(n)
+ self.truncated_dump = True
+ return dump_path
+ elif last_dumped_rev != -1:
+ raise Exception(('Last dumped subversion revision (%s) is '
+ 'lesser than the last one loaded into the '
+ 'archive (%s).') % (last_dumped_rev,
+ last_loaded_svn_rev))
+
+ raise Exception('An error occured when running svnrdump and '
+ 'no exploitable dump file has been generated.')
+
+ def prepare(self, *, svn_url, destination_path=None,
+ swh_revision=None, start_from_scratch=False, **kwargs):
+ # First, check if previous revisions have been loaded for the
+ # subversion origin and get the number of the last one
+ last_loaded_svn_rev = self.get_last_loaded_svn_rev(svn_url)
+
+ # Then try to generate a dump file containing relevant svn revisions
+ # to load, an exception will be thrown if something wrong happened
+ dump_path = self.dump_svn_revisions(svn_url, last_loaded_svn_rev)
+
+ # Finally, mount the dump and load the repository
+ self.log.debug('Mounting dump file with "svnadmin load".')
+ _, self.repo_path = \
+ init_svn_repo_from_dump(dump_path,
+ prefix=TEMPORARY_DIR_PREFIX_PATTERN,
+ suffix='-%s' % os.getpid(),
+ root_dir=self.temp_dir)
+ super().prepare(svn_url='file://%s' % self.repo_path,
+ destination_path=destination_path,
+ swh_revision=swh_revision,
+ start_from_scratch=start_from_scratch,
+ **kwargs)
+
+ def cleanup(self):
+ super().cleanup()
+ if self.temp_dir and os.path.exists(self.temp_dir):
+ shutil.rmtree(self.temp_dir)
+
+ def visit_status(self):
+ if self.truncated_dump:
+ return 'partial'
+ else:
+ return super().visit_status()
diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py
--- a/swh/loader/svn/tasks.py
+++ b/swh/loader/svn/tasks.py
@@ -3,9 +3,12 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+
from swh.scheduler.task import Task
-from .loader import SvnLoader, SvnLoaderFromDumpArchive
+from .loader import (
+ SvnLoader, SvnLoaderFromDumpArchive, SvnLoaderFromRemoteDump
+)
class LoadSvnRepository(Task):
@@ -67,3 +70,26 @@
visit_date=visit_date,
archive_path=archive_path,
start_from_scratch=start_from_scratch)
+
+
+class DumpMountAndLoadSvnRepository(Task):
+ """
+ Create a dump of a remote repository through the svnrdump
+ tool, mount it locally then load the repository into the
+ Software Heritage archive.
+ """
+ task_queue = 'swh_loader_svn_dump_mount_and_load'
+
+ def run_task(self, *, svn_url, origin_url=None, visit_date=None,
+ start_from_scratch=False):
+ """1. Mount an svn dump from archive as a local svn repository.
+ 2. Load it through the svn loader.
+ 3. Clean up mounted svn repository archive.
+
+ """
+ loader = SvnLoaderFromRemoteDump()
+ loader.log = self.log
+ return loader.load(svn_url=svn_url,
+ origin_url=origin_url,
+ visit_date=visit_date,
+ start_from_scratch=start_from_scratch)
diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py
--- a/swh/loader/svn/tests/test_loader.py
+++ b/swh/loader/svn/tests/test_loader.py
@@ -10,7 +10,7 @@
from swh.model import hashutil
from swh.loader.svn.loader import build_swh_snapshot, DEFAULT_BRANCH
-from swh.loader.svn.loader import SvnLoader
+from swh.loader.svn.loader import SvnLoader, SvnLoaderFromRemoteDump
class TestSnapshot(TestCase):
@@ -928,3 +928,48 @@
# FIXME: Check the snapshot's state
self.assertEqual(self.loader.load_status(), {'status': 'eventful'})
self.assertEqual(self.loader.visit_status(), 'full')
+
+
+class SvnLoaderFromRemoteDumpNoStorage(LoaderNoStorage, LoaderWithState,
+ SvnLoaderFromRemoteDump):
+ """A SvnLoaderFromRemoteDump with no persistence.
+
+ Context:
+ Load a remote svn repository from a generated dump file.
+
+ """
+
+ def swh_latest_snapshot_revision(self, origin_id, prev_swh_revision=None):
+ """We do not know this repository so no revision.
+
+ """
+ return {}
+
+
+class SvnLoaderFromRemoteDump(BaseSvnLoaderTest):
+ """
+ Check that the results obtained with the remote svn dump loader
+ and the base svn loader are the same.
+ """
+ def setUp(self):
+ super().setUp(archive_name='pkg-gourmet.tgz')
+
+ @istest
+ def load(self):
+ """
+ Compare results of remote dump loader and base loader
+ """
+ dump_loader = SvnLoaderFromRemoteDumpNoStorage()
+ dump_loader.load(svn_url=self.svn_mirror_url)
+
+ base_loader = SvnLoaderNoStorage()
+ base_loader.load(svn_url=self.svn_mirror_url)
+
+ self.assertEqual(dump_loader.all_contents,
+ base_loader.all_contents)
+ self.assertEqual(dump_loader.all_directories,
+ base_loader.all_directories)
+ self.assertEqual(dump_loader.all_revisions,
+ base_loader.all_revisions)
+ self.assertEqual(dump_loader.all_snapshots,
+ base_loader.all_snapshots)
diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py
--- a/swh/loader/svn/utils.py
+++ b/swh/loader/svn/utils.py
@@ -33,9 +33,9 @@
return ts
-def init_svn_repo_from_archive_dump(archive_path, prefix=None, suffix=None,
- root_dir='/tmp'):
- """Given a path to an archive containing an svn dump.
+def init_svn_repo_from_dump(dump_path, prefix=None, suffix=None,
+ root_dir='/tmp', gzip=False):
+ """Given a path to a svn dump.
Initialize an svn repository with the content of said dump.
Returns:
@@ -49,7 +49,7 @@
and load the dump.
"""
- project_name = os.path.basename(os.path.dirname(archive_path))
+ project_name = os.path.basename(os.path.dirname(dump_path))
temp_dir = tempfile.mkdtemp(prefix=prefix, suffix=suffix, dir=root_dir)
try:
@@ -63,7 +63,11 @@
'Failed to initialize empty svn repo for %s' %
project_name)
- with Popen(['gzip', '-dc', archive_path], stdout=PIPE) as dump:
+ read_dump_cmd = ['cat', dump_path]
+ if gzip:
+ read_dump_cmd = ['gzip', '-dc', dump_path]
+
+ with Popen(read_dump_cmd, stdout=PIPE) as dump:
cmd = ['svnadmin', 'load', '-q', repo_path]
r = call(cmd, stdin=dump.stdout)
if r != 0:
@@ -74,3 +78,23 @@
except Exception as e:
shutil.rmtree(temp_dir)
raise e
+
+
+def init_svn_repo_from_archive_dump(archive_path, prefix=None, suffix=None,
+ root_dir='/tmp'):
+ """Given a path to an archive containing an svn dump.
+ Initialize an svn repository with the content of said dump.
+
+ Returns:
+ A tuple:
+ - temporary folder (str): containing the mounted repository
+ - repo_path (str): path to the mounted repository inside the
+ temporary folder
+
+ Raises:
+ ValueError in case of failure to run the command to uncompress
+ and load the dump.
+
+ """
+ return init_svn_repo_from_dump(archive_path, prefix=prefix, suffix=suffix,
+ root_dir=root_dir, gzip=True)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 5:30 AM (8 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228986
Attached To
D439: SVN loader: Add efficient loader based on remote dumps
Event Timeline
Log In to Comment