Page MenuHomeSoftware Heritage

D439.diff
No OneTemporary

D439.diff

diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py
--- a/swh/loader/svn/loader.py
+++ b/swh/loader/svn/loader.py
@@ -9,9 +9,13 @@
"""
import os
+import re
import shutil
import tempfile
+from mmap import mmap, ACCESS_WRITE
+from subprocess import run, PIPE
+
from swh.model import hashutil
from swh.model.from_disk import Directory
from swh.model.identifiers import identifier_to_bytes, revision_identifier
@@ -20,7 +24,9 @@
from swh.loader.core.utils import clean_dangling_folders
from . import svn, converters
-from .utils import init_svn_repo_from_archive_dump
+from .utils import (
+ init_svn_repo_from_dump, init_svn_repo_from_archive_dump
+)
from .exception import SvnLoaderUneventful
from .exception import SvnLoaderHistoryAltered
@@ -597,3 +603,135 @@
self.temp_dir, os.path.basename(self.repo_path))
self.log.debug(msg)
shutil.rmtree(self.temp_dir)
+
+
+class SvnLoaderFromRemoteDump(SvnLoader):
+ """
+ Create a subversion repository dump using the svnrdump utility,
+ mount it locally and load the repository from it.
+ """
+ def __init__(self):
+ super().__init__()
+ self.temp_dir = tempfile.mkdtemp(dir=self.temp_directory)
+ self.repo_path = None
+ self.truncated_dump = False
+
+ def get_last_loaded_svn_rev(self, svn_url):
+ """
+ Check if the svn repository has already been visited
+ and return the last loaded svn revision number or -1
+ otherwise.
+ """
+ last_loaded_svn_rev = -1
+ try:
+ origin = \
+ self.storage.origin_get({'type': 'svn', 'url': svn_url})
+ last_swh_rev = \
+ self.swh_latest_snapshot_revision(origin['id'])['revision']
+ last_swh_rev_headers = \
+ dict(last_swh_rev['metadata']['extra_headers'])
+ last_loaded_svn_rev = int(last_swh_rev_headers['svn_revision'])
+ except Exception:
+ pass
+ return last_loaded_svn_rev
+
+ def dump_svn_revisions(self, svn_url, last_loaded_svn_rev=-1):
+ """
+ Generate a subversion dump file using the svnrdump tool.
+ If the svnrdump command failed somehow,
+ the produced dump file is analyzed to determine if a partial
+ loading is still feasible.
+ """
+ # Build the svnrdump command line
+ svnrdump_cmd = ['svnrdump', 'dump', svn_url]
+
+ # Launch the svnrdump command while capturing stderr as
+ # successfully dumped revision numbers are printed to it
+ dump_temp_dir = tempfile.mkdtemp(dir=self.temp_dir)
+ dump_name = ''.join(c for c in svn_url if c.isalnum())
+ dump_path = '%s/%s.svndump' % (dump_temp_dir, dump_name)
+ self.log.debug('Executing %s' % ' '.join(svnrdump_cmd))
+ with open(dump_path, 'wb') as dump_file:
+ svnrdump = run(svnrdump_cmd, stdout=dump_file, stderr=PIPE)
+
+ if svnrdump.returncode == 0:
+ return dump_path
+
+ # There was an error but it does not mean that no revisions
+ # can be loaded.
+
+ # Get the stderr line with latest dumped revision
+ stderr_lines = svnrdump.stderr.split(b'\n')
+ last_dumped_rev = None
+ if len(stderr_lines) > 1:
+ last_dumped_rev = stderr_lines[-2]
+
+ if last_dumped_rev:
+ # Get the latest dumped revision number
+ matched_rev = re.search(b'.*revision ([0-9]+)', last_dumped_rev)
+ last_dumped_rev = int(matched_rev.group(1)) if matched_rev else -1
+ # Check if revisions inside the dump file can be loaded anyway
+ if last_dumped_rev > last_loaded_svn_rev:
+ self.log.debug(('svnrdump did not dump all expected revisions '
+ 'but revisions range %s:%s are available in '
+ 'the generated dump file and will be loaded '
+ 'into the archive.') % (last_loaded_svn_rev+1,
+ last_dumped_rev))
+ # Truncate the dump file after the last successfully dumped
+ # revision to avoid the loading of corrupted data
+ self.log.debug(('Truncating dump file after the last '
+ 'successfully dumped revision (%s) to avoid '
+ 'the loading of corrupted data')
+ % last_dumped_rev)
+
+ with open(dump_path, 'r+b') as f:
+ with mmap(f.fileno(), 0, access=ACCESS_WRITE) as s:
+ pattern = ('Revision-number: %s' %
+ (last_dumped_rev+1)).encode()
+ n = s.rfind(pattern)
+ if n != -1:
+ s.resize(n)
+ self.truncated_dump = True
+ return dump_path
+ elif last_dumped_rev != -1:
+ raise Exception(('Last dumped subversion revision (%s) is '
+ 'lesser than the last one loaded into the '
+ 'archive (%s).') % (last_dumped_rev,
+ last_loaded_svn_rev))
+
+ raise Exception('An error occured when running svnrdump and '
+ 'no exploitable dump file has been generated.')
+
+ def prepare(self, *, svn_url, destination_path=None,
+ swh_revision=None, start_from_scratch=False, **kwargs):
+ # First, check if previous revisions have been loaded for the
+ # subversion origin and get the number of the last one
+ last_loaded_svn_rev = self.get_last_loaded_svn_rev(svn_url)
+
+ # Then try to generate a dump file containing relevant svn revisions
+ # to load, an exception will be thrown if something wrong happened
+ dump_path = self.dump_svn_revisions(svn_url, last_loaded_svn_rev)
+
+ # Finally, mount the dump and load the repository
+ self.log.debug('Mounting dump file with "svnadmin load".')
+ _, self.repo_path = \
+ init_svn_repo_from_dump(dump_path,
+ prefix=TEMPORARY_DIR_PREFIX_PATTERN,
+ suffix='-%s' % os.getpid(),
+ root_dir=self.temp_dir)
+ super().prepare(svn_url='file://%s' % self.repo_path,
+ destination_path=destination_path,
+ swh_revision=swh_revision,
+ start_from_scratch=start_from_scratch,
+ **kwargs)
+
+ def cleanup(self):
+ super().cleanup()
+ if self.temp_dir and os.path.exists(self.temp_dir):
+ shutil.rmtree(self.temp_dir)
+
+ def visit_status(self):
+ if self.truncated_dump:
+ return 'partial'
+ else:
+ return super().visit_status()
diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py
--- a/swh/loader/svn/tasks.py
+++ b/swh/loader/svn/tasks.py
@@ -3,9 +3,12 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+
from swh.scheduler.task import Task
-from .loader import SvnLoader, SvnLoaderFromDumpArchive
+from .loader import (
+ SvnLoader, SvnLoaderFromDumpArchive, SvnLoaderFromRemoteDump
+)
class LoadSvnRepository(Task):
@@ -67,3 +70,26 @@
visit_date=visit_date,
archive_path=archive_path,
start_from_scratch=start_from_scratch)
+
+
+class DumpMountAndLoadSvnRepository(Task):
+ """
+ Create a dump of a remote repository through the svnrdump
+ tool, mount it locally then load the repository into the
+ Software Heritage archive.
+ """
+ task_queue = 'swh_loader_svn_dump_mount_and_load'
+
+ def run_task(self, *, svn_url, origin_url=None, visit_date=None,
+ start_from_scratch=False):
+ """1. Mount an svn dump from archive as a local svn repository.
+ 2. Load it through the svn loader.
+ 3. Clean up mounted svn repository archive.
+
+ """
+ loader = SvnLoaderFromRemoteDump()
+ loader.log = self.log
+ return loader.load(svn_url=svn_url,
+ origin_url=origin_url,
+ visit_date=visit_date,
+ start_from_scratch=start_from_scratch)
diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py
--- a/swh/loader/svn/tests/test_loader.py
+++ b/swh/loader/svn/tests/test_loader.py
@@ -10,7 +10,7 @@
from swh.model import hashutil
from swh.loader.svn.loader import build_swh_snapshot, DEFAULT_BRANCH
-from swh.loader.svn.loader import SvnLoader
+from swh.loader.svn.loader import SvnLoader, SvnLoaderFromRemoteDump
class TestSnapshot(TestCase):
@@ -928,3 +928,48 @@
# FIXME: Check the snapshot's state
self.assertEqual(self.loader.load_status(), {'status': 'eventful'})
self.assertEqual(self.loader.visit_status(), 'full')
+
+
+class SvnLoaderFromRemoteDumpNoStorage(LoaderNoStorage, LoaderWithState,
+ SvnLoaderFromRemoteDump):
+ """A SvnLoaderFromRemoteDump with no persistence.
+
+ Context:
+ Load a remote svn repository from a generated dump file.
+
+ """
+
+ def swh_latest_snapshot_revision(self, origin_id, prev_swh_revision=None):
+ """We do not know this repository so no revision.
+
+ """
+ return {}
+
+
+class SvnLoaderFromRemoteDump(BaseSvnLoaderTest):
+ """
+ Check that the results obtained with the remote svn dump loader
+ and the base svn loader are the same.
+ """
+ def setUp(self):
+ super().setUp(archive_name='pkg-gourmet.tgz')
+
+ @istest
+ def load(self):
+ """
+ Compare results of remote dump loader and base loader
+ """
+ dump_loader = SvnLoaderFromRemoteDumpNoStorage()
+ dump_loader.load(svn_url=self.svn_mirror_url)
+
+ base_loader = SvnLoaderNoStorage()
+ base_loader.load(svn_url=self.svn_mirror_url)
+
+ self.assertEqual(dump_loader.all_contents,
+ base_loader.all_contents)
+ self.assertEqual(dump_loader.all_directories,
+ base_loader.all_directories)
+ self.assertEqual(dump_loader.all_revisions,
+ base_loader.all_revisions)
+ self.assertEqual(dump_loader.all_snapshots,
+ base_loader.all_snapshots)
diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py
--- a/swh/loader/svn/utils.py
+++ b/swh/loader/svn/utils.py
@@ -33,9 +33,9 @@
return ts
-def init_svn_repo_from_archive_dump(archive_path, prefix=None, suffix=None,
- root_dir='/tmp'):
- """Given a path to an archive containing an svn dump.
+def init_svn_repo_from_dump(dump_path, prefix=None, suffix=None,
+ root_dir='/tmp', gzip=False):
+ """Given a path to a svn dump.
Initialize an svn repository with the content of said dump.
Returns:
@@ -49,7 +49,7 @@
and load the dump.
"""
- project_name = os.path.basename(os.path.dirname(archive_path))
+ project_name = os.path.basename(os.path.dirname(dump_path))
temp_dir = tempfile.mkdtemp(prefix=prefix, suffix=suffix, dir=root_dir)
try:
@@ -63,7 +63,11 @@
'Failed to initialize empty svn repo for %s' %
project_name)
- with Popen(['gzip', '-dc', archive_path], stdout=PIPE) as dump:
+ read_dump_cmd = ['cat', dump_path]
+ if gzip:
+ read_dump_cmd = ['gzip', '-dc', dump_path]
+
+ with Popen(read_dump_cmd, stdout=PIPE) as dump:
cmd = ['svnadmin', 'load', '-q', repo_path]
r = call(cmd, stdin=dump.stdout)
if r != 0:
@@ -74,3 +78,23 @@
except Exception as e:
shutil.rmtree(temp_dir)
raise e
+
+
+def init_svn_repo_from_archive_dump(archive_path, prefix=None, suffix=None,
+ root_dir='/tmp'):
+ """Given a path to an archive containing an svn dump.
+ Initialize an svn repository with the content of said dump.
+
+ Returns:
+ A tuple:
+ - temporary folder (str): containing the mounted repository
+ - repo_path (str): path to the mounted repository inside the
+ temporary folder
+
+ Raises:
+ ValueError in case of failure to run the command to uncompress
+ and load the dump.
+
+ """
+ return init_svn_repo_from_dump(archive_path, prefix=prefix, suffix=suffix,
+ root_dir=root_dir, gzip=True)

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 5:30 AM (8 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3228986

Event Timeline