Page MenuHomeSoftware Heritage

D433.id1348.diff
No OneTemporary

D433.id1348.diff

diff --git a/debian/control b/debian/control
--- a/debian/control
+++ b/debian/control
@@ -29,6 +29,7 @@
python3-swh.scheduler (>= 0.0.14~),
python3-swh.storage (>= 0.0.83~),
subversion,
+ rsvndump,
${misc:Depends},
${python3:Depends}
Description: Software Heritage Loader Svn
diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py
--- a/swh/loader/svn/loader.py
+++ b/swh/loader/svn/loader.py
@@ -9,9 +9,13 @@
"""
import os
+import re
import shutil
import tempfile
+from mmap import mmap, ACCESS_WRITE
+from subprocess import run, PIPE
+
from swh.model import hashutil
from swh.model.from_disk import Directory
from swh.model.identifiers import identifier_to_bytes, revision_identifier
@@ -20,7 +24,9 @@
from swh.loader.core.utils import clean_dangling_folders
from . import svn, converters
-from .utils import init_svn_repo_from_archive_dump
+from .utils import (
+ init_svn_repo_from_dump, init_svn_repo_from_archive_dump
+)
from .exception import SvnLoaderUneventful
from .exception import SvnLoaderHistoryAltered
@@ -571,3 +577,148 @@
self.temp_dir, os.path.basename(self.repo_path))
self.log.debug(msg)
shutil.rmtree(self.temp_dir)
+
+
+class SvnLoaderFromRemoteDump(SvnLoader):
+ """
+ Create a subversion repository dump using the rsvndump utility,
+ mount it locally and load the repository from it.
+ """
+ def __init__(self, svn_url):
+ super().__init__()
+ self.svn_url = svn_url
+ self.temp_dir = tempfile.mkdtemp(dir=self.temp_directory)
+ self.repo_path = None
+ self.truncated_dump = False
+
+ def get_last_loaded_svn_rev(self, svn_url):
+ """
+ Check if the svn repository has already been visited
+ and return the last loaded svn revision number or -1
+ otherwise.
+ """
+ last_loaded_svn_rev = -1
+ try:
+ origin = \
+ self.storage.origin_get({'type': 'svn', 'url': svn_url})
+ last_swh_rev = \
+ self.swh_latest_snapshot_revision(origin['id'])['revision']
+ last_swh_rev_headers = \
+ dict(last_swh_rev['metadata']['extra_headers'])
+ last_loaded_svn_rev = int(last_swh_rev_headers['svn_revision'])
+ except Exception:
+ pass
+ return last_loaded_svn_rev
+
+ def dump_svn_revisions(self, svn_url, last_loaded_svn_rev=-1):
+ """
+ Generate a subversion dump file using the rsvndump tool.
+ If previous revisions have already been loaded, the dump
+ file will only contain the new ones (empty revisions will
+ be created in the dump file to padd them and keep the same
+ revision numbering). If the rsvndump command failed somehow,
+ the produced dump file is analyzed to determine if a partial
+ loading is still feasible.
+ """
+ # Build the rsvndump command line
+ rsvndump_cmd = ['rsvndump']
+ if last_loaded_svn_rev >= 0:
+ # Previous revisions have already been loaded, so dump
+ # only those we are interested in while padding already loaded
+ # ones to keep the original revision numbers in the dump file
+ rsvndump_cmd += ['-r', '%s:HEAD' % last_loaded_svn_rev,
+ '--keep-revnums']
+ # Use deltas to get a smaller dump file
+ rsvndump_cmd += ['--deltas', svn_url]
+
+ # Launch the rsvndump command while capturing stderr as
+ # successfully dumped revision numbers are printed to it
+ dump_temp_dir = tempfile.mkdtemp(dir=self.temp_dir)
+ dump_name = ''.join(c for c in svn_url if c.isalnum())
+ dump_path = '%s/%s.svndump' % (dump_temp_dir, dump_name)
+ self.log.debug('Executing %s' % ' '.join(rsvndump_cmd))
+ with open(dump_path, 'wb') as dump_file:
+ rsvndump = run(rsvndump_cmd, stdout=dump_file, stderr=PIPE)
+
+ if rsvndump.returncode == 0:
+ return dump_path
+
+ # There was an error but it does not mean that no revisions
+ # can be loaded.
+
+ # Get the stderr line with latest dumped revision
+ stderr_lines = rsvndump.stderr.split(b'\n')
+ last_dumped_rev = None
+ if len(stderr_lines) > 1:
+ last_dumped_rev = stderr_lines[-2]
+
+ if last_dumped_rev:
+ # Get the latest dumped revision number
+ matched_rev = re.search(b'.*revision ([0-9]+)', last_dumped_rev)
+ last_dumped_rev = int(matched_rev.group(1)) if matched_rev else -1
+ # Check if revisions inside the dump file can be loaded anyway
+ if last_dumped_rev > last_loaded_svn_rev:
+ self.log.debug(('rsvndump did not dump all expected revisions '
+ 'but revisions range %s:%s are available in '
+ 'the generated dump file and will be loaded '
+ 'into the archive.') % (last_loaded_svn_rev+1,
+ last_dumped_rev))
+ # Truncate the dump file after the last successfully dumped
+ # revision to avoid the loading of corrupted data
+ self.log.debug(('Truncating dump file after the last '
+ 'successfully dumped revision (%s) to avoid '
+ 'the loading of corrupted data')
+ % last_dumped_rev)
+
+ with open(dump_path, 'r+b') as f:
+ with mmap(f.fileno(), 0, access=ACCESS_WRITE) as s:
+ pattern = ('Revision-number: %s' %
+ (last_dumped_rev+1)).encode()
+ n = s.rfind(pattern)
+ if n != -1:
+ s.resize(n)
+ self.truncated_dump = True
+ return dump_path
+ elif last_dumped_rev != -1:
+ raise Exception('Last dumped subversion revision (%s) is '
+ 'lesser than the last one loaded into the '
+ 'archive (%s).') % (last_dumped_rev,
+ last_loaded_svn_rev)
+
+ raise Exception('An error occured when running rsvndump and '
+ 'no exploitable dump file has been generated.')
+
+ def prepare(self, *, svn_url, destination_path=None,
+ swh_revision=None, start_from_scratch=False, **kwargs):
+ # First, check if previous revisions have been loaded for the
+ # subversion origin and get the number of the last one
+ last_loaded_svn_rev = self.get_last_loaded_svn_rev(self.svn_url)
+
+ # Then try to generate a dump file containing relevant svn revisions
+ # to load, an exception will be thrown if something wrong happened
+ dump_path = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev)
+
+ # Finally, mount the dump and load the repository
+ _, self.repo_path = \
+ init_svn_repo_from_dump(dump_path,
+ prefix=TEMPORARY_DIR_PREFIX_PATTERN,
+ suffix='-%s' % os.getpid(),
+ root_dir=self.temp_dir)
+ if not svn_url:
+ svn_url = 'file://%s' % self.repo_path
+ super().prepare(svn_url=svn_url,
+ destination_path=destination_path,
+ swh_revision=swh_revision,
+ start_from_scratch=start_from_scratch,
+ **kwargs)
+
+ def cleanup(self):
+ super().cleanup()
+ if self.temp_dir and os.path.exists(self.temp_dir):
+ shutil.rmtree(self.temp_dir)
+
+ def visit_status(self):
+ if self.truncated_dump:
+ return 'partial'
+ else:
+ return super().visit_status()
diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py
--- a/swh/loader/svn/tasks.py
+++ b/swh/loader/svn/tasks.py
@@ -3,9 +3,12 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+
from swh.scheduler.task import Task
-from .loader import SvnLoader, SvnLoaderFromDumpArchive
+from .loader import (
+ SvnLoader, SvnLoaderFromDumpArchive, SvnLoaderFromRemoteDump
+)
class LoadSvnRepository(Task):
@@ -67,3 +70,26 @@
visit_date=visit_date,
archive_path=archive_path,
start_from_scratch=start_from_scratch)
+
+
+class DumpMountAndLoadSvnRepository(Task):
+ """
+ Create a dump of a remote repository through the rsvndump
+ tool, mount it locally then load the repository into the
+ Software Heritage archive.
+ """
+ task_queue = 'swh_loader_svn_dump_mount_and_load'
+
+ def run_task(self, *, svn_url, origin_url=None, visit_date=None,
+ start_from_scratch=False):
+ """1. Mount an svn dump from archive as a local svn repository.
+ 2. Load it through the svn loader.
+ 3. Clean up mounted svn repository archive.
+
+ """
+ loader = SvnLoaderFromRemoteDump(svn_url)
+ loader.log = self.log
+ return loader.load(svn_url=None,
+ origin_url=origin_url,
+ visit_date=visit_date,
+ start_from_scratch=start_from_scratch)
diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py
--- a/swh/loader/svn/utils.py
+++ b/swh/loader/svn/utils.py
@@ -33,9 +33,9 @@
return ts
-def init_svn_repo_from_archive_dump(archive_path, prefix=None, suffix=None,
- root_dir='/tmp'):
- """Given a path to an archive containing an svn dump.
+def init_svn_repo_from_dump(dump_path, prefix=None, suffix=None,
+ root_dir='/tmp', gzip=False):
+ """Given a path to a svn dump.
Initialize an svn repository with the content of said dump.
Returns:
@@ -49,7 +49,7 @@
and load the dump.
"""
- project_name = os.path.basename(os.path.dirname(archive_path))
+ project_name = os.path.basename(os.path.dirname(dump_path))
temp_dir = tempfile.mkdtemp(prefix=prefix, suffix=suffix, dir=root_dir)
try:
@@ -63,7 +63,11 @@
'Failed to initialize empty svn repo for %s' %
project_name)
- with Popen(['gzip', '-dc', archive_path], stdout=PIPE) as dump:
+ read_dump_cmd = ['cat', dump_path]
+ if gzip:
+ read_dump_cmd = ['gzip', '-dc', dump_path]
+
+ with Popen(read_dump_cmd, stdout=PIPE) as dump:
cmd = ['svnadmin', 'load', '-q', repo_path]
r = call(cmd, stdin=dump.stdout)
if r != 0:
@@ -74,3 +78,23 @@
except Exception as e:
shutil.rmtree(temp_dir)
raise e
+
+
+def init_svn_repo_from_archive_dump(archive_path, prefix=None, suffix=None,
+ root_dir='/tmp'):
+ """Given a path to an archive containing an svn dump.
+ Initialize an svn repository with the content of said dump.
+
+ Returns:
+ A tuple:
+ - temporary folder (str): containing the mounted repository
+ - repo_path (str): path to the mounted repository inside the
+ temporary folder
+
+ Raises:
+ ValueError in case of failure to run the command to uncompress
+ and load the dump.
+
+ """
+ return init_svn_repo_from_dump(archive_path, prefix=prefix, suffix=suffix,
+ root_dir=root_dir, gzip=True)

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 9:56 PM (3 h, 39 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214521

Event Timeline