diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -29,6 +29,7 @@ python3-swh.scheduler (>= 0.0.14~), python3-swh.storage (>= 0.0.83~), subversion, + rsvndump, ${misc:Depends}, ${python3:Depends} Description: Software Heritage Loader Svn diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -9,9 +9,13 @@ """ import os +import re import shutil import tempfile +from mmap import mmap, ACCESS_WRITE +from subprocess import run, PIPE + from swh.model import hashutil from swh.model.from_disk import Directory from swh.model.identifiers import identifier_to_bytes, revision_identifier @@ -20,7 +24,9 @@ from swh.loader.core.utils import clean_dangling_folders from . import svn, converters -from .utils import init_svn_repo_from_archive_dump +from .utils import ( + init_svn_repo_from_dump, init_svn_repo_from_archive_dump +) from .exception import SvnLoaderUneventful from .exception import SvnLoaderHistoryAltered @@ -571,3 +577,148 @@ self.temp_dir, os.path.basename(self.repo_path)) self.log.debug(msg) shutil.rmtree(self.temp_dir) + + +class SvnLoaderFromRemoteDump(SvnLoader): + """ + Create a subversion repository dump using the rsvndump utility, + mount it locally and load the repository from it. + """ + def __init__(self, svn_url): + super().__init__() + self.svn_url = svn_url + self.temp_dir = tempfile.mkdtemp(dir=self.temp_directory) + self.repo_path = None + self.truncated_dump = False + + def get_last_loaded_svn_rev(self, svn_url): + """ + Check if the svn repository has already been visited + and return the last loaded svn revision number or -1 + otherwise. + """ + last_loaded_svn_rev = -1 + try: + origin = \ + self.storage.origin_get({'type': 'svn', 'url': svn_url}) + last_swh_rev = \ + self.swh_latest_snapshot_revision(origin['id'])['revision'] + last_swh_rev_headers = \ + dict(last_swh_rev['metadata']['extra_headers']) + last_loaded_svn_rev = int(last_swh_rev_headers['svn_revision']) + except Exception: + pass + return last_loaded_svn_rev + + def dump_svn_revisions(self, svn_url, last_loaded_svn_rev=-1): + """ + Generate a subversion dump file using the rsvndump tool. + If previous revisions have already been loaded, the dump + file will only contain the new ones (empty revisions will + be created in the dump file to padd them and keep the same + revision numbering). If the rsvndump command failed somehow, + the produced dump file is analyzed to determine if a partial + loading is still feasible. + """ + # Build the rsvndump command line + rsvndump_cmd = ['rsvndump'] + if last_loaded_svn_rev >= 0: + # Previous revisions have already been loaded, so dump + # only those we are interested in while padding already loaded + # ones to keep the original revision numbers in the dump file + rsvndump_cmd += ['-r', '%s:HEAD' % last_loaded_svn_rev, + '--keep-revnums'] + # Use deltas to get a smaller dump file + rsvndump_cmd += ['--deltas', svn_url] + + # Launch the rsvndump command while capturing stderr as + # successfully dumped revision numbers are printed to it + dump_temp_dir = tempfile.mkdtemp(dir=self.temp_dir) + dump_name = ''.join(c for c in svn_url if c.isalnum()) + dump_path = '%s/%s.svndump' % (dump_temp_dir, dump_name) + self.log.debug('Executing %s' % ' '.join(rsvndump_cmd)) + with open(dump_path, 'wb') as dump_file: + rsvndump = run(rsvndump_cmd, stdout=dump_file, stderr=PIPE) + + if rsvndump.returncode == 0: + return dump_path + + # There was an error but it does not mean that no revisions + # can be loaded. + + # Get the stderr line with latest dumped revision + stderr_lines = rsvndump.stderr.split(b'\n') + last_dumped_rev = None + if len(stderr_lines) > 1: + last_dumped_rev = stderr_lines[-2] + + if last_dumped_rev: + # Get the latest dumped revision number + matched_rev = re.search(b'.*revision ([0-9]+)', last_dumped_rev) + last_dumped_rev = int(matched_rev.group(1)) if matched_rev else -1 + # Check if revisions inside the dump file can be loaded anyway + if last_dumped_rev > last_loaded_svn_rev: + self.log.debug(('rsvndump did not dump all expected revisions ' + 'but revisions range %s:%s are available in ' + 'the generated dump file and will be loaded ' + 'into the archive.') % (last_loaded_svn_rev+1, + last_dumped_rev)) + # Truncate the dump file after the last successfully dumped + # revision to avoid the loading of corrupted data + self.log.debug(('Truncating dump file after the last ' + 'successfully dumped revision (%s) to avoid ' + 'the loading of corrupted data') + % last_dumped_rev) + + with open(dump_path, 'r+b') as f: + with mmap(f.fileno(), 0, access=ACCESS_WRITE) as s: + pattern = ('Revision-number: %s' % + (last_dumped_rev+1)).encode() + n = s.rfind(pattern) + if n != -1: + s.resize(n) + self.truncated_dump = True + return dump_path + elif last_dumped_rev != -1: + raise Exception('Last dumped subversion revision (%s) is ' + 'lesser than the last one loaded into the ' + 'archive (%s).') % (last_dumped_rev, + last_loaded_svn_rev) + + raise Exception('An error occured when running rsvndump and ' + 'no exploitable dump file has been generated.') + + def prepare(self, *, svn_url, destination_path=None, + swh_revision=None, start_from_scratch=False, **kwargs): + # First, check if previous revisions have been loaded for the + # subversion origin and get the number of the last one + last_loaded_svn_rev = self.get_last_loaded_svn_rev(self.svn_url) + + # Then try to generate a dump file containing relevant svn revisions + # to load, an exception will be thrown if something wrong happened + dump_path = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev) + + # Finally, mount the dump and load the repository + _, self.repo_path = \ + init_svn_repo_from_dump(dump_path, + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + suffix='-%s' % os.getpid(), + root_dir=self.temp_dir) + if not svn_url: + svn_url = 'file://%s' % self.repo_path + super().prepare(svn_url=svn_url, + destination_path=destination_path, + swh_revision=swh_revision, + start_from_scratch=start_from_scratch, + **kwargs) + + def cleanup(self): + super().cleanup() + if self.temp_dir and os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + + def visit_status(self): + if self.truncated_dump: + return 'partial' + else: + return super().visit_status() diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py --- a/swh/loader/svn/tasks.py +++ b/swh/loader/svn/tasks.py @@ -3,9 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + from swh.scheduler.task import Task -from .loader import SvnLoader, SvnLoaderFromDumpArchive +from .loader import ( + SvnLoader, SvnLoaderFromDumpArchive, SvnLoaderFromRemoteDump +) class LoadSvnRepository(Task): @@ -67,3 +70,26 @@ visit_date=visit_date, archive_path=archive_path, start_from_scratch=start_from_scratch) + + +class DumpMountAndLoadSvnRepository(Task): + """ + Create a dump of a remote repository through the rsvndump + tool, mount it locally then load the repository into the + Software Heritage archive. + """ + task_queue = 'swh_loader_svn_dump_mount_and_load' + + def run_task(self, *, svn_url, origin_url=None, visit_date=None, + start_from_scratch=False): + """1. Mount an svn dump from archive as a local svn repository. + 2. Load it through the svn loader. + 3. Clean up mounted svn repository archive. + + """ + loader = SvnLoaderFromRemoteDump(svn_url) + loader.log = self.log + return loader.load(svn_url=None, + origin_url=origin_url, + visit_date=visit_date, + start_from_scratch=start_from_scratch) diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py --- a/swh/loader/svn/utils.py +++ b/swh/loader/svn/utils.py @@ -33,9 +33,9 @@ return ts -def init_svn_repo_from_archive_dump(archive_path, prefix=None, suffix=None, - root_dir='/tmp'): - """Given a path to an archive containing an svn dump. +def init_svn_repo_from_dump(dump_path, prefix=None, suffix=None, + root_dir='/tmp', gzip=False): + """Given a path to a svn dump. Initialize an svn repository with the content of said dump. Returns: @@ -49,7 +49,7 @@ and load the dump. """ - project_name = os.path.basename(os.path.dirname(archive_path)) + project_name = os.path.basename(os.path.dirname(dump_path)) temp_dir = tempfile.mkdtemp(prefix=prefix, suffix=suffix, dir=root_dir) try: @@ -63,7 +63,11 @@ 'Failed to initialize empty svn repo for %s' % project_name) - with Popen(['gzip', '-dc', archive_path], stdout=PIPE) as dump: + read_dump_cmd = ['cat', dump_path] + if gzip: + read_dump_cmd = ['gzip', '-dc', dump_path] + + with Popen(read_dump_cmd, stdout=PIPE) as dump: cmd = ['svnadmin', 'load', '-q', repo_path] r = call(cmd, stdin=dump.stdout) if r != 0: @@ -74,3 +78,23 @@ except Exception as e: shutil.rmtree(temp_dir) raise e + + +def init_svn_repo_from_archive_dump(archive_path, prefix=None, suffix=None, + root_dir='/tmp'): + """Given a path to an archive containing an svn dump. + Initialize an svn repository with the content of said dump. + + Returns: + A tuple: + - temporary folder (str): containing the mounted repository + - repo_path (str): path to the mounted repository inside the + temporary folder + + Raises: + ValueError in case of failure to run the command to uncompress + and load the dump. + + """ + return init_svn_repo_from_dump(archive_path, prefix=prefix, suffix=suffix, + root_dir=root_dir, gzip=True)