diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -9,9 +9,13 @@ """ import os +import re import shutil import tempfile +from mmap import mmap, ACCESS_WRITE +from subprocess import run, PIPE + from swh.model import hashutil from swh.model.from_disk import Directory from swh.model.identifiers import identifier_to_bytes, revision_identifier @@ -20,7 +24,9 @@ from swh.loader.core.utils import clean_dangling_folders from . import svn, converters -from .utils import init_svn_repo_from_archive_dump +from .utils import ( + init_svn_repo_from_dump, init_svn_repo_from_archive_dump +) from .exception import SvnLoaderUneventful from .exception import SvnLoaderHistoryAltered @@ -597,3 +603,135 @@ self.temp_dir, os.path.basename(self.repo_path)) self.log.debug(msg) shutil.rmtree(self.temp_dir) + + +class SvnLoaderFromRemoteDump(SvnLoader): + """ + Create a subversion repository dump using the svnrdump utility, + mount it locally and load the repository from it. + """ + def __init__(self): + super().__init__() + self.temp_dir = tempfile.mkdtemp(dir=self.temp_directory) + self.repo_path = None + self.truncated_dump = False + + def get_last_loaded_svn_rev(self, svn_url): + """ + Check if the svn repository has already been visited + and return the last loaded svn revision number or -1 + otherwise. + """ + last_loaded_svn_rev = -1 + try: + origin = \ + self.storage.origin_get({'type': 'svn', 'url': svn_url}) + last_swh_rev = \ + self.swh_latest_snapshot_revision(origin['id'])['revision'] + last_swh_rev_headers = \ + dict(last_swh_rev['metadata']['extra_headers']) + last_loaded_svn_rev = int(last_swh_rev_headers['svn_revision']) + except Exception: + pass + return last_loaded_svn_rev + + def dump_svn_revisions(self, svn_url, last_loaded_svn_rev=-1): + """ + Generate a subversion dump file using the svnrdump tool. + If the svnrdump command failed somehow, + the produced dump file is analyzed to determine if a partial + loading is still feasible. + """ + # Build the svnrdump command line + svnrdump_cmd = ['svnrdump', 'dump', svn_url] + + # Launch the svnrdump command while capturing stderr as + # successfully dumped revision numbers are printed to it + dump_temp_dir = tempfile.mkdtemp(dir=self.temp_dir) + dump_name = ''.join(c for c in svn_url if c.isalnum()) + dump_path = '%s/%s.svndump' % (dump_temp_dir, dump_name) + self.log.debug('Executing %s' % ' '.join(svnrdump_cmd)) + with open(dump_path, 'wb') as dump_file: + svnrdump = run(svnrdump_cmd, stdout=dump_file, stderr=PIPE) + + if svnrdump.returncode == 0: + return dump_path + + # There was an error but it does not mean that no revisions + # can be loaded. + + # Get the stderr line with latest dumped revision + stderr_lines = svnrdump.stderr.split(b'\n') + last_dumped_rev = None + if len(stderr_lines) > 1: + last_dumped_rev = stderr_lines[-2] + + if last_dumped_rev: + # Get the latest dumped revision number + matched_rev = re.search(b'.*revision ([0-9]+)', last_dumped_rev) + last_dumped_rev = int(matched_rev.group(1)) if matched_rev else -1 + # Check if revisions inside the dump file can be loaded anyway + if last_dumped_rev > last_loaded_svn_rev: + self.log.debug(('svnrdump did not dump all expected revisions ' + 'but revisions range %s:%s are available in ' + 'the generated dump file and will be loaded ' + 'into the archive.') % (last_loaded_svn_rev+1, + last_dumped_rev)) + # Truncate the dump file after the last successfully dumped + # revision to avoid the loading of corrupted data + self.log.debug(('Truncating dump file after the last ' + 'successfully dumped revision (%s) to avoid ' + 'the loading of corrupted data') + % last_dumped_rev) + + with open(dump_path, 'r+b') as f: + with mmap(f.fileno(), 0, access=ACCESS_WRITE) as s: + pattern = ('Revision-number: %s' % + (last_dumped_rev+1)).encode() + n = s.rfind(pattern) + if n != -1: + s.resize(n) + self.truncated_dump = True + return dump_path + elif last_dumped_rev != -1: + raise Exception(('Last dumped subversion revision (%s) is ' + 'lesser than the last one loaded into the ' + 'archive (%s).') % (last_dumped_rev, + last_loaded_svn_rev)) + + raise Exception('An error occured when running svnrdump and ' + 'no exploitable dump file has been generated.') + + def prepare(self, *, svn_url, destination_path=None, + swh_revision=None, start_from_scratch=False, **kwargs): + # First, check if previous revisions have been loaded for the + # subversion origin and get the number of the last one + last_loaded_svn_rev = self.get_last_loaded_svn_rev(svn_url) + + # Then try to generate a dump file containing relevant svn revisions + # to load, an exception will be thrown if something wrong happened + dump_path = self.dump_svn_revisions(svn_url, last_loaded_svn_rev) + + # Finally, mount the dump and load the repository + self.log.debug('Mounting dump file with "svnadmin load".') + _, self.repo_path = \ + init_svn_repo_from_dump(dump_path, + prefix=TEMPORARY_DIR_PREFIX_PATTERN, + suffix='-%s' % os.getpid(), + root_dir=self.temp_dir) + super().prepare(svn_url='file://%s' % self.repo_path, + destination_path=destination_path, + swh_revision=swh_revision, + start_from_scratch=start_from_scratch, + **kwargs) + + def cleanup(self): + super().cleanup() + if self.temp_dir and os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + + def visit_status(self): + if self.truncated_dump: + return 'partial' + else: + return super().visit_status() diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py --- a/swh/loader/svn/tasks.py +++ b/swh/loader/svn/tasks.py @@ -3,9 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + from swh.scheduler.task import Task -from .loader import SvnLoader, SvnLoaderFromDumpArchive +from .loader import ( + SvnLoader, SvnLoaderFromDumpArchive, SvnLoaderFromRemoteDump +) class LoadSvnRepository(Task): @@ -67,3 +70,26 @@ visit_date=visit_date, archive_path=archive_path, start_from_scratch=start_from_scratch) + + +class DumpMountAndLoadSvnRepository(Task): + """ + Create a dump of a remote repository through the svnrdump + tool, mount it locally then load the repository into the + Software Heritage archive. + """ + task_queue = 'swh_loader_svn_dump_mount_and_load' + + def run_task(self, *, svn_url, origin_url=None, visit_date=None, + start_from_scratch=False): + """1. Mount an svn dump from archive as a local svn repository. + 2. Load it through the svn loader. + 3. Clean up mounted svn repository archive. + + """ + loader = SvnLoaderFromRemoteDump() + loader.log = self.log + return loader.load(svn_url=svn_url, + origin_url=origin_url, + visit_date=visit_date, + start_from_scratch=start_from_scratch) diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py --- a/swh/loader/svn/tests/test_loader.py +++ b/swh/loader/svn/tests/test_loader.py @@ -10,7 +10,7 @@ from swh.model import hashutil from swh.loader.svn.loader import build_swh_snapshot, DEFAULT_BRANCH -from swh.loader.svn.loader import SvnLoader +from swh.loader.svn.loader import SvnLoader, SvnLoaderFromRemoteDump class TestSnapshot(TestCase): @@ -928,3 +928,48 @@ # FIXME: Check the snapshot's state self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) self.assertEqual(self.loader.visit_status(), 'full') + + +class SvnLoaderFromRemoteDumpNoStorage(LoaderNoStorage, LoaderWithState, + SvnLoaderFromRemoteDump): + """A SvnLoaderFromRemoteDump with no persistence. + + Context: + Load a remote svn repository from a generated dump file. + + """ + + def swh_latest_snapshot_revision(self, origin_id, prev_swh_revision=None): + """We do not know this repository so no revision. + + """ + return {} + + +class SvnLoaderFromRemoteDump(BaseSvnLoaderTest): + """ + Check that the results obtained with the remote svn dump loader + and the base svn loader are the same. + """ + def setUp(self): + super().setUp(archive_name='pkg-gourmet.tgz') + + @istest + def load(self): + """ + Compare results of remote dump loader and base loader + """ + dump_loader = SvnLoaderFromRemoteDumpNoStorage() + dump_loader.load(svn_url=self.svn_mirror_url) + + base_loader = SvnLoaderNoStorage() + base_loader.load(svn_url=self.svn_mirror_url) + + self.assertEqual(dump_loader.all_contents, + base_loader.all_contents) + self.assertEqual(dump_loader.all_directories, + base_loader.all_directories) + self.assertEqual(dump_loader.all_revisions, + base_loader.all_revisions) + self.assertEqual(dump_loader.all_snapshots, + base_loader.all_snapshots) diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py --- a/swh/loader/svn/utils.py +++ b/swh/loader/svn/utils.py @@ -33,9 +33,9 @@ return ts -def init_svn_repo_from_archive_dump(archive_path, prefix=None, suffix=None, - root_dir='/tmp'): - """Given a path to an archive containing an svn dump. +def init_svn_repo_from_dump(dump_path, prefix=None, suffix=None, + root_dir='/tmp', gzip=False): + """Given a path to a svn dump. Initialize an svn repository with the content of said dump. Returns: @@ -49,7 +49,7 @@ and load the dump. """ - project_name = os.path.basename(os.path.dirname(archive_path)) + project_name = os.path.basename(os.path.dirname(dump_path)) temp_dir = tempfile.mkdtemp(prefix=prefix, suffix=suffix, dir=root_dir) try: @@ -63,7 +63,11 @@ 'Failed to initialize empty svn repo for %s' % project_name) - with Popen(['gzip', '-dc', archive_path], stdout=PIPE) as dump: + read_dump_cmd = ['cat', dump_path] + if gzip: + read_dump_cmd = ['gzip', '-dc', dump_path] + + with Popen(read_dump_cmd, stdout=PIPE) as dump: cmd = ['svnadmin', 'load', '-q', repo_path] r = call(cmd, stdin=dump.stdout) if r != 0: @@ -74,3 +78,23 @@ except Exception as e: shutil.rmtree(temp_dir) raise e + + +def init_svn_repo_from_archive_dump(archive_path, prefix=None, suffix=None, + root_dir='/tmp'): + """Given a path to an archive containing an svn dump. + Initialize an svn repository with the content of said dump. + + Returns: + A tuple: + - temporary folder (str): containing the mounted repository + - repo_path (str): path to the mounted repository inside the + temporary folder + + Raises: + ValueError in case of failure to run the command to uncompress + and load the dump. + + """ + return init_svn_repo_from_dump(archive_path, prefix=prefix, suffix=suffix, + root_dir=root_dir, gzip=True)