Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/svn/loader.py
# Copyright (C) 2015-2018 The Software Heritage developers | # Copyright (C) 2015-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Loader in charge of injecting either new or existing svn mirrors to | """Loader in charge of injecting either new or existing svn mirrors to | ||||
swh-storage. | swh-storage. | ||||
""" | """ | ||||
import mmap | |||||
import os | import os | ||||
import re | |||||
import shutil | import shutil | ||||
import tempfile | import tempfile | ||||
from subprocess import run, PIPE | |||||
from swh.core import utils | from swh.core import utils | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.from_disk import Directory | from swh.model.from_disk import Directory | ||||
from swh.model.identifiers import identifier_to_bytes, revision_identifier | from swh.model.identifiers import identifier_to_bytes, revision_identifier | ||||
from swh.model.identifiers import snapshot_identifier | from swh.model.identifiers import snapshot_identifier | ||||
from swh.loader.core.loader import SWHLoader | from swh.loader.core.loader import SWHLoader | ||||
from swh.loader.core.utils import clean_dangling_folders | from swh.loader.core.utils import clean_dangling_folders | ||||
from . import svn, converters | from . import svn, converters | ||||
from .utils import init_svn_repo_from_archive_dump | from .utils import ( | ||||
init_svn_repo_from_dump, init_svn_repo_from_archive_dump | |||||
) | |||||
from .exception import SvnLoaderEventful, SvnLoaderUneventful | from .exception import SvnLoaderEventful, SvnLoaderUneventful | ||||
from .exception import SvnLoaderHistoryAltered | from .exception import SvnLoaderHistoryAltered | ||||
DEFAULT_BRANCH = b'master' | DEFAULT_BRANCH = b'master' | ||||
def _revision_id(revision): | def _revision_id(revision): | ||||
Show All 40 Lines | class SWHSvnLoader(SWHLoader): | ||||
def __init__(self): | def __init__(self): | ||||
super().__init__(logging_class='swh.loader.svn.SvnLoader') | super().__init__(logging_class='swh.loader.svn.SvnLoader') | ||||
self.check_revision = self.config['check_revision'] | self.check_revision = self.config['check_revision'] | ||||
self.origin_id = None | self.origin_id = None | ||||
self.debug = self.config['debug'] | self.debug = self.config['debug'] | ||||
self.last_seen_revision = None | self.last_seen_revision = None | ||||
self.temp_directory = self.config['temp_directory'] | self.temp_directory = self.config['temp_directory'] | ||||
self.svnrepo = None | |||||
def pre_cleanup(self): | def pre_cleanup(self): | ||||
"""Cleanup potential dangling files from prior runs (e.g. OOM killed | """Cleanup potential dangling files from prior runs (e.g. OOM killed | ||||
tasks) | tasks) | ||||
""" | """ | ||||
clean_dangling_folders(self.temp_directory, | clean_dangling_folders(self.temp_directory, | ||||
pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, | pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
log=self.log) | log=self.log) | ||||
def cleanup(self): | def cleanup(self): | ||||
"""Clean up the svn repository's working representation on disk. | """Clean up the svn repository's working representation on disk. | ||||
""" | """ | ||||
if not self.svnrepo: | |||||
return | |||||
if self.debug: | if self.debug: | ||||
self.log.error('''NOT FOR PRODUCTION - debug flag activated | self.log.error('''NOT FOR PRODUCTION - debug flag activated | ||||
Local repository not cleaned up for investigation: %s''' % ( | Local repository not cleaned up for investigation: %s''' % ( | ||||
self.svnrepo.local_url.decode('utf-8'), )) | self.svnrepo.local_url.decode('utf-8'), )) | ||||
return | return | ||||
self.svnrepo.clean_fs() | self.svnrepo.clean_fs() | ||||
def swh_revision_hash_tree_at_svn_revision(self, revision): | def swh_revision_hash_tree_at_svn_revision(self, revision): | ||||
▲ Show 20 Lines • Show All 433 Lines • ▼ Show 20 Lines | class SWHSvnLoaderFromDumpArchive(SWHSvnLoader): | ||||
def cleanup(self): | def cleanup(self): | ||||
super().cleanup() | super().cleanup() | ||||
if self.temp_dir and os.path.exists(self.temp_dir): | if self.temp_dir and os.path.exists(self.temp_dir): | ||||
msg = 'Clean up temporary directory dump %s for project %s' % ( | msg = 'Clean up temporary directory dump %s for project %s' % ( | ||||
self.temp_dir, os.path.basename(self.repo_path)) | self.temp_dir, os.path.basename(self.repo_path)) | ||||
self.log.debug(msg) | self.log.debug(msg) | ||||
shutil.rmtree(self.temp_dir) | shutil.rmtree(self.temp_dir) | ||||
class SWHSvnLoaderFromRemoteDump(SWHSvnLoaderFromDumpArchive): | |||||
""" | |||||
Create a subversion repository dump using the rsvndump utility, | |||||
mount it locally and load the repository from it. | |||||
""" | |||||
def __init__(self, origin_url): | |||||
super(SWHSvnLoaderFromDumpArchive, self).__init__() | |||||
self.dump_temp_dir = None | |||||
self.temp_dir = None | |||||
self.repo_path = None | |||||
self.dump_error = None | |||||
self.partial_dump = False | |||||
# First, check if previous revisions have been loaded for the | |||||
# subversion origin and get the number of the last one | |||||
last_loaded_svn_rev = -1 | |||||
try: | |||||
origin = \ | |||||
self.storage.origin_get({'type': 'svn', 'url': origin_url}) | |||||
last_swh_rev = \ | |||||
self.swh_latest_snapshot_revision(origin['id'])['revision'] | |||||
last_swh_rev_headers = \ | |||||
dict(last_swh_rev['metadata']['extra_headers']) | |||||
last_loaded_svn_rev = int(last_swh_rev_headers['svn_revision']) | |||||
except Exception: | |||||
pass | |||||
# Build the rsvndump command line | |||||
rsvndump_cmd = ['rsvndump'] | |||||
if last_loaded_svn_rev >= 0: | |||||
# Previous revisions have already been loaded, so dump | |||||
# only those we are interested in while padding already loaded | |||||
# ones to keep the original revision numbers in the dump file | |||||
rsvndump_cmd += ['-r', '%s:HEAD' % last_loaded_svn_rev, | |||||
'--keep-revnums'] | |||||
# Use deltas to get a smaller dump file | |||||
rsvndump_cmd += ['--deltas', origin_url] | |||||
# Launch the rsvndump command while capturing stderr as | |||||
# successfully dumped revision numbers are printed to it | |||||
self.dump_temp_dir = tempfile.mkdtemp() | |||||
dump_name = ''.join(c for c in origin_url if c.isalnum()) | |||||
dump_path = '%s/%s.svndump.gz' % (self.dump_temp_dir, dump_name) | |||||
self.log.debug('Executing %s' % ' '.join(rsvndump_cmd)) | |||||
with open(dump_path, 'wb') as dump_file: | |||||
rsvndump = run(rsvndump_cmd, stdout=dump_file, stderr=PIPE) | |||||
# Get the stderr line with latest dumped revision | |||||
stderr_lines = rsvndump.stderr.split(b'\n') | |||||
last_dumped_rev = None | |||||
if len(stderr_lines) > 1: | |||||
last_dumped_rev = stderr_lines[-2] | |||||
can_load_dump = False | |||||
# No errors, dump file can be mounted and loaded immediately | |||||
if rsvndump.returncode == 0: | |||||
can_load_dump = True | |||||
# There was an error but it does not mean that no revisions | |||||
# can be loaded. | |||||
elif last_dumped_rev: | |||||
# Get the latest dumped revision number | |||||
last_dumped_rev = int(re.search(b'.*revision ([0-9]+)', | |||||
last_dumped_rev).group(1)) | |||||
# Check if revisions inside the dump file can be loaded anyway | |||||
if last_dumped_rev > last_loaded_svn_rev: | |||||
self.log.debug(('rsvndump did not dump all expected revisions ' | |||||
'but revisions range %s:%s are available in ' | |||||
'the generated dump file and will be loaded ' | |||||
'into the archive.') % (last_loaded_svn_rev+1, | |||||
last_dumped_rev)) | |||||
# Truncate the dump file after the last successfully dumped | |||||
# revision to avoid the loading of corrupted data | |||||
f = open(dump_path, 'r+b') | |||||
s = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_WRITE) | |||||
pattern = ('Revision-number: %s' % | |||||
(last_dumped_rev+1)).encode() | |||||
n = s.find(pattern) | |||||
if n != -1: | |||||
s.resize(n) | |||||
s.close() | |||||
f.close() | |||||
can_load_dump = True | |||||
self.partial_dump = True | |||||
else: | |||||
self.dump_error = ('Last dumped subversion revision (%s) is ' | |||||
'lesser than the last one loaded into the ' | |||||
'archive (%s).') % (last_dumped_rev, | |||||
last_loaded_svn_rev) | |||||
else: | |||||
self.dump_error = ('An error occured when running rsvndump and ' | |||||
'no exploitable dump file has been generated.') | |||||
if can_load_dump: | |||||
self.temp_dir, self.repo_path = \ | |||||
init_svn_repo_from_dump(dump_path, | |||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | |||||
suffix='-%s' % os.getpid(), | |||||
root_dir=self.temp_directory) | |||||
def prepare(self, *args, **kwargs): | |||||
if self.dump_error: | |||||
raise Exception(self.dump_error) | |||||
else: | |||||
super().prepare(*args, **kwargs) | |||||
def cleanup(self): | |||||
super().cleanup() | |||||
if self.dump_temp_dir and os.path.exists(self.dump_temp_dir): | |||||
shutil.rmtree(self.dump_temp_dir) | |||||
def visit_status(self): | |||||
if self.partial_dump: | |||||
return 'partial' | |||||
else: | |||||
return super().visit_status() |