Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/svn/loader.py
# Copyright (C) 2015-2018 The Software Heritage developers | # Copyright (C) 2015-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Loader in charge of injecting either new or existing svn mirrors to | """Loader in charge of injecting either new or existing svn mirrors to | ||||
swh-storage. | swh-storage. | ||||
""" | """ | ||||
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines | ADDITIONAL_CONFIG = { | ||||
'check_revision': ('dict', { | 'check_revision': ('dict', { | ||||
'status': False, # do we check the revision? | 'status': False, # do we check the revision? | ||||
'limit': 1000, # at which pace do we check it? | 'limit': 1000, # at which pace do we check it? | ||||
}), | }), | ||||
} | } | ||||
visit_type = 'svn' | visit_type = 'svn' | ||||
def __init__(self): | def __init__(self, url, origin_url=None, visit_date=None, | ||||
destination_path=None, swh_revision=None, | |||||
start_from_scratch=False): | |||||
super().__init__(logging_class='swh.loader.svn.SvnLoader') | super().__init__(logging_class='swh.loader.svn.SvnLoader') | ||||
self.origin_url = None | # technical svn uri to act on svn repository | ||||
self.svn_url = url | |||||
# origin url as unique identifier for origin in swh archive | |||||
self.origin_url = origin_url if origin_url else self.svn_url | |||||
self.debug = self.config['debug'] | self.debug = self.config['debug'] | ||||
self.last_seen_revision = None | self.last_seen_revision = None | ||||
self.temp_directory = self.config['temp_directory'] | self.temp_directory = self.config['temp_directory'] | ||||
self.done = False | self.done = False | ||||
self.svnrepo = None | self.svnrepo = None | ||||
# Revision check is configurable | # Revision check is configurable | ||||
check_revision = self.config['check_revision'] | check_revision = self.config['check_revision'] | ||||
if check_revision['status']: | if check_revision['status']: | ||||
self.check_revision = check_revision['limit'] | self.check_revision = check_revision['limit'] | ||||
else: | else: | ||||
self.check_revision = None | self.check_revision = None | ||||
# internal state used to store swh objects | # internal state used to store swh objects | ||||
self._contents = [] | self._contents = [] | ||||
self._directories = [] | self._directories = [] | ||||
self._revisions = [] | self._revisions = [] | ||||
self._snapshot = None | self._snapshot = None | ||||
self._last_revision = None | self._last_revision = None | ||||
self._visit_status = 'full' | self._visit_status = 'full' | ||||
self._load_status = 'uneventful' | self._load_status = 'uneventful' | ||||
self.visit_date = visit_date | |||||
self.destination_path = destination_path | |||||
self.start_from_scratch = start_from_scratch | |||||
self.swh_revision = swh_revision | |||||
def pre_cleanup(self): | def pre_cleanup(self): | ||||
"""Cleanup potential dangling files from prior runs (e.g. OOM killed | """Cleanup potential dangling files from prior runs (e.g. OOM killed | ||||
tasks) | tasks) | ||||
""" | """ | ||||
clean_dangling_folders(self.temp_directory, | clean_dangling_folders(self.temp_directory, | ||||
pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, | pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
▲ Show 20 Lines • Show All 328 Lines • ▼ Show 20 Lines | def process_svn_revisions(self, svnrepo, revision_start, revision_end, | ||||
if self.check_revision: | if self.check_revision: | ||||
self._check_revision_divergence(count, rev, dir_id) | self._check_revision_divergence(count, rev, dir_id) | ||||
if nextrev: | if nextrev: | ||||
revision_parents[nextrev] = [swh_revision['id']] | revision_parents[nextrev] = [swh_revision['id']] | ||||
yield _contents, _directories, swh_revision | yield _contents, _directories, swh_revision | ||||
def prepare_origin_visit(self, *, svn_url, visit_date=None, | def prepare_origin_visit(self, *args, **kwargs): | ||||
origin_url=None, **kwargs): | |||||
self.origin = { | self.origin = { | ||||
'url': origin_url if origin_url else svn_url, | 'url': self.origin_url if self.origin_url else self.svn_url, | ||||
'type': self.visit_type, | |||||
} | } | ||||
self.visit_date = visit_date | |||||
def prepare(self, *, svn_url, destination_path=None, | def prepare(self, *args, **kwargs): | ||||
swh_revision=None, start_from_scratch=False, **kwargs): | if self.swh_revision: | ||||
self.start_from_scratch = start_from_scratch | self.last_known_swh_revision = self.swh_revision | ||||
if swh_revision: | |||||
self.last_known_swh_revision = swh_revision | |||||
else: | else: | ||||
self.last_known_swh_revision = None | self.last_known_swh_revision = None | ||||
self.latest_snapshot = self.swh_latest_snapshot_revision( | self.latest_snapshot = self.swh_latest_snapshot_revision( | ||||
self.origin_url, self.last_known_swh_revision) | self.origin_url, self.last_known_swh_revision) | ||||
if destination_path: | if self.destination_path: | ||||
local_dirname = destination_path | local_dirname = self.destination_path | ||||
else: | else: | ||||
local_dirname = tempfile.mkdtemp( | local_dirname = tempfile.mkdtemp( | ||||
suffix='-%s' % os.getpid(), | suffix='-%s' % os.getpid(), | ||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | prefix=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
dir=self.temp_directory) | dir=self.temp_directory) | ||||
self.svnrepo = self.get_svn_repo( | self.svnrepo = self.get_svn_repo( | ||||
svn_url, local_dirname, self.origin_url) | self.svn_url, local_dirname, self.origin_url) | ||||
try: | try: | ||||
revision_start, revision_end, revision_parents = self.start_from( | revision_start, revision_end, revision_parents = self.start_from( | ||||
self.last_known_swh_revision, self.start_from_scratch) | self.last_known_swh_revision, self.start_from_scratch) | ||||
self.swh_revision_gen = self.process_svn_revisions( | self.swh_revision_gen = self.process_svn_revisions( | ||||
self.svnrepo, revision_start, revision_end, revision_parents) | self.svnrepo, revision_start, revision_end, revision_parents) | ||||
except SvnLoaderUneventful as e: | except SvnLoaderUneventful as e: | ||||
self.log.warning(e) | self.log.warning(e) | ||||
if self.latest_snapshot and 'snapshot' in self.latest_snapshot: | if self.latest_snapshot and 'snapshot' in self.latest_snapshot: | ||||
▲ Show 20 Lines • Show All 95 Lines • ▼ Show 20 Lines | def visit_status(self): | ||||
return self._visit_status | return self._visit_status | ||||
class SvnLoaderFromDumpArchive(SvnLoader): | class SvnLoaderFromDumpArchive(SvnLoader): | ||||
"""Uncompress an archive containing an svn dump, mount the svn dump as | """Uncompress an archive containing an svn dump, mount the svn dump as | ||||
an svn repository and load said repository. | an svn repository and load said repository. | ||||
""" | """ | ||||
def __init__(self, archive_path): | def __init__(self, url, archive_path, | ||||
super().__init__() | origin_url=None, destination_path=None, | ||||
swh_revision=None, start_from_scratch=None, | |||||
visit_date=None): | |||||
super().__init__(url, | |||||
origin_url=origin_url, | |||||
destination_path=destination_path, | |||||
swh_revision=swh_revision, | |||||
start_from_scratch=start_from_scratch, | |||||
visit_date=visit_date) | |||||
self.archive_path = archive_path | self.archive_path = archive_path | ||||
self.temp_dir = None | self.temp_dir = None | ||||
self.repo_path = None | self.repo_path = None | ||||
def prepare(self, *, svn_url, destination_path=None, | def prepare(self, *args, **kwargs): | ||||
swh_revision=None, start_from_scratch=False, **kwargs): | |||||
self.log.info('Archive to mount and load %s' % self.archive_path) | self.log.info('Archive to mount and load %s' % self.archive_path) | ||||
self.temp_dir, self.repo_path = init_svn_repo_from_archive_dump( | self.temp_dir, self.repo_path = init_svn_repo_from_archive_dump( | ||||
self.archive_path, | self.archive_path, | ||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | prefix=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
suffix='-%s' % os.getpid(), | suffix='-%s' % os.getpid(), | ||||
root_dir=self.temp_directory) | root_dir=self.temp_directory) | ||||
if not svn_url: | super().prepare(*args, **kwargs) | ||||
svn_url = 'file://%s' % self.repo_path | |||||
super().prepare(svn_url=svn_url, destination_path=destination_path, | |||||
swh_revision=swh_revision, | |||||
start_from_scratch=start_from_scratch, | |||||
**kwargs) | |||||
def cleanup(self): | def cleanup(self): | ||||
super().cleanup() | super().cleanup() | ||||
if self.temp_dir and os.path.exists(self.temp_dir): | if self.temp_dir and os.path.exists(self.temp_dir): | ||||
msg = 'Clean up temporary directory dump %s for project %s' % ( | msg = 'Clean up temporary directory dump %s for project %s' % ( | ||||
self.temp_dir, os.path.basename(self.repo_path)) | self.temp_dir, os.path.basename(self.repo_path)) | ||||
self.log.debug(msg) | self.log.debug(msg) | ||||
shutil.rmtree(self.temp_dir) | shutil.rmtree(self.temp_dir) | ||||
class SvnLoaderFromRemoteDump(SvnLoader): | class SvnLoaderFromRemoteDump(SvnLoader): | ||||
""" | """ | ||||
Create a subversion repository dump using the svnrdump utility, | Create a subversion repository dump using the svnrdump utility, | ||||
mount it locally and load the repository from it. | mount it locally and load the repository from it. | ||||
""" | """ | ||||
def __init__(self): | def __init__(self, url, origin_url=None, destination_path=None, | ||||
super().__init__() | swh_revision=None, start_from_scratch=False, visit_date=None): | ||||
super().__init__(url, origin_url=origin_url, | |||||
destination_path=destination_path, | |||||
swh_revision=swh_revision, | |||||
start_from_scratch=start_from_scratch, | |||||
visit_date=visit_date) | |||||
self.temp_dir = tempfile.mkdtemp(dir=self.temp_directory) | self.temp_dir = tempfile.mkdtemp(dir=self.temp_directory) | ||||
self.repo_path = None | self.repo_path = None | ||||
self.truncated_dump = False | self.truncated_dump = False | ||||
def get_last_loaded_svn_rev(self, svn_url): | def get_last_loaded_svn_rev(self, svn_url): | ||||
""" | """ | ||||
Check if the svn repository has already been visited | Check if the svn repository has already been visited | ||||
and return the last loaded svn revision number or -1 | and return the last loaded svn revision number or -1 | ||||
otherwise. | otherwise. | ||||
""" | """ | ||||
last_loaded_svn_rev = -1 | last_loaded_svn_rev = -1 | ||||
try: | try: | ||||
origin = \ | origin = \ | ||||
self.storage.origin_get({'type': 'svn', 'url': svn_url}) | self.storage.origin_get({'url': svn_url}) | ||||
last_swh_rev = \ | last_swh_rev = \ | ||||
self.swh_latest_snapshot_revision(origin['url'])['revision'] | self.swh_latest_snapshot_revision(origin['url'])['revision'] | ||||
last_swh_rev_headers = \ | last_swh_rev_headers = \ | ||||
dict(last_swh_rev['metadata']['extra_headers']) | dict(last_swh_rev['metadata']['extra_headers']) | ||||
last_loaded_svn_rev = int(last_swh_rev_headers['svn_revision']) | last_loaded_svn_rev = int(last_swh_rev_headers['svn_revision']) | ||||
except Exception: | except Exception: | ||||
pass | pass | ||||
return last_loaded_svn_rev | return last_loaded_svn_rev | ||||
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines | def dump_svn_revisions(self, svn_url, last_loaded_svn_rev=-1): | ||||
raise Exception(('Last dumped subversion revision (%s) is ' | raise Exception(('Last dumped subversion revision (%s) is ' | ||||
'lesser than the last one loaded into the ' | 'lesser than the last one loaded into the ' | ||||
'archive (%s).') % (last_dumped_rev, | 'archive (%s).') % (last_dumped_rev, | ||||
last_loaded_svn_rev)) | last_loaded_svn_rev)) | ||||
raise Exception('An error occurred when running svnrdump and ' | raise Exception('An error occurred when running svnrdump and ' | ||||
'no exploitable dump file has been generated.') | 'no exploitable dump file has been generated.') | ||||
def prepare(self, *, svn_url, destination_path=None, | def prepare(self, *args, **kwargs): | ||||
swh_revision=None, start_from_scratch=False, **kwargs): | |||||
# First, check if previous revisions have been loaded for the | # First, check if previous revisions have been loaded for the | ||||
# subversion origin and get the number of the last one | # subversion origin and get the number of the last one | ||||
last_loaded_svn_rev = self.get_last_loaded_svn_rev(svn_url) | last_loaded_svn_rev = self.get_last_loaded_svn_rev(self.svn_url) | ||||
# Then try to generate a dump file containing relevant svn revisions | # Then try to generate a dump file containing relevant svn revisions | ||||
# to load, an exception will be thrown if something wrong happened | # to load, an exception will be thrown if something wrong happened | ||||
dump_path = self.dump_svn_revisions(svn_url, last_loaded_svn_rev) | dump_path = self.dump_svn_revisions(self.svn_url, last_loaded_svn_rev) | ||||
# Finally, mount the dump and load the repository | # Finally, mount the dump and load the repository | ||||
self.log.debug('Mounting dump file with "svnadmin load".') | self.log.debug('Mounting dump file with "svnadmin load".') | ||||
_, self.repo_path = \ | _, self.repo_path = \ | ||||
init_svn_repo_from_dump(dump_path, | init_svn_repo_from_dump(dump_path, | ||||
prefix=TEMPORARY_DIR_PREFIX_PATTERN, | prefix=TEMPORARY_DIR_PREFIX_PATTERN, | ||||
suffix='-%s' % os.getpid(), | suffix='-%s' % os.getpid(), | ||||
root_dir=self.temp_dir) | root_dir=self.temp_dir) | ||||
super().prepare(svn_url='file://%s' % self.repo_path, | self.svn_url = 'file://%s' % self.repo_path | ||||
destination_path=destination_path, | super().prepare(*args, **kwargs) | ||||
swh_revision=swh_revision, | |||||
start_from_scratch=start_from_scratch, | |||||
**kwargs) | |||||
def cleanup(self): | def cleanup(self): | ||||
super().cleanup() | super().cleanup() | ||||
if self.temp_dir and os.path.exists(self.temp_dir): | if self.temp_dir and os.path.exists(self.temp_dir): | ||||
shutil.rmtree(self.temp_dir) | shutil.rmtree(self.temp_dir) | ||||
def visit_status(self): | def visit_status(self): | ||||
if self.truncated_dump: | if self.truncated_dump: | ||||
return 'partial' | return 'partial' | ||||
else: | else: | ||||
return super().visit_status() | return super().visit_status() |