Page MenuHomeSoftware Heritage

svn.py
No OneTemporary

# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import pysvn
import tempfile
import subprocess
import shutil
from contextlib import contextmanager
from pysvn import Revision, opt_revision_kind
from retrying import retry
from swh.model import git
@contextmanager
def cwd(path):
"""Contextually change the working directory to do thy bidding.
Then gets back to the original location.
"""
prev_cwd = os.getcwd()
os.chdir(path)
try:
yield
finally:
os.chdir(prev_cwd)
def init_repo(remote_repo_url, destination_path=None):
"""Initialize a repository without any svn action on disk. There may be
temporary folder creation on disk as side effect (if destination_path is
not provided)
Args:
remote_repo_url: The remote svn url
destination_path: The optional local parent folder to checkout the
repository to.
Returns:
Dictionary with the following keys:
- client: client instance to manipulate the repository
- remote_url: remote url (same as input)
- local_url: local url which has been computed
"""
name = os.path.basename(remote_repo_url)
if destination_path:
os.makedirs(destination_path, exist_ok=True)
local_dirname = destination_path
else:
local_dirname = tempfile.mkdtemp(suffix='.swh.loader',
prefix='tmp.',
dir='/tmp')
local_repo_url = os.path.join(local_dirname, name)
client = pysvn.Client()
return {'client': client,
'remote_url': remote_repo_url,
'local_url': local_repo_url}
class SvnRepo():
"""Swh representation of a svn repository.
"""
def __init__(self, remote_url, origin_id, storage, local_url=None):
self.remote_url = remote_url
self.storage = storage
self.origin_id = origin_id
r = init_repo(remote_url, local_url)
self.client = r['client']
self.local_url = r['local_url']
self.uuid = None
def __str__(self):
return str({'remote_url': self.remote_url,
'local_url': self.local_url,
'uuid': self.uuid,
'swh-origin': self.origin_id})
def read_uuid(self):
with cwd(self.local_url):
cmd = 'svn info | grep UUID | cut -f2 -d:'
uuid = subprocess.check_output(cmd, shell=True)
return uuid.strip().decode('utf-8')
@retry(stop_max_attempt_number=3)
def checkout(self, revision):
"""Checkout repository repo at revision.
Args:
revision: the revision number to checkout the repo to.
"""
self.client.checkout(
self.remote_url,
self.local_url,
revision=Revision(opt_revision_kind.number, revision))
def fork(self, svn_revision=None):
"""Checkout remote repository to a local working copy (at revision 1
if the svn revision is not specified).
This will also update the repository's uuid.
"""
self.checkout(1 if not svn_revision else svn_revision)
self.uuid = self.read_uuid()
def head_revision(self):
"""Retrieve current revision of the repository's working copy.
"""
head_rev = Revision(opt_revision_kind.head)
info = self.client.info2(self.local_url,
revision=head_rev,
recurse=False)
return info[0][1]['rev'].number
def initial_revision(self):
"""Retrieve the initial revision from which the remote url appeared.
Note: This should always be 1 since we won't be dealing with in-depth
url.
"""
return self.client.log(self.remote_url)[-1].data.get(
'revision').number
def logs(self, revision_start, revision_end, block_size=100):
"""Stream svn logs between revision_start and revision_end by chunks of
block_size logs.
Yields revision and associated revision information between the
revision start and revision_end.
Args:
revision_start: the svn revision starting bound
revision_end: the svn revision ending bound
block_size: block size of revisions to fetch
Yields:
tuple of revisions and logs.
revisions: list of revisions in order
logs: Dictionary with key revision number and value the log entry.
The log entry is a dictionary with the following keys:
- author_date: date of the commit
- author_name: name of the author
- message: commit message
"""
r1 = revision_start
r2 = r1 + block_size - 1
done = False
if r2 >= revision_end:
r2 = revision_end
done = True
rev_start = Revision(opt_revision_kind.number, r1)
rev_end = Revision(opt_revision_kind.number, r2)
for log_entry in self.client.log(url_or_path=self.local_url,
revision_start=rev_start,
revision_end=rev_end,
discover_changed_paths=True):
author_date = log_entry.date
author = log_entry.author
message = log_entry.message
rev = log_entry.revision.number
# Determine the changed paths
changed_paths = []
for paths in log_entry.changed_paths:
path = os.path.join(self.local_url, paths.path.lstrip('/'))
changed_paths.append({
'path': path.encode('utf-8'),
'action': paths.action # A(dd), M(odified), D(eleted)
})
# # determine the full diff between (rev - 1) and rev
# diff = self.client.diff(url_or_path=self.local_url,
# tmp_path='/tmp',
# url_or_path2=self.local_url,
# revision1=Revision(
# opt_revision_kind.number, rev-1),
# revision2=Revision(
# opt_revision_kind.number, rev),
# ignore_content_type=True)
yield rev, {
'author_date': author_date if author_date else '',
'author_name': author if author else '',
'message': message if message else '',
'changed_paths': changed_paths,
# 'diff': diff
}
if not done:
yield from self.logs(r2 + 1, revision_end, block_size)
def swh_previous_revision(self):
"""Look for possible existing revision.
Returns:
The previous swh revision if found, None otherwise.
"""
storage = self.storage
occ = storage.occurrence_get(self.origin_id)
if occ:
revision_id = occ[0]['target']
revisions = storage.revision_get([revision_id])
if revisions:
return revisions[0]
def swh_hash_data_per_revision(self, start_revision, end_revision):
"""Compute swh hash data per each revision between start_revision and
end_revision.
Args:
start_revision: starting revision
end_revision: ending revision
Yields:
tuple (rev, nextrev, commit, objects_per_path)
- rev: current revision
- nextrev: next revision
- commit: commit data (author, date, message) for such revision
- objects_per_path: dictionary of path, swh hash data with type
"""
def ignore_svn_folder(dirpath):
return b'.svn' not in dirpath
local_url = self.local_url.encode('utf-8')
for rev, commit in self.logs(start_revision, end_revision):
# checkout to the revision rev
self.checkout(revision=rev)
if rev == start_revision: # first time we walk the complete tree
# compute git commit
objects_per_path = git.walk_and_compute_sha1_from_directory(
local_url, dir_ok_fn=ignore_svn_folder)
else: # then we update only what needs to be
objects_per_path = git.update_checksums_from(
commit['changed_paths'],
objects_per_path,
dir_ok_fn=ignore_svn_folder)
if rev == end_revision:
nextrev = None
else:
nextrev = rev + 1
yield rev, nextrev, commit, objects_per_path
def cleanup(self):
"""Clean up the local url checkout.
"""
shutil.rmtree(self.local_url)

File Metadata

Mime Type
text/x-python
Expires
Jun 4 2025, 7:09 PM (9 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3398949

Event Timeline