diff --git a/swh/loader/svn/converters.py b/swh/loader/svn/converters.py index 0803292..1e321d5 100644 --- a/swh/loader/svn/converters.py +++ b/swh/loader/svn/converters.py @@ -1,196 +1,198 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from email import utils from .utils import strdate_to_timestamp def svn_date_to_gitsvn_date(strdate): """Convert a string date to an swh one. Args: strdate: A string formatted for .utils.strdate_to_timestamp to do its jobs Returns: An swh date format with an integer timestamp. """ ts = strdate_to_timestamp(strdate) return { 'timestamp': { 'seconds': ts['seconds'], 'microseconds': 0, }, 'offset': 0 } def svn_date_to_swh_date(strdate): """Convert a string date to an swh one. Args: strdate: A string formatted for .utils.strdate_to_timestamp to do its jobs Returns: An swh date format """ return { 'timestamp': strdate_to_timestamp(strdate), 'offset': 0 } def svn_author_to_swh_person(author): """Convert an svn author to an swh person. Default policy: No information is added. Args: author (string): the svn author (in bytes) Returns: a dictionary with keys: fullname: the author's associated fullname name: the author's associated name email: None (no email in svn) """ if not author: return {'fullname': b'', 'name': None, 'email': None} - author = author.encode('utf-8') + if isinstance(author, str): + author = author.encode('utf-8') + if b'<' in author and b'>' in author: name, email = utils.parseaddr(author.decode('utf-8')) return { 'fullname': author, 'name': name.encode('utf-8'), 'email': email.encode('utf-8') } return {'fullname': author, 'email': None, 'name': author} def svn_author_to_gitsvn_person(author, repo_uuid): """Convert an svn author to a person suitable for insertion. Default policy: If no email is found, the email is created using the author and the repo_uuid. Args: author (string): the svn author (in bytes) repo_uuid (bytes): the repository's uuid Returns: a dictionary with keys: fullname: the author's associated fullname name: the author's associated name email: None (no email in svn) """ if not author: author = '(no author)' author = author.encode('utf-8') if b'<' in author and b'>' in author: name, email = utils.parseaddr(author.decode('utf-8')) return { 'fullname': author, 'name': name.encode('utf-8'), 'email': email.encode('utf-8') } # we'll construct the author's fullname the same way git svn does # 'user ' email = b'@'.join([author, repo_uuid]) return { 'fullname': b''.join([author, b' ', b'<', email, b'>']), 'name': author, 'email': email, } def build_swh_revision(rev, commit, repo_uuid, dir_id, parents): """Given a svn revision, build a swh revision. This adds an ['metadata']['extra-headers'] entry with the repository's uuid and the svn revision. Args: - rev: the svn revision number - commit: the commit metadata - repo_uuid: The repository's uuid - dir_id: the tree's hash identifier - parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = commit['author_name'] msg = commit['message'] date = commit['author_date'] metadata = { 'extra_headers': [ ['svn_repo_uuid', repo_uuid], ['svn_revision', str(rev).encode('utf-8')] ] } return { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': dir_id, 'message': msg, 'author': author, 'committer': author, 'synthetic': True, 'metadata': metadata, 'parents': parents, } def build_gitsvn_swh_revision(rev, commit, dir_id, parents): """Given a svn revision, build a swh revision. Args: - rev: the svn revision number - commit: the commit metadata - dir_id: the tree's hash identifier - parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = commit['author_name'] msg = commit['message'] date = commit['author_date'] return { 'date': date, 'committer_date': date, 'type': 'svn', 'directory': dir_id, 'message': msg, 'author': author, 'committer': author, 'synthetic': True, 'metadata': None, 'parents': parents, } def build_swh_occurrence(revision_id, origin_id, visit): """Build a swh occurrence from the revision id, origin id, and date. """ return {'branch': 'master', 'target': revision_id, 'target_type': 'revision', 'origin': origin_id, 'visit': visit} diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py index 365b851..8e49fca 100644 --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -1,376 +1,378 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """SVN client in charge of iterating over svn logs and yield commit representations including the hash tree/content computations per svn commit. """ import os import tempfile import shutil from subvertpy.ra import RemoteAccess, Auth, get_username_provider from subvertpy import client, properties from swh.model import git from . import ra, utils, converters # When log message contains empty data DEFAULT_AUTHOR_MESSAGE = '' class SvnRepoException(ValueError): def __init__(self, svnrepo, e): super().__init__(e) self.svnrepo = svnrepo class BaseSvnRepo(): """Base svn repository representation for swh. To override some of the behavior regarding the message log properties, you can instantiate a subclass of this class and override:: def convert_commit_author(self, author) def convert_commit_message(self, msg) def convert_commit_date(self, date) see :class:`GitSvnSvnRepo`, :class:`SwhSvnRepo` for instanciation example. """ def __init__(self, remote_url, origin_id, storage, destination_path=None): self.remote_url = remote_url.rstrip('/') self.storage = storage self.origin_id = origin_id if destination_path: os.makedirs(destination_path, exist_ok=True) self.root_dir = destination_path else: self.root_dir = '/tmp' auth = Auth([get_username_provider()]) # one connection for log iteration self.conn_log = RemoteAccess(self.remote_url, auth=auth) # another for replay self.conn = RemoteAccess(self.remote_url, auth=auth) # one client for update operation self.client = client.Client(auth=auth) self.local_dirname = tempfile.mkdtemp( suffix='.swh.loader', prefix='tmp.', dir=self.root_dir) local_name = os.path.basename(self.remote_url) self.local_url = os.path.join(self.local_dirname, local_name).encode( 'utf-8') self.uuid = self.conn.get_uuid().encode('utf-8') def __str__(self): return str({ 'remote_url': self.remote_url, 'local_url': self.local_url, 'uuid': self.uuid, 'swh-origin': self.origin_id }) def head_revision(self): """Retrieve current revision of the repository's working copy. """ return self.conn.get_latest_revnum() def initial_revision(self): """Retrieve the initial revision from which the remote url appeared. Note: This should always be 1 since we won't be dealing with in-depth url. """ return 1 def convert_commit_message(self, msg): """Do something with message (e.g add extra line, etc...) cf. SvnRepo for a simple implementation. Args: msg (str): the commit message to convert. Returns: The transformed message as bytes. """ raise NotImplementedError('Should be overridden by subclass.') def convert_commit_date(self, date): """Convert the message date (e.g, convert into timestamp or whatever makes sense to you.). Args: date (str): the commit date to convert. Returns: The transformed date. """ raise NotImplementedError('Should be overridden by subclass.') def convert_commit_author(self, author): """Convert the commit author (e.g, convert into dict or whatever makes sense to you.). Args: author (str): the commit author to convert. Returns: The transformed author as dict. """ raise NotImplementedError('Should be overridden by subclass.') def __to_entry(self, log_entry): changed_paths, rev, revprops, has_children = log_entry author_date = self.convert_commit_date( revprops.get(properties.PROP_REVISION_DATE)) author = self.convert_commit_author( revprops.get(properties.PROP_REVISION_AUTHOR)) message = self.convert_commit_message( revprops.get(properties.PROP_REVISION_LOG, DEFAULT_AUTHOR_MESSAGE)) return { 'rev': rev, 'author_date': author_date, 'author_name': author, 'message': message, } def logs(self, revision_start, revision_end): """Stream svn logs between revision_start and revision_end by chunks of block_size logs. Yields revision and associated revision information between the revision start and revision_end. Args: revision_start: the svn revision starting bound revision_end: the svn revision ending bound Yields: tuple: tuple of revisions and logs: - revisions: list of revisions in order - logs: Dictionary with key revision number and value the log entry. The log entry is a dictionary with the following keys: - author_date: date of the commit - author_name: name of the author - message: commit message """ for log_entry in self.conn_log.iter_log(paths=None, start=revision_start, end=revision_end, discover_changed_paths=False): yield self.__to_entry(log_entry) def export(self, revision): """Export the repository to a given version. """ self.client.export(self.remote_url, to=self.local_url.decode('utf-8'), rev=revision, ignore_keywords=True) def export_temporary(self, revision): """Export the repository to a given revision in a temporary location. This is up to the caller of this function to clean up the temporary location when done (cf. self.clean_fs method) Args: revision: Revision to export at Returns: The tuple local_dirname the temporary location root folder, local_url where the repository was exported. """ local_dirname = tempfile.mkdtemp( prefix='check-revision-%s.' % revision, dir=self.root_dir) local_name = os.path.basename(self.remote_url) local_url = os.path.join(local_dirname, local_name) self.client.export( self.remote_url, to=local_url, rev=revision, ignore_keywords=True) return local_dirname, local_url def swh_previous_revision(self, previous_swh_revision=None): """Look for possible existing revision in swh. Args: previous_swh_revision: (optional) id of a possible previous swh revision Returns: If previous_swh_revision is not None and do exists, returns the complete instance. Otherwise, check for a possible occurrence and returns the targeted complete revision if it does exists. Otherwise, returns None. """ storage = self.storage # got no previous revision, will check if some occurrence # already exists for that origin if not previous_swh_revision: occ = list(storage.occurrence_get(self.origin_id)) if occ: revision_id = occ[0]['target'] revisions = list(storage.revision_get([revision_id])) if revisions: return revisions[0] else: revs = list(storage.revision_get([previous_swh_revision])) if revs: return revs[0] def swh_hash_data_per_revision(self, start_revision, end_revision): """Compute swh hash data per each revision between start_revision and end_revision. Args: start_revision: starting revision end_revision: ending revision Yields: tuple (rev, nextrev, commit, objects_per_path) - rev: current revision - nextrev: next revision - commit: commit data (author, date, message) for such revision - objects_per_path: dictionary of path, swh hash data with type """ hashes = {} for commit in self.logs(start_revision, end_revision): rev = commit['rev'] hashes = self.swhreplay.compute_hashes(rev) if rev == end_revision: nextrev = None else: nextrev = rev + 1 yield rev, nextrev, commit, hashes def swh_hash_data_at_revision(self, revision): """Compute the hash data at revision. Expected to be used for update only. """ # Update the disk at revision self.export(revision) # Compute the current hashes on disk hashes = git.compute_hashes_from_directory(self.local_url) hashes = utils.convert_hashes_with_relative_path( hashes, rootpath=self.local_url) # Update the replay collaborator with the right state self.swhreplay = ra.SWHReplay( conn=self.conn, rootpath=self.local_url, objects=hashes) # Retrieve the commit information for revision commit = list(self.logs(revision, revision))[0] yield revision, revision + 1, commit, hashes def clean_fs(self, local_dirname=None): """Clean up the local working copy. Args: local_dirname (str): Path to remove recursively if provided. Otherwise, remove the temporary upper root tree used for svn repository loading. """ if local_dirname: shutil.rmtree(local_dirname) else: shutil.rmtree(self.local_dirname) class SWHSvnRepo(BaseSvnRepo): """Same as :class:`BaseSvnRepo` except for: - the commit message which is simply encoded - the commit author is left as is. - the commit timestamp is left as is. """ def __init__(self, remote_url, origin_id, storage, destination_path=None): super().__init__(remote_url, origin_id, storage, destination_path=destination_path) self.swhreplay = ra.SWHReplay( conn=self.conn, rootpath=self.local_url) def convert_commit_message(self, msg): """Simply encode the commit message. Args: msg (str): the commit message to convert. Returns: The transformed message as bytes. """ + if isinstance(msg, bytes): + return msg return msg.encode('utf-8') def convert_commit_date(self, date): """Convert the message commit date into a timestamp in swh format. The precision is kept. Args: date (str): the commit date to convert. Returns: The transformed date. """ return converters.svn_date_to_swh_date(date) def convert_commit_author(self, author): """Convert the commit author into an swh person. The user becomes a dictionary of the form:: { name: author, email: '', fullname: author } Args: author (str): the commit author to convert. Returns: The transformed author as dict. """ return converters.svn_author_to_swh_person(author)