diff --git a/requirements-swh.txt b/requirements-swh.txt index 8b084ee..3aa90d1 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ swh.storage >= 0.0.163 -swh.model >= 0.0.59 +swh.model >= 0.0.60 swh.scheduler >= 0.0.39 swh.loader.core >= 0.0.80 diff --git a/swh/loader/svn/converters.py b/swh/loader/svn/converters.py index 32069ee..f94e75a 100644 --- a/swh/loader/svn/converters.py +++ b/swh/loader/svn/converters.py @@ -1,107 +1,89 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, List, Optional, Union - -from email import utils +from typing import Any, Dict, List, Optional from swh.model.model import ( Person, Revision, RevisionType, TimestampWithTimezone ) -from swh.loader.package.utils import EMPTY_AUTHOR from .utils import strdate_to_timestamp def svn_date_to_swh_date( strdate: Optional[str]) -> TimestampWithTimezone: """Convert a string date to an swh one. Args: strdate: A string formatted for .utils.strdate_to_timestamp to do its jobs Returns: An swh date format """ return TimestampWithTimezone( timestamp=strdate_to_timestamp(strdate), offset=0, negative_utc=False, ) -def svn_author_to_swh_person(author: Union[str, bytes]) -> Person: +def svn_author_to_swh_person(author: Optional[bytes]) -> Person: """Convert an svn author to an swh person. Default policy: No information is added. Args: - author (string): the svn author (in bytes) + author: the svn author (in bytes) Returns: a Person """ - # TODO: Align this function and move it up as library helper function - if not author: - return EMPTY_AUTHOR - - if isinstance(author, str): - author = author.encode('utf-8') - - if b'<' in author and b'>' in author: - name, email = utils.parseaddr(author.decode('utf-8')) - return Person( - fullname=author, - name=name.encode('utf-8'), - email=email.encode('utf-8') - ) - - return Person(fullname=author, name=author, email=None) + return Person.from_fullname(author or b'') def build_swh_revision( rev: int, commit: Dict, repo_uuid: str, dir_id: bytes, parents: List[bytes]) -> Revision: """Given a svn revision, build a swh revision. This adds an ['metadata']['extra-headers'] entry with the repository's uuid and the svn revision. Args: - rev: the svn revision number - commit: the commit metadata - repo_uuid: The repository's uuid - dir_id: the tree's hash identifier - parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = commit['author_name'] msg = commit['message'] date = commit['author_date'] metadata: Dict[str, Any] = { 'extra_headers': [ ['svn_repo_uuid', repo_uuid], ['svn_revision', str(rev).encode('utf-8')] ] } return Revision( type=RevisionType.SUBVERSION, date=date, committer_date=date, directory=dir_id, message=msg, author=author, committer=author, synthetic=True, metadata=metadata, parents=parents, ) diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py index 60877e7..7d0d16a 100644 --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -1,269 +1,261 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """SVN client in charge of iterating over svn logs and yield commit representations including the hash tree/content computations per svn commit. """ import logging import os import tempfile import shutil from subvertpy.ra import RemoteAccess, Auth, get_username_provider from subvertpy import client, properties from swh.model.from_disk import Directory from . import ra, converters # When log message contains empty data DEFAULT_AUTHOR_MESSAGE = '' class SvnRepo: """Svn repository representation. Args: remote_url (str): origin_url (str): Associated origin identifier local_dirname (str): Path to write intermediary svn action results """ def __init__(self, remote_url, origin_url, local_dirname, max_content_length): self.remote_url = remote_url.rstrip('/') self.origin_url = origin_url auth = Auth([get_username_provider()]) # one connection for log iteration self.conn_log = RemoteAccess(self.remote_url, auth=auth) # another for replay self.conn = RemoteAccess(self.remote_url, auth=auth) # one client for update operation self.client = client.Client(auth=auth) self.local_dirname = local_dirname local_name = os.path.basename(self.remote_url) self.local_url = os.path.join(self.local_dirname, local_name).encode( 'utf-8') self.uuid = self.conn.get_uuid().encode('utf-8') self.swhreplay = ra.Replay(conn=self.conn, rootpath=self.local_url) self.max_content_length = max_content_length def __str__(self): return str({ 'swh-origin': self.origin_url, 'remote_url': self.remote_url, 'local_url': self.local_url, 'uuid': self.uuid, }) def head_revision(self): """Retrieve current head revision. """ return self.conn.get_latest_revnum() def initial_revision(self): """Retrieve the initial revision from which the remote url appeared. """ return 1 def convert_commit_message(self, msg): """Simply encode the commit message. Args: msg (str): the commit message to convert. Returns: The transformed message as bytes. """ if isinstance(msg, bytes): return msg return msg.encode('utf-8') def convert_commit_date(self, date): """Convert the message commit date into a timestamp in swh format. The precision is kept. Args: date (str): the commit date to convert. Returns: The transformed date. """ return converters.svn_date_to_swh_date(date) def convert_commit_author(self, author): """Convert the commit author into an swh person. - The user becomes a dictionary of the form:: - - { - name: author, - email: '', - fullname: author - } - Args: author (str): the commit author to convert. Returns: - The transformed author as dict. + Person: a model object """ return converters.svn_author_to_swh_person(author) def __to_entry(self, log_entry): changed_paths, rev, revprops, has_children = log_entry author_date = self.convert_commit_date( revprops.get(properties.PROP_REVISION_DATE)) author = self.convert_commit_author( revprops.get(properties.PROP_REVISION_AUTHOR)) message = self.convert_commit_message( revprops.get(properties.PROP_REVISION_LOG, DEFAULT_AUTHOR_MESSAGE)) return { 'rev': rev, 'author_date': author_date, 'author_name': author, 'message': message, } def logs(self, revision_start, revision_end): """Stream svn logs between revision_start and revision_end by chunks of block_size logs. Yields revision and associated revision information between the revision start and revision_end. Args: revision_start: the svn revision starting bound revision_end: the svn revision ending bound Yields: tuple: tuple of revisions and logs: - revisions: list of revisions in order - logs: Dictionary with key revision number and value the log entry. The log entry is a dictionary with the following keys: - author_date: date of the commit - author_name: name of the author - message: commit message """ for log_entry in self.conn_log.iter_log(paths=None, start=revision_start, end=revision_end, discover_changed_paths=False): yield self.__to_entry(log_entry) def export(self, revision): """Export the repository to a given version. """ self.client.export(self.remote_url, to=self.local_url.decode('utf-8'), rev=revision, ignore_keywords=True) def export_temporary(self, revision): """Export the repository to a given revision in a temporary location. This is up to the caller of this function to clean up the temporary location when done (cf. self.clean_fs method) Args: revision: Revision to export at Returns: The tuple local_dirname the temporary location root folder, local_url where the repository was exported. """ local_dirname = tempfile.mkdtemp( prefix='check-revision-%s.' % revision, dir=self.local_dirname) local_name = os.path.basename(self.remote_url) local_url = os.path.join(local_dirname, local_name) self.client.export( self.remote_url, to=local_url, rev=revision, ignore_keywords=True) return local_dirname, os.fsencode(local_url) def swh_hash_data_per_revision(self, start_revision, end_revision): """Compute swh hash data per each revision between start_revision and end_revision. Args: start_revision: starting revision end_revision: ending revision Yields: tuple (rev, nextrev, commit, objects_per_path) - rev: current revision - nextrev: next revision - commit: commit data (author, date, message) for such revision - objects_per_path: dictionary of path, swh hash data with type """ for commit in self.logs(start_revision, end_revision): rev = commit['rev'] objects = self.swhreplay.compute_objects(rev) if rev == end_revision: nextrev = None else: nextrev = rev + 1 yield rev, nextrev, commit, objects, self.swhreplay.directory def swh_hash_data_at_revision(self, revision): """Compute the hash data at revision. Expected to be used for update only. """ # Update the disk at revision self.export(revision) # Compute the current hashes on disk directory = Directory.from_disk( path=os.fsencode(self.local_url), max_content_length=self.max_content_length) # Update the replay collaborator with the right state self.swhreplay = ra.Replay( conn=self.conn, rootpath=self.local_url, directory=directory) # Retrieve the commit information for revision commit = list(self.logs(revision, revision))[0] yield revision, revision + 1, commit, {}, directory def clean_fs(self, local_dirname=None): """Clean up the local working copy. Args: local_dirname (str): Path to remove recursively if provided. Otherwise, remove the temporary upper root tree used for svn repository loading. """ dirname = local_dirname if local_dirname else self.local_dirname if os.path.exists(dirname): logging.debug('cleanup %s' % dirname) shutil.rmtree(dirname) diff --git a/swh/loader/svn/tests/test_converters.py b/swh/loader/svn/tests/test_converters.py index 8ee82e8..bd55b24 100644 --- a/swh/loader/svn/tests/test_converters.py +++ b/swh/loader/svn/tests/test_converters.py @@ -1,137 +1,137 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Person, Revision, Timestamp, TimestampWithTimezone ) from swh.loader.svn import converters def test_svn_author_to_swh_person(): """The author should have name, email and fullname filled. """ actual_person = converters.svn_author_to_swh_person( - 'tony ') + b'tony ') assert actual_person == Person.from_dict({ 'fullname': b'tony ', 'name': b'tony', 'email': b'ynot@dagobah', }) def test_svn_author_to_swh_person_no_email(): """The author and fullname should be the same as the input (author). """ - actual_person = converters.svn_author_to_swh_person('tony') + actual_person = converters.svn_author_to_swh_person(b'tony') assert actual_person == Person.from_dict({ 'fullname': b'tony', 'name': b'tony', 'email': None, }) def test_svn_author_to_swh_person_empty_person(): """Empty person has only its fullname filled with the empty byte-string. """ - actual_person = converters.svn_author_to_swh_person('') + actual_person = converters.svn_author_to_swh_person(b'') assert actual_person == Person.from_dict({ 'fullname': b'', 'name': None, 'email': None, }) def test_build_swh_revision_default(): """This should build the swh revision with the swh revision's extra headers about the repository. """ dir_id = hash_to_bytes('d6e08e19159f77983242877c373c75222d5ae9dd') date = TimestampWithTimezone( timestamp=Timestamp(seconds=1088108379, microseconds=0), offset=0, negative_utc=False, ) actual_rev = converters.build_swh_revision( repo_uuid=b'uuid', dir_id=dir_id, commit={ 'author_name': Person( name=b'theo', email=b'theo@uuid', fullname=b'theo ' ), 'message': b'commit message', 'author_date': date, }, rev=10, parents=[]) expected_rev = Revision.from_dict({ 'date': date.to_dict(), 'committer_date': date.to_dict(), 'type': 'svn', 'directory': dir_id, 'message': b'commit message', 'author': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'committer': { 'name': b'theo', 'email': b'theo@uuid', 'fullname': b'theo ' }, 'synthetic': True, 'metadata': { 'extra_headers': [ ['svn_repo_uuid', b'uuid'], ['svn_revision', b'10'], ] }, 'parents': [], }) assert actual_rev == expected_rev def test_svn_date_to_swh_date(): """The timestamp should not be tampered with and include the decimals. """ assert converters.svn_date_to_swh_date('2011-05-31T06:04:39.500900Z') == \ TimestampWithTimezone( timestamp=Timestamp(seconds=1306821879, microseconds=500900), offset=0, negative_utc=False, ) assert converters.svn_date_to_swh_date('2011-05-31T06:04:39.800722Z') == \ TimestampWithTimezone( timestamp=Timestamp(seconds=1306821879, microseconds=800722), offset=0, negative_utc=False, ) def test_svn_date_to_swh_date_epoch(): """Empty date should be EPOCH (timestamp and offset at 0).""" # It should return 0, epoch default_tstz = TimestampWithTimezone( timestamp=Timestamp(seconds=0, microseconds=0), offset=0, negative_utc=False, ) assert converters.svn_date_to_swh_date('') == default_tstz assert converters.svn_date_to_swh_date(None) == default_tstz