diff --git a/requirements.txt b/requirements.txt index 5730ae2..f03579d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html click iso8601 -python-dateutil subvertpy >= 0.9.4 typing-extensions diff --git a/swh/loader/svn/converters.py b/swh/loader/svn/converters.py index 65878b4..cdef38b 100644 --- a/swh/loader/svn/converters.py +++ b/swh/loader/svn/converters.py @@ -1,87 +1,86 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from typing import Dict, Optional, Sequence, Tuple -import dateutil +import iso8601 from swh.model.model import Person, Revision, RevisionType, TimestampWithTimezone -def svn_date_to_swh_date(strdate: Optional[str]) -> TimestampWithTimezone: +def svn_date_to_swh_date(strdate: Optional[bytes]) -> TimestampWithTimezone: """Convert a string date to an swh one. Args: strdate: A string representing a date with format like - 'YYYY-mm-DDTHH:MM:SS.800722Z' + ``b'YYYY-mm-DDTHH:MM:SS.800722Z'`` Returns: An swh date format """ if not strdate: # either None or empty string dt = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) else: - # TODO: Migrate to iso8601 if possible - dt = dateutil.parser.parse(strdate) + dt = iso8601.parse_date(strdate.decode("ascii")) assert dt.tzinfo is not None, strdate return TimestampWithTimezone.from_datetime(dt) def svn_author_to_swh_person(author: Optional[bytes]) -> Person: """Convert an svn author to an swh person. Default policy: No information is added. Args: author: the svn author (in bytes) Returns: a Person """ return Person.from_fullname(author or b"") def build_swh_revision( rev: int, commit: Dict, repo_uuid: bytes, dir_id: bytes, parents: Sequence[bytes] ) -> Revision: """Given a svn revision, build a swh revision. This adds an 'extra-headers' entry with the repository's uuid and the svn revision. Args: rev: the svn revision number commit: the commit data: revision id, date, author, and message repo_uuid: The repository's uuid dir_id: the tree's hash identifier parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = commit["author_name"] msg = commit["message"] date = commit["author_date"] extra_headers: Tuple[Tuple[bytes, bytes], ...] = ( (b"svn_repo_uuid", repo_uuid), (b"svn_revision", str(rev).encode()), ) return Revision( type=RevisionType.SUBVERSION, date=date, committer_date=date, directory=dir_id, message=msg, author=author, committer=author, synthetic=True, extra_headers=extra_headers, parents=tuple(parents), ) diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py index b6680e9..308b33e 100644 --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -1,298 +1,298 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """SVN client in charge of iterating over svn logs and yield commit representations including the hash tree/content computations per svn commit. """ import logging import os import shutil import tempfile from typing import Dict, Iterator, List, Optional, Tuple, Union from subvertpy import client, properties from subvertpy.ra import Auth, RemoteAccess, get_username_provider from swh.model.from_disk import Directory as DirectoryFromDisk from swh.model.model import ( Content, Directory, Person, SkippedContent, TimestampWithTimezone, ) from . import converters, ra # When log message contains empty data DEFAULT_AUTHOR_MESSAGE = "" logger = logging.getLogger(__name__) class SvnRepo: """Svn repository representation. Args: remote_url: Remove svn repository url origin_url: Associated origin identifier local_dirname: Path to write intermediary svn action results """ def __init__( self, remote_url: str, origin_url: str, local_dirname: str, max_content_length: int, ): self.remote_url = remote_url.rstrip("/") self.origin_url = origin_url auth = Auth([get_username_provider()]) # one connection for log iteration self.conn_log = RemoteAccess(self.remote_url, auth=auth) # another for replay self.conn = RemoteAccess(self.remote_url, auth=auth) # one client for update operation self.client = client.Client(auth=auth) self.local_dirname = local_dirname local_name = os.path.basename(self.remote_url) self.local_url = os.path.join(self.local_dirname, local_name).encode("utf-8") self.uuid = self.conn.get_uuid().encode("utf-8") self.swhreplay = ra.Replay( conn=self.conn, rootpath=self.local_url, svnrepo=self ) self.max_content_length = max_content_length def __str__(self): return str( { "swh-origin": self.origin_url, "remote_url": self.remote_url, "local_url": self.local_url, "uuid": self.uuid, } ) def head_revision(self) -> int: """Retrieve current head revision. """ return self.conn.get_latest_revnum() def initial_revision(self) -> int: """Retrieve the initial revision from which the remote url appeared. """ return 1 def convert_commit_message(self, msg: Union[str, bytes]) -> bytes: """Simply encode the commit message. Args: msg: the commit message to convert. Returns: The transformed message as bytes. """ if isinstance(msg, bytes): return msg return msg.encode("utf-8") - def convert_commit_date(self, date: str) -> TimestampWithTimezone: + def convert_commit_date(self, date: bytes) -> TimestampWithTimezone: """Convert the message commit date into a timestamp in swh format. The precision is kept. Args: date: the commit date to convert. Returns: The transformed date. """ return converters.svn_date_to_swh_date(date) def convert_commit_author(self, author: Optional[bytes]) -> Person: """Convert the commit author into an swh person. Args: author: the commit author to convert. Returns: Person as model object """ return converters.svn_author_to_swh_person(author) def __to_entry(self, log_entry: Tuple) -> Dict: changed_paths, rev, revprops, has_children = log_entry author_date = self.convert_commit_date( revprops.get(properties.PROP_REVISION_DATE) ) author = self.convert_commit_author( revprops.get(properties.PROP_REVISION_AUTHOR) ) message = self.convert_commit_message( revprops.get(properties.PROP_REVISION_LOG, DEFAULT_AUTHOR_MESSAGE) ) return { "rev": rev, "author_date": author_date, "author_name": author, "message": message, } def logs(self, revision_start: int, revision_end: int) -> Iterator[Dict]: """Stream svn logs between revision_start and revision_end by chunks of block_size logs. Yields revision and associated revision information between the revision start and revision_end. Args: revision_start: the svn revision starting bound revision_end: the svn revision ending bound Yields: tuple: tuple of revisions and logs: - revisions: list of revisions in order - logs: Dictionary with key revision number and value the log entry. The log entry is a dictionary with the following keys: - author_date: date of the commit - author_name: name of the author - message: commit message """ for log_entry in self.conn_log.iter_log( paths=None, start=revision_start, end=revision_end, discover_changed_paths=False, ): yield self.__to_entry(log_entry) def export_temporary(self, revision: int) -> Tuple[str, bytes]: """Export the repository to a given revision in a temporary location. This is up to the caller of this function to clean up the temporary location when done (cf. self.clean_fs method) Args: revision: Revision to export at Returns: The tuple local_dirname the temporary location root folder, local_url where the repository was exported. """ local_dirname = tempfile.mkdtemp( dir=self.local_dirname, prefix=f"check-revision-{revision}." ) local_name = os.path.basename(self.remote_url) local_url = os.path.join(local_dirname, local_name) self.client.export( self.remote_url, to=local_url, rev=revision, ignore_keywords=True ) return local_dirname, os.fsencode(local_url) def swh_hash_data_per_revision( self, start_revision: int, end_revision: int ) -> Iterator[ Tuple[ int, Optional[int], Dict, Tuple[List[Content], List[SkippedContent], List[Directory]], DirectoryFromDisk, ], ]: """Compute swh hash data per each revision between start_revision and end_revision. Args: start_revision: starting revision end_revision: ending revision Yields: Tuple (rev, nextrev, commit, objects_per_path): - rev: current revision - nextrev: next revision or None if we reached end_revision. - commit: commit data (author, date, message) for such revision - objects_per_path: Tuple of list of objects between start_revision and end_revision - complete Directory representation """ # even in incremental loading mode, we need to replay the whole set of # path modifications from first revision to restore possible file states induced # by setting svn properties on those files (end of line style for instance) first_revision = 1 if start_revision else 0 # handle empty repository edge case for commit in self.logs(first_revision, end_revision): rev = commit["rev"] objects = self.swhreplay.compute_objects(rev) if rev == end_revision: nextrev = None else: nextrev = rev + 1 if rev >= start_revision: # start yielding new data to archive once we reached the revision to # resume the loading from yield rev, nextrev, commit, objects, self.swhreplay.directory def swh_hash_data_at_revision( self, revision: int ) -> Tuple[Dict, DirectoryFromDisk]: """Compute the information at a given svn revision. This is expected to be used for checks only. Yields: The tuple (commit dictionary, targeted directory object). """ # Update disk representation of the repository at revision id local_dirname, local_url = self.export_temporary(revision) # Compute the current hashes on disk directory = DirectoryFromDisk.from_disk( path=local_url, max_content_length=self.max_content_length ) # Retrieve the commit information for revision commit = list(self.logs(revision, revision))[0] # Clean export directory self.clean_fs(local_dirname) return commit, directory def clean_fs(self, local_dirname: Optional[str] = None) -> None: """Clean up the local working copy. Args: local_dirname: Path to remove recursively if provided. Otherwise, remove the temporary upper root tree used for svn repository loading. """ dirname = local_dirname or self.local_dirname if os.path.exists(dirname): logger.debug("cleanup %s", dirname) shutil.rmtree(dirname) diff --git a/swh/loader/svn/tests/test_converters.py b/swh/loader/svn/tests/test_converters.py index 7dd7696..58c7238 100644 --- a/swh/loader/svn/tests/test_converters.py +++ b/swh/loader/svn/tests/test_converters.py @@ -1,124 +1,124 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.loader.svn import converters from swh.model.hashutil import hash_to_bytes from swh.model.model import Person, Revision, Timestamp, TimestampWithTimezone def test_svn_author_to_swh_person(): """The author should have name, email and fullname filled. """ actual_person = converters.svn_author_to_swh_person(b"tony ") assert actual_person == Person.from_dict( {"fullname": b"tony ", "name": b"tony", "email": b"ynot@dagobah",} ) def test_svn_author_to_swh_person_no_email(): """The author and fullname should be the same as the input (author). """ actual_person = converters.svn_author_to_swh_person(b"tony") assert actual_person == Person.from_dict( {"fullname": b"tony", "name": b"tony", "email": None,} ) def test_svn_author_to_swh_person_empty_person(): """Empty person has only its fullname filled with the empty byte-string. """ actual_person = converters.svn_author_to_swh_person(b"") assert actual_person == Person.from_dict( {"fullname": b"", "name": None, "email": None,} ) def test_build_swh_revision_default(): """This should build the swh revision with the swh revision's extra headers about the repository. """ dir_id = hash_to_bytes("d6e08e19159f77983242877c373c75222d5ae9dd") date = TimestampWithTimezone( timestamp=Timestamp(seconds=1088108379, microseconds=0), offset=0, negative_utc=False, ) actual_rev = converters.build_swh_revision( repo_uuid=b"uuid", dir_id=dir_id, commit={ "author_name": Person( name=b"theo", email=b"theo@uuid", fullname=b"theo " ), "message": b"commit message", "author_date": date, }, rev=10, parents=(), ) expected_rev = Revision.from_dict( { "date": date.to_dict(), "committer_date": date.to_dict(), "type": "svn", "directory": dir_id, "message": b"commit message", "author": { "name": b"theo", "email": b"theo@uuid", "fullname": b"theo ", }, "committer": { "name": b"theo", "email": b"theo@uuid", "fullname": b"theo ", }, "synthetic": True, "extra_headers": ((b"svn_repo_uuid", b"uuid"), (b"svn_revision", b"10"),), "parents": (), } ) assert actual_rev == expected_rev def test_svn_date_to_swh_date(): """The timestamp should not be tampered with and include the decimals. """ assert converters.svn_date_to_swh_date( - "2011-05-31T06:04:39.500900Z" + b"2011-05-31T06:04:39.500900Z" ) == TimestampWithTimezone( timestamp=Timestamp(seconds=1306821879, microseconds=500900), offset=0, negative_utc=False, ) assert converters.svn_date_to_swh_date( - "2011-05-31T06:04:39.800722Z" + b"2011-05-31T06:04:39.800722Z" ) == TimestampWithTimezone( timestamp=Timestamp(seconds=1306821879, microseconds=800722), offset=0, negative_utc=False, ) def test_svn_date_to_swh_date_epoch(): """Empty date should be EPOCH (timestamp and offset at 0).""" # It should return 0, epoch default_tstz = TimestampWithTimezone( timestamp=Timestamp(seconds=0, microseconds=0), offset=0, negative_utc=False, ) assert converters.svn_date_to_swh_date("") == default_tstz assert converters.svn_date_to_swh_date(None) == default_tstz