Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/svn/svn.py
# Copyright (C) 2015-2021 The Software Heritage developers | # Copyright (C) 2015-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""SVN client in charge of iterating over svn logs and yield commit | """SVN client in charge of iterating over svn logs and yield commit | ||||
representations including the hash tree/content computations per svn | representations including the hash tree/content computations per svn | ||||
commit. | commit. | ||||
""" | """ | ||||
import logging | import logging | ||||
import os | import os | ||||
import shutil | import shutil | ||||
import tempfile | import tempfile | ||||
from typing import Dict, Iterator, List, Optional, Tuple, Union | |||||
from subvertpy import client, properties | from subvertpy import client, properties | ||||
from subvertpy.ra import Auth, RemoteAccess, get_username_provider | from subvertpy.ra import Auth, RemoteAccess, get_username_provider | ||||
from swh.model.from_disk import Directory | from swh.model.from_disk import Directory as DirectoryFromDisk | ||||
from swh.model.model import ( | |||||
Content, | |||||
Directory, | |||||
Person, | |||||
SkippedContent, | |||||
TimestampWithTimezone, | |||||
) | |||||
from . import converters, ra | from . import converters, ra | ||||
# When log message contains empty data | # When log message contains empty data | ||||
DEFAULT_AUTHOR_MESSAGE = "" | DEFAULT_AUTHOR_MESSAGE = "" | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
Show All 40 Lines | def __str__(self): | ||||
{ | { | ||||
"swh-origin": self.origin_url, | "swh-origin": self.origin_url, | ||||
"remote_url": self.remote_url, | "remote_url": self.remote_url, | ||||
"local_url": self.local_url, | "local_url": self.local_url, | ||||
"uuid": self.uuid, | "uuid": self.uuid, | ||||
} | } | ||||
) | ) | ||||
def head_revision(self): | def head_revision(self) -> int: | ||||
"""Retrieve current head revision. | """Retrieve current head revision. | ||||
""" | """ | ||||
return self.conn.get_latest_revnum() | return self.conn.get_latest_revnum() | ||||
def initial_revision(self): | def initial_revision(self) -> int: | ||||
"""Retrieve the initial revision from which the remote url appeared. | """Retrieve the initial revision from which the remote url appeared. | ||||
""" | """ | ||||
return 1 | return 1 | ||||
def convert_commit_message(self, msg): | def convert_commit_message(self, msg: Union[str, bytes]) -> bytes: | ||||
"""Simply encode the commit message. | """Simply encode the commit message. | ||||
Args: | Args: | ||||
msg (str): the commit message to convert. | msg: the commit message to convert. | ||||
Returns: | Returns: | ||||
The transformed message as bytes. | The transformed message as bytes. | ||||
""" | """ | ||||
if isinstance(msg, bytes): | if isinstance(msg, bytes): | ||||
return msg | return msg | ||||
return msg.encode("utf-8") | return msg.encode("utf-8") | ||||
def convert_commit_date(self, date): | def convert_commit_date(self, date: str) -> TimestampWithTimezone: | ||||
"""Convert the message commit date into a timestamp in swh format. | """Convert the message commit date into a timestamp in swh format. | ||||
The precision is kept. | The precision is kept. | ||||
Args: | Args: | ||||
date (str): the commit date to convert. | date: the commit date to convert. | ||||
Returns: | Returns: | ||||
The transformed date. | The transformed date. | ||||
""" | """ | ||||
return converters.svn_date_to_swh_date(date) | return converters.svn_date_to_swh_date(date) | ||||
def convert_commit_author(self, author): | def convert_commit_author(self, author: Optional[bytes]) -> Person: | ||||
"""Convert the commit author into an swh person. | """Convert the commit author into an swh person. | ||||
Args: | Args: | ||||
author (str): the commit author to convert. | author: the commit author to convert. | ||||
Returns: | Returns: | ||||
Person: a model object | Person as model object | ||||
""" | """ | ||||
return converters.svn_author_to_swh_person(author) | return converters.svn_author_to_swh_person(author) | ||||
def __to_entry(self, log_entry): | def __to_entry(self, log_entry: Tuple) -> Dict: | ||||
changed_paths, rev, revprops, has_children = log_entry | changed_paths, rev, revprops, has_children = log_entry | ||||
author_date = self.convert_commit_date( | author_date = self.convert_commit_date( | ||||
revprops.get(properties.PROP_REVISION_DATE) | revprops.get(properties.PROP_REVISION_DATE) | ||||
) | ) | ||||
author = self.convert_commit_author( | author = self.convert_commit_author( | ||||
revprops.get(properties.PROP_REVISION_AUTHOR) | revprops.get(properties.PROP_REVISION_AUTHOR) | ||||
) | ) | ||||
message = self.convert_commit_message( | message = self.convert_commit_message( | ||||
revprops.get(properties.PROP_REVISION_LOG, DEFAULT_AUTHOR_MESSAGE) | revprops.get(properties.PROP_REVISION_LOG, DEFAULT_AUTHOR_MESSAGE) | ||||
) | ) | ||||
return { | return { | ||||
"rev": rev, | "rev": rev, | ||||
"author_date": author_date, | "author_date": author_date, | ||||
"author_name": author, | "author_name": author, | ||||
"message": message, | "message": message, | ||||
} | } | ||||
def logs(self, revision_start, revision_end): | def logs(self, revision_start: int, revision_end: int) -> Iterator[Dict]: | ||||
"""Stream svn logs between revision_start and revision_end by chunks of | """Stream svn logs between revision_start and revision_end by chunks of | ||||
block_size logs. | block_size logs. | ||||
Yields revision and associated revision information between the | Yields revision and associated revision information between the | ||||
revision start and revision_end. | revision start and revision_end. | ||||
Args: | Args: | ||||
revision_start: the svn revision starting bound | revision_start: the svn revision starting bound | ||||
revision_end: the svn revision ending bound | revision_end: the svn revision ending bound | ||||
Yields: | Yields: | ||||
tuple: tuple of revisions and logs: | tuple: tuple of revisions and logs: | ||||
vlorentz: need to update this | |||||
- revisions: list of revisions in order | - revisions: list of revisions in order | ||||
- logs: Dictionary with key revision number and value the log | - logs: Dictionary with key revision number and value the log | ||||
entry. The log entry is a dictionary with the following keys: | entry. The log entry is a dictionary with the following keys: | ||||
- author_date: date of the commit | - author_date: date of the commit | ||||
- author_name: name of the author | - author_name: name of the author | ||||
- message: commit message | - message: commit message | ||||
""" | """ | ||||
for log_entry in self.conn_log.iter_log( | for log_entry in self.conn_log.iter_log( | ||||
paths=None, | paths=None, | ||||
start=revision_start, | start=revision_start, | ||||
end=revision_end, | end=revision_end, | ||||
discover_changed_paths=False, | discover_changed_paths=False, | ||||
): | ): | ||||
yield self.__to_entry(log_entry) | yield self.__to_entry(log_entry) | ||||
def export(self, revision): | def export(self, revision: int) -> None: | ||||
"""Export the repository to a given version. | """Export the repository to a given version. | ||||
""" | """ | ||||
self.client.export( | self.client.export( | ||||
self.remote_url, | self.remote_url, | ||||
to=self.local_url.decode("utf-8"), | to=self.local_url.decode("utf-8"), | ||||
rev=revision, | rev=revision, | ||||
ignore_keywords=True, | ignore_keywords=True, | ||||
) | ) | ||||
def export_temporary(self, revision): | def export_temporary(self, revision: int) -> Tuple[str, bytes]: | ||||
"""Export the repository to a given revision in a temporary location. | """Export the repository to a given revision in a temporary location. This is up | ||||
This is up to the caller of this function to clean up the | to the caller of this function to clean up the temporary location when done (cf. | ||||
temporary location when done (cf. self.clean_fs method) | self.clean_fs method) | ||||
Args: | Args: | ||||
revision: Revision to export at | revision: Revision to export at | ||||
Returns: | Returns: | ||||
The tuple local_dirname the temporary location root | The tuple local_dirname the temporary location root | ||||
folder, local_url where the repository was exported. | folder, local_url where the repository was exported. | ||||
""" | """ | ||||
local_dirname = tempfile.mkdtemp( | local_dirname = tempfile.mkdtemp( | ||||
prefix="check-revision-%s." % revision, dir=self.local_dirname | dir=self.local_dirname, prefix=f"check-revision-{revision}." | ||||
) | ) | ||||
local_name = os.path.basename(self.remote_url) | local_name = os.path.basename(self.remote_url) | ||||
local_url = os.path.join(local_dirname, local_name) | local_url = os.path.join(local_dirname, local_name) | ||||
self.client.export( | self.client.export( | ||||
self.remote_url, to=local_url, rev=revision, ignore_keywords=True | self.remote_url, to=local_url, rev=revision, ignore_keywords=True | ||||
) | ) | ||||
return local_dirname, os.fsencode(local_url) | return local_dirname, os.fsencode(local_url) | ||||
def swh_hash_data_per_revision(self, start_revision, end_revision): | def swh_hash_data_per_revision( | ||||
self, start_revision: int, end_revision: int | |||||
) -> Iterator[ | |||||
Tuple[ | |||||
int, | |||||
Optional[int], | |||||
Dict, | |||||
Tuple[List[Content], List[SkippedContent], List[Directory]], | |||||
DirectoryFromDisk, | |||||
Not Done Inline Actionswould make sense to add a data structure for this, at least a namedtuple vlorentz: would make sense to add a data structure for this, at least a namedtuple | |||||
Done Inline Actionsyes, in another diff though if you don't mind (i'm lazy right now). ardumont: yes, in another diff though if you don't mind (i'm lazy right now). | |||||
], | |||||
]: | |||||
"""Compute swh hash data per each revision between start_revision and | """Compute swh hash data per each revision between start_revision and | ||||
end_revision. | end_revision. | ||||
Args: | Args: | ||||
start_revision: starting revision | start_revision: starting revision | ||||
end_revision: ending revision | end_revision: ending revision | ||||
Yields: | Yields: | ||||
tuple (rev, nextrev, commit, objects_per_path) | Tuple (rev, nextrev, commit, objects_per_path): | ||||
- rev: current revision | - rev: current revision | ||||
- nextrev: next revision | - nextrev: next revision or None if we reached end_revision. | ||||
- commit: commit data (author, date, message) for such revision | - commit: commit data (author, date, message) for such revision | ||||
- objects_per_path: dictionary of path, swh hash data with type | - objects_per_path: Tuple of list of objects between start_revision and | ||||
end_revision | |||||
- complete Directory representation | |||||
""" | """ | ||||
for commit in self.logs(start_revision, end_revision): | for commit in self.logs(start_revision, end_revision): | ||||
rev = commit["rev"] | rev = commit["rev"] | ||||
objects = self.swhreplay.compute_objects(rev) | objects = self.swhreplay.compute_objects(rev) | ||||
if rev == end_revision: | if rev == end_revision: | ||||
nextrev = None | nextrev = None | ||||
else: | else: | ||||
nextrev = rev + 1 | nextrev = rev + 1 | ||||
yield rev, nextrev, commit, objects, self.swhreplay.directory | yield rev, nextrev, commit, objects, self.swhreplay.directory | ||||
def swh_hash_data_at_revision(self, revision): | def swh_hash_data_at_revision( | ||||
"""Compute the hash data at revision. | self, revision: int | ||||
) -> Iterator[Tuple[Dict, DirectoryFromDisk]]: | |||||
"""Compute the information at a given svn revision. This is expected to be used | |||||
for update only. | |||||
Expected to be used for update only. | Yields: | ||||
The tuple (commit dictionary, targeted directory object). | |||||
""" | """ | ||||
# Update the disk at revision | # Update disk representation of the repository at revision id | ||||
self.export(revision) | self.export(revision) | ||||
# Compute the current hashes on disk | # Compute the current hashes on disk | ||||
directory = Directory.from_disk( | directory = DirectoryFromDisk.from_disk( | ||||
path=os.fsencode(self.local_url), max_content_length=self.max_content_length | path=os.fsencode(self.local_url), max_content_length=self.max_content_length | ||||
) | ) | ||||
# Update the replay collaborator with the right state | # Update the replay collaborator with the right state | ||||
self.swhreplay = ra.Replay( | self.swhreplay = ra.Replay( | ||||
conn=self.conn, rootpath=self.local_url, directory=directory | conn=self.conn, rootpath=self.local_url, directory=directory | ||||
) | ) | ||||
# Retrieve the commit information for revision | # Retrieve the commit information for revision | ||||
commit = list(self.logs(revision, revision))[0] | commit = list(self.logs(revision, revision))[0] | ||||
yield revision, revision + 1, commit, {}, directory | yield commit, directory | ||||
def clean_fs(self, local_dirname=None): | def clean_fs(self, local_dirname: Optional[str] = None) -> None: | ||||
"""Clean up the local working copy. | """Clean up the local working copy. | ||||
Args: | Args: | ||||
local_dirname (str): Path to remove recursively if | local_dirname: Path to remove recursively if provided. Otherwise, remove the | ||||
provided. Otherwise, remove the temporary upper root tree | temporary upper root tree used for svn repository loading. | ||||
used for svn repository loading. | |||||
""" | """ | ||||
dirname = local_dirname if local_dirname else self.local_dirname | dirname = local_dirname or self.local_dirname | ||||
if os.path.exists(dirname): | if os.path.exists(dirname): | ||||
logger.debug("cleanup %s", dirname) | logger.debug("cleanup %s", dirname) | ||||
shutil.rmtree(dirname) | shutil.rmtree(dirname) |
need to update this