Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py
index d10f6bd..b8f5543 100644
--- a/swh/loader/svn/svn.py
+++ b/swh/loader/svn/svn.py
@@ -1,365 +1,376 @@
# Copyright (C) 2015-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""SVN client in charge of iterating over svn logs and yield commit
representations including the hash tree/content computations per svn
commit.
"""
import logging
import os
import shutil
import tempfile
from typing import Dict, Iterator, List, Optional, Tuple, Union
-from subvertpy import client, properties
+from subvertpy import SubversionException, client, properties
from subvertpy.ra import Auth, RemoteAccess, get_username_provider
from swh.model.from_disk import Directory as DirectoryFromDisk
from swh.model.model import (
Content,
Directory,
Person,
SkippedContent,
TimestampWithTimezone,
)
from . import converters, replay
from .utils import is_recursive_external, parse_external_definition
# When log message contains empty data
DEFAULT_AUTHOR_MESSAGE = ""
logger = logging.getLogger(__name__)
class SvnRepo:
"""Svn repository representation.
Args:
remote_url: Remove svn repository url
origin_url: Associated origin identifier
local_dirname: Path to write intermediary svn action results
"""
def __init__(
self,
remote_url: str,
origin_url: str,
local_dirname: str,
max_content_length: int,
):
self.remote_url = remote_url.rstrip("/")
self.origin_url = origin_url
auth = Auth([get_username_provider()])
# one connection for log iteration
self.conn_log = RemoteAccess(self.remote_url, auth=auth)
# another for replay
self.conn = RemoteAccess(self.remote_url, auth=auth)
# one client for update operation
self.client = client.Client(auth=auth)
self.local_dirname = local_dirname
local_name = os.path.basename(self.remote_url)
self.local_url = os.path.join(self.local_dirname, local_name).encode("utf-8")
self.uuid = self.conn.get_uuid().encode("utf-8")
self.swhreplay = replay.Replay(
conn=self.conn,
rootpath=self.local_url,
svnrepo=self,
temp_dir=local_dirname,
)
self.max_content_length = max_content_length
self.has_relative_externals = False
self.has_recursive_externals = False
self.replay_started = False
def __str__(self):
return str(
{
"swh-origin": self.origin_url,
"remote_url": self.remote_url,
"local_url": self.local_url,
"uuid": self.uuid,
}
)
def head_revision(self) -> int:
"""Retrieve current head revision.
"""
return self.conn.get_latest_revnum()
def initial_revision(self) -> int:
"""Retrieve the initial revision from which the remote url appeared.
"""
return 1
def convert_commit_message(self, msg: Union[str, bytes]) -> bytes:
"""Simply encode the commit message.
Args:
msg: the commit message to convert.
Returns:
The transformed message as bytes.
"""
if isinstance(msg, bytes):
return msg
return msg.encode("utf-8")
def convert_commit_date(self, date: bytes) -> TimestampWithTimezone:
"""Convert the message commit date into a timestamp in swh format.
The precision is kept.
Args:
date: the commit date to convert.
Returns:
The transformed date.
"""
return converters.svn_date_to_swh_date(date)
def convert_commit_author(self, author: Optional[bytes]) -> Person:
"""Convert the commit author into an swh person.
Args:
author: the commit author to convert.
Returns:
Person as model object
"""
return converters.svn_author_to_swh_person(author)
def __to_entry(self, log_entry: Tuple) -> Dict:
changed_paths, rev, revprops, has_children = log_entry
author_date = self.convert_commit_date(
revprops.get(properties.PROP_REVISION_DATE)
)
author = self.convert_commit_author(
revprops.get(properties.PROP_REVISION_AUTHOR)
)
message = self.convert_commit_message(
revprops.get(properties.PROP_REVISION_LOG, DEFAULT_AUTHOR_MESSAGE)
)
return {
"rev": rev,
"author_date": author_date,
"author_name": author,
"message": message,
}
def logs(self, revision_start: int, revision_end: int) -> Iterator[Dict]:
"""Stream svn logs between revision_start and revision_end by chunks of
block_size logs.
Yields revision and associated revision information between the
revision start and revision_end.
Args:
revision_start: the svn revision starting bound
revision_end: the svn revision ending bound
Yields:
tuple: tuple of revisions and logs:
- revisions: list of revisions in order
- logs: Dictionary with key revision number and value the log
entry. The log entry is a dictionary with the following keys:
- author_date: date of the commit
- author_name: name of the author
- message: commit message
"""
for log_entry in self.conn_log.iter_log(
paths=None,
start=revision_start,
end=revision_end,
discover_changed_paths=False,
):
yield self.__to_entry(log_entry)
def export_temporary(self, revision: int) -> Tuple[str, bytes]:
"""Export the repository to a given revision in a temporary location. This is up
to the caller of this function to clean up the temporary location when done (cf.
self.clean_fs method)
Args:
revision: Revision to export at
Returns:
The tuple local_dirname the temporary location root
folder, local_url where the repository was exported.
"""
local_dirname = tempfile.mkdtemp(
dir=self.local_dirname, prefix=f"check-revision-{revision}."
)
local_name = os.path.basename(self.remote_url)
local_url = os.path.join(local_dirname, local_name)
url = self.remote_url
# if some paths have external URLs relative to the repository URL but targeting
# paths oustide it, we need to export from the origin URL as the remote URL can
# target a dump mounted on the local filesystem
if self.replay_started and self.has_relative_externals:
# externals detected while replaying revisions
url = self.origin_url
elif not self.replay_started:
# revisions replay has not started, we need to check if svn:externals
# properties are set from a checkout of the revision and if some
# external URLs are relative to pick the right export URL,
# recursive externals are also checked
with tempfile.TemporaryDirectory(
dir=self.local_dirname, prefix=f"checkout-revision-{revision}."
) as co_dirname:
self.client.checkout(
self.remote_url, co_dirname, revision, ignore_externals=True
)
# get all svn:externals properties recursively
externals = self.client.propget(
"svn:externals", co_dirname, None, revision, True
)
self.has_relative_externals = False
self.has_recursive_externals = False
for path, external_defs in externals.items():
if self.has_relative_externals or self.has_recursive_externals:
break
path = path.replace(self.remote_url.rstrip("/") + "/", "")
for external_def in os.fsdecode(external_defs).split("\n"):
# skip empty line or comment
if not external_def or external_def.startswith("#"):
continue
(
external_path,
external_url,
_,
relative_url,
) = parse_external_definition(
external_def.rstrip("\r"), path, self.origin_url
)
if is_recursive_external(
self.origin_url, path, external_path, external_url,
):
self.has_recursive_externals = True
url = self.remote_url
break
if relative_url:
self.has_relative_externals = True
url = self.origin_url
break
- self.client.export(
- url.rstrip("/"),
- to=local_url,
- rev=revision,
- ignore_keywords=True,
- ignore_externals=self.has_recursive_externals,
- )
+ try:
+ self.client.export(
+ url.rstrip("/"),
+ to=local_url,
+ rev=revision,
+ ignore_keywords=True,
+ ignore_externals=self.has_recursive_externals,
+ )
+ except SubversionException as se:
+ if se.args[0].startswith(
+ (
+ "Error parsing svn:externals property",
+ "Unrecognized format for the relative external URL",
+ )
+ ):
+ pass
+ else:
+ raise
return local_dirname, os.fsencode(local_url)
def swh_hash_data_per_revision(
self, start_revision: int, end_revision: int
) -> Iterator[
Tuple[
int,
Optional[int],
Dict,
Tuple[List[Content], List[SkippedContent], List[Directory]],
DirectoryFromDisk,
],
]:
"""Compute swh hash data per each revision between start_revision and
end_revision.
Args:
start_revision: starting revision
end_revision: ending revision
Yields:
Tuple (rev, nextrev, commit, objects_per_path):
- rev: current revision
- nextrev: next revision or None if we reached end_revision.
- commit: commit data (author, date, message) for such revision
- objects_per_path: Tuple of list of objects between start_revision and
end_revision
- complete Directory representation
"""
# even in incremental loading mode, we need to replay the whole set of
# path modifications from first revision to restore possible file states induced
# by setting svn properties on those files (end of line style for instance)
self.replay_started = True
first_revision = 1 if start_revision else 0 # handle empty repository edge case
for commit in self.logs(first_revision, end_revision):
rev = commit["rev"]
objects = self.swhreplay.compute_objects(rev)
if rev == end_revision:
nextrev = None
else:
nextrev = rev + 1
if rev >= start_revision:
# start yielding new data to archive once we reached the revision to
# resume the loading from
yield rev, nextrev, commit, objects, self.swhreplay.directory
def swh_hash_data_at_revision(
self, revision: int
) -> Tuple[Dict, DirectoryFromDisk]:
"""Compute the information at a given svn revision. This is expected to be used
for checks only.
Yields:
The tuple (commit dictionary, targeted directory object).
"""
# Update disk representation of the repository at revision id
local_dirname, local_url = self.export_temporary(revision)
# Compute the current hashes on disk
directory = DirectoryFromDisk.from_disk(
path=local_url, max_content_length=self.max_content_length
)
# Retrieve the commit information for revision
commit = list(self.logs(revision, revision))[0]
# Clean export directory
self.clean_fs(local_dirname)
return commit, directory
def clean_fs(self, local_dirname: Optional[str] = None) -> None:
"""Clean up the local working copy.
Args:
local_dirname: Path to remove recursively if provided. Otherwise, remove the
temporary upper root tree used for svn repository loading.
"""
dirname = local_dirname or self.local_dirname
if os.path.exists(dirname):
logger.debug("cleanup %s", dirname)
shutil.rmtree(dirname)

File Metadata

Mime Type
text/x-diff
Expires
Thu, Jul 3, 12:02 PM (5 d, 6 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3247212

Event Timeline