Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/requirements.txt b/requirements.txt
index 5730ae2..f03579d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,7 @@
# Add here external Python modules dependencies, one per line. Module names
# should match https://pypi.python.org/pypi names. For the full spec or
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
click
iso8601
-python-dateutil
subvertpy >= 0.9.4
typing-extensions
diff --git a/swh/loader/svn/converters.py b/swh/loader/svn/converters.py
index 65878b4..cdef38b 100644
--- a/swh/loader/svn/converters.py
+++ b/swh/loader/svn/converters.py
@@ -1,87 +1,86 @@
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
from typing import Dict, Optional, Sequence, Tuple
-import dateutil
+import iso8601
from swh.model.model import Person, Revision, RevisionType, TimestampWithTimezone
-def svn_date_to_swh_date(strdate: Optional[str]) -> TimestampWithTimezone:
+def svn_date_to_swh_date(strdate: Optional[bytes]) -> TimestampWithTimezone:
"""Convert a string date to an swh one.
Args:
strdate: A string representing a date with format like
- 'YYYY-mm-DDTHH:MM:SS.800722Z'
+ ``b'YYYY-mm-DDTHH:MM:SS.800722Z'``
Returns:
An swh date format
"""
if not strdate: # either None or empty string
dt = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc)
else:
- # TODO: Migrate to iso8601 if possible
- dt = dateutil.parser.parse(strdate)
+ dt = iso8601.parse_date(strdate.decode("ascii"))
assert dt.tzinfo is not None, strdate
return TimestampWithTimezone.from_datetime(dt)
def svn_author_to_swh_person(author: Optional[bytes]) -> Person:
"""Convert an svn author to an swh person.
Default policy: No information is added.
Args:
author: the svn author (in bytes)
Returns:
a Person
"""
return Person.from_fullname(author or b"")
def build_swh_revision(
rev: int, commit: Dict, repo_uuid: bytes, dir_id: bytes, parents: Sequence[bytes]
) -> Revision:
"""Given a svn revision, build a swh revision.
This adds an 'extra-headers' entry with the
repository's uuid and the svn revision.
Args:
rev: the svn revision number
commit: the commit data: revision id, date, author, and message
repo_uuid: The repository's uuid
dir_id: the tree's hash identifier
parents: the revision's parents identifier
Returns:
The swh revision dictionary.
"""
author = commit["author_name"]
msg = commit["message"]
date = commit["author_date"]
extra_headers: Tuple[Tuple[bytes, bytes], ...] = (
(b"svn_repo_uuid", repo_uuid),
(b"svn_revision", str(rev).encode()),
)
return Revision(
type=RevisionType.SUBVERSION,
date=date,
committer_date=date,
directory=dir_id,
message=msg,
author=author,
committer=author,
synthetic=True,
extra_headers=extra_headers,
parents=tuple(parents),
)
diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py
index b6680e9..308b33e 100644
--- a/swh/loader/svn/svn.py
+++ b/swh/loader/svn/svn.py
@@ -1,298 +1,298 @@
# Copyright (C) 2015-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""SVN client in charge of iterating over svn logs and yield commit
representations including the hash tree/content computations per svn
commit.
"""
import logging
import os
import shutil
import tempfile
from typing import Dict, Iterator, List, Optional, Tuple, Union
from subvertpy import client, properties
from subvertpy.ra import Auth, RemoteAccess, get_username_provider
from swh.model.from_disk import Directory as DirectoryFromDisk
from swh.model.model import (
Content,
Directory,
Person,
SkippedContent,
TimestampWithTimezone,
)
from . import converters, ra
# When log message contains empty data
DEFAULT_AUTHOR_MESSAGE = ""
logger = logging.getLogger(__name__)
class SvnRepo:
"""Svn repository representation.
Args:
remote_url: Remove svn repository url
origin_url: Associated origin identifier
local_dirname: Path to write intermediary svn action results
"""
def __init__(
self,
remote_url: str,
origin_url: str,
local_dirname: str,
max_content_length: int,
):
self.remote_url = remote_url.rstrip("/")
self.origin_url = origin_url
auth = Auth([get_username_provider()])
# one connection for log iteration
self.conn_log = RemoteAccess(self.remote_url, auth=auth)
# another for replay
self.conn = RemoteAccess(self.remote_url, auth=auth)
# one client for update operation
self.client = client.Client(auth=auth)
self.local_dirname = local_dirname
local_name = os.path.basename(self.remote_url)
self.local_url = os.path.join(self.local_dirname, local_name).encode("utf-8")
self.uuid = self.conn.get_uuid().encode("utf-8")
self.swhreplay = ra.Replay(
conn=self.conn, rootpath=self.local_url, svnrepo=self
)
self.max_content_length = max_content_length
def __str__(self):
return str(
{
"swh-origin": self.origin_url,
"remote_url": self.remote_url,
"local_url": self.local_url,
"uuid": self.uuid,
}
)
def head_revision(self) -> int:
"""Retrieve current head revision.
"""
return self.conn.get_latest_revnum()
def initial_revision(self) -> int:
"""Retrieve the initial revision from which the remote url appeared.
"""
return 1
def convert_commit_message(self, msg: Union[str, bytes]) -> bytes:
"""Simply encode the commit message.
Args:
msg: the commit message to convert.
Returns:
The transformed message as bytes.
"""
if isinstance(msg, bytes):
return msg
return msg.encode("utf-8")
- def convert_commit_date(self, date: str) -> TimestampWithTimezone:
+ def convert_commit_date(self, date: bytes) -> TimestampWithTimezone:
"""Convert the message commit date into a timestamp in swh format.
The precision is kept.
Args:
date: the commit date to convert.
Returns:
The transformed date.
"""
return converters.svn_date_to_swh_date(date)
def convert_commit_author(self, author: Optional[bytes]) -> Person:
"""Convert the commit author into an swh person.
Args:
author: the commit author to convert.
Returns:
Person as model object
"""
return converters.svn_author_to_swh_person(author)
def __to_entry(self, log_entry: Tuple) -> Dict:
changed_paths, rev, revprops, has_children = log_entry
author_date = self.convert_commit_date(
revprops.get(properties.PROP_REVISION_DATE)
)
author = self.convert_commit_author(
revprops.get(properties.PROP_REVISION_AUTHOR)
)
message = self.convert_commit_message(
revprops.get(properties.PROP_REVISION_LOG, DEFAULT_AUTHOR_MESSAGE)
)
return {
"rev": rev,
"author_date": author_date,
"author_name": author,
"message": message,
}
def logs(self, revision_start: int, revision_end: int) -> Iterator[Dict]:
"""Stream svn logs between revision_start and revision_end by chunks of
block_size logs.
Yields revision and associated revision information between the
revision start and revision_end.
Args:
revision_start: the svn revision starting bound
revision_end: the svn revision ending bound
Yields:
tuple: tuple of revisions and logs:
- revisions: list of revisions in order
- logs: Dictionary with key revision number and value the log
entry. The log entry is a dictionary with the following keys:
- author_date: date of the commit
- author_name: name of the author
- message: commit message
"""
for log_entry in self.conn_log.iter_log(
paths=None,
start=revision_start,
end=revision_end,
discover_changed_paths=False,
):
yield self.__to_entry(log_entry)
def export_temporary(self, revision: int) -> Tuple[str, bytes]:
"""Export the repository to a given revision in a temporary location. This is up
to the caller of this function to clean up the temporary location when done (cf.
self.clean_fs method)
Args:
revision: Revision to export at
Returns:
The tuple local_dirname the temporary location root
folder, local_url where the repository was exported.
"""
local_dirname = tempfile.mkdtemp(
dir=self.local_dirname, prefix=f"check-revision-{revision}."
)
local_name = os.path.basename(self.remote_url)
local_url = os.path.join(local_dirname, local_name)
self.client.export(
self.remote_url, to=local_url, rev=revision, ignore_keywords=True
)
return local_dirname, os.fsencode(local_url)
def swh_hash_data_per_revision(
self, start_revision: int, end_revision: int
) -> Iterator[
Tuple[
int,
Optional[int],
Dict,
Tuple[List[Content], List[SkippedContent], List[Directory]],
DirectoryFromDisk,
],
]:
"""Compute swh hash data per each revision between start_revision and
end_revision.
Args:
start_revision: starting revision
end_revision: ending revision
Yields:
Tuple (rev, nextrev, commit, objects_per_path):
- rev: current revision
- nextrev: next revision or None if we reached end_revision.
- commit: commit data (author, date, message) for such revision
- objects_per_path: Tuple of list of objects between start_revision and
end_revision
- complete Directory representation
"""
# even in incremental loading mode, we need to replay the whole set of
# path modifications from first revision to restore possible file states induced
# by setting svn properties on those files (end of line style for instance)
first_revision = 1 if start_revision else 0 # handle empty repository edge case
for commit in self.logs(first_revision, end_revision):
rev = commit["rev"]
objects = self.swhreplay.compute_objects(rev)
if rev == end_revision:
nextrev = None
else:
nextrev = rev + 1
if rev >= start_revision:
# start yielding new data to archive once we reached the revision to
# resume the loading from
yield rev, nextrev, commit, objects, self.swhreplay.directory
def swh_hash_data_at_revision(
self, revision: int
) -> Tuple[Dict, DirectoryFromDisk]:
"""Compute the information at a given svn revision. This is expected to be used
for checks only.
Yields:
The tuple (commit dictionary, targeted directory object).
"""
# Update disk representation of the repository at revision id
local_dirname, local_url = self.export_temporary(revision)
# Compute the current hashes on disk
directory = DirectoryFromDisk.from_disk(
path=local_url, max_content_length=self.max_content_length
)
# Retrieve the commit information for revision
commit = list(self.logs(revision, revision))[0]
# Clean export directory
self.clean_fs(local_dirname)
return commit, directory
def clean_fs(self, local_dirname: Optional[str] = None) -> None:
"""Clean up the local working copy.
Args:
local_dirname: Path to remove recursively if provided. Otherwise, remove the
temporary upper root tree used for svn repository loading.
"""
dirname = local_dirname or self.local_dirname
if os.path.exists(dirname):
logger.debug("cleanup %s", dirname)
shutil.rmtree(dirname)
diff --git a/swh/loader/svn/tests/test_converters.py b/swh/loader/svn/tests/test_converters.py
index 7dd7696..58c7238 100644
--- a/swh/loader/svn/tests/test_converters.py
+++ b/swh/loader/svn/tests/test_converters.py
@@ -1,124 +1,124 @@
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.loader.svn import converters
from swh.model.hashutil import hash_to_bytes
from swh.model.model import Person, Revision, Timestamp, TimestampWithTimezone
def test_svn_author_to_swh_person():
"""The author should have name, email and fullname filled.
"""
actual_person = converters.svn_author_to_swh_person(b"tony <ynot@dagobah>")
assert actual_person == Person.from_dict(
{"fullname": b"tony <ynot@dagobah>", "name": b"tony", "email": b"ynot@dagobah",}
)
def test_svn_author_to_swh_person_no_email():
"""The author and fullname should be the same as the input (author).
"""
actual_person = converters.svn_author_to_swh_person(b"tony")
assert actual_person == Person.from_dict(
{"fullname": b"tony", "name": b"tony", "email": None,}
)
def test_svn_author_to_swh_person_empty_person():
"""Empty person has only its fullname filled with the empty
byte-string.
"""
actual_person = converters.svn_author_to_swh_person(b"")
assert actual_person == Person.from_dict(
{"fullname": b"", "name": None, "email": None,}
)
def test_build_swh_revision_default():
"""This should build the swh revision with the swh revision's extra
headers about the repository.
"""
dir_id = hash_to_bytes("d6e08e19159f77983242877c373c75222d5ae9dd")
date = TimestampWithTimezone(
timestamp=Timestamp(seconds=1088108379, microseconds=0),
offset=0,
negative_utc=False,
)
actual_rev = converters.build_swh_revision(
repo_uuid=b"uuid",
dir_id=dir_id,
commit={
"author_name": Person(
name=b"theo", email=b"theo@uuid", fullname=b"theo <theo@uuid>"
),
"message": b"commit message",
"author_date": date,
},
rev=10,
parents=(),
)
expected_rev = Revision.from_dict(
{
"date": date.to_dict(),
"committer_date": date.to_dict(),
"type": "svn",
"directory": dir_id,
"message": b"commit message",
"author": {
"name": b"theo",
"email": b"theo@uuid",
"fullname": b"theo <theo@uuid>",
},
"committer": {
"name": b"theo",
"email": b"theo@uuid",
"fullname": b"theo <theo@uuid>",
},
"synthetic": True,
"extra_headers": ((b"svn_repo_uuid", b"uuid"), (b"svn_revision", b"10"),),
"parents": (),
}
)
assert actual_rev == expected_rev
def test_svn_date_to_swh_date():
"""The timestamp should not be tampered with and include the
decimals.
"""
assert converters.svn_date_to_swh_date(
- "2011-05-31T06:04:39.500900Z"
+ b"2011-05-31T06:04:39.500900Z"
) == TimestampWithTimezone(
timestamp=Timestamp(seconds=1306821879, microseconds=500900),
offset=0,
negative_utc=False,
)
assert converters.svn_date_to_swh_date(
- "2011-05-31T06:04:39.800722Z"
+ b"2011-05-31T06:04:39.800722Z"
) == TimestampWithTimezone(
timestamp=Timestamp(seconds=1306821879, microseconds=800722),
offset=0,
negative_utc=False,
)
def test_svn_date_to_swh_date_epoch():
"""Empty date should be EPOCH (timestamp and offset at 0)."""
# It should return 0, epoch
default_tstz = TimestampWithTimezone(
timestamp=Timestamp(seconds=0, microseconds=0), offset=0, negative_utc=False,
)
assert converters.svn_date_to_swh_date("") == default_tstz
assert converters.svn_date_to_swh_date(None) == default_tstz

File Metadata

Mime Type
text/x-diff
Expires
Jul 4 2025, 8:11 AM (10 w, 4 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3236833

Event Timeline