Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9342023
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
16 KB
Subscribers
None
View Options
diff --git a/requirements-swh.txt b/requirements-swh.txt
index 38ea505..44d1ffe 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,5 +1,5 @@
swh.core >= 0.0.7
swh.loader.core >= 0.0.78
-swh.model >= 0.0.60
+swh.model >= 0.3.0
swh.scheduler >= 0.0.39
swh.storage >= 0.0.108
diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py
index 0cfb46e..be3abac 100644
--- a/swh/loader/git/converters.py
+++ b/swh/loader/git/converters.py
@@ -1,189 +1,189 @@
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Convert dulwich objects to dictionaries suitable for swh.storage"""
from typing import Any, Dict, Optional
from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, MultiHash
from swh.model.model import (
BaseContent,
Content,
Directory,
DirectoryEntry,
ObjectType,
Person,
Release,
Revision,
RevisionType,
SkippedContent,
TargetType,
Timestamp,
TimestampWithTimezone,
)
HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"}
def dulwich_blob_to_content_id(blob) -> Dict[str, Any]:
"""Convert a dulwich blob to a Software Heritage content id"""
if blob.type_name != b"blob":
raise ValueError("Argument is not a blob.")
size = blob.raw_length()
data = blob.as_raw_string()
hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest()
hashes["sha1_git"] = blob.sha().digest()
hashes["length"] = size
return hashes
def dulwich_blob_to_content(blob, max_content_size=None) -> BaseContent:
"""Convert a dulwich blob to a Software Heritage content
"""
if blob.type_name != b"blob":
raise ValueError("Argument is not a blob.")
hashes = dulwich_blob_to_content_id(blob)
if max_content_size is not None and hashes["length"] >= max_content_size:
return SkippedContent(status="absent", reason="Content too large", **hashes,)
else:
return Content(data=blob.as_raw_string(), status="visible", **hashes,)
def dulwich_tree_to_directory(tree, log=None) -> Directory:
"""Format a tree as a directory"""
if tree.type_name != b"tree":
raise ValueError("Argument is not a tree.")
entries = []
entry_mode_map = {
0o040000: "dir",
0o160000: "rev",
0o100644: "file",
0o100755: "file",
0o120000: "file",
}
for entry in tree.iteritems():
entries.append(
DirectoryEntry(
type=entry_mode_map.get(entry.mode, "file"),
perms=entry.mode,
name=entry.path,
target=hash_to_bytes(entry.sha.decode("ascii")),
)
)
- return Directory(id=tree.sha().digest(), entries=entries,)
+ return Directory(id=tree.sha().digest(), entries=tuple(entries),)
def parse_author(name_email: bytes) -> Person:
"""Parse an author line"""
return Person.from_fullname(name_email)
def dulwich_tsinfo_to_timestamp(
timestamp, timezone, timezone_neg_utc
) -> TimestampWithTimezone:
"""Convert the dulwich timestamp information to a structure compatible with
Software Heritage"""
return TimestampWithTimezone(
timestamp=Timestamp(seconds=int(timestamp), microseconds=0,),
offset=timezone // 60,
negative_utc=timezone_neg_utc if timezone == 0 else False,
)
def dulwich_commit_to_revision(commit, log=None) -> Revision:
if commit.type_name != b"commit":
raise ValueError("Argument is not a commit.")
git_metadata = []
if commit.encoding is not None:
git_metadata.append(["encoding", commit.encoding])
if commit.mergetag:
for mergetag in commit.mergetag:
raw_string = mergetag.as_raw_string()
assert raw_string.endswith(b"\n")
git_metadata.append(["mergetag", raw_string[:-1]])
if commit.extra:
git_metadata.extend([k.decode("utf-8"), v] for k, v in commit.extra)
if commit.gpgsig:
git_metadata.append(["gpgsig", commit.gpgsig])
if git_metadata:
metadata: Optional[Dict[str, Any]] = {
"extra_headers": git_metadata,
}
else:
metadata = None
return Revision(
id=commit.sha().digest(),
author=parse_author(commit.author),
date=dulwich_tsinfo_to_timestamp(
commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc,
),
committer=parse_author(commit.committer),
committer_date=dulwich_tsinfo_to_timestamp(
commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc,
),
type=RevisionType.GIT,
directory=bytes.fromhex(commit.tree.decode()),
message=commit.message,
metadata=metadata,
synthetic=False,
- parents=[bytes.fromhex(p.decode()) for p in commit.parents],
+ parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents),
)
DULWICH_TARGET_TYPES = {
b"blob": TargetType.CONTENT,
b"tree": TargetType.DIRECTORY,
b"commit": TargetType.REVISION,
b"tag": TargetType.RELEASE,
}
DULWICH_OBJECT_TYPES = {
b"blob": ObjectType.CONTENT,
b"tree": ObjectType.DIRECTORY,
b"commit": ObjectType.REVISION,
b"tag": ObjectType.RELEASE,
}
def dulwich_tag_to_release(tag, log=None) -> Release:
if tag.type_name != b"tag":
raise ValueError("Argument is not a tag.")
target_type, target = tag.object
if tag.tagger:
author: Optional[Person] = parse_author(tag.tagger)
if not tag.tag_time:
date = None
else:
date = dulwich_tsinfo_to_timestamp(
tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc,
)
else:
author = date = None
return Release(
id=tag.sha().digest(),
author=author,
date=date,
name=tag.name,
target=bytes.fromhex(target.decode()),
target_type=DULWICH_OBJECT_TYPES[target_type.type_name],
message=tag._message,
metadata=None,
synthetic=False,
)
diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py
index 8b71a80..849de2a 100644
--- a/swh/loader/git/tests/test_converters.py
+++ b/swh/loader/git/tests/test_converters.py
@@ -1,319 +1,319 @@
# Copyright (C) 2015-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import pytest
import shutil
import subprocess
import tempfile
import unittest
import dulwich.repo
from swh.model.hashutil import bytehex_to_hash, hash_to_bytes
from swh.model.model import (
Content,
Person,
Release,
Revision,
RevisionType,
ObjectType,
Timestamp,
TimestampWithTimezone,
)
import swh.loader.git.converters as converters
TEST_DATA = os.path.join(os.path.dirname(__file__), "data")
class SWHObjectType:
"""Dulwich lookalike ObjectType class
"""
def __init__(self, type_name):
self.type_name = type_name
class SWHTag:
"""Dulwich lookalike tag class
"""
def __init__(
self,
name,
type_name,
target,
target_type,
tagger,
tag_time,
tag_timezone,
message,
):
self.name = name
self.type_name = type_name
self.object = SWHObjectType(target_type), target
self.tagger = tagger
self._message = message
self.tag_time = tag_time
self.tag_timezone = tag_timezone
self._tag_timezone_neg_utc = False
def sha(self):
from hashlib import sha1
return sha1()
@pytest.mark.fs
class TestConverters(unittest.TestCase):
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.repo_path = tempfile.mkdtemp()
cls.repo = dulwich.repo.Repo.init_bare(cls.repo_path)
fast_export = os.path.join(
TEST_DATA, "git-repos", "example-submodule.fast-export.xz"
)
xz = subprocess.Popen(
["xzcat"], stdin=open(fast_export, "rb"), stdout=subprocess.PIPE,
)
git = subprocess.Popen(
["git", "fast-import", "--quiet"], stdin=xz.stdout, cwd=cls.repo_path,
)
# flush stdout of xz
xz.stdout.close()
git.communicate()
@classmethod
def tearDownClass(cls):
super().tearDownClass()
shutil.rmtree(cls.repo_path)
def test_blob_to_content(self):
content_id = b"28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0"
content = converters.dulwich_blob_to_content(self.repo[content_id])
expected_content = Content(
sha1_git=bytehex_to_hash(content_id),
sha1=hash_to_bytes("4850a3420a2262ff061cb296fb915430fa92301c"),
sha256=hash_to_bytes(
"fee7c8a485a10321ad94b64135073cb5" "5f22cb9f57fa2417d2adfb09d310adef"
),
blake2s256=hash_to_bytes(
"5d71873f42a137f6d89286e43677721e574" "1fa05ce4cd5e3c7ea7c44d4c2d10b"
),
data=(
b'[submodule "example-dependency"]\n'
b"\tpath = example-dependency\n"
b"\turl = https://github.com/githubtraining/"
b"example-dependency.git\n"
),
length=124,
status="visible",
)
self.assertEqual(content, expected_content)
def test_convertion_wrong_input(self):
class Something:
type_name = b"something-not-the-right-type"
m = {
"blob": converters.dulwich_blob_to_content,
"blob2": converters.dulwich_blob_to_content_id,
"tree": converters.dulwich_tree_to_directory,
"commit": converters.dulwich_tree_to_directory,
"tag": converters.dulwich_tag_to_release,
}
for _callable in m.values():
with self.assertRaises(ValueError):
_callable(Something())
def test_commit_to_revision(self):
sha1 = b"9768d0b576dbaaecd80abedad6dfd0d72f1476da"
revision = converters.dulwich_commit_to_revision(self.repo[sha1])
expected_revision = Revision(
id=hash_to_bytes("9768d0b576dbaaecd80abedad6dfd0d72f1476da"),
directory=b"\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca",
type=RevisionType.GIT,
committer=Person(
name=b"Stefano Zacchiroli",
fullname=b"Stefano Zacchiroli <zack@upsilon.cc>",
email=b"zack@upsilon.cc",
),
author=Person(
name=b"Stefano Zacchiroli",
fullname=b"Stefano Zacchiroli <zack@upsilon.cc>",
email=b"zack@upsilon.cc",
),
committer_date=TimestampWithTimezone(
timestamp=Timestamp(seconds=1443083765, microseconds=0,),
negative_utc=False,
offset=120,
),
message=b"add submodule dependency\n",
metadata=None,
date=TimestampWithTimezone(
timestamp=Timestamp(seconds=1443083765, microseconds=0,),
negative_utc=False,
offset=120,
),
- parents=[b"\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r"],
+ parents=(b"\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r",),
synthetic=False,
)
self.assertEqual(revision, expected_revision)
def test_author_line_to_author(self):
# edge case out of the way
with self.assertRaises(TypeError):
converters.parse_author(None)
tests = {
b"a <b@c.com>": Person(
name=b"a", email=b"b@c.com", fullname=b"a <b@c.com>",
),
b"<foo@bar.com>": Person(
name=None, email=b"foo@bar.com", fullname=b"<foo@bar.com>",
),
b"malformed <email": Person(
name=b"malformed", email=b"email", fullname=b"malformed <email"
),
b"trailing <sp@c.e> ": Person(
name=b"trailing", email=b"sp@c.e", fullname=b"trailing <sp@c.e> ",
),
b"no<sp@c.e>": Person(name=b"no", email=b"sp@c.e", fullname=b"no<sp@c.e>",),
b" <>": Person(name=None, email=None, fullname=b" <>",),
b"something": Person(name=b"something", email=None, fullname=b"something"),
}
for author in sorted(tests):
parsed_author = tests[author]
self.assertEqual(parsed_author, converters.parse_author(author))
def test_dulwich_tag_to_release_no_author_no_date(self):
target = b"641fb6e08ddb2e4fd096dcf18e80b894bf"
message = b"some release message"
tag = SWHTag(
name=b"blah",
type_name=b"tag",
target=target,
target_type=b"commit",
message=message,
tagger=None,
tag_time=None,
tag_timezone=None,
)
# when
actual_release = converters.dulwich_tag_to_release(tag)
# then
expected_release = Release(
author=None,
date=None,
id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t",
message=message,
metadata=None,
name=b"blah",
synthetic=False,
target=hash_to_bytes(target.decode()),
target_type=ObjectType.REVISION,
)
self.assertEqual(actual_release, expected_release)
def test_dulwich_tag_to_release_author_and_date(self):
tagger = b"hey dude <hello@mail.org>"
target = b"641fb6e08ddb2e4fd096dcf18e80b894bf"
message = b"some release message"
import datetime
date = datetime.datetime(2007, 12, 5, tzinfo=datetime.timezone.utc).timestamp()
tag = SWHTag(
name=b"blah",
type_name=b"tag",
target=target,
target_type=b"commit",
message=message,
tagger=tagger,
tag_time=date,
tag_timezone=0,
)
# when
actual_release = converters.dulwich_tag_to_release(tag)
# then
expected_release = Release(
author=Person(
email=b"hello@mail.org",
fullname=b"hey dude <hello@mail.org>",
name=b"hey dude",
),
date=TimestampWithTimezone(
negative_utc=False,
offset=0,
timestamp=Timestamp(seconds=1196812800, microseconds=0,),
),
id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t",
message=message,
metadata=None,
name=b"blah",
synthetic=False,
target=hash_to_bytes(target.decode()),
target_type=ObjectType.REVISION,
)
self.assertEqual(actual_release, expected_release)
def test_dulwich_tag_to_release_author_no_date(self):
# to reproduce bug T815 (fixed)
tagger = b"hey dude <hello@mail.org>"
target = b"641fb6e08ddb2e4fd096dcf18e80b894bf"
message = b"some release message"
tag = SWHTag(
name=b"blah",
type_name=b"tag",
target=target,
target_type=b"commit",
message=message,
tagger=tagger,
tag_time=None,
tag_timezone=None,
)
# when
actual_release = converters.dulwich_tag_to_release(tag)
# then
expected_release = Release(
author=Person(
email=b"hello@mail.org",
fullname=b"hey dude <hello@mail.org>",
name=b"hey dude",
),
date=None,
id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t",
message=message,
metadata=None,
name=b"blah",
synthetic=False,
target=hash_to_bytes(target.decode()),
target_type=ObjectType.REVISION,
)
self.assertEqual(actual_release, expected_release)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 12:26 PM (2 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3250907
Attached To
rDLDG Git loader
Event Timeline
Log In to Comment