Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123760
D6281.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
10 KB
Subscribers
None
D6281.diff
View Options
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,5 +1,5 @@
swh.core >= 0.0.7
swh.loader.core >= 0.18.0
-swh.model >= 0.4.0
+swh.model >= 2.9.0
swh.scheduler >= 0.0.39
swh.storage >= 0.22.0
diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py
--- a/swh/loader/git/converters.py
+++ b/swh/loader/git/converters.py
@@ -15,6 +15,7 @@
Content,
Directory,
DirectoryEntry,
+ HashableObject,
ObjectType,
Person,
Release,
@@ -26,7 +27,18 @@
TimestampWithTimezone,
)
-HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"}
+
+class HashMismatch(Exception):
+ pass
+
+
+def check_id(obj: HashableObject) -> None:
+ real_id = obj.compute_hash()
+ if obj.id != real_id:
+ raise HashMismatch(
+ f"Expected {type(obj).__name__} hash to be {obj.id.hex()}, "
+ f"got {real_id.hex()}"
+ )
def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:
@@ -37,8 +49,12 @@
size = blob.raw_length()
data = blob.as_raw_string()
- hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest()
- hashes["sha1_git"] = blob.sha().digest()
+ hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest()
+ if hashes["sha1_git"] != blob.sha().digest():
+ raise HashMismatch(
+ f"Expected Content hash to be {blob.sha().digest().hex()}, "
+ f"got {hashes['sha1_git'].hex()}"
+ )
hashes["length"] = size
return hashes
@@ -84,7 +100,9 @@
)
)
- return Directory(id=tree.sha().digest(), entries=tuple(entries),)
+ dir_ = Directory(id=tree.sha().digest(), entries=tuple(entries),)
+ check_id(dir_)
+ return dir_
def parse_author(name_email: bytes) -> Person:
@@ -124,7 +142,7 @@
if commit.gpgsig:
extra_headers.append((b"gpgsig", commit.gpgsig))
- return Revision(
+ rev = Revision(
id=commit.sha().digest(),
author=parse_author(commit.author),
date=dulwich_tsinfo_to_timestamp(
@@ -142,6 +160,8 @@
synthetic=False,
parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents),
)
+ check_id(rev)
+ return rev
DULWICH_TARGET_TYPES = {
@@ -181,7 +201,7 @@
if tag.signature:
message += tag.signature
- return Release(
+ rel = Release(
id=tag.sha().digest(),
author=author,
date=date,
@@ -192,3 +212,5 @@
metadata=None,
synthetic=False,
)
+ check_id(rel)
+ return rel
diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py
--- a/swh/loader/git/tests/test_converters.py
+++ b/swh/loader/git/tests/test_converters.py
@@ -3,6 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import copy
import os
import shutil
import subprocess
@@ -12,7 +13,7 @@
import pytest
import swh.loader.git.converters as converters
-from swh.model.hashutil import bytehex_to_hash, hash_to_bytes
+from swh.model.hashutil import bytehex_to_hash, hash_to_bytehex, hash_to_bytes
from swh.model.model import (
Content,
ObjectType,
@@ -89,6 +90,7 @@
def __init__(
self,
+ sha,
name,
type_name,
target,
@@ -99,6 +101,7 @@
message,
signature,
):
+ self._sha = sha
self.name = name
self.type_name = type_name
self.object = SWHObjectType(target_type), target
@@ -110,9 +113,11 @@
self._tag_timezone_neg_utc = False
def sha(self):
- from hashlib import sha1
+ class hasher:
+ def digest():
+ return self._sha
- return sha1()
+ return hasher
@pytest.mark.fs
@@ -161,13 +166,34 @@
)
assert content == expected_content
+ def test_corrupt_blob(self, mocker):
+ # has a signature
+ sha1 = hash_to_bytes("28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0")
+
+ blob = copy.deepcopy(self.repo[hash_to_bytehex(sha1)])
+
+ class hasher:
+ def digest():
+ return sha1
+
+ blob._sha = hasher
+
+ converters.dulwich_blob_to_content(blob)
+ converters.dulwich_blob_to_content_id(blob)
+
+ sha1 = hash_to_bytes("1234" * 10)
+
+ with pytest.raises(converters.HashMismatch):
+ converters.dulwich_blob_to_content(blob)
+ with pytest.raises(converters.HashMismatch):
+ converters.dulwich_blob_to_content_id(blob)
+
def test_convertion_wrong_input(self):
class Something:
type_name = b"something-not-the-right-type"
m = {
"blob": converters.dulwich_blob_to_content,
- "blob2": converters.dulwich_blob_to_content_id,
"tree": converters.dulwich_tree_to_directory,
"commit": converters.dulwich_tree_to_directory,
"tag": converters.dulwich_tag_to_release,
@@ -177,6 +203,17 @@
with pytest.raises(ValueError):
_callable(Something())
+ def test_corrupt_tree(self):
+ # has a signature
+ sha1 = b"f0695c2e2fa7ce9d574023c3413761a473e500ca"
+ tree = copy.deepcopy(self.repo[sha1])
+ converters.dulwich_tree_to_directory(tree)
+
+ del tree._entries[next(iter(tree._entries))]
+
+ with pytest.raises(converters.HashMismatch):
+ converters.dulwich_tree_to_directory(tree)
+
def test_commit_to_revision(self):
sha1 = b"9768d0b576dbaaecd80abedad6dfd0d72f1476da"
@@ -251,6 +288,23 @@
assert revision == expected_revision
+ @pytest.mark.parametrize(
+ "attribute", ["_message", "_encoding", "_author", "_gpgsig"]
+ )
+ def test_corrupt_commit(self, attribute):
+ # has a signature
+ sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b"
+ commit = copy.deepcopy(self.repo[sha1])
+ converters.dulwich_commit_to_revision(commit)
+ setattr(commit, attribute, b"abcde")
+ with pytest.raises(converters.HashMismatch):
+ converters.dulwich_commit_to_revision(commit)
+
+ if attribute == "_gpgsig":
+ setattr(commit, attribute, None)
+ with pytest.raises(converters.HashMismatch):
+ converters.dulwich_commit_to_revision(commit)
+
def test_commit_to_revision_with_extra_headers_mergetag(self):
sha1 = b"3ab3da4bf0f81407be16969df09cd1c8af9ac703"
@@ -319,9 +373,11 @@
assert parsed_author == converters.parse_author(author)
def test_dulwich_tag_to_release_no_author_no_date(self):
- target = b"641fb6e08ddb2e4fd096dcf18e80b894bf"
+ sha = hash_to_bytes("f6e367357b446bd1315276de5e88ba3d0d99e136")
+ target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
message = b"some release message"
tag = SWHTag(
+ sha=sha,
name=b"blah",
type_name=b"tag",
target=target,
@@ -340,7 +396,7 @@
expected_release = Release(
author=None,
date=None,
- id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t",
+ id=sha,
message=message,
metadata=None,
name=b"blah",
@@ -352,8 +408,9 @@
assert actual_release == expected_release
def test_dulwich_tag_to_release_author_and_date(self):
+ sha = hash_to_bytes("fc1e6a4f1e37e93e28e78560e73efd0b12f616ef")
tagger = b"hey dude <hello@mail.org>"
- target = b"641fb6e08ddb2e4fd096dcf18e80b894bf"
+ target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
message = b"some release message"
import datetime
@@ -361,6 +418,7 @@
date = datetime.datetime(2007, 12, 5, tzinfo=datetime.timezone.utc).timestamp()
tag = SWHTag(
+ sha=sha,
name=b"blah",
type_name=b"tag",
target=target,
@@ -387,7 +445,7 @@
offset=0,
timestamp=Timestamp(seconds=1196812800, microseconds=0,),
),
- id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t",
+ id=sha,
message=message,
metadata=None,
name=b"blah",
@@ -400,10 +458,12 @@
def test_dulwich_tag_to_release_author_no_date(self):
# to reproduce bug T815 (fixed)
+ sha = hash_to_bytes("41076e970975122dc6b2a878aa9797960bc4781d")
tagger = b"hey dude <hello@mail.org>"
- target = b"641fb6e08ddb2e4fd096dcf18e80b894bf"
+ target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
message = b"some release message"
tag = SWHTag(
+ sha=sha,
name=b"blah",
type_name=b"tag",
target=target,
@@ -426,7 +486,7 @@
name=b"hey dude",
),
date=None,
- id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t",
+ id=sha,
message=message,
metadata=None,
name=b"blah",
@@ -438,9 +498,11 @@
assert actual_release == expected_release
def test_dulwich_tag_to_release_signature(self):
- target = b"641fb6e08ddb2e4fd096dcf18e80b894bf"
+ target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
message = b"some release message"
+ sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71")
tag = SWHTag(
+ sha=sha,
name=b"blah",
type_name=b"tag",
target=target,
@@ -459,7 +521,7 @@
expected_release = Release(
author=None,
date=None,
- id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t",
+ id=sha,
message=message + GPGSIG,
metadata=None,
name=b"blah",
@@ -469,3 +531,32 @@
)
assert actual_release == expected_release
+
+ @pytest.mark.parametrize("attribute", ["name", "message", "signature"])
+ def test_corrupt_tag(self, attribute):
+ # has a signature
+ sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71")
+ target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
+ message = b"some release message"
+ tag = SWHTag(
+ sha=sha,
+ name=b"blah",
+ type_name=b"tag",
+ target=target,
+ target_type=b"commit",
+ message=message,
+ signature=GPGSIG,
+ tagger=None,
+ tag_time=None,
+ tag_timezone=None,
+ )
+ converters.dulwich_tag_to_release(tag)
+
+ setattr(tag, attribute, b"abcde")
+ with pytest.raises(converters.HashMismatch):
+ converters.dulwich_tag_to_release(tag)
+
+ if attribute == "signature":
+ setattr(tag, attribute, None)
+ with pytest.raises(converters.HashMismatch):
+ converters.dulwich_tag_to_release(tag)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 20 2024, 12:05 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216720
Attached To
D6281: converters: Recompute hashes and check they match the originals
Event Timeline
Log In to Comment