Page MenuHomeSoftware Heritage

D3171.id11270.diff
No OneTemporary

D3171.id11270.diff

diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py
--- a/swh/model/hypothesis_strategies.py
+++ b/swh/model/hypothesis_strategies.py
@@ -93,10 +93,13 @@
return "%s://%s" % (protocol, domain)
-def persons_d():
- return builds(
- dict, fullname=binary(), email=optional(binary()), name=optional(binary()),
- )
+@composite
+def persons_d(draw):
+ fullname = draw(binary())
+ email = draw(optional(binary()))
+ name = draw(optional(binary()))
+ assume(not (len(fullname) == 32 and email is None and name is None))
+ return dict(fullname=fullname, name=name, email=email)
def persons():
diff --git a/swh/model/model.py b/swh/model/model.py
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -7,7 +7,8 @@
from abc import ABCMeta, abstractmethod
from enum import Enum
-from typing import Dict, List, Optional, Union
+from hashlib import sha256
+from typing import Dict, List, Optional, TypeVar, Union
import attr
from attrs_strict import type_validator
@@ -51,6 +52,9 @@
return value
+ModelType = TypeVar("ModelType", bound="BaseModel")
+
+
class BaseModel:
"""Base class for SWH model classes.
@@ -68,6 +72,13 @@
recursively builds the corresponding objects."""
return cls(**d)
+ def anonymize(self: ModelType) -> Optional[ModelType]:
+ """Returns an anonymized version of the object, if needed.
+
+ If the object model does not need/support anonymization, returns None.
+ """
+ return None
+
class HashableObject(metaclass=ABCMeta):
"""Mixin to automatically compute object identifier hash when
@@ -129,6 +140,14 @@
return Person(name=name or None, email=email or None, fullname=fullname,)
+ def anonymize(self) -> "Person":
+ """Returns an anonymized version of the Person object.
+
+ Anonymization is simply a Person which fullname is the hashed, with unset name
+ or email.
+ """
+ return Person(fullname=sha256(self.fullname).digest(), name=None, email=None,)
+
@attr.s(frozen=True)
class Timestamp(BaseModel):
@@ -369,6 +388,14 @@
d["date"] = TimestampWithTimezone.from_dict(d["date"])
return cls(target_type=ObjectType(d.pop("target_type")), **d)
+ def anonymize(self) -> "Release":
+ """Returns an anonymized version of the Release object.
+
+ Anonymization consists in replacing the author with an anonymized Person object.
+ """
+ author = self.author and self.author.anonymize()
+ return attr.evolve(self, author=author)
+
class RevisionType(Enum):
GIT = "git"
@@ -422,6 +449,16 @@
**d,
)
+ def anonymize(self) -> "Revision":
+ """Returns an anonymized version of the Revision object.
+
+ Anonymization consists in replacing the author and committer with an anonymized
+ Person object.
+ """
+ return attr.evolve(
+ self, author=self.author.anonymize(), committer=self.committer.anonymize()
+ )
+
@attr.s(frozen=True)
class DirectoryEntry(BaseModel):
diff --git a/swh/model/tests/test_hypothesis_strategies.py b/swh/model/tests/test_hypothesis_strategies.py
--- a/swh/model/tests/test_hypothesis_strategies.py
+++ b/swh/model/tests/test_hypothesis_strategies.py
@@ -18,6 +18,7 @@
skipped_contents,
snapshots,
origin_visits,
+ persons,
)
from swh.model.model import TargetType
@@ -196,3 +197,10 @@
@given(origin_visits())
def test_origin_visit_aware_datetime(visit):
assert visit.date.tzinfo is not None
+
+
+@given(persons())
+def test_person_do_not_look_like_anonimized(person):
+ assert not (
+ len(person.fullname) == 32 and person.name is None and person.email is None
+ )
diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py
--- a/swh/model/tests/test_model.py
+++ b/swh/model/tests/test_model.py
@@ -61,6 +61,37 @@
assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict()
+# Anonymization
+
+
+@given(strategies.objects())
+def test_anonymization(objtype_and_obj):
+ (obj_type, obj) = objtype_and_obj
+
+ def check_person(p):
+ if p is not None:
+ assert p.name is None
+ assert p.email is None
+ assert len(p.fullname) == 32
+
+ anon_obj = obj.anonymize()
+ if obj_type == "person":
+ assert anon_obj is not None
+ check_person(anon_obj)
+ elif obj_type == "release":
+ assert anon_obj is not None
+ check_person(anon_obj.author)
+ elif obj_type == "revision":
+ assert anon_obj is not None
+ check_person(anon_obj.author)
+ check_person(anon_obj.committer)
+ else:
+ assert anon_obj is None
+
+
+# Origin, OriginVisit
+
+
@given(strategies.origins())
def test_todict_origins(origin):
obj = origin.to_dict()

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 2:51 PM (7 h, 41 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3215668

Event Timeline