diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py --- a/swh/model/hypothesis_strategies.py +++ b/swh/model/hypothesis_strategies.py @@ -93,10 +93,13 @@ return "%s://%s" % (protocol, domain) -def persons_d(): - return builds( - dict, fullname=binary(), email=optional(binary()), name=optional(binary()), - ) +@composite +def persons_d(draw): + fullname = draw(binary()) + email = draw(optional(binary())) + name = draw(optional(binary())) + assume(not (len(fullname) == 32 and email is None and name is None)) + return dict(fullname=fullname, name=name, email=email) def persons(): diff --git a/swh/model/model.py b/swh/model/model.py --- a/swh/model/model.py +++ b/swh/model/model.py @@ -7,6 +7,7 @@ from abc import ABCMeta, abstractmethod from enum import Enum +from hashlib import sha256 from typing import Dict, List, Optional, Union import attr @@ -68,6 +69,13 @@ recursively builds the corresponding objects.""" return cls(**d) + def anonymize(self) -> Optional["BaseModel"]: + """Returns an anonymized version of the object, if needed. + + If the object model does not need/support anonymization, returns None. + """ + return None + class HashableObject(metaclass=ABCMeta): """Mixin to automatically compute object identifier hash when @@ -129,6 +137,14 @@ return Person(name=name or None, email=email or None, fullname=fullname,) + def anonymize(self) -> "Person": + """Returns an anonymized version of the Person object. + + Anonymization is simply a Person which fullname is the hashed, with unset name + or email. + """ + return Person(fullname=sha256(self.fullname).digest(), name=None, email=None,) + @attr.s(frozen=True) class Timestamp(BaseModel): @@ -369,6 +385,14 @@ d["date"] = TimestampWithTimezone.from_dict(d["date"]) return cls(target_type=ObjectType(d.pop("target_type")), **d) + def anonymize(self) -> "Release": + """Returns an anonymized version of the Release object. + + Anonymization consists in replacing the author with an anonymized Person object. + """ + author = self.author and self.author.anonymize() + return attr.evolve(self, author=author) + class RevisionType(Enum): GIT = "git" @@ -422,6 +446,16 @@ **d, ) + def anonymize(self) -> "Revision": + """Returns an anonymized version of the Revision object. + + Anonymization consists in replacing the author and committer with an anonymized + Person object. + """ + return attr.evolve( + self, author=self.author.anonymize(), committer=self.committer.anonymize() + ) + @attr.s(frozen=True) class DirectoryEntry(BaseModel): diff --git a/swh/model/tests/test_hypothesis_strategies.py b/swh/model/tests/test_hypothesis_strategies.py --- a/swh/model/tests/test_hypothesis_strategies.py +++ b/swh/model/tests/test_hypothesis_strategies.py @@ -18,6 +18,7 @@ skipped_contents, snapshots, origin_visits, + persons, ) from swh.model.model import TargetType @@ -196,3 +197,10 @@ @given(origin_visits()) def test_origin_visit_aware_datetime(visit): assert visit.date.tzinfo is not None + + +@given(persons()) +def test_person_do_not_look_like_anonimized(person): + assert not ( + len(person.fullname) == 32 and person.name is None and person.email is None + ) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -61,6 +61,37 @@ assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict() +# Anonymization + + +@given(strategies.objects()) +def test_anonymization(objtype_and_obj): + (obj_type, obj) = objtype_and_obj + + def check_person(p): + if p is not None: + assert p.name is None + assert p.email is None + assert len(p.fullname) == 32 + + anon_obj = obj.anonymize() + if obj_type == "person": + assert anon_obj is not None + check_person(anon_obj) + elif obj_type == "release": + assert anon_obj is not None + check_person(anon_obj.author) + elif obj_type == "revision": + assert anon_obj is not None + check_person(anon_obj.author) + check_person(anon_obj.committer) + else: + assert anon_obj is None + + +# Origin, OriginVisit + + @given(strategies.origins()) def test_todict_origins(origin): obj = origin.to_dict()