diff --git a/swh/model/model.py b/swh/model/model.py --- a/swh/model/model.py +++ b/swh/model/model.py @@ -7,6 +7,7 @@ from abc import ABCMeta, abstractmethod from enum import Enum +from hashlib import sha256 from typing import Dict, List, Optional, Union import attr @@ -68,6 +69,13 @@ recursively builds the corresponding objects.""" return cls(**d) + def anonymize(self) -> Optional["BaseModel"]: + """Returns an anonymized version of the object, if needed. + + If the object model does not need/support anonymization, returns None. + """ + return None + class HashableObject(metaclass=ABCMeta): """Mixin to automatically compute object identifier hash when @@ -129,6 +137,15 @@ return Person(name=name or None, email=email or None, fullname=fullname,) + def anonymize(self) -> "Person": + """Returns an anonymized version of the Person object. + + Anonymization is simply a Person which fullname is the hash of + fullname+name+email). + """ + tohash = self.fullname + (self.name or b"") + (self.email or b"") + return Person(fullname=sha256(tohash).digest(), name=None, email=None,) + @attr.s(frozen=True) class Timestamp(BaseModel): @@ -369,6 +386,14 @@ d["date"] = TimestampWithTimezone.from_dict(d["date"]) return cls(target_type=ObjectType(d.pop("target_type")), **d) + def anonymize(self) -> "Release": + """Returns an anonymized version of the Release object. + + Anonymization consist in replacing the author with an anonymized Person object. + """ + author = self.author and self.author.anonymize() + return attr.evolve(self, author=author) + class RevisionType(Enum): GIT = "git" @@ -422,6 +447,16 @@ **d, ) + def anonymize(self) -> "Revision": + """Returns an anonymized version of the Revision object. + + Anonymization consist in replacing the author and committer with an anonymized + Person object. + """ + return attr.evolve( + self, author=self.author.anonymize(), committer=self.committer.anonymize() + ) + @attr.s(frozen=True) class DirectoryEntry(BaseModel): diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -61,6 +61,37 @@ assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict() +# Anonymization + + +@given(strategies.objects()) +def test_anonymization(objtype_and_obj): + (obj_type, obj) = objtype_and_obj + + def check_person(p): + if p is not None: + assert p.name is None + assert p.email is None + assert len(p.fullname) == 32 + + anon_obj = obj.anonymize() + if obj_type == "person": + assert anon_obj is not None + check_person(anon_obj) + elif obj_type == "release": + assert anon_obj is not None + check_person(anon_obj.author) + elif obj_type == "revision": + assert anon_obj is not None + check_person(anon_obj.author) + check_person(anon_obj.committer) + else: + assert anon_obj is None + + +# Origin, OriginVisit + + @given(strategies.origins()) def test_todict_origins(origin): obj = origin.to_dict()