From 29312dff6d96ac1c9bc18bf98de1d2e27a76c334 Mon Sep 17 00:00:00 2001 From: David Douard <david.douard@sdfa3.org> Date: Tue, 19 May 2020 16:04:30 +0200 Subject: [PATCH] Add support for model object anonymization Simply add a BaseModel.anonymize() method. Default implementation returns None, meaning the object is not anonymizable. For Person, the method returns a Person whith hashed fullname (and unset name and email). For Revision and Release, the method returns an anonymized version of the object, i.e. with instance of Person replaced by anonymized ones. --- swh/model/hypothesis_strategies.py | 11 ++++-- swh/model/model.py | 39 ++++++++++++++++++- swh/model/tests/test_hypothesis_strategies.py | 8 ++++ swh/model/tests/test_model.py | 31 +++++++++++++++ 4 files changed, 84 insertions(+), 5 deletions(-) diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py index 448487a8..bc9d58c4 100644 --- a/swh/model/hypothesis_strategies.py +++ b/swh/model/hypothesis_strategies.py @@ -93,10 +93,13 @@ def urls(draw): return "%s://%s" % (protocol, domain) -def persons_d(): - return builds( - dict, fullname=binary(), email=optional(binary()), name=optional(binary()), - ) +@composite +def persons_d(draw): + fullname = draw(binary()) + email = draw(optional(binary())) + name = draw(optional(binary())) + assume(not (len(fullname) == 32 and email is None and name is None)) + return dict(fullname=fullname, name=name, email=email) def persons(): diff --git a/swh/model/model.py b/swh/model/model.py index 74702cf4..7db255cc 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -7,7 +7,8 @@ import datetime from abc import ABCMeta, abstractmethod from enum import Enum -from typing import Dict, List, Optional, Union +from hashlib import sha256 +from typing import Dict, List, Optional, TypeVar, Union import attr from attrs_strict import type_validator @@ -51,6 +52,9 @@ def dictify(value): return value +ModelType = TypeVar("ModelType", bound="BaseModel") + + class BaseModel: """Base class for SWH model classes. @@ -68,6 +72,13 @@ class BaseModel: recursively builds the corresponding objects.""" return cls(**d) + def anonymize(self: ModelType) -> Optional[ModelType]: + """Returns an anonymized version of the object, if needed. + + If the object model does not need/support anonymization, returns None. + """ + return None + class HashableObject(metaclass=ABCMeta): """Mixin to automatically compute object identifier hash when @@ -129,6 +140,14 @@ class Person(BaseModel): return Person(name=name or None, email=email or None, fullname=fullname,) + def anonymize(self) -> "Person": + """Returns an anonymized version of the Person object. + + Anonymization is simply a Person which fullname is the hashed, with unset name + or email. + """ + return Person(fullname=sha256(self.fullname).digest(), name=None, email=None,) + @attr.s(frozen=True) class Timestamp(BaseModel): @@ -369,6 +388,14 @@ class Release(BaseModel, HashableObject): d["date"] = TimestampWithTimezone.from_dict(d["date"]) return cls(target_type=ObjectType(d.pop("target_type")), **d) + def anonymize(self) -> "Release": + """Returns an anonymized version of the Release object. + + Anonymization consists in replacing the author with an anonymized Person object. + """ + author = self.author and self.author.anonymize() + return attr.evolve(self, author=author) + class RevisionType(Enum): GIT = "git" @@ -422,6 +449,16 @@ class Revision(BaseModel, HashableObject): **d, ) + def anonymize(self) -> "Revision": + """Returns an anonymized version of the Revision object. + + Anonymization consists in replacing the author and committer with an anonymized + Person object. + """ + return attr.evolve( + self, author=self.author.anonymize(), committer=self.committer.anonymize() + ) + @attr.s(frozen=True) class DirectoryEntry(BaseModel): diff --git a/swh/model/tests/test_hypothesis_strategies.py b/swh/model/tests/test_hypothesis_strategies.py index 1622b3c7..2be35a05 100644 --- a/swh/model/tests/test_hypothesis_strategies.py +++ b/swh/model/tests/test_hypothesis_strategies.py @@ -18,6 +18,7 @@ from swh.model.hypothesis_strategies import ( skipped_contents, snapshots, origin_visits, + persons, ) from swh.model.model import TargetType @@ -196,3 +197,10 @@ def test_snapshots_strategy_fixed_size(snapshot): @given(origin_visits()) def test_origin_visit_aware_datetime(visit): assert visit.date.tzinfo is not None + + +@given(persons()) +def test_person_do_not_look_like_anonimized(person): + assert not ( + len(person.fullname) == 32 and person.name is None and person.email is None + ) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index 6027cc27..e126ca57 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -61,6 +61,37 @@ def test_todict_inverse_fromdict(objtype_and_obj): assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict() +# Anonymization + + +@given(strategies.objects()) +def test_anonymization(objtype_and_obj): + (obj_type, obj) = objtype_and_obj + + def check_person(p): + if p is not None: + assert p.name is None + assert p.email is None + assert len(p.fullname) == 32 + + anon_obj = obj.anonymize() + if obj_type == "person": + assert anon_obj is not None + check_person(anon_obj) + elif obj_type == "release": + assert anon_obj is not None + check_person(anon_obj.author) + elif obj_type == "revision": + assert anon_obj is not None + check_person(anon_obj.author) + check_person(anon_obj.committer) + else: + assert anon_obj is None + + +# Origin, OriginVisit + + @given(strategies.origins()) def test_todict_origins(origin): obj = origin.to_dict() -- GitLab