From a251df2e5b31a5d59d7e69e51a441bb22b1a7b0b Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Tue, 29 Sep 2020 14:08:08 +0200
Subject: [PATCH] Add a 'unique_key' method on model objects

that returns a value suitable for unicity constraints.

Motivation:

* this is somewhat more of a model concern than a journal/kafka
  concern IMO
* this is one step toward adding support for non-model objects in
  KafkaJournalWriter

Implementation of the unique_key methods comes from
`swh.journal.serializers.object_key`.
---
 swh/model/model.py            | 52 ++++++++++++++++++++++++++++++++---
 swh/model/tests/test_model.py | 30 ++++++++++++++++++++
 2 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/swh/model/model.py b/swh/model/model.py
index ca3a8c70..e962dbad 100644
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -35,6 +35,10 @@ class MissingData(Exception):
     pass
 
 
+KeyType = Union[Dict[str, str], Dict[str, bytes], bytes]
+"""The type returned by BaseModel.unique_key()."""
+
+
 SHA1_SIZE = 20
 
 # TODO: Limit this to 20 bytes
@@ -98,6 +102,11 @@ class BaseModel:
         """
         return None
 
+    def unique_key(self) -> KeyType:
+        """Returns a unique key for this object, that can be used for
+        deduplication."""
+        raise NotImplementedError(f"unique_key for {self}")
+
 
 class HashableObject(metaclass=ABCMeta):
     """Mixin to automatically compute object identifier hash when
@@ -115,6 +124,9 @@ class HashableObject(metaclass=ABCMeta):
             obj_id = hash_to_bytes(self.compute_hash(self.to_dict()))
             object.__setattr__(self, "id", obj_id)
 
+    def unique_key(self) -> KeyType:
+        return self.id  # type: ignore
+
 
 @attr.s(frozen=True)
 class Person(BaseModel):
@@ -252,6 +264,9 @@ class Origin(BaseModel):
 
     url = attr.ib(type=str, validator=type_validator())
 
+    def unique_key(self) -> KeyType:
+        return {"url": self.url}
+
 
 @attr.s(frozen=True)
 class OriginVisit(BaseModel):
@@ -280,6 +295,9 @@ class OriginVisit(BaseModel):
             del ov["visit"]
         return ov
 
+    def unique_key(self) -> KeyType:
+        return {"origin": self.origin, "date": str(self.date)}
+
 
 @attr.s(frozen=True)
 class OriginVisitStatus(BaseModel):
@@ -311,6 +329,9 @@ class OriginVisitStatus(BaseModel):
         if value is not None and value.tzinfo is None:
             raise ValueError("date must be a timezone-aware datetime.")
 
+    def unique_key(self) -> KeyType:
+        return {"origin": self.origin, "visit": str(self.visit), "date": str(self.date)}
+
 
 class TargetType(Enum):
     """The type of content pointed to by a snapshot branch. Usually a
@@ -357,7 +378,7 @@ class SnapshotBranch(BaseModel):
 
 
 @attr.s(frozen=True)
-class Snapshot(BaseModel, HashableObject):
+class Snapshot(HashableObject, BaseModel):
     """Represents the full state of an origin at a given point in time."""
 
     object_type: Final = "snapshot"
@@ -386,7 +407,7 @@ class Snapshot(BaseModel, HashableObject):
 
 
 @attr.s(frozen=True)
-class Release(BaseModel, HashableObject):
+class Release(HashableObject, BaseModel):
     object_type: Final = "release"
 
     name = attr.ib(type=bytes, validator=type_validator())
@@ -453,7 +474,7 @@ def tuplify_extra_headers(value: Iterable):
 
 
 @attr.s(frozen=True)
-class Revision(BaseModel, HashableObject):
+class Revision(HashableObject, BaseModel):
     object_type: Final = "revision"
 
     message = attr.ib(type=Optional[bytes], validator=type_validator())
@@ -543,7 +564,7 @@ class DirectoryEntry(BaseModel):
 
 
 @attr.s(frozen=True)
-class Directory(BaseModel, HashableObject):
+class Directory(HashableObject, BaseModel):
     object_type: Final = "directory"
 
     entries = attr.ib(type=Tuple[DirectoryEntry, ...], validator=type_validator())
@@ -675,6 +696,9 @@ class Content(BaseContent):
             raise MissingData("Content data is None.")
         return self
 
+    def unique_key(self) -> KeyType:
+        return self.sha1  # TODO: use a dict of hashes
+
 
 @attr.s(frozen=True)
 class SkippedContent(BaseContent):
@@ -752,6 +776,9 @@ class SkippedContent(BaseContent):
             raise ValueError('SkippedContent has no "data" attribute %r' % d)
         return super().from_dict(d2, use_subclass=False)
 
+    def unique_key(self) -> KeyType:
+        return self.hashes()
+
 
 class MetadataAuthorityType(Enum):
     DEPOSIT_CLIENT = "deposit_client"
@@ -786,6 +813,9 @@ class MetadataAuthority(BaseModel):
         d["type"] = MetadataAuthorityType(d["type"])
         return super().from_dict(d)
 
+    def unique_key(self) -> KeyType:
+        return {"type": self.type.value, "url": self.url}
+
 
 @attr.s(frozen=True)
 class MetadataFetcher(BaseModel):
@@ -809,6 +839,9 @@ class MetadataFetcher(BaseModel):
             del d["metadata"]
         return d
 
+    def unique_key(self) -> KeyType:
+        return {"name": self.name, "version": self.version}
+
 
 class MetadataTargetType(Enum):
     """The type of object extrinsic metadata refer to."""
@@ -1024,3 +1057,14 @@ class RawExtrinsicMetadata(BaseModel):
                 d[swhid_key] = parse_swhid(d[swhid_key])
 
         return super().from_dict(d)
+
+    def unique_key(self) -> KeyType:
+        return {
+            "type": self.type.value,
+            "id": str(self.id),
+            "authority_type": self.authority.type.value,
+            "authority_url": self.authority.url,
+            "discovery_date": str(self.discovery_date),
+            "fetcher_name": self.fetcher.name,
+            "fetcher_version": self.fetcher.version,
+        }
diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py
index fdd5e044..990a1707 100644
--- a/swh/model/tests/test_model.py
+++ b/swh/model/tests/test_model.py
@@ -71,6 +71,36 @@ def test_todict_inverse_fromdict(objtype_and_obj):
     assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict()
 
 
+def test_unique_key():
+    url = "http://example.org/"
+    date = datetime.datetime.now(tz=datetime.timezone.utc)
+    id_ = b"42" * 10
+    assert Origin(url=url).unique_key() == {"url": url}
+    assert OriginVisit(origin=url, date=date, type="git").unique_key() == {
+        "origin": url,
+        "date": str(date),
+    }
+    assert OriginVisitStatus(
+        origin=url, visit=42, date=date, status="created", snapshot=None
+    ).unique_key() == {"origin": url, "visit": "42", "date": str(date),}
+
+    assert Snapshot.from_dict({**snapshot_example, "id": id_}).unique_key() == id_
+    assert Release.from_dict({**release_example, "id": id_}).unique_key() == id_
+    assert Revision.from_dict({**revision_example, "id": id_}).unique_key() == id_
+    assert Directory.from_dict({**directory_example, "id": id_}).unique_key() == id_
+
+    cont = Content.from_data(b"foo")
+    assert cont.unique_key().hex() == "0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"
+
+    kwargs = {
+        **cont.to_dict(),
+        "reason": "foo",
+        "status": "absent",
+    }
+    del kwargs["data"]
+    assert SkippedContent(**kwargs).unique_key() == cont.hashes()
+
+
 # Anonymization
 
 
-- 
GitLab