From f9641d2865cbf5d4d809e6ee5ffbf1b1a4086ca2 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Mon, 8 Apr 2019 21:46:28 +0200
Subject: [PATCH] Tune the model generation to work with the pgsql storage.

---
 swh/model/hypothesis_strategies.py            | 70 +++++++++++++------
 swh/model/model.py                            | 52 ++++++++++++--
 swh/model/tests/test_hypothesis_strategies.py |  7 +-
 3 files changed, 103 insertions(+), 26 deletions(-)

diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py
index 3b046ca2..26d4a817 100644
--- a/swh/model/hypothesis_strategies.py
+++ b/swh/model/hypothesis_strategies.py
@@ -3,9 +3,11 @@
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+import datetime
+
 from hypothesis.strategies import (
-    lists, one_of, composite, builds, integers, sampled_from, binary,
-    dictionaries, none, from_regex, just
+    binary, builds, characters, composite, dictionaries, from_regex,
+    integers, just, lists, none, one_of, sampled_from, text, tuples,
 )
 
 
@@ -22,6 +24,10 @@ def sha1_git():
     return binary(min_size=20, max_size=20)
 
 
+def sha1():
+    return binary(min_size=20, max_size=20)
+
+
 @composite
 def urls(draw):
     protocol = draw(sampled_from(['git', 'http', 'https', 'deb']))
@@ -35,9 +41,11 @@ def persons():
 
 
 def timestamps():
+    max_seconds = datetime.datetime.max.timestamp()
+    min_seconds = datetime.datetime.min.timestamp()
     return builds(
         Timestamp,
-        seconds=integers(-2**63, 2**63-1),
+        seconds=integers(min_seconds, max_seconds),
         microseconds=integers(0, 1000000))
 
 
@@ -45,7 +53,7 @@ def timestamps_with_timezone():
     return builds(
         TimestampWithTimezone,
         timestamp=timestamps(),
-        offset=integers(-2**16, 2**16-1))
+        offset=integers(min_value=-14*60, max_value=14*60))
 
 
 def origins():
@@ -62,13 +70,27 @@ def origin_visits():
         origin=origins())
 
 
-def releases():
-    return builds(
+@composite
+def releases(draw):
+    (date, author) = draw(one_of(
+        tuples(none(), none()),
+        tuples(timestamps_with_timezone(), persons())))
+    rel = draw(builds(
         Release,
         id=sha1_git(),
-        date=timestamps_with_timezone(),
-        author=one_of(none(), persons()),
-        target=one_of(none(), sha1_git()))
+        author=none(),
+        date=none(),
+        target=sha1_git()))
+    rel.date = date
+    rel.author = author
+    return rel
+
+
+def revision_metadata():
+    alphabet = characters(
+        blacklist_categories=('Cs', ),
+        blacklist_characters=['\u0000'])  # postgresql does not like these
+    return dictionaries(text(alphabet=alphabet), text(alphabet=alphabet))
 
 
 def revisions():
@@ -77,9 +99,10 @@ def revisions():
         id=sha1_git(),
         date=timestamps_with_timezone(),
         committer_date=timestamps_with_timezone(),
-        parents=lists(binary()),
-        directory=binary(),
-        metadata=one_of(none(), dictionaries(binary(), binary())))
+        parents=lists(sha1_git()),
+        directory=sha1_git(),
+        metadata=one_of(none(), revision_metadata()))
+    # TODO: metadata['extra_headers'] can have binary keys and values
 
 
 def directory_entries():
@@ -96,18 +119,25 @@ def directories():
         entries=lists(directory_entries()))
 
 
-def contents():
-    def filter_data(content):
-        if content.status != 'visible':
-            content.data = None
-        return content
+@composite
+def contents(draw):
+    (status, data, reason) = draw(one_of(
+        tuples(just('visible'), binary(), none()),
+        tuples(just('absent'), none(), text()),
+        tuples(just('hidden'), none(), none()),
+    ))
 
-    return builds(
+    return draw(builds(
         Content,
         length=integers(0),
-        data=binary(),
+        sha1=sha1(),
         sha1_git=sha1_git(),
-    ).map(filter_data)
+        sha256=binary(min_size=32, max_size=32),
+        blake2s256=binary(min_size=32, max_size=32),
+        status=just(status),
+        data=just(data),
+        reason=just(reason),
+    ))
 
 
 def branch_names():
diff --git a/swh/model/model.py b/swh/model/model.py
index 890d1330..036879de 100644
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -48,6 +48,13 @@ class TimestampWithTimezone:
     def to_dict(self):
         return attr.asdict(self)
 
+    @offset.validator
+    def check_offset(self, attribute, value):
+        if not (-2**15 <= value < 2**15):
+            # max 14 hours offset in theory, but you never know what
+            # you'll find in the wild...
+            raise ValueError('offset too large: %d minutes' % value)
+
 
 @attr.s
 class Origin:
@@ -83,6 +90,14 @@ class TargetType(Enum):
     ALIAS = 'alias'
 
 
+class ObjectType(Enum):
+    CONTENT = 'content'
+    DIRECTORY = 'directory'
+    REVISION = 'revision'
+    RELEASE = 'release'
+    SNAPSHOT = 'snapshot'
+
+
 @attr.s
 class SnapshotBranch:
     target = attr.ib(type=bytes)
@@ -121,18 +136,31 @@ class Release:
     id = attr.ib(type=Sha1Git)
     name = attr.ib(type=bytes)
     message = attr.ib(type=bytes)
-    date = attr.ib(type=TimestampWithTimezone)
+    date = attr.ib(type=Optional[TimestampWithTimezone])
     author = attr.ib(type=Optional[Person])
     target = attr.ib(type=Optional[Sha1Git])
-    target_type = attr.ib(type=TargetType)
+    target_type = attr.ib(type=ObjectType)
     synthetic = attr.ib(type=bool)
 
     def to_dict(self):
         rel = attr.asdict(self)
-        rel['date'] = self.date.to_dict()
+        rel['date'] = self.date.to_dict() if self.date is not None else None
         rel['target_type'] = rel['target_type'].value
         return rel
 
+    @author.validator
+    def check_author(self, attribute, value):
+        if self.author is None and self.date is not None:
+            raise ValueError('release date must be None if date is None.')
+
+
+class RevisionType(Enum):
+    GIT = 'git'
+    TAR = 'tar'
+    DSC = 'dsc'
+    SUBVERSION = 'svn'
+    MERCURIAL = 'hg'
+
 
 @attr.s
 class Revision:
@@ -143,15 +171,16 @@ class Revision:
     date = attr.ib(type=TimestampWithTimezone)
     committer_date = attr.ib(type=TimestampWithTimezone)
     parents = attr.ib(type=List[Sha1Git])
-    type = attr.ib(type=str)
+    type = attr.ib(type=RevisionType)
     directory = attr.ib(type=Sha1Git)
-    metadata = attr.ib(type=Optional[dict])
+    metadata = attr.ib(type=Optional[Dict[str, object]])
     synthetic = attr.ib(type=bool)
 
     def to_dict(self):
         rev = attr.asdict(self)
         rev['date'] = self.date.to_dict()
         rev['committer_date'] = self.committer_date.to_dict()
+        rev['type'] = rev['type'].value
         return rev
 
 
@@ -191,6 +220,7 @@ class Content:
     status = attr.ib(
         type=str,
         validator=attr.validators.in_(['visible', 'absent', 'hidden']))
+    reason = attr.ib(type=Optional[str])
 
     @length.validator
     def check_length(self, attribute, value):
@@ -198,8 +228,20 @@ class Content:
         if value < 0:
             raise ValueError('Length must be positive.')
 
+    @reason.validator
+    def check_reason(self, attribute, value):
+        """Checks the reason is full iff status != absent."""
+        assert self.reason == value
+        if self.status == 'absent' and value is None:
+            raise ValueError('Must provide a reason if content is absent.')
+        elif self.status != 'absent' and value is not None:
+            raise ValueError(
+                'Must not provide a reason if content is not absent.')
+
     def to_dict(self):
         content = attr.asdict(self)
         if content['data'] is None:
             del content['data']
+        if content['reason'] is None:
+            del content['reason']
         return content
diff --git a/swh/model/tests/test_hypothesis_strategies.py b/swh/model/tests/test_hypothesis_strategies.py
index 7e63d847..3e69ab9b 100644
--- a/swh/model/tests/test_hypothesis_strategies.py
+++ b/swh/model/tests/test_hypothesis_strategies.py
@@ -44,9 +44,14 @@ def test_dicts_generation(obj_type_and_obj):
         if object_['status'] == 'visible':
             assert set(object_) == \
                 set(DEFAULT_ALGORITHMS) | {'length', 'status', 'data'}
-        else:
+        elif object_['status'] == 'absent':
+            assert set(object_) == \
+                set(DEFAULT_ALGORITHMS) | {'length', 'status', 'reason'}
+        elif object_['status'] == 'hidden':
             assert set(object_) == \
                 set(DEFAULT_ALGORITHMS) | {'length', 'status'}
+        else:
+            assert False, object_
     elif obj_type == 'release':
         assert object_['target_type'] in target_types
     elif obj_type == 'snapshot':
-- 
GitLab