Compare revisions

c9583bae · c9583bae · c9583bae · c9583bae · c9583bae · c9583bae
--- a/swh/model/tests/data/repos/sample-repo.tgz
+++ b/swh/model/tests/data/repos/sample-repo.tgz
--- a/swh/model/tests/fields/test_compound.py
+++ b/swh/model/tests/fields/test_compound.py
@@ -18,12 +18,12 @@ class ValidateCompound(unittest.TestCase):
        def validate_never(model):
            return False

-        self.test_model = 'test model'
+        self.test_model = "test model"
        self.test_schema = {
-            'int': (True, simple.validate_int),
-            'str': (True, simple.validate_str),
-            'str2': (True, simple.validate_str),
-            'datetime': (False, simple.validate_datetime),
+            "int": (True, simple.validate_int),
+            "str": (True, simple.validate_str),
+            "str2": (True, simple.validate_str),
+            "datetime": (False, simple.validate_datetime),
            NON_FIELD_ERRORS: validate_always,
        }

@@ -31,43 +31,48 @@ class ValidateCompound(unittest.TestCase):
        self.test_schema_shortcut[NON_FIELD_ERRORS] = validate_never

        self.test_schema_field_failed = self.test_schema.copy()
-        self.test_schema_field_failed['int'] = (True, [simple.validate_int,
-                                                       validate_never])
+        self.test_schema_field_failed["int"] = (
+            True,
+            [simple.validate_int, validate_never],
+        )

        self.test_value = {
-            'str': 'value1',
-            'str2': 'value2',
-            'int': 42,
-            'datetime': datetime.datetime(1990, 1, 1, 12, 0, 0,
-                                          tzinfo=datetime.timezone.utc),
+            "str": "value1",
+            "str2": "value2",
+            "int": 42,
+            "datetime": datetime.datetime(
+                1990, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc
+            ),
        }

        self.test_value_missing = {
-            'str': 'value1',
+            "str": "value1",
        }

        self.test_value_str_error = {
-            'str': 1984,
-            'str2': 'value2',
-            'int': 42,
-            'datetime': datetime.datetime(1990, 1, 1, 12, 0, 0,
-                                          tzinfo=datetime.timezone.utc),
+            "str": 1984,
+            "str2": "value2",
+            "int": 42,
+            "datetime": datetime.datetime(
+                1990, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc
+            ),
        }

-        self.test_value_missing_keys = {'int'}
+        self.test_value_missing_keys = {"int"}

        self.test_value_wrong_type = 42

        self.present_keys = set(self.test_value)
-        self.missing_keys = {'missingkey1', 'missingkey2'}
+        self.missing_keys = {"missingkey1", "missingkey2"}

    def test_validate_any_key(self):
-        self.assertTrue(
-            compound.validate_any_key(self.test_value, self.present_keys))
+        self.assertTrue(compound.validate_any_key(self.test_value, self.present_keys))

        self.assertTrue(
-            compound.validate_any_key(self.test_value,
-                                      self.present_keys | self.missing_keys))
+            compound.validate_any_key(
+                self.test_value, self.present_keys | self.missing_keys
+            )
+        )

    def test_validate_any_key_missing(self):
        with self.assertRaises(ValidationError) as cm:
@@ -75,13 +80,13 @@ class ValidateCompound(unittest.TestCase):

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'missing-alternative-field')
-        self.assertEqual(exc.params['missing_fields'],
-                         ', '.join(sorted(self.missing_keys)))
+        self.assertEqual(exc.code, "missing-alternative-field")
+        self.assertEqual(
+            exc.params["missing_fields"], ", ".join(sorted(self.missing_keys))
+        )

    def test_validate_all_keys(self):
-        self.assertTrue(
-            compound.validate_all_keys(self.test_value, self.present_keys))
+        self.assertTrue(compound.validate_all_keys(self.test_value, self.present_keys))

    def test_validate_all_keys_missing(self):
        with self.assertRaises(ValidationError) as cm:
@@ -89,41 +94,49 @@ class ValidateCompound(unittest.TestCase):

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'missing-mandatory-field')
-        self.assertEqual(exc.params['missing_fields'],
-                         ', '.join(sorted(self.missing_keys)))
+        self.assertEqual(exc.code, "missing-mandatory-field")
+        self.assertEqual(
+            exc.params["missing_fields"], ", ".join(sorted(self.missing_keys))
+        )

        with self.assertRaises(ValidationError) as cm:
-            compound.validate_all_keys(self.test_value,
-                                       self.present_keys | self.missing_keys)
+            compound.validate_all_keys(
+                self.test_value, self.present_keys | self.missing_keys
+            )

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'missing-mandatory-field')
-        self.assertEqual(exc.params['missing_fields'],
-                         ', '.join(sorted(self.missing_keys)))
+        self.assertEqual(exc.code, "missing-mandatory-field")
+        self.assertEqual(
+            exc.params["missing_fields"], ", ".join(sorted(self.missing_keys))
+        )

    def test_validate_against_schema(self):
        self.assertTrue(
-            compound.validate_against_schema(self.test_model, self.test_schema,
-                                             self.test_value))
+            compound.validate_against_schema(
+                self.test_model, self.test_schema, self.test_value
+            )
+        )

    def test_validate_against_schema_wrong_type(self):
        with self.assertRaises(ValidationError) as cm:
-            compound.validate_against_schema(self.test_model, self.test_schema,
-                                             self.test_value_wrong_type)
+            compound.validate_against_schema(
+                self.test_model, self.test_schema, self.test_value_wrong_type
+            )

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'model-unexpected-type')
-        self.assertEqual(exc.params['model'], self.test_model)
-        self.assertEqual(exc.params['type'],
-                         self.test_value_wrong_type.__class__.__name__)
+        self.assertEqual(exc.code, "model-unexpected-type")
+        self.assertEqual(exc.params["model"], self.test_model)
+        self.assertEqual(
+            exc.params["type"], self.test_value_wrong_type.__class__.__name__
+        )

    def test_validate_against_schema_mandatory_keys(self):
        with self.assertRaises(ValidationError) as cm:
-            compound.validate_against_schema(self.test_model, self.test_schema,
-                                             self.test_value_missing)
+            compound.validate_against_schema(
+                self.test_model, self.test_schema, self.test_value_missing
+            )

        # The exception should be of the form:
        # ValidationError({
@@ -138,8 +151,8 @@ class ValidateCompound(unittest.TestCase):
            self.assertEqual(len(nested_key), 1)
            nested = nested_key[0]
            self.assertIsInstance(nested, ValidationError)
-            self.assertEqual(nested.code, 'model-field-mandatory')
-            self.assertEqual(nested.params['field'], key)
+            self.assertEqual(nested.code, "model-field-mandatory")
+            self.assertEqual(nested.params["field"], key)

    def test_validate_whole_schema_shortcut_previous_error(self):
        with self.assertRaises(ValidationError) as cm:
@@ -176,14 +189,15 @@ class ValidateCompound(unittest.TestCase):

        nested = non_field_errors[0]
        self.assertIsInstance(nested, ValidationError)
-        self.assertEqual(nested.code, 'model-validation-failed')
-        self.assertEqual(nested.params['model'], self.test_model)
-        self.assertEqual(nested.params['validator'], 'validate_never')
+        self.assertEqual(nested.code, "model-validation-failed")
+        self.assertEqual(nested.params["model"], self.test_model)
+        self.assertEqual(nested.params["validator"], "validate_never")

    def test_validate_against_schema_field_error(self):
        with self.assertRaises(ValidationError) as cm:
-            compound.validate_against_schema(self.test_model, self.test_schema,
-                                             self.test_value_str_error)
+            compound.validate_against_schema(
+                self.test_model, self.test_schema, self.test_value_str_error
+            )

        # The exception should be of the form:
        # ValidationError({
@@ -192,21 +206,21 @@ class ValidateCompound(unittest.TestCase):

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(set(exc.error_dict.keys()), {'str'})
+        self.assertEqual(set(exc.error_dict.keys()), {"str"})

-        str_errors = exc.error_dict['str']
+        str_errors = exc.error_dict["str"]
        self.assertIsInstance(str_errors, list)
        self.assertEqual(len(str_errors), 1)

        nested = str_errors[0]
        self.assertIsInstance(nested, ValidationError)
-        self.assertEqual(nested.code, 'unexpected-type')
+        self.assertEqual(nested.code, "unexpected-type")

    def test_validate_against_schema_field_failed(self):
        with self.assertRaises(ValidationError) as cm:
-            compound.validate_against_schema(self.test_model,
-                                             self.test_schema_field_failed,
-                                             self.test_value)
+            compound.validate_against_schema(
+                self.test_model, self.test_schema_field_failed, self.test_value
+            )

        # The exception should be of the form:
        # ValidationError({
@@ -215,14 +229,14 @@ class ValidateCompound(unittest.TestCase):

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(set(exc.error_dict.keys()), {'int'})
+        self.assertEqual(set(exc.error_dict.keys()), {"int"})

-        int_errors = exc.error_dict['int']
+        int_errors = exc.error_dict["int"]
        self.assertIsInstance(int_errors, list)
        self.assertEqual(len(int_errors), 1)

        nested = int_errors[0]
        self.assertIsInstance(nested, ValidationError)
-        self.assertEqual(nested.code, 'field-validation-failed')
-        self.assertEqual(nested.params['validator'], 'validate_never')
-        self.assertEqual(nested.params['field'], 'int')
+        self.assertEqual(nested.code, "field-validation-failed")
+        self.assertEqual(nested.params["validator"], "validate_never")
+        self.assertEqual(nested.params["field"], "int")
--- a/swh/model/tests/fields/test_hashes.py
+++ b/swh/model/tests/fields/test_hashes.py
@@ -12,20 +12,20 @@ from swh.model.fields import hashes
 class ValidateHashes(unittest.TestCase):
    def setUp(self):
        self.valid_byte_hashes = {
-            'sha1': b'\xf1\xd2\xd2\xf9\x24\xe9\x86\xac\x86\xfd\xf7\xb3\x6c\x94'
-                    b'\xbc\xdf\x32\xbe\xec\x15',
-            'sha1_git': b'\x25\x7c\xc5\x64\x2c\xb1\xa0\x54\xf0\x8c\xc8\x3f\x2d'
-                        b'\x94\x3e\x56\xfd\x3e\xbe\x99',
-            'sha256': b'\xb5\xbb\x9d\x80\x14\xa0\xf9\xb1\xd6\x1e\x21\xe7\x96'
-                      b'\xd7\x8d\xcc\xdf\x13\x52\xf2\x3c\xd3\x28\x12\xf4\x85'
-                      b'\x0b\x87\x8a\xe4\x94\x4c',
+            "sha1": b"\xf1\xd2\xd2\xf9\x24\xe9\x86\xac\x86\xfd\xf7\xb3\x6c\x94"
+            b"\xbc\xdf\x32\xbe\xec\x15",
+            "sha1_git": b"\x25\x7c\xc5\x64\x2c\xb1\xa0\x54\xf0\x8c\xc8\x3f\x2d"
+            b"\x94\x3e\x56\xfd\x3e\xbe\x99",
+            "sha256": b"\xb5\xbb\x9d\x80\x14\xa0\xf9\xb1\xd6\x1e\x21\xe7\x96"
+            b"\xd7\x8d\xcc\xdf\x13\x52\xf2\x3c\xd3\x28\x12\xf4\x85"
+            b"\x0b\x87\x8a\xe4\x94\x4c",
        }

        self.valid_str_hashes = {
-            'sha1': 'f1d2d2f924e986ac86fdf7b36c94bcdf32beec15',
-            'sha1_git': '257cc5642cb1a054f08cc83f2d943e56fd3ebe99',
-            'sha256': 'b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f485'
-                      '0b878ae4944c',
+            "sha1": "f1d2d2f924e986ac86fdf7b36c94bcdf32beec15",
+            "sha1_git": "257cc5642cb1a054f08cc83f2d943e56fd3ebe99",
+            "sha256": "b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f485"
+            "0b878ae4944c",
        }

        self.bad_hash = object()
@@ -39,112 +39,108 @@ class ValidateHashes(unittest.TestCase):
            self.assertTrue(hashes.validate_hash(value, hash_type))

    def test_invalid_hash_type(self):
-        hash_type = 'unknown_hash_type'
+        hash_type = "unknown_hash_type"
        with self.assertRaises(ValidationError) as cm:
-            hashes.validate_hash(self.valid_str_hashes['sha1'], hash_type)
+            hashes.validate_hash(self.valid_str_hashes["sha1"], hash_type)

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-hash-type')
-        self.assertEqual(exc.params['hash_type'], hash_type)
+        self.assertEqual(exc.code, "unexpected-hash-type")
+        self.assertEqual(exc.params["hash_type"], hash_type)

-        self.assertIn('Unexpected hash type', str(exc))
+        self.assertIn("Unexpected hash type", str(exc))
        self.assertIn(hash_type, str(exc))

    def test_invalid_bytes_len(self):
        for hash_type, value in self.valid_byte_hashes.items():
-            value = value + b'\x00\x01'
+            value = value + b"\x00\x01"
            with self.assertRaises(ValidationError) as cm:
                hashes.validate_hash(value, hash_type)

            exc = cm.exception
            self.assertIsInstance(str(exc), str)
-            self.assertEqual(exc.code, 'unexpected-hash-length')
-            self.assertEqual(exc.params['hash_type'], hash_type)
-            self.assertEqual(exc.params['length'], len(value))
+            self.assertEqual(exc.code, "unexpected-hash-length")
+            self.assertEqual(exc.params["hash_type"], hash_type)
+            self.assertEqual(exc.params["length"], len(value))

-            self.assertIn('Unexpected length', str(exc))
+            self.assertIn("Unexpected length", str(exc))
            self.assertIn(str(len(value)), str(exc))

    def test_invalid_str_len(self):
        for hash_type, value in self.valid_str_hashes.items():
-            value = value + '0001'
+            value = value + "0001"
            with self.assertRaises(ValidationError) as cm:
                hashes.validate_hash(value, hash_type)

            exc = cm.exception
            self.assertIsInstance(str(exc), str)
-            self.assertEqual(exc.code, 'unexpected-hash-length')
-            self.assertEqual(exc.params['hash_type'], hash_type)
-            self.assertEqual(exc.params['length'], len(value))
+            self.assertEqual(exc.code, "unexpected-hash-length")
+            self.assertEqual(exc.params["hash_type"], hash_type)
+            self.assertEqual(exc.params["length"], len(value))

-            self.assertIn('Unexpected length', str(exc))
+            self.assertIn("Unexpected length", str(exc))
            self.assertIn(str(len(value)), str(exc))

    def test_invalid_str_contents(self):
        for hash_type, value in self.valid_str_hashes.items():
-            value = '\xa2' + value[1:-1] + '\xc3'
+            value = "\xa2" + value[1:-1] + "\xc3"
            with self.assertRaises(ValidationError) as cm:
                hashes.validate_hash(value, hash_type)

            exc = cm.exception
            self.assertIsInstance(str(exc), str)
-            self.assertEqual(exc.code, 'unexpected-hash-contents')
-            self.assertEqual(exc.params['hash_type'], hash_type)
-            self.assertEqual(exc.params['unexpected_chars'], '\xa2, \xc3')
+            self.assertEqual(exc.code, "unexpected-hash-contents")
+            self.assertEqual(exc.params["hash_type"], hash_type)
+            self.assertEqual(exc.params["unexpected_chars"], "\xa2, \xc3")

-            self.assertIn('Unexpected characters', str(exc))
-            self.assertIn('\xc3', str(exc))
-            self.assertIn('\xa2', str(exc))
+            self.assertIn("Unexpected characters", str(exc))
+            self.assertIn("\xc3", str(exc))
+            self.assertIn("\xa2", str(exc))

    def test_invalid_value_type(self):
        with self.assertRaises(ValidationError) as cm:
-            hashes.validate_hash(self.bad_hash, 'sha1')
+            hashes.validate_hash(self.bad_hash, "sha1")

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-hash-value-type')
-        self.assertEqual(exc.params['type'], self.bad_hash.__class__.__name__)
+        self.assertEqual(exc.code, "unexpected-hash-value-type")
+        self.assertEqual(exc.params["type"], self.bad_hash.__class__.__name__)

-        self.assertIn('Unexpected type', str(exc))
+        self.assertIn("Unexpected type", str(exc))
        self.assertIn(self.bad_hash.__class__.__name__, str(exc))

    def test_validate_sha1(self):
-        self.assertTrue(hashes.validate_sha1(self.valid_byte_hashes['sha1']))
-        self.assertTrue(hashes.validate_sha1(self.valid_str_hashes['sha1']))
+        self.assertTrue(hashes.validate_sha1(self.valid_byte_hashes["sha1"]))
+        self.assertTrue(hashes.validate_sha1(self.valid_str_hashes["sha1"]))

        with self.assertRaises(ValidationError) as cm:
            hashes.validate_sha1(self.bad_hash)

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-hash-value-type')
-        self.assertEqual(exc.params['type'], self.bad_hash.__class__.__name__)
+        self.assertEqual(exc.code, "unexpected-hash-value-type")
+        self.assertEqual(exc.params["type"], self.bad_hash.__class__.__name__)

    def test_validate_sha1_git(self):
-        self.assertTrue(
-            hashes.validate_sha1_git(self.valid_byte_hashes['sha1_git']))
-        self.assertTrue(
-            hashes.validate_sha1_git(self.valid_str_hashes['sha1_git']))
+        self.assertTrue(hashes.validate_sha1_git(self.valid_byte_hashes["sha1_git"]))
+        self.assertTrue(hashes.validate_sha1_git(self.valid_str_hashes["sha1_git"]))

        with self.assertRaises(ValidationError) as cm:
            hashes.validate_sha1_git(self.bad_hash)

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-hash-value-type')
-        self.assertEqual(exc.params['type'], self.bad_hash.__class__.__name__)
+        self.assertEqual(exc.code, "unexpected-hash-value-type")
+        self.assertEqual(exc.params["type"], self.bad_hash.__class__.__name__)

    def test_validate_sha256(self):
-        self.assertTrue(
-            hashes.validate_sha256(self.valid_byte_hashes['sha256']))
-        self.assertTrue(
-            hashes.validate_sha256(self.valid_str_hashes['sha256']))
+        self.assertTrue(hashes.validate_sha256(self.valid_byte_hashes["sha256"]))
+        self.assertTrue(hashes.validate_sha256(self.valid_str_hashes["sha256"]))

        with self.assertRaises(ValidationError) as cm:
            hashes.validate_sha256(self.bad_hash)

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-hash-value-type')
-        self.assertEqual(exc.params['type'], self.bad_hash.__class__.__name__)
+        self.assertEqual(exc.code, "unexpected-hash-value-type")
+        self.assertEqual(exc.params["type"], self.bad_hash.__class__.__name__)
--- a/swh/model/tests/fields/test_simple.py
+++ b/swh/model/tests/fields/test_simple.py
@@ -12,19 +12,20 @@ from swh.model.fields import simple

 class ValidateSimple(unittest.TestCase):
    def setUp(self):
-        self.valid_str = 'I am a valid string'
+        self.valid_str = "I am a valid string"

-        self.valid_bytes = b'I am a valid bytes object'
+        self.valid_bytes = b"I am a valid bytes object"

-        self.enum_values = {'an enum value', 'other', 'and another'}
-        self.invalid_enum_value = 'invalid enum value'
+        self.enum_values = {"an enum value", "other", "and another"}
+        self.invalid_enum_value = "invalid enum value"

        self.valid_int = 42

        self.valid_real = 42.42

-        self.valid_datetime = datetime.datetime(1999, 1, 1, 12, 0, 0,
-                                                tzinfo=datetime.timezone.utc)
+        self.valid_datetime = datetime.datetime(
+            1999, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc
+        )
        self.invalid_datetime_notz = datetime.datetime(1999, 1, 1, 12, 0, 0)

    def test_validate_int(self):
@@ -36,9 +37,9 @@ class ValidateSimple(unittest.TestCase):

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-type')
-        self.assertEqual(exc.params['expected_type'], 'Integral')
-        self.assertEqual(exc.params['type'], 'str')
+        self.assertEqual(exc.code, "unexpected-type")
+        self.assertEqual(exc.params["expected_type"], "Integral")
+        self.assertEqual(exc.params["type"], "str")

    def test_validate_str(self):
        self.assertTrue(simple.validate_str(self.valid_str))
@@ -49,18 +50,18 @@ class ValidateSimple(unittest.TestCase):

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-type')
-        self.assertEqual(exc.params['expected_type'], 'str')
-        self.assertEqual(exc.params['type'], 'int')
+        self.assertEqual(exc.code, "unexpected-type")
+        self.assertEqual(exc.params["expected_type"], "str")
+        self.assertEqual(exc.params["type"], "int")

        with self.assertRaises(ValidationError) as cm:
            simple.validate_str(self.valid_bytes)

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-type')
-        self.assertEqual(exc.params['expected_type'], 'str')
-        self.assertEqual(exc.params['type'], 'bytes')
+        self.assertEqual(exc.code, "unexpected-type")
+        self.assertEqual(exc.params["expected_type"], "str")
+        self.assertEqual(exc.params["type"], "bytes")

    def test_validate_bytes(self):
        self.assertTrue(simple.validate_bytes(self.valid_bytes))
@@ -71,18 +72,18 @@ class ValidateSimple(unittest.TestCase):

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-type')
-        self.assertEqual(exc.params['expected_type'], 'bytes')
-        self.assertEqual(exc.params['type'], 'int')
+        self.assertEqual(exc.code, "unexpected-type")
+        self.assertEqual(exc.params["expected_type"], "bytes")
+        self.assertEqual(exc.params["type"], "int")

        with self.assertRaises(ValidationError) as cm:
            simple.validate_bytes(self.valid_str)

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-type')
-        self.assertEqual(exc.params['expected_type'], 'bytes')
-        self.assertEqual(exc.params['type'], 'str')
+        self.assertEqual(exc.code, "unexpected-type")
+        self.assertEqual(exc.params["expected_type"], "bytes")
+        self.assertEqual(exc.params["type"], "str")

    def test_validate_datetime(self):
        self.assertTrue(simple.validate_datetime(self.valid_datetime))
@@ -95,9 +96,9 @@ class ValidateSimple(unittest.TestCase):

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-type')
-        self.assertEqual(exc.params['expected_type'], 'one of datetime, Real')
-        self.assertEqual(exc.params['type'], 'str')
+        self.assertEqual(exc.code, "unexpected-type")
+        self.assertEqual(exc.params["expected_type"], "one of datetime, Real")
+        self.assertEqual(exc.params["type"], "str")

    def test_validate_datetime_invalide_tz(self):
        with self.assertRaises(ValidationError) as cm:
@@ -105,7 +106,7 @@ class ValidateSimple(unittest.TestCase):

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'datetime-without-tzinfo')
+        self.assertEqual(exc.code, "datetime-without-tzinfo")

    def test_validate_enum(self):
        for value in self.enum_values:
@@ -117,7 +118,8 @@ class ValidateSimple(unittest.TestCase):

        exc = cm.exception
        self.assertIsInstance(str(exc), str)
-        self.assertEqual(exc.code, 'unexpected-value')
-        self.assertEqual(exc.params['value'], self.invalid_enum_value)
-        self.assertEqual(exc.params['expected_values'],
-                         ', '.join(sorted(self.enum_values)))
+        self.assertEqual(exc.code, "unexpected-value")
+        self.assertEqual(exc.params["value"], self.invalid_enum_value)
+        self.assertEqual(
+            exc.params["expected_values"], ", ".join(sorted(self.enum_values))
+        )
--- a/swh/model/tests/generate_testdata.py
+++ b/swh/model/tests/generate_testdata.py
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from datetime import datetime
+from random import choice, randint, random, shuffle
+from typing import Dict, List
+
+from pytz import all_timezones, timezone
+
+from swh.model.hashutil import MultiHash
+
+PROTOCOLS = ["git", "http", "https", "deb", "svn", "mock"]
+DOMAINS = ["example.com", "some.long.host.name", "xn--n28h.tld"]
+PATHS = [
+    "",
+    "/",
+    "/stuff",
+    "/stuff/",
+    "/path/to/resource",
+    "/path/with/anchor#id=42",
+    "/path/with/qargs?q=1&b",
+]
+CONTENT_STATUS = ["visible", "hidden", "absent"]
+MAX_DATE = 3e9  # around 2065
+
+
+def gen_all_origins():
+    for protocol in PROTOCOLS:
+        for domain in DOMAINS:
+            for urlpath in PATHS:
+                yield {"url": "%s://%s%s" % (protocol, domain, urlpath)}
+
+
+ORIGINS = list(gen_all_origins())
+
+
+def gen_origins(n: int = 100) -> List:
+    """Returns a list of n randomly generated origins suitable for using as
+    Storage.add_origin() argument.
+
+    """
+    origins = ORIGINS[:]
+    shuffle(origins)
+    return origins[:n]
+
+
+def gen_content():
+    size = randint(1, 10 * 1024)
+    data = bytes(randint(0, 255) for i in range(size))
+    status = choice(CONTENT_STATUS)
+    h = MultiHash.from_data(data)
+    ctime = datetime.fromtimestamp(random() * MAX_DATE, timezone(choice(all_timezones)))
+    content = {
+        "data": data,
+        "status": status,
+        "length": size,
+        "ctime": ctime,
+        **h.digest(),
+    }
+    if status == "absent":
+        content["reason"] = "why not"
+        content["data"] = None
+    return content
+
+
+def gen_contents(n=20) -> List[Dict]:
+    """Returns a list of n randomly generated content objects (as dict) suitable
+    for using as Storage.content_add() argument.
+    """
+    return [gen_content() for i in range(n)]
--- a/swh/model/tests/generate_testdata_from_disk.py
+++ b/swh/model/tests/generate_testdata_from_disk.py
@@ -7,86 +7,86 @@ from operator import itemgetter
 import os
 import sys

-from swh.model.from_disk import Directory, DentryPerms
+from swh.model.from_disk import DentryPerms, Directory
 from swh.model.hashutil import ALGORITHMS, hash_to_hex


 def generate_from_directory(varname, directory, indent=0):
    """Generate test data from a given directory"""
+
    def get_data(member, path):
        yield (path, member.get_data())
        if isinstance(member, Directory):
            for name, child in member.items():
                yield from get_data(child, os.path.join(path, name))

-    data = dict(get_data(directory, b''))
+    data = dict(get_data(directory, b""))
    out = []

    def format_hash(h, indent=0):
-        spindent = ' ' * indent
+        spindent = " " * indent
        if len(h) > 20:
-            cutoff = len(h)//2
+            cutoff = len(h) // 2
            parts = h[:cutoff], h[cutoff:]
        else:
            parts = [h]

-        out.append('hash_to_bytes(\n')
+        out.append("hash_to_bytes(\n")
        for part in parts:
-            out.append(spindent + '    %s\n' % repr(hash_to_hex(part)))
-        out.append(spindent + ')')
+            out.append(spindent + "    %s\n" % repr(hash_to_hex(part)))
+        out.append(spindent + ")")

    def format_dict_items(d, indent=0):
-        spindent = ' ' * indent
+        spindent = " " * indent
        for key, value in sorted(d.items()):
            if isinstance(key, bytes):
-                out.append(spindent + repr(key) + ': {\n')
+                out.append(spindent + repr(key) + ": {\n")
                format_dict_items(value, indent=indent + 4)
-                out.append(spindent + '}')
+                out.append(spindent + "}")
            else:
-                out.append(spindent + repr(key) + ': ')
-                if key == 'entries':
+                out.append(spindent + repr(key) + ": ")
+                if key == "entries":
                    if not value:
-                        out.append('[]')
+                        out.append("[]")
                    else:
-                        out.append('[')
+                        out.append("[")
                        last_index = len(value) - 1
                        for i, entry in enumerate(
-                                sorted(value, key=itemgetter('name'))):
+                            sorted(value, key=itemgetter("name"))
+                        ):
                            if i:
-                                out.append(' ')
-                            out.append('{\n')
+                                out.append(" ")
+                            out.append("{\n")
                            format_dict_items(entry, indent=indent + 4)
                            if i != last_index:
-                                out.append(spindent + '},')
-                        out.append(spindent + '}]')
-                elif key in ALGORITHMS | {'id', 'target'}:
+                                out.append(spindent + "},")
+                        out.append(spindent + "}]")
+                elif key in ALGORITHMS | {"id", "target"}:
                    format_hash(value, indent=indent)
                elif isinstance(value, DentryPerms):
                    out.append(str(value))
                else:
                    out.append(repr(value))
-            out.append(',\n')
+            out.append(",\n")

-    spindent = ' ' * indent
-    out.append(spindent + '%s = {\n' % varname)
+    spindent = " " * indent
+    out.append(spindent + "%s = {\n" % varname)
    format_dict_items(data, indent=4 + indent)
-    out.append(spindent + '}')
+    out.append(spindent + "}")

-    return ''.join(out)
+    return "".join(out)


-if __name__ == '__main__':
+if __name__ == "__main__":
    if not sys.argv[1:]:
        print("Usage: %s dir1 dir2" % sys.argv[0], file=sys.stderr)
        exit(2)

    for dirname in sys.argv[1:]:
        basename = os.path.basename(dirname)
-        varname = 'expected_%s' % basename
+        varname = "expected_%s" % basename
        testdata = generate_from_directory(
-            varname,
-            Directory.from_disk(path=os.fsencode(dirname)),
-            indent=8
+            varname, Directory.from_disk(path=os.fsencode(dirname)), indent=8
        )
        print(testdata)
        print()
--- a/swh/model/tests/swh_model_data.py
+++ b/swh/model/tests/swh_model_data.py
+# Copyright (C) 2019-2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import datetime
+from typing import Dict, List, Sequence, cast
+
+import attr
+
+from swh.model.hashutil import MultiHash, hash_to_bytes
+from swh.model.model import (
+    BaseModel,
+    Content,
+    Directory,
+    DirectoryEntry,
+    ExtID,
+    MetadataAuthority,
+    MetadataAuthorityType,
+    MetadataFetcher,
+    ModelObjectType,
+    ObjectType,
+    Origin,
+    OriginVisit,
+    OriginVisitStatus,
+    Person,
+    RawExtrinsicMetadata,
+    Release,
+    Revision,
+    RevisionType,
+    SkippedContent,
+    Snapshot,
+    SnapshotBranch,
+    SnapshotTargetType,
+    Timestamp,
+    TimestampWithTimezone,
+)
+from swh.model.swhids import ExtendedSWHID
+
+UTC = datetime.timezone.utc
+
+CONTENTS: List[Content] = [
+    Content(
+        length=4,
+        data=f"foo{i}".encode(),
+        status="visible",
+        **MultiHash.from_data(f"foo{i}".encode()).digest(),
+    )
+    for i in range(10)
+] + [
+    Content(
+        length=14,
+        data=f"forbidden foo{i}".encode(),
+        status="hidden",
+        **MultiHash.from_data(f"forbidden foo{i}".encode()).digest(),
+    )
+    for i in range(10)
+]
+
+SKIPPED_CONTENTS: List[SkippedContent] = [
+    SkippedContent(
+        length=4,
+        status="absent",
+        reason=f"because chr({i}) != '*'",
+        **MultiHash.from_data(f"bar{i}".encode()).digest(),
+    )
+    for i in range(2)
+]
+
+duplicate_content1 = Content(
+    length=4,
+    sha1=hash_to_bytes("44973274ccef6ab4dfaaf86599792fa9c3fe4689"),
+    sha1_git=b"another-foo",
+    blake2s256=b"another-bar",
+    sha256=b"another-baz",
+    status="visible",
+)
+
+# Craft a sha1 collision
+sha1_array = bytearray(duplicate_content1.sha1_git)
+sha1_array[0] += 1
+duplicate_content2 = attr.evolve(duplicate_content1, sha1_git=bytes(sha1_array))
+
+
+DUPLICATE_CONTENTS = [duplicate_content1, duplicate_content2]
+
+
+COMMITTERS: List[Person] = [
+    Person(fullname=b"foo", name=b"foo", email=b""),
+    Person(fullname=b"bar", name=b"bar", email=b""),
+]
+
+DATES: List[TimestampWithTimezone] = [
+    TimestampWithTimezone(
+        timestamp=Timestamp(
+            seconds=1234567891,
+            microseconds=0,
+        ),
+        offset_bytes=b"+0200",
+    ),
+    TimestampWithTimezone(
+        timestamp=Timestamp(
+            seconds=1234567892,
+            microseconds=0,
+        ),
+        offset_bytes=b"+0200",
+    ),
+]
+
+REVISIONS: List[Revision] = [
+    Revision(
+        id=hash_to_bytes("66c7c1cd9673275037140f2abff7b7b11fc9439c"),
+        message=b"hello",
+        date=DATES[0],
+        committer=COMMITTERS[0],
+        author=COMMITTERS[0],
+        committer_date=DATES[0],
+        type=RevisionType.GIT,
+        directory=b"\x01" * 20,
+        synthetic=False,
+        metadata=None,
+        parents=(
+            hash_to_bytes("9b918dd063cec85c2bc63cc7f167e29f5894dcbc"),
+            hash_to_bytes("757f38bdcd8473aaa12df55357f5e2f1a318e672"),
+        ),
+    ),
+    Revision(
+        id=hash_to_bytes("c7f96242d73c267adc77c2908e64e0c1cb6a4431"),
+        message=b"hello again",
+        date=DATES[1],
+        committer=COMMITTERS[1],
+        author=COMMITTERS[1],
+        committer_date=DATES[1],
+        type=RevisionType.MERCURIAL,
+        directory=b"\x02" * 20,
+        synthetic=False,
+        metadata=None,
+        parents=(),
+        extra_headers=((b"foo", b"bar"),),
+    ),
+    Revision(
+        id=hash_to_bytes("51580d63b8dcc0ec73e74994e66896858542840a"),
+        message=b"hello",
+        date=DATES[0],
+        committer=COMMITTERS[0],
+        author=COMMITTERS[0],
+        committer_date=DATES[0],
+        type=RevisionType.GIT,
+        directory=b"\x01" * 20,
+        synthetic=False,
+        metadata=None,
+        parents=(hash_to_bytes("9b918dd063cec85c2bc63cc7f167e29f5894dcbc"),),
+        raw_manifest=(
+            b"commit 207\x00"
+            b"tree 0101010101010101010101010101010101010101\n"
+            b"parent 9B918DD063CEC85C2BC63CC7F167E29F5894DCBC"  # upper-cased
+            b"nauthor foo 1234567891 +0200\n"
+            b"committer foo 1234567891 +0200"
+            b"\n\nhello"
+        ),
+    ),
+]
+
+RELEASES: List[Release] = [
+    Release(
+        id=hash_to_bytes("8059dc4e17fcd0e51ca3bcd6b80f4577d281fd08"),
+        name=b"v0.0.1",
+        date=TimestampWithTimezone(
+            timestamp=Timestamp(
+                seconds=1234567890,
+                microseconds=0,
+            ),
+            offset_bytes=b"+0200",
+        ),
+        author=COMMITTERS[0],
+        target_type=ObjectType.REVISION,
+        target=b"\x04" * 20,
+        message=b"foo",
+        synthetic=False,
+    ),
+    Release(
+        id=hash_to_bytes("ee4d20e80af850cc0f417d25dc5073792c5010d2"),
+        name=b"this-is-a/tag/1.0",
+        date=None,
+        author=None,
+        target_type=ObjectType.DIRECTORY,
+        target=b"\x05" * 20,
+        message=b"bar",
+        synthetic=False,
+    ),
+    Release(
+        id=hash_to_bytes("1cdd1e87234b6f066d0855a3b5b567638a55d583"),
+        name=b"v0.0.1",
+        date=TimestampWithTimezone(
+            timestamp=Timestamp(
+                seconds=1234567890,
+                microseconds=0,
+            ),
+            offset_bytes=b"+0200",
+        ),
+        author=COMMITTERS[0],
+        target_type=ObjectType.REVISION,
+        target=b"\x04" * 20,
+        message=b"foo",
+        synthetic=False,
+        raw_manifest=(
+            b"tag 102\x00"
+            b"object 0404040404040404040404040404040404040404\n"
+            b"type commit\n"
+            b"tag v0.0.1\n"
+            b"tagger foo 1234567890 +200"  # missing leading 0 for timezone
+            b"\n\nfoo"
+        ),
+    ),
+]
+
+ORIGINS: List[Origin] = [
+    Origin(
+        url="https://somewhere.org/den/fox",
+    ),
+    Origin(
+        url="https://overtherainbow.org/fox/den",
+    ),
+]
+
+ORIGIN_VISITS: List[OriginVisit] = [
+    OriginVisit(
+        origin=ORIGINS[0].url,
+        date=datetime.datetime(2013, 5, 7, 4, 20, 39, 369271, tzinfo=UTC),
+        visit=1,
+        type="git",
+    ),
+    OriginVisit(
+        origin=ORIGINS[1].url,
+        date=datetime.datetime(2014, 11, 27, 17, 20, 39, tzinfo=UTC),
+        visit=1,
+        type="hg",
+    ),
+    OriginVisit(
+        origin=ORIGINS[0].url,
+        date=datetime.datetime(2018, 11, 27, 17, 20, 39, tzinfo=UTC),
+        visit=2,
+        type="git",
+    ),
+    OriginVisit(
+        origin=ORIGINS[0].url,
+        date=datetime.datetime(2018, 11, 27, 17, 20, 39, tzinfo=UTC),
+        visit=3,
+        type="git",
+    ),
+    OriginVisit(
+        origin=ORIGINS[1].url,
+        date=datetime.datetime(2015, 11, 27, 17, 20, 39, tzinfo=UTC),
+        visit=2,
+        type="hg",
+    ),
+]
+
+# The origin-visit-status dates needs to be shifted slightly in the future from their
+# visit dates counterpart. Otherwise, we are hitting storage-wise the "on conflict"
+# ignore policy (because origin-visit-add creates an origin-visit-status with the same
+# parameters from the origin-visit {origin, visit, date}...
+ORIGIN_VISIT_STATUSES: List[OriginVisitStatus] = [
+    OriginVisitStatus(
+        origin=ORIGINS[0].url,
+        date=datetime.datetime(2013, 5, 7, 4, 20, 39, 432222, tzinfo=UTC),
+        visit=1,
+        type="git",
+        status="ongoing",
+        snapshot=None,
+        metadata=None,
+    ),
+    OriginVisitStatus(
+        origin=ORIGINS[1].url,
+        date=datetime.datetime(2014, 11, 27, 17, 21, 12, tzinfo=UTC),
+        visit=1,
+        type="hg",
+        status="ongoing",
+        snapshot=None,
+        metadata=None,
+    ),
+    OriginVisitStatus(
+        origin=ORIGINS[0].url,
+        date=datetime.datetime(2018, 11, 27, 17, 20, 59, tzinfo=UTC),
+        visit=2,
+        type="git",
+        status="ongoing",
+        snapshot=None,
+        metadata=None,
+    ),
+    OriginVisitStatus(
+        origin=ORIGINS[0].url,
+        date=datetime.datetime(2018, 11, 27, 17, 20, 49, tzinfo=UTC),
+        visit=3,
+        type="git",
+        status="full",
+        snapshot=hash_to_bytes("9e78d7105c5e0f886487511e2a92377b4ee4c32a"),
+        metadata=None,
+    ),
+    OriginVisitStatus(
+        origin=ORIGINS[1].url,
+        date=datetime.datetime(2015, 11, 27, 17, 22, 18, tzinfo=UTC),
+        visit=2,
+        type="hg",
+        status="partial",
+        snapshot=hash_to_bytes("0e7f84ede9a254f2cd55649ad5240783f557e65f"),
+        metadata=None,
+    ),
+]
+
+
+DIRECTORIES: List[Directory] = [
+    Directory(id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"), entries=()),
+    Directory(
+        id=hash_to_bytes("87b339104f7dc2a8163dec988445e3987995545f"),
+        entries=(
+            DirectoryEntry(
+                name=b"file1.ext",
+                perms=0o644,
+                type="file",
+                target=CONTENTS[0].sha1_git,
+            ),
+            DirectoryEntry(
+                name=b"dir1",
+                perms=0o755,
+                type="dir",
+                target=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
+            ),
+            DirectoryEntry(
+                name=b"subprepo1",
+                perms=0o160000,
+                type="rev",
+                target=REVISIONS[1].id,
+            ),
+        ),
+    ),
+    Directory(
+        id=hash_to_bytes("d135a91ac82a754e7f4bdeff8d56ef06d921eb7d"),
+        entries=(
+            DirectoryEntry(
+                name=b"file1.ext",
+                perms=0o644,
+                type="file",
+                target=b"\x11" * 20,
+            ),
+        ),
+        raw_manifest=(
+            b"tree 34\x00"
+            + b"00644 file1.ext\x00"  # added two leading zeros
+            + b"\x11" * 20
+        ),
+    ),
+]
+
+
+SNAPSHOTS: List[Snapshot] = [
+    Snapshot(
+        id=hash_to_bytes("9e78d7105c5e0f886487511e2a92377b4ee4c32a"),
+        branches={
+            b"master": SnapshotBranch(
+                target_type=SnapshotTargetType.REVISION, target=REVISIONS[0].id
+            )
+        },
+    ),
+    Snapshot(
+        id=hash_to_bytes("0e7f84ede9a254f2cd55649ad5240783f557e65f"),
+        branches={
+            b"target/revision": SnapshotBranch(
+                target_type=SnapshotTargetType.REVISION,
+                target=REVISIONS[0].id,
+            ),
+            b"target/alias": SnapshotBranch(
+                target_type=SnapshotTargetType.ALIAS, target=b"target/revision"
+            ),
+            b"target/directory": SnapshotBranch(
+                target_type=SnapshotTargetType.DIRECTORY,
+                target=DIRECTORIES[0].id,
+            ),
+            b"target/release": SnapshotBranch(
+                target_type=SnapshotTargetType.RELEASE, target=RELEASES[0].id
+            ),
+            b"target/snapshot": SnapshotBranch(
+                target_type=SnapshotTargetType.SNAPSHOT,
+                target=hash_to_bytes("9e78d7105c5e0f886487511e2a92377b4ee4c32a"),
+            ),
+        },
+    ),
+]
+
+
+METADATA_AUTHORITIES: List[MetadataAuthority] = [
+    MetadataAuthority(
+        type=MetadataAuthorityType.FORGE,
+        url="http://example.org/",
+        metadata={},
+    ),
+]
+
+METADATA_FETCHERS: List[MetadataFetcher] = [
+    MetadataFetcher(
+        name="test-fetcher",
+        version="1.0.0",
+        metadata={},
+    )
+]
+
+RAW_EXTRINSIC_METADATA: List[RawExtrinsicMetadata] = [
+    RawExtrinsicMetadata(
+        target=Origin("http://example.org/foo.git").swhid(),
+        discovery_date=datetime.datetime(2020, 7, 30, 17, 8, 20, tzinfo=UTC),
+        authority=attr.evolve(METADATA_AUTHORITIES[0], metadata=None),
+        fetcher=attr.evolve(METADATA_FETCHERS[0], metadata=None),
+        format="json",
+        metadata=b'{"foo": "bar"}',
+    ),
+    RawExtrinsicMetadata(
+        target=ExtendedSWHID.from_string(str(CONTENTS[0].swhid())),
+        discovery_date=datetime.datetime(2020, 7, 30, 17, 8, 20, tzinfo=UTC),
+        authority=attr.evolve(METADATA_AUTHORITIES[0], metadata=None),
+        fetcher=attr.evolve(METADATA_FETCHERS[0], metadata=None),
+        format="json",
+        metadata=b'{"foo": "bar"}',
+    ),
+]
+
+EXTIDS: List[ExtID] = [
+    ExtID(
+        extid_type="git256",
+        extid=b"\x03" * 32,
+        target=REVISIONS[0].swhid(),
+    ),
+    ExtID(
+        extid_type="hg",
+        extid=b"\x04" * 20,
+        target=REVISIONS[1].swhid(),
+    ),
+    ExtID(
+        extid_type="hg-nodeid",
+        extid=b"\x05" * 20,
+        target=REVISIONS[1].swhid(),
+        extid_version=1,
+    ),
+    ExtID(
+        extid_type="tarball-sha256",
+        extid=b"\x03" * 32,
+        target=DIRECTORIES[0].swhid(),
+        payload_type="disarchive",
+        payload=CONTENTS[0].sha1_git,
+    ),
+]
+
+TEST_OBJECTS: Dict[ModelObjectType, Sequence[BaseModel]] = {}
+# generate this mapping with code to avoid error
+for objects in [
+    CONTENTS,
+    DIRECTORIES,
+    EXTIDS,
+    METADATA_AUTHORITIES,
+    METADATA_FETCHERS,
+    ORIGINS,
+    ORIGIN_VISITS,
+    ORIGIN_VISIT_STATUSES,
+    RAW_EXTRINSIC_METADATA,
+    RELEASES,
+    REVISIONS,
+    SNAPSHOTS,
+    SKIPPED_CONTENTS,
+]:
+    objects = cast(List[BaseModel], objects)
+    object_type = objects[0].object_type
+    assert all(object_type == o.object_type for o in objects)
+    assert object_type not in TEST_OBJECTS
+    TEST_OBJECTS[object_type] = objects
+
+SAMPLE_FOLDER_SWHIDS = [
+    "swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759",
+    "swh:1:cnt:7d5c08111e21c8a9f71540939998551683375fad",
+    "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb",
+    "swh:1:cnt:e86b45e538d9b6888c969c89fbd22a85aa0e0366",
+    "swh:1:dir:3c1f578394f4623f74a0ba7fe761729f59fc6ec4",
+    "swh:1:dir:c3020f6bf135a38c6df3afeb5fb38232c5e07087",
+    "swh:1:cnt:133693b125bad2b4ac318535b84901ebb1f6b638",
+    "swh:1:dir:4b825dc642cb6eb9a060e54bf8d69288fbee4904",
+    "swh:1:cnt:19102815663d23f8b75a47e7a01965dcdc96468c",
+    "swh:1:dir:2b41c40f0d1fbffcba12497db71fba83fcca96e5",
+    "swh:1:cnt:8185dfb2c0c2c597d16f75a8a0c37668567c3d7e",
+    "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a",
+    "swh:1:cnt:acac326ddd63b0bc70840659d4ac43619484e69f",
+]
--- a/swh/model/tests/test_cli.py
+++ b/swh/model/tests/test_cli.py
-# Copyright (C) 2018 The Software Heritage developers
+# Copyright (C) 2018-2019 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 import os
+import sys
+import tarfile
 import tempfile
 import unittest
+import unittest.mock

 from click.testing import CliRunner
 import pytest

 from swh.model import cli
 from swh.model.hashutil import hash_to_hex
+from swh.model.tests.swh_model_data import SAMPLE_FOLDER_SWHIDS
 from swh.model.tests.test_from_disk import DataMixin


-@pytest.mark.fs
 class TestIdentify(DataMixin, unittest.TestCase):
-
    def setUp(self):
        super().setUp()
        self.runner = CliRunner()

-    def assertPidOK(self, result, pid):  # noqa: N802
-        self.assertEqual(result.exit_code, 0)
-        self.assertEqual(result.output.split()[0], pid)
+    def assertSWHID(self, result, swhid):
+        self.assertEqual(result.exit_code, 0, result.output)
+        self.assertEqual(result.output.split()[0], swhid)
+
+    def test_no_args(self):
+        result = self.runner.invoke(cli.identify)
+        self.assertNotEqual(result.exit_code, 0)

    def test_content_id(self):
        """identify file content"""
        self.make_contents(self.tmpdir_name)
        for filename, content in self.contents.items():
            path = os.path.join(self.tmpdir_name, filename)
-            result = self.runner.invoke(cli.identify,
-                                        ['--type', 'content', path])
-            self.assertPidOK(result,
-                             'swh:1:cnt:' + hash_to_hex(content['sha1_git']))
+            result = self.runner.invoke(cli.identify, ["--type", "content", path])
+            self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
+
+    def test_content_id_from_stdin(self):
+        """identify file content"""
+        self.make_contents(self.tmpdir_name)
+        for _, content in self.contents.items():
+            result = self.runner.invoke(cli.identify, ["-"], input=content["data"])
+            self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))

    def test_directory_id(self):
        """identify an entire directory"""
        self.make_from_tarball(self.tmpdir_name)
-        path = os.path.join(self.tmpdir_name, b'sample-folder')
-        result = self.runner.invoke(cli.identify,
-                                    ['--type', 'directory', path])
-        self.assertPidOK(result,
-                         'swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759')
+        path = os.path.join(self.tmpdir_name, b"sample-folder")
+        result = self.runner.invoke(cli.identify, ["--type", "directory", path])
+        self.assertSWHID(result, "swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759")
+
+    @pytest.mark.requires_optional_deps
+    def test_snapshot_id(self):
+        """identify a snapshot"""
+        tarball = os.path.join(
+            os.path.dirname(__file__), "data", "repos", "sample-repo.tgz"
+        )
+        with tempfile.TemporaryDirectory(prefix="swh.model.cli") as d:
+            with tarfile.open(tarball, "r:gz") as t:
+                t.extractall(d)
+                repo_dir = os.path.join(d, "sample-repo")
+                result = self.runner.invoke(
+                    cli.identify, ["--type", "snapshot", repo_dir]
+                )
+                self.assertSWHID(
+                    result, "swh:1:snp:abc888898124270905a0ef3c67e872ce08e7e0c1"
+                )
+
+    def test_snapshot_without_dulwich(self):
+        """checks swh-identify returns a 'nice' message instead of a traceback
+        when dulwich is not installed"""
+        with unittest.mock.patch.dict(sys.modules, {"dulwich": None}):
+            with tempfile.TemporaryDirectory(prefix="swh.model.cli") as d:
+                result = self.runner.invoke(
+                    cli.identify,
+                    ["--type", "snapshot", d],
+                    catch_exceptions=False,
+                )
+
+        assert result.exit_code == 1
+        assert "'swh.model[cli]'" in result.output
+
+    def test_origin_id(self):
+        """identify an origin URL"""
+        url = "https://github.com/torvalds/linux"
+        result = self.runner.invoke(cli.identify, ["--type", "origin", url])
+        self.assertSWHID(result, "swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f")

    def test_symlink(self):
        """identify symlink --- both itself and target"""
-        regular = os.path.join(self.tmpdir_name, b'foo.txt')
-        link = os.path.join(self.tmpdir_name, b'bar.txt')
-        open(regular, 'w').write('foo\n')
+        regular = os.path.join(self.tmpdir_name, b"foo.txt")
+        link = os.path.join(self.tmpdir_name, b"bar.txt")
+        with open(regular, "w") as f:
+            f.write("foo\n")
        os.symlink(os.path.basename(regular), link)

        result = self.runner.invoke(cli.identify, [link])
-        self.assertPidOK(result,
-                         'swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99')
+        self.assertSWHID(result, "swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99")

-        result = self.runner.invoke(cli.identify, ['--no-dereference', link])
-        self.assertPidOK(result,
-                         'swh:1:cnt:996f1789ff67c0e3f69ef5933a55d54c5d0e9954')
+        result = self.runner.invoke(cli.identify, ["--no-dereference", link])
+        self.assertSWHID(result, "swh:1:cnt:996f1789ff67c0e3f69ef5933a55d54c5d0e9954")

    def test_show_filename(self):
        """filename is shown by default"""
        self.make_contents(self.tmpdir_name)
        for filename, content in self.contents.items():
            path = os.path.join(self.tmpdir_name, filename)
-            result = self.runner.invoke(cli.identify,
-                                        ['--type', 'content', path])
+            result = self.runner.invoke(cli.identify, ["--type", "content", path])

            self.assertEqual(result.exit_code, 0)
-            self.assertEqual(result.output.rstrip(),
-                             'swh:1:cnt:%s\t%s' %
-                             (hash_to_hex(content['sha1_git']), path.decode()))
+            self.assertEqual(
+                result.output.rstrip(),
+                "swh:1:cnt:%s\t%s" % (hash_to_hex(content["sha1_git"]), path.decode()),
+            )

    def test_hide_filename(self):
        """filename is hidden upon request"""
        self.make_contents(self.tmpdir_name)
        for filename, content in self.contents.items():
            path = os.path.join(self.tmpdir_name, filename)
-            result = self.runner.invoke(cli.identify,
-                                        ['--type', 'content', '--no-filename',
-                                         path])
-            self.assertPidOK(result,
-                             'swh:1:cnt:' + hash_to_hex(content['sha1_git']))
-
-    def test_auto_id(self):
-        """automatic object type: file or directory, depending on argument"""
-        with tempfile.NamedTemporaryFile(prefix='swh.model.cli') as f:
+            result = self.runner.invoke(
+                cli.identify, ["--type", "content", "--no-filename", path]
+            )
+            self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
+
+    def test_auto_content(self):
+        """automatic object type detection: content"""
+        with tempfile.NamedTemporaryFile(prefix="swh.model.cli") as f:
            result = self.runner.invoke(cli.identify, [f.name])
            self.assertEqual(result.exit_code, 0)
-            self.assertRegex(result.output, r'^swh:\d+:cnt:')
+            self.assertRegex(result.output, r"^swh:\d+:cnt:")

-        with tempfile.TemporaryDirectory(prefix='swh.model.cli') as dirname:
+    def test_auto_directory(self):
+        """automatic object type detection: directory"""
+        with tempfile.TemporaryDirectory(prefix="swh.model.cli") as dirname:
            result = self.runner.invoke(cli.identify, [dirname])
            self.assertEqual(result.exit_code, 0)
-            self.assertRegex(result.output, r'^swh:\d+:dir:')
+            self.assertRegex(result.output, r"^swh:\d+:dir:")
+
+    def test_auto_origin(self):
+        """automatic object type detection: origin"""
+        result = self.runner.invoke(cli.identify, ["https://github.com/torvalds/linux"])
+        self.assertEqual(result.exit_code, 0, result.output)
+        self.assertRegex(result.output, r"^swh:\d+:ori:")

    def test_verify_content(self):
        """identifier verification"""
        self.make_contents(self.tmpdir_name)
        for filename, content in self.contents.items():
-            expected_id = 'swh:1:cnt:' + hash_to_hex(content['sha1_git'])
+            expected_id = "swh:1:cnt:" + hash_to_hex(content["sha1_git"])

            # match
            path = os.path.join(self.tmpdir_name, filename)
-            result = self.runner.invoke(cli.identify,
-                                        ['--verify', expected_id, path])
-            self.assertEqual(result.exit_code, 0)
+            result = self.runner.invoke(cli.identify, ["--verify", expected_id, path])
+            self.assertEqual(result.exit_code, 0, result.output)

            # mismatch
-            with open(path, 'a') as f:
-                f.write('trailing garbage to make verification fail')
-            result = self.runner.invoke(cli.identify,
-                                        ['--verify', expected_id, path])
+            with open(path, "a") as f:
+                f.write("trailing garbage to make verification fail")
+            result = self.runner.invoke(cli.identify, ["--verify", expected_id, path])
            self.assertEqual(result.exit_code, 1)
+
+    def test_exclude(self):
+        """exclude patterns"""
+        self.make_from_tarball(self.tmpdir_name)
+        path = os.path.join(self.tmpdir_name, b"sample-folder")
+
+        excluded_dir = os.path.join(path, b"excluded_dir\x96")
+        os.mkdir(excluded_dir)
+        with open(os.path.join(excluded_dir, b"some_file"), "w") as f:
+            f.write("content")
+
+        result = self.runner.invoke(
+            cli.identify, ["--type", "directory", "--exclude", "excluded_*", path]
+        )
+
+        self.assertSWHID(result, "swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759")
+
+    def test_recursive_directory(self):
+        self.make_from_tarball(self.tmpdir_name)
+        path = os.path.join(self.tmpdir_name, b"sample-folder")
+        result = self.runner.invoke(cli.identify, ["--recursive", path])
+        self.assertEqual(result.exit_code, 0, result.output)
+
+        result = result.output.split()
+        result_swhids = []
+        # get all SWHID from the result
+        for i in range(0, len(result)):
+            if i % 2 == 0:
+                result_swhids.append(result[i])
+
+        assert len(result_swhids) == len(SAMPLE_FOLDER_SWHIDS)
+        for swhid in SAMPLE_FOLDER_SWHIDS:
+            assert swhid in result_swhids
+
+    def test_recursive_directory_no_filename(self):
+        self.make_from_tarball(self.tmpdir_name)
+        path = os.path.join(self.tmpdir_name, b"sample-folder")
+        result = self.runner.invoke(
+            cli.identify, ["--recursive", "--no-filename", path]
+        )
+        self.assertEqual(result.exit_code, 0, result.output)
+
+        result_swhids = result.output.split()
+
+        assert len(result_swhids) == len(SAMPLE_FOLDER_SWHIDS)
+        for swhid in SAMPLE_FOLDER_SWHIDS:
+            assert swhid in result_swhids
--- a/swh/model/tests/test_collections.py
+++ b/swh/model/tests/test_collections.py
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import pytest
+
+from swh.model.collections import ImmutableDict
+
+
+def test_immutabledict_empty():
+    d = ImmutableDict()
+
+    assert d == {}
+    assert d != {"foo": "bar"}
+
+    assert list(d) == []
+    assert list(d.items()) == []
+
+
+def test_immutabledict_one_item():
+    d = ImmutableDict({"foo": "bar"})
+
+    assert d == {"foo": "bar"}
+    assert d != {}
+
+    assert d["foo"] == "bar"
+    with pytest.raises(KeyError, match="bar"):
+        d["bar"]
+
+    assert list(d) == ["foo"]
+    assert list(d.items()) == [("foo", "bar")]
+
+
+def test_immutabledict_from_iterable():
+    d1 = ImmutableDict()
+    d2 = ImmutableDict({"foo": "bar"})
+
+    assert ImmutableDict([]) == d1
+    assert ImmutableDict([("foo", "bar")]) == d2
+
+
+def test_immutabledict_from_immutabledict():
+    d1 = ImmutableDict()
+    d2 = ImmutableDict({"foo": "bar"})
+
+    assert ImmutableDict(d1) == d1
+    assert ImmutableDict(d2) == d2
+
+
+def test_immutabledict_immutable():
+    d = ImmutableDict({"foo": "bar"})
+
+    with pytest.raises(TypeError, match="item assignment"):
+        d["bar"] = "baz"
+
+    with pytest.raises(TypeError, match="item deletion"):
+        del d["foo"]
+
+
+def test_immutabledict_copy_pop():
+    d = ImmutableDict({"foo": "bar", "baz": "qux"})
+
+    assert d.copy_pop("foo") == ("bar", ImmutableDict({"baz": "qux"}))
+
+    assert d.copy_pop("not a key") == (None, d)
+
+
+def test_hash():
+    assert hash(ImmutableDict()) == hash(ImmutableDict({}))
+    assert hash(ImmutableDict({"foo": "bar"})) == hash(ImmutableDict({"foo": "bar"}))
+    assert hash(ImmutableDict({"foo": "bar", "baz": "qux"})) == hash(
+        ImmutableDict({"foo": "bar", "baz": "qux"})
+    )
+    assert hash(ImmutableDict({"foo": "bar", "baz": "qux"})) == hash(
+        ImmutableDict({"baz": "qux", "foo": "bar"})
+    )
+
+
+def test_equality_order():
+    assert ImmutableDict({"foo": "bar", "baz": "qux"}) == ImmutableDict(
+        {"foo": "bar", "baz": "qux"}
+    )
+    assert ImmutableDict({"foo": "bar", "baz": "qux"}) == ImmutableDict(
+        {"baz": "qux", "foo": "bar"}
+    )
--- a/swh/model/tests/test_discovery.py
+++ b/swh/model/tests/test_discovery.py
+# Copyright (C) 2023 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from dataclasses import dataclass
+from typing import Iterable, List
+
+from swh.model import discovery, model
+from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Sha1Git
+from swh.model.tests.test_identifiers import directory_example
+
+pytest_plugins = ["aiohttp.pytest_plugin"]
+
+UNKNOWN_HASH = hash_to_bytes("17140cb6109f1e3296dc52e2b2cd29bcb40e86be")
+KNOWN_CONTENT_HASH = hash_to_bytes("e8e4106de42e2d5d5efab6a9422b9a8677c993c8")
+KNOWN_DIRECTORY_HASH = hash_to_bytes("d7ed3d2c31d608823be58b1cbe57605310615231")
+KNOWN_DIRECTORY_HASH_2 = hash_to_bytes("c76724e9a0be4b60f4bf0cb48b261df8eda94b1d")
+
+
+@dataclass
+class FakeArchive:
+    contents: List[model.Content]
+    skipped_contents: List[model.SkippedContent]
+    directories: List[model.Directory]
+
+    def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
+        return []
+
+    def skipped_content_missing(
+        self, skipped_contents: List[Sha1Git]
+    ) -> Iterable[Sha1Git]:
+        """List skipped content missing from the archive by sha1"""
+        return []
+
+    def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
+        """List directories missing from the archive by sha1"""
+        return []
+
+
+def test_filter_known_objects(monkeypatch):
+    # Test with smaller sample sizes to actually trigger the random sampling
+    monkeypatch.setattr(discovery, "SAMPLE_SIZE", 1)
+
+    base_directory = model.Directory.from_dict(directory_example)
+
+    # Hardcoding another hash is enough since it's all that's being checked
+    directory_data = directory_example.copy()
+    directory_data["id"] = KNOWN_DIRECTORY_HASH_2
+    other_directory = model.Directory.from_dict(directory_data)
+    archive = FakeArchive(
+        contents=[model.Content.from_data(b"blabla")],
+        skipped_contents=[model.SkippedContent.from_data(b"blabla2", reason="reason")],
+        directories=[
+            base_directory,
+            other_directory,
+        ],
+    )
+    assert archive.contents[0].sha1_git == KNOWN_CONTENT_HASH
+    assert archive.directories[0].id == KNOWN_DIRECTORY_HASH
+    assert archive.directories[1].id == KNOWN_DIRECTORY_HASH_2
+    (contents, skipped_contents, directories) = discovery.filter_known_objects(archive)
+    assert len(contents) == 0
+    assert len(skipped_contents) == 0
+    assert len(directories) == 0
--- a/swh/model/tests/test_from_disk.py
+++ b/swh/model/tests/test_from_disk.py
-# Copyright (C) 2017 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

+from collections import defaultdict
+from functools import partial
 import os
 import tarfile
 import tempfile
+from typing import ClassVar, Optional
 import unittest

 import pytest

-from swh.model import from_disk
-from swh.model.from_disk import Content, DentryPerms, Directory
+from swh.model import from_disk, model
+from swh.model.from_disk import (
+    Content,
+    DentryPerms,
+    Directory,
+    DiskBackedData,
+    FromDiskType,
+)
 from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex

-TEST_DATA = os.path.join(os.path.dirname(__file__), 'data')
+TEST_DATA = os.path.join(os.path.dirname(__file__), "data")
+
+
+def mk_tree(root: bytes, tree_desc: bytes):
+    """Create a directory tree under `root` with content generated from `tree_desc`
+
+    tree_desc is a simple textual representation of the tree structure; each
+    line is an element of the directory tree structure, a trailing '/' defines
+    a directory, otherwise it's an (empty) file; a symlink is specified with a
+    ' -> path' in the description. If the destination path starts with a slash ('/')
+    it is considered as absolute, ie. relative to the 'root' directory; e.g.
+
+      foo/bar/baz.txt
+      foo/baz/
+      foo/bar/toto -> baz.txt
+      foo/abstoto -> /foo/bar/baz.txt
+
+    will generate a directory structure like:
+
+    .
+    └── foo
+        ├── abstoto -> bar/baz.txt
+        ├── bar
+        │   ├── baz.txt
+        │   └── toto -> baz.txt
+        └── baz
+
+    The root directory must already exist.
+
+    """
+    if not os.path.isdir(root):
+        raise EnvironmentError("The root directory must exists and be writable")
+
+    symlinks = []
+    for entry in tree_desc.splitlines():
+        entry = entry.strip()
+        if not entry or entry.startswith(b"#"):
+            continue
+        entry = entry.strip().lstrip(b"/")
+        if b".." in entry:
+            raise ValueError(".. in path descr is forbidden...")
+        if b"->" in entry:
+            dst, src = entry.split(b"->")
+            symlinks.append((src.strip(), dst.strip()))
+            continue
+        path = os.path.join(root, entry)
+        if entry.endswith(b"/"):
+            os.makedirs(path, exist_ok=True)
+        else:
+            dirname = os.path.dirname(path)
+            os.makedirs(dirname, exist_ok=True)
+            open(path, "a")
+
+    # now create symlinks
+    while symlinks:
+        src, dst = symlinks.pop(0)
+        fp_dst = os.path.join(root, dst)
+        if src.startswith(b"/"):
+            rp_src = src.lstrip(b"/")
+        else:
+            rp_src = os.path.join(os.path.dirname(dst), src)
+        fp_src = os.path.join(root, rp_src)
+        if not os.path.exists(fp_src):
+            symlinks.append((src, dst))
+            continue
+        # create the parent directory of the dst, if need be
+        dirname = os.path.dirname(fp_dst)
+        os.makedirs(dirname, exist_ok=True)
+
+        rp_src = os.path.relpath(fp_src, os.path.dirname(fp_dst))
+        os.symlink(rp_src, fp_dst)
+
+
+def test_mk_tree(tmpdir):
+    desc = b"""
+      foo/bar/baz.txt
+      foo/baz/
+      foo/bar/toto -> baz.txt
+      foo/abstoto -> /foo/bar/baz.txt
+      baz/baz/baz/
+      # prefix / is ignored
+      /bar/a_file.txt
+      # symlink to a not yet defined target is ok
+      bar/baz/lnk -> /foo/bar/later.txt
+      foo/bar/later.txt
+      # symlink to another symlink is ok
+      bar/baz/lnk2 -> /foo/bar/toto
+      # even if the src of the symlink is defined after the dst
+      bar/baz/lnk3 -> /foo/bar/toto2
+      foo/bar/toto2 -> later.txt
+
+    """
+    from os.path import isdir, isfile, islink, realpath
+
+    join = partial(os.path.join, tmpdir)
+
+    mk_tree(os.fsencode(tmpdir), desc)
+
+    assert isfile(join("foo/bar/baz.txt"))
+    assert isfile(join("foo/bar/later.txt"))
+    assert isfile(join("bar/a_file.txt"))
+
+    assert isdir(join("baz/baz/baz"))
+
+    assert islink(join("foo/bar/toto"))
+    assert realpath(join("foo/bar/toto")) == join("foo/bar/baz.txt")
+    assert islink(join("foo/bar/toto2"))
+    assert realpath(join("foo/bar/toto2")) == join("foo/bar/later.txt")
+    assert islink(join("foo/abstoto"))
+    assert realpath(join("foo/abstoto")) == join("foo/bar/baz.txt")
+    assert islink(join("bar/baz/lnk"))
+    assert realpath(join("bar/baz/lnk")) == join("foo/bar/later.txt")
+    assert islink(join("bar/baz/lnk2"))
+    assert realpath(join("bar/baz/lnk2")) == join("foo/bar/baz.txt")
+    assert islink(join("bar/baz/lnk3"))
+    assert realpath(join("bar/baz/lnk3")) == join("foo/bar/later.txt")


 class ModeToPerms(unittest.TestCase):
@@ -47,353 +170,399 @@ class ModeToPerms(unittest.TestCase):
            self.assertEqual(perm, from_disk.mode_to_perms(fmode))


+class TestDiskBackedContent(unittest.TestCase):
+    def test_with_data(self):
+        expected_content = model.Content(
+            length=42,
+            status="visible",
+            data=b"foo bar",
+            sha1=b"foo",
+            sha1_git=b"bar",
+            sha256=b"baz",
+            blake2s256=b"qux",
+        )
+        with tempfile.NamedTemporaryFile(mode="w+b") as fd:
+            content = model.Content(
+                length=42,
+                status="visible",
+                get_data=DiskBackedData(path=fd.name),
+                sha1=b"foo",
+                sha1_git=b"bar",
+                sha256=b"baz",
+                blake2s256=b"qux",
+            )
+            fd.write(b"foo bar")
+            fd.seek(0)
+            content_with_data = content.with_data()
+            assert content.to_dict() == content_with_data.to_dict()
+
+        assert expected_content == content_with_data
+        assert expected_content.to_dict() == content_with_data.to_dict()
+
+    def test_lazy_data(self):
+        with tempfile.NamedTemporaryFile(mode="w+b") as fd:
+            fd.write(b"foo")
+            fd.seek(0)
+            content = model.Content(
+                length=42,
+                status="visible",
+                get_data=DiskBackedData(path=fd.name),
+                sha1=b"foo",
+                sha1_git=b"bar",
+                sha256=b"baz",
+                blake2s256=b"qux",
+            )
+            fd.write(b"bar")
+            fd.seek(0)
+            content_with_data = content.with_data()
+            fd.write(b"baz")
+            fd.seek(0)
+
+        assert content_with_data.data == b"bar"
+
+    def test_with_data_cannot_read(self):
+        with tempfile.NamedTemporaryFile(mode="w+b") as fd:
+            content = model.Content(
+                length=42,
+                status="visible",
+                get_data=DiskBackedData(path=fd.name),
+                sha1=b"foo",
+                sha1_git=b"bar",
+                sha256=b"baz",
+                blake2s256=b"qux",
+            )
+
+        with pytest.raises(OSError):
+            content.with_data()
+
+    def test_missing_path(self):
+        with pytest.raises(model.MissingData):
+            c = model.Content(
+                length=42,
+                status="visible",
+                sha1=b"foo",
+                sha1_git=b"bar",
+                sha256=b"baz",
+                blake2s256=b"qux",
+            )
+            c.with_data()
+
+        with pytest.raises(model.MissingData):
+            c = model.Content(
+                length=42,
+                status="visible",
+                get_data=lambda: None,
+                sha1=b"foo",
+                sha1_git=b"bar",
+                sha256=b"baz",
+                blake2s256=b"qux",
+            )
+            c.with_data()
+
+
 class DataMixin:
-    maxDiff = None
+    maxDiff: ClassVar[Optional[int]] = None

    def setUp(self):
-        self.tmpdir = tempfile.TemporaryDirectory(
-            prefix='swh.model.from_disk'
-        )
+        self.tmpdir = tempfile.TemporaryDirectory(prefix="swh.model.from_disk")
        self.tmpdir_name = os.fsencode(self.tmpdir.name)

        self.contents = {
-            b'file': {
-                'data': b'42\n',
-                'sha1': hash_to_bytes(
-                    '34973274ccef6ab4dfaaf86599792fa9c3fe4689'
-                ),
-                'sha256': hash_to_bytes(
-                    '084c799cd551dd1d8d5c5f9a5d593b2e'
-                    '931f5e36122ee5c793c1d08a19839cc0'
+            b"file": {
+                "data": b"42\n",
+                "sha1": hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"),
+                "sha256": hash_to_bytes(
+                    "084c799cd551dd1d8d5c5f9a5d593b2e"
+                    "931f5e36122ee5c793c1d08a19839cc0"
                ),
-                'sha1_git': hash_to_bytes(
-                    'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd'),
-                'blake2s256': hash_to_bytes(
-                    'd5fe1939576527e42cfd76a9455a2432'
-                    'fe7f56669564577dd93c4280e76d661d'
+                "sha1_git": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
+                "blake2s256": hash_to_bytes(
+                    "d5fe1939576527e42cfd76a9455a2432"
+                    "fe7f56669564577dd93c4280e76d661d"
                ),
-                'length': 3,
-                'mode': 0o100644
+                "length": 3,
+                "mode": 0o100644,
            },
        }

        self.symlinks = {
-            b'symlink': {
-                'data': b'target',
-                'blake2s256': hash_to_bytes(
-                    '595d221b30fdd8e10e2fdf18376e688e'
-                    '9f18d56fd9b6d1eb6a822f8c146c6da6'
-                ),
-                'sha1': hash_to_bytes(
-                    '0e8a3ad980ec179856012b7eecf4327e99cd44cd'
-                ),
-                'sha1_git': hash_to_bytes(
-                    '1de565933b05f74c75ff9a6520af5f9f8a5a2f1d'
+            b"symlink": {
+                "data": b"target",
+                "blake2s256": hash_to_bytes(
+                    "595d221b30fdd8e10e2fdf18376e688e"
+                    "9f18d56fd9b6d1eb6a822f8c146c6da6"
                ),
-                'sha256': hash_to_bytes(
-                    '34a04005bcaf206eec990bd9637d9fdb'
-                    '6725e0a0c0d4aebf003f17f4c956eb5c'
+                "sha1": hash_to_bytes("0e8a3ad980ec179856012b7eecf4327e99cd44cd"),
+                "sha1_git": hash_to_bytes("1de565933b05f74c75ff9a6520af5f9f8a5a2f1d"),
+                "sha256": hash_to_bytes(
+                    "34a04005bcaf206eec990bd9637d9fdb"
+                    "6725e0a0c0d4aebf003f17f4c956eb5c"
                ),
-                'length': 6,
-                'perms': DentryPerms.symlink,
+                "length": 6,
+                "perms": DentryPerms.symlink,
            }
        }

        self.specials = {
-            b'fifo': os.mkfifo,
-            b'devnull': lambda path: os.mknod(path, device=os.makedev(1, 3)),
+            b"fifo": os.mkfifo,
        }

        self.empty_content = {
-            'data': b'',
-            'length': 0,
-            'blake2s256': hash_to_bytes(
-                '69217a3079908094e11121d042354a7c'
-                '1f55b6482ca1a51e1b250dfd1ed0eef9'
+            "data": b"",
+            "length": 0,
+            "blake2s256": hash_to_bytes(
+                "69217a3079908094e11121d042354a7c" "1f55b6482ca1a51e1b250dfd1ed0eef9"
            ),
-            'sha1': hash_to_bytes(
-                'da39a3ee5e6b4b0d3255bfef95601890afd80709'
+            "sha1": hash_to_bytes("da39a3ee5e6b4b0d3255bfef95601890afd80709"),
+            "sha1_git": hash_to_bytes("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"),
+            "sha256": hash_to_bytes(
+                "e3b0c44298fc1c149afbf4c8996fb924" "27ae41e4649b934ca495991b7852b855"
            ),
-            'sha1_git': hash_to_bytes(
-                'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
-            ),
-            'sha256': hash_to_bytes(
-                'e3b0c44298fc1c149afbf4c8996fb924'
-                '27ae41e4649b934ca495991b7852b855'
-            ),
-            'perms': DentryPerms.content,
+            "perms": DentryPerms.content,
        }

        self.empty_directory = {
-            'id': hash_to_bytes(
-                '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
-            ),
-            'entries': [],
+            "id": hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
+            "entries": [],
        }

        # Generated with generate_testdata_from_disk
        self.tarball_contents = {
-            b'': {
-                'entries': [{
-                    'name': b'bar',
-                    'perms': DentryPerms.directory,
-                    'target': hash_to_bytes(
-                        '3c1f578394f4623f74a0ba7fe761729f59fc6ec4'
-                    ),
-                    'type': 'dir',
-                }, {
-                    'name': b'empty-folder',
-                    'perms': DentryPerms.directory,
-                    'target': hash_to_bytes(
-                        '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
-                    ),
-                    'type': 'dir',
-                }, {
-                    'name': b'foo',
-                    'perms': DentryPerms.directory,
-                    'target': hash_to_bytes(
-                        '2b41c40f0d1fbffcba12497db71fba83fcca96e5'
-                    ),
-                    'type': 'dir',
-                }, {
-                    'name': b'link-to-another-quote',
-                    'perms': DentryPerms.symlink,
-                    'target': hash_to_bytes(
-                        '7d5c08111e21c8a9f71540939998551683375fad'
-                    ),
-                    'type': 'file',
-                }, {
-                    'name': b'link-to-binary',
-                    'perms': DentryPerms.symlink,
-                    'target': hash_to_bytes(
-                        'e86b45e538d9b6888c969c89fbd22a85aa0e0366'
-                    ),
-                    'type': 'file',
-                }, {
-                    'name': b'link-to-foo',
-                    'perms': DentryPerms.symlink,
-                    'target': hash_to_bytes(
-                        '19102815663d23f8b75a47e7a01965dcdc96468c'
-                    ),
-                    'type': 'file',
-                }, {
-                    'name': b'some-binary',
-                    'perms': DentryPerms.executable_content,
-                    'target': hash_to_bytes(
-                        '68769579c3eaadbe555379b9c3538e6628bae1eb'
-                    ),
-                    'type': 'file',
-                }],
-                'id': hash_to_bytes(
-                    'e8b0f1466af8608c8a3fb9879db172b887e80759'
-                ),
+            b"": {
+                "entries": [
+                    {
+                        "name": b"bar",
+                        "perms": DentryPerms.directory,
+                        "target": hash_to_bytes(
+                            "3c1f578394f4623f74a0ba7fe761729f59fc6ec4"
+                        ),
+                        "type": "dir",
+                    },
+                    {
+                        "name": b"empty-folder",
+                        "perms": DentryPerms.directory,
+                        "target": hash_to_bytes(
+                            "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
+                        ),
+                        "type": "dir",
+                    },
+                    {
+                        "name": b"foo",
+                        "perms": DentryPerms.directory,
+                        "target": hash_to_bytes(
+                            "2b41c40f0d1fbffcba12497db71fba83fcca96e5"
+                        ),
+                        "type": "dir",
+                    },
+                    {
+                        "name": b"link-to-another-quote",
+                        "perms": DentryPerms.symlink,
+                        "target": hash_to_bytes(
+                            "7d5c08111e21c8a9f71540939998551683375fad"
+                        ),
+                        "type": "file",
+                    },
+                    {
+                        "name": b"link-to-binary",
+                        "perms": DentryPerms.symlink,
+                        "target": hash_to_bytes(
+                            "e86b45e538d9b6888c969c89fbd22a85aa0e0366"
+                        ),
+                        "type": "file",
+                    },
+                    {
+                        "name": b"link-to-foo",
+                        "perms": DentryPerms.symlink,
+                        "target": hash_to_bytes(
+                            "19102815663d23f8b75a47e7a01965dcdc96468c"
+                        ),
+                        "type": "file",
+                    },
+                    {
+                        "name": b"some-binary",
+                        "perms": DentryPerms.executable_content,
+                        "target": hash_to_bytes(
+                            "68769579c3eaadbe555379b9c3538e6628bae1eb"
+                        ),
+                        "type": "file",
+                    },
+                ],
+                "id": hash_to_bytes("e8b0f1466af8608c8a3fb9879db172b887e80759"),
            },
-            b'bar': {
-                'entries': [{
-                    'name': b'barfoo',
-                    'perms': DentryPerms.directory,
-                    'target': hash_to_bytes(
-                        'c3020f6bf135a38c6df3afeb5fb38232c5e07087'
-                    ),
-                    'type': 'dir',
-                }],
-                'id': hash_to_bytes(
-                    '3c1f578394f4623f74a0ba7fe761729f59fc6ec4'
-                ),
+            b"bar": {
+                "entries": [
+                    {
+                        "name": b"barfoo",
+                        "perms": DentryPerms.directory,
+                        "target": hash_to_bytes(
+                            "c3020f6bf135a38c6df3afeb5fb38232c5e07087"
+                        ),
+                        "type": "dir",
+                    }
+                ],
+                "id": hash_to_bytes("3c1f578394f4623f74a0ba7fe761729f59fc6ec4"),
            },
-            b'bar/barfoo': {
-                'entries': [{
-                    'name': b'another-quote.org',
-                    'perms': DentryPerms.content,
-                    'target': hash_to_bytes(
-                        '133693b125bad2b4ac318535b84901ebb1f6b638'
-                    ),
-                    'type': 'file',
-                }],
-                'id': hash_to_bytes(
-                    'c3020f6bf135a38c6df3afeb5fb38232c5e07087'
-                ),
+            b"bar/barfoo": {
+                "entries": [
+                    {
+                        "name": b"another-quote.org",
+                        "perms": DentryPerms.content,
+                        "target": hash_to_bytes(
+                            "133693b125bad2b4ac318535b84901ebb1f6b638"
+                        ),
+                        "type": "file",
+                    }
+                ],
+                "id": hash_to_bytes("c3020f6bf135a38c6df3afeb5fb38232c5e07087"),
            },
-            b'bar/barfoo/another-quote.org': {
-                'blake2s256': hash_to_bytes(
-                    'd26c1cad82d43df0bffa5e7be11a60e3'
-                    '4adb85a218b433cbce5278b10b954fe8'
-                ),
-                'length': 72,
-                'perms': DentryPerms.content,
-                'sha1': hash_to_bytes(
-                    '90a6138ba59915261e179948386aa1cc2aa9220a'
-                ),
-                'sha1_git': hash_to_bytes(
-                    '133693b125bad2b4ac318535b84901ebb1f6b638'
+            b"bar/barfoo/another-quote.org": {
+                "blake2s256": hash_to_bytes(
+                    "d26c1cad82d43df0bffa5e7be11a60e3"
+                    "4adb85a218b433cbce5278b10b954fe8"
                ),
-                'sha256': hash_to_bytes(
-                    '3db5ae168055bcd93a4d08285dc99ffe'
-                    'e2883303b23fac5eab850273a8ea5546'
+                "length": 72,
+                "perms": DentryPerms.content,
+                "sha1": hash_to_bytes("90a6138ba59915261e179948386aa1cc2aa9220a"),
+                "sha1_git": hash_to_bytes("133693b125bad2b4ac318535b84901ebb1f6b638"),
+                "sha256": hash_to_bytes(
+                    "3db5ae168055bcd93a4d08285dc99ffe"
+                    "e2883303b23fac5eab850273a8ea5546"
                ),
            },
-            b'empty-folder': {
-                'entries': [],
-                'id': hash_to_bytes(
-                    '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
-                ),
+            b"empty-folder": {
+                "entries": [],
+                "id": hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
            },
-            b'foo': {
-                'entries': [{
-                    'name': b'barfoo',
-                    'perms': DentryPerms.symlink,
-                    'target': hash_to_bytes(
-                        '8185dfb2c0c2c597d16f75a8a0c37668567c3d7e'
-                    ),
-                    'type': 'file',
-                }, {
-                    'name': b'quotes.md',
-                    'perms': DentryPerms.content,
-                    'target': hash_to_bytes(
-                        '7c4c57ba9ff496ad179b8f65b1d286edbda34c9a'
-                    ),
-                    'type': 'file',
-                }, {
-                    'name': b'rel-link-to-barfoo',
-                    'perms': DentryPerms.symlink,
-                    'target': hash_to_bytes(
-                        'acac326ddd63b0bc70840659d4ac43619484e69f'
-                    ),
-                    'type': 'file',
-                }],
-                'id': hash_to_bytes(
-                    '2b41c40f0d1fbffcba12497db71fba83fcca96e5'
-                ),
+            b"foo": {
+                "entries": [
+                    {
+                        "name": b"barfoo",
+                        "perms": DentryPerms.symlink,
+                        "target": hash_to_bytes(
+                            "8185dfb2c0c2c597d16f75a8a0c37668567c3d7e"
+                        ),
+                        "type": "file",
+                    },
+                    {
+                        "name": b"quotes.md",
+                        "perms": DentryPerms.content,
+                        "target": hash_to_bytes(
+                            "7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"
+                        ),
+                        "type": "file",
+                    },
+                    {
+                        "name": b"rel-link-to-barfoo",
+                        "perms": DentryPerms.symlink,
+                        "target": hash_to_bytes(
+                            "acac326ddd63b0bc70840659d4ac43619484e69f"
+                        ),
+                        "type": "file",
+                    },
+                ],
+                "id": hash_to_bytes("2b41c40f0d1fbffcba12497db71fba83fcca96e5"),
            },
-            b'foo/barfoo': {
-                'blake2s256': hash_to_bytes(
-                    'e1252f2caa4a72653c4efd9af871b62b'
-                    'f2abb7bb2f1b0e95969204bd8a70d4cd'
-                ),
-                'data': b'bar/barfoo',
-                'length': 10,
-                'perms': DentryPerms.symlink,
-                'sha1': hash_to_bytes(
-                    '9057ee6d0162506e01c4d9d5459a7add1fedac37'
-                ),
-                'sha1_git': hash_to_bytes(
-                    '8185dfb2c0c2c597d16f75a8a0c37668567c3d7e'
+            b"foo/barfoo": {
+                "blake2s256": hash_to_bytes(
+                    "e1252f2caa4a72653c4efd9af871b62b"
+                    "f2abb7bb2f1b0e95969204bd8a70d4cd"
                ),
-                'sha256': hash_to_bytes(
-                    '29ad3f5725321b940332c78e403601af'
-                    'ff61daea85e9c80b4a7063b6887ead68'
+                "data": b"bar/barfoo",
+                "length": 10,
+                "perms": DentryPerms.symlink,
+                "sha1": hash_to_bytes("9057ee6d0162506e01c4d9d5459a7add1fedac37"),
+                "sha1_git": hash_to_bytes("8185dfb2c0c2c597d16f75a8a0c37668567c3d7e"),
+                "sha256": hash_to_bytes(
+                    "29ad3f5725321b940332c78e403601af"
+                    "ff61daea85e9c80b4a7063b6887ead68"
                ),
            },
-            b'foo/quotes.md': {
-                'blake2s256': hash_to_bytes(
-                    'bf7ce4fe304378651ee6348d3e9336ed'
-                    '5ad603d33e83c83ba4e14b46f9b8a80b'
+            b"foo/quotes.md": {
+                "blake2s256": hash_to_bytes(
+                    "bf7ce4fe304378651ee6348d3e9336ed"
+                    "5ad603d33e83c83ba4e14b46f9b8a80b"
                ),
-                'length': 66,
-                'perms': DentryPerms.content,
-                'sha1': hash_to_bytes(
-                    '1bf0bb721ac92c18a19b13c0eb3d741cbfadebfc'
-                ),
-                'sha1_git': hash_to_bytes(
-                    '7c4c57ba9ff496ad179b8f65b1d286edbda34c9a'
-                ),
-                'sha256': hash_to_bytes(
-                    'caca942aeda7b308859eb56f909ec96d'
-                    '07a499491690c453f73b9800a93b1659'
+                "length": 66,
+                "perms": DentryPerms.content,
+                "sha1": hash_to_bytes("1bf0bb721ac92c18a19b13c0eb3d741cbfadebfc"),
+                "sha1_git": hash_to_bytes("7c4c57ba9ff496ad179b8f65b1d286edbda34c9a"),
+                "sha256": hash_to_bytes(
+                    "caca942aeda7b308859eb56f909ec96d"
+                    "07a499491690c453f73b9800a93b1659"
                ),
            },
-            b'foo/rel-link-to-barfoo': {
-                'blake2s256': hash_to_bytes(
-                    'd9c327421588a1cf61f316615005a2e9'
-                    'c13ac3a4e96d43a24138d718fa0e30db'
-                ),
-                'data': b'../bar/barfoo',
-                'length': 13,
-                'perms': DentryPerms.symlink,
-                'sha1': hash_to_bytes(
-                    'dc51221d308f3aeb2754db48391b85687c2869f4'
-                ),
-                'sha1_git': hash_to_bytes(
-                    'acac326ddd63b0bc70840659d4ac43619484e69f'
+            b"foo/rel-link-to-barfoo": {
+                "blake2s256": hash_to_bytes(
+                    "d9c327421588a1cf61f316615005a2e9"
+                    "c13ac3a4e96d43a24138d718fa0e30db"
                ),
-                'sha256': hash_to_bytes(
-                    '8007d20db2af40435f42ddef4b8ad76b'
-                    '80adbec26b249fdf0473353f8d99df08'
+                "data": b"../bar/barfoo",
+                "length": 13,
+                "perms": DentryPerms.symlink,
+                "sha1": hash_to_bytes("dc51221d308f3aeb2754db48391b85687c2869f4"),
+                "sha1_git": hash_to_bytes("acac326ddd63b0bc70840659d4ac43619484e69f"),
+                "sha256": hash_to_bytes(
+                    "8007d20db2af40435f42ddef4b8ad76b"
+                    "80adbec26b249fdf0473353f8d99df08"
                ),
            },
-            b'link-to-another-quote': {
-                'blake2s256': hash_to_bytes(
-                    '2d0e73cea01ba949c1022dc10c8a43e6'
-                    '6180639662e5dc2737b843382f7b1910'
+            b"link-to-another-quote": {
+                "blake2s256": hash_to_bytes(
+                    "2d0e73cea01ba949c1022dc10c8a43e6"
+                    "6180639662e5dc2737b843382f7b1910"
                ),
-                'data': b'bar/barfoo/another-quote.org',
-                'length': 28,
-                'perms': DentryPerms.symlink,
-                'sha1': hash_to_bytes(
-                    'cbeed15e79599c90de7383f420fed7acb48ea171'
-                ),
-                'sha1_git': hash_to_bytes(
-                    '7d5c08111e21c8a9f71540939998551683375fad'
-                ),
-                'sha256': hash_to_bytes(
-                    'e6e17d0793aa750a0440eb9ad5b80b25'
-                    '8076637ef0fb68f3ac2e59e4b9ac3ba6'
+                "data": b"bar/barfoo/another-quote.org",
+                "length": 28,
+                "perms": DentryPerms.symlink,
+                "sha1": hash_to_bytes("cbeed15e79599c90de7383f420fed7acb48ea171"),
+                "sha1_git": hash_to_bytes("7d5c08111e21c8a9f71540939998551683375fad"),
+                "sha256": hash_to_bytes(
+                    "e6e17d0793aa750a0440eb9ad5b80b25"
+                    "8076637ef0fb68f3ac2e59e4b9ac3ba6"
                ),
            },
-            b'link-to-binary': {
-                'blake2s256': hash_to_bytes(
-                    '9ce18b1adecb33f891ca36664da676e1'
-                    '2c772cc193778aac9a137b8dc5834b9b'
-                ),
-                'data': b'some-binary',
-                'length': 11,
-                'perms': DentryPerms.symlink,
-                'sha1': hash_to_bytes(
-                    'd0248714948b3a48a25438232a6f99f0318f59f1'
-                ),
-                'sha1_git': hash_to_bytes(
-                    'e86b45e538d9b6888c969c89fbd22a85aa0e0366'
+            b"link-to-binary": {
+                "blake2s256": hash_to_bytes(
+                    "9ce18b1adecb33f891ca36664da676e1"
+                    "2c772cc193778aac9a137b8dc5834b9b"
                ),
-                'sha256': hash_to_bytes(
-                    '14126e97d83f7d261c5a6889cee73619'
-                    '770ff09e40c5498685aba745be882eff'
+                "data": b"some-binary",
+                "length": 11,
+                "perms": DentryPerms.symlink,
+                "sha1": hash_to_bytes("d0248714948b3a48a25438232a6f99f0318f59f1"),
+                "sha1_git": hash_to_bytes("e86b45e538d9b6888c969c89fbd22a85aa0e0366"),
+                "sha256": hash_to_bytes(
+                    "14126e97d83f7d261c5a6889cee73619"
+                    "770ff09e40c5498685aba745be882eff"
                ),
            },
-            b'link-to-foo': {
-                'blake2s256': hash_to_bytes(
-                    '08d6cad88075de8f192db097573d0e82'
-                    '9411cd91eb6ec65e8fc16c017edfdb74'
+            b"link-to-foo": {
+                "blake2s256": hash_to_bytes(
+                    "08d6cad88075de8f192db097573d0e82"
+                    "9411cd91eb6ec65e8fc16c017edfdb74"
                ),
-                'data': b'foo',
-                'length': 3,
-                'perms': DentryPerms.symlink,
-                'sha1': hash_to_bytes(
-                    '0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33'
-                ),
-                'sha1_git': hash_to_bytes(
-                    '19102815663d23f8b75a47e7a01965dcdc96468c'
-                ),
-                'sha256': hash_to_bytes(
-                    '2c26b46b68ffc68ff99b453c1d304134'
-                    '13422d706483bfa0f98a5e886266e7ae'
+                "data": b"foo",
+                "length": 3,
+                "perms": DentryPerms.symlink,
+                "sha1": hash_to_bytes("0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"),
+                "sha1_git": hash_to_bytes("19102815663d23f8b75a47e7a01965dcdc96468c"),
+                "sha256": hash_to_bytes(
+                    "2c26b46b68ffc68ff99b453c1d304134"
+                    "13422d706483bfa0f98a5e886266e7ae"
                ),
            },
-            b'some-binary': {
-                'blake2s256': hash_to_bytes(
-                    '922e0f7015035212495b090c27577357'
-                    'a740ddd77b0b9e0cd23b5480c07a18c6'
-                ),
-                'length': 5,
-                'perms': DentryPerms.executable_content,
-                'sha1': hash_to_bytes(
-                    '0bbc12d7f4a2a15b143da84617d95cb223c9b23c'
-                ),
-                'sha1_git': hash_to_bytes(
-                    '68769579c3eaadbe555379b9c3538e6628bae1eb'
+            b"some-binary": {
+                "blake2s256": hash_to_bytes(
+                    "922e0f7015035212495b090c27577357"
+                    "a740ddd77b0b9e0cd23b5480c07a18c6"
                ),
-                'sha256': hash_to_bytes(
-                    'bac650d34a7638bb0aeb5342646d24e3'
-                    'b9ad6b44c9b383621faa482b990a367d'
+                "length": 5,
+                "perms": DentryPerms.executable_content,
+                "sha1": hash_to_bytes("0bbc12d7f4a2a15b143da84617d95cb223c9b23c"),
+                "sha1_git": hash_to_bytes("68769579c3eaadbe555379b9c3538e6628bae1eb"),
+                "sha256": hash_to_bytes(
+                    "bac650d34a7638bb0aeb5342646d24e3"
+                    "b9ad6b44c9b383621faa482b990a367d"
                ),
            },
        }
@@ -401,28 +570,27 @@ class DataMixin:
    def tearDown(self):
        self.tmpdir.cleanup()

-    def assertContentEqual(self, left, right, *, check_data=False,  # noqa
-                           check_path=False):
+    def assertContentEqual(self, left, right, *, check_path=False):  # noqa
        if not isinstance(left, Content):
-            raise ValueError('%s is not a Content' % left)
+            raise ValueError("%s is not a Content" % left)
        if isinstance(right, Content):
            right = right.get_data()

+        # Compare dictionaries
+
        keys = DEFAULT_ALGORITHMS | {
-            'length',
-            'perms',
+            "length",
+            "perms",
        }
-        if check_data:
-            keys |= {'data'}
        if check_path:
-            keys |= {'path'}
+            keys |= {"path"}

        failed = []
        for key in keys:
            try:
                lvalue = left.data[key]
-                if key == 'perms' and 'perms' not in right:
-                    rvalue = from_disk.mode_to_perms(right['mode'])
+                if key == "perms" and "perms" not in right:
+                    rvalue = from_disk.mode_to_perms(right["mode"])
                else:
                    rvalue = right[key]
            except KeyError:
@@ -434,33 +602,35 @@ class DataMixin:

        if failed:
            raise self.failureException(
-                'Content mismatched:\n' +
-                '\n'.join(
-                    'content[%s] = %r != %r' % (
-                        key, left.data.get(key), right.get(key))
+                "Content mismatched:\n"
+                + "\n".join(
+                    "content[%s] = %r != %r" % (key, left.data.get(key), right.get(key))
                    for key in failed
                )
            )

    def assertDirectoryEqual(self, left, right):  # NoQA
        if not isinstance(left, Directory):
-            raise ValueError('%s is not a Directory' % left)
+            raise ValueError("%s is not a Directory" % left)
        if isinstance(right, Directory):
            right = right.get_data()

-        return self.assertCountEqual(left.entries, right['entries'])
+        assert left.entries == right["entries"]
+        assert left.hash == right["id"]
+
+        assert left.to_model() == model.Directory.from_dict(right)

    def make_contents(self, directory):
        for filename, content in self.contents.items():
            path = os.path.join(directory, filename)
-            with open(path, 'wb') as f:
-                f.write(content['data'])
-            os.chmod(path, content['mode'])
+            with open(path, "wb") as f:
+                f.write(content["data"])
+            os.chmod(path, content["mode"])

    def make_symlinks(self, directory):
        for filename, symlink in self.symlinks.items():
            path = os.path.join(directory, filename)
-            os.symlink(symlink['data'], path)
+            os.symlink(symlink["data"], path)

    def make_specials(self, directory):
        for filename, fn in self.specials.items():
@@ -468,9 +638,9 @@ class DataMixin:
            fn(path)

    def make_from_tarball(self, directory):
-        tarball = os.path.join(TEST_DATA, 'dir-folders', 'sample-folder.tgz')
+        tarball = os.path.join(TEST_DATA, "dir-folders", "sample-folder.tgz")

-        with tarfile.open(tarball, 'r:gz') as f:
+        with tarfile.open(tarball, "r:gz") as f:
            f.extractall(os.fsdecode(directory))


@@ -480,11 +650,28 @@ class TestContent(DataMixin, unittest.TestCase):

    def test_data_to_content(self):
        for filename, content in self.contents.items():
-            conv_content = Content.from_bytes(mode=content['mode'],
-                                              data=content['data'])
+            conv_content = Content.from_bytes(
+                mode=content["mode"], data=content["data"]
+            )
            self.assertContentEqual(conv_content, content)
            self.assertIn(hash_to_hex(conv_content.hash), repr(conv_content))

+    def test_content_swhid(self):
+        for _, content in self.contents.items():
+            content_res = Content.from_bytes(mode=content["mode"], data=content["data"])
+            content_swhid = "swh:1:cnt:" + hash_to_hex(content["sha1_git"])
+            assert str(content_res.swhid()) == content_swhid
+
+
+class TestDirectory(DataMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+
+    def test_directory_swhid(self):
+        directory_swhid = "swh:1:dir:" + hash_to_hex(self.empty_directory["id"])
+        directory = Directory.from_disk(path=self.tmpdir_name)
+        assert str(directory.swhid()) == directory_swhid
+

 class SymlinkToContent(DataMixin, unittest.TestCase):
    def setUp(self):
@@ -496,7 +683,21 @@ class SymlinkToContent(DataMixin, unittest.TestCase):
            path = os.path.join(self.tmpdir_name, filename)
            perms = 0o120000
            conv_content = Content.from_symlink(path=path, mode=perms)
-            self.assertContentEqual(conv_content, symlink)
+            symlink_copy = symlink.copy()
+            symlink_copy["path"] = path
+            self.assertContentEqual(conv_content, symlink_copy, check_path=True)
+
+    def test_symlink_to_base_model(self):
+        for filename, symlink in self.symlinks.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            perms = 0o120000
+            model_content = Content.from_symlink(path=path, mode=perms).to_model()
+
+            right = symlink.copy()
+            for key in ("perms", "path", "mode"):
+                right.pop(key, None)
+            right["status"] = "visible"
+            assert model_content == model.Content.from_dict(right)


 class FileToContent(DataMixin, unittest.TestCase):
@@ -506,186 +707,410 @@ class FileToContent(DataMixin, unittest.TestCase):
        self.make_symlinks(self.tmpdir_name)
        self.make_specials(self.tmpdir_name)

+    def test_symlink_to_content(self):
+        for filename, symlink in self.symlinks.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            conv_content = Content.from_file(path=path)
+            self.assertContentEqual(conv_content, symlink)
+
    def test_file_to_content(self):
-        # Check whether loading the data works
-        for data in [True, False]:
+        for filename, content in self.contents.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            conv_content = Content.from_file(path=path)
+            self.assertContentEqual(conv_content, content)
+
+    def test_special_to_content(self):
+        for filename in self.specials:
+            path = os.path.join(self.tmpdir_name, filename)
+            conv_content = Content.from_file(path=path)
+            self.assertContentEqual(conv_content, self.empty_content)
+
+        for path in ["/dev/null", "/dev/zero"]:
+            path = os.path.join(self.tmpdir_name, filename)
+            conv_content = Content.from_file(path=path)
+            self.assertContentEqual(conv_content, self.empty_content)
+
+    def test_symlink_to_content_model(self):
+        for filename, symlink in self.symlinks.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            model_content = Content.from_file(path=path).to_model()
+
+            right = symlink.copy()
+            for key in ("perms", "path", "mode"):
+                right.pop(key, None)
+            right["status"] = "visible"
+            assert model_content == model.Content.from_dict(right)
+
+    def test_file_to_content_model(self):
+        for filename, content in self.contents.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            model_content = Content.from_file(path=path).to_model()
+
+            right = content.copy()
+            for key in ("perms", "mode"):
+                right.pop(key, None)
+            assert model_content.with_data() == model.Content.from_dict(right)
+
+            right["get_data"] = DiskBackedData(path=path)
+            del right["data"]
+            assert model_content == model.Content.from_dict(right)
+
+    def test_special_to_content_model(self):
+        for filename in self.specials:
+            path = os.path.join(self.tmpdir_name, filename)
+            model_content = Content.from_file(path=path).to_model()
+
+            right = self.empty_content.copy()
+            for key in ("perms", "path", "mode"):
+                right.pop(key, None)
+            right["status"] = "visible"
+            assert model_content == model.Content.from_dict(right)
+
+        for path in ["/dev/null", "/dev/zero"]:
+            model_content = Content.from_file(path=path).to_model()
+
+            right = self.empty_content.copy()
+            for key in ("perms", "path", "mode"):
+                right.pop(key, None)
+            right["status"] = "visible"
+            assert model_content == model.Content.from_dict(right)
+
+    def test_symlink_max_length(self):
+        for max_content_length in [4, 10]:
            for filename, symlink in self.symlinks.items():
                path = os.path.join(self.tmpdir_name, filename)
-                conv_content = Content.from_file(path=path, data=data)
-                self.assertContentEqual(conv_content, symlink, check_data=data)
+                content = Content.from_file(path=path)
+                if content.data["length"] > max_content_length:
+                    with pytest.raises(Exception, match="too large"):
+                        Content.from_file(
+                            path=path, max_content_length=max_content_length
+                        )
+                else:
+                    limited_content = Content.from_file(
+                        path=path, max_content_length=max_content_length
+                    )
+                    assert content == limited_content

+    def test_file_max_length(self):
+        for max_content_length in [2, 4]:
            for filename, content in self.contents.items():
                path = os.path.join(self.tmpdir_name, filename)
-                conv_content = Content.from_file(path=path, data=data)
-                self.assertContentEqual(conv_content, content, check_data=data)
+                content = Content.from_file(path=path)
+                limited_content = Content.from_file(
+                    path=path, max_content_length=max_content_length
+                )
+                assert content.data["length"] == limited_content.data["length"]
+                assert content.data["status"] == "visible"
+                if content.data["length"] > max_content_length:
+                    assert limited_content.data["status"] == "absent"
+                    assert limited_content.data["reason"] == "Content too large"
+                else:
+                    assert limited_content.data["status"] == "visible"

+    def test_special_file_max_length(self):
+        for max_content_length in [None, 0, 1]:
            for filename in self.specials:
                path = os.path.join(self.tmpdir_name, filename)
-                conv_content = Content.from_file(path=path, data=data)
-                self.assertContentEqual(conv_content, self.empty_content)
+                content = Content.from_file(path=path)
+                limited_content = Content.from_file(
+                    path=path, max_content_length=max_content_length
+                )
+                assert limited_content == content

    def test_file_to_content_with_path(self):
        for filename, content in self.contents.items():
            content_w_path = content.copy()
            path = os.path.join(self.tmpdir_name, filename)
-            content_w_path['path'] = path
-            conv_content = Content.from_file(path=path, save_path=True)
-            self.assertContentEqual(conv_content, content_w_path,
-                                    check_path=True)
+            content_w_path["path"] = path
+            conv_content = Content.from_file(path=path)
+            self.assertContentEqual(conv_content, content_w_path, check_path=True)


 class DirectoryToObjects(DataMixin, unittest.TestCase):
    def setUp(self):
        super().setUp()
-        contents = os.path.join(self.tmpdir_name, b'contents')
+        contents = os.path.join(self.tmpdir_name, b"contents")
        os.mkdir(contents)
        self.make_contents(contents)
-        symlinks = os.path.join(self.tmpdir_name, b'symlinks')
+        symlinks = os.path.join(self.tmpdir_name, b"symlinks")
        os.mkdir(symlinks)
        self.make_symlinks(symlinks)
-        specials = os.path.join(self.tmpdir_name, b'specials')
+        specials = os.path.join(self.tmpdir_name, b"specials")
        os.mkdir(specials)
        self.make_specials(specials)
-        empties = os.path.join(self.tmpdir_name, b'empty1', b'empty2')
+        empties = os.path.join(self.tmpdir_name, b"empty1", b"empty2")
        os.makedirs(empties)

+    def check_collect(
+        self, directory, expected_directory_count, expected_content_count
+    ):
+        objs = directory.collect()
+        contents = []
+        directories = []
+        for obj in objs:
+            if isinstance(obj, Content):
+                contents.append(obj)
+            elif isinstance(obj, Directory):
+                directories.append(obj)
+
+        self.assertEqual(len(directories), expected_directory_count)
+        self.assertEqual(len(contents), expected_content_count)
+
    def test_directory_to_objects(self):
        directory = Directory.from_disk(path=self.tmpdir_name)

        for name, value in self.contents.items():
-            self.assertContentEqual(directory[b'contents/' + name], value)
+            self.assertContentEqual(directory[b"contents/" + name], value)

        for name, value in self.symlinks.items():
-            self.assertContentEqual(directory[b'symlinks/' + name], value)
+            self.assertContentEqual(directory[b"symlinks/" + name], value)

        for name in self.specials:
            self.assertContentEqual(
-                directory[b'specials/' + name],
+                directory[b"specials/" + name],
                self.empty_content,
            )

        self.assertEqual(
-            directory[b'empty1/empty2'].get_data(),
+            directory[b"empty1/empty2"].get_data(),
            self.empty_directory,
        )

        # Raise on non existent file
        with self.assertRaisesRegex(KeyError, "b'nonexistent'"):
-            directory[b'empty1/nonexistent']
+            directory[b"empty1/nonexistent"]

        # Raise on non existent directory
        with self.assertRaisesRegex(KeyError, "b'nonexistentdir'"):
-            directory[b'nonexistentdir/file']
+            directory[b"nonexistentdir/file"]

-        objs = directory.collect()
-
-        self.assertCountEqual(['content', 'directory'], objs)
-
-        self.assertEqual(len(objs['directory']), 6)
-        self.assertEqual(len(objs['content']),
-                         len(self.contents)
-                         + len(self.symlinks)
-                         + 1)
+        self.check_collect(
+            directory,
+            expected_directory_count=6,
+            expected_content_count=len(self.contents) + len(self.symlinks) + 1,
+        )

    def test_directory_to_objects_ignore_empty(self):
        directory = Directory.from_disk(
-            path=self.tmpdir_name,
-            dir_filter=from_disk.ignore_empty_directories
+            path=self.tmpdir_name, path_filter=from_disk.ignore_empty_directories
        )

        for name, value in self.contents.items():
-            self.assertContentEqual(directory[b'contents/' + name], value)
+            self.assertContentEqual(directory[b"contents/" + name], value)

        for name, value in self.symlinks.items():
-            self.assertContentEqual(directory[b'symlinks/' + name], value)
+            self.assertContentEqual(directory[b"symlinks/" + name], value)

        for name in self.specials:
            self.assertContentEqual(
-                directory[b'specials/' + name],
+                directory[b"specials/" + name],
                self.empty_content,
            )

        # empty directories have been ignored recursively
        with self.assertRaisesRegex(KeyError, "b'empty1'"):
-            directory[b'empty1']
+            directory[b"empty1"]
        with self.assertRaisesRegex(KeyError, "b'empty1'"):
-            directory[b'empty1/empty2']
-
-        objs = directory.collect()
-
-        self.assertCountEqual(['content', 'directory'], objs)
+            directory[b"empty1/empty2"]

-        self.assertEqual(len(objs['directory']), 4)
-        self.assertEqual(len(objs['content']),
-                         len(self.contents)
-                         + len(self.symlinks)
-                         + 1)
+        self.check_collect(
+            directory,
+            expected_directory_count=4,
+            expected_content_count=len(self.contents) + len(self.symlinks) + 1,
+        )

    def test_directory_to_objects_ignore_name(self):
+        pfilter = from_disk.ignore_named_directories([b"symlinks"])
        directory = Directory.from_disk(
            path=self.tmpdir_name,
-            dir_filter=from_disk.ignore_named_directories([b'symlinks'])
+            path_filter=pfilter,
        )
        for name, value in self.contents.items():
-            self.assertContentEqual(directory[b'contents/' + name], value)
+            self.assertContentEqual(directory[b"contents/" + name], value)

        for name in self.specials:
            self.assertContentEqual(
-                directory[b'specials/' + name],
+                directory[b"specials/" + name],
                self.empty_content,
            )

        self.assertEqual(
-            directory[b'empty1/empty2'].get_data(),
+            directory[b"empty1/empty2"].get_data(),
            self.empty_directory,
        )

        with self.assertRaisesRegex(KeyError, "b'symlinks'"):
-            directory[b'symlinks']
-
-        objs = directory.collect()
+            directory[b"symlinks"]

-        self.assertCountEqual(['content', 'directory'], objs)
+        self.check_collect(
+            directory,
+            expected_directory_count=5,
+            expected_content_count=len(self.contents) + 1,
+        )

-        self.assertEqual(len(objs['directory']), 5)
-        self.assertEqual(len(objs['content']),
-                         len(self.contents)
-                         + 1)
+    def test_directory_to_objects_ignore_name_with_slash(self):
+        self.tmpdir_name = self.tmpdir_name + b"/"
+        self.test_directory_to_objects_ignore_name()

    def test_directory_to_objects_ignore_name_case(self):
        directory = Directory.from_disk(
            path=self.tmpdir_name,
-            dir_filter=from_disk.ignore_named_directories([b'symLiNks'],
-                                                          case_sensitive=False)
+            path_filter=from_disk.ignore_named_directories(
+                [b"symLiNks"], case_sensitive=False
+            ),
        )
        for name, value in self.contents.items():
-            self.assertContentEqual(directory[b'contents/' + name], value)
+            self.assertContentEqual(directory[b"contents/" + name], value)

        for name in self.specials:
            self.assertContentEqual(
-                directory[b'specials/' + name],
+                directory[b"specials/" + name],
                self.empty_content,
            )

        self.assertEqual(
-            directory[b'empty1/empty2'].get_data(),
+            directory[b"empty1/empty2"].get_data(),
            self.empty_directory,
        )

        with self.assertRaisesRegex(KeyError, "b'symlinks'"):
-            directory[b'symlinks']
+            directory[b"symlinks"]

-        objs = directory.collect()
+        self.check_collect(
+            directory,
+            expected_directory_count=5,
+            expected_content_count=len(self.contents) + 1,
+        )
+
+    def test_directory_entry_order(self):
+        with tempfile.TemporaryDirectory() as dirname:
+            dirname = os.fsencode(dirname)
+            mk_tree(
+                dirname,
+                b"""
+              /foo.
+              /foo0
+              /foo/
+            """,
+            )
+            directory = Directory.from_disk(path=dirname)
+
+        assert [entry["name"] for entry in directory.entries] == [
+            b"foo.",
+            b"foo",
+            b"foo0",
+        ]
+
+    def test_directory_path_filter(self):
+        def filter_func(path, name, entries):
+            return name.startswith(b"foo")
+
+        with tempfile.TemporaryDirectory() as dirname:
+            dirname = os.fsencode(dirname)
+            mk_tree(
+                dirname,
+                b"""
+              /foofile
+              /file
+              /foo/foo/
+              /baz/
+            """,
+            )
+
+            # No filters
+            directory = Directory.from_disk(path=dirname)
+            assert [entry["name"] for entry in directory.entries] == [
+                b"baz",
+                b"file",
+                b"foo",
+                b"foofile",
+            ]
+
+            # Filter paths
+            directory = Directory.from_disk(path=dirname, path_filter=filter_func)
+            assert [entry["name"] for entry in directory.entries] == [
+                b"foo",
+                b"foofile",
+            ]
+
+    def test_directory_progress_callback(self):
+        total = []
+
+        def update_info(arg):
+            assert type(arg) is int
+            total.append(arg)
+
+        Directory.from_disk(path=self.tmpdir_name, progress_callback=update_info)
+        # Corresponds to the deeper files and directories plus the four top level ones
+        assert total == [4, 1, 1, 1, 1]
+
+    def test_exclude_trailing(self):
+        self.test_exclude(trailing_slash=True)
+
+    def test_exclude(self, trailing_slash=False):
+        """exclude patterns"""
+        with tempfile.TemporaryDirectory() as dirname:
+            dirname = os.fsencode(dirname)
+            mk_tree(
+                dirname,
+                b"""
+              /foofile
+              /file
+              /foo/foo/
+              /baz/
+              /excluded_dir/file
+              /excluded_dir\x96/file
+              /excluded_dir2/
+              /excluded_dir2\x96/
+              /foo/excluded_dir/
+              /foo/excluded_dir2\x96/
+            """,
+            )

-        self.assertCountEqual(['content', 'directory'], objs)
+            # no filter
+            dir_path = dirname
+            if trailing_slash:
+                dir_path += b"/"
+            directory = Directory.from_disk(path=dir_path)
+            assert set(directory.keys()) == {
+                b"baz",
+                b"foo",
+                b"excluded_dir2\x96",
+                b"excluded_dir",
+                b"excluded_dir\x96",
+                b"excluded_dir2",
+                b"foofile",
+                b"file",
+            }
+            assert set(directory[b"foo"].keys()) == {
+                b"foo",
+                b"excluded_dir2\x96",
+                b"excluded_dir",
+            }
+            assert (
+                str(directory.swhid())
+                == "swh:1:dir:cd4dfab9b3e160a683f036841e03855929a07286"
+            )

-        self.assertEqual(len(objs['directory']), 5)
-        self.assertEqual(len(objs['content']),
-                         len(self.contents)
-                         + 1)
+            from swh.model.from_disk import ignore_directories_patterns
+
+            exclude_patterns = [b"excluded_*"]
+            path_filter = ignore_directories_patterns(dirname, exclude_patterns)
+            directory_f = Directory.from_disk(path=dir_path, path_filter=path_filter)
+            assert set(directory_f.keys()) == {b"baz", b"foo", b"foofile", b"file"}
+            # XXX should foo/excluded_dir and foo/excluded_dir2 be excluded as
+            # well? Currently they are not
+            assert set(directory_f[b"foo"].keys()) == {
+                b"foo",
+                b"excluded_dir2\x96",
+                b"excluded_dir",
+            }
+            assert (
+                str(directory_f.swhid())
+                == "swh:1:dir:adaeb949e1f09d28d334b7e360691ef9df934703"
+            )


-@pytest.mark.fs
 class TarballTest(DataMixin, unittest.TestCase):
    def setUp(self):
        super().setUp()
@@ -693,57 +1118,83 @@ class TarballTest(DataMixin, unittest.TestCase):

    def test_contents_match(self):
        directory = Directory.from_disk(
-            path=os.path.join(self.tmpdir_name, b'sample-folder')
+            path=os.path.join(self.tmpdir_name, b"sample-folder")
        )

-        for name, data in self.tarball_contents.items():
+        for name, expected in self.tarball_contents.items():
            obj = directory[name]
            if isinstance(obj, Content):
-                self.assertContentEqual(obj, data)
+                self.assertContentEqual(obj, expected)
            elif isinstance(obj, Directory):
-                self.assertDirectoryEqual(obj, data)
+                self.assertDirectoryEqual(obj, expected)
            else:
-                raise self.failureException('Unknown type for %s' % obj)
+                raise self.failureException("Unknown type for %s" % obj)
+
+
+class TarballIterDirectory(DataMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.make_from_tarball(self.tmpdir_name)
+
+    def test_iter_directory(self):
+        """Iter from_disk.directory should yield the full arborescence tree"""
+        directory = Directory.from_disk(
+            path=os.path.join(self.tmpdir_name, b"sample-folder")
+        )
+
+        contents, skipped_contents, directories = from_disk.iter_directory(directory)
+
+        expected_nb = defaultdict(int)
+        for name in self.tarball_contents.keys():
+            obj = directory[name]
+            expected_nb[obj.object_type] += 1
+
+        assert len(contents) == expected_nb[FromDiskType.CONTENT] and len(contents) > 0
+        assert len(skipped_contents) == 0
+        assert (
+            len(directories) == expected_nb[FromDiskType.DIRECTORY]
+            and len(directories) > 0
+        )


 class DirectoryManipulation(DataMixin, unittest.TestCase):
    def test_directory_access_nested(self):
        d = Directory()
-        d[b'a'] = Directory()
-        d[b'a/b'] = Directory()
+        d[b"a"] = Directory()
+        d[b"a/b"] = Directory()

-        self.assertEqual(d[b'a/b'].get_data(), self.empty_directory)
+        self.assertEqual(d[b"a/b"].get_data(), self.empty_directory)

    def test_directory_del_nested(self):
        d = Directory()
-        d[b'a'] = Directory()
-        d[b'a/b'] = Directory()
+        d[b"a"] = Directory()
+        d[b"a/b"] = Directory()

        with self.assertRaisesRegex(KeyError, "b'c'"):
-            del d[b'a/b/c']
+            del d[b"a/b/c"]

        with self.assertRaisesRegex(KeyError, "b'level2'"):
-            del d[b'a/level2/c']
+            del d[b"a/level2/c"]

-        del d[b'a/b']
+        del d[b"a/b"]

-        self.assertEqual(d[b'a'].get_data(), self.empty_directory)
+        self.assertEqual(d[b"a"].get_data(), self.empty_directory)

    def test_directory_access_self(self):
        d = Directory()
-        self.assertIs(d, d[b''])
-        self.assertIs(d, d[b'/'])
-        self.assertIs(d, d[b'//'])
+        self.assertIs(d, d[b""])
+        self.assertIs(d, d[b"/"])
+        self.assertIs(d, d[b"//"])

    def test_directory_access_wrong_type(self):
        d = Directory()
-        with self.assertRaisesRegex(ValueError, 'bytes from Directory'):
-            d['foo']
-        with self.assertRaisesRegex(ValueError, 'bytes from Directory'):
+        with self.assertRaisesRegex(ValueError, "bytes from Directory"):
+            d["foo"]
+        with self.assertRaisesRegex(ValueError, "bytes from Directory"):
            d[42]

    def test_directory_repr(self):
-        entries = [b'a', b'b', b'c']
+        entries = [b"a", b"b", b"c"]
        d = Directory()
        for entry in entries:
            d[entry] = Directory()
@@ -756,32 +1207,48 @@ class DirectoryManipulation(DataMixin, unittest.TestCase):

    def test_directory_set_wrong_type_name(self):
        d = Directory()
-        with self.assertRaisesRegex(ValueError, 'bytes Directory entry'):
-            d['foo'] = Directory()
-        with self.assertRaisesRegex(ValueError, 'bytes Directory entry'):
+        with self.assertRaisesRegex(ValueError, "bytes Directory entry"):
+            d["foo"] = Directory()
+        with self.assertRaisesRegex(ValueError, "bytes Directory entry"):
            d[42] = Directory()

    def test_directory_set_nul_in_name(self):
        d = Directory()

-        with self.assertRaisesRegex(ValueError, 'nul bytes'):
-            d[b'\x00\x01'] = Directory()
+        with self.assertRaisesRegex(ValueError, "nul bytes"):
+            d[b"\x00\x01"] = Directory()

    def test_directory_set_empty_name(self):
        d = Directory()
-        with self.assertRaisesRegex(ValueError, 'must have a name'):
-            d[b''] = Directory()
-        with self.assertRaisesRegex(ValueError, 'must have a name'):
-            d[b'/'] = Directory()
+        with self.assertRaisesRegex(ValueError, "must have a name"):
+            d[b""] = Directory()
+        with self.assertRaisesRegex(ValueError, "must have a name"):
+            d[b"/"] = Directory()

    def test_directory_set_wrong_type(self):
        d = Directory()
-        with self.assertRaisesRegex(ValueError, 'Content or Directory'):
-            d[b'entry'] = object()
+        with self.assertRaisesRegex(ValueError, "Content or Directory"):
+            d[b"entry"] = object()

    def test_directory_del_wrong_type(self):
        d = Directory()
-        with self.assertRaisesRegex(ValueError, 'bytes Directory entry'):
-            del d['foo']
-        with self.assertRaisesRegex(ValueError, 'bytes Directory entry'):
+        with self.assertRaisesRegex(ValueError, "bytes Directory entry"):
+            del d["foo"]
+        with self.assertRaisesRegex(ValueError, "bytes Directory entry"):
            del d[42]
+
+    def test_directory_contains(self):
+        d = Directory()
+        d[b"a"] = Directory()
+        d[b"a/b"] = Directory()
+        d[b"a/b/c"] = Directory()
+        d[b"a/b/c/d"] = Content()
+
+        self.assertIn(b"a", d)
+        self.assertIn(b"a/b", d)
+        self.assertIn(b"a/b/c", d)
+        self.assertIn(b"a/b/c/d", d)
+
+        self.assertNotIn(b"b", d)
+        self.assertNotIn(b"b/c", d)
+        self.assertNotIn(b"b/c/d", d)
--- a/swh/model/tests/test_generate_testdata.py
+++ b/swh/model/tests/test_generate_testdata.py
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.model.model import BaseContent, Origin
+
+from .generate_testdata import ORIGINS, gen_contents, gen_origins
+
+
+def test_gen_origins_empty():
+    origins = gen_origins(0)
+    assert not origins
+
+
+def test_gen_origins_one():
+    origins = gen_origins(1)
+    assert len(origins) == 1
+    assert [Origin.from_dict(d) for d in origins]
+
+
+def test_gen_origins_default():
+    origins = gen_origins()
+    assert len(origins) == 100
+    models = [Origin.from_dict(d).url for d in origins]
+    assert len(origins) == len(set(models))
+
+
+def test_gen_origins_max():
+    nmax = len(ORIGINS)
+    origins = gen_origins(nmax + 1)
+    assert len(origins) == nmax
+    models = {Origin.from_dict(d).url for d in origins}
+    # ensure we did not generate the same origin twice
+    assert len(origins) == len(models)
+
+
+def test_gen_contents_empty():
+    contents = gen_contents(0)
+    assert not contents
+
+
+def test_gen_contents_one():
+    contents = gen_contents(1)
+    assert len(contents) == 1
+    assert [BaseContent.from_dict(d) for d in contents]
+
+
+def test_gen_contents_default():
+    contents = gen_contents()
+    assert len(contents) == 20
+    models = {BaseContent.from_dict(d) for d in contents}
+    # ensure we did not generate the same content twice
+    assert len(contents) == len(models)
--- a/swh/model/tests/test_hashutil.py
+++ b/swh/model/tests/test_hashutil.py
-# Copyright (C) 2015-2018  The Software Heritage developers
+# Copyright (C) 2015-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

+import contextlib
 import hashlib
 import io
 import os
 import tempfile
-import unittest
 from unittest.mock import patch

+import pytest
+
 from swh.model import hashutil
-from swh.model.hashutil import MultiHash
-
-
-class BaseHashutil(unittest.TestCase):
-    def setUp(self):
-        # Reset function cache
-        hashutil._blake2_hash_cache = {}
-
-        self.data = b'1984\n'
-        self.hex_checksums = {
-            'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731',
-            'sha1_git': '568aaf43d83b2c3df8067f3bedbb97d83260be6d',
-            'sha256': '26602113b4b9afd9d55466b08580d3c2'
-                      '4a9b50ee5b5866c0d91fab0e65907311',
-            'blake2s256': '63cfb259e1fdb485bc5c55749697a6b21ef31fb7445f6c78a'
-                          'c9422f9f2dc8906',
+from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytehex
+
+
+@contextlib.contextmanager
+def patch_blake2(function_name):
+    try:
+        with patch(function_name) as mock:
+            yield mock
+    finally:
+        # mocking blake2 inserts mock objects in the cache; we need
+        # to clean it before the next test runs
+        hashutil._blake2_hash_cache.clear()
+
+
+@pytest.fixture(autouse=True)
+def blake2_hash_cache_reset():
+    # Reset function cache
+    hashutil._blake2_hash_cache = {}
+
+
+@pytest.fixture
+def hash_test_data():
+    class HashTestData:
+        data = b"1984\n"
+        hex_checksums = {
+            "sha1": "62be35bf00ff0c624f4a621e2ea5595a049e0731",
+            "sha1_git": "568aaf43d83b2c3df8067f3bedbb97d83260be6d",
+            "sha256": "26602113b4b9afd9d55466b08580d3c2"
+            "4a9b50ee5b5866c0d91fab0e65907311",
+            "blake2s256": "63cfb259e1fdb485bc5c55749697a6b21ef31fb7445f6c78a"
+            "c9422f9f2dc8906",
        }

-        self.checksums = {
-            type: bytes.fromhex(cksum)
-            for type, cksum in self.hex_checksums.items()
+        checksums = {
+            type: bytes.fromhex(cksum) for type, cksum in hex_checksums.items()
        }

-        self.bytehex_checksums = {
-            type: hashutil.hash_to_bytehex(cksum)
-            for type, cksum in self.checksums.items()
+        bytehex_checksums = {
+            type: hashutil.hash_to_bytehex(cksum) for type, cksum in checksums.items()
        }

-        self.git_hex_checksums = {
-            'blob': self.hex_checksums['sha1_git'],
-            'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0',
-            'commit': '79e4093542e72f0fcb7cbd75cb7d270f9254aa8f',
-            'tag': 'd6bf62466f287b4d986c545890716ce058bddf67',
+        git_hex_checksums = {
+            "blob": hex_checksums["sha1_git"],
+            "tree": "5b2e883aa33d2efab98442693ea4dd5f1b8871b0",
+            "commit": "79e4093542e72f0fcb7cbd75cb7d270f9254aa8f",
+            "tag": "d6bf62466f287b4d986c545890716ce058bddf67",
        }

-        self.git_checksums = {
-            type: bytes.fromhex(cksum)
-            for type, cksum in self.git_hex_checksums.items()
+        git_checksums = {
+            type: bytes.fromhex(cksum) for type, cksum in git_hex_checksums.items()
        }

+    return HashTestData

-class MultiHashTest(BaseHashutil):
-    def test_multi_hash_data(self):
-        checksums = MultiHash.from_data(self.data).digest()
-        self.assertEqual(checksums, self.checksums)
-        self.assertFalse('length' in checksums)

-    def test_multi_hash_data_with_length(self):
-        expected_checksums = self.checksums.copy()
-        expected_checksums['length'] = len(self.data)
+def test_multi_hash_data(hash_test_data):
+    checksums = MultiHash.from_data(hash_test_data.data).digest()
+    assert checksums == hash_test_data.checksums
+    assert "length" not in checksums

-        algos = set(['length']).union(hashutil.DEFAULT_ALGORITHMS)
-        checksums = MultiHash.from_data(self.data, hash_names=algos).digest()

-        self.assertEqual(checksums, expected_checksums)
-        self.assertTrue('length' in checksums)
+def test_multi_hash_data_with_length(hash_test_data):
+    expected_checksums = hash_test_data.checksums.copy()
+    expected_checksums["length"] = len(hash_test_data.data)

-    def test_multi_hash_data_unknown_hash(self):
-        with self.assertRaises(ValueError) as cm:
-            MultiHash.from_data(self.data, ['unknown-hash'])
+    algos = set(["length"]).union(hashutil.DEFAULT_ALGORITHMS)
+    checksums = MultiHash.from_data(hash_test_data.data, hash_names=algos).digest()

-        self.assertIn('Unexpected hashing algorithm', cm.exception.args[0])
-        self.assertIn('unknown-hash', cm.exception.args[0])
+    assert checksums == expected_checksums
+    assert "length" in checksums

-    def test_multi_hash_file(self):
-        fobj = io.BytesIO(self.data)

-        checksums = MultiHash.from_file(fobj, length=len(self.data)).digest()
-        self.assertEqual(checksums, self.checksums)
+def test_multi_hash_data_unknown_hash(hash_test_data):
+    with pytest.raises(ValueError, match="Unexpected hashing algorithm.*unknown-hash"):
+        MultiHash.from_data(hash_test_data.data, ["unknown-hash"])

-    def test_multi_hash_file_hexdigest(self):
-        fobj = io.BytesIO(self.data)
-        length = len(self.data)
-        checksums = MultiHash.from_file(fobj, length=length).hexdigest()
-        self.assertEqual(checksums, self.hex_checksums)

-    def test_multi_hash_file_bytehexdigest(self):
-        fobj = io.BytesIO(self.data)
-        length = len(self.data)
-        checksums = MultiHash.from_file(fobj, length=length).bytehexdigest()
-        self.assertEqual(checksums, self.bytehex_checksums)
+def test_multi_hash_file(hash_test_data):
+    fobj = io.BytesIO(hash_test_data.data)

-    def test_multi_hash_file_missing_length(self):
-        fobj = io.BytesIO(self.data)
-        with self.assertRaises(ValueError) as cm:
-            MultiHash.from_file(fobj, hash_names=['sha1_git'])
+    checksums = MultiHash.from_file(fobj, length=len(hash_test_data.data)).digest()
+    assert checksums == hash_test_data.checksums

-        self.assertIn('Missing length', cm.exception.args[0])

-    def test_multi_hash_path(self):
-        with tempfile.NamedTemporaryFile(delete=False) as f:
-            f.write(self.data)
+def test_multi_hash_file_hexdigest(hash_test_data):
+    fobj = io.BytesIO(hash_test_data.data)
+    length = len(hash_test_data.data)
+    checksums = MultiHash.from_file(fobj, length=length).hexdigest()
+    assert checksums == hash_test_data.hex_checksums

-        hashes = MultiHash.from_path(f.name).digest()
-        os.remove(f.name)

-        self.assertEqual(self.checksums, hashes)
+def test_multi_hash_file_bytehexdigest(hash_test_data):
+    fobj = io.BytesIO(hash_test_data.data)
+    length = len(hash_test_data.data)
+    checksums = MultiHash.from_file(fobj, length=length).bytehexdigest()
+    assert checksums == hash_test_data.bytehex_checksums


-class Hashutil(BaseHashutil):
+EXTRA_HASH_ALGOS = ["md5", "sha512"]

-    def test_hash_git_data(self):
-        checksums = {
-            git_type: hashutil.hash_git_data(self.data, git_type)
-            for git_type in self.git_checksums
-        }

-        self.assertEqual(checksums, self.git_checksums)
-
-    def test_hash_git_data_unknown_git_type(self):
-        with self.assertRaises(ValueError) as cm:
-            hashutil.hash_git_data(self.data, 'unknown-git-type')
-
-        self.assertIn('Unexpected git object type', cm.exception.args[0])
-        self.assertIn('unknown-git-type', cm.exception.args[0])
-
-    def test_hash_to_hex(self):
-        for type in self.checksums:
-            hex = self.hex_checksums[type]
-            hash = self.checksums[type]
-            self.assertEqual(hashutil.hash_to_hex(hex), hex)
-            self.assertEqual(hashutil.hash_to_hex(hash), hex)
-
-    def test_hash_to_bytes(self):
-        for type in self.checksums:
-            hex = self.hex_checksums[type]
-            hash = self.checksums[type]
-            self.assertEqual(hashutil.hash_to_bytes(hex), hash)
-            self.assertEqual(hashutil.hash_to_bytes(hash), hash)
-
-    def test_hash_to_bytehex(self):
-        for algo in self.checksums:
-            self.assertEqual(self.hex_checksums[algo].encode('ascii'),
-                             hashutil.hash_to_bytehex(self.checksums[algo]))
-
-    def test_bytehex_to_hash(self):
-        for algo in self.checksums:
-            self.assertEqual(self.checksums[algo],
-                             hashutil.bytehex_to_hash(
-                                 self.hex_checksums[algo].encode()))
-
-    def test_new_hash_unsupported_hashing_algorithm(self):
-        try:
-            hashutil._new_hash('blake2:10')
-        except ValueError as e:
-            self.assertEqual(str(e),
-                             'Unexpected hashing algorithm blake2:10, '
-                             'expected one of blake2b512, blake2s256, '
-                             'sha1, sha1_git, sha256')
+@pytest.mark.parametrize("hash_algo", EXTRA_HASH_ALGOS)
+def test_multi_hash_file_with_extra_hash_algo(hash_test_data, hash_algo):
+    fobj = io.BytesIO(hash_test_data.data)
+
+    checksums = MultiHash.from_file(
+        fobj,
+        hash_names=DEFAULT_ALGORITHMS | {hash_algo},
+        length=len(hash_test_data.data),
+    ).digest()
+    checksum = {hash_algo: hashlib.new(hash_algo, hash_test_data.data).digest()}
+    assert checksums == {**hash_test_data.checksums, **checksum}
+
+
+@pytest.mark.parametrize("hash_algo", EXTRA_HASH_ALGOS)
+def test_multi_hash_file_hexdigest_with_extra_hash_algo(hash_test_data, hash_algo):
+    fobj = io.BytesIO(hash_test_data.data)
+    length = len(hash_test_data.data)
+    checksums = MultiHash.from_file(
+        fobj, hash_names=DEFAULT_ALGORITHMS | {hash_algo}, length=length
+    ).hexdigest()
+    checksum = {hash_algo: hashlib.new(hash_algo, hash_test_data.data).hexdigest()}
+    assert checksums == {**hash_test_data.hex_checksums, **checksum}
+
+
+@pytest.mark.parametrize("hash_algo", EXTRA_HASH_ALGOS)
+def test_multi_hash_file_bytehexdigest_with_extra_algo(hash_test_data, hash_algo):
+    fobj = io.BytesIO(hash_test_data.data)
+    length = len(hash_test_data.data)
+    checksums = MultiHash.from_file(
+        fobj, hash_names=DEFAULT_ALGORITHMS | {hash_algo}, length=length
+    ).bytehexdigest()
+    checksum = {
+        hash_algo: hash_to_bytehex(hashlib.new(hash_algo, hash_test_data.data).digest())
+    }
+    assert checksums == {**hash_test_data.bytehex_checksums, **checksum}
+
+
+def test_multi_hash_file_missing_length(hash_test_data):
+    fobj = io.BytesIO(hash_test_data.data)
+    with pytest.raises(ValueError, match="Missing length"):
+        MultiHash.from_file(fobj, hash_names=["sha1_git"])
+
+
+def test_multi_hash_path(hash_test_data):
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        f.write(hash_test_data.data)
+
+    hashes = MultiHash.from_path(f.name).digest()
+    os.remove(f.name)
+
+    assert hash_test_data.checksums == hashes
+
+
+def test_hash_git_data(hash_test_data):
+    checksums = {
+        git_type: hashutil.hash_git_data(hash_test_data.data, git_type)
+        for git_type in hash_test_data.git_checksums
+    }
+
+    assert checksums == hash_test_data.git_checksums
+
+
+def test_hash_git_data_unknown_git_type(hash_test_data):
+    with pytest.raises(
+        ValueError, match="Unexpected git object type.*unknown-git-type"
+    ):
+        hashutil.hash_git_data(hash_test_data.data, "unknown-git-type")
+

-    @patch('hashlib.new')
-    def test_new_hash_blake2b_blake2b512_builtin(self, mock_hashlib_new):
-        if 'blake2b512' not in hashlib.algorithms_available:
-            self.skipTest('blake2b512 not built-in')
-        mock_hashlib_new.return_value = sentinel = object()
-
-        h = hashutil._new_hash('blake2b512')
-
-        self.assertIs(h, sentinel)
-        mock_hashlib_new.assert_called_with('blake2b512')
-
-    @patch('hashlib.new')
-    def test_new_hash_blake2s_blake2s256_builtin(self, mock_hashlib_new):
-        if 'blake2s256' not in hashlib.algorithms_available:
-            self.skipTest('blake2s256 not built-in')
-        mock_hashlib_new.return_value = sentinel = object()
-
-        h = hashutil._new_hash('blake2s256')
-
-        self.assertIs(h, sentinel)
-        mock_hashlib_new.assert_called_with('blake2s256')
-
-    def test_new_hash_blake2b_builtin(self):
-        removed_hash = False
-
-        try:
-            if 'blake2b512' in hashlib.algorithms_available:
-                removed_hash = True
-                hashlib.algorithms_available.remove('blake2b512')
-            if 'blake2b' not in hashlib.algorithms_available:
-                self.skipTest('blake2b not built in')
-
-            with patch('hashlib.blake2b') as mock_blake2b:
-                mock_blake2b.return_value = sentinel = object()
-
-                h = hashutil._new_hash('blake2b512')
-
-                self.assertIs(h, sentinel)
-                mock_blake2b.assert_called_with(digest_size=512//8)
-        finally:
-            if removed_hash:
-                hashlib.algorithms_available.add('blake2b512')
-
-    def test_new_hash_blake2s_builtin(self):
-        removed_hash = False
-
-        try:
-            if 'blake2s256' in hashlib.algorithms_available:
-                removed_hash = True
-                hashlib.algorithms_available.remove('blake2s256')
-            if 'blake2s' not in hashlib.algorithms_available:
-                self.skipTest('blake2s not built in')
-
-            with patch('hashlib.blake2s') as mock_blake2s:
-                mock_blake2s.return_value = sentinel = object()
-
-                h = hashutil._new_hash('blake2s256')
-
-                self.assertIs(h, sentinel)
-                mock_blake2s.assert_called_with(digest_size=256//8)
-        finally:
-            if removed_hash:
-                hashlib.algorithms_available.add('blake2s256')
-
-    def test_new_hash_blake2b_pyblake2(self):
-        if 'blake2b512' in hashlib.algorithms_available:
-            self.skipTest('blake2b512 built in')
-        if 'blake2b' in hashlib.algorithms_available:
-            self.skipTest('blake2b built in')
-
-        with patch('pyblake2.blake2b') as mock_blake2b:
-            mock_blake2b.return_value = sentinel = object()
-
-            h = hashutil._new_hash('blake2b512')
-
-            self.assertIs(h, sentinel)
-            mock_blake2b.assert_called_with(digest_size=512//8)
-
-    def test_new_hash_blake2s_pyblake2(self):
-        if 'blake2s256' in hashlib.algorithms_available:
-            self.skipTest('blake2s256 built in')
-        if 'blake2s' in hashlib.algorithms_available:
-            self.skipTest('blake2s built in')
+def test_hash_to_hex(hash_test_data):
+    for type in hash_test_data.checksums:
+        hex = hash_test_data.hex_checksums[type]
+        hash = hash_test_data.checksums[type]
+        assert hashutil.hash_to_hex(hex) == hex
+        assert hashutil.hash_to_hex(hash) == hex

-        with patch('pyblake2.blake2s') as mock_blake2s:
-            mock_blake2s.return_value = sentinel = object()

-            h = hashutil._new_hash('blake2s256')
+def test_hash_to_bytes(hash_test_data):
+    for type in hash_test_data.checksums:
+        hex = hash_test_data.hex_checksums[type]
+        hash = hash_test_data.checksums[type]
+        assert hashutil.hash_to_bytes(hex) == hash
+        assert hashutil.hash_to_bytes(hash) == hash

-            self.assertIs(h, sentinel)
-            mock_blake2s.assert_called_with(digest_size=256//8)

+def test_hash_to_bytehex(hash_test_data):
+    for algo in hash_test_data.checksums:
+        hex_checksum = hash_test_data.hex_checksums[algo].encode("ascii")
+        assert hex_checksum == hashutil.hash_to_bytehex(hash_test_data.checksums[algo])

-class HashlibGit(unittest.TestCase):

-    def setUp(self):
-        self.blob_data = b'42\n'
+def test_bytehex_to_hash(hash_test_data):
+    for algo in hash_test_data.checksums:
+        assert hash_test_data.checksums[algo] == hashutil.bytehex_to_hash(
+            hash_test_data.hex_checksums[algo].encode()
+        )

-        self.tree_data = b''.join([b'40000 barfoo\0',
-                                   bytes.fromhex('c3020f6bf135a38c6df'
-                                                 '3afeb5fb38232c5e07087'),
-                                   b'100644 blah\0',
-                                   bytes.fromhex('63756ef0df5e4f10b6efa'
-                                                 '33cfe5c758749615f20'),
-                                   b'100644 hello\0',
-                                   bytes.fromhex('907b308167f0880fb2a'
-                                                 '5c0e1614bb0c7620f9dc3')])

-        self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970
+def test_new_hash_unsupported_hashing_algorithm():
+    expected_message = (
+        "Unexpected hashing algorithm blake2:10, "
+        "expected one of blake2b512, blake2s256, "
+        "md5, sha1, sha1_git, sha256"
+    )
+    with pytest.raises(ValueError, match=expected_message):
+        hashutil._new_hash("blake2:10")
+
+
+def test_new_hash_blake2b_builtin():
+    with patch_blake2("hashlib.blake2b") as mock_blake2b:
+        mock_blake2b.return_value = sentinel = object()
+
+        h = hashutil._new_hash("blake2b512")
+
+        assert h is sentinel
+        mock_blake2b.assert_called_with(digest_size=512 // 8)
+
+
+def test_new_hash_blake2s_builtin():
+    with patch_blake2("hashlib.blake2s") as mock_blake2s:
+        mock_blake2s.return_value = sentinel = object()
+
+        h = hashutil._new_hash("blake2s256")
+
+        assert h is sentinel
+        mock_blake2s.assert_called_with(digest_size=256 // 8)
+
+
+@pytest.fixture
+def hashgit_test_data():
+    class HashGitTestData:
+        blob_data = b"42\n"
+
+        tree_data = b"".join(
+            [
+                b"40000 barfoo\0",
+                bytes.fromhex("c3020f6bf135a38c6df" "3afeb5fb38232c5e07087"),
+                b"100644 blah\0",
+                bytes.fromhex("63756ef0df5e4f10b6efa" "33cfe5c758749615f20"),
+                b"100644 hello\0",
+                bytes.fromhex("907b308167f0880fb2a" "5c0e1614bb0c7620f9dc3"),
+            ]
+        )
+
+        commit_data = b"""\
+tree 1c61f7259dcb770f46b194d941df4f08ff0a3970
 author Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com> 1444054085 +0200
 committer Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com> 1444054085 +0200

 initial
-""".encode('utf-8')  # NOQA
-        self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241
+"""  # noqa
+
+        tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241
 type commit
 tag 0.0.1
 tagger Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com> 1444225145 +0200

 blah
-""".encode('utf-8')  # NOQA
-
-        self.checksums = {
-            'blob_sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a8453e1'
-                                           'e07157b6cd'),
-            'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db'
-                                           '121dacdb1c'),
-            'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399'
-                                             'd629189653'),
-            'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534'
-                                          'e9e959f120'),
+""".encode(
+            "utf-8"
+        )  # NOQA
+
+        checksums = {
+            "blob_sha1_git": bytes.fromhex(
+                "d81cc0710eb6cf9efd5b920a8453e1" "e07157b6cd"
+            ),
+            "tree_sha1_git": bytes.fromhex(
+                "ac212302c45eada382b27bfda795db" "121dacdb1c"
+            ),
+            "commit_sha1_git": bytes.fromhex(
+                "e960570b2e6e2798fa4cfb9af2c399" "d629189653"
+            ),
+            "tag_sha1_git": bytes.fromhex(
+                "bc2b99ba469987bcf1272c189ed534" "e9e959f120"
+            ),
        }

-    def test_unknown_header_type(self):
-        with self.assertRaises(ValueError) as cm:
-            hashutil.hash_git_data(b'any-data', 'some-unknown-type')
+    return HashGitTestData
+
+
+def test_unknown_header_type():
+    with pytest.raises(ValueError, match="Unexpected git object type"):
+        hashutil.hash_git_data(b"any-data", "some-unknown-type")
+
+
+def test_hashdata_content(hashgit_test_data):
+    # when
+    actual_hash = hashutil.hash_git_data(hashgit_test_data.blob_data, git_type="blob")
+
+    # then
+    assert actual_hash == hashgit_test_data.checksums["blob_sha1_git"]

-        self.assertIn('Unexpected git object type', cm.exception.args[0])

-    def test_hashdata_content(self):
-        # when
-        actual_hash = hashutil.hash_git_data(self.blob_data, git_type='blob')
+def test_hashdata_tree(hashgit_test_data):
+    # when
+    actual_hash = hashutil.hash_git_data(hashgit_test_data.tree_data, git_type="tree")

-        # then
-        self.assertEqual(actual_hash,
-                         self.checksums['blob_sha1_git'])
+    # then
+    assert actual_hash == hashgit_test_data.checksums["tree_sha1_git"]

-    def test_hashdata_tree(self):
-        # when
-        actual_hash = hashutil.hash_git_data(self.tree_data, git_type='tree')

-        # then
-        self.assertEqual(actual_hash,
-                         self.checksums['tree_sha1_git'])
+def test_hashdata_revision(hashgit_test_data):
+    # when
+    actual_hash = hashutil.hash_git_data(
+        hashgit_test_data.commit_data, git_type="commit"
+    )

-    def test_hashdata_revision(self):
-        # when
-        actual_hash = hashutil.hash_git_data(self.commit_data,
-                                             git_type='commit')
+    # then
+    assert actual_hash == hashgit_test_data.checksums["commit_sha1_git"]

-        # then
-        self.assertEqual(actual_hash,
-                         self.checksums['commit_sha1_git'])

-    def test_hashdata_tag(self):
-        # when
-        actual_hash = hashutil.hash_git_data(self.tag_data, git_type='tag')
+def test_hashdata_tag(hashgit_test_data):
+    # when
+    actual_hash = hashutil.hash_git_data(hashgit_test_data.tag_data, git_type="tag")

-        # then
-        self.assertEqual(actual_hash,
-                         self.checksums['tag_sha1_git'])
+    # then
+    assert actual_hash == hashgit_test_data.checksums["tag_sha1_git"]
--- a/swh/model/tests/test_hypothesis_strategies.py
+++ b/swh/model/tests/test_hypothesis_strategies.py
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

+import datetime
+
 import attr
-from hypothesis import given
+from hypothesis import given, settings
+import iso8601

 from swh.model.hashutil import DEFAULT_ALGORITHMS
-from swh.model.hypothesis_strategies import objects, object_dicts
-
+from swh.model.hypothesis_strategies import (
+    aware_datetimes,
+    contents,
+    object_dicts,
+    objects,
+    origin_visits,
+    persons,
+    skipped_contents,
+    snapshots,
+)
+from swh.model.model import ModelObjectType, SnapshotTargetType

-target_types = (
-    'content', 'directory', 'revision', 'release', 'snapshot', 'alias')
+target_types = ("content", "directory", "revision", "release", "snapshot", "alias")
+all_but_skipped_content = {
+    o_t for o_t in ModelObjectType if o_t != ModelObjectType.SKIPPED_CONTENT
+}


-@given(objects())
+@given(objects(blacklist_types=()))
 def test_generation(obj_type_and_obj):
    (obj_type, object_) = obj_type_and_obj
    attr.validate(object_)


+@given(objects(split_content=False))
+def test_generation_merged_content(obj_type_and_obj):
+    # we should never generate a "skipped_content" here
+    assert obj_type_and_obj[0] != ModelObjectType.SKIPPED_CONTENT
+
+
+@given(objects(split_content=True, blacklist_types=all_but_skipped_content))
+def test_generation_split_content(obj_type_and_obj):
+    # we should only generate "skipped_content"
+    assert obj_type_and_obj[0] == ModelObjectType.SKIPPED_CONTENT
+
+
+@given(
+    objects(
+        blacklist_types={
+            ModelObjectType.DIRECTORY,
+            ModelObjectType.ORIGIN_VISIT,
+        }
+    )
+)
+def test_generation_blacklist(obj_type_and_obj):
+    assert obj_type_and_obj[0] not in {
+        ModelObjectType.DIRECTORY,
+        ModelObjectType.ORIGIN_VISIT,
+    }
+
+
 def assert_nested_dict(obj):
    """Tests the object is a nested dict and contains no more class
    from swh.model.model."""
    if isinstance(obj, dict):
-        for (key, value) in obj.items():
+        for key, value in obj.items():
            assert isinstance(key, (str, bytes)), key
            assert_nested_dict(value)
-    elif isinstance(obj, list):
+    elif isinstance(obj, tuple):
        for value in obj:
            assert_nested_dict(value)
-    elif isinstance(obj, (int, float, str, bytes, bool, type(None))):
+    elif isinstance(obj, (int, float, str, bytes, bool, type(None), datetime.datetime)):
        pass
    else:
        assert False, obj


-@given(object_dicts())
+@given(object_dicts(blacklist_types=()))
 def test_dicts_generation(obj_type_and_obj):
    (obj_type, object_) = obj_type_and_obj
    assert_nested_dict(object_)
-    if obj_type == 'content':
-        if object_['status'] == 'visible':
-            assert set(object_) == \
-                set(DEFAULT_ALGORITHMS) | {'length', 'status', 'data'}
-        elif object_['status'] == 'absent':
-            assert set(object_) == \
-                set(DEFAULT_ALGORITHMS) | {'length', 'status', 'reason'}
-        elif object_['status'] == 'hidden':
-            assert set(object_) == \
-                set(DEFAULT_ALGORITHMS) | {'length', 'status', 'data'}
+    if obj_type == ModelObjectType.CONTENT:
+        COMMON_KEYS = set(DEFAULT_ALGORITHMS) | {"length", "status", "ctime"}
+        if object_["status"] == "visible":
+            assert set(object_) <= COMMON_KEYS | {"data"}
+        elif object_["status"] == "absent":
+            assert set(object_) == COMMON_KEYS | {"reason"}
+        elif object_["status"] == "hidden":
+            assert set(object_) <= COMMON_KEYS | {"data"}
        else:
            assert False, object_
-    elif obj_type == 'release':
-        assert object_['target_type'] in target_types
-    elif obj_type == 'snapshot':
-        for branch in object_['branches'].values():
-            assert branch['target_type'] in target_types
+    elif obj_type == ModelObjectType.RELEASE:
+        assert object_["target_type"] in target_types
+    elif obj_type == ModelObjectType.SNAPSHOT:
+        for branch in object_["branches"].values():
+            assert branch is None or branch["target_type"] in target_types
+
+
+@given(aware_datetimes())
+def test_datetimes(dt):
+    # Checks this doesn't raise an error, eg. about seconds in the TZ offset
+    iso8601.parse_date(dt.isoformat())
+
+    assert dt.tzinfo is not None
+
+
+@given(object_dicts(split_content=False))
+def test_dicts_generation_merged_content(obj_type_and_obj):
+    # we should never generate a "skipped_content" here
+    assert obj_type_and_obj[0] != ModelObjectType.SKIPPED_CONTENT
+
+
+@given(object_dicts(split_content=True, blacklist_types=all_but_skipped_content))
+def test_dicts_generation_split_content(obj_type_and_obj):
+    # we should only generate "skipped_content"
+    assert obj_type_and_obj[0] == ModelObjectType.SKIPPED_CONTENT
+
+
+@given(
+    object_dicts(
+        blacklist_types={
+            ModelObjectType.CONTENT,
+            ModelObjectType.RELEASE,
+        }
+    )
+)
+def test_dicts_generation_blacklist(obj_type_and_obj):
+    assert obj_type_and_obj[0] not in {
+        ModelObjectType.CONTENT,
+        ModelObjectType.RELEASE,
+    }
+
+
+@given(objects())
+def test_model_to_dicts(obj_type_and_obj):
+    _, object_ = obj_type_and_obj
+    object_type = object_.object_type
+    obj_dict = object_.to_dict()
+    assert_nested_dict(obj_dict)
+    if object_type in {ModelObjectType.CONTENT, ModelObjectType.SKIPPED_CONTENT}:
+        COMMON_KEYS = set(DEFAULT_ALGORITHMS) | {"length", "status"}
+        if object_.ctime is not None:
+            COMMON_KEYS |= {"ctime"}
+
+        if obj_dict["status"] == "visible":
+            assert set(obj_dict) == COMMON_KEYS | {"data"}
+        elif obj_dict["status"] == "absent":
+            assert set(obj_dict) == COMMON_KEYS | {"reason"}
+        elif obj_dict["status"] == "hidden":
+            assert set(obj_dict) == COMMON_KEYS | {"data"}
+        else:
+            assert False, obj_dict
+    elif object_type == ModelObjectType.RELEASE:
+        assert obj_dict["target_type"] in target_types
+    elif object_type == ModelObjectType.RELEASE:
+        for branch in obj_dict["branches"].values():
+            assert branch is None or branch["target_type"] in target_types
+
+
+@given(contents())
+def test_content_aware_datetime(cont):
+    assert cont.ctime is None or cont.ctime.tzinfo is not None
+
+
+@given(skipped_contents())
+def test_skipped_content_aware_datetime(cont):
+    assert cont.ctime is None or cont.ctime.tzinfo is not None
+
+
+_min_snp_size = 10
+_max_snp_size = 100
+
+
+@given(snapshots(min_size=_min_snp_size, max_size=_max_snp_size))
+@settings(max_examples=1)
+def test_snapshots_strategy(snapshot):
+    branches = snapshot.branches
+
+    assert len(branches) >= _min_snp_size
+    assert len(branches) <= _max_snp_size
+
+    aliases = []
+
+    # check snapshot integrity
+    for name, branch in branches.items():
+        assert branch is None or branch.target_type.value in target_types
+        if branch is not None and branch.target_type == SnapshotTargetType.ALIAS:
+            aliases.append(name)
+            assert branch.target in branches
+
+    # check no cycles between aliases
+    for alias in aliases:
+        processed_alias = set()
+        current_alias = alias
+        while (
+            branches[current_alias] is not None
+            and branches[current_alias].target_type == SnapshotTargetType.ALIAS
+        ):
+            assert branches[current_alias].target not in processed_alias
+            processed_alias.add(current_alias)
+            current_alias = branches[current_alias].target
+
+
+@given(snapshots(min_size=_min_snp_size, max_size=_min_snp_size))
+@settings(max_examples=1)
+def test_snapshots_strategy_fixed_size(snapshot):
+    assert len(snapshot.branches) == _min_snp_size
+
+
+@given(origin_visits())
+def test_origin_visit_aware_datetime(visit):
+    assert visit.date.tzinfo is not None
+
+
+@given(persons())
+def test_person_do_not_look_like_anonimized(person):
+    assert not (
+        len(person.fullname) == 32 and person.name is None and person.email is None
+    )
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
-# Copyright (C) 2015-2018  The Software Heritage developers
+# Copyright (C) 2015-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

-import binascii
 import datetime
+import hashlib
+from typing import Dict
 import unittest

-from swh.model import hashutil, identifiers
-from swh.model.exceptions import ValidationError
-from swh.model.identifiers import (CONTENT, DIRECTORY,
-                                   PERSISTENT_IDENTIFIER_TYPES, RELEASE,
-                                   REVISION, SNAPSHOT, PersistentId)
+import pytest

+from swh.model import git_objects, hashutil
+from swh.model.hashutil import hash_to_bytes as _x
+from swh.model.model import (
+    Content,
+    Directory,
+    ExtID,
+    Origin,
+    RawExtrinsicMetadata,
+    Release,
+    Revision,
+    Snapshot,
+    TimestampWithTimezone,
+)

-class UtilityFunctionsIdentifier(unittest.TestCase):
-    def setUp(self):
-        self.str_id = 'c2e41aae41ac17bd4a650770d6ee77f62e52235b'
-        self.bytes_id = binascii.unhexlify(self.str_id)
-        self.bad_type_id = object()
-
-    def test_identifier_to_bytes(self):
-        for id in [self.str_id, self.bytes_id]:
-            self.assertEqual(identifiers.identifier_to_bytes(id),
-                             self.bytes_id)
-
-            # wrong length
-            with self.assertRaises(ValueError) as cm:
-                identifiers.identifier_to_bytes(id[:-2])
-
-            self.assertIn('length', str(cm.exception))
-
-        with self.assertRaises(ValueError) as cm:
-            identifiers.identifier_to_bytes(self.bad_type_id)
-
-        self.assertIn('type', str(cm.exception))
-
-    def test_identifier_to_str(self):
-        for id in [self.str_id, self.bytes_id]:
-            self.assertEqual(identifiers.identifier_to_str(id),
-                             self.str_id)
-
-            # wrong length
-            with self.assertRaises(ValueError) as cm:
-                identifiers.identifier_to_str(id[:-2])

-            self.assertIn('length', str(cm.exception))
-
-        with self.assertRaises(ValueError) as cm:
-            identifiers.identifier_to_str(self.bad_type_id)
-
-        self.assertIn('type', str(cm.exception))
+def remove_id(d: Dict) -> Dict:
+    """Returns a (shallow) copy of a dict with the 'id' key removed."""
+    d = d.copy()
+    if "id" in d:
+        del d["id"]
+    return d


 class UtilityFunctionsDateOffset(unittest.TestCase):
    def setUp(self):
        self.dates = {
-            b'1448210036': {
-                'seconds': 1448210036,
-                'microseconds': 0,
+            b"1448210036": {
+                "seconds": 1448210036,
+                "microseconds": 0,
            },
-            b'1448210036.002342': {
-                'seconds': 1448210036,
-                'microseconds': 2342,
+            b"1448210036.002342": {
+                "seconds": 1448210036,
+                "microseconds": 2342,
+            },
+            b"1448210036.12": {
+                "seconds": 1448210036,
+                "microseconds": 120000,
            },
-            b'1448210036.12': {
-                'seconds': 1448210036,
-                'microseconds': 120000,
-            }
-        }
-        self.broken_dates = [
-            1448210036.12,
-        ]
-
-        self.offsets = {
-            0: b'+0000',
-            -630: b'-1030',
-            800: b'+1320',
        }

    def test_format_date(self):
        for date_repr, date in self.dates.items():
-            self.assertEqual(identifiers.format_date(date), date_repr)
+            self.assertEqual(git_objects.format_date(date), date_repr)

-    def test_format_date_fail(self):
-        for date in self.broken_dates:
-            with self.assertRaises(ValueError):
-                identifiers.format_date(date)

-    def test_format_offset(self):
-        for offset, res in self.offsets.items():
-            self.assertEqual(identifiers.format_offset(offset), res)
+content_example = {
+    "status": "visible",
+    "length": 5,
+    "data": b"1984\n",
+    "ctime": datetime.datetime(2015, 11, 22, 16, 33, 56, tzinfo=datetime.timezone.utc),
+}


 class ContentIdentifier(unittest.TestCase):
    def setUp(self):
-        self.content = {
-            'status': 'visible',
-            'length': 5,
-            'data': b'1984\n',
-            'ctime': datetime.datetime(2015, 11, 22, 16, 33, 56,
-                                       tzinfo=datetime.timezone.utc),
-        }
-
-        self.content_id = hashutil.MultiHash.from_data(
-            self.content['data']).digest()
+        self.content_id = hashutil.MultiHash.from_data(content_example["data"]).digest()

    def test_content_identifier(self):
-        self.assertEqual(identifiers.content_identifier(self.content),
-                         self.content_id)
+        self.assertEqual(
+            Content.from_data(content_example["data"]).hashes(), self.content_id
+        )
+
+
+directory_example = {
+    "id": _x("d7ed3d2c31d608823be58b1cbe57605310615231"),
+    "entries": [
+        {
+            "type": "file",
+            "perms": 33188,
+            "name": b"README",
+            "target": _x("37ec8ea2110c0b7a32fbb0e872f6e7debbf95e21"),
+        },
+        {
+            "type": "file",
+            "perms": 33188,
+            "name": b"Rakefile",
+            "target": _x("3bb0e8592a41ae3185ee32266c860714980dbed7"),
+        },
+        {
+            "type": "dir",
+            "perms": 16384,
+            "name": b"app",
+            "target": _x("61e6e867f5d7ba3b40540869bc050b0c4fed9e95"),
+        },
+        {
+            "type": "file",
+            "perms": 33188,
+            "name": b"1.megabyte",
+            "target": _x("7c2b2fbdd57d6765cdc9d84c2d7d333f11be7fb3"),
+        },
+        {
+            "type": "dir",
+            "perms": 16384,
+            "name": b"config",
+            "target": _x("591dfe784a2e9ccc63aaba1cb68a765734310d98"),
+        },
+        {
+            "type": "dir",
+            "perms": 16384,
+            "name": b"public",
+            "target": _x("9588bf4522c2b4648bfd1c61d175d1f88c1ad4a5"),
+        },
+        {
+            "type": "file",
+            "perms": 33188,
+            "name": b"development.sqlite3",
+            "target": _x("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"),
+        },
+        {
+            "type": "dir",
+            "perms": 16384,
+            "name": b"doc",
+            "target": _x("154705c6aa1c8ead8c99c7915373e3c44012057f"),
+        },
+        {
+            "type": "dir",
+            "perms": 16384,
+            "name": b"db",
+            "target": _x("85f157bdc39356b7bc7de9d0099b4ced8b3b382c"),
+        },
+        {
+            "type": "dir",
+            "perms": 16384,
+            "name": b"log",
+            "target": _x("5e3d3941c51cce73352dff89c805a304ba96fffe"),
+        },
+        {
+            "type": "dir",
+            "perms": 16384,
+            "name": b"script",
+            "target": _x("1b278423caf176da3f3533592012502aa10f566c"),
+        },
+        {
+            "type": "dir",
+            "perms": 16384,
+            "name": b"test",
+            "target": _x("035f0437c080bfd8711670b3e8677e686c69c763"),
+        },
+        {
+            "type": "dir",
+            "perms": 16384,
+            "name": b"vendor",
+            "target": _x("7c0dc9ad978c1af3f9a4ce061e50f5918bd27138"),
+        },
+        {
+            "type": "rev",
+            "perms": 57344,
+            "name": b"will_paginate",
+            "target": _x("3d531e169db92a16a9a8974f0ae6edf52e52659e"),
+        },
+        # in git order, the dir named "order" should be between the files
+        # named "order." and "order0"
+        {
+            "type": "dir",
+            "perms": 16384,
+            "name": b"order",
+            "target": _x("62cdb7020ff920e5aa642c3d4066950dd1f01f4d"),
+        },
+        {
+            "type": "file",
+            "perms": 16384,
+            "name": b"order.",
+            "target": _x("0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"),
+        },
+        {
+            "type": "file",
+            "perms": 16384,
+            "name": b"order0",
+            "target": _x("bbe960a25ea311d21d40669e93df2003ba9b90a2"),
+        },
+    ],
+}


 class DirectoryIdentifier(unittest.TestCase):
    def setUp(self):
-        self.directory = {
-            'id': 'c2e41aae41ac17bd4a650770d6ee77f62e52235b',
-            'entries': [
-                {
-                    'type': 'file',
-                    'perms': 33188,
-                    'name': b'README',
-                    'target': '37ec8ea2110c0b7a32fbb0e872f6e7debbf95e21'
-                },
-                {
-                    'type': 'file',
-                    'perms': 33188,
-                    'name': b'Rakefile',
-                    'target': '3bb0e8592a41ae3185ee32266c860714980dbed7'
-                },
-                {
-                    'type': 'dir',
-                    'perms': 16384,
-                    'name': b'app',
-                    'target': '61e6e867f5d7ba3b40540869bc050b0c4fed9e95'
-                },
-                {
-                    'type': 'file',
-                    'perms': 33188,
-                    'name': b'1.megabyte',
-                    'target': '7c2b2fbdd57d6765cdc9d84c2d7d333f11be7fb3'
-                },
-                {
-                    'type': 'dir',
-                    'perms': 16384,
-                    'name': b'config',
-                    'target': '591dfe784a2e9ccc63aaba1cb68a765734310d98'
-                },
-                {
-                    'type': 'dir',
-                    'perms': 16384,
-                    'name': b'public',
-                    'target': '9588bf4522c2b4648bfd1c61d175d1f88c1ad4a5'
-                },
-                {
-                    'type': 'file',
-                    'perms': 33188,
-                    'name': b'development.sqlite3',
-                    'target': 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
-                },
-                {
-                    'type': 'dir',
-                    'perms': 16384,
-                    'name': b'doc',
-                    'target': '154705c6aa1c8ead8c99c7915373e3c44012057f'
-                },
-                {
-                    'type': 'dir',
-                    'perms': 16384,
-                    'name': b'db',
-                    'target': '85f157bdc39356b7bc7de9d0099b4ced8b3b382c'
-                },
-                {
-                    'type': 'dir',
-                    'perms': 16384,
-                    'name': b'log',
-                    'target': '5e3d3941c51cce73352dff89c805a304ba96fffe'
-                },
-                {
-                    'type': 'dir',
-                    'perms': 16384,
-                    'name': b'script',
-                    'target': '1b278423caf176da3f3533592012502aa10f566c'
-                },
-                {
-                    'type': 'dir',
-                    'perms': 16384,
-                    'name': b'test',
-                    'target': '035f0437c080bfd8711670b3e8677e686c69c763'
-                },
-                {
-                    'type': 'dir',
-                    'perms': 16384,
-                    'name': b'vendor',
-                    'target': '7c0dc9ad978c1af3f9a4ce061e50f5918bd27138'
-                },
-                {
-                    'type': 'rev',
-                    'perms': 57344,
-                    'name': b'will_paginate',
-                    'target': '3d531e169db92a16a9a8974f0ae6edf52e52659e'
-                }
-            ],
-        }
+        self.directory = directory_example

        self.empty_directory = {
-            'id': '4b825dc642cb6eb9a060e54bf8d69288fbee4904',
-            'entries': [],
+            "id": "4b825dc642cb6eb9a060e54bf8d69288fbee4904",
+            "entries": [],
        }

    def test_dir_identifier(self):
+        self.assertEqual(Directory.from_dict(self.directory).id, self.directory["id"])
+        self.assertEqual(
+            Directory.from_dict(remove_id(self.directory)).id,
+            self.directory["id"],
+        )
+
+    def test_dir_identifier_entry_order(self):
+        # Reverse order of entries, check the id is still the same.
+        directory = {"entries": reversed(self.directory["entries"])}
        self.assertEqual(
-            identifiers.directory_identifier(self.directory),
-            self.directory['id'])
+            Directory.from_dict(remove_id(directory)).id,
+            self.directory["id"],
+        )

    def test_dir_identifier_empty_directory(self):
        self.assertEqual(
-            identifiers.directory_identifier(self.empty_directory),
-            self.empty_directory['id'])
+            Directory.from_dict(remove_id(self.empty_directory)).id,
+            _x(self.empty_directory["id"]),
+        )


-class RevisionIdentifier(unittest.TestCase):
-    def setUp(self):
+linus_tz = datetime.timezone(datetime.timedelta(minutes=-420))
+
+revision_example = {
+    "id": _x("bc0195aad0daa2ad5b0d76cce22b167bc3435590"),
+    "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"),
+    "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")],
+    "author": {
+        "name": b"Linus Torvalds",
+        "email": b"torvalds@linux-foundation.org",
+        "fullname": b"Linus Torvalds <torvalds@linux-foundation.org>",
+    },
+    "date": datetime.datetime(2015, 7, 12, 15, 10, 30, tzinfo=linus_tz),
+    "committer": {
+        "name": b"Linus Torvalds",
+        "email": b"torvalds@linux-foundation.org",
+        "fullname": b"Linus Torvalds <torvalds@linux-foundation.org>",
+    },
+    "committer_date": datetime.datetime(2015, 7, 12, 15, 10, 30, tzinfo=linus_tz),
+    "message": b"Linux 4.2-rc2\n",
+    "type": "git",
+    "synthetic": False,
+}

-        linus_tz = datetime.timezone(datetime.timedelta(minutes=-420))

-        gpgsig = b'''\
+class RevisionIdentifier(unittest.TestCase):
+    def setUp(self):
+        gpgsig = b"""\
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.4.13 (Darwin)

@@ -241,290 +258,274 @@ jdTswYL6+MUdL8sB9pZ82D+BP/YAdHe69CyTu1lk9RT2pYtI/kkfjHubXBCYEJSG
 lf1Qb5GDsQrZWgD+jtWTywOYHtCBwyCKSAXxSARMbNPeak9WPlcW/Jmu+fUcMe2x
 dg1KdHOa34shrKDaOVzW
 =od6m
-----END PGP SIGNATURE-----'''
-
-        self.revision = {
-            'id': 'bc0195aad0daa2ad5b0d76cce22b167bc3435590',
-            'directory': '85a74718d377195e1efd0843ba4f3260bad4fe07',
-            'parents': ['01e2d0627a9a6edb24c37db45db5ecb31e9de808'],
-            'author': {
-                'name': b'Linus Torvalds',
-                'email': b'torvalds@linux-foundation.org',
-            },
-            'date': datetime.datetime(2015, 7, 12, 15, 10, 30,
-                                      tzinfo=linus_tz),
-            'committer': {
-                'name': b'Linus Torvalds',
-                'email': b'torvalds@linux-foundation.org',
-            },
-            'committer_date': datetime.datetime(2015, 7, 12, 15, 10, 30,
-                                                tzinfo=linus_tz),
-            'message': b'Linux 4.2-rc2\n',
-        }
+-----END PGP SIGNATURE-----"""
+
+        self.revision = revision_example

        self.revision_none_metadata = {
-            'id': 'bc0195aad0daa2ad5b0d76cce22b167bc3435590',
-            'directory': '85a74718d377195e1efd0843ba4f3260bad4fe07',
-            'parents': ['01e2d0627a9a6edb24c37db45db5ecb31e9de808'],
-            'author': {
-                'name': b'Linus Torvalds',
-                'email': b'torvalds@linux-foundation.org',
+            "id": _x("bc0195aad0daa2ad5b0d76cce22b167bc3435590"),
+            "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"),
+            "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")],
+            "author": {
+                "name": b"Linus Torvalds",
+                "email": b"torvalds@linux-foundation.org",
            },
-            'date': datetime.datetime(2015, 7, 12, 15, 10, 30,
-                                      tzinfo=linus_tz),
-            'committer': {
-                'name': b'Linus Torvalds',
-                'email': b'torvalds@linux-foundation.org',
+            "date": datetime.datetime(2015, 7, 12, 15, 10, 30, tzinfo=linus_tz),
+            "committer": {
+                "name": b"Linus Torvalds",
+                "email": b"torvalds@linux-foundation.org",
            },
-            'committer_date': datetime.datetime(2015, 7, 12, 15, 10, 30,
-                                                tzinfo=linus_tz),
-            'message': b'Linux 4.2-rc2\n',
-            'metadata': None,
+            "committer_date": datetime.datetime(
+                2015, 7, 12, 15, 10, 30, tzinfo=linus_tz
+            ),
+            "message": b"Linux 4.2-rc2\n",
+            "type": "git",
+            "synthetic": False,
+            "metadata": None,
        }

        self.synthetic_revision = {
-            'id': b'\xb2\xa7\xe1&\x04\x92\xe3D\xfa\xb3\xcb\xf9\x1b\xc1<\x91'
-                  b'\xe0T&\xfd',
-            'author': {
-                'name': b'Software Heritage',
-                'email': b'robot@softwareheritage.org',
+            "id": _x("b2a7e1260492e344fab3cbf91bc13c91e05426fd"),
+            "author": {
+                "name": b"Software Heritage",
+                "email": b"robot@softwareheritage.org",
            },
-            'date': {
-                'timestamp': {'seconds': 1437047495},
-                'offset': 0,
-                'negative_utc': False,
+            "date": {
+                "timestamp": {"seconds": 1437047495},
+                "offset_bytes": b"+0000",
            },
-            'type': 'tar',
-            'committer': {
-                'name': b'Software Heritage',
-                'email': b'robot@softwareheritage.org',
+            "type": "tar",
+            "committer": {
+                "name": b"Software Heritage",
+                "email": b"robot@softwareheritage.org",
+            },
+            "committer_date": 1437047495,
+            "synthetic": True,
+            "parents": [],
+            "message": b"synthetic revision message\n",
+            "directory": _x("d11f00a6a0fea6055341d25584b5a96516c0d2b8"),
+            "metadata": {
+                "original_artifact": [
+                    {
+                        "archive_type": "tar",
+                        "name": "gcc-5.2.0.tar.bz2",
+                        "sha1_git": "39d281aff934d44b439730057e55b055e206a586",
+                        "sha1": "fe3f5390949d47054b613edc36c557eb1d51c18e",
+                        "sha256": "5f835b04b5f7dd4f4d2dc96190ec1621b8d89f"
+                        "2dc6f638f9f8bc1b1014ba8cad",
+                    }
+                ]
            },
-            'committer_date': 1437047495,
-            'synthetic': True,
-            'parents': [None],
-            'message': b'synthetic revision message\n',
-            'directory': b'\xd1\x1f\x00\xa6\xa0\xfe\xa6\x05SA\xd2U\x84\xb5\xa9'
-                         b'e\x16\xc0\xd2\xb8',
-            'metadata': {'original_artifact': [
-                {'archive_type': 'tar',
-                 'name': 'gcc-5.2.0.tar.bz2',
-                 'sha1_git': '39d281aff934d44b439730057e55b055e206a586',
-                 'sha1': 'fe3f5390949d47054b613edc36c557eb1d51c18e',
-                 'sha256': '5f835b04b5f7dd4f4d2dc96190ec1621b8d89f'
-                           '2dc6f638f9f8bc1b1014ba8cad'}]},
-
        }

        # cat commit.txt | git hash-object -t commit --stdin
        self.revision_with_extra_headers = {
-            'id': '010d34f384fa99d047cdd5e2f41e56e5c2feee45',
-            'directory': '85a74718d377195e1efd0843ba4f3260bad4fe07',
-            'parents': ['01e2d0627a9a6edb24c37db45db5ecb31e9de808'],
-            'author': {
-                'name': b'Linus Torvalds',
-                'email': b'torvalds@linux-foundation.org',
-                'fullname': b'Linus Torvalds <torvalds@linux-foundation.org>',
+            "id": _x("010d34f384fa99d047cdd5e2f41e56e5c2feee45"),
+            "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"),
+            "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")],
+            "author": {
+                "name": b"Linus Torvalds",
+                "email": b"torvalds@linux-foundation.org",
+                "fullname": b"Linus Torvalds <torvalds@linux-foundation.org>",
            },
-            'date': datetime.datetime(2015, 7, 12, 15, 10, 30,
-                                      tzinfo=linus_tz),
-            'committer': {
-                'name': b'Linus Torvalds',
-                'email': b'torvalds@linux-foundation.org',
-                'fullname': b'Linus Torvalds <torvalds@linux-foundation.org>',
+            "date": datetime.datetime(2015, 7, 12, 15, 10, 30, tzinfo=linus_tz),
+            "committer": {
+                "name": b"Linus Torvalds",
+                "email": b"torvalds@linux-foundation.org",
+                "fullname": b"Linus Torvalds <torvalds@linux-foundation.org>",
            },
-            'committer_date': datetime.datetime(2015, 7, 12, 15, 10, 30,
-                                                tzinfo=linus_tz),
-            'message': b'Linux 4.2-rc2\n',
-            'metadata': {
-                'extra_headers': [
-                    ['svn-repo-uuid', '046f1af7-66c2-d61b-5410-ce57b7db7bff'],
-                    ['svn-revision', 10],
-                ]
-            }
+            "committer_date": datetime.datetime(
+                2015, 7, 12, 15, 10, 30, tzinfo=linus_tz
+            ),
+            "message": b"Linux 4.2-rc2\n",
+            "type": "git",
+            "synthetic": False,
+            "extra_headers": (
+                (b"svn-repo-uuid", b"046f1af7-66c2-d61b-5410-ce57b7db7bff"),
+                (b"svn-revision", b"10"),
+            ),
        }

        self.revision_with_gpgsig = {
-            'id': '44cc742a8ca17b9c279be4cc195a93a6ef7a320e',
-            'directory': 'b134f9b7dc434f593c0bab696345548b37de0558',
-            'parents': ['689664ae944b4692724f13b709a4e4de28b54e57',
-                        'c888305e1efbaa252d01b4e5e6b778f865a97514'],
-            'author': {
-                'name': b'Jiang Xin',
-                'email': b'worldhello.net@gmail.com',
-                'fullname': b'Jiang Xin <worldhello.net@gmail.com>',
-            },
-            'date': {
-                'timestamp': 1428538899,
-                'offset': 480,
+            "id": _x("44cc742a8ca17b9c279be4cc195a93a6ef7a320e"),
+            "directory": _x("b134f9b7dc434f593c0bab696345548b37de0558"),
+            "parents": [
+                _x("689664ae944b4692724f13b709a4e4de28b54e57"),
+                _x("c888305e1efbaa252d01b4e5e6b778f865a97514"),
+            ],
+            "author": {
+                "name": b"Jiang Xin",
+                "email": b"worldhello.net@gmail.com",
+                "fullname": b"Jiang Xin <worldhello.net@gmail.com>",
            },
-            'committer': {
-                'name': b'Jiang Xin',
-                'email': b'worldhello.net@gmail.com',
+            "date": {
+                "timestamp": 1428538899,
+                "offset": 480,
            },
-            'committer_date': {
-                'timestamp': 1428538899,
-                'offset': 480,
+            "committer": {
+                "name": b"Jiang Xin",
+                "email": b"worldhello.net@gmail.com",
            },
-            'metadata': {
-                'extra_headers': [
-                    ['gpgsig', gpgsig],
-                ],
+            "committer_date": {
+                "timestamp": 1428538899,
+                "offset": 480,
            },
-            'message': b'''Merge branch 'master' of git://github.com/alexhenrie/git-po
+            "extra_headers": ((b"gpgsig", gpgsig),),
+            "message": b"""Merge branch 'master' of git://github.com/alexhenrie/git-po

 * 'master' of git://github.com/alexhenrie/git-po:
  l10n: ca.po: update translation
-'''
+""",
+            "type": "git",
+            "synthetic": False,
        }

        self.revision_no_message = {
-            'id': '4cfc623c9238fa92c832beed000ce2d003fd8333',
-            'directory': 'b134f9b7dc434f593c0bab696345548b37de0558',
-            'parents': ['689664ae944b4692724f13b709a4e4de28b54e57',
-                        'c888305e1efbaa252d01b4e5e6b778f865a97514'],
-            'author': {
-                'name': b'Jiang Xin',
-                'email': b'worldhello.net@gmail.com',
-                'fullname': b'Jiang Xin <worldhello.net@gmail.com>',
+            "id": _x("4cfc623c9238fa92c832beed000ce2d003fd8333"),
+            "directory": _x("b134f9b7dc434f593c0bab696345548b37de0558"),
+            "parents": [
+                _x("689664ae944b4692724f13b709a4e4de28b54e57"),
+                _x("c888305e1efbaa252d01b4e5e6b778f865a97514"),
+            ],
+            "author": {
+                "name": b"Jiang Xin",
+                "email": b"worldhello.net@gmail.com",
+                "fullname": b"Jiang Xin <worldhello.net@gmail.com>",
            },
-            'date': {
-                'timestamp': 1428538899,
-                'offset': 480,
+            "date": {
+                "timestamp": 1428538899,
+                "offset": 480,
            },
-            'committer': {
-                'name': b'Jiang Xin',
-                'email': b'worldhello.net@gmail.com',
+            "committer": {
+                "name": b"Jiang Xin",
+                "email": b"worldhello.net@gmail.com",
            },
-            'committer_date': {
-                'timestamp': 1428538899,
-                'offset': 480,
+            "committer_date": {
+                "timestamp": 1428538899,
+                "offset": 480,
            },
-            'message': None,
+            "message": None,
+            "type": "git",
+            "synthetic": False,
        }

        self.revision_empty_message = {
-            'id': '7442cd78bd3b4966921d6a7f7447417b7acb15eb',
-            'directory': 'b134f9b7dc434f593c0bab696345548b37de0558',
-            'parents': ['689664ae944b4692724f13b709a4e4de28b54e57',
-                        'c888305e1efbaa252d01b4e5e6b778f865a97514'],
-            'author': {
-                'name': b'Jiang Xin',
-                'email': b'worldhello.net@gmail.com',
-                'fullname': b'Jiang Xin <worldhello.net@gmail.com>',
+            "id": _x("7442cd78bd3b4966921d6a7f7447417b7acb15eb"),
+            "directory": _x("b134f9b7dc434f593c0bab696345548b37de0558"),
+            "parents": [
+                _x("689664ae944b4692724f13b709a4e4de28b54e57"),
+                _x("c888305e1efbaa252d01b4e5e6b778f865a97514"),
+            ],
+            "author": {
+                "name": b"Jiang Xin",
+                "email": b"worldhello.net@gmail.com",
+                "fullname": b"Jiang Xin <worldhello.net@gmail.com>",
            },
-            'date': {
-                'timestamp': 1428538899,
-                'offset': 480,
+            "date": {
+                "timestamp": 1428538899,
+                "offset": 480,
            },
-            'committer': {
-                'name': b'Jiang Xin',
-                'email': b'worldhello.net@gmail.com',
+            "committer": {
+                "name": b"Jiang Xin",
+                "email": b"worldhello.net@gmail.com",
            },
-            'committer_date': {
-                'timestamp': 1428538899,
-                'offset': 480,
+            "committer_date": {
+                "timestamp": 1428538899,
+                "offset": 480,
            },
-            'message': b'',
+            "message": b"",
+            "type": "git",
+            "synthetic": False,
        }

        self.revision_only_fullname = {
-            'id': '010d34f384fa99d047cdd5e2f41e56e5c2feee45',
-            'directory': '85a74718d377195e1efd0843ba4f3260bad4fe07',
-            'parents': ['01e2d0627a9a6edb24c37db45db5ecb31e9de808'],
-            'author': {
-                'fullname': b'Linus Torvalds <torvalds@linux-foundation.org>',
+            "id": _x("010d34f384fa99d047cdd5e2f41e56e5c2feee45"),
+            "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"),
+            "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")],
+            "author": {
+                "fullname": b"Linus Torvalds <torvalds@linux-foundation.org>",
            },
-            'date': datetime.datetime(2015, 7, 12, 15, 10, 30,
-                                      tzinfo=linus_tz),
-            'committer': {
-                'fullname': b'Linus Torvalds <torvalds@linux-foundation.org>',
+            "date": datetime.datetime(2015, 7, 12, 15, 10, 30, tzinfo=linus_tz),
+            "committer": {
+                "fullname": b"Linus Torvalds <torvalds@linux-foundation.org>",
            },
-            'committer_date': datetime.datetime(2015, 7, 12, 15, 10, 30,
-                                                tzinfo=linus_tz),
-            'message': b'Linux 4.2-rc2\n',
-            'metadata': {
-                'extra_headers': [
-                    ['svn-repo-uuid', '046f1af7-66c2-d61b-5410-ce57b7db7bff'],
-                    ['svn-revision', 10],
-                ]
-            }
+            "committer_date": datetime.datetime(
+                2015, 7, 12, 15, 10, 30, tzinfo=linus_tz
+            ),
+            "message": b"Linux 4.2-rc2\n",
+            "type": "git",
+            "synthetic": False,
+            "extra_headers": (
+                (b"svn-repo-uuid", b"046f1af7-66c2-d61b-5410-ce57b7db7bff"),
+                (b"svn-revision", b"10"),
+            ),
        }

    def test_revision_identifier(self):
        self.assertEqual(
-            identifiers.revision_identifier(self.revision),
-            identifiers.identifier_to_str(self.revision['id']),
+            Revision.from_dict(self.revision).id,
+            self.revision["id"],
+        )
+        self.assertEqual(
+            Revision.from_dict(remove_id(self.revision)).id,
+            self.revision["id"],
        )

    def test_revision_identifier_none_metadata(self):
        self.assertEqual(
-            identifiers.revision_identifier(self.revision_none_metadata),
-            identifiers.identifier_to_str(self.revision_none_metadata['id']),
+            Revision.from_dict(remove_id(self.revision_none_metadata)).id,
+            self.revision_none_metadata["id"],
        )

    def test_revision_identifier_synthetic(self):
        self.assertEqual(
-            identifiers.revision_identifier(self.synthetic_revision),
-            identifiers.identifier_to_str(self.synthetic_revision['id']),
+            Revision.from_dict(remove_id(self.synthetic_revision)).id,
+            self.synthetic_revision["id"],
        )

    def test_revision_identifier_with_extra_headers(self):
        self.assertEqual(
-            identifiers.revision_identifier(
-                self.revision_with_extra_headers),
-            identifiers.identifier_to_str(
-                self.revision_with_extra_headers['id']),
+            Revision.from_dict(remove_id(self.revision_with_extra_headers)).id,
+            self.revision_with_extra_headers["id"],
        )

    def test_revision_identifier_with_gpgsig(self):
        self.assertEqual(
-            identifiers.revision_identifier(
-                self.revision_with_gpgsig),
-            identifiers.identifier_to_str(
-                self.revision_with_gpgsig['id']),
+            Revision.from_dict(remove_id(self.revision_with_gpgsig)).id,
+            self.revision_with_gpgsig["id"],
        )

    def test_revision_identifier_no_message(self):
        self.assertEqual(
-            identifiers.revision_identifier(
-                self.revision_no_message),
-            identifiers.identifier_to_str(
-                self.revision_no_message['id']),
+            Revision.from_dict(remove_id(self.revision_no_message)).id,
+            self.revision_no_message["id"],
        )

    def test_revision_identifier_empty_message(self):
        self.assertEqual(
-            identifiers.revision_identifier(
-                self.revision_empty_message),
-            identifiers.identifier_to_str(
-                self.revision_empty_message['id']),
+            Revision.from_dict(remove_id(self.revision_empty_message)).id,
+            self.revision_empty_message["id"],
        )

    def test_revision_identifier_only_fullname(self):
        self.assertEqual(
-            identifiers.revision_identifier(
-                self.revision_only_fullname),
-            identifiers.identifier_to_str(
-                self.revision_only_fullname['id']),
+            Revision.from_dict(remove_id(self.revision_only_fullname)).id,
+            self.revision_only_fullname["id"],
        )


-class ReleaseIdentifier(unittest.TestCase):
-    def setUp(self):
-        linus_tz = datetime.timezone(datetime.timedelta(minutes=-420))
-
-        self.release = {
-            'id': '2b10839e32c4c476e9d94492756bb1a3e1ec4aa8',
-            'target': b't\x1b"R\xa5\xe1Ml`\xa9\x13\xc7z`\x99\xab\xe7:\x85J',
-            'target_type': 'revision',
-            'name': b'v2.6.14',
-            'author': {
-                'name': b'Linus Torvalds',
-                'email': b'torvalds@g5.osdl.org',
-            },
-            'date': datetime.datetime(2005, 10, 27, 17, 2, 33,
-                                      tzinfo=linus_tz),
-            'message': b'''\
+release_example = {
+    "id": _x("2b10839e32c4c476e9d94492756bb1a3e1ec4aa8"),
+    "target": _x("741b2252a5e14d6c60a913c77a6099abe73a854a"),
+    "target_type": "revision",
+    "name": b"v2.6.14",
+    "author": {
+        "name": b"Linus Torvalds",
+        "email": b"torvalds@g5.osdl.org",
+        "fullname": b"Linus Torvalds <torvalds@g5.osdl.org>",
+    },
+    "date": datetime.datetime(2005, 10, 27, 17, 2, 33, tzinfo=linus_tz),
+    "message": b"""\
 Linux 2.6.14 release
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.4.1 (GNU/Linux)
@@ -533,16 +534,23 @@ iD8DBQBDYWq6F3YsRnbiHLsRAmaeAJ9RCez0y8rOBbhSv344h86l/VVcugCeIhO1
 wdLOnvj91G4wxYqrvThthbE=
 =7VeT
 -----END PGP SIGNATURE-----
-''',
-            'synthetic': False,
-        }
+""",
+    "synthetic": False,
+}
+
+
+class ReleaseIdentifier(unittest.TestCase):
+    def setUp(self):
+        linus_tz = datetime.timezone(datetime.timedelta(minutes=-420))
+
+        self.release = release_example

        self.release_no_author = {
-            'id': b'&y\x1a\x8b\xcf\x0em3\xf4:\xefv\x82\xbd\xb5U#mV\xde',
-            'target': '9ee1c939d1cb936b1f98e8d81aeffab57bae46ab',
-            'target_type': 'revision',
-            'name': b'v2.6.12',
-            'message': b'''\
+            "id": _x("26791a8bcf0e6d33f43aef7682bdb555236d56de"),
+            "target": _x("9ee1c939d1cb936b1f98e8d81aeffab57bae46ab"),
+            "target_type": "revision",
+            "name": b"v2.6.12",
+            "message": b"""\
 This is the final 2.6.12 release
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.2.4 (GNU/Linux)
@@ -551,356 +559,805 @@ iD8DBQBCsykyF3YsRnbiHLsRAvPNAJ482tCZwuxp/bJRz7Q98MHlN83TpACdHr37
 o6X/3T+vm8K3bf3driRr34c=
 =sBHn
 -----END PGP SIGNATURE-----
-''',
-            'synthetic': False,
+""",
+            "synthetic": False,
        }

        self.release_no_message = {
-            'id': 'b6f4f446715f7d9543ef54e41b62982f0db40045',
-            'target': '9ee1c939d1cb936b1f98e8d81aeffab57bae46ab',
-            'target_type': 'revision',
-            'name': b'v2.6.12',
-            'author': {
-                'name': b'Linus Torvalds',
-                'email': b'torvalds@g5.osdl.org',
+            "id": _x("b6f4f446715f7d9543ef54e41b62982f0db40045"),
+            "target": _x("9ee1c939d1cb936b1f98e8d81aeffab57bae46ab"),
+            "target_type": "revision",
+            "name": b"v2.6.12",
+            "author": {
+                "name": b"Linus Torvalds",
+                "email": b"torvalds@g5.osdl.org",
            },
-            'date': datetime.datetime(2005, 10, 27, 17, 2, 33,
-                                      tzinfo=linus_tz),
-            'message': None,
+            "date": datetime.datetime(2005, 10, 27, 17, 2, 33, tzinfo=linus_tz),
+            "message": None,
+            "synthetic": False,
        }

        self.release_empty_message = {
-            'id': '71a0aea72444d396575dc25ac37fec87ee3c6492',
-            'target': '9ee1c939d1cb936b1f98e8d81aeffab57bae46ab',
-            'target_type': 'revision',
-            'name': b'v2.6.12',
-            'author': {
-                'name': b'Linus Torvalds',
-                'email': b'torvalds@g5.osdl.org',
+            "id": _x("71a0aea72444d396575dc25ac37fec87ee3c6492"),
+            "target": _x("9ee1c939d1cb936b1f98e8d81aeffab57bae46ab"),
+            "target_type": "revision",
+            "name": b"v2.6.12",
+            "author": {
+                "name": b"Linus Torvalds",
+                "email": b"torvalds@g5.osdl.org",
            },
-            'date': datetime.datetime(2005, 10, 27, 17, 2, 33,
-                                      tzinfo=linus_tz),
-            'message': b'',
+            "date": datetime.datetime(2005, 10, 27, 17, 2, 33, tzinfo=linus_tz),
+            "message": b"",
+            "synthetic": False,
        }

        self.release_negative_utc = {
-            'id': '97c8d2573a001f88e72d75f596cf86b12b82fd01',
-            'name': b'20081029',
-            'target': '54e9abca4c77421e2921f5f156c9fe4a9f7441c7',
-            'target_type': 'revision',
-            'date': {
-                'timestamp': {'seconds': 1225281976},
-                'offset': 0,
-                'negative_utc': True,
+            "id": _x("97c8d2573a001f88e72d75f596cf86b12b82fd01"),
+            "name": b"20081029",
+            "target": _x("54e9abca4c77421e2921f5f156c9fe4a9f7441c7"),
+            "target_type": "revision",
+            "date": {
+                "timestamp": {"seconds": 1225281976},
+                "offset_bytes": b"-0000",
            },
-            'author': {
-                'name': b'Otavio Salvador',
-                'email': b'otavio@debian.org',
-                'id': 17640,
+            "author": {
+                "name": b"Otavio Salvador",
+                "email": b"otavio@debian.org",
            },
-            'synthetic': False,
-            'message': b'tagging version 20081029\n\nr56558\n',
+            "synthetic": False,
+            "message": b"tagging version 20081029\n\nr56558\n",
        }

        self.release_newline_in_author = {
-            'author': {
-                'email': b'esycat@gmail.com',
-                'fullname': b'Eugene Janusov\n<esycat@gmail.com>',
-                'name': b'Eugene Janusov\n',
+            "author": {
+                "email": b"esycat@gmail.com",
+                "fullname": b"Eugene Janusov\n<esycat@gmail.com>",
+                "name": b"Eugene Janusov\n",
            },
-            'date': {
-                'negative_utc': None,
-                'offset': 600,
-                'timestamp': {
-                    'microseconds': 0,
-                    'seconds': 1377480558,
+            "date": {
+                "offset_bytes": b"+1000",
+                "timestamp": {
+                    "microseconds": 0,
+                    "seconds": 1377480558,
                },
            },
-            'id': b'\\\x98\xf5Y\xd04\x16-\xe2->\xbe\xb9T3\xe6\xf8\x88R1',
-            'message': b'Release of v0.3.2.',
-            'name': b'0.3.2',
-            'synthetic': False,
-            'target': (b'\xc0j\xa3\xd9;x\xa2\x86\\I5\x17'
-                       b'\x000\xf8\xc2\xd79o\xd3'),
-            'target_type': 'revision',
+            "id": _x("5c98f559d034162de22d3ebeb95433e6f8885231"),
+            "message": b"Release of v0.3.2.",
+            "name": b"0.3.2",
+            "synthetic": False,
+            "target": _x("c06aa3d93b78a2865c4935170030f8c2d7396fd3"),
+            "target_type": "revision",
        }

+        self.release_snapshot_target = dict(self.release)
+        self.release_snapshot_target["target_type"] = "snapshot"
+        self.release_snapshot_target["id"] = _x(
+            "c29c3ddcc6769a04e54dd69d63a6fdcbc566f850"
+        )
+
    def test_release_identifier(self):
        self.assertEqual(
-            identifiers.release_identifier(self.release),
-            identifiers.identifier_to_str(self.release['id'])
+            Release.from_dict(self.release).id,
+            self.release["id"],
+        )
+        self.assertEqual(
+            Release.from_dict(remove_id(self.release)).id,
+            self.release["id"],
        )

    def test_release_identifier_no_author(self):
        self.assertEqual(
-            identifiers.release_identifier(self.release_no_author),
-            identifiers.identifier_to_str(self.release_no_author['id'])
+            Release.from_dict(remove_id(self.release_no_author)).id,
+            self.release_no_author["id"],
        )

    def test_release_identifier_no_message(self):
        self.assertEqual(
-            identifiers.release_identifier(self.release_no_message),
-            identifiers.identifier_to_str(self.release_no_message['id'])
+            Release.from_dict(remove_id(self.release_no_message)).id,
+            self.release_no_message["id"],
        )

    def test_release_identifier_empty_message(self):
        self.assertEqual(
-            identifiers.release_identifier(self.release_empty_message),
-            identifiers.identifier_to_str(self.release_empty_message['id'])
+            Release.from_dict(remove_id(self.release_empty_message)).id,
+            self.release_empty_message["id"],
        )

    def test_release_identifier_negative_utc(self):
        self.assertEqual(
-            identifiers.release_identifier(self.release_negative_utc),
-            identifiers.identifier_to_str(self.release_negative_utc['id'])
+            Release.from_dict(remove_id(self.release_negative_utc)).id,
+            self.release_negative_utc["id"],
        )

    def test_release_identifier_newline_in_author(self):
        self.assertEqual(
-            identifiers.release_identifier(self.release_newline_in_author),
-            identifiers.identifier_to_str(self.release_newline_in_author['id'])
+            Release.from_dict(remove_id(self.release_newline_in_author)).id,
+            self.release_newline_in_author["id"],
        )

+    def test_release_identifier_snapshot_target(self):
+        self.assertEqual(
+            Release.from_dict(self.release_snapshot_target).id,
+            self.release_snapshot_target["id"],
+        )
+
+
+snapshot_example = {
+    "id": _x("6e65b86363953b780d92b0a928f3e8fcdd10db36"),
+    "branches": {
+        b"directory": {
+            "target": _x("1bd0e65f7d2ff14ae994de17a1e7fe65111dcad8"),
+            "target_type": "directory",
+        },
+        b"content": {
+            "target": _x("fe95a46679d128ff167b7c55df5d02356c5a1ae1"),
+            "target_type": "content",
+        },
+        b"alias": {
+            "target": b"revision",
+            "target_type": "alias",
+        },
+        b"revision": {
+            "target": _x("aafb16d69fd30ff58afdd69036a26047f3aebdc6"),
+            "target_type": "revision",
+        },
+        b"release": {
+            "target": _x("7045404f3d1c54e6473c71bbb716529fbad4be24"),
+            "target_type": "release",
+        },
+        b"snapshot": {
+            "target": _x("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
+            "target_type": "snapshot",
+        },
+        b"dangling": None,
+    },
+}
+

 class SnapshotIdentifier(unittest.TestCase):
    def setUp(self):
        super().setUp()

-        self.empty = {
-            'id': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e',
-            'branches': {},
-        }
-
-        self.dangling_branch = {
-            'id': 'c84502e821eb21ed84e9fd3ec40973abc8b32353',
-            'branches': {
-                b'HEAD': None,
-            },
-        }
+        self.empty = Snapshot.from_dict(
+            {
+                "id": _x("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
+                "branches": {},
+            }
+        )

-        self.unresolved = {
-            'id': '84b4548ea486e4b0a7933fa541ff1503a0afe1e0',
-            'branches': {
-                b'foo': {
-                    'target': b'bar',
-                    'target_type': 'alias',
+        self.dangling_branch = Snapshot.from_dict(
+            {
+                "id": _x("c84502e821eb21ed84e9fd3ec40973abc8b32353"),
+                "branches": {
+                    b"HEAD": None,
                },
-            },
-        }
+            }
+        )

-        self.all_types = {
-            'id': '6e65b86363953b780d92b0a928f3e8fcdd10db36',
-            'branches': {
-                b'directory': {
-                    'target': '1bd0e65f7d2ff14ae994de17a1e7fe65111dcad8',
-                    'target_type': 'directory',
-                },
-                b'content': {
-                    'target': 'fe95a46679d128ff167b7c55df5d02356c5a1ae1',
-                    'target_type': 'content',
-                },
-                b'alias': {
-                    'target': b'revision',
-                    'target_type': 'alias',
-                },
-                b'revision': {
-                    'target': 'aafb16d69fd30ff58afdd69036a26047f3aebdc6',
-                    'target_type': 'revision',
+        self.unresolved = Snapshot.from_dict(
+            {
+                "id": _x("84b4548ea486e4b0a7933fa541ff1503a0afe1e0"),
+                "branches": {
+                    b"foo": {
+                        "target": b"bar",
+                        "target_type": "alias",
+                    },
                },
-                b'release': {
-                    'target': '7045404f3d1c54e6473c71bbb716529fbad4be24',
-                    'target_type': 'release',
-                },
-                b'snapshot': {
-                    'target': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e',
-                    'target_type': 'snapshot',
-                },
-                b'dangling': None,
            }
-        }
+        )
+
+        self.all_types = snapshot_example

    def test_empty_snapshot(self):
        self.assertEqual(
-            identifiers.snapshot_identifier(self.empty),
-            identifiers.identifier_to_str(self.empty['id']),
+            Snapshot.from_dict(remove_id(self.empty.to_dict())).id,
+            self.empty.id,
        )

    def test_dangling_branch(self):
        self.assertEqual(
-            identifiers.snapshot_identifier(self.dangling_branch),
-            identifiers.identifier_to_str(self.dangling_branch['id']),
+            Snapshot.from_dict(remove_id(self.dangling_branch.to_dict())).id,
+            self.dangling_branch.id,
        )

    def test_unresolved(self):
+        self.assertEqual(
+            Snapshot.from_dict(remove_id(self.unresolved.to_dict())).id,
+            self.unresolved.id,
+        )
+
+    def test_git_object_unresolved(self):
        with self.assertRaisesRegex(ValueError, "b'foo' -> b'bar'"):
-            identifiers.snapshot_identifier(self.unresolved)
+            git_objects.snapshot_git_object(self.unresolved)
+        git_objects.snapshot_git_object(self.unresolved, ignore_unresolved=True)

-    def test_unresolved_force(self):
+    def test_all_types(self):
        self.assertEqual(
-            identifiers.snapshot_identifier(
-                self.unresolved,
-                ignore_unresolved=True,
+            Snapshot.from_dict(remove_id(self.all_types)).id,
+            self.all_types["id"],
+        )
+
+
+authority_example = {
+    "type": "forge",
+    "url": "https://forge.softwareheritage.org/",
+}
+fetcher_example = {
+    "name": "swh-phabricator-metadata-fetcher",
+    "version": "0.0.1",
+}
+metadata_example = {
+    "target": "swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d",
+    "discovery_date": datetime.datetime(
+        2021, 1, 25, 11, 27, 51, tzinfo=datetime.timezone.utc
+    ),
+    "authority": authority_example,
+    "fetcher": fetcher_example,
+    "format": "json",
+    "metadata": b'{"foo": "bar"}',
+}
+
+
+class RawExtrinsicMetadataIdentifier(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+
+        self.minimal = metadata_example
+        self.maximal = {
+            **self.minimal,
+            "origin": "https://forge.softwareheritage.org/source/swh-model/",
+            "visit": 42,
+            "snapshot": "swh:1:snp:" + "00" * 20,
+            "release": "swh:1:rel:" + "01" * 20,
+            "revision": "swh:1:rev:" + "02" * 20,
+            "path": b"/abc/def",
+            "directory": "swh:1:dir:" + "03" * 20,
+        }
+
+    def test_minimal(self):
+        git_object = (
+            b"raw_extrinsic_metadata 210\0"
+            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
+            b"discovery_date 1611574071\n"
+            b"authority forge https://forge.softwareheritage.org/\n"
+            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
+            b"format json\n"
+            b"\n"
+            b'{"foo": "bar"}'
+        )
+
+        self.assertEqual(
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(self.minimal)
            ),
-            identifiers.identifier_to_str(self.unresolved['id']),
+            git_object,
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(self.minimal).id,
+            hashlib.sha1(git_object).digest(),
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(self.minimal).id,
+            _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"),
+        )
+
+    def test_maximal(self):
+        git_object = (
+            b"raw_extrinsic_metadata 533\0"
+            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
+            b"discovery_date 1611574071\n"
+            b"authority forge https://forge.softwareheritage.org/\n"
+            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
+            b"format json\n"
+            b"origin https://forge.softwareheritage.org/source/swh-model/\n"
+            b"visit 42\n"
+            b"snapshot swh:1:snp:0000000000000000000000000000000000000000\n"
+            b"release swh:1:rel:0101010101010101010101010101010101010101\n"
+            b"revision swh:1:rev:0202020202020202020202020202020202020202\n"
+            b"path /abc/def\n"
+            b"directory swh:1:dir:0303030303030303030303030303030303030303\n"
+            b"\n"
+            b'{"foo": "bar"}'
        )

-    def test_all_types(self):
        self.assertEqual(
-            identifiers.snapshot_identifier(self.all_types),
-            identifiers.identifier_to_str(self.all_types['id']),
-        )
-
-    def test_persistent_identifier(self):
-        _snapshot_id = hashutil.hash_to_bytes(
-                    'c7c108084bc0bf3d81436bf980b46e98bd338453')
-        _release_id = '22ece559cc7cc2364edc5e5593d63ae8bd229f9f'
-        _revision_id = '309cf2674ee7a0749978cf8265ab91a60aea0f7d'
-        _directory_id = 'd198bc9d7a6bcf6db04f476d29314f157507d505'
-        _content_id = '94a9ed024d3859793618152ea559a168bbcbb5e2'
-        _snapshot = {'id': _snapshot_id}
-        _release = {'id': _release_id}
-        _revision = {'id': _revision_id}
-        _directory = {'id': _directory_id}
-        _content = {'sha1_git': _content_id}
-
-        for full_type, _hash, expected_persistent_id, version, _meta in [
-                (SNAPSHOT, _snapshot_id,
-                 'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453',
-                 None, {}),
-                (RELEASE, _release_id,
-                 'swh:2:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f',
-                 2, {}),
-                (REVISION, _revision_id,
-                 'swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d',
-                 None, {}),
-                (DIRECTORY, _directory_id,
-                 'swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505',
-                 None, {}),
-                (CONTENT, _content_id,
-                 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2',
-                 1, {}),
-                (SNAPSHOT, _snapshot,
-                 'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453',
-                 None, {}),
-                (RELEASE, _release,
-                 'swh:2:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f',
-                 2, {}),
-                (REVISION, _revision,
-                 'swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d',
-                 None, {}),
-                (DIRECTORY, _directory,
-                 'swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505',
-                 None, {}),
-                (CONTENT, _content,
-                 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2',
-                 1, {}),
-                (CONTENT, _content,
-                 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2;origin=1',
-                 1, {'origin': '1'}),
-        ]:
-            if version:
-                actual_value = identifiers.persistent_identifier(
-                    full_type, _hash, version, metadata=_meta)
-            else:
-                actual_value = identifiers.persistent_identifier(
-                    full_type, _hash, metadata=_meta)
-
-            self.assertEqual(actual_value, expected_persistent_id)
-
-    def test_persistent_identifier_wrong_input(self):
-        _snapshot_id = 'notahash4bc0bf3d81436bf980b46e98bd338453'
-        _snapshot = {'id': _snapshot_id}
-
-        for _type, _hash, _error in [
-                (SNAPSHOT, _snapshot_id, 'Unexpected characters'),
-                (SNAPSHOT, _snapshot, 'Unexpected characters'),
-                ('foo', '', 'Wrong input: Supported types are'),
-        ]:
-            with self.assertRaisesRegex(ValidationError, _error):
-                identifiers.persistent_identifier(_type, _hash)
-
-    def test_parse_persistent_identifier(self):
-        for pid, _type, _version, _hash in [
-                ('swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2',
-                 CONTENT, 1, '94a9ed024d3859793618152ea559a168bbcbb5e2'),
-                ('swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505',
-                 DIRECTORY, 1, 'd198bc9d7a6bcf6db04f476d29314f157507d505'),
-                ('swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d',
-                 REVISION, 1, '309cf2674ee7a0749978cf8265ab91a60aea0f7d'),
-                ('swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f',
-                 RELEASE, 1, '22ece559cc7cc2364edc5e5593d63ae8bd229f9f'),
-                ('swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453',
-                 SNAPSHOT, 1, 'c7c108084bc0bf3d81436bf980b46e98bd338453'),
-        ]:
-            expected_result = PersistentId(
-                namespace='swh',
-                scheme_version=_version,
-                object_type=_type,
-                object_id=_hash,
-                metadata={}
-            )
-            actual_result = identifiers.parse_persistent_identifier(pid)
-            self.assertEqual(actual_result, expected_result)
-
-        for pid, _type, _version, _hash, _metadata in [
-                ('swh:1:cnt:9c95815d9e9d91b8dae8e05d8bbc696fe19f796b;lines=1-18;origin=https://github.com/python/cpython', # noqa
-                 CONTENT, 1, '9c95815d9e9d91b8dae8e05d8bbc696fe19f796b',
-                 {
-                     'lines': '1-18',
-                     'origin': 'https://github.com/python/cpython'
-                 }),
-                 ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=deb://Debian/packages/linuxdoc-tools', # noqa
-                  DIRECTORY, 1, '0b6959356d30f1a4e9b7f6bca59b9a336464c03d',
-                 {
-                     'origin': 'deb://Debian/packages/linuxdoc-tools'
-                 })
-        ]:
-            expected_result = PersistentId(
-                namespace='swh',
-                scheme_version=_version,
-                object_type=_type,
-                object_id=_hash,
-                metadata=_metadata
-            )
-            actual_result = identifiers.parse_persistent_identifier(pid)
-            self.assertEqual(actual_result, expected_result)
-
-    def test_parse_persistent_identifier_parsing_error(self):
-        for pid, _error in [
-                ('swh:1:cnt',
-                 'Wrong format: There should be 4 mandatory values'),
-                ('swh:1:',
-                 'Wrong format: There should be 4 mandatory values'),
-                ('swh:',
-                 'Wrong format: There should be 4 mandatory values'),
-                ('swh:1:cnt:',
-                 'Wrong format: Identifier should be present'),
-                ('foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505',
-                 'Wrong format: Supported namespace is \'swh\''),
-                ('swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505',
-                 'Wrong format: Supported version is 1'),
-                ('swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505',
-                 'Wrong format: Supported types are %s' % (
-                     ', '.join(PERSISTENT_IDENTIFIER_TYPES))),
-                ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;'
-                 'malformed',
-                 'Contextual data is badly formatted, form key=val expected'),
-                ('swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d',
-                 'Wrong format: Identifier should be a valid hash'),
-                ('swh:1:snp:foo',
-                 'Wrong format: Identifier should be a valid hash')
-        ]:
-            with self.assertRaisesRegex(
-                    ValidationError, _error):
-                identifiers.parse_persistent_identifier(pid)
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(self.maximal)
+            ),
+            git_object,
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(self.maximal).id,
+            hashlib.sha1(git_object).digest(),
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(self.maximal).id,
+            _x("f96966e1093d15236a31fde07e47d5b1c9428049"),
+        )

+    def test_nonascii_path(self):
+        metadata = {
+            **self.minimal,
+            "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f",
+        }
+        git_object = (
+            b"raw_extrinsic_metadata 231\0"
+            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
+            b"discovery_date 1611574071\n"
+            b"authority forge https://forge.softwareheritage.org/\n"
+            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
+            b"format json\n"
+            b"path /ab\n"
+            b" c/d\xf0\x9f\xa4\xb7e\x00f\n"
+            b"\n"
+            b'{"foo": "bar"}'
+        )

-class OriginIdentifier(unittest.TestCase):
-    def setUp(self):
-        self.origin = {
-            'url': 'https://github.com/torvalds/linux',
+        self.assertEqual(
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(metadata)
+            ),
+            git_object,
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            hashlib.sha1(git_object).digest(),
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            _x("7cc83fd1912176510c083f5df43f01b09af4b333"),
+        )
+
+    def test_timezone_insensitive(self):
+        """Checks the timezone of the datetime.datetime does not affect the
+        hashed git_object."""
+        utc_plus_one = datetime.timezone(datetime.timedelta(hours=1))
+        metadata = {
+            **self.minimal,
+            "discovery_date": datetime.datetime(
+                2021,
+                1,
+                25,
+                12,
+                27,
+                51,
+                tzinfo=utc_plus_one,
+            ),
+        }
+
+        self.assertEqual(
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(self.minimal)
+            ),
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(metadata)
+            ),
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(self.minimal).id,
+            RawExtrinsicMetadata.from_dict(metadata).id,
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"),
+        )
+
+    def test_microsecond_insensitive(self):
+        """Checks the microseconds of the datetime.datetime does not affect the
+        hashed manifest."""
+        metadata = {
+            **self.minimal,
+            "discovery_date": datetime.datetime(
+                2021,
+                1,
+                25,
+                11,
+                27,
+                51,
+                123456,
+                tzinfo=datetime.timezone.utc,
+            ),
+        }
+
+        self.assertEqual(
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(self.minimal)
+            ),
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(metadata)
+            ),
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(self.minimal).id,
+            RawExtrinsicMetadata.from_dict(metadata).id,
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"),
+        )
+
+    def test_noninteger_timezone(self):
+        """Checks the discovery_date is translated to UTC before truncating
+        microseconds"""
+        tz = datetime.timezone(datetime.timedelta(microseconds=-42))
+        metadata = {
+            **self.minimal,
+            "discovery_date": datetime.datetime(
+                2021,
+                1,
+                25,
+                11,
+                27,
+                50,
+                1_000_000 - 42,
+                tzinfo=tz,
+            ),
        }

+        self.assertEqual(
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(self.minimal)
+            ),
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(metadata)
+            ),
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(self.minimal).id,
+            RawExtrinsicMetadata.from_dict(metadata).id,
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"),
+        )
+
+    def test_negative_timestamp(self):
+        metadata = {
+            **self.minimal,
+            "discovery_date": datetime.datetime(
+                1960,
+                1,
+                25,
+                11,
+                27,
+                51,
+                tzinfo=datetime.timezone.utc,
+            ),
+        }
+
+        git_object = (
+            b"raw_extrinsic_metadata 210\0"
+            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
+            b"discovery_date -313504329\n"
+            b"authority forge https://forge.softwareheritage.org/\n"
+            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
+            b"format json\n"
+            b"\n"
+            b'{"foo": "bar"}'
+        )
+
+        self.assertEqual(
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(metadata)
+            ),
+            git_object,
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            hashlib.sha1(git_object).digest(),
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            _x("895d0821a2991dd376ddc303424aceb7c68280f9"),
+        )
+
+    def test_epoch(self):
+        metadata = {
+            **self.minimal,
+            "discovery_date": datetime.datetime(
+                1970,
+                1,
+                1,
+                0,
+                0,
+                0,
+                tzinfo=datetime.timezone.utc,
+            ),
+        }
+
+        git_object = (
+            b"raw_extrinsic_metadata 201\0"
+            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
+            b"discovery_date 0\n"
+            b"authority forge https://forge.softwareheritage.org/\n"
+            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
+            b"format json\n"
+            b"\n"
+            b'{"foo": "bar"}'
+        )
+
+        self.assertEqual(
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(metadata)
+            ),
+            git_object,
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            hashlib.sha1(git_object).digest(),
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            _x("27a53df54ace35ebd910493cdc70b334d6b7cb88"),
+        )
+
+    def test_negative_epoch(self):
+        metadata = {
+            **self.minimal,
+            "discovery_date": datetime.datetime(
+                1969,
+                12,
+                31,
+                23,
+                59,
+                59,
+                1,
+                tzinfo=datetime.timezone.utc,
+            ),
+        }
+
+        git_object = (
+            b"raw_extrinsic_metadata 202\0"
+            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
+            b"discovery_date -1\n"
+            b"authority forge https://forge.softwareheritage.org/\n"
+            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
+            b"format json\n"
+            b"\n"
+            b'{"foo": "bar"}'
+        )
+
+        self.assertEqual(
+            git_objects.raw_extrinsic_metadata_git_object(
+                RawExtrinsicMetadata.from_dict(metadata)
+            ),
+            git_object,
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            hashlib.sha1(git_object).digest(),
+        )
+        self.assertEqual(
+            RawExtrinsicMetadata.from_dict(metadata).id,
+            _x("be7154a8fd49d87f81547ea634d1e2152907d089"),
+        )
+
+
+origin_example = {
+    "url": "https://github.com/torvalds/linux",
+}
+
+
+class OriginIdentifier(unittest.TestCase):
    def test_content_identifier(self):
-        self.assertEqual(identifiers.origin_identifier(self.origin),
-                         'b63a575fe3faab7692c9f38fb09d4bb45651bb0f')
+        self.assertEqual(
+            Origin.from_dict(origin_example).id,
+            _x("b63a575fe3faab7692c9f38fb09d4bb45651bb0f"),
+        )
+
+
+# Format: [
+#   (
+#       input1,
+#       expected_output1,
+#   ),
+#   (
+#       input2,
+#       expected_output2,
+#   ),
+#   ...
+# ]
+TS_DICTS = [
+    # with current input dict format (offset_bytes)
+    (
+        {"timestamp": 12345, "offset_bytes": b"+0000"},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"+0000",
+        },
+    ),
+    (
+        {"timestamp": 12345, "offset_bytes": b"-0000"},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"-0000",
+        },
+    ),
+    (
+        {"timestamp": 12345, "offset_bytes": b"+0200"},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"+0200",
+        },
+    ),
+    (
+        {"timestamp": 12345, "offset_bytes": b"-0200"},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"-0200",
+        },
+    ),
+    (
+        {"timestamp": 12345, "offset_bytes": b"--700"},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"--700",
+        },
+    ),
+    (
+        {"timestamp": 12345, "offset_bytes": b"1234567"},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"1234567",
+        },
+    ),
+    # with old-style input dicts (numeric offset + optional negative_utc):
+    (
+        {"timestamp": 12345, "offset": 0},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"+0000",
+        },
+    ),
+    (
+        {"timestamp": 12345, "offset": 0, "negative_utc": False},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"+0000",
+        },
+    ),
+    (
+        {"timestamp": 12345, "offset": 0, "negative_utc": False},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"+0000",
+        },
+    ),
+    (
+        {"timestamp": 12345, "offset": 0, "negative_utc": None},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"+0000",
+        },
+    ),
+    (
+        {"timestamp": {"seconds": 12345}, "offset": 0, "negative_utc": None},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"+0000",
+        },
+    ),
+    (
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset": 0,
+            "negative_utc": None,
+        },
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"+0000",
+        },
+    ),
+    (
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 100},
+            "offset": 0,
+            "negative_utc": None,
+        },
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 100},
+            "offset_bytes": b"+0000",
+        },
+    ),
+    (
+        {"timestamp": 12345, "offset": 0, "negative_utc": True},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"-0000",
+        },
+    ),
+    (
+        {"timestamp": 12345, "offset": 0, "negative_utc": None},
+        {
+            "timestamp": {"seconds": 12345, "microseconds": 0},
+            "offset_bytes": b"+0000",
+        },
+    ),
+]
+
+
+@pytest.mark.parametrize("dict_input,expected", TS_DICTS)
+def test_normalize_timestamp_dict(dict_input, expected):
+    assert TimestampWithTimezone.from_dict(dict_input).to_dict() == expected
+
+
+TS_DICTS_INVALID_TIMESTAMP = [
+    {"timestamp": 1.2, "offset": 0},
+    {"timestamp": "1", "offset": 0},
+    # these below should really also trigger a ValueError...
+    # {"timestamp": {"seconds": "1"}, "offset": 0},
+    # {"timestamp": {"seconds": 1.2}, "offset": 0},
+    # {"timestamp": {"seconds": 1.2}, "offset": 0},
+]
+
+
+@pytest.mark.parametrize("dict_input", TS_DICTS_INVALID_TIMESTAMP)
+def test_normalize_timestamp_dict_invalid_timestamp(dict_input):
+    with pytest.raises(ValueError, match="non-integer timestamp"):
+        TimestampWithTimezone.from_dict(dict_input)
+
+
+UTC = datetime.timezone.utc
+TS_TIMEZONES = [
+    datetime.timezone.min,
+    datetime.timezone(datetime.timedelta(hours=-1)),
+    UTC,
+    datetime.timezone(datetime.timedelta(minutes=+60)),
+    datetime.timezone.max,
+]
+TS_TZ_EXPECTED = [-1439, -60, 0, 60, 1439]
+TS_TZ_BYTES_EXPECTED = [b"-2359", b"-0100", b"+0000", b"+0100", b"+2359"]
+TS_DATETIMES = [
+    datetime.datetime(2020, 2, 27, 14, 39, 19, tzinfo=UTC),
+    datetime.datetime(2120, 12, 31, 23, 59, 59, tzinfo=UTC),
+    datetime.datetime(1610, 5, 14, 15, 43, 0, tzinfo=UTC),
+]
+TS_DT_EXPECTED = [1582814359, 4765132799, -11348929020]
+
+
+@pytest.mark.parametrize("date, seconds", zip(TS_DATETIMES, TS_DT_EXPECTED))
+@pytest.mark.parametrize(
+    "tz, offset, offset_bytes", zip(TS_TIMEZONES, TS_TZ_EXPECTED, TS_TZ_BYTES_EXPECTED)
+)
+@pytest.mark.parametrize("microsecond", [0, 1, 10, 100, 1000, 999999])
+def test_normalize_timestamp_datetime(
+    date, seconds, tz, offset, offset_bytes, microsecond
+):
+    date = date.astimezone(tz).replace(microsecond=microsecond)
+    assert TimestampWithTimezone.from_dict(date).to_dict() == {
+        "timestamp": {"seconds": seconds, "microseconds": microsecond},
+        "offset_bytes": offset_bytes,
+    }
+
+
+def test_extid_identifier_bwcompat():
+    extid_dict = {
+        "extid_type": "test-type",
+        "extid": b"extid",
+        "target": "swh:1:dir:" + "00" * 20,
+    }
+
+    assert ExtID.from_dict(extid_dict).id == _x(
+        "b9295e1931c31e40a7e3e1e967decd1c89426455"
+    )
+
+    assert (
+        ExtID.from_dict({**extid_dict, "extid_version": 0}).id
+        == ExtID.from_dict(extid_dict).id
+    )
+
+    assert (
+        ExtID.from_dict({**extid_dict, "extid_version": 1}).id
+        != ExtID.from_dict(extid_dict).id
+    )
+
+    assert (
+        ExtID.from_dict(
+            {
+                **extid_dict,
+                "payload_type": "test",
+                "payload": bytes.fromhex("257cc5642cb1a054f08cc83f2d943e56fd3ebe99"),
+            }
+        ).id
+        != ExtID.from_dict(extid_dict).id
+    )
--- a/swh/model/tests/test_merkle.py
+++ b/swh/model/tests/test_merkle.py
-# Copyright (C) 2017 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -9,28 +9,20 @@ from swh.model import merkle


 class MerkleTestNode(merkle.MerkleNode):
-    type = 'tested_merkle_node_type'
+    object_type = "tested_merkle_node_type"

    def __init__(self, data):
        super().__init__(data)
        self.compute_hash_called = 0

-    def compute_hash(self):
+    def compute_hash(self) -> bytes:
        self.compute_hash_called += 1
-        child_data = [
-            child + b'=' + self[child].hash
-            for child in sorted(self)
-        ]
-
-        return (
-            b'hash('
-            + b', '.join([self.data['value']] + child_data)
-            + b')'
-        )
+        child_data = [child + b"=" + self[child].hash for child in sorted(self)]
+        return b"hash(" + b", ".join([self.data.get("value", b"")] + child_data) + b")"


 class MerkleTestLeaf(merkle.MerkleLeaf):
-    type = 'tested_merkle_leaf_type'
+    object_type = "tested_merkle_leaf_type"

    def __init__(self, data):
        super().__init__(data)
@@ -38,14 +30,22 @@ class MerkleTestLeaf(merkle.MerkleLeaf):

    def compute_hash(self):
        self.compute_hash_called += 1
-        return b'hash(' + self.data['value'] + b')'
+        return b"hash(" + self.data.get("value", b"") + b")"


 class TestMerkleLeaf(unittest.TestCase):
    def setUp(self):
-        self.data = {'value': b'value'}
+        self.data = {"value": b"value"}
        self.instance = MerkleTestLeaf(self.data)

+    def test_equality(self):
+        leaf1 = MerkleTestLeaf(self.data)
+        leaf2 = MerkleTestLeaf(self.data)
+        leaf3 = MerkleTestLeaf({})
+
+        self.assertEqual(leaf1, leaf2)
+        self.assertNotEqual(leaf1, leaf3)
+
    def test_hash(self):
        self.assertEqual(self.instance.compute_hash_called, 0)
        instance_hash = self.instance.hash
@@ -60,29 +60,26 @@ class TestMerkleLeaf(unittest.TestCase):
    def test_collect(self):
        collected = self.instance.collect()
        self.assertEqual(
-            collected, {
-                self.instance.type: {
-                    self.instance.hash: self.instance.get_data(),
-                },
-            },
+            collected,
+            {self.instance},
        )
        collected2 = self.instance.collect()
-        self.assertEqual(collected2, {})
+        self.assertEqual(collected2, set())
        self.instance.reset_collect()
        collected3 = self.instance.collect()
        self.assertEqual(collected, collected3)

    def test_leaf(self):
-        with self.assertRaisesRegex(ValueError, 'is a leaf'):
-            self.instance[b'key1'] = 'Test'
+        with self.assertRaisesRegex(ValueError, "is a leaf"):
+            self.instance[b"key1"] = "Test"

-        with self.assertRaisesRegex(ValueError, 'is a leaf'):
-            del self.instance[b'key1']
+        with self.assertRaisesRegex(ValueError, "is a leaf"):
+            del self.instance[b"key1"]

-        with self.assertRaisesRegex(ValueError, 'is a leaf'):
-            self.instance[b'key1']
+        with self.assertRaisesRegex(ValueError, "is a leaf"):
+            self.instance[b"key1"]

-        with self.assertRaisesRegex(ValueError, 'is a leaf'):
+        with self.assertRaisesRegex(ValueError, "is a leaf"):
            self.instance.update(self.data)


@@ -90,30 +87,50 @@ class TestMerkleNode(unittest.TestCase):
    maxDiff = None

    def setUp(self):
-        self.root = MerkleTestNode({'value': b'root'})
-        self.nodes = {b'root': self.root}
-        for i in (b'a', b'b', b'c'):
-            value = b'root/' + i
-            node = MerkleTestNode({
-                'value': value,
-            })
+        self.root = MerkleTestNode({"value": b"root"})
+        self.nodes = {b"root": self.root}
+        for i in (b"a", b"b", b"c"):
+            value = b"root/" + i
+            node = MerkleTestNode(
+                {
+                    "value": value,
+                }
+            )
            self.root[i] = node
            self.nodes[value] = node
-            for j in (b'a', b'b', b'c'):
-                value2 = value + b'/' + j
-                node2 = MerkleTestNode({
-                    'value': value2,
-                })
+            for j in (b"a", b"b", b"c"):
+                value2 = value + b"/" + j
+                node2 = MerkleTestNode(
+                    {
+                        "value": value2,
+                    }
+                )
                node[j] = node2
                self.nodes[value2] = node2
-                for k in (b'a', b'b', b'c'):
-                    value3 = value2 + b'/' + j
-                    node3 = MerkleTestNode({
-                        'value': value3,
-                    })
+                for k in (b"a", b"b", b"c"):
+                    value3 = value2 + b"/" + j
+                    node3 = MerkleTestNode(
+                        {
+                            "value": value3,
+                        }
+                    )
                    node2[j] = node3
                    self.nodes[value3] = node3

+    def test_equality(self):
+        node1 = MerkleTestNode({"value": b"bar"})
+        node2 = MerkleTestNode({"value": b"bar"})
+        node3 = MerkleTestNode({})
+
+        self.assertEqual(node1, node2)
+        self.assertNotEqual(node1, node3, node1 == node3)
+
+        node1[b"a"] = node3
+        self.assertNotEqual(node1, node2)
+
+        node2[b"a"] = node3
+        self.assertEqual(node1, node2)
+
    def test_hash(self):
        for node in self.nodes.values():
            self.assertEqual(node.compute_hash_called, 0)
@@ -122,7 +139,7 @@ class TestMerkleNode(unittest.TestCase):
        hash = self.root.hash
        for node in self.nodes.values():
            self.assertEqual(node.compute_hash_called, 1)
-            self.assertIn(node.data['value'], hash)
+            self.assertIn(node.data["value"], hash)

        # Should use the cached value
        hash2 = self.root.hash
@@ -137,10 +154,10 @@ class TestMerkleNode(unittest.TestCase):
            self.assertEqual(node.compute_hash_called, 1)

        # Force update of the cached value for a deeply nested node
-        self.root[b'a'][b'b'].update_hash(force=True)
+        self.root[b"a"][b"b"].update_hash(force=True)
        for key, node in self.nodes.items():
            # update_hash rehashes all children
-            if key.startswith(b'root/a/b'):
+            if key.startswith(b"root/a/b"):
                self.assertEqual(node.compute_hash_called, 2)
            else:
                self.assertEqual(node.compute_hash_called, 1)
@@ -149,81 +166,97 @@ class TestMerkleNode(unittest.TestCase):
        self.assertEqual(hash, hash4)
        for key, node in self.nodes.items():
            # update_hash also invalidates all parents
-            if key in (b'root', b'root/a') or key.startswith(b'root/a/b'):
+            if key in (b"root", b"root/a") or key.startswith(b"root/a/b"):
                self.assertEqual(node.compute_hash_called, 2)
            else:
                self.assertEqual(node.compute_hash_called, 1)

    def test_collect(self):
        collected = self.root.collect()
-        self.assertEqual(len(collected[self.root.type]), len(self.nodes))
+        self.assertEqual(collected, set(self.nodes.values()))
        for node in self.nodes.values():
            self.assertTrue(node.collected)
        collected2 = self.root.collect()
-        self.assertEqual(collected2, {})
+        self.assertEqual(collected2, set())
+
+    def test_iter_tree_with_deduplication(self):
+        nodes = list(self.root.iter_tree())
+        self.assertCountEqual(nodes, self.nodes.values())
+
+    def test_iter_tree_without_deduplication(self):
+        # duplicate existing hash in merkle tree
+        self.root[b"d"] = MerkleTestNode({"value": b"root/c/c/c"})
+        nodes_dedup = list(self.root.iter_tree())
+        nodes = list(self.root.iter_tree(dedup=False))
+        assert nodes != nodes_dedup
+        assert len(nodes) == len(nodes_dedup) + 1

    def test_get(self):
-        for key in (b'a', b'b', b'c'):
-            self.assertEqual(self.root[key], self.nodes[b'root/' + key])
+        for key in (b"a", b"b", b"c"):
+            self.assertEqual(self.root[key], self.nodes[b"root/" + key])

        with self.assertRaisesRegex(KeyError, "b'nonexistent'"):
-            self.root[b'nonexistent']
+            self.root[b"nonexistent"]

    def test_del(self):
        hash_root = self.root.hash
-        hash_a = self.nodes[b'root/a'].hash
-        del self.root[b'a'][b'c']
+        hash_a = self.nodes[b"root/a"].hash
+        del self.root[b"a"][b"c"]
        hash_root2 = self.root.hash
-        hash_a2 = self.nodes[b'root/a'].hash
+        hash_a2 = self.nodes[b"root/a"].hash

        self.assertNotEqual(hash_root, hash_root2)
        self.assertNotEqual(hash_a, hash_a2)

-        self.assertEqual(self.nodes[b'root/a/c'].parents, [])
+        self.assertEqual(self.nodes[b"root/a/c"].parents, [])

        with self.assertRaisesRegex(KeyError, "b'nonexistent'"):
-            del self.root[b'nonexistent']
+            del self.root[b"nonexistent"]

    def test_update(self):
        hash_root = self.root.hash
-        hash_b = self.root[b'b'].hash
+        hash_b = self.root[b"b"].hash
        new_children = {
-            b'c': MerkleTestNode({'value': b'root/b/new_c'}),
-            b'd': MerkleTestNode({'value': b'root/b/d'}),
+            b"c": MerkleTestNode({"value": b"root/b/new_c"}),
+            b"d": MerkleTestNode({"value": b"root/b/d"}),
        }

        # collect all nodes
        self.root.collect()

-        self.root[b'b'].update(new_children)
+        self.root[b"b"].update(new_children)

        # Ensure everyone got reparented
-        self.assertEqual(new_children[b'c'].parents, [self.root[b'b']])
-        self.assertEqual(new_children[b'd'].parents, [self.root[b'b']])
-        self.assertEqual(self.nodes[b'root/b/c'].parents, [])
+        self.assertEqual(new_children[b"c"].parents, [self.root[b"b"]])
+        self.assertEqual(new_children[b"d"].parents, [self.root[b"b"]])
+        self.assertEqual(self.nodes[b"root/b/c"].parents, [])

        hash_root2 = self.root.hash
        self.assertNotEqual(hash_root, hash_root2)
-        self.assertIn(b'root/b/new_c', hash_root2)
-        self.assertIn(b'root/b/d', hash_root2)
+        self.assertIn(b"root/b/new_c", hash_root2)
+        self.assertIn(b"root/b/d", hash_root2)

-        hash_b2 = self.root[b'b'].hash
+        hash_b2 = self.root[b"b"].hash
        self.assertNotEqual(hash_b, hash_b2)

        for key, node in self.nodes.items():
-            if key in (b'root', b'root/b'):
+            if key in (b"root", b"root/b"):
                self.assertEqual(node.compute_hash_called, 2)
            else:
                self.assertEqual(node.compute_hash_called, 1)

        # Ensure we collected root, root/b, and both new children
        collected_after_update = self.root.collect()
-        self.assertCountEqual(
-            collected_after_update[MerkleTestNode.type],
-            [self.nodes[b'root'].hash, self.nodes[b'root/b'].hash,
-             new_children[b'c'].hash, new_children[b'd'].hash],
+        self.assertEqual(
+            collected_after_update,
+            {
+                self.nodes[b"root"],
+                self.nodes[b"root/b"],
+                new_children[b"c"],
+                new_children[b"d"],
+            },
        )

        # test that noop updates doesn't invalidate anything
-        self.root[b'a'][b'b'].update({})
-        self.assertEqual(self.root.collect(), {})
+        self.root[b"a"][b"b"].update({})
+        self.assertEqual(self.root.collect(), set())
--- a/swh/model/tests/test_model.py
+++ b/swh/model/tests/test_model.py
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

+import collections
 import copy
+import datetime
+import hashlib
+import re
+from typing import Any, List, Optional, Tuple, Union

+import attr
+from attrs_strict import AttributeTypeError
+import dateutil
 from hypothesis import given
+from hypothesis.strategies import binary, none
+import pytest

-from swh.model.hypothesis_strategies import objects
+from swh.model.collections import ImmutableDict
+from swh.model.from_disk import DentryPerms
+import swh.model.git_objects
+from swh.model.hashutil import MultiHash, hash_to_bytes
+import swh.model.hypothesis_strategies as strategies
+import swh.model.model
+from swh.model.model import (
+    BaseModel,
+    Content,
+    Directory,
+    DirectoryEntry,
+    MetadataAuthority,
+    MetadataAuthorityType,
+    MetadataFetcher,
+    MissingData,
+    ModelObjectType,
+    Origin,
+    OriginVisit,
+    OriginVisitStatus,
+    Person,
+    RawExtrinsicMetadata,
+    Release,
+    Revision,
+    SkippedContent,
+    Snapshot,
+    SnapshotBranch,
+    SnapshotTargetType,
+    Timestamp,
+    TimestampOverflowException,
+    TimestampWithTimezone,
+    optimized_validator,
+)
+import swh.model.swhids
+from swh.model.swhids import CoreSWHID, ExtendedSWHID, ObjectType
+from swh.model.tests.swh_model_data import TEST_OBJECTS
+from swh.model.tests.test_identifiers import (
+    TS_DATETIMES,
+    TS_TIMEZONES,
+    directory_example,
+    metadata_example,
+    release_example,
+    revision_example,
+    snapshot_example,
+)

+EXAMPLE_HASH = hash_to_bytes("94a9ed024d3859793618152ea559a168bbcbb5e2")

-@given(objects())
+
+@given(
+    strategies.objects(
+        blacklist_types={
+            ModelObjectType.ORIGIN,
+            ModelObjectType.ORIGIN_VISIT,
+            ModelObjectType.ORIGIN_VISIT_STATUS,
+        }
+    )
+)
 def test_todict_inverse_fromdict(objtype_and_obj):
    (obj_type, obj) = objtype_and_obj
+
    obj_as_dict = obj.to_dict()
    obj_as_dict_copy = copy.deepcopy(obj_as_dict)

@@ -24,3 +88,2184 @@ def test_todict_inverse_fromdict(objtype_and_obj):

    # Check the composition of from_dict and to_dict is the identity
    assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict()
+
+
+# In some case, python-dateutil build a `tzfile` object from the
+# content of a tarball. In such case the `tzfile._filename` attribute will refer to the
+# filepath within the tarball, making the __repr__ unusable. We work around
+# this by replacing the tzfile by a gettz call, as the filename matches the
+# timezone identifier.
+#
+# We detect the bogus tzfile __repr__ by checking if the path is absolute. If
+# the path is not absolute, we are in the tarball case.
+
+RE_FIX_TZ_FILE = re.compile(r"tzfile\('([^/][^']*)'\)")
+
+
+@given(strategies.objects())
+def test_repr(objtype_and_obj):
+    """Checks every model object has a working repr(), and that it can be eval()uated
+    (so that printed objects can be copy-pasted to write test cases.)"""
+    (obj_type, obj) = objtype_and_obj
+
+    r = repr(obj)
+    env = {
+        "tzutc": lambda: datetime.timezone.utc,
+        "tzfile": dateutil.tz.tzfile,
+        "gettz": dateutil.tz.gettz,
+        "hash_to_bytes": hash_to_bytes,
+        **swh.model.swhids.__dict__,
+        **swh.model.model.__dict__,
+    }
+    # replace bogus tzfile __repr__ on the fly
+    r = RE_FIX_TZ_FILE.sub(r"gettz('\1')", r)
+    assert eval(r, env) == obj
+
+
+@attr.s
+class Cls1:
+    pass
+
+
+@attr.s
+class Cls2(Cls1):
+    pass
+
+
+_custom_namedtuple = collections.namedtuple("_custom_namedtuple", "a b")
+
+
+class _custom_tuple(tuple):
+    pass
+
+
+# List of (type, valid_values, invalid_values)
+_TYPE_VALIDATOR_PARAMETERS: List[Tuple[Any, List[Any], List[Any]]] = [
+    # base types:
+    (
+        bool,
+        [True, False],
+        [-1, 0, 1, 42, 1000, None, "123", 0.0, (), ("foo",), ImmutableDict()],
+    ),
+    (
+        int,
+        [-1, 0, 1, 42, 1000, DentryPerms.directory, True, False],
+        [None, "123", 0.0, (), ImmutableDict()],
+    ),
+    (
+        float,
+        [-1.0, 0.0, 1.0, float("infinity"), float("NaN")],
+        [True, False, None, 1, "1.2", (), ImmutableDict()],
+    ),
+    (
+        bytes,
+        [b"", b"123"],
+        [None, bytearray(b"\x12\x34"), "123", 0, 123, (), (1, 2, 3), ImmutableDict()],
+    ),
+    (str, ["", "123"], [None, b"123", b"", 0, (), (1, 2, 3), ImmutableDict()]),
+    (None, [None], [b"", b"123", "", "foo", 0, 123, ImmutableDict(), float("NaN")]),
+    # unions:
+    (
+        Optional[int],
+        [None, -1, 0, 1, 42, 1000, DentryPerms.directory],
+        ["123", 0.0, (), ImmutableDict()],
+    ),
+    (
+        Optional[bytes],
+        [None, b"", b"123"],
+        ["123", "", 0, (), (1, 2, 3), ImmutableDict()],
+    ),
+    (
+        Union[str, bytes],
+        ["", "123", b"123", b""],
+        [None, 0, (), (1, 2, 3), ImmutableDict()],
+    ),
+    (
+        Union[str, bytes, None],
+        ["", "123", b"123", b"", None],
+        [0, (), (1, 2, 3), ImmutableDict()],
+    ),
+    # tuples
+    (
+        Tuple[str, str],
+        [("foo", "bar"), ("", ""), _custom_namedtuple("", ""), _custom_tuple(("", ""))],
+        [("foo",), ("foo", "bar", "baz"), ("foo", 42), (42, "foo")],
+    ),
+    (
+        Tuple[bytes, bytes],
+        [
+            (b"foo", b"bar"),
+            (b"", b""),
+            _custom_namedtuple(b"", b""),
+            _custom_tuple((b"", b"")),
+        ],
+        [(b"foo",), (b"foo", b"bar", b"baz"), (b"foo", 42), (42, b"foo")],
+    ),
+    (
+        Tuple[str, ...],
+        [
+            ("foo",),
+            ("foo", "bar"),
+            ("", ""),
+            ("foo", "bar", "baz"),
+            _custom_namedtuple("", ""),
+            _custom_tuple(("", "")),
+        ],
+        [("foo", 42), (42, "foo")],
+    ),
+    # composite generic:
+    (
+        Tuple[Union[str, int], Union[str, int]],
+        [("foo", "foo"), ("foo", 42), (42, "foo"), (42, 42)],
+        [("foo", b"bar"), (b"bar", "foo")],
+    ),
+    (
+        Union[Tuple[str, str], Tuple[int, int]],
+        [("foo", "foo"), (42, 42)],
+        [("foo", b"bar"), (b"bar", "foo"), ("foo", 42), (42, "foo")],
+    ),
+    (
+        Tuple[Tuple[bytes, bytes], ...],
+        [(), ((b"foo", b"bar"),), ((b"foo", b"bar"), (b"baz", b"qux"))],
+        [((b"foo", "bar"),), ((b"foo", b"bar"), ("baz", b"qux"))],
+    ),
+    # standard types:
+    (
+        datetime.datetime,
+        [
+            datetime.datetime(2021, 12, 15, 12, 59, 27),
+            datetime.datetime(2021, 12, 15, 12, 59, 27, tzinfo=datetime.timezone.utc),
+        ],
+        [None, 123],
+    ),
+    # ImmutableDict
+    (
+        ImmutableDict[str, int],
+        [
+            ImmutableDict(),
+            ImmutableDict({"foo": 42}),
+            ImmutableDict({"foo": 42, "bar": 123}),
+        ],
+        [ImmutableDict({"foo": "bar"}), ImmutableDict({42: 123})],
+    ),
+    # Any:
+    (
+        object,
+        [-1, 0, 1, 42, 1000, None, "123", 0.0, (), ImmutableDict()],
+        [],
+    ),
+    (
+        Any,
+        [-1, 0, 1, 42, 1000, None, "123", 0.0, (), ImmutableDict()],
+        [],
+    ),
+    (
+        ImmutableDict[Any, int],
+        [
+            ImmutableDict(),
+            ImmutableDict({"foo": 42}),
+            ImmutableDict({"foo": 42, "bar": 123}),
+            ImmutableDict({42: 123}),
+        ],
+        [ImmutableDict({"foo": "bar"})],
+    ),
+    (
+        ImmutableDict[str, Any],
+        [
+            ImmutableDict(),
+            ImmutableDict({"foo": 42}),
+            ImmutableDict({"foo": "bar"}),
+            ImmutableDict({"foo": 42, "bar": 123}),
+        ],
+        [ImmutableDict({42: 123})],
+    ),
+    # attr objects:
+    (
+        Timestamp,
+        [
+            Timestamp(seconds=123, microseconds=0),
+        ],
+        [None, "2021-09-28T11:27:59", 123],
+    ),
+    (
+        Cls1,
+        [Cls1(), Cls2()],
+        [None, b"abcd"],
+    ),
+    # enums:
+    (
+        SnapshotTargetType,
+        [SnapshotTargetType.CONTENT, SnapshotTargetType.ALIAS],
+        ["content", "alias", 123, None],
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "type_,value",
+    [
+        pytest.param(type_, value, id=f"type={type_}, value={value}")
+        for (type_, values, _) in _TYPE_VALIDATOR_PARAMETERS
+        for value in values
+    ],
+)
+def test_optimized_type_validator_valid(type_, value):
+    validator = optimized_validator(type_)
+    validator(None, attr.ib(type=type_), value)
+
+
+@pytest.mark.parametrize(
+    "type_,value",
+    [
+        pytest.param(type_, value, id=f"type={type_}, value={value}")
+        for (type_, _, values) in _TYPE_VALIDATOR_PARAMETERS
+        for value in values
+    ],
+)
+def test_optimized_type_validator_invalid(type_, value):
+    validator = optimized_validator(type_)
+    with pytest.raises(AttributeTypeError):
+        validator(None, attr.ib(type=type_), value)
+
+
+@pytest.mark.parametrize("object_type, objects", TEST_OBJECTS.items())
+def test_swh_model_todict_fromdict(object_type, objects):
+    """checks model objects in swh_model_data are in correct shape"""
+    assert objects
+    for obj in objects:
+        # Check the composition of from_dict and to_dict is the identity
+        obj_as_dict = obj.to_dict()
+        assert obj == type(obj).from_dict(obj_as_dict)
+        assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict()
+
+
+def test_unique_key():
+    url = "http://example.org/"
+    date = datetime.datetime.now(tz=datetime.timezone.utc)
+    id_ = b"42" * 10
+    assert Origin(url=url).unique_key() == {"url": url}
+    assert OriginVisit(origin=url, date=date, type="git").unique_key() == {
+        "origin": url,
+        "date": str(date),
+    }
+    assert OriginVisitStatus(
+        origin=url, visit=42, date=date, status="created", snapshot=None
+    ).unique_key() == {
+        "origin": url,
+        "visit": "42",
+        "date": str(date),
+    }
+
+    assert Snapshot.from_dict({**snapshot_example, "id": id_}).unique_key() == id_
+    assert Release.from_dict({**release_example, "id": id_}).unique_key() == id_
+    assert Revision.from_dict({**revision_example, "id": id_}).unique_key() == id_
+    assert Directory.from_dict({**directory_example, "id": id_}).unique_key() == id_
+    assert (
+        RawExtrinsicMetadata.from_dict({**metadata_example, "id": id_}).unique_key()
+        == id_
+    )
+
+    cont = Content.from_data(b"foo")
+    assert cont.unique_key().hex() == "0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"
+
+    kwargs = {
+        **cont.to_dict(),
+        "reason": "foo",
+        "status": "absent",
+    }
+    del kwargs["data"]
+    assert SkippedContent(**kwargs).unique_key() == cont.hashes()
+
+
+# Anonymization
+
+
+@given(strategies.objects())
+def test_anonymization(objtype_and_obj):
+    (obj_type, obj) = objtype_and_obj
+
+    def check_person(p):
+        if p is not None:
+            assert p.name is None
+            assert p.email is None
+            assert len(p.fullname) == 32
+
+    anon_obj = obj.anonymize()
+    if obj_type == ModelObjectType.PERSON:
+        assert anon_obj is not None
+        check_person(anon_obj)
+    elif obj_type == ModelObjectType.RELEASE:
+        assert anon_obj is not None
+        check_person(anon_obj.author)
+    elif obj_type == ModelObjectType.REVISION:
+        assert anon_obj is not None
+        check_person(anon_obj.author)
+        check_person(anon_obj.committer)
+    else:
+        assert anon_obj is None
+
+
+# Origin, OriginVisit, OriginVisitStatus
+
+
+@given(strategies.origins())
+def test_todict_origins(origin):
+    obj = origin.to_dict()
+
+    assert "type" not in obj
+    assert type(origin)(url=origin.url) == type(origin).from_dict(obj)
+
+
+def test_origin_long_url():
+    with pytest.raises(ValueError, match="Origin URL is too long"):
+        Origin(url="https://" + "a" * 3000)
+    with pytest.raises(ValueError, match="Origin URL is too long"):
+        Origin(url="https://example.org/" + "a" * 3050)
+
+
+@given(strategies.origin_visits())
+def test_todict_origin_visits(origin_visit):
+    obj = origin_visit.to_dict()
+
+    assert origin_visit == type(origin_visit).from_dict(obj)
+
+
+def test_origin_visit_naive_datetime():
+    with pytest.raises(ValueError, match="must be a timezone-aware datetime"):
+        OriginVisit(
+            origin="http://foo/",
+            date=datetime.datetime.now(),
+            type="git",
+        )
+
+
+@given(strategies.origin_visit_statuses())
+def test_todict_origin_visit_statuses(origin_visit_status):
+    obj = origin_visit_status.to_dict()
+
+    assert origin_visit_status == type(origin_visit_status).from_dict(obj)
+
+
+def test_origin_visit_status_naive_datetime():
+    with pytest.raises(ValueError, match="must be a timezone-aware datetime"):
+        OriginVisitStatus(
+            origin="http://foo/",
+            visit=42,
+            date=datetime.datetime.now(),
+            status="ongoing",
+            snapshot=None,
+        )
+
+
+@pytest.fixture
+def origin_visit_status_example():
+    tz = datetime.timezone(datetime.timedelta(minutes=+60))
+    return OriginVisitStatus(
+        origin="http://foo/",
+        visit=42,
+        date=datetime.datetime.now(tz=tz),
+        status="full",
+        snapshot=hash_to_bytes("6e65b86363953b780d92b0a928f3e8fcdd10db36"),
+    )
+
+
+def test_origin_visit_status_snapshot_swhid(origin_visit_status_example):
+    assert origin_visit_status_example.snapshot_swhid() == CoreSWHID.from_string(
+        "swh:1:snp:6e65b86363953b780d92b0a928f3e8fcdd10db36"
+    )
+
+
+def test_origin_visit_status_origin_swhid(origin_visit_status_example):
+    assert origin_visit_status_example.origin_swhid() == ExtendedSWHID.from_string(
+        "swh:1:ori:e0cee4b024ab93b037a1c182865942f5430c6fa4"
+    )
+
+
+# Timestamp
+
+
+@given(strategies.timestamps())
+def test_timestamps_strategy(timestamp):
+    attr.validate(timestamp)
+
+
+def test_timestamp_seconds():
+    attr.validate(Timestamp(seconds=0, microseconds=0))
+    with pytest.raises(AttributeTypeError):
+        Timestamp(seconds="0", microseconds=0)
+
+    attr.validate(
+        Timestamp(
+            seconds=Timestamp.MAX_SECONDS, microseconds=Timestamp.MAX_MICROSECONDS
+        )
+    )
+    attr.validate(
+        Timestamp(
+            seconds=Timestamp.MIN_SECONDS, microseconds=Timestamp.MIN_MICROSECONDS
+        )
+    )
+
+    with pytest.raises(TimestampOverflowException):
+        attr.validate(
+            Timestamp(
+                seconds=Timestamp.MAX_SECONDS + 1,
+                microseconds=Timestamp.MAX_MICROSECONDS,
+            )
+        )
+    with pytest.raises(TimestampOverflowException):
+        attr.validate(
+            Timestamp(
+                seconds=Timestamp.MIN_SECONDS - 1,
+                microseconds=Timestamp.MIN_MICROSECONDS,
+            )
+        )
+
+    with pytest.raises(TimestampOverflowException):
+        attr.validate(Timestamp(seconds=2**63 - 1, microseconds=0))
+    with pytest.raises(ValueError):
+        Timestamp(seconds=2**63, microseconds=0)
+
+    with pytest.raises(TimestampOverflowException):
+        attr.validate(Timestamp(seconds=-(2**63), microseconds=0))
+    with pytest.raises(TimestampOverflowException):
+        Timestamp(seconds=-(2**63) - 1, microseconds=0)
+
+
+def test_timestamp_microseconds():
+    attr.validate(Timestamp(seconds=0, microseconds=0))
+    with pytest.raises(AttributeTypeError):
+        Timestamp(seconds=0, microseconds="0")
+
+    with pytest.raises(ValueError):
+        attr.validate(
+            Timestamp(
+                seconds=Timestamp.MAX_SECONDS,
+                microseconds=Timestamp.MAX_MICROSECONDS + 1,
+            )
+        )
+    with pytest.raises(ValueError):
+        attr.validate(Timestamp(seconds=0, microseconds=Timestamp.MAX_MICROSECONDS + 1))
+    with pytest.raises(ValueError):
+        attr.validate(Timestamp(seconds=0, microseconds=Timestamp.MIN_MICROSECONDS - 1))
+    with pytest.raises(ValueError):
+        attr.validate(
+            Timestamp(
+                seconds=Timestamp.MIN_SECONDS,
+                microseconds=Timestamp.MIN_MICROSECONDS - 1,
+            )
+        )
+
+
+def test_timestamp_from_dict():
+    assert Timestamp.from_dict({"seconds": 10, "microseconds": 5})
+
+    with pytest.raises(AttributeTypeError):
+        Timestamp.from_dict({"seconds": "10", "microseconds": 5})
+
+    with pytest.raises(AttributeTypeError):
+        Timestamp.from_dict({"seconds": 10, "microseconds": "5"})
+    with pytest.raises(ValueError):
+        Timestamp.from_dict({"seconds": 0, "microseconds": -1})
+
+    Timestamp.from_dict({"seconds": 0, "microseconds": 10**6 - 1})
+    with pytest.raises(ValueError):
+        Timestamp.from_dict({"seconds": 0, "microseconds": 10**6})
+
+
+# TimestampWithTimezone
+
+
+def test_timestampwithtimezone():
+    ts = Timestamp(seconds=0, microseconds=0)
+    tstz = TimestampWithTimezone(timestamp=ts, offset_bytes=b"+0000")
+    attr.validate(tstz)
+    assert tstz.offset_minutes() == 0
+    assert tstz.offset_bytes == b"+0000"
+
+    tstz = TimestampWithTimezone(timestamp=ts, offset_bytes=b"+0010")
+    attr.validate(tstz)
+    assert tstz.offset_minutes() == 10
+    assert tstz.offset_bytes == b"+0010"
+
+    tstz = TimestampWithTimezone(timestamp=ts, offset_bytes=b"-0010")
+    attr.validate(tstz)
+    assert tstz.offset_minutes() == -10
+    assert tstz.offset_bytes == b"-0010"
+
+    tstz = TimestampWithTimezone(timestamp=ts, offset_bytes=b"-0000")
+    attr.validate(tstz)
+    assert tstz.offset_minutes() == 0
+    assert tstz.offset_bytes == b"-0000"
+
+    tstz = TimestampWithTimezone(timestamp=ts, offset_bytes=b"-1030")
+    attr.validate(tstz)
+    assert tstz.offset_minutes() == -630
+    assert tstz.offset_bytes == b"-1030"
+
+    tstz = TimestampWithTimezone(timestamp=ts, offset_bytes=b"+1320")
+    attr.validate(tstz)
+    assert tstz.offset_minutes() == 800
+    assert tstz.offset_bytes == b"+1320"
+
+    tstz = TimestampWithTimezone(timestamp=ts, offset_bytes=b"+200")
+    attr.validate(tstz)
+    assert tstz.offset_minutes() == 120
+    assert tstz.offset_bytes == b"+200"
+
+    tstz = TimestampWithTimezone(timestamp=ts, offset_bytes=b"+02")
+    attr.validate(tstz)
+    assert tstz.offset_minutes() == 120
+    assert tstz.offset_bytes == b"+02"
+
+    tstz = TimestampWithTimezone(timestamp=ts, offset_bytes=b"+2000000000")
+    attr.validate(tstz)
+    assert tstz.offset_minutes() == 0
+    assert tstz.offset_bytes == b"+2000000000"
+
+    with pytest.raises(AttributeTypeError):
+        TimestampWithTimezone(timestamp=datetime.datetime.now(), offset_bytes=b"+0000")
+
+    with pytest.raises((AttributeTypeError, TypeError)):
+        TimestampWithTimezone(timestamp=ts, offset_bytes=0)
+
+
+def test_timestampwithtimezone_from_datetime():
+    # Typical case
+    tz = datetime.timezone(datetime.timedelta(minutes=+60))
+    date = datetime.datetime(2020, 2, 27, 14, 39, 19, tzinfo=tz)
+    tstz = TimestampWithTimezone.from_datetime(date)
+    assert tstz == TimestampWithTimezone(
+        timestamp=Timestamp(
+            seconds=1582810759,
+            microseconds=0,
+        ),
+        offset_bytes=b"+0100",
+    )
+
+    # Typical case (close to epoch)
+    tz = datetime.timezone(datetime.timedelta(minutes=+60))
+    date = datetime.datetime(1970, 1, 1, 1, 0, 5, tzinfo=tz)
+    tstz = TimestampWithTimezone.from_datetime(date)
+    assert tstz == TimestampWithTimezone(
+        timestamp=Timestamp(
+            seconds=5,
+            microseconds=0,
+        ),
+        offset_bytes=b"+0100",
+    )
+
+    # non-integer number of seconds before UNIX epoch
+    date = datetime.datetime(
+        1969, 12, 31, 23, 59, 59, 100000, tzinfo=datetime.timezone.utc
+    )
+    tstz = TimestampWithTimezone.from_datetime(date)
+    assert tstz == TimestampWithTimezone(
+        timestamp=Timestamp(
+            seconds=-1,
+            microseconds=100000,
+        ),
+        offset_bytes=b"+0000",
+    )
+
+    # non-integer number of seconds in both the timestamp and the offset
+    tz = datetime.timezone(datetime.timedelta(microseconds=-600000))
+    date = datetime.datetime(1969, 12, 31, 23, 59, 59, 600000, tzinfo=tz)
+    tstz = TimestampWithTimezone.from_datetime(date)
+    assert tstz == TimestampWithTimezone(
+        timestamp=Timestamp(
+            seconds=0,
+            microseconds=200000,
+        ),
+        offset_bytes=b"+0000",
+    )
+
+    # timezone offset with non-integer number of seconds, for dates before epoch
+    # we round down to the previous second, so it should be the same as
+    # 1969-01-01T23:59:59Z
+    tz = datetime.timezone(datetime.timedelta(microseconds=900000))
+    date = datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=tz)
+    tstz = TimestampWithTimezone.from_datetime(date)
+    assert tstz == TimestampWithTimezone(
+        timestamp=Timestamp(
+            seconds=-1,
+            microseconds=100000,
+        ),
+        offset_bytes=b"+0000",
+    )
+
+
+def test_timestampwithtimezone_from_naive_datetime():
+    date = datetime.datetime(2020, 2, 27, 14, 39, 19)
+
+    with pytest.raises(ValueError, match="datetime without timezone"):
+        TimestampWithTimezone.from_datetime(date)
+
+
+def test_timestampwithtimezone_from_iso8601():
+    date = "2020-02-27 14:39:19.123456+0100"
+
+    tstz = TimestampWithTimezone.from_iso8601(date)
+
+    assert tstz == TimestampWithTimezone(
+        timestamp=Timestamp(
+            seconds=1582810759,
+            microseconds=123456,
+        ),
+        offset_bytes=b"+0100",
+    )
+
+
+def test_timestampwithtimezone_from_iso8601_negative_utc():
+    date = "2020-02-27 13:39:19-0000"
+
+    tstz = TimestampWithTimezone.from_iso8601(date)
+
+    assert tstz == TimestampWithTimezone(
+        timestamp=Timestamp(
+            seconds=1582810759,
+            microseconds=0,
+        ),
+        offset_bytes=b"-0000",
+    )
+
+
+@pytest.mark.parametrize("date", TS_DATETIMES)
+@pytest.mark.parametrize("tz", TS_TIMEZONES)
+@pytest.mark.parametrize("microsecond", [0, 1, 10, 100, 1000, 999999])
+def test_timestampwithtimezone_to_datetime(date, tz, microsecond):
+    date = date.replace(tzinfo=tz, microsecond=microsecond)
+    tstz = TimestampWithTimezone.from_datetime(date)
+
+    assert tstz.to_datetime() == date
+    assert tstz.to_datetime().utcoffset() == date.utcoffset()
+
+
+def test_timestampwithtimezone_to_datetime__tz_overflow():
+    ts = 1582810759
+    date = datetime.datetime.fromtimestamp(ts, datetime.timezone.utc)
+    tstz = TimestampWithTimezone(
+        timestamp=Timestamp(seconds=ts, microseconds=0), offset_bytes=b"+9959"
+    )
+
+    assert tstz.to_datetime() == date
+    assert tstz.to_datetime().utcoffset() == date.utcoffset()
+    assert int(tstz.to_datetime().timestamp()) == ts
+
+
+def test_person_from_fullname():
+    """The author should have name, email and fullname filled."""
+    actual_person = Person.from_fullname(b"tony <ynot@dagobah>")
+    assert actual_person == Person(
+        fullname=b"tony <ynot@dagobah>",
+        name=b"tony",
+        email=b"ynot@dagobah",
+    )
+
+
+def test_person_from_fullname_no_email():
+    """The author and fullname should be the same as the input (author)."""
+    actual_person = Person.from_fullname(b"tony")
+    assert actual_person == Person(
+        fullname=b"tony",
+        name=b"tony",
+        email=None,
+    )
+
+
+def test_person_from_fullname_empty_person():
+    """Empty person has only its fullname filled with the empty
+    byte-string.
+
+    """
+    actual_person = Person.from_fullname(b"")
+    assert actual_person == Person(
+        fullname=b"",
+        name=None,
+        email=None,
+    )
+
+
+def test_git_author_line_to_author():
+    # edge case out of the way
+    with pytest.raises(TypeError):
+        Person.from_fullname(None)
+
+    tests = {
+        b"a <b@c.com>": Person(
+            name=b"a",
+            email=b"b@c.com",
+            fullname=b"a <b@c.com>",
+        ),
+        b"<foo@bar.com>": Person(
+            name=None,
+            email=b"foo@bar.com",
+            fullname=b"<foo@bar.com>",
+        ),
+        b"malformed <email": Person(
+            name=b"malformed", email=b"email", fullname=b"malformed <email"
+        ),
+        b'malformed <"<br"@ckets>': Person(
+            name=b"malformed",
+            email=b'"<br"@ckets',
+            fullname=b'malformed <"<br"@ckets>',
+        ),
+        b"trailing <sp@c.e> ": Person(
+            name=b"trailing",
+            email=b"sp@c.e",
+            fullname=b"trailing <sp@c.e> ",
+        ),
+        b"no<sp@c.e>": Person(
+            name=b"no",
+            email=b"sp@c.e",
+            fullname=b"no<sp@c.e>",
+        ),
+        b" more   <sp@c.es>": Person(
+            name=b"more",
+            email=b"sp@c.es",
+            fullname=b" more   <sp@c.es>",
+        ),
+        b" <>": Person(
+            name=None,
+            email=None,
+            fullname=b" <>",
+        ),
+    }
+
+    for person in sorted(tests):
+        expected_person = tests[person]
+        assert expected_person == Person.from_fullname(person)
+
+
+def test_person_comparison():
+    """Check only the fullname attribute is used to compare Person objects"""
+    person = Person(fullname=b"p1", name=None, email=None)
+    assert attr.evolve(person, name=b"toto") == person
+    assert attr.evolve(person, email=b"toto@example.com") == person
+
+    person = Person(fullname=b"", name=b"toto", email=b"toto@example.com")
+    assert attr.evolve(person, fullname=b"dude") != person
+
+
+# Content
+
+
+def test_content_get_hash():
+    hashes = dict(sha1=b"foo", sha1_git=b"bar", sha256=b"baz", blake2s256=b"qux")
+    c = Content(length=42, status="visible", **hashes)
+    for hash_name, hash_ in hashes.items():
+        assert c.get_hash(hash_name) == hash_
+
+
+def test_content_hashes():
+    hashes = dict(sha1=b"foo", sha1_git=b"bar", sha256=b"baz", blake2s256=b"qux")
+    c = Content(length=42, status="visible", **hashes)
+    assert c.hashes() == hashes
+
+
+def test_content_data():
+    c = Content(
+        length=42,
+        status="visible",
+        data=b"foo",
+        sha1=b"foo",
+        sha1_git=b"bar",
+        sha256=b"baz",
+        blake2s256=b"qux",
+    )
+    assert c.with_data() == c
+    assert c.to_dict() == {
+        "sha1": b"foo",
+        "sha1_git": b"bar",
+        "sha256": b"baz",
+        "blake2s256": b"qux",
+        "length": 42,
+        "status": "visible",
+        "data": b"foo",
+    }
+
+
+def test_content_data_missing():
+    c = Content(
+        length=42,
+        status="visible",
+        sha1=b"foo",
+        sha1_git=b"bar",
+        sha256=b"baz",
+        blake2s256=b"qux",
+    )
+    with pytest.raises(MissingData):
+        c.with_data()
+
+    assert c.to_dict() == {
+        "sha1": b"foo",
+        "sha1_git": b"bar",
+        "sha256": b"baz",
+        "blake2s256": b"qux",
+        "length": 42,
+        "status": "visible",
+    }
+
+
+@given(strategies.present_contents_d())
+def test_content_from_dict(content_d):
+    c = Content.from_data(**content_d)
+    assert c
+    assert c.ctime == content_d["ctime"]
+
+    content_d2 = c.to_dict()
+    c2 = Content.from_dict(content_d2)
+    assert c2.ctime == c.ctime
+
+
+def test_content_from_dict_str_ctime():
+    # test with ctime as a string
+    n = datetime.datetime(2020, 5, 6, 12, 34, tzinfo=datetime.timezone.utc)
+    content_d = {
+        "ctime": n.isoformat(),
+        "data": b"",
+        "length": 0,
+        "sha1": b"\x00",
+        "sha256": b"\x00",
+        "sha1_git": b"\x00",
+        "blake2s256": b"\x00",
+    }
+    c = Content.from_dict(content_d)
+    assert c.ctime == n
+
+
+def test_content_from_dict_str_naive_ctime():
+    # test with ctime as a string
+    n = datetime.datetime(2020, 5, 6, 12, 34)
+    content_d = {
+        "ctime": n.isoformat(),
+        "data": b"",
+        "length": 0,
+        "sha1": b"\x00",
+        "sha256": b"\x00",
+        "sha1_git": b"\x00",
+        "blake2s256": b"\x00",
+    }
+    with pytest.raises(ValueError, match="must be a timezone-aware datetime."):
+        Content.from_dict(content_d)
+
+
+@given(binary(max_size=4096))
+def test_content_from_data(data):
+    c = Content.from_data(data)
+    assert c.data == data
+    assert c.length == len(data)
+    assert c.status == "visible"
+    for key, value in MultiHash.from_data(data).digest().items():
+        assert getattr(c, key) == value
+
+
+@given(binary(max_size=4096))
+def test_hidden_content_from_data(data):
+    c = Content.from_data(data, status="hidden")
+    assert c.data == data
+    assert c.length == len(data)
+    assert c.status == "hidden"
+    for key, value in MultiHash.from_data(data).digest().items():
+        assert getattr(c, key) == value
+
+
+def test_content_naive_datetime():
+    c = Content.from_data(b"foo")
+    with pytest.raises(ValueError, match="must be a timezone-aware datetime"):
+        Content(
+            **c.to_dict(),
+            ctime=datetime.datetime.now(),
+        )
+
+
+@given(strategies.present_contents())
+def test_content_git_roundtrip(content):
+    assert content.data is not None
+    raw = swh.model.git_objects.content_git_object(content)
+    sha1_git = hashlib.new("sha1", raw).digest()
+    assert content.sha1_git == sha1_git
+
+
+@given(strategies.present_contents())
+def test_content_evolve(content):
+    content.check()
+
+    assert attr.evolve(content, sha1=b"\x00" * 20) == content.evolve(sha1=b"\x00" * 20)
+
+    assert attr.evolve(content, data=b"foo") == content.evolve(data=b"foo")
+
+    assert attr.evolve(content, data=None) == content.evolve(data=None)
+
+
+# SkippedContent
+
+
+@given(binary(max_size=4096))
+def test_skipped_content_from_data(data):
+    c = SkippedContent.from_data(data, reason="reason")
+    assert c.reason == "reason"
+    assert c.length == len(data)
+    assert c.status == "absent"
+    for key, value in MultiHash.from_data(data).digest().items():
+        assert getattr(c, key) == value
+
+
+@given(strategies.skipped_contents_d())
+def test_skipped_content_origin_is_str(skipped_content_d):
+    assert SkippedContent.from_dict(skipped_content_d)
+
+    skipped_content_d["origin"] = "http://path/to/origin"
+    assert SkippedContent.from_dict(skipped_content_d)
+
+    skipped_content_d["origin"] = Origin(url="http://path/to/origin")
+    with pytest.raises(ValueError, match="origin"):
+        SkippedContent.from_dict(skipped_content_d)
+
+
+def test_skipped_content_naive_datetime():
+    c = SkippedContent.from_data(b"foo", reason="reason")
+    with pytest.raises(ValueError, match="must be a timezone-aware datetime"):
+        SkippedContent(
+            **c.to_dict(),
+            ctime=datetime.datetime.now(),
+        )
+
+
+def test_skipped_content_swhid():
+    skipped_content = SkippedContent.from_data(b"foo", reason="reason")
+    assert skipped_content.swhid() == CoreSWHID.from_string(
+        "swh:1:cnt:19102815663d23f8b75a47e7a01965dcdc96468c"
+    )
+
+
+@given(strategies.skipped_contents())
+def test_skipped_content_evolve(content):
+    content.check()
+
+    assert attr.evolve(content, sha1=b"\x00" * 20) == content.evolve(sha1=b"\x00" * 20)
+
+    assert attr.evolve(content, sha1=None) == content.evolve(sha1=None)
+
+
+# Directory
+
+
+@given(strategies.directories(raw_manifest=none()))
+def test_directory_check(directory):
+    directory.check()
+
+    directory2 = attr.evolve(directory, id=b"\x00" * 20)
+    with pytest.raises(ValueError, match="does not match recomputed hash"):
+        directory2.check()
+
+    directory2 = attr.evolve(
+        directory, raw_manifest=swh.model.git_objects.directory_git_object(directory)
+    )
+    with pytest.raises(
+        ValueError, match="non-none raw_manifest attribute, but does not need it."
+    ):
+        directory2.check()
+
+
+@given(strategies.directories(raw_manifest=none()))
+def test_directory_raw_manifest(directory):
+    assert "raw_manifest" not in directory.to_dict()
+
+    raw_manifest = b"foo"
+    id_ = hashlib.new("sha1", raw_manifest).digest()
+
+    # Forgot to update the id -> error
+    directory2 = attr.evolve(directory, raw_manifest=raw_manifest)
+    assert directory2.to_dict()["raw_manifest"] == raw_manifest
+    with pytest.raises(ValueError, match="does not match recomputed hash"):
+        directory2.check()
+
+    # id set to the right value -> ok
+    directory2 = attr.evolve(directory, raw_manifest=raw_manifest, id=id_)
+    assert directory2.id is not None
+    assert directory2.id == id_ != directory.id
+    assert directory2.to_dict()["raw_manifest"] == raw_manifest
+    directory2.check()
+
+    # id implicitly set to the right value -> ok
+    directory3 = directory.evolve(raw_manifest=raw_manifest)
+    assert directory3.id is not None
+    assert directory3.id == id_ != directory.id
+    assert directory3.to_dict()["raw_manifest"] == raw_manifest
+    directory3.check()
+
+
+@given(strategies.directories(raw_manifest=none()))
+def test_directory_evolve(directory):
+    directory.check()
+
+    # Add an entry (while making sure it is not a duplicate)
+    longest_entry_name = max(
+        (entry.name for entry in directory.entries), key=len, default=b""
+    )
+    entries = (
+        *directory.entries,
+        DirectoryEntry(
+            name=longest_entry_name + b"x",
+            type="file",
+            target=b"\x00" * 20,
+            perms=0,
+        ),
+    )
+    directory2 = directory.evolve(entries=entries)
+    assert directory2.entries == entries
+    assert directory2.id != directory.id, "directory.evolve() did not update the id"
+    directory2.check()
+
+    with pytest.raises(TypeError, match="use attr.evolve"):
+        directory.evolve(id=b"\x00" * 20)
+
+    with pytest.raises(TypeError, match="unexpected keyword argument"):
+        directory.evolve(foo=b"")
+
+
+@given(strategies.directories(raw_manifest=none()))
+def test_directory_evolve_raw_manifest(directory):
+    directory2 = directory.evolve(raw_manifest=b"123")
+    assert directory2 == attr.evolve(directory, id=directory2.id, raw_manifest=b"123")
+
+    directory3 = directory2.evolve(entries=())
+    assert directory3.raw_manifest == directory2.raw_manifest
+    assert (
+        directory3.id == directory2.id
+    ), ".evolve() change the id despite raw_manifest being set"
+    assert directory3 == attr.evolve(
+        directory, id=directory2.id, entries=(), raw_manifest=b"123"
+    )
+
+
+def test_directory_entry_name_validation():
+    with pytest.raises(ValueError, match="valid directory entry name."):
+        DirectoryEntry(name=b"foo/", type="dir", target=b"\x00" * 20, perms=0),
+
+
+def test_directory_duplicate_entry_name():
+    entries = (
+        DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+        DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1),
+    )
+    with pytest.raises(ValueError, match="duplicated entry name"):
+        Directory(entries=entries)
+
+    entries = (
+        DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+        DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+    )
+    with pytest.raises(ValueError, match="duplicated entry name"):
+        Directory(entries=entries)
+
+
+@given(strategies.directories())
+def test_directory_from_possibly_duplicated_entries__no_duplicates(directory):
+    """
+    Directory.from_possibly_duplicated_entries should return the directory
+    unchanged if it has no duplicated entry name.
+    """
+    assert (False, directory) == Directory.from_possibly_duplicated_entries(
+        id=directory.id, entries=directory.entries, raw_manifest=directory.raw_manifest
+    )
+    assert (False, directory) == Directory.from_possibly_duplicated_entries(
+        entries=directory.entries, raw_manifest=directory.raw_manifest
+    )
+
+
+@pytest.mark.parametrize("rev_first", [True, False])
+def test_directory_from_possibly_duplicated_entries__rev_and_dir(rev_first):
+    entries = (
+        DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1),
+        DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0),
+    )
+    if rev_first:
+        entries = tuple(reversed(entries))
+    (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries)
+    assert is_corrupt
+    assert dir_.entries == (
+        DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0),
+        DirectoryEntry(
+            name=b"foo_0101010101", type="dir", target=b"\x01" * 20, perms=1
+        ),
+    )
+
+    # order is independent of 'rev_first' because it is always sorted in git order
+    assert dir_.raw_manifest == (
+        # fmt: off
+        b"tree 52\x00"
+        + b"0 foo\x00" + b"\x00" * 20
+        + b"1 foo\x00" + b"\x01" * 20
+        # fmt: on
+    )
+
+
+@pytest.mark.parametrize("file_first", [True, False])
+def test_directory_from_possibly_duplicated_entries__file_and_dir(file_first):
+    entries = (
+        DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1),
+        DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+    )
+    if file_first:
+        entries = tuple(reversed(entries))
+    (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries)
+    assert is_corrupt
+    assert dir_.entries == (
+        DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1),
+        DirectoryEntry(
+            name=b"foo_0000000000", type="file", target=b"\x00" * 20, perms=0
+        ),
+    )
+
+    # order is independent of 'file_first' because it is always sorted in git order
+    assert dir_.raw_manifest == (
+        # fmt: off
+        b"tree 52\x00"
+        + b"0 foo\x00" + b"\x00" * 20
+        + b"1 foo\x00" + b"\x01" * 20
+        # fmt: on
+    )
+
+
+def test_directory_from_possibly_duplicated_entries__two_files1():
+    entries = (
+        DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1),
+        DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+    )
+    (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries)
+    assert is_corrupt
+
+    assert dir_.entries == (
+        DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1),
+        DirectoryEntry(
+            name=b"foo_0000000000", type="file", target=b"\x00" * 20, perms=0
+        ),
+    )
+    assert dir_.raw_manifest == (
+        # fmt: off
+        b"tree 52\x00"
+        + b"1 foo\x00" + b"\x01" * 20
+        + b"0 foo\x00" + b"\x00" * 20
+        # fmt: on
+    )
+
+
+def test_directory_from_possibly_duplicated_entries__two_files2():
+    """
+    Same as above, but entries are in a different order (and order matters
+    to break the tie)
+    """
+    entries = (
+        DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+        DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1),
+    )
+    (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries)
+    assert is_corrupt
+
+    assert dir_.entries == (
+        DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+        DirectoryEntry(
+            name=b"foo_0101010101", type="file", target=b"\x01" * 20, perms=1
+        ),
+    )
+    assert dir_.raw_manifest == (
+        # fmt: off
+        b"tree 52\x00"
+        + b"0 foo\x00" + b"\x00" * 20
+        + b"1 foo\x00" + b"\x01" * 20
+        # fmt: on
+    )
+
+
+def test_directory_from_possibly_duplicated_entries__preserve_manifest():
+    entries = (
+        DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1),
+        DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0),
+    )
+    (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(
+        entries=entries, raw_manifest=b"blah"
+    )
+    assert is_corrupt
+    assert dir_.entries == (
+        DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0),
+        DirectoryEntry(
+            name=b"foo_0101010101", type="dir", target=b"\x01" * 20, perms=1
+        ),
+    )
+
+    assert dir_.raw_manifest == b"blah"
+
+
+@pytest.fixture
+def directory_with_every_possible_type():
+    return Directory.from_dict(
+        {
+            "entries": [
+                {
+                    "type": "file",
+                    "perms": 33188,
+                    "name": b"README",
+                    "target": hash_to_bytes("37ec8ea2110c0b7a32fbb0e872f6e7debbf95e21"),
+                },
+                {
+                    "type": "dir",
+                    "perms": 16384,
+                    "name": b"src",
+                    "target": hash_to_bytes("61e6e867f5d7ba3b40540869bc050b0c4fed9e95"),
+                },
+                {
+                    "type": "rev",
+                    "perms": 57344,
+                    "name": b"submodule",
+                    "target": hash_to_bytes("3d531e169db92a16a9a8974f0ae6edf52e52659e"),
+                },
+            ],
+        }
+    )
+
+
+def test_directory_entry_swhids(directory_with_every_possible_type):
+    assert [entry.swhid() for entry in directory_with_every_possible_type.entries] == [
+        CoreSWHID.from_string("swh:1:cnt:37ec8ea2110c0b7a32fbb0e872f6e7debbf95e21"),
+        CoreSWHID.from_string("swh:1:dir:61e6e867f5d7ba3b40540869bc050b0c4fed9e95"),
+        CoreSWHID.from_string("swh:1:rev:3d531e169db92a16a9a8974f0ae6edf52e52659e"),
+    ]
+
+
+# Release
+
+
+@given(strategies.releases(raw_manifest=none()))
+def test_release_check(release):
+    release.check()
+
+    release2 = attr.evolve(release, id=b"\x00" * 20)
+    with pytest.raises(ValueError, match="does not match recomputed hash"):
+        release2.check()
+
+    release2 = attr.evolve(
+        release, raw_manifest=swh.model.git_objects.release_git_object(release)
+    )
+    with pytest.raises(
+        ValueError, match="non-none raw_manifest attribute, but does not need it."
+    ):
+        release2.check()
+
+
+@given(strategies.releases(raw_manifest=none()))
+def test_release_raw_manifest(release):
+    raw_manifest = b"foo"
+    id_ = hashlib.new("sha1", raw_manifest).digest()
+
+    release2 = attr.evolve(release, raw_manifest=raw_manifest)
+    assert release2.to_dict()["raw_manifest"] == raw_manifest
+    with pytest.raises(ValueError, match="does not match recomputed hash"):
+        release2.check()
+
+    release2 = attr.evolve(release, raw_manifest=raw_manifest, id=id_)
+    assert release2.id is not None
+    assert release2.id == id_ != release.id
+    assert release2.to_dict()["raw_manifest"] == raw_manifest
+    release2.check()
+
+
+def test_release_target_swhid():
+    release = Release.from_dict(release_example)
+    assert release.target_swhid() == CoreSWHID.from_string(
+        "swh:1:rev:741b2252a5e14d6c60a913c77a6099abe73a854a"
+    )
+
+
+@given(strategies.releases(raw_manifest=none()))
+def test_release_evolve(release):
+    release.check()
+
+    message = (release.message or b"abc") + b"\n"
+    release2 = release.evolve(message=message)
+    assert release2.message == message
+    assert release2.id != release.id, "release.evolve() did not update the id"
+    release2.check()
+
+    release2 = release.evolve(message=None)
+    assert release2.message is None
+    if release.message is None:
+        assert release2.id == release.id, "no-op release.evolve() updated the id"
+    else:
+        assert release2.id != release.id, "release.evolve() did not update the id"
+    release2.check()
+
+    with pytest.raises(TypeError, match="use attr.evolve"):
+        release.evolve(id=b"\x00" * 20)
+
+    with pytest.raises(TypeError, match="unexpected keyword argument"):
+        release.evolve(foo=b"")
+
+
+@given(strategies.releases(raw_manifest=none()))
+def test_release_evolve_raw_manifest(release):
+    release2 = release.evolve(raw_manifest=b"123")
+    assert release2 == attr.evolve(release, id=release2.id, raw_manifest=b"123")
+
+    release3 = release2.evolve(message=None)
+    assert release3.raw_manifest == release2.raw_manifest
+    assert (
+        release3.id == release2.id
+    ), ".evolve() change the id despite raw_manifest being set"
+    assert release3 == attr.evolve(
+        release, id=release2.id, message=None, raw_manifest=b"123"
+    )
+
+
+# Revision
+
+
+@given(strategies.revisions(raw_manifest=none()))
+def test_revision_check(revision):
+    revision.check()
+
+    revision2 = attr.evolve(revision, id=b"\x00" * 20)
+    with pytest.raises(ValueError, match="does not match recomputed hash"):
+        revision2.check()
+
+    revision2 = attr.evolve(
+        revision, raw_manifest=swh.model.git_objects.revision_git_object(revision)
+    )
+    with pytest.raises(
+        ValueError, match="non-none raw_manifest attribute, but does not need it."
+    ):
+        revision2.check()
+
+
+@given(strategies.revisions(raw_manifest=none()))
+def test_revision_raw_manifest(revision):
+    raw_manifest = b"foo"
+    id_ = hashlib.new("sha1", raw_manifest).digest()
+
+    revision2 = attr.evolve(revision, raw_manifest=raw_manifest)
+    assert revision2.to_dict()["raw_manifest"] == raw_manifest
+    with pytest.raises(ValueError, match="does not match recomputed hash"):
+        revision2.check()
+
+    revision2 = attr.evolve(revision, raw_manifest=raw_manifest, id=id_)
+    assert revision2.id is not None
+    assert revision2.id == id_ != revision.id
+    assert revision2.to_dict()["raw_manifest"] == raw_manifest
+    revision2.check()
+
+
+def test_revision_extra_headers_no_headers():
+    rev_dict = revision_example.copy()
+    rev_dict.pop("id")
+    rev = Revision.from_dict(rev_dict)
+    rev_dict = attr.asdict(rev, recurse=False)
+
+    rev_model = Revision(**rev_dict)
+    assert rev_model.metadata is None
+    assert rev_model.extra_headers == ()
+
+    rev_dict["metadata"] = {
+        "something": "somewhere",
+        "some other thing": "stranger",
+    }
+    rev_model = Revision(**rev_dict)
+    assert rev_model.metadata == rev_dict["metadata"]
+    assert rev_model.extra_headers == ()
+
+
+def test_revision_extra_headers_with_headers():
+    rev_dict = revision_example.copy()
+    rev_dict.pop("id")
+    rev = Revision.from_dict(rev_dict)
+    rev_dict = attr.asdict(rev, recurse=False)
+    rev_dict["metadata"] = {
+        "something": "somewhere",
+        "some other thing": "stranger",
+    }
+    extra_headers = (
+        (b"header1", b"value1"),
+        (b"header2", b"42"),
+        (b"header3", b"should I?\x00"),
+        (b"header1", b"again"),
+    )
+
+    rev_dict["extra_headers"] = extra_headers
+    rev_model = Revision(**rev_dict)
+    assert "extra_headers" not in rev_model.metadata
+    assert rev_model.extra_headers == extra_headers
+
+
+def test_revision_extra_headers_in_metadata():
+    rev_dict = revision_example.copy()
+    rev_dict.pop("id")
+    rev = Revision.from_dict(rev_dict)
+    rev_dict = attr.asdict(rev, recurse=False)
+    rev_dict["metadata"] = {
+        "something": "somewhere",
+        "some other thing": "stranger",
+    }
+
+    extra_headers = (
+        (b"header1", b"value1"),
+        (b"header2", b"42"),
+        (b"header3", b"should I?\x00"),
+        (b"header1", b"again"),
+    )
+
+    # check the bw-compat init hook does the job
+    # ie. extra_headers are given in the metadata field
+    rev_dict["metadata"]["extra_headers"] = extra_headers
+    rev_model = Revision(**rev_dict)
+    assert "extra_headers" not in rev_model.metadata
+    assert rev_model.extra_headers == extra_headers
+
+
+def test_revision_extra_headers_as_lists():
+    rev_dict = revision_example.copy()
+    rev_dict.pop("id")
+    rev = Revision.from_dict(rev_dict)
+    rev_dict = attr.asdict(rev, recurse=False)
+    rev_dict["metadata"] = {}
+
+    extra_headers = (
+        (b"header1", b"value1"),
+        (b"header2", b"42"),
+        (b"header3", b"should I?\x00"),
+        (b"header1", b"again"),
+    )
+
+    # check Revision.extra_headers tuplify does the job
+    rev_dict["extra_headers"] = [list(x) for x in extra_headers]
+    rev_model = Revision(**rev_dict)
+    assert "extra_headers" not in rev_model.metadata
+    assert rev_model.extra_headers == extra_headers
+
+
+def test_revision_extra_headers_type_error():
+    rev_dict = revision_example.copy()
+    rev_dict.pop("id")
+    rev = Revision.from_dict(rev_dict)
+    orig_rev_dict = attr.asdict(rev, recurse=False)
+    orig_rev_dict["metadata"] = {
+        "something": "somewhere",
+        "some other thing": "stranger",
+    }
+    extra_headers = (
+        ("header1", b"value1"),
+        (b"header2", 42),
+        ("header1", "again"),
+    )
+    # check headers one at a time
+    #   if given as extra_header
+    for extra_header in extra_headers:
+        rev_dict = copy.deepcopy(orig_rev_dict)
+        rev_dict["extra_headers"] = (extra_header,)
+        with pytest.raises(AttributeTypeError):
+            Revision(**rev_dict)
+    #   if given as metadata
+    for extra_header in extra_headers:
+        rev_dict = copy.deepcopy(orig_rev_dict)
+        rev_dict["metadata"]["extra_headers"] = (extra_header,)
+        with pytest.raises(AttributeTypeError):
+            Revision(**rev_dict)
+
+
+def test_revision_extra_headers_from_dict():
+    rev_dict = revision_example.copy()
+    rev_dict.pop("id")
+    rev_model = Revision.from_dict(rev_dict)
+    assert rev_model.metadata is None
+    assert rev_model.extra_headers == ()
+
+    rev_dict["metadata"] = {
+        "something": "somewhere",
+        "some other thing": "stranger",
+    }
+    rev_model = Revision.from_dict(rev_dict)
+    assert rev_model.metadata == rev_dict["metadata"]
+    assert rev_model.extra_headers == ()
+
+    extra_headers = (
+        (b"header1", b"value1"),
+        (b"header2", b"42"),
+        (b"header3", b"should I?\nmaybe\x00\xff"),
+        (b"header1", b"again"),
+    )
+    rev_dict["extra_headers"] = extra_headers
+    rev_model = Revision.from_dict(rev_dict)
+    assert "extra_headers" not in rev_model.metadata
+    assert rev_model.extra_headers == extra_headers
+
+
+def test_revision_extra_headers_in_metadata_from_dict():
+    rev_dict = revision_example.copy()
+    rev_dict.pop("id")
+
+    rev_dict["metadata"] = {
+        "something": "somewhere",
+        "some other thing": "stranger",
+    }
+    extra_headers = (
+        (b"header1", b"value1"),
+        (b"header2", b"42"),
+        (b"header3", b"should I?\nmaybe\x00\xff"),
+        (b"header1", b"again"),
+    )
+    # check the bw-compat init hook does the job
+    rev_dict["metadata"]["extra_headers"] = extra_headers
+    rev_model = Revision.from_dict(rev_dict)
+    assert "extra_headers" not in rev_model.metadata
+    assert rev_model.extra_headers == extra_headers
+
+
+def test_revision_extra_headers_as_lists_from_dict():
+    rev_dict = revision_example.copy()
+    rev_dict.pop("id")
+    rev_model = Revision.from_dict(rev_dict)
+    rev_dict["metadata"] = {
+        "something": "somewhere",
+        "some other thing": "stranger",
+    }
+    extra_headers = (
+        (b"header1", b"value1"),
+        (b"header2", b"42"),
+        (b"header3", b"should I?\nmaybe\x00\xff"),
+        (b"header1", b"again"),
+    )
+    # check Revision.extra_headers converter does the job
+    rev_dict["extra_headers"] = [list(x) for x in extra_headers]
+    rev_model = Revision.from_dict(rev_dict)
+    assert "extra_headers" not in rev_model.metadata
+    assert rev_model.extra_headers == extra_headers
+
+
+def test_revision_no_author_or_committer_from_dict():
+    rev_dict = revision_example.copy()
+    rev_dict["author"] = rev_dict["date"] = None
+    rev_dict["committer"] = rev_dict["committer_date"] = None
+    rev_model = Revision.from_dict(rev_dict)
+    assert rev_model.to_dict() == {
+        **rev_dict,
+        "parents": tuple(rev_dict["parents"]),
+        "extra_headers": (),
+        "metadata": None,
+    }
+
+
+def test_revision_none_author_or_committer():
+    rev_dict = revision_example.copy()
+    rev_dict["author"] = None
+    with pytest.raises(ValueError, match=".*date must be None if author is None.*"):
+        Revision.from_dict(rev_dict)
+
+    rev_dict = revision_example.copy()
+    rev_dict["committer"] = None
+    with pytest.raises(
+        ValueError, match=".*committer_date must be None if committer is None.*"
+    ):
+        Revision.from_dict(rev_dict)
+
+
+def test_revision_directory_swhid():
+    revision = Revision.from_dict(revision_example)
+    assert revision.directory_swhid() == CoreSWHID.from_string(
+        "swh:1:dir:85a74718d377195e1efd0843ba4f3260bad4fe07"
+    )
+
+
+def test_revision_parent_swhids():
+    revision_d = copy.deepcopy(revision_example)
+    revision_d["parents"].append(
+        hash_to_bytes("b2a7e1260492e344fab3cbf91bc13c91e05426fd")
+    )
+    revision = Revision.from_dict(revision_d)
+    assert revision.parent_swhids() == [
+        CoreSWHID.from_string("swh:1:rev:01e2d0627a9a6edb24c37db45db5ecb31e9de808"),
+        CoreSWHID.from_string("swh:1:rev:b2a7e1260492e344fab3cbf91bc13c91e05426fd"),
+    ]
+
+
+@pytest.fixture
+def snapshot_with_all_types():
+    return Snapshot.from_dict(snapshot_example)
+
+
+def test_snapshot_branch_swhids(snapshot_with_all_types):
+    assert {
+        name: branch and branch.swhid()
+        for (name, branch) in snapshot_with_all_types.branches.items()
+    } == {
+        b"directory": CoreSWHID.from_string(
+            "swh:1:dir:1bd0e65f7d2ff14ae994de17a1e7fe65111dcad8"
+        ),
+        b"content": CoreSWHID.from_string(
+            "swh:1:cnt:fe95a46679d128ff167b7c55df5d02356c5a1ae1"
+        ),
+        b"alias": None,
+        b"revision": CoreSWHID.from_string(
+            "swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6"
+        ),
+        b"release": CoreSWHID.from_string(
+            "swh:1:rel:7045404f3d1c54e6473c71bbb716529fbad4be24"
+        ),
+        b"snapshot": CoreSWHID.from_string(
+            "swh:1:snp:1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"
+        ),
+        b"dangling": None,
+    }
+
+
+@given(strategies.snapshots())
+def test_snapshot_evolve(snapshot):
+    snapshot.check()
+
+    # Add an entry (while making sure it is not a duplicate)
+    longest_branch_name = max(snapshot.branches, key=len, default=b"")
+    branches = {
+        **snapshot.branches,
+        longest_branch_name
+        + b"x": SnapshotBranch(
+            target_type=SnapshotTargetType.RELEASE,
+            target=b"\x00" * 20,
+        ),
+    }
+    snapshot2 = snapshot.evolve(branches=branches)
+    assert snapshot2.branches == branches
+    assert snapshot2.id != snapshot.id, "snapshot.evolve() did not update the id"
+    snapshot2.check()
+
+    with pytest.raises(TypeError, match="use attr.evolve"):
+        snapshot.evolve(id=b"\x00" * 20)
+
+    with pytest.raises(TypeError, match="unexpected keyword argument"):
+        snapshot.evolve(foo=b"")
+
+
+@given(strategies.revisions(raw_manifest=none()))
+def test_revision_evolve(revision):
+    revision.check()
+
+    message = (revision.message or b"abc") + b"\n"
+    revision2 = revision.evolve(message=message)
+    assert revision2.message == message
+    assert revision2.id != revision.id, "revision.evolve() did not update the id"
+    revision2.check()
+
+    revision2 = revision.evolve(message=None)
+    assert revision2.message is None
+    if revision.message is None:
+        assert revision2.id == revision.id, "no-op revision.evolve() updated the id"
+    else:
+        assert revision2.id != revision.id, "revision.evolve() did not update the id"
+    revision2.check()
+
+    with pytest.raises(TypeError, match="use attr.evolve"):
+        revision.evolve(id=b"\x00" * 20)
+
+    with pytest.raises(TypeError, match="unexpected keyword argument"):
+        revision.evolve(foo=b"")
+
+
+@given(strategies.revisions(raw_manifest=none()))
+def test_revision_evolve_raw_manifest(revision):
+    revision2 = revision.evolve(raw_manifest=b"123")
+    assert revision2 == attr.evolve(revision, id=revision2.id, raw_manifest=b"123")
+
+    revision3 = revision2.evolve(message=None)
+    assert revision3.raw_manifest == revision2.raw_manifest
+    assert (
+        revision3.id == revision2.id
+    ), ".evolve() change the id despite raw_manifest being set"
+    assert revision3 == attr.evolve(
+        revision, id=revision2.id, message=None, raw_manifest=b"123"
+    )
+
+
+@given(strategies.objects(split_content=True))
+def test_object_type(objtype_and_obj):
+    obj_type, obj = objtype_and_obj
+    assert obj_type == obj.object_type
+
+
+def test_object_type_is_final():
+    checked_classes = set()
+    object_types = set()
+
+    def check_final(cls):
+        if cls in checked_classes:
+            return
+
+        checked_classes.add(cls)
+        obj_type = sentinel = object()
+        obj_type = getattr(cls, "object_type", sentinel)
+        if getattr(obj_type, "__isabstractmethod__", False):
+            obj_type = sentinel
+        if obj_type is sentinel:
+            assert cls.__subclasses__()
+        else:
+            assert not cls.__subclasses__()
+            assert cls.object_type not in object_types
+            object_types.add(cls.object_type)
+        for subcls in cls.__subclasses__():
+            check_final(subcls)
+
+    check_final(BaseModel)
+
+
+_metadata_authority = MetadataAuthority(
+    type=MetadataAuthorityType.FORGE,
+    url="https://forge.softwareheritage.org",
+)
+_metadata_fetcher = MetadataFetcher(
+    name="test-fetcher",
+    version="0.0.1",
+)
+_content_swhid = ExtendedSWHID.from_string(
+    "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2"
+)
+_origin_url = "https://forge.softwareheritage.org/source/swh-model.git"
+_origin_swhid = ExtendedSWHID.from_string(
+    "swh:1:ori:433b4f5612f0720ed51fa7aeaf43a3625870057b"
+)
+_dummy_qualifiers = {"origin": "https://example.com", "lines": "42"}
+_common_metadata_fields = dict(
+    discovery_date=datetime.datetime(
+        2021, 1, 29, 13, 57, 9, tzinfo=datetime.timezone.utc
+    ),
+    authority=_metadata_authority,
+    fetcher=_metadata_fetcher,
+    format="json",
+    metadata=b'{"origin": "https://example.com", "lines": "42"}',
+)
+
+
+def test_metadata_valid():
+    """Checks valid RawExtrinsicMetadata objects don't raise an error."""
+
+    # Simplest case
+    RawExtrinsicMetadata(target=_origin_swhid, **_common_metadata_fields)
+
+    # Object with an SWHID
+    RawExtrinsicMetadata(
+        target=_content_swhid,
+        **_common_metadata_fields,
+    )
+
+
+def test_metadata_from_old_dict():
+    common_fields = {
+        "authority": {"type": "forge", "url": "https://forge.softwareheritage.org"},
+        "fetcher": {
+            "name": "test-fetcher",
+            "version": "0.0.1",
+        },
+        "discovery_date": _common_metadata_fields["discovery_date"],
+        "format": "json",
+        "metadata": b'{"origin": "https://example.com", "lines": "42"}',
+    }
+
+    m = RawExtrinsicMetadata(
+        target=_origin_swhid,
+        **_common_metadata_fields,
+    )
+    assert (
+        RawExtrinsicMetadata.from_dict(
+            {"id": m.id, "target": _origin_url, "type": "origin", **common_fields}
+        )
+        == m
+    )
+
+    m = RawExtrinsicMetadata(
+        target=_content_swhid,
+        **_common_metadata_fields,
+    )
+    assert (
+        RawExtrinsicMetadata.from_dict(
+            {"target": str(_content_swhid), "type": "content", **common_fields}
+        )
+        == m
+    )
+
+
+def test_metadata_to_dict():
+    """Checks valid RawExtrinsicMetadata objects don't raise an error."""
+
+    common_fields = {
+        "authority": {"type": "forge", "url": "https://forge.softwareheritage.org"},
+        "fetcher": {
+            "name": "test-fetcher",
+            "version": "0.0.1",
+        },
+        "discovery_date": _common_metadata_fields["discovery_date"],
+        "format": "json",
+        "metadata": b'{"origin": "https://example.com", "lines": "42"}',
+    }
+
+    m = RawExtrinsicMetadata(
+        target=_origin_swhid,
+        **_common_metadata_fields,
+    )
+    assert m.to_dict() == {
+        "target": str(_origin_swhid),
+        "id": b"\xa3)q\x0f\xf7p\xc7\xb0\\O\xe8\x84\x83Z\xb0]\x81\xe9\x95\x13",
+        **common_fields,
+    }
+    assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m
+
+    m = RawExtrinsicMetadata(
+        target=_content_swhid,
+        **_common_metadata_fields,
+    )
+    assert m.to_dict() == {
+        "target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2",
+        "id": b"\xbc\xa3U\xddf\x19U\xc5\xd2\xd7\xdfK\xd7c\x1f\xa8\xfeh\x992",
+        **common_fields,
+    }
+    assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m
+
+    hash_hex = "6162" * 10
+    hash_bin = b"ab" * 10
+    m = RawExtrinsicMetadata(
+        target=_content_swhid,
+        **_common_metadata_fields,
+        origin="https://example.org/",
+        snapshot=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=hash_bin),
+        release=CoreSWHID(object_type=ObjectType.RELEASE, object_id=hash_bin),
+        revision=CoreSWHID(object_type=ObjectType.REVISION, object_id=hash_bin),
+        path=b"/foo/bar",
+        directory=CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=hash_bin),
+    )
+    assert m.to_dict() == {
+        "target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2",
+        "id": b"\x14l\xb0\x1f\xb9\xc0{)\xc7\x0f\xbd\xc0*,YZ\xf5C\xab\xfc",
+        **common_fields,
+        "origin": "https://example.org/",
+        "snapshot": f"swh:1:snp:{hash_hex}",
+        "release": f"swh:1:rel:{hash_hex}",
+        "revision": f"swh:1:rev:{hash_hex}",
+        "path": b"/foo/bar",
+        "directory": f"swh:1:dir:{hash_hex}",
+    }
+    assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m
+
+
+def test_metadata_invalid_target():
+    """Checks various invalid values for the 'target' field."""
+    # SWHID passed as string instead of SWHID
+    with pytest.raises(ValueError, match="target must be.*ExtendedSWHID"):
+        RawExtrinsicMetadata(
+            target="swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2",
+            **_common_metadata_fields,
+        )
+
+
+def test_metadata_naive_datetime():
+    with pytest.raises(ValueError, match="must be a timezone-aware datetime"):
+        RawExtrinsicMetadata(
+            target=_origin_swhid,
+            **{**_common_metadata_fields, "discovery_date": datetime.datetime.now()},
+        )
+
+
+def test_metadata_validate_context_origin():
+    """Checks validation of RawExtrinsicMetadata.origin."""
+
+    # Origins can't have an 'origin' context
+    with pytest.raises(
+        ValueError, match="Unexpected 'origin' context for origin object"
+    ):
+        RawExtrinsicMetadata(
+            target=_origin_swhid,
+            origin=_origin_url,
+            **_common_metadata_fields,
+        )
+
+    # but all other types can
+    RawExtrinsicMetadata(
+        target=_content_swhid,
+        origin=_origin_url,
+        **_common_metadata_fields,
+    )
+
+    # SWHIDs aren't valid origin URLs
+    with pytest.raises(ValueError, match="SWHID used as context origin URL"):
+        RawExtrinsicMetadata(
+            target=_content_swhid,
+            origin="swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2",
+            **_common_metadata_fields,
+        )
+
+
+def test_metadata_validate_context_visit():
+    """Checks validation of RawExtrinsicMetadata.visit."""
+
+    # Origins can't have a 'visit' context
+    with pytest.raises(
+        ValueError, match="Unexpected 'visit' context for origin object"
+    ):
+        RawExtrinsicMetadata(
+            target=_origin_swhid,
+            visit=42,
+            **_common_metadata_fields,
+        )
+
+    # but all other types can
+    RawExtrinsicMetadata(
+        target=_content_swhid,
+        origin=_origin_url,
+        visit=42,
+        **_common_metadata_fields,
+    )
+
+    # Missing 'origin'
+    with pytest.raises(ValueError, match="'origin' context must be set if 'visit' is"):
+        RawExtrinsicMetadata(
+            target=_content_swhid,
+            visit=42,
+            **_common_metadata_fields,
+        )
+
+    # visit id must be positive
+    with pytest.raises(ValueError, match="Nonpositive visit id"):
+        RawExtrinsicMetadata(
+            target=_content_swhid,
+            origin=_origin_url,
+            visit=-42,
+            **_common_metadata_fields,
+        )
+
+
+def test_metadata_validate_context_snapshot():
+    """Checks validation of RawExtrinsicMetadata.snapshot."""
+
+    # Origins can't have a 'snapshot' context
+    with pytest.raises(
+        ValueError, match="Unexpected 'snapshot' context for origin object"
+    ):
+        RawExtrinsicMetadata(
+            target=_origin_swhid,
+            snapshot=CoreSWHID(
+                object_type=ObjectType.SNAPSHOT,
+                object_id=EXAMPLE_HASH,
+            ),
+            **_common_metadata_fields,
+        )
+
+    # but content can
+    RawExtrinsicMetadata(
+        target=_content_swhid,
+        snapshot=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=EXAMPLE_HASH),
+        **_common_metadata_fields,
+    )
+
+    # SWHID type doesn't match the expected type of this context key
+    with pytest.raises(
+        ValueError, match="Expected SWHID type 'snapshot', got 'content'"
+    ):
+        RawExtrinsicMetadata(
+            target=_content_swhid,
+            snapshot=CoreSWHID(
+                object_type=ObjectType.CONTENT,
+                object_id=EXAMPLE_HASH,
+            ),
+            **_common_metadata_fields,
+        )
+
+
+def test_metadata_validate_context_release():
+    """Checks validation of RawExtrinsicMetadata.release."""
+
+    # Origins can't have a 'release' context
+    with pytest.raises(
+        ValueError, match="Unexpected 'release' context for origin object"
+    ):
+        RawExtrinsicMetadata(
+            target=_origin_swhid,
+            release=CoreSWHID(
+                object_type=ObjectType.RELEASE,
+                object_id=EXAMPLE_HASH,
+            ),
+            **_common_metadata_fields,
+        )
+
+    # but content can
+    RawExtrinsicMetadata(
+        target=_content_swhid,
+        release=CoreSWHID(object_type=ObjectType.RELEASE, object_id=EXAMPLE_HASH),
+        **_common_metadata_fields,
+    )
+
+    # SWHID type doesn't match the expected type of this context key
+    with pytest.raises(
+        ValueError, match="Expected SWHID type 'release', got 'content'"
+    ):
+        RawExtrinsicMetadata(
+            target=_content_swhid,
+            release=CoreSWHID(
+                object_type=ObjectType.CONTENT,
+                object_id=EXAMPLE_HASH,
+            ),
+            **_common_metadata_fields,
+        )
+
+
+def test_metadata_validate_context_revision():
+    """Checks validation of RawExtrinsicMetadata.revision."""
+
+    # Origins can't have a 'revision' context
+    with pytest.raises(
+        ValueError, match="Unexpected 'revision' context for origin object"
+    ):
+        RawExtrinsicMetadata(
+            target=_origin_swhid,
+            revision=CoreSWHID(
+                object_type=ObjectType.REVISION,
+                object_id=EXAMPLE_HASH,
+            ),
+            **_common_metadata_fields,
+        )
+
+    # but content can
+    RawExtrinsicMetadata(
+        target=_content_swhid,
+        revision=CoreSWHID(object_type=ObjectType.REVISION, object_id=EXAMPLE_HASH),
+        **_common_metadata_fields,
+    )
+
+    # SWHID type doesn't match the expected type of this context key
+    with pytest.raises(
+        ValueError, match="Expected SWHID type 'revision', got 'content'"
+    ):
+        RawExtrinsicMetadata(
+            target=_content_swhid,
+            revision=CoreSWHID(
+                object_type=ObjectType.CONTENT,
+                object_id=EXAMPLE_HASH,
+            ),
+            **_common_metadata_fields,
+        )
+
+
+def test_metadata_validate_context_path():
+    """Checks validation of RawExtrinsicMetadata.path."""
+
+    # Origins can't have a 'path' context
+    with pytest.raises(ValueError, match="Unexpected 'path' context for origin object"):
+        RawExtrinsicMetadata(
+            target=_origin_swhid,
+            path=b"/foo/bar",
+            **_common_metadata_fields,
+        )
+
+    # but content can
+    RawExtrinsicMetadata(
+        target=_content_swhid,
+        path=b"/foo/bar",
+        **_common_metadata_fields,
+    )
+
+
+def test_metadata_validate_context_directory():
+    """Checks validation of RawExtrinsicMetadata.directory."""
+
+    # Origins can't have a 'directory' context
+    with pytest.raises(
+        ValueError, match="Unexpected 'directory' context for origin object"
+    ):
+        RawExtrinsicMetadata(
+            target=_origin_swhid,
+            directory=CoreSWHID(
+                object_type=ObjectType.DIRECTORY,
+                object_id=EXAMPLE_HASH,
+            ),
+            **_common_metadata_fields,
+        )
+
+    # but content can
+    RawExtrinsicMetadata(
+        target=_content_swhid,
+        directory=CoreSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=EXAMPLE_HASH,
+        ),
+        **_common_metadata_fields,
+    )
+
+    # SWHID type doesn't match the expected type of this context key
+    with pytest.raises(
+        ValueError, match="Expected SWHID type 'directory', got 'content'"
+    ):
+        RawExtrinsicMetadata(
+            target=_content_swhid,
+            directory=CoreSWHID(
+                object_type=ObjectType.CONTENT,
+                object_id=EXAMPLE_HASH,
+            ),
+            **_common_metadata_fields,
+        )
+
+
+def test_metadata_normalize_discovery_date():
+    fields_copy = {**_common_metadata_fields}
+    truncated_date = fields_copy.pop("discovery_date")
+    assert truncated_date.microsecond == 0
+
+    # Check for TypeError on disabled object type: we removed attrs_strict's
+    # type_validator
+    with pytest.raises(TypeError):
+        RawExtrinsicMetadata(
+            target=_content_swhid, discovery_date="not a datetime", **fields_copy
+        )
+
+    # Check for truncation to integral second
+    date_with_us = truncated_date.replace(microsecond=42)
+    md = RawExtrinsicMetadata(
+        target=_content_swhid,
+        discovery_date=date_with_us,
+        **fields_copy,
+    )
+
+    assert md.discovery_date == truncated_date
+    assert md.discovery_date.tzinfo == datetime.timezone.utc
+
+    # Check that the timezone gets normalized. Timezones can be offset by a
+    # non-integral number of seconds, so we need to handle that.
+    timezone = datetime.timezone(offset=datetime.timedelta(hours=2))
+    date_with_tz = truncated_date.astimezone(timezone)
+
+    assert date_with_tz.tzinfo != datetime.timezone.utc
+
+    md = RawExtrinsicMetadata(
+        target=_content_swhid,
+        discovery_date=date_with_tz,
+        **fields_copy,
+    )
+
+    assert md.discovery_date == truncated_date
+    assert md.discovery_date.tzinfo == datetime.timezone.utc
+
+
+def test_revision_repr():
+    from swh.model.model import RevisionType  # noqa
+
+    revision = Revision.from_dict(revision_example)
+    rev_repr = repr(revision)
+
+    assert rev_repr == (
+        "Revision(message=b'Linux 4.2-rc2\\n', "
+        "author=Person(fullname=b'Linus Torvalds <torvalds@linux-foundation.org>', "
+        "name=b'Linus Torvalds', email=b'torvalds@linux-foundation.org'), "
+        "committer=Person(fullname=b'Linus Torvalds <torvalds@linux-foundation.org>', "
+        "name=b'Linus Torvalds', email=b'torvalds@linux-foundation.org'), "
+        "date=TimestampWithTimezone(timestamp=Timestamp(seconds=1436739030, microseconds=0), "
+        "offset_bytes=b'-0700'), "
+        "committer_date=TimestampWithTimezone(timestamp=Timestamp(seconds=1436739030, "
+        "microseconds=0), offset_bytes=b'-0700'), "
+        "type=RevisionType.GIT, "
+        "directory=hash_to_bytes('85a74718d377195e1efd0843ba4f3260bad4fe07'), "
+        "synthetic=False, metadata=None, "
+        "parents=(hash_to_bytes('01e2d0627a9a6edb24c37db45db5ecb31e9de808'),), "
+        "id=hash_to_bytes('bc0195aad0daa2ad5b0d76cce22b167bc3435590'), "
+        "extra_headers=(), raw_manifest=None)"
+    )
+    assert eval(rev_repr) == revision
--- a/swh/model/tests/test_swh_model_data.py
+++ b/swh/model/tests/test_swh_model_data.py
+# Copyright (C) 2021  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import attr
+import pytest
+
+from swh.model.model import ModelObjectType
+from swh.model.tests.swh_model_data import TEST_OBJECTS
+
+
+@pytest.mark.parametrize("object_type, objects", TEST_OBJECTS.items())
+def test_swh_model_data(object_type, objects):
+    """checks model objects in swh_model_data are in correct shape"""
+    assert objects
+    for obj in objects:
+        assert obj.object_type == object_type
+        attr.validate(obj)
+
+
+@pytest.mark.parametrize(
+    "object_type",
+    (
+        ModelObjectType.DIRECTORY,
+        ModelObjectType.REVISION,
+        ModelObjectType.RELEASE,
+        ModelObjectType.SNAPSHOT,
+    ),
+)
+def test_swh_model_data_hash(object_type):
+    for obj in TEST_OBJECTS[object_type]:
+        assert (
+            obj.compute_hash() == obj.id
+        ), f"{obj.compute_hash().hex()} != {obj.id.hex()}"
+
+
+def test_ensure_visit_status_date_consistency():
+    """ensure origin-visit-status dates are more recent than their visit counterpart
+
+    The origin-visit-status dates needs to be shifted slightly in the future from their
+    visit dates counterpart. Otherwise, we are hitting storage-wise the "on conflict"
+    ignore policy (because origin-visit-add creates an origin-visit-status with the same
+    parameters from the origin-visit {origin, visit, date}...
+
+    """
+    visits = TEST_OBJECTS[ModelObjectType.ORIGIN_VISIT]
+    visit_statuses = TEST_OBJECTS[ModelObjectType.ORIGIN_VISIT_STATUS]
+    for visit, visit_status in zip(visits, visit_statuses):
+        assert visit.origin == visit_status.origin
+        assert visit.visit == visit_status.visit
+        assert visit.date < visit_status.date
+
+
+def test_ensure_visit_status_snapshot_consistency():
+    """ensure origin-visit-status snapshots exist in the test dataset"""
+    snapshots = [snp.id for snp in TEST_OBJECTS[ModelObjectType.SNAPSHOT]]
+    for visit_status in TEST_OBJECTS[ModelObjectType.ORIGIN_VISIT_STATUS]:
+        if visit_status.snapshot:
+            assert visit_status.snapshot in snapshots
--- a/swh/model/tests/test_swhids.py
+++ b/swh/model/tests/test_swhids.py
+# Copyright (C) 2015-2021  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import itertools
+
+import attr
+import pytest
+
+from swh.model.exceptions import ValidationError
+from swh.model.hashutil import hash_to_bytes as _x
+from swh.model.swhids import (
+    SWHID_QUALIFIERS,
+    CoreSWHID,
+    ExtendedObjectType,
+    ExtendedSWHID,
+    ObjectType,
+    QualifiedSWHID,
+)
+
+dummy_qualifiers = {"origin": "https://example.com", "lines": "42"}
+
+
+# SWHIDs that are outright invalid, no matter the context
+INVALID_SWHIDS = [
+    "swh:1:cnt",
+    "swh:1:",
+    "swh:",
+    "swh:1:cnt:",
+    "foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505",
+    "swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505",
+    "swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505",
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;malformed",
+    "swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d",
+    "swh:1:snp:foo",
+    # wrong qualifier: ori should be origin
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/",  # noqa
+    # wrong qualifier: anc should be anchor
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anc=1;visit=1;path=/",  # noqa
+    # wrong qualifier: vis should be visit
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=1;vis=1;path=/",  # noqa
+    # wrong qualifier: pa should be path
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=1;visit=1;pa=/",  # noqa
+    # wrong qualifier: line should be lines
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;line=10;origin=something;anchor=1;visit=1;path=/",  # noqa
+    # wrong qualifier value: it contains space before of after
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=  https://some-url",  # noqa
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=some-anchor    ",  # noqa
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=some-anchor    ;visit=1",  # noqa
+    # invalid swhid: whitespaces
+    "swh :1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/",  # noqa
+    "swh: 1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/",  # noqa
+    "swh: 1: dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/",  # noqa
+    "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d",
+    "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d; origin=blah",
+    "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12",
+    # other whitespaces
+    "swh\t:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12",
+    "swh:1\n:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12",
+    "swh:1:\rdir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12",
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d\f;lines=12",
+    "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12\v",
+]
+
+SWHID_CLASSES = [CoreSWHID, QualifiedSWHID, ExtendedSWHID]
+
+
+@pytest.mark.parametrize(
+    "invalid_swhid,swhid_class", itertools.product(INVALID_SWHIDS, SWHID_CLASSES)
+)
+def test_swhid_parsing_error(invalid_swhid, swhid_class):
+    """Tests SWHID strings that are invalid for all SWHID classes do raise
+    a ValidationError"""
+    with pytest.raises(ValidationError):
+        swhid_class.from_string(invalid_swhid)
+
+
+# string SWHIDs, and how they should be parsed by each of the classes,
+# or None if the class does not support it
+HASH = "94a9ed024d3859793618152ea559a168bbcbb5e2"
+VALID_SWHIDS = [
+    (
+        f"swh:1:cnt:{HASH}",
+        CoreSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+        ),
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+        ),
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.CONTENT,
+            object_id=_x(HASH),
+        ),
+    ),
+    (
+        f"swh:1:dir:{HASH}",
+        CoreSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=_x(HASH),
+        ),
+        QualifiedSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=_x(HASH),
+        ),
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.DIRECTORY,
+            object_id=_x(HASH),
+        ),
+    ),
+    (
+        f"swh:1:rev:{HASH}",
+        CoreSWHID(
+            object_type=ObjectType.REVISION,
+            object_id=_x(HASH),
+        ),
+        QualifiedSWHID(
+            object_type=ObjectType.REVISION,
+            object_id=_x(HASH),
+        ),
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.REVISION,
+            object_id=_x(HASH),
+        ),
+    ),
+    (
+        f"swh:1:rel:{HASH}",
+        CoreSWHID(
+            object_type=ObjectType.RELEASE,
+            object_id=_x(HASH),
+        ),
+        QualifiedSWHID(
+            object_type=ObjectType.RELEASE,
+            object_id=_x(HASH),
+        ),
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.RELEASE,
+            object_id=_x(HASH),
+        ),
+    ),
+    (
+        f"swh:1:snp:{HASH}",
+        CoreSWHID(
+            object_type=ObjectType.SNAPSHOT,
+            object_id=_x(HASH),
+        ),
+        QualifiedSWHID(
+            object_type=ObjectType.SNAPSHOT,
+            object_id=_x(HASH),
+        ),
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.SNAPSHOT,
+            object_id=_x(HASH),
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython;lines=1-18",
+        None,  # CoreSWHID does not allow qualifiers
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+            origin="https://github.com/python/cpython",
+            lines=(1, 18),
+        ),
+        None,  # Neither does ExtendedSWHID
+    ),
+    (
+        f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython;lines=1-18/",
+        None,  # likewise
+        None,
+        None,  # likewise
+    ),
+    (
+        f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython;lines=18",
+        None,  # likewise
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+            origin="https://github.com/python/cpython",
+            lines=(18, None),
+        ),
+        None,  # likewise
+    ),
+    (
+        f"swh:1:dir:{HASH};origin=deb://Debian/packages/linuxdoc-tools",
+        None,  # likewise
+        QualifiedSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=_x(HASH),
+            origin="deb://Debian/packages/linuxdoc-tools",
+        ),
+        None,  # likewise
+    ),
+    (
+        f"swh:1:ori:{HASH}",
+        None,  # CoreSWHID does not allow origin pseudo-SWHIDs
+        None,  # Neither does QualifiedSWHID
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.ORIGIN,
+            object_id=_x(HASH),
+        ),
+    ),
+    (
+        f"swh:1:emd:{HASH}",
+        None,  # likewise for metadata pseudo-SWHIDs
+        None,  # Neither does QualifiedSWHID
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.RAW_EXTRINSIC_METADATA,
+            object_id=_x(HASH),
+        ),
+    ),
+    (
+        f"swh:1:emd:{HASH};origin=https://github.com/python/cpython",
+        None,  # CoreSWHID does not allow metadata pseudo-SWHIDs or qualifiers
+        None,  # QualifiedSWHID does not allow metadata pseudo-SWHIDs
+        None,  # ExtendedSWHID does not allow qualifiers
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "string,core,qualified,extended",
+    [
+        pytest.param(string, core, qualified, extended, id=string)
+        for (string, core, qualified, extended) in VALID_SWHIDS
+    ],
+)
+def test_parse_unparse_swhids(string, core, qualified, extended):
+    """Tests parsing and serializing valid SWHIDs with the various SWHID classes."""
+    classes = [CoreSWHID, QualifiedSWHID, ExtendedSWHID]
+    for cls, parsed_swhid in zip(classes, [core, qualified, extended]):
+        if parsed_swhid is None:
+            # This class should not accept this SWHID
+            with pytest.raises(ValidationError) as excinfo:
+                cls.from_string(string)
+            # Check string serialization for exception
+            assert str(excinfo.value) is not None
+        else:
+            # This class should
+            assert cls.from_string(string) == parsed_swhid
+
+            # Also check serialization
+            assert string == str(parsed_swhid)
+
+
+@pytest.mark.parametrize(
+    "core,extended",
+    [
+        pytest.param(core, extended, id=string)
+        for (string, core, qualified, extended) in VALID_SWHIDS
+        if core is not None
+    ],
+)
+def test_core_to_extended(core, extended):
+    assert core.to_extended() == extended
+
+
+@pytest.mark.parametrize(
+    "core,qualified",
+    [
+        pytest.param(core, qualified, id=string)
+        for (string, core, qualified, extended) in VALID_SWHIDS
+        if core is not None
+    ],
+)
+def test_core_to_qualified(core, qualified):
+    assert core.to_qualified() == qualified
+
+
+@pytest.mark.parametrize(
+    "ns,version,type,id,qualifiers",
+    [
+        ("foo", 1, ObjectType.CONTENT, "abc8bc9d7a6bcf6db04f476d29314f157507d505", {}),
+        ("swh", 2, ObjectType.CONTENT, "def8bc9d7a6bcf6db04f476d29314f157507d505", {}),
+        ("swh", 1, ObjectType.DIRECTORY, "aaaa", {}),
+    ],
+)
+def test_QualifiedSWHID_validation_error(ns, version, type, id, qualifiers):
+    with pytest.raises(ValidationError):
+        QualifiedSWHID(
+            namespace=ns,
+            scheme_version=version,
+            object_type=type,
+            object_id=_x(id),
+            **qualifiers,
+        )
+
+
+QSWHID_EXPECTED = [
+    # No qualifier:
+    (ObjectType.CONTENT, {}, f"swh:1:cnt:{HASH}"),
+    # origin:
+    (ObjectType.CONTENT, {"origin": None}, f"swh:1:cnt:{HASH}"),
+    (ObjectType.CONTENT, {"origin": 42}, ValueError),
+    # visit:
+    (
+        ObjectType.CONTENT,
+        {"visit": f"swh:1:snp:{HASH}"},
+        f"swh:1:cnt:{HASH};visit=swh:1:snp:{HASH}",
+    ),
+    (
+        ObjectType.CONTENT,
+        {"visit": CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH))},
+        f"swh:1:cnt:{HASH};visit=swh:1:snp:{HASH}",
+    ),
+    (ObjectType.CONTENT, {"visit": 42}, TypeError),
+    (
+        ObjectType.CONTENT,
+        {"visit": f"swh:1:rel:{HASH}"},
+        ValidationError,
+    ),
+    (
+        ObjectType.CONTENT,
+        {"visit": CoreSWHID(object_type=ObjectType.RELEASE, object_id=_x(HASH))},
+        ValidationError,
+    ),
+    # anchor:
+    (
+        ObjectType.CONTENT,
+        {"anchor": f"swh:1:snp:{HASH}"},
+        f"swh:1:cnt:{HASH};anchor=swh:1:snp:{HASH}",
+    ),
+    (
+        ObjectType.CONTENT,
+        {"anchor": CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH))},
+        f"swh:1:cnt:{HASH};anchor=swh:1:snp:{HASH}",
+    ),
+    (
+        ObjectType.CONTENT,
+        {"anchor": f"swh:1:dir:{HASH}"},
+        f"swh:1:cnt:{HASH};anchor=swh:1:dir:{HASH}",
+    ),
+    (
+        ObjectType.CONTENT,
+        {"anchor": CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=_x(HASH))},
+        f"swh:1:cnt:{HASH};anchor=swh:1:dir:{HASH}",
+    ),
+    (ObjectType.CONTENT, {"anchor": 42}, TypeError),
+    (
+        ObjectType.CONTENT,
+        {"anchor": f"swh:1:cnt:{HASH}"},
+        ValidationError,
+    ),
+    (
+        ObjectType.CONTENT,
+        {"anchor": CoreSWHID(object_type=ObjectType.CONTENT, object_id=_x(HASH))},
+        ValidationError,
+    ),
+    # path:
+    (
+        ObjectType.CONTENT,
+        {"path": b"/foo"},
+        f"swh:1:cnt:{HASH};path=/foo",
+    ),
+    (
+        ObjectType.CONTENT,
+        {"path": b"/foo;bar"},
+        f"swh:1:cnt:{HASH};path=/foo%3Bbar",
+    ),
+    (
+        ObjectType.CONTENT,
+        {"path": "/foo"},
+        f"swh:1:cnt:{HASH};path=/foo",
+    ),
+    (
+        ObjectType.CONTENT,
+        {"path": "/foo;bar"},
+        f"swh:1:cnt:{HASH};path=/foo%3Bbar",
+    ),
+    (ObjectType.CONTENT, {"path": 42}, Exception),
+    # lines:
+    (
+        ObjectType.CONTENT,
+        {"lines": (42, None)},
+        f"swh:1:cnt:{HASH};lines=42",
+    ),
+    (
+        ObjectType.CONTENT,
+        {"lines": (21, 42)},
+        f"swh:1:cnt:{HASH};lines=21-42",
+    ),
+    (
+        ObjectType.CONTENT,
+        {"lines": 42},
+        TypeError,
+    ),
+    (
+        ObjectType.CONTENT,
+        {"lines": (None, 42)},
+        ValueError,
+    ),
+    (
+        ObjectType.CONTENT,
+        {"lines": ("42", None)},
+        ValueError,
+    ),
+]
+
+
+@pytest.mark.parametrize("object_type,qualifiers,expected", QSWHID_EXPECTED)
+def test_QualifiedSWHID_init(object_type, qualifiers, expected):
+    """Tests validation and converters of qualifiers"""
+    if isinstance(expected, type):
+        assert issubclass(expected, Exception)
+        with pytest.raises(expected):
+            QualifiedSWHID(object_type=object_type, object_id=_x(HASH), **qualifiers)
+    else:
+        assert isinstance(expected, str)
+        swhid = QualifiedSWHID(
+            object_type=object_type, object_id=_x(HASH), **qualifiers
+        )
+
+        # Check the build object has the right serialization
+        assert expected == str(swhid)
+
+        # Check the internal state of the object is the same as if parsed from a string
+        assert QualifiedSWHID.from_string(expected) == swhid
+
+
+@pytest.mark.parametrize(
+    "object_type,qualifiers",
+    [
+        (type_, dict_)
+        for (type_, dict_, str_or_exc) in QSWHID_EXPECTED
+        if isinstance(str_or_exc, str)
+    ],
+)
+def test_QualifiedSWHID_to_dict(object_type, qualifiers):
+    qswhid = QualifiedSWHID(object_type=object_type, object_id=_x(HASH), **qualifiers)
+    d = qswhid.to_dict()
+    swhid = CoreSWHID.from_string(d.pop("swhid"))
+    other = QualifiedSWHID(
+        object_type=swhid.object_type, object_id=swhid.object_id, **d
+    )
+    assert qswhid == other
+
+
+def test_QualifiedSWHID_hash():
+    object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2")
+
+    assert hash(
+        QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id)
+    ) == hash(QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id))
+
+    assert hash(
+        QualifiedSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=object_id,
+            **dummy_qualifiers,
+        )
+    ) == hash(
+        QualifiedSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=object_id,
+            **dummy_qualifiers,
+        )
+    )
+
+    # Different order of the dictionary, so the underlying order of the tuple in
+    # ImmutableDict is different.
+    assert hash(
+        QualifiedSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=object_id,
+            origin="https://example.com",
+            lines=(42, None),
+        )
+    ) == hash(
+        QualifiedSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=object_id,
+            lines=(42, None),
+            origin="https://example.com",
+        )
+    )
+
+
+def test_QualifiedSWHID_eq():
+    object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2")
+
+    assert QualifiedSWHID(
+        object_type=ObjectType.DIRECTORY, object_id=object_id
+    ) == QualifiedSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id)
+
+    assert QualifiedSWHID(
+        object_type=ObjectType.DIRECTORY,
+        object_id=object_id,
+        **dummy_qualifiers,
+    ) == QualifiedSWHID(
+        object_type=ObjectType.DIRECTORY,
+        object_id=object_id,
+        **dummy_qualifiers,
+    )
+
+    assert QualifiedSWHID(
+        object_type=ObjectType.DIRECTORY,
+        object_id=object_id,
+        **dummy_qualifiers,
+    ) == QualifiedSWHID(
+        object_type=ObjectType.DIRECTORY,
+        object_id=object_id,
+        **dummy_qualifiers,
+    )
+
+
+QUALIFIED_SWHIDS = [
+    # origin:
+    (
+        f"swh:1:cnt:{HASH};origin=https://github.com/python/cpython",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+            origin="https://github.com/python/cpython",
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};origin=https://example.org/foo%3Bbar%25baz",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+            origin="https://example.org/foo;bar%baz",
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};origin=https://example.org?project=test",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+            origin="https://example.org?project=test",
+        ),
+    ),
+    # visit:
+    (
+        f"swh:1:cnt:{HASH};visit=swh:1:snp:{HASH}",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+            visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=_x(HASH)),
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};visit=swh:1:rel:{HASH}",
+        None,
+    ),
+    # anchor:
+    (
+        f"swh:1:cnt:{HASH};anchor=swh:1:dir:{HASH}",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+            anchor=CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=_x(HASH)),
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};anchor=swh:1:rev:{HASH}",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+            anchor=CoreSWHID(object_type=ObjectType.REVISION, object_id=_x(HASH)),
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};anchor=swh:1:cnt:{HASH}",
+        None,  # 'cnt' is not valid in anchor
+    ),
+    (
+        f"swh:1:cnt:{HASH};anchor=swh:1:ori:{HASH}",
+        None,  # 'ori' is not valid in a CoreSWHID
+    ),
+    # path:
+    (
+        f"swh:1:cnt:{HASH};path=/foo",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo"
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};path=/foo%3Bbar",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo;bar"
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};path=/foo%25bar",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo%bar"
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};path=/foo/bar%3Dbaz",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT, object_id=_x(HASH), path=b"/foo/bar=baz"
+        ),
+    ),
+    # lines
+    (
+        f"swh:1:cnt:{HASH};lines=1-18",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+            lines=(1, 18),
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};lines=18",
+        QualifiedSWHID(
+            object_type=ObjectType.CONTENT,
+            object_id=_x(HASH),
+            lines=(18, None),
+        ),
+    ),
+    (
+        f"swh:1:cnt:{HASH};lines=",
+        None,
+    ),
+    (
+        f"swh:1:cnt:{HASH};lines=aa",
+        None,
+    ),
+    (
+        f"swh:1:cnt:{HASH};lines=18-aa",
+        None,
+    ),
+]
+
+
+@pytest.mark.parametrize("string,parsed", QUALIFIED_SWHIDS)
+def test_QualifiedSWHID_parse_serialize_qualifiers(string, parsed):
+    """Tests parsing and serializing valid SWHIDs with the various SWHID classes."""
+    if parsed is None:
+        with pytest.raises(ValidationError):
+            print(repr(QualifiedSWHID.from_string(string)))
+    else:
+        assert QualifiedSWHID.from_string(string) == parsed
+        assert str(parsed) == string
+
+
+def test_QualifiedSWHID_deserialize_origin_extra_escapes():
+    """Checks that semicolon in origins are escaped."""
+    string = f"swh:1:cnt:{HASH};origin=https://example.org/foo%3Bbar%25baz"
+    swhid = QualifiedSWHID(
+        object_type=ObjectType.CONTENT,
+        object_id=_x(HASH),
+        origin="https://example.org/foo;bar%baz",
+    )
+    assert QualifiedSWHID.from_string(string) == swhid
+
+
+def test_QualifiedSWHID_attributes():
+    """Checks the set of QualifiedSWHID attributes match the SWHID_QUALIFIERS
+    constant."""
+
+    assert set(attr.fields_dict(QualifiedSWHID)) == {
+        "namespace",
+        "scheme_version",
+        "object_type",
+        "object_id",
+        *SWHID_QUALIFIERS,
+    }
+
+
+@pytest.mark.parametrize(
+    "ns,version,type,id",
+    [
+        ("foo", 1, ObjectType.CONTENT, "abc8bc9d7a6bcf6db04f476d29314f157507d505"),
+        ("swh", 2, ObjectType.CONTENT, "def8bc9d7a6bcf6db04f476d29314f157507d505"),
+        ("swh", 1, ObjectType.DIRECTORY, "aaaa"),
+    ],
+)
+def test_CoreSWHID_validation_error(ns, version, type, id):
+    with pytest.raises(ValidationError):
+        CoreSWHID(
+            namespace=ns,
+            scheme_version=version,
+            object_type=type,
+            object_id=_x(id),
+        )
+
+
+def test_CoreSWHID_hash():
+    object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2")
+
+    assert hash(
+        CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id)
+    ) == hash(CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id))
+
+    assert hash(
+        CoreSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=object_id,
+        )
+    ) == hash(
+        CoreSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=object_id,
+        )
+    )
+
+    # Different order of the dictionary, so the underlying order of the tuple in
+    # ImmutableDict is different.
+    assert hash(
+        CoreSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=object_id,
+        )
+    ) == hash(
+        CoreSWHID(
+            object_type=ObjectType.DIRECTORY,
+            object_id=object_id,
+        )
+    )
+
+
+def test_CoreSWHID_eq():
+    object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2")
+
+    assert CoreSWHID(
+        object_type=ObjectType.DIRECTORY, object_id=object_id
+    ) == CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id)
+
+    assert CoreSWHID(
+        object_type=ObjectType.DIRECTORY,
+        object_id=object_id,
+    ) == CoreSWHID(
+        object_type=ObjectType.DIRECTORY,
+        object_id=object_id,
+    )
+
+    assert CoreSWHID(
+        object_type=ObjectType.DIRECTORY,
+        object_id=object_id,
+    ) == CoreSWHID(
+        object_type=ObjectType.DIRECTORY,
+        object_id=object_id,
+    )
+
+
+@pytest.mark.parametrize(
+    "ns,version,type,id",
+    [
+        (
+            "foo",
+            1,
+            ExtendedObjectType.CONTENT,
+            "abc8bc9d7a6bcf6db04f476d29314f157507d505",
+        ),
+        (
+            "swh",
+            2,
+            ExtendedObjectType.CONTENT,
+            "def8bc9d7a6bcf6db04f476d29314f157507d505",
+        ),
+        ("swh", 1, ExtendedObjectType.DIRECTORY, "aaaa"),
+    ],
+)
+def test_ExtendedSWHID_validation_error(ns, version, type, id):
+    with pytest.raises(ValidationError):
+        ExtendedSWHID(
+            namespace=ns,
+            scheme_version=version,
+            object_type=type,
+            object_id=_x(id),
+        )
+
+
+def test_ExtendedSWHID_hash():
+    object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2")
+
+    assert hash(
+        ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id)
+    ) == hash(
+        ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id)
+    )
+
+    assert hash(
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.DIRECTORY,
+            object_id=object_id,
+        )
+    ) == hash(
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.DIRECTORY,
+            object_id=object_id,
+        )
+    )
+
+    # Different order of the dictionary, so the underlying order of the tuple in
+    # ImmutableDict is different.
+    assert hash(
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.DIRECTORY,
+            object_id=object_id,
+        )
+    ) == hash(
+        ExtendedSWHID(
+            object_type=ExtendedObjectType.DIRECTORY,
+            object_id=object_id,
+        )
+    )
+
+
+def test_ExtendedSWHID_eq():
+    object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2")
+
+    assert ExtendedSWHID(
+        object_type=ExtendedObjectType.DIRECTORY, object_id=object_id
+    ) == ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id)
+
+    assert ExtendedSWHID(
+        object_type=ExtendedObjectType.DIRECTORY,
+        object_id=object_id,
+    ) == ExtendedSWHID(
+        object_type=ExtendedObjectType.DIRECTORY,
+        object_id=object_id,
+    )
+
+    assert ExtendedSWHID(
+        object_type=ExtendedObjectType.DIRECTORY,
+        object_id=object_id,
+    ) == ExtendedSWHID(
+        object_type=ExtendedObjectType.DIRECTORY,
+        object_id=object_id,
+    )
+
+
+def test_object_types():
+    """Checks ExtendedObjectType is a superset of ObjectType"""
+    for member in ObjectType:
+        assert getattr(ExtendedObjectType, member.name).value == member.value
--- a/swh/model/tests/test_toposort.py
+++ b/swh/model/tests/test_toposort.py
@@ -25,16 +25,16 @@ def is_toposorted_slow(revision_log):
    Returns:
        True if the revision log is topologically sorted.
    """
-    rev_by_id = {r['id']: r for r in revision_log}
+    rev_by_id = {r["id"]: r for r in revision_log}

    def all_parents(revision):
-        for parent in revision['parents']:
+        for parent in revision["parents"]:
            yield parent
            yield from all_parents(rev_by_id[parent])

    visited = set()
    for rev in revision_log:
-        visited.add(rev['id'])
+        visited.add(rev["id"])
        if not all(parent in visited for parent in all_parents(rev)):
            return False
    return True
@@ -43,10 +43,10 @@ def is_toposorted_slow(revision_log):
 class TestToposort(unittest.TestCase):
    def generate_log(self, graph):
        for node_id, parents in graph.items():
-            yield {'id': node_id, 'parents': tuple(parents)}
+            yield {"id": node_id, "parents": tuple(parents)}

    def unordered_log(self, log):
-        return {(d['id'], tuple(d['parents'])) for d in log}
+        return {(d["id"], tuple(d["parents"])) for d in log}

    def check(self, graph):
        log = list(self.generate_log(graph))
@@ -56,45 +56,28 @@ class TestToposort(unittest.TestCase):
        self.assertTrue(is_toposorted_slow(toposort(log)))

    def test_linked_list(self):
-        self.check({3: [2],
-                    2: [1],
-                    1: []})
+        self.check({3: [2], 2: [1], 1: []})

    def test_fork(self):
-        self.check({7: [6],
-                    6: [4],
-                    5: [3],
-                    4: [2],
-                    3: [2],
-                    2: [1],
-                    1: []})
+        self.check({7: [6], 6: [4], 5: [3], 4: [2], 3: [2], 2: [1], 1: []})

    def test_fork_merge(self):
-        self.check({8: [7, 5],
-                    7: [6],
-                    6: [4],
-                    5: [3],
-                    4: [2],
-                    3: [2],
-                    2: [1],
-                    1: []})
+        self.check({8: [7, 5], 7: [6], 6: [4], 5: [3], 4: [2], 3: [2], 2: [1], 1: []})

    def test_two_origins(self):
-        self.check({9: [8],
-                    8: [7, 5],
-                    7: [6],
-                    6: [4],
-                    5: [3],
-                    4: [],
-                    3: []})
+        self.check({9: [8], 8: [7, 5], 7: [6], 6: [4], 5: [3], 4: [], 3: []})

    def test_three_way(self):
-        self.check({9: [8, 4, 2],
-                    8: [7, 5],
-                    7: [6],
-                    6: [4],
-                    5: [3],
-                    4: [2],
-                    3: [2],
-                    2: [1],
-                    1: []})
+        self.check(
+            {
+                9: [8, 4, 2],
+                8: [7, 5],
+                7: [6],
+                6: [4],
+                5: [3],
+                4: [2],
+                3: [2],
+                2: [1],
+                1: [],
+            }
+        )
No results found