From 8c904dc6af617d595c1295b80c3976ea1861b92f Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Fri, 30 Apr 2021 12:24:00 +0200
Subject: [PATCH] identifiers: Expose git_object instead of manifest

The git_object is what will be actually useful to the vault.

It's also easier to test, because test_identifier.py has the
entire git_object in its test data.
---
 swh/model/hashutil.py               | 55 +++++++++-----------
 swh/model/identifiers.py            | 81 +++++++++++++++++------------
 swh/model/tests/test_identifiers.py | 56 +++++++++++++++-----
 3 files changed, 117 insertions(+), 75 deletions(-)

diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py
index e332c23b..eaacb230 100644
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -56,7 +56,7 @@ import functools
 import hashlib
 from io import BytesIO
 import os
-from typing import Callable, Dict
+from typing import Callable, Dict, Optional
 
 ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512"])
 """Hashing algorithms supported by this module"""
@@ -212,12 +212,10 @@ def _new_hashlib_hash(algo):
         return hashlib.new(algo)
 
 
-def _new_git_hash(base_algo, git_type, length):
-    """Initialize a digest object (as returned by python's hashlib) for the
-    requested algorithm, and feed it with the header for a git object of the
-    given type and length.
+def git_object_header(git_type: str, length: int) -> bytes:
+    """Returns the header for a git object of the given type and length.
 
-    The header for hashing a git object consists of:
+    The header of a git object consists of:
      - The type of the object (encoded in ASCII)
      - One ASCII space (\x20)
      - The length of the object (decimal encoded in ASCII)
@@ -232,15 +230,26 @@ def _new_git_hash(base_algo, git_type, length):
     Returns:
         a hashutil.hash object
     """
+    git_object_types = {
+        "blob",
+        "tree",
+        "commit",
+        "tag",
+        "snapshot",
+        "raw_extrinsic_metadata",
+        "extid",
+    }
 
-    h = _new_hashlib_hash(base_algo)
-    git_header = "%s %d\0" % (git_type, length)
-    h.update(git_header.encode("ascii"))
+    if git_type not in git_object_types:
+        raise ValueError(
+            "Unexpected git object type %s, expected one of %s"
+            % (git_type, ", ".join(sorted(git_object_types)))
+        )
 
-    return h
+    return ("%s %d\0" % (git_type, length)).encode("ascii")
 
 
-def _new_hash(algo, length=None):
+def _new_hash(algo: str, length: Optional[int] = None):
     """Initialize a digest object (as returned by python's hashlib) for
     the requested algorithm. See the constant ALGORITHMS for the list
     of supported algorithms. If a git-specific hashing algorithm is
@@ -270,7 +279,9 @@ def _new_hash(algo, length=None):
         if length is None:
             raise ValueError("Missing length for git hashing algorithm")
         base_algo = algo[:-4]
-        return _new_git_hash(base_algo, "blob", length)
+        h = _new_hashlib_hash(base_algo)
+        h.update(git_object_header("blob", length))
+        return h
 
     return _new_hashlib_hash(algo)
 
@@ -288,24 +299,8 @@ def hash_git_data(data, git_type, base_algo="sha1"):
     Raises:
         ValueError if the git_type is unexpected.
     """
-
-    git_object_types = {
-        "blob",
-        "tree",
-        "commit",
-        "tag",
-        "snapshot",
-        "raw_extrinsic_metadata",
-        "extid",
-    }
-
-    if git_type not in git_object_types:
-        raise ValueError(
-            "Unexpected git object type %s, expected one of %s"
-            % (git_type, ", ".join(sorted(git_object_types)))
-        )
-
-    h = _new_git_hash(base_algo, git_type, len(data))
+    h = _new_hashlib_hash(base_algo)
+    h.update(git_object_header(git_type, len(data)))
     h.update(data)
 
     return h.digest()
diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py
index 01a9ecc2..d542178f 100644
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -29,7 +29,7 @@ import attr
 from attrs_strict import type_validator
 
 from .exceptions import ValidationError
-from .hashutil import MultiHash, hash_git_data, hash_to_bytes, hash_to_hex
+from .hashutil import MultiHash, git_object_header, hash_to_bytes, hash_to_hex
 
 
 class ObjectType(enum.Enum):
@@ -241,11 +241,11 @@ def directory_identifier(directory: Dict[str, Any]) -> str:
       (Note that there is no separator between entries)
 
     """
-    manifest = directory_manifest(directory)
-    return identifier_to_str(hash_git_data(manifest, "tree"))
+    git_object = directory_git_object(directory)
+    return hashlib.new("sha1", git_object).hexdigest()
 
 
-def directory_manifest(directory: Dict[str, Any]) -> bytes:
+def directory_git_object(directory: Dict[str, Any]) -> bytes:
     components = []
 
     for entry in sorted(directory["entries"], key=directory_entry_sort_key):
@@ -259,7 +259,7 @@ def directory_manifest(directory: Dict[str, Any]) -> bytes:
             ]
         )
 
-    return b"".join(components)
+    return format_git_object_from_parts("tree", components)
 
 
 def format_date(date):
@@ -416,12 +416,15 @@ def format_author(author):
     return b" ".join(ret)
 
 
-def format_manifest(
-    headers: Iterable[Tuple[bytes, bytes]], message: Optional[bytes] = None,
+def format_git_object_from_headers(
+    git_type: str,
+    headers: Iterable[Tuple[bytes, bytes]],
+    message: Optional[bytes] = None,
 ) -> bytes:
-    """Format a manifest comprised of a sequence of `headers` and an optional `message`.
+    """Format a git_object comprised of a git header and a manifest,
+    which is itself a sequence of `headers`, and an optional `message`.
 
-    The manifest format, compatible with the git format for tag and commit
+    The git_object format, compatible with the git format for tag and commit
     objects, is as follows:
 
       - for each `key`, `value` in `headers`, emit:
@@ -441,7 +444,7 @@ def format_manifest(
       message: an optional message used to trail the manifest.
 
     Returns:
-      the formatted manifest as bytes
+      the formatted git_object as bytes
     """
     entries: List[bytes] = []
 
@@ -451,7 +454,19 @@ def format_manifest(
     if message is not None:
         entries.extend((b"\n", message))
 
-    return b"".join(entries)
+    concatenated_entries = b"".join(entries)
+
+    header = git_object_header(git_type, len(concatenated_entries))
+    return header + concatenated_entries
+
+
+def format_git_object_from_parts(git_type: str, parts: Iterable[bytes]) -> bytes:
+    """Similar to :func:`format_git_object_from_headers`, but for manifests made of
+    a flat list of entries, instead of key-value + message, ie. trees and snapshots."""
+    concatenated_parts = b"".join(parts)
+
+    header = git_object_header(git_type, len(concatenated_parts))
+    return header + concatenated_parts
 
 
 def format_author_data(author, date_offset) -> bytes:
@@ -550,12 +565,12 @@ def revision_identifier(revision: Dict[str, Any]) -> str:
     type.
 
     """
-    manifest = revision_manifest(revision)
-    return identifier_to_str(hash_git_data(manifest, "commit"))
+    git_object = revision_git_object(revision)
+    return hashlib.new("sha1", git_object).hexdigest()
 
 
-def revision_manifest(revision: Dict[str, Any]) -> bytes:
-    """Formats the manifest of a revision. See :func:`revision_identifier` for details
+def revision_git_object(revision: Dict[str, Any]) -> bytes:
+    """Formats the git_object of a revision. See :func:`revision_identifier` for details
     on the format."""
     headers = [(b"tree", identifier_to_str(revision["directory"]).encode())]
     for parent in revision["parents"]:
@@ -580,7 +595,7 @@ def revision_manifest(revision: Dict[str, Any]) -> bytes:
 
     headers.extend(extra_headers)
 
-    return format_manifest(headers, revision["message"])
+    return format_git_object_from_headers("commit", headers, revision["message"])
 
 
 def target_type_to_git(target_type: str) -> bytes:
@@ -596,11 +611,11 @@ def target_type_to_git(target_type: str) -> bytes:
 
 def release_identifier(release: Dict[str, Any]) -> str:
     """Return the intrinsic identifier for a release."""
-    manifest = release_manifest(release)
-    return identifier_to_str(hash_git_data(manifest, "tag"))
+    git_object = release_git_object(release)
+    return hashlib.new("sha1", git_object).hexdigest()
 
 
-def release_manifest(release: Dict[str, Any]) -> bytes:
+def release_git_object(release: Dict[str, Any]) -> bytes:
     headers = [
         (b"object", identifier_to_str(release["target"]).encode()),
         (b"type", target_type_to_git(release["target_type"])),
@@ -612,7 +627,7 @@ def release_manifest(release: Dict[str, Any]) -> bytes:
             (b"tagger", format_author_data(release["author"], release["date"]))
         )
 
-    return format_manifest(headers, release["message"])
+    return format_git_object_from_headers("tag", headers, release["message"])
 
 
 def snapshot_identifier(
@@ -672,14 +687,14 @@ def snapshot_identifier(
       str: the intrinsic identifier for `snapshot`
 
     """
-    manifest = snapshot_manifest(snapshot, ignore_unresolved=ignore_unresolved)
-    return identifier_to_str(hash_git_data(manifest, "snapshot"))
+    git_object = snapshot_git_object(snapshot, ignore_unresolved=ignore_unresolved)
+    return hashlib.new("sha1", git_object).hexdigest()
 
 
-def snapshot_manifest(
+def snapshot_git_object(
     snapshot: Dict[str, Any], *, ignore_unresolved: bool = False
 ) -> bytes:
-    """Formats the manifest of a revision. See :func:`snapshot_identifier` for details
+    """Formats the git_object of a revision. See :func:`snapshot_identifier` for details
     on the format."""
     unresolved = []
     lines = []
@@ -715,7 +730,7 @@ def snapshot_manifest(
             unresolved,
         )
 
-    return b"".join(lines)
+    return format_git_object_from_parts("snapshot", lines)
 
 
 def origin_identifier(origin):
@@ -773,12 +788,12 @@ def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str:
       str: the intrinsic identifier for ``metadata``
 
     """
-    manifest = raw_extrinsic_metadata_manifest(metadata)
-    return identifier_to_str(hash_git_data(manifest, "raw_extrinsic_metadata"))
+    git_object = raw_extrinsic_metadata_git_object(metadata)
+    return hashlib.new("sha1", git_object).hexdigest()
 
 
-def raw_extrinsic_metadata_manifest(metadata: Dict[str, Any]) -> bytes:
-    """Formats the manifest of a raw_extrinsic_metadata object.
+def raw_extrinsic_metadata_git_object(metadata: Dict[str, Any]) -> bytes:
+    """Formats the git_object of a raw_extrinsic_metadata object.
     See :func:`raw_extrinsic_metadata_identifier` for details
     on the format."""
     # equivalent to using math.floor(dt.timestamp()) to round down,
@@ -827,7 +842,9 @@ def raw_extrinsic_metadata_manifest(metadata: Dict[str, Any]) -> bytes:
 
             headers.append((key.encode("ascii"), value))
 
-    return format_manifest(headers, metadata["metadata"])
+    return format_git_object_from_headers(
+        "raw_extrinsic_metadata", headers, metadata["metadata"]
+    )
 
 
 def extid_identifier(extid: Dict[str, Any]) -> str:
@@ -858,8 +875,8 @@ def extid_identifier(extid: Dict[str, Any]) -> str:
         (b"target", str(extid["target"]).encode("ascii")),
     ]
 
-    manifest = format_manifest(headers)
-    return identifier_to_str(hash_git_data(manifest, "extid"))
+    git_object = format_git_object_from_headers("extid", headers)
+    return hashlib.new("sha1", git_object).hexdigest()
 
 
 # type of the "object_type" attribute of the SWHID class; either
diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py
index 42934ba8..682a3ef2 100644
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
@@ -805,7 +805,7 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
         }
 
     def test_minimal(self):
-        manifest = (
+        git_object = (
             b"raw_extrinsic_metadata 210\0"
             b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
             b"discovery_date 1611574071\n"
@@ -816,9 +816,12 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             b'{"foo": "bar"}'
         )
 
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_git_object(self.minimal), git_object,
+        )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(self.minimal),
-            hashlib.sha1(manifest).hexdigest(),
+            hashlib.sha1(git_object).hexdigest(),
         )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(self.minimal),
@@ -826,7 +829,7 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
         )
 
     def test_maximal(self):
-        manifest = (
+        git_object = (
             b"raw_extrinsic_metadata 533\0"
             b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
             b"discovery_date 1611574071\n"
@@ -844,9 +847,12 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             b'{"foo": "bar"}'
         )
 
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_git_object(self.maximal), git_object,
+        )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(self.maximal),
-            hashlib.sha1(manifest).hexdigest(),
+            hashlib.sha1(git_object).hexdigest(),
         )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(self.maximal),
@@ -858,7 +864,7 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             **self.minimal,
             "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f",
         }
-        manifest = (
+        git_object = (
             b"raw_extrinsic_metadata 231\0"
             b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
             b"discovery_date 1611574071\n"
@@ -871,9 +877,12 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             b'{"foo": "bar"}'
         )
 
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_git_object(metadata), git_object,
+        )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(metadata),
-            hashlib.sha1(manifest).hexdigest(),
+            hashlib.sha1(git_object).hexdigest(),
         )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(metadata),
@@ -882,7 +891,7 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
 
     def test_timezone_insensitive(self):
         """Checks the timezone of the datetime.datetime does not affect the
-        hashed manifest."""
+        hashed git_object."""
         utc_plus_one = datetime.timezone(datetime.timedelta(hours=1))
         metadata = {
             **self.minimal,
@@ -891,6 +900,10 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             ),
         }
 
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_git_object(self.minimal),
+            identifiers.raw_extrinsic_metadata_git_object(metadata),
+        )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(self.minimal),
             identifiers.raw_extrinsic_metadata_identifier(metadata),
@@ -910,6 +923,10 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             ),
         }
 
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_git_object(self.minimal),
+            identifiers.raw_extrinsic_metadata_git_object(metadata),
+        )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(self.minimal),
             identifiers.raw_extrinsic_metadata_identifier(metadata),
@@ -930,6 +947,10 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             ),
         }
 
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_git_object(self.minimal),
+            identifiers.raw_extrinsic_metadata_git_object(metadata),
+        )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(self.minimal),
             identifiers.raw_extrinsic_metadata_identifier(metadata),
@@ -947,7 +968,7 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             ),
         }
 
-        manifest = (
+        git_object = (
             b"raw_extrinsic_metadata 210\0"
             b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
             b"discovery_date -313504329\n"
@@ -958,9 +979,12 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             b'{"foo": "bar"}'
         )
 
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_git_object(metadata), git_object,
+        )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(metadata),
-            hashlib.sha1(manifest).hexdigest(),
+            hashlib.sha1(git_object).hexdigest(),
         )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(metadata),
@@ -975,7 +999,7 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             ),
         }
 
-        manifest = (
+        git_object = (
             b"raw_extrinsic_metadata 201\0"
             b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
             b"discovery_date 0\n"
@@ -986,9 +1010,12 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             b'{"foo": "bar"}'
         )
 
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_git_object(metadata), git_object,
+        )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(metadata),
-            hashlib.sha1(manifest).hexdigest(),
+            hashlib.sha1(git_object).hexdigest(),
         )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(metadata),
@@ -1003,7 +1030,7 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             ),
         }
 
-        manifest = (
+        git_object = (
             b"raw_extrinsic_metadata 202\0"
             b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
             b"discovery_date -1\n"
@@ -1014,9 +1041,12 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
             b'{"foo": "bar"}'
         )
 
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_git_object(metadata), git_object,
+        )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(metadata),
-            hashlib.sha1(manifest).hexdigest(),
+            hashlib.sha1(git_object).hexdigest(),
         )
         self.assertEqual(
             identifiers.raw_extrinsic_metadata_identifier(metadata),
-- 
GitLab