From ac006d3c2c8c953c6112edbfd6102e6d4505b35f Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Thu, 19 Dec 2024 16:15:10 +0100
Subject: [PATCH] Use dulwich git object name constants instead of raw bytes
 literals

---
 swh/loader/git/converters.py            | 28 ++++++-------
 swh/loader/git/dumb.py                  | 15 +++----
 swh/loader/git/from_disk.py             | 28 ++++++++-----
 swh/loader/git/loader.py                |  8 ++--
 swh/loader/git/tests/test_converters.py | 54 ++++++++++++-------------
 5 files changed, 71 insertions(+), 62 deletions(-)

diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py
index f96bc1bd..7588d6ab 100644
--- a/swh/loader/git/converters.py
+++ b/swh/loader/git/converters.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2022  The Software Heritage developers
+# Copyright (C) 2015-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -63,7 +63,7 @@ def check_id(obj: HashableObject) -> None:
 
 def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:
     """Convert a dulwich blob to a Software Heritage content id"""
-    if obj.type_name != b"blob":
+    if obj.type_name != Blob.type_name:
         raise ValueError("Argument is not a blob.")
     blob = cast(Blob, obj)
 
@@ -81,7 +81,7 @@ def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]:
 
 def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent:
     """Convert a dulwich blob to a Software Heritage content"""
-    if obj.type_name != b"blob":
+    if obj.type_name != Blob.type_name:
         raise ValueError("Argument is not a blob.")
     blob = cast(Blob, obj)
 
@@ -102,7 +102,7 @@ def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent:
 
 def dulwich_tree_to_directory(obj: ShaFile) -> Directory:
     """Format a tree as a directory"""
-    if obj.type_name != b"tree":
+    if obj.type_name != Tree.type_name:
         raise ValueError("Argument is not a tree.")
     tree = cast(Tree, obj)
 
@@ -179,7 +179,7 @@ def dulwich_tsinfo_to_timestamp(
 
 
 def dulwich_commit_to_revision(obj: ShaFile) -> Revision:
-    if obj.type_name != b"commit":
+    if obj.type_name != Commit.type_name:
         raise ValueError("Argument is not a commit.")
     commit = cast(Commit, obj)
 
@@ -256,23 +256,23 @@ def dulwich_commit_to_revision(obj: ShaFile) -> Revision:
 
 
 DULWICH_TARGET_TYPES = {
-    b"blob": SnapshotTargetType.CONTENT,
-    b"tree": SnapshotTargetType.DIRECTORY,
-    b"commit": SnapshotTargetType.REVISION,
-    b"tag": SnapshotTargetType.RELEASE,
+    Blob.type_name: SnapshotTargetType.CONTENT,
+    Tree.type_name: SnapshotTargetType.DIRECTORY,
+    Commit.type_name: SnapshotTargetType.REVISION,
+    Tag.type_name: SnapshotTargetType.RELEASE,
 }
 
 
 DULWICH_OBJECT_TYPES = {
-    b"blob": ObjectType.CONTENT,
-    b"tree": ObjectType.DIRECTORY,
-    b"commit": ObjectType.REVISION,
-    b"tag": ObjectType.RELEASE,
+    Blob.type_name: ObjectType.CONTENT,
+    Tree.type_name: ObjectType.DIRECTORY,
+    Commit.type_name: ObjectType.REVISION,
+    Tag.type_name: ObjectType.RELEASE,
 }
 
 
 def dulwich_tag_to_release(obj: ShaFile) -> Release:
-    if obj.type_name != b"tag":
+    if obj.type_name != Tag.type_name:
         raise ValueError("Argument is not a tag.")
     tag = cast(Tag, obj)
 
diff --git a/swh/loader/git/dumb.py b/swh/loader/git/dumb.py
index 1e94f4a1..7b9f5acc 100644
--- a/swh/loader/git/dumb.py
+++ b/swh/loader/git/dumb.py
@@ -25,7 +25,7 @@ from typing import (
 import urllib.parse
 
 from dulwich.errors import NotGitRepository
-from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree
+from dulwich.objects import S_IFGITLINK, Blob, Commit, ShaFile, Tree
 from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file
 import requests
 from tenacity.before_sleep import before_sleep_log
@@ -140,18 +140,19 @@ class GitObjectsFetcher:
             for parent in commit.parents:
                 if (
                     # commit not already seen in the current load
-                    parent not in self.objects[b"commit"]
+                    parent not in self.objects[Commit.type_name]
                     # commit not already archived by a previous load
                     and parent not in self.base_repo.local_heads
                 ):
                     commit_objects.append(cast(Commit, self._get_git_object(parent)))
-                    self.objects[b"commit"].add(parent)
+                    self.objects[Commit.type_name].add(parent)
 
     def iter_objects(self, object_type: bytes) -> Iterable[ShaFile]:
         """Returns a generator on fetched git objects per type.
 
         Args:
-            object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree"
+            object_type: Git object type, either Blob.type_name, Commit.type_name,
+                Tag.type_name or Tree.type_name
 
         Returns:
             A generator fetching git objects on the fly.
@@ -256,9 +257,9 @@ class GitObjectsFetcher:
         return ShaFile.from_file(self._http_get(object_path))
 
     def _fetch_tree_objects(self, sha: bytes) -> None:
-        if sha not in self.objects[b"tree"]:
+        if sha not in self.objects[Tree.type_name]:
             tree = cast(Tree, self._get_git_object(sha))
-            self.objects[b"tree"].add(sha)
+            self.objects[Tree.type_name].add(sha)
             for item in tree.items():
                 if item.mode == S_IFGITLINK:
                     # skip submodules as objects are not stored in repository
@@ -266,4 +267,4 @@ class GitObjectsFetcher:
                 if item.mode & stat.S_IFDIR:
                     self._fetch_tree_objects(item.sha)
                 else:
-                    self.objects[b"blob"].add(item.sha)
+                    self.objects[Blob.type_name].add(item.sha)
diff --git a/swh/loader/git/from_disk.py b/swh/loader/git/from_disk.py
index cad245e7..60ffeaa5 100644
--- a/swh/loader/git/from_disk.py
+++ b/swh/loader/git/from_disk.py
@@ -1,7 +1,8 @@
-# Copyright (C) 2015-2023  The Software Heritage developers
+# Copyright (C) 2015-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
+
 from collections import defaultdict
 from datetime import datetime
 import logging
@@ -12,7 +13,7 @@ from typing import Dict, Optional
 from deprecated import deprecated
 from dulwich.errors import ObjectFormatException
 import dulwich.objects
-from dulwich.objects import EmptyFileException
+from dulwich.objects import Blob, Commit, EmptyFileException, Tag, Tree
 import dulwich.repo
 
 from swh.loader.git.utils import raise_not_found_repository
@@ -221,11 +222,11 @@ class GitLoaderFromDisk(BaseGitLoader):
 
     def has_contents(self):
         """Checks whether we need to load contents"""
-        return bool(self.type_to_ids[b"blob"])
+        return bool(self.type_to_ids[Blob.type_name])
 
     def get_content_ids(self):
         """Get the content identifiers from the git repository"""
-        for oid in self.type_to_ids[b"blob"]:
+        for oid in self.type_to_ids[Blob.type_name]:
             yield converters.dulwich_blob_to_content_id(self.repo[oid])
 
     def get_contents(self):
@@ -241,11 +242,14 @@ class GitLoaderFromDisk(BaseGitLoader):
 
     def has_directories(self):
         """Checks whether we need to load directories"""
-        return bool(self.type_to_ids[b"tree"])
+        return bool(self.type_to_ids[Tree.type_name])
 
     def get_directory_ids(self):
         """Get the directory identifiers from the git repository"""
-        return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tree"])
+        return (
+            hashutil.hash_to_bytes(id.decode())
+            for id in self.type_to_ids[Tree.type_name]
+        )
 
     def get_directories(self):
         """Get the directories that need to be loaded"""
@@ -260,12 +264,13 @@ class GitLoaderFromDisk(BaseGitLoader):
 
     def has_revisions(self):
         """Checks whether we need to load revisions"""
-        return bool(self.type_to_ids[b"commit"])
+        return bool(self.type_to_ids[Commit.type_name])
 
     def get_revision_ids(self):
         """Get the revision identifiers from the git repository"""
         return (
-            hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"commit"]
+            hashutil.hash_to_bytes(id.decode())
+            for id in self.type_to_ids[Commit.type_name]
         )
 
     def get_revisions(self):
@@ -281,11 +286,14 @@ class GitLoaderFromDisk(BaseGitLoader):
 
     def has_releases(self):
         """Checks whether we need to load releases"""
-        return bool(self.type_to_ids[b"tag"])
+        return bool(self.type_to_ids[Tag.type_name])
 
     def get_release_ids(self):
         """Get the release identifiers from the git repository"""
-        return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tag"])
+        return (
+            hashutil.hash_to_bytes(id.decode())
+            for id in self.type_to_ids[Tag.type_name]
+        )
 
     def get_releases(self):
         """Get the releases that need to be loaded"""
diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py
index ec17c8b4..c58e56f5 100644
--- a/swh/loader/git/loader.py
+++ b/swh/loader/git/loader.py
@@ -611,7 +611,7 @@ class GitLoader(BaseGitLoader):
 
     def get_contents(self) -> Iterable[BaseContent]:
         """Format the blobs from the git repository as swh contents"""
-        for raw_obj in self.iter_objects(b"blob"):
+        for raw_obj in self.iter_objects(Blob.type_name):
             if raw_obj.id in self.ref_object_types:
                 self.ref_object_types[raw_obj.id] = SnapshotTargetType.CONTENT
 
@@ -621,7 +621,7 @@ class GitLoader(BaseGitLoader):
 
     def get_directories(self) -> Iterable[Directory]:
         """Format the trees as swh directories"""
-        for raw_obj in self.iter_objects(b"tree"):
+        for raw_obj in self.iter_objects(Tree.type_name):
             if raw_obj.id in self.ref_object_types:
                 self.ref_object_types[raw_obj.id] = SnapshotTargetType.DIRECTORY
 
@@ -629,7 +629,7 @@ class GitLoader(BaseGitLoader):
 
     def get_revisions(self) -> Iterable[Revision]:
         """Format commits as swh revisions"""
-        for raw_obj in self.iter_objects(b"commit"):
+        for raw_obj in self.iter_objects(Commit.type_name):
             if raw_obj.id in self.ref_object_types:
                 self.ref_object_types[raw_obj.id] = SnapshotTargetType.REVISION
 
@@ -637,7 +637,7 @@ class GitLoader(BaseGitLoader):
 
     def get_releases(self) -> Iterable[Release]:
         """Retrieve all the release objects from the git repository"""
-        for raw_obj in self.iter_objects(b"tag"):
+        for raw_obj in self.iter_objects(Tag.type_name):
             if raw_obj.id in self.ref_object_types:
                 self.ref_object_types[raw_obj.id] = SnapshotTargetType.RELEASE
 
diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py
index 9bff770a..67cd1f87 100644
--- a/swh/loader/git/tests/test_converters.py
+++ b/swh/loader/git/tests/test_converters.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2022  The Software Heritage developers
+# Copyright (C) 2015-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -10,7 +10,7 @@ import shutil
 import subprocess
 import tempfile
 
-import dulwich.objects
+from dulwich.objects import Commit, Tag, Tree
 import dulwich.repo
 import pytest
 
@@ -170,7 +170,7 @@ class TestConverters:
     def test_corrupt_tree(self):
         sha1 = b"a9b41fc6347d778f16c4380b598d8083e9b4c1fb"
         target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
-        tree = dulwich.objects.Tree()
+        tree = Tree()
         tree.add(b"file1", 0o644, target)
         assert tree.sha().hexdigest() == sha1.decode()
         converters.dulwich_tree_to_directory(tree)
@@ -195,7 +195,7 @@ class TestConverters:
             b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce"
         )
 
-        tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_string)
+        tree = Tree.from_raw_string(Tree.type_name, raw_string)
 
         assert converters.dulwich_tree_to_directory(tree) == Directory(
             entries=(
@@ -232,7 +232,7 @@ class TestConverters:
             (b"tree_normal", 0o040000, "dir"),
         ]
 
-        tree = dulwich.objects.Tree()
+        tree = Tree()
         for name, mode, _ in entries:
             tree.add(name, mode, b"00" * 20)
 
@@ -257,7 +257,7 @@ class TestConverters:
             b"\x1d\xd3\xec\x83\x94+\xbc\x04\xde\xee\x7f\xc6\xbe\x8b\x9cnp=W\xf9"
         )
 
-        tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_string)
+        tree = Tree.from_raw_string(Tree.type_name, raw_string)
 
         dir_ = Directory(
             entries=(
@@ -386,7 +386,7 @@ class TestConverters:
         author = Person(
             fullname=b"Foo <foo@example.org>", name=b"Foo", email=b"foo@example.org"
         )
-        commit = dulwich.objects.Commit()
+        commit = Commit()
         commit.tree = target
         commit.message = message
         commit.author = commit.committer = b"Foo <foo@example.org>"
@@ -416,7 +416,7 @@ class TestConverters:
         sha = hash_to_bytes("3f0ac5a6d15d89cf928209a57334e3b77c5651b9")
         target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
         message = b"some commit message"
-        commit = dulwich.objects.Commit()
+        commit = Commit()
         commit.tree = target
         commit.message = message
         commit.gpgsig = GPGSIG
@@ -496,7 +496,7 @@ class TestConverters:
             b"committer Foo <foo@example.org> 1640191028 +0200\n\n"
             b"some commit message"
         )
-        commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string)
+        commit = Commit.from_raw_string(Commit.type_name, raw_string)
         date = TimestampWithTimezone(
             timestamp=Timestamp(seconds=1640191028, microseconds=0),
             offset_bytes=b"+0200",
@@ -519,7 +519,7 @@ class TestConverters:
 
         # Mess with the offset
         raw_string2 = raw_string.replace(b"+0200", b"+200")
-        commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string2)
+        commit = Commit.from_raw_string(Commit.type_name, raw_string2)
         date = TimestampWithTimezone(
             timestamp=Timestamp(seconds=1640191028, microseconds=0),
             offset_bytes=b"+200",
@@ -545,7 +545,7 @@ class TestConverters:
             b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce",
             b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE",
         )
-        commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string2)
+        commit = Commit.from_raw_string(Commit.type_name, raw_string2)
         date = TimestampWithTimezone(
             timestamp=Timestamp(seconds=1640191028, microseconds=0),
             offset_bytes=b"+0200",
@@ -611,9 +611,9 @@ class TestConverters:
         sha = hash_to_bytes("f6e367357b446bd1315276de5e88ba3d0d99e136")
         target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
         message = b"some release message"
-        tag = dulwich.objects.Tag()
+        tag = Tag()
         tag.name = b"blah"
-        tag.object = (dulwich.objects.Commit, target)
+        tag.object = (Commit, target)
         tag.message = message
         tag.signature = None
         tag.tagger = None
@@ -649,9 +649,9 @@ class TestConverters:
             datetime.datetime(2007, 12, 5, tzinfo=datetime.timezone.utc).timestamp()
         )
 
-        tag = dulwich.objects.Tag()
+        tag = Tag()
         tag.name = b"blah"
-        tag.object = (dulwich.objects.Commit, target)
+        tag.object = (Commit, target)
         tag.message = message
         tag.signature = None
         tag.tagger = tagger
@@ -693,9 +693,9 @@ class TestConverters:
         tagger = b"hey dude <hello@mail.org>"
         target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
         message = b"some release message"
-        tag = dulwich.objects.Tag()
+        tag = Tag()
         tag.name = b"blah"
-        tag.object = (dulwich.objects.Commit, target)
+        tag.object = (Commit, target)
         tag.message = message
         tag.signature = None
         tag.tagger = tagger
@@ -734,9 +734,9 @@ class TestConverters:
         date = int(
             datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc).timestamp()
         )
-        tag = dulwich.objects.Tag()
+        tag = Tag()
         tag.name = b"blah"
-        tag.object = (dulwich.objects.Commit, target)
+        tag.object = (Commit, target)
         tag.message = message
         tag.signature = None
         tag.tagger = tagger
@@ -776,9 +776,9 @@ class TestConverters:
         target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
         message = b"some release message"
         sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71")
-        tag = dulwich.objects.Tag()
+        tag = Tag()
         tag.name = b"blah"
-        tag.object = (dulwich.objects.Commit, target)
+        tag.object = (Commit, target)
         tag.message = message
         tag.signature = GPGSIG
         tag.tagger = None
@@ -809,9 +809,9 @@ class TestConverters:
         sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71")
         target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
         message = b"some release message"
-        tag = dulwich.objects.Tag()
+        tag = Tag()
         tag.name = b"blah"
-        tag.object = (dulwich.objects.Commit, target)
+        tag.object = (Commit, target)
         tag.message = message
         tag.signature = GPGSIG
         tag.tagger = None
@@ -846,7 +846,7 @@ class TestConverters:
             b"tagger Foo <foo@example.org> 1640191027 +0200\n\n"
             b"some release message"
         )
-        tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string)
+        tag = Tag.from_raw_string(Tag.type_name, raw_string)
         assert converters.dulwich_tag_to_release(tag) == Release(
             name=b"blah",
             message=b"some release message",
@@ -865,7 +865,7 @@ class TestConverters:
 
         # Mess with the offset (negative UTC)
         raw_string2 = raw_string.replace(b"+0200", b"-0000")
-        tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2)
+        tag = Tag.from_raw_string(Tag.type_name, raw_string2)
         assert converters.dulwich_tag_to_release(tag) == Release(
             name=b"blah",
             message=b"some release message",
@@ -883,7 +883,7 @@ class TestConverters:
 
         # Mess with the offset (other)
         raw_string2 = raw_string.replace(b"+0200", b"+200")
-        tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2)
+        tag = Tag.from_raw_string(Tag.type_name, raw_string2)
         assert converters.dulwich_tag_to_release(tag) == Release(
             name=b"blah",
             message=b"some release message",
@@ -904,7 +904,7 @@ class TestConverters:
             b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce",
             b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE",
         )
-        tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2)
+        tag = Tag.from_raw_string(Tag.type_name, raw_string2)
         assert converters.dulwich_tag_to_release(tag) == Release(
             name=b"blah",
             message=b"some release message",
-- 
GitLab