From ac006d3c2c8c953c6112edbfd6102e6d4505b35f Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Thu, 19 Dec 2024 16:15:10 +0100 Subject: [PATCH] Use dulwich git object name constants instead of raw bytes literals --- swh/loader/git/converters.py | 28 ++++++------- swh/loader/git/dumb.py | 15 +++---- swh/loader/git/from_disk.py | 28 ++++++++----- swh/loader/git/loader.py | 8 ++-- swh/loader/git/tests/test_converters.py | 54 ++++++++++++------------- 5 files changed, 71 insertions(+), 62 deletions(-) diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index f96bc1bd..7588d6ab 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2022 The Software Heritage developers +# Copyright (C) 2015-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -63,7 +63,7 @@ def check_id(obj: HashableObject) -> None: def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]: """Convert a dulwich blob to a Software Heritage content id""" - if obj.type_name != b"blob": + if obj.type_name != Blob.type_name: raise ValueError("Argument is not a blob.") blob = cast(Blob, obj) @@ -81,7 +81,7 @@ def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]: def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent: """Convert a dulwich blob to a Software Heritage content""" - if obj.type_name != b"blob": + if obj.type_name != Blob.type_name: raise ValueError("Argument is not a blob.") blob = cast(Blob, obj) @@ -102,7 +102,7 @@ def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent: def dulwich_tree_to_directory(obj: ShaFile) -> Directory: """Format a tree as a directory""" - if obj.type_name != b"tree": + if obj.type_name != Tree.type_name: raise ValueError("Argument is not a tree.") tree = cast(Tree, obj) @@ -179,7 +179,7 @@ def dulwich_tsinfo_to_timestamp( def dulwich_commit_to_revision(obj: ShaFile) -> Revision: - if obj.type_name != b"commit": + if obj.type_name != Commit.type_name: raise ValueError("Argument is not a commit.") commit = cast(Commit, obj) @@ -256,23 +256,23 @@ def dulwich_commit_to_revision(obj: ShaFile) -> Revision: DULWICH_TARGET_TYPES = { - b"blob": SnapshotTargetType.CONTENT, - b"tree": SnapshotTargetType.DIRECTORY, - b"commit": SnapshotTargetType.REVISION, - b"tag": SnapshotTargetType.RELEASE, + Blob.type_name: SnapshotTargetType.CONTENT, + Tree.type_name: SnapshotTargetType.DIRECTORY, + Commit.type_name: SnapshotTargetType.REVISION, + Tag.type_name: SnapshotTargetType.RELEASE, } DULWICH_OBJECT_TYPES = { - b"blob": ObjectType.CONTENT, - b"tree": ObjectType.DIRECTORY, - b"commit": ObjectType.REVISION, - b"tag": ObjectType.RELEASE, + Blob.type_name: ObjectType.CONTENT, + Tree.type_name: ObjectType.DIRECTORY, + Commit.type_name: ObjectType.REVISION, + Tag.type_name: ObjectType.RELEASE, } def dulwich_tag_to_release(obj: ShaFile) -> Release: - if obj.type_name != b"tag": + if obj.type_name != Tag.type_name: raise ValueError("Argument is not a tag.") tag = cast(Tag, obj) diff --git a/swh/loader/git/dumb.py b/swh/loader/git/dumb.py index 1e94f4a1..7b9f5acc 100644 --- a/swh/loader/git/dumb.py +++ b/swh/loader/git/dumb.py @@ -25,7 +25,7 @@ from typing import ( import urllib.parse from dulwich.errors import NotGitRepository -from dulwich.objects import S_IFGITLINK, Commit, ShaFile, Tree +from dulwich.objects import S_IFGITLINK, Blob, Commit, ShaFile, Tree from dulwich.pack import Pack, PackData, PackIndex, load_pack_index_file import requests from tenacity.before_sleep import before_sleep_log @@ -140,18 +140,19 @@ class GitObjectsFetcher: for parent in commit.parents: if ( # commit not already seen in the current load - parent not in self.objects[b"commit"] + parent not in self.objects[Commit.type_name] # commit not already archived by a previous load and parent not in self.base_repo.local_heads ): commit_objects.append(cast(Commit, self._get_git_object(parent))) - self.objects[b"commit"].add(parent) + self.objects[Commit.type_name].add(parent) def iter_objects(self, object_type: bytes) -> Iterable[ShaFile]: """Returns a generator on fetched git objects per type. Args: - object_type: Git object type, either b"blob", b"commit", b"tag" or b"tree" + object_type: Git object type, either Blob.type_name, Commit.type_name, + Tag.type_name or Tree.type_name Returns: A generator fetching git objects on the fly. @@ -256,9 +257,9 @@ class GitObjectsFetcher: return ShaFile.from_file(self._http_get(object_path)) def _fetch_tree_objects(self, sha: bytes) -> None: - if sha not in self.objects[b"tree"]: + if sha not in self.objects[Tree.type_name]: tree = cast(Tree, self._get_git_object(sha)) - self.objects[b"tree"].add(sha) + self.objects[Tree.type_name].add(sha) for item in tree.items(): if item.mode == S_IFGITLINK: # skip submodules as objects are not stored in repository @@ -266,4 +267,4 @@ class GitObjectsFetcher: if item.mode & stat.S_IFDIR: self._fetch_tree_objects(item.sha) else: - self.objects[b"blob"].add(item.sha) + self.objects[Blob.type_name].add(item.sha) diff --git a/swh/loader/git/from_disk.py b/swh/loader/git/from_disk.py index cad245e7..60ffeaa5 100644 --- a/swh/loader/git/from_disk.py +++ b/swh/loader/git/from_disk.py @@ -1,7 +1,8 @@ -# Copyright (C) 2015-2023 The Software Heritage developers +# Copyright (C) 2015-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + from collections import defaultdict from datetime import datetime import logging @@ -12,7 +13,7 @@ from typing import Dict, Optional from deprecated import deprecated from dulwich.errors import ObjectFormatException import dulwich.objects -from dulwich.objects import EmptyFileException +from dulwich.objects import Blob, Commit, EmptyFileException, Tag, Tree import dulwich.repo from swh.loader.git.utils import raise_not_found_repository @@ -221,11 +222,11 @@ class GitLoaderFromDisk(BaseGitLoader): def has_contents(self): """Checks whether we need to load contents""" - return bool(self.type_to_ids[b"blob"]) + return bool(self.type_to_ids[Blob.type_name]) def get_content_ids(self): """Get the content identifiers from the git repository""" - for oid in self.type_to_ids[b"blob"]: + for oid in self.type_to_ids[Blob.type_name]: yield converters.dulwich_blob_to_content_id(self.repo[oid]) def get_contents(self): @@ -241,11 +242,14 @@ class GitLoaderFromDisk(BaseGitLoader): def has_directories(self): """Checks whether we need to load directories""" - return bool(self.type_to_ids[b"tree"]) + return bool(self.type_to_ids[Tree.type_name]) def get_directory_ids(self): """Get the directory identifiers from the git repository""" - return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tree"]) + return ( + hashutil.hash_to_bytes(id.decode()) + for id in self.type_to_ids[Tree.type_name] + ) def get_directories(self): """Get the directories that need to be loaded""" @@ -260,12 +264,13 @@ class GitLoaderFromDisk(BaseGitLoader): def has_revisions(self): """Checks whether we need to load revisions""" - return bool(self.type_to_ids[b"commit"]) + return bool(self.type_to_ids[Commit.type_name]) def get_revision_ids(self): """Get the revision identifiers from the git repository""" return ( - hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"commit"] + hashutil.hash_to_bytes(id.decode()) + for id in self.type_to_ids[Commit.type_name] ) def get_revisions(self): @@ -281,11 +286,14 @@ class GitLoaderFromDisk(BaseGitLoader): def has_releases(self): """Checks whether we need to load releases""" - return bool(self.type_to_ids[b"tag"]) + return bool(self.type_to_ids[Tag.type_name]) def get_release_ids(self): """Get the release identifiers from the git repository""" - return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tag"]) + return ( + hashutil.hash_to_bytes(id.decode()) + for id in self.type_to_ids[Tag.type_name] + ) def get_releases(self): """Get the releases that need to be loaded""" diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py index ec17c8b4..c58e56f5 100644 --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -611,7 +611,7 @@ class GitLoader(BaseGitLoader): def get_contents(self) -> Iterable[BaseContent]: """Format the blobs from the git repository as swh contents""" - for raw_obj in self.iter_objects(b"blob"): + for raw_obj in self.iter_objects(Blob.type_name): if raw_obj.id in self.ref_object_types: self.ref_object_types[raw_obj.id] = SnapshotTargetType.CONTENT @@ -621,7 +621,7 @@ class GitLoader(BaseGitLoader): def get_directories(self) -> Iterable[Directory]: """Format the trees as swh directories""" - for raw_obj in self.iter_objects(b"tree"): + for raw_obj in self.iter_objects(Tree.type_name): if raw_obj.id in self.ref_object_types: self.ref_object_types[raw_obj.id] = SnapshotTargetType.DIRECTORY @@ -629,7 +629,7 @@ class GitLoader(BaseGitLoader): def get_revisions(self) -> Iterable[Revision]: """Format commits as swh revisions""" - for raw_obj in self.iter_objects(b"commit"): + for raw_obj in self.iter_objects(Commit.type_name): if raw_obj.id in self.ref_object_types: self.ref_object_types[raw_obj.id] = SnapshotTargetType.REVISION @@ -637,7 +637,7 @@ class GitLoader(BaseGitLoader): def get_releases(self) -> Iterable[Release]: """Retrieve all the release objects from the git repository""" - for raw_obj in self.iter_objects(b"tag"): + for raw_obj in self.iter_objects(Tag.type_name): if raw_obj.id in self.ref_object_types: self.ref_object_types[raw_obj.id] = SnapshotTargetType.RELEASE diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index 9bff770a..67cd1f87 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2022 The Software Heritage developers +# Copyright (C) 2015-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,7 +10,7 @@ import shutil import subprocess import tempfile -import dulwich.objects +from dulwich.objects import Commit, Tag, Tree import dulwich.repo import pytest @@ -170,7 +170,7 @@ class TestConverters: def test_corrupt_tree(self): sha1 = b"a9b41fc6347d778f16c4380b598d8083e9b4c1fb" target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" - tree = dulwich.objects.Tree() + tree = Tree() tree.add(b"file1", 0o644, target) assert tree.sha().hexdigest() == sha1.decode() converters.dulwich_tree_to_directory(tree) @@ -195,7 +195,7 @@ class TestConverters: b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce" ) - tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_string) + tree = Tree.from_raw_string(Tree.type_name, raw_string) assert converters.dulwich_tree_to_directory(tree) == Directory( entries=( @@ -232,7 +232,7 @@ class TestConverters: (b"tree_normal", 0o040000, "dir"), ] - tree = dulwich.objects.Tree() + tree = Tree() for name, mode, _ in entries: tree.add(name, mode, b"00" * 20) @@ -257,7 +257,7 @@ class TestConverters: b"\x1d\xd3\xec\x83\x94+\xbc\x04\xde\xee\x7f\xc6\xbe\x8b\x9cnp=W\xf9" ) - tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_string) + tree = Tree.from_raw_string(Tree.type_name, raw_string) dir_ = Directory( entries=( @@ -386,7 +386,7 @@ class TestConverters: author = Person( fullname=b"Foo <foo@example.org>", name=b"Foo", email=b"foo@example.org" ) - commit = dulwich.objects.Commit() + commit = Commit() commit.tree = target commit.message = message commit.author = commit.committer = b"Foo <foo@example.org>" @@ -416,7 +416,7 @@ class TestConverters: sha = hash_to_bytes("3f0ac5a6d15d89cf928209a57334e3b77c5651b9") target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some commit message" - commit = dulwich.objects.Commit() + commit = Commit() commit.tree = target commit.message = message commit.gpgsig = GPGSIG @@ -496,7 +496,7 @@ class TestConverters: b"committer Foo <foo@example.org> 1640191028 +0200\n\n" b"some commit message" ) - commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string) + commit = Commit.from_raw_string(Commit.type_name, raw_string) date = TimestampWithTimezone( timestamp=Timestamp(seconds=1640191028, microseconds=0), offset_bytes=b"+0200", @@ -519,7 +519,7 @@ class TestConverters: # Mess with the offset raw_string2 = raw_string.replace(b"+0200", b"+200") - commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string2) + commit = Commit.from_raw_string(Commit.type_name, raw_string2) date = TimestampWithTimezone( timestamp=Timestamp(seconds=1640191028, microseconds=0), offset_bytes=b"+200", @@ -545,7 +545,7 @@ class TestConverters: b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", ) - commit = dulwich.objects.Commit.from_raw_string(b"commit", raw_string2) + commit = Commit.from_raw_string(Commit.type_name, raw_string2) date = TimestampWithTimezone( timestamp=Timestamp(seconds=1640191028, microseconds=0), offset_bytes=b"+0200", @@ -611,9 +611,9 @@ class TestConverters: sha = hash_to_bytes("f6e367357b446bd1315276de5e88ba3d0d99e136") target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" - tag = dulwich.objects.Tag() + tag = Tag() tag.name = b"blah" - tag.object = (dulwich.objects.Commit, target) + tag.object = (Commit, target) tag.message = message tag.signature = None tag.tagger = None @@ -649,9 +649,9 @@ class TestConverters: datetime.datetime(2007, 12, 5, tzinfo=datetime.timezone.utc).timestamp() ) - tag = dulwich.objects.Tag() + tag = Tag() tag.name = b"blah" - tag.object = (dulwich.objects.Commit, target) + tag.object = (Commit, target) tag.message = message tag.signature = None tag.tagger = tagger @@ -693,9 +693,9 @@ class TestConverters: tagger = b"hey dude <hello@mail.org>" target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" - tag = dulwich.objects.Tag() + tag = Tag() tag.name = b"blah" - tag.object = (dulwich.objects.Commit, target) + tag.object = (Commit, target) tag.message = message tag.signature = None tag.tagger = tagger @@ -734,9 +734,9 @@ class TestConverters: date = int( datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc).timestamp() ) - tag = dulwich.objects.Tag() + tag = Tag() tag.name = b"blah" - tag.object = (dulwich.objects.Commit, target) + tag.object = (Commit, target) tag.message = message tag.signature = None tag.tagger = tagger @@ -776,9 +776,9 @@ class TestConverters: target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") - tag = dulwich.objects.Tag() + tag = Tag() tag.name = b"blah" - tag.object = (dulwich.objects.Commit, target) + tag.object = (Commit, target) tag.message = message tag.signature = GPGSIG tag.tagger = None @@ -809,9 +809,9 @@ class TestConverters: sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" - tag = dulwich.objects.Tag() + tag = Tag() tag.name = b"blah" - tag.object = (dulwich.objects.Commit, target) + tag.object = (Commit, target) tag.message = message tag.signature = GPGSIG tag.tagger = None @@ -846,7 +846,7 @@ class TestConverters: b"tagger Foo <foo@example.org> 1640191027 +0200\n\n" b"some release message" ) - tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string) + tag = Tag.from_raw_string(Tag.type_name, raw_string) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", @@ -865,7 +865,7 @@ class TestConverters: # Mess with the offset (negative UTC) raw_string2 = raw_string.replace(b"+0200", b"-0000") - tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2) + tag = Tag.from_raw_string(Tag.type_name, raw_string2) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", @@ -883,7 +883,7 @@ class TestConverters: # Mess with the offset (other) raw_string2 = raw_string.replace(b"+0200", b"+200") - tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2) + tag = Tag.from_raw_string(Tag.type_name, raw_string2) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", @@ -904,7 +904,7 @@ class TestConverters: b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", ) - tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_string2) + tag = Tag.from_raw_string(Tag.type_name, raw_string2) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", -- GitLab