From 37364c24b50911f107cfde7b3a283dd52e1a76c9 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Mon, 6 Dec 2021 17:38:59 +0100
Subject: [PATCH] hashutil: Add support for md5 sum

Enable to compute md5 sum through the hashutil.MultiHash class.

Nevertheless, md5 is not put in DEFAULT_ALGORITHMS set and must
be explicitely requested by client code.

Related to T2400
---
 swh/model/hashutil.py            |  2 +-
 swh/model/tests/test_hashutil.py | 34 ++++++++++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py
index eaacb230..86ecc6f0 100644
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -58,7 +58,7 @@ from io import BytesIO
 import os
 from typing import Callable, Dict, Optional
 
-ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512"])
+ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5"])
 """Hashing algorithms supported by this module"""
 
 DEFAULT_ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256"])
diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py
index ec540d2f..c864bd8f 100644
--- a/swh/model/tests/test_hashutil.py
+++ b/swh/model/tests/test_hashutil.py
@@ -13,7 +13,7 @@ from unittest.mock import patch
 import pytest
 
 from swh.model import hashutil
-from swh.model.hashutil import MultiHash
+from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytehex
 
 
 @contextlib.contextmanager
@@ -112,6 +112,36 @@ def test_multi_hash_file_bytehexdigest(hash_test_data):
     assert checksums == hash_test_data.bytehex_checksums
 
 
+def test_multi_hash_file_with_md5(hash_test_data):
+    fobj = io.BytesIO(hash_test_data.data)
+
+    checksums = MultiHash.from_file(
+        fobj, hash_names=DEFAULT_ALGORITHMS | {"md5"}, length=len(hash_test_data.data)
+    ).digest()
+    md5sum = {"md5": hashlib.md5(hash_test_data.data).digest()}
+    assert checksums == {**hash_test_data.checksums, **md5sum}
+
+
+def test_multi_hash_file_hexdigest_with_md5(hash_test_data):
+    fobj = io.BytesIO(hash_test_data.data)
+    length = len(hash_test_data.data)
+    checksums = MultiHash.from_file(
+        fobj, hash_names=DEFAULT_ALGORITHMS | {"md5"}, length=length
+    ).hexdigest()
+    md5sum = {"md5": hashlib.md5(hash_test_data.data).hexdigest()}
+    assert checksums == {**hash_test_data.hex_checksums, **md5sum}
+
+
+def test_multi_hash_file_bytehexdigest_with_md5(hash_test_data):
+    fobj = io.BytesIO(hash_test_data.data)
+    length = len(hash_test_data.data)
+    checksums = MultiHash.from_file(
+        fobj, hash_names=DEFAULT_ALGORITHMS | {"md5"}, length=length
+    ).bytehexdigest()
+    md5sum = {"md5": hash_to_bytehex(hashlib.md5(hash_test_data.data).digest())}
+    assert checksums == {**hash_test_data.bytehex_checksums, **md5sum}
+
+
 def test_multi_hash_file_missing_length(hash_test_data):
     fobj = io.BytesIO(hash_test_data.data)
     with pytest.raises(ValueError, match="Missing length"):
@@ -177,7 +207,7 @@ def test_new_hash_unsupported_hashing_algorithm():
     expected_message = (
         "Unexpected hashing algorithm blake2:10, "
         "expected one of blake2b512, blake2s256, "
-        "sha1, sha1_git, sha256"
+        "md5, sha1, sha1_git, sha256"
     )
     with pytest.raises(ValueError, match=expected_message):
         hashutil._new_hash("blake2:10")
-- 
GitLab