From 98a7c011c05c0151bfcac145a457f1ef6ff37690 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Wed, 19 Mar 2025 08:21:32 +0100
Subject: [PATCH 1/2] Add support for loading commits whose timestamp cannot be
 represented

by replacing their timestamp with epoch, and relying on the raw_manifest
to store the timestamp instead
---
 requirements-swh.txt                    |  2 +-
 swh/loader/git/converters.py            | 13 ++++--
 swh/loader/git/tests/test_converters.py | 62 +++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/requirements-swh.txt b/requirements-swh.txt
index 27ee6f53..7a6d16a6 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,5 +1,5 @@
 swh.core >= 2.22.0
 swh.loader.core >= 5.18.3
-swh.model >= 6.13.0
+swh.model >= 7.1.0
 swh.scheduler >= 0.0.39
 swh.storage >= 2.4.1
diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py
index 7588d6ab..16f95b03 100644
--- a/swh/loader/git/converters.py
+++ b/swh/loader/git/converters.py
@@ -33,6 +33,7 @@ from swh.model.model import (
     SkippedContent,
     SnapshotTargetType,
     Timestamp,
+    TimestampOverflowException,
     TimestampWithTimezone,
 )
 
@@ -161,11 +162,13 @@ def dulwich_tsinfo_to_timestamp(
     timezone_bytes: Optional[bytes],
 ) -> TimestampWithTimezone:
     """Convert the dulwich timestamp information to a structure compatible with
-    Software Heritage."""
-    ts = Timestamp(
-        seconds=int(timestamp),
-        microseconds=0,
-    )
+    Software Heritage.
+
+    Returns epoch if the timestamp overflows :class:`Timestamp`."""
+    try:
+        ts = Timestamp(seconds=int(timestamp), microseconds=0)
+    except TimestampOverflowException:
+        ts = Timestamp(seconds=0, microseconds=0)
     if timezone_bytes is None:
         # Failed to parse from the raw manifest, fallback to what Dulwich managed to
         # parse.
diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py
index 67cd1f87..472e8345 100644
--- a/swh/loader/git/tests/test_converters.py
+++ b/swh/loader/git/tests/test_converters.py
@@ -566,6 +566,68 @@ class TestConverters:
             raw_manifest=b"commit 161\x00" + raw_string2,
         )
 
+    def test_commit_timestamp_overflow(self):
+        """Checks raw_manifest is set when the commit cannot fit the data model"""
+
+        # Well-formed manifest
+        raw_string = (
+            b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n"
+            b"author Foo <foo@example.org> 99999999999999999 +0200\n"
+            b"committer Foo <foo@example.org> 99999999999999999 +0200\n\n"
+            b"some commit message"
+        )
+        commit = Commit.from_raw_string(Commit.type_name, raw_string)
+        date = TimestampWithTimezone(
+            timestamp=Timestamp(seconds=0, microseconds=0),
+            offset_bytes=b"+0200",
+        )
+        assert converters.dulwich_commit_to_revision(commit) == Revision(
+            message=b"some commit message",
+            directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
+            synthetic=False,
+            author=Person.from_fullname(
+                b"Foo <foo@example.org>",
+            ),
+            committer=Person.from_fullname(
+                b"Foo <foo@example.org>",
+            ),
+            date=date,
+            committer_date=date,
+            type=RevisionType.GIT,
+            raw_manifest=b"commit 175\x00" + raw_string,
+        )
+
+    def test_commit_timestamp_large_offset(self):
+        """Checks raw_manifest is set when the commit cannot fit the data model"""
+
+        # Well-formed manifest
+        raw_string = (
+            b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n"
+            b"author Foo <foo@example.org> 1640191028 +99999999\n"
+            b"committer Foo <foo@example.org> 1640191028 +99999999\n\n"
+            b"some commit message"
+        )
+        commit = Commit.from_raw_string(Commit.type_name, raw_string)
+        date = TimestampWithTimezone(
+            timestamp=Timestamp(seconds=1640191028, microseconds=0),
+            offset_bytes=b"+99999999",
+        )
+        assert converters.dulwich_commit_to_revision(commit) == Revision(
+            message=b"some commit message",
+            directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
+            synthetic=False,
+            author=Person.from_fullname(
+                b"Foo <foo@example.org>",
+            ),
+            committer=Person.from_fullname(
+                b"Foo <foo@example.org>",
+            ),
+            date=date,
+            committer_date=date,
+            type=RevisionType.GIT,
+            raw_manifest=None,
+        )
+
     def test_author_line_to_author(self):
         # edge case out of the way
         with pytest.raises(TypeError):
-- 
GitLab


From 621a6d0460392c7c194b86d5c4b59d422993ae6f Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Wed, 19 Mar 2025 09:02:59 +0100
Subject: [PATCH 2/2] Fix test_commit_timestamp_large_offset docstring

---
 swh/loader/git/tests/test_converters.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py
index 472e8345..cfd9459c 100644
--- a/swh/loader/git/tests/test_converters.py
+++ b/swh/loader/git/tests/test_converters.py
@@ -598,7 +598,8 @@ class TestConverters:
         )
 
     def test_commit_timestamp_large_offset(self):
-        """Checks raw_manifest is set when the commit cannot fit the data model"""
+        """Checks commits with an offset too large to fit in :class:`datetime` can
+        still be parsed."""
 
         # Well-formed manifest
         raw_string = (
-- 
GitLab