From 2c4114770987039999a28b01092510ab6ef449d5 Mon Sep 17 00:00:00 2001
From: vlorentz <vlorentz@softwareheritage.org>
Date: Wed, 19 Mar 2025 08:13:13 +0000
Subject: [PATCH] Add support for loading commits whose timestamp cannot be
 represented

by replacing their timestamp with epoch, and relying on the raw_manifest
to store the timestamp instead
---
 requirements-swh.txt                    |  2 +-
 swh/loader/git/converters.py            | 13 +++--
 swh/loader/git/tests/test_converters.py | 63 +++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 6 deletions(-)

diff --git a/requirements-swh.txt b/requirements-swh.txt
index 27ee6f53..7a6d16a6 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,5 +1,5 @@
 swh.core >= 2.22.0
 swh.loader.core >= 5.18.3
-swh.model >= 6.13.0
+swh.model >= 7.1.0
 swh.scheduler >= 0.0.39
 swh.storage >= 2.4.1
diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py
index 7588d6ab..16f95b03 100644
--- a/swh/loader/git/converters.py
+++ b/swh/loader/git/converters.py
@@ -33,6 +33,7 @@ from swh.model.model import (
     SkippedContent,
     SnapshotTargetType,
     Timestamp,
+    TimestampOverflowException,
     TimestampWithTimezone,
 )
 
@@ -161,11 +162,13 @@ def dulwich_tsinfo_to_timestamp(
     timezone_bytes: Optional[bytes],
 ) -> TimestampWithTimezone:
     """Convert the dulwich timestamp information to a structure compatible with
-    Software Heritage."""
-    ts = Timestamp(
-        seconds=int(timestamp),
-        microseconds=0,
-    )
+    Software Heritage.
+
+    Returns epoch if the timestamp overflows :class:`Timestamp`."""
+    try:
+        ts = Timestamp(seconds=int(timestamp), microseconds=0)
+    except TimestampOverflowException:
+        ts = Timestamp(seconds=0, microseconds=0)
     if timezone_bytes is None:
         # Failed to parse from the raw manifest, fallback to what Dulwich managed to
         # parse.
diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py
index 67cd1f87..cfd9459c 100644
--- a/swh/loader/git/tests/test_converters.py
+++ b/swh/loader/git/tests/test_converters.py
@@ -566,6 +566,69 @@ class TestConverters:
             raw_manifest=b"commit 161\x00" + raw_string2,
         )
 
+    def test_commit_timestamp_overflow(self):
+        """Checks raw_manifest is set when the commit cannot fit the data model"""
+
+        # Well-formed manifest
+        raw_string = (
+            b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n"
+            b"author Foo <foo@example.org> 99999999999999999 +0200\n"
+            b"committer Foo <foo@example.org> 99999999999999999 +0200\n\n"
+            b"some commit message"
+        )
+        commit = Commit.from_raw_string(Commit.type_name, raw_string)
+        date = TimestampWithTimezone(
+            timestamp=Timestamp(seconds=0, microseconds=0),
+            offset_bytes=b"+0200",
+        )
+        assert converters.dulwich_commit_to_revision(commit) == Revision(
+            message=b"some commit message",
+            directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
+            synthetic=False,
+            author=Person.from_fullname(
+                b"Foo <foo@example.org>",
+            ),
+            committer=Person.from_fullname(
+                b"Foo <foo@example.org>",
+            ),
+            date=date,
+            committer_date=date,
+            type=RevisionType.GIT,
+            raw_manifest=b"commit 175\x00" + raw_string,
+        )
+
+    def test_commit_timestamp_large_offset(self):
+        """Checks commits with an offset too large to fit in :class:`datetime` can
+        still be parsed."""
+
+        # Well-formed manifest
+        raw_string = (
+            b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n"
+            b"author Foo <foo@example.org> 1640191028 +99999999\n"
+            b"committer Foo <foo@example.org> 1640191028 +99999999\n\n"
+            b"some commit message"
+        )
+        commit = Commit.from_raw_string(Commit.type_name, raw_string)
+        date = TimestampWithTimezone(
+            timestamp=Timestamp(seconds=1640191028, microseconds=0),
+            offset_bytes=b"+99999999",
+        )
+        assert converters.dulwich_commit_to_revision(commit) == Revision(
+            message=b"some commit message",
+            directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
+            synthetic=False,
+            author=Person.from_fullname(
+                b"Foo <foo@example.org>",
+            ),
+            committer=Person.from_fullname(
+                b"Foo <foo@example.org>",
+            ),
+            date=date,
+            committer_date=date,
+            type=RevisionType.GIT,
+            raw_manifest=None,
+        )
+
     def test_author_line_to_author(self):
         # edge case out of the way
         with pytest.raises(TypeError):
-- 
GitLab