From 2c4114770987039999a28b01092510ab6ef449d5 Mon Sep 17 00:00:00 2001 From: vlorentz <vlorentz@softwareheritage.org> Date: Wed, 19 Mar 2025 08:13:13 +0000 Subject: [PATCH] Add support for loading commits whose timestamp cannot be represented by replacing their timestamp with epoch, and relying on the raw_manifest to store the timestamp instead --- requirements-swh.txt | 2 +- swh/loader/git/converters.py | 13 +++-- swh/loader/git/tests/test_converters.py | 63 +++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 6 deletions(-) diff --git a/requirements-swh.txt b/requirements-swh.txt index 27ee6f53..7a6d16a6 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 2.22.0 swh.loader.core >= 5.18.3 -swh.model >= 6.13.0 +swh.model >= 7.1.0 swh.scheduler >= 0.0.39 swh.storage >= 2.4.1 diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index 7588d6ab..16f95b03 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -33,6 +33,7 @@ from swh.model.model import ( SkippedContent, SnapshotTargetType, Timestamp, + TimestampOverflowException, TimestampWithTimezone, ) @@ -161,11 +162,13 @@ def dulwich_tsinfo_to_timestamp( timezone_bytes: Optional[bytes], ) -> TimestampWithTimezone: """Convert the dulwich timestamp information to a structure compatible with - Software Heritage.""" - ts = Timestamp( - seconds=int(timestamp), - microseconds=0, - ) + Software Heritage. + + Returns epoch if the timestamp overflows :class:`Timestamp`.""" + try: + ts = Timestamp(seconds=int(timestamp), microseconds=0) + except TimestampOverflowException: + ts = Timestamp(seconds=0, microseconds=0) if timezone_bytes is None: # Failed to parse from the raw manifest, fallback to what Dulwich managed to # parse. diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index 67cd1f87..cfd9459c 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -566,6 +566,69 @@ class TestConverters: raw_manifest=b"commit 161\x00" + raw_string2, ) + def test_commit_timestamp_overflow(self): + """Checks raw_manifest is set when the commit cannot fit the data model""" + + # Well-formed manifest + raw_string = ( + b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" + b"author Foo <foo@example.org> 99999999999999999 +0200\n" + b"committer Foo <foo@example.org> 99999999999999999 +0200\n\n" + b"some commit message" + ) + commit = Commit.from_raw_string(Commit.type_name, raw_string) + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=0, microseconds=0), + offset_bytes=b"+0200", + ) + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + synthetic=False, + author=Person.from_fullname( + b"Foo <foo@example.org>", + ), + committer=Person.from_fullname( + b"Foo <foo@example.org>", + ), + date=date, + committer_date=date, + type=RevisionType.GIT, + raw_manifest=b"commit 175\x00" + raw_string, + ) + + def test_commit_timestamp_large_offset(self): + """Checks commits with an offset too large to fit in :class:`datetime` can + still be parsed.""" + + # Well-formed manifest + raw_string = ( + b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" + b"author Foo <foo@example.org> 1640191028 +99999999\n" + b"committer Foo <foo@example.org> 1640191028 +99999999\n\n" + b"some commit message" + ) + commit = Commit.from_raw_string(Commit.type_name, raw_string) + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191028, microseconds=0), + offset_bytes=b"+99999999", + ) + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + synthetic=False, + author=Person.from_fullname( + b"Foo <foo@example.org>", + ), + committer=Person.from_fullname( + b"Foo <foo@example.org>", + ), + date=date, + committer_date=date, + type=RevisionType.GIT, + raw_manifest=None, + ) + def test_author_line_to_author(self): # edge case out of the way with pytest.raises(TypeError): -- GitLab