diff --git a/requirements-swh.txt b/requirements-swh.txt index 27ee6f53d7eb3ccb1673950cf2b9e23a6f2381f5..7a6d16a69669550ddd72b891cc5ec1d42363a014 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 2.22.0 swh.loader.core >= 5.18.3 -swh.model >= 6.13.0 +swh.model >= 7.1.0 swh.scheduler >= 0.0.39 swh.storage >= 2.4.1 diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index 7588d6abfa1f553db0be938d049556a87bf3492b..16f95b03e7cd9e1e6a30ca736f654a2547802098 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -33,6 +33,7 @@ from swh.model.model import ( SkippedContent, SnapshotTargetType, Timestamp, + TimestampOverflowException, TimestampWithTimezone, ) @@ -161,11 +162,13 @@ def dulwich_tsinfo_to_timestamp( timezone_bytes: Optional[bytes], ) -> TimestampWithTimezone: """Convert the dulwich timestamp information to a structure compatible with - Software Heritage.""" - ts = Timestamp( - seconds=int(timestamp), - microseconds=0, - ) + Software Heritage. + + Returns epoch if the timestamp overflows :class:`Timestamp`.""" + try: + ts = Timestamp(seconds=int(timestamp), microseconds=0) + except TimestampOverflowException: + ts = Timestamp(seconds=0, microseconds=0) if timezone_bytes is None: # Failed to parse from the raw manifest, fallback to what Dulwich managed to # parse. diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index 67cd1f87e07a3bfe2f6dc4d0df51bc17d2bacddd..cfd9459ccea307b729198a8165abfe393e9eb7e0 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -566,6 +566,69 @@ class TestConverters: raw_manifest=b"commit 161\x00" + raw_string2, ) + def test_commit_timestamp_overflow(self): + """Checks raw_manifest is set when the commit cannot fit the data model""" + + # Well-formed manifest + raw_string = ( + b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" + b"author Foo <foo@example.org> 99999999999999999 +0200\n" + b"committer Foo <foo@example.org> 99999999999999999 +0200\n\n" + b"some commit message" + ) + commit = Commit.from_raw_string(Commit.type_name, raw_string) + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=0, microseconds=0), + offset_bytes=b"+0200", + ) + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + synthetic=False, + author=Person.from_fullname( + b"Foo <foo@example.org>", + ), + committer=Person.from_fullname( + b"Foo <foo@example.org>", + ), + date=date, + committer_date=date, + type=RevisionType.GIT, + raw_manifest=b"commit 175\x00" + raw_string, + ) + + def test_commit_timestamp_large_offset(self): + """Checks commits with an offset too large to fit in :class:`datetime` can + still be parsed.""" + + # Well-formed manifest + raw_string = ( + b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" + b"author Foo <foo@example.org> 1640191028 +99999999\n" + b"committer Foo <foo@example.org> 1640191028 +99999999\n\n" + b"some commit message" + ) + commit = Commit.from_raw_string(Commit.type_name, raw_string) + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=1640191028, microseconds=0), + offset_bytes=b"+99999999", + ) + assert converters.dulwich_commit_to_revision(commit) == Revision( + message=b"some commit message", + directory=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), + synthetic=False, + author=Person.from_fullname( + b"Foo <foo@example.org>", + ), + committer=Person.from_fullname( + b"Foo <foo@example.org>", + ), + date=date, + committer_date=date, + type=RevisionType.GIT, + raw_manifest=None, + ) + def test_author_line_to_author(self): # edge case out of the way with pytest.raises(TypeError):