From 3ce412506cd942d1a5e7340bc309f43da2b2adeb Mon Sep 17 00:00:00 2001 From: Valentin Lorentz <vlorentz@softwareheritage.org> Date: Thu, 4 Feb 2021 11:02:39 +0100 Subject: [PATCH] identifiers: Change the manifest format of raw_extrinsic_metadata to use integer instead of ISO8601 Serializing as ISO8601 makes the hash brittle, because the database may change the timezone silently and/or lose precision in the microseconds. As we do not need precise timestamp, using an integer is good enough, and is consistant with the git format. The manifest also does not need to contain a timezone, as it only represents the timezone of the system that fetched this metadata, which is useless data. --- swh/model/identifiers.py | 12 +++- swh/model/tests/test_identifiers.py | 85 ++++++++++++++++++++++++++--- swh/model/tests/test_model.py | 6 +- 3 files changed, 89 insertions(+), 14 deletions(-) diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index dce259ed..9fed8ddc 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -733,7 +733,7 @@ def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: ``` target $ExtendedSwhid - discovery_date $ISO8601 + discovery_date $Timestamp authority $StrWithoutSpaces $IRI fetcher $Str $Version format $StrWithoutSpaces @@ -759,6 +759,12 @@ def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for origins and 'emd' for raw extrinsic metadata) + $Timestamp is a decimal representation of the integer number of seconds since + the UNIX epoch (1970-01-01 00:00:00 UTC), with no leading '0' + (unless the timestamp value is zero) and no timezone. + It may be negative by prefixing it with a '-', which must not be followed + by a '0'. + Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, ie. by adding a space after them. @@ -766,9 +772,11 @@ def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: str: the intrinsic identifier for `metadata` """ + timestamp = metadata["discovery_date"].timestamp() + headers = [ (b"target", str(metadata["target"]).encode()), - (b"discovery_date", metadata["discovery_date"].isoformat().encode("ascii")), + (b"discovery_date", str(int(timestamp)).encode("ascii")), ( b"authority", f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(), diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index 8a0c4b81..7d5a0989 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -807,9 +807,9 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): def test_minimal(self): manifest = ( - b"raw_extrinsic_metadata 225\0" + b"raw_extrinsic_metadata 210\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" - b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"discovery_date 1611574071\n" b"authority forge https://forge.softwareheritage.org/\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"format json\n" @@ -823,14 +823,14 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.minimal), - "df16b5ea35b12f530fb7ecd0eb10b87a8b1fc3d2", + "5c13f20ba336e44549baf3d7b9305b027ec9f43d", ) def test_maximal(self): manifest = ( - b"raw_extrinsic_metadata 548\0" + b"raw_extrinsic_metadata 533\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" - b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"discovery_date 1611574071\n" b"authority forge https://forge.softwareheritage.org/\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"format json\n" @@ -851,7 +851,7 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(self.maximal), - "55563d91a3f9cb41aa36c60c2b518433bf318ae4", + "f96966e1093d15236a31fde07e47d5b1c9428049", ) def test_nonascii_path(self): @@ -860,9 +860,9 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f", } manifest = ( - b"raw_extrinsic_metadata 246\0" + b"raw_extrinsic_metadata 231\0" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" - b"discovery_date 2021-01-25T11:27:51+00:00\n" + b"discovery_date 1611574071\n" b"authority forge https://forge.softwareheritage.org/\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"format json\n" @@ -878,7 +878,74 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ) self.assertEqual( identifiers.raw_extrinsic_metadata_identifier(metadata), - "d8e5856601cdae96dfdfb5147235895949c9322d", + "7cc83fd1912176510c083f5df43f01b09af4b333", + ) + + def test_timezone_insensitive(self): + """Checks the timezone of the datetime.datetime does not affect the + hashed manifest.""" + utc_plus_one = datetime.timezone(datetime.timedelta(hours=1)) + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 2021, 1, 25, 12, 27, 51, tzinfo=utc_plus_one, + ), + } + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + identifiers.raw_extrinsic_metadata_identifier(metadata), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + ) + + def test_microsecond_insensitive(self): + """Checks the microseconds of the datetime.datetime does not affect the + hashed manifest.""" + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 2021, 1, 25, 11, 27, 51, 123456, tzinfo=datetime.timezone.utc, + ), + } + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(self.minimal), + identifiers.raw_extrinsic_metadata_identifier(metadata), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + ) + + def test_negative_timestamp(self): + metadata = { + **self.minimal, + "discovery_date": datetime.datetime( + 1960, 1, 25, 11, 27, 51, tzinfo=datetime.timezone.utc, + ), + } + + manifest = ( + b"raw_extrinsic_metadata 210\0" + b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" + b"discovery_date -313504329\n" + b"authority forge https://forge.softwareheritage.org/\n" + b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" + b"format json\n" + b"\n" + b'{"foo": "bar"}' + ) + + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + hashlib.sha1(manifest).hexdigest(), + ) + self.assertEqual( + identifiers.raw_extrinsic_metadata_identifier(metadata), + "895d0821a2991dd376ddc303424aceb7c68280f9", ) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index bdc741a4..8325bf51 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -855,7 +855,7 @@ def test_metadata_to_dict(): m = RawExtrinsicMetadata(target=_origin_swhid, **_common_metadata_fields,) assert m.to_dict() == { "target": str(_origin_swhid), - "id": b"\xeck\x9cQ\xf1\x1f\xeb\xde\x85{\x7f\xf0\x83\x9c\x8a\xd5\xfb\x8e2\xef", + "id": b"@j\xc9\x01\xbc\x1e#p*\xf3q9\xa7u\x97\x00\x14\x02xa", **common_fields, } assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m @@ -863,7 +863,7 @@ def test_metadata_to_dict(): m = RawExtrinsicMetadata(target=_content_swhid, **_common_metadata_fields,) assert m.to_dict() == { "target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", - "id": b"\x8e:_;eb\xe6\xf1Y\xd9\xa5aG[\rt\x89\xa1\x0b\xe4", + "id": b"\xbc\xa3U\xddf\x19U\xc5\xd2\xd7\xdfK\xd7c\x1f\xa8\xfeh\x992", **common_fields, } assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m @@ -882,7 +882,7 @@ def test_metadata_to_dict(): ) assert m.to_dict() == { "target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", - "id": b"\x07\xf1aS\xbe\xda\xd2\xf2\xd7\xaf:\xc7\xb7\x91C\x87W\x85R\x19", + "id": b"\x14l\xb0\x1f\xb9\xc0{)\xc7\x0f\xbd\xc0*,YZ\xf5C\xab\xfc", **common_fields, "origin": "https://example.org/", "snapshot": f"swh:1:snp:{hash_hex}", -- GitLab