Skip to content
Snippets Groups Projects
Commit 3ce41250 authored by vlorentz's avatar vlorentz
Browse files

identifiers: Change the manifest format of raw_extrinsic_metadata to use integer instead of ISO8601

Serializing as ISO8601 makes the hash brittle, because the database may
change the timezone silently and/or lose precision in the microseconds.

As we do not need precise timestamp, using an integer is good enough,
and is consistant with the git format.

The manifest also does not need to contain a timezone, as it only
represents the timezone of the system that fetched this metadata,
which is useless data.
parent fc808e1f
No related branches found
No related tags found
No related merge requests found
...@@ -733,7 +733,7 @@ def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: ...@@ -733,7 +733,7 @@ def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str:
``` ```
target $ExtendedSwhid target $ExtendedSwhid
discovery_date $ISO8601 discovery_date $Timestamp
authority $StrWithoutSpaces $IRI authority $StrWithoutSpaces $IRI
fetcher $Str $Version fetcher $Str $Version
format $StrWithoutSpaces format $StrWithoutSpaces
...@@ -759,6 +759,12 @@ def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: ...@@ -759,6 +759,12 @@ def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str:
$ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for
origins and 'emd' for raw extrinsic metadata) origins and 'emd' for raw extrinsic metadata)
$Timestamp is a decimal representation of the integer number of seconds since
the UNIX epoch (1970-01-01 00:00:00 UTC), with no leading '0'
(unless the timestamp value is zero) and no timezone.
It may be negative by prefixing it with a '-', which must not be followed
by a '0'.
Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields,
ie. by adding a space after them. ie. by adding a space after them.
...@@ -766,9 +772,11 @@ def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: ...@@ -766,9 +772,11 @@ def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str:
str: the intrinsic identifier for `metadata` str: the intrinsic identifier for `metadata`
""" """
timestamp = metadata["discovery_date"].timestamp()
headers = [ headers = [
(b"target", str(metadata["target"]).encode()), (b"target", str(metadata["target"]).encode()),
(b"discovery_date", metadata["discovery_date"].isoformat().encode("ascii")), (b"discovery_date", str(int(timestamp)).encode("ascii")),
( (
b"authority", b"authority",
f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(), f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(),
......
...@@ -807,9 +807,9 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ...@@ -807,9 +807,9 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
def test_minimal(self): def test_minimal(self):
manifest = ( manifest = (
b"raw_extrinsic_metadata 225\0" b"raw_extrinsic_metadata 210\0"
b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
b"discovery_date 2021-01-25T11:27:51+00:00\n" b"discovery_date 1611574071\n"
b"authority forge https://forge.softwareheritage.org/\n" b"authority forge https://forge.softwareheritage.org/\n"
b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
b"format json\n" b"format json\n"
...@@ -823,14 +823,14 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ...@@ -823,14 +823,14 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
) )
self.assertEqual( self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(self.minimal), identifiers.raw_extrinsic_metadata_identifier(self.minimal),
"df16b5ea35b12f530fb7ecd0eb10b87a8b1fc3d2", "5c13f20ba336e44549baf3d7b9305b027ec9f43d",
) )
def test_maximal(self): def test_maximal(self):
manifest = ( manifest = (
b"raw_extrinsic_metadata 548\0" b"raw_extrinsic_metadata 533\0"
b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
b"discovery_date 2021-01-25T11:27:51+00:00\n" b"discovery_date 1611574071\n"
b"authority forge https://forge.softwareheritage.org/\n" b"authority forge https://forge.softwareheritage.org/\n"
b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
b"format json\n" b"format json\n"
...@@ -851,7 +851,7 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ...@@ -851,7 +851,7 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
) )
self.assertEqual( self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(self.maximal), identifiers.raw_extrinsic_metadata_identifier(self.maximal),
"55563d91a3f9cb41aa36c60c2b518433bf318ae4", "f96966e1093d15236a31fde07e47d5b1c9428049",
) )
def test_nonascii_path(self): def test_nonascii_path(self):
...@@ -860,9 +860,9 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ...@@ -860,9 +860,9 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
"path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f", "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f",
} }
manifest = ( manifest = (
b"raw_extrinsic_metadata 246\0" b"raw_extrinsic_metadata 231\0"
b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n" b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
b"discovery_date 2021-01-25T11:27:51+00:00\n" b"discovery_date 1611574071\n"
b"authority forge https://forge.softwareheritage.org/\n" b"authority forge https://forge.softwareheritage.org/\n"
b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n" b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
b"format json\n" b"format json\n"
...@@ -878,7 +878,74 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase): ...@@ -878,7 +878,74 @@ class RawExtrinsicMetadataIdentifier(unittest.TestCase):
) )
self.assertEqual( self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(metadata), identifiers.raw_extrinsic_metadata_identifier(metadata),
"d8e5856601cdae96dfdfb5147235895949c9322d", "7cc83fd1912176510c083f5df43f01b09af4b333",
)
def test_timezone_insensitive(self):
"""Checks the timezone of the datetime.datetime does not affect the
hashed manifest."""
utc_plus_one = datetime.timezone(datetime.timedelta(hours=1))
metadata = {
**self.minimal,
"discovery_date": datetime.datetime(
2021, 1, 25, 12, 27, 51, tzinfo=utc_plus_one,
),
}
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(self.minimal),
identifiers.raw_extrinsic_metadata_identifier(metadata),
)
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(metadata),
"5c13f20ba336e44549baf3d7b9305b027ec9f43d",
)
def test_microsecond_insensitive(self):
"""Checks the microseconds of the datetime.datetime does not affect the
hashed manifest."""
metadata = {
**self.minimal,
"discovery_date": datetime.datetime(
2021, 1, 25, 11, 27, 51, 123456, tzinfo=datetime.timezone.utc,
),
}
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(self.minimal),
identifiers.raw_extrinsic_metadata_identifier(metadata),
)
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(metadata),
"5c13f20ba336e44549baf3d7b9305b027ec9f43d",
)
def test_negative_timestamp(self):
metadata = {
**self.minimal,
"discovery_date": datetime.datetime(
1960, 1, 25, 11, 27, 51, tzinfo=datetime.timezone.utc,
),
}
manifest = (
b"raw_extrinsic_metadata 210\0"
b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
b"discovery_date -313504329\n"
b"authority forge https://forge.softwareheritage.org/\n"
b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
b"format json\n"
b"\n"
b'{"foo": "bar"}'
)
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(metadata),
hashlib.sha1(manifest).hexdigest(),
)
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(metadata),
"895d0821a2991dd376ddc303424aceb7c68280f9",
) )
......
...@@ -855,7 +855,7 @@ def test_metadata_to_dict(): ...@@ -855,7 +855,7 @@ def test_metadata_to_dict():
m = RawExtrinsicMetadata(target=_origin_swhid, **_common_metadata_fields,) m = RawExtrinsicMetadata(target=_origin_swhid, **_common_metadata_fields,)
assert m.to_dict() == { assert m.to_dict() == {
"target": str(_origin_swhid), "target": str(_origin_swhid),
"id": b"\xeck\x9cQ\xf1\x1f\xeb\xde\x85{\x7f\xf0\x83\x9c\x8a\xd5\xfb\x8e2\xef", "id": b"@j\xc9\x01\xbc\x1e#p*\xf3q9\xa7u\x97\x00\x14\x02xa",
**common_fields, **common_fields,
} }
assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m
...@@ -863,7 +863,7 @@ def test_metadata_to_dict(): ...@@ -863,7 +863,7 @@ def test_metadata_to_dict():
m = RawExtrinsicMetadata(target=_content_swhid, **_common_metadata_fields,) m = RawExtrinsicMetadata(target=_content_swhid, **_common_metadata_fields,)
assert m.to_dict() == { assert m.to_dict() == {
"target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", "target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2",
"id": b"\x8e:_;eb\xe6\xf1Y\xd9\xa5aG[\rt\x89\xa1\x0b\xe4", "id": b"\xbc\xa3U\xddf\x19U\xc5\xd2\xd7\xdfK\xd7c\x1f\xa8\xfeh\x992",
**common_fields, **common_fields,
} }
assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m assert RawExtrinsicMetadata.from_dict(m.to_dict()) == m
...@@ -882,7 +882,7 @@ def test_metadata_to_dict(): ...@@ -882,7 +882,7 @@ def test_metadata_to_dict():
) )
assert m.to_dict() == { assert m.to_dict() == {
"target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", "target": "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2",
"id": b"\x07\xf1aS\xbe\xda\xd2\xf2\xd7\xaf:\xc7\xb7\x91C\x87W\x85R\x19", "id": b"\x14l\xb0\x1f\xb9\xc0{)\xc7\x0f\xbd\xc0*,YZ\xf5C\xab\xfc",
**common_fields, **common_fields,
"origin": "https://example.org/", "origin": "https://example.org/",
"snapshot": f"swh:1:snp:{hash_hex}", "snapshot": f"swh:1:snp:{hash_hex}",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment