diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py index 8beca3427c8d79bda322f4711b4ed896c97f0b2d..e41e275de1d71278f176e87b1396ee2ae8c5882c 100644 --- a/swh/model/hypothesis_strategies.py +++ b/swh/model/hypothesis_strategies.py @@ -175,7 +175,7 @@ timestamps_with_timezone = timestamps_with_timezone_d().map( ) -def origins_d(*, url=iris()): +def origins_d(*, url=iris().filter(lambda iri: len(iri.encode()) < 2048)): return builds(dict, url=url) diff --git a/swh/model/model.py b/swh/model/model.py index 136cf82b53f4d9dd6307f6c04e3f4079d6ab6cff..06e1901351deaed88ff4df688b2e9543f29c8506 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -809,6 +809,19 @@ class Origin(BaseHashableModel): object_id=self.id, ) + @url.validator + def check_url(self, attribute, value): + if len(value.encode()) >= 2048: + # Rationale for this value: + # 1. Needs to be stored in a postgresql btree, which is limited to + # somewhere around 2700 bytes + # 2. URLs longer than 2048 characters won't work very well in browsers, + # and repository URLs are often meant to at least display something + # when opened in a browser. https://stackoverflow.com/a/417184/539465 + # 3. Even though this field is actually an IRI, it is usually in ASCII + # so this should be a good-enough approximation + raise ValueError("Origin URL is too long") + @attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators) class OriginVisit(BaseModel): diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index 11d0260fa0dcbfef14beeaf7edd1f225d00f851c..38caf19c75d1f266959311af5d19fbba06e4bdc2 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -393,6 +393,13 @@ def test_todict_origins(origin): assert type(origin)(url=origin.url) == type(origin).from_dict(obj) +def test_origin_long_url(): + with pytest.raises(ValueError, match="Origin URL is too long"): + Origin(url="https://" + "a" * 3000) + with pytest.raises(ValueError, match="Origin URL is too long"): + Origin(url="https://example.org/" + "a" * 3050) + + @given(strategies.origin_visits()) def test_todict_origin_visits(origin_visit): obj = origin_visit.to_dict()