From 559a283cb095de2b7da173e5995883b0af801f64 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Thu, 12 Nov 2020 10:39:24 +0100
Subject: [PATCH] identifiers.parse_swhid: Make SWHIDs with whitespaces invalid

So parse_swhid raises a ValidationError when that is detected.

Related to T2769
---
 swh/model/identifiers.py            |  6 +++++-
 swh/model/tests/test_identifiers.py | 17 +++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py
index e4f3a637..c479f3ba 100644
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2019  The Software Heritage developers
+# Copyright (C) 2015-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -7,6 +7,7 @@ import binascii
 import datetime
 from functools import lru_cache
 import hashlib
+import re
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import attr
@@ -823,6 +824,9 @@ def parse_swhid(swhid: str) -> SWHID:
         a named tuple holding the parsing result
 
     """
+    if re.search(r"[ \t\n\r\f\v]", swhid):
+        raise ValidationError("Invalid SwHID: SWHIDs cannot contain whitespaces")
+
     # <swhid>;<contextual-information>
     swhid_parts = swhid.split(SWHID_CTXT_SEP)
     swhid_data = swhid_parts.pop(0).split(":")
diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py
index cb21bb4b..73515c65 100644
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
@@ -1069,6 +1069,23 @@ def test_normalize_timestamp_dict_invalid_timestamp(dict_input):
         "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=1;visit=1;pa=/",  # noqa
         # wrong qualifier: line should be lines
         "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;line=10;origin=something;anchor=1;visit=1;path=/",  # noqa
+        # wrong qualifier value: it contains space before of after
+        "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=  https://some-url",  # noqa
+        "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=some-anchor    ",  # noqa
+        "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=some-anchor    ;visit=1",  # noqa
+        # invalid swhid: whitespaces
+        "swh :1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/",  # noqa
+        "swh: 1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/",  # noqa
+        "swh: 1: dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/",  # noqa
+        "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d",
+        "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d; origin=blah",
+        "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12",
+        # other whitespaces
+        "swh\t:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12",
+        "swh:1\n:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12",
+        "swh:1:\rdir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12",
+        "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d\f;lines=12",
+        "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12\v",
     ],
 )
 def test_parse_swhid_parsing_error(invalid_swhid):
-- 
GitLab