diff --git a/PKG-INFO b/PKG-INFO index 6d30a917c19afba7f5bbdf82b9f05c362db5f9c1..7082d8cc50e208c7367bc7b38eda7780765f8f2d 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 0.7.3 +Version: 0.8.0 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 6d30a917c19afba7f5bbdf82b9f05c362db5f9c1..7082d8cc50e208c7367bc7b38eda7780765f8f2d 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 0.7.3 +Version: 0.8.0 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index 92664cc4f922593e0afe6ea672be904917f99112..274cb3563b907cbcea74f661ea0bc17b128677fa 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2019 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,6 +7,7 @@ import binascii import datetime from functools import lru_cache import hashlib +import re from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import attr @@ -728,21 +729,23 @@ class SWHID: def check_namespace(self, attribute, value): if value != SWHID_NAMESPACE: raise ValidationError( - "Wrong format: only supported namespace is '%s'" % SWHID_NAMESPACE + f"Invalid SWHID: namespace is '{value}' but must be '{SWHID_NAMESPACE}'" ) @scheme_version.validator def check_scheme_version(self, attribute, value): if value != SWHID_VERSION: raise ValidationError( - "Wrong format: only supported version is %d" % SWHID_VERSION + f"Invalid SWHID: version is {value} but must be {SWHID_VERSION}" ) @object_type.validator def check_object_type(self, attribute, value): if value not in _object_type_map: + supported_types = ", ".join(_object_type_map.keys()) raise ValidationError( - "Wrong input: Supported types are %s" % (list(_object_type_map.keys())) + f"Invalid SWHID: object type is {value} but must be " + f"one of {supported_types}" ) @object_id.validator @@ -798,6 +801,9 @@ def swhid( return str(swhid) +CONTEXT_QUALIFIERS = {"origin", "anchor", "visit", "path", "lines"} + + def parse_swhid(swhid: str) -> SWHID: """Parse :ref:`persistent-identifiers`. @@ -818,12 +824,17 @@ def parse_swhid(swhid: str) -> SWHID: a named tuple holding the parsing result """ + if re.search(r"[ \t\n\r\f\v]", swhid): + raise ValidationError("Invalid SwHID: SWHIDs cannot contain whitespaces") + # <swhid>;<contextual-information> swhid_parts = swhid.split(SWHID_CTXT_SEP) swhid_data = swhid_parts.pop(0).split(":") if len(swhid_data) != 4: - raise ValidationError("Wrong format: There should be 4 mandatory values") + raise ValidationError( + "Invalid SWHID, format must be 'swh:1:OBJECT_TYPE:OBJECT_ID'" + ) # Checking for parsing errors _ns, _version, _type, _id = swhid_data @@ -834,16 +845,29 @@ def parse_swhid(swhid: str) -> SWHID: break if not _id: - raise ValidationError("Wrong format: Identifier should be present") + raise ValidationError( + "Invalid SWHID: missing OBJECT_ID (as a 40 hex digit string)" + ) _metadata = {} for part in swhid_parts: try: - key, val = part.split("=") - _metadata[key] = val + qualifier, val = part.split("=") + _metadata[qualifier] = val except Exception: - msg = "Contextual data is badly formatted, form key=val expected" - raise ValidationError(msg) + raise ValidationError( + "Invalid SWHID: contextual data must be a ;-separated list of " + "key=value pairs" + ) + + wrong_qualifiers = set(_metadata) - set(CONTEXT_QUALIFIERS) + if wrong_qualifiers: + error_msg = ( + f"Invalid SWHID: Wrong qualifiers {', '.join(wrong_qualifiers)}. " + f"The qualifiers must be one of {', '.join(CONTEXT_QUALIFIERS)}" + ) + raise ValidationError(error_msg) + return SWHID( _ns, int(_version), diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index ed34d23dd8fcc5e34c51c9123d54f344ff87ae79..73515c6589cf22a91e7828291a39414b74392aee 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -927,37 +927,6 @@ class SnapshotIdentifier(unittest.TestCase): }, ) - def test_parse_swhid_parsing_error(self): - for swhid in [ - ("swh:1:cnt"), - ("swh:1:"), - ("swh:"), - ("swh:1:cnt:"), - ("foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505"), - ("swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505"), - ("swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505"), - ("swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;" "malformed"), - ("swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d"), - ("swh:1:snp:foo"), - ]: - with self.assertRaises(ValidationError): - identifiers.parse_swhid(swhid) - - def test_persistentid_class_validation_error(self): - for _ns, _version, _type, _id in [ - ("foo", 1, CONTENT, "abc8bc9d7a6bcf6db04f476d29314f157507d505"), - ("swh", 2, DIRECTORY, "def8bc9d7a6bcf6db04f476d29314f157507d505"), - ("swh", 1, "foo", "fed8bc9d7a6bcf6db04f476d29314f157507d505"), - ("swh", 1, SNAPSHOT, "gh6959356d30f1a4e9b7f6bca59b9a336464c03d"), - ]: - with self.assertRaises(ValidationError): - SWHID( - namespace=_ns, - scheme_version=_version, - object_type=_type, - object_id=_id, - ) - class OriginIdentifier(unittest.TestCase): def setUp(self): @@ -1077,6 +1046,69 @@ def test_normalize_timestamp_dict_invalid_timestamp(dict_input): normalize_timestamp(dict_input) +@pytest.mark.parametrize( + "invalid_swhid", + [ + "swh:1:cnt", + "swh:1:", + "swh:", + "swh:1:cnt:", + "foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;malformed", + "swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d", + "swh:1:snp:foo", + # wrong qualifier: ori should be origin + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa + # wrong qualifier: anc should be anchor + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anc=1;visit=1;path=/", # noqa + # wrong qualifier: vis should be visit + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=1;vis=1;path=/", # noqa + # wrong qualifier: pa should be path + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=1;visit=1;pa=/", # noqa + # wrong qualifier: line should be lines + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;line=10;origin=something;anchor=1;visit=1;path=/", # noqa + # wrong qualifier value: it contains space before of after + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin= https://some-url", # noqa + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=some-anchor ", # noqa + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=something;anchor=some-anchor ;visit=1", # noqa + # invalid swhid: whitespaces + "swh :1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa + "swh: 1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa + "swh: 1: dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;ori=something;anchor=1;visit=1;path=/", # noqa + "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d", + "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d; origin=blah", + "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", + # other whitespaces + "swh\t:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", + "swh:1\n:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", + "swh:1:\rdir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12", + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d\f;lines=12", + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;lines=12\v", + ], +) +def test_parse_swhid_parsing_error(invalid_swhid): + with pytest.raises(ValidationError): + identifiers.parse_swhid(invalid_swhid) + + +@pytest.mark.parametrize( + "ns,version,type,id", + [ + ("foo", 1, CONTENT, "abc8bc9d7a6bcf6db04f476d29314f157507d505",), + ("swh", 2, DIRECTORY, "def8bc9d7a6bcf6db04f476d29314f157507d505",), + ("swh", 1, "foo", "fed8bc9d7a6bcf6db04f476d29314f157507d505",), + ("swh", 1, SNAPSHOT, "gh6959356d30f1a4e9b7f6bca59b9a336464c03d",), + ], +) +def test_SWHID_class_validation_error(ns, version, type, id): + with pytest.raises(ValidationError): + SWHID( + namespace=ns, scheme_version=version, object_type=type, object_id=id, + ) + + def test_swhid_hash(): object_id = "94a9ed024d3859793618152ea559a168bbcbb5e2"