diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -7,6 +7,8 @@ import datetime from functools import lru_cache +from .exceptions import ValidationError +from .fields.hashes import validate_sha1 from .hashutil import hash_data, hash_git_data, DEFAULT_ALGORITHMS from .hashutil import hash_to_hex @@ -603,7 +605,7 @@ Args: type (str): Object's type - object (str): Object's dict representation + object (dict): Object's dict representation version (int): persistent identifier version (default to 1) Returns: @@ -637,12 +639,21 @@ return 'swh:%s:%s:%s' % (version, o['short_name'], _hash) +PERSISTENT_IDENTIFIER_TYPES = ['snp', 'rel', 'rev', 'dir', 'cnt'] + PERSISTENT_IDENTIFIER_KEYS = [ 'namespace', 'scheme_version', 'object_type', 'object_id', 'metadata'] PERSISTENT_IDENTIFIER_PARTS_SEP = ';' +class SWHMalformedIdentifierException(ValueError): + """Exception when a string representing an identifier is badly formatted. + + """ + pass + + def parse_persistent_identifier(persistent_id): """Parse swh's :ref:`persistent-identifiers` scheme. @@ -659,14 +670,47 @@ * metadata, holding dict value """ + # ; persistent_id_parts = persistent_id.split(PERSISTENT_IDENTIFIER_PARTS_SEP) - data = persistent_id_parts.pop(0).split(':') + pid_data = persistent_id_parts.pop(0).split(':') + + if len(pid_data) != 4: + raise SWHMalformedIdentifierException( + 'Wrong format: There should be 4 mandatory parameters') + + # Checking for parsing errors + _ns, _version, _type, _id = pid_data + if _ns != 'swh': + raise SWHMalformedIdentifierException( + 'Wrong format: Supported namespace is \'swh\'') + + if _version != '1': + raise SWHMalformedIdentifierException( + 'Wrong format: Supported version is 1') + + expected_types = PERSISTENT_IDENTIFIER_TYPES + if _type not in expected_types: + raise SWHMalformedIdentifierException( + 'Wrong format: Supported types are %s' % ( + ', '.join(expected_types))) + + if not _id: + raise SWHMalformedIdentifierException( + 'Wrong format: Identifier should be present') + + try: + validate_sha1(_id) + except ValidationError: + raise SWHMalformedIdentifierException( + 'Wrong format: Identifier should be a valid hash') + persistent_id_metadata = {} for part in persistent_id_parts: try: key, val = part.split('=') persistent_id_metadata[key] = val except Exception: - pass - data.append(persistent_id_metadata) - return dict(zip(PERSISTENT_IDENTIFIER_KEYS, data)) + msg = 'Contextual data is badly formatted, form key=val expected' + raise SWHMalformedIdentifierException(msg) + pid_data.append(persistent_id_metadata) + return dict(zip(PERSISTENT_IDENTIFIER_KEYS, pid_data)) diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -804,8 +804,8 @@ for pid, _type, _version, _hash in [ ('swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 'cnt', '1', '94a9ed024d3859793618152ea559a168bbcbb5e2'), - ('swh:2:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir', - '2', 'd198bc9d7a6bcf6db04f476d29314f157507d505'), + ('swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir', + '1', 'd198bc9d7a6bcf6db04f476d29314f157507d505'), ('swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', 'rev', '1', '309cf2674ee7a0749978cf8265ab91a60aea0f7d'), ('swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 'rel', @@ -834,9 +834,7 @@ 'dir', '1', '0b6959356d30f1a4e9b7f6bca59b9a336464c03d', { 'origin': 'deb://Debian/packages/linuxdoc-tools' - }), - ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;malformed', # noqa - 'dir', '1', '0b6959356d30f1a4e9b7f6bca59b9a336464c03d', {}) + }) ]: expected_result = { 'namespace': 'swh', @@ -847,3 +845,34 @@ } actual_result = identifiers.parse_persistent_identifier(pid) self.assertEquals(actual_result, expected_result) + + def test_parse_persistent_identifier_parsing_error(self): + from swh.model.identifiers import (SWHMalformedIdentifierException, + PERSISTENT_IDENTIFIER_TYPES) + for pid, _error in [ + ('swh:1:cnt', + 'Wrong format: There should be 4 mandatory parameters'), + ('swh:1:', + 'Wrong format: There should be 4 mandatory parameters'), + ('swh:', + 'Wrong format: There should be 4 mandatory parameters'), + ('swh:1:cnt:', + 'Wrong format: Identifier should be present'), + ('foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505', + 'Wrong format: Supported namespace is \'swh\''), + ('swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505', + 'Wrong format: Supported version is 1'), + ('swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505', + 'Wrong format: Supported types are %s' % ( + ', '.join(PERSISTENT_IDENTIFIER_TYPES))), + ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;' + 'malformed', + 'Contextual data is badly formatted, form key=val expected'), + ('swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d', + 'Wrong format: Identifier should be a valid hash'), + ('swh:1:snp:foo', + 'Wrong format: Identifier should be a valid hash') + ]: + with self.assertRaisesRegex( + SWHMalformedIdentifierException, _error): + identifiers.parse_persistent_identifier(pid)