diff --git a/swh/model/cli.py b/swh/model/cli.py --- a/swh/model/cli.py +++ b/swh/model/cli.py @@ -9,6 +9,7 @@ from swh.model import identifiers as pids from swh.model.from_disk import Content, Directory +from swh.model.identifiers import SWHMalformedIdentifierException class PidParamType(click.ParamType): @@ -16,12 +17,10 @@ def convert(self, value, param, ctx): try: - _parsed_pid = pids.parse_persistent_identifier(value) # noqa + pids.parse_persistent_identifier(value) return value # return as string, as we need just that - except Exception: - # TODO catch more specific parsing exception. Requires - # https://forge.softwareheritage.org/T1104 to be addressed first. - self.fail('%s is not a valid PID' % value, param, ctx) + except SWHMalformedIdentifierException as e: + self.fail('%s is not a valid PID. %s.' % (value, e), param, ctx) def pid_of_file(path): diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -7,6 +7,8 @@ import datetime from functools import lru_cache +from .exceptions import ValidationError +from .fields.hashes import validate_sha1 from .hashutil import hash_data, hash_git_data, DEFAULT_ALGORITHMS from .hashutil import hash_to_hex @@ -603,7 +605,8 @@ Args: type (str): Object's type - object (str): Object's dict representation + object (dict/bytes/str): Object's dict representation or object + identifier version (int): persistent identifier version (default to 1) Returns: @@ -633,16 +636,29 @@ }, } o = _map[type] - _hash = hash_to_hex(object[o['key_id']]) + if isinstance(object, dict): # internal swh representation resolution + _hash = object[o['key_id']] + else: # client passed direct identifier (bytes/str) + _hash = object + _hash = hash_to_hex(_hash) return 'swh:%s:%s:%s' % (version, o['short_name'], _hash) +PERSISTENT_IDENTIFIER_TYPES = ['snp', 'rel', 'rev', 'dir', 'cnt'] + PERSISTENT_IDENTIFIER_KEYS = [ 'namespace', 'scheme_version', 'object_type', 'object_id', 'metadata'] PERSISTENT_IDENTIFIER_PARTS_SEP = ';' +class SWHMalformedIdentifierException(ValueError): + """Exception when a string representing an identifier is badly formatted. + + """ + pass + + def parse_persistent_identifier(persistent_id): """Parse swh's :ref:`persistent-identifiers` scheme. @@ -659,14 +675,47 @@ * metadata, holding dict value """ + # ; persistent_id_parts = persistent_id.split(PERSISTENT_IDENTIFIER_PARTS_SEP) - data = persistent_id_parts.pop(0).split(':') + pid_data = persistent_id_parts.pop(0).split(':') + + if len(pid_data) != 4: + raise SWHMalformedIdentifierException( + 'Wrong format: There should be 4 mandatory parameters') + + # Checking for parsing errors + _ns, _version, _type, _id = pid_data + if _ns != 'swh': + raise SWHMalformedIdentifierException( + 'Wrong format: Supported namespace is \'swh\'') + + if _version != '1': + raise SWHMalformedIdentifierException( + 'Wrong format: Supported version is 1') + + expected_types = PERSISTENT_IDENTIFIER_TYPES + if _type not in expected_types: + raise SWHMalformedIdentifierException( + 'Wrong format: Supported types are %s' % ( + ', '.join(expected_types))) + + if not _id: + raise SWHMalformedIdentifierException( + 'Wrong format: Identifier should be present') + + try: + validate_sha1(_id) + except ValidationError: + raise SWHMalformedIdentifierException( + 'Wrong format: Identifier should be a valid hash') + persistent_id_metadata = {} for part in persistent_id_parts: try: key, val = part.split('=') persistent_id_metadata[key] = val except Exception: - pass - data.append(persistent_id_metadata) - return dict(zip(PERSISTENT_IDENTIFIER_KEYS, data)) + msg = 'Contextual data is badly formatted, form key=val expected' + raise SWHMalformedIdentifierException(msg) + pid_data.append(persistent_id_metadata) + return dict(zip(PERSISTENT_IDENTIFIER_KEYS, pid_data)) diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -773,13 +773,29 @@ ) def test_persistent_identifier(self): - _snapshot = {'id': hashutil.hash_to_bytes( - 'c7c108084bc0bf3d81436bf980b46e98bd338453')} - _release = {'id': '22ece559cc7cc2364edc5e5593d63ae8bd229f9f'} - _revision = {'id': '309cf2674ee7a0749978cf8265ab91a60aea0f7d'} - _directory = {'id': 'd198bc9d7a6bcf6db04f476d29314f157507d505'} - _content = {'sha1_git': '94a9ed024d3859793618152ea559a168bbcbb5e2'} + _snapshot_id = hashutil.hash_to_bytes( + 'c7c108084bc0bf3d81436bf980b46e98bd338453') + _release_id = '22ece559cc7cc2364edc5e5593d63ae8bd229f9f' + _revision_id = '309cf2674ee7a0749978cf8265ab91a60aea0f7d' + _directory_id = 'd198bc9d7a6bcf6db04f476d29314f157507d505' + _content_id = '94a9ed024d3859793618152ea559a168bbcbb5e2' + _snapshot = {'id': _snapshot_id} + _release = {'id': _release_id} + _revision = {'id': _revision_id} + _directory = {'id': _directory_id} + _content = {'sha1_git': _content_id} + for full_type, _hash, expected_persistent_id, version in [ + (SNAPSHOT, _snapshot_id, + 'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', None), + (RELEASE, _release_id, + 'swh:2:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 2), + (REVISION, _revision_id, + 'swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', None), + (DIRECTORY, _directory_id, + 'swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', None), + (CONTENT, _content_id, + 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 1), (SNAPSHOT, _snapshot, 'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', None), (RELEASE, _release, @@ -789,7 +805,7 @@ (DIRECTORY, _directory, 'swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', None), (CONTENT, _content, - 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 1) + 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 1), ]: if version: actual_value = identifiers.persistent_identifier( @@ -804,8 +820,8 @@ for pid, _type, _version, _hash in [ ('swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 'cnt', '1', '94a9ed024d3859793618152ea559a168bbcbb5e2'), - ('swh:2:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir', - '2', 'd198bc9d7a6bcf6db04f476d29314f157507d505'), + ('swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir', + '1', 'd198bc9d7a6bcf6db04f476d29314f157507d505'), ('swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', 'rev', '1', '309cf2674ee7a0749978cf8265ab91a60aea0f7d'), ('swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 'rel', @@ -834,9 +850,7 @@ 'dir', '1', '0b6959356d30f1a4e9b7f6bca59b9a336464c03d', { 'origin': 'deb://Debian/packages/linuxdoc-tools' - }), - ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;malformed', # noqa - 'dir', '1', '0b6959356d30f1a4e9b7f6bca59b9a336464c03d', {}) + }) ]: expected_result = { 'namespace': 'swh', @@ -847,3 +861,34 @@ } actual_result = identifiers.parse_persistent_identifier(pid) self.assertEquals(actual_result, expected_result) + + def test_parse_persistent_identifier_parsing_error(self): + from swh.model.identifiers import (SWHMalformedIdentifierException, + PERSISTENT_IDENTIFIER_TYPES) + for pid, _error in [ + ('swh:1:cnt', + 'Wrong format: There should be 4 mandatory parameters'), + ('swh:1:', + 'Wrong format: There should be 4 mandatory parameters'), + ('swh:', + 'Wrong format: There should be 4 mandatory parameters'), + ('swh:1:cnt:', + 'Wrong format: Identifier should be present'), + ('foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505', + 'Wrong format: Supported namespace is \'swh\''), + ('swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505', + 'Wrong format: Supported version is 1'), + ('swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505', + 'Wrong format: Supported types are %s' % ( + ', '.join(PERSISTENT_IDENTIFIER_TYPES))), + ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;' + 'malformed', + 'Contextual data is badly formatted, form key=val expected'), + ('swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d', + 'Wrong format: Identifier should be a valid hash'), + ('swh:1:snp:foo', + 'Wrong format: Identifier should be a valid hash') + ]: + with self.assertRaisesRegex( + SWHMalformedIdentifierException, _error): + identifiers.parse_persistent_identifier(pid)