Changeset View
Changeset View
Standalone View
Standalone View
swh/model/identifiers.py
# Copyright (C) 2015-2018 The Software Heritage developers | # Copyright (C) 2015-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import binascii | import binascii | ||||
import datetime | import datetime | ||||
import hashlib | import hashlib | ||||
from collections import namedtuple | from collections import namedtuple | ||||
from functools import lru_cache | from functools import lru_cache | ||||
from .exceptions import ValidationError | from .exceptions import ValidationError | ||||
from .fields.hashes import validate_sha1 | from .fields.hashes import validate_sha1 | ||||
from .hashutil import hash_git_data, hash_to_hex, MultiHash | from .hashutil import hash_git_data, hash_to_hex, MultiHash | ||||
ORIGIN = 'origin' | ORIGIN = 'origin' | ||||
SNAPSHOT = 'snapshot' | SNAPSHOT = 'snapshot' | ||||
REVISION = 'revision' | REVISION = 'revision' | ||||
RELEASE = 'release' | RELEASE = 'release' | ||||
DIRECTORY = 'directory' | DIRECTORY = 'directory' | ||||
CONTENT = 'content' | CONTENT = 'content' | ||||
PID_NAMESPACE = 'swh' | |||||
PID_VERSION = 1 | |||||
PID_TYPES = ['ori', 'snp', 'rel', 'rev', 'dir', 'cnt'] | |||||
PID_KEYS = ['namespace', 'scheme_version', 'object_type', 'object_id', | |||||
'metadata'] | |||||
PID_SEP = ':' | |||||
PID_CTXT_SEP = ';' | |||||
@lru_cache() | @lru_cache() | ||||
def identifier_to_bytes(identifier): | def identifier_to_bytes(identifier): | ||||
"""Convert a text identifier to bytes. | """Convert a text identifier to bytes. | ||||
Args: | Args: | ||||
identifier: an identifier, either a 40-char hexadecimal string or a | identifier: an identifier, either a 40-char hexadecimal string or a | ||||
bytes object of length 20 | bytes object of length 20 | ||||
▲ Show 20 Lines • Show All 593 Lines • ▼ Show 20 Lines | DIRECTORY: { | ||||
'key_id': 'id' | 'key_id': 'id' | ||||
}, | }, | ||||
CONTENT: { | CONTENT: { | ||||
'short_name': 'cnt', | 'short_name': 'cnt', | ||||
'key_id': 'sha1_git' | 'key_id': 'sha1_git' | ||||
} | } | ||||
} | } | ||||
PERSISTENT_IDENTIFIER_TYPES = ['ori', 'snp', 'rel', 'rev', 'dir', 'cnt'] | |||||
PERSISTENT_IDENTIFIER_KEYS = [ | |||||
'namespace', 'scheme_version', 'object_type', 'object_id', 'metadata'] | |||||
PERSISTENT_IDENTIFIER_PARTS_SEP = ';' | |||||
class PersistentId(namedtuple('PersistentId', PERSISTENT_IDENTIFIER_KEYS)): | class PersistentId(namedtuple('PersistentId', PID_KEYS)): | ||||
""" | """ | ||||
Named tuple holding the relevant info associated to a Software Heritage | Named tuple holding the relevant info associated to a Software Heritage | ||||
persistent identifier. | persistent identifier. | ||||
Args: | Args: | ||||
namespace (str): the namespace of the identifier, defaults to 'swh' | namespace (str): the namespace of the identifier, defaults to 'swh' | ||||
scheme_version (int): the scheme version of the identifier, | scheme_version (int): the scheme version of the identifier, | ||||
defaults to 1 | defaults to 1 | ||||
Show All 24 Lines | this named tuple, use the :func:`str` function:: | ||||
object_type='content', | object_type='content', | ||||
object_id='8ff44f081d43176474b267de5451f2c2e88089d0' | object_id='8ff44f081d43176474b267de5451f2c2e88089d0' | ||||
) | ) | ||||
pid_str = str(pid) | pid_str = str(pid) | ||||
# 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' | # 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' | ||||
""" | """ | ||||
__slots__ = () | __slots__ = () | ||||
def __new__(cls, namespace='swh', scheme_version=1, | def __new__(cls, namespace=PID_NAMESPACE, scheme_version=PID_VERSION, | ||||
object_type='', object_id='', metadata={}): | object_type='', object_id='', metadata={}): | ||||
o = _object_type_map.get(object_type) | o = _object_type_map.get(object_type) | ||||
if not o: | if not o: | ||||
raise ValidationError('Wrong input: Supported types are %s' % ( | raise ValidationError('Wrong input: Supported types are %s' % ( | ||||
list(_object_type_map.keys()))) | list(_object_type_map.keys()))) | ||||
# internal swh representation resolution | # internal swh representation resolution | ||||
if isinstance(object_id, dict): | if isinstance(object_id, dict): | ||||
object_id = object_id[o['key_id']] | object_id = object_id[o['key_id']] | ||||
validate_sha1(object_id) # can raise if invalid hash | validate_sha1(object_id) # can raise if invalid hash | ||||
object_id = hash_to_hex(object_id) | object_id = hash_to_hex(object_id) | ||||
return super(cls, PersistentId).__new__( | return super(cls, PersistentId).__new__( | ||||
cls, namespace, scheme_version, object_type, object_id, metadata) | cls, namespace, scheme_version, object_type, object_id, metadata) | ||||
def __str__(self): | def __str__(self): | ||||
o = _object_type_map.get(self.object_type) | o = _object_type_map.get(self.object_type) | ||||
pid = '%s:%s:%s:%s' % (self.namespace, self.scheme_version, | pid = PID_SEP.join([self.namespace, str(self.scheme_version), | ||||
o['short_name'], self.object_id) | o['short_name'], self.object_id]) | ||||
if self.metadata: | if self.metadata: | ||||
for k, v in self.metadata.items(): | for k, v in self.metadata.items(): | ||||
pid += '%s%s=%s' % (PERSISTENT_IDENTIFIER_PARTS_SEP, k, v) | pid += '%s%s=%s' % (PID_CTXT_SEP, k, v) | ||||
return pid | return pid | ||||
def persistent_identifier(object_type, object_id, scheme_version=1, | def persistent_identifier(object_type, object_id, scheme_version=1, | ||||
metadata={}): | metadata={}): | ||||
"""Compute persistent identifier (stable over time) as per | """Compute persistent identifier (stable over time) as per | ||||
documentation. | documentation. | ||||
Show All 38 Lines | Raises: | ||||
* missing hash | * missing hash | ||||
* invalid hash identifier supplied | * invalid hash identifier supplied | ||||
Returns: | Returns: | ||||
PersistentId: a named tuple holding the parsing result | PersistentId: a named tuple holding the parsing result | ||||
""" | """ | ||||
# <pid>;<contextual-information> | # <pid>;<contextual-information> | ||||
persistent_id_parts = persistent_id.split(PERSISTENT_IDENTIFIER_PARTS_SEP) | persistent_id_parts = persistent_id.split(PID_CTXT_SEP) | ||||
pid_data = persistent_id_parts.pop(0).split(':') | pid_data = persistent_id_parts.pop(0).split(':') | ||||
if len(pid_data) != 4: | if len(pid_data) != 4: | ||||
raise ValidationError( | raise ValidationError( | ||||
'Wrong format: There should be 4 mandatory values') | 'Wrong format: There should be 4 mandatory values') | ||||
# Checking for parsing errors | # Checking for parsing errors | ||||
_ns, _version, _type, _id = pid_data | _ns, _version, _type, _id = pid_data | ||||
if _ns != 'swh': | if _ns != PID_NAMESPACE: | ||||
raise ValidationError( | raise ValidationError( | ||||
'Wrong format: Supported namespace is \'swh\'') | "Wrong format: only supported namespace is '%s'" % PID_NAMESPACE) | ||||
if _version != '1': | if _version != str(PID_VERSION): | ||||
raise ValidationError( | raise ValidationError( | ||||
'Wrong format: Supported version is 1') | 'Wrong format: only supported version is %d' % PID_VERSION) | ||||
pid_data[1] = int(pid_data[1]) | pid_data[1] = int(pid_data[1]) | ||||
expected_types = PERSISTENT_IDENTIFIER_TYPES | expected_types = PID_TYPES | ||||
if _type not in expected_types: | if _type not in expected_types: | ||||
raise ValidationError( | raise ValidationError( | ||||
'Wrong format: Supported types are %s' % ( | 'Wrong format: Supported types are %s' % ( | ||||
', '.join(expected_types))) | ', '.join(expected_types))) | ||||
for otype, data in _object_type_map.items(): | for otype, data in _object_type_map.items(): | ||||
if _type == data['short_name']: | if _type == data['short_name']: | ||||
pid_data[2] = otype | pid_data[2] = otype | ||||
Show All 22 Lines |