Changeset View
Changeset View
Standalone View
Standalone View
swh/model/identifiers.py
# Copyright (C) 2015-2019 The Software Heritage developers | # Copyright (C) 2015-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import binascii | import binascii | ||||
import datetime | import datetime | ||||
import hashlib | import hashlib | ||||
from functools import lru_cache | from functools import lru_cache | ||||
from typing import Any, Dict, NamedTuple | from typing import Any, Dict, NamedTuple | ||||
from deprecated import deprecated | |||||
from .exceptions import ValidationError | from .exceptions import ValidationError | ||||
from .fields.hashes import validate_sha1 | from .fields.hashes import validate_sha1 | ||||
from .hashutil import hash_git_data, hash_to_hex, MultiHash | from .hashutil import hash_git_data, hash_to_hex, MultiHash | ||||
ORIGIN = "origin" | ORIGIN = "origin" | ||||
SNAPSHOT = "snapshot" | SNAPSHOT = "snapshot" | ||||
REVISION = "revision" | REVISION = "revision" | ||||
RELEASE = "release" | RELEASE = "release" | ||||
DIRECTORY = "directory" | DIRECTORY = "directory" | ||||
CONTENT = "content" | CONTENT = "content" | ||||
PID_NAMESPACE = "swh" | SWHID_NAMESPACE = "swh" | ||||
PID_VERSION = 1 | SWHID_VERSION = 1 | ||||
PID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"] | SWHID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"] | ||||
PID_SEP = ":" | SWHID_SEP = ":" | ||||
PID_CTXT_SEP = ";" | SWHID_CTXT_SEP = ";" | ||||
# deprecated variables | |||||
PID_NAMESPACE = SWHID_NAMESPACE | |||||
PID_VERSION = SWHID_VERSION | |||||
PID_TYPES = SWHID_TYPES | |||||
PID_SEP = SWHID_SEP | |||||
PID_CTXT_SEP = SWHID_CTXT_SEP | |||||
@lru_cache() | @lru_cache() | ||||
def identifier_to_bytes(identifier): | def identifier_to_bytes(identifier): | ||||
"""Convert a text identifier to bytes. | """Convert a text identifier to bytes. | ||||
Args: | Args: | ||||
identifier: an identifier, either a 40-char hexadecimal string or a | identifier: an identifier, either a 40-char hexadecimal string or a | ||||
▲ Show 20 Lines • Show All 606 Lines • ▼ Show 20 Lines | _object_type_map = { | ||||
SNAPSHOT: {"short_name": "snp", "key_id": "id"}, | SNAPSHOT: {"short_name": "snp", "key_id": "id"}, | ||||
RELEASE: {"short_name": "rel", "key_id": "id"}, | RELEASE: {"short_name": "rel", "key_id": "id"}, | ||||
REVISION: {"short_name": "rev", "key_id": "id"}, | REVISION: {"short_name": "rev", "key_id": "id"}, | ||||
DIRECTORY: {"short_name": "dir", "key_id": "id"}, | DIRECTORY: {"short_name": "dir", "key_id": "id"}, | ||||
CONTENT: {"short_name": "cnt", "key_id": "sha1_git"}, | CONTENT: {"short_name": "cnt", "key_id": "sha1_git"}, | ||||
} | } | ||||
_PersistentId = NamedTuple( | _SWHID = NamedTuple( | ||||
"PersistentId", | "SWHID", | ||||
[ | [ | ||||
("namespace", str), | ("namespace", str), | ||||
("scheme_version", int), | ("scheme_version", int), | ||||
("object_type", str), | ("object_type", str), | ||||
("object_id", str), | ("object_id", str), | ||||
("metadata", Dict[str, Any]), | ("metadata", Dict[str, Any]), | ||||
], | ], | ||||
) | ) | ||||
class PersistentId(_PersistentId): | class SWHID(_SWHID): | ||||
""" | """ | ||||
Named tuple holding the relevant info associated to a Software Heritage | Named tuple holding the relevant info associated to a SoftWare Heritage | ||||
persistent identifier. | persistent IDentifier (SWHID) | ||||
Args: | Args: | ||||
namespace (str): the namespace of the identifier, defaults to 'swh' | namespace (str): the namespace of the identifier, defaults to ``swh`` | ||||
scheme_version (int): the scheme version of the identifier, | scheme_version (int): the scheme version of the identifier, | ||||
defaults to 1 | defaults to 1 | ||||
object_type (str): the type of object the identifier points to, | object_type (str): the type of object the identifier points to, | ||||
either 'content', 'directory', 'release', 'revision' or 'snapshot' | either ``content``, ``directory``, ``release``, ``revision`` or ``snapshot`` | ||||
object_id (dict/bytes/str): object's dict representation or | object_id (str): object's identifier | ||||
object identifier | |||||
metadata (dict): optional dict filled with metadata related to | metadata (dict): optional dict filled with metadata related to | ||||
pointed object | pointed object | ||||
Raises: | Raises: | ||||
swh.model.exceptions.ValidationError: In case of invalid object type | swh.model.exceptions.ValidationError: In case of invalid object type or id | ||||
or id | |||||
Once created, it contains the following attributes: | Once created, it contains the following attributes: | ||||
Attributes: | Attributes: | ||||
namespace (str): the namespace of the identifier | namespace (str): the namespace of the identifier | ||||
scheme_version (int): the scheme version of the identifier | scheme_version (int): the scheme version of the identifier | ||||
object_type (str): the type of object the identifier points to | object_type (str): the type of object the identifier points to | ||||
object_id (str): hexadecimal representation of the object hash | object_id (str): hexadecimal representation of the object hash | ||||
metadata (dict): metadata related to the pointed object | metadata (dict): metadata related to the pointed object | ||||
To get the raw persistent identifier string from an instance of | To get the raw SWHID string from an instance of this named tuple, | ||||
this named tuple, use the :func:`str` function:: | use the :func:`str` function:: | ||||
pid = PersistentId( | swhid = SWHID( | ||||
object_type='content', | object_type='content', | ||||
object_id='8ff44f081d43176474b267de5451f2c2e88089d0' | object_id='8ff44f081d43176474b267de5451f2c2e88089d0' | ||||
) | ) | ||||
pid_str = str(pid) | swhid_str = str(swhid) | ||||
# 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' | # 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' | ||||
""" | """ | ||||
__slots__ = () | __slots__ = () | ||||
def __new__( | def __new__( | ||||
cls, | cls, | ||||
namespace=PID_NAMESPACE, | namespace: str = SWHID_NAMESPACE, | ||||
scheme_version=PID_VERSION, | scheme_version: int = SWHID_VERSION, | ||||
object_type="", | object_type: str = "", | ||||
object_id="", | object_id: str = "", | ||||
metadata={}, | metadata: Dict[str, Any] = {}, | ||||
): | ): | ||||
o = _object_type_map.get(object_type) | o = _object_type_map.get(object_type) | ||||
if not o: | if not o: | ||||
raise ValidationError( | raise ValidationError( | ||||
"Wrong input: Supported types are %s" % (list(_object_type_map.keys())) | "Wrong input: Supported types are %s" % (list(_object_type_map.keys())) | ||||
) | ) | ||||
if namespace != PID_NAMESPACE: | if namespace != SWHID_NAMESPACE: | ||||
raise ValidationError( | raise ValidationError( | ||||
"Wrong format: only supported namespace is '%s'" % PID_NAMESPACE | "Wrong format: only supported namespace is '%s'" % SWHID_NAMESPACE | ||||
) | ) | ||||
if scheme_version != PID_VERSION: | if scheme_version != SWHID_VERSION: | ||||
raise ValidationError( | raise ValidationError( | ||||
"Wrong format: only supported version is %d" % PID_VERSION | "Wrong format: only supported version is %d" % SWHID_VERSION | ||||
) | ) | ||||
# internal swh representation resolution | # internal swh representation resolution | ||||
if isinstance(object_id, dict): | if isinstance(object_id, dict): | ||||
object_id = object_id[o["key_id"]] | object_id = object_id[o["key_id"]] | ||||
validate_sha1(object_id) # can raise if invalid hash | validate_sha1(object_id) # can raise if invalid hash | ||||
object_id = hash_to_hex(object_id) | object_id = hash_to_hex(object_id) | ||||
return super(cls, PersistentId).__new__( | return super().__new__( | ||||
cls, namespace, scheme_version, object_type, object_id, metadata | cls, namespace, scheme_version, object_type, object_id, metadata | ||||
) | ) | ||||
def __str__(self): | def __str__(self) -> str: | ||||
o = _object_type_map.get(self.object_type) | o = _object_type_map.get(self.object_type) | ||||
pid = PID_SEP.join( | assert o | ||||
swhid = SWHID_SEP.join( | |||||
[self.namespace, str(self.scheme_version), o["short_name"], self.object_id] | [self.namespace, str(self.scheme_version), o["short_name"], self.object_id] | ||||
) | ) | ||||
if self.metadata: | if self.metadata: | ||||
for k, v in self.metadata.items(): | for k, v in self.metadata.items(): | ||||
pid += "%s%s=%s" % (PID_CTXT_SEP, k, v) | swhid += "%s%s=%s" % (SWHID_CTXT_SEP, k, v) | ||||
return pid | return swhid | ||||
def persistent_identifier(object_type, object_id, scheme_version=1, metadata={}): | @deprecated("Use swh.model.identifiers.SWHID instead") | ||||
"""Compute :ref:`SWHID <persistent-identifiers>` persistent identifiers. | class PersistentId(SWHID): | ||||
""" | |||||
Named tuple holding the relevant info associated to a SoftWare Heritage | |||||
persistent IDentifier. | |||||
.. deprecated:: 0.3.8 | |||||
Use :class:`swh.model.identifiers.SWHID` instead | |||||
""" | |||||
def __new__(cls, *args, **kwargs): | |||||
return super(cls, PersistentId).__new__(cls, *args, **kwargs) | |||||
def swhid( | |||||
object_type: str, | |||||
object_id: str, | |||||
scheme_version: int = 1, | |||||
metadata: Dict[str, Any] = {}, | |||||
) -> str: | |||||
"""Compute :ref:`persistent-identifiers` | |||||
Args: | Args: | ||||
object_type (str): object's type, either 'content', 'directory', | object_type: object's type, either ``content``, ``directory``, | ||||
'release', 'revision' or 'snapshot' | ``release``, ``revision`` or ``snapshot`` | ||||
object_id (dict/bytes/str): object's dict representation or object | object_id: object's identifier | ||||
identifier | scheme_version: SWHID scheme version, defaults to 1 | ||||
scheme_version (int): persistent identifier scheme version, | metadata: metadata related to the pointed object | ||||
defaults to 1 | |||||
metadata (dict): metadata related to the pointed object | |||||
Raises: | Raises: | ||||
swh.model.exceptions.ValidationError: In case of invalid object type | swh.model.exceptions.ValidationError: In case of invalid object type or id | ||||
or id | |||||
Returns: | Returns: | ||||
str: the persistent identifier | the SWHID of the object | ||||
""" | """ | ||||
pid = PersistentId( | swhid = SWHID( | ||||
scheme_version=scheme_version, | scheme_version=scheme_version, | ||||
object_type=object_type, | object_type=object_type, | ||||
object_id=object_id, | object_id=object_id, | ||||
metadata=metadata, | metadata=metadata, | ||||
) | ) | ||||
return str(pid) | return str(swhid) | ||||
@deprecated("Use swh.model.identifiers.swhid instead") | |||||
def persistent_identifier(*args, **kwargs) -> str: | |||||
"""Compute :ref:`persistent-identifiers` | |||||
.. deprecated:: 0.3.8 | |||||
Use :func:`swh.model.identifiers.swhid` instead | |||||
""" | |||||
return swhid(*args, **kwargs) | |||||
def parse_persistent_identifier(persistent_id): | |||||
"""Parse :ref:`SWHID <persistent-identifiers>` persistent identifiers. | def parse_swhid(swhid: str) -> SWHID: | ||||
"""Parse :ref:`persistent-identifiers`. | |||||
Args: | Args: | ||||
persistent_id (str): A persistent identifier | swhid (str): A persistent identifier | ||||
Raises: | Raises: | ||||
swh.model.exceptions.ValidationError: in case of: | swh.model.exceptions.ValidationError: in case of: | ||||
* missing mandatory values (4) | * missing mandatory values (4) | ||||
* invalid namespace supplied | * invalid namespace supplied | ||||
* invalid version supplied | * invalid version supplied | ||||
* invalid type supplied | * invalid type supplied | ||||
* missing hash | * missing hash | ||||
* invalid hash identifier supplied | * invalid hash identifier supplied | ||||
Returns: | Returns: | ||||
PersistentId: a named tuple holding the parsing result | a named tuple holding the parsing result | ||||
""" | """ | ||||
# <pid>;<contextual-information> | # <swhid>;<contextual-information> | ||||
persistent_id_parts = persistent_id.split(PID_CTXT_SEP) | swhid_parts = swhid.split(SWHID_CTXT_SEP) | ||||
pid_data = persistent_id_parts.pop(0).split(":") | swhid_data = swhid_parts.pop(0).split(":") | ||||
if len(pid_data) != 4: | if len(swhid_data) != 4: | ||||
raise ValidationError("Wrong format: There should be 4 mandatory values") | raise ValidationError("Wrong format: There should be 4 mandatory values") | ||||
# Checking for parsing errors | # Checking for parsing errors | ||||
_ns, _version, _type, _id = pid_data | _ns, _version, _type, _id = swhid_data | ||||
pid_data[1] = int(pid_data[1]) | |||||
for otype, data in _object_type_map.items(): | for otype, data in _object_type_map.items(): | ||||
if _type == data["short_name"]: | if _type == data["short_name"]: | ||||
pid_data[2] = otype | _type = otype | ||||
break | break | ||||
if not _id: | if not _id: | ||||
raise ValidationError("Wrong format: Identifier should be present") | raise ValidationError("Wrong format: Identifier should be present") | ||||
persistent_id_metadata = {} | _metadata = {} | ||||
for part in persistent_id_parts: | for part in swhid_parts: | ||||
try: | try: | ||||
key, val = part.split("=") | key, val = part.split("=") | ||||
persistent_id_metadata[key] = val | _metadata[key] = val | ||||
except Exception: | except Exception: | ||||
msg = "Contextual data is badly formatted, form key=val expected" | msg = "Contextual data is badly formatted, form key=val expected" | ||||
raise ValidationError(msg) | raise ValidationError(msg) | ||||
pid_data.append(persistent_id_metadata) | return SWHID(_ns, int(_version), _type, _id, _metadata) | ||||
return PersistentId(*pid_data) | |||||
@deprecated("Use swh.model.identifiers.parse_swhid instead") | |||||
def parse_persistent_identifier(persistent_id: str) -> PersistentId: | |||||
"""Parse :ref:`persistent-identifiers`. | |||||
.. deprecated:: 0.3.8 | |||||
Use :func:`swh.model.identifiers.parse_swhid` instead | |||||
""" | |||||
return PersistentId(**parse_swhid(persistent_id)._asdict()) |