Changeset View
Changeset View
Standalone View
Standalone View
swh/model/identifiers.py
Show All 18 Lines | |||||
from .collections import ImmutableDict | from .collections import ImmutableDict | ||||
from .exceptions import ValidationError | from .exceptions import ValidationError | ||||
from .fields.hashes import validate_sha1 | from .fields.hashes import validate_sha1 | ||||
from .hashutil import MultiHash, hash_git_data, hash_to_bytes, hash_to_hex | from .hashutil import MultiHash, hash_git_data, hash_to_bytes, hash_to_hex | ||||
class ObjectType(enum.Enum): | class ObjectType(enum.Enum): | ||||
"""Possible object types of a QualifiedSWHID. | """Possible object types of a QualifiedSWHID or CoreSWHID. | ||||
The values of each variant is what is used in the SWHID's string representation.""" | The values of each variant is what is used in the SWHID's string representation.""" | ||||
ORIGIN = "ori" | |||||
SNAPSHOT = "snp" | SNAPSHOT = "snp" | ||||
REVISION = "rev" | REVISION = "rev" | ||||
RELEASE = "rel" | RELEASE = "rel" | ||||
DIRECTORY = "dir" | DIRECTORY = "dir" | ||||
CONTENT = "cnt" | CONTENT = "cnt" | ||||
class ExtendedObjectType(enum.Enum): | |||||
"""Possible object types of an ExtendedSWHID. | |||||
The variants are a superset of :cls:`ObjectType`'s""" | |||||
SNAPSHOT = "snp" | |||||
REVISION = "rev" | |||||
RELEASE = "rel" | |||||
DIRECTORY = "dir" | |||||
CONTENT = "cnt" | |||||
ORIGIN = "ori" | |||||
zack: "RAW_EXTRINSIC_METADATA" is a horribly long mouthful.
(But I don't have better suggestions at… | |||||
Done Inline ActionsI agree, but we don't have a better name; and that one is already used in various places vlorentz: I agree, but we don't have a better name; and that one is already used in various places | |||||
RAW_EXTRINSIC_METADATA = "emd" | |||||
# The following are deprecated aliases of the variants defined in ObjectType | # The following are deprecated aliases of the variants defined in ObjectType | ||||
# while transitioning from SWHID to QualifiedSWHID | # while transitioning from SWHID to QualifiedSWHID | ||||
ORIGIN = "origin" | ORIGIN = "origin" | ||||
SNAPSHOT = "snapshot" | SNAPSHOT = "snapshot" | ||||
REVISION = "revision" | REVISION = "revision" | ||||
RELEASE = "release" | RELEASE = "release" | ||||
DIRECTORY = "directory" | DIRECTORY = "directory" | ||||
CONTENT = "content" | CONTENT = "content" | ||||
RAW_EXTRINSIC_METADATA = "raw_extrinsic_metadata" | |||||
SWHID_NAMESPACE = "swh" | SWHID_NAMESPACE = "swh" | ||||
SWHID_VERSION = 1 | SWHID_VERSION = 1 | ||||
SWHID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"] | SWHID_TYPES = ["snp", "rel", "rev", "dir", "cnt"] | ||||
EXTENDED_SWHID_TYPES = SWHID_TYPES + ["ori", "emd"] | |||||
SWHID_SEP = ":" | SWHID_SEP = ":" | ||||
Not Done Inline ActionsCould we use the enum values instead of duplicating these vars? olasd: Could we use the enum values instead of duplicating these vars? | |||||
SWHID_CTXT_SEP = ";" | SWHID_CTXT_SEP = ";" | ||||
SWHID_QUALIFIERS = {"origin", "anchor", "visit", "path", "lines"} | SWHID_QUALIFIERS = {"origin", "anchor", "visit", "path", "lines"} | ||||
SWHID_RE_RAW = ( | SWHID_RE_RAW = ( | ||||
f"(?P<namespace>{SWHID_NAMESPACE})" | f"(?P<namespace>{SWHID_NAMESPACE})" | ||||
f"{SWHID_SEP}(?P<scheme_version>{SWHID_VERSION})" | f"{SWHID_SEP}(?P<scheme_version>{SWHID_VERSION})" | ||||
f"{SWHID_SEP}(?P<object_type>{'|'.join(SWHID_TYPES)})" | f"{SWHID_SEP}(?P<object_type>{'|'.join(EXTENDED_SWHID_TYPES)})" | ||||
f"{SWHID_SEP}(?P<object_id>[0-9a-f]{{40}})" | f"{SWHID_SEP}(?P<object_id>[0-9a-f]{{40}})" | ||||
f"({SWHID_CTXT_SEP}(?P<qualifiers>\\S+))?" | f"({SWHID_CTXT_SEP}(?P<qualifiers>\\S+))?" | ||||
) | ) | ||||
SWHID_RE = re.compile(SWHID_RE_RAW) | SWHID_RE = re.compile(SWHID_RE_RAW) | ||||
@lru_cache() | @lru_cache() | ||||
def identifier_to_bytes(identifier): | def identifier_to_bytes(identifier): | ||||
▲ Show 20 Lines • Show All 627 Lines • ▼ Show 20 Lines | def origin_identifier(origin): | ||||
"""Return the intrinsic identifier for an origin. | """Return the intrinsic identifier for an origin. | ||||
An origin's identifier is the sha1 checksum of the entire origin URL | An origin's identifier is the sha1 checksum of the entire origin URL | ||||
""" | """ | ||||
return hashlib.sha1(origin["url"].encode("utf-8")).hexdigest() | return hashlib.sha1(origin["url"].encode("utf-8")).hexdigest() | ||||
_object_type_map = { | _object_type_map = { | ||||
ORIGIN: {"short_name": "ori", "key_id": "id"}, | ORIGIN: {"short_name": "ori", "key_id": "id"}, | ||||
SNAPSHOT: {"short_name": "snp", "key_id": "id"}, | SNAPSHOT: {"short_name": "snp", "key_id": "id"}, | ||||
RELEASE: {"short_name": "rel", "key_id": "id"}, | RELEASE: {"short_name": "rel", "key_id": "id"}, | ||||
REVISION: {"short_name": "rev", "key_id": "id"}, | REVISION: {"short_name": "rev", "key_id": "id"}, | ||||
DIRECTORY: {"short_name": "dir", "key_id": "id"}, | DIRECTORY: {"short_name": "dir", "key_id": "id"}, | ||||
CONTENT: {"short_name": "cnt", "key_id": "sha1_git"}, | CONTENT: {"short_name": "cnt", "key_id": "sha1_git"}, | ||||
RAW_EXTRINSIC_METADATA: {"short_name": "emd", "key_id": "id"}, | |||||
} | } | ||||
_swhid_type_map = { | _swhid_type_map = { | ||||
"ori": ORIGIN, | "ori": ORIGIN, | ||||
"snp": SNAPSHOT, | "snp": SNAPSHOT, | ||||
"rel": RELEASE, | "rel": RELEASE, | ||||
"rev": REVISION, | "rev": REVISION, | ||||
"dir": DIRECTORY, | "dir": DIRECTORY, | ||||
"cnt": CONTENT, | "cnt": CONTENT, | ||||
"emd": RAW_EXTRINSIC_METADATA, | |||||
} | } | ||||
Not Done Inline ActionsThese two dicts look like fairly easy candidates for deletion (in a further diff)? olasd: These two dicts look like fairly easy candidates for deletion (in a further diff)? | |||||
@attr.s(frozen=True, kw_only=True) | @attr.s(frozen=True, kw_only=True) | ||||
class CoreSWHID: | class CoreSWHID: | ||||
""" | """ | ||||
Dataclass holding the relevant info associated to a SoftWare Heritage | Dataclass holding the relevant info associated to a SoftWare Heritage | ||||
persistent IDentifier (SWHID). | persistent IDentifier (SWHID). | ||||
▲ Show 20 Lines • Show All 256 Lines • ▼ Show 20 Lines | def from_string(cls, s: str) -> QualifiedSWHID: | ||||
"Invalid qualifier(s): {', '.join(invalid_qualifiers)}" | "Invalid qualifier(s): {', '.join(invalid_qualifiers)}" | ||||
) | ) | ||||
try: | try: | ||||
return QualifiedSWHID(**parts, **qualifiers) | return QualifiedSWHID(**parts, **qualifiers) | ||||
except ValueError as e: | except ValueError as e: | ||||
raise ValidationError(*e.args) from None | raise ValidationError(*e.args) from None | ||||
@attr.s(frozen=True, kw_only=True) | |||||
class ExtendedSWHID: | |||||
""" | |||||
Dataclass holding the relevant info associated to a SoftWare Heritage | |||||
persistent IDentifier (SWHID). | |||||
It extends `CoreSWHID`, by allowing non-standard object types; and should | |||||
only be used internally to Software Heritage. | |||||
Raises: | |||||
swh.model.exceptions.ValidationError: In case of invalid object type or id | |||||
To get the raw SWHID string from an instance of this class, | |||||
use the :func:`str` function: | |||||
>>> swhid = ExtendedSWHID( | |||||
... object_type=ExtendedObjectType.CONTENT, | |||||
... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'), | |||||
... ) | |||||
>>> str(swhid) | |||||
'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' | |||||
And vice-versa with :meth:`CoreSWHID.from_string`: | |||||
>>> swhid == ExtendedSWHID.from_string( | |||||
... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0" | |||||
... ) | |||||
True | |||||
""" | |||||
namespace = attr.ib(type=str, default=SWHID_NAMESPACE) | |||||
"""the namespace of the identifier, defaults to ``swh``""" | |||||
scheme_version = attr.ib(type=int, default=SWHID_VERSION) | |||||
"""the scheme version of the identifier, defaults to 1""" | |||||
object_type = attr.ib( | |||||
type=ExtendedObjectType, | |||||
validator=type_validator(), | |||||
converter=ExtendedObjectType, | |||||
) | |||||
"""the type of object the identifier points to""" | |||||
object_id = attr.ib(type=bytes, validator=type_validator()) | |||||
"""object's identifier""" | |||||
@namespace.validator | |||||
def check_namespace(self, attribute, value): | |||||
if value != SWHID_NAMESPACE: | |||||
raise ValidationError( | |||||
"Invalid SWHID: invalid namespace: %(namespace)s", | |||||
params={"namespace": value}, | |||||
) | |||||
@scheme_version.validator | |||||
def check_scheme_version(self, attribute, value): | |||||
if value != SWHID_VERSION: | |||||
raise ValidationError( | |||||
"Invalid SWHID: invalid version: %(version)s", params={"version": value} | |||||
) | |||||
@object_id.validator | |||||
def check_object_id(self, attribute, value): | |||||
if len(value) != 20: | |||||
raise ValidationError( | |||||
"Invalid SWHID: invalid checksum: %(object_id)s", | |||||
params={"object_id": hash_to_hex(value)}, | |||||
) | |||||
def __str__(self) -> str: | |||||
return SWHID_SEP.join( | |||||
[ | |||||
self.namespace, | |||||
str(self.scheme_version), | |||||
self.object_type.value, | |||||
hash_to_hex(self.object_id), | |||||
] | |||||
) | |||||
@classmethod | |||||
def from_string(cls, s: str) -> ExtendedSWHID: | |||||
parts = _parse_swhid(s) | |||||
if parts.pop("qualifiers"): | |||||
raise ValidationError("ExtendedSWHID does not support qualifiers.") | |||||
return ExtendedSWHID(**parts) | |||||
@attr.s(frozen=True) | @attr.s(frozen=True) | ||||
class SWHID: | class SWHID: | ||||
""" | """ | ||||
Deprecated alternative to QualifiedSWHID. | Deprecated alternative to QualifiedSWHID. | ||||
Args: | Args: | ||||
namespace (str): the namespace of the identifier, defaults to ``swh`` | namespace (str): the namespace of the identifier, defaults to ``swh`` | ||||
scheme_version (int): the scheme version of the identifier, | scheme_version (int): the scheme version of the identifier, | ||||
▲ Show 20 Lines • Show All 195 Lines • Show Last 20 Lines |
"RAW_EXTRINSIC_METADATA" is a horribly long mouthful.
(But I don't have better suggestions at the moment...)