diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -24,11 +24,10 @@ class ObjectType(enum.Enum): - """Possible object types of a QualifiedSWHID. + """Possible object types of a QualifiedSWHID or CoreSWHID. The values of each variant is what is used in the SWHID's string representation.""" - ORIGIN = "ori" SNAPSHOT = "snp" REVISION = "rev" RELEASE = "rel" @@ -36,6 +35,20 @@ CONTENT = "cnt" +class ExtendedObjectType(enum.Enum): + """Possible object types of an ExtendedSWHID. + + The variants are a superset of :cls:`ObjectType`'s""" + + SNAPSHOT = "snp" + REVISION = "rev" + RELEASE = "rel" + DIRECTORY = "dir" + CONTENT = "cnt" + ORIGIN = "ori" + RAW_EXTRINSIC_METADATA = "emd" + + # The following are deprecated aliases of the variants defined in ObjectType # while transitioning from SWHID to QualifiedSWHID ORIGIN = "origin" @@ -44,10 +57,12 @@ RELEASE = "release" DIRECTORY = "directory" CONTENT = "content" +RAW_EXTRINSIC_METADATA = "raw_extrinsic_metadata" SWHID_NAMESPACE = "swh" SWHID_VERSION = 1 -SWHID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"] +SWHID_TYPES = ["snp", "rel", "rev", "dir", "cnt"] +EXTENDED_SWHID_TYPES = SWHID_TYPES + ["ori", "emd"] SWHID_SEP = ":" SWHID_CTXT_SEP = ";" SWHID_QUALIFIERS = {"origin", "anchor", "visit", "path", "lines"} @@ -55,7 +70,7 @@ SWHID_RE_RAW = ( f"(?P{SWHID_NAMESPACE})" f"{SWHID_SEP}(?P{SWHID_VERSION})" - f"{SWHID_SEP}(?P{'|'.join(SWHID_TYPES)})" + f"{SWHID_SEP}(?P{'|'.join(EXTENDED_SWHID_TYPES)})" f"{SWHID_SEP}(?P[0-9a-f]{{40}})" f"({SWHID_CTXT_SEP}(?P\\S+))?" ) @@ -706,6 +721,7 @@ REVISION: {"short_name": "rev", "key_id": "id"}, DIRECTORY: {"short_name": "dir", "key_id": "id"}, CONTENT: {"short_name": "cnt", "key_id": "sha1_git"}, + RAW_EXTRINSIC_METADATA: {"short_name": "emd", "key_id": "id"}, } _swhid_type_map = { @@ -715,6 +731,7 @@ "rev": REVISION, "dir": DIRECTORY, "cnt": CONTENT, + "emd": RAW_EXTRINSIC_METADATA, } @@ -988,6 +1005,93 @@ raise ValidationError(*e.args) from None +@attr.s(frozen=True, kw_only=True) +class ExtendedSWHID: + """ + Dataclass holding the relevant info associated to a SoftWare Heritage + persistent IDentifier (SWHID). + + It extends `CoreSWHID`, by allowing non-standard object types; and should + only be used internally to Software Heritage. + + Raises: + swh.model.exceptions.ValidationError: In case of invalid object type or id + + To get the raw SWHID string from an instance of this class, + use the :func:`str` function: + + >>> swhid = ExtendedSWHID( + ... object_type=ExtendedObjectType.CONTENT, + ... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'), + ... ) + >>> str(swhid) + 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' + + And vice-versa with :meth:`CoreSWHID.from_string`: + + >>> swhid == ExtendedSWHID.from_string( + ... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0" + ... ) + True + """ + + namespace = attr.ib(type=str, default=SWHID_NAMESPACE) + """the namespace of the identifier, defaults to ``swh``""" + + scheme_version = attr.ib(type=int, default=SWHID_VERSION) + """the scheme version of the identifier, defaults to 1""" + + object_type = attr.ib( + type=ExtendedObjectType, + validator=type_validator(), + converter=ExtendedObjectType, + ) + """the type of object the identifier points to""" + + object_id = attr.ib(type=bytes, validator=type_validator()) + """object's identifier""" + + @namespace.validator + def check_namespace(self, attribute, value): + if value != SWHID_NAMESPACE: + raise ValidationError( + "Invalid SWHID: invalid namespace: %(namespace)s", + params={"namespace": value}, + ) + + @scheme_version.validator + def check_scheme_version(self, attribute, value): + if value != SWHID_VERSION: + raise ValidationError( + "Invalid SWHID: invalid version: %(version)s", params={"version": value} + ) + + @object_id.validator + def check_object_id(self, attribute, value): + if len(value) != 20: + raise ValidationError( + "Invalid SWHID: invalid checksum: %(object_id)s", + params={"object_id": hash_to_hex(value)}, + ) + + def __str__(self) -> str: + return SWHID_SEP.join( + [ + self.namespace, + str(self.scheme_version), + self.object_type.value, + hash_to_hex(self.object_id), + ] + ) + + @classmethod + def from_string(cls, s: str) -> ExtendedSWHID: + parts = _parse_swhid(s) + if parts.pop("qualifiers"): + raise ValidationError("ExtendedSWHID does not support qualifiers.") + return ExtendedSWHID(**parts) + + @attr.s(frozen=True) class SWHID: """ diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -21,6 +21,8 @@ SNAPSHOT, SWHID, CoreSWHID, + ExtendedObjectType, + ExtendedSWHID, ObjectType, QualifiedSWHID, normalize_timestamp, @@ -1277,6 +1279,8 @@ "foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505", "swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505", "swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:1:ori:fed8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:1:emd:fed8bc9d7a6bcf6db04f476d29314f157507d505", "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;malformed", "swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d", "swh:1:snp:foo", @@ -1444,6 +1448,8 @@ "foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505", "swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505", "swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:1:ori:fed8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:1:emd:fed8bc9d7a6bcf6db04f476d29314f157507d505", "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;visit=swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d", # noqa "swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d", "swh:1:snp:foo", @@ -1503,3 +1509,146 @@ assert CoreSWHID( object_type=ObjectType.DIRECTORY, object_id=object_id, ) == CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=object_id,) + + +def test_parse_serialize_extended_swhid(): + for swhid, _type, _version, _hash in [ + ( + "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", + ExtendedObjectType.CONTENT, + 1, + _x("94a9ed024d3859793618152ea559a168bbcbb5e2"), + ), + ( + "swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505", + ExtendedObjectType.DIRECTORY, + 1, + _x("d198bc9d7a6bcf6db04f476d29314f157507d505"), + ), + ( + "swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d", + ExtendedObjectType.REVISION, + 1, + _x("309cf2674ee7a0749978cf8265ab91a60aea0f7d"), + ), + ( + "swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f", + ExtendedObjectType.RELEASE, + 1, + _x("22ece559cc7cc2364edc5e5593d63ae8bd229f9f"), + ), + ( + "swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453", + ExtendedObjectType.SNAPSHOT, + 1, + _x("c7c108084bc0bf3d81436bf980b46e98bd338453"), + ), + ( + "swh:1:ori:c7c108084bc0bf3d81436bf980b46e98bd338453", + ExtendedObjectType.ORIGIN, + 1, + _x("c7c108084bc0bf3d81436bf980b46e98bd338453"), + ), + ( + "swh:1:emd:c7c108084bc0bf3d81436bf980b46e98bd338453", + ExtendedObjectType.RAW_EXTRINSIC_METADATA, + 1, + _x("c7c108084bc0bf3d81436bf980b46e98bd338453"), + ), + ]: + expected_result = ExtendedSWHID( + namespace="swh", + scheme_version=_version, + object_type=_type, + object_id=_hash, + ) + actual_result = ExtendedSWHID.from_string(swhid) + assert actual_result == expected_result + assert str(expected_result) == str(actual_result) == swhid + + +@pytest.mark.parametrize( + "invalid_swhid", + [ + "swh:1:cnt", + "swh:1:", + "swh:", + "swh:1:cnt:", + "foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505", + "swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;visit=swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d", # noqa + "swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d", + "swh:1:snp:foo", + "swh:1: dir: 0b6959356d30f1a4e9b7f6bca59b9a336464c03d", + ], +) +def test_parse_extended_swhid_parsing_error(invalid_swhid): + with pytest.raises(ValidationError): + ExtendedSWHID.from_string(invalid_swhid) + + +@pytest.mark.filterwarnings("ignore:.*SWHID.*:DeprecationWarning") +@pytest.mark.parametrize( + "ns,version,type,id", + [ + ( + "foo", + 1, + ExtendedObjectType.CONTENT, + "abc8bc9d7a6bcf6db04f476d29314f157507d505", + ), + ( + "swh", + 2, + ExtendedObjectType.CONTENT, + "def8bc9d7a6bcf6db04f476d29314f157507d505", + ), + ("swh", 1, ExtendedObjectType.DIRECTORY, "aaaa"), + ], +) +def test_ExtendedSWHID_validation_error(ns, version, type, id): + with pytest.raises(ValidationError): + ExtendedSWHID( + namespace=ns, scheme_version=version, object_type=type, object_id=_x(id), + ) + + +def test_ExtendedSWHID_hash(): + object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") + + assert hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id) + ) == hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id) + ) + + assert hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + ) == hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + ) + + # Different order of the dictionary, so the underlying order of the tuple in + # ImmutableDict is different. + assert hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + ) == hash( + ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + ) + + +def test_ExtendedSWHID_eq(): + object_id = _x("94a9ed024d3859793618152ea559a168bbcbb5e2") + + assert ExtendedSWHID( + object_type=ExtendedObjectType.DIRECTORY, object_id=object_id + ) == ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id) + + assert ExtendedSWHID( + object_type=ExtendedObjectType.DIRECTORY, object_id=object_id, + ) == ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,) + + assert ExtendedSWHID( + object_type=ExtendedObjectType.DIRECTORY, object_id=object_id, + ) == ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=object_id,)