diff --git a/docs/data-model.rst b/docs/data-model.rst --- a/docs/data-model.rst +++ b/docs/data-model.rst @@ -271,13 +271,19 @@ **extid** a relationship between an original identifier of an artifact, in its native/upstream environment, and a `core SWHID `, - which is specific to Software Heritage. As such, it is a triple made of: + which is specific to Software Heritage. As such, it includes: * the external identifier, stored as bytes whose format is opaque to the data model * a type (a simple name and a version), to identify the type of relationship * the "target", which is a core SWHID + An extid may also include a "payload", which is arbitrary data about + the relationship. For example, an extid might link a directory to + the cryptographic hash of the tarball that originally contained it. + In this case, the payload could include data useful for + reconstructing the original tarball from the directory. + **raw extrinsic metadata** an opaque bytestring, along with its format (a simple name), an identifier of the object the metadata is about and in which context (similar to a diff --git a/swh/model/git_objects.py b/swh/model/git_objects.py --- a/swh/model/git_objects.py +++ b/swh/model/git_objects.py @@ -631,6 +631,8 @@ [extid_version $Str] extid $Bytes target $CoreSwhid + [payload_type $StrWithoutSpaces] + [payload $Sha1] ``` $StrWithoutSpaces is an ASCII string, and may not contain spaces. @@ -639,6 +641,10 @@ space after them. The extid_version line is only generated if the version is non-zero. + + The payload_type and payload lines are only generated if they are not + `None`. $Sha1 means the 20 bytes of a SHA-1 hash value. + """ headers = [ @@ -655,4 +661,12 @@ ] ) + payload_type = extid.payload_type + if payload_type is not None: + headers.append((b"payload_type", str(payload_type).encode("ascii"))) + + payload = extid.payload + if payload is not None: + headers.append((b"payload", payload)) + return format_git_object_from_headers("extid", headers) diff --git a/swh/model/model.py b/swh/model/model.py --- a/swh/model/model.py +++ b/swh/model/model.py @@ -1845,10 +1845,23 @@ target = attr.ib(type=CoreSWHID, validator=generic_type_validator) extid_version = attr.ib(type=int, validator=generic_type_validator, default=0) - id = attr.ib( - type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr + payload_type = attr.ib(type=Optional[str], validator=generic_type_validator, default=None) + payload = attr.ib( + type=Optional[Sha1], validator=generic_type_validator, default=None, repr=hash_repr ) + id = attr.ib(type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr) + + @payload_type.validator + def check_payload_type(self, attribute, value): + if value is not None and self.payload is None: + raise ValueError("'payload' must be set if 'payload_type' is.") + + @payload.validator + def check_payload(self, attribute, value): + if value is not None and self.payload_type is None: + raise ValueError("'payload_type' must be set if 'payload' is.") + @classmethod def from_dict(cls, d): return cls( @@ -1856,6 +1869,8 @@ extid_type=d["extid_type"], target=CoreSWHID.from_string(d["target"]), extid_version=d.get("extid_version", 0), + payload_type=d.get("payload_type"), + payload=d.get("payload"), ) def _compute_hash_from_attributes(self) -> bytes: diff --git a/swh/model/tests/swh_model_data.py b/swh/model/tests/swh_model_data.py --- a/swh/model/tests/swh_model_data.py +++ b/swh/model/tests/swh_model_data.py @@ -160,25 +160,6 @@ ), ] -EXTIDS = [ - ExtID( - extid_type="git256", - extid=b"\x03" * 32, - target=REVISIONS[0].swhid(), - ), - ExtID( - extid_type="hg", - extid=b"\x04" * 20, - target=REVISIONS[1].swhid(), - ), - ExtID( - extid_type="hg-nodeid", - extid=b"\x05" * 20, - target=REVISIONS[1].swhid(), - extid_version=1, - ), -] - RELEASES = [ Release( id=hash_to_bytes("8059dc4e17fcd0e51ca3bcd6b80f4577d281fd08"), @@ -441,6 +422,31 @@ ), ] +EXTIDS = [ + ExtID( + extid_type="git256", + extid=b"\x03" * 32, + target=REVISIONS[0].swhid(), + ), + ExtID( + extid_type="hg", + extid=b"\x04" * 20, + target=REVISIONS[1].swhid(), + ), + ExtID( + extid_type="hg-nodeid", + extid=b"\x05" * 20, + target=REVISIONS[1].swhid(), + extid_version=1, + ), + ExtID( + extid_type="tarball-sha256", + extid=b"\x03" * 32, + target=DIRECTORIES[0].swhid(), + payload_type="disarchive", + payload=CONTENTS[0].sha1, + ), +] TEST_OBJECTS: Dict[str, Sequence[BaseModel]] = { "content": CONTENTS, diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -1343,3 +1343,14 @@ ExtID.from_dict({**extid_dict, "extid_version": 1}).id != ExtID.from_dict(extid_dict).id ) + + assert ( + ExtID.from_dict( + { + **extid_dict, + "payload_type": "test", + "payload": bytes.fromhex("f1d2d2f924e986ac86fdf7b36c94bcdf32beec15"), + } + ).id + != ExtID.from_dict(extid_dict).id + )