diff --git a/swh/model/model.py b/swh/model/model.py --- a/swh/model/model.py +++ b/swh/model/model.py @@ -35,6 +35,10 @@ pass +KeyType = Union[Dict[str, str], Dict[str, bytes], bytes] +"""The type returned by BaseModel.unique_key().""" + + SHA1_SIZE = 20 # TODO: Limit this to 20 bytes @@ -98,6 +102,11 @@ """ return None + def unique_key(self) -> KeyType: + """Returns a unique key for this object, that can be used for + deduplication.""" + raise NotImplementedError(f"unique_key for {self}") + class HashableObject(metaclass=ABCMeta): """Mixin to automatically compute object identifier hash when @@ -115,6 +124,9 @@ obj_id = hash_to_bytes(self.compute_hash(self.to_dict())) object.__setattr__(self, "id", obj_id) + def unique_key(self) -> KeyType: + return self.id # type: ignore + @attr.s(frozen=True) class Person(BaseModel): @@ -252,6 +264,9 @@ url = attr.ib(type=str, validator=type_validator()) + def unique_key(self) -> KeyType: + return {"url": self.url} + @attr.s(frozen=True) class OriginVisit(BaseModel): @@ -280,6 +295,9 @@ del ov["visit"] return ov + def unique_key(self) -> KeyType: + return {"origin": self.origin, "date": str(self.date)} + @attr.s(frozen=True) class OriginVisitStatus(BaseModel): @@ -311,6 +329,9 @@ if value is not None and value.tzinfo is None: raise ValueError("date must be a timezone-aware datetime.") + def unique_key(self) -> KeyType: + return {"origin": self.origin, "visit": str(self.visit), "date": str(self.date)} + class TargetType(Enum): """The type of content pointed to by a snapshot branch. Usually a @@ -357,7 +378,7 @@ @attr.s(frozen=True) -class Snapshot(BaseModel, HashableObject): +class Snapshot(HashableObject, BaseModel): """Represents the full state of an origin at a given point in time.""" object_type: Final = "snapshot" @@ -386,7 +407,7 @@ @attr.s(frozen=True) -class Release(BaseModel, HashableObject): +class Release(HashableObject, BaseModel): object_type: Final = "release" name = attr.ib(type=bytes, validator=type_validator()) @@ -453,7 +474,7 @@ @attr.s(frozen=True) -class Revision(BaseModel, HashableObject): +class Revision(HashableObject, BaseModel): object_type: Final = "revision" message = attr.ib(type=Optional[bytes], validator=type_validator()) @@ -543,7 +564,7 @@ @attr.s(frozen=True) -class Directory(BaseModel, HashableObject): +class Directory(HashableObject, BaseModel): object_type: Final = "directory" entries = attr.ib(type=Tuple[DirectoryEntry, ...], validator=type_validator()) @@ -675,6 +696,9 @@ raise MissingData("Content data is None.") return self + def unique_key(self) -> KeyType: + return self.sha1 # TODO: use a dict of hashes + @attr.s(frozen=True) class SkippedContent(BaseContent): @@ -752,6 +776,9 @@ raise ValueError('SkippedContent has no "data" attribute %r' % d) return super().from_dict(d2, use_subclass=False) + def unique_key(self) -> KeyType: + return self.hashes() + class MetadataAuthorityType(Enum): DEPOSIT_CLIENT = "deposit_client" @@ -786,6 +813,9 @@ d["type"] = MetadataAuthorityType(d["type"]) return super().from_dict(d) + def unique_key(self) -> KeyType: + return {"type": self.type.value, "url": self.url} + @attr.s(frozen=True) class MetadataFetcher(BaseModel): @@ -809,6 +839,9 @@ del d["metadata"] return d + def unique_key(self) -> KeyType: + return {"name": self.name, "version": self.version} + class MetadataTargetType(Enum): """The type of object extrinsic metadata refer to.""" @@ -1024,3 +1057,14 @@ d[swhid_key] = parse_swhid(d[swhid_key]) return super().from_dict(d) + + def unique_key(self) -> KeyType: + return { + "type": self.type.value, + "id": str(self.id), + "authority_type": self.authority.type.value, + "authority_url": self.authority.url, + "discovery_date": str(self.discovery_date), + "fetcher_name": self.fetcher.name, + "fetcher_version": self.fetcher.version, + } diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -71,6 +71,36 @@ assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict() +def test_unique_key(): + url = "http://example.org/" + date = datetime.datetime.now(tz=datetime.timezone.utc) + id_ = b"42" * 10 + assert Origin(url=url).unique_key() == {"url": url} + assert OriginVisit(origin=url, date=date, type="git").unique_key() == { + "origin": url, + "date": str(date), + } + assert OriginVisitStatus( + origin=url, visit=42, date=date, status="created", snapshot=None + ).unique_key() == {"origin": url, "visit": "42", "date": str(date),} + + assert Snapshot.from_dict({**snapshot_example, "id": id_}).unique_key() == id_ + assert Release.from_dict({**release_example, "id": id_}).unique_key() == id_ + assert Revision.from_dict({**revision_example, "id": id_}).unique_key() == id_ + assert Directory.from_dict({**directory_example, "id": id_}).unique_key() == id_ + + cont = Content.from_data(b"foo") + assert cont.unique_key().hex() == "0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33" + + kwargs = { + **cont.to_dict(), + "reason": "foo", + "status": "absent", + } + del kwargs["data"] + assert SkippedContent(**kwargs).unique_key() == cont.hashes() + + # Anonymization