Page MenuHomeSoftware Heritage

D4078.diff
No OneTemporary

D4078.diff

diff --git a/swh/model/model.py b/swh/model/model.py
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -35,6 +35,10 @@
pass
+KeyType = Union[Dict[str, str], Dict[str, bytes], bytes]
+"""The type returned by BaseModel.unique_key()."""
+
+
SHA1_SIZE = 20
# TODO: Limit this to 20 bytes
@@ -98,6 +102,11 @@
"""
return None
+ def unique_key(self) -> KeyType:
+ """Returns a unique key for this object, that can be used for
+ deduplication."""
+ raise NotImplementedError(f"unique_key for {self}")
+
class HashableObject(metaclass=ABCMeta):
"""Mixin to automatically compute object identifier hash when
@@ -115,6 +124,9 @@
obj_id = hash_to_bytes(self.compute_hash(self.to_dict()))
object.__setattr__(self, "id", obj_id)
+ def unique_key(self) -> KeyType:
+ return self.id # type: ignore
+
@attr.s(frozen=True)
class Person(BaseModel):
@@ -252,6 +264,9 @@
url = attr.ib(type=str, validator=type_validator())
+ def unique_key(self) -> KeyType:
+ return {"url": self.url}
+
@attr.s(frozen=True)
class OriginVisit(BaseModel):
@@ -280,6 +295,9 @@
del ov["visit"]
return ov
+ def unique_key(self) -> KeyType:
+ return {"origin": self.origin, "date": str(self.date)}
+
@attr.s(frozen=True)
class OriginVisitStatus(BaseModel):
@@ -311,6 +329,9 @@
if value is not None and value.tzinfo is None:
raise ValueError("date must be a timezone-aware datetime.")
+ def unique_key(self) -> KeyType:
+ return {"origin": self.origin, "visit": str(self.visit), "date": str(self.date)}
+
class TargetType(Enum):
"""The type of content pointed to by a snapshot branch. Usually a
@@ -357,7 +378,7 @@
@attr.s(frozen=True)
-class Snapshot(BaseModel, HashableObject):
+class Snapshot(HashableObject, BaseModel):
"""Represents the full state of an origin at a given point in time."""
object_type: Final = "snapshot"
@@ -386,7 +407,7 @@
@attr.s(frozen=True)
-class Release(BaseModel, HashableObject):
+class Release(HashableObject, BaseModel):
object_type: Final = "release"
name = attr.ib(type=bytes, validator=type_validator())
@@ -453,7 +474,7 @@
@attr.s(frozen=True)
-class Revision(BaseModel, HashableObject):
+class Revision(HashableObject, BaseModel):
object_type: Final = "revision"
message = attr.ib(type=Optional[bytes], validator=type_validator())
@@ -543,7 +564,7 @@
@attr.s(frozen=True)
-class Directory(BaseModel, HashableObject):
+class Directory(HashableObject, BaseModel):
object_type: Final = "directory"
entries = attr.ib(type=Tuple[DirectoryEntry, ...], validator=type_validator())
@@ -675,6 +696,9 @@
raise MissingData("Content data is None.")
return self
+ def unique_key(self) -> KeyType:
+ return self.sha1 # TODO: use a dict of hashes
+
@attr.s(frozen=True)
class SkippedContent(BaseContent):
@@ -752,6 +776,9 @@
raise ValueError('SkippedContent has no "data" attribute %r' % d)
return super().from_dict(d2, use_subclass=False)
+ def unique_key(self) -> KeyType:
+ return self.hashes()
+
class MetadataAuthorityType(Enum):
DEPOSIT_CLIENT = "deposit_client"
@@ -786,6 +813,9 @@
d["type"] = MetadataAuthorityType(d["type"])
return super().from_dict(d)
+ def unique_key(self) -> KeyType:
+ return {"type": self.type.value, "url": self.url}
+
@attr.s(frozen=True)
class MetadataFetcher(BaseModel):
@@ -809,6 +839,9 @@
del d["metadata"]
return d
+ def unique_key(self) -> KeyType:
+ return {"name": self.name, "version": self.version}
+
class MetadataTargetType(Enum):
"""The type of object extrinsic metadata refer to."""
@@ -1024,3 +1057,14 @@
d[swhid_key] = parse_swhid(d[swhid_key])
return super().from_dict(d)
+
+ def unique_key(self) -> KeyType:
+ return {
+ "type": self.type.value,
+ "id": str(self.id),
+ "authority_type": self.authority.type.value,
+ "authority_url": self.authority.url,
+ "discovery_date": str(self.discovery_date),
+ "fetcher_name": self.fetcher.name,
+ "fetcher_version": self.fetcher.version,
+ }
diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py
--- a/swh/model/tests/test_model.py
+++ b/swh/model/tests/test_model.py
@@ -71,6 +71,36 @@
assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict()
+def test_unique_key():
+ url = "http://example.org/"
+ date = datetime.datetime.now(tz=datetime.timezone.utc)
+ id_ = b"42" * 10
+ assert Origin(url=url).unique_key() == {"url": url}
+ assert OriginVisit(origin=url, date=date, type="git").unique_key() == {
+ "origin": url,
+ "date": str(date),
+ }
+ assert OriginVisitStatus(
+ origin=url, visit=42, date=date, status="created", snapshot=None
+ ).unique_key() == {"origin": url, "visit": "42", "date": str(date),}
+
+ assert Snapshot.from_dict({**snapshot_example, "id": id_}).unique_key() == id_
+ assert Release.from_dict({**release_example, "id": id_}).unique_key() == id_
+ assert Revision.from_dict({**revision_example, "id": id_}).unique_key() == id_
+ assert Directory.from_dict({**directory_example, "id": id_}).unique_key() == id_
+
+ cont = Content.from_data(b"foo")
+ assert cont.unique_key().hex() == "0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"
+
+ kwargs = {
+ **cont.to_dict(),
+ "reason": "foo",
+ "status": "absent",
+ }
+ del kwargs["data"]
+ assert SkippedContent(**kwargs).unique_key() == cont.hashes()
+
+
# Anonymization

File Metadata

Mime Type
text/plain
Expires
Dec 21 2024, 9:24 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217038

Event Timeline