Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124383
D4078.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Subscribers
None
D4078.diff
View Options
diff --git a/swh/model/model.py b/swh/model/model.py
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -35,6 +35,10 @@
pass
+KeyType = Union[Dict[str, str], Dict[str, bytes], bytes]
+"""The type returned by BaseModel.unique_key()."""
+
+
SHA1_SIZE = 20
# TODO: Limit this to 20 bytes
@@ -98,6 +102,11 @@
"""
return None
+ def unique_key(self) -> KeyType:
+ """Returns a unique key for this object, that can be used for
+ deduplication."""
+ raise NotImplementedError(f"unique_key for {self}")
+
class HashableObject(metaclass=ABCMeta):
"""Mixin to automatically compute object identifier hash when
@@ -115,6 +124,9 @@
obj_id = hash_to_bytes(self.compute_hash(self.to_dict()))
object.__setattr__(self, "id", obj_id)
+ def unique_key(self) -> KeyType:
+ return self.id # type: ignore
+
@attr.s(frozen=True)
class Person(BaseModel):
@@ -252,6 +264,9 @@
url = attr.ib(type=str, validator=type_validator())
+ def unique_key(self) -> KeyType:
+ return {"url": self.url}
+
@attr.s(frozen=True)
class OriginVisit(BaseModel):
@@ -280,6 +295,9 @@
del ov["visit"]
return ov
+ def unique_key(self) -> KeyType:
+ return {"origin": self.origin, "date": str(self.date)}
+
@attr.s(frozen=True)
class OriginVisitStatus(BaseModel):
@@ -311,6 +329,9 @@
if value is not None and value.tzinfo is None:
raise ValueError("date must be a timezone-aware datetime.")
+ def unique_key(self) -> KeyType:
+ return {"origin": self.origin, "visit": str(self.visit), "date": str(self.date)}
+
class TargetType(Enum):
"""The type of content pointed to by a snapshot branch. Usually a
@@ -357,7 +378,7 @@
@attr.s(frozen=True)
-class Snapshot(BaseModel, HashableObject):
+class Snapshot(HashableObject, BaseModel):
"""Represents the full state of an origin at a given point in time."""
object_type: Final = "snapshot"
@@ -386,7 +407,7 @@
@attr.s(frozen=True)
-class Release(BaseModel, HashableObject):
+class Release(HashableObject, BaseModel):
object_type: Final = "release"
name = attr.ib(type=bytes, validator=type_validator())
@@ -453,7 +474,7 @@
@attr.s(frozen=True)
-class Revision(BaseModel, HashableObject):
+class Revision(HashableObject, BaseModel):
object_type: Final = "revision"
message = attr.ib(type=Optional[bytes], validator=type_validator())
@@ -543,7 +564,7 @@
@attr.s(frozen=True)
-class Directory(BaseModel, HashableObject):
+class Directory(HashableObject, BaseModel):
object_type: Final = "directory"
entries = attr.ib(type=Tuple[DirectoryEntry, ...], validator=type_validator())
@@ -675,6 +696,9 @@
raise MissingData("Content data is None.")
return self
+ def unique_key(self) -> KeyType:
+ return self.sha1 # TODO: use a dict of hashes
+
@attr.s(frozen=True)
class SkippedContent(BaseContent):
@@ -752,6 +776,9 @@
raise ValueError('SkippedContent has no "data" attribute %r' % d)
return super().from_dict(d2, use_subclass=False)
+ def unique_key(self) -> KeyType:
+ return self.hashes()
+
class MetadataAuthorityType(Enum):
DEPOSIT_CLIENT = "deposit_client"
@@ -786,6 +813,9 @@
d["type"] = MetadataAuthorityType(d["type"])
return super().from_dict(d)
+ def unique_key(self) -> KeyType:
+ return {"type": self.type.value, "url": self.url}
+
@attr.s(frozen=True)
class MetadataFetcher(BaseModel):
@@ -809,6 +839,9 @@
del d["metadata"]
return d
+ def unique_key(self) -> KeyType:
+ return {"name": self.name, "version": self.version}
+
class MetadataTargetType(Enum):
"""The type of object extrinsic metadata refer to."""
@@ -1024,3 +1057,14 @@
d[swhid_key] = parse_swhid(d[swhid_key])
return super().from_dict(d)
+
+ def unique_key(self) -> KeyType:
+ return {
+ "type": self.type.value,
+ "id": str(self.id),
+ "authority_type": self.authority.type.value,
+ "authority_url": self.authority.url,
+ "discovery_date": str(self.discovery_date),
+ "fetcher_name": self.fetcher.name,
+ "fetcher_version": self.fetcher.version,
+ }
diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py
--- a/swh/model/tests/test_model.py
+++ b/swh/model/tests/test_model.py
@@ -71,6 +71,36 @@
assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict()
+def test_unique_key():
+ url = "http://example.org/"
+ date = datetime.datetime.now(tz=datetime.timezone.utc)
+ id_ = b"42" * 10
+ assert Origin(url=url).unique_key() == {"url": url}
+ assert OriginVisit(origin=url, date=date, type="git").unique_key() == {
+ "origin": url,
+ "date": str(date),
+ }
+ assert OriginVisitStatus(
+ origin=url, visit=42, date=date, status="created", snapshot=None
+ ).unique_key() == {"origin": url, "visit": "42", "date": str(date),}
+
+ assert Snapshot.from_dict({**snapshot_example, "id": id_}).unique_key() == id_
+ assert Release.from_dict({**release_example, "id": id_}).unique_key() == id_
+ assert Revision.from_dict({**revision_example, "id": id_}).unique_key() == id_
+ assert Directory.from_dict({**directory_example, "id": id_}).unique_key() == id_
+
+ cont = Content.from_data(b"foo")
+ assert cont.unique_key().hex() == "0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"
+
+ kwargs = {
+ **cont.to_dict(),
+ "reason": "foo",
+ "status": "absent",
+ }
+ del kwargs["data"]
+ assert SkippedContent(**kwargs).unique_key() == cont.hashes()
+
+
# Anonymization
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 9:24 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217038
Attached To
D4078: Add a 'unique_key' method on model objects
Event Timeline
Log In to Comment