diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -8,10 +8,11 @@ import binascii import datetime from functools import lru_cache -import hashlib from typing import Any, Dict, Iterable, List, Optional, Tuple -from .hashutil import MultiHash, git_object_header +from . import model +from .collections import ImmutableDict +from .hashutil import MultiHash, git_object_header, hash_to_bytehex, hash_to_hex # Reexport for backward compatibility from .swhids import * # noqa @@ -116,12 +117,15 @@ return MultiHash.from_data(content["data"]).digest() -def directory_entry_sort_key(entry): +def directory_entry_sort_key(entry: model.DirectoryEntry): """The sorting key for tree entries""" - if entry["type"] == "dir": - return entry["name"] + b"/" + if isinstance(entry, dict): + # For backward compatibility + entry = model.DirectoryEntry.from_dict(entry) + if entry.type == "dir": + return entry.name + b"/" else: - return entry["name"] + return entry.name @lru_cache() @@ -181,28 +185,25 @@ (Note that there is no separator between entries) """ - git_object = directory_git_object(directory) - return hashlib.new("sha1", git_object).hexdigest() + return hash_to_hex(model.Directory.from_dict(directory).id) -def directory_git_object(directory: Dict[str, Any]) -> bytes: +def directory_git_object(directory: model.Directory) -> bytes: + if isinstance(directory, dict): + # For backward compatibility + directory = model.Directory.from_dict(directory) + components = [] - for entry in sorted(directory["entries"], key=directory_entry_sort_key): + for entry in sorted(directory.entries, key=directory_entry_sort_key): components.extend( - [ - _perms_to_bytes(entry["perms"]), - b"\x20", - entry["name"], - b"\x00", - identifier_to_bytes(entry["target"]), - ] + [_perms_to_bytes(entry.perms), b"\x20", entry.name, b"\x00", entry.target,] ) return format_git_object_from_parts("tree", components) -def format_date(date): +def format_date(date: model.Timestamp) -> bytes: """Convert a date object into an UTC timestamp encoded as ascii bytes. Git stores timestamps as an integer number of seconds since the UNIX epoch. @@ -216,20 +217,19 @@ representation if we ever need more precision in timestamps. """ - if not isinstance(date, dict): - raise ValueError("format_date only supports dicts, %r received" % date) + if isinstance(date, dict): + # For backward compatibility + date = model.Timestamp.from_dict(date) - seconds = date.get("seconds", 0) - microseconds = date.get("microseconds", 0) - if not microseconds: - return str(seconds).encode() + if not date.microseconds: + return str(date.seconds).encode() else: - float_value = "%d.%06d" % (seconds, microseconds) + float_value = "%d.%06d" % (date.seconds, date.microseconds) return float_value.rstrip("0").encode() @lru_cache() -def format_offset(offset, negative_utc=None): +def format_offset(offset: int, negative_utc: Optional[bool] = None) -> bytes: """Convert an integer number of minutes into an offset representation. The offset representation is [+-]hhmm where: @@ -276,86 +276,10 @@ when offset = 0. """ - if time_representation is None: return None - - negative_utc = False - - if isinstance(time_representation, dict): - ts = time_representation["timestamp"] - if isinstance(ts, dict): - seconds = ts.get("seconds", 0) - microseconds = ts.get("microseconds", 0) - elif isinstance(ts, int): - seconds = ts - microseconds = 0 - else: - raise ValueError( - "normalize_timestamp received non-integer timestamp member:" " %r" % ts - ) - offset = time_representation["offset"] - if "negative_utc" in time_representation: - negative_utc = time_representation["negative_utc"] - if negative_utc is None: - negative_utc = False - elif isinstance(time_representation, datetime.datetime): - microseconds = time_representation.microsecond - if microseconds: - time_representation = time_representation.replace(microsecond=0) - seconds = int(time_representation.timestamp()) - utcoffset = time_representation.utcoffset() - if utcoffset is None: - raise ValueError( - "normalize_timestamp received datetime without timezone: %s" - % time_representation - ) - - # utcoffset is an integer number of minutes - seconds_offset = utcoffset.total_seconds() - offset = int(seconds_offset) // 60 - elif isinstance(time_representation, int): - seconds = time_representation - microseconds = 0 - offset = 0 else: - raise ValueError( - "normalize_timestamp received non-integer timestamp:" - " %r" % time_representation - ) - - return { - "timestamp": {"seconds": seconds, "microseconds": microseconds,}, - "offset": offset, - "negative_utc": negative_utc, - } - - -def format_author(author): - """Format the specification of an author. - - An author is either a byte string (passed unchanged), or a dict with three - keys, fullname, name and email. - - If the fullname exists, return it; if it doesn't, we construct a fullname - using the following heuristics: if the name value is None, we return the - email in angle brackets, else, we return the name, a space, and the email - in angle brackets. - - """ - if isinstance(author, bytes) or author is None: - return author - - if "fullname" in author: - return author["fullname"] - - ret = [] - if author["name"] is not None: - ret.append(author["name"]) - if author["email"] is not None: - ret.append(b"".join([b"<", author["email"], b">"])) - - return b" ".join(ret) + return model.TimestampWithTimezone.from_dict(time_representation).to_dict() def format_git_object_from_headers( @@ -411,7 +335,9 @@ return header + concatenated_parts -def format_author_data(author, date_offset) -> bytes: +def format_author_data( + author: model.Person, date_offset: Optional[model.TimestampWithTimezone] +) -> bytes: """Format authorship data according to git standards. Git authorship data has two components: @@ -433,24 +359,16 @@ tools can pass a negative offset corresponding to the UTC timezone ('-0000'), which is valid and is encoded as such. - Args: - author: an author specification (dict with two bytes values: name and - email, or byte value) - date_offset: a normalized date/time representation as returned by - :func:`normalize_timestamp`. - Returns: the byte string containing the authorship data """ - ret = [format_author(author)] - - date_offset = normalize_timestamp(date_offset) + ret = [author.fullname] if date_offset is not None: - date_f = format_date(date_offset["timestamp"]) - offset_f = format_offset(date_offset["offset"], date_offset["negative_utc"]) + date_f = format_date(date_offset.timestamp) + offset_f = format_offset(date_offset.offset, date_offset.negative_utc) ret.extend([b" ", date_f, b" ", offset_f]) @@ -489,7 +407,8 @@ The directory identifier is the ascii representation of its hexadecimal encoding. - Author and committer are formatted with the :func:`format_author` function. + Author and committer are formatted as described in the :meth:`Person.from_dict` + function. Dates are formatted with the :func:`format_offset` function. Extra headers are an ordered list of [key, value] pairs. Keys are strings @@ -507,74 +426,71 @@ type. """ - git_object = revision_git_object(revision) - return hashlib.new("sha1", git_object).hexdigest() + return hash_to_hex(model.Revision.from_dict(revision).id) -def revision_git_object(revision: Dict[str, Any]) -> bytes: +def revision_git_object(revision: model.Revision) -> bytes: """Formats the git_object of a revision. See :func:`revision_identifier` for details on the format.""" - headers = [(b"tree", identifier_to_str(revision["directory"]).encode())] - for parent in revision["parents"]: + if isinstance(revision, dict): + # For backward compatibility + revision = model.Revision.from_dict(revision) + + headers = [(b"tree", hash_to_bytehex(revision.directory))] + for parent in revision.parents: if parent: - headers.append((b"parent", identifier_to_str(parent).encode())) + headers.append((b"parent", hash_to_bytehex(parent))) + headers.append((b"author", format_author_data(revision.author, revision.date))) headers.append( - (b"author", format_author_data(revision["author"], revision["date"])) - ) - headers.append( - ( - b"committer", - format_author_data(revision["committer"], revision["committer_date"]), - ) + (b"committer", format_author_data(revision.committer, revision.committer_date),) ) # Handle extra headers - metadata = revision.get("metadata") or {} - extra_headers = revision.get("extra_headers", ()) + metadata = revision.metadata or ImmutableDict() + extra_headers = revision.extra_headers or () if not extra_headers and "extra_headers" in metadata: extra_headers = metadata["extra_headers"] headers.extend(extra_headers) - return format_git_object_from_headers("commit", headers, revision["message"]) + return format_git_object_from_headers("commit", headers, revision.message) -def target_type_to_git(target_type: str) -> bytes: +def target_type_to_git(target_type: model.ObjectType) -> bytes: """Convert a software heritage target type to a git object type""" return { - "content": b"blob", - "directory": b"tree", - "revision": b"commit", - "release": b"tag", - "snapshot": b"refs", + model.ObjectType.CONTENT: b"blob", + model.ObjectType.DIRECTORY: b"tree", + model.ObjectType.REVISION: b"commit", + model.ObjectType.RELEASE: b"tag", + model.ObjectType.SNAPSHOT: b"refs", }[target_type] def release_identifier(release: Dict[str, Any]) -> str: """Return the intrinsic identifier for a release.""" - git_object = release_git_object(release) - return hashlib.new("sha1", git_object).hexdigest() + return hash_to_hex(model.Release.from_dict(release).id) + +def release_git_object(release: model.Release) -> bytes: + if isinstance(release, dict): + # For backward compatibility + release = model.Release.from_dict(release) -def release_git_object(release: Dict[str, Any]) -> bytes: headers = [ - (b"object", identifier_to_str(release["target"]).encode()), - (b"type", target_type_to_git(release["target_type"])), - (b"tag", release["name"]), + (b"object", hash_to_bytehex(release.target)), + (b"type", target_type_to_git(release.target_type)), + (b"tag", release.name), ] - if "author" in release and release["author"]: - headers.append( - (b"tagger", format_author_data(release["author"], release["date"])) - ) + if release.author is not None: + headers.append((b"tagger", format_author_data(release.author, release.date))) - return format_git_object_from_headers("tag", headers, release["message"]) + return format_git_object_from_headers("tag", headers, release.message) -def snapshot_identifier( - snapshot: Dict[str, Any], *, ignore_unresolved: bool = False -) -> str: +def snapshot_identifier(snapshot: Dict[str, Any]) -> str: """Return the intrinsic identifier for a snapshot. Snapshots are a set of named branches, which are pointers to objects at any @@ -623,36 +539,36 @@ snapshot (dict): the snapshot of which to compute the identifier. A single entry is needed, ``'branches'``, which is itself a :class:`dict` mapping each branch to its target - ignore_unresolved (bool): if `True`, ignore unresolved branch aliases. Returns: str: the intrinsic identifier for `snapshot` """ - git_object = snapshot_git_object(snapshot, ignore_unresolved=ignore_unresolved) - return hashlib.new("sha1", git_object).hexdigest() + return hash_to_hex(model.Snapshot.from_dict(snapshot).id) -def snapshot_git_object( - snapshot: Dict[str, Any], *, ignore_unresolved: bool = False -) -> bytes: +def snapshot_git_object(snapshot: model.Snapshot) -> bytes: """Formats the git_object of a revision. See :func:`snapshot_identifier` for details on the format.""" + if isinstance(snapshot, dict): + # For backward compatibility + snapshot = model.Snapshot.from_dict(snapshot) + unresolved = [] lines = [] - for name, target in sorted(snapshot["branches"].items()): + for name, target in sorted(snapshot.branches.items()): if not target: target_type = b"dangling" target_id = b"" - elif target["target_type"] == "alias": + elif target.target_type == model.TargetType.ALIAS: target_type = b"alias" - target_id = target["target"] - if target_id not in snapshot["branches"] or target_id == name: + target_id = target.target + if target_id not in snapshot.branches or target_id == name: unresolved.append((name, target_id)) else: - target_type = target["target_type"].encode() - target_id = identifier_to_bytes(target["target"]) + target_type = target.target_type.value.encode() + target_id = target.target lines.extend( [ @@ -665,7 +581,7 @@ ] ) - if unresolved and not ignore_unresolved: + if unresolved: raise ValueError( "Branch aliases unresolved: %s" % ", ".join("%r -> %r" % x for x in unresolved), @@ -681,7 +597,7 @@ An origin's identifier is the sha1 checksum of the entire origin URL """ - return hashlib.sha1(origin["url"].encode("utf-8")).hexdigest() + return hash_to_hex(model.Origin.from_dict(origin).id) def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: @@ -730,14 +646,17 @@ str: the intrinsic identifier for ``metadata`` """ - git_object = raw_extrinsic_metadata_git_object(metadata) - return hashlib.new("sha1", git_object).hexdigest() + return hash_to_hex(model.RawExtrinsicMetadata.from_dict(metadata).id) -def raw_extrinsic_metadata_git_object(metadata: Dict[str, Any]) -> bytes: +def raw_extrinsic_metadata_git_object(metadata: model.RawExtrinsicMetadata) -> bytes: """Formats the git_object of a raw_extrinsic_metadata object. See :func:`raw_extrinsic_metadata_identifier` for details on the format.""" + if isinstance(metadata, dict): + # For backward compatibility + metadata = model.RawExtrinsicMetadata.from_dict(metadata) + # equivalent to using math.floor(dt.timestamp()) to round down, # as int(dt.timestamp()) rounds toward zero, # which would map two seconds on the 0 timestamp. @@ -745,25 +664,21 @@ # This should never be an issue in practice as Software Heritage didn't # start collecting metadata before 2015. timestamp = ( - metadata["discovery_date"] - .astimezone(datetime.timezone.utc) + metadata.discovery_date.astimezone(datetime.timezone.utc) .replace(microsecond=0) .timestamp() ) assert timestamp.is_integer() headers = [ - (b"target", str(metadata["target"]).encode()), + (b"target", str(metadata.target).encode()), (b"discovery_date", str(int(timestamp)).encode("ascii")), ( b"authority", - f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(), - ), - ( - b"fetcher", - f"{metadata['fetcher']['name']} {metadata['fetcher']['version']}".encode(), + f"{metadata.authority.type.value} {metadata.authority.url}".encode(), ), - (b"format", metadata["format"].encode()), + (b"fetcher", f"{metadata.fetcher.name} {metadata.fetcher.version}".encode(),), + (b"format", metadata.format.encode()), ] for key in ( @@ -775,17 +690,17 @@ "path", "directory", ): - if metadata.get(key) is not None: + if getattr(metadata, key, None) is not None: value: bytes if key == "path": - value = metadata[key] + value = getattr(metadata, key) else: - value = str(metadata[key]).encode() + value = str(getattr(metadata, key)).encode() headers.append((key.encode("ascii"), value)) return format_git_object_from_headers( - "raw_extrinsic_metadata", headers, metadata["metadata"] + "raw_extrinsic_metadata", headers, metadata.metadata ) @@ -814,16 +729,19 @@ """ + return hash_to_hex(model.ExtID.from_dict(extid).id) + + +def extid_git_object(extid: model.ExtID) -> bytes: headers = [ - (b"extid_type", extid["extid_type"].encode("ascii")), + (b"extid_type", extid.extid_type.encode("ascii")), ] - extid_version = extid.get("extid_version", 0) + extid_version = extid.extid_version if extid_version != 0: headers.append((b"extid_version", str(extid_version).encode("ascii"))) headers.extend( - [(b"extid", extid["extid"]), (b"target", str(extid["target"]).encode("ascii")),] + [(b"extid", extid.extid), (b"target", str(extid.target).encode("ascii")),] ) - git_object = format_git_object_from_headers("extid", headers) - return hashlib.new("sha1", git_object).hexdigest() + return format_git_object_from_headers("extid", headers) diff --git a/swh/model/model.py b/swh/model/model.py --- a/swh/model/model.py +++ b/swh/model/model.py @@ -6,7 +6,7 @@ from abc import ABCMeta, abstractmethod import datetime from enum import Enum -from hashlib import sha256 +import hashlib from typing import Any, Dict, Iterable, Optional, Tuple, TypeVar, Union import attr @@ -15,18 +15,9 @@ import iso8601 from typing_extensions import Final +from . import identifiers from .collections import ImmutableDict -from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes -from .identifiers import ( - directory_identifier, - extid_identifier, - normalize_timestamp, - origin_identifier, - raw_extrinsic_metadata_identifier, - release_identifier, - revision_identifier, - snapshot_identifier, -) +from .hashutil import DEFAULT_ALGORITHMS, MultiHash from .swhids import CoreSWHID from .swhids import ExtendedObjectType as SwhidExtendedObjectType from .swhids import ExtendedSWHID @@ -193,7 +184,29 @@ Anonymization is simply a Person which fullname is the hashed, with unset name or email. """ - return Person(fullname=sha256(self.fullname).digest(), name=None, email=None,) + return Person( + fullname=hashlib.sha256(self.fullname).digest(), name=None, email=None, + ) + + @classmethod + def from_dict(cls, d): + """ + If the fullname is missing, construct a fullname + using the following heuristics: if the name value is None, we return the + email in angle brackets, else, we return the name, a space, and the email + in angle brackets. + """ + if "fullname" not in d: + parts = [] + if d["name"] is not None: + parts.append(d["name"]) + if d["email"] is not None: + parts.append(b"".join([b"<", d["email"], b">"])) + + fullname = b" ".join(parts) + d = {**d, "fullname": fullname} + d = {"name": None, "email": None, **d} + return super().from_dict(d) @attr.s(frozen=True, slots=True) @@ -243,16 +256,60 @@ raise ValueError("negative_utc can only be True is offset=0") @classmethod - def from_dict(cls, obj: Union[Dict, datetime.datetime, int]): + def from_dict(cls, time_representation: Union[Dict, datetime.datetime, int]): """Builds a TimestampWithTimezone from any of the formats accepted by :func:`swh.model.normalize_timestamp`.""" # TODO: this accept way more types than just dicts; find a better # name - d = normalize_timestamp(obj) + negative_utc = False + + if isinstance(time_representation, dict): + ts = time_representation["timestamp"] + if isinstance(ts, dict): + seconds = ts.get("seconds", 0) + microseconds = ts.get("microseconds", 0) + elif isinstance(ts, int): + seconds = ts + microseconds = 0 + else: + raise ValueError( + "normalize_timestamp received non-integer timestamp member:" + " %r" % ts + ) + offset = time_representation["offset"] + if "negative_utc" in time_representation: + negative_utc = time_representation["negative_utc"] + if negative_utc is None: + negative_utc = False + elif isinstance(time_representation, datetime.datetime): + microseconds = time_representation.microsecond + if microseconds: + time_representation = time_representation.replace(microsecond=0) + seconds = int(time_representation.timestamp()) + utcoffset = time_representation.utcoffset() + if utcoffset is None: + raise ValueError( + "normalize_timestamp received datetime without timezone: %s" + % time_representation + ) + + # utcoffset is an integer number of minutes + seconds_offset = utcoffset.total_seconds() + offset = int(seconds_offset) // 60 + elif isinstance(time_representation, int): + seconds = time_representation + microseconds = 0 + offset = 0 + else: + raise ValueError( + "normalize_timestamp received non-integer timestamp:" + " %r" % time_representation + ) + return cls( - timestamp=Timestamp.from_dict(d["timestamp"]), - offset=d["offset"], - negative_utc=d["negative_utc"], + timestamp=Timestamp(seconds=seconds, microseconds=microseconds), + offset=offset, + negative_utc=negative_utc, ) @classmethod @@ -286,21 +343,25 @@ @attr.s(frozen=True, slots=True) -class Origin(BaseModel): +class Origin(HashableObject, BaseModel): """Represents a software source: a VCS and an URL.""" object_type: Final = "origin" url = attr.ib(type=str, validator=type_validator()) + id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") + def unique_key(self) -> KeyType: return {"url": self.url} + def compute_hash(self) -> bytes: + return hashlib.sha1(self.url.encode("utf-8")).digest() + def swhid(self) -> ExtendedSWHID: """Returns a SWHID representing this origin.""" return ExtendedSWHID( - object_type=SwhidExtendedObjectType.ORIGIN, - object_id=hash_to_bytes(origin_identifier(self.unique_key())), + object_type=SwhidExtendedObjectType.ORIGIN, object_id=self.id, ) @@ -431,7 +492,8 @@ id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") def compute_hash(self) -> bytes: - return hash_to_bytes(snapshot_identifier(self.to_dict())) + git_object = identifiers.snapshot_git_object(self) + return hashlib.new("sha1", git_object).digest() @classmethod def from_dict(cls, d): @@ -471,7 +533,8 @@ id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") def compute_hash(self) -> bytes: - return hash_to_bytes(release_identifier(self.to_dict())) + git_object = identifiers.release_git_object(self) + return hashlib.new("sha1", git_object).digest() @author.validator def check_author(self, attribute, value): @@ -564,7 +627,8 @@ object.__setattr__(self, "metadata", metadata) def compute_hash(self) -> bytes: - return hash_to_bytes(revision_identifier(self.to_dict())) + git_object = identifiers.revision_git_object(self) + return hashlib.new("sha1", git_object).digest() @classmethod def from_dict(cls, d): @@ -621,7 +685,8 @@ id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") def compute_hash(self) -> bytes: - return hash_to_bytes(directory_identifier(self.to_dict())) + git_object = identifiers.directory_git_object(self) + return hashlib.new("sha1", git_object).digest() @classmethod def from_dict(cls, d): @@ -950,7 +1015,8 @@ id = attr.ib(type=Sha1Git, validator=type_validator(), default=b"") def compute_hash(self) -> bytes: - return hash_to_bytes(raw_extrinsic_metadata_identifier(self.to_dict())) + git_object = identifiers.raw_extrinsic_metadata_git_object(self) + return hashlib.new("sha1", git_object).digest() @origin.validator def check_origin(self, attribute, value): @@ -1150,4 +1216,5 @@ ) def compute_hash(self) -> bytes: - return hash_to_bytes(extid_identifier(self.to_dict())) + git_object = identifiers.extid_git_object(self) + return hashlib.new("sha1", git_object).digest() diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -794,11 +794,11 @@ **self.minimal, "origin": "https://forge.softwareheritage.org/source/swh-model/", "visit": 42, - "snapshot": CoreSWHID.from_string("swh:1:snp:" + "00" * 20), - "release": CoreSWHID.from_string("swh:1:rel:" + "01" * 20), - "revision": CoreSWHID.from_string("swh:1:rev:" + "02" * 20), + "snapshot": "swh:1:snp:" + "00" * 20, + "release": "swh:1:rel:" + "01" * 20, + "revision": "swh:1:rev:" + "02" * 20, "path": b"/abc/def", - "directory": CoreSWHID.from_string("swh:1:dir:" + "03" * 20), + "directory": "swh:1:dir:" + "03" * 20, } def test_minimal(self): @@ -1812,9 +1812,7 @@ extid_dict = { "extid_type": "test-type", "extid": b"extid", - "target": ExtendedSWHID( - object_type=ExtendedObjectType.DIRECTORY, object_id=b"\x00" * 20 - ), + "target": "swh:1:dir:" + "00" * 20, } assert (