diff --git a/swh/model/cli.py b/swh/model/cli.py --- a/swh/model/cli.py +++ b/swh/model/cli.py @@ -26,7 +26,7 @@ swh_cli_group = click # type: ignore from swh.model.from_disk import Directory -from swh.model.identifiers import CoreSWHID, ObjectType +from swh.model.swhids import CoreSWHID CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @@ -42,7 +42,7 @@ class CoreSWHIDParamType(click.ParamType): """Click argument that accepts a core SWHID and returns them as - :class:`swh.model.identifiers.CoreSWHID` instances """ + :class:`swh.model.swhids.CoreSWHID` instances """ name = "SWHID" @@ -87,17 +87,9 @@ def swhid_of_origin(url): - from swh.model.hashutil import hash_to_bytes - from swh.model.identifiers import ( - ExtendedObjectType, - ExtendedSWHID, - origin_identifier, - ) + from swh.model.model import Origin - return ExtendedSWHID( - object_type=ExtendedObjectType.ORIGIN, - object_id=hash_to_bytes(origin_identifier({"url": url})), - ) + return Origin(url).swhid() def swhid_of_git_repo(path) -> CoreSWHID: @@ -110,7 +102,7 @@ ) from swh.model import hashutil - from swh.model.identifiers import snapshot_identifier + from swh.model.model import Snapshot repo = dulwich.repo.Repo(path) @@ -133,10 +125,7 @@ snapshot = {"branches": branches} - return CoreSWHID( - object_type=ObjectType.SNAPSHOT, - object_id=hashutil.hash_to_bytes(snapshot_identifier(snapshot)), - ) + return Snapshot.from_dict(snapshot).swhid() def identify_object( diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -18,14 +18,10 @@ from . import model from .exceptions import InvalidDirectoryPath -from .hashutil import MultiHash, hash_to_bytes, hash_to_hex -from .identifiers import ( - CoreSWHID, - ObjectType, - directory_entry_sort_key, - directory_identifier, -) +from .git_objects import directory_entry_sort_key +from .hashutil import MultiHash, hash_to_hex from .merkle import MerkleLeaf, MerkleNode +from .swhids import CoreSWHID, ObjectType @attr.s(frozen=True, slots=True) @@ -477,8 +473,8 @@ @property def entries(self): - """Child nodes, sorted by name in the same way `directory_identifier` - does.""" + """Child nodes, sorted by name in the same way + :func:`swh.model.git_objects.directory_git_object` does.""" if self.__entries is None: self.__entries = sorted( ( @@ -496,7 +492,7 @@ return CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=self.hash) def compute_hash(self): - return hash_to_bytes(directory_identifier({"entries": self.entries})) + return model.Directory.from_dict({"entries": self.entries}).id def to_model(self) -> model.Directory: """Builds a `model.Directory` object based on this node; diff --git a/swh/model/git_objects.py b/swh/model/git_objects.py --- a/swh/model/git_objects.py +++ b/swh/model/git_objects.py @@ -27,7 +27,7 @@ @lru_cache() def _perms_to_bytes(perms): - """Convert the perms value to its bytes representation""" + """Convert the perms value to its canonical bytes representation""" oc = oct(perms)[2:] return oc.encode("ascii") @@ -117,7 +117,6 @@ UTC - negative_utc: a boolean representing whether the offset is -0000 when offset = 0. - """ if time_representation is None: return None @@ -126,6 +125,41 @@ def directory_git_object(directory: model.Directory) -> bytes: + """Formats a directory as a git tree. + + A directory's identifier is the tree sha1 à la git of a directory listing, + using the following algorithm, which is equivalent to the git algorithm for + trees: + + 1. Entries of the directory are sorted using the name (or the name with '/' + appended for directory entries) as key, in bytes order. + + 2. For each entry of the directory, the following bytes are output: + + - the octal representation of the permissions for the entry (stored in + the 'perms' member), which is a representation of the entry type: + + - b'100644' (int 33188) for files + - b'100755' (int 33261) for executable files + - b'120000' (int 40960) for symbolic links + - b'40000' (int 16384) for directories + - b'160000' (int 57344) for references to revisions + + - an ascii space (b'\x20') + - the entry's name (as raw bytes), stored in the 'name' member + - a null byte (b'\x00') + - the 20 byte long identifier of the object pointed at by the entry, + stored in the 'target' member: + + - for files or executable files: their blob sha1_git + - for symbolic links: the blob sha1_git of a file containing the link + destination + - for directories: their intrinsic identifier + - for revisions: their intrinsic identifier + + (Note that there is no separator between entries) + + """ if isinstance(directory, dict): # For backward compatibility directory = model.Directory.from_dict(directory) @@ -219,7 +253,6 @@ Returns: the byte string containing the authorship data - """ ret = [author.fullname] @@ -234,8 +267,55 @@ def revision_git_object(revision: model.Revision) -> bytes: - """Formats the git_object of a revision. See :func:`revision_identifier` for details - on the format.""" + """Formats a revision as a git tree. + + The fields used for the revision identifier computation are: + + - directory + - parents + - author + - author_date + - committer + - committer_date + - extra_headers or metadata -> extra_headers + - message + + A revision's identifier is the 'git'-checksum of a commit manifest + constructed as follows (newlines are a single ASCII newline character):: + + tree + [for each parent in parents] + parent + [end for each parents] + author + committer + [for each key, value in extra_headers] + + [end for each extra_headers] + + + + The directory identifier is the ascii representation of its hexadecimal + encoding. + + Author and committer are formatted using the :attr:`Person.fullname` attribute only. + Dates are formatted with the :func:`format_offset` function. + + Extra headers are an ordered list of [key, value] pairs. Keys are strings + and get encoded to utf-8 for identifier computation. Values are either byte + strings, unicode strings (that get encoded to utf-8), or integers (that get + encoded to their utf-8 decimal representation). + + Multiline extra header values are escaped by indenting the continuation + lines with one ascii space. + + If the message is None, the manifest ends with the last header. Else, the + message is appended to the headers after an empty line. + + The checksum of the full manifest is computed using the 'commit' git object + type. + + """ if isinstance(revision, dict): # For backward compatibility revision = model.Revision.from_dict(revision) @@ -290,8 +370,50 @@ def snapshot_git_object(snapshot: model.Snapshot) -> bytes: - """Formats the git_object of a revision. See :func:`snapshot_identifier` for details - on the format.""" + """Formats a snapshot as a git-like object. + + Snapshots are a set of named branches, which are pointers to objects at any + level of the Software Heritage DAG. + + As well as pointing to other objects in the Software Heritage DAG, branches + can also be *alias*es, in which case their target is the name of another + branch in the same snapshot, or *dangling*, in which case the target is + unknown (and represented by the ``None`` value). + + A snapshot identifier is a salted sha1 (using the git hashing algorithm + with the ``snapshot`` object type) of a manifest following the algorithm: + + 1. Branches are sorted using the name as key, in bytes order. + + 2. For each branch, the following bytes are output: + + - the type of the branch target: + + - ``content``, ``directory``, ``revision``, ``release`` or ``snapshot`` + for the corresponding entries in the DAG; + - ``alias`` for branches referencing another branch; + - ``dangling`` for dangling branches + + - an ascii space (``\\x20``) + - the branch name (as raw bytes) + - a null byte (``\\x00``) + - the length of the target identifier, as an ascii-encoded decimal number + (``20`` for current intrinsic identifiers, ``0`` for dangling + branches, the length of the target branch name for branch aliases) + - a colon (``:``) + - the identifier of the target object pointed at by the branch, + stored in the 'target' member: + + - for contents: their *sha1_git* + - for directories, revisions, releases or snapshots: their intrinsic + identifier + - for branch aliases, the name of the target branch (as raw bytes) + - for dangling branches, the empty string + + Note that, akin to directory manifests, there is no separator between + entries. Because of symbolic branches, identifiers are of arbitrary + length but are length-encoded to avoid ambiguity. + """ if isinstance(snapshot, dict): # For backward compatibility snapshot = model.Snapshot.from_dict(snapshot) @@ -334,9 +456,47 @@ def raw_extrinsic_metadata_git_object(metadata: model.RawExtrinsicMetadata) -> bytes: - """Formats the git_object of a raw_extrinsic_metadata object. - See :func:`raw_extrinsic_metadata_identifier` for details - on the format.""" + """Formats RawExtrinsicMetadata as a git-like object. + + A raw_extrinsic_metadata identifier is a salted sha1 (using the git + hashing algorithm with the ``raw_extrinsic_metadata`` object type) of + a manifest following the format:: + + target $ExtendedSwhid + discovery_date $Timestamp + authority $StrWithoutSpaces $IRI + fetcher $Str $Version + format $StrWithoutSpaces + origin $IRI <- optional + visit $IntInDecimal <- optional + snapshot $CoreSwhid <- optional + release $CoreSwhid <- optional + revision $CoreSwhid <- optional + path $Bytes <- optional + directory $CoreSwhid <- optional + + $MetadataBytes + + $IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as + described below) + + $StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces. + + $Str is an UTF-8 string. + + $CoreSwhid are core SWHIDs, as defined in :ref:`persistent-identifiers`. + $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for + origins and 'emd' for raw extrinsic metadata) + + $Timestamp is a decimal representation of the rounded-down integer number of + seconds since the UNIX epoch (1970-01-01 00:00:00 UTC), + with no leading '0' (unless the timestamp value is zero) and no timezone. + It may be negative by prefixing it with a '-', which must not be followed + by a '0'. + + Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, + ie. by adding a space after them. + """ if isinstance(metadata, dict): # For backward compatibility metadata = model.RawExtrinsicMetadata.from_dict(metadata) @@ -389,6 +549,26 @@ def extid_git_object(extid: model.ExtID) -> bytes: + """Formats an extid as a gi-like object. + + An ExtID identifier is a salted sha1 (using the git hashing algorithm with + the ``extid`` object type) of a manifest following the format: + + ``` + extid_type $StrWithoutSpaces + [extid_version $Str] + extid $Bytes + target $CoreSwhid + ``` + + $StrWithoutSpaces is an ASCII string, and may not contain spaces. + + Newlines in $Bytes are escaped as with other git fields, ie. by adding a + space after them. + + The extid_version line is only generated if the version is non-zero. + """ + headers = [ (b"extid_type", extid.extid_type.encode("ascii")), ] diff --git a/swh/model/hypothesis_strategies.py b/swh/model/hypothesis_strategies.py --- a/swh/model/hypothesis_strategies.py +++ b/swh/model/hypothesis_strategies.py @@ -29,8 +29,6 @@ ) from .from_disk import DentryPerms -from .hashutil import hash_to_bytes -from .identifiers import ExtendedObjectType, ExtendedSWHID, snapshot_identifier from .model import ( BaseContent, Content, @@ -54,6 +52,7 @@ Timestamp, TimestampWithTimezone, ) +from .swhids import ExtendedObjectType, ExtendedSWHID pgsql_alphabet = characters( blacklist_categories=("Cs",), blacklist_characters=["\u0000"] @@ -396,7 +395,7 @@ # Ensure no cycles between aliases while True: try: - id_ = snapshot_identifier( + snapshot = Snapshot.from_dict( { "branches": { name: branch or None for (name, branch) in branches.items() @@ -409,7 +408,7 @@ else: break - return dict(id=hash_to_bytes(id_), branches=branches) + return snapshot.to_dict() def snapshots(*, min_size=0, max_size=100, only_objects=False): diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -3,9 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from __future__ import annotations - from typing import Any, Dict +import warnings from . import model @@ -16,6 +15,14 @@ # Reexport for backward compatibility from .swhids import * # noqa +warnings.warn( + "The swh.model.identifiers module is deprecated. " + "SWHID-related classes were moved to swh.model.swhids, and identifier " + "computation is now done directly with swh.model.model classes.", + DeprecationWarning, + stacklevel=2, +) + # The following are deprecated aliases of the variants defined in ObjectType # while transitioning from SWHID to QualifiedSWHID ORIGIN = "origin" @@ -28,260 +35,57 @@ def content_identifier(content: Dict[str, Any]) -> Dict[str, bytes]: - """Return the intrinsic identifier for a content. - - A content's identifier is the sha1, sha1_git and sha256 checksums of its - data. - - Args: - content: a content conforming to the Software Heritage schema - - Returns: - A dictionary with all the hashes for the data - - Raises: - KeyError: if the content doesn't have a data member. - + """Deprecated, use :class:`swh.model.Content` instead: + ``content_identifier(d)`` is equivalent to: + ``{k: hash_to_hex(v) for (k, v) in Content.from_data(d["data"]).hashes().items()}`` """ - return MultiHash.from_data(content["data"]).digest() def directory_identifier(directory: Dict[str, Any]) -> str: - """Return the intrinsic identifier for a directory. - - A directory's identifier is the tree sha1 à la git of a directory listing, - using the following algorithm, which is equivalent to the git algorithm for - trees: + """Deprecated, use :class:`swh.model.Directory` instead: + ``directory_identifier(d)`` is equivalent to: + ``hash_to_hex(Directory.from_dict(d).id)``. - 1. Entries of the directory are sorted using the name (or the name with '/' - appended for directory entries) as key, in bytes order. - - 2. For each entry of the directory, the following bytes are output: - - - the octal representation of the permissions for the entry (stored in - the 'perms' member), which is a representation of the entry type: - - - b'100644' (int 33188) for files - - b'100755' (int 33261) for executable files - - b'120000' (int 40960) for symbolic links - - b'40000' (int 16384) for directories - - b'160000' (int 57344) for references to revisions - - - an ascii space (b'\x20') - - the entry's name (as raw bytes), stored in the 'name' member - - a null byte (b'\x00') - - the 20 byte long identifier of the object pointed at by the entry, - stored in the 'target' member: - - - for files or executable files: their blob sha1_git - - for symbolic links: the blob sha1_git of a file containing the link - destination - - for directories: their intrinsic identifier - - for revisions: their intrinsic identifier - - (Note that there is no separator between entries) - - """ + See :func:`swh.model.git_objects.directory_git_object` for details of the + format used to generate this identifier.""" return hash_to_hex(model.Directory.from_dict(directory).id) def revision_identifier(revision: Dict[str, Any]) -> str: - """Return the intrinsic identifier for a revision. - - The fields used for the revision identifier computation are: + """Deprecated, use :class:`swh.model.Revision` instead: + ``revision_identifier(d)`` is equivalent to: + ``hash_to_hex(Revision.from_dict(d).id)``. - - directory - - parents - - author - - author_date - - committer - - committer_date - - extra_headers or metadata -> extra_headers - - message - - A revision's identifier is the 'git'-checksum of a commit manifest - constructed as follows (newlines are a single ASCII newline character):: - - tree - [for each parent in parents] - parent - [end for each parents] - author - committer - [for each key, value in extra_headers] - - [end for each extra_headers] - - - - The directory identifier is the ascii representation of its hexadecimal - encoding. - - Author and committer are formatted using the :attr:`Person.fullname` attribute only. - Dates are formatted with the :func:`format_offset` function. - - Extra headers are an ordered list of [key, value] pairs. Keys are strings - and get encoded to utf-8 for identifier computation. Values are either byte - strings, unicode strings (that get encoded to utf-8), or integers (that get - encoded to their utf-8 decimal representation). - - Multiline extra header values are escaped by indenting the continuation - lines with one ascii space. - - If the message is None, the manifest ends with the last header. Else, the - message is appended to the headers after an empty line. - - The checksum of the full manifest is computed using the 'commit' git object - type. - - """ + See :func:`swh.model.git_objects.revision_git_object` for details of the + format used to generate this identifier.""" return hash_to_hex(model.Revision.from_dict(revision).id) def release_identifier(release: Dict[str, Any]) -> str: - """Return the intrinsic identifier for a release.""" + """Deprecated, use :class:`swh.model.Release` instead: + ``release_identifier(d)`` is equivalent to: + ``hash_to_hex(Release.from_dict(d).id)``. + + See :func:`swh.model.git_objects.release_git_object` for details of the + format used to generate this identifier.""" return hash_to_hex(model.Release.from_dict(release).id) def snapshot_identifier(snapshot: Dict[str, Any]) -> str: - """Return the intrinsic identifier for a snapshot. - - Snapshots are a set of named branches, which are pointers to objects at any - level of the Software Heritage DAG. - - As well as pointing to other objects in the Software Heritage DAG, branches - can also be *alias*es, in which case their target is the name of another - branch in the same snapshot, or *dangling*, in which case the target is - unknown (and represented by the ``None`` value). - - A snapshot identifier is a salted sha1 (using the git hashing algorithm - with the ``snapshot`` object type) of a manifest following the algorithm: - - 1. Branches are sorted using the name as key, in bytes order. - - 2. For each branch, the following bytes are output: - - - the type of the branch target: + """Deprecated, use :class:`swh.model.Snapshot` instead: + ``snapshot_identifier(d)`` is equivalent to: + ``hash_to_hex(Snapshot.from_dict(d).id)``. - - ``content``, ``directory``, ``revision``, ``release`` or ``snapshot`` - for the corresponding entries in the DAG; - - ``alias`` for branches referencing another branch; - - ``dangling`` for dangling branches - - - an ascii space (``\\x20``) - - the branch name (as raw bytes) - - a null byte (``\\x00``) - - the length of the target identifier, as an ascii-encoded decimal number - (``20`` for current intrinsic identifiers, ``0`` for dangling - branches, the length of the target branch name for branch aliases) - - a colon (``:``) - - the identifier of the target object pointed at by the branch, - stored in the 'target' member: - - - for contents: their *sha1_git* - - for directories, revisions, releases or snapshots: their intrinsic - identifier - - for branch aliases, the name of the target branch (as raw bytes) - - for dangling branches, the empty string - - Note that, akin to directory manifests, there is no separator between - entries. Because of symbolic branches, identifiers are of arbitrary - length but are length-encoded to avoid ambiguity. - - Args: - snapshot (dict): the snapshot of which to compute the identifier. A - single entry is needed, ``'branches'``, which is itself a :class:`dict` - mapping each branch to its target - - Returns: - str: the intrinsic identifier for `snapshot` - - """ + See :func:`swh.model.git_objects.snapshot_git_object` for details of the + format used to generate this identifier.""" return hash_to_hex(model.Snapshot.from_dict(snapshot).id) def origin_identifier(origin): - """Return the intrinsic identifier for an origin. - - An origin's identifier is the sha1 checksum of the entire origin URL - + """Deprecated, use :class:`swh.model.Origin` instead: + ``origin_identifier(url)`` is equivalent to: + ``hash_to_hex(Origin(url=url).id)``. """ - return hash_to_hex(model.Origin.from_dict(origin).id) - - -def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: - """Return the intrinsic identifier for a RawExtrinsicMetadata object. - - A raw_extrinsic_metadata identifier is a salted sha1 (using the git - hashing algorithm with the ``raw_extrinsic_metadata`` object type) of - a manifest following the format:: - - target $ExtendedSwhid - discovery_date $Timestamp - authority $StrWithoutSpaces $IRI - fetcher $Str $Version - format $StrWithoutSpaces - origin $IRI <- optional - visit $IntInDecimal <- optional - snapshot $CoreSwhid <- optional - release $CoreSwhid <- optional - revision $CoreSwhid <- optional - path $Bytes <- optional - directory $CoreSwhid <- optional - - $MetadataBytes - - $IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as - described below) - - $StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces. - - $Str is an UTF-8 string. - $CoreSwhid are core SWHIDs, as defined in :ref:`persistent-identifiers`. - $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for - origins and 'emd' for raw extrinsic metadata) - - $Timestamp is a decimal representation of the rounded-down integer number of - seconds since the UNIX epoch (1970-01-01 00:00:00 UTC), - with no leading '0' (unless the timestamp value is zero) and no timezone. - It may be negative by prefixing it with a '-', which must not be followed - by a '0'. - - Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, - ie. by adding a space after them. - - Returns: - str: the intrinsic identifier for ``metadata`` - - """ - return hash_to_hex(model.RawExtrinsicMetadata.from_dict(metadata).id) - - -def extid_identifier(extid: Dict[str, Any]) -> str: - """Return the intrinsic identifier for an ExtID object. - - An ExtID identifier is a salted sha1 (using the git hashing algorithm with - the ``extid`` object type) of a manifest following the format: - - ``` - extid_type $StrWithoutSpaces - [extid_version $Str] - extid $Bytes - target $CoreSwhid - ``` - - $StrWithoutSpaces is an ASCII string, and may not contain spaces. - - Newlines in $Bytes are escaped as with other git fields, ie. by adding a - space after them. - - The extid_version line is only generated if the version is non-zero. - - Returns: - str: the intrinsic identifier for `extid` - - """ - - return hash_to_hex(model.ExtID.from_dict(extid).id) + return hash_to_hex(model.Origin.from_dict(origin).id) diff --git a/swh/model/tests/swh_model_data.py b/swh/model/tests/swh_model_data.py --- a/swh/model/tests/swh_model_data.py +++ b/swh/model/tests/swh_model_data.py @@ -9,7 +9,6 @@ import attr from swh.model.hashutil import MultiHash, hash_to_bytes -from swh.model.identifiers import ExtendedSWHID from swh.model.model import ( BaseModel, Content, @@ -35,6 +34,7 @@ Timestamp, TimestampWithTimezone, ) +from swh.model.swhids import ExtendedSWHID UTC = datetime.timezone.utc diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -12,18 +12,27 @@ import attr import pytest -from swh.model import hashutil, identifiers +from swh.model import git_objects, hashutil from swh.model.exceptions import ValidationError from swh.model.hashutil import hash_to_bytes as _x -from swh.model.hashutil import hash_to_hex -from swh.model.identifiers import ( +from swh.model.model import ( + Content, + Directory, + ExtID, + Origin, + RawExtrinsicMetadata, + Release, + Revision, + Snapshot, + TimestampWithTimezone, +) +from swh.model.swhids import ( SWHID_QUALIFIERS, CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType, QualifiedSWHID, - normalize_timestamp, ) @@ -51,11 +60,11 @@ def test_format_date(self): for date_repr, date in self.dates.items(): - self.assertEqual(identifiers.format_date(date), date_repr) + self.assertEqual(git_objects.format_date(date), date_repr) def test_format_offset(self): for offset, res in self.offsets.items(): - self.assertEqual(identifiers.format_offset(offset), res) + self.assertEqual(git_objects.format_offset(offset), res) content_example = { @@ -72,7 +81,7 @@ def test_content_identifier(self): self.assertEqual( - identifiers.content_identifier(content_example), self.content_id + Content.from_data(content_example["data"]).hashes(), self.content_id ) @@ -199,26 +208,22 @@ } def test_dir_identifier(self): + self.assertEqual(Directory.from_dict(self.directory).id, self.directory["id"]) self.assertEqual( - _x(identifiers.directory_identifier(self.directory)), self.directory["id"] - ) - self.assertEqual( - _x(identifiers.directory_identifier(remove_id(self.directory))), - self.directory["id"], + Directory.from_dict(remove_id(self.directory)).id, self.directory["id"], ) def test_dir_identifier_entry_order(self): # Reverse order of entries, check the id is still the same. directory = {"entries": reversed(self.directory["entries"])} self.assertEqual( - _x(identifiers.directory_identifier(remove_id(directory))), - self.directory["id"], + Directory.from_dict(remove_id(directory)).id, self.directory["id"], ) def test_dir_identifier_empty_directory(self): self.assertEqual( - identifiers.directory_identifier(remove_id(self.empty_directory)), - self.empty_directory["id"], + Directory.from_dict(remove_id(self.empty_directory)).id, + _x(self.empty_directory["id"]), ) @@ -270,7 +275,7 @@ self.revision = revision_example self.revision_none_metadata = { - "id": "bc0195aad0daa2ad5b0d76cce22b167bc3435590", + "id": _x("bc0195aad0daa2ad5b0d76cce22b167bc3435590"), "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"), "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")], "author": { @@ -328,7 +333,7 @@ # cat commit.txt | git hash-object -t commit --stdin self.revision_with_extra_headers = { - "id": "010d34f384fa99d047cdd5e2f41e56e5c2feee45", + "id": _x("010d34f384fa99d047cdd5e2f41e56e5c2feee45"), "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"), "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")], "author": { @@ -355,7 +360,7 @@ } self.revision_with_gpgsig = { - "id": "44cc742a8ca17b9c279be4cc195a93a6ef7a320e", + "id": _x("44cc742a8ca17b9c279be4cc195a93a6ef7a320e"), "directory": _x("b134f9b7dc434f593c0bab696345548b37de0558"), "parents": [ _x("689664ae944b4692724f13b709a4e4de28b54e57"), @@ -380,7 +385,7 @@ } self.revision_no_message = { - "id": "4cfc623c9238fa92c832beed000ce2d003fd8333", + "id": _x("4cfc623c9238fa92c832beed000ce2d003fd8333"), "directory": _x("b134f9b7dc434f593c0bab696345548b37de0558"), "parents": [ _x("689664ae944b4692724f13b709a4e4de28b54e57"), @@ -400,7 +405,7 @@ } self.revision_empty_message = { - "id": "7442cd78bd3b4966921d6a7f7447417b7acb15eb", + "id": _x("7442cd78bd3b4966921d6a7f7447417b7acb15eb"), "directory": _x("b134f9b7dc434f593c0bab696345548b37de0558"), "parents": [ _x("689664ae944b4692724f13b709a4e4de28b54e57"), @@ -420,7 +425,7 @@ } self.revision_only_fullname = { - "id": "010d34f384fa99d047cdd5e2f41e56e5c2feee45", + "id": _x("010d34f384fa99d047cdd5e2f41e56e5c2feee45"), "directory": _x("85a74718d377195e1efd0843ba4f3260bad4fe07"), "parents": [_x("01e2d0627a9a6edb24c37db45db5ecb31e9de808")], "author": {"fullname": b"Linus Torvalds ",}, @@ -442,56 +447,52 @@ def test_revision_identifier(self): self.assertEqual( - identifiers.revision_identifier(self.revision), - hash_to_hex(self.revision["id"]), + Revision.from_dict(self.revision).id, self.revision["id"], ) self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision)), - hash_to_hex(self.revision["id"]), + Revision.from_dict(remove_id(self.revision)).id, self.revision["id"], ) def test_revision_identifier_none_metadata(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision_none_metadata)), - hash_to_hex(self.revision_none_metadata["id"]), + Revision.from_dict(remove_id(self.revision_none_metadata)).id, + self.revision_none_metadata["id"], ) def test_revision_identifier_synthetic(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.synthetic_revision)), - hash_to_hex(self.synthetic_revision["id"]), + Revision.from_dict(remove_id(self.synthetic_revision)).id, + self.synthetic_revision["id"], ) def test_revision_identifier_with_extra_headers(self): self.assertEqual( - identifiers.revision_identifier( - remove_id(self.revision_with_extra_headers) - ), - hash_to_hex(self.revision_with_extra_headers["id"]), + Revision.from_dict(remove_id(self.revision_with_extra_headers)).id, + self.revision_with_extra_headers["id"], ) def test_revision_identifier_with_gpgsig(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision_with_gpgsig)), - hash_to_hex(self.revision_with_gpgsig["id"]), + Revision.from_dict(remove_id(self.revision_with_gpgsig)).id, + self.revision_with_gpgsig["id"], ) def test_revision_identifier_no_message(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision_no_message)), - hash_to_hex(self.revision_no_message["id"]), + Revision.from_dict(remove_id(self.revision_no_message)).id, + self.revision_no_message["id"], ) def test_revision_identifier_empty_message(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision_empty_message)), - hash_to_hex(self.revision_empty_message["id"]), + Revision.from_dict(remove_id(self.revision_empty_message)).id, + self.revision_empty_message["id"], ) def test_revision_identifier_only_fullname(self): self.assertEqual( - identifiers.revision_identifier(remove_id(self.revision_only_fullname)), - hash_to_hex(self.revision_only_fullname["id"]), + Revision.from_dict(remove_id(self.revision_only_fullname)).id, + self.revision_only_fullname["id"], ) @@ -608,48 +609,46 @@ def test_release_identifier(self): self.assertEqual( - identifiers.release_identifier(self.release), - hash_to_hex(self.release["id"]), + Release.from_dict(self.release).id, self.release["id"], ) self.assertEqual( - identifiers.release_identifier(remove_id(self.release)), - hash_to_hex(self.release["id"]), + Release.from_dict(remove_id(self.release)).id, self.release["id"], ) def test_release_identifier_no_author(self): self.assertEqual( - identifiers.release_identifier(remove_id(self.release_no_author)), - hash_to_hex(self.release_no_author["id"]), + Release.from_dict(remove_id(self.release_no_author)).id, + self.release_no_author["id"], ) def test_release_identifier_no_message(self): self.assertEqual( - identifiers.release_identifier(remove_id(self.release_no_message)), - hash_to_hex(self.release_no_message["id"]), + Release.from_dict(remove_id(self.release_no_message)).id, + self.release_no_message["id"], ) def test_release_identifier_empty_message(self): self.assertEqual( - identifiers.release_identifier(remove_id(self.release_empty_message)), - hash_to_hex(self.release_empty_message["id"]), + Release.from_dict(remove_id(self.release_empty_message)).id, + self.release_empty_message["id"], ) def test_release_identifier_negative_utc(self): self.assertEqual( - identifiers.release_identifier(remove_id(self.release_negative_utc)), - hash_to_hex(self.release_negative_utc["id"]), + Release.from_dict(remove_id(self.release_negative_utc)).id, + self.release_negative_utc["id"], ) def test_release_identifier_newline_in_author(self): self.assertEqual( - identifiers.release_identifier(remove_id(self.release_newline_in_author)), - hash_to_hex(self.release_newline_in_author["id"]), + Release.from_dict(remove_id(self.release_newline_in_author)).id, + self.release_newline_in_author["id"], ) def test_release_identifier_snapshot_target(self): self.assertEqual( - identifiers.release_identifier(self.release_snapshot_target), - hash_to_hex(self.release_snapshot_target["id"]), + Release.from_dict(self.release_snapshot_target).id, + self.release_snapshot_target["id"], ) @@ -687,17 +686,17 @@ super().setUp() self.empty = { - "id": "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e", + "id": _x("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), "branches": {}, } self.dangling_branch = { - "id": "c84502e821eb21ed84e9fd3ec40973abc8b32353", + "id": _x("c84502e821eb21ed84e9fd3ec40973abc8b32353"), "branches": {b"HEAD": None,}, } self.unresolved = { - "id": "84b4548ea486e4b0a7933fa541ff1503a0afe1e0", + "id": _x("84b4548ea486e4b0a7933fa541ff1503a0afe1e0"), "branches": {b"foo": {"target": b"bar", "target_type": "alias",},}, } @@ -705,24 +704,22 @@ def test_empty_snapshot(self): self.assertEqual( - identifiers.snapshot_identifier(remove_id(self.empty)), - hash_to_hex(self.empty["id"]), + Snapshot.from_dict(remove_id(self.empty)).id, self.empty["id"], ) def test_dangling_branch(self): self.assertEqual( - identifiers.snapshot_identifier(remove_id(self.dangling_branch)), - hash_to_hex(self.dangling_branch["id"]), + Snapshot.from_dict(remove_id(self.dangling_branch)).id, + self.dangling_branch["id"], ) def test_unresolved(self): with self.assertRaisesRegex(ValueError, "b'foo' -> b'bar'"): - identifiers.snapshot_identifier(remove_id(self.unresolved)) + Snapshot.from_dict(remove_id(self.unresolved)) def test_all_types(self): self.assertEqual( - identifiers.snapshot_identifier(remove_id(self.all_types)), - hash_to_hex(self.all_types["id"]), + Snapshot.from_dict(remove_id(self.all_types)).id, self.all_types["id"], ) @@ -775,15 +772,18 @@ ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(self.minimal), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(self.minimal) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.minimal), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(self.minimal).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.minimal), - "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + RawExtrinsicMetadata.from_dict(self.minimal).id, + _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"), ) def test_maximal(self): @@ -806,15 +806,18 @@ ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(self.maximal), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(self.maximal) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.maximal), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(self.maximal).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.maximal), - "f96966e1093d15236a31fde07e47d5b1c9428049", + RawExtrinsicMetadata.from_dict(self.maximal).id, + _x("f96966e1093d15236a31fde07e47d5b1c9428049"), ) def test_nonascii_path(self): @@ -836,15 +839,18 @@ ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(metadata).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "7cc83fd1912176510c083f5df43f01b09af4b333", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("7cc83fd1912176510c083f5df43f01b09af4b333"), ) def test_timezone_insensitive(self): @@ -859,16 +865,20 @@ } self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(self.minimal), - identifiers.raw_extrinsic_metadata_git_object(metadata), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(self.minimal) + ), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.minimal), - identifiers.raw_extrinsic_metadata_identifier(metadata), + RawExtrinsicMetadata.from_dict(self.minimal).id, + RawExtrinsicMetadata.from_dict(metadata).id, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"), ) def test_microsecond_insensitive(self): @@ -882,16 +892,20 @@ } self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(self.minimal), - identifiers.raw_extrinsic_metadata_git_object(metadata), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(self.minimal) + ), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.minimal), - identifiers.raw_extrinsic_metadata_identifier(metadata), + RawExtrinsicMetadata.from_dict(self.minimal).id, + RawExtrinsicMetadata.from_dict(metadata).id, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"), ) def test_noninteger_timezone(self): @@ -906,16 +920,20 @@ } self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(self.minimal), - identifiers.raw_extrinsic_metadata_git_object(metadata), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(self.minimal) + ), + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(self.minimal), - identifiers.raw_extrinsic_metadata_identifier(metadata), + RawExtrinsicMetadata.from_dict(self.minimal).id, + RawExtrinsicMetadata.from_dict(metadata).id, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "5c13f20ba336e44549baf3d7b9305b027ec9f43d", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"), ) def test_negative_timestamp(self): @@ -938,15 +956,18 @@ ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(metadata).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "895d0821a2991dd376ddc303424aceb7c68280f9", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("895d0821a2991dd376ddc303424aceb7c68280f9"), ) def test_epoch(self): @@ -969,15 +990,18 @@ ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(metadata).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "27a53df54ace35ebd910493cdc70b334d6b7cb88", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("27a53df54ace35ebd910493cdc70b334d6b7cb88"), ) def test_negative_epoch(self): @@ -1000,15 +1024,18 @@ ) self.assertEqual( - identifiers.raw_extrinsic_metadata_git_object(metadata), git_object, + git_objects.raw_extrinsic_metadata_git_object( + RawExtrinsicMetadata.from_dict(metadata) + ), + git_object, ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - hashlib.sha1(git_object).hexdigest(), + RawExtrinsicMetadata.from_dict(metadata).id, + hashlib.sha1(git_object).digest(), ) self.assertEqual( - identifiers.raw_extrinsic_metadata_identifier(metadata), - "be7154a8fd49d87f81547ea634d1e2152907d089", + RawExtrinsicMetadata.from_dict(metadata).id, + _x("be7154a8fd49d87f81547ea634d1e2152907d089"), ) @@ -1020,8 +1047,8 @@ class OriginIdentifier(unittest.TestCase): def test_content_identifier(self): self.assertEqual( - identifiers.origin_identifier(origin_example), - "b63a575fe3faab7692c9f38fb09d4bb45651bb0f", + Origin.from_dict(origin_example).id, + _x("b63a575fe3faab7692c9f38fb09d4bb45651bb0f"), ) @@ -1111,7 +1138,7 @@ @pytest.mark.parametrize("dict_input,expected", TS_DICTS) def test_normalize_timestamp_dict(dict_input, expected): - assert normalize_timestamp(dict_input) == expected + assert TimestampWithTimezone.from_dict(dict_input).to_dict() == expected TS_DICTS_INVALID_TIMESTAMP = [ @@ -1127,7 +1154,7 @@ @pytest.mark.parametrize("dict_input", TS_DICTS_INVALID_TIMESTAMP) def test_normalize_timestamp_dict_invalid_timestamp(dict_input): with pytest.raises(ValueError, match="non-integer timestamp"): - normalize_timestamp(dict_input) + TimestampWithTimezone.from_dict(dict_input) UTC = datetime.timezone.utc @@ -1152,7 +1179,7 @@ @pytest.mark.parametrize("microsecond", [0, 1, 10, 100, 1000, 999999]) def test_normalize_timestamp_datetime(date, seconds, tz, offset, microsecond): date = date.astimezone(tz).replace(microsecond=microsecond) - assert normalize_timestamp(date) == { + assert TimestampWithTimezone.from_dict(date).to_dict() == { "timestamp": {"seconds": seconds, "microseconds": microsecond}, "offset": offset, "negative_utc": False, @@ -1776,18 +1803,19 @@ "target": "swh:1:dir:" + "00" * 20, } - assert ( - identifiers.extid_identifier(extid_dict) - == "b9295e1931c31e40a7e3e1e967decd1c89426455" + assert ExtID.from_dict(extid_dict).id == _x( + "b9295e1931c31e40a7e3e1e967decd1c89426455" ) - assert identifiers.extid_identifier( - {**extid_dict, "extid_version": 0} - ) == identifiers.extid_identifier(extid_dict) + assert ( + ExtID.from_dict({**extid_dict, "extid_version": 0}).id + == ExtID.from_dict(extid_dict).id + ) - assert identifiers.extid_identifier( - {**extid_dict, "extid_version": 1} - ) != identifiers.extid_identifier(extid_dict) + assert ( + ExtID.from_dict({**extid_dict, "extid_version": 1}).id + != ExtID.from_dict(extid_dict).id + ) def test_object_types(): diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -12,20 +12,8 @@ from hypothesis.strategies import binary import pytest -from swh.model.hashutil import MultiHash, hash_to_bytes, hash_to_hex +from swh.model.hashutil import MultiHash, hash_to_bytes import swh.model.hypothesis_strategies as strategies -from swh.model.identifiers import ( - CoreSWHID, - ExtendedSWHID, - ObjectType, - content_identifier, - directory_identifier, - origin_identifier, - raw_extrinsic_metadata_identifier, - release_identifier, - revision_identifier, - snapshot_identifier, -) from swh.model.model import ( BaseModel, Content, @@ -46,14 +34,13 @@ Timestamp, TimestampWithTimezone, ) +from swh.model.swhids import CoreSWHID, ExtendedSWHID, ObjectType from swh.model.tests.swh_model_data import TEST_OBJECTS from swh.model.tests.test_identifiers import ( TS_DATETIMES, TS_TIMEZONES, - content_example, directory_example, metadata_example, - origin_example, release_example, revision_example, snapshot_example, @@ -736,94 +723,6 @@ assert rev_model.extra_headers == extra_headers -# ID computation - - -def test_content_model_id_computation(): - cnt_dict = content_example.copy() - - cnt_id_str = hash_to_hex(content_identifier(cnt_dict)["sha1_git"]) - cnt_model = Content.from_data(cnt_dict["data"]) - assert str(cnt_model.swhid()) == "swh:1:cnt:" + cnt_id_str - - -def test_directory_model_id_computation(): - dir_dict = directory_example.copy() - del dir_dict["id"] - - dir_id_str = directory_identifier(dir_dict) - dir_id = hash_to_bytes(dir_id_str) - dir_model = Directory.from_dict(dir_dict) - assert dir_model.id == dir_id - assert str(dir_model.swhid()) == "swh:1:dir:" + dir_id_str - - -def test_revision_model_id_computation(): - rev_dict = revision_example.copy() - del rev_dict["id"] - - rev_id_str = revision_identifier(rev_dict) - rev_id = hash_to_bytes(rev_id_str) - rev_model = Revision.from_dict(rev_dict) - assert rev_model.id == rev_id - assert str(rev_model.swhid()) == "swh:1:rev:" + rev_id_str - - -def test_revision_model_id_computation_with_no_date(): - """We can have revision with date to None - - """ - rev_dict = revision_example.copy() - rev_dict["date"] = None - rev_dict["committer_date"] = None - del rev_dict["id"] - - rev_id = hash_to_bytes(revision_identifier(rev_dict)) - rev_model = Revision.from_dict(rev_dict) - assert rev_model.date is None - assert rev_model.committer_date is None - assert rev_model.id == rev_id - - -def test_release_model_id_computation(): - rel_dict = release_example.copy() - del rel_dict["id"] - - rel_id_str = release_identifier(rel_dict) - rel_id = hash_to_bytes(rel_id_str) - rel_model = Release.from_dict(rel_dict) - assert isinstance(rel_model.date, TimestampWithTimezone) - assert rel_model.id == hash_to_bytes(rel_id) - assert str(rel_model.swhid()) == "swh:1:rel:" + rel_id_str - - -def test_snapshot_model_id_computation(): - snp_dict = snapshot_example.copy() - del snp_dict["id"] - - snp_id_str = snapshot_identifier(snp_dict) - snp_id = hash_to_bytes(snp_id_str) - snp_model = Snapshot.from_dict(snp_dict) - assert snp_model.id == snp_id - assert str(snp_model.swhid()) == "swh:1:snp:" + snp_id_str - - -def test_origin_model_id_computation(): - ori_dict = origin_example.copy() - - ori_id_str = origin_identifier(ori_dict) - ori_model = Origin.from_dict(ori_dict) - assert str(ori_model.swhid()) == "swh:1:ori:" + ori_id_str - - -def test_raw_extrinsic_metadata_model_id_computation(): - emd_dict = metadata_example.copy() - - emd_id_str = raw_extrinsic_metadata_identifier(emd_dict) - emd_model = RawExtrinsicMetadata.from_dict(emd_dict) - assert str(emd_model.swhid()) == "swh:1:emd:" + emd_id_str - - @given(strategies.objects(split_content=True)) def test_object_type(objtype_and_obj): obj_type, obj = objtype_and_obj