Changeset View
Changeset View
Standalone View
Standalone View
swh/model/identifiers.py
Show First 20 Lines • Show All 150 Lines • ▼ Show 20 Lines | if isinstance(identifier, bytes): | ||||
return binascii.hexlify(identifier).decode() | return binascii.hexlify(identifier).decode() | ||||
raise ValueError( | raise ValueError( | ||||
"Wrong type for identifier %s, expected bytes or str" | "Wrong type for identifier %s, expected bytes or str" | ||||
% identifier.__class__.__name__ | % identifier.__class__.__name__ | ||||
) | ) | ||||
def content_identifier(content): | def content_identifier(content: Dict[str, Any]) -> Dict[str, bytes]: | ||||
"""Return the intrinsic identifier for a content. | """Return the intrinsic identifier for a content. | ||||
A content's identifier is the sha1, sha1_git and sha256 checksums of its | A content's identifier is the sha1, sha1_git and sha256 checksums of its | ||||
data. | data. | ||||
Args: | Args: | ||||
content: a content conforming to the Software Heritage schema | content: a content conforming to the Software Heritage schema | ||||
Show All 32 Lines | def escape_newlines(snippet): | ||||
""" | """ | ||||
if b"\n" in snippet: | if b"\n" in snippet: | ||||
return b"\n ".join(snippet.split(b"\n")) | return b"\n ".join(snippet.split(b"\n")) | ||||
else: | else: | ||||
return snippet | return snippet | ||||
def directory_identifier(directory): | def directory_identifier(directory: Dict[str, Any]) -> str: | ||||
"""Return the intrinsic identifier for a directory. | """Return the intrinsic identifier for a directory. | ||||
A directory's identifier is the tree sha1 à la git of a directory listing, | A directory's identifier is the tree sha1 à la git of a directory listing, | ||||
using the following algorithm, which is equivalent to the git algorithm for | using the following algorithm, which is equivalent to the git algorithm for | ||||
trees: | trees: | ||||
1. Entries of the directory are sorted using the name (or the name with '/' | 1. Entries of the directory are sorted using the name (or the name with '/' | ||||
appended for directory entries) as key, in bytes order. | appended for directory entries) as key, in bytes order. | ||||
Show All 19 Lines | 2. For each entry of the directory, the following bytes are output: | ||||
- for symbolic links: the blob sha1_git of a file containing the link | - for symbolic links: the blob sha1_git of a file containing the link | ||||
destination | destination | ||||
- for directories: their intrinsic identifier | - for directories: their intrinsic identifier | ||||
- for revisions: their intrinsic identifier | - for revisions: their intrinsic identifier | ||||
(Note that there is no separator between entries) | (Note that there is no separator between entries) | ||||
""" | """ | ||||
manifest = directory_manifest(directory) | |||||
return identifier_to_str(hash_git_data(manifest, "tree")) | |||||
def directory_manifest(directory: Dict[str, Any]) -> bytes: | |||||
components = [] | components = [] | ||||
for entry in sorted(directory["entries"], key=directory_entry_sort_key): | for entry in sorted(directory["entries"], key=directory_entry_sort_key): | ||||
components.extend( | components.extend( | ||||
[ | [ | ||||
_perms_to_bytes(entry["perms"]), | _perms_to_bytes(entry["perms"]), | ||||
b"\x20", | b"\x20", | ||||
entry["name"], | entry["name"], | ||||
b"\x00", | b"\x00", | ||||
identifier_to_bytes(entry["target"]), | identifier_to_bytes(entry["target"]), | ||||
] | ] | ||||
) | ) | ||||
return identifier_to_str(hash_git_data(b"".join(components), "tree")) | return b"".join(components) | ||||
def format_date(date): | def format_date(date): | ||||
"""Convert a date object into an UTC timestamp encoded as ascii bytes. | """Convert a date object into an UTC timestamp encoded as ascii bytes. | ||||
Git stores timestamps as an integer number of seconds since the UNIX epoch. | Git stores timestamps as an integer number of seconds since the UNIX epoch. | ||||
However, Software Heritage stores timestamps as an integer number of | However, Software Heritage stores timestamps as an integer number of | ||||
▲ Show 20 Lines • Show All 178 Lines • ▼ Show 20 Lines | for key, value in headers: | ||||
entries.extend((key, b" ", escape_newlines(value), b"\n")) | entries.extend((key, b" ", escape_newlines(value), b"\n")) | ||||
if message is not None: | if message is not None: | ||||
entries.extend((b"\n", message)) | entries.extend((b"\n", message)) | ||||
return b"".join(entries) | return b"".join(entries) | ||||
def hash_manifest( | |||||
type: str, headers: Iterable[Tuple[bytes, bytes]], message: Optional[bytes] = None, | |||||
): | |||||
"""Hash the manifest of an object of type `type`, comprised of a sequence | |||||
of `headers` and an optional `message`. | |||||
Before hashing, the manifest is serialized with the :func:`format_manifest` | |||||
function. | |||||
We then use the git "salted sha1" (:func:`swh.model.hashutil.hash_git_data`) | |||||
with the given `type` to hash the manifest. | |||||
Args: | |||||
type: the type of object for which we're computing a manifest (e.g. | |||||
"tag", "commit", ...) | |||||
headers: a sequence of key/value headers stored in the manifest; | |||||
message: an optional message used to trail the manifest. | |||||
""" | |||||
manifest = format_manifest(headers, message) | |||||
return hash_git_data(manifest, type) | |||||
def format_author_data(author, date_offset) -> bytes: | def format_author_data(author, date_offset) -> bytes: | ||||
"""Format authorship data according to git standards. | """Format authorship data according to git standards. | ||||
Git authorship data has two components: | Git authorship data has two components: | ||||
- an author specification, usually a name and email, but in practice an | - an author specification, usually a name and email, but in practice an | ||||
arbitrary bytestring | arbitrary bytestring | ||||
- optionally, a timestamp with a UTC offset specification | - optionally, a timestamp with a UTC offset specification | ||||
Show All 30 Lines | if date_offset is not None: | ||||
date_f = format_date(date_offset["timestamp"]) | date_f = format_date(date_offset["timestamp"]) | ||||
offset_f = format_offset(date_offset["offset"], date_offset["negative_utc"]) | offset_f = format_offset(date_offset["offset"], date_offset["negative_utc"]) | ||||
ret.extend([b" ", date_f, b" ", offset_f]) | ret.extend([b" ", date_f, b" ", offset_f]) | ||||
return b"".join(ret) | return b"".join(ret) | ||||
def revision_identifier(revision): | def revision_identifier(revision: Dict[str, Any]) -> str: | ||||
"""Return the intrinsic identifier for a revision. | """Return the intrinsic identifier for a revision. | ||||
The fields used for the revision identifier computation are: | The fields used for the revision identifier computation are: | ||||
- directory | - directory | ||||
- parents | - parents | ||||
- author | - author | ||||
- author_date | - author_date | ||||
Show All 33 Lines | def revision_identifier(revision: Dict[str, Any]) -> str: | ||||
If the message is None, the manifest ends with the last header. Else, the | If the message is None, the manifest ends with the last header. Else, the | ||||
message is appended to the headers after an empty line. | message is appended to the headers after an empty line. | ||||
The checksum of the full manifest is computed using the 'commit' git object | The checksum of the full manifest is computed using the 'commit' git object | ||||
type. | type. | ||||
""" | """ | ||||
manifest = revision_manifest(revision) | |||||
return identifier_to_str(hash_git_data(manifest, "commit")) | |||||
def revision_manifest(revision: Dict[str, Any]) -> bytes: | |||||
"""Formats the manifest of a revision. See :func:`revision_identifier` for details | |||||
on the format.""" | |||||
headers = [(b"tree", identifier_to_str(revision["directory"]).encode())] | headers = [(b"tree", identifier_to_str(revision["directory"]).encode())] | ||||
for parent in revision["parents"]: | for parent in revision["parents"]: | ||||
if parent: | if parent: | ||||
headers.append((b"parent", identifier_to_str(parent).encode())) | headers.append((b"parent", identifier_to_str(parent).encode())) | ||||
headers.append( | headers.append( | ||||
(b"author", format_author_data(revision["author"], revision["date"])) | (b"author", format_author_data(revision["author"], revision["date"])) | ||||
) | ) | ||||
headers.append( | headers.append( | ||||
( | ( | ||||
b"committer", | b"committer", | ||||
format_author_data(revision["committer"], revision["committer_date"]), | format_author_data(revision["committer"], revision["committer_date"]), | ||||
) | ) | ||||
) | ) | ||||
# Handle extra headers | # Handle extra headers | ||||
metadata = revision.get("metadata") or {} | metadata = revision.get("metadata") or {} | ||||
extra_headers = revision.get("extra_headers", ()) | extra_headers = revision.get("extra_headers", ()) | ||||
if not extra_headers and "extra_headers" in metadata: | if not extra_headers and "extra_headers" in metadata: | ||||
extra_headers = metadata["extra_headers"] | extra_headers = metadata["extra_headers"] | ||||
headers.extend(extra_headers) | headers.extend(extra_headers) | ||||
return identifier_to_str(hash_manifest("commit", headers, revision["message"])) | return format_manifest(headers, revision["message"]) | ||||
def target_type_to_git(target_type): | def target_type_to_git(target_type: str) -> bytes: | ||||
"""Convert a software heritage target type to a git object type""" | """Convert a software heritage target type to a git object type""" | ||||
return { | return { | ||||
"content": b"blob", | "content": b"blob", | ||||
"directory": b"tree", | "directory": b"tree", | ||||
"revision": b"commit", | "revision": b"commit", | ||||
"release": b"tag", | "release": b"tag", | ||||
"snapshot": b"refs", | "snapshot": b"refs", | ||||
}[target_type] | }[target_type] | ||||
def release_identifier(release): | def release_identifier(release: Dict[str, Any]) -> str: | ||||
"""Return the intrinsic identifier for a release.""" | """Return the intrinsic identifier for a release.""" | ||||
manifest = release_manifest(release) | |||||
return identifier_to_str(hash_git_data(manifest, "tag")) | |||||
def release_manifest(release: Dict[str, Any]) -> bytes: | |||||
headers = [ | headers = [ | ||||
(b"object", identifier_to_str(release["target"]).encode()), | (b"object", identifier_to_str(release["target"]).encode()), | ||||
(b"type", target_type_to_git(release["target_type"])), | (b"type", target_type_to_git(release["target_type"])), | ||||
(b"tag", release["name"]), | (b"tag", release["name"]), | ||||
] | ] | ||||
if "author" in release and release["author"]: | if "author" in release and release["author"]: | ||||
headers.append( | headers.append( | ||||
(b"tagger", format_author_data(release["author"], release["date"])) | (b"tagger", format_author_data(release["author"], release["date"])) | ||||
) | ) | ||||
return identifier_to_str(hash_manifest("tag", headers, release["message"])) | return format_manifest(headers, release["message"]) | ||||
def snapshot_identifier(snapshot, *, ignore_unresolved=False): | def snapshot_identifier( | ||||
snapshot: Dict[str, Any], *, ignore_unresolved: bool = False | |||||
) -> str: | |||||
"""Return the intrinsic identifier for a snapshot. | """Return the intrinsic identifier for a snapshot. | ||||
Snapshots are a set of named branches, which are pointers to objects at any | Snapshots are a set of named branches, which are pointers to objects at any | ||||
level of the Software Heritage DAG. | level of the Software Heritage DAG. | ||||
As well as pointing to other objects in the Software Heritage DAG, branches | As well as pointing to other objects in the Software Heritage DAG, branches | ||||
can also be *alias*es, in which case their target is the name of another | can also be *alias*es, in which case their target is the name of another | ||||
branch in the same snapshot, or *dangling*, in which case the target is | branch in the same snapshot, or *dangling*, in which case the target is | ||||
Show All 38 Lines | Args: | ||||
single entry is needed, ``'branches'``, which is itself a :class:`dict` | single entry is needed, ``'branches'``, which is itself a :class:`dict` | ||||
mapping each branch to its target | mapping each branch to its target | ||||
ignore_unresolved (bool): if `True`, ignore unresolved branch aliases. | ignore_unresolved (bool): if `True`, ignore unresolved branch aliases. | ||||
Returns: | Returns: | ||||
str: the intrinsic identifier for `snapshot` | str: the intrinsic identifier for `snapshot` | ||||
""" | """ | ||||
manifest = snapshot_manifest(snapshot, ignore_unresolved=ignore_unresolved) | |||||
return identifier_to_str(hash_git_data(manifest, "snapshot")) | |||||
def snapshot_manifest( | |||||
snapshot: Dict[str, Any], *, ignore_unresolved: bool = False | |||||
) -> bytes: | |||||
"""Formats the manifest of a revision. See :func:`snapshot_identifier` for details | |||||
on the format.""" | |||||
unresolved = [] | unresolved = [] | ||||
lines = [] | lines = [] | ||||
for name, target in sorted(snapshot["branches"].items()): | for name, target in sorted(snapshot["branches"].items()): | ||||
if not target: | if not target: | ||||
target_type = b"dangling" | target_type = b"dangling" | ||||
target_id = b"" | target_id = b"" | ||||
elif target["target_type"] == "alias": | elif target["target_type"] == "alias": | ||||
Show All 14 Lines | for name, target in sorted(snapshot["branches"].items()): | ||||
("%d:" % len(target_id)).encode(), | ("%d:" % len(target_id)).encode(), | ||||
target_id, | target_id, | ||||
] | ] | ||||
) | ) | ||||
if unresolved and not ignore_unresolved: | if unresolved and not ignore_unresolved: | ||||
raise ValueError( | raise ValueError( | ||||
"Branch aliases unresolved: %s" | "Branch aliases unresolved: %s" | ||||
% ", ".join("%s -> %s" % x for x in unresolved), | % ", ".join("%r -> %r" % x for x in unresolved), | ||||
unresolved, | unresolved, | ||||
) | ) | ||||
return identifier_to_str(hash_git_data(b"".join(lines), "snapshot")) | return b"".join(lines) | ||||
def origin_identifier(origin): | def origin_identifier(origin): | ||||
"""Return the intrinsic identifier for an origin. | """Return the intrinsic identifier for an origin. | ||||
An origin's identifier is the sha1 checksum of the entire origin URL | An origin's identifier is the sha1 checksum of the entire origin URL | ||||
""" | """ | ||||
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines | def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: | ||||
Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, | Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, | ||||
ie. by adding a space after them. | ie. by adding a space after them. | ||||
Returns: | Returns: | ||||
str: the intrinsic identifier for ``metadata`` | str: the intrinsic identifier for ``metadata`` | ||||
""" | """ | ||||
manifest = raw_extrinsic_metadata_manifest(metadata) | |||||
return identifier_to_str(hash_git_data(manifest, "raw_extrinsic_metadata")) | |||||
def raw_extrinsic_metadata_manifest(metadata: Dict[str, Any]) -> bytes: | |||||
"""Formats the manifest of a raw_extrinsic_metadata object. | |||||
See :func:`raw_extrinsic_metadata_identifier` for details | |||||
on the format.""" | |||||
# equivalent to using math.floor(dt.timestamp()) to round down, | # equivalent to using math.floor(dt.timestamp()) to round down, | ||||
# as int(dt.timestamp()) rounds toward zero, | # as int(dt.timestamp()) rounds toward zero, | ||||
# which would map two seconds on the 0 timestamp. | # which would map two seconds on the 0 timestamp. | ||||
# | # | ||||
# This should never be an issue in practice as Software Heritage didn't | # This should never be an issue in practice as Software Heritage didn't | ||||
# start collecting metadata before 2015. | # start collecting metadata before 2015. | ||||
timestamp = ( | timestamp = ( | ||||
metadata["discovery_date"] | metadata["discovery_date"] | ||||
Show All 30 Lines | ): | ||||
value: bytes | value: bytes | ||||
if key == "path": | if key == "path": | ||||
value = metadata[key] | value = metadata[key] | ||||
else: | else: | ||||
value = str(metadata[key]).encode() | value = str(metadata[key]).encode() | ||||
headers.append((key.encode("ascii"), value)) | headers.append((key.encode("ascii"), value)) | ||||
return identifier_to_str( | return format_manifest(headers, metadata["metadata"]) | ||||
hash_manifest("raw_extrinsic_metadata", headers, metadata["metadata"]) | |||||
) | |||||
def extid_identifier(extid: Dict[str, Any]) -> str: | def extid_identifier(extid: Dict[str, Any]) -> str: | ||||
"""Return the intrinsic identifier for an ExtID object. | """Return the intrinsic identifier for an ExtID object. | ||||
An ExtID identifier is a salted sha1 (using the git hashing algorithm with | An ExtID identifier is a salted sha1 (using the git hashing algorithm with | ||||
the ``extid`` object type) of a manifest following the format: | the ``extid`` object type) of a manifest following the format: | ||||
Show All 14 Lines | def extid_identifier(extid: Dict[str, Any]) -> str: | ||||
""" | """ | ||||
headers = [ | headers = [ | ||||
(b"extid_type", extid["extid_type"].encode("ascii")), | (b"extid_type", extid["extid_type"].encode("ascii")), | ||||
(b"extid", extid["extid"]), | (b"extid", extid["extid"]), | ||||
(b"target", str(extid["target"]).encode("ascii")), | (b"target", str(extid["target"]).encode("ascii")), | ||||
] | ] | ||||
return identifier_to_str(hash_manifest("extid", headers)) | manifest = format_manifest(headers) | ||||
return identifier_to_str(hash_git_data(manifest, "extid")) | |||||
# type of the "object_type" attribute of the SWHID class; either | # type of the "object_type" attribute of the SWHID class; either | ||||
# ObjectType or ExtendedObjectType | # ObjectType or ExtendedObjectType | ||||
_TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType) | _TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType) | ||||
# the SWHID class itself (this is used so that X.from_string() can return X | # the SWHID class itself (this is used so that X.from_string() can return X | ||||
# for all X subclass of _BaseSWHID) | # for all X subclass of _BaseSWHID) | ||||
▲ Show 20 Lines • Show All 379 Lines • Show Last 20 Lines |