Changeset View
Changeset View
Standalone View
Standalone View
swh/model/identifiers.py
Show First 20 Lines • Show All 718 Lines • ▼ Show 20 Lines | def origin_identifier(origin): | ||||
"""Return the intrinsic identifier for an origin. | """Return the intrinsic identifier for an origin. | ||||
An origin's identifier is the sha1 checksum of the entire origin URL | An origin's identifier is the sha1 checksum of the entire origin URL | ||||
""" | """ | ||||
return hashlib.sha1(origin["url"].encode("utf-8")).hexdigest() | return hashlib.sha1(origin["url"].encode("utf-8")).hexdigest() | ||||
def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str: | |||||
"""Return the intrinsic identifier for a RawExtrinsicMetadata object. | |||||
A raw_extrinsic_metadata identifier is a salted sha1 (using the git | |||||
anlambert: This does not compute a snapshot identifier. | |||||
hashing algorithm with the ``raw_extrinsic_metadata`` object type) of | |||||
a manifest following the format: | |||||
``` | |||||
target $ExtendedSwhid | |||||
discovery_date $ISO8601 | |||||
authority $StrWithoutSpaces $IRI | |||||
fetcher $Str $Version | |||||
format $StrWithoutSpaces | |||||
origin $IRI <- optional | |||||
visit $IntInDecimal <- optional | |||||
snapshot $CoreSwhid <- optional | |||||
release $CoreSwhid <- optional | |||||
revision $CoreSwhid <- optional | |||||
path $Bytes <- optional | |||||
Not Done Inline ActionsYou still have the spurious colons that @zack had mentioned. olasd: You still have the spurious colons that @zack had mentioned. | |||||
directory $CoreSwhid <- optional | |||||
$MetadataBytes | |||||
``` | |||||
$IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as | |||||
described below) | |||||
$StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces. | |||||
$Str is an UTF-8 string. | |||||
$CoreSwhid are core SWHIDs, as defined in :ref:`persistent-identifiers`. | |||||
$ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for | |||||
origins and 'emd' for raw extrinsic metadata) | |||||
Not Done Inline Actionssame here anlambert: same here | |||||
Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields, | |||||
ie. by adding a space after them. | |||||
Returns: | |||||
str: the intrinsic identifier for `metadata` | |||||
""" | |||||
headers = [ | |||||
(b"target", str(metadata["target"]).encode()), | |||||
(b"discovery_date", metadata["discovery_date"].isoformat().encode("ascii")), | |||||
( | |||||
b"authority", | |||||
f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(), | |||||
), | |||||
( | |||||
b"fetcher", | |||||
f"{metadata['fetcher']['name']} {metadata['fetcher']['version']}".encode(), | |||||
), | |||||
(b"format", metadata["format"].encode()), | |||||
] | |||||
for key in ( | |||||
"origin", | |||||
"visit", | |||||
"snapshot", | |||||
"release", | |||||
"revision", | |||||
"path", | |||||
"directory", | |||||
): | |||||
if metadata.get(key) is not None: | |||||
value: bytes | |||||
Not Done Inline ActionsI never remember which is the default encoding; can we be explicit here? olasd: I never remember which is the default encoding; can we be explicit here? | |||||
if key == "path": | |||||
value = metadata[key] | |||||
else: | |||||
value = str(metadata[key]).encode() | |||||
headers.append((key.encode("ascii"), value)) | |||||
return identifier_to_str( | |||||
hash_manifest("raw_extrinsic_metadata", headers, metadata["metadata"]) | |||||
) | |||||
# type of the "object_type" attribute of the SWHID class; either | # type of the "object_type" attribute of the SWHID class; either | ||||
# ObjectType or ExtendedObjectType | # ObjectType or ExtendedObjectType | ||||
_TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType) | _TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType) | ||||
# the SWHID class itself (this is used so that X.from_string() can return X | # the SWHID class itself (this is used so that X.from_string() can return X | ||||
# for all X subclass of _BaseSWHID) | # for all X subclass of _BaseSWHID) | ||||
_TSWHID = TypeVar("_TSWHID", bound="_BaseSWHID") | _TSWHID = TypeVar("_TSWHID", bound="_BaseSWHID") | ||||
▲ Show 20 Lines • Show All 377 Lines • Show Last 20 Lines |
This does not compute a snapshot identifier.