Changeset View
Changeset View
Standalone View
Standalone View
swh/web/common/identifiers.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU Affero General Public License version 3, or any later version | # License: GNU Affero General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from urllib.parse import quote | from urllib.parse import quote | ||||
from typing import Any, Dict, Iterable, List, Optional | from typing import cast, Any, Dict, Iterable, List, Optional | ||||
from typing_extensions import TypedDict | from typing_extensions import TypedDict | ||||
from django.http import QueryDict | from django.http import QueryDict | ||||
from swh.model.exceptions import ValidationError | from swh.model.exceptions import ValidationError | ||||
from swh.model.hashutil import hash_to_bytes | from swh.model.hashutil import hash_to_bytes | ||||
from swh.model.identifiers import ( | from swh.model.identifiers import ( | ||||
persistent_identifier, | swhid, | ||||
parse_persistent_identifier, | parse_swhid, | ||||
CONTENT, | CONTENT, | ||||
DIRECTORY, | DIRECTORY, | ||||
ORIGIN, | ORIGIN, | ||||
RELEASE, | RELEASE, | ||||
REVISION, | REVISION, | ||||
SNAPSHOT, | SNAPSHOT, | ||||
PersistentId, | SWHID, | ||||
) | ) | ||||
from swh.web.common import service | from swh.web.common import service | ||||
from swh.web.common.exc import BadInputExc | from swh.web.common.exc import BadInputExc | ||||
from swh.web.common.typing import ( | from swh.web.common.typing import ( | ||||
QueryParameters, | QueryParameters, | ||||
SnapshotContext, | SnapshotContext, | ||||
SWHObjectInfo, | SWHObjectInfo, | ||||
SWHIDInfo, | SWHIDInfo, | ||||
SWHIDContext, | SWHIDContext, | ||||
) | ) | ||||
from swh.web.common.utils import reverse | from swh.web.common.utils import reverse | ||||
def get_swh_persistent_id( | def gen_swhid( | ||||
object_type: str, | object_type: str, | ||||
object_id: str, | object_id: str, | ||||
scheme_version: int = 1, | scheme_version: int = 1, | ||||
metadata: SWHIDContext = {}, | metadata: SWHIDContext = {}, | ||||
) -> str: | ) -> str: | ||||
""" | """ | ||||
Returns the persistent identifier for a swh object based on: | Returns the SoftWare Heritage persistent IDentifier for a swh object based on: | ||||
* the object type | * the object type | ||||
* the object id | * the object id | ||||
* the swh identifiers scheme version | * the SWHID scheme version | ||||
Args: | Args: | ||||
object_type: the swh object type | object_type: the swh object type | ||||
(content/directory/release/revision/snapshot) | (content/directory/release/revision/snapshot) | ||||
object_id: the swh object id (hexadecimal representation | object_id: the swh object id (hexadecimal representation | ||||
of its hash value) | of its hash value) | ||||
scheme_version: the scheme version of the swh | scheme_version: the scheme version of the SWHIDs | ||||
persistent identifiers | |||||
Returns: | Returns: | ||||
the swh object persistent identifier | the SWHID of the object | ||||
Raises: | Raises: | ||||
BadInputExc: if the provided parameters do not enable to | BadInputExc: if the provided parameters do not enable to | ||||
generate a valid identifier | generate a valid identifier | ||||
""" | """ | ||||
try: | try: | ||||
swh_id = persistent_identifier(object_type, object_id, scheme_version, metadata) | obj_swhid = swhid( | ||||
except ValidationError as e: | object_type, object_id, scheme_version, cast(Dict[str, Any], metadata) | ||||
raise BadInputExc( | |||||
"Invalid object (%s) for swh persistent id. %s" % (object_id, e) | |||||
) | ) | ||||
except ValidationError as e: | |||||
raise BadInputExc("Invalid object (%s) for SWHID. %s" % (object_id, e)) | |||||
else: | else: | ||||
return swh_id | return obj_swhid | ||||
class ResolvedPersistentId(TypedDict): | class ResolvedSWHID(TypedDict): | ||||
"""parsed SWHID with context""" | """parsed SWHID with context""" | ||||
swh_id_parsed: PersistentId | swhid_parsed: SWHID | ||||
"""URL to browse object according to SWHID context""" | """URL to browse object according to SWHID context""" | ||||
browse_url: Optional[str] | browse_url: Optional[str] | ||||
def resolve_swh_persistent_id( | def resolve_swhid( | ||||
swh_id: str, query_params: Optional[QueryParameters] = None | swhid: str, query_params: Optional[QueryParameters] = None | ||||
) -> ResolvedPersistentId: | ) -> ResolvedSWHID: | ||||
""" | """ | ||||
Try to resolve a Software Heritage persistent id into an url for | Try to resolve a SoftWare Heritage persistent IDentifier into an url for | ||||
browsing the targeted object. | browsing the targeted object. | ||||
Args: | Args: | ||||
swh_id: a Software Heritage persistent identifier | swhid: a SoftWare Heritage persistent IDentifier | ||||
query_params: optional dict filled with | query_params: optional dict filled with | ||||
query parameters to append to the browse url | query parameters to append to the browse url | ||||
Returns: | Returns: | ||||
a dict with the following keys: | a dict with the following keys: | ||||
* **swh_id_parsed**: the parsed identifier | * **swhid_parsed**: the parsed identifier | ||||
* **browse_url**: the url for browsing the targeted object | * **browse_url**: the url for browsing the targeted object | ||||
""" | """ | ||||
swh_id_parsed = get_persistent_identifier(swh_id) | swhid_parsed = get_swhid(swhid) | ||||
object_type = swh_id_parsed.object_type | object_type = swhid_parsed.object_type | ||||
object_id = swh_id_parsed.object_id | object_id = swhid_parsed.object_id | ||||
browse_url = None | browse_url = None | ||||
url_args = {} | url_args = {} | ||||
query_dict = QueryDict("", mutable=True) | query_dict = QueryDict("", mutable=True) | ||||
fragment = "" | fragment = "" | ||||
anchor_swhid_parsed = None | anchor_swhid_parsed = None | ||||
process_lines = object_type is CONTENT | process_lines = object_type is CONTENT | ||||
if query_params and len(query_params) > 0: | if query_params and len(query_params) > 0: | ||||
for k in sorted(query_params.keys()): | for k in sorted(query_params.keys()): | ||||
query_dict[k] = query_params[k] | query_dict[k] = query_params[k] | ||||
if "origin" in swh_id_parsed.metadata: | if "origin" in swhid_parsed.metadata: | ||||
query_dict["origin_url"] = swh_id_parsed.metadata["origin"] | query_dict["origin_url"] = swhid_parsed.metadata["origin"] | ||||
if "anchor" in swh_id_parsed.metadata: | if "anchor" in swhid_parsed.metadata: | ||||
anchor_swhid_parsed = get_persistent_identifier( | anchor_swhid_parsed = get_swhid(swhid_parsed.metadata["anchor"]) | ||||
swh_id_parsed.metadata["anchor"] | |||||
) | |||||
if "path" in swh_id_parsed.metadata and swh_id_parsed.metadata["path"] != "/": | if "path" in swhid_parsed.metadata and swhid_parsed.metadata["path"] != "/": | ||||
query_dict["path"] = swh_id_parsed.metadata["path"] | query_dict["path"] = swhid_parsed.metadata["path"] | ||||
if anchor_swhid_parsed: | if anchor_swhid_parsed: | ||||
directory = "" | directory = "" | ||||
if anchor_swhid_parsed.object_type == DIRECTORY: | if anchor_swhid_parsed.object_type == DIRECTORY: | ||||
directory = anchor_swhid_parsed.object_id | directory = anchor_swhid_parsed.object_id | ||||
elif anchor_swhid_parsed.object_type == REVISION: | elif anchor_swhid_parsed.object_type == REVISION: | ||||
revision = service.lookup_revision(anchor_swhid_parsed.object_id) | revision = service.lookup_revision(anchor_swhid_parsed.object_id) | ||||
directory = revision["directory"] | directory = revision["directory"] | ||||
elif anchor_swhid_parsed.object_type == RELEASE: | elif anchor_swhid_parsed.object_type == RELEASE: | ||||
release = service.lookup_release(anchor_swhid_parsed.object_id) | release = service.lookup_release(anchor_swhid_parsed.object_id) | ||||
if release["target_type"] == REVISION: | if release["target_type"] == REVISION: | ||||
revision = service.lookup_revision(release["target"]) | revision = service.lookup_revision(release["target"]) | ||||
directory = revision["directory"] | directory = revision["directory"] | ||||
if object_type == CONTENT: | if object_type == CONTENT: | ||||
if "origin" not in swh_id_parsed.metadata: | if "origin" not in swhid_parsed.metadata: | ||||
# when no origin context, content objects need to have their | # when no origin context, content objects need to have their | ||||
# path prefixed by root directory id for proper breadcrumbs display | # path prefixed by root directory id for proper breadcrumbs display | ||||
query_dict["path"] = directory + query_dict["path"] | query_dict["path"] = directory + query_dict["path"] | ||||
else: | else: | ||||
# remove leading slash from SWHID content path | # remove leading slash from SWHID content path | ||||
query_dict["path"] = query_dict["path"][1:] | query_dict["path"] = query_dict["path"][1:] | ||||
elif object_type == DIRECTORY: | elif object_type == DIRECTORY: | ||||
object_id = directory | object_id = directory | ||||
# remove leading and trailing slashes from SWHID directory path | # remove leading and trailing slashes from SWHID directory path | ||||
query_dict["path"] = query_dict["path"][1:-1] | query_dict["path"] = query_dict["path"][1:-1] | ||||
# snapshot context | # snapshot context | ||||
if "visit" in swh_id_parsed.metadata: | if "visit" in swhid_parsed.metadata: | ||||
snp_swhid_parsed = get_persistent_identifier(swh_id_parsed.metadata["visit"]) | snp_swhid_parsed = get_swhid(swhid_parsed.metadata["visit"]) | ||||
if snp_swhid_parsed.object_type != SNAPSHOT: | if snp_swhid_parsed.object_type != SNAPSHOT: | ||||
raise BadInputExc("Visit must be a snapshot SWHID.") | raise BadInputExc("Visit must be a snapshot SWHID.") | ||||
query_dict["snapshot"] = snp_swhid_parsed.object_id | query_dict["snapshot"] = snp_swhid_parsed.object_id | ||||
if anchor_swhid_parsed: | if anchor_swhid_parsed: | ||||
if anchor_swhid_parsed.object_type == REVISION: | if anchor_swhid_parsed.object_type == REVISION: | ||||
# check if the anchor revision is the tip of a branch | # check if the anchor revision is the tip of a branch | ||||
branch_name = service.lookup_snapshot_branch_name_from_tip_revision( | branch_name = service.lookup_snapshot_branch_name_from_tip_revision( | ||||
Show All 34 Lines | elif object_type == RELEASE: | ||||
url_args["sha1_git"] = object_id | url_args["sha1_git"] = object_id | ||||
elif object_type == REVISION: | elif object_type == REVISION: | ||||
url_args["sha1_git"] = object_id | url_args["sha1_git"] = object_id | ||||
elif object_type == SNAPSHOT: | elif object_type == SNAPSHOT: | ||||
url_args["snapshot_id"] = object_id | url_args["snapshot_id"] = object_id | ||||
elif object_type == ORIGIN: | elif object_type == ORIGIN: | ||||
raise BadInputExc( | raise BadInputExc( | ||||
( | ( | ||||
"Origin PIDs (Persistent Identifiers) are not " | "Origin SWHIDs are not publicly resolvable because they are for " | ||||
"publicly resolvable because they are for " | |||||
"internal usage only" | "internal usage only" | ||||
) | ) | ||||
) | ) | ||||
if "lines" in swh_id_parsed.metadata and process_lines: | if "lines" in swhid_parsed.metadata and process_lines: | ||||
lines = swh_id_parsed.metadata["lines"].split("-") | lines = swhid_parsed.metadata["lines"].split("-") | ||||
fragment += "#L" + lines[0] | fragment += "#L" + lines[0] | ||||
if len(lines) > 1: | if len(lines) > 1: | ||||
fragment += "-L" + lines[1] | fragment += "-L" + lines[1] | ||||
if url_args: | if url_args: | ||||
browse_url = ( | browse_url = ( | ||||
reverse( | reverse( | ||||
f"browse-{object_type}", url_args=url_args, query_params=query_dict, | f"browse-{object_type}", url_args=url_args, query_params=query_dict, | ||||
) | ) | ||||
+ fragment | + fragment | ||||
) | ) | ||||
return ResolvedPersistentId(swh_id_parsed=swh_id_parsed, browse_url=browse_url) | return ResolvedSWHID(swhid_parsed=swhid_parsed, browse_url=browse_url) | ||||
def get_persistent_identifier(persistent_id: str) -> PersistentId: | def get_swhid(swhid: str) -> SWHID: | ||||
"""Check if a persistent identifier is valid. | """Check if a SWHID is valid and return it parsed. | ||||
Args: | Args: | ||||
persistent_id: A string representing a Software Heritage | swhid: a SoftWare Heritage persistent IDentifier. | ||||
persistent identifier. | |||||
Raises: | Raises: | ||||
BadInputExc: if the provided persistent identifier can | BadInputExc: if the provided SWHID can not be parsed. | ||||
not be parsed. | |||||
Return: | Return: | ||||
A persistent identifier object. | A parsed SWHID. | ||||
""" | """ | ||||
try: | try: | ||||
pid_object = parse_persistent_identifier(persistent_id) | swhid_parsed = parse_swhid(swhid) | ||||
except ValidationError as ve: | except ValidationError as ve: | ||||
raise BadInputExc("Error when parsing identifier: %s" % " ".join(ve.messages)) | raise BadInputExc("Error when parsing identifier: %s" % " ".join(ve.messages)) | ||||
else: | else: | ||||
return pid_object | return swhid_parsed | ||||
def group_swh_persistent_identifiers( | def group_swhids(swhids: Iterable[SWHID],) -> Dict[str, List[bytes]]: | ||||
persistent_ids: Iterable[PersistentId], | |||||
) -> Dict[str, List[bytes]]: | |||||
""" | """ | ||||
Groups many Software Heritage persistent identifiers into a | Groups many SoftWare Heritage persistent IDentifiers into a | ||||
dictionary depending on their type. | dictionary depending on their type. | ||||
Args: | Args: | ||||
persistent_ids: an iterable of Software Heritage persistent | swhids: an iterable of SoftWare Heritage persistent | ||||
identifier objects | IDentifier objects | ||||
Returns: | Returns: | ||||
A dictionary with: | A dictionary with: | ||||
keys: persistent identifier types | keys: object types | ||||
values: persistent identifiers id | values: object hashes | ||||
""" | """ | ||||
pids_by_type: Dict[str, List[bytes]] = { | swhids_by_type: Dict[str, List[bytes]] = { | ||||
CONTENT: [], | CONTENT: [], | ||||
DIRECTORY: [], | DIRECTORY: [], | ||||
REVISION: [], | REVISION: [], | ||||
RELEASE: [], | RELEASE: [], | ||||
SNAPSHOT: [], | SNAPSHOT: [], | ||||
} | } | ||||
for pid in persistent_ids: | for obj_swhid in swhids: | ||||
obj_id = pid.object_id | obj_id = obj_swhid.object_id | ||||
obj_type = pid.object_type | obj_type = obj_swhid.object_type | ||||
pids_by_type[obj_type].append(hash_to_bytes(obj_id)) | swhids_by_type[obj_type].append(hash_to_bytes(obj_id)) | ||||
return pids_by_type | return swhids_by_type | ||||
def get_swhids_info( | def get_swhids_info( | ||||
swh_objects: Iterable[SWHObjectInfo], | swh_objects: Iterable[SWHObjectInfo], | ||||
snapshot_context: Optional[SnapshotContext] = None, | snapshot_context: Optional[SnapshotContext] = None, | ||||
extra_context: Optional[Dict[str, Any]] = None, | extra_context: Optional[Dict[str, Any]] = None, | ||||
) -> List[SWHIDInfo]: | ) -> List[SWHIDInfo]: | ||||
""" | """ | ||||
Returns a list of dict containing info related to persistent | Returns a list of dict containing info related to SWHIDs of objects. | ||||
identifiers of swh objects. | |||||
Args: | Args: | ||||
swh_objects: an iterable of dict describing archived objects | swh_objects: an iterable of dict describing archived objects | ||||
snapshot_context: optional dict parameter describing the snapshot in | snapshot_context: optional dict parameter describing the snapshot in | ||||
which the objects have been found | which the objects have been found | ||||
extra_context: optional dict filled with extra contextual info about | extra_context: optional dict filled with extra contextual info about | ||||
the objects | the objects | ||||
Returns: | Returns: | ||||
a list of dict containing persistent identifiers info | a list of dict containing SWHIDs info | ||||
""" | """ | ||||
swhids_info = [] | swhids_info = [] | ||||
for swh_object in swh_objects: | for swh_object in swh_objects: | ||||
if not swh_object["object_id"]: | if not swh_object["object_id"]: | ||||
swhids_info.append( | swhids_info.append( | ||||
SWHIDInfo( | SWHIDInfo( | ||||
object_type=swh_object["object_type"], | object_type=swh_object["object_type"], | ||||
Show All 10 Lines | for swh_object in swh_objects: | ||||
object_id = swh_object["object_id"] | object_id = swh_object["object_id"] | ||||
swhid_context: SWHIDContext = {} | swhid_context: SWHIDContext = {} | ||||
if snapshot_context: | if snapshot_context: | ||||
if snapshot_context["origin_info"] is not None: | if snapshot_context["origin_info"] is not None: | ||||
swhid_context["origin"] = quote( | swhid_context["origin"] = quote( | ||||
snapshot_context["origin_info"]["url"], safe="/?:@&" | snapshot_context["origin_info"]["url"], safe="/?:@&" | ||||
) | ) | ||||
if object_type != SNAPSHOT: | if object_type != SNAPSHOT: | ||||
swhid_context["visit"] = get_swh_persistent_id( | swhid_context["visit"] = gen_swhid( | ||||
SNAPSHOT, snapshot_context["snapshot_id"] | SNAPSHOT, snapshot_context["snapshot_id"] | ||||
) | ) | ||||
if object_type in (CONTENT, DIRECTORY): | if object_type in (CONTENT, DIRECTORY): | ||||
if snapshot_context["release_id"] is not None: | if snapshot_context["release_id"] is not None: | ||||
swhid_context["anchor"] = get_swh_persistent_id( | swhid_context["anchor"] = gen_swhid( | ||||
RELEASE, snapshot_context["release_id"] | RELEASE, snapshot_context["release_id"] | ||||
) | ) | ||||
elif snapshot_context["revision_id"] is not None: | elif snapshot_context["revision_id"] is not None: | ||||
swhid_context["anchor"] = get_swh_persistent_id( | swhid_context["anchor"] = gen_swhid( | ||||
REVISION, snapshot_context["revision_id"] | REVISION, snapshot_context["revision_id"] | ||||
) | ) | ||||
if object_type in (CONTENT, DIRECTORY): | if object_type in (CONTENT, DIRECTORY): | ||||
if ( | if ( | ||||
extra_context | extra_context | ||||
and "revision" in extra_context | and "revision" in extra_context | ||||
and extra_context["revision"] | and extra_context["revision"] | ||||
and "anchor" not in swhid_context | and "anchor" not in swhid_context | ||||
): | ): | ||||
swhid_context["anchor"] = get_swh_persistent_id( | swhid_context["anchor"] = gen_swhid(REVISION, extra_context["revision"]) | ||||
REVISION, extra_context["revision"] | |||||
) | |||||
elif ( | elif ( | ||||
extra_context | extra_context | ||||
and "root_directory" in extra_context | and "root_directory" in extra_context | ||||
and extra_context["root_directory"] | and extra_context["root_directory"] | ||||
and "anchor" not in swhid_context | and "anchor" not in swhid_context | ||||
and ( | and ( | ||||
object_type != DIRECTORY | object_type != DIRECTORY | ||||
or extra_context["root_directory"] != object_id | or extra_context["root_directory"] != object_id | ||||
) | ) | ||||
): | ): | ||||
swhid_context["anchor"] = get_swh_persistent_id( | swhid_context["anchor"] = gen_swhid( | ||||
DIRECTORY, extra_context["root_directory"] | DIRECTORY, extra_context["root_directory"] | ||||
) | ) | ||||
path = None | path = None | ||||
if extra_context and "path" in extra_context: | if extra_context and "path" in extra_context: | ||||
path = extra_context["path"] or "/" | path = extra_context["path"] or "/" | ||||
if "filename" in extra_context and object_type == CONTENT: | if "filename" in extra_context and object_type == CONTENT: | ||||
path += extra_context["filename"] | path += extra_context["filename"] | ||||
if path: | if path: | ||||
swhid_context["path"] = quote(path, safe="/?:@&") | swhid_context["path"] = quote(path, safe="/?:@&") | ||||
swhid = get_swh_persistent_id(object_type, object_id) | swhid = gen_swhid(object_type, object_id) | ||||
swhid_url = reverse("browse-swh-id", url_args={"swh_id": swhid}) | swhid_url = reverse("browse-swhid", url_args={"swhid": swhid}) | ||||
swhid_with_context = None | swhid_with_context = None | ||||
swhid_with_context_url = None | swhid_with_context_url = None | ||||
if swhid_context: | if swhid_context: | ||||
swhid_with_context = get_swh_persistent_id( | swhid_with_context = gen_swhid( | ||||
object_type, object_id, metadata=swhid_context | object_type, object_id, metadata=swhid_context | ||||
) | ) | ||||
swhid_with_context_url = reverse( | swhid_with_context_url = reverse( | ||||
"browse-swh-id", url_args={"swh_id": swhid_with_context} | "browse-swhid", url_args={"swhid": swhid_with_context} | ||||
) | ) | ||||
swhids_info.append( | swhids_info.append( | ||||
SWHIDInfo( | SWHIDInfo( | ||||
object_type=object_type, | object_type=object_type, | ||||
object_id=object_id, | object_id=object_id, | ||||
swhid=swhid, | swhid=swhid, | ||||
swhid_url=swhid_url, | swhid_url=swhid_url, | ||||
context=swhid_context, | context=swhid_context, | ||||
swhid_with_context=swhid_with_context, | swhid_with_context=swhid_with_context, | ||||
swhid_with_context_url=swhid_with_context_url, | swhid_with_context_url=swhid_with_context_url, | ||||
) | ) | ||||
) | ) | ||||
return swhids_info | return swhids_info |