Changeset View
Standalone View
swh/graphql/resolvers/content.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from typing import Union | from typing import Union | ||||
from swh.graphql.backends import archive | from swh.graphql.backends import archive | ||||
from swh.model import hashutil | |||||
from .base_node import BaseSWHNode | from .base_node import BaseSWHNode | ||||
from .directory_entry import DirectoryEntryNode | from .directory_entry import DirectoryEntryNode | ||||
from .release import BaseReleaseNode | from .release import BaseReleaseNode | ||||
from .snapshot_branch import SnapshotBranchNode | from .snapshot_branch import SnapshotBranchNode | ||||
class BaseContentNode(BaseSWHNode): | class BaseContentNode(BaseSWHNode): | ||||
""" | """ | ||||
Base resolver for all the content nodes | Base resolver for all the content nodes | ||||
""" | """ | ||||
def _get_content_by_id(self, content_id): | def _get_content_by_hash(self, hash_value: bytes, hash_type="sha1_git"): | ||||
content = archive.Archive().get_content(content_id) | content = archive.Archive().get_content( | ||||
hash_value=hash_value, hash_type=hash_type | |||||
) | |||||
return content[0] if content else None | return content[0] if content else None | ||||
@property | @property | ||||
def checksum(self): | def checksum(self): | ||||
# FIXME, return a Node object | # FIXME, return a Node object | ||||
return {k: v.hex() for (k, v) in self._node.hashes().items()} | return {k: v.hex() for (k, v) in self._node.hashes().items()} | ||||
@property | @property | ||||
def id(self): | def id(self): | ||||
return self._node.sha1_git | return self._node.sha1_git | ||||
@property | |||||
def data(self): | |||||
# FIXME, return a Node object | |||||
content_sha1 = self._node.hashes()["sha1"] | |||||
return {"raw": archive.Archive().get_content_data(content_sha1)} | |||||
anlambert: I think it will be safer to reference content data by an URL here instead of raw bytes, you… | |||||
Done Inline ActionsI agree. Or is there a way to have a static URL instead, like an S3 one? jayeshv: I agree.
Do you think adding a link to https://archive.softwareheritage. | |||||
Not Done Inline Actions
I think so, this endpoint does not use SWHID by the way as they were not yet in place when it was implemented.
For the record, you can define default config values using Python code (see swh-web config.py).
Not that I know. anlambert: > Do you think adding a link to https://archive.softwareheritage.org/api/1/content/<swhid>/raw/… | |||||
Done Inline Actions
Ok. Here also data is served by an application server (Django in this case). So, too many requests could bring that too down. About rate limiting, a basic rate limiter is in place for GraphQL and I'm planning to improve that soon. jayeshv: > > Do you think adding a link to https://archive.softwareheritage. | |||||
Done Inline Actions@anlambert For the time being I have added both the raw binary data and a URL to get file data. I have added the API URL in the code instead of a config now. I will move that to a config in another diff that improves the config. jayeshv: @anlambert For the time being I have added both the raw binary data and a URL to get file data. | |||||
Not Done Inline ActionsI am still thinking that raw content data should not be included in GraphQL responses as it remains a flaw to easily DDoS us with a single request In Web API responses (being REST or GraphQL) we should only return metadata about the archived objects, getting raw content data should be done Anyway, do as you want but my guess is that we will not keep that feature in the future.
Ideally the credits for getting a raw content should be proportional to its size in bytes. anlambert: I am still thinking that raw content data should not be included in GraphQL responses as it… | |||||
def is_type_of(self): | def is_type_of(self): | ||||
# is_type_of is required only when resolving a UNION type | # is_type_of is required only when resolving a UNION type | ||||
Not Done Inline Actionss/Conetent/Content/ anlambert: s/Conetent/Content/ | |||||
# This is for ariadne to return the right type | # This is for ariadne to return the right type | ||||
return "Content" | return "Content" | ||||
class ContentNode(BaseContentNode): | class ContentNode(BaseContentNode): | ||||
""" | """ | ||||
Node resolver for a content requested directly with its SWHID | Node resolver for a content requested directly with its SWHID | ||||
or with a contentHash (hash_type:hash_value) string | |||||
""" | """ | ||||
Not Done Inline Actionss/Conetent/Content/ anlambert: s/Conetent/Content/ | |||||
def _get_node_data(self): | def _get_node_data(self): | ||||
return self._get_content_by_id(self.kwargs.get("swhid").object_id) | if self.kwargs.get("swhid"): | ||||
# SWHID gets precedence in case both the arguments are available | |||||
hash_type, hash_value = "sha1_git", self.kwargs.get("swhid").object_id | |||||
Not Done Inline Actionss/Conetent/Content/ anlambert: s/Conetent/Content/ | |||||
elif self.kwargs.get("contentHash"): | |||||
hash_type, hash_string = self.kwargs.get("contentHash").split(":") | |||||
hash_value = hashutil.hash_to_bytes(hash_string) | |||||
else: | |||||
# raise ObjectNotFoundError in case both the arguments are missing | |||||
return None | |||||
return self._get_content_by_hash(hash_value=hash_value, hash_type=hash_type) | |||||
class TargetContentNode(BaseContentNode): | class TargetContentNode(BaseContentNode): | ||||
""" | """ | ||||
Node resolver for a content requested from a | Node resolver for a content requested from a | ||||
directory entry or from a release target | directory entry or from a release target | ||||
""" | """ | ||||
obj: Union[DirectoryEntryNode, BaseReleaseNode, SnapshotBranchNode] | obj: Union[DirectoryEntryNode, BaseReleaseNode, SnapshotBranchNode] | ||||
def _get_node_data(self): | def _get_node_data(self): | ||||
content_id = self.obj.target_hash | return self._get_content_by_hash(hash_value=self.obj.target_hash) | ||||
return self._get_content_by_id(content_id) |
I think it will be safer to reference content data by an URL here instead of raw bytes, you could use this Web API endpoint for instance.
Fetching content data could put some pressure on our object storage depending on the input GraphQL query and responses could also be very large depending on content sizes,
so better offering links here.