diff --git a/swh/graphql/app.py b/swh/graphql/app.py --- a/swh/graphql/app.py +++ b/swh/graphql/app.py @@ -39,4 +39,5 @@ scalars.id_scalar, scalars.datetime_scalar, scalars.swhid_scalar, + scalars.content_hash_scalar, ) diff --git a/swh/graphql/backends/archive.py b/swh/graphql/backends/archive.py --- a/swh/graphql/backends/archive.py +++ b/swh/graphql/backends/archive.py @@ -69,10 +69,6 @@ directory_id, limit=first, page_token=after ) - def get_content(self, content_id): - # FIXME, only for tests - return self.storage.content_find({"sha1_git": content_id}) - def is_object_available(self, object_id: str, object_type: ObjectType) -> bool: mapping = { ObjectType.CONTENT: self.storage.content_missing_per_sha1_git, @@ -81,4 +77,10 @@ ObjectType.REVISION: self.storage.revision_missing, ObjectType.SNAPSHOT: self.storage.snapshot_missing, } - return not mapping[object_type]([object_id]) + return not list(mapping[object_type]([object_id])) + + def get_contents(self, checksums: dict): + return self.storage.content_find(checksums) + + def get_content_data(self, content_sha1): + return self.storage.content_get_data(content_sha1) diff --git a/swh/graphql/resolvers/content.py b/swh/graphql/resolvers/content.py --- a/swh/graphql/resolvers/content.py +++ b/swh/graphql/resolvers/content.py @@ -18,19 +18,39 @@ Base resolver for all the content nodes """ - def _get_content_by_id(self, content_id): - content = archive.Archive().get_content(content_id) + def _get_content_by_hash(self, checksums: dict): + content = archive.Archive().get_contents(checksums) + # in case of a conflict, return the first element return content[0] if content else None @property def checksum(self): - # FIXME, return a Node object + # FIXME, use a Node instead return {k: v.hex() for (k, v) in self._node.hashes().items()} @property def id(self): return self._node.sha1_git + @property + def data(self): + # FIXME, return a Node object + # FIXME, add more ways to retrieve data (eg: a static URL) + content_sha1 = self._node.hashes()["sha1"] + return {"raw": archive.Archive().get_content_data(content_sha1)} + + def ConetentFileType(self): + # FIXME, fetch data from the indexers + return None + + def ConetentLanguage(self): + # FIXME, fetch data from the indexers + return None + + def ConetentLicense(self): + # FIXME, fetch data from the indexers + return None + def is_type_of(self): # is_type_of is required only when resolving a UNION type # This is for ariadne to return the right type @@ -43,17 +63,27 @@ """ def _get_node_data(self): - return self._get_content_by_id(self.kwargs.get("swhid").object_id) + checksums = {"sha1_git": self.kwargs.get("swhid").object_id} + return self._get_content_by_hash(checksums) + + +class HashContentNode(BaseContentNode): + """ + Node resolver for a content requested with one or more checksums + """ + + def _get_node_data(self): + checksums = dict(self.kwargs.get("checksums")) + return self._get_content_by_hash(checksums) class TargetContentNode(BaseContentNode): """ - Node resolver for a content requested from a - directory entry or from a release target + Node resolver for a content requested as a target + This request could be from directory entry, release or a branch """ obj: Union[DirectoryEntryNode, BaseReleaseNode, SnapshotBranchNode] def _get_node_data(self): - content_id = self.obj.target_hash - return self._get_content_by_id(content_id) + return self._get_content_by_hash(checksums={"sha1_git": self.obj.target_hash}) diff --git a/swh/graphql/resolvers/resolver_factory.py b/swh/graphql/resolvers/resolver_factory.py --- a/swh/graphql/resolvers/resolver_factory.py +++ b/swh/graphql/resolvers/resolver_factory.py @@ -3,7 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from .content import ContentNode, TargetContentNode +from .content import ContentNode, HashContentNode, TargetContentNode from .directory import DirectoryNode, RevisionDirectoryNode, TargetDirectoryNode from .directory_entry import DirectoryEntryConnection from .origin import OriginConnection, OriginNode @@ -49,6 +49,7 @@ "release-content": TargetContentNode, "directory": DirectoryNode, "content": ContentNode, + "content-by-hash": HashContentNode, "dir-entry-dir": TargetDirectoryNode, "dir-entry-file": TargetContentNode, "search-result-snapshot": TargetSnapshotNode, diff --git a/swh/graphql/resolvers/resolvers.py b/swh/graphql/resolvers/resolvers.py --- a/swh/graphql/resolvers/resolvers.py +++ b/swh/graphql/resolvers/resolvers.py @@ -183,6 +183,14 @@ return resolver(obj, info, **kw) +@query.field("contentByHash") +def content_by_hash_resolver( + obj: None, info: GraphQLResolveInfo, **kw +) -> rs.content.ContentNode: + resolver = get_node_resolver("content-by-hash") + return resolver(obj, info, **kw) + + # Connection resolvers # A connection resolver should return an instance of BaseConnection diff --git a/swh/graphql/resolvers/scalars.py b/swh/graphql/resolvers/scalars.py --- a/swh/graphql/resolvers/scalars.py +++ b/swh/graphql/resolvers/scalars.py @@ -8,12 +8,14 @@ from ariadne import ScalarType from swh.graphql.utils import utils +from swh.model import hashutil from swh.model.model import TimestampWithTimezone from swh.model.swhids import CoreSWHID datetime_scalar = ScalarType("DateTime") swhid_scalar = ScalarType("SWHID") id_scalar = ScalarType("ID") +content_hash_scalar = ScalarType("ContentHash") @id_scalar.serializer @@ -41,3 +43,18 @@ @swhid_scalar.serializer def serialize_swhid(value): return str(value) + + +@content_hash_scalar.value_parser +def validate_content_hash(value): + try: + hash_type, hash_string = value.split(":") + hash_value = hashutil.hash_to_bytes(hash_string) + except ValueError as e: + # FIXME, log this error + raise AttributeError("Invalid content checksum", e) + except Exception as e: + # FIXME, log this error + raise AttributeError("Invalid content checksum", e) + # FIXME, add validation for the hash_type + return hash_type, hash_value diff --git a/swh/graphql/schema/schema.graphql b/swh/graphql/schema/schema.graphql --- a/swh/graphql/schema/schema.graphql +++ b/swh/graphql/schema/schema.graphql @@ -8,6 +8,11 @@ """ scalar DateTime +""" +Content identifier in the form hash-type:hash-value +""" +scalar ContentHash + """ Object with an id """ @@ -755,24 +760,49 @@ } """ -An object with different checksums +An object with different content checksums """ type ContentChecksum { + blake2s256: String + sha1: String + sha1_git: String + sha256: String +} + +""" +Object with different content data representations +""" +type ContentData { """ + Content as a base64 string """ - blake2s256: String + raw: BinaryString +} +type ContentFileType { """ + Detected content encoding """ - sha1: String + encoding: String """ + Detected MIME type of the content """ - sha1_git: String + mimetype: String +} +type ContentLanguage { """ + Detected programming language if any """ - sha256: String + lang: String +} + +type ContentLicense { + """ + Array of strings containing the detected license names + """ + licenses: [String] } """ @@ -803,6 +833,26 @@ Content status, visible or hidden """ status: String + + """ + File content + """ + data: ContentData + + """ + Information about the content MIME type + """ + fileType: ContentFileType + + """ + Information about the programming language used in the content + """ + language: ContentLanguage + + """ + Information about the license of the content + """ + license: ContentLicense } """ @@ -970,6 +1020,17 @@ swhid: SWHID! ): Content + """ + Get the content by one or more hashes + Use multiple hashes for an accurate result + """ + contentByHash( + """ + List of hashType:hashValue strings + """ + checksums: [ContentHash]! + ): Content + """ Resolve the given SWHID to an object """ diff --git a/swh/graphql/tests/functional/test_content.py b/swh/graphql/tests/functional/test_content.py new file mode 100644 --- /dev/null +++ b/swh/graphql/tests/functional/test_content.py @@ -0,0 +1,148 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from . import utils +from ..data import get_contents + + +@pytest.mark.parametrize("content", get_contents()) +def test_get_contnet_with_swhid(client, content): + query_str = """ + { + content(swhid: "%s") { + swhid + checksum { + blake2s256 + sha1 + sha1_git + sha256 + } + length + status + data { + raw { + text + } + } + fileType { + encoding + } + language { + lang + } + license { + licenses + } + } + } + """ + data, _ = utils.get_query_response(client, query_str % content.swhid()) + response = { + "swhid": str(content.swhid()), + "checksum": { + "blake2s256": content.blake2s256.hex(), + "sha1": content.sha1.hex(), + "sha1_git": content.sha1_git.hex(), + "sha256": content.sha256.hex(), + }, + "length": content.length, + "status": content.status, + "data": {"raw": {"text": content.data.decode()}}, + "fileType": None, + "language": None, + "license": None, + } + assert data["content"] == response + + +@pytest.mark.parametrize("content", get_contents()) +def test_get_content_with_hash(client, content): + query_str = """ + { + contentByHash(checksums: ["blake2s256:%s", "sha1:%s", "sha1_git:%s", "sha256:%s"]) { + swhid + } + } + """ + data, _ = utils.get_query_response( + client, + query_str + % ( + content.blake2s256.hex(), + content.sha1.hex(), + content.sha1_git.hex(), + content.sha256.hex(), + ), + ) + assert data["contentByHash"] == {"swhid": str(content.swhid())} + + +def test_get_content_with_invalid_swhid(client): + query_str = """ + { + content(swhid: "swh:1:cnt:invalid") { + swhid + } + } + """ + errors = utils.get_error_response(client, query_str) + # API will throw an error in case of an invalid SWHID + assert len(errors) == 1 + assert "Invalid SWHID: invalid syntax" in errors[0]["message"] + + +def test_get_content_with_invalid_hashes(client): + content = get_contents()[0] + query_str = """ + { + contentByHash(checksums: ["blake2s256:%s", "sha1:%s", "sha1_git:%s", "sha256:%s"]) { + swhid + } + } + """ + errors = utils.get_error_response( + client, + query_str + % ( + "invalid", # Only one hash is invalid + content.sha1.hex(), + content.sha1_git.hex(), + content.sha256.hex(), + ), + ) + # API will throw an error in case of an invalid content hash + assert len(errors) == 1 + assert "Invalid content checksum" in errors[0]["message"] + + +def test_get_content_as_target(client): + # SWHID of a test dir with a file entry + directory_swhid = "swh:1:dir:87b339104f7dc2a8163dec988445e3987995545f" + query_str = """ + { + directory(swhid: "%s") { + swhid + entries(first: 2) { + nodes { + type + target { + ...on Content { + swhid + length + } + } + } + } + } + } + """ + data, _ = utils.get_query_response(client, query_str % directory_swhid) + content_obj = data["directory"]["entries"]["nodes"][1]["target"] + assert content_obj == { + "length": 4, + "swhid": "swh:1:cnt:86bc6b377e9d25f9d26777a4a28d08e63e7c5779", + }