diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py --- a/swh/web/api/views/identifiers.py +++ b/swh/web/api/views/identifiers.py @@ -3,10 +3,13 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.web.common import service +from swh.web.common import service, utils from swh.web.common.utils import resolve_swh_persistent_id +from swh.web.common.exc import BadInputExc from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route +from swh.model.identifiers import parse_persistent_identifier +from swh.model.exceptions import ValidationError @api_route(r'/resolve/(?P.*)/', @@ -61,3 +64,45 @@ swh_id_data['browse_url'] = request.build_absolute_uri( swh_id_resolved['browse_url']) return swh_id_data + + +@api_route(r'/known/', + 'api-1-swh-pid-known', methods=['POST']) +@api_doc('/known/', noargs=True, tags=['hidden']) +@format_docstring() +def api_swh_pid_known(request): + """ + .. http:post:: /api/1/known/ + + Check if a list of Software Heritage persistent identifier is present + in the archive depending on their id (sha1_git). + + Returns: + A dictionary with: + keys(str): Persistent identifier + values(dict): A dictionary containing the key 'found'. (true if + the pid is present, False otherwise) + + """ + response = {} + + if request.method == 'POST': + try: + persistent_ids = [parse_persistent_identifier(pid) + for pid in request.data] + except ValidationError as e: + raise BadInputExc('Error when parsing identifier: %s' % + ' '.join(e.messages)) + + response = {str(pid): {'known': False} for pid in persistent_ids} + + # group pids by their type + pids_by_type = utils.group_swh_persistent_identifiers(persistent_ids) + # search for hashes not present in the storage + missing_hashes = service.lookup_missing_hashes(pids_by_type) + + for pid in persistent_ids: + if pid.object_id not in missing_hashes: + response[str(pid)]['known'] = True + + return response diff --git a/swh/web/common/service.py b/swh/web/common/service.py --- a/swh/web/common/service.py +++ b/swh/web/common/service.py @@ -8,7 +8,7 @@ import re from collections import defaultdict -from typing import Any, Dict +from typing import Any, Dict, List, Set from swh.model import hashutil @@ -1171,3 +1171,35 @@ raise BadInputExc(('Invalid swh object type! Valid types are ' f'{CONTENT}, {DIRECTORY}, {RELEASE} ' f'{REVISION} or {SNAPSHOT}.')) + + +def lookup_missing_hashes(grouped_pids: Dict[str, List[bytes]]) -> Set[str]: + """Lookup missing Software Heritage persistent identifier hash, using + batch processing. + + Args: + A dictionary with: + keys: persistent identifier type + values: list(bytes) persistent identifier hash + Returns: + A set(hexadecimal) of the hashes not found in the storage + """ + missing_hashes = [] + + for obj_type, obj_ids in grouped_pids.items(): + if obj_type == CONTENT: + missing_hashes.append( + storage.content_missing_per_sha1_git(obj_ids)) + elif obj_type == DIRECTORY: + missing_hashes.append(storage.directory_missing(obj_ids)) + elif obj_type == REVISION: + missing_hashes.append(storage.revision_missing(obj_ids)) + elif obj_type == RELEASE: + missing_hashes.append(storage.directory_missing(obj_ids)) + elif obj_type == SNAPSHOT: + missing_hashes.append(storage.directory_missing(obj_ids)) + + missing = set(map(lambda x: hashutil.hash_to_hex(x), + itertools.chain(*missing_hashes))) + + return missing diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -21,6 +21,7 @@ from rest_framework.authentication import SessionAuthentication from swh.model.exceptions import ValidationError +from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import ( persistent_identifier, parse_persistent_identifier, CONTENT, DIRECTORY, ORIGIN, RELEASE, REVISION, SNAPSHOT @@ -386,3 +387,37 @@ else: branch = None return branch + + +def group_swh_persistent_identifiers(persistent_ids): + """ + Groups many Software Heritage persistent identifiers into a + dictionary depending on their type. + + Args: + persistent_ids (list): a list of Software Heritage persistent + identifier objects + + Returns: + A dictionary with: + keys: persistent identifier types + values: list(bytes) persistent identifiers id + + Raises: + BadInputExc: if one of the provided persistent identifier can + not be parsed. + """ + pids_by_type = { + CONTENT: [], + DIRECTORY: [], + REVISION: [], + RELEASE: [], + SNAPSHOT: [] + } + + for pid in persistent_ids: + obj_id = pid.object_id + obj_type = pid.object_type + pids_by_type[obj_type].append(hash_to_bytes(obj_id)) + + return pids_by_type diff --git a/swh/web/tests/api/views/test_identifiers.py b/swh/web/tests/api/views/test_identifiers.py --- a/swh/web/tests/api/views/test_identifiers.py +++ b/swh/web/tests/api/views/test_identifiers.py @@ -10,6 +10,7 @@ ) from swh.web.common.utils import reverse +from swh.web.tests.data import random_sha1 from swh.web.tests.strategies import ( content, directory, origin, release, revision, snapshot, unknown_content, unknown_directory, unknown_release, @@ -95,3 +96,47 @@ url = reverse('api-1-resolve-swh-pid', url_args={'swh_id': ori_pid}) resp = api_client.get(url) assert resp.status_code == 400, resp.data + + +@given(content(), directory()) +def test_api_known_swhpid_some_present(api_client, content, directory): + content_ = 'swh:1:cnt:%s' % content['sha1_git'] + directory_ = 'swh:1:dir:%s' % directory + unknown_revision_ = 'swh:1:rev:%s' % random_sha1() + unknown_release_ = 'swh:1:rel:%s' % random_sha1() + unknown_snapshot_ = 'swh:1:snp:%s' % random_sha1() + + input_pids = [content_, directory_, unknown_revision_, + unknown_release_, unknown_snapshot_] + + url = reverse('api-1-swh-pid-known') + + resp = api_client.post(url, data=input_pids, format='json', + HTTP_ACCEPT='application/json') + + assert resp.status_code == 200, resp.data + assert resp['Content-Type'] == 'application/json' + assert resp.data == { + content_: {'known': True}, + directory_: {'known': True}, + unknown_revision_: {'known': False}, + unknown_release_: {'known': False}, + unknown_snapshot_: {'known': False} + } + + +def test_api_known_invalid_swhpid(api_client): + invalid_pid_sha1 = ['swh:1:cnt:8068d0075010b590762c6cb5682ed53cb3c13de;'] + invalid_pid_type = ['swh:1:cnn:8068d0075010b590762c6cb5682ed53cb3c13deb'] + + url = reverse('api-1-swh-pid-known') + + resp = api_client.post(url, data=invalid_pid_sha1, format='json', + HTTP_ACCEPT='application/json') + + assert resp.status_code == 400, resp.data + + resp2 = api_client.post(url, data=invalid_pid_type, format='json', + HTTP_ACCEPT='application/json') + + assert resp2.status_code == 400, resp.data diff --git a/swh/web/tests/common/test_service.py b/swh/web/tests/common/test_service.py --- a/swh/web/tests/common/test_service.py +++ b/swh/web/tests/common/test_service.py @@ -876,3 +876,43 @@ with pytest.raises(BadInputExc) as e: service.lookup_object(SNAPSHOT, invalid_sha1) assert e.match('Invalid checksum') + + +def test_lookup_missing_hashes_non_present(): + missing_cnt = random_sha1() + missing_dir = random_sha1() + missing_rev = random_sha1() + missing_rel = random_sha1() + missing_snp = random_sha1() + + grouped_pids = { + CONTENT: [hash_to_bytes(missing_cnt)], + DIRECTORY: [hash_to_bytes(missing_dir)], + REVISION: [hash_to_bytes(missing_rev)], + RELEASE: [hash_to_bytes(missing_rel)], + SNAPSHOT: [hash_to_bytes(missing_snp)], + } + + actual_result = service.lookup_missing_hashes(grouped_pids) + + assert actual_result == {missing_cnt, missing_dir, missing_rev, + missing_rel, missing_snp} + + +@given(content(), directory()) +def test_lookup_missing_hashes_some_present(archive_data, content, directory): + missing_rev = random_sha1() + missing_rel = random_sha1() + missing_snp = random_sha1() + + grouped_pids = { + CONTENT: [hash_to_bytes(content['sha1_git'])], + DIRECTORY: [hash_to_bytes(directory)], + REVISION: [hash_to_bytes(missing_rev)], + RELEASE: [hash_to_bytes(missing_rel)], + SNAPSHOT: [hash_to_bytes(missing_snp)], + } + + actual_result = service.lookup_missing_hashes(grouped_pids) + + assert actual_result == {missing_rev, missing_rel, missing_snp}