diff --git a/swh/web/api/urls.py b/swh/web/api/urls.py --- a/swh/web/api/urls.py +++ b/swh/web/api/urls.py @@ -13,6 +13,7 @@ import swh.web.api.views.snapshot # noqa import swh.web.api.views.stat # noqa import swh.web.api.views.vault # noqa +import swh.web.api.views.known # noqa from swh.web.api.apiurls import APIUrls diff --git a/swh/web/api/views/known.py b/swh/web/api/views/known.py new file mode 100644 --- /dev/null +++ b/swh/web/api/views/known.py @@ -0,0 +1,42 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.web.common import service, utils +from swh.web.api.apidoc import api_doc, format_docstring +from swh.web.api.apiurls import api_route + + +@api_route(r'/known/', + 'api-1-swh-pid-known', methods=['POST']) +@api_doc('/known/', noargs=True, tags=['hidden']) +@format_docstring() +def api_swh_pid_known(request): + """ + .. http:post:: /api/1/known/ + + Check if a list of Software Heritage persistent identifier is present + in the archive depending on their id (sha1_git). + + Returns: + A dictionary with: + keys(str): Persistent identifier + values(dict): A dictionary containing the key 'found'. (true if + the pid is present, False otherwise) + + """ + if request.method == 'POST': + persistent_ids = request.data + response = dict((pid, {'found': False}) for pid in persistent_ids) + + # group pid by their type + pids_by_type = utils.group_swh_persistent_identifiers(persistent_ids) + # search for hash present in the storage + missing_hashes = service.lookup_missing_hashes(pids_by_type) + + for pid, attr in response.items(): + if pid.split(':')[3] not in missing_hashes: + attr['found'] = True + + return response diff --git a/swh/web/common/service.py b/swh/web/common/service.py --- a/swh/web/common/service.py +++ b/swh/web/common/service.py @@ -1171,3 +1171,35 @@ raise BadInputExc(('Invalid swh object type! Valid types are ' f'{CONTENT}, {DIRECTORY}, {RELEASE} ' f'{REVISION} or {SNAPSHOT}.')) + + +def lookup_missing_hashes(grouped_pids): + """Lookup missing Software Heritage persistent identifier hash, using + batch processing. + + Args: + A dictionary with: + keys: persistent identifier type + values: list(bytes) persistent identifier hash + Returns: + A set(hexadecimal) of the hashes not found in the storage + """ + missing_hashes = [] + + for obj_type, obj_ids in grouped_pids.items(): + if obj_type == CONTENT: + missing_hashes.append( + storage.content_missing_per_sha1_git(obj_ids)) + if obj_type == DIRECTORY: + missing_hashes.append(storage.directory_missing(obj_ids)) + if obj_type == REVISION: + missing_hashes.append(storage.revision_missing(obj_ids)) + if obj_type == RELEASE: + missing_hashes.append(storage.directory_missing(obj_ids)) + if obj_type == SNAPSHOT: + missing_hashes.append(storage.directory_missing(obj_ids)) + + missing = set(map(lambda x: hashutil.hash_to_hex(x), + itertools.chain(*missing_hashes))) + + return missing diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -21,6 +21,7 @@ from rest_framework.authentication import SessionAuthentication from swh.model.exceptions import ValidationError +from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import ( persistent_identifier, parse_persistent_identifier, CONTENT, DIRECTORY, ORIGIN, RELEASE, REVISION, SNAPSHOT @@ -386,3 +387,41 @@ else: branch = None return branch + + +def group_swh_persistent_identifiers(persistent_ids): + """ + Groups many Software Heritage persistent identifiers into a + dictionary depending on their type. + + Args: + persistent_ids (list): a list of Software Heritage persistent + identifier + + Returns: + A dictionary with: + keys: persistent identifier types + values: list(bytes) persistent identifiers id + + Raises: + BadInputExc: if one of the provided identifier is not valid + """ + pids_by_type = { + CONTENT: [], + DIRECTORY: [], + REVISION: [], + RELEASE: [], + SNAPSHOT: [] + } + + try: + for pid in persistent_ids: + parsed_pid = parse_persistent_identifier(pid) + obj_id = parsed_pid.object_id + obj_type = parsed_pid.object_type + pids_by_type[obj_type].append(hash_to_bytes(obj_id)) + except ValidationError as v: + raise BadInputExc('Error when parsing identifier: %s' % + ' '.join(v.messages)) + + return pids_by_type diff --git a/swh/web/tests/common/test_service.py b/swh/web/tests/common/test_service.py --- a/swh/web/tests/common/test_service.py +++ b/swh/web/tests/common/test_service.py @@ -876,3 +876,24 @@ with pytest.raises(BadInputExc) as e: service.lookup_object(SNAPSHOT, invalid_sha1) assert e.match('Invalid checksum') + + +def test_lookup_missing_hashes(): + missing_cnt = random_sha1() + missing_dir = random_sha1() + missing_rev = random_sha1() + missing_rel = random_sha1() + missing_snp = random_sha1() + + grouped_pids = { + CONTENT: [hash_to_bytes(missing_cnt)], + DIRECTORY: [hash_to_bytes(missing_dir)], + REVISION: [hash_to_bytes(missing_rev)], + RELEASE: [hash_to_bytes(missing_rel)], + SNAPSHOT: [hash_to_bytes(missing_snp)], + } + + actual_result = service.lookup_missing_hashes(grouped_pids) + + assert actual_result == {missing_cnt, missing_dir, missing_rev, + missing_rel, missing_snp} diff --git a/swh/web/tests/common/test_utils.py b/swh/web/tests/common/test_utils.py --- a/swh/web/tests/common/test_utils.py +++ b/swh/web/tests/common/test_utils.py @@ -112,3 +112,15 @@ with pytest.raises(BadInputExc) as e: utils.get_swh_persistent_id(swh_object_type, 'not a valid id') assert e.match('Invalid object') + + +def test_group_swh_persistent_identifiers_bad_input(): + sha1_git = 'aafb16d69fd30ff58afdd69036a26047f3aebdc6' + invalid_pid_sha1 = ['swh:1:cnt:aafb16d69fd30ff58afdd69036a26047f3aebdc;'] + invalid_pid_type = ['swh:1:dri:%s' % sha1_git] + + with pytest.raises(BadInputExc): + utils.group_swh_persistent_identifiers(invalid_pid_sha1) + + with pytest.raises(BadInputExc): + utils.group_swh_persistent_identifiers(invalid_pid_type)