diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py --- a/swh/web/api/views/identifiers.py +++ b/swh/web/api/views/identifiers.py @@ -3,8 +3,11 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.web.common import service -from swh.web.common.utils import resolve_swh_persistent_id +from swh.web.common import service, utils +from swh.web.common.utils import ( + resolve_swh_persistent_id, + get_persistent_identifier +) from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route @@ -61,3 +64,41 @@ swh_id_data['browse_url'] = request.build_absolute_uri( swh_id_resolved['browse_url']) return swh_id_data + + +@api_route(r'/known/', + 'api-1-swh-pid-known', methods=['POST']) +@api_doc('/known/', noargs=True, tags=['hidden']) +@format_docstring() +def api_swh_pid_known(request): + """ + .. http:post:: /api/1/known/ + + Check if a list of Software Heritage persistent identifier is present + in the archive depending on their id (sha1_git). + + Returns: + A dictionary with: + keys(str): Persistent identifier + values(dict): A dictionary containing the key 'known'. (true if + the pid is present, False otherwise) + + """ + response = {} + + if request.method == 'POST': + persistent_ids = [get_persistent_identifier(pid) + for pid in request.data] + + response = {str(pid): {'known': False} for pid in persistent_ids} + + # group pids by their type + pids_by_type = utils.group_swh_persistent_identifiers(persistent_ids) + # search for hashes not present in the storage + missing_hashes = service.lookup_missing_hashes(pids_by_type) + + for pid in persistent_ids: + if pid.object_id not in missing_hashes: + response[str(pid)]['known'] = True + + return response diff --git a/swh/web/common/service.py b/swh/web/common/service.py --- a/swh/web/common/service.py +++ b/swh/web/common/service.py @@ -8,7 +8,7 @@ import re from collections import defaultdict -from typing import Any, Dict +from typing import Any, Dict, List, Set from swh.model import hashutil @@ -1171,3 +1171,35 @@ raise BadInputExc(('Invalid swh object type! Valid types are ' f'{CONTENT}, {DIRECTORY}, {RELEASE} ' f'{REVISION} or {SNAPSHOT}.')) + + +def lookup_missing_hashes(grouped_pids: Dict[str, List[bytes]]) -> Set[str]: + """Lookup missing Software Heritage persistent identifier hash, using + batch processing. + + Args: + A dictionary with: + keys: persistent identifier type + values: list(bytes) persistent identifier hash + Returns: + A set(hexadecimal) of the hashes not found in the storage + """ + missing_hashes = [] + + for obj_type, obj_ids in grouped_pids.items(): + if obj_type == CONTENT: + missing_hashes.append( + storage.content_missing_per_sha1_git(obj_ids)) + elif obj_type == DIRECTORY: + missing_hashes.append(storage.directory_missing(obj_ids)) + elif obj_type == REVISION: + missing_hashes.append(storage.revision_missing(obj_ids)) + elif obj_type == RELEASE: + missing_hashes.append(storage.directory_missing(obj_ids)) + elif obj_type == SNAPSHOT: + missing_hashes.append(storage.directory_missing(obj_ids)) + + missing = set(map(lambda x: hashutil.hash_to_hex(x), + itertools.chain(*missing_hashes))) + + return missing diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -21,6 +21,7 @@ from rest_framework.authentication import SessionAuthentication from swh.model.exceptions import ValidationError +from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import ( persistent_identifier, parse_persistent_identifier, CONTENT, DIRECTORY, ORIGIN, RELEASE, REVISION, SNAPSHOT @@ -244,58 +245,51 @@ * **swh_id_parsed (swh.model.identifiers.PersistentId)**: the parsed identifier * **browse_url (str)**: the url for browsing the pointed object - - Raises: - BadInputExc: if the provided identifier can not be parsed """ - try: - swh_id_parsed = parse_persistent_identifier(swh_id) - object_type = swh_id_parsed.object_type - object_id = swh_id_parsed.object_id - browse_url = None - query_dict = QueryDict('', mutable=True) - if query_params and len(query_params) > 0: - for k in sorted(query_params.keys()): - query_dict[k] = query_params[k] - if 'origin' in swh_id_parsed.metadata: - query_dict['origin'] = swh_id_parsed.metadata['origin'] - if object_type == CONTENT: - query_string = 'sha1_git:' + object_id - fragment = '' - if 'lines' in swh_id_parsed.metadata: - lines = swh_id_parsed.metadata['lines'].split('-') - fragment += '#L' + lines[0] - if len(lines) > 1: - fragment += '-L' + lines[1] - browse_url = reverse('browse-content', - url_args={'query_string': query_string}, - query_params=query_dict) + fragment - elif object_type == DIRECTORY: - browse_url = reverse('browse-directory', - url_args={'sha1_git': object_id}, - query_params=query_dict) - elif object_type == RELEASE: - browse_url = reverse('browse-release', - url_args={'sha1_git': object_id}, - query_params=query_dict) - elif object_type == REVISION: - browse_url = reverse('browse-revision', - url_args={'sha1_git': object_id}, - query_params=query_dict) - elif object_type == SNAPSHOT: - browse_url = reverse('browse-snapshot', - url_args={'snapshot_id': object_id}, - query_params=query_dict) - elif object_type == ORIGIN: - raise BadInputExc(('Origin PIDs (Persistent Identifiers) are not ' - 'publicly resolvable because they are for ' - 'internal usage only')) - except ValidationError as ve: - raise BadInputExc('Error when parsing identifier. %s' % - ' '.join(ve.messages)) - else: - return {'swh_id_parsed': swh_id_parsed, - 'browse_url': browse_url} + swh_id_parsed = get_persistent_identifier(swh_id) + object_type = swh_id_parsed.object_type + object_id = swh_id_parsed.object_id + browse_url = None + query_dict = QueryDict('', mutable=True) + if query_params and len(query_params) > 0: + for k in sorted(query_params.keys()): + query_dict[k] = query_params[k] + if 'origin' in swh_id_parsed.metadata: + query_dict['origin'] = swh_id_parsed.metadata['origin'] + if object_type == CONTENT: + query_string = 'sha1_git:' + object_id + fragment = '' + if 'lines' in swh_id_parsed.metadata: + lines = swh_id_parsed.metadata['lines'].split('-') + fragment += '#L' + lines[0] + if len(lines) > 1: + fragment += '-L' + lines[1] + browse_url = reverse('browse-content', + url_args={'query_string': query_string}, + query_params=query_dict) + fragment + elif object_type == DIRECTORY: + browse_url = reverse('browse-directory', + url_args={'sha1_git': object_id}, + query_params=query_dict) + elif object_type == RELEASE: + browse_url = reverse('browse-release', + url_args={'sha1_git': object_id}, + query_params=query_dict) + elif object_type == REVISION: + browse_url = reverse('browse-revision', + url_args={'sha1_git': object_id}, + query_params=query_dict) + elif object_type == SNAPSHOT: + browse_url = reverse('browse-snapshot', + url_args={'snapshot_id': object_id}, + query_params=query_dict) + elif object_type == ORIGIN: + raise BadInputExc(('Origin PIDs (Persistent Identifiers) are not ' + 'publicly resolvable because they are for ' + 'internal usage only')) + + return {'swh_id_parsed': swh_id_parsed, + 'browse_url': browse_url} def parse_rst(text, report_level=2): @@ -386,3 +380,60 @@ else: branch = None return branch + + +def get_persistent_identifier(persistent_id): + """Check if a persistent identifier is valid. + + Args: + persistent_id: A string representing a Software Heritage + persistent identifier. + + Raises: + BadInputExc: if the provided persistent identifier can + not be parsed. + + Return: + A persistent identifier object. + """ + try: + pid_object = parse_persistent_identifier(persistent_id) + except ValidationError as ve: + raise BadInputExc('Error when parsing identifier: %s' % + ' '.join(ve.messages)) + else: + return pid_object + + +def group_swh_persistent_identifiers(persistent_ids): + """ + Groups many Software Heritage persistent identifiers into a + dictionary depending on their type. + + Args: + persistent_ids (list): a list of Software Heritage persistent + identifier objects + + Returns: + A dictionary with: + keys: persistent identifier types + values: list(bytes) persistent identifiers id + + Raises: + BadInputExc: if one of the provided persistent identifier can + not be parsed. + """ + pids_by_type = { + CONTENT: [], + DIRECTORY: [], + REVISION: [], + RELEASE: [], + SNAPSHOT: [] + } + + for pid in persistent_ids: + obj_id = pid.object_id + obj_type = pid.object_type + pids_by_type[obj_type].append(hash_to_bytes(obj_id)) + + return pids_by_type diff --git a/swh/web/tests/api/views/test_identifiers.py b/swh/web/tests/api/views/test_identifiers.py --- a/swh/web/tests/api/views/test_identifiers.py +++ b/swh/web/tests/api/views/test_identifiers.py @@ -10,6 +10,7 @@ ) from swh.web.common.utils import reverse +from swh.web.tests.data import random_sha1 from swh.web.tests.strategies import ( content, directory, origin, release, revision, snapshot, unknown_content, unknown_directory, unknown_release, @@ -95,3 +96,47 @@ url = reverse('api-1-resolve-swh-pid', url_args={'swh_id': ori_pid}) resp = api_client.get(url) assert resp.status_code == 400, resp.data + + +@given(content(), directory()) +def test_api_known_swhpid_some_present(api_client, content, directory): + content_ = 'swh:1:cnt:%s' % content['sha1_git'] + directory_ = 'swh:1:dir:%s' % directory + unknown_revision_ = 'swh:1:rev:%s' % random_sha1() + unknown_release_ = 'swh:1:rel:%s' % random_sha1() + unknown_snapshot_ = 'swh:1:snp:%s' % random_sha1() + + input_pids = [content_, directory_, unknown_revision_, + unknown_release_, unknown_snapshot_] + + url = reverse('api-1-swh-pid-known') + + resp = api_client.post(url, data=input_pids, format='json', + HTTP_ACCEPT='application/json') + + assert resp.status_code == 200, resp.data + assert resp['Content-Type'] == 'application/json' + assert resp.data == { + content_: {'known': True}, + directory_: {'known': True}, + unknown_revision_: {'known': False}, + unknown_release_: {'known': False}, + unknown_snapshot_: {'known': False} + } + + +def test_api_known_invalid_swhpid(api_client): + invalid_pid_sha1 = ['swh:1:cnt:8068d0075010b590762c6cb5682ed53cb3c13de;'] + invalid_pid_type = ['swh:1:cnn:8068d0075010b590762c6cb5682ed53cb3c13deb'] + + url = reverse('api-1-swh-pid-known') + + resp = api_client.post(url, data=invalid_pid_sha1, format='json', + HTTP_ACCEPT='application/json') + + assert resp.status_code == 400, resp.data + + resp2 = api_client.post(url, data=invalid_pid_type, format='json', + HTTP_ACCEPT='application/json') + + assert resp2.status_code == 400, resp.data diff --git a/swh/web/tests/common/test_service.py b/swh/web/tests/common/test_service.py --- a/swh/web/tests/common/test_service.py +++ b/swh/web/tests/common/test_service.py @@ -876,3 +876,43 @@ with pytest.raises(BadInputExc) as e: service.lookup_object(SNAPSHOT, invalid_sha1) assert e.match('Invalid checksum') + + +def test_lookup_missing_hashes_non_present(): + missing_cnt = random_sha1() + missing_dir = random_sha1() + missing_rev = random_sha1() + missing_rel = random_sha1() + missing_snp = random_sha1() + + grouped_pids = { + CONTENT: [hash_to_bytes(missing_cnt)], + DIRECTORY: [hash_to_bytes(missing_dir)], + REVISION: [hash_to_bytes(missing_rev)], + RELEASE: [hash_to_bytes(missing_rel)], + SNAPSHOT: [hash_to_bytes(missing_snp)], + } + + actual_result = service.lookup_missing_hashes(grouped_pids) + + assert actual_result == {missing_cnt, missing_dir, missing_rev, + missing_rel, missing_snp} + + +@given(content(), directory()) +def test_lookup_missing_hashes_some_present(archive_data, content, directory): + missing_rev = random_sha1() + missing_rel = random_sha1() + missing_snp = random_sha1() + + grouped_pids = { + CONTENT: [hash_to_bytes(content['sha1_git'])], + DIRECTORY: [hash_to_bytes(directory)], + REVISION: [hash_to_bytes(missing_rev)], + RELEASE: [hash_to_bytes(missing_rel)], + SNAPSHOT: [hash_to_bytes(missing_snp)], + } + + actual_result = service.lookup_missing_hashes(grouped_pids) + + assert actual_result == {missing_rev, missing_rel, missing_snp}