diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py --- a/swh/web/api/views/identifiers.py +++ b/swh/web/api/views/identifiers.py @@ -3,10 +3,11 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.web.common import service +from swh.web.common import service, utils from swh.web.common.utils import resolve_swh_persistent_id from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route +from swh.model.identifiers import parse_persistent_identifier @api_route(r'/resolve/(?P.*)/', @@ -61,3 +62,47 @@ swh_id_data['browse_url'] = request.build_absolute_uri( swh_id_resolved['browse_url']) return swh_id_data + + +@api_route(r'/known/', + 'api-1-swh-pid-known', methods=['POST']) +@api_doc('/known/', noargs=True, tags=['hidden']) +@format_docstring() +def api_swh_pid_known(request): + """ + .. http:post:: /api/1/known/ + + Check if a list of Software Heritage persistent identifier is present + in the archive depending on their id (sha1_git). + + Returns: + A dictionary with: + keys(str): Persistent identifier + values(dict): A dictionary containing the key 'found'. (true if + the pid is present, False otherwise) + + """ + response = {} + + if request.method == 'POST': + data = request.data + + persistent_ids = [] + for pid in data: + persistent_ids.append(parse_persistent_identifier(pid)) + + response = dict((str(pid), {'found': False}) for pid in persistent_ids) + + # group pid by their type + pids_by_type = utils.group_swh_persistent_identifiers(persistent_ids) + # search for hash present in the storage + missing_hashes = service.lookup_missing_hashes(pids_by_type) + + for pid in persistent_ids: + response[str(pid)] = {} + if pid.object_id not in missing_hashes: + response[str(pid)]['found'] = True + else: + response[str(pid)]['found'] = False + + return response diff --git a/swh/web/common/service.py b/swh/web/common/service.py --- a/swh/web/common/service.py +++ b/swh/web/common/service.py @@ -1171,3 +1171,35 @@ raise BadInputExc(('Invalid swh object type! Valid types are ' f'{CONTENT}, {DIRECTORY}, {RELEASE} ' f'{REVISION} or {SNAPSHOT}.')) + + +def lookup_missing_hashes(grouped_pids): + """Lookup missing Software Heritage persistent identifier hash, using + batch processing. + + Args: + A dictionary with: + keys: persistent identifier type + values: list(bytes) persistent identifier hash + Returns: + A set(hexadecimal) of the hashes not found in the storage + """ + missing_hashes = [] + + for obj_type, obj_ids in grouped_pids.items(): + if obj_type == CONTENT: + missing_hashes.append( + storage.content_missing_per_sha1_git(obj_ids)) + if obj_type == DIRECTORY: + missing_hashes.append(storage.directory_missing(obj_ids)) + if obj_type == REVISION: + missing_hashes.append(storage.revision_missing(obj_ids)) + if obj_type == RELEASE: + missing_hashes.append(storage.directory_missing(obj_ids)) + if obj_type == SNAPSHOT: + missing_hashes.append(storage.directory_missing(obj_ids)) + + missing = set(map(lambda x: hashutil.hash_to_hex(x), + itertools.chain(*missing_hashes))) + + return missing diff --git a/swh/web/common/utils.py b/swh/web/common/utils.py --- a/swh/web/common/utils.py +++ b/swh/web/common/utils.py @@ -21,6 +21,7 @@ from rest_framework.authentication import SessionAuthentication from swh.model.exceptions import ValidationError +from swh.model.hashutil import hash_to_bytes from swh.model.identifiers import ( persistent_identifier, parse_persistent_identifier, CONTENT, DIRECTORY, ORIGIN, RELEASE, REVISION, SNAPSHOT @@ -386,3 +387,34 @@ else: branch = None return branch + + +def group_swh_persistent_identifiers(persistent_ids): + """ + Groups many Software Heritage persistent identifiers into a + dictionary depending on their type. + + Args: + persistent_ids (list): a list of Software Heritage persistent + identifier objects + + Returns: + A dictionary with: + keys: persistent identifier types + values: list(bytes) persistent identifiers id + + """ + pids_by_type = { + CONTENT: [], + DIRECTORY: [], + REVISION: [], + RELEASE: [], + SNAPSHOT: [] + } + + for pid in persistent_ids: + obj_id = pid.object_id + obj_type = pid.object_type + pids_by_type[obj_type].append(hash_to_bytes(obj_id)) + + return pids_by_type diff --git a/swh/web/tests/common/test_service.py b/swh/web/tests/common/test_service.py --- a/swh/web/tests/common/test_service.py +++ b/swh/web/tests/common/test_service.py @@ -876,3 +876,43 @@ with pytest.raises(BadInputExc) as e: service.lookup_object(SNAPSHOT, invalid_sha1) assert e.match('Invalid checksum') + + +def test_lookup_missing_hashes_non_present(): + missing_cnt = random_sha1() + missing_dir = random_sha1() + missing_rev = random_sha1() + missing_rel = random_sha1() + missing_snp = random_sha1() + + grouped_pids = { + CONTENT: [hash_to_bytes(missing_cnt)], + DIRECTORY: [hash_to_bytes(missing_dir)], + REVISION: [hash_to_bytes(missing_rev)], + RELEASE: [hash_to_bytes(missing_rel)], + SNAPSHOT: [hash_to_bytes(missing_snp)], + } + + actual_result = service.lookup_missing_hashes(grouped_pids) + + assert actual_result == {missing_cnt, missing_dir, missing_rev, + missing_rel, missing_snp} + + +@given(content(), directory()) +def test_lookup_missing_hashes_some_present(archive_data, content, directory): + missing_rev = random_sha1() + missing_rel = random_sha1() + missing_snp = random_sha1() + + grouped_pids = { + CONTENT: [hash_to_bytes(content['sha1_git'])], + DIRECTORY: [hash_to_bytes(directory)], + REVISION: [hash_to_bytes(missing_rev)], + RELEASE: [hash_to_bytes(missing_rel)], + SNAPSHOT: [hash_to_bytes(missing_snp)], + } + + actual_result = service.lookup_missing_hashes(grouped_pids) + + assert actual_result == {missing_rel, missing_snp, missing_snp}