diff --git a/docker/Dockerfile b/docker/Dockerfile --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -23,7 +23,7 @@ RUN python3 -m venv /srv/softwareheritage/venv ENV PATH="/srv/softwareheritage/venv/bin:${PATH}" RUN pip install --upgrade pip setuptools wheel -RUN pip install gunicorn +RUN pip install gunicorn httpie ARG CASS_DRIVER_NO_CYTHON ENV CASS_DRIVER_NO_CYTHON ${CASS_DRIVER_NO_CYTHON} diff --git a/docker/tests/conftest.py b/docker/tests/conftest.py --- a/docker/tests/conftest.py +++ b/docker/tests/conftest.py @@ -9,6 +9,8 @@ import requests +from os.path import join +from typing import Generator, Mapping, Tuple from urllib.parse import urljoin import pytest @@ -124,3 +126,28 @@ resp = requests.request(verb, url, **kwargs) assert resp.status_code == 200, f'failed to retrieve {url} ({resp})' return resp.json() + + +def pollapi(path: str, verb: str = 'GET', **kwargs): + """Poll the API at path until it returns an OK result""" + url = urljoin(APIURL, path) + for i in range(60): + resp = requests.request(verb, url, **kwargs) + if resp.ok: + break + time.sleep(1) + else: + assert False, f"Polling {url} failed" + return resp + + +def getdirectory(dirid: str, currentpath: str = '') \ + -> Generator[Tuple[str, Mapping], None, None]: + """Recursively retrieve directory description from the archive""" + directory = apiget(f'directory/{dirid}') + for direntry in directory: + path = join(currentpath, direntry['name']) + if direntry['type'] != 'dir': + yield (path, direntry) + else: + yield from getdirectory(direntry['target'], path) diff --git a/docker/tests/test_vault.py b/docker/tests/test_vault.py new file mode 100644 --- /dev/null +++ b/docker/tests/test_vault.py @@ -0,0 +1,63 @@ +# Copyright (C) 2019-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import io +import hashlib +import tarfile + +from os.path import join +from urllib.parse import quote_plus + +from .conftest import apiget, getdirectory, pollapi + + +def test_vault_directory(scheduler_host, git_origin): + # retrieve the root directory of the master branch of the ingested git + # repository (by the git_origin fixture) + visit = apiget(f'origin/{quote_plus(git_origin)}/visit/latest') + snapshot = apiget(f'snapshot/{visit["snapshot"]}') + rev_id = snapshot["branches"]["refs/heads/master"]["target"] + revision = apiget(f'revision/{rev_id}') + dir_id = revision['directory'] + + # now cook it + cook = apiget(f'vault/directory/{dir_id}/', 'POST') + assert cook['obj_type'] == 'directory' + assert cook['obj_id'] == dir_id + assert cook['fetch_url'].endswith(f'vault/directory/{dir_id}/raw/') + + # while it's cooking, get the directory tree from the archive + directory = getdirectory(dir_id) + + # retrieve the cooked tar file + resp = pollapi(f'vault/directory/{dir_id}/raw') + tarf = tarfile.open(fileobj=io.BytesIO(resp.content)) + + # and check the tarfile seems ok wrt. 'directory' + assert tarf.getnames()[0] == dir_id + tarfiles = {t.name: t for t in tarf.getmembers()} + + for fname, fdesc in directory: + tfinfo = tarfiles.get(join(dir_id, fname)) + assert tfinfo, f"Missing path {fname} in retrieved tarfile" + if fdesc['type'] == 'file': + assert fdesc['length'] == tfinfo.size, \ + f"File {fname}: length mismatch" + fdata = tarf.extractfile(tfinfo).read() + for algo in fdesc['checksums']: + if algo not in hashlib.algorithms_available: + continue + hash = hashlib.new(algo, fdata).hexdigest() + assert hash == fdesc['checksums'][algo], \ + f"File {fname}: {algo} mismatch" + # XXX what to check for dir? symlink? (other?) + + # check that if we ask a second time this directory, it returns the same + # and does not cook it again + recook = apiget(f'vault/directory/{dir_id}/', 'POST') + assert recook['obj_type'] == 'directory' + assert recook['obj_id'] == dir_id + assert recook['id'] == cook['id'] + assert recook['status'] == 'done' # no need to wait for this to be true