diff --git a/docker/Dockerfile b/docker/Dockerfile index 4b59471..ec80cf5 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,56 +1,56 @@ FROM python:3.7 RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update && apt-get upgrade -y && \ apt-get install -y \ libapr1-dev \ libaprutil1-dev \ libpq-dev \ libsvn-dev \ libsystemd-dev \ postgresql-client \ wait-for-it \ ngrep && \ apt-get install -y --no-install-recommends \ r-base-core \ r-cran-jsonlite && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* RUN useradd -md /srv/softwareheritage -s /bin/bash swh USER swh RUN python3 -m venv /srv/softwareheritage/venv ENV PATH="/srv/softwareheritage/venv/bin:${PATH}" RUN pip install --upgrade pip setuptools wheel -RUN pip install gunicorn +RUN pip install gunicorn httpie ARG CASS_DRIVER_NO_CYTHON ENV CASS_DRIVER_NO_CYTHON ${CASS_DRIVER_NO_CYTHON} ARG CASS_DRIVER_BUILD_CONCURRENCY ENV CASS_DRIVER_BUILD_CONCURRENCY ${CASS_DRIVER_BUILD_CONCURRENCY:-1} RUN pip install cassandra-driver # Enforce installation of django 1 otherwise pip will choose django 2 when # installing the swh stack due to poor version dependency support in pip RUN pip install 'Django<2' RUN pip install \ swh-core[db,http] \ swh-deposit[server] \ swh-indexer \ swh-journal \ swh-lister \ swh-loader-core \ swh-loader-git \ swh-loader-mercurial \ swh-loader-svn \ swh-storage \ swh-objstorage \ swh-scheduler \ swh-vault \ swh-web COPY utils/*.sh /srv/softwareheritage/utils/ RUN mkdir -p /srv/softwareheritage/objects RUN rm -rd /srv/softwareheritage/.cache diff --git a/docker/tests/conftest.py b/docker/tests/conftest.py index f987692..c3f8605 100644 --- a/docker/tests/conftest.py +++ b/docker/tests/conftest.py @@ -1,126 +1,153 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import subprocess import time import requests +from os.path import join +from typing import Generator, Mapping, Tuple from urllib.parse import urljoin import pytest import testinfra APIURL = 'http://127.0.0.1:5080/api/1/' SAMPLE_METADATA = '''\ Test Software swh test-software No One ''' # scope='session' so we use the same container for all the tests; @pytest.fixture(scope='session') def docker_compose(request): # start the whole cluster subprocess.check_output(['docker-compose', 'up', '-d']) yield # and strop it subprocess.check_call(['docker-compose', 'down']) @pytest.fixture def scheduler_host(request, docker_compose): # run a container in which test commands are executed docker_id = subprocess.check_output( ['docker-compose', 'run', '-d', 'swh-scheduler', 'shell', 'sleep', '1h']).decode().strip() scheduler_host = testinfra.get_host("docker://" + docker_id) scheduler_host.check_output('wait-for-it swh-scheduler:5008 -t 30') scheduler_host.check_output('wait-for-it swh-storage:5002 -t 30') # return a testinfra connection to the container yield scheduler_host # at the end of the test suite, destroy the container subprocess.check_call(['docker', 'rm', '-f', docker_id]) # scope='session' so we use the same container for all the tests; @pytest.fixture def deposit_host(request, docker_compose): # run a container in which test commands are executed docker_id = subprocess.check_output( ['docker-compose', 'run', '-d', 'swh-deposit', 'shell', 'sleep', '1h']).decode().strip() deposit_host = testinfra.get_host("docker://" + docker_id) deposit_host.check_output( 'echo \'print("Hello World!")\n\' > /tmp/hello.py') deposit_host.check_output( 'tar -C /tmp -czf /tmp/archive.tgz /tmp/hello.py') deposit_host.check_output( f'echo \'{SAMPLE_METADATA}\' > /tmp/metadata.xml') deposit_host.check_output('wait-for-it swh-deposit:5006 -t 30') # return a testinfra connection to the container yield deposit_host # at the end of the test suite, destroy the container subprocess.check_call(['docker', 'rm', '-f', docker_id]) @pytest.fixture def git_url(): return 'https://forge.softwareheritage.org/source/swh-core' @pytest.fixture def git_origin(scheduler_host, git_url): task = scheduler_host.check_output( 'swh scheduler task add load-git ' f'url={git_url}' ) taskid = re.search(r'^Task (?P\d+)$', task, flags=re.MULTILINE).group('id') assert int(taskid) > 0 for i in range(60): status = scheduler_host.check_output( f'swh scheduler task list --list-runs --task-id {taskid}') if 'Executions:' in status: if '[eventful]' in status: break if '[started]' in status: time.sleep(1) continue if '[failed]' in status: loader_logs = subprocess.check_output( ['docker-compose', 'logs', 'swh-loader']) assert False, ('Loading execution failed\n' f'status: {status}\n' f'loader logs: {loader_logs}') assert False, f'Loading execution failed, task status is {status}' return git_url # Utility functions def apiget(path: str, verb: str = 'GET', **kwargs): """Query the API at path and return the json result or raise an AssertionError""" url = urljoin(APIURL, path) resp = requests.request(verb, url, **kwargs) assert resp.status_code == 200, f'failed to retrieve {url} ({resp})' return resp.json() + + +def pollapi(path: str, verb: str = 'GET', **kwargs): + """Poll the API at path until it returns an OK result""" + url = urljoin(APIURL, path) + for i in range(60): + resp = requests.request(verb, url, **kwargs) + if resp.ok: + break + time.sleep(1) + else: + assert False, f"Polling {url} failed" + return resp + + +def getdirectory(dirid: str, currentpath: str = '') \ + -> Generator[Tuple[str, Mapping], None, None]: + """Recursively retrieve directory description from the archive""" + directory = apiget(f'directory/{dirid}') + for direntry in directory: + path = join(currentpath, direntry['name']) + if direntry['type'] != 'dir': + yield (path, direntry) + else: + yield from getdirectory(direntry['target'], path) diff --git a/docker/tests/test_vault.py b/docker/tests/test_vault.py new file mode 100644 index 0000000..6c7c571 --- /dev/null +++ b/docker/tests/test_vault.py @@ -0,0 +1,63 @@ +# Copyright (C) 2019-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import io +import hashlib +import tarfile + +from os.path import join +from urllib.parse import quote_plus + +from .conftest import apiget, getdirectory, pollapi + + +def test_vault_directory(scheduler_host, git_origin): + # retrieve the root directory of the master branch of the ingested git + # repository (by the git_origin fixture) + visit = apiget(f'origin/{quote_plus(git_origin)}/visit/latest') + snapshot = apiget(f'snapshot/{visit["snapshot"]}') + rev_id = snapshot["branches"]["refs/heads/master"]["target"] + revision = apiget(f'revision/{rev_id}') + dir_id = revision['directory'] + + # now cook it + cook = apiget(f'vault/directory/{dir_id}/', 'POST') + assert cook['obj_type'] == 'directory' + assert cook['obj_id'] == dir_id + assert cook['fetch_url'].endswith(f'vault/directory/{dir_id}/raw/') + + # while it's cooking, get the directory tree from the archive + directory = getdirectory(dir_id) + + # retrieve the cooked tar file + resp = pollapi(f'vault/directory/{dir_id}/raw') + tarf = tarfile.open(fileobj=io.BytesIO(resp.content)) + + # and check the tarfile seems ok wrt. 'directory' + assert tarf.getnames()[0] == dir_id + tarfiles = {t.name: t for t in tarf.getmembers()} + + for fname, fdesc in directory: + tfinfo = tarfiles.get(join(dir_id, fname)) + assert tfinfo, f"Missing path {fname} in retrieved tarfile" + if fdesc['type'] == 'file': + assert fdesc['length'] == tfinfo.size, \ + f"File {fname}: length mismatch" + fdata = tarf.extractfile(tfinfo).read() + for algo in fdesc['checksums']: + if algo not in hashlib.algorithms_available: + continue + hash = hashlib.new(algo, fdata).hexdigest() + assert hash == fdesc['checksums'][algo], \ + f"File {fname}: {algo} mismatch" + # XXX what to check for dir? symlink? (other?) + + # check that if we ask a second time this directory, it returns the same + # and does not cook it again + recook = apiget(f'vault/directory/{dir_id}/', 'POST') + assert recook['obj_type'] == 'directory' + assert recook['obj_id'] == dir_id + assert recook['id'] == cook['id'] + assert recook['status'] == 'done' # no need to wait for this to be true