diff --git a/docker/Dockerfile b/docker/Dockerfile
index 4b59471..ec80cf5 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,56 +1,56 @@
FROM python:3.7
RUN export DEBIAN_FRONTEND=noninteractive && \
apt-get update && apt-get upgrade -y && \
apt-get install -y \
libapr1-dev \
libaprutil1-dev \
libpq-dev \
libsvn-dev \
libsystemd-dev \
postgresql-client \
wait-for-it \
ngrep && \
apt-get install -y --no-install-recommends \
r-base-core \
r-cran-jsonlite && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN useradd -md /srv/softwareheritage -s /bin/bash swh
USER swh
RUN python3 -m venv /srv/softwareheritage/venv
ENV PATH="/srv/softwareheritage/venv/bin:${PATH}"
RUN pip install --upgrade pip setuptools wheel
-RUN pip install gunicorn
+RUN pip install gunicorn httpie
ARG CASS_DRIVER_NO_CYTHON
ENV CASS_DRIVER_NO_CYTHON ${CASS_DRIVER_NO_CYTHON}
ARG CASS_DRIVER_BUILD_CONCURRENCY
ENV CASS_DRIVER_BUILD_CONCURRENCY ${CASS_DRIVER_BUILD_CONCURRENCY:-1}
RUN pip install cassandra-driver
# Enforce installation of django 1 otherwise pip will choose django 2 when
# installing the swh stack due to poor version dependency support in pip
RUN pip install 'Django<2'
RUN pip install \
swh-core[db,http] \
swh-deposit[server] \
swh-indexer \
swh-journal \
swh-lister \
swh-loader-core \
swh-loader-git \
swh-loader-mercurial \
swh-loader-svn \
swh-storage \
swh-objstorage \
swh-scheduler \
swh-vault \
swh-web
COPY utils/*.sh /srv/softwareheritage/utils/
RUN mkdir -p /srv/softwareheritage/objects
RUN rm -rd /srv/softwareheritage/.cache
diff --git a/docker/tests/conftest.py b/docker/tests/conftest.py
index f987692..c3f8605 100644
--- a/docker/tests/conftest.py
+++ b/docker/tests/conftest.py
@@ -1,126 +1,153 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
import subprocess
import time
import requests
+from os.path import join
+from typing import Generator, Mapping, Tuple
from urllib.parse import urljoin
import pytest
import testinfra
APIURL = 'http://127.0.0.1:5080/api/1/'
SAMPLE_METADATA = '''\
Test Software
swh
test-software
No One
'''
# scope='session' so we use the same container for all the tests;
@pytest.fixture(scope='session')
def docker_compose(request):
# start the whole cluster
subprocess.check_output(['docker-compose', 'up', '-d'])
yield
# and strop it
subprocess.check_call(['docker-compose', 'down'])
@pytest.fixture
def scheduler_host(request, docker_compose):
# run a container in which test commands are executed
docker_id = subprocess.check_output(
['docker-compose', 'run', '-d',
'swh-scheduler', 'shell', 'sleep', '1h']).decode().strip()
scheduler_host = testinfra.get_host("docker://" + docker_id)
scheduler_host.check_output('wait-for-it swh-scheduler:5008 -t 30')
scheduler_host.check_output('wait-for-it swh-storage:5002 -t 30')
# return a testinfra connection to the container
yield scheduler_host
# at the end of the test suite, destroy the container
subprocess.check_call(['docker', 'rm', '-f', docker_id])
# scope='session' so we use the same container for all the tests;
@pytest.fixture
def deposit_host(request, docker_compose):
# run a container in which test commands are executed
docker_id = subprocess.check_output(
['docker-compose', 'run', '-d',
'swh-deposit', 'shell', 'sleep', '1h']).decode().strip()
deposit_host = testinfra.get_host("docker://" + docker_id)
deposit_host.check_output(
'echo \'print("Hello World!")\n\' > /tmp/hello.py')
deposit_host.check_output(
'tar -C /tmp -czf /tmp/archive.tgz /tmp/hello.py')
deposit_host.check_output(
f'echo \'{SAMPLE_METADATA}\' > /tmp/metadata.xml')
deposit_host.check_output('wait-for-it swh-deposit:5006 -t 30')
# return a testinfra connection to the container
yield deposit_host
# at the end of the test suite, destroy the container
subprocess.check_call(['docker', 'rm', '-f', docker_id])
@pytest.fixture
def git_url():
return 'https://forge.softwareheritage.org/source/swh-core'
@pytest.fixture
def git_origin(scheduler_host, git_url):
task = scheduler_host.check_output(
'swh scheduler task add load-git '
f'url={git_url}'
)
taskid = re.search(r'^Task (?P\d+)$', task,
flags=re.MULTILINE).group('id')
assert int(taskid) > 0
for i in range(60):
status = scheduler_host.check_output(
f'swh scheduler task list --list-runs --task-id {taskid}')
if 'Executions:' in status:
if '[eventful]' in status:
break
if '[started]' in status:
time.sleep(1)
continue
if '[failed]' in status:
loader_logs = subprocess.check_output(
['docker-compose', 'logs', 'swh-loader'])
assert False, ('Loading execution failed\n'
f'status: {status}\n'
f'loader logs: {loader_logs}')
assert False, f'Loading execution failed, task status is {status}'
return git_url
# Utility functions
def apiget(path: str, verb: str = 'GET', **kwargs):
"""Query the API at path and return the json result or raise an
AssertionError"""
url = urljoin(APIURL, path)
resp = requests.request(verb, url, **kwargs)
assert resp.status_code == 200, f'failed to retrieve {url} ({resp})'
return resp.json()
+
+
+def pollapi(path: str, verb: str = 'GET', **kwargs):
+ """Poll the API at path until it returns an OK result"""
+ url = urljoin(APIURL, path)
+ for i in range(60):
+ resp = requests.request(verb, url, **kwargs)
+ if resp.ok:
+ break
+ time.sleep(1)
+ else:
+ assert False, f"Polling {url} failed"
+ return resp
+
+
+def getdirectory(dirid: str, currentpath: str = '') \
+ -> Generator[Tuple[str, Mapping], None, None]:
+ """Recursively retrieve directory description from the archive"""
+ directory = apiget(f'directory/{dirid}')
+ for direntry in directory:
+ path = join(currentpath, direntry['name'])
+ if direntry['type'] != 'dir':
+ yield (path, direntry)
+ else:
+ yield from getdirectory(direntry['target'], path)
diff --git a/docker/tests/test_vault.py b/docker/tests/test_vault.py
new file mode 100644
index 0000000..6c7c571
--- /dev/null
+++ b/docker/tests/test_vault.py
@@ -0,0 +1,63 @@
+# Copyright (C) 2019-2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import io
+import hashlib
+import tarfile
+
+from os.path import join
+from urllib.parse import quote_plus
+
+from .conftest import apiget, getdirectory, pollapi
+
+
+def test_vault_directory(scheduler_host, git_origin):
+ # retrieve the root directory of the master branch of the ingested git
+ # repository (by the git_origin fixture)
+ visit = apiget(f'origin/{quote_plus(git_origin)}/visit/latest')
+ snapshot = apiget(f'snapshot/{visit["snapshot"]}')
+ rev_id = snapshot["branches"]["refs/heads/master"]["target"]
+ revision = apiget(f'revision/{rev_id}')
+ dir_id = revision['directory']
+
+ # now cook it
+ cook = apiget(f'vault/directory/{dir_id}/', 'POST')
+ assert cook['obj_type'] == 'directory'
+ assert cook['obj_id'] == dir_id
+ assert cook['fetch_url'].endswith(f'vault/directory/{dir_id}/raw/')
+
+ # while it's cooking, get the directory tree from the archive
+ directory = getdirectory(dir_id)
+
+ # retrieve the cooked tar file
+ resp = pollapi(f'vault/directory/{dir_id}/raw')
+ tarf = tarfile.open(fileobj=io.BytesIO(resp.content))
+
+ # and check the tarfile seems ok wrt. 'directory'
+ assert tarf.getnames()[0] == dir_id
+ tarfiles = {t.name: t for t in tarf.getmembers()}
+
+ for fname, fdesc in directory:
+ tfinfo = tarfiles.get(join(dir_id, fname))
+ assert tfinfo, f"Missing path {fname} in retrieved tarfile"
+ if fdesc['type'] == 'file':
+ assert fdesc['length'] == tfinfo.size, \
+ f"File {fname}: length mismatch"
+ fdata = tarf.extractfile(tfinfo).read()
+ for algo in fdesc['checksums']:
+ if algo not in hashlib.algorithms_available:
+ continue
+ hash = hashlib.new(algo, fdata).hexdigest()
+ assert hash == fdesc['checksums'][algo], \
+ f"File {fname}: {algo} mismatch"
+ # XXX what to check for dir? symlink? (other?)
+
+ # check that if we ask a second time this directory, it returns the same
+ # and does not cook it again
+ recook = apiget(f'vault/directory/{dir_id}/', 'POST')
+ assert recook['obj_type'] == 'directory'
+ assert recook['obj_id'] == dir_id
+ assert recook['id'] == cook['id']
+ assert recook['status'] == 'done' # no need to wait for this to be true