diff --git a/docker/run_full_export.py b/docker/run_full_export.py index 20ab4df..d260578 100644 --- a/docker/run_full_export.py +++ b/docker/run_full_export.py @@ -1,166 +1,170 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import click import sys import datetime import glob import re from subprocess import check_call from os import chdir, makedirs from os.path import getsize, isabs, isdir, isfile, join, basename from pathlib import Path from shutil import copy2 from urllib.parse import urljoin import requests logger = logging.getLogger(__name__) MAVEN_INDEX_NAME = "nexus-maven-repository-index" MAVEN_INDEX_ARCHIVE = f"{MAVEN_INDEX_NAME}.gz" -def _download_indexes(work_dir: str, instance_url: str) -> None: +def _download_indexes(base_url: str, work_dir: str) -> None: """Download all required indexes from the .index/ directory of the specified instance. """ - logger.info("Downloading all required indexes") + if base_url.startswith('test://'): + logger.info("(Testing) Fake downloading required indexes") + return None - index_url = urljoin(instance_url, ".index/") + logger.info("Downloading required indexes") + + index_url = urljoin(base_url, ".index/") properties_name = f"{MAVEN_INDEX_NAME}.properties" properties_file = join(work_dir, properties_name) properties_url = urljoin(index_url, properties_name) # Retrieve properties file. logger.info(" - Downloading %s.", properties_file) content = requests.get(properties_url).content.decode() open(properties_file, "w").write(content) diff_re = re.compile("^nexus.index.incremental-[0-9]+=([0-9]+)") for line in content.split("\n"): diff_group = diff_re.match(line) if diff_group is not None: ind_name = f"{MAVEN_INDEX_NAME}.{diff_group.group(1)}.gz" ind_path = join(work_dir, ind_name) ind_url = urljoin(index_url, ind_name) if isfile(ind_path): logger.info( " - File %s exists, skipping download.", basename(ind_path) ) else: logger.info( " - File %s doesn't exist. Downloading file from %s.", basename(ind_path), ind_url, ) # Retrieve incremental gz file contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) # Retrieve main index file. ind_path = join(work_dir, MAVEN_INDEX_ARCHIVE) ind_url = urljoin(index_url, MAVEN_INDEX_ARCHIVE) if isfile(ind_path): logger.info(" - File %s exists, skipping download.", basename(ind_path)) else: logger.info( " - File %s doesn't exist. Downloading file from %s", basename(ind_path), ind_url, ) contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) @click.command() @click.option( "--base-url", required=True, help=( "Base url of the maven repository instance. \n" "Example: https://repo.maven.apache.org/maven2/" ), ) @click.option( "--work-dir", help="Absolute path to the temp directory.", default="/tmp/maven-index-exporter/", ) @click.option( "--publish-dir", help="Absolute path to the final directory.", default="/tmp/maven-index-exporter/publish/", ) def main(base_url, work_dir, publish_dir): now = datetime.datetime.now() logger.info("Script: run_full_export") logger.info("Timestamp: %s", now.strftime("%Y-%m-%d %H:%M:%S")) logger.info("* URL: %s", base_url) logger.info("* Working directory: %s", work_dir) logger.info("* Publish directory: %s", publish_dir) # Check work_dir and create it if needed. if isdir(work_dir): logger.info("Work_Dir %s exists. Reusing it.", work_dir) else: try: logger.info("Cannot find work_dir %s. Creating it.", work_dir) Path(work_dir).mkdir(parents=True, exist_ok=True) except OSError as error: logger.info("Could not create work_dir %s: %s.", work_dir, error) assert isdir(work_dir) assert isabs(work_dir) # Grab all the indexes # Only fetch the new ones, existing files won't be re-downloaded. - _download_indexes(work_dir, base_url) + _download_indexes(base_url, work_dir) # Extract indexes into a .fld file check_call(["/opt/extract_indexes.sh", work_dir]) logger.info("Export directory has the following files:") export_dir = join(work_dir, "export") makedirs(export_dir, exist_ok=True) chdir(export_dir) myfile = None re_fld = re.compile(r".*\.fld$") for file_ in glob.glob("*.*"): logger.info(" - %s size %s", file_, getsize(file_)) if re_fld.match(file_): myfile = file_ # Now copy the results to the desired location: publish_dir. if isfile(myfile): logger.info("Found fld file: %s", myfile) else: logger.info("Cannot find .fld file. Exiting") sys.exit(4) makedirs(publish_dir, exist_ok=True) publish_file = join(publish_dir, "export.fld") logger.info("Copying files to %s.", publish_file) try: copy2(myfile, publish_file) except OSError as error: logger.info("Could not publish results in %s: %s.", publish_dir, error) now = datetime.datetime.now() logger.info("Script finished on %s", now.strftime("%Y-%m-%d %H:%M:%S")) ############################################### # Start execution ############################################### if __name__ == "__main__": logging.basicConfig(level=logging.INFO) main(auto_envvar_prefix='MVN_IDX_EXPORTER') diff --git a/scripts/requirements.txt b/scripts/requirements.txt deleted file mode 100644 index 48acf7b..0000000 --- a/scripts/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -certifi==2021.5.30 -charset-normalizer==2.0.4 -docker==5.0.0 -idna==3.2 -pkg-resources==0.0.0 -requests==2.26.0 -six==1.16.0 -urllib3==1.26.6 -websocket-client==1.2.1 -click diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000..8a0bacd --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,2 @@ +publish/ +export/ diff --git a/tests/Makefile b/tests/Makefile new file mode 100644 index 0000000..88ce33e --- /dev/null +++ b/tests/Makefile @@ -0,0 +1,5 @@ +build-and-test: + ./build_and_test_image.sh + +test: + ./test_image.sh diff --git a/tests/build_and_test_image.sh b/tests/build_and_test_image.sh new file mode 100755 index 0000000..19bdb54 --- /dev/null +++ b/tests/build_and_test_image.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Copyright (C) 2021-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +DOCKER_IMAGE="softwareheritage/maven-index-exporter" +LOG=test_docker_image.log + +# This script builds the docker image for maven-index-exporter, and +# executes it on a known set of indexes and checks the results in order +# to test the full tool chain. + +echo "Script started on `date +%Y%m%d_%H%M%S`." +echo "* Writing log to $LOG." + +# Find location of script directory +OLD_DIR=$(pwd) +REPO_DIR=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd) +cd $OLD_DIR + +# First clean up and remove any docker image with our own name +docker rmi $DOCKER_IMAGE >>$LOG 2>&1 +RET=$? +if [[ $RET -eq 0 ]]; then + echo "* Docker image [$DOCKER_IMAGE] deleted." +elif [[ $RET -eq 1 ]]; then + echo "* Docker image [$DOCKER_IMAGE] doesn't exist." +else + echo "Error when deleting docker image [$DOCKER_IMAGE]." +fi + +# Build the image and tag it as $DOCKER_IMAGE +cd $REPO_DIR/docker +echo "* Building docker image." +docker build . -t $DOCKER_IMAGE >>$LOG +RET=$? +if [[ $RET -eq 0 ]]; then + echo "PASS: docker build returned 0." +else + echo "FAIL: docker build returned $RET." + exit 20 +fi + +# Assert docker image has been created. +COUNT=$(docker images | grep -E "^$DOCKER_IMAGE\s" | wc -l) +if [[ $COUNT -eq 0 ]]; then + echo "FAIL: Docker image cannot be listed." + exit 10 +else + echo "PASS: Docker image is listed." +fi + +cd $OLD_DIR +./test_image.sh diff --git a/scripts/test_docker_image.sh b/tests/test_image.sh similarity index 62% rename from scripts/test_docker_image.sh rename to tests/test_image.sh index 7331a29..47065ac 100755 --- a/scripts/test_docker_image.sh +++ b/tests/test_image.sh @@ -1,121 +1,96 @@ #!/bin/bash -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -DOCKER_IMAGE="maven-index-exporter" +DOCKER_IMAGE="softwareheritage/maven-index-exporter" LOG=test_docker_image.log -# This script builds the docker image for maven-index-exporter, and -# executes it on a known set of indexes and checks the results in order -# to test the full tool chain. - -echo "Script started on `date +%Y%m%d_%H%M%S`." -echo "* Writing log to $LOG." - -# Find location of script directory -OLD_DIR=$(pwd) REPO_DIR=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd) -cd $OLD_DIR - WORK_DIR=$REPO_DIR/tests/repository_test EXPORT_DIR=$WORK_DIR/export +PUBLISH_DIR=$REPO_DIR/tests/publish -# First clean up and remove any docker image with our own name -docker rmi $DOCKER_IMAGE >>$LOG 2>&1 -RET=$? -if [[ $RET -eq 0 ]]; then - echo "* Docker image [$DOCKER_IMAGE] deleted." -elif [[ $RET -eq 1 ]]; then - echo "* Docker image [$DOCKER_IMAGE] doesn't exist." -else - echo "Error when deleting docker image [$DOCKER_IMAGE]." -fi - - -# Build the image and tag it as $DOCKER_IMAGE -cd $REPO_DIR/docker -echo "* Building docker image." -docker build . -t $DOCKER_IMAGE --no-cache >>$LOG -RET=$? -if [[ $RET -eq 0 ]]; then - echo "PASS: docker build returned 0." -else - echo "FAIL: docker build returned $RET." - exit 20 -fi - -# Assert docker image has been created. -COUNT=$(docker images | grep -E "^$DOCKER_IMAGE\s" | wc -l) -if [[ $COUNT -eq 0 ]]; then - echo "FAIL: Docker image cannot be listed." - exit 10 -else - echo "PASS: Docker image is listed." -fi +# clean up publish directory +rm -rf $PUBLISH_DIR/* +mkdir -p $PUBLISH_DIR +# This will mock out the download part of the python code +export MVN_IDX_EXPORTER_BASE_URL='test://example.org' # Run the image on the maven indexes. -docker run -v $WORK_DIR:/work $DOCKER_IMAGE >>$LOG 2>&1 +docker run -v $WORK_DIR:/work \ + -v $PUBLISH_DIR:/publish \ + -e MVN_IDX_EXPORTER_BASE_URL \ + $DOCKER_IMAGE >>$LOG 2>&1 # Assert exported text files are there, with the correct content. EXPORT_FILE=$(ls $EXPORT_DIR/*.fld) if [[ -e $EXPORT_FILE ]]; then echo "PASS: file [$EXPORT_FILE] has been created." else echo "FAIL: file [$EXPORT_FILE] has NOT been created." exit 20 fi DOCS=$(grep -E "^doc" $EXPORT_FILE | wc -l) if [[ $DOCS -eq 10 ]]; then echo "PASS: file [$EXPORT_FILE] has 10 docs." else echo "FAIL: file [$EXPORT_FILE] has $DOCS docs, should be 10." exit 20 fi FIELDS=$(grep -E "^ field" $EXPORT_FILE | wc -l) if [[ $FIELDS -eq 35 ]]; then echo "PASS: file [$EXPORT_FILE] has 35 fields." else echo "FAIL: file [$EXPORT_FILE] has $FIELDS fields, should be 35." exit 20 fi FIELDS=$(grep "value al.aldi|sprova4j|0.1.0|sources|jar" $EXPORT_FILE | wc -l) if [[ $FIELDS -eq 1 ]]; then echo "PASS: file [$EXPORT_FILE] has sprova4j-0.1.0-sources.jar." else echo "FAIL: file [$EXPORT_FILE] has NOT sprova4j-0.1.0-sources.jar." exit 20 fi FIELDS=$(grep "value al.aldi|sprova4j|0.1.0|NA|pom" $EXPORT_FILE | wc -l) if [[ $FIELDS -eq 1 ]]; then echo "PASS: file [$EXPORT_FILE] has sprova4j-0.1.0.pom." else echo "FAIL: file [$EXPORT_FILE] has NOT sprova4j-0.1.0.pom." exit 20 fi FIELDS=$(grep "value al.aldi|sprova4j|0.1.1|sources|jar" $EXPORT_FILE | wc -l) if [[ $FIELDS -eq 1 ]]; then echo "PASS: file [$EXPORT_FILE] has sprova4j-0.1.1-sources.jar." else echo "FAIL: file [$EXPORT_FILE] has NOT sprova4j-0.1.1-sources.jar." exit 20 fi FIELDS=$(grep "value al.aldi|sprova4j|0.1.1|NA|pom" $EXPORT_FILE | wc -l) if [[ $FIELDS -eq 1 ]]; then echo "PASS: file [$EXPORT_FILE] has sprova4j-0.1.1.pom." else echo "FAIL: file [$EXPORT_FILE] has NOT sprova4j-0.1.1.pom." exit 20 fi +PUBLISH_FILE=$PUBLISH_DIR/export.fld + +if [[ -f $PUBLISH_FILE ]]; then + echo "PASS: file [$PUBLISH_FILE] exists." +else + echo "FAIL: file [$PUBLISH_FILE] does not exist." + exit 20 +fi + # Cleanup rm -rf $EXPORT_DIR cd $OLD_DIR