diff --git a/docker/Dockerfile b/docker/Dockerfile index f1a2730..30461eb 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,16 +1,30 @@ FROM adoptopenjdk/openjdk11:debian-jre +RUN export DEBIAN_FRONTEND=noninteractive && \ + apt-get update && apt-get upgrade -y && \ + apt-get install -y python3 python3-click python3-requests wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # Download and install jars -ADD https://github.com/javasoze/clue/releases/download/release-6.2.0-1.0.0/clue-6.2.0-1.0.0.jar /opt/ -ADD https://repo1.maven.org/maven2/org/apache/maven/indexer/indexer-cli/6.0.0/indexer-cli-6.0.0.jar /opt/ +RUN cd /opt && wget -q \ + https://github.com/javasoze/clue/releases/download/release-6.2.0-1.0.0/clue-6.2.0-1.0.0.jar +# coming out of https://github.com/javasoze/clue/releases/tag/release-6.2.0-1.0.0 +RUN cd /opt && wget -q \ + https://repo1.maven.org/maven2/org/apache/maven/indexer/indexer-cli/6.0.0/indexer-cli-6.0.0.jar +# FIXME: Retrieve https://repo1.maven.org/maven2/org/apache/maven/indexer/indexer-cli/6.0.0/indexer-cli-6.0.0.jar.sha1 +# indexer-cli-6.0.0.jar.sha1: eeb98596b7fed4aa13fa13ecafcbb843ef8ab697 # Copy index extraction script + +COPY run_full_export.py /opt/ COPY extract_indexes.sh /opt/ +RUN mkdir /work/ && chmod +x /opt/extract_indexes.sh WORKDIR /work/ -RUN ls /opt/ -RUN ls -R /work/ +ENV MVN_IDX_EXPORTER_BASE_URL=https://repo.maven.apache.org/maven2/ +ENV MVN_IDX_EXPORTER_WORK_DIR=/work +ENV MVN_IDX_EXPORTER_PUBLISH_DIR=/publish -# Parse default index file (will be overriden by cli parameters) -CMD ["sh", "/opt/extract_indexes.sh"] +CMD ["python3", "/opt/run_full_export.py"] diff --git a/docker/extract_indexes.sh b/docker/extract_indexes.sh index 7f5dcc1..a5ee45c 100644 --- a/docker/extract_indexes.sh +++ b/docker/extract_indexes.sh @@ -1,109 +1,109 @@ #!/bin/bash # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -WORKDIR=/work +WORKDIR=${1-/work} FILE_IN=$WORKDIR/nexus-maven-repository-index.gz localtime=$(date +"%Y-%m-%d %H:%M:%S") echo "Docker Script started on $localtime." echo "# Checks.." echo "* Content of /opt:" ls -l /opt echo "* Content of $WORKDIR:" ls -l $WORKDIR echo "* Will read files from [$FILE_IN]." if [ ! -r "$FILE_IN" ]; then echo "Cannot find file [$FILE_IN]." echo "Need an index file to work on. Exiting 4." exit 4 else echo "* Found file [$FILE_IN]." fi indexer=$(find /opt/ -name "indexer-cli-*.jar") if [ "$indexer" = "" ]; then echo "Cannot find indexer. Exiting 6." exit 6 else echo "* Found indexer [$indexer]." fi clue=$(find /opt/ -name "clue-*.jar") if [ "$clue" = "" ]; then echo "Cannot find clue. Exiting 8." exit 8 else echo "* Found clue [$clue]." fi echo "* Java version:." java -version echo "#############################" if [ -d $WORKDIR/indexes ]; then echo "Found $WORKDIR/indexes, skipping index generation." du -sh $WORKDIR/indexes/ else echo "Unpacking [$FILE_IN] to $WORKDIR/indexes" java --illegal-access=permit -jar $indexer \ --unpack $FILE_IN \ --destination $WORKDIR/indexes/ \ --type full 2>&1 | grep -v WARNING fi localtime=$(date +"%Y-%m-%d %H:%M:%S") echo "Unpacking finished on $localtime." echo "#############################" if [ -d $WORKDIR/export ]; then echo "Found $WORKDIR/export, skipping index export." ls -lh $WORKDIR/export/ else echo "Exporting indexes $WORKDIR/indexes to $WORKDIR/export" java --illegal-access=permit -jar $clue \ $WORKDIR/indexes/ \ export $WORKDIR/export/ text 2>&1 | grep -v WARNING fi localtime=$(date +"%Y-%m-%d %H:%M:%S") echo "Exporting finished on $localtime." echo "#############################" echo "Cleaning useless files." echo "Size before cleaning:" du -sh $WORKDIR/* # We might want or not to delete the indexes # Remember that when they're not present, everything # gets recomputed every run.. #echo "* Removing indexes." #rm -rf $WORKDIR/indexes/ # If files others than the .fld one are required, please comment # the following lines. echo "* Removing useless exports." echo " Keeping only fld text extract." rm -f $WORKDIR/export/*.inf rm -f $WORKDIR/export/*.len rm -f $WORKDIR/export/*.pst rm -f $WORKDIR/export/*.si rm -f $WORKDIR/export/segments* echo " Size after cleaning:" du -sh $WORKDIR/* echo "* Make files modifiable by the end-user." chmod -R 777 $WORKDIR/export/ chmod -R 777 $WORKDIR/indexes/ localtime=$(date +"%Y-%m-%d %H:%M:%S") echo "Docker Script execution finished on $localtime." diff --git a/scripts/run_full_export.py b/docker/run_full_export.py similarity index 71% rename from scripts/run_full_export.py rename to docker/run_full_export.py index db5b2d4..20ab4df 100644 --- a/scripts/run_full_export.py +++ b/docker/run_full_export.py @@ -1,221 +1,166 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import click import sys import datetime import glob import re +from subprocess import check_call from os import chdir, makedirs from os.path import getsize, isabs, isdir, isfile, join, basename from pathlib import Path from shutil import copy2 from urllib.parse import urljoin import requests -import docker -from docker.client import DockerClient -from docker.models.images import Image - logger = logging.getLogger(__name__) MAVEN_INDEX_NAME = "nexus-maven-repository-index" MAVEN_INDEX_ARCHIVE = f"{MAVEN_INDEX_NAME}.gz" -def _docker_image_get( - client: DockerClient, docker_image_name: str, docker_image_update: bool = False -) -> Image: - """Retrieve docker image locally.""" - if docker_image_update: - return client.images.pull(repository=docker_image_name) - try: - image = next(iter(client.images.list(name=docker_image_name))) - except StopIteration: - logger.info("Docker: Could not find %s. Pulling it.", docker_image_name) - image = client.images.pull(repository=docker_image_name) - else: - logger.info( - "Docker: Found image %s locally, ID is %s.", image, image.attrs["Id"] - ) - return image - - -def _extract_indexes( - work_dir: str, docker_image_name: str, docker_image_update: bool = False -) -> None: - """Extract indexes out of the working directory. - - As an implementation details, this starts the maven indexer exporter container using - a docker image. This will use the local image if present, otherwise pull it from - docker hub first. - - """ - # Initialise the docker client. - client = docker.from_env() - image = _docker_image_get(client, docker_image_name, docker_image_update) - # Run the extraction process through the docker image (which runs the extract index - # script), see ../docker/Dockerfile. - ret = client.containers.run( - image, - tty=True, - volumes={work_dir: {"bind": "/work", "mode": "rw"}}, - ) - - logger.info("Docker log:\n%s", ret.decode()) - - def _download_indexes(work_dir: str, instance_url: str) -> None: - """ Download all required indexes from the .index/ directory + """Download all required indexes from the .index/ directory of the specified instance. + """ logger.info("Downloading all required indexes") index_url = urljoin(instance_url, ".index/") properties_name = f"{MAVEN_INDEX_NAME}.properties" properties_file = join(work_dir, properties_name) properties_url = urljoin(index_url, properties_name) # Retrieve properties file. logger.info(" - Downloading %s.", properties_file) content = requests.get(properties_url).content.decode() open(properties_file, "w").write(content) diff_re = re.compile("^nexus.index.incremental-[0-9]+=([0-9]+)") for line in content.split("\n"): diff_group = diff_re.match(line) if diff_group is not None: ind_name = f"{MAVEN_INDEX_NAME}.{diff_group.group(1)}.gz" ind_path = join(work_dir, ind_name) ind_url = urljoin(index_url, ind_name) if isfile(ind_path): logger.info( " - File %s exists, skipping download.", basename(ind_path) ) else: logger.info( " - File %s doesn't exist. Downloading file from %s.", basename(ind_path), ind_url, ) # Retrieve incremental gz file contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) # Retrieve main index file. ind_path = join(work_dir, MAVEN_INDEX_ARCHIVE) ind_url = urljoin(index_url, MAVEN_INDEX_ARCHIVE) if isfile(ind_path): logger.info(" - File %s exists, skipping download.", basename(ind_path)) else: logger.info( " - File %s doesn't exist. Downloading file from %s", basename(ind_path), ind_url, ) contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) @click.command() @click.option( "--base-url", required=True, help=( "Base url of the maven repository instance. \n" "Example: https://repo.maven.apache.org/maven2/" ), ) @click.option( "--work-dir", help="Absolute path to the temp directory.", default="/tmp/maven-index-exporter/", ) @click.option( "--publish-dir", help="Absolute path to the final directory.", default="/tmp/maven-index-exporter/publish/", ) -@click.option( - "--docker-image-name", help="Docker image", default="maven-index-exporter" -) -@click.option( - "--docker-image-update", - is_flag=True, - help="Trigger a docker image update.", - default=False, -) -def main(base_url, work_dir, publish_dir, docker_image_name, docker_image_update): +def main(base_url, work_dir, publish_dir): now = datetime.datetime.now() logger.info("Script: run_full_export") logger.info("Timestamp: %s", now.strftime("%Y-%m-%d %H:%M:%S")) logger.info("* URL: %s", base_url) logger.info("* Working directory: %s", work_dir) logger.info("* Publish directory: %s", publish_dir) # Check work_dir and create it if needed. if isdir(work_dir): logger.info("Work_Dir %s exists. Reusing it.", work_dir) else: try: logger.info("Cannot find work_dir %s. Creating it.", work_dir) Path(work_dir).mkdir(parents=True, exist_ok=True) except OSError as error: logger.info("Could not create work_dir %s: %s.", work_dir, error) assert isdir(work_dir) assert isabs(work_dir) # Grab all the indexes # Only fetch the new ones, existing files won't be re-downloaded. _download_indexes(work_dir, base_url) - # Run Docker on the downloaded indexes. - _extract_indexes( - work_dir, docker_image_name, docker_image_update=docker_image_update - ) + # Extract indexes into a .fld file + check_call(["/opt/extract_indexes.sh", work_dir]) logger.info("Export directory has the following files:") export_dir = join(work_dir, "export") makedirs(export_dir, exist_ok=True) chdir(export_dir) myfile = None re_fld = re.compile(r".*\.fld$") for file_ in glob.glob("*.*"): logger.info(" - %s size %s", file_, getsize(file_)) if re_fld.match(file_): myfile = file_ # Now copy the results to the desired location: publish_dir. if isfile(myfile): logger.info("Found fld file: %s", myfile) else: logger.info("Cannot find .fld file. Exiting") sys.exit(4) makedirs(publish_dir, exist_ok=True) publish_file = join(publish_dir, "export.fld") logger.info("Copying files to %s.", publish_file) try: copy2(myfile, publish_file) except OSError as error: logger.info("Could not publish results in %s: %s.", publish_dir, error) now = datetime.datetime.now() logger.info("Script finished on %s", now.strftime("%Y-%m-%d %H:%M:%S")) ############################################### # Start execution ############################################### if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - main() + main(auto_envvar_prefix='MVN_IDX_EXPORTER')