diff --git a/docker/Dockerfile b/docker/Dockerfile --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM adoptopenjdk/openjdk11:alpine-jre +FROM adoptopenjdk/openjdk11:debian-jre # Download and install jars ADD https://github.com/javasoze/clue/releases/download/release-6.2.0-1.0.0/clue-6.2.0-1.0.0.jar /opt/ @@ -13,4 +13,4 @@ RUN ls -R /work/ # Parse default index file (will be overriden by cli parameters) -CMD ["sh", "/opt/extract_indexes.sh", "/work/nexus-maven-repository-index.gz"] +CMD ["sh", "/opt/extract_indexes.sh"] diff --git a/docker/extract_indexes.sh b/docker/extract_indexes.sh --- a/docker/extract_indexes.sh +++ b/docker/extract_indexes.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -52,7 +52,10 @@ du -sh $WORKDIR/indexes/ else echo "Unpacking [$FILE_IN] to $WORKDIR/indexes" - java --illegal-access=permit -jar $indexer --unpack $FILE_IN --destination $WORKDIR/indexes/ --type full 2>&1 | grep -v WARNING + java --illegal-access=permit -jar $indexer \ + --unpack $FILE_IN \ + --destination $WORKDIR/indexes/ \ + --type full 2>&1 | grep -v WARNING fi localtime=$(date +"%Y-%m-%d %H:%M:%S") @@ -64,7 +67,9 @@ ls -lh $WORKDIR/export/ else echo "Exporting indexes $WORKDIR/indexes to $WORKDIR/export" - java --illegal-access=permit -jar $clue $WORKDIR/indexes/ export $WORKDIR/export/ text 2>&1 | grep -v WARNING + java --illegal-access=permit -jar $clue \ + $WORKDIR/indexes/ \ + export $WORKDIR/export/ text 2>&1 | grep -v WARNING fi localtime=$(date +"%Y-%m-%d %H:%M:%S") @@ -102,4 +107,3 @@ localtime=$(date +"%Y-%m-%d %H:%M:%S") echo "Docker Script execution finished on $localtime." - diff --git a/docs/maven_repositories.md b/docs/maven_repositories.md --- a/docs/maven_repositories.md +++ b/docs/maven_repositories.md @@ -1,9 +1,6 @@ - - A list of remote Maven repositories using [Maven Indexer](https://maven.apache.org/maven-indexer/) for their catalogue. - # Introduction In the Maven ecosystem, dependencies and artefacts required to develop Java projects can @@ -116,7 +113,7 @@ ## Checking compatibility -To ensure that these repositories can be actually parsed with the Maen index exporter, +To ensure that these repositories can be actually parsed with the Maven index exporter, there is no better way than parsing them and generating the index and text export. For this, we first need to download all indexes from all servers: @@ -135,7 +132,7 @@ ```shell mkdir -p ../maven_repositories/ for i in `ls`; do - time docker run -v /data/work/$i:/work bbaldassari/maven-index-exporter | tee ../logs/$i.log; + time docker run -v /data/work/$i:/work maven-index-exporter | tee ../logs/$i.log; mv $i/ ../maven_repositories/; done ``` diff --git a/docs/run_maven_index_exporter.md b/docs/run_maven_index_exporter.md --- a/docs/run_maven_index_exporter.md +++ b/docs/run_maven_index_exporter.md @@ -12,12 +12,15 @@ disk (see warning below) and pass it to docker: ``` -$ LOCAL_DIR=/tmp/work -$ docker run -v $LOCAL_DIR:/work $USER/maven-index-exporter +$ LOCAL_DIR=/tmp/maven-index-exporter +# build the image +$ cd docker && docker build -f Dockerfile -t maven-index-exporter . +# run the image +$ docker run -v $LOCAL_DIR:/work maven-index-exporter ``` -Please note that the local work dir MUST be an absolute path, as docker won't mount -relative paths as volumes. +Please note that `LOCAL_DIR` *MUST* be an absolute path, as docker won't mount relative +paths as volumes. For our purpose only the fld file is kept, so if you need other export files you should simply edit the `extract_indexes.sh` script and comment the lines that do the cleaning. @@ -29,24 +32,29 @@ The `run_full_export.py` script located in `scripts/` provides an easy way to run the export as a cron batch job, and copy the resulting text export to a specific location. - %Simply use and adapt the crontab command as follows: +Simply use and adapt the crontab command as follows: ``` cd $HOME/maven-index-exporter/scripts/ && \ - ./myvenv/bin/python $HOME/maven-index-exporter/scripts/run_full_export.py https://repo.maven.apache.org/maven2/ /tmp/maven-index \ - -exporter/ /var/www/html/maven_index_exporter/ 2>&1 > /tmp/run_maven_exporter_$(date +"%Y%m%d-%H%M%S").log + ./myvenv/bin/python $HOME/maven-index-exporter/scripts/run_full_export.py \ + --base-url https://repo.maven.apache.org/maven2/ \ + 2>&1 > /tmp/run_maven_exporter_$(date +"%Y%m%d-%H%M%S").log ``` -The script takes three mandatory arguments: +Script usage: ``` -Usage: run_full_export.py - - url is the base url of the maven repository instance. - Example: https://repo.maven.apache.org/maven2/ - - work_dir must be an absolute path to the temp directory. - Example: /tmp/maven-index-exporter/ - - publish_dir must be an absolute path to the final directory. - Example: /var/www/html/ +$ python3 run_full_export.py --help +Usage: run_full_export.py [OPTIONS] + +Options: + --base-url TEXT Base url of the maven repository instance. Example: + https://repo.maven.apache.org/maven2/ [required] + + --work-dir TEXT Absolute path to the temp directory. + --publish-dir TEXT Absolute path to the final directory. + --docker-image TEXT Docker image + --help Show this message and exit. ``` It is recommended to setup a virtual environment to run the script. diff --git a/scripts/requirements.txt b/scripts/requirements.txt --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -7,3 +7,4 @@ six==1.16.0 urllib3==1.26.6 websocket-client==1.2.1 +click diff --git a/scripts/run_full_export.py b/scripts/run_full_export.py --- a/scripts/run_full_export.py +++ b/scripts/run_full_export.py @@ -1,14 +1,16 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging +import click +import sys import datetime import glob import re -import sys -from os import chdir, getcwd -from os.path import getsize, isabs, isdir, isfile, join +from os import chdir, makedirs +from os.path import getsize, isabs, isdir, isfile, join, basename from pathlib import Path from shutil import copy2 from urllib.parse import urljoin @@ -16,66 +18,72 @@ import requests import docker +from docker.client import DockerClient +from docker.models.images import Image + +logger = logging.getLogger(__name__) -# Check paramaters -if len(sys.argv) != 4: - print("Usage:", sys.argv[0], "url work_dir publish_dir") - print(" - url is the base url of the maven repository instance.") - print(" Example: https://repo.maven.apache.org/maven2/") - print(" - work_dir must be an absolute path to the temp directory.") - print(" Example: /tmp/maven-index-exporter/") - print(" - publish_dir must be an absolute path to the final directory.") - print(" Example: /var/www/html/") - exit() - -base_url = sys.argv[1] -work_dir = sys.argv[2] -publish_dir = sys.argv[3] - - -def _docker_run(docker_image: str): - """ Start the container for the maven index export, using the image - 'bbaldassari/maven-index-exporter'. If needed the image is pulled from - docker hub. If it already exists, simply use the local one. - """ - # Initialise the docker client. - client = docker.from_env() - myimage = None - for image in client.images.list(name=docker_image): - myimage = image - break +MAVEN_INDEX_NAME = "nexus-maven-repository-index" +MAVEN_INDEX_ARCHIVE = f"{MAVEN_INDEX_NAME}.gz" - if myimage is None: - print(f"Docker: Could not find {docker_image}. Pulling it.") - myimage = client.images.pull(repository=docker_image) + +def _docker_image_get( + client: DockerClient, docker_image_name: str, docker_image_update: bool = False +) -> Image: + """Retrieve docker image locally.""" + if docker_image_update: + return client.images.pull(repository=docker_image_name) + try: + image = next(iter(client.images.list(name=docker_image_name))) + except StopIteration: + logger.info("Docker: Could not find %s. Pulling it.", docker_image_name) + image = client.images.pull(repository=docker_image_name) else: - print("Docker: Found image {myimage} locally, ID is {myimage.attrs['Id']}.") + logger.info( + "Docker: Found image %s locally, ID is %s.", image, image.attrs["Id"] + ) + return image + +def _extract_indexes( + work_dir: str, docker_image_name: str, docker_image_update: bool = False +) -> None: + """Extract indexes out of the working directory. + + As an implementation details, this starts the maven indexer exporter container using + a docker image. This will use the local image if present, otherwise pull it from + docker hub first. + + """ + # Initialise the docker client. + client = docker.from_env() + image = _docker_image_get(client, docker_image_name, docker_image_update) + # Run the extraction process through the docker image (which runs the extract index + # script), see ../docker/Dockerfile. ret = client.containers.run( - myimage, + image, tty=True, - command=["sh", "/opt/extract_indexes.sh", "/work/"], volumes={work_dir: {"bind": "/work", "mode": "rw"}}, ) - print(f"Docker log:\n{ret.decode()}") + logger.info("Docker log:\n%s", ret.decode()) -def _download_indexes(instance_url: str): +def _download_indexes(work_dir: str, instance_url: str) -> None: """ Download all required indexes from the .index/ directory of the specified instance. """ - print(f"# Downloading all required indexes") + logger.info("Downloading all required indexes") index_url = urljoin(instance_url, ".index/") - properties_name = "nexus-maven-repository-index.properties" + properties_name = f"{MAVEN_INDEX_NAME}.properties" properties_file = join(work_dir, properties_name) properties_url = urljoin(index_url, properties_name) # Retrieve properties file. - print(f" - Downloading {properties_file}.") + logger.info(" - Downloading %s.", properties_file) content = requests.get(properties_url).content.decode() open(properties_file, "w").write(content) @@ -83,86 +91,131 @@ for line in content.split("\n"): diff_group = diff_re.match(line) if diff_group is not None: - ind_name = "nexus-maven-repository-index." + diff_group.group(1) + ".gz" + ind_name = f"{MAVEN_INDEX_NAME}.{diff_group.group(1)}.gz" ind_path = join(work_dir, ind_name) ind_url = urljoin(index_url, ind_name) if isfile(ind_path): - print(f" - File {ind_path} exists, skipping download.") + logger.info( + " - File %s exists, skipping download.", basename(ind_path) + ) else: - print( - ( - f" - File {ind_path} doesn't exist. " - f"Downloading file from {ind_url}." - ) + logger.info( + " - File %s doesn't exist. Downloading file from %s.", + basename(ind_path), + ind_url, ) # Retrieve incremental gz file contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) # Retrieve main index file. - ind_path = join(work_dir, "nexus-maven-repository-index.gz") - ind_url = urljoin(index_url, "nexus-maven-repository-index.gz") + ind_path = join(work_dir, MAVEN_INDEX_ARCHIVE) + ind_url = urljoin(index_url, MAVEN_INDEX_ARCHIVE) if isfile(ind_path): - print(f" - File {ind_path} exists, skipping download.") + logger.info(" - File %s exists, skipping download.", basename(ind_path)) else: - print(f" - File {ind_path} doesn't exist. Downloading file from {ind_url}") + logger.info( + " - File %s doesn't exist. Downloading file from %s", + basename(ind_path), + ind_url, + ) + contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) -############################################### -# Start execution -############################################### +@click.command() +@click.option( + "--base-url", + required=True, + help=( + "Base url of the maven repository instance. \n" + "Example: https://repo.maven.apache.org/maven2/" + ), +) +@click.option( + "--work-dir", + help="Absolute path to the temp directory.", + default="/tmp/maven-index-exporter/", +) +@click.option( + "--publish-dir", + help="Absolute path to the final directory.", + default="/tmp/maven-index-exporter/publish/", +) +@click.option( + "--docker-image-name", help="Docker image", default="maven-index-exporter" +) +@click.option( + "--docker-image-update", + is_flag=True, + help="Trigger a docker image update.", + default=False, +) +def main(base_url, work_dir, publish_dir, docker_image_name, docker_image_update): + now = datetime.datetime.now() + logger.info("Script: run_full_export") + logger.info("Timestamp: %s", now.strftime("%Y-%m-%d %H:%M:%S")) + logger.info("* URL: %s", base_url) + logger.info("* Working directory: %s", work_dir) + logger.info("* Publish directory: %s", publish_dir) + + # Check work_dir and create it if needed. + if isdir(work_dir): + logger.info("Work_Dir %s exists. Reusing it.", work_dir) + else: + try: + logger.info("Cannot find work_dir %s. Creating it.", work_dir) + Path(work_dir).mkdir(parents=True, exist_ok=True) + except OSError as error: + logger.info("Could not create work_dir %s: %s.", work_dir, error) + + assert isdir(work_dir) + assert isabs(work_dir) + + # Grab all the indexes + # Only fetch the new ones, existing files won't be re-downloaded. + _download_indexes(work_dir, base_url) + + # Run Docker on the downloaded indexes. + _extract_indexes( + work_dir, docker_image_name, docker_image_update=docker_image_update + ) -now = datetime.datetime.now() -print(f"Script: {sys.argv[0]}") -print("Timestamp:", now.strftime("%Y-%m-%d %H:%M:%S")) -print(f"* URL: {base_url}") -print(f"* Work_Dir: {work_dir}") + logger.info("Export directory has the following files:") + export_dir = join(work_dir, "export") + makedirs(export_dir, exist_ok=True) + chdir(export_dir) + myfile = None + re_fld = re.compile(r".*\.fld$") + for file_ in glob.glob("*.*"): + logger.info(" - %s size %s", file_, getsize(file_)) + if re_fld.match(file_): + myfile = file_ + + # Now copy the results to the desired location: publish_dir. + if isfile(myfile): + logger.info("Found fld file: %s", myfile) + else: + logger.info("Cannot find .fld file. Exiting") + sys.exit(4) -# Check work_dir and create it if needed. -if isdir(work_dir): - print("Work_Dir {work_dir} exists. Reusing it.") -else: + makedirs(publish_dir, exist_ok=True) + publish_file = join(publish_dir, "export.fld") + logger.info("Copying files to %s.", publish_file) try: - print("Cannot find work_dir {work_dir}. Creating it.") - Path(work_dir).mkdir(parents=True, exist_ok=True) + copy2(myfile, publish_file) except OSError as error: - print(f"Could not create work_dir {work_dir}: {error}.") - -assert isdir(work_dir) -assert isabs(work_dir) - -# Grab all the indexes -# Only fetch the new ones, existing files won't be re-downloaded. -_download_indexes(base_url) - -# Run Docker on the downloaded indexes. -_docker_run("bbaldassari/maven-index-exporter") - -print("Export directory has the following files:") -owd = getcwd() -chdir(join(work_dir, "export")) -myfile = None -re_fld = re.compile(r".*\.fld$") -for file in glob.glob("*.*"): - print(" -", file, "size", getsize(file)) - if re_fld.match(file): - myfile = file - -# Now copy the results to the desired location: publish_dir. -if isfile(myfile): - print("Found fld file:", myfile) -else: - print("Cannot find .fld file. Exiting") - exit(4) - -publish_file = join(publish_dir, "export.fld") -print(f"Copying files to {publish_file}..") -try: - copy2(myfile, publish_file) -except OSError as error: - print(f"Could not publish results in {publish_dir}: {error}.") - -now = datetime.datetime.now() -print(f"Script finished on", now.strftime("%Y-%m-%d %H:%M:%S")) + logger.info("Could not publish results in %s: %s.", publish_dir, error) + + now = datetime.datetime.now() + logger.info("Script finished on %s", now.strftime("%Y-%m-%d %H:%M:%S")) + + +############################################### +# Start execution +############################################### + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + main()