diff --git a/docker/extract_indexes.sh b/docker/extract_indexes.sh --- a/docker/extract_indexes.sh +++ b/docker/extract_indexes.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -52,7 +52,10 @@ du -sh $WORKDIR/indexes/ else echo "Unpacking [$FILE_IN] to $WORKDIR/indexes" - java --illegal-access=permit -jar $indexer --unpack $FILE_IN --destination $WORKDIR/indexes/ --type full 2>&1 | grep -v WARNING + java --illegal-access=permit -jar $indexer \ + --unpack $FILE_IN \ + --destination $WORKDIR/indexes/ \ + --type full 2>&1 | grep -v WARNING fi localtime=$(date +"%Y-%m-%d %H:%M:%S") @@ -64,7 +67,9 @@ ls -lh $WORKDIR/export/ else echo "Exporting indexes $WORKDIR/indexes to $WORKDIR/export" - java --illegal-access=permit -jar $clue $WORKDIR/indexes/ export $WORKDIR/export/ text 2>&1 | grep -v WARNING + java --illegal-access=permit -jar $clue \ + $WORKDIR/indexes/ \ + export $WORKDIR/export/ text 2>&1 | grep -v WARNING fi localtime=$(date +"%Y-%m-%d %H:%M:%S") @@ -102,4 +107,3 @@ localtime=$(date +"%Y-%m-%d %H:%M:%S") echo "Docker Script execution finished on $localtime." - diff --git a/docs/run_maven_index_exporter.md b/docs/run_maven_index_exporter.md --- a/docs/run_maven_index_exporter.md +++ b/docs/run_maven_index_exporter.md @@ -12,12 +12,15 @@ disk (see warning below) and pass it to docker: ``` -$ LOCAL_DIR=/tmp/work -$ docker run -v $LOCAL_DIR:/work $USER/maven-index-exporter +$ LOCAL_DIR=/tmp/maven-index-exporter +# build the image +$ cd docker && docker build -f Dockerfile . +# run the image +$ docker run -v $LOCAL_DIR:/work maven-index-exporter ``` -Please note that the local work dir MUST be an absolute path, as docker won't mount -relative paths as volumes. +Please note that `LOCAL_DIR` *MUST* be an absolute path, as docker won't mount relative +paths as volumes. For our purpose only the fld file is kept, so if you need other export files you should simply edit the `extract_indexes.sh` script and comment the lines that do the cleaning. @@ -29,24 +32,29 @@ The `run_full_export.py` script located in `scripts/` provides an easy way to run the export as a cron batch job, and copy the resulting text export to a specific location. - %Simply use and adapt the crontab command as follows: +Simply use and adapt the crontab command as follows: ``` cd $HOME/maven-index-exporter/scripts/ && \ - ./myvenv/bin/python $HOME/maven-index-exporter/scripts/run_full_export.py https://repo.maven.apache.org/maven2/ /tmp/maven-index \ - -exporter/ /var/www/html/maven_index_exporter/ 2>&1 > /tmp/run_maven_exporter_$(date +"%Y%m%d-%H%M%S").log + ./myvenv/bin/python $HOME/maven-index-exporter/scripts/run_full_export.py \ + --base-url https://repo.maven.apache.org/maven2/ \ + 2>&1 > /tmp/run_maven_exporter_$(date +"%Y%m%d-%H%M%S").log ``` -The script takes three mandatory arguments: +Script usage: ``` -Usage: run_full_export.py - - url is the base url of the maven repository instance. - Example: https://repo.maven.apache.org/maven2/ - - work_dir must be an absolute path to the temp directory. - Example: /tmp/maven-index-exporter/ - - publish_dir must be an absolute path to the final directory. - Example: /var/www/html/ +$ python3 run_full_export.py --help +Usage: run_full_export.py [OPTIONS] + +Options: + --base-url TEXT Base url of the maven repository instance. Example: + https://repo.maven.apache.org/maven2/ [required] + + --work-dir TEXT Absolute path to the temp directory. + --publish-dir TEXT Absolute path to the final directory. + --docker-image TEXT Docker image + --help Show this message and exit. ``` It is recommended to setup a virtual environment to run the script. diff --git a/scripts/requirements.txt b/scripts/requirements.txt --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -7,3 +7,4 @@ six==1.16.0 urllib3==1.26.6 websocket-client==1.2.1 +click diff --git a/scripts/run_full_export.py b/scripts/run_full_export.py --- a/scripts/run_full_export.py +++ b/scripts/run_full_export.py @@ -1,13 +1,15 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging +import click +import sys import datetime import glob import re -import sys -from os import chdir, getcwd +from os import chdir, getcwd, makedirs from os.path import getsize, isabs, isdir, isfile, join from pathlib import Path from shutil import copy2 @@ -17,56 +19,45 @@ import docker -# Check paramaters -if len(sys.argv) != 4: - print("Usage:", sys.argv[0], "url work_dir publish_dir") - print(" - url is the base url of the maven repository instance.") - print(" Example: https://repo.maven.apache.org/maven2/") - print(" - work_dir must be an absolute path to the temp directory.") - print(" Example: /tmp/maven-index-exporter/") - print(" - publish_dir must be an absolute path to the final directory.") - print(" Example: /var/www/html/") - exit() - -base_url = sys.argv[1] -work_dir = sys.argv[2] -publish_dir = sys.argv[3] - - -def _docker_run(docker_image: str): - """ Start the container for the maven index export, using the image - 'bbaldassari/maven-index-exporter'. If needed the image is pulled from - docker hub. If it already exists, simply use the local one. +logger = logging.getLogger(__name__) + + +def _extract_indexes(work_dir: str, docker_image: str) -> None: + """Extract indexes out of the working directory. + + As an implementation details, this starts the maven indexer exporter container using + the docker image. If the image already exists locally, simply use the local one, + otherwise pull it from docker hub. + """ # Initialise the docker client. client = docker.from_env() - myimage = None - for image in client.images.list(name=docker_image): - myimage = image - break - - if myimage is None: - print(f"Docker: Could not find {docker_image}. Pulling it.") + try: + myimage = next(iter(client.images.list(name=docker_image))) + except StopIteration: + logger.info("Docker: Could not find %s. Pulling it.", docker_image) myimage = client.images.pull(repository=docker_image) else: - print("Docker: Found image {myimage} locally, ID is {myimage.attrs['Id']}.") + logger.info( + "Docker: Found image %s locally, ID is %s.", myimage, myimage.attrs['Id'] + ) ret = client.containers.run( myimage, tty=True, - command=["sh", "/opt/extract_indexes.sh", "/work/"], + command=["sh", "/opt/extract_indexes.sh"], volumes={work_dir: {"bind": "/work", "mode": "rw"}}, ) - print(f"Docker log:\n{ret.decode()}") + logger.info("Docker log:\n%s", ret.decode()) -def _download_indexes(instance_url: str): +def _download_indexes(work_dir: str, instance_url: str) -> None: """ Download all required indexes from the .index/ directory of the specified instance. """ - print(f"# Downloading all required indexes") + logger.info("Downloading all required indexes") index_url = urljoin(instance_url, ".index/") @@ -75,7 +66,7 @@ properties_url = urljoin(index_url, properties_name) # Retrieve properties file. - print(f" - Downloading {properties_file}.") + logger.info(" - Downloading %s.", properties_file) content = requests.get(properties_url).content.decode() open(properties_file, "w").write(content) @@ -83,16 +74,16 @@ for line in content.split("\n"): diff_group = diff_re.match(line) if diff_group is not None: - ind_name = "nexus-maven-repository-index." + diff_group.group(1) + ".gz" + ind_name = f"nexus-maven-repository-index.{diff_group.group(1)}.gz" ind_path = join(work_dir, ind_name) ind_url = urljoin(index_url, ind_name) if isfile(ind_path): - print(f" - File {ind_path} exists, skipping download.") + logger.info(" - File %s exists, skipping download.", ind_path) else: - print( + logger.info( ( - f" - File {ind_path} doesn't exist. " - f"Downloading file from {ind_url}." + " - File %s doesn't exist. " + "Downloading file from %s.", ind_path, ind_url ) ) # Retrieve incremental gz file @@ -103,66 +94,100 @@ ind_path = join(work_dir, "nexus-maven-repository-index.gz") ind_url = urljoin(index_url, "nexus-maven-repository-index.gz") if isfile(ind_path): - print(f" - File {ind_path} exists, skipping download.") + logger.info(f" - File {ind_path} exists, skipping download.") else: - print(f" - File {ind_path} doesn't exist. Downloading file from {ind_url}") + logger.info( + f" - File {ind_path} doesn't exist. Downloading file from {ind_url}" + ) contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) +@click.command() +@click.option( + "--base-url", + required=True, + help=( + "Base url of the maven repository instance. \n" + "Example: https://repo.maven.apache.org/maven2/" + ), +) +@click.option( + "--work-dir", + help="Absolute path to the temp directory.", + default="/tmp/maven-index-exporter/") +@click.option( + "--publish-dir", + help="Absolute path to the final directory.", + default="/tmp/maven-index-exporter/publish/" +) +@click.option( + "--docker-image", + help="Docker image", + default="maven-index-exporter" +) +def main(base_url, work_dir, publish_dir, docker_image): + now = datetime.datetime.now() + logger.info("Script: run_full_export") + logger.info("Timestamp: %s", now.strftime("%Y-%m-%d %H:%M:%S")) + logger.info("* URL: %s", base_url) + logger.info("* Working directory: %s", work_dir) + logger.info("* Publish directory: %s", publish_dir) + + # Check work_dir and create it if needed. + if isdir(work_dir): + logger.info("Work_Dir %s exists. Reusing it.", work_dir) + else: + try: + logger.info("Cannot find work_dir %s. Creating it.", work_dir) + Path(work_dir).mkdir(parents=True, exist_ok=True) + except OSError as error: + logger.info("Could not create work_dir %s: %s.", work_dir, error) + + assert isdir(work_dir) + assert isabs(work_dir) + + # Grab all the indexes + # Only fetch the new ones, existing files won't be re-downloaded. + _download_indexes(work_dir, base_url) + + # Run Docker on the downloaded indexes. + _extract_indexes(work_dir, docker_image) + + logger.info("Export directory has the following files:") + export_dir = join(work_dir, "export") + makedirs(export_dir, exist_ok=True) + chdir(export_dir) + myfile = None + re_fld = re.compile(r".*\.fld$") + for file_ in glob.glob("*.*"): + logger.info(" - %s size %s", file_, getsize(file_)) + if re_fld.match(file_): + myfile = file_ + + # Now copy the results to the desired location: publish_dir. + if isfile(myfile): + logger.info("Found fld file: %s", myfile) + else: + logger.info("Cannot find .fld file. Exiting") + sys.exit(4) + + makedirs(publish_dir, exist_ok=True) + publish_file = join(publish_dir, "export.fld") + logger.info("Copying files to %s.", publish_file) + try: + copy2(myfile, publish_file) + except OSError as error: + logger.info("Could not publish results in %s: %s.", publish_dir, error) + + now = datetime.datetime.now() + logger.info("Script finished on %s", now.strftime("%Y-%m-%d %H:%M:%S")) + + ############################################### # Start execution ############################################### -now = datetime.datetime.now() -print(f"Script: {sys.argv[0]}") -print("Timestamp:", now.strftime("%Y-%m-%d %H:%M:%S")) -print(f"* URL: {base_url}") -print(f"* Work_Dir: {work_dir}") - -# Check work_dir and create it if needed. -if isdir(work_dir): - print("Work_Dir {work_dir} exists. Reusing it.") -else: - try: - print("Cannot find work_dir {work_dir}. Creating it.") - Path(work_dir).mkdir(parents=True, exist_ok=True) - except OSError as error: - print(f"Could not create work_dir {work_dir}: {error}.") - -assert isdir(work_dir) -assert isabs(work_dir) - -# Grab all the indexes -# Only fetch the new ones, existing files won't be re-downloaded. -_download_indexes(base_url) - -# Run Docker on the downloaded indexes. -_docker_run("bbaldassari/maven-index-exporter") - -print("Export directory has the following files:") -owd = getcwd() -chdir(join(work_dir, "export")) -myfile = None -re_fld = re.compile(r".*\.fld$") -for file in glob.glob("*.*"): - print(" -", file, "size", getsize(file)) - if re_fld.match(file): - myfile = file - -# Now copy the results to the desired location: publish_dir. -if isfile(myfile): - print("Found fld file:", myfile) -else: - print("Cannot find .fld file. Exiting") - exit(4) - -publish_file = join(publish_dir, "export.fld") -print(f"Copying files to {publish_file}..") -try: - copy2(myfile, publish_file) -except OSError as error: - print(f"Could not publish results in {publish_dir}: {error}.") - -now = datetime.datetime.now() -print(f"Script finished on", now.strftime("%Y-%m-%d %H:%M:%S")) +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + main()