Page MenuHomeSoftware Heritage

D7412.id26829.diff
No OneTemporary

D7412.id26829.diff

diff --git a/docker/Dockerfile b/docker/Dockerfile
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM adoptopenjdk/openjdk11:alpine-jre
+FROM adoptopenjdk/openjdk11:debian-jre
# Download and install jars
ADD https://github.com/javasoze/clue/releases/download/release-6.2.0-1.0.0/clue-6.2.0-1.0.0.jar /opt/
diff --git a/docker/extract_indexes.sh b/docker/extract_indexes.sh
--- a/docker/extract_indexes.sh
+++ b/docker/extract_indexes.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-# Copyright (C) 2021 The Software Heritage developers
+# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -52,7 +52,10 @@
du -sh $WORKDIR/indexes/
else
echo "Unpacking [$FILE_IN] to $WORKDIR/indexes"
- java --illegal-access=permit -jar $indexer --unpack $FILE_IN --destination $WORKDIR/indexes/ --type full 2>&1 | grep -v WARNING
+ java --illegal-access=permit -jar $indexer \
+ --unpack $FILE_IN \
+ --destination $WORKDIR/indexes/ \
+ --type full 2>&1 | grep -v WARNING
fi
localtime=$(date +"%Y-%m-%d %H:%M:%S")
@@ -64,7 +67,9 @@
ls -lh $WORKDIR/export/
else
echo "Exporting indexes $WORKDIR/indexes to $WORKDIR/export"
- java --illegal-access=permit -jar $clue $WORKDIR/indexes/ export $WORKDIR/export/ text 2>&1 | grep -v WARNING
+ java --illegal-access=permit -jar $clue \
+ $WORKDIR/indexes/ \
+ export $WORKDIR/export/ text 2>&1 | grep -v WARNING
fi
localtime=$(date +"%Y-%m-%d %H:%M:%S")
@@ -102,4 +107,3 @@
localtime=$(date +"%Y-%m-%d %H:%M:%S")
echo "Docker Script execution finished on $localtime."
-
diff --git a/docs/run_maven_index_exporter.md b/docs/run_maven_index_exporter.md
--- a/docs/run_maven_index_exporter.md
+++ b/docs/run_maven_index_exporter.md
@@ -12,12 +12,15 @@
disk (see warning below) and pass it to docker:
```
-$ LOCAL_DIR=/tmp/work
-$ docker run -v $LOCAL_DIR:/work $USER/maven-index-exporter
+$ LOCAL_DIR=/tmp/maven-index-exporter
+# build the image
+$ cd docker && docker build -f Dockerfile .
+# run the image
+$ docker run -v $LOCAL_DIR:/work maven-index-exporter
```
-Please note that the local work dir MUST be an absolute path, as docker won't mount
-relative paths as volumes.
+Please note that `LOCAL_DIR` *MUST* be an absolute path, as docker won't mount relative
+paths as volumes.
For our purpose only the fld file is kept, so if you need other export files you should
simply edit the `extract_indexes.sh` script and comment the lines that do the cleaning.
@@ -29,24 +32,29 @@
The `run_full_export.py` script located in `scripts/` provides an easy way to run the
export as a cron batch job, and copy the resulting text export to a specific location.
- %Simply use and adapt the crontab command as follows:
+Simply use and adapt the crontab command as follows:
```
cd $HOME/maven-index-exporter/scripts/ && \
- ./myvenv/bin/python $HOME/maven-index-exporter/scripts/run_full_export.py https://repo.maven.apache.org/maven2/ /tmp/maven-index \
- -exporter/ /var/www/html/maven_index_exporter/ 2>&1 > /tmp/run_maven_exporter_$(date +"%Y%m%d-%H%M%S").log
+ ./myvenv/bin/python $HOME/maven-index-exporter/scripts/run_full_export.py \
+ --base-url https://repo.maven.apache.org/maven2/ \
+ 2>&1 > /tmp/run_maven_exporter_$(date +"%Y%m%d-%H%M%S").log
```
-The script takes three mandatory arguments:
+Script usage:
```
-Usage: run_full_export.py <url> <work_dir> <publish_dir>
- - url is the base url of the maven repository instance.
- Example: https://repo.maven.apache.org/maven2/
- - work_dir must be an absolute path to the temp directory.
- Example: /tmp/maven-index-exporter/
- - publish_dir must be an absolute path to the final directory.
- Example: /var/www/html/
+$ python3 run_full_export.py --help
+Usage: run_full_export.py [OPTIONS]
+
+Options:
+ --base-url TEXT Base url of the maven repository instance. Example:
+ https://repo.maven.apache.org/maven2/ [required]
+
+ --work-dir TEXT Absolute path to the temp directory.
+ --publish-dir TEXT Absolute path to the final directory.
+ --docker-image TEXT Docker image
+ --help Show this message and exit.
```
It is recommended to setup a virtual environment to run the script.
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -7,3 +7,4 @@
six==1.16.0
urllib3==1.26.6
websocket-client==1.2.1
+click
diff --git a/scripts/run_full_export.py b/scripts/run_full_export.py
--- a/scripts/run_full_export.py
+++ b/scripts/run_full_export.py
@@ -1,13 +1,15 @@
-# Copyright (C) 2021 The Software Heritage developers
+# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import logging
+import click
+import sys
import datetime
import glob
import re
-import sys
-from os import chdir, getcwd
+from os import chdir, getcwd, makedirs
from os.path import getsize, isabs, isdir, isfile, join
from pathlib import Path
from shutil import copy2
@@ -17,56 +19,45 @@
import docker
-# Check paramaters
-if len(sys.argv) != 4:
- print("Usage:", sys.argv[0], "url work_dir publish_dir")
- print(" - url is the base url of the maven repository instance.")
- print(" Example: https://repo.maven.apache.org/maven2/")
- print(" - work_dir must be an absolute path to the temp directory.")
- print(" Example: /tmp/maven-index-exporter/")
- print(" - publish_dir must be an absolute path to the final directory.")
- print(" Example: /var/www/html/")
- exit()
-
-base_url = sys.argv[1]
-work_dir = sys.argv[2]
-publish_dir = sys.argv[3]
-
-
-def _docker_run(docker_image: str):
- """ Start the container for the maven index export, using the image
- 'bbaldassari/maven-index-exporter'. If needed the image is pulled from
- docker hub. If it already exists, simply use the local one.
+logger = logging.getLogger(__name__)
+
+
+def _extract_indexes(work_dir: str, docker_image: str) -> None:
+ """Extract indexes out of the working directory.
+
+ As an implementation details, this starts the maven indexer exporter container using
+ the docker image. If the image already exists locally, simply use the local one,
+ otherwise pull it from docker hub.
+
"""
# Initialise the docker client.
client = docker.from_env()
- myimage = None
- for image in client.images.list(name=docker_image):
- myimage = image
- break
-
- if myimage is None:
- print(f"Docker: Could not find {docker_image}. Pulling it.")
- myimage = client.images.pull(repository=docker_image)
+ try:
+ myimage = next(iter(client.images.list(name=docker_image)))
+ except StopIteration:
+ logger.info("Docker: Could not find %s. Pulling it.", docker_image)
+ myimage = client.images.pull(repository=docker_image, force_pull=True)
else:
- print("Docker: Found image {myimage} locally, ID is {myimage.attrs['Id']}.")
+ logger.info(
+ "Docker: Found image %s locally, ID is %s.", myimage, myimage.attrs['Id']
+ )
ret = client.containers.run(
myimage,
tty=True,
- command=["sh", "/opt/extract_indexes.sh", "/work/"],
+ command=["sh", "/opt/extract_indexes.sh"],
volumes={work_dir: {"bind": "/work", "mode": "rw"}},
)
- print(f"Docker log:\n{ret.decode()}")
+ logger.info("Docker log:\n%s", ret.decode())
-def _download_indexes(instance_url: str):
+def _download_indexes(work_dir: str, instance_url: str) -> None:
""" Download all required indexes from the .index/ directory
of the specified instance.
"""
- print(f"# Downloading all required indexes")
+ logger.info("Downloading all required indexes")
index_url = urljoin(instance_url, ".index/")
@@ -75,7 +66,7 @@
properties_url = urljoin(index_url, properties_name)
# Retrieve properties file.
- print(f" - Downloading {properties_file}.")
+ logger.info(" - Downloading %s.", properties_file)
content = requests.get(properties_url).content.decode()
open(properties_file, "w").write(content)
@@ -83,16 +74,16 @@
for line in content.split("\n"):
diff_group = diff_re.match(line)
if diff_group is not None:
- ind_name = "nexus-maven-repository-index." + diff_group.group(1) + ".gz"
+ ind_name = f"nexus-maven-repository-index.{diff_group.group(1)}.gz"
ind_path = join(work_dir, ind_name)
ind_url = urljoin(index_url, ind_name)
if isfile(ind_path):
- print(f" - File {ind_path} exists, skipping download.")
+ logger.info(" - File %s exists, skipping download.", ind_path)
else:
- print(
+ logger.info(
(
- f" - File {ind_path} doesn't exist. "
- f"Downloading file from {ind_url}."
+ " - File %s doesn't exist. "
+ "Downloading file from %s.", ind_path, ind_url
)
)
# Retrieve incremental gz file
@@ -103,66 +94,100 @@
ind_path = join(work_dir, "nexus-maven-repository-index.gz")
ind_url = urljoin(index_url, "nexus-maven-repository-index.gz")
if isfile(ind_path):
- print(f" - File {ind_path} exists, skipping download.")
+ logger.info(f" - File {ind_path} exists, skipping download.")
else:
- print(f" - File {ind_path} doesn't exist. Downloading file from {ind_url}")
+ logger.info(
+ f" - File {ind_path} doesn't exist. Downloading file from {ind_url}"
+ )
contentb = requests.get(ind_url).content
open(ind_path, "wb").write(contentb)
+@click.command()
+@click.option(
+ "--base-url",
+ required=True,
+ help=(
+ "Base url of the maven repository instance. \n"
+ "Example: https://repo.maven.apache.org/maven2/"
+ ),
+)
+@click.option(
+ "--work-dir",
+ help="Absolute path to the temp directory.",
+ default="/tmp/maven-index-exporter/")
+@click.option(
+ "--publish-dir",
+ help="Absolute path to the final directory.",
+ default="/tmp/maven-index-exporter/publish/"
+)
+@click.option(
+ "--docker-image",
+ help="Docker image",
+ default="maven-index-exporter"
+)
+def main(base_url, work_dir, publish_dir, docker_image):
+ now = datetime.datetime.now()
+ logger.info("Script: run_full_export")
+ logger.info("Timestamp: %s", now.strftime("%Y-%m-%d %H:%M:%S"))
+ logger.info("* URL: %s", base_url)
+ logger.info("* Working directory: %s", work_dir)
+ logger.info("* Publish directory: %s", publish_dir)
+
+ # Check work_dir and create it if needed.
+ if isdir(work_dir):
+ logger.info("Work_Dir %s exists. Reusing it.", work_dir)
+ else:
+ try:
+ logger.info("Cannot find work_dir %s. Creating it.", work_dir)
+ Path(work_dir).mkdir(parents=True, exist_ok=True)
+ except OSError as error:
+ logger.info("Could not create work_dir %s: %s.", work_dir, error)
+
+ assert isdir(work_dir)
+ assert isabs(work_dir)
+
+ # Grab all the indexes
+ # Only fetch the new ones, existing files won't be re-downloaded.
+ _download_indexes(work_dir, base_url)
+
+ # Run Docker on the downloaded indexes.
+ _extract_indexes(work_dir, docker_image)
+
+ logger.info("Export directory has the following files:")
+ export_dir = join(work_dir, "export")
+ makedirs(export_dir, exist_ok=True)
+ chdir(export_dir)
+ myfile = None
+ re_fld = re.compile(r".*\.fld$")
+ for file_ in glob.glob("*.*"):
+ logger.info(" - %s size %s", file_, getsize(file_))
+ if re_fld.match(file_):
+ myfile = file_
+
+ # Now copy the results to the desired location: publish_dir.
+ if isfile(myfile):
+ logger.info("Found fld file: %s", myfile)
+ else:
+ logger.info("Cannot find .fld file. Exiting")
+ sys.exit(4)
+
+ makedirs(publish_dir, exist_ok=True)
+ publish_file = join(publish_dir, "export.fld")
+ logger.info("Copying files to %s.", publish_file)
+ try:
+ copy2(myfile, publish_file)
+ except OSError as error:
+ logger.info("Could not publish results in %s: %s.", publish_dir, error)
+
+ now = datetime.datetime.now()
+ logger.info("Script finished on %s", now.strftime("%Y-%m-%d %H:%M:%S"))
+
+
###############################################
# Start execution
###############################################
-now = datetime.datetime.now()
-print(f"Script: {sys.argv[0]}")
-print("Timestamp:", now.strftime("%Y-%m-%d %H:%M:%S"))
-print(f"* URL: {base_url}")
-print(f"* Work_Dir: {work_dir}")
-
-# Check work_dir and create it if needed.
-if isdir(work_dir):
- print("Work_Dir {work_dir} exists. Reusing it.")
-else:
- try:
- print("Cannot find work_dir {work_dir}. Creating it.")
- Path(work_dir).mkdir(parents=True, exist_ok=True)
- except OSError as error:
- print(f"Could not create work_dir {work_dir}: {error}.")
-
-assert isdir(work_dir)
-assert isabs(work_dir)
-
-# Grab all the indexes
-# Only fetch the new ones, existing files won't be re-downloaded.
-_download_indexes(base_url)
-
-# Run Docker on the downloaded indexes.
-_docker_run("bbaldassari/maven-index-exporter")
-
-print("Export directory has the following files:")
-owd = getcwd()
-chdir(join(work_dir, "export"))
-myfile = None
-re_fld = re.compile(r".*\.fld$")
-for file in glob.glob("*.*"):
- print(" -", file, "size", getsize(file))
- if re_fld.match(file):
- myfile = file
-
-# Now copy the results to the desired location: publish_dir.
-if isfile(myfile):
- print("Found fld file:", myfile)
-else:
- print("Cannot find .fld file. Exiting")
- exit(4)
-
-publish_file = join(publish_dir, "export.fld")
-print(f"Copying files to {publish_file}..")
-try:
- copy2(myfile, publish_file)
-except OSError as error:
- print(f"Could not publish results in {publish_dir}: {error}.")
-
-now = datetime.datetime.now()
-print(f"Script finished on", now.strftime("%Y-%m-%d %H:%M:%S"))
+if __name__ == "__main__":
+ logging.basicConfig(level=logging.INFO)
+ main()

File Metadata

Mime Type
text/plain
Expires
Wed, Sep 17, 4:36 AM (8 h, 48 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226349

Event Timeline