Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F11012804
D7412.id26829.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
D7412.id26829.diff
View Options
diff --git a/docker/Dockerfile b/docker/Dockerfile
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM adoptopenjdk/openjdk11:alpine-jre
+FROM adoptopenjdk/openjdk11:debian-jre
# Download and install jars
ADD https://github.com/javasoze/clue/releases/download/release-6.2.0-1.0.0/clue-6.2.0-1.0.0.jar /opt/
diff --git a/docker/extract_indexes.sh b/docker/extract_indexes.sh
--- a/docker/extract_indexes.sh
+++ b/docker/extract_indexes.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-# Copyright (C) 2021 The Software Heritage developers
+# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -52,7 +52,10 @@
du -sh $WORKDIR/indexes/
else
echo "Unpacking [$FILE_IN] to $WORKDIR/indexes"
- java --illegal-access=permit -jar $indexer --unpack $FILE_IN --destination $WORKDIR/indexes/ --type full 2>&1 | grep -v WARNING
+ java --illegal-access=permit -jar $indexer \
+ --unpack $FILE_IN \
+ --destination $WORKDIR/indexes/ \
+ --type full 2>&1 | grep -v WARNING
fi
localtime=$(date +"%Y-%m-%d %H:%M:%S")
@@ -64,7 +67,9 @@
ls -lh $WORKDIR/export/
else
echo "Exporting indexes $WORKDIR/indexes to $WORKDIR/export"
- java --illegal-access=permit -jar $clue $WORKDIR/indexes/ export $WORKDIR/export/ text 2>&1 | grep -v WARNING
+ java --illegal-access=permit -jar $clue \
+ $WORKDIR/indexes/ \
+ export $WORKDIR/export/ text 2>&1 | grep -v WARNING
fi
localtime=$(date +"%Y-%m-%d %H:%M:%S")
@@ -102,4 +107,3 @@
localtime=$(date +"%Y-%m-%d %H:%M:%S")
echo "Docker Script execution finished on $localtime."
-
diff --git a/docs/run_maven_index_exporter.md b/docs/run_maven_index_exporter.md
--- a/docs/run_maven_index_exporter.md
+++ b/docs/run_maven_index_exporter.md
@@ -12,12 +12,15 @@
disk (see warning below) and pass it to docker:
```
-$ LOCAL_DIR=/tmp/work
-$ docker run -v $LOCAL_DIR:/work $USER/maven-index-exporter
+$ LOCAL_DIR=/tmp/maven-index-exporter
+# build the image
+$ cd docker && docker build -f Dockerfile .
+# run the image
+$ docker run -v $LOCAL_DIR:/work maven-index-exporter
```
-Please note that the local work dir MUST be an absolute path, as docker won't mount
-relative paths as volumes.
+Please note that `LOCAL_DIR` *MUST* be an absolute path, as docker won't mount relative
+paths as volumes.
For our purpose only the fld file is kept, so if you need other export files you should
simply edit the `extract_indexes.sh` script and comment the lines that do the cleaning.
@@ -29,24 +32,29 @@
The `run_full_export.py` script located in `scripts/` provides an easy way to run the
export as a cron batch job, and copy the resulting text export to a specific location.
- %Simply use and adapt the crontab command as follows:
+Simply use and adapt the crontab command as follows:
```
cd $HOME/maven-index-exporter/scripts/ && \
- ./myvenv/bin/python $HOME/maven-index-exporter/scripts/run_full_export.py https://repo.maven.apache.org/maven2/ /tmp/maven-index \
- -exporter/ /var/www/html/maven_index_exporter/ 2>&1 > /tmp/run_maven_exporter_$(date +"%Y%m%d-%H%M%S").log
+ ./myvenv/bin/python $HOME/maven-index-exporter/scripts/run_full_export.py \
+ --base-url https://repo.maven.apache.org/maven2/ \
+ 2>&1 > /tmp/run_maven_exporter_$(date +"%Y%m%d-%H%M%S").log
```
-The script takes three mandatory arguments:
+Script usage:
```
-Usage: run_full_export.py <url> <work_dir> <publish_dir>
- - url is the base url of the maven repository instance.
- Example: https://repo.maven.apache.org/maven2/
- - work_dir must be an absolute path to the temp directory.
- Example: /tmp/maven-index-exporter/
- - publish_dir must be an absolute path to the final directory.
- Example: /var/www/html/
+$ python3 run_full_export.py --help
+Usage: run_full_export.py [OPTIONS]
+
+Options:
+ --base-url TEXT Base url of the maven repository instance. Example:
+ https://repo.maven.apache.org/maven2/ [required]
+
+ --work-dir TEXT Absolute path to the temp directory.
+ --publish-dir TEXT Absolute path to the final directory.
+ --docker-image TEXT Docker image
+ --help Show this message and exit.
```
It is recommended to setup a virtual environment to run the script.
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -7,3 +7,4 @@
six==1.16.0
urllib3==1.26.6
websocket-client==1.2.1
+click
diff --git a/scripts/run_full_export.py b/scripts/run_full_export.py
--- a/scripts/run_full_export.py
+++ b/scripts/run_full_export.py
@@ -1,13 +1,15 @@
-# Copyright (C) 2021 The Software Heritage developers
+# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import logging
+import click
+import sys
import datetime
import glob
import re
-import sys
-from os import chdir, getcwd
+from os import chdir, getcwd, makedirs
from os.path import getsize, isabs, isdir, isfile, join
from pathlib import Path
from shutil import copy2
@@ -17,56 +19,45 @@
import docker
-# Check paramaters
-if len(sys.argv) != 4:
- print("Usage:", sys.argv[0], "url work_dir publish_dir")
- print(" - url is the base url of the maven repository instance.")
- print(" Example: https://repo.maven.apache.org/maven2/")
- print(" - work_dir must be an absolute path to the temp directory.")
- print(" Example: /tmp/maven-index-exporter/")
- print(" - publish_dir must be an absolute path to the final directory.")
- print(" Example: /var/www/html/")
- exit()
-
-base_url = sys.argv[1]
-work_dir = sys.argv[2]
-publish_dir = sys.argv[3]
-
-
-def _docker_run(docker_image: str):
- """ Start the container for the maven index export, using the image
- 'bbaldassari/maven-index-exporter'. If needed the image is pulled from
- docker hub. If it already exists, simply use the local one.
+logger = logging.getLogger(__name__)
+
+
+def _extract_indexes(work_dir: str, docker_image: str) -> None:
+ """Extract indexes out of the working directory.
+
+ As an implementation details, this starts the maven indexer exporter container using
+ the docker image. If the image already exists locally, simply use the local one,
+ otherwise pull it from docker hub.
+
"""
# Initialise the docker client.
client = docker.from_env()
- myimage = None
- for image in client.images.list(name=docker_image):
- myimage = image
- break
-
- if myimage is None:
- print(f"Docker: Could not find {docker_image}. Pulling it.")
- myimage = client.images.pull(repository=docker_image)
+ try:
+ myimage = next(iter(client.images.list(name=docker_image)))
+ except StopIteration:
+ logger.info("Docker: Could not find %s. Pulling it.", docker_image)
+ myimage = client.images.pull(repository=docker_image, force_pull=True)
else:
- print("Docker: Found image {myimage} locally, ID is {myimage.attrs['Id']}.")
+ logger.info(
+ "Docker: Found image %s locally, ID is %s.", myimage, myimage.attrs['Id']
+ )
ret = client.containers.run(
myimage,
tty=True,
- command=["sh", "/opt/extract_indexes.sh", "/work/"],
+ command=["sh", "/opt/extract_indexes.sh"],
volumes={work_dir: {"bind": "/work", "mode": "rw"}},
)
- print(f"Docker log:\n{ret.decode()}")
+ logger.info("Docker log:\n%s", ret.decode())
-def _download_indexes(instance_url: str):
+def _download_indexes(work_dir: str, instance_url: str) -> None:
""" Download all required indexes from the .index/ directory
of the specified instance.
"""
- print(f"# Downloading all required indexes")
+ logger.info("Downloading all required indexes")
index_url = urljoin(instance_url, ".index/")
@@ -75,7 +66,7 @@
properties_url = urljoin(index_url, properties_name)
# Retrieve properties file.
- print(f" - Downloading {properties_file}.")
+ logger.info(" - Downloading %s.", properties_file)
content = requests.get(properties_url).content.decode()
open(properties_file, "w").write(content)
@@ -83,16 +74,16 @@
for line in content.split("\n"):
diff_group = diff_re.match(line)
if diff_group is not None:
- ind_name = "nexus-maven-repository-index." + diff_group.group(1) + ".gz"
+ ind_name = f"nexus-maven-repository-index.{diff_group.group(1)}.gz"
ind_path = join(work_dir, ind_name)
ind_url = urljoin(index_url, ind_name)
if isfile(ind_path):
- print(f" - File {ind_path} exists, skipping download.")
+ logger.info(" - File %s exists, skipping download.", ind_path)
else:
- print(
+ logger.info(
(
- f" - File {ind_path} doesn't exist. "
- f"Downloading file from {ind_url}."
+ " - File %s doesn't exist. "
+ "Downloading file from %s.", ind_path, ind_url
)
)
# Retrieve incremental gz file
@@ -103,66 +94,100 @@
ind_path = join(work_dir, "nexus-maven-repository-index.gz")
ind_url = urljoin(index_url, "nexus-maven-repository-index.gz")
if isfile(ind_path):
- print(f" - File {ind_path} exists, skipping download.")
+ logger.info(f" - File {ind_path} exists, skipping download.")
else:
- print(f" - File {ind_path} doesn't exist. Downloading file from {ind_url}")
+ logger.info(
+ f" - File {ind_path} doesn't exist. Downloading file from {ind_url}"
+ )
contentb = requests.get(ind_url).content
open(ind_path, "wb").write(contentb)
+@click.command()
+@click.option(
+ "--base-url",
+ required=True,
+ help=(
+ "Base url of the maven repository instance. \n"
+ "Example: https://repo.maven.apache.org/maven2/"
+ ),
+)
+@click.option(
+ "--work-dir",
+ help="Absolute path to the temp directory.",
+ default="/tmp/maven-index-exporter/")
+@click.option(
+ "--publish-dir",
+ help="Absolute path to the final directory.",
+ default="/tmp/maven-index-exporter/publish/"
+)
+@click.option(
+ "--docker-image",
+ help="Docker image",
+ default="maven-index-exporter"
+)
+def main(base_url, work_dir, publish_dir, docker_image):
+ now = datetime.datetime.now()
+ logger.info("Script: run_full_export")
+ logger.info("Timestamp: %s", now.strftime("%Y-%m-%d %H:%M:%S"))
+ logger.info("* URL: %s", base_url)
+ logger.info("* Working directory: %s", work_dir)
+ logger.info("* Publish directory: %s", publish_dir)
+
+ # Check work_dir and create it if needed.
+ if isdir(work_dir):
+ logger.info("Work_Dir %s exists. Reusing it.", work_dir)
+ else:
+ try:
+ logger.info("Cannot find work_dir %s. Creating it.", work_dir)
+ Path(work_dir).mkdir(parents=True, exist_ok=True)
+ except OSError as error:
+ logger.info("Could not create work_dir %s: %s.", work_dir, error)
+
+ assert isdir(work_dir)
+ assert isabs(work_dir)
+
+ # Grab all the indexes
+ # Only fetch the new ones, existing files won't be re-downloaded.
+ _download_indexes(work_dir, base_url)
+
+ # Run Docker on the downloaded indexes.
+ _extract_indexes(work_dir, docker_image)
+
+ logger.info("Export directory has the following files:")
+ export_dir = join(work_dir, "export")
+ makedirs(export_dir, exist_ok=True)
+ chdir(export_dir)
+ myfile = None
+ re_fld = re.compile(r".*\.fld$")
+ for file_ in glob.glob("*.*"):
+ logger.info(" - %s size %s", file_, getsize(file_))
+ if re_fld.match(file_):
+ myfile = file_
+
+ # Now copy the results to the desired location: publish_dir.
+ if isfile(myfile):
+ logger.info("Found fld file: %s", myfile)
+ else:
+ logger.info("Cannot find .fld file. Exiting")
+ sys.exit(4)
+
+ makedirs(publish_dir, exist_ok=True)
+ publish_file = join(publish_dir, "export.fld")
+ logger.info("Copying files to %s.", publish_file)
+ try:
+ copy2(myfile, publish_file)
+ except OSError as error:
+ logger.info("Could not publish results in %s: %s.", publish_dir, error)
+
+ now = datetime.datetime.now()
+ logger.info("Script finished on %s", now.strftime("%Y-%m-%d %H:%M:%S"))
+
+
###############################################
# Start execution
###############################################
-now = datetime.datetime.now()
-print(f"Script: {sys.argv[0]}")
-print("Timestamp:", now.strftime("%Y-%m-%d %H:%M:%S"))
-print(f"* URL: {base_url}")
-print(f"* Work_Dir: {work_dir}")
-
-# Check work_dir and create it if needed.
-if isdir(work_dir):
- print("Work_Dir {work_dir} exists. Reusing it.")
-else:
- try:
- print("Cannot find work_dir {work_dir}. Creating it.")
- Path(work_dir).mkdir(parents=True, exist_ok=True)
- except OSError as error:
- print(f"Could not create work_dir {work_dir}: {error}.")
-
-assert isdir(work_dir)
-assert isabs(work_dir)
-
-# Grab all the indexes
-# Only fetch the new ones, existing files won't be re-downloaded.
-_download_indexes(base_url)
-
-# Run Docker on the downloaded indexes.
-_docker_run("bbaldassari/maven-index-exporter")
-
-print("Export directory has the following files:")
-owd = getcwd()
-chdir(join(work_dir, "export"))
-myfile = None
-re_fld = re.compile(r".*\.fld$")
-for file in glob.glob("*.*"):
- print(" -", file, "size", getsize(file))
- if re_fld.match(file):
- myfile = file
-
-# Now copy the results to the desired location: publish_dir.
-if isfile(myfile):
- print("Found fld file:", myfile)
-else:
- print("Cannot find .fld file. Exiting")
- exit(4)
-
-publish_file = join(publish_dir, "export.fld")
-print(f"Copying files to {publish_file}..")
-try:
- copy2(myfile, publish_file)
-except OSError as error:
- print(f"Could not publish results in {publish_dir}: {error}.")
-
-now = datetime.datetime.now()
-print(f"Script finished on", now.strftime("%Y-%m-%d %H:%M:%S"))
+if __name__ == "__main__":
+ logging.basicConfig(level=logging.INFO)
+ main()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Sep 17, 4:36 AM (8 h, 48 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226349
Attached To
D7412: Adapt run_full_export according to swh cli conventions
Event Timeline
Log In to Comment