diff --git a/extract_indexes.sh b/extract_indexes.sh index a12d907..2d60e88 100644 --- a/extract_indexes.sh +++ b/extract_indexes.sh @@ -1,103 +1,101 @@ WORKDIR=/work FILE_IN=$WORKDIR/nexus-maven-repository-index.gz localtime=$(date +"%Y-%m-%d %H:%M:%S") echo "Docker Script started on $localtime." echo "# Checks.." echo "* Content of /opt:" ls -l /opt echo "* Content of $WORKDIR:" ls -l $WORKDIR echo "* Will read files from [$FILE_IN]." if [ ! -r "$FILE_IN" ]; then echo "Cannot find file [$FILE_IN]." echo "Need an index file to work on. Exiting 4." exit 4 else echo "* Found file [$FILE_IN]." fi indexer=$(find /opt/ -name "indexer-cli-*.jar") if [ "$indexer" = "" ]; then echo "Cannot find indexer. Exiting 6." exit 6 else echo "* Found indexer [$indexer]." fi clue=$(find /opt/ -name "clue-*.jar") if [ "$clue" = "" ]; then echo "Cannot find clue. Exiting 8." exit 8 else echo "* Found clue [$clue]." fi echo "* Java version:." java -version echo "#############################" if [ -d $WORKDIR/indexes ]; then echo "Found $WORKDIR/indexes, skipping index generation." du -sh $WORKDIR/indexes/ else echo "Unpacking [$FILE_IN] to $WORKDIR/indexes" java --illegal-access=permit -jar $indexer --unpack $FILE_IN --destination $WORKDIR/indexes/ --type full 2>&1 | grep -v WARNING fi localtime=$(date +"%Y-%m-%d %H:%M:%S") echo "Unpacking finished on $localtime." echo "#############################" if [ -d $WORKDIR/export ]; then echo "Found $WORKDIR/export, skipping index export." ls -lh $WORKDIR/export/ else echo "Exporting indexes $WORKDIR/indexes to $WORKDIR/export" java --illegal-access=permit -jar $clue $WORKDIR/indexes/ export $WORKDIR/export/ text 2>&1 | grep -v WARNING fi localtime=$(date +"%Y-%m-%d %H:%M:%S") echo "Exporting finished on $localtime." echo "#############################" echo "Cleaning useless files." echo "Size before cleaning:" du -sh $WORKDIR/* # We might want or not to delete the indexes # Remember that when they're not present, everything # gets recomputed every run.. #echo "* Removing indexes." #rm -rf $WORKDIR/indexes/ # If files others than the .fld one are required, please comment # the following lines. echo "* Removing useless exports." +echo " Keeping only fld text extract." rm -f $WORKDIR/export/*.inf rm -f $WORKDIR/export/*.len rm -f $WORKDIR/export/*.pst rm -f $WORKDIR/export/*.si rm -f $WORKDIR/export/segments* -echo "Keeping only fld text extract:" -ls -lh $WORKDIR/ - -echo "Size after cleaning:" +echo " Size after cleaning:" du -sh $WORKDIR/* -echo "Make files modifiable by the end-user." +echo "* Make files modifiable by the end-user." chmod -R 777 $WORKDIR/export/ chmod -R 777 $WORKDIR/indexes/ localtime=$(date +"%Y-%m-%d %H:%M:%S") echo "Docker Script execution finished on $localtime." diff --git a/resources/run_full_export.py b/resources/run_full_export.py index 61cdb90..0f40fdb 100644 --- a/resources/run_full_export.py +++ b/resources/run_full_export.py @@ -1,150 +1,166 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import docker import requests import re +import glob +import sys +import datetime +from os import getcwd, chdir +from os.path import getsize, isdir, isfile, isabs, join +from pathlib import Path +from urllib.parse import urljoin +from shutil import copy2 # Check paramaters -if len(sys.argv) != 3: +if len(sys.argv) != 4: print("Usage:", sys.argv[0], "url work_dir publish_dir") print(" - url is the base url of the maven repository instance.") print(" Example: https://repo.maven.apache.org/maven2/") print(" - work_dir must be an absolute path to the temp directory.") print(" Example: /tmp/maven-index-exporter/") print(" - publish_dir must be an absolute path to the final directory.") print(" Example: /var/www/html/maven_index_exporter/") exit() base_url = sys.argv[1] work_dir = sys.argv[2] publish_dir = sys.argv[3] def _docker_run(docker_image: str): """ Start the container for the maven index export, using the image 'bbaldassari/maven-index-exporter'. If needed the image is pulled from docker hub. If it already exists, simply use the local one. """ # Initialise the docker client. client = docker.from_env() myimage = None for image in client.images.list(name=docker_image): - myimage = image - break - + myimage = image + break + if myimage is None: - print("Docker: Could not find %s. Pulling it.", docker_image) + print(f"Docker: Could not find {docker_image}. Pulling it.") myimage = client.images.pull(repository=docker_image) else: - print( - "Docker: Found image %s locally, ID is %s.", - myimage, - myimage.attrs["Id"], - ) - + print("Docker: Found image {myimage} locally, ID is {myimage.attrs['Id']}.") + ret = client.containers.run( myimage, tty=True, command=["sh", "/opt/extract_indexes.sh", "/work/"], volumes={work_dir: {"bind": "/work", "mode": "rw"}}, ) - print("Docker log:\n%s", ret.decode()) + print(f"Docker log:\n{ret.decode()}") def _download_indexes(instance_url: str): """ Download all required indexes from the .index/ directory of the specified instance. """ - index_url = urljoin(base_url, ".index/") + print(f"# Downloading all required indexes") - properties_file = join(work_dir, "nexus-maven-repository-index.properties") - properties_url = urljoin(index_url, "nexus-maven-repository-index.properties") + index_url = urljoin(instance_url, ".index/") + + properties_name = "nexus-maven-repository-index.properties" + properties_file = join(work_dir, properties_name) + properties_url = urljoin(index_url, properties_name) # Retrieve properties file. + print(f" - Downloading {properties_file}.") content = requests.get(properties_url).content.decode() open(properties_file, "w").write(content) diff_re = re.compile("^nexus.index.incremental-[0-9]+=([0-9]+)") for line in content.split("\n"): diff_group = diff_re.match(line) if diff_group is not None: ind_name = "nexus-maven-repository-index." + diff_group.group(1) + ".gz" ind_path = join(work_dir, ind_name) ind_url = urljoin(index_url, ind_name) if isfile(ind_path): - print(f"File {ind_path} exists, skipping download.") + print(f" - File {ind_path} exists, skipping download.") else: print( ( - f"File {ind_path} doesn't exist. " - f"Downloading file from {ind_url}" + f" - File {ind_path} doesn't exist. " + f"Downloading file from {ind_url}." ) ) # Retrieve incremental gz file contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) # Retrieve main index file. ind_path = join(work_dir, "nexus-maven-repository-index.gz") ind_url = urljoin(index_url, "nexus-maven-repository-index.gz") if isfile(ind_path): - print(f"File {ind_path} exists, skipping download.") + print(f" - File {ind_path} exists, skipping download.") else: - print(f"File {ind_path} doesn't exist. Downloading file from {ind_url}") + print(f" - File {ind_path} doesn't exist. Downloading file from {ind_url}") contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) ############################################### # Start execution ############################################### now = datetime.datetime.now() print(f"Script: {sys.argv[0]}") print("Timestamp:", now.strftime("%Y-%m-%d %H:%M:%S")) print(f"* URL: {base_url}") print(f"* Work_Dir: {work_dir}") # Check work_dir and create it if needed. if isdir(work_dir): - print("WORKD_DIR exists. Reusing it.") + print("Work_Dir {work_dir} exists. Reusing it.") else: try: + print("Cannot find work_dir {work_dir}. Creating it.") Path(work_dir).mkdir(parents=True, exist_ok=True) except OSError as error: - print(f"Could not create WORK_DIR {work_dir}: {error}.") + print(f"Could not create work_dir {work_dir}: {error}.") assert isdir(work_dir) assert isabs(work_dir) +# Grab all the indexes +# Only fetch the new ones, existing files won't be re-downloaded. _download_indexes(base_url) +# Run Docker on the downloaded indexes. _docker_run("bbaldassari/maven-index-exporter") - -now = datetime.datetime.now() -print("Timestamp:", now.strftime("%Y-%m-%d %H:%M:%S")) - print("Export directory has the following files:") - owd = getcwd() -chdir(join(workdir, "export")) +chdir(join(work_dir, "export")) myfile = None re_fld = re.compile(r".*\.fld$") for file in glob.glob("*.*"): - print(" -", file, "size", sizeof(getsize(file))) + print(" -", file, "size", getsize(file)) if (re_fld.match(file)): myfile = file + +# Now copy the results to the desired location: publish_dir. if isfile(myfile): print("Found fld file:", myfile) else: print("Cannot find .fld file. Exiting") exit(4) -shutil.copy2(myfile, publish_dir) +print(f"Copying files to {publish_dir}..") +try: + copy2(myfile, publish_dir) +except OSError as error: + print(f"Could not publish results in {publish_dir}: {error}.") +now = datetime.datetime.now() +print(f"Script finished on", now.strftime("%Y-%m-%d %H:%M:%S")) +