diff --git a/docker/Dockerfile b/docker/Dockerfile index b12a7b0..f1a2730 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,16 +1,16 @@ FROM adoptopenjdk/openjdk11:debian-jre # Download and install jars ADD https://github.com/javasoze/clue/releases/download/release-6.2.0-1.0.0/clue-6.2.0-1.0.0.jar /opt/ ADD https://repo1.maven.org/maven2/org/apache/maven/indexer/indexer-cli/6.0.0/indexer-cli-6.0.0.jar /opt/ # Copy index extraction script COPY extract_indexes.sh /opt/ WORKDIR /work/ RUN ls /opt/ RUN ls -R /work/ # Parse default index file (will be overriden by cli parameters) -CMD ["sh", "/opt/extract_indexes.sh", "/work/nexus-maven-repository-index.gz"] +CMD ["sh", "/opt/extract_indexes.sh"] diff --git a/scripts/run_full_export.py b/scripts/run_full_export.py index cb3455d..db5b2d4 100644 --- a/scripts/run_full_export.py +++ b/scripts/run_full_export.py @@ -1,216 +1,221 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import click import sys import datetime import glob import re from os import chdir, makedirs from os.path import getsize, isabs, isdir, isfile, join, basename from pathlib import Path from shutil import copy2 from urllib.parse import urljoin import requests import docker from docker.client import DockerClient from docker.models.images import Image logger = logging.getLogger(__name__) +MAVEN_INDEX_NAME = "nexus-maven-repository-index" +MAVEN_INDEX_ARCHIVE = f"{MAVEN_INDEX_NAME}.gz" + + def _docker_image_get( client: DockerClient, docker_image_name: str, docker_image_update: bool = False ) -> Image: """Retrieve docker image locally.""" if docker_image_update: return client.images.pull(repository=docker_image_name) try: image = next(iter(client.images.list(name=docker_image_name))) except StopIteration: logger.info("Docker: Could not find %s. Pulling it.", docker_image_name) image = client.images.pull(repository=docker_image_name) else: logger.info( "Docker: Found image %s locally, ID is %s.", image, image.attrs["Id"] ) return image def _extract_indexes( work_dir: str, docker_image_name: str, docker_image_update: bool = False ) -> None: """Extract indexes out of the working directory. As an implementation details, this starts the maven indexer exporter container using a docker image. This will use the local image if present, otherwise pull it from docker hub first. """ # Initialise the docker client. client = docker.from_env() image = _docker_image_get(client, docker_image_name, docker_image_update) + # Run the extraction process through the docker image (which runs the extract index + # script), see ../docker/Dockerfile. ret = client.containers.run( image, tty=True, - command=["sh", "/opt/extract_indexes.sh"], volumes={work_dir: {"bind": "/work", "mode": "rw"}}, ) logger.info("Docker log:\n%s", ret.decode()) def _download_indexes(work_dir: str, instance_url: str) -> None: """ Download all required indexes from the .index/ directory of the specified instance. """ logger.info("Downloading all required indexes") index_url = urljoin(instance_url, ".index/") - properties_name = "nexus-maven-repository-index.properties" + properties_name = f"{MAVEN_INDEX_NAME}.properties" properties_file = join(work_dir, properties_name) properties_url = urljoin(index_url, properties_name) # Retrieve properties file. logger.info(" - Downloading %s.", properties_file) content = requests.get(properties_url).content.decode() open(properties_file, "w").write(content) diff_re = re.compile("^nexus.index.incremental-[0-9]+=([0-9]+)") for line in content.split("\n"): diff_group = diff_re.match(line) if diff_group is not None: - ind_name = f"nexus-maven-repository-index.{diff_group.group(1)}.gz" + ind_name = f"{MAVEN_INDEX_NAME}.{diff_group.group(1)}.gz" ind_path = join(work_dir, ind_name) ind_url = urljoin(index_url, ind_name) if isfile(ind_path): logger.info( " - File %s exists, skipping download.", basename(ind_path) ) else: logger.info( " - File %s doesn't exist. Downloading file from %s.", basename(ind_path), ind_url, ) # Retrieve incremental gz file contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) # Retrieve main index file. - ind_path = join(work_dir, "nexus-maven-repository-index.gz") - ind_url = urljoin(index_url, "nexus-maven-repository-index.gz") + ind_path = join(work_dir, MAVEN_INDEX_ARCHIVE) + ind_url = urljoin(index_url, MAVEN_INDEX_ARCHIVE) if isfile(ind_path): logger.info(" - File %s exists, skipping download.", basename(ind_path)) else: logger.info( " - File %s doesn't exist. Downloading file from %s", basename(ind_path), ind_url, ) contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) @click.command() @click.option( "--base-url", required=True, help=( "Base url of the maven repository instance. \n" "Example: https://repo.maven.apache.org/maven2/" ), ) @click.option( "--work-dir", help="Absolute path to the temp directory.", default="/tmp/maven-index-exporter/", ) @click.option( "--publish-dir", help="Absolute path to the final directory.", default="/tmp/maven-index-exporter/publish/", ) @click.option( "--docker-image-name", help="Docker image", default="maven-index-exporter" ) @click.option( "--docker-image-update", is_flag=True, help="Trigger a docker image update.", default=False, ) def main(base_url, work_dir, publish_dir, docker_image_name, docker_image_update): now = datetime.datetime.now() logger.info("Script: run_full_export") logger.info("Timestamp: %s", now.strftime("%Y-%m-%d %H:%M:%S")) logger.info("* URL: %s", base_url) logger.info("* Working directory: %s", work_dir) logger.info("* Publish directory: %s", publish_dir) # Check work_dir and create it if needed. if isdir(work_dir): logger.info("Work_Dir %s exists. Reusing it.", work_dir) else: try: logger.info("Cannot find work_dir %s. Creating it.", work_dir) Path(work_dir).mkdir(parents=True, exist_ok=True) except OSError as error: logger.info("Could not create work_dir %s: %s.", work_dir, error) assert isdir(work_dir) assert isabs(work_dir) # Grab all the indexes # Only fetch the new ones, existing files won't be re-downloaded. _download_indexes(work_dir, base_url) # Run Docker on the downloaded indexes. _extract_indexes( work_dir, docker_image_name, docker_image_update=docker_image_update ) logger.info("Export directory has the following files:") export_dir = join(work_dir, "export") makedirs(export_dir, exist_ok=True) chdir(export_dir) myfile = None re_fld = re.compile(r".*\.fld$") for file_ in glob.glob("*.*"): logger.info(" - %s size %s", file_, getsize(file_)) if re_fld.match(file_): myfile = file_ # Now copy the results to the desired location: publish_dir. if isfile(myfile): logger.info("Found fld file: %s", myfile) else: logger.info("Cannot find .fld file. Exiting") sys.exit(4) makedirs(publish_dir, exist_ok=True) publish_file = join(publish_dir, "export.fld") logger.info("Copying files to %s.", publish_file) try: copy2(myfile, publish_file) except OSError as error: logger.info("Could not publish results in %s: %s.", publish_dir, error) now = datetime.datetime.now() logger.info("Script finished on %s", now.strftime("%Y-%m-%d %H:%M:%S")) ############################################### # Start execution ############################################### if __name__ == "__main__": logging.basicConfig(level=logging.INFO) main()