diff --git a/scripts/run_full_export.py b/scripts/run_full_export.py index dbc1fbd..92777de 100644 --- a/scripts/run_full_export.py +++ b/scripts/run_full_export.py @@ -1,193 +1,213 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import click import sys import datetime import glob import re -from os import chdir, getcwd, makedirs +from os import chdir, makedirs from os.path import getsize, isabs, isdir, isfile, join from pathlib import Path from shutil import copy2 from urllib.parse import urljoin import requests import docker +from docker.client import DockerClient +from docker.models.images import Image logger = logging.getLogger(__name__) -def _extract_indexes(work_dir: str, docker_image: str) -> None: +def _docker_image_get( + client: DockerClient, docker_image_name: str, docker_image_update: bool = False +) -> Image: + """Retrieve docker image locally.""" + if docker_image_update: + return client.images.pull(repository=docker_image_name) + try: + image = next(iter(client.images.list(name=docker_image_name))) + except StopIteration: + logger.info("Docker: Could not find %s. Pulling it.", docker_image_name) + image = client.images.pull(repository=docker_image_name) + else: + logger.info( + "Docker: Found image %s locally, ID is %s.", image, image.attrs["Id"] + ) + return image + + +def _extract_indexes( + work_dir: str, docker_image_name: str, docker_image_update: bool = False +) -> None: """Extract indexes out of the working directory. As an implementation details, this starts the maven indexer exporter container using - the docker image. If the image already exists locally, simply use the local one, - otherwise pull it from docker hub. + a docker image. This will use the local image if present, otherwise pull it from + docker hub first. """ # Initialise the docker client. client = docker.from_env() - - try: - myimage = next(iter(client.images.list(name=docker_image))) - except StopIteration: - logger.info("Docker: Could not find %s. Pulling it.", docker_image) - myimage = client.images.pull(repository=docker_image) - else: - logger.info( - "Docker: Found image %s locally, ID is %s.", myimage, myimage.attrs['Id'] - ) - + image = _docker_image_get(client, docker_image_name, docker_image_update) ret = client.containers.run( - myimage, + image, tty=True, command=["sh", "/opt/extract_indexes.sh"], volumes={work_dir: {"bind": "/work", "mode": "rw"}}, ) logger.info("Docker log:\n%s", ret.decode()) def _download_indexes(work_dir: str, instance_url: str) -> None: """ Download all required indexes from the .index/ directory of the specified instance. """ logger.info("Downloading all required indexes") index_url = urljoin(instance_url, ".index/") properties_name = "nexus-maven-repository-index.properties" properties_file = join(work_dir, properties_name) properties_url = urljoin(index_url, properties_name) # Retrieve properties file. logger.info(" - Downloading %s.", properties_file) content = requests.get(properties_url).content.decode() open(properties_file, "w").write(content) diff_re = re.compile("^nexus.index.incremental-[0-9]+=([0-9]+)") for line in content.split("\n"): diff_group = diff_re.match(line) if diff_group is not None: ind_name = f"nexus-maven-repository-index.{diff_group.group(1)}.gz" ind_path = join(work_dir, ind_name) ind_url = urljoin(index_url, ind_name) if isfile(ind_path): logger.info(" - File %s exists, skipping download.", ind_path) else: logger.info( ( - " - File %s doesn't exist. " - "Downloading file from %s.", ind_path, ind_url + " - File %s doesn't exist. " "Downloading file from %s.", + ind_path, + ind_url, ) ) # Retrieve incremental gz file contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) # Retrieve main index file. ind_path = join(work_dir, "nexus-maven-repository-index.gz") ind_url = urljoin(index_url, "nexus-maven-repository-index.gz") if isfile(ind_path): logger.info(f" - File {ind_path} exists, skipping download.") else: logger.info( f" - File {ind_path} doesn't exist. Downloading file from {ind_url}" ) contentb = requests.get(ind_url).content open(ind_path, "wb").write(contentb) @click.command() @click.option( "--base-url", required=True, help=( "Base url of the maven repository instance. \n" "Example: https://repo.maven.apache.org/maven2/" ), ) @click.option( "--work-dir", help="Absolute path to the temp directory.", - default="/tmp/maven-index-exporter/") + default="/tmp/maven-index-exporter/", +) @click.option( "--publish-dir", help="Absolute path to the final directory.", - default="/tmp/maven-index-exporter/publish/" + default="/tmp/maven-index-exporter/publish/", +) +@click.option( + "--docker-image-name", help="Docker image", default="maven-index-exporter" ) @click.option( - "--docker-image", - help="Docker image", - default="maven-index-exporter" + "--docker-image-update", + is_flag=True, + help="Trigger a docker image update.", + default=False, ) -def main(base_url, work_dir, publish_dir, docker_image): +def main(base_url, work_dir, publish_dir, docker_image_name, docker_image_update): now = datetime.datetime.now() logger.info("Script: run_full_export") logger.info("Timestamp: %s", now.strftime("%Y-%m-%d %H:%M:%S")) logger.info("* URL: %s", base_url) logger.info("* Working directory: %s", work_dir) logger.info("* Publish directory: %s", publish_dir) # Check work_dir and create it if needed. if isdir(work_dir): logger.info("Work_Dir %s exists. Reusing it.", work_dir) else: try: logger.info("Cannot find work_dir %s. Creating it.", work_dir) Path(work_dir).mkdir(parents=True, exist_ok=True) except OSError as error: logger.info("Could not create work_dir %s: %s.", work_dir, error) assert isdir(work_dir) assert isabs(work_dir) # Grab all the indexes # Only fetch the new ones, existing files won't be re-downloaded. _download_indexes(work_dir, base_url) # Run Docker on the downloaded indexes. - _extract_indexes(work_dir, docker_image) + _extract_indexes( + work_dir, docker_image_name, docker_image_update=docker_image_update + ) logger.info("Export directory has the following files:") export_dir = join(work_dir, "export") makedirs(export_dir, exist_ok=True) chdir(export_dir) myfile = None re_fld = re.compile(r".*\.fld$") for file_ in glob.glob("*.*"): logger.info(" - %s size %s", file_, getsize(file_)) if re_fld.match(file_): myfile = file_ # Now copy the results to the desired location: publish_dir. if isfile(myfile): logger.info("Found fld file: %s", myfile) else: logger.info("Cannot find .fld file. Exiting") sys.exit(4) makedirs(publish_dir, exist_ok=True) publish_file = join(publish_dir, "export.fld") logger.info("Copying files to %s.", publish_file) try: copy2(myfile, publish_file) except OSError as error: logger.info("Could not publish results in %s: %s.", publish_dir, error) now = datetime.datetime.now() logger.info("Script finished on %s", now.strftime("%Y-%m-%d %H:%M:%S")) ############################################### # Start execution ############################################### if __name__ == "__main__": logging.basicConfig(level=logging.INFO) main()