diff --git a/bin/ls-tarball-size b/bin/ls-tarball-size new file mode 100755 index 0000000..df56eab --- /dev/null +++ b/bin/ls-tarball-size @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import argparse +import os + +from swh.loader.tar import file + + +def parse_args(): + """Parse the configuration from the cli. + + """ + cli = argparse.ArgumentParser( + description='Tarball listing tarballs size.') + cli.add_argument('--mirror-root-dir', '-m', help='path to the root dir.') + + args = cli.parse_args() + + return args + + +if __name__ == '__main__': + args = parse_args() + root_dir = args.mirror_root_dir + + for tarpath, _ in file.list_archives_from(root_dir): + print('%s %s' % (tarpath, os.path.getsize(tarpath))) diff --git a/bin/swh-loader-tar-lister b/bin/swh-loader-tar-lister index 634ae49..f200092 100755 --- a/bin/swh-loader-tar-lister +++ b/bin/swh-loader-tar-lister @@ -1,231 +1,171 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse -import os import sys from swh.core import config -from swh.loader.tar import tarball, build - - -def list_archives_from_dir(path): - """Given a path to a directory, walk such directory and yield tuple of - tarpath, fname. - - Args: - path: top level directory - - Returns: - Generator of tuple tarpath, filename with tarpath a tarball. - - """ - for dirpath, dirnames, filenames in os.walk(path): - for fname in filenames: - tarpath = os.path.join(dirpath, fname) - if not os.path.exists(tarpath): - continue - - if tarball.is_tarball(tarpath): - yield tarpath, fname - - -def list_archives_from_file(mirror_file): - """Given a path to a file containing one tarball per line, yield a tuple of - tarpath, fname. - - Args: - mirror_file: path to the file containing list of tarpath. - - Returns: - Generator of tuple tarpath, filename with tarpath a tarball. - - """ - with open(mirror_file, 'r') as f: - for tarpath in f.readlines(): - tarpath = tarpath.strip() - if not os.path.exists(tarpath): - print('WARN: %s does not exist. Skipped.' % tarpath) - continue - - if tarball.is_tarball(tarpath): - yield tarpath, os.path.basename(tarpath) - - -def list_archives_from(path): - """From path, list tuple of tarpath, fname. - - Args: - path: top directory to list archives from or custom file format. - - """ - if os.path.isfile(path): - yield from list_archives_from_file(path) - elif os.path.isdir(path): - yield from list_archives_from_dir(path) - else: - raise ValueError( - 'Input incorrect, %s must be a file or a directory.' % path) +from swh.loader.tar import build, file def compute_message_from(app, conf, root_dir, tarpath, filename, dry_run=False): """Compute and post the message to worker for the archive tarpath. Args: app: instance of the celery app conf: dictionary holding static metadata root_dir: root directory tarball: the archive's representation dry_run: will compute but not send messages Returns: None Raises: ValueError when release number computation error arise. """ origin = build.compute_origin(conf['url_scheme'], conf['type'], root_dir, tarpath) revision = build.compute_revision() occurrences = [build.occurrence_with_mtime(GNU_AUTHORITY, tarpath), build.occurrence_with_ctime(SWH_AUTHORITY, tarpath)] release = build.compute_release(filename, tarpath) if not dry_run: app.tasks['swh.loader.tar.tasks.LoadTarRepository'].delay(tarpath, origin, revision, release, occurrences) def produce_archive_messages_from(app, conf, path, mirror_file=None, dry_run=False): """From path, produce archive tarball messages to celery. Will print error message when some computation arise on archive and continue. Args: app: instance of the celery app conf: dictionary holding static metadata path: top directory to list archives from. mirror_file: a filtering file of tarballs to load dry_run: will compute but not send messages Returns: None Raises: None """ LIMIT = conf['limit'] count = 0 path_source_tarballs = mirror_file if mirror_file else path - for tarpath, fname in list_archives_from(path_source_tarballs): + for tarpath, fname in file.list_archives_from(path_source_tarballs): count += 1 try: compute_message_from(app, conf, path, tarpath, fname, dry_run) except ValueError: print('Problem with the following archive: %s' % tarpath) if LIMIT and count >= LIMIT: return count return count def load_config(conf_file): """Load the configuration from file. Args: conf_file: path to a configuration file with the following content: [main] # mirror's root directory holding tarballs to load into swh mirror_root_directory = /home/storage/space/mirrors/gnu.org/gnu/ # origin setup's possible scheme url url_scheme = rsync://ftp.gnu.org/gnu/ # origin type used for those tarballs type = ftp # For tryouts purposes (no limit if not specified) limit = 1 Returns: dictionary of data present in the configuration file. """ conf = config.read(conf_file, default_conf={'limit': ('int', None)}) url_scheme = conf['url_scheme'] mirror_dir = conf['mirror_root_directory'] # remove trailing / in configuration (to ease ulterior computation) if url_scheme[-1] == '/': conf.update({ 'url_scheme': url_scheme[0:-1] }) if mirror_dir[-1] == '/': conf.update({ 'mirror_root_directory': mirror_dir[0:-1] }) return conf def parse_args(): """Parse the configuration from the cli. """ cli = argparse.ArgumentParser( description='Tarball producer of local fs tarballs.') cli.add_argument('--dry-run', '-n', action='store_true', help='Dry run (print repo only)') cli.add_argument('--config', '-c', help='configuration file path') args = cli.parse_args() return args if __name__ == '__main__': args = parse_args() config_file = args.config if not config_file: print('Missing configuration file option.') sys.exit(1) # instantiate celery app with its configuration from swh.core.worker import app from swh.loader.tar import tasks # noqa conf = load_config(config_file) # state... SWH_AUTHORITY = conf['swh_authority'] GNU_AUTHORITY = conf['gnu_authority'] nb_tarballs = produce_archive_messages_from( app, conf, conf['mirror_root_directory'], conf.get('mirror_subset_archives'), args.dry_run) print('%s tarball(s) sent to worker.' % nb_tarballs) diff --git a/swh/loader/tar/file.py b/swh/loader/tar/file.py new file mode 100644 index 0000000..24ac56c --- /dev/null +++ b/swh/loader/tar/file.py @@ -0,0 +1,67 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os + +from swh.loader.tar import tarball + + +def list_archives_from_dir(path): + """Given a path to a directory, walk such directory and yield tuple of + tarpath, fname. + + Args: + path: top level directory + + Returns: + Generator of tuple tarpath, filename with tarpath a tarball. + + """ + for dirpath, dirnames, filenames in os.walk(path): + for fname in filenames: + tarpath = os.path.join(dirpath, fname) + if not os.path.exists(tarpath): + continue + + if tarball.is_tarball(tarpath): + yield tarpath, fname + + +def list_archives_from_file(mirror_file): + """Given a path to a file containing one tarball per line, yield a tuple of + tarpath, fname. + + Args: + mirror_file: path to the file containing list of tarpath. + + Returns: + Generator of tuple tarpath, filename with tarpath a tarball. + + """ + with open(mirror_file, 'r') as f: + for tarpath in f.readlines(): + tarpath = tarpath.strip() + if not os.path.exists(tarpath): + print('WARN: %s does not exist. Skipped.' % tarpath) + continue + + if tarball.is_tarball(tarpath): + yield tarpath, os.path.basename(tarpath) + + +def list_archives_from(path): + """From path, list tuple of tarpath, fname. + + Args: + path: top directory to list archives from or custom file format. + + """ + if os.path.isfile(path): + yield from list_archives_from_file(path) + elif os.path.isdir(path): + yield from list_archives_from_dir(path) + else: + raise ValueError( + 'Input incorrect, %s must be a file or a directory.' % path)