diff --git a/bin/swh-loader-tar-producer b/bin/swh-loader-tar-producer index 512b7fa..5f944c0 100755 --- a/bin/swh-loader-tar-producer +++ b/bin/swh-loader-tar-producer @@ -1,174 +1,174 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import sys from swh.core import config from swh.loader.tar import build, file def compute_message_from(app, conf, root_dir, tarpath, filename, dry_run=False): """Compute and post the message to worker for the archive tarpath. Args: app: instance of the celery app conf: dictionary holding static metadata root_dir: root directory tarball: the archive's representation dry_run: will compute but not send messages Returns: None Raises: ValueError when release number computation error arise. """ origin = build.compute_origin(conf['url_scheme'], conf['type'], root_dir, tarpath) revision = build.compute_revision(tarpath) occurrences = [build.occurrence_with_mtime(GNU_AUTHORITY, tarpath), build.occurrence_with_ctime(SWH_AUTHORITY, tarpath)] release = build.compute_release(filename, tarpath) if not dry_run: app.tasks['swh.loader.tar.tasks.LoadTarRepository'].delay(tarpath, origin, revision, release, occurrences) def produce_archive_messages_from(app, conf, path, mirror_file=None, dry_run=False): """From path, produce archive tarball messages to celery. Will print error message when some computation arise on archive and continue. Args: app: instance of the celery app conf: dictionary holding static metadata path: top directory to list archives from. mirror_file: a filtering file of tarballs to load dry_run: will compute but not send messages Returns: None Raises: None """ - LIMIT = conf['limit'] - BLOCK = int(conf['block_messages']) + limit = conf['limit'] + block = int(conf['block_messages']) count = 0 path_source_tarballs = mirror_file if mirror_file else path for tarpath, fname in file.random_archives_from(path_source_tarballs, - BLOCK, - LIMIT): + block, + limit): try: compute_message_from(app, conf, path, tarpath, fname, dry_run) count += 1 except ValueError: print('Problem with the following archive: %s' % tarpath) return count def load_config(conf_file): """Load the configuration from file. Args: conf_file: path to a configuration file with the following content: [main] # mirror's root directory holding tarballs to load into swh mirror_root_directory = /home/storage/space/mirrors/gnu.org/gnu/ # origin setup's possible scheme url url_scheme = rsync://ftp.gnu.org/gnu/ # origin type used for those tarballs type = ftp # For tryouts purposes (no limit if not specified) limit = 1 Returns: dictionary of data present in the configuration file. """ conf = config.read(conf_file, default_conf={'limit': ('int', None)}) url_scheme = conf['url_scheme'] mirror_dir = conf['mirror_root_directory'] # remove trailing / in configuration (to ease ulterior computation) if url_scheme[-1] == '/': conf.update({ 'url_scheme': url_scheme[0:-1] }) if mirror_dir[-1] == '/': conf.update({ 'mirror_root_directory': mirror_dir[0:-1] }) return conf def parse_args(): """Parse the configuration from the cli. """ cli = argparse.ArgumentParser( description='Tarball producer of local fs tarballs.') cli.add_argument('--dry-run', '-n', action='store_true', help='Dry run (print repo only)') cli.add_argument('--config', '-c', help='configuration file path') args = cli.parse_args() return args if __name__ == '__main__': args = parse_args() config_file = args.config if not config_file: print('Missing configuration file option.') sys.exit(1) # instantiate celery app with its configuration from swh.core.worker import app from swh.loader.tar import tasks # noqa conf = load_config(config_file) # state... SWH_AUTHORITY = conf['swh_authority'] GNU_AUTHORITY = conf['gnu_authority'] nb_tarballs = produce_archive_messages_from( app, conf, conf['mirror_root_directory'], conf.get('mirror_subset_archives'), args.dry_run) print('%s tarball(s) sent to worker.' % nb_tarballs) diff --git a/bin/swh-loader-tar-retrieve-tarball b/bin/swh-loader-tar-retrieve-tarball index 9778729..0ef0a12 100755 --- a/bin/swh-loader-tar-retrieve-tarball +++ b/bin/swh-loader-tar-retrieve-tarball @@ -1,190 +1,190 @@ #!/usr/bin/env python3 # NOT FOR PRODUCTION # - use swh storage api # - does not deal with missing contents yet so the tarball could be uncomplete # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import collections import itertools import os import shutil import sys import tempfile from swh.core import hashutil, config from swh.loader.tar import tarball, utils def escape_hash(sha1): """Escape an hexa sha1 to a ready queryable sha1.""" if isinstance(sha1, bytes): sha1 = hashutil.hash_to_hex(sha1) return '\\x%s' % sha1 def get_revision(revision_id): """Return the directory sha1 the revision with id revision_id points to. """ for revision in storage.revision_get([revision_id]): if 'metadata' in revision: meta = revision['metadata'] artifact = meta['original_artifact'][0] revision['archive_name'] = artifact['name'] revision['archive_type'] = artifact['archive_type'] return revision CONTENTS_BLOCK_SIZE = 10000 def _directory_ls_contents(ls_contents): # Split in iterable blocks of size CONTENTS_BLOCK_SIZE # filled with empty values if need for the last block blocks_contents = utils.grouper(ls_contents, CONTENTS_BLOCK_SIZE, fillvalue=None) for block_contents in blocks_contents: full_contents = [] content_by_sha = collections.defaultdict(list) # iter over contents (beware the last block can contain empty ones) for content in block_contents: if not content or content['status'] != 'visible': continue full_contents.append(content['sha1']) content_by_sha[content['sha1']].append(content) for c in storage.content_get(full_contents): for content in content_by_sha[c['sha1']]: content['data'] = c['data'] yield content def directory_ls_with_content(directory_id, recursive=True): """List directories with their data when content targeted is a file. """ ls_dirs, ls_contents = itertools.tee( storage.directory_get(directory_id, recursive=recursive)) yield from itertools.chain( (e for e in ls_dirs if e['type'] == 'dir'), _directory_ls_contents((e for e in ls_contents if e['type'] != 'dir'))) def build_archive_from_revision(revision_id, archive_type=None, directory_dest='.'): def mkdir(path): os.makedirs(path, exist_ok=True) os.chmod(path, 0o755) def build_tree(): # build fs structure tmpdir = tempfile.mkdtemp(suffix='create-tarball', prefix='swh.loader.tar-', dir='/tmp') for entry in directory_ls_with_content(directory_id, recursive=True): name = entry['name'].decode('utf-8') perms = entry['perms'] path = os.path.join(tmpdir, name) if perms == 40000: # dir mkdir(path) else: dirpath = os.path.dirname(path) mkdir(dirpath) if perms == 100644: # file file_content = entry['data'] with open(path, 'wb') as f: f.write(file_content) os.chmod(path, 0o644) else: # symlink linkdest = entry['data'] os.symlink(path, linkdest) yield path, name # clean up tmp directory shutil.rmtree(tmpdir) revision = get_revision(revision_id) directory_id = revision['directory'] tarpath = os.path.join(directory_dest, revision['archive_name']) archive_type = archive_type or revision['archive_type'] files = build_tree() # build archive from the tree tarball.compress(tarpath, archive_type, files) def parse_args(): """Parse the configuration from the cli. """ cli = argparse.ArgumentParser( description='Tarball creation from swh-storage.') cli.add_argument('--config-file', '-c', help='configuration file') cli.add_argument('--type-archive', '-t', help='archive type (zip or tar)') cli.add_argument('--directory', '-d', help='configuration file path') cli.add_argument('--revision', '-r', help='revision checksum') args = cli.parse_args() return args def check_args(args): """Check cli args and returns the error msg. Returns: List of error messages as string if some. """ - errorMsgs = [] + errors = [] if not args.config_file: - errorMsgs.append('\n- Configuration file option.') + errors.append('\n- Configuration file option.') if not args.revision: - errorMsgs.append('\n- Revision checksum') + errors.append('\n- Revision checksum') - return errorMsgs + return errors if __name__ == '__main__': args = parse_args() errorMsgs = check_args(args) if errorMsgs: print('Some mandatory options are missing: %s' % ''.join(errorMsgs)) sys.exit(1) conf = config.read(args.config_file) type_archive = args.type_archive or None directory_dest = args.directory or '.' revision_hex = args.revision if conf['storage_class'] == 'remote_storage': from swh.storage.api.client import RemoteStorage as Storage else: from swh.storage import Storage storage = Storage(conf['storage_args']) revision_id = hashutil.hex_to_hash(revision_hex) build_archive_from_revision(revision_id, type_archive, directory_dest) diff --git a/swh/loader/tar/tests/test_utils.py b/swh/loader/tar/tests/test_utils.py index 3dd870b..1aa16d5 100644 --- a/swh/loader/tar/tests/test_utils.py +++ b/swh/loader/tar/tests/test_utils.py @@ -1,157 +1,160 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.tar import utils class TestUtils(unittest.TestCase): @classmethod - def setupClass(cls): + def setUpClass(cls): + + super().setUpClass() + cls.files = { 'free-ipmi-1.2.2.tar': ('free-ipmi-', '1.2.2', '.tar'), 'free-ipmi-1.2.2.tar.gz': ('free-ipmi-', '1.2.2', '.tar.gz'), 'free-ipmi-1.2.2.tar.tgz': ('free-ipmi-', '1.2.2', '.tar.tgz'), 'gcc-testsuite-4.4.2-4.4.3.diff.bz2': ( 'gcc-testsuite-', '4.4.2-4.4.3', '.diff.bz2'), 'gcc-java-4.0.4.tar.gz': ('gcc-java-', '4.0.4', '.tar.gz'), 'gmp-2.0.tar.lzma': ('gmp-', '2.0', '.tar.lzma'), 'win-gerwin-0.6.zip': ('win-gerwin-', '0.6', '.zip'), 'ballandpaddle-0.8.0.tar.xz': ( 'ballandpaddle-', '0.8.0', '.tar.xz'), 'mail-1.1.1.some.lz': ('mail-', '1.1.1.some', '.lz'), 'gmp-4.1.1-4.1.2.diff.tar.Z': ( 'gmp-', '4.1.1-4.1.2', '.diff.tar.Z'), 'findutils-4.2.18.tar.bzip2': ( 'findutils-', '4.2.18', '.tar.bzip2'), 'gnunet-java-0.9.4.jar': ('gnunet-java-', '0.9.4', '.jar'), 'pycdio-0.15-py2.5-linux-i686.egg': ( 'pycdio-', '0.15-py2.5-linux-i686', '.egg'), 'rbcdio-0.04.gem': ('rbcdio-', '0.04', '.gem'), 'librejs-6.0.5.xpi': ('librejs-', '6.0.5', '.xpi'), 'icecat-31.8.0.csb.langpack.xpi': ( 'icecat-', '31.8.0.csb.langpack', '.xpi'), 'icecatmobile-31.8.0.en-US.android-arm.apk': ( 'icecatmobile-', '31.8.0.en-US.android-arm', '.apk'), 'icecat-31.8.0.en-US.mac.dmg': ( 'icecat-', '31.8.0.en-US.mac', '.dmg'), 'gnutls-3.0.21-1gn.DevPak': ('gnutls-', '3.0.21-1gn', '.DevPak'), # . separator 'greg-1.4.tar.gz': ('greg-', '1.4', '.tar.gz'), # number in software product 'aspell6-pt_BR-20070411-0.tar.bz2': ( 'aspell6-pt_BR-', '20070411-0', '.tar.bz2'), 'libosip2-3.3.0.tar.gz': ('libosip2-', '3.3.0', '.tar.gz'), # other cases 'hurd-F2-main.iso': ('hurd-F2-main', None, '.iso'), 'winboard-4_0_5.exe': ('winboard-', '4_0_5', '.exe'), # particular patterns... 'gift-0.1.9+3epsilon.tar.gz': ( 'gift-', '0.1.9+3epsilon', '.tar.gz'), 'gift-0.1.6pre2.tgz': ('gift-', '0.1.6pre2', '.tgz'), 'binutils-2.19.1a.tar.bz2': ('binutils-', '2.19.1a', '.tar.bz2'), 'readline-4.2-4.2a.diff.gz': ('readline-', '4.2-4.2a', '.diff.gz'), # with arch patterns 'cvs-1.12.6-BSD.bin.gz': ('cvs-', '1.12.6-BSD.bin', '.gz'), 'cvs-1.12.12-SunOS-5.8-i386.gz': ( 'cvs-', '1.12.12-SunOS-5.8-i386', '.gz'), 'gnutls-3.0.20-w32.zip': ('gnutls-', '3.0.20-w32', '.zip'), 'mit-scheme_7.7.90+20080130-0gutsy1.diff.gz': ( 'mit-scheme_', '7.7.90+20080130-0gutsy1', '.diff.gz'), # no release number 'gnu.ps.gz': ('gnu', None, '.ps.gz'), 'direvent-latest.tar.gz': ('direvent-latest', None, '.tar.gz'), } cls.files_error = ['.tar', '.anything'] @istest def parse_filename(self): for f in self.files: # when actual_components = utils.parse_filename(f) # then name, version, ext = self.files[f] expected_components = { 'software_name': name, 'release_number': version, 'extension': ext, } self.assertEquals(actual_components, expected_components) @istest def parse_filename_not_parseable_file(self): for f in self.files_error: with self.assertRaises(ValueError): utils.parse_filename(f) @istest def release_number(self): for f in self.files.keys(): # when actual_ext = utils.release_number(f) # then _, expected_rel_num, _ = self.files[f] self.assertEquals( actual_ext, expected_rel_num, 'for %s, the version should be %s' % (f, expected_rel_num)) @istest def commonname(self): # when actual_commonname = utils.commonname('/some/where/to/', '/some/where/to/go/to') # then self.assertEquals('go/to', actual_commonname) # when actual_commonname2 = utils.commonname(b'/some/where/to/', b'/some/where/to/go/to') # then self.assertEquals(b'go/to', actual_commonname2) @istest def convert_to_hex(self): # given input_dict = { 'sha1_git': b'\xf6\xb7 \x8b+\xcd \x9fq5E\xe6\x03\xffg\x87\xd7\xb9D\xa1', # noqa 'sha1': b'\xf4O\xf0\xd4\xc0\xb0\xae\xca\xe4C\xab%\x10\xf7\x12h\x1e\x9f\xac\xeb', # noqa 'sha256': b'\xa8\xf9=\xf3\xfek\xa2$\xee\xc7\x1b\xc2\x83\xca\x96\xae8\xaf&\xab\x08\xfa\xb1\x13\xec(.s]\xf6Yb'} # noqa expected_dict = {'sha1_git': 'f6b7208b2bcd209f713545e603ff6' '787d7b944a1', 'sha1': 'f44ff0d4c0b0aecae443ab2510f712681e' '9faceb', 'sha256': 'a8f93df3fe6ba224eec71bc283ca96ae3' '8af26ab08fab113ec282e735df65962'} # when actual_dict = utils.convert_to_hex(input_dict) # then self.assertDictEqual(actual_dict, expected_dict) @istest def convert_to_hex_edge_cases(self): # when actual_dict = utils.convert_to_hex({}) # then self.assertDictEqual(actual_dict, {}) self.assertIsNone(utils.convert_to_hex(None))