diff --git a/MANIFEST.in b/MANIFEST.in index 08ebc95..e7c46fc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include Makefile include requirements.txt +include requirements-swh.txt include version.txt diff --git a/PKG-INFO b/PKG-INFO index 11aeca8..ebc0730 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.tar -Version: 0.0.22 +Version: 0.0.23 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/README b/README index 47d02ec..45586c0 100644 --- a/README +++ b/README @@ -1,204 +1,195 @@ SWH-loader-tar ============== -The Software Heritage Tarball Loader is a tool and a library to uncompress a local -tarball and inject into the SWH dataset all unknown contained files. +The Software Heritage Tarball Loader is a tool and a library to +uncompress a local tarball and inject into the SWH dataset all unknown +contained files. Tarball loader ============== -Its job is to uncompress a tarball and load its content in swh storage. +Its job is to uncompress a tarball and load its content in swh +storage. ### Configuration This is the loader's (or task's) configuration file. -loader/tar.ini: - - [main] - - # the path where to extract the tarball before loading it into swh +loader/tar.yml: + +``` + extraction_dir: /home/storage/tmp/ + storage: + cls: local + args: + db: service=swh-dev + objstorage: + cls: pathslicing + args: + root: /home/storage/swh-storage + slicing: 0:2/2:4/4:6 + send_contents: True + send_directories: True + send_revisions: True + send_releases: True + send_occurrences: True + + content_packet_size: 10000 + content_packet_block_size_bytes: 104857600 + content_packet_size_bytes: 1073741824 + directory_packet_size: 25000 + revision_packet_size: 100000 + release_packet_size: 100000 + occurrence_packet_size: 100000 extraction_dir = /home/storage/tmp/ - - # access to swh's storage - storage_class = remote_storage - storage_args = http://localhost:5000/ - - # parameters to condition loading into swh storage - send_contents = True - send_directories = True - send_revisions = True - send_releases = True - send_occurrences = True - content_packet_size = 10000 - content_packet_size_bytes = 1073741824 - directory_packet_size = 25000 - revision_packet_size = 100000 - release_packet_size = 100000 - occurrence_packet_size = 100000 +``` Present in possible locations: - ~/.config/swh/loader/tar.ini - ~/.swh/loader/tar.ini - /etc/softwareheritage/loader/tar.ini ### API Load tarball directly from code or toplevel: - from swh.loader.tar.tasks import LoadTarRepository + from swh.loader.tar.loader import TarLoader + tarpath = '/some/path/to/blah-7.8.3.tgz' # Fill in those - origin = {} - release = None + origin = {'url': 'some-origin', 'type': 'dir'} + visit_date = 'Tue, 3 May 2017 17:16:32 +0200' revision = {} occurrence = {} - LoadTarRepository().run('/some/path/to/blah-7.8.3.tgz', - origin, - revision, - release, - [occurrence]) + TarLoader().load(tarpath, origin, visit_date, revision, [occurrence]) ### Celery Load tarball using celery. Providing you have a properly configured celery up and running worker.ini needs to be updated with the following keys: task_modules = swh.loader.tar.tasks task_queues = swh_loader_tar cf. https://forge.softwareheritage.org/diffusion/DCORE/browse/master/README.md for more details #### Toplevel You can send the following message to the task queue: from swh.loader.tar.tasks import LoadTarRepository # Fill in those - origin = {} - release = None + tarpath = '/some/path/to/blah-7.8.3.tgz' + origin = {'url': 'some-origin', 'type': 'dir'} + visit_date = 'Tue, 3 May 2017 17:16:32 +0200' revision = {} occurrence = {} # Send message to the task queue - LoadTarRepository().apply_async(('/some/path/to/blah-7.8.3.tgz', - origin, - revision, - release, - [occurrence])) + LoadTarRepository().run((tarpath, origin, visit_date, revision, [occurrence])) Tar Producer ============ Its job is to compulse from a file or a folder a list of existing -tarball. From this list, compute the corresponding messages to -send to the broker. +tarballs. From this list, compute the corresponding messages to send +to the broker. #### Configuration Message producer's configuration file: [main] # Mirror's root directory holding tarballs to load into swh mirror_root_directory = /srv/storage/space/mirrors/gnu.org/gnu/ # mirror_root_directory = /srv/storage/space/mirrors/gnu.org/old-gnu/ # Url scheme prefix used to create the origin url url_scheme = http://ftp.gnu.org/gnu/ # url_scheme = rsync://ftp.gnu.org/old-gnu/ # Origin type used for tarballs type = ftp # File containing a subset list tarballs from mirror_root_directory to load. # The file's format is one absolute path name to a tarball per line. # NOTE: # - This file must contain data consistent with the mirror_root_directory # - if this option is not provided, the mirror_root_directory is scanned # completely as usual # mirror_subset_archives = /home/storage/missing-archives - # Authorities - gnu_authority = 4706c92a-8173-45d9-93d7-06523f249398 - swh_authority = 5f4d4c51-498a-4e28-88b3-b3e4e8396cba - # Randomize blocks of messages and send for consumption block_messages = 250 - # DEV options - - # Tryouts purposes (no limit if not specified) - # limit = 10 - - #### Run Trigger the message computations: - swh-loader-tar-producer --config ~/.swh/producer/tar.ini + python3 -m swh.loader.tar.producer --config ~/.swh/producer/tar.ini This will walk the `mirror_root_directory` folder and send encountered tarball messages for the swh-loader-tar to uncompress (through celery). If the `mirror_subset_archives` is provided, the tarball messages will be computed from such file (the mirror_root_directory is still used so please be consistent). If problem arises during tarball message computation, a message will be outputed with the tarball that present a problem. It will displayed the number of tarball messages sent at the end. Dry run: - swh-loader-tar-producer --config ~/.swh/producer/tar.ini --dry-run + python3 -m swh.loader.tar.producer --config-file ~/.swh/producer/tar.ini --dry-run This will do the same as previously described but only display the number of potential tarball messages computed. Help: - swh-loader-tar-producer -h + python3 -m swh.loader.tar.producer --help diff-db-mirror ============== Utility to compute the difference between the `occurrence_history` table (column branch) and the actual mirror path on disk. This will output the path to the tarballs not injected in db (for any reason). This output is to be consumed by the swh-loader-tar-producer in replay mode. Sample use: ./bin/swh-diff-db-mirror \ --db-url 'host= dbname= user= password=' \ --mirror-root-directory /path/to/mirrors/gnu.org/old-gnu Here is a sample output: ... /home/storage/space/mirrors/gnu.org/gnu/miscfiles/miscfiles-1.4.2.tar.gz /home/storage/space/mirrors/gnu.org/gnu/zile/zile-2.4.5.tar.gz /home/storage/space/mirrors/gnu.org/gnu/zile/zile-2.3.10.tar.gz /home/storage/space/mirrors/gnu.org/gnu/zile/zile-2.4.8.tar.gz /home/storage/space/mirrors/gnu.org/gnu/zile/zile-2.3.5.tar.gz /home/storage/space/mirrors/gnu.org/gnu/zile/zile-2.3.7.tar.gz /home/storage/space/mirrors/gnu.org/gnu/zile/zile-2.3.14.tar.gz /home/storage/space/mirrors/gnu.org/gnu/zile/zile-2.2.59.tar.gz /home/storage/space/mirrors/gnu.org/gnu/zile/zile-2.3.9.tar.gz /home/storage/space/mirrors/gnu.org/gnu/zile/zile-2.3.11.tar.gz diff --git a/bin/swh-diff-db-mirror b/bin/swh-diff-db-mirror deleted file mode 100755 index b4a4def..0000000 --- a/bin/swh-diff-db-mirror +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import argparse -import os -import sys - -from swh.loader.tar import tarball, db - - -def list_branches(db_url): - """Return the distinct list of branches present in occurrence_history. - - """ - with db.connect(db_url) as db_conn: - for branch in db.query_fetch(db_conn, - 'select distinct branch from ' - 'occurrence_history'): - yield branch[0] - - -def diff_branch(root_dir, existing_set): - """Walk the root_dir and for every tarball not in existing_set, - yield its absolute path. - - """ - for dirpath, _, filenames in os.walk(root_dir): - for filename in filenames: - filepath = os.path.join(dirpath, filename) - if not os.path.exists(filepath): - continue - if not tarball.is_tarball(filepath): - continue - if filename in existing_set: - continue - - yield filepath - - -def parse_args(): - """Parse the configuration from the cli. - - """ - cli = argparse.ArgumentParser( - description='Diff between db and local fs mirror tarballs directory.' - 'Output what\'s on disk and not on db.') - cli.add_argument('--db-url', '-d', - help='db-url string') - cli.add_argument('--mirror-root-directory', '-m', - help='mirror root directory') - - args = cli.parse_args() - if not args.db_url or not args.mirror_root_directory: - print('Bad usage, cf. --help') - sys.exit(1) - - return args - - -if __name__ == '__main__': - args = parse_args() - - db_url = args.db_url - - already_present = set(list_branches(db_url)) - - root_dir = args.mirror_root_directory - for filepath in diff_branch(root_dir, already_present): - print(filepath) diff --git a/bin/swh-loader-tar-retrieve-tarball b/bin/swh-loader-tar-retrieve-tarball deleted file mode 100755 index 0ef0a12..0000000 --- a/bin/swh-loader-tar-retrieve-tarball +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env python3 - -# NOT FOR PRODUCTION -# - use swh storage api -# - does not deal with missing contents yet so the tarball could be uncomplete - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import argparse -import collections -import itertools -import os -import shutil -import sys -import tempfile - -from swh.core import hashutil, config -from swh.loader.tar import tarball, utils - - -def escape_hash(sha1): - """Escape an hexa sha1 to a ready queryable sha1.""" - if isinstance(sha1, bytes): - sha1 = hashutil.hash_to_hex(sha1) - return '\\x%s' % sha1 - - -def get_revision(revision_id): - """Return the directory sha1 the revision with id revision_id points to. - - """ - for revision in storage.revision_get([revision_id]): - if 'metadata' in revision: - meta = revision['metadata'] - artifact = meta['original_artifact'][0] - revision['archive_name'] = artifact['name'] - revision['archive_type'] = artifact['archive_type'] - - return revision - - -CONTENTS_BLOCK_SIZE = 10000 - - -def _directory_ls_contents(ls_contents): - # Split in iterable blocks of size CONTENTS_BLOCK_SIZE - # filled with empty values if need for the last block - blocks_contents = utils.grouper(ls_contents, CONTENTS_BLOCK_SIZE, - fillvalue=None) - - for block_contents in blocks_contents: - full_contents = [] - content_by_sha = collections.defaultdict(list) - - # iter over contents (beware the last block can contain empty ones) - for content in block_contents: - if not content or content['status'] != 'visible': - continue - - full_contents.append(content['sha1']) - content_by_sha[content['sha1']].append(content) - - for c in storage.content_get(full_contents): - for content in content_by_sha[c['sha1']]: - content['data'] = c['data'] - yield content - - -def directory_ls_with_content(directory_id, recursive=True): - """List directories with their data when content targeted is a file. - - """ - ls_dirs, ls_contents = itertools.tee( - storage.directory_get(directory_id, recursive=recursive)) - - yield from itertools.chain( - (e for e in ls_dirs if e['type'] == 'dir'), - _directory_ls_contents((e for e in ls_contents if e['type'] != 'dir'))) - - -def build_archive_from_revision(revision_id, - archive_type=None, - directory_dest='.'): - def mkdir(path): - os.makedirs(path, exist_ok=True) - os.chmod(path, 0o755) - - def build_tree(): - # build fs structure - tmpdir = tempfile.mkdtemp(suffix='create-tarball', - prefix='swh.loader.tar-', - dir='/tmp') - - for entry in directory_ls_with_content(directory_id, recursive=True): - name = entry['name'].decode('utf-8') - perms = entry['perms'] - - path = os.path.join(tmpdir, name) - if perms == 40000: # dir - mkdir(path) - else: - dirpath = os.path.dirname(path) - mkdir(dirpath) - - if perms == 100644: # file - file_content = entry['data'] - with open(path, 'wb') as f: - f.write(file_content) - - os.chmod(path, 0o644) - else: # symlink - linkdest = entry['data'] - os.symlink(path, linkdest) - - yield path, name - - # clean up tmp directory - shutil.rmtree(tmpdir) - - revision = get_revision(revision_id) - directory_id = revision['directory'] - tarpath = os.path.join(directory_dest, revision['archive_name']) - archive_type = archive_type or revision['archive_type'] - - files = build_tree() - # build archive from the tree - tarball.compress(tarpath, archive_type, files) - - -def parse_args(): - """Parse the configuration from the cli. - - """ - cli = argparse.ArgumentParser( - description='Tarball creation from swh-storage.') - cli.add_argument('--config-file', '-c', help='configuration file') - cli.add_argument('--type-archive', '-t', - help='archive type (zip or tar)') - cli.add_argument('--directory', '-d', - help='configuration file path') - cli.add_argument('--revision', '-r', - help='revision checksum') - - args = cli.parse_args() - - return args - - -def check_args(args): - """Check cli args and returns the error msg. - - Returns: - List of error messages as string if some. - - """ - errors = [] - if not args.config_file: - errors.append('\n- Configuration file option.') - - if not args.revision: - errors.append('\n- Revision checksum') - - return errors - - -if __name__ == '__main__': - args = parse_args() - - errorMsgs = check_args(args) - if errorMsgs: - print('Some mandatory options are missing: %s' % ''.join(errorMsgs)) - sys.exit(1) - - conf = config.read(args.config_file) - type_archive = args.type_archive or None - directory_dest = args.directory or '.' - revision_hex = args.revision - - if conf['storage_class'] == 'remote_storage': - from swh.storage.api.client import RemoteStorage as Storage - else: - from swh.storage import Storage - - storage = Storage(conf['storage_args']) - - revision_id = hashutil.hex_to_hash(revision_hex) - build_archive_from_revision(revision_id, type_archive, directory_dest) diff --git a/bin/swh-ls-tarball-size b/bin/swh-ls-tarball-size deleted file mode 100755 index 1eb8a77..0000000 --- a/bin/swh-ls-tarball-size +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import argparse -import os - -from swh.loader.tar import file - - -def parse_args(): - """Parse the configuration from the cli. - - """ - cli = argparse.ArgumentParser( - description='Tarball listing tarballs size.') - cli.add_argument('--mirror-root-dir', '-m', help='path to the root dir.') - - args = cli.parse_args() - - return args - - -if __name__ == '__main__': - args = parse_args() - root_dir = args.mirror_root_dir - - for tarpath, _ in file.archives_from(root_dir): - print('%s %s' % (tarpath, os.path.getsize(tarpath))) diff --git a/bin/swh-update-tarball-size b/bin/swh-update-tarball-size deleted file mode 100755 index 00754c2..0000000 --- a/bin/swh-update-tarball-size +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import argparse -import os -import psycopg2 - -from contextlib import contextmanager - -from swh.core import hashutil -from swh.loader.tar import utils - - -def entry_to_bytes(entry): - """Convert an entry coming from the database to bytes""" - if isinstance(entry, memoryview): - return entry.tobytes() - if isinstance(entry, list): - return [entry_to_bytes(value) for value in entry] - return entry - - -def line_to_bytes(line): - """Convert a line coming from the database to bytes""" - return line.__class__(entry_to_bytes(entry) for entry in line) - - -def cursor_to_bytes(cursor): - """Yield all the data from a cursor as bytes""" - yield from (line_to_bytes(line) for line in cursor) - - -class Db: - """Proxy to the SWH DB, with wrappers around stored procedures - - """ - - @classmethod - def connect(cls, *args, **kwargs): - """factory method to create a DB proxy - - Accepts all arguments of psycopg2.connect; only some specific - possibilities are reported below. - - Args: - connstring: libpq2 connection string - - """ - conn = psycopg2.connect(*args, **kwargs) - return cls(conn) - - def _cursor(self, cur_arg): - """get a cursor: from cur_arg if given, or a fresh one otherwise - - meant to avoid boilerplate if/then/else in methods that proxy stored - procedures - - """ - if cur_arg is not None: - return cur_arg - # elif self.cur is not None: - # return self.cur - else: - return self.conn.cursor() - - def __init__(self, conn): - """create a DB proxy - - Args: - conn: psycopg2 connection to the SWH DB - - """ - self.conn = conn - - @contextmanager - def transaction(self): - """context manager to execute within a DB transaction - - Yields: - a psycopg2 cursor - - """ - with self.conn.cursor() as cur: - try: - yield cur - self.conn.commit() - except: - if not self.conn.closed: - self.conn.rollback() - raise - - def read_archives(self, cur=None): - cur = self._cursor(cur) - q = """select target, o.url, r.metadata#>>'{original_artifact,0,name}', - r.metadata#>>'{original_artifact,0,archive_type}', - r.metadata#>>'{original_artifact,0,sha1}', - r.metadata#>>'{original_artifact,0,sha256}', - r.metadata#>>'{original_artifact,0,sha1_git}' - from occurrence_history occ - inner join origin o on o.id=occ.origin - inner join revision r on occ.target = r.id - where target_type='revision' - and o.url like 'rsync://%gnu%' - and r.metadata#>>'{original_artifact,0,length}' is null - and r.metadata#>>'{original_artifact,0,archive_type}' is - not null; - """ - cur.execute(q) - for entry in cursor_to_bytes(cur): - url = entry[1] - name = entry[2] - path = os.path.join(url.replace('rsync://ftp.gnu.org/', ''), name) - yield { - 'revision_id': hashutil.hash_to_hex(entry[0]), - 'name': name, - 'path': path, - 'sha1': entry[3], - 'sha256': entry[4], - 'sha1_git': entry[5], - } - - -def parse_args(): - """Parse the configuration from the cli. - - """ - cli = argparse.ArgumentParser( - description='Tarball listing tarballs size.') - cli.add_argument('--mirror-root-dir', '-m', help='path to the root dir.') - cli.add_argument('--db-url', '-u', default=None, help='path to root dir.') - cli.add_argument('--dry-run', action='store_true', help='dry run.') - - args = cli.parse_args() - - return args - - -def read_revisions_per_tarname_from_db(root_dir, db_url): - db = Db.connect(db_url) - with db.transaction() as cur: - for data in db.read_archives(cur): - revision_id = data['revision_id'] - path = os.path.join(root_dir, data['path']) - - yield { - 'path': path, - 'revision_id': revision_id, - 'sha1': data['sha1'], - 'sha256': data['sha256'], - 'sha1_git': data['sha1_git'], - } - - -if __name__ == '__main__': - args = parse_args() - root_dir = args.mirror_root_dir - if root_dir.endswith('/'): - root_dir = root_dir.rstrip('/') - - dry_run = args.dry_run - db_url = args.db_url - revisions = read_revisions_per_tarname_from_db(root_dir, db_url) - - db = Db.connect(db_url) - with db.transaction() as cur: - # scan folder - count = 0 - for data in revisions: - tarpath = data['path'] - if not os.path.exists(tarpath): - print('%s skipped' % tarpath) - continue - - length = os.path.getsize(tarpath) - name = os.path.basename(tarpath) - - checksums = utils.convert_to_hex(hashutil.hashfile(tarpath)) - revid = data['revision_id'] - - if not revid: - print('%s %s %s' % (name, tarpath, checksums)) - continue - - count += 1 - - print('revision %s tarpath %s' % ( - revid, tarpath)) - - if dry_run: - continue - - query = """ - update revision - set metadata = jsonb_set(metadata, - '{original_artifact,0,length}', '%s') - where id='\\x%s' and - metadata#>>'{original_artifact,0,sha1}' = '%s' and - metadata#>>'{original_artifact,0,sha256}' = '%s' and - metadata#>>'{original_artifact,0,sha1_git}' = '%s' and - metadata#>>'{original_artifact,0,name}' = '%s'""" % ( - length, revid, checksums['sha1'], - checksums['sha256'], checksums['sha1_git'], name) - cur.execute(query) - - print('%s updates' % count) diff --git a/bin/test-archive.sh b/bin/test-archive.sh deleted file mode 100755 index cb3c03f..0000000 --- a/bin/test-archive.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -generated_ar=./emacs-19.34.6-src.tar.gz -original_ar=/home/storage/space/mirrors/gnu.org/old-gnu/emacs/windows/19.34/src/emacs-19.34.6-src.tar.gz - -tmpdir=$(mktemp -d) -tmpdirgnu=$(mktemp -d) - -tar xvf $generated_ar -C $tmpdir -tar xvf $original_ar -C $tmpdirgnu - -echo "diff -r ar:$tmpdir gnu:$tmpdirgnu" -diff -r $tmpdir $tmpdirgnu - - -rm -rf $tmpdir $tmpdirgnu diff --git a/debian/control b/debian/control index 0de36fe..0bec5e7 100644 --- a/debian/control +++ b/debian/control @@ -1,22 +1,24 @@ Source: swh-loader-tar Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.14~), python3-swh.scheduler, python3-swh.storage (>= 0.0.76~), - python3-swh.loader.dir (>= 0.0.23~), + python3-swh.loader.dir (>= 0.0.24~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDTAR/ Package: python3-swh.loader.tar Architecture: all -Depends: ${misc:Depends}, +Depends: python3-swh.core (>= 0.0.14~), python3-swh.storage (>= 0.0.76~), + python3-swh.loader.dir (>= 0.0.24~), python3-swh.scheduler, + ${misc:Depends}, ${python3:Depends} Description: Software Heritage Tarball Loader diff --git a/requirements-swh.txt b/requirements-swh.txt new file mode 100644 index 0000000..cd41b69 --- /dev/null +++ b/requirements-swh.txt @@ -0,0 +1,4 @@ +swh.core >= 0.0.14 +swh.scheduler +swh.storage >= 0.0.76 +swh.loader.dir >= 0.0.24 diff --git a/requirements.txt b/requirements.txt index bd9617b..8df856d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,7 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner -swh.core >= 0.0.14 -swh.scheduler -swh.storage >= 0.0.76 -swh.loader.dir >= 0.0.23 retrying +click +python-dateutil diff --git a/resources/create-tarball.ini b/resources/create-tarball.ini index 4e5da01..d4f6d6c 100644 --- a/resources/create-tarball.ini +++ b/resources/create-tarball.ini @@ -1,4 +1,4 @@ [main] storage_class = remote_storage -storage_args = http://localhost:5000/ +storage_args = http://localhost:5002/ diff --git a/resources/producer/tar-gnu.ini b/resources/producer/tar-gnu.ini index 98e01c6..a1660bc 100644 --- a/resources/producer/tar-gnu.ini +++ b/resources/producer/tar-gnu.ini @@ -1,30 +1,24 @@ [main] # Mirror's root directory holding tarballs to load into swh mirror_root_directory = /home/storage/space/mirrors/gnu.org/gnu/ # Origin setup's possible scheme url url_scheme = rsync://ftp.gnu.org/gnu/ # Origin type used for tarballs type = ftp # File containing a subset list tarballs from mirror_root_directory to load. # The file's format is one absolute path name to a tarball per line. # NOTE: # - This file must contain data consistent with the mirror_root_directory # - if this option is not provided, the mirror_root_directory is scanned # completely as usual # mirror_subset_archives = /home/storage/missing-archives # Retrieval date information (rsync, etc...) date = Fri, 28 Aug 2015 13:13:26 +0200 # Randomize blocks of messages and send for consumption block_messages = 250 - -# DEV options - -# Tryouts purposes (no limit if not specified) -# limit = 10 - diff --git a/resources/producer/tar-old-gnu.ini b/resources/producer/tar-old-gnu.ini index db5cec1..54b78ff 100644 --- a/resources/producer/tar-old-gnu.ini +++ b/resources/producer/tar-old-gnu.ini @@ -1,30 +1,24 @@ [main] # Mirror's root directory holding tarballs to load into swh mirror_root_directory = /home/storage/space/mirrors/gnu.org/old-gnu/ # Origin setup's possible scheme url url_scheme = rsync://ftp.gnu.org/old-gnu/ # Origin type used for tarballs type = ftp # File containing a subset list tarballs from mirror_root_directory to load. # The file's format is one absolute path name to a tarball per line. # NOTE: # - This file must contain data consistent with the mirror_root_directory # - if this option is not provided, the mirror_root_directory is scanned # completely as usual # mirror_subset_archives = /home/tony/work/inria/repo/swh-environment/swh-loader-tar/old-gnu-missing # Retrieval date information (rsync, etc...) date = Fri, 28 Aug 2015 13:13:26 +0200 # Randomize blocks of messages and send for consumption block_messages = 100 - -# DEV options - -# Tryouts purposes (no limit if not specified) -#limit = 10 - diff --git a/scratch/count_tarballs.py b/scratch/count_tarballs.py deleted file mode 100755 index 8e7634d..0000000 --- a/scratch/count_tarballs.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 - -import os -import tarfile -import zipfile - - -def is_tarball(filepath): - """Determine if the filepath is an tarball or not. - - This is dependent on the filepath only. - - Args: - filepath: the filepath without any paths. - - Returns: - Boolean True if an tarball, False otherwise. - - """ - - return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) - - -def list_tarballs_from(path): - """From path, produce tarball tarball message to celery. - - Args: - path: top directory to list tarballs from. - - """ - for dirpath, dirnames, filenames in os.walk(path): - for fname in filenames: - tarpath = os.path.join(dirpath, fname) - if os.path.exists(tarpath) and is_tarball(tarpath): - yield dirpath, fname - - -def count_tarballs_from(path): - count = 0 - for dirpath, fname in list_tarballs_from(path): - count += 1 - - return count - - -if __name__ == '__main__': - for path in ['/home/storage/space/mirrors/gnu.org/gnu', - '/home/storage/space/mirrors/gnu.org/old-gnu']: - print("%s %s" % (path, count_tarballs_from(path))) diff --git a/setup.py b/setup.py index e2fd83b..067f83a 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,28 @@ from setuptools import setup def parse_requirements(): requirements = [] - with open('requirements.txt') as f: - for line in f.readlines(): - line = line.strip() - if not line or line.startswith('#'): - continue - requirements.append(line) - + for reqf in ('requirements.txt', 'requirements-swh.txt'): + with open(reqf) as f: + for line in f.readlines(): + line = line.strip() + if not line or line.startswith('#'): + continue + requirements.append(line) return requirements setup( name='swh.loader.tar', description='Software Heritage Tarball Loader', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLDTAR', packages=['swh.loader.tar', 'swh.loader.tar.tests'], - scripts=['bin/swh-diff-db-mirror', - 'bin/swh-ls-tarball-size', - 'bin/swh-update-tarball-size'], + scripts=[], install_requires=parse_requirements(), setup_requires=['vcversioner'], vcversioner={}, include_package_data=True, ) diff --git a/swh.loader.tar.egg-info/PKG-INFO b/swh.loader.tar.egg-info/PKG-INFO index 11aeca8..ebc0730 100644 --- a/swh.loader.tar.egg-info/PKG-INFO +++ b/swh.loader.tar.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.tar -Version: 0.0.22 +Version: 0.0.23 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.tar.egg-info/SOURCES.txt b/swh.loader.tar.egg-info/SOURCES.txt index f30f4ee..d59bf44 100644 --- a/swh.loader.tar.egg-info/SOURCES.txt +++ b/swh.loader.tar.egg-info/SOURCES.txt @@ -1,41 +1,36 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile README +requirements-swh.txt requirements.txt setup.py version.txt -bin/swh-diff-db-mirror -bin/swh-loader-tar-retrieve-tarball -bin/swh-ls-tarball-size -bin/swh-update-tarball-size -bin/test-archive.sh debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format resources/create-tarball.ini resources/loader/tar.ini resources/producer/tar-gnu.ini resources/producer/tar-old-gnu.ini -scratch/count_tarballs.py swh.loader.tar.egg-info/PKG-INFO swh.loader.tar.egg-info/SOURCES.txt swh.loader.tar.egg-info/dependency_links.txt swh.loader.tar.egg-info/requires.txt swh.loader.tar.egg-info/top_level.txt swh/loader/tar/__init__.py swh/loader/tar/build.py swh/loader/tar/db.py swh/loader/tar/file.py swh/loader/tar/loader.py swh/loader/tar/producer.py swh/loader/tar/tarball.py swh/loader/tar/tasks.py swh/loader/tar/utils.py swh/loader/tar/tests/test_build.py swh/loader/tar/tests/test_utils.py \ No newline at end of file diff --git a/swh.loader.tar.egg-info/requires.txt b/swh.loader.tar.egg-info/requires.txt index dc85f9d..f64313a 100644 --- a/swh.loader.tar.egg-info/requires.txt +++ b/swh.loader.tar.egg-info/requires.txt @@ -1,6 +1,8 @@ +click +python-dateutil retrying swh.core>=0.0.14 -swh.loader.dir>=0.0.23 +swh.loader.dir>=0.0.24 swh.scheduler swh.storage>=0.0.76 vcversioner diff --git a/swh/loader/tar/build.py b/swh/loader/tar/build.py index 83f9372..c5ff02d 100755 --- a/swh/loader/tar/build.py +++ b/swh/loader/tar/build.py @@ -1,136 +1,116 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from swh.loader.tar import utils # Static setup EPOCH = 0 UTC_OFFSET = 0 SWH_PERSON = { 'name': 'Software Heritage', 'fullname': 'Software Heritage', 'email': 'robot@softwareheritage.org' } REVISION_MESSAGE = 'synthetic revision message' -RELEASE_MESSAGE = 'synthetic release message' REVISION_TYPE = 'tar' def compute_origin(url_scheme, url_type, root_dirpath, tarpath): """Compute the origin. Args: - url_scheme: scheme to build the origin's url - url_type: origin's type - root_dirpath: the top level root directory path - tarpath: file's absolute path Returns: Dictionary origin with keys: - url: origin's url - type: origin's type """ relative_path = utils.commonname(root_dirpath, tarpath) return { 'url': ''.join([url_scheme, os.path.dirname(relative_path)]), 'type': url_type, } -def occurrence_with_date(date, tarpath): +def compute_occurrence(tarpath): """Compute the occurrence using the tarpath's ctime. Args: - authority: the authority's uuid tarpath: file's path Returns: - Occurrence dictionary (cf. _build_occurrence) + Occurrence dictionary. """ return { 'branch': os.path.basename(tarpath), - 'date': date } def _time_from_path(tarpath): """Compute the modification time from the tarpath. + Args: + tarpath (str|bytes): Full path to the archive to extract the + date from. + + Returns: + dict representing a timestamp with keys seconds and microseconds keys. + """ - return os.lstat(tarpath).st_mtime + mtime = os.lstat(tarpath).st_mtime + if isinstance(mtime, float): + normalized_time = list(map(int, str(mtime).split('.'))) + else: # assuming int + normalized_time = [mtime, 0] + + return { + 'seconds': normalized_time[0], + 'microseconds': normalized_time[1] + } def compute_revision(tarpath): """Compute a revision. Args: tarpath: absolute path to the tarball Returns: Revision as dict: - - date: the modification timestamp as returned by a fstat call - - committer_date: the modification timestamp as returned by a fstat - call + - date (dict): the modification timestamp as returned by + _time_from_path function + - committer_date: the modification timestamp as returned by + _time_from_path function - author: cf. SWH_PERSON - committer: cf. SWH_PERSON - type: cf. REVISION_TYPE - message: cf. REVISION_MESSAGE """ ts = _time_from_path(tarpath) return { 'date': { 'timestamp': ts, 'offset': UTC_OFFSET, }, 'committer_date': { 'timestamp': ts, 'offset': UTC_OFFSET, }, 'author': SWH_PERSON, 'committer': SWH_PERSON, 'type': REVISION_TYPE, 'message': REVISION_MESSAGE, } - - -def compute_release(filename, tarpath): - """Compute a release from a given tarpath, filename. - If the tarpath does not contain a recognizable release number, the release - can be skipped. - - Args: - filename: file's name without path - tarpath: file's absolute path - - Returns: - None if the release number cannot be extracted from the filename. - Otherwise a synthetic release is computed with the following keys: - - name: the release computed from the filename - - date: the modification timestamp as returned by a fstat call - - offset: 0 - - author_name: '' - - author_email: '' - - comment: '' - - """ - release_number = utils.release_number(filename) - if release_number: - return { - 'name': release_number, - 'date': { - 'timestamp': _time_from_path(tarpath), - 'offset': UTC_OFFSET, - }, - 'author': SWH_PERSON, - 'message': RELEASE_MESSAGE, - } - return None diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index 88a9f9b..7d320f4 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,130 +1,100 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime + import os import tempfile import shutil from swh.core import hashutil from swh.loader.dir import loader from swh.loader.tar import tarball, utils class TarLoader(loader.DirLoader): - """A tarball loader. - - """ - CONFIG_BASE_FILENAME = 'loader/tar' - - ADDITIONAL_CONFIG = { - 'extraction_dir': ('string', '/tmp') - } - - def __init__(self): - super().__init__(logging_class='swh.loader.tar.TarLoader') - - def load(self, tarpath, origin, visit, revision, release, occurrences): - """ - Load a tarball in backend. + """A tarball loader: - This will: + - creates an origin if it does not exist + - creates a fetch_history entry + - creates an origin_visit - uncompress locally the tarballs in a temporary location - process the content of the tarballs to persist on swh storage - clean up the temporary location - write an entry in fetch_history to mark the loading tarball end + (success or failure) Args: - tarpath: path to the tarball to uncompress - origin: Dictionary origin - url: url origin we fetched - type: type of the origin - - visit: Numbered visit + - visit_date (str): To override the visit date - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - - release: Dictionary of information needed, keys are: - - name: release name - - date: release timestamp (e.g. 1444054085) - - offset: release date offset e.g. -0220, +0100 - - author_name: release author's name - - author_email: release author's email - - comment: release's comment message - occurrences: List of occurrence dictionary. Information needed, keys are: - branch: occurrence's branch name - authority_id: authority id (e.g. 1 for swh) - validity: validity date (e.g. 2015-01-01 00:00:00+00) + """ + CONFIG_BASE_FILENAME = 'loader/tar' + + ADDITIONAL_CONFIG = { + 'extraction_dir': ('string', '/tmp') + } + + def __init__(self): + super().__init__(logging_class='swh.loader.tar.TarLoader') + + def prepare(self, *args, **kwargs): + """1. Uncompress the tarball in a temporary directory. + 2. Compute some metadata to update the revision. + """ + tarpath, origin, visit_date, revision, occs = args + + if 'type' not in origin: # let the type flow if present + origin['type'] = 'tar' + # Prepare the extraction path extraction_dir = self.config['extraction_dir'] os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=extraction_dir) # add checksums in revision artifact = utils.convert_to_hex(hashutil.hashfile(tarpath)) artifact['name'] = os.path.basename(tarpath) - try: - self.log.info('Uncompress %s to %s' % (tarpath, dir_path)) - nature = tarball.uncompress(tarpath, dir_path) - artifact['archive_type'] = nature - artifact['length'] = os.path.getsize(tarpath) + self.log.info('Uncompress %s to %s' % (tarpath, dir_path)) + nature = tarball.uncompress(tarpath, dir_path) + artifact['archive_type'] = nature + artifact['length'] = os.path.getsize(tarpath) - revision['metadata'] = { - 'original_artifact': [artifact], - } + revision['metadata'] = { + 'original_artifact': [artifact], + } - return super().load( - dir_path, origin, visit, revision, release, occurrences) - finally: - shutil.rmtree(dir_path) + self.dir_path = dir_path - def prepare_and_load(self, - tarpath, origin, revision, release, occurrences): - """ - Prepare origin, fetch_origin, origin_visit - Then load a tarball 'tarpath'. - Then close origin_visit, fetch_history + super().prepare(dir_path, origin, visit_date, revision, None, occs) - First: - - creates an origin if it does not exist - - creates a fetch_history entry - - creates an origin_visit - - Then loads the tarball + def cleanup(self): + """Clean up temporary directory where we uncompress the tarball. """ - if 'type' not in origin: # let the type flow if present - origin['type'] = 'tar' - - self.origin_id = self.storage.origin_add_one(origin) - origin['id'] = self.origin_id - - date_visit = datetime.datetime.now(tz=datetime.timezone.utc) - origin_visit = self.storage.origin_visit_add(origin['id'], date_visit) - visit = origin_visit['visit'] - - fetch_history_id = self.open_fetch_history() - - try: - self.load(tarpath, origin, visit, revision, release, occurrences) - self.close_fetch_history_success(fetch_history_id) - self.storage.origin_visit_update( - self.origin_id, origin_visit['visit'], status='full') - except: - self.close_fetch_history_failure(fetch_history_id) - self.storage.origin_visit_update( - self.origin_id, origin_visit['visit'], status='partial') - raise + dir_path = self.dir_path + if dir_path and os.path.exists(dir_path): + shutil.rmtree(dir_path) diff --git a/swh/loader/tar/producer.py b/swh/loader/tar/producer.py index de21c40..e692b7f 100755 --- a/swh/loader/tar/producer.py +++ b/swh/loader/tar/producer.py @@ -1,179 +1,101 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import argparse -import sys +import click +import dateutil.parser + +from swh.scheduler.utils import get_task from swh.core import config from swh.loader.tar import build, file -task_queue = 'swh.loader.tar.tasks.LoadTarRepository' - +TASK_QUEUE = 'swh.loader.tar.tasks.LoadTarRepository' -def compute_message_from(app, conf, root_dir, tarpath, filename, - retrieval_date, dry_run=False): - """Compute and post the message to worker for the archive tarpath. - Args: - app: instance of the celery app - conf: dictionary holding static metadata - root_dir: root directory - tarball: the archive's representation - retrieval_date: retrieval date of information - dry_run: will compute but not send messages - - Returns: - None - - Raises: - ValueError when release number computation error arise. - - """ - origin = build.compute_origin(conf['url_scheme'], - conf['type'], - root_dir, - tarpath) - revision = build.compute_revision(tarpath) - occurrence = build.occurrence_with_date(retrieval_date, tarpath) - release = build.compute_release(filename, tarpath) - - if not dry_run: - app.tasks[task_queue].delay(tarpath, - origin, - revision, - release, - [occurrence]) - - -def produce_archive_messages_from(app, conf, path, - retrieval_date, - mirror_file=None, - dry_run=False): - """From path, produce archive tarball messages to celery. +def produce_archive_messages_from( + conf, root_dir, visit_date, mirror_file=None, dry_run=False): + """From root_dir, produce archive tarball messages to celery. Will print error message when some computation arise on archive and continue. Args: - app: instance of the celery app conf: dictionary holding static metadata - path: top directory to list archives from. - retrieval_date: retrieval date of information + root_dir: top directory to list archives from. + visit_date: override origin's visit date of information mirror_file: a filtering file of tarballs to load dry_run: will compute but not send messages Returns: - None - - Raises: - None + Number of messages generated """ limit = conf['limit'] block = int(conf['block_messages']) count = 0 - path_source_tarballs = mirror_file if mirror_file else path + path_source_tarballs = mirror_file if mirror_file else root_dir - for tarpath, fname in file.random_archives_from(path_source_tarballs, - block, - limit): + visit_date = dateutil.parser.parse(visit_date) + if not dry_run: + task = get_task(TASK_QUEUE) + + for tarpath, _ in file.random_archives_from( + path_source_tarballs, block, limit): try: - compute_message_from(app, conf, path, tarpath, fname, - retrieval_date, dry_run) + origin = build.compute_origin( + conf['url_scheme'], conf['type'], root_dir, tarpath) + revision = build.compute_revision(tarpath) + occurrence = build.compute_occurrence(tarpath) + + if not dry_run: + task.delay(tarpath, origin, visit_date, revision, [occurrence]) + count += 1 except ValueError: print('Problem with the following archive: %s' % tarpath) return count -def load_config(conf_file): - """Load the configuration from file. - - Args: - conf_file: path to a configuration file with the following content: - [main] - - # mirror's root directory holding tarballs to load into swh - mirror_root_directory = /home/storage/space/mirrors/gnu.org/gnu/ - - # origin setup's possible scheme url - url_scheme = rsync://ftp.gnu.org/gnu/ - - # origin type used for those tarballs - type = ftp - - # For tryouts purposes (no limit if not specified) - limit = 1 - - Returns: - dictionary of data present in the configuration file. - +@click.command() +@click.option('--config-file', required=1, + help='Configuration file path') +@click.option('--dry-run/--no-dry-run', default=False, + help='Dry run (print repo only)') +@click.option('--limit', default=None, + help='Number of origins limit to send') +def main(config_file, dry_run, limit): + """Tarball producer of local fs tarballs. """ - conf = config.read(conf_file, - default_conf={'limit': ('int', None)}) + conf = config.read(config_file) url_scheme = conf['url_scheme'] mirror_dir = conf['mirror_root_directory'] # remove trailing / in configuration (to ease ulterior computation) if url_scheme[-1] == '/': - conf.update({ - 'url_scheme': url_scheme[0:-1] - }) + conf['url_scheme'] = url_scheme[0:-1] if mirror_dir[-1] == '/': - conf.update({ - 'mirror_root_directory': mirror_dir[0:-1] - }) - - return conf + conf['mirror_root_directory'] = mirror_dir[0:-1] + if limit: + conf['limit'] = int(limit) -def parse_args(): - """Parse the configuration from the cli. - - """ - cli = argparse.ArgumentParser( - description='Tarball producer of local fs tarballs.') - cli.add_argument('--dry-run', '-n', - action='store_true', - help='Dry run (print repo only)') - cli.add_argument('--config', '-c', help='configuration file path') - - args = cli.parse_args() + nb_tarballs = produce_archive_messages_from( + conf=conf, + root_dir=conf['mirror_root_directory'], + visit_date=conf['date'], + mirror_file=conf.get('mirror_subset_archives'), + dry_run=dry_run) - return args + print('%s tarball(s) sent to worker.' % nb_tarballs) if __name__ == '__main__': - args = parse_args() - config_file = args.config - if not config_file: - print('Missing configuration file option.') - sys.exit(1) - - # instantiate celery app with its configuration - from swh.scheduler.celery_backend.config import app - from swh.loader.tar import tasks # noqa - - conf = load_config(config_file) - - retrieval_date = conf['date'] - - nb_tarballs = produce_archive_messages_from( - app, - conf, - conf['mirror_root_directory'], - retrieval_date, - conf.get('mirror_subset_archives'), - args.dry_run) - - print('%s tarball(s) sent to worker.' % nb_tarballs) + main() diff --git a/swh/loader/tar/tasks.py b/swh/loader/tar/tasks.py index c6ea825..cab4721 100644 --- a/swh/loader/tar/tasks.py +++ b/swh/loader/tar/tasks.py @@ -1,27 +1,28 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task from swh.loader.tar.loader import TarLoader class LoadTarRepository(Task): """Import a directory to Software Heritage """ task_queue = 'swh_loader_tar' - def run(self, tarpath, origin, revision, release, occurrences): + def run(self, tarpath, origin, visit_date, revision, occurrences): """Import a tarball into swh. Args: - tarpath: path to a tarball file - - origin, revision, release, occurrences: - cf. swh.loader.dir.loader.run docstring + - origin, visit_date, revision, release, occurrences: + cf. swh.loader.dir.loader.prepare docstring """ - TarLoader().prepare_and_load( - tarpath, origin, revision, release, occurrences) + loader = TarLoader() + loader.log = self.log + loader.load(tarpath, origin, visit_date, revision, occurrences) diff --git a/swh/loader/tar/tests/test_build.py b/swh/loader/tar/tests/test_build.py index e4a7763..94e2235 100644 --- a/swh/loader/tar/tests/test_build.py +++ b/swh/loader/tar/tests/test_build.py @@ -1,106 +1,106 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from unittest.mock import patch from swh.loader.tar import build class TestBuildUtils(unittest.TestCase): @istest def compute_origin(self): # given expected_origin = { 'url': 'rsync://some/url/package-foo', 'type': 'rsync', } # when actual_origin = build.compute_origin( 'rsync://some/url/', 'rsync', '/some/root/path/', '/some/root/path/package-foo/package-foo-1.2.3.tgz') # then self.assertEquals(actual_origin, expected_origin) @istest - def occurrence_with_date(self): + def compute_occurrence(self): # given expected_occurrence = { 'branch': b'package-bar.tgz', - 'date': '2015-10-22 08:44:47.422384+00' } # when - actual_occurrence = build.occurrence_with_date( - '2015-10-22 08:44:47.422384+00', b'/path/to/package-bar.tgz',) + actual_occurrence = build.compute_occurrence( + b'/path/to/package-bar.tgz') # then self.assertEquals(actual_occurrence, expected_occurrence) + @patch('swh.loader.tar.build._time_from_path') @istest - def compute_release__no_release(self): - # given - - # when - actual_release = build.compute_release( - 'pack-without-version.tgz', - '/some/path/to/pack-without-version.tgz') - - # then - self.assertIsNone(actual_release) - - @istest - def compute_release(self): - # given - expected_release = { - 'name': '1.2.3rc1', - 'date': { - 'timestamp': 'some-time', - 'offset': build.UTC_OFFSET, - }, - 'author': build.SWH_PERSON, - 'message': build.RELEASE_MESSAGE, - } - - # when - with patch('swh.loader.tar.build._time_from_path', - return_value='some-time'): - actual_release = build.compute_release( - 'foobar-1.2.3rc1.tgz', - '/some/path/to/path-without-version.tgz') + def compute_revision(self, mock_time_from_path): + mock_time_from_path.return_value = 'some-other-time' - # then - self.assertEquals(expected_release, actual_release) - - @istest - def compute_revision(self): # when - with patch('swh.loader.tar.build._time_from_path', - return_value='some-other-time'): - actual_revision = build.compute_revision('/some/path') + actual_revision = build.compute_revision('/some/path') expected_revision = { 'date': { 'timestamp': 'some-other-time', 'offset': build.UTC_OFFSET, }, 'committer_date': { 'timestamp': 'some-other-time', 'offset': build.UTC_OFFSET, }, 'author': build.SWH_PERSON, 'committer': build.SWH_PERSON, 'type': build.REVISION_TYPE, 'message': build.REVISION_MESSAGE, } # then self.assertEquals(actual_revision, expected_revision) + + mock_time_from_path.assert_called_once_with('/some/path') + + @patch('swh.loader.tar.build.os') + @istest + def time_from_path_with_float(self, mock_os): + class MockStat: + st_mtime = 1445348286.8308342 + mock_os.lstat.return_value = MockStat() + + actual_time = build._time_from_path('some/path') + + self.assertEquals(actual_time, { + 'seconds': 1445348286, + 'microseconds': 8308342 + }) + + mock_os.lstat.assert_called_once_with('some/path') + + @patch('swh.loader.tar.build.os') + @istest + def time_from_path_with_int(self, mock_os): + class MockStat: + st_mtime = 1445348286 + + mock_os.lstat.return_value = MockStat() + + actual_time = build._time_from_path('some/path') + + self.assertEquals(actual_time, { + 'seconds': 1445348286, + 'microseconds': 0 + }) + + mock_os.lstat.assert_called_once_with('some/path') diff --git a/swh/loader/tar/tests/test_utils.py b/swh/loader/tar/tests/test_utils.py index 1aa16d5..80d4da7 100644 --- a/swh/loader/tar/tests/test_utils.py +++ b/swh/loader/tar/tests/test_utils.py @@ -1,160 +1,56 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.tar import utils class TestUtils(unittest.TestCase): - @classmethod - def setUpClass(cls): - - super().setUpClass() - - cls.files = { - 'free-ipmi-1.2.2.tar': ('free-ipmi-', '1.2.2', '.tar'), - 'free-ipmi-1.2.2.tar.gz': ('free-ipmi-', '1.2.2', '.tar.gz'), - 'free-ipmi-1.2.2.tar.tgz': ('free-ipmi-', '1.2.2', '.tar.tgz'), - 'gcc-testsuite-4.4.2-4.4.3.diff.bz2': ( - 'gcc-testsuite-', '4.4.2-4.4.3', '.diff.bz2'), - 'gcc-java-4.0.4.tar.gz': ('gcc-java-', '4.0.4', '.tar.gz'), - 'gmp-2.0.tar.lzma': ('gmp-', '2.0', '.tar.lzma'), - 'win-gerwin-0.6.zip': ('win-gerwin-', '0.6', '.zip'), - 'ballandpaddle-0.8.0.tar.xz': ( - 'ballandpaddle-', '0.8.0', '.tar.xz'), - 'mail-1.1.1.some.lz': ('mail-', '1.1.1.some', '.lz'), - 'gmp-4.1.1-4.1.2.diff.tar.Z': ( - 'gmp-', '4.1.1-4.1.2', '.diff.tar.Z'), - 'findutils-4.2.18.tar.bzip2': ( - 'findutils-', '4.2.18', '.tar.bzip2'), - 'gnunet-java-0.9.4.jar': ('gnunet-java-', '0.9.4', '.jar'), - 'pycdio-0.15-py2.5-linux-i686.egg': ( - 'pycdio-', '0.15-py2.5-linux-i686', '.egg'), - 'rbcdio-0.04.gem': ('rbcdio-', '0.04', '.gem'), - 'librejs-6.0.5.xpi': ('librejs-', '6.0.5', '.xpi'), - 'icecat-31.8.0.csb.langpack.xpi': ( - 'icecat-', '31.8.0.csb.langpack', '.xpi'), - 'icecatmobile-31.8.0.en-US.android-arm.apk': ( - 'icecatmobile-', '31.8.0.en-US.android-arm', '.apk'), - 'icecat-31.8.0.en-US.mac.dmg': ( - 'icecat-', '31.8.0.en-US.mac', '.dmg'), - 'gnutls-3.0.21-1gn.DevPak': ('gnutls-', '3.0.21-1gn', '.DevPak'), - - # . separator - 'greg-1.4.tar.gz': ('greg-', '1.4', '.tar.gz'), - - # number in software product - 'aspell6-pt_BR-20070411-0.tar.bz2': ( - 'aspell6-pt_BR-', '20070411-0', '.tar.bz2'), - 'libosip2-3.3.0.tar.gz': ('libosip2-', '3.3.0', '.tar.gz'), - - # other cases - 'hurd-F2-main.iso': ('hurd-F2-main', None, '.iso'), - - 'winboard-4_0_5.exe': ('winboard-', '4_0_5', '.exe'), - - # particular patterns... - 'gift-0.1.9+3epsilon.tar.gz': ( - 'gift-', '0.1.9+3epsilon', '.tar.gz'), - 'gift-0.1.6pre2.tgz': ('gift-', '0.1.6pre2', '.tgz'), - 'binutils-2.19.1a.tar.bz2': ('binutils-', '2.19.1a', '.tar.bz2'), - 'readline-4.2-4.2a.diff.gz': ('readline-', '4.2-4.2a', '.diff.gz'), - - # with arch patterns - 'cvs-1.12.6-BSD.bin.gz': ('cvs-', '1.12.6-BSD.bin', '.gz'), - 'cvs-1.12.12-SunOS-5.8-i386.gz': ( - 'cvs-', '1.12.12-SunOS-5.8-i386', '.gz'), - 'gnutls-3.0.20-w32.zip': ('gnutls-', '3.0.20-w32', '.zip'), - 'mit-scheme_7.7.90+20080130-0gutsy1.diff.gz': ( - 'mit-scheme_', '7.7.90+20080130-0gutsy1', '.diff.gz'), - - # no release number - 'gnu.ps.gz': ('gnu', None, '.ps.gz'), - 'direvent-latest.tar.gz': ('direvent-latest', None, '.tar.gz'), - } - - cls.files_error = ['.tar', '.anything'] - - @istest - def parse_filename(self): - for f in self.files: - # when - actual_components = utils.parse_filename(f) - - # then - name, version, ext = self.files[f] - expected_components = { - 'software_name': name, - 'release_number': version, - 'extension': ext, - } - - self.assertEquals(actual_components, expected_components) - - @istest - def parse_filename_not_parseable_file(self): - for f in self.files_error: - with self.assertRaises(ValueError): - utils.parse_filename(f) - - @istest - def release_number(self): - for f in self.files.keys(): - # when - actual_ext = utils.release_number(f) - - # then - _, expected_rel_num, _ = self.files[f] - self.assertEquals( - actual_ext, - expected_rel_num, - 'for %s, the version should be %s' % (f, expected_rel_num)) - @istest def commonname(self): # when actual_commonname = utils.commonname('/some/where/to/', '/some/where/to/go/to') # then self.assertEquals('go/to', actual_commonname) # when actual_commonname2 = utils.commonname(b'/some/where/to/', b'/some/where/to/go/to') # then self.assertEquals(b'go/to', actual_commonname2) @istest def convert_to_hex(self): # given input_dict = { 'sha1_git': b'\xf6\xb7 \x8b+\xcd \x9fq5E\xe6\x03\xffg\x87\xd7\xb9D\xa1', # noqa 'sha1': b'\xf4O\xf0\xd4\xc0\xb0\xae\xca\xe4C\xab%\x10\xf7\x12h\x1e\x9f\xac\xeb', # noqa 'sha256': b'\xa8\xf9=\xf3\xfek\xa2$\xee\xc7\x1b\xc2\x83\xca\x96\xae8\xaf&\xab\x08\xfa\xb1\x13\xec(.s]\xf6Yb'} # noqa expected_dict = {'sha1_git': 'f6b7208b2bcd209f713545e603ff6' '787d7b944a1', 'sha1': 'f44ff0d4c0b0aecae443ab2510f712681e' '9faceb', 'sha256': 'a8f93df3fe6ba224eec71bc283ca96ae3' '8af26ab08fab113ec282e735df65962'} # when actual_dict = utils.convert_to_hex(input_dict) # then self.assertDictEqual(actual_dict, expected_dict) @istest def convert_to_hex_edge_cases(self): # when actual_dict = utils.convert_to_hex({}) # then self.assertDictEqual(actual_dict, {}) self.assertIsNone(utils.convert_to_hex(None)) diff --git a/swh/loader/tar/utils.py b/swh/loader/tar/utils.py index 7e90536..67706d6 100644 --- a/swh/loader/tar/utils.py +++ b/swh/loader/tar/utils.py @@ -1,170 +1,78 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import itertools import random -import re from swh.core import hashutil -# FIXME; extract this in property -# to recognize existing naming pattern -extensions = [ - 'ps', - 'zip', - 'tar', - 'gz', 'tgz', - 'bz2', 'bzip2', - 'lzma', 'lz', - 'xz', - 'Z', - 'diff', - 'iso', - 'exe', - 'jar', - 'egg', - 'gem', - 'xpi', - 'apk', - 'dmg', - 'DevPak', -] - - -pattern = re.compile(r''' -^ -(?: - # We have a software name and a release number, separated with a - # -, _ or dot. - (?P.+?[-_.]) - (?P[0-9][0-9a-zA-Z_.+:~-]*?) -| - # We couldn't match a release number, put everything in the - # software name. - (?P.+?) -) -(?P(?:\.(?:%s))+) -$ -''' % '|'.join(extensions), - flags=re.VERBOSE) - - -def parse_filename(filename): - """Parse a filename into its components. - - Parsing policy: - We use Debian's release number heuristic: A release number starts - with a digit, and is followed by alphanumeric characters or any of - ., +, :, ~ and - - - We hardcode a list of possible extensions, as this release number - scheme would match them too... We match on any combination of those. - - Greedy matching is done right to left (we only match the extension - greedily with +, software_name and release_number are matched lazily - with +? and *?). - - Args: - filename: filename without path. - - Returns: - Dictionary with the following keys: - - software_name - - release_number: can be None if it could not be found. - - extension - - Raises: - ValueError if the filename could not be parsed. - -""" - m = pattern.match(filename) - if not m: - raise ValueError('Filename %s could not be parsed.' % filename) - - d = m.groupdict() - return { - 'software_name': d['software_name1'] or d['software_name2'], - 'release_number': d['release_number'], - 'extension': d['extension'], - } - - -def release_number(filename): - """Compute the release number from the filename. - - cf. parse_filename's docstring - - """ - return parse_filename(filename)['release_number'] - - def commonname(path0, path1, as_str=False): """Compute the commonname between the path0 and path1. """ return path1.split(path0)[1] def convert_to_hex(d): """Convert a flat dictionary with bytes in values to the same dictionary with hex as values. Args: dict: flat dictionary with sha bytes in their values. Returns: Mirror dictionary with values as string hex. """ if not d: return d checksums = {} for key, h in d.items(): checksums[key] = hashutil.hash_to_hex(h) return checksums def grouper(iterable, n, fillvalue=None): """Collect data into fixed-length chunks or blocks. Args: iterable: an iterable n: size of block fillvalue: value to use for the last block Returns: fixed-length chunks of blocks as iterables """ args = [iter(iterable)] * n return itertools.zip_longest(*args, fillvalue=fillvalue) def random_blocks(iterable, block=100, fillvalue=None): """Given an iterable: - slice the iterable in data set of block-sized elements - randomized the data set - yield each element Args: iterable: iterable of data block: number of elements per block fillvalue: a fillvalue for the last block if not enough values in last block Returns: An iterable of randomized per block-size elements. """ count = 0 for iterable in grouper(iterable, block, fillvalue=fillvalue): count += 1 l = list(iterable) random.shuffle(l) for e in l: yield e diff --git a/version.txt b/version.txt index 53c1e92..1683b16 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.22-0-g79254b5 \ No newline at end of file +v0.0.23-0-gd4fe87e \ No newline at end of file