diff --git a/bin/swh-diff-db-mirror b/bin/swh-diff-db-mirror deleted file mode 100755 index b4a4def..0000000 --- a/bin/swh-diff-db-mirror +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import argparse -import os -import sys - -from swh.loader.tar import tarball, db - - -def list_branches(db_url): - """Return the distinct list of branches present in occurrence_history. - - """ - with db.connect(db_url) as db_conn: - for branch in db.query_fetch(db_conn, - 'select distinct branch from ' - 'occurrence_history'): - yield branch[0] - - -def diff_branch(root_dir, existing_set): - """Walk the root_dir and for every tarball not in existing_set, - yield its absolute path. - - """ - for dirpath, _, filenames in os.walk(root_dir): - for filename in filenames: - filepath = os.path.join(dirpath, filename) - if not os.path.exists(filepath): - continue - if not tarball.is_tarball(filepath): - continue - if filename in existing_set: - continue - - yield filepath - - -def parse_args(): - """Parse the configuration from the cli. - - """ - cli = argparse.ArgumentParser( - description='Diff between db and local fs mirror tarballs directory.' - 'Output what\'s on disk and not on db.') - cli.add_argument('--db-url', '-d', - help='db-url string') - cli.add_argument('--mirror-root-directory', '-m', - help='mirror root directory') - - args = cli.parse_args() - if not args.db_url or not args.mirror_root_directory: - print('Bad usage, cf. --help') - sys.exit(1) - - return args - - -if __name__ == '__main__': - args = parse_args() - - db_url = args.db_url - - already_present = set(list_branches(db_url)) - - root_dir = args.mirror_root_directory - for filepath in diff_branch(root_dir, already_present): - print(filepath) diff --git a/bin/swh-ls-tarball-size b/bin/swh-ls-tarball-size deleted file mode 100755 index ae23d47..0000000 --- a/bin/swh-ls-tarball-size +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2015-2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -import click -import os - -from swh.loader.tar import file - - -@click.command() -@click.option('--mirror-root-dir', required=1, help='path to the root dir.') -def main(mirror_root_dir): - """Parse the configuration from the cli. - - """ - for tarpath, _ in file.archives_from(mirror_root_dir): - print(tarpath, os.path.getsize(tarpath)) - - -if __name__ == '__main__': - main() diff --git a/bin/swh-update-tarball-size b/bin/swh-update-tarball-size deleted file mode 100755 index 00754c2..0000000 --- a/bin/swh-update-tarball-size +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import argparse -import os -import psycopg2 - -from contextlib import contextmanager - -from swh.core import hashutil -from swh.loader.tar import utils - - -def entry_to_bytes(entry): - """Convert an entry coming from the database to bytes""" - if isinstance(entry, memoryview): - return entry.tobytes() - if isinstance(entry, list): - return [entry_to_bytes(value) for value in entry] - return entry - - -def line_to_bytes(line): - """Convert a line coming from the database to bytes""" - return line.__class__(entry_to_bytes(entry) for entry in line) - - -def cursor_to_bytes(cursor): - """Yield all the data from a cursor as bytes""" - yield from (line_to_bytes(line) for line in cursor) - - -class Db: - """Proxy to the SWH DB, with wrappers around stored procedures - - """ - - @classmethod - def connect(cls, *args, **kwargs): - """factory method to create a DB proxy - - Accepts all arguments of psycopg2.connect; only some specific - possibilities are reported below. - - Args: - connstring: libpq2 connection string - - """ - conn = psycopg2.connect(*args, **kwargs) - return cls(conn) - - def _cursor(self, cur_arg): - """get a cursor: from cur_arg if given, or a fresh one otherwise - - meant to avoid boilerplate if/then/else in methods that proxy stored - procedures - - """ - if cur_arg is not None: - return cur_arg - # elif self.cur is not None: - # return self.cur - else: - return self.conn.cursor() - - def __init__(self, conn): - """create a DB proxy - - Args: - conn: psycopg2 connection to the SWH DB - - """ - self.conn = conn - - @contextmanager - def transaction(self): - """context manager to execute within a DB transaction - - Yields: - a psycopg2 cursor - - """ - with self.conn.cursor() as cur: - try: - yield cur - self.conn.commit() - except: - if not self.conn.closed: - self.conn.rollback() - raise - - def read_archives(self, cur=None): - cur = self._cursor(cur) - q = """select target, o.url, r.metadata#>>'{original_artifact,0,name}', - r.metadata#>>'{original_artifact,0,archive_type}', - r.metadata#>>'{original_artifact,0,sha1}', - r.metadata#>>'{original_artifact,0,sha256}', - r.metadata#>>'{original_artifact,0,sha1_git}' - from occurrence_history occ - inner join origin o on o.id=occ.origin - inner join revision r on occ.target = r.id - where target_type='revision' - and o.url like 'rsync://%gnu%' - and r.metadata#>>'{original_artifact,0,length}' is null - and r.metadata#>>'{original_artifact,0,archive_type}' is - not null; - """ - cur.execute(q) - for entry in cursor_to_bytes(cur): - url = entry[1] - name = entry[2] - path = os.path.join(url.replace('rsync://ftp.gnu.org/', ''), name) - yield { - 'revision_id': hashutil.hash_to_hex(entry[0]), - 'name': name, - 'path': path, - 'sha1': entry[3], - 'sha256': entry[4], - 'sha1_git': entry[5], - } - - -def parse_args(): - """Parse the configuration from the cli. - - """ - cli = argparse.ArgumentParser( - description='Tarball listing tarballs size.') - cli.add_argument('--mirror-root-dir', '-m', help='path to the root dir.') - cli.add_argument('--db-url', '-u', default=None, help='path to root dir.') - cli.add_argument('--dry-run', action='store_true', help='dry run.') - - args = cli.parse_args() - - return args - - -def read_revisions_per_tarname_from_db(root_dir, db_url): - db = Db.connect(db_url) - with db.transaction() as cur: - for data in db.read_archives(cur): - revision_id = data['revision_id'] - path = os.path.join(root_dir, data['path']) - - yield { - 'path': path, - 'revision_id': revision_id, - 'sha1': data['sha1'], - 'sha256': data['sha256'], - 'sha1_git': data['sha1_git'], - } - - -if __name__ == '__main__': - args = parse_args() - root_dir = args.mirror_root_dir - if root_dir.endswith('/'): - root_dir = root_dir.rstrip('/') - - dry_run = args.dry_run - db_url = args.db_url - revisions = read_revisions_per_tarname_from_db(root_dir, db_url) - - db = Db.connect(db_url) - with db.transaction() as cur: - # scan folder - count = 0 - for data in revisions: - tarpath = data['path'] - if not os.path.exists(tarpath): - print('%s skipped' % tarpath) - continue - - length = os.path.getsize(tarpath) - name = os.path.basename(tarpath) - - checksums = utils.convert_to_hex(hashutil.hashfile(tarpath)) - revid = data['revision_id'] - - if not revid: - print('%s %s %s' % (name, tarpath, checksums)) - continue - - count += 1 - - print('revision %s tarpath %s' % ( - revid, tarpath)) - - if dry_run: - continue - - query = """ - update revision - set metadata = jsonb_set(metadata, - '{original_artifact,0,length}', '%s') - where id='\\x%s' and - metadata#>>'{original_artifact,0,sha1}' = '%s' and - metadata#>>'{original_artifact,0,sha256}' = '%s' and - metadata#>>'{original_artifact,0,sha1_git}' = '%s' and - metadata#>>'{original_artifact,0,name}' = '%s'""" % ( - length, revid, checksums['sha1'], - checksums['sha256'], checksums['sha1_git'], name) - cur.execute(query) - - print('%s updates' % count) diff --git a/scratch/count_tarballs.py b/scratch/count_tarballs.py deleted file mode 100755 index 8e7634d..0000000 --- a/scratch/count_tarballs.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 - -import os -import tarfile -import zipfile - - -def is_tarball(filepath): - """Determine if the filepath is an tarball or not. - - This is dependent on the filepath only. - - Args: - filepath: the filepath without any paths. - - Returns: - Boolean True if an tarball, False otherwise. - - """ - - return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) - - -def list_tarballs_from(path): - """From path, produce tarball tarball message to celery. - - Args: - path: top directory to list tarballs from. - - """ - for dirpath, dirnames, filenames in os.walk(path): - for fname in filenames: - tarpath = os.path.join(dirpath, fname) - if os.path.exists(tarpath) and is_tarball(tarpath): - yield dirpath, fname - - -def count_tarballs_from(path): - count = 0 - for dirpath, fname in list_tarballs_from(path): - count += 1 - - return count - - -if __name__ == '__main__': - for path in ['/home/storage/space/mirrors/gnu.org/gnu', - '/home/storage/space/mirrors/gnu.org/old-gnu']: - print("%s %s" % (path, count_tarballs_from(path))) diff --git a/setup.py b/setup.py index a1a4e31..067f83a 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,28 @@ from setuptools import setup def parse_requirements(): requirements = [] for reqf in ('requirements.txt', 'requirements-swh.txt'): with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements setup( name='swh.loader.tar', description='Software Heritage Tarball Loader', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLDTAR', packages=['swh.loader.tar', 'swh.loader.tar.tests'], - scripts=['bin/swh-diff-db-mirror', - 'bin/swh-ls-tarball-size', - 'bin/swh-update-tarball-size'], + scripts=[], install_requires=parse_requirements(), setup_requires=['vcversioner'], vcversioner={}, include_package_data=True, )