diff --git a/PKG-INFO b/PKG-INFO index 6e91a52..11aeca8 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.tar -Version: 0.0.21 +Version: 0.0.22 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/bin/swh-update-tarball-size b/bin/swh-update-tarball-size index 3171dd9..00754c2 100755 --- a/bin/swh-update-tarball-size +++ b/bin/swh-update-tarball-size @@ -1,208 +1,209 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import argparse import os import psycopg2 from contextlib import contextmanager from swh.core import hashutil from swh.loader.tar import utils def entry_to_bytes(entry): """Convert an entry coming from the database to bytes""" if isinstance(entry, memoryview): return entry.tobytes() if isinstance(entry, list): return [entry_to_bytes(value) for value in entry] return entry def line_to_bytes(line): """Convert a line coming from the database to bytes""" return line.__class__(entry_to_bytes(entry) for entry in line) def cursor_to_bytes(cursor): """Yield all the data from a cursor as bytes""" yield from (line_to_bytes(line) for line in cursor) class Db: """Proxy to the SWH DB, with wrappers around stored procedures """ @classmethod def connect(cls, *args, **kwargs): """factory method to create a DB proxy Accepts all arguments of psycopg2.connect; only some specific possibilities are reported below. Args: connstring: libpq2 connection string """ conn = psycopg2.connect(*args, **kwargs) return cls(conn) def _cursor(self, cur_arg): """get a cursor: from cur_arg if given, or a fresh one otherwise meant to avoid boilerplate if/then/else in methods that proxy stored procedures """ if cur_arg is not None: return cur_arg # elif self.cur is not None: # return self.cur else: return self.conn.cursor() def __init__(self, conn): """create a DB proxy Args: conn: psycopg2 connection to the SWH DB """ self.conn = conn @contextmanager def transaction(self): """context manager to execute within a DB transaction Yields: a psycopg2 cursor """ with self.conn.cursor() as cur: try: yield cur self.conn.commit() except: if not self.conn.closed: self.conn.rollback() raise def read_archives(self, cur=None): cur = self._cursor(cur) q = """select target, o.url, r.metadata#>>'{original_artifact,0,name}', r.metadata#>>'{original_artifact,0,archive_type}', r.metadata#>>'{original_artifact,0,sha1}', r.metadata#>>'{original_artifact,0,sha256}', r.metadata#>>'{original_artifact,0,sha1_git}' from occurrence_history occ inner join origin o on o.id=occ.origin inner join revision r on occ.target = r.id where target_type='revision' and o.url like 'rsync://%gnu%' and r.metadata#>>'{original_artifact,0,length}' is null and r.metadata#>>'{original_artifact,0,archive_type}' is not null; """ cur.execute(q) for entry in cursor_to_bytes(cur): url = entry[1] name = entry[2] path = os.path.join(url.replace('rsync://ftp.gnu.org/', ''), name) yield { 'revision_id': hashutil.hash_to_hex(entry[0]), 'name': name, 'path': path, 'sha1': entry[3], 'sha256': entry[4], 'sha1_git': entry[5], } def parse_args(): """Parse the configuration from the cli. """ cli = argparse.ArgumentParser( description='Tarball listing tarballs size.') cli.add_argument('--mirror-root-dir', '-m', help='path to the root dir.') cli.add_argument('--db-url', '-u', default=None, help='path to root dir.') cli.add_argument('--dry-run', action='store_true', help='dry run.') args = cli.parse_args() return args def read_revisions_per_tarname_from_db(root_dir, db_url): db = Db.connect(db_url) with db.transaction() as cur: for data in db.read_archives(cur): revision_id = data['revision_id'] path = os.path.join(root_dir, data['path']) yield { 'path': path, 'revision_id': revision_id, 'sha1': data['sha1'], 'sha256': data['sha256'], 'sha1_git': data['sha1_git'], } + if __name__ == '__main__': args = parse_args() root_dir = args.mirror_root_dir if root_dir.endswith('/'): root_dir = root_dir.rstrip('/') dry_run = args.dry_run db_url = args.db_url revisions = read_revisions_per_tarname_from_db(root_dir, db_url) db = Db.connect(db_url) with db.transaction() as cur: # scan folder count = 0 for data in revisions: tarpath = data['path'] if not os.path.exists(tarpath): print('%s skipped' % tarpath) continue length = os.path.getsize(tarpath) name = os.path.basename(tarpath) checksums = utils.convert_to_hex(hashutil.hashfile(tarpath)) revid = data['revision_id'] if not revid: print('%s %s %s' % (name, tarpath, checksums)) continue count += 1 print('revision %s tarpath %s' % ( revid, tarpath)) if dry_run: continue query = """ update revision set metadata = jsonb_set(metadata, '{original_artifact,0,length}', '%s') where id='\\x%s' and metadata#>>'{original_artifact,0,sha1}' = '%s' and metadata#>>'{original_artifact,0,sha256}' = '%s' and metadata#>>'{original_artifact,0,sha1_git}' = '%s' and metadata#>>'{original_artifact,0,name}' = '%s'""" % ( length, revid, checksums['sha1'], checksums['sha256'], checksums['sha1_git'], name) cur.execute(query) print('%s updates' % count) diff --git a/debian/control b/debian/control index 5fe4dc6..0de36fe 100644 --- a/debian/control +++ b/debian/control @@ -1,22 +1,22 @@ Source: swh-loader-tar Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.14~), python3-swh.scheduler, - python3-swh.storage (>= 0.0.31~), - python3-swh.loader.dir (>= 0.0.22~), + python3-swh.storage (>= 0.0.76~), + python3-swh.loader.dir (>= 0.0.23~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDTAR/ Package: python3-swh.loader.tar Architecture: all Depends: ${misc:Depends}, ${python3:Depends} Description: Software Heritage Tarball Loader diff --git a/requirements.txt b/requirements.txt index cfc0175..bd9617b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner swh.core >= 0.0.14 swh.scheduler -swh.storage >= 0.0.31 -swh.loader.dir >= 0.0.22 +swh.storage >= 0.0.76 +swh.loader.dir >= 0.0.23 retrying diff --git a/swh.loader.tar.egg-info/PKG-INFO b/swh.loader.tar.egg-info/PKG-INFO index 6e91a52..11aeca8 100644 --- a/swh.loader.tar.egg-info/PKG-INFO +++ b/swh.loader.tar.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.tar -Version: 0.0.21 +Version: 0.0.22 Summary: Software Heritage Tarball Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDTAR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.tar.egg-info/requires.txt b/swh.loader.tar.egg-info/requires.txt index c895c5d..dc85f9d 100644 --- a/swh.loader.tar.egg-info/requires.txt +++ b/swh.loader.tar.egg-info/requires.txt @@ -1,6 +1,6 @@ retrying swh.core>=0.0.14 -swh.loader.dir>=0.0.22 +swh.loader.dir>=0.0.23 swh.scheduler -swh.storage>=0.0.31 +swh.storage>=0.0.76 vcversioner diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index 9ef1376..88a9f9b 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,130 +1,130 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import os import tempfile import shutil from swh.core import hashutil from swh.loader.dir import loader from swh.loader.tar import tarball, utils class TarLoader(loader.DirLoader): """A tarball loader. """ - CONFIG_BASE_FILENAME = 'loader/tar.ini' + CONFIG_BASE_FILENAME = 'loader/tar' ADDITIONAL_CONFIG = { 'extraction_dir': ('string', '/tmp') } def __init__(self): super().__init__(logging_class='swh.loader.tar.TarLoader') def load(self, tarpath, origin, visit, revision, release, occurrences): """ Load a tarball in backend. This will: - uncompress locally the tarballs in a temporary location - process the content of the tarballs to persist on swh storage - clean up the temporary location - write an entry in fetch_history to mark the loading tarball end Args: - tarpath: path to the tarball to uncompress - origin: Dictionary origin - url: url origin we fetched - type: type of the origin - visit: Numbered visit - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - release: Dictionary of information needed, keys are: - name: release name - date: release timestamp (e.g. 1444054085) - offset: release date offset e.g. -0220, +0100 - author_name: release author's name - author_email: release author's email - comment: release's comment message - occurrences: List of occurrence dictionary. Information needed, keys are: - branch: occurrence's branch name - authority_id: authority id (e.g. 1 for swh) - validity: validity date (e.g. 2015-01-01 00:00:00+00) """ # Prepare the extraction path extraction_dir = self.config['extraction_dir'] os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=extraction_dir) # add checksums in revision artifact = utils.convert_to_hex(hashutil.hashfile(tarpath)) artifact['name'] = os.path.basename(tarpath) try: self.log.info('Uncompress %s to %s' % (tarpath, dir_path)) nature = tarball.uncompress(tarpath, dir_path) artifact['archive_type'] = nature artifact['length'] = os.path.getsize(tarpath) revision['metadata'] = { 'original_artifact': [artifact], } return super().load( dir_path, origin, visit, revision, release, occurrences) finally: shutil.rmtree(dir_path) def prepare_and_load(self, tarpath, origin, revision, release, occurrences): """ Prepare origin, fetch_origin, origin_visit Then load a tarball 'tarpath'. Then close origin_visit, fetch_history First: - creates an origin if it does not exist - creates a fetch_history entry - creates an origin_visit - Then loads the tarball """ if 'type' not in origin: # let the type flow if present origin['type'] = 'tar' self.origin_id = self.storage.origin_add_one(origin) origin['id'] = self.origin_id date_visit = datetime.datetime.now(tz=datetime.timezone.utc) origin_visit = self.storage.origin_visit_add(origin['id'], date_visit) visit = origin_visit['visit'] fetch_history_id = self.open_fetch_history() try: self.load(tarpath, origin, visit, revision, release, occurrences) self.close_fetch_history_success(fetch_history_id) self.storage.origin_visit_update( self.origin_id, origin_visit['visit'], status='full') except: self.close_fetch_history_failure(fetch_history_id) self.storage.origin_visit_update( self.origin_id, origin_visit['visit'], status='partial') raise diff --git a/version.txt b/version.txt index b9937ac..53c1e92 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.21-0-gf330178 \ No newline at end of file +v0.0.22-0-g79254b5 \ No newline at end of file