diff --git a/PKG-INFO b/PKG-INFO index 726dc92..ae9713f 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.31 +Version: 0.0.32 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/changelog b/debian/changelog index 2dcfb2d..34bfad1 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,221 +1,222 @@ -swh-loader-git (0.0.31-1~swh1~bpo9+1) stretch-swh; urgency=medium +swh-loader-git (0.0.32-1~swh1) unstable-swh; urgency=medium - * Rebuild for stretch-backports. + * Release swh.loader.git 0.0.32 + * Update tasks to new swh.scheduler API - -- Nicolas Dandrimont Fri, 17 Mar 2017 17:40:16 +0100 + -- Nicolas Dandrimont Mon, 12 Jun 2017 18:04:50 +0200 swh-loader-git (0.0.31-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.31 * Migrate from swh.core.hashutil to swh.model.hashutil * Only send objects that are actually missing -- Nicolas Dandrimont Fri, 17 Mar 2017 17:40:17 +0100 swh-loader-git (0.0.30-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.30 * Fix handling of mergetag headers -- Nicolas Dandrimont Thu, 09 Mar 2017 11:30:08 +0100 swh-loader-git (0.0.29-1~swh1) unstable-swh; urgency=medium * v0.0.29 * GitLoaderFromArchive: Use the same configuration file as * GitLoader (permit to deploy both as the same unit) * git reader: Refactor to allow listing revisions as well as contents -- Antoine R. Dumont (@ardumont) Mon, 20 Feb 2017 11:32:24 +0100 swh-loader-git (0.0.28-1~swh1) unstable-swh; urgency=medium * v0.0.28 * loader: Fix fetch_date override -- Antoine R. Dumont (@ardumont) Wed, 15 Feb 2017 18:43:32 +0100 swh-loader-git (0.0.27-1~swh1) unstable-swh; urgency=medium * v0.0.27 * Add loader-git from archive -- Antoine R. Dumont (@ardumont) Tue, 14 Feb 2017 18:56:52 +0100 swh-loader-git (0.0.26-1~swh1) unstable-swh; urgency=medium * v0.0.26 * Add a git loader able to deal with git repository in archive -- Antoine R. Dumont (@ardumont) Tue, 14 Feb 2017 16:24:50 +0100 swh-loader-git (0.0.25-1~swh1) unstable-swh; urgency=medium * v0.0.25 * Fix to permit to actually pass the fetch date as parameter for * the loading git disk loader -- Antoine R. Dumont (@ardumont) Fri, 10 Feb 2017 17:34:35 +0100 swh-loader-git (0.0.24-1~swh1) unstable-swh; urgency=medium * v0.0.24 * Update storage configuration reading -- Antoine R. Dumont (@ardumont) Thu, 15 Dec 2016 18:40:29 +0100 swh-loader-git (0.0.23-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.23 * Make the save_data mechanism generic -- Nicolas Dandrimont Fri, 02 Dec 2016 15:34:05 +0100 swh-loader-git (0.0.22-1~swh1) unstable-swh; urgency=medium * v0.0.22 * Improve reader to permit to use it as analyzer tool -- Antoine R. Dumont (@ardumont) Fri, 04 Nov 2016 10:37:24 +0100 swh-loader-git (0.0.21-1~swh1) unstable-swh; urgency=medium * v0.0.21 * Improve the reader git to load all contents from a pack. * Improve to avoid unnecessary readings from db -- Antoine R. Dumont (@ardumont) Wed, 26 Oct 2016 17:06:12 +0200 swh-loader-git (0.0.20-1~swh1) unstable-swh; urgency=medium * v0.0.20 * Add new reader git task -- Antoine R. Dumont (@ardumont) Tue, 25 Oct 2016 18:40:17 +0200 swh-loader-git (0.0.19-1~swh1) unstable-swh; urgency=medium * v0.0.19 * Update git loaders to register origin_visit's state -- Antoine R. Dumont (@ardumont) Tue, 23 Aug 2016 16:34:15 +0200 swh-loader-git (0.0.18-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.18 * Properly handle skipped contents -- Nicolas Dandrimont Fri, 19 Aug 2016 18:12:44 +0200 swh-loader-git (0.0.16-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.16 * Add exist_ok to packfile cache directory creation -- Nicolas Dandrimont Mon, 01 Aug 2016 15:53:07 +0200 swh-loader-git (0.0.15-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.15 * Absence of remote refs doesn't throw an error in updater -- Nicolas Dandrimont Wed, 15 Jun 2016 01:20:37 +0200 swh-loader-git (0.0.14-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.14 * Add a disk loader using dulwich * Rework the loader logic to use a single pattern for both loaders * Allow caching of packfiles for the remote loader -- Nicolas Dandrimont Tue, 14 Jun 2016 18:10:21 +0200 swh-loader-git (0.0.13-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.13 * Update for latest schema revision -- Nicolas Dandrimont Fri, 08 Apr 2016 16:46:41 +0200 swh-loader-git (0.0.12-1~swh1) unstable-swh; urgency=medium * Release swh-loader-git v0.0.12 * Update to use new swh.storage api for object listing * Add a size limit to packfiles * Return a proper eventfulness for empty repositories * Do not crawl the pack file if unnecessary -- Nicolas Dandrimont Thu, 25 Feb 2016 18:21:34 +0100 swh-loader-git (0.0.11-1~swh1) unstable-swh; urgency=medium * Release swh.loader.git v0.0.11 * Implement git updater -- Nicolas Dandrimont Fri, 19 Feb 2016 19:13:22 +0100 swh-loader-git (0.0.10-1~swh1) unstable-swh; urgency=medium * Prepare swh.loader.git release v0.0.10 * Update for swh.model * Use new swh.storage -- Nicolas Dandrimont Mon, 07 Dec 2015 18:59:46 +0100 swh-loader-git (0.0.9-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.9 * Close fetch_history on failure too -- Nicolas Dandrimont Wed, 04 Nov 2015 10:54:37 +0100 swh-loader-git (0.0.8-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.8 * New database schema (v028) * Populate fetch_history (T121) -- Nicolas Dandrimont Tue, 27 Oct 2015 18:11:26 +0100 swh-loader-git (0.0.7-1~swh1) unstable-swh; urgency=medium * Prepare swh.loader.git v0.0.7 deployment -- Nicolas Dandrimont Mon, 19 Oct 2015 12:37:09 +0200 swh-loader-git (0.0.6-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.6 -- Nicolas Dandrimont Fri, 09 Oct 2015 17:50:35 +0200 swh-loader-git (0.0.5-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.5 -- Nicolas Dandrimont Tue, 06 Oct 2015 17:42:11 +0200 swh-loader-git (0.0.4-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.4 -- Nicolas Dandrimont Fri, 02 Oct 2015 14:54:04 +0200 swh-loader-git (0.0.3-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.loader.git v0.0.3 -- Nicolas Dandrimont Thu, 01 Oct 2015 11:36:28 +0200 swh-loader-git (0.0.2-1~swh1) unstable-swh; urgency=medium * Prepare deploying swh.loader.git v0.0.2 -- Nicolas Dandrimont Tue, 29 Sep 2015 17:22:09 +0200 swh-loader-git (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * Tagging swh.loader.git v0.0.1 -- Nicolas Dandrimont Fri, 25 Sep 2015 16:04:00 +0200 diff --git a/requirements-swh.txt b/requirements-swh.txt index 498ea27..f0dd184 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ swh.core >= 0.0.7 -swh.model >= 0.0.14 -swh.scheduler -swh.storage >= 0.0.76 +swh.model >= 0.0.15 +swh.scheduler >= 0.0.14 +swh.storage >= 0.0.83 diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index 726dc92..ae9713f 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.31 +Version: 0.0.32 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.git.egg-info/requires.txt b/swh.loader.git.egg-info/requires.txt index f258b7b..9e96a57 100644 --- a/swh.loader.git.egg-info/requires.txt +++ b/swh.loader.git.egg-info/requires.txt @@ -1,8 +1,8 @@ click dulwich retrying swh.core>=0.0.7 -swh.model>=0.0.14 -swh.scheduler -swh.storage>=0.0.76 +swh.model>=0.0.15 +swh.scheduler>=0.0.14 +swh.storage>=0.0.83 vcversioner diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index 2da671d..fc28332 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,229 +1,229 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dulwich objects to dictionaries suitable for swh.storage""" from swh.model import hashutil -HASH_ALGORITHMS = hashutil.ALGORITHMS - {'sha1_git'} +HASH_ALGORITHMS = hashutil.DEFAULT_ALGORITHMS - {'sha1_git'} def origin_url_to_origin(origin_url): """Format a pygit2.Repository as an origin suitable for swh.storage""" return { 'type': 'git', 'url': origin_url, } def dulwich_blob_to_content_id(blob): """Convert a dulwich blob to a Software Heritage content id""" if blob.type_name != b'blob': return size = blob.raw_length() ret = { 'sha1_git': blob.sha().digest(), 'length': size, } data = blob.as_raw_string() ret.update(hashutil.hash_data(data, HASH_ALGORITHMS)) return ret def dulwich_blob_to_content(blob, log=None, max_content_size=None, origin_id=None): """Convert a dulwich blob to a Software Heritage content""" if blob.type_name != b'blob': return ret = dulwich_blob_to_content_id(blob) size = ret['length'] if max_content_size: if size > max_content_size: id = hashutil.hash_to_hex(ret['sha1_git']) if log: log.info('Skipping content %s, too large (%s > %s)' % (id, size, max_content_size), extra={ 'swh_type': 'loader_git_content_skip', 'swh_id': id, 'swh_size': size, }) ret['status'] = 'absent' ret['reason'] = 'Content too large' ret['origin'] = origin_id return ret data = blob.as_raw_string() ret['data'] = data ret['status'] = 'visible' return ret def dulwich_tree_to_directory(tree, log=None): """Format a tree as a directory""" if tree.type_name != b'tree': return ret = { 'id': tree.sha().digest(), } entries = [] ret['entries'] = entries entry_mode_map = { 0o040000: 'dir', 0o160000: 'rev', 0o100644: 'file', 0o100755: 'file', 0o120000: 'file', } for entry in tree.iteritems(): entries.append({ 'type': entry_mode_map.get(entry.mode, 'file'), 'perms': entry.mode, 'name': entry.path, 'target': hashutil.hash_to_bytes(entry.sha.decode('ascii')), }) return ret def parse_author(name_email): """Parse an author line""" if name_email is None: return None try: open_bracket = name_email.index(b'<') except ValueError: name = email = None else: raw_name = name_email[:open_bracket] raw_email = name_email[open_bracket+1:] if not raw_name: name = None elif raw_name.endswith(b' '): name = raw_name[:-1] else: name = raw_name try: close_bracket = raw_email.index(b'>') except ValueError: email = None else: email = raw_email[:close_bracket] return { 'name': name, 'email': email, 'fullname': name_email, } def dulwich_tsinfo_to_timestamp(timestamp, timezone, timezone_neg_utc): """Convert the dulwich timestamp information to a structure compatible with Software Heritage""" return { 'timestamp': timestamp, 'offset': timezone // 60, 'negative_utc': timezone_neg_utc if timezone == 0 else None, } def dulwich_commit_to_revision(commit, log=None): if commit.type_name != b'commit': return ret = { 'id': commit.sha().digest(), 'author': parse_author(commit.author), 'date': dulwich_tsinfo_to_timestamp( commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, ), 'committer': parse_author(commit.committer), 'committer_date': dulwich_tsinfo_to_timestamp( commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, ), 'type': 'git', 'directory': bytes.fromhex(commit.tree.decode()), 'message': commit.message, 'metadata': None, 'synthetic': False, 'parents': [bytes.fromhex(p.decode()) for p in commit.parents], } git_metadata = [] if commit.encoding is not None: git_metadata.append(['encoding', commit.encoding]) if commit.mergetag: for mergetag in commit.mergetag: raw_string = mergetag.as_raw_string() assert raw_string.endswith(b'\n') git_metadata.append(['mergetag', raw_string[:-1]]) if commit.extra: git_metadata.extend([k.decode('utf-8'), v] for k, v in commit.extra) if commit.gpgsig: git_metadata.append(['gpgsig', commit.gpgsig]) if git_metadata: ret['metadata'] = { 'extra_headers': git_metadata, } return ret DULWICH_TYPES = { b'blob': 'content', b'tree': 'directory', b'commit': 'revision', b'tag': 'release', } def dulwich_tag_to_release(tag, log=None): if tag.type_name != b'tag': return target_type, target = tag.object ret = { 'id': tag.sha().digest(), 'name': tag.name, 'target': bytes.fromhex(target.decode()), 'target_type': DULWICH_TYPES[target_type.type_name], 'message': tag._message, 'metadata': None, 'synthetic': False, } if tag.tagger: ret['author'] = parse_author(tag.tagger) ret['date'] = dulwich_tsinfo_to_timestamp( tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, ) else: ret['author'] = ret['date'] = None return ret diff --git a/swh/loader/git/tasks.py b/swh/loader/git/tasks.py index f941e08..5f449af 100644 --- a/swh/loader/git/tasks.py +++ b/swh/loader/git/tasks.py @@ -1,70 +1,70 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import dateutil.parser from swh.scheduler.task import Task from .loader import GitLoader, GitLoaderFromArchive from .updater import BulkUpdater from .reader import GitSha1RemoteReaderAndSendToQueue # TODO: rename to LoadRemoteGitRepository class UpdateGitRepository(Task): """Import a git repository from a remote location""" task_queue = 'swh_loader_git' - def run(self, repo_url, base_url=None): + def run_task(self, repo_url, base_url=None): """Import a git repository""" loader = BulkUpdater() loader.log = self.log return loader.load(repo_url, base_url) class LoadDiskGitRepository(Task): """Import a git repository from disk""" task_queue = 'swh_loader_git_express' - def run(self, origin_url, directory, date): + def run_task(self, origin_url, directory, date): """Import a git repository, cloned in `directory` from `origin_url` at `date`.""" loader = GitLoader() loader.log = self.log return loader.load(origin_url, directory, dateutil.parser.parse(date)) class UncompressAndLoadDiskGitRepository(Task): """Import a git repository from a zip archive""" task_queue = 'swh_loader_git_archive' - def run(self, origin_url, archive_path, date): + def run_task(self, origin_url, archive_path, date): """1. Uncompress an archive repository in a local and temporary folder 2. Load it through the git disk loader 3. Clean up the temporary folder """ loader = GitLoaderFromArchive() loader.log = self.log return loader.load( origin_url, archive_path, dateutil.parser.parse(date)) class ReaderGitRepository(Task): task_queue = 'swh_reader_git' - def run(self, repo_url, base_url=None): + def run_task(self, repo_url, base_url=None): """Read a git repository from a remote location and send sha1 to archival. """ loader = GitSha1RemoteReaderAndSendToQueue() loader.log = self.log return loader.load(repo_url) diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index bca61d2..c5ca08e 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,173 +1,176 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest from nose.tools import istest import dulwich.repo import swh.loader.git.converters as converters from swh.model.hashutil import bytehex_to_hash, hash_to_bytes class TestConverters(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.repo_path = tempfile.mkdtemp() cls.repo = dulwich.repo.Repo.init_bare(cls.repo_path) fast_export = os.path.join(os.path.dirname(__file__), '../../../../..', 'swh-storage-testdata', 'git-repos', 'example-submodule.fast-export.xz') xz = subprocess.Popen( ['xzcat'], stdin=open(fast_export, 'rb'), stdout=subprocess.PIPE, ) git = subprocess.Popen( ['git', 'fast-import', '--quiet'], stdin=xz.stdout, cwd=cls.repo_path, ) # flush stdout of xz xz.stdout.close() git.communicate() @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.repo_path) - print(cls.repo_path) def setUp(self): super().setUp() self.blob_id = b'28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0' self.blob = { 'sha1_git': bytehex_to_hash(self.blob_id), 'sha1': hash_to_bytes('4850a3420a2262ff061cb296fb915430fa92301c'), 'sha256': hash_to_bytes('fee7c8a485a10321ad94b64135073cb5' '5f22cb9f57fa2417d2adfb09d310adef'), + 'blake2s256': hash_to_bytes('5d71873f42a137f6d89286e43677721e574' + '1fa05ce4cd5e3c7ea7c44d4c2d10b'), 'data': (b'[submodule "example-dependency"]\n' b'\tpath = example-dependency\n' b'\turl = https://github.com/githubtraining/' b'example-dependency.git\n'), 'length': 124, 'status': 'visible', - } + } self.blob_hidden = { 'sha1_git': bytehex_to_hash(self.blob_id), 'sha1': hash_to_bytes('4850a3420a2262ff061cb296fb915430fa92301c'), 'sha256': hash_to_bytes('fee7c8a485a10321ad94b64135073cb5' '5f22cb9f57fa2417d2adfb09d310adef'), + 'blake2s256': hash_to_bytes('5d71873f42a137f6d89286e43677721e574' + '1fa05ce4cd5e3c7ea7c44d4c2d10b'), 'length': 124, 'status': 'absent', 'reason': 'Content too large', 'origin': None, - } + } @istest def blob_to_content(self): content = converters.dulwich_blob_to_content(self.repo[self.blob_id]) self.assertEqual(self.blob, content) @istest def blob_to_content_absent(self): max_length = self.blob['length'] - 1 content = converters.dulwich_blob_to_content( self.repo[self.blob_id], max_content_size=max_length) self.assertEqual(self.blob_hidden, content) @istest def commit_to_revision(self): sha1 = b'9768d0b576dbaaecd80abedad6dfd0d72f1476da' revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = { 'id': hash_to_bytes('9768d0b576dbaaecd80abedad6dfd0d72f1476da'), 'directory': b'\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca', 'type': 'git', 'committer': { 'name': b'Stefano Zacchiroli', 'fullname': b'Stefano Zacchiroli ', 'email': b'zack@upsilon.cc', }, 'author': { 'name': b'Stefano Zacchiroli', 'fullname': b'Stefano Zacchiroli ', 'email': b'zack@upsilon.cc', }, 'committer_date': { 'negative_utc': None, 'timestamp': 1443083765, 'offset': 120, }, 'message': b'add submodule dependency\n', 'metadata': None, 'date': { 'negative_utc': None, 'timestamp': 1443083765, 'offset': 120, }, 'parents': [ b'\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r' ], 'synthetic': False, } self.assertEquals(revision, expected_revision) @istest def author_line_to_author(self): tests = { b'a ': { 'name': b'a', 'email': b'b@c.com', 'fullname': b'a ', }, b'': { 'name': None, 'email': b'foo@bar.com', 'fullname': b'', }, b'malformed ': { 'name': b'trailing', 'email': b'sp@c.e', 'fullname': b'trailing ', }, b'no': { 'name': b'no', 'email': b'sp@c.e', 'fullname': b'no', }, b' <>': { 'name': b'', 'email': b'', 'fullname': b' <>', }, } for author in sorted(tests): parsed_author = tests[author] self.assertEquals(parsed_author, converters.parse_author(author)) diff --git a/version.txt b/version.txt index a61968d..bba90a3 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.31-0-ga582b76 \ No newline at end of file +v0.0.32-0-gaf7b3c4 \ No newline at end of file