diff --git a/PKG-INFO b/PKG-INFO index 726dc92..ae9713f 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.31 +Version: 0.0.32 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index f4e57fb..22c71ac 100644 --- a/debian/control +++ b/debian/control @@ -1,28 +1,29 @@ Source: swh-loader-git Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, - python3-nose, + python3-click, python3-dulwich, + python3-nose, python3-retrying, python3-setuptools, - python3-click, python3-swh.core (>= 0.0.7~), - python3-swh.model (>= 0.0.14~), - python3-swh.scheduler, - python3-swh.storage (>= 0.0.76~), + python3-swh.model (>= 0.0.15~), + python3-swh.scheduler (>= 0.0.14~), + python3-swh.storage (>= 0.0.83~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDG/ Package: python3-swh.loader.git Architecture: all Depends: python3-swh.core (>= 0.0.7~), - python3-swh.storage (>= 0.0.76~), - python3-swh.model (>= 0.0.14~), + python3-swh.model (>= 0.0.15~), + python3-swh.scheduler (>= 0.0.14~), + python3-swh.storage (>= 0.0.83~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Git loader diff --git a/requirements-swh.txt b/requirements-swh.txt index 498ea27..f0dd184 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ swh.core >= 0.0.7 -swh.model >= 0.0.14 -swh.scheduler -swh.storage >= 0.0.76 +swh.model >= 0.0.15 +swh.scheduler >= 0.0.14 +swh.storage >= 0.0.83 diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index 726dc92..ae9713f 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.31 +Version: 0.0.32 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.git.egg-info/requires.txt b/swh.loader.git.egg-info/requires.txt index f258b7b..9e96a57 100644 --- a/swh.loader.git.egg-info/requires.txt +++ b/swh.loader.git.egg-info/requires.txt @@ -1,8 +1,8 @@ click dulwich retrying swh.core>=0.0.7 -swh.model>=0.0.14 -swh.scheduler -swh.storage>=0.0.76 +swh.model>=0.0.15 +swh.scheduler>=0.0.14 +swh.storage>=0.0.83 vcversioner diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index 2da671d..fc28332 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,229 +1,229 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dulwich objects to dictionaries suitable for swh.storage""" from swh.model import hashutil -HASH_ALGORITHMS = hashutil.ALGORITHMS - {'sha1_git'} +HASH_ALGORITHMS = hashutil.DEFAULT_ALGORITHMS - {'sha1_git'} def origin_url_to_origin(origin_url): """Format a pygit2.Repository as an origin suitable for swh.storage""" return { 'type': 'git', 'url': origin_url, } def dulwich_blob_to_content_id(blob): """Convert a dulwich blob to a Software Heritage content id""" if blob.type_name != b'blob': return size = blob.raw_length() ret = { 'sha1_git': blob.sha().digest(), 'length': size, } data = blob.as_raw_string() ret.update(hashutil.hash_data(data, HASH_ALGORITHMS)) return ret def dulwich_blob_to_content(blob, log=None, max_content_size=None, origin_id=None): """Convert a dulwich blob to a Software Heritage content""" if blob.type_name != b'blob': return ret = dulwich_blob_to_content_id(blob) size = ret['length'] if max_content_size: if size > max_content_size: id = hashutil.hash_to_hex(ret['sha1_git']) if log: log.info('Skipping content %s, too large (%s > %s)' % (id, size, max_content_size), extra={ 'swh_type': 'loader_git_content_skip', 'swh_id': id, 'swh_size': size, }) ret['status'] = 'absent' ret['reason'] = 'Content too large' ret['origin'] = origin_id return ret data = blob.as_raw_string() ret['data'] = data ret['status'] = 'visible' return ret def dulwich_tree_to_directory(tree, log=None): """Format a tree as a directory""" if tree.type_name != b'tree': return ret = { 'id': tree.sha().digest(), } entries = [] ret['entries'] = entries entry_mode_map = { 0o040000: 'dir', 0o160000: 'rev', 0o100644: 'file', 0o100755: 'file', 0o120000: 'file', } for entry in tree.iteritems(): entries.append({ 'type': entry_mode_map.get(entry.mode, 'file'), 'perms': entry.mode, 'name': entry.path, 'target': hashutil.hash_to_bytes(entry.sha.decode('ascii')), }) return ret def parse_author(name_email): """Parse an author line""" if name_email is None: return None try: open_bracket = name_email.index(b'<') except ValueError: name = email = None else: raw_name = name_email[:open_bracket] raw_email = name_email[open_bracket+1:] if not raw_name: name = None elif raw_name.endswith(b' '): name = raw_name[:-1] else: name = raw_name try: close_bracket = raw_email.index(b'>') except ValueError: email = None else: email = raw_email[:close_bracket] return { 'name': name, 'email': email, 'fullname': name_email, } def dulwich_tsinfo_to_timestamp(timestamp, timezone, timezone_neg_utc): """Convert the dulwich timestamp information to a structure compatible with Software Heritage""" return { 'timestamp': timestamp, 'offset': timezone // 60, 'negative_utc': timezone_neg_utc if timezone == 0 else None, } def dulwich_commit_to_revision(commit, log=None): if commit.type_name != b'commit': return ret = { 'id': commit.sha().digest(), 'author': parse_author(commit.author), 'date': dulwich_tsinfo_to_timestamp( commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, ), 'committer': parse_author(commit.committer), 'committer_date': dulwich_tsinfo_to_timestamp( commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, ), 'type': 'git', 'directory': bytes.fromhex(commit.tree.decode()), 'message': commit.message, 'metadata': None, 'synthetic': False, 'parents': [bytes.fromhex(p.decode()) for p in commit.parents], } git_metadata = [] if commit.encoding is not None: git_metadata.append(['encoding', commit.encoding]) if commit.mergetag: for mergetag in commit.mergetag: raw_string = mergetag.as_raw_string() assert raw_string.endswith(b'\n') git_metadata.append(['mergetag', raw_string[:-1]]) if commit.extra: git_metadata.extend([k.decode('utf-8'), v] for k, v in commit.extra) if commit.gpgsig: git_metadata.append(['gpgsig', commit.gpgsig]) if git_metadata: ret['metadata'] = { 'extra_headers': git_metadata, } return ret DULWICH_TYPES = { b'blob': 'content', b'tree': 'directory', b'commit': 'revision', b'tag': 'release', } def dulwich_tag_to_release(tag, log=None): if tag.type_name != b'tag': return target_type, target = tag.object ret = { 'id': tag.sha().digest(), 'name': tag.name, 'target': bytes.fromhex(target.decode()), 'target_type': DULWICH_TYPES[target_type.type_name], 'message': tag._message, 'metadata': None, 'synthetic': False, } if tag.tagger: ret['author'] = parse_author(tag.tagger) ret['date'] = dulwich_tsinfo_to_timestamp( tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, ) else: ret['author'] = ret['date'] = None return ret diff --git a/swh/loader/git/tasks.py b/swh/loader/git/tasks.py index f941e08..5f449af 100644 --- a/swh/loader/git/tasks.py +++ b/swh/loader/git/tasks.py @@ -1,70 +1,70 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import dateutil.parser from swh.scheduler.task import Task from .loader import GitLoader, GitLoaderFromArchive from .updater import BulkUpdater from .reader import GitSha1RemoteReaderAndSendToQueue # TODO: rename to LoadRemoteGitRepository class UpdateGitRepository(Task): """Import a git repository from a remote location""" task_queue = 'swh_loader_git' - def run(self, repo_url, base_url=None): + def run_task(self, repo_url, base_url=None): """Import a git repository""" loader = BulkUpdater() loader.log = self.log return loader.load(repo_url, base_url) class LoadDiskGitRepository(Task): """Import a git repository from disk""" task_queue = 'swh_loader_git_express' - def run(self, origin_url, directory, date): + def run_task(self, origin_url, directory, date): """Import a git repository, cloned in `directory` from `origin_url` at `date`.""" loader = GitLoader() loader.log = self.log return loader.load(origin_url, directory, dateutil.parser.parse(date)) class UncompressAndLoadDiskGitRepository(Task): """Import a git repository from a zip archive""" task_queue = 'swh_loader_git_archive' - def run(self, origin_url, archive_path, date): + def run_task(self, origin_url, archive_path, date): """1. Uncompress an archive repository in a local and temporary folder 2. Load it through the git disk loader 3. Clean up the temporary folder """ loader = GitLoaderFromArchive() loader.log = self.log return loader.load( origin_url, archive_path, dateutil.parser.parse(date)) class ReaderGitRepository(Task): task_queue = 'swh_reader_git' - def run(self, repo_url, base_url=None): + def run_task(self, repo_url, base_url=None): """Read a git repository from a remote location and send sha1 to archival. """ loader = GitSha1RemoteReaderAndSendToQueue() loader.log = self.log return loader.load(repo_url) diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index bca61d2..c5ca08e 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,173 +1,176 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest from nose.tools import istest import dulwich.repo import swh.loader.git.converters as converters from swh.model.hashutil import bytehex_to_hash, hash_to_bytes class TestConverters(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.repo_path = tempfile.mkdtemp() cls.repo = dulwich.repo.Repo.init_bare(cls.repo_path) fast_export = os.path.join(os.path.dirname(__file__), '../../../../..', 'swh-storage-testdata', 'git-repos', 'example-submodule.fast-export.xz') xz = subprocess.Popen( ['xzcat'], stdin=open(fast_export, 'rb'), stdout=subprocess.PIPE, ) git = subprocess.Popen( ['git', 'fast-import', '--quiet'], stdin=xz.stdout, cwd=cls.repo_path, ) # flush stdout of xz xz.stdout.close() git.communicate() @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.repo_path) - print(cls.repo_path) def setUp(self): super().setUp() self.blob_id = b'28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0' self.blob = { 'sha1_git': bytehex_to_hash(self.blob_id), 'sha1': hash_to_bytes('4850a3420a2262ff061cb296fb915430fa92301c'), 'sha256': hash_to_bytes('fee7c8a485a10321ad94b64135073cb5' '5f22cb9f57fa2417d2adfb09d310adef'), + 'blake2s256': hash_to_bytes('5d71873f42a137f6d89286e43677721e574' + '1fa05ce4cd5e3c7ea7c44d4c2d10b'), 'data': (b'[submodule "example-dependency"]\n' b'\tpath = example-dependency\n' b'\turl = https://github.com/githubtraining/' b'example-dependency.git\n'), 'length': 124, 'status': 'visible', - } + } self.blob_hidden = { 'sha1_git': bytehex_to_hash(self.blob_id), 'sha1': hash_to_bytes('4850a3420a2262ff061cb296fb915430fa92301c'), 'sha256': hash_to_bytes('fee7c8a485a10321ad94b64135073cb5' '5f22cb9f57fa2417d2adfb09d310adef'), + 'blake2s256': hash_to_bytes('5d71873f42a137f6d89286e43677721e574' + '1fa05ce4cd5e3c7ea7c44d4c2d10b'), 'length': 124, 'status': 'absent', 'reason': 'Content too large', 'origin': None, - } + } @istest def blob_to_content(self): content = converters.dulwich_blob_to_content(self.repo[self.blob_id]) self.assertEqual(self.blob, content) @istest def blob_to_content_absent(self): max_length = self.blob['length'] - 1 content = converters.dulwich_blob_to_content( self.repo[self.blob_id], max_content_size=max_length) self.assertEqual(self.blob_hidden, content) @istest def commit_to_revision(self): sha1 = b'9768d0b576dbaaecd80abedad6dfd0d72f1476da' revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = { 'id': hash_to_bytes('9768d0b576dbaaecd80abedad6dfd0d72f1476da'), 'directory': b'\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca', 'type': 'git', 'committer': { 'name': b'Stefano Zacchiroli', 'fullname': b'Stefano Zacchiroli ', 'email': b'zack@upsilon.cc', }, 'author': { 'name': b'Stefano Zacchiroli', 'fullname': b'Stefano Zacchiroli ', 'email': b'zack@upsilon.cc', }, 'committer_date': { 'negative_utc': None, 'timestamp': 1443083765, 'offset': 120, }, 'message': b'add submodule dependency\n', 'metadata': None, 'date': { 'negative_utc': None, 'timestamp': 1443083765, 'offset': 120, }, 'parents': [ b'\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r' ], 'synthetic': False, } self.assertEquals(revision, expected_revision) @istest def author_line_to_author(self): tests = { b'a ': { 'name': b'a', 'email': b'b@c.com', 'fullname': b'a ', }, b'': { 'name': None, 'email': b'foo@bar.com', 'fullname': b'', }, b'malformed ': { 'name': b'trailing', 'email': b'sp@c.e', 'fullname': b'trailing ', }, b'no': { 'name': b'no', 'email': b'sp@c.e', 'fullname': b'no', }, b' <>': { 'name': b'', 'email': b'', 'fullname': b' <>', }, } for author in sorted(tests): parsed_author = tests[author] self.assertEquals(parsed_author, converters.parse_author(author)) diff --git a/version.txt b/version.txt index a61968d..bba90a3 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.31-0-ga582b76 \ No newline at end of file +v0.0.32-0-gaf7b3c4 \ No newline at end of file