diff --git a/PKG-INFO b/PKG-INFO index ba0aa5c..24dfc16 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.9 +Version: 0.0.10 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index 97483d2..e78ccae 100644 --- a/debian/control +++ b/debian/control @@ -1,24 +1,25 @@ Source: swh-loader-git Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-pygit2, python3-retrying, python3-setuptools, python3-swh.core (>= 0.0.7~), - python3-swh.storage (>= 0.0.15~), + python3-swh.model, + python3-swh.storage (>= 0.0.20~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDG/ Package: python3-swh.loader.git Architecture: all Depends: python3-swh.core (>= 0.0.7~), - python3-swh.storage (>= 0.0.15~), + python3-swh.storage (>= 0.0.20~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Git loader diff --git a/requirements.txt b/requirements.txt index a29d1a0..d7697e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ pygit2 retrying vcversioner swh.core >= 0.0.7 -swh.storage >= 0.0.15 +swh.model +swh.storage >= 0.0.20 diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index ba0aa5c..24dfc16 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.git -Version: 0.0.9 +Version: 0.0.10 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.git.egg-info/requires.txt b/swh.loader.git.egg-info/requires.txt index e119675..5f1dfe1 100644 --- a/swh.loader.git.egg-info/requires.txt +++ b/swh.loader.git.egg-info/requires.txt @@ -1,5 +1,6 @@ pygit2 retrying swh.core>=0.0.7 -swh.storage>=0.0.15 +swh.model +swh.storage>=0.0.20 vcversioner diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index aef8f7c..9136e31 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,162 +1,162 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert pygit2 objects to dictionaries suitable for swh.storage""" from pygit2 import GIT_OBJ_COMMIT from swh.core import hashutil from .utils import format_date HASH_ALGORITHMS = ['sha1', 'sha256'] def blob_to_content(id, repo, log=None, max_content_size=None, origin_id=None): """Format a blob as a content""" blob = repo[id] size = blob.size ret = { 'sha1_git': id.raw, 'length': blob.size, 'status': 'absent' } if max_content_size: if size > max_content_size: if log: log.info('Skipping content %s, too large (%s > %s)' % (id.hex, size, max_content_size), extra={ 'swh_type': 'loader_git_content_skip', 'swh_repo': repo.path, 'swh_id': id.hex, 'swh_size': size, }) ret['reason'] = 'Content too large' ret['origin'] = origin_id return ret data = blob.data hashes = hashutil.hashdata(data, HASH_ALGORITHMS) ret.update(hashes) ret['data'] = data ret['status'] = 'visible' return ret def tree_to_directory(id, repo, log=None): """Format a tree as a directory""" ret = { 'id': id.raw, } entries = [] ret['entries'] = entries entry_type_map = { 'tree': 'dir', 'blob': 'file', 'commit': 'rev', } for entry in repo[id]: entries.append({ 'type': entry_type_map[entry.type], 'perms': entry.filemode, 'name': entry._name, 'target': entry.id.raw, }) return ret def commit_to_revision(id, repo, log=None): """Format a commit as a revision""" commit = repo[id] author = commit.author committer = commit.committer return { 'id': id.raw, 'date': format_date(author), - 'date_offset': author.offset, 'committer_date': format_date(committer), - 'committer_date_offset': committer.offset, 'type': 'git', 'directory': commit.tree_id.raw, 'message': commit.raw_message, - 'author_name': author.raw_name, - 'author_email': author.raw_email, - 'committer_name': committer.raw_name, - 'committer_email': committer.raw_email, + 'metadata': None, + 'author': { + 'name': author.raw_name, + 'email': author.raw_email, + }, + 'committer': { + 'name': committer.raw_name, + 'email': committer.raw_email, + }, 'synthetic': False, 'parents': [p.raw for p in commit.parent_ids], } def annotated_tag_to_release(id, repo, log=None): """Format an annotated tag as a release""" tag = repo[id] tag_pointer = repo[tag.target] if tag_pointer.type != GIT_OBJ_COMMIT: if log: log.warn("Ignoring tag %s pointing at %s %s" % ( tag.id.hex, tag_pointer.__class__.__name__, tag_pointer.id.hex), extra={ 'swh_type': 'loader_git_tag_ignore', 'swh_repo': repo.path, 'swh_tag_id': tag.id.hex, 'swh_tag_dest': { 'type': tag_pointer.__class__.__name__, 'id': tag_pointer.id.hex, }, }) return - author = tag.tagger - - if not author: + if not tag.tagger: if log: log.warn("Tag %s has no author, using default values" % id.hex, extra={ 'swh_type': 'loader_git_tag_author_default', 'swh_repo': repo.path, 'swh_tag_id': tag.id.hex, }) - author_name = b'' - author_email = b'' + author = None date = None - date_offset = 0 else: - author_name = author.raw_name - author_email = author.raw_email - date = format_date(author) - date_offset = author.offset + author = { + 'name': tag.tagger.raw_name, + 'email': tag.tagger.raw_email, + } + date = format_date(tag.tagger) return { 'id': id.raw, 'date': date, - 'date_offset': date_offset, - 'revision': tag.target.raw, - 'comment': tag._message, - 'name': tag.name, - 'author_name': author_name, - 'author_email': author_email, + 'target': tag.target.raw, + 'target_type': 'revision', + 'message': tag._message, + 'name': tag.name.encode('utf-8'), + 'author': author, + 'metadata': None, 'synthetic': False, } def ref_to_occurrence(ref): """Format a reference as an occurrence""" return ref def origin_url_to_origin(origin_url): """Format a pygit2.Repository as an origin suitable for swh.storage""" return { 'type': 'git', 'url': origin_url, } diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index b66d59e..852f959 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,124 +1,132 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest import datetime from nose.tools import istest import pygit2 import swh.loader.git.converters as converters from swh.core.hashutil import hex_to_hash class TestConverters(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.repo_path = tempfile.mkdtemp() cls.repo = pygit2.init_repository(cls.repo_path, bare=True) fast_export = os.path.join(os.path.dirname(__file__), '../../../../..', 'swh-storage-testdata', 'git-repos', 'example-submodule.fast-export.xz') xz = subprocess.Popen( ['xzcat'], stdin=open(fast_export, 'rb'), stdout=subprocess.PIPE, ) git = subprocess.Popen( ['git', 'fast-import', '--quiet'], stdin=xz.stdout, cwd=cls.repo_path, ) # flush stdout of xz xz.stdout.close() git.communicate() @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.repo_path) print(cls.repo_path) def setUp(self): super().setUp() self.blob_id = pygit2.Oid( hex='28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0') self.blob = { 'sha1_git': self.blob_id.raw, 'sha1': hex_to_hash('4850a3420a2262ff061cb296fb915430fa92301c'), 'sha256': hex_to_hash('fee7c8a485a10321ad94b64135073cb5' '5f22cb9f57fa2417d2adfb09d310adef'), 'data': (b'[submodule "example-dependency"]\n' b'\tpath = example-dependency\n' b'\turl = https://github.com/githubtraining/' b'example-dependency.git\n'), 'length': 124, 'status': 'visible', } self.blob_hidden = { 'sha1_git': self.blob_id.raw, 'length': 124, 'status': 'absent', 'reason': 'Content too large', 'origin': None, } @istest def blob_to_content(self): content = converters.blob_to_content(self.blob_id, self.repo) self.assertEqual(self.blob, content) @istest def blob_to_content_absent(self): max_length = self.blob['length'] - 1 content = converters.blob_to_content(self.blob_id, self.repo, max_content_size=max_length) self.assertEqual(self.blob_hidden, content) @istest def commit_to_revision(self): sha1 = '9768d0b576dbaaecd80abedad6dfd0d72f1476da' commit = self.repo.revparse_single(sha1) # when actual_revision = converters.commit_to_revision(commit.id, self.repo) + offset = datetime.timedelta(minutes=120) + tzoffset = datetime.timezone(offset) expected_revision = { - 'author_email': b'zack@upsilon.cc', 'id': hex_to_hash('9768d0b576dbaaecd80abedad6dfd0d72f1476da'), 'directory': b'\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca', 'type': 'git', - 'committer_name': b'Stefano Zacchiroli', - 'date_offset': 120, - 'committer_email': b'zack@upsilon.cc', - 'committer_date': datetime.datetime(2015, 9, 24, 8, 36, 5, - tzinfo=datetime.timezone.utc), - 'author_name': b'Stefano Zacchiroli', + 'committer': { + 'name': b'Stefano Zacchiroli', + 'email': b'zack@upsilon.cc', + }, + 'author': { + 'name': b'Stefano Zacchiroli', + 'email': b'zack@upsilon.cc', + }, + 'committer_date': datetime.datetime(2015, 9, 24, 10, 36, 5, + tzinfo=tzoffset), 'message': b'add submodule dependency\n', - 'date': datetime.datetime(2015, 9, 24, 8, 36, 5, - tzinfo=datetime.timezone.utc), - 'committer_date_offset': 120, + 'metadata': None, + 'date': datetime.datetime(2015, 9, 24, 10, 36, 5, + tzinfo=tzoffset), 'parents': [ b'\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r' ], 'synthetic': False, } # then self.assertEquals(actual_revision, expected_revision) + self.assertEquals(offset, expected_revision['date'].utcoffset()) + self.assertEquals(offset, + expected_revision['committer_date'].utcoffset()) diff --git a/swh/loader/git/utils.py b/swh/loader/git/utils.py index 7ef91e5..e6fb155 100644 --- a/swh/loader/git/utils.py +++ b/swh/loader/git/utils.py @@ -1,89 +1,89 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import glob import os import subprocess from collections import defaultdict from pygit2 import Oid def format_date(signature): """Convert the date from a signature to a datetime""" - return datetime.datetime.fromtimestamp(signature.time, - datetime.timezone.utc) + tz = datetime.timezone(datetime.timedelta(minutes=signature.offset)) + return datetime.datetime.fromtimestamp(signature.time, tz) def list_objects_from_packfile_index(packfile_index): """List the objects indexed by this packfile, in packfile offset order. """ input_file = open(packfile_index, 'rb') with subprocess.Popen( ['/usr/bin/git', 'show-index'], stdin=input_file, stdout=subprocess.PIPE, ) as process: data = [] for line in process.stdout.readlines(): # git show-index returns the line as: # () line_components = line.split() offset = int(line_components[0]) object_id = line_components[1] data.append((offset, object_id)) yield from (Oid(hex=object_id.decode('ascii')) for _, object_id in sorted(data)) input_file.close() def simple_list_objects(repo): """List the objects in a given repository. Watch out for duplicates!""" objects_dir = os.path.join(repo.path, 'objects') # Git hashes are 40-character long objects_glob = os.path.join(objects_dir, '[0-9a-f]' * 2, '[0-9a-f]' * 38) packfile_dir = os.path.join(objects_dir, 'pack') if os.path.isdir(packfile_dir): for packfile_index in os.listdir(packfile_dir): if not packfile_index.endswith('.idx'): # Not an index file continue packfile_index_path = os.path.join(packfile_dir, packfile_index) yield from list_objects_from_packfile_index(packfile_index_path) for object_file in glob.glob(objects_glob): # Rebuild the object id as the last two components of the path yield Oid(hex=''.join(object_file.split(os.path.sep)[-2:])) def list_objects(repo): """List the objects in a given repository, removing duplicates""" seen = set() for oid in simple_list_objects(repo): if oid not in seen: yield oid seen.add(oid) def get_objects_per_object_type(repo): """Get all the (pygit2-parsed) objects from repo per object type""" objects_per_object_type = defaultdict(list) for object_id in list_objects(repo): object = repo[object_id] objects_per_object_type[object.type].append(object_id) return objects_per_object_type diff --git a/version.txt b/version.txt index 05e3d51..b3a0857 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.9-0-g551638e \ No newline at end of file +v0.0.10-0-g3d36175 \ No newline at end of file