diff --git a/README b/README index 617f8ba..f246fb7 100644 --- a/README +++ b/README @@ -1,106 +1,82 @@ The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ======= This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ============ Runtime ------- - python3 -- python3-pygit2 +- python3-dulwich +- python3-retrying - python3-swh.core +- python3-swh.model - python3-swh.storage Test ---- - python3-nose Requirements ============ - implementation language, Python3 - coding guidelines: conform to PEP8 -- Git access: via libgit2/pygit +- Git access: via dulwich Configuration ============= -bin/swh-loader-git takes one argument: a configuration file in .ini format. +You can run the loader or the updater directly by calling python3 -m swh.loader.git.{loader,updater}. + +Both tools expect a configuration file in .ini format to be present in ~/.config/swh/loader/git-{loader,updater}.ini The configuration file contains the following directives: ``` [main] # the storage class used. one of remote_storage, local_storage storage_class = remote_storage # arguments passed to the storage class # for remote_storage: URI of the storage server storage_args = http://localhost:5000/ # for local_storage: database connection string and root of the # storage, comma separated # storage_args = dbname=softwareheritage-dev, /tmp/swh/storage -# The path to the repository to load -repo_path = /tmp/git_repo - -# The URL of the origin for the repo -origin_url = https://github.com/hylang/hy - -# The UUID of the authority that dated the validity of the repo -authority = 5f4d4c51-498a-4e28-88b3-b3e4e8396cba - -# The validity date of the refs in the given repo, in Postgres -# timestamptz format -validity = 2015-01-01 00:00:00+00 - # Whether to send the given types of objects send_contents = True send_directories = True send_revisions = True send_releases = True send_occurrences = True # The size of the packets sent to storage for each kind of object content_packet_size = 100000 content_packet_size_bytes = 1073741824 directory_packet_size = 25000 revision_packet_size = 100000 release_packet_size = 100000 occurrence_packet_size = 100000 ``` - -bin/swh-loader-git-multi takes the same arguments, and adds: - -``` -[main] -# database connection string to the lister-github database -lister_db = dbname=lister-github - -# base path of the github repositories -repo_basepath = /srv/storage/space/data/github - -# Whether to run the mass loading or just list the repos -dry_run = False - -``` diff --git a/bin/swh-loader-git b/bin/swh-loader-git deleted file mode 100755 index 9ed1d92..0000000 --- a/bin/swh-loader-git +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 - -import logging -import sys - -from swh.loader.git import BulkLoader - -ADDITIONAL_CONFIG = { - 'repo_path': ('str', None), - - 'origin_url': ('str', 'file:///dev/null'), - 'authority': ('str', '5f4d4c51-498a-4e28-88b3-b3e4e8396cba'), - 'validity': ('str', '2015-01-01 00:00:00+00'), -} - -my_config = BulkLoader.parse_config_file( - config_filename=sys.argv[1], additional_configs=[ADDITIONAL_CONFIG]) - -logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s %(name)s %(levelname)s %(message)s', - handlers=[ - logging.StreamHandler(), - ], -) - -requests_log = logging.getLogger("requests") -requests_log.setLevel(logging.CRITICAL) - -loader = BulkLoader(my_config) -loader.process(my_config['repo_path'], - my_config['origin_url'], - my_config['authority'], - my_config['validity']) diff --git a/bin/swh-loader-git-multi b/bin/swh-loader-git-multi deleted file mode 100755 index 715828b..0000000 --- a/bin/swh-loader-git-multi +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 - -import datetime -import logging -import os -import sys - -import psycopg2 -import psycopg2.extras - -from swh.core import config -from swh.loader.git import BulkLoader - -DEFAULT_CONFIG = { - 'lister_db': ('str', 'dbname=lister-github'), - 'repo_basepath': ('str', '/srv/storage/space/data/github'), - 'dry_run': ('bool', True), - - 'db': ('str', 'dbname=softwareheritage-dev'), - 'storage_base': ('str', '/tmp/swh-loader-git/test'), - 'repo_path': ('str', None), - - 'origin_url': ('str', 'file:///dev/null'), - 'authority': ('str', '5f4d4c51-498a-4e28-88b3-b3e4e8396cba'), - 'validity': ('str', '2015-01-01 00:00:00+00'), - - 'send_contents': ('bool', True), - 'send_directories': ('bool', True), - 'send_revisions': ('bool', True), - 'send_releases': ('bool', True), - 'send_occurrences': ('bool', True), - - 'content_packet_size': ('int', 100000), - 'directory_packet_size': ('int', 25000), - 'revision_packet_size': ('int', 100000), - 'release_packet_size': ('int', 100000), - 'occurrence_packet_size': ('int', 100000), -} - -logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(name)s %(levelname)s %(message)s') - -logger = logging.getLogger('test-bulk-loader-all') - -base_config = config.read(sys.argv[1], DEFAULT_CONFIG) - - -def process_one_repository(base_config, repo_name): - my_config = base_config.copy() - - basepath = my_config['repo_basepath'] - - my_path = os.path.join(basepath, repo_name[0], repo_name) - my_config['repo_path'] = my_path - - if not os.path.exists(my_path): - logger.error('Repository %s does not exist at %s' % (repo_name, - my_path)) - return - - witness_file = os.path.join(my_path, 'witness') - if not os.path.exists(witness_file): - logger.warn('No witness file for repository %s, using default value ' - '%s' % (repo_name, my_config['validity'])) - else: - validity_timestamp = os.stat(witness_file).st_mtime - my_config['validity'] = "%s+00" % datetime.datetime.utcfromtimestamp( - validity_timestamp) - - logger.info('Processing repository %s fetched on %s' % ( - repo_name, - my_config['validity'])) - - if my_config['dry_run']: - return - - loader = BulkLoader(my_config) - origin = loader.get_origin() - if origin['id']: - logger.info('Repository %s already loaded (origin id=%s), skipping' % ( - repo_name, origin['id'])) - return - loader.process() - - -def list_random_repos(config): - db = psycopg2.connect(config['lister_db'], - cursor_factory=psycopg2.extras.NamedTupleCursor) - query = '''select full_name from repos_random_sample(0.001) r - inner join crawl_history c on r.id = c.repo where status=true''' - cur = db.cursor() - cur.execute(query) - ret = cur.fetchall() - cur.close() - db.close() - - return ret - -processed_repos = set() - -print('Needs updating for new BulkLoader') -sys.exit(1) - -while True: - logger.info('listing 0.001% random repos') - random_repos = list_random_repos(base_config) - logger.info('done') - for repo in random_repos: - repo_name = repo.full_name - if repo_name not in processed_repos: - try: - process_one_repository(base_config, repo_name) - except Exception: - logger.exception('Failed processing repository %s' % - repo_name) - finally: - processed_repos.add(repo_name) diff --git a/debian/control b/debian/control index c6d5571..2599b49 100644 --- a/debian/control +++ b/debian/control @@ -1,28 +1,27 @@ Source: swh-loader-git Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-dulwich, - python3-pygit2, python3-retrying, python3-setuptools, python3-swh.core (>= 0.0.7~), python3-swh.model (>= 0.0.3~), python3-swh.scheduler, python3-swh.storage (>= 0.0.37~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDG/ Package: python3-swh.loader.git Architecture: all Depends: python3-swh.core (>= 0.0.7~), python3-swh.storage (>= 0.0.37~), python3-swh.model (>= 0.0.3~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Git loader diff --git a/requirements.txt b/requirements.txt index eac3ae2..0bcbab8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ dulwich -pygit2 retrying vcversioner swh.core >= 0.0.7 swh.model >= 0.0.3 swh.scheduler swh.storage >= 0.0.37 diff --git a/scratch/repo_walk.py b/scratch/repo_walk.py deleted file mode 100755 index 58c0c23..0000000 --- a/scratch/repo_walk.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python3 - -import pygit2 -from pygit2 import GIT_SORT_TOPOLOGICAL - -import sys - -repo_path = sys.argv[1] -ref_name = sys.argv[2] - -repo = pygit2.Repository(repo_path) - -ref = repo.lookup_reference(ref_name) - -head_rev = repo[ref.target] - -for rev in repo.walk(head_rev.hex, GIT_SORT_TOPOLOGICAL): - print(rev.hex, rev.tree.hex) - for tree_entry in rev.tree: - print(repo.get(tree_entry.oid)) diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index 4beaf04..657c525 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,363 +1,213 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -"""Convert pygit2 objects to dictionaries suitable for swh.storage""" - -from pygit2 import GIT_OBJ_COMMIT +"""Convert dulwich objects to dictionaries suitable for swh.storage""" from swh.core import hashutil -from .utils import format_date - -HASH_ALGORITHMS = ['sha1', 'sha256'] - - -def blob_to_content(id, repo, log=None, max_content_size=None, origin_id=None): - """Format a blob as a content""" - blob = repo[id] - size = blob.size - ret = { - 'sha1_git': id.raw, - 'length': blob.size, - 'status': 'absent' - } - - if max_content_size: - if size > max_content_size: - if log: - log.info('Skipping content %s, too large (%s > %s)' % - (id.hex, size, max_content_size), extra={ - 'swh_type': 'loader_git_content_skip', - 'swh_repo': repo.path, - 'swh_id': id.hex, - 'swh_size': size, - }) - ret['reason'] = 'Content too large' - ret['origin'] = origin_id - return ret - - data = blob.data - hashes = hashutil.hashdata(data, HASH_ALGORITHMS) - ret.update(hashes) - ret['data'] = data - ret['status'] = 'visible' - - return ret - - -def tree_to_directory(id, repo, log=None): - """Format a tree as a directory""" - ret = { - 'id': id.raw, - } - entries = [] - ret['entries'] = entries - - entry_type_map = { - 'tree': 'dir', - 'blob': 'file', - 'commit': 'rev', - } - - for entry in repo[id]: - entries.append({ - 'type': entry_type_map[entry.type], - 'perms': entry.filemode, - 'name': entry._name, - 'target': entry.id.raw, - }) - - return ret - -def commit_to_revision(id, repo, log=None): - """Format a commit as a revision""" - commit = repo[id] - - author = commit.author - committer = commit.committer - return { - 'id': id.raw, - 'date': format_date(author), - 'committer_date': format_date(committer), - 'type': 'git', - 'directory': commit.tree_id.raw, - 'message': commit.raw_message, - 'metadata': None, - 'author': { - 'name': author.raw_name, - 'email': author.raw_email, - }, - 'committer': { - 'name': committer.raw_name, - 'email': committer.raw_email, - }, - 'synthetic': False, - 'parents': [p.raw for p in commit.parent_ids], - } - - -def annotated_tag_to_release(id, repo, log=None): - """Format an annotated tag as a release""" - tag = repo[id] - - tag_pointer = repo[tag.target] - if tag_pointer.type != GIT_OBJ_COMMIT: - if log: - log.warn("Ignoring tag %s pointing at %s %s" % ( - tag.id.hex, tag_pointer.__class__.__name__, - tag_pointer.id.hex), extra={ - 'swh_type': 'loader_git_tag_ignore', - 'swh_repo': repo.path, - 'swh_tag_id': tag.id.hex, - 'swh_tag_dest': { - 'type': tag_pointer.__class__.__name__, - 'id': tag_pointer.id.hex, - }, - }) - return - - if not tag.tagger: - if log: - log.warn("Tag %s has no author, using default values" - % id.hex, extra={ - 'swh_type': 'loader_git_tag_author_default', - 'swh_repo': repo.path, - 'swh_tag_id': tag.id.hex, - }) - author = None - date = None - else: - author = { - 'name': tag.tagger.raw_name, - 'email': tag.tagger.raw_email, - } - date = format_date(tag.tagger) - - return { - 'id': id.raw, - 'date': date, - 'target': tag.target.raw, - 'target_type': 'revision', - 'message': tag._message, - 'name': tag.name.raw, - 'author': author, - 'metadata': None, - 'synthetic': False, - } - - -def ref_to_occurrence(ref): - """Format a reference as an occurrence""" - occ = ref.copy() - if 'branch' in ref: - branch = ref['branch'] - if isinstance(branch, str): - occ['branch'] = branch.encode('utf-8') - else: - occ['branch'] = branch - return occ +HASH_ALGORITHMS = hashutil.ALGORITHMS - {'sha1_git'} def origin_url_to_origin(origin_url): """Format a pygit2.Repository as an origin suitable for swh.storage""" return { 'type': 'git', 'url': origin_url, } def dulwich_blob_to_content(blob, log=None, max_content_size=None, origin_id=None): """Convert a dulwich blob to a Software Heritage content""" if blob.type_name != b'blob': return size = blob.raw_length() ret = { 'sha1_git': blob.sha().digest(), 'length': size, 'status': 'absent' } if max_content_size: if size > max_content_size: if log: log.info('Skipping content %s, too large (%s > %s)' % (blob.id.encode(), size, max_content_size), extra={ 'swh_type': 'loader_git_content_skip', 'swh_id': id.hex, 'swh_size': size, }) ret['reason'] = 'Content too large' ret['origin'] = origin_id return ret data = blob.as_raw_string() hashes = hashutil.hashdata(data, HASH_ALGORITHMS) ret.update(hashes) ret['data'] = data ret['status'] = 'visible' return ret def dulwich_tree_to_directory(tree, log=None): """Format a tree as a directory""" if tree.type_name != b'tree': return ret = { 'id': tree.sha().digest(), } entries = [] ret['entries'] = entries entry_mode_map = { 0o040000: 'dir', 0o160000: 'rev', 0o100644: 'file', 0o100755: 'file', 0o120000: 'file', } for entry in tree.iteritems(): entries.append({ 'type': entry_mode_map.get(entry.mode, 'file'), 'perms': entry.mode, 'name': entry.path, 'target': hashutil.hex_to_hash(entry.sha.decode('ascii')), }) return ret def parse_author(name_email): """Parse an author line""" if name_email is None: return None try: open_bracket = name_email.index(b'<') except ValueError: name = email = None else: raw_name = name_email[:open_bracket] raw_email = name_email[open_bracket+1:] if not raw_name: name = None elif raw_name.endswith(b' '): name = raw_name[:-1] else: name = raw_name try: close_bracket = raw_email.index(b'>') except ValueError: email = None else: email = raw_email[:close_bracket] return { 'name': name, 'email': email, 'fullname': name_email, } def dulwich_tsinfo_to_timestamp(timestamp, timezone, timezone_neg_utc): """Convert the dulwich timestamp information to a structure compatible with Software Heritage""" return { 'timestamp': timestamp, 'offset': timezone // 60, 'negative_utc': timezone_neg_utc if timezone == 0 else None, } def dulwich_commit_to_revision(commit, log=None): if commit.type_name != b'commit': return ret = { 'id': commit.sha().digest(), 'author': parse_author(commit.author), 'date': dulwich_tsinfo_to_timestamp( commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, ), 'committer': parse_author(commit.committer), 'committer_date': dulwich_tsinfo_to_timestamp( commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, ), 'type': 'git', 'directory': bytes.fromhex(commit.tree.decode()), 'message': commit.message, 'metadata': None, 'synthetic': False, 'parents': [bytes.fromhex(p.decode()) for p in commit.parents], } git_metadata = [] if commit.encoding is not None: git_metadata.append(['encoding', commit.encoding]) if commit.mergetag: for mergetag in commit.mergetag: git_metadata.append(['mergetag', mergetag.as_raw_string()]) if commit.extra: git_metadata.extend([k.decode('utf-8'), v] for k, v in commit.extra) if commit.gpgsig: git_metadata.append(['gpgsig', commit.gpgsig]) if git_metadata: ret['metadata'] = { 'extra_headers': git_metadata, } return ret DULWICH_TYPES = { b'blob': 'content', b'tree': 'directory', b'commit': 'revision', b'tag': 'release', } def dulwich_tag_to_release(tag, log=None): if tag.type_name != b'tag': return target_type, target = tag.object ret = { 'id': tag.sha().digest(), 'name': tag.name, 'target': bytes.fromhex(target.decode()), 'target_type': DULWICH_TYPES[target_type.type_name], 'message': tag._message, 'metadata': None, 'synthetic': False, } if tag.tagger: ret['author'] = parse_author(tag.tagger) ret['date'] = dulwich_tsinfo_to_timestamp( tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, ) else: ret['author'] = ret['date'] = None return ret diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index c634333..fddf249 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,199 +1,170 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest -import datetime from nose.tools import istest -import pygit2 +import dulwich.repo import swh.loader.git.converters as converters -from swh.core.hashutil import hex_to_hash +from swh.core.hashutil import bytehex_to_hash, hex_to_hash class TestConverters(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.repo_path = tempfile.mkdtemp() - cls.repo = pygit2.init_repository(cls.repo_path, bare=True) + cls.repo = dulwich.repo.Repo.init_bare(cls.repo_path) fast_export = os.path.join(os.path.dirname(__file__), '../../../../..', 'swh-storage-testdata', 'git-repos', 'example-submodule.fast-export.xz') xz = subprocess.Popen( ['xzcat'], stdin=open(fast_export, 'rb'), stdout=subprocess.PIPE, ) git = subprocess.Popen( ['git', 'fast-import', '--quiet'], stdin=xz.stdout, cwd=cls.repo_path, ) # flush stdout of xz xz.stdout.close() git.communicate() @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.repo_path) print(cls.repo_path) def setUp(self): super().setUp() - self.blob_id = pygit2.Oid( - hex='28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0') + self.blob_id = b'28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0' self.blob = { - 'sha1_git': self.blob_id.raw, + 'sha1_git': bytehex_to_hash(self.blob_id), 'sha1': hex_to_hash('4850a3420a2262ff061cb296fb915430fa92301c'), 'sha256': hex_to_hash('fee7c8a485a10321ad94b64135073cb5' '5f22cb9f57fa2417d2adfb09d310adef'), 'data': (b'[submodule "example-dependency"]\n' b'\tpath = example-dependency\n' b'\turl = https://github.com/githubtraining/' b'example-dependency.git\n'), 'length': 124, 'status': 'visible', } self.blob_hidden = { - 'sha1_git': self.blob_id.raw, + 'sha1_git': bytehex_to_hash(self.blob_id), 'length': 124, 'status': 'absent', 'reason': 'Content too large', 'origin': None, } @istest def blob_to_content(self): - content = converters.blob_to_content(self.blob_id, self.repo) + content = converters.dulwich_blob_to_content(self.repo[self.blob_id]) self.assertEqual(self.blob, content) @istest def blob_to_content_absent(self): max_length = self.blob['length'] - 1 - content = converters.blob_to_content(self.blob_id, self.repo, - max_content_size=max_length) + content = converters.dulwich_blob_to_content( + self.repo[self.blob_id], max_content_size=max_length) self.assertEqual(self.blob_hidden, content) @istest def commit_to_revision(self): - sha1 = '9768d0b576dbaaecd80abedad6dfd0d72f1476da' - commit = self.repo.revparse_single(sha1) + sha1 = b'9768d0b576dbaaecd80abedad6dfd0d72f1476da' - # when - actual_revision = converters.commit_to_revision(commit.id, self.repo) + revision = converters.dulwich_commit_to_revision(self.repo[sha1]) - offset = datetime.timedelta(minutes=120) - tzoffset = datetime.timezone(offset) expected_revision = { 'id': hex_to_hash('9768d0b576dbaaecd80abedad6dfd0d72f1476da'), 'directory': b'\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca', 'type': 'git', 'committer': { 'name': b'Stefano Zacchiroli', + 'fullname': b'Stefano Zacchiroli ', 'email': b'zack@upsilon.cc', }, 'author': { 'name': b'Stefano Zacchiroli', + 'fullname': b'Stefano Zacchiroli ', 'email': b'zack@upsilon.cc', }, - 'committer_date': datetime.datetime(2015, 9, 24, 10, 36, 5, - tzinfo=tzoffset), + 'committer_date': { + 'negative_utc': None, + 'timestamp': 1443083765, + 'offset': 120, + }, 'message': b'add submodule dependency\n', 'metadata': None, - 'date': datetime.datetime(2015, 9, 24, 10, 36, 5, - tzinfo=tzoffset), + 'date': { + 'negative_utc': None, + 'timestamp': 1443083765, + 'offset': 120, + }, 'parents': [ b'\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r' ], 'synthetic': False, } - # then - self.assertEquals(actual_revision, expected_revision) - self.assertEquals(offset, expected_revision['date'].utcoffset()) - self.assertEquals(offset, - expected_revision['committer_date'].utcoffset()) - - @istest - def ref_to_occurrence_1(self): - # when - actual_occ = converters.ref_to_occurrence({ - 'id': 'some-id', - 'branch': 'some/branch' - }) - # then - self.assertEquals(actual_occ, { - 'id': 'some-id', - 'branch': b'some/branch' - }) - - @istest - def ref_to_occurrence_2(self): - # when - actual_occ = converters.ref_to_occurrence({ - 'id': 'some-id', - 'branch': b'some/branch' - }) - - # then - self.assertEquals(actual_occ, { - 'id': 'some-id', - 'branch': b'some/branch' - }) + self.assertEquals(revision, expected_revision) @istest def author_line_to_author(self): tests = { b'a ': { 'name': b'a', 'email': b'b@c.com', 'fullname': b'a ', }, b'': { 'name': None, 'email': b'foo@bar.com', 'fullname': b'', }, b'malformed ': { 'name': b'trailing', 'email': b'sp@c.e', 'fullname': b'trailing ', }, b'no': { 'name': b'no', 'email': b'sp@c.e', 'fullname': b'no', }, b' <>': { 'name': b'', 'email': b'', 'fullname': b' <>', }, } for author in sorted(tests): parsed_author = tests[author] self.assertEquals(parsed_author, converters.parse_author(author)) diff --git a/swh/loader/git/utils.py b/swh/loader/git/utils.py deleted file mode 100644 index e6fb155..0000000 --- a/swh/loader/git/utils.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import datetime -import glob -import os -import subprocess - -from collections import defaultdict - -from pygit2 import Oid - - -def format_date(signature): - """Convert the date from a signature to a datetime""" - tz = datetime.timezone(datetime.timedelta(minutes=signature.offset)) - return datetime.datetime.fromtimestamp(signature.time, tz) - - -def list_objects_from_packfile_index(packfile_index): - """List the objects indexed by this packfile, in packfile offset - order. - """ - input_file = open(packfile_index, 'rb') - - with subprocess.Popen( - ['/usr/bin/git', 'show-index'], - stdin=input_file, - stdout=subprocess.PIPE, - ) as process: - - data = [] - - for line in process.stdout.readlines(): - # git show-index returns the line as: - # () - line_components = line.split() - offset = int(line_components[0]) - object_id = line_components[1] - - data.append((offset, object_id)) - - yield from (Oid(hex=object_id.decode('ascii')) - for _, object_id in sorted(data)) - - input_file.close() - - -def simple_list_objects(repo): - """List the objects in a given repository. Watch out for duplicates!""" - objects_dir = os.path.join(repo.path, 'objects') - # Git hashes are 40-character long - objects_glob = os.path.join(objects_dir, '[0-9a-f]' * 2, '[0-9a-f]' * 38) - - packfile_dir = os.path.join(objects_dir, 'pack') - - if os.path.isdir(packfile_dir): - for packfile_index in os.listdir(packfile_dir): - if not packfile_index.endswith('.idx'): - # Not an index file - continue - packfile_index_path = os.path.join(packfile_dir, packfile_index) - yield from list_objects_from_packfile_index(packfile_index_path) - - for object_file in glob.glob(objects_glob): - # Rebuild the object id as the last two components of the path - yield Oid(hex=''.join(object_file.split(os.path.sep)[-2:])) - - -def list_objects(repo): - """List the objects in a given repository, removing duplicates""" - seen = set() - for oid in simple_list_objects(repo): - if oid not in seen: - yield oid - seen.add(oid) - - -def get_objects_per_object_type(repo): - """Get all the (pygit2-parsed) objects from repo per object type""" - objects_per_object_type = defaultdict(list) - - for object_id in list_objects(repo): - object = repo[object_id] - objects_per_object_type[object.type].append(object_id) - - return objects_per_object_type