git.py
No OneTemporary
Actions

Size

14 KB

Subscribers

None

git.py
View Options

	# Copyright (C) 2015 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	import glob
	import logging
	import os
	import subprocess
	import time

	import pygit2

	from collections import defaultdict
	from datetime import datetime
	from pygit2 import GIT_REF_OID, Oid
	from pygit2 import GIT_OBJ_BLOB, GIT_OBJ_TREE, GIT_OBJ_COMMIT, GIT_OBJ_TAG, GIT_SORT_TOPOLOGICAL
	from enum import Enum

	from swh.core import hashutil
	from swh.loader.git.data import swhrepo
	from swh.storage.storage import Storage


	class DirectoryTypeEntry(Enum):
	"""Types of git objects.
	"""
	file = 'file'
	directory = 'directory'


	def date_format(d):
	"""d is expected to be a datetime object.
	"""
	return time.strftime("%a, %d %b %Y %H:%M:%S +0000", d.timetuple())


	def now():
	"""Cheat time values."""
	return date_format(datetime.utcnow())


	def timestamp_to_string(timestamp):
	"""Convert a timestamps to string.
	"""
	return date_format(datetime.utcfromtimestamp(timestamp))


	def list_objects_from_packfile_index(packfile_index):
	"""List the objects indexed by this packfile, in packfile offset
	order.
	"""
	input_file = open(packfile_index, 'rb')

	with subprocess.Popen(
	['/usr/bin/git', 'show-index'],
	stdin=input_file,
	stdout=subprocess.PIPE,
	) as process:

	data = []

	for line in process.stdout.readlines():
	# git show-index returns the line as:
	# <packfile offset> <object_id> (<object CRC>)
	line_components = line.split()
	offset = int(line_components[0])
	object_id = line_components[1]

	data.append((offset, object_id))

	yield from (Oid(hex=object_id.decode('ascii'))
	for _, object_id in sorted(data))

	input_file.close()


	def simple_list_objects(repo):
	"""List the objects in a given repository. Watch out for duplicates!"""
	objects_dir = os.path.join(repo.path, 'objects')
	# Git hashes are 40-character long
	objects_glob = os.path.join(objects_dir, '[0-9a-f]' * 2, '[0-9a-f]' * 38)

	packfile_dir = os.path.join(objects_dir, 'pack')

	if os.path.isdir(packfile_dir):
	for packfile_index in os.listdir(packfile_dir):
	if not packfile_index.endswith('.idx'):
	# Not an index file
	continue
	packfile_index_path = os.path.join(packfile_dir, packfile_index)
	yield from list_objects_from_packfile_index(packfile_index_path)

	for object_file in glob.glob(objects_glob):
	# Rebuild the object id as the last two components of the path
	yield Oid(hex=''.join(object_file.split(os.path.sep)[-2:]))


	def list_objects(repo):
	"""List the objects in a given repository, removing duplicates"""
	seen = set()
	for oid in simple_list_objects(repo):
	if oid not in seen:
	yield oid
	seen.add(oid)


	def get_objects_per_object_type(repo):
	"""Get all the (pygit2-parsed) objects from repo per object type"""
	objects_per_object_type = defaultdict(list)

	for object_id in list_objects(repo):
	object = repo[object_id]
	objects_per_object_type[object.type].append(object_id)

	return objects_per_object_type


	HASH_ALGORITHMS = ['sha1', 'sha256']


	def send_in_packets(repo, source_list, formatter, sender, packet_size, extra_data=None):
	"""Send objects from `source_list`, passed through `formatter` (being
	passed the `repo` and `extra_data`), by the `sender`, in packets
	of `packet_size` objects

	"""

	if extra_data is None:
	extra_data = {}

	objects = []
	for id in source_list:
	objects.append(formatter(repo, id, **extra_data))
	if len(objects) >= packet_size:
	sender(objects)
	objects = []

	sender(objects)


	def send_contents(content_list):
	"""Actually send properly formatted contents to the database"""
	logging.info("Sending %d contents" % len(content_list))
	s = Storage('dbname=softwareheritage-dev', '/tmp/swh-loader-git/test')

	s.content_add(content_list)
	logging.info("Done sending %d contents" % len(content_list))


	def send_directories(directory_list):
	"""Actually send properly formatted directories to the database"""
	# TODO: send directories
	logging.info("Sending %d directories" % len(directory_list))
	s = Storage('dbname=softwareheritage-dev', '/tmp/swh-loader-git/test')

	s.directory_add(directory_list)
	logging.info("Done sending %d directories" % len(directory_list))


	def send_revisions(revision_list):
	"""Actually send properly formatted revisions to the database"""
	logging.info("Sending %d revisions" % len(revision_list))
	# TODO: send revisions
	logging.info("Done sending %d revisions" % len(revision_list))


	def send_releases(release_list):
	"""Actually send properly formatted releases to the database"""
	logging.info("Sending %d releases" % len(release_list))
	# TODO: send releases
	logging.info("Done sending %d releases" % len(release_list))


	def blob_to_content(repo, id):
	"""Format a blob as a content"""
	blob = repo[id]
	data = blob.data
	hashes = hashutil.hashdata(data, HASH_ALGORITHMS)
	return {
	'sha1_git': id.raw,
	'sha1': hashes['sha1'],
	'sha256': hashes['sha256'],
	'data': data,
	'length': blob.size,
	}


	def tree_to_directory(repo, id):
	"""Format a tree as a directory"""
	ret = {
	'id': id.raw,
	}
	entries = []
	ret['entries'] = entries

	entry_type_map = {
	'tree': 'dir',
	'blob': 'file',
	'commit': 'rev',
	}

	for entry in repo[id]:
	entries.append({
	'type': entry_type_map[entry.type],
	'perms': entry.filemode,
	'name': entry.name,
	'target': entry.id.raw,
	'atime': None,
	'mtime': None,
	'ctime': None,
	})

	return ret


	def commit_to_revision(repo, id):
	"""Format a commit as a revision"""
	# TODO: format commits
	return {
	'id': id,
	}


	def annotated_tag_to_release(repo, id):
	"""Format an annotated tag as a release"""
	# TODO: format annotated tags
	return {
	'id': id,
	}


	def unannotated_tag_to_release(repo, id):
	"""Format an unannotated tag as a release"""
	# TODO: format unannotated tags
	return {
	'id': id,
	}


	def bulk_send_blobs(repo, blob_dict):
	"""Format blobs as swh contents and send them to the database in bulks
	of maximum `threshold` objects

	"""
	# TODO: move to config file
	content_packet_size = 100000

	send_in_packets(repo, blob_dict, blob_to_content, send_contents, content_packet_size)


	def bulk_send_trees(repo, tree_dict):
	"""Format trees as swh directories and send them to the database

	"""
	# TODO: move to config file
	directory_packet_size = 25000

	send_in_packets(repo, tree_dict, tree_to_directory, send_directories, directory_packet_size)


	def bulk_send_commits(repo, commit_dict):
	"""Format commits as swh revisions and send them to the database

	"""
	# TODO: move to config file
	revision_packet_size = 10000

	send_in_packets(repo, commit_dict, commit_to_revision, send_revisions, revision_packet_size)


	def bulk_send_annotated_tags(repo, tag_dict):
	"""Format annotated tags (pygit2.Tag objects) as swh releases and send
	them to the database

	"""
	# TODO: move to config file
	release_packet_size = 10000

	send_in_packets(repo, tag_dict, annotated_tag_to_release, send_releases, release_packet_size)


	def bulk_send_unannotated_tags(repo, tag_dict, commit_dict):
	"""Format unannotated tags (strings) as swh releases and send
	them to the database

	"""
	# TODO: move to config file
	release_packet_size = 10000

	extra_data = {
	'commits': commit_dict,
	}
	send_in_packets(repo, tag_dict, unannotated_tag_to_release,
	send_releases, release_packet_size, extra_data)


	def parse_via_object_list(repo_path):
	logging.info("Started loading %s" % repo_path)
	repo = pygit2.Repository(repo_path)
	objects_per_object_type = get_objects_per_object_type(repo)

	logging.info("Done listing the objects in %s: will load %d contents, "
	"%d directories, %d revisions, %d releases" % (
	repo_path,
	len(objects_per_object_type[GIT_OBJ_BLOB]),
	len(objects_per_object_type[GIT_OBJ_TREE]),
	len(objects_per_object_type[GIT_OBJ_COMMIT]),
	len(objects_per_object_type[GIT_OBJ_TAG])))

	# bulk_send_blobs(repo, objects_per_object_type[GIT_OBJ_BLOB])
	bulk_send_trees(repo, objects_per_object_type[GIT_OBJ_TREE])
	bulk_send_commits(repo, objects_per_object_type[GIT_OBJ_COMMIT])
	bulk_send_annotated_tags(repo, objects_per_object_type[GIT_OBJ_TAG])
	# TODO: send unannotated tags
	bulk_send_unannotated_tags(repo, [], objects_per_object_type[GIT_OBJ_COMMIT])

	return objects_per_object_type, {type: len(list) for type, list in objects_per_object_type.items()}

	def parse(repo_path):
	"""Given a repository path, parse and return a memory model of such
	repository."""
	def read_signature(signature):
	return '%s <%s>' % (signature.name, signature.email)

	def treewalk(repo, tree):
	"""Walk a tree with the same implementation as `os.path`.
	Returns: tree, trees, blobs
	"""
	trees, blobs, dir_entries = [], [], []
	for tree_entry in tree:
	if swh_repo.already_visited(tree_entry.hex):
	logging.debug('tree_entry %s already visited, skipped' % tree_entry.hex)
	continue

	obj = repo.get(tree_entry.oid)
	if obj is None: # or obj.type == GIT_OBJ_COMMIT:
	logging.warn('skip submodule-commit %s' % tree_entry.hex)
	continue # submodule!

	if obj.type == GIT_OBJ_TREE:
	logging.debug('found tree %s' % tree_entry.hex)
	nature = DirectoryTypeEntry.directory.value
	trees.append(tree_entry)
	else:
	logging.debug('found content %s' % tree_entry.hex)
	data = obj.data
	nature = DirectoryTypeEntry.file.value
	hashes = hashutil.hashdata(data, HASH_ALGORITHMS)
	blobs.append({'id': obj.hex,
	'type': storage.Type.content,
	'content-sha1': hashes['sha1'],
	'content-sha256': hashes['sha256'],
	'content': data,
	'size': obj.size})

	dir_entries.append({'name': tree_entry.name,
	'type': storage.Type.directory_entry,
	'target-sha1': obj.hex,
	'nature': nature,
	'perms': tree_entry.filemode,
	'atime': None,
	'mtime': None,
	'ctime': None,
	'parent': tree.hex})

	yield tree, dir_entries, trees, blobs
	for tree_entry in trees:
	for x in treewalk(repo, repo[tree_entry.oid]):
	yield x

	def walk_tree(repo, swh_repo, rev):
	"""Walk the rev revision's directories.
	"""
	if swh_repo.already_visited(rev.hex):
	logging.debug('commit %s already visited, skipped' % rev.hex)
	return swh_repo

	for dir_root, dir_entries, _, contents_ref in treewalk(repo, rev.tree):
	for content_ref in contents_ref:
	swh_repo.add_content(content_ref)

	swh_repo.add_directory({'id': dir_root.hex,
	'type': storage.Type.directory,
	'entries': dir_entries})

	revision_parent_sha1s = list(map(str, rev.parent_ids))

	author = {'name': rev.author.name,
	'email': rev.author.email,
	'type': storage.Type.person}
	committer = {'name': rev.committer.name,
	'email': rev.committer.email,
	'type': storage.Type.person}

	swh_repo.add_revision({'id': rev.hex,
	'type':storage.Type.revision,
	'date': timestamp_to_string(rev.commit_time),
	'directory': rev.tree.hex,
	'message': rev.message,
	'committer': committer,
	'author': author,
	'parent-sha1s': revision_parent_sha1s
	})

	swh_repo.add_person(read_signature(rev.author), author)
	swh_repo.add_person(read_signature(rev.committer), committer)

	return swh_repo

	def walk_revision_from(repo, swh_repo, head_rev):
	"""Walk the rev history log from head_rev.
	- repo is the current repository
	- rev is the latest rev to start from.
	"""
	for rev in repo.walk(head_rev.id, GIT_SORT_TOPOLOGICAL):
	swh_repo = walk_tree(repo, swh_repo, rev)

	return swh_repo

	repo = pygit2.Repository(repo_path)
	# memory model
	swh_repo = swhrepo.SWHRepo()
	# add origin
	origin = {'type': 'git',
	'url': 'file://' + repo.path}
	swh_repo.add_origin(origin)
	# add references and crawl them
	for ref_name in repo.listall_references():
	logging.info('walk reference %s' % ref_name)
	ref = repo.lookup_reference(ref_name)

	head_rev = repo[ref.target] \
	if ref.type is GIT_REF_OID \
	else ref.peel(GIT_OBJ_COMMIT) # noqa

	if isinstance(head_rev, pygit2.Tag):
	head_start = head_rev.get_object()
	taggerSig = head_rev.tagger
	author = {'name': taggerSig.name,
	'email': taggerSig.email,
	'type': storage.Type.person}
	release = {'id': head_rev.hex,
	'type': storage.Type.release,
	'revision': head_rev.target.hex,
	'name': ref_name,
	'date': now(), # FIXME: find the tag's date,
	'author': author,
	'comment': head_rev.message}

	swh_repo.add_release(release)
	swh_repo.add_person(read_signature(taggerSig), author)
	else:
	swh_repo.add_occurrence({'id': head_rev.hex,
	'revision': head_rev.hex,
	'reference': ref_name,
	'url-origin': origin['url'],
	'type': storage.Type.occurrence})
	head_start = head_rev

	# crawl commits and trees
	walk_revision_from(repo, swh_repo, head_start)

	return swh_repo

File Metadata

Mime Type: text/x-python
Expires: Jun 4 2025, 7:04 PM (10 w, 1 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3398926

git.pyNo OneTemporaryActions

git.pyView Options

File Metadata

Event Timeline

git.py
No OneTemporary
Actions

git.py
View Options