diff --git a/swh/loader/git/git.py b/swh/loader/git/git.py index 381eaef..64bb603 100644 --- a/swh/loader/git/git.py +++ b/swh/loader/git/git.py @@ -1,509 +1,536 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import glob import logging import os import subprocess import time import pygit2 from collections import defaultdict from pygit2 import GIT_REF_OID, Oid from pygit2 import GIT_OBJ_BLOB, GIT_OBJ_TREE, GIT_OBJ_COMMIT, GIT_OBJ_TAG, GIT_SORT_TOPOLOGICAL from enum import Enum from swh.core import hashutil from swh.loader.git.data import swhrepo from swh.storage.storage import Storage class DirectoryTypeEntry(Enum): """Types of git objects. """ file = 'file' directory = 'directory' def date_format(d): """d is expected to be a datetime object. """ return time.strftime("%a, %d %b %Y %H:%M:%S +0000", d.timetuple()) def now(): """Cheat time values.""" return date_format(datetime.utcnow()) def timestamp_to_string(timestamp): """Convert a timestamps to string. """ return date_format(datetime.utcfromtimestamp(timestamp)) def format_date(signature): """Convert the date from a signature to a datetime""" return datetime.datetime.fromtimestamp(signature.time, datetime.timezone.utc) def list_objects_from_packfile_index(packfile_index): """List the objects indexed by this packfile, in packfile offset order. """ input_file = open(packfile_index, 'rb') with subprocess.Popen( ['/usr/bin/git', 'show-index'], stdin=input_file, stdout=subprocess.PIPE, ) as process: data = [] for line in process.stdout.readlines(): # git show-index returns the line as: # () line_components = line.split() offset = int(line_components[0]) object_id = line_components[1] data.append((offset, object_id)) yield from (Oid(hex=object_id.decode('ascii')) for _, object_id in sorted(data)) input_file.close() def simple_list_objects(repo): """List the objects in a given repository. Watch out for duplicates!""" objects_dir = os.path.join(repo.path, 'objects') # Git hashes are 40-character long objects_glob = os.path.join(objects_dir, '[0-9a-f]' * 2, '[0-9a-f]' * 38) packfile_dir = os.path.join(objects_dir, 'pack') if os.path.isdir(packfile_dir): for packfile_index in os.listdir(packfile_dir): if not packfile_index.endswith('.idx'): # Not an index file continue packfile_index_path = os.path.join(packfile_dir, packfile_index) yield from list_objects_from_packfile_index(packfile_index_path) for object_file in glob.glob(objects_glob): # Rebuild the object id as the last two components of the path yield Oid(hex=''.join(object_file.split(os.path.sep)[-2:])) def list_objects(repo): """List the objects in a given repository, removing duplicates""" seen = set() for oid in simple_list_objects(repo): if oid not in seen: yield oid seen.add(oid) def get_objects_per_object_type(repo): """Get all the (pygit2-parsed) objects from repo per object type""" objects_per_object_type = defaultdict(list) for object_id in list_objects(repo): object = repo[object_id] objects_per_object_type[object.type].append(object_id) return objects_per_object_type HASH_ALGORITHMS = ['sha1', 'sha256'] def send_in_packets(repo, source_list, formatter, sender, packet_size): """Send objects from `source_list`, passed through `formatter` (being passed the `repo`), by the `sender`, in packets of `packet_size` objects """ formatted_objects = [] for obj in source_list: formatted_object = formatter(repo, obj) if formatted_object: formatted_objects.append(formatted_object) if len(formatted_objects) >= packet_size: sender(formatted_objects) formatted_objects = [] sender(formatted_objects) def send_contents(content_list): """Actually send properly formatted contents to the database""" logging.info("Sending %d contents" % len(content_list)) s = Storage('dbname=softwareheritage-dev', '/tmp/swh-loader-git/test') s.content_add(content_list) logging.info("Done sending %d contents" % len(content_list)) def send_directories(directory_list): """Actually send properly formatted directories to the database""" logging.info("Sending %d directories" % len(directory_list)) s = Storage('dbname=softwareheritage-dev', '/tmp/swh-loader-git/test') s.directory_add(directory_list) logging.info("Done sending %d directories" % len(directory_list)) def send_revisions(revision_list): """Actually send properly formatted revisions to the database""" logging.info("Sending %d revisions" % len(revision_list)) s = Storage('dbname=softwareheritage-dev', '/tmp/swh-loader-git/test') s.revision_add(revision_list) logging.info("Done sending %d revisions" % len(revision_list)) def send_releases(release_list): """Actually send properly formatted releases to the database""" logging.info("Sending %d releases" % len(release_list)) s = Storage('dbname=softwareheritage-dev', '/tmp/swh-loader-git/test') s.release_add(release_list) logging.info("Done sending %d releases" % len(release_list)) def send_occurrences(occurrence_list): """Actually send properly formatted occurrences to the database""" logging.info("Sending %d occurrences" % len(occurrence_list)) s = Storage('dbname=softwareheritage-dev', '/tmp/swh-loader-git/test') s.occurrence_add(occurrence_list) logging.info("Done sending %d occurrences" % len(occurrence_list)) def blob_to_content(repo, id): """Format a blob as a content""" blob = repo[id] data = blob.data hashes = hashutil.hashdata(data, HASH_ALGORITHMS) return { 'sha1_git': id.raw, 'sha1': hashes['sha1'], 'sha256': hashes['sha256'], 'data': data, 'length': blob.size, } def tree_to_directory(repo, id): """Format a tree as a directory""" ret = { 'id': id.raw, } entries = [] ret['entries'] = entries entry_type_map = { 'tree': 'dir', 'blob': 'file', 'commit': 'rev', } for entry in repo[id]: entries.append({ 'type': entry_type_map[entry.type], 'perms': entry.filemode, 'name': entry.name, 'target': entry.id.raw, 'atime': None, 'mtime': None, 'ctime': None, }) return ret def commit_to_revision(repo, id): """Format a commit as a revision""" commit = repo[id] author = commit.author committer = commit.committer return { 'id': id.raw, 'date': format_date(author), 'date_offset': author.offset, 'committer_date': format_date(committer), 'committer_date_offset': committer.offset, 'type': 'git', 'directory': commit.tree_id.raw, 'message': commit.raw_message, 'author_name': author.name, 'author_email': author.email, 'committer_name': committer.name, 'committer_email': committer.email, 'parents': [p.raw for p in commit.parent_ids], } def annotated_tag_to_release(repo, id): """Format an annotated tag as a release""" - # TODO: format annotated tags + tag = repo[id] + + tag_pointer = repo[tag.target] + if tag_pointer.type != GIT_OBJ_COMMIT: + logging.warn("Ignoring tag %s pointing at %s %s" % ( + tag.id.hex, tag_pointer.__class__.__name__, tag_pointer.id.hex)) + return + + author = tag.tagger + + if not author: + logging.warn("Tag %s has no author, using default values" % id.hex) + author_name = '' + author_email = '' + date = None + date_offset = 0 + else: + author_name = author.name + author_email = author.email + date = format_date(author) + date_offset = author.offset + return { - 'id': id, + 'id': id.raw, + 'date': date, + 'date_offset': date_offset, + 'revision': tag.target.raw, + 'comment': tag.message.encode('utf-8'), + 'author_name': author_name, + 'author_email': author_email, } def ref_to_occurrence(repo, ref): """Format a reference as an occurrence""" return ref def bulk_send_blobs(repo, blob_dict): """Format blobs as swh contents and send them to the database in bulks of maximum `threshold` objects """ # TODO: move to config file content_packet_size = 100000 send_in_packets(repo, blob_dict, blob_to_content, send_contents, content_packet_size) def bulk_send_trees(repo, tree_dict): """Format trees as swh directories and send them to the database """ # TODO: move to config file directory_packet_size = 25000 send_in_packets(repo, tree_dict, tree_to_directory, send_directories, directory_packet_size) def bulk_send_commits(repo, commit_dict): """Format commits as swh revisions and send them to the database """ # TODO: move to config file revision_packet_size = 10000 send_in_packets(repo, commit_dict, commit_to_revision, send_revisions, revision_packet_size) def bulk_send_annotated_tags(repo, tag_dict): """Format annotated tags (pygit2.Tag objects) as swh releases and send them to the database """ # TODO: move to config file release_packet_size = 10000 send_in_packets(repo, tag_dict, annotated_tag_to_release, send_releases, release_packet_size) def bulk_send_refs(repo, refs): """Format git references as swh occurrences and send them to the database """ # TODO: move to config file occurrence_packet_size = 100000 send_in_packets(repo, refs, ref_to_occurrence, send_occurrences, occurrence_packet_size) def parse_via_object_list(repo_path): logging.info("Started loading %s" % repo_path) repo = pygit2.Repository(repo_path) objects_per_object_type = get_objects_per_object_type(repo) refs = [] ref_names = repo.listall_references() for ref_name in ref_names: ref = repo.lookup_reference(ref_name) target = ref.target if not isinstance(target, Oid): logging.info("Skipping symbolic ref %s pointing at %s" % ( ref_name, ref.target)) continue target_obj = repo[target] if not target_obj.type == GIT_OBJ_COMMIT: logging.info("Skipping ref %s pointing to %s %s" % ( ref_name, target_obj.__class__.__name__, target.hex)) refs.append({ 'name': ref_name, 'revision': target.hex, }) logging.info("Done listing the objects in %s: will load %d contents, " "%d directories, %d revisions, %d releases, " "%d occurrences" % ( repo_path, len(objects_per_object_type[GIT_OBJ_BLOB]), len(objects_per_object_type[GIT_OBJ_TREE]), len(objects_per_object_type[GIT_OBJ_COMMIT]), len(objects_per_object_type[GIT_OBJ_TAG]), len(refs) )) bulk_send_blobs(repo, objects_per_object_type[GIT_OBJ_BLOB]) bulk_send_trees(repo, objects_per_object_type[GIT_OBJ_TREE]) bulk_send_commits(repo, objects_per_object_type[GIT_OBJ_COMMIT]) bulk_send_annotated_tags(repo, objects_per_object_type[GIT_OBJ_TAG]) bulk_send_refs(repo, refs) def parse(repo_path): """Given a repository path, parse and return a memory model of such repository.""" def read_signature(signature): return '%s <%s>' % (signature.name, signature.email) def treewalk(repo, tree): """Walk a tree with the same implementation as `os.path`. Returns: tree, trees, blobs """ trees, blobs, dir_entries = [], [], [] for tree_entry in tree: if swh_repo.already_visited(tree_entry.hex): logging.debug('tree_entry %s already visited, skipped' % tree_entry.hex) continue obj = repo.get(tree_entry.oid) if obj is None: # or obj.type == GIT_OBJ_COMMIT: logging.warn('skip submodule-commit %s' % tree_entry.hex) continue # submodule! if obj.type == GIT_OBJ_TREE: logging.debug('found tree %s' % tree_entry.hex) nature = DirectoryTypeEntry.directory.value trees.append(tree_entry) else: logging.debug('found content %s' % tree_entry.hex) data = obj.data nature = DirectoryTypeEntry.file.value hashes = hashutil.hashdata(data, HASH_ALGORITHMS) blobs.append({'id': obj.hex, 'type': storage.Type.content, 'content-sha1': hashes['sha1'], 'content-sha256': hashes['sha256'], 'content': data, 'size': obj.size}) dir_entries.append({'name': tree_entry.name, 'type': storage.Type.directory_entry, 'target-sha1': obj.hex, 'nature': nature, 'perms': tree_entry.filemode, 'atime': None, 'mtime': None, 'ctime': None, 'parent': tree.hex}) yield tree, dir_entries, trees, blobs for tree_entry in trees: for x in treewalk(repo, repo[tree_entry.oid]): yield x def walk_tree(repo, swh_repo, rev): """Walk the rev revision's directories. """ if swh_repo.already_visited(rev.hex): logging.debug('commit %s already visited, skipped' % rev.hex) return swh_repo for dir_root, dir_entries, _, contents_ref in treewalk(repo, rev.tree): for content_ref in contents_ref: swh_repo.add_content(content_ref) swh_repo.add_directory({'id': dir_root.hex, 'type': storage.Type.directory, 'entries': dir_entries}) revision_parent_sha1s = list(map(str, rev.parent_ids)) author = {'name': rev.author.name, 'email': rev.author.email, 'type': storage.Type.person} committer = {'name': rev.committer.name, 'email': rev.committer.email, 'type': storage.Type.person} swh_repo.add_revision({'id': rev.hex, 'type':storage.Type.revision, 'date': timestamp_to_string(rev.commit_time), 'directory': rev.tree.hex, 'message': rev.message, 'committer': committer, 'author': author, 'parent-sha1s': revision_parent_sha1s }) swh_repo.add_person(read_signature(rev.author), author) swh_repo.add_person(read_signature(rev.committer), committer) return swh_repo def walk_revision_from(repo, swh_repo, head_rev): """Walk the rev history log from head_rev. - repo is the current repository - rev is the latest rev to start from. """ for rev in repo.walk(head_rev.id, GIT_SORT_TOPOLOGICAL): swh_repo = walk_tree(repo, swh_repo, rev) return swh_repo repo = pygit2.Repository(repo_path) # memory model swh_repo = swhrepo.SWHRepo() # add origin origin = {'type': 'git', 'url': 'file://' + repo.path} swh_repo.add_origin(origin) # add references and crawl them for ref_name in repo.listall_references(): logging.info('walk reference %s' % ref_name) ref = repo.lookup_reference(ref_name) head_rev = repo[ref.target] \ if ref.type is GIT_REF_OID \ else ref.peel(GIT_OBJ_COMMIT) # noqa if isinstance(head_rev, pygit2.Tag): head_start = head_rev.get_object() taggerSig = head_rev.tagger author = {'name': taggerSig.name, 'email': taggerSig.email, 'type': storage.Type.person} release = {'id': head_rev.hex, 'type': storage.Type.release, 'revision': head_rev.target.hex, 'name': ref_name, 'date': now(), # FIXME: find the tag's date, 'author': author, 'comment': head_rev.message} swh_repo.add_release(release) swh_repo.add_person(read_signature(taggerSig), author) else: swh_repo.add_occurrence({'id': head_rev.hex, 'revision': head_rev.hex, 'reference': ref_name, 'url-origin': origin['url'], 'type': storage.Type.occurrence}) head_start = head_rev # crawl commits and trees walk_revision_from(repo, swh_repo, head_start) return swh_repo