diff --git a/swh/loader/git/git.py b/swh/loader/git/git.py index 8a1cea7..0a705b1 100644 --- a/swh/loader/git/git.py +++ b/swh/loader/git/git.py @@ -1,234 +1,234 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import glob import logging import os import subprocess import time import pygit2 from datetime import datetime from pygit2 import GIT_REF_OID from pygit2 import GIT_OBJ_COMMIT, GIT_OBJ_TREE, GIT_SORT_TOPOLOGICAL from enum import Enum from swh.core import hashutil from swh.loader.git.data import swhrepo from swh.loader.git.storage import storage def date_format(d): """d is expected to be a datetime object. """ return time.strftime("%a, %d %b %Y %H:%M:%S +0000", d.timetuple()) def now(): """Cheat time values. """ return date_format(datetime.utcnow()) def timestamp_to_string(timestamp): """Convert a timestamps to string. """ return date_format(datetime.utcfromtimestamp(timestamp)) def list_objects_from_packfile_index(packfile_index): """List the objects indexed by this packfile. """ input_file = open(packfile_index, 'rb') with subprocess.Popen( ['/usr/bin/git', 'show-index'], stdin=input_file, stdout=subprocess.PIPE, ) as process: for line in process.stdout.readlines(): obj_id = line.decode('utf-8', 'ignore').split()[1] yield obj_id def list_objects(repo): """List the objects in a given repository. """ objects_dir = os.path.join(repo.path, 'objects') objects_glob = os.path.join(objects_dir, '[0-9a-f]' * 2, '[0-9a-f]' * 38) packfile_dir = os.path.join(objects_dir, 'pack') if os.path.isdir(packfile_dir): for packfile_index in os.listdir(packfile_dir): if not packfile_index.endswith('.idx'): # Not an index file continue packfile_index_path = os.path.join(packfile_dir, packfile_index) yield from list_objects_from_packfile_index(packfile_index_path) for object_file in glob.glob(objects_glob): yield ''.join(object_file.split(os.path.sep)[-2:]) HASH_ALGORITHMS = ['sha1', 'sha256'] def parse(repo_path): """Given a repository path, parse and return a memory model of such repository. """ def read_signature(signature): return '%s <%s>' % (signature.name, signature.email) def treewalk(repo, tree): """Walk a tree with the same implementation as `os.path`. Returns: tree, trees, contents """ trees, contents, dir_entry_dirs, dir_entry_files = [], [], [], [] for tree_entry in tree: if swh_repo.already_visited(tree_entry.hex): logging.debug('tree_entry %s already visited,' ' skipped' % tree_entry.hex) continue obj = repo.get(tree_entry.oid) if obj is None: # or obj.type == GIT_OBJ_COMMIT: logging.warn('skip submodule-commit %s' % tree_entry.hex) continue # submodule! dir_entry = {'name': tree_entry.name, 'type': storage.Type.directory_entry, 'target-sha1': obj.hex, 'perms': tree_entry.filemode, 'atime': None, 'mtime': None, 'ctime': None} if obj.type == GIT_OBJ_TREE: logging.debug('found tree %s' % tree_entry.hex) trees.append(tree_entry) dir_entry_dirs.append(dir_entry) else: logging.debug('found content %s' % tree_entry.hex) data = obj.data hashes = hashutil.hashdata(data, HASH_ALGORITHMS) contents.append({'id': hashes['sha1'], 'type': storage.Type.content, 'git-sha1': obj.hex, 'content-sha256': hashes['sha256'], 'content': data, 'size': obj.size}) dir_entry_files.append(dir_entry) yield tree, dir_entry_dirs, dir_entry_files, trees, contents for tree_entry in trees: for x in treewalk(repo, repo[tree_entry.oid]): yield x def walk_tree(repo, swh_repo, rev): """Walk the rev revision's directories. """ - if swh_repo.already_visited(rev.hex): - logging.debug('commit %s already visited, skipped' % rev.hex) - return swh_repo - for dir_root, dir_entry_dirs, dir_entry_files, _, contents_ref \ - in treewalk(repo, rev.tree): + in treewalk(repo, rev.tree): for content_ref in contents_ref: swh_repo.add_content(content_ref) swh_repo.add_directory({'id': dir_root.hex, 'type': storage.Type.directory, 'entry-dirs': dir_entry_dirs, 'entry-files': dir_entry_files}) revision_parent_sha1s = list(map(str, rev.parent_ids)) author = {'name': rev.author.name, 'email': rev.author.email, 'type': storage.Type.person} committer = {'name': rev.committer.name, 'email': rev.committer.email, 'type': storage.Type.person} swh_repo.add_revision({'id': rev.hex, 'type': storage.Type.revision, 'date': timestamp_to_string(rev.commit_time), 'directory': rev.tree.hex, 'message': rev.message, 'committer': committer, 'author': author, 'parent-sha1s': revision_parent_sha1s }) swh_repo.add_person(read_signature(rev.author), author) swh_repo.add_person(read_signature(rev.committer), committer) return swh_repo def walk_revision_from(repo, swh_repo, head_rev): """Walk the rev history log from head_rev. - repo is the current repository - rev is the latest rev to start from. """ for rev in repo.walk(head_rev.id, GIT_SORT_TOPOLOGICAL): + sha1 = rev.hex + if swh_repo.already_visited(sha1): + logging.debug('commit %s already visited, skipped' % sha1) + continue swh_repo = walk_tree(repo, swh_repo, rev) return swh_repo repo = pygit2.Repository(repo_path) # memory model swh_repo = swhrepo.SWHRepo() # add origin origin = {'type': 'git', 'url': 'file://' + repo.path} swh_repo.add_origin(origin) # add references and crawl them for ref_name in repo.listall_references(): logging.info('walk reference %s' % ref_name) ref = repo.lookup_reference(ref_name) head_rev = repo[ref.target] \ if ref.type is GIT_REF_OID \ else ref.peel(GIT_OBJ_COMMIT) # noqa if isinstance(head_rev, pygit2.Tag): head_start = head_rev.get_object() taggerSig = head_rev.tagger author = {'name': taggerSig.name, 'email': taggerSig.email, 'type': storage.Type.person} release = {'id': head_rev.hex, 'type': storage.Type.release, 'revision': head_rev.target.hex, 'name': ref_name, 'date': now(), # FIXME: find the tag's date, 'author': author, 'comment': head_rev.message} swh_repo.add_release(release) swh_repo.add_person(read_signature(taggerSig), author) else: swh_repo.add_occurrence({'id': head_rev.hex, 'revision': head_rev.hex, 'branch': ref_name, 'url-origin': origin['url'], 'type': storage.Type.occurrence}) head_start = head_rev # crawl commits and trees walk_revision_from(repo, swh_repo, head_start) return swh_repo