diff --git a/swh/loader/git/git.py b/swh/loader/git/git.py index 768c5df..3a6829b 100644 --- a/swh/loader/git/git.py +++ b/swh/loader/git/git.py @@ -1,227 +1,227 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import glob import logging import os import subprocess import time import pygit2 from pygit2 import GIT_REF_OID from pygit2 import GIT_OBJ_COMMIT, GIT_OBJ_TREE, GIT_SORT_TOPOLOGICAL from enum import Enum from swh.core import hashutil from swh.loader.git import date from swh.loader.git.data import swhrepo from swh.loader.git.storage import storage SWH_AUTHORITY = 'softwareheritage' def list_objects_from_packfile_index(packfile_index): """List the objects indexed by this packfile. """ input_file = open(packfile_index, 'rb') with subprocess.Popen( ['/usr/bin/git', 'show-index'], stdin=input_file, stdout=subprocess.PIPE, ) as process: for line in process.stdout.readlines(): obj_id = line.decode('utf-8', 'ignore').split()[1] yield obj_id def list_objects(repo): """List the objects in a given repository. """ objects_dir = os.path.join(repo.path, 'objects') objects_glob = os.path.join(objects_dir, '[0-9a-f]' * 2, '[0-9a-f]' * 38) packfile_dir = os.path.join(objects_dir, 'pack') if os.path.isdir(packfile_dir): for packfile_index in os.listdir(packfile_dir): if not packfile_index.endswith('.idx'): # Not an index file continue packfile_index_path = os.path.join(packfile_dir, packfile_index) yield from list_objects_from_packfile_index(packfile_index_path) for object_file in glob.glob(objects_glob): yield ''.join(object_file.split(os.path.sep)[-2:]) HASH_ALGORITHMS = ['sha1', 'sha256'] def parse(repo_path): """Given a repository path, parse and return a memory model of such repository. """ def read_signature(signature): return '%s <%s>' % (signature.name, signature.email) def treewalk(repo, tree): """Walk a tree with the same implementation as `os.path`. Returns: tree, trees, contents """ trees, contents = [], [] dir_entry_dirs, dir_entry_files, dir_entry_revs = [], [], [] for tree_entry in tree: tree_sha1 = hashutil.hex_to_hash(tree_entry.hex) if swh_repo.already_visited(tree_sha1): logging.debug('tree_entry %s already visited,' ' skipped' % tree_entry.hex) continue dir_entry = {'name': tree_entry.name, 'type': storage.Type.directory_entry, 'target-sha1': tree_sha1, 'perms': tree_entry.filemode, 'atime': None, 'mtime': None, 'ctime': None} obj = repo.get(tree_entry.oid) if obj is None: # submodule logging.debug('found rev %s' % tree_entry.hex) dir_entry_revs.append(dir_entry) elif obj.type == GIT_OBJ_TREE: logging.debug('found tree %s' % tree_entry.hex) trees.append(tree_entry) dir_entry_dirs.append(dir_entry) else: logging.debug('found content %s' % tree_entry.hex) data = obj.data hashes = hashutil.hashdata(data, HASH_ALGORITHMS) contents.append({'id': hashes['sha1'], 'type': storage.Type.content, 'git-sha1': hashutil.hex_to_hash(obj.hex), 'content-sha256': hashes['sha256'], 'content': data, 'size': obj.size}) dir_entry_files.append(dir_entry) yield tree, dir_entry_dirs, dir_entry_files, dir_entry_revs, trees, contents for tree_entry in trees: for x in treewalk(repo, repo[tree_entry.oid]): yield x def walk_tree(repo, swh_repo, rev): """Walk the rev revision's directories. """ for dir_root, dir_entry_dirs, dir_entry_files, dir_entry_revs, _, contents_ref \ in treewalk(repo, rev.tree): for content_ref in contents_ref: swh_repo.add_content(content_ref) swh_repo.add_directory({'id': hashutil.hex_to_hash(dir_root.hex), 'type': storage.Type.directory, 'entry-dirs': dir_entry_dirs, 'entry-files': dir_entry_files, 'entry-revs': dir_entry_revs}) - revision_parent_sha1s = map(lambda x: hashutil.hex_to_hash(str(x)), rev.parent_ids) + revision_parent_sha1s = list(map(lambda x: hashutil.hex_to_hash(str(x)), rev.parent_ids)) author = {'name': rev.author.name, 'email': rev.author.email, 'type': storage.Type.person} committer = {'name': rev.committer.name, 'email': rev.committer.email, 'type': storage.Type.person} swh_repo.add_revision({'id': hashutil.hex_to_hash(rev.hex), 'type': storage.Type.revision, 'date': date.ts_to_str( rev.author.time, rev.author.offset), 'committer-date': date.ts_to_str( rev.commit_time, rev.commit_time_offset), 'directory': hashutil.hex_to_hash(rev.tree.hex), 'message': rev.message, 'committer': committer, 'author': author, 'parent-sha1s': revision_parent_sha1s }) swh_repo.add_person(read_signature(rev.author), author) swh_repo.add_person(read_signature(rev.committer), committer) return swh_repo def walk_revision_from(repo, swh_repo, head_rev): """Walk the rev history log from head_rev. - repo is the current repository - rev is the latest rev to start from. """ for rev in repo.walk(head_rev.id, GIT_SORT_TOPOLOGICAL): sha1 = hashutil.hex_to_hash(rev.hex) if swh_repo.already_visited(sha1): logging.debug('commit %s already visited, skipped' % sha1) continue swh_repo = walk_tree(repo, swh_repo, rev) return swh_repo repo = pygit2.Repository(repo_path) # memory model swh_repo = swhrepo.SWHRepo() # add origin origin = {'type': 'git', 'url': 'file://' + repo.path} swh_repo.add_origin(origin) # add references and crawl them for ref_name in repo.listall_references(): logging.info('walk reference %s' % ref_name) ref = repo.lookup_reference(ref_name) head_rev = repo[ref.target] \ if ref.type is GIT_REF_OID \ else ref.peel(GIT_OBJ_COMMIT) # noqa if isinstance(head_rev, pygit2.Tag): head_start = head_rev.get_object() taggerSig = head_rev.tagger author = {'name': taggerSig.name, 'email': taggerSig.email, 'type': storage.Type.person} release = {'id': hashutil.hex_to_hash(head_rev.hex), 'type': storage.Type.release, 'revision': hashutil.hex_to_hash(head_rev.target.hex), 'name': ref_name, 'date': date.ts_to_str(taggerSig.time, taggerSig.offset), 'author': author, 'comment': head_rev.message} swh_repo.add_release(release) swh_repo.add_person(read_signature(taggerSig), author) else: swh_repo.add_occurrence({'id': hashutil.hex_to_hash(head_rev.hex), 'revision': hashutil.hex_to_hash(head_rev.hex), 'authority': SWH_AUTHORITY, 'branch': ref_name, 'url-origin': origin['url'], 'type': storage.Type.occurrence}) head_start = head_rev # crawl commits and trees walk_revision_from(repo, swh_repo, head_start) return swh_repo diff --git a/swh/loader/git/storage/storage.py b/swh/loader/git/storage/storage.py index 13f2fce..7a2c1ff 100755 --- a/swh/loader/git/storage/storage.py +++ b/swh/loader/git/storage/storage.py @@ -1,251 +1,251 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import StringIO from swh.loader.git.storage import models from swh.core import hashutil Type = models.Type _find_object = {Type.occurrence: models.find_occurrences_for_revision, Type.content: lambda *args: models.find_object(*args, column='sha1')} hex_to_sha1 = lambda x: '\\\\x%s' % hashutil.hash_to_hex(x) def find(db_conn, id, type): """Find an object according to its id and type. """ return _find_object.get(type, models.find_object)(db_conn, id, type) _find_unknown = {Type.revision: models.find_unknown_revisions, Type.content: models.find_unknown_contents, Type.directory: models.find_unknown_directories} def find_unknowns(db_conn, obj_type, sha1s_hex): """Given a list of sha1s, return the non presents one in storage. """ def row_to_sha1(row): """Convert a row (memoryview) to a string sha1. """ - return row[0].tobytes() + return bytes(row[0]) cpy_data_buffer = StringIO() vals = '\n'.join(map(hex_to_sha1, sha1s_hex)) cpy_data_buffer.write(vals) cpy_data_buffer.seek(0) # move file cursor back at start of file find_unknown_fn = _find_unknown[obj_type] unknowns = find_unknown_fn(db_conn, cpy_data_buffer) cpy_data_buffer.close() - return map(row_to_sha1, unknowns) + return list(map(row_to_sha1, unknowns)) # hack: force resolution for remote loader def _add_content(db_conn, vcs_object, id): """Add a blob to storage. Designed to be wrapped in a db transaction. Returns: - the sha1 if everything went alright. - None if something went wrong Writing exceptions can also be raised and expected to be handled by the caller. """ models.add_content(db_conn, id, vcs_object['git-sha1'], vcs_object['content-sha256'], vcs_object['size']) return id def _add_directory(db_conn, vcs_object, id): """Add a directory to storage. Designed to be wrapped in a db transaction. """ parent_id = models.add_directory(db_conn, id) for directory_entry_dir in vcs_object['entry-dirs']: _add_directory_entry_dir(db_conn, parent_id, directory_entry_dir) for directory_entry_file in vcs_object['entry-files']: _add_directory_entry_file(db_conn, parent_id, directory_entry_file) for directory_entry_rev in vcs_object['entry-revs']: _add_directory_entry_rev(db_conn, parent_id, directory_entry_rev) return id def _add_directory_entry_dir(db_conn, parent_id, vcs_object): """Add a directory entry dir to storage. Designed to be wrapped in a db transaction. Returns: - the sha1 if everything went alright. - None if something went wrong Writing exceptions can also be raised and expected to be handled by the caller. """ name = vcs_object['name'] models.add_directory_entry_dir(db_conn, name, vcs_object['target-sha1'], vcs_object['perms'], vcs_object['atime'], vcs_object['mtime'], vcs_object['ctime'], parent_id) return name, parent_id def _add_directory_entry_file(db_conn, parent_id, vcs_object): """Add a directory entry file to storage. Designed to be wrapped in a db transaction. Returns: - the sha1 if everything went alright. - None if something went wrong Writing exceptions can also be raised and expected to be handled by the caller. """ name = vcs_object['name'] models.add_directory_entry_file(db_conn, name, vcs_object['target-sha1'], vcs_object['perms'], vcs_object['atime'], vcs_object['mtime'], vcs_object['ctime'], parent_id) return name, parent_id def _add_directory_entry_rev(db_conn, parent_id, vcs_object): """Add a directory entry rev to storage. Designed to be wrapped in a db transaction. Returns: - the sha1 if everything went alright. - None if something went wrong Writing exceptions can also be raised and expected to be handled by the caller. """ name = vcs_object['name'] models.add_directory_entry_rev(db_conn, name, vcs_object['target-sha1'], vcs_object['perms'], vcs_object['atime'], vcs_object['mtime'], vcs_object['ctime'], parent_id) return parent_id def _add_revision(db_conn, vcs_object, id): """Add a revision to storage. Designed to be wrapped in a db transaction. Returns: - the sha1 if everything went alright. - None if something went wrong Writing exceptions can also be raised and expected to be handled by the caller. """ models.add_revision(db_conn, id, vcs_object['date'], vcs_object['committer-date'], vcs_object['directory'], vcs_object['message'], vcs_object['author'], vcs_object['committer'], vcs_object['parent-sha1s']) return id def _add_release(db_conn, vcs_object, id): """Add a release. """ models.add_release(db_conn, id, vcs_object['revision'], vcs_object['date'], vcs_object['name'], vcs_object['comment'], vcs_object['author']) return id def _add_occurrence(db_conn, vcs_object, id): """Add an occurrence. """ models.add_occurrence_history(db_conn, vcs_object['url-origin'], vcs_object['branch'], vcs_object['revision'], vcs_object['authority']) return id def add_person(db_conn, vcs_object): """Add an author. """ return models.add_person(db_conn, vcs_object['name'], vcs_object['email']) _store_fn = {Type.directory: _add_directory, Type.revision: _add_revision, Type.release: _add_release, Type.occurrence: _add_occurrence} def add_origin(db_conn, origin): """A a new origin and returns its id. """ return models.add_origin(db_conn, origin['url'], origin['type']) def find_origin(db_conn, origin): """Find an existing origin. """ return models.find_origin(db_conn, origin['url'], origin['type']) def find_person(db_conn, person): """Find an existing person. """ return models.find_person(db_conn, person['email'], person['name']) def add_with_fs_storage(db_conn, config, id, type, vcs_object): """Add vcs_object in the storage - db_conn is the opened connection to the db - config is the map of configuration needed for core layer - type is not used here but represent the type of vcs_object - vcs_object is the object meant to be persisted in fs and db """ config['objstorage'].add_bytes(vcs_object['content'], id) return _add_content(db_conn, vcs_object, id) def add(db_conn, config, id, type, vcs_object): """Given a id, type and content, store a given object in the store. - db_conn is the opened connection to the db - config is not used here - type is the object's type - vcs_object is the object meant to be persisted in db """ return _store_fn[type](db_conn, vcs_object, id) hex_to_sha1_2 = lambda x: '\\x%s' % hashutil.hash_to_hex(x) def add_revision_history(db_conn, tuple_parents): """Given a list of tuple (sha, parent_sha), store in revision_history. """ if len(tuple_parents) > 0: models.add_revision_history( db_conn, map(lambda t: (hex_to_sha1_2(t[0]), hex_to_sha1_2(t[1]), t[2]), tuple_parents))