diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,6 @@ -python-dateutil +click +python-fastimport +flask psycopg2 +python-dateutil vcversioner - -# remote storage API server -flask - - -click diff --git a/swh/storage/vault/cookers/__init__.py b/swh/storage/vault/cookers/__init__.py --- a/swh/storage/vault/cookers/__init__.py +++ b/swh/storage/vault/cookers/__init__.py @@ -1,12 +1,14 @@ -# Copyright (C) 2016 The Software Heritage developers +# Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from .directory import DirectoryCooker from .revision_flat import RevisionFlatCooker +from .revision_git import RevisionGitCooker COOKER_TYPES = { 'directory': DirectoryCooker, 'revision_flat': RevisionFlatCooker, + 'revision_git': RevisionGitCooker, } diff --git a/swh/storage/vault/cookers/revision_git.py b/swh/storage/vault/cookers/revision_git.py new file mode 100644 --- /dev/null +++ b/swh/storage/vault/cookers/revision_git.py @@ -0,0 +1,151 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import fastimport.commands + +from .base import BaseVaultCooker +from swh.core import hashutil + + +class RevisionGitCooker(BaseVaultCooker): + """Cooker to create a directory bundle """ + CACHE_TYPE_KEY = 'revision_git' + + def cook(self, obj_id): + """Cook the requested revision into a Bundle + + Args: + obj_id (bytes): the id of the revision to be cooked. + + Returns: + bytes that correspond to the bundle + + """ + bundle_content = self.fastexport(obj_id) + + # Cache the bundle + self.update_cache(obj_id, bundle_content) + # Make a notification that the bundle have been cooked + # NOT YET IMPLEMENTED see TODO in function. + self.notify_bundle_ready( + notif_data='Bundle %s ready' % hashutil.hash_to_hex(obj_id), + bundle_id=obj_id) + + def fastexport(self, obj_id): + commands = list(self.fastexport_log( + self.storage.revision_log([obj_id]))) + bundle_content = b'\n'.join(bytes(command) for command in commands) + return bundle_content + + def fastexport_log(self, log): + self.rev_by_id = {r['id']: r for r in log} + self.rev_sorted = list(self.toposort(self.rev_by_id)) + self.dir_by_id = {} + self.obj_done = set() + self.obj_to_mark = {} + self.next_available_mark = 1 + + yield from self.register_all_blobs() + yield from self.visit_commits() + + def register_all_blobs(self): + for rev in self.rev_sorted: + yield from self.register_directory(rev['directory']) + + def register_directory(self, dir_id): + data = self.storage.directory_ls(dir_id, recursive=True) + files_data = list(entry for entry in data if entry['type'] == 'file') + self.dir_by_id[dir_id] = files_data + for file_data in files_data: + obj_id = file_data['sha1'] + if obj_id in self.obj_done: + continue + content = list(self.storage.content_get([obj_id]))[0]['data'] + yield fastimport.commands.BlobCommand( + mark=str(self.mark(obj_id)).encode(), + data=content, + ) + self.obj_done.add(obj_id) + + def toposort(self, rev_by_id): + done = set() + remaining = rev_by_id.copy() + + while remaining: + for rev_id, rev in list(remaining.items()): + parents = rev['parents'] + if set(parents) <= done: + yield rev + done.add(rev_id) + del remaining[rev_id] + + def visit_commits(self): + for rev in self.rev_sorted: + yield from self.visit_commit(rev) + + def filecommands(self, rev, parent=None): + if not parent: + parent_dir = [] + else: + parent_dir = self.dir_by_id[parent['directory']] + cur_dir = self.dir_by_id[rev['directory']] + parent_dir = {f['name']: f for f in parent_dir} + cur_dir = {f['name']: f for f in cur_dir} + + for fname, f in cur_dir.items(): + if ((fname not in parent_dir + or f['sha1'] != parent_dir[fname]['sha1'] + or f['perms'] != parent_dir[fname]['perms'])): + yield fastimport.commands.FileModifyCommand( + path=f['name'], + mode=f['perms'], + dataref=(b':' + str(self.mark(f['sha1'])).encode()), + data=None, + ) + + for fname, f in parent_dir.items(): + if fname not in cur_dir: + yield fastimport.commands.FileDeleteCommand( + path=f['name'] + ) + + def visit_commit(self, rev): + from_ = None + merges = None + parent = None + if 'parents' in rev and rev['parents']: + from_ = b':' + str(self.mark(rev['parents'][0])).encode() + merges = [str(self.mark(r)).encode() for r in rev['parents'][1:]] + parent = self.rev_by_id[rev['parents'][0]] + files = self.filecommands(rev, parent) + author = (rev['author']['name'], + rev['author']['email'], + rev['date']['timestamp']['seconds'], + rev['date']['offset']) + committer = (rev['committer']['name'], + rev['committer']['email'], + rev['committer_date']['timestamp']['seconds'], + rev['committer_date']['offset']) + yield fastimport.commands.CommitCommand( + ref=b'refs/heads/master', + mark=str(self.mark(rev['id'])).encode(), + author=author, + committer=committer, + message=rev['message'], + from_=from_, + merges=merges, + file_iter=files, + ) + + def mark(self, obj_id): + if obj_id not in self.obj_to_mark: + self.obj_to_mark[obj_id] = self.next_available_mark + self.next_available_mark += 1 + return self.obj_to_mark[obj_id] + + def notify_bundle_ready(self, notif_data, bundle_id): + # TODO plug this method with the notification method once + # done. + pass