Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/requirements.txt b/requirements.txt
index 2661e11..48f0a0b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,6 @@
-python-dateutil
+click
+flask
psycopg2
+python-dateutil
+python-fastimport
vcversioner
-
-# remote storage API server
-flask
-
-
-click
diff --git a/swh/vault/cookers/__init__.py b/swh/vault/cookers/__init__.py
index 18385a2..fd84f0f 100644
--- a/swh/vault/cookers/__init__.py
+++ b/swh/vault/cookers/__init__.py
@@ -1,12 +1,14 @@
-# Copyright (C) 2016 The Software Heritage developers
+# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from .directory import DirectoryCooker
from .revision_flat import RevisionFlatCooker
+from .revision_git import RevisionGitCooker
COOKER_TYPES = {
'directory': DirectoryCooker,
'revision_flat': RevisionFlatCooker,
+ 'revision_git': RevisionGitCooker,
}
diff --git a/swh/vault/cookers/revision_git.py b/swh/vault/cookers/revision_git.py
new file mode 100644
index 0000000..c403084
--- /dev/null
+++ b/swh/vault/cookers/revision_git.py
@@ -0,0 +1,152 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import fastimport.commands
+
+from .base import BaseVaultCooker
+
+
+class RevisionGitCooker(BaseVaultCooker):
+ """Cooker to create a git fast-import bundle """
+ CACHE_TYPE_KEY = 'revision_git'
+
+ def prepare_bundle(self, obj_id):
+ commands = self.fastexport(self.storage.revision_log([obj_id]))
+ bundle_content = b'\n'.join(bytes(command) for command in commands)
+ return bundle_content
+
+ def fastexport(self, log):
+ """Generate all the git fast-import commands from a given log.
+ """
+ self.rev_by_id = {r['id']: r for r in log}
+ self.rev_sorted = list(self._toposort(self.rev_by_id))
+ self.dir_by_id = {}
+ self.obj_done = set()
+ self.obj_to_mark = {}
+ self.next_available_mark = 1
+
+ yield from self._compute_all_blob_commands()
+ yield from self._compute_all_commit_commands()
+
+ def _toposort(self, rev_by_id):
+ """Perform a topological sort on the revision graph.
+ """
+ done = set()
+ remaining = rev_by_id.copy()
+
+ while remaining:
+ for rev_id, rev in list(remaining.items()):
+ parents = rev['parents']
+ if set(parents) <= done:
+ yield rev
+ done.add(rev_id)
+ del remaining[rev_id]
+
+ def mark(self, obj_id):
+ """Get the mark ID as bytes of a git object.
+
+ If the object has not yet been marked, assign a new ID and add it to
+ the mark dictionary.
+ """
+ if obj_id not in self.obj_to_mark:
+ self.obj_to_mark[obj_id] = self.next_available_mark
+ self.next_available_mark += 1
+ return str(self.obj_to_mark[obj_id]).encode()
+
+ def _compute_all_blob_commands(self):
+ """Compute all the blob commands to populate the empty git repository.
+
+ Mark the populated blobs so that we are able to reference them in file
+ commands.
+
+ """
+ for rev in self.rev_sorted:
+ yield from self._compute_blob_commands_in_dir(rev['directory'])
+
+ def _compute_blob_commands_in_dir(self, dir_id):
+ """Find all the blobs in a directory and generate their blob commands.
+
+ If a blob has already been visited and marked, skip it.
+ """
+ data = self.storage.directory_ls(dir_id, recursive=True)
+ files_data = list(entry for entry in data if entry['type'] == 'file')
+ self.dir_by_id[dir_id] = files_data
+ for file_data in files_data:
+ obj_id = file_data['sha1']
+ if obj_id in self.obj_done:
+ continue
+ content = list(self.storage.content_get([obj_id]))[0]['data']
+ yield fastimport.commands.BlobCommand(
+ mark=self.mark(obj_id),
+ data=content,
+ )
+ self.obj_done.add(obj_id)
+
+ def _compute_all_commit_commands(self):
+ """Compute all the commit commands.
+ """
+ for rev in self.rev_sorted:
+ yield from self._compute_commit_command(rev)
+
+ def _compute_commit_command(self, rev):
+ """Compute a commit command from a specific revision.
+ """
+ from_ = None
+ merges = None
+ parent = None
+ if 'parents' in rev and rev['parents']:
+ from_ = b':' + self.mark(rev['parents'][0])
+ merges = [b':' + self.mark(r) for r in rev['parents'][1:]]
+ parent = self.rev_by_id[rev['parents'][0]]
+ files = self._compute_file_commands(rev, parent)
+ author = (rev['author']['name'],
+ rev['author']['email'],
+ rev['date']['timestamp']['seconds'],
+ rev['date']['offset'] * 60)
+ committer = (rev['committer']['name'],
+ rev['committer']['email'],
+ rev['committer_date']['timestamp']['seconds'],
+ rev['committer_date']['offset'] * 60)
+ yield fastimport.commands.CommitCommand(
+ ref=b'refs/heads/master',
+ mark=self.mark(rev['id']),
+ author=author,
+ committer=committer,
+ message=rev['message'],
+ from_=from_,
+ merges=merges,
+ file_iter=files,
+ )
+
+ def _compute_file_commands(self, rev, parent=None):
+ """Compute all the file commands of a revision.
+
+ Generate a diff of the files between the revision and its main parent
+ to find the necessary file commands to apply.
+ """
+ if not parent:
+ parent_dir = []
+ else:
+ parent_dir = self.dir_by_id[parent['directory']]
+ cur_dir = self.dir_by_id[rev['directory']]
+ parent_dir = {f['name']: f for f in parent_dir}
+ cur_dir = {f['name']: f for f in cur_dir}
+
+ for fname, f in cur_dir.items():
+ if ((fname not in parent_dir
+ or f['sha1'] != parent_dir[fname]['sha1']
+ or f['perms'] != parent_dir[fname]['perms'])):
+ yield fastimport.commands.FileModifyCommand(
+ path=f['name'],
+ mode=f['perms'],
+ dataref=(b':' + self.mark(f['sha1'])),
+ data=None,
+ )
+
+ for fname, f in parent_dir.items():
+ if fname not in cur_dir:
+ yield fastimport.commands.FileDeleteCommand(
+ path=f['name']
+ )

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 11:05 AM (3 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3317401

Event Timeline