diff --git a/swh/loader/dir/git/git.py b/swh/loader/dir/git/git.py index 52d5052..c57321e 100644 --- a/swh/loader/dir/git/git.py +++ b/swh/loader/dir/git/git.py @@ -1,331 +1,324 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from enum import Enum from swh.loader.dir.git import utils class GitType(Enum): BLOB = b'blob' TREE = b'tree' EXEC = b'exec' LINK = b'link' COMM = b'commit' RELE = b'release' class GitPerm(Enum): BLOB = b'100644' TREE = b'40000' EXEC = b'100755' LINK = b'120000' def compute_directory_git_sha1(dirpath, hashes): """Compute a directory git sha1 for a dirpath. Args: dirpath: the directory's absolute path hashes: list of tree entries with keys: - sha1_git: the tree entry's sha1 - name: file or subdir's name - perms: the tree entry's sha1 permissions Returns: dictionary with sha1_git as key and the actual binary sha1 as value. Assumes: Every path exists in hashes. """ def sort_by_entry_name(hashes): return sorted(hashes, key=lambda entry: entry['name']) def row_entry_tree_format(hashes): return map(lambda entry: b''.join([entry['perms'].value, b' ', entry['name'], b'\0', entry['sha1_git']]), hashes) rows = row_entry_tree_format(sort_by_entry_name(hashes[dirpath])) return utils.hashdata(b''.join(rows), 'tree') def compute_revision_git_sha1(tree_hash, info): """Compute a revision representation targeting the tree_hash. Args: tree_hash: binary form of the tree hash info: Additional dictionary information needed to compute a synthetic revision. Following keys are expected: - revision_author_name - revision_author_email - revision_author_date - revision_author_offset - revision_committer_name - revision_committer_email - revision_committer_date - revision_committer_offset - revision_message - revision_type """ revision_author_name = info['revision_author_name'] revision_author_email = info['revision_author_email'] revision_author_date = info['revision_author_date'] revision_author_offset = info['revision_author_offset'] revision_committer_name = info['revision_committer_name'] revision_committer_email = info['revision_committer_email'] revision_committer_date = info['revision_committer_date'] revision_committer_offset = info['revision_committer_offset'] revision_message = info['revision_message'] revision_content = ("""tree %s author %s <%s> %s %s committer %s <%s> %s %s %s """ % (utils.hash_to_hex(tree_hash), revision_author_name, revision_author_email, revision_author_date, revision_author_offset, revision_committer_name, revision_committer_email, revision_committer_date, revision_committer_offset, revision_message)).encode('utf-8') hashes = utils.hashdata(revision_content, 'commit') # and update other information hashes.update({ 'revision_author_name': revision_author_name, 'revision_author_email': revision_author_email, 'revision_author_date': revision_author_date, 'revision_author_offset': revision_author_offset, 'revision_committer_name': revision_committer_name, 'revision_committer_email': revision_committer_email, 'revision_committer_date': revision_committer_date, 'revision_committer_offset': revision_committer_offset, 'revision_message': revision_message, 'revision_type': info['revision_type'] }) return hashes def compute_release(revision_hash, info): """Compute a release representation. This release representation will contain the computed sha1_git for such release. This release will point to the revision_hash. The additional informations are present in the dictionary info. Args: revision_hash: binary form of the sha1_git revision targeted by this release info: Additional dictionary information needed to compute a synthetic release. Following keys are expected: - release_name - release_comment - release_date - release_offset - release_author_name - release_author_email """ release_name = info['release_name'] release_author_name = info['release_author_name'] release_author_email = info['release_author_email'] release_date = info['release_date'] release_offset = info['release_offset'] release_comment = info['release_comment'] release_content_to_hash = ("""object %s type commit tag %s tagger %s <%s> %s %s %s """ % (utils.hash_to_hex(revision_hash), release_name, release_author_name, release_author_email, release_date, release_offset, release_comment)).encode('utf-8') hashes = utils.hashdata(release_content_to_hash, 'tag') hashes.update({ 'revision_sha1_git': revision_hash, 'release_name': release_name, 'release_comment': release_comment, 'release_date': release_date, 'release_offset': release_offset, 'release_author_name': release_author_name, 'release_author_email': release_author_email, }) return hashes def compute_link_metadata(linkpath): """Given a linkpath, compute the git metadata. Args: linkpath: absolute pathname of the link Returns: Dictionary of values: - name: basename of the link - perms: git permission for link - type: git type for link """ m_hashes = utils.hashlink(linkpath) m_hashes.update({ 'name': bytes(os.path.basename(linkpath), 'utf-8'), 'perms': GitPerm.LINK, 'type': GitType.BLOB, 'path': linkpath }) return m_hashes def compute_blob_metadata(filepath): """Given a filepath, compute the git metadata. Args: filepath: absolute pathname of the file. Returns: Dictionary of values: - name: basename of the file - perms: git permission for file - type: git type for file """ m_hashes = utils.hashfile(filepath) perms = GitPerm.EXEC if os.access(filepath, os.X_OK) else GitPerm.BLOB m_hashes.update({ 'name': bytes(os.path.basename(filepath), 'utf-8'), 'perms': perms, 'type': GitType.BLOB, 'path': filepath }) return m_hashes def compute_tree_metadata(dirname, ls_hashes): """Given a dirname, compute the git metadata. Args: dirname: absolute pathname of the directory. Returns: Dictionary of values: - name: basename of the directory - perms: git permission for directory - type: git type for directory """ tree_hash = compute_directory_git_sha1(dirname, ls_hashes) tree_hash.update({ 'name': bytes(os.path.basename(dirname), 'utf-8'), 'perms': GitPerm.TREE, 'type': GitType.TREE, 'path': dirname }) return tree_hash def walk_and_compute_sha1_from_directory(rootdir): """Compute git sha1 from directory rootdir. - Empty directories are skipped. - Returns: Dictionary of entries with keys and as values a list of directory entries. Those are list of dictionary with keys: - 'perms' - 'type' - 'name' - 'sha1_git' - and specifically content: 'sha1', 'sha256', ... Note: One special key is '' to indicate the upper root of the - directory (this is the entry point of the revision). + directory (this is the revision's directory). Raises: Nothing If something is raised, this is a programmatic error. """ ls_hashes = {} - empty_dirs = set() - link_dirs = set() + all_links = set() for dirpath, dirnames, filenames in os.walk(rootdir, topdown=False): hashes = [] - if not(dirnames) and not(filenames): - empty_dirs.add(dirpath) - continue - links = [os.path.join(dirpath, file) for file in (filenames+dirnames) if os.path.islink(os.path.join(dirpath, file))] for linkpath in links: - link_dirs.add(linkpath) + all_links.add(linkpath) m_hashes = compute_link_metadata(linkpath) hashes.append(m_hashes) only_files = [os.path.join(dirpath, file) for file in filenames - if os.path.join(dirpath, file) not in link_dirs] + if os.path.join(dirpath, file) not in all_links] for filepath in only_files: m_hashes = compute_blob_metadata(filepath) hashes.append(m_hashes) ls_hashes.update({ dirpath: hashes }) dir_hashes = [] subdirs = [os.path.join(dirpath, dir) for dir in dirnames if os.path.join(dirpath, dir) - not in (empty_dirs | link_dirs)] + not in all_links] for fulldirname in subdirs: tree_hash = compute_tree_metadata(fulldirname, ls_hashes) dir_hashes.append(tree_hash) ls_hashes.update({ dirpath: ls_hashes.get(dirpath, []) + dir_hashes }) # compute the current directory hashes root_hash = compute_directory_git_sha1(rootdir, ls_hashes) root_hash.update({ 'path': rootdir, 'name': bytes(os.path.basename(rootdir), 'utf-8'), 'perms': GitPerm.TREE, 'type': GitType.TREE }) ls_hashes.update({ '': [root_hash] }) return ls_hashes diff --git a/swh/loader/dir/tests/test_converters.py b/swh/loader/dir/tests/test_converters.py index a3ab907..c71dead 100644 --- a/swh/loader/dir/tests/test_converters.py +++ b/swh/loader/dir/tests/test_converters.py @@ -1,206 +1,206 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from datetime import datetime from swh.loader.dir import converters from swh.loader.dir.git.git import GitType, GitPerm class TestConverters(unittest.TestCase): @istest def format_to_minutes(self): self.assertEquals(converters.format_to_minutes('+0100'), 60) self.assertEquals(converters.format_to_minutes('-0200'), -120) self.assertEquals(converters.format_to_minutes('+1250'), 12*60+50) self.assertEquals(converters.format_to_minutes('+0000'), 0) self.assertEquals(converters.format_to_minutes('-0000'), 0) @istest def origin_url_to_origin(self): # given origin_url = 'foobar' # when self.assertDictEqual({ 'type': 'dir', 'url': origin_url, }, converters.origin_url_to_origin(origin_url)) @istest def annotated_tag_to_release(self): # given release = { 'sha1_git': '123', 'revision_sha1_git': '456', 'release_name': 'some-release', 'release_comment': 'some-comment-on-release', 'release_date': 1444054085, 'release_offset': '-0300', 'release_author_name': 'someone', 'release_author_email': 'someone@whatelse.eu' } expected_release = { 'id': '123', 'revision': '456', 'name': 'some-release', 'comment': 'some-comment-on-release', 'date': datetime.fromtimestamp(1444054085), 'date_offset': -180, 'author_name': 'someone', 'author_email': 'someone@whatelse.eu', } # when actual_release = converters.annotated_tag_to_release(release) # then self.assertDictEqual( expected_release, actual_release) @istest - def blob_to_content_visible(self): + def _blob_to_content_visible(self): obj = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB, 'type': GitType.BLOB } expected_content = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, 'status': 'visible' } # when - actual_content = converters.blob_to_content(obj) + actual_content = converters._blob_to_content(obj) # then self.assertEqual(expected_content, actual_content) @istest - def blob_to_content_absent(self): + def _blob_to_content_absent(self): obj = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB, 'type': GitType.BLOB } expected_content = { 'length': 9, 'data': b'some-data', 'sha1': b'sha1', 'sha1_git': b'sha1-git', 'sha256': b'sha256', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, 'status': 'absent', 'reason': 'Content too large', 'origin': 3} # when - actual_content = converters.blob_to_content(obj, - max_content_size=5, - origin_id=3) + actual_content = converters._blob_to_content(obj, + max_content_size=5, + origin_id=3) # then self.assertEqual(expected_content, actual_content) @istest def tree_to_directory_no_entries(self): # given tree = { 'path': 'foo', 'sha1_git': b'tree_sha1_git' } objects = { 'foo': [{'type': GitType.TREE, 'perms': GitPerm.TREE, 'name': 'bar', 'sha1_git': b'sha1-target'}, {'type': GitType.BLOB, 'perms': GitPerm.BLOB, 'name': 'file-foo', 'sha1_git': b'file-foo-sha1-target'}] } expected_directory = { 'id': b'tree_sha1_git', 'entries': [{'type': 'dir', 'perms': int(GitPerm.TREE.value), 'name': 'bar', 'target': b'sha1-target'}, {'type': 'file', 'perms': int(GitPerm.BLOB.value), 'name': 'file-foo', 'target': b'file-foo-sha1-target'}] } # when actual_directory = converters.tree_to_directory(tree, objects) # then self.assertEqual(actual_directory, expected_directory) @istest def commit_to_revision(self): # given commit = { 'sha1_git': 'commit-git-sha1', 'revision_author_date': 1444054085, 'revision_author_offset': '+0000', 'revision_committer_date': 1444054085, 'revision_committer_offset': '-0000', 'revision_type': 'tar', 'revision_message': 'synthetic-message-input', 'revision_author_name': 'author-name', 'revision_author_email': 'author-email', 'revision_committer_name': 'committer-name', 'revision_committer_email': 'committer-email', } objects = { '': [{'sha1_git': 'targeted-tree-sha1'}] } expected_revision = { 'id': 'commit-git-sha1', 'date': datetime.fromtimestamp(1444054085), 'date_offset': 0, 'committer_date': datetime.fromtimestamp(1444054085), 'committer_date_offset': 0, 'type': 'tar', 'directory': 'targeted-tree-sha1', 'message': 'synthetic-message-input', 'author_name': 'author-name', 'author_email': 'author-email', 'committer_name': 'committer-name', 'committer_email': 'committer-email', 'parents': [], } # when actual_revision = converters.commit_to_revision(commit, objects) # then self.assertEquals(actual_revision, expected_revision) diff --git a/swh/loader/dir/tests/test_loader.py b/swh/loader/dir/tests/test_loader.py index 5ee1129..fcb80fe 100644 --- a/swh/loader/dir/tests/test_loader.py +++ b/swh/loader/dir/tests/test_loader.py @@ -1,98 +1,105 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest from nose.tools import istest from swh.core.hashutil import hex_to_hash from swh.loader.dir.loader import DirLoader from swh.loader.dir.git.git import GitType class TestLoader(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.tmp_root_path = tempfile.mkdtemp() sample_folder_archive = os.path.join(os.path.dirname(__file__), '../../../../..', 'swh-storage-testdata', 'dir-folders', 'sample-folder.tgz') cls.root_path = os.path.join(cls.tmp_root_path, 'sample-folder') # uncompress the sample folder subprocess.check_output( ['tar', 'xvf', sample_folder_archive, '-C', cls.tmp_root_path], ) @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.tmp_root_path) print(cls.tmp_root_path) def setUp(self): super().setUp() self.info = { 'storage_class': 'remote_storage', 'storage_args': ['http://localhost:5000/'], # origin information 'origin_url': 'file:///dev/null', # occurrence information 'branch': 'master', 'authority_id': 1, 'validity': '2015-01-01 00:00:00+00', # revision information 'revision_author_name': 'swh author', 'revision_author_email': 'swh@inria.fr', 'revision_author_date': '1444054085', 'revision_author_offset': '+0200', 'revision_committer_name': 'swh committer', 'revision_committer_email': 'swh@inria.fr', 'revision_committer_date': '1444054085', 'revision_committer_offset': '+0200', 'revision_type': 'tar', 'revision_message': 'synthetic revision', # release information 'release_name': 'v0.0.1', 'release_date': '1444054085', 'release_offset': '+0200', 'release_author_name': 'swh author', 'release_author_email': 'swh@inria.fr', 'release_comment': 'synthetic release', } self.dirloader = DirLoader(self.info) @istest def load_without_storage(self): # when - objects, objects_per_path = self.dirloader.list_repo_objs(self.root_path, self.info) + objects, objects_per_path = self.dirloader.list_repo_objs( + self.root_path, + self.info) # then - self.assertEquals(len(objects), 4, "4 objects types, blob, tree, revision, release") - self.assertEquals(len(objects[GitType.BLOB]), 8, "8 contents: 3 files + 5 links") - self.assertEquals(len(objects[GitType.TREE]), 4, "4 directories: 3 subdir + 1 main dir") + self.assertEquals(len(objects), 4, + "4 objects types, blob, tree, revision, release") + self.assertEquals(len(objects[GitType.BLOB]), 8, + "8 contents: 3 files + 5 links") + self.assertEquals(len(objects[GitType.TREE]), 5, + "5 directories: 4 subdirs + 1 empty one + 1 main dir") self.assertEquals(len(objects[GitType.COMM]), 1, "1 synthetic revision") self.assertEquals(len(objects[GitType.RELE]), 1, "1 synthetic release") - self.assertEquals(len(objects_per_path), 5, "4 folders + ") + self.assertEquals(len(objects_per_path), 6, "5 folders + ") -# print('objects: %s\n objects-per-path: %s\n' % (objects.keys(), objects_per_path.keys())) + # print('objects: %s\n objects-per-path: %s\n' % + # (objects.keys(), + # objects_per_path.keys()))