diff --git a/swh/loader/dir/converters.py b/swh/loader/dir/converters.py index 1dda58e..2623ccd 100644 --- a/swh/loader/dir/converters.py +++ b/swh/loader/dir/converters.py @@ -1,139 +1,156 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dir objects to dictionaries suitable for swh.storage""" import datetime import os from swh.loader.dir.git.git import GitType from swh.loader.dir.git import git, utils def to_datetime(ts): """Convert a timestamp to utc datetime. """ return datetime.datetime.utcfromtimestamp(ts).replace( tzinfo=datetime.timezone.utc) def format_to_minutes(offset_str): """Convert a git string timezone format string (e.g +0200, -0310) to minutes. Args: offset_str: a string representing an offset. Returns: A positive or negative number of minutes of such input """ sign = offset_str[0] hours = int(offset_str[1:3]) minutes = int(offset_str[3:]) + (hours * 60) return minutes if sign == '+' else -1 * minutes def blob_to_content(obj, log=None, max_content_size=None, origin_id=None): """Convert obj to a swh storage content. + Note: + - If obj represents a link, the length and data are already + provided so we use them directly. + - 'data' is returned only if max_content_size is not reached. + + Returns: + obj converted to content as a dictionary. + """ filepath = obj['path'] - size = os.lstat(filepath).st_size + if 'length' in obj: # link already has it + size = obj['length'] + else: + size = os.lstat(filepath).st_size + ret = { 'sha1': obj['sha1'], 'sha256': obj['sha256'], 'sha1_git': obj['sha1_git'], 'length': size, 'perms': obj['perms'].value, 'type': obj['type'].value, } if max_content_size and size > max_content_size: if log: log.info('Skipping content %s, too large (%s > %s)' % (utils.hash_to_hex(obj['sha1_git']), size, max_content_size)) ret.update({'status': 'absent', 'reason': 'Content too large', 'origin': origin_id}) return ret + if 'data' in obj: # link already has it + data = obj['data'] + else: + data = open(filepath, 'rb').read() + ret.update({ - 'data': open(filepath, 'rb').read(), + 'data': data, 'status': 'visible' }) return ret # Map of type to swh types _entry_type_map = { GitType.TREE: 'dir', GitType.BLOB: 'file', GitType.COMM: 'rev', } def tree_to_directory(tree, objects, log=None): """Format a tree as a directory """ entries = [] for entry in objects[tree['path']]: entries.append({ 'type': _entry_type_map[entry['type']], 'perms': int(entry['perms'].value), 'name': entry['name'], 'target': entry['sha1_git'] }) return { 'id': tree['sha1_git'], 'entries': entries } def commit_to_revision(commit, objects, log=None): """Format a commit as a revision. """ upper_directory = objects[git.ROOT_TREE_KEY][0] return { 'id': commit['sha1_git'], 'date': to_datetime(commit['author_date']), 'date_offset': format_to_minutes(commit['author_offset']), 'committer_date': to_datetime(commit['committer_date']), 'committer_date_offset': format_to_minutes(commit['committer_offset']), 'type': commit['type'], 'directory': upper_directory['sha1_git'], 'message': commit['message'], 'author_name': commit['author_name'], 'author_email': commit['author_email'], 'committer_name': commit['committer_name'], 'committer_email': commit['committer_email'], 'synthetic': True, 'metadata': commit['metadata'], 'parents': [], } def annotated_tag_to_release(release, log=None): """Format a swh release. """ return { 'id': release['sha1_git'], 'revision': release['revision'], 'name': release['name'], 'comment': release['comment'], 'date': to_datetime(release['date']), 'date_offset': format_to_minutes(release['offset']), 'author_name': release['author_name'], 'author_email': release['author_email'], 'synthetic': True, } diff --git a/swh/loader/dir/tests/test_converters.py b/swh/loader/dir/tests/test_converters.py index b2d6aa4..fc13af8 100644 --- a/swh/loader/dir/tests/test_converters.py +++ b/swh/loader/dir/tests/test_converters.py @@ -1,272 +1,304 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import os import shutil import tempfile import unittest from nose.tools import istest from swh.loader.dir import converters from swh.loader.dir.git import git from swh.loader.dir.git.git import GitType, GitPerm def tmpfile_with_content(fromdir, contentfile): """Create a temporary file with content contentfile in directory fromdir. """ tmpfilepath = tempfile.mktemp( suffix='.swh', prefix='tmp-file-for-test', dir=fromdir) with open(tmpfilepath, 'wb') as f: f.write(contentfile) return tmpfilepath class TestConverters(unittest.TestCase): @classmethod def setupClass(cls): cls.tmpdir = tempfile.mkdtemp(prefix='test-swh-loader-dir.') @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdir) @istest def format_to_minutes(self): self.assertEquals(converters.format_to_minutes('+0100'), 60) self.assertEquals(converters.format_to_minutes('-0200'), -120) self.assertEquals(converters.format_to_minutes('+1250'), 12*60+50) self.assertEquals(converters.format_to_minutes('+0000'), 0) self.assertEquals(converters.format_to_minutes('-0000'), 0) @istest def annotated_tag_to_release(self): # given release = { 'sha1_git': '123', 'revision': '456', 'name': 'some-release', 'comment': 'some-comment-on-release', 'date': 1444054085, 'offset': '-0300', 'author_name': 'someone', 'author_email': 'someone@whatelse.eu' } expected_release = { 'id': '123', 'revision': '456', 'name': 'some-release', 'comment': 'some-comment-on-release', 'date': datetime.datetime.fromtimestamp( 1444054085, tz=datetime.timezone.utc), 'date_offset': -180, 'author_name': 'someone', 'author_email': 'someone@whatelse.eu', 'synthetic': True, } # when actual_release = converters.annotated_tag_to_release(release) # then self.assertDictEqual(actual_release, expected_release) @istest def blob_to_content_visible_data(self): # given contentfile = b'temp file for testing blob to content conversion' tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) obj = { 'path': tmpfilepath, 'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', } expected_blob = { 'data': contentfile, 'length': len(contentfile), 'status': 'visible', 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, } # when actual_blob = converters.blob_to_content(obj) # then self.assertEqual(actual_blob, expected_blob) @istest def blob_to_content_link(self): # given contentfile = b'temp file for testing blob to content conversion' tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) tmplinkpath = tempfile.mktemp(dir=self.tmpdir) os.symlink(tmpfilepath, tmplinkpath) - print(tmpfilepath, tmplinkpath) + obj = { 'path': tmplinkpath, 'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', } expected_blob = { 'data': contentfile, 'length': len(tmpfilepath), 'status': 'visible', 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, } # when actual_blob = converters.blob_to_content(obj) # then self.assertEqual(actual_blob, expected_blob) + @istest + def blob_to_content_link_with_data_length_populated(self): + # given + tmplinkpath = tempfile.mktemp(dir=self.tmpdir) + obj = { + 'length': 10, # wrong for test purposes + 'data': 'something wrong', # again for test purposes + 'path': tmplinkpath, + 'perms': GitPerm.BLOB, + 'type': GitType.BLOB, + 'sha1': 'some-sha1', + 'sha256': 'some-sha256', + 'sha1_git': 'some-sha1git', + } + + expected_blob = { + 'length': 10, + 'data': 'something wrong', + 'status': 'visible', + 'sha1': 'some-sha1', + 'sha256': 'some-sha256', + 'sha1_git': 'some-sha1git', + 'perms': GitPerm.BLOB.value, + 'type': GitType.BLOB.value, + } + + # when + actual_blob = converters.blob_to_content(obj) + + # then + self.assertEqual(actual_blob, expected_blob) + @istest def blob_to_content2_absent_data(self): # given contentfile = b'temp file for testing blob to content conversion' tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) obj = { 'path': tmpfilepath, 'perms': GitPerm.BLOB, 'type': GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', } expected_blob = { 'length': len(contentfile), 'status': 'absent', 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, 'reason': 'Content too large', 'origin': 190 } # when actual_blob = converters.blob_to_content(obj, None, max_content_size=10, origin_id=190) # then self.assertEqual(actual_blob, expected_blob) @istest def tree_to_directory_no_entries(self): # given tree = { 'path': 'foo', 'sha1_git': b'tree_sha1_git' } objects = { 'foo': [{'type': GitType.TREE, 'perms': GitPerm.TREE, 'name': 'bar', 'sha1_git': b'sha1-target'}, {'type': GitType.BLOB, 'perms': GitPerm.BLOB, 'name': 'file-foo', 'sha1_git': b'file-foo-sha1-target'}] } expected_directory = { 'id': b'tree_sha1_git', 'entries': [{'type': 'dir', 'perms': int(GitPerm.TREE.value), 'name': 'bar', 'target': b'sha1-target'}, {'type': 'file', 'perms': int(GitPerm.BLOB.value), 'name': 'file-foo', 'target': b'file-foo-sha1-target'}] } # when actual_directory = converters.tree_to_directory(tree, objects) # then self.assertEqual(actual_directory, expected_directory) @istest def commit_to_revision(self): # given commit = { 'sha1_git': 'commit-git-sha1', 'author_date': 1444054085, 'author_offset': '+0000', 'committer_date': 1444054085, 'committer_offset': '-0000', 'type': 'tar', 'message': 'synthetic-message-input', 'author_name': 'author-name', 'author_email': 'author-email', 'committer_name': 'committer-name', 'committer_email': 'committer-email', 'metadata': {'checksums': {'sha1': b'sha1-as-bytes'}}, 'directory': 'targeted-tree-sha1', } objects = { git.ROOT_TREE_KEY: [{'sha1_git': 'targeted-tree-sha1'}] } expected_revision = { 'id': 'commit-git-sha1', 'date': datetime.datetime.fromtimestamp( 1444054085, tz=datetime.timezone.utc), 'date_offset': 0, 'committer_date': datetime.datetime.fromtimestamp( 1444054085, tz=datetime.timezone.utc), 'committer_date_offset': 0, 'type': 'tar', 'directory': 'targeted-tree-sha1', 'message': 'synthetic-message-input', 'author_name': 'author-name', 'author_email': 'author-email', 'committer_name': 'committer-name', 'committer_email': 'committer-email', 'synthetic': True, 'metadata': {'checksums': {'sha1': b'sha1-as-bytes'}}, 'parents': [], } # when actual_revision = converters.commit_to_revision(commit, objects) # then self.assertEquals(actual_revision, expected_revision)