diff --git a/PKG-INFO b/PKG-INFO index 1764905..bb5c3cd 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.dir -Version: 0.0.10 +Version: 0.0.11 Summary: Software Heritage Directory Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDDIR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.dir.egg-info/PKG-INFO b/swh.loader.dir.egg-info/PKG-INFO index 1764905..bb5c3cd 100644 --- a/swh.loader.dir.egg-info/PKG-INFO +++ b/swh.loader.dir.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.dir -Version: 0.0.10 +Version: 0.0.11 Summary: Software Heritage Directory Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDDIR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh/loader/dir/converters.py b/swh/loader/dir/converters.py index 034ea54..0cfd945 100644 --- a/swh/loader/dir/converters.py +++ b/swh/loader/dir/converters.py @@ -1,148 +1,139 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dir objects to dictionaries suitable for swh.storage""" import datetime +import os from swh.loader.dir.git.git import GitType from swh.loader.dir.git import git, utils def to_datetime(ts): """Convert a timestamp to utc datetime. """ return datetime.datetime.utcfromtimestamp(ts).replace( tzinfo=datetime.timezone.utc) def format_to_minutes(offset_str): """Convert a git string timezone format string (e.g +0200, -0310) to minutes. Args: offset_str: a string representing an offset. Returns: A positive or negative number of minutes of such input """ sign = offset_str[0] hours = int(offset_str[1:3]) minutes = int(offset_str[3:]) + (hours * 60) return minutes if sign == '+' else -1 * minutes def blob_to_content(obj, log=None, max_content_size=None, origin_id=None): - if 'data' not in obj: - filepath = obj['path'] - content_raw, length = utils._read_raw(filepath) - obj.update({'data': content_raw, - 'length': length}) - return _blob_to_content(obj, log, max_content_size, origin_id) - - -def _blob_to_content(obj, log=None, - max_content_size=None, - origin_id=None): - """Convert to a compliant swh content. + """Convert obj to a swh storage content. """ - size = obj['length'] + filepath = obj['path'] + size = os.path.getsize(filepath) ret = { 'sha1': obj['sha1'], 'sha256': obj['sha256'], 'sha1_git': obj['sha1_git'], 'length': size, 'perms': obj['perms'].value, 'type': obj['type'].value, } if max_content_size and size > max_content_size: if log: log.info('Skipping content %s, too large (%s > %s)' % (utils.hash_to_hex(obj['sha1_git']), size, max_content_size)) ret.update({'status': 'absent', 'reason': 'Content too large', 'origin': origin_id}) return ret ret.update({ - 'status': 'visible', - 'data': obj['data'], + 'data': open(filepath, 'rb').read(), + 'status': 'visible' }) return ret # Map of type to swh types _entry_type_map = { GitType.TREE: 'dir', GitType.BLOB: 'file', GitType.COMM: 'rev', } def tree_to_directory(tree, objects, log=None): """Format a tree as a directory """ entries = [] for entry in objects[tree['path']]: entries.append({ 'type': _entry_type_map[entry['type']], 'perms': int(entry['perms'].value), 'name': entry['name'], 'target': entry['sha1_git'] }) return { 'id': tree['sha1_git'], 'entries': entries } def commit_to_revision(commit, objects, log=None): """Format a commit as a revision. """ upper_directory = objects[git.ROOT_TREE_KEY][0] return { 'id': commit['sha1_git'], 'date': to_datetime(commit['author_date']), 'date_offset': format_to_minutes(commit['author_offset']), 'committer_date': to_datetime(commit['committer_date']), 'committer_date_offset': format_to_minutes(commit['committer_offset']), 'type': commit['type'], 'directory': upper_directory['sha1_git'], 'message': commit['message'], 'author_name': commit['author_name'], 'author_email': commit['author_email'], 'committer_name': commit['committer_name'], 'committer_email': commit['committer_email'], 'synthetic': True, 'metadata': commit['metadata'], 'parents': [], } def annotated_tag_to_release(release, log=None): """Format a swh release. """ return { 'id': release['sha1_git'], 'revision': release['revision'], 'name': release['name'], 'comment': release['comment'], 'date': to_datetime(release['date']), 'date_offset': format_to_minutes(release['offset']), 'author_name': release['author_name'], 'author_email': release['author_email'], 'synthetic': True, } diff --git a/swh/loader/dir/git/utils.py b/swh/loader/dir/git/utils.py index 04afe0d..b1f8037 100644 --- a/swh/loader/dir/git/utils.py +++ b/swh/loader/dir/git/utils.py @@ -1,121 +1,98 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import os from io import BytesIO from swh.core import hashutil hashfile = hashutil.hashfile hash_to_hex = hashutil.hash_to_hex hex_to_hash = hashutil.hex_to_hash def _new_hash(header_type, length): """Initialize a digest object (as returned by python's hashlib) for the git sha1 algorithm. This is in charge of pre-computing the needed header for git. Args: header_type: a git sha1 type ('blob', 'tree', 'commit', 'tag') length: Length of content to hash. Could be None if when hashing with sha1 and sha256 Returns: A digest object Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ h = hashlib.new('sha1') if header_type not in ('blob', 'commit', 'tree', 'tag'): raise ValueError( 'Only supported types are blob, commit, tree, tag') h.update(('%s %d\0' % (header_type, length)).encode('ascii')) return h def _hash_file_obj(f, header_type, length): """hash (git sha1) the content of a file-like object f with header_type and length. Returns: A dictionary with 'sha1_git' as key and value the computed sha1_git. Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ h = _new_hash(header_type, length) while True: chunk = f.read(hashutil.HASH_BLOCK_SIZE) if not chunk: break h.update(chunk) return {'sha1_git': h.digest()} def hashdata(data, header_type): """Hash data as git sha1 with header_type. Returns: A dictionary with 'sha1_git' as key and value the computed sha1_git. Raises: ValueError if header_type is not one of 'blob', 'commit', 'tree', 'tag' """ buf = BytesIO(data) return _hash_file_obj(buf, header_type, len(data)) -def _read_raw(filepath): - """Read filepath's raw content and returns it. - - Args: - filepath: absolute path to an existing file. - - Returns: - raw content in bytes + its length - - """ - content_raw = b'' - length = 0 - with open(filepath, 'rb') as f: - while True: - chunk = f.read(hashutil.HASH_BLOCK_SIZE) - if not chunk: - break - content_raw += chunk - length += len(chunk) - - return content_raw, length - - def hashlink(linkpath): """Compute hashes for a link. Args: linkpath: the absolute path name to a symbolic link. Returns: dictionary with sha1_git as key and the actual binary sha1 as value. """ raw_data = os.readlink(linkpath) hashes = hashutil.hashdata(raw_data) hashes.update({ 'data': raw_data, 'length': len(raw_data) }) return hashes diff --git a/swh/loader/dir/tests/test_converters.py b/swh/loader/dir/tests/test_converters.py index f071365..0949a77 100644 --- a/swh/loader/dir/tests/test_converters.py +++ b/swh/loader/dir/tests/test_converters.py @@ -1,204 +1,237 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import unittest import datetime +import shutil +import tempfile +import unittest from nose.tools import istest from swh.loader.dir import converters from swh.loader.dir.git import git from swh.loader.dir.git.git import GitType, GitPerm +def tmpfile_with_content(fromdir, contentfile): + """Create a temporary file with content contentfile in directory fromdir. + + """ + tmpfilepath = tempfile.mktemp( + suffix='.swh', + prefix='tmp-file-for-test', + dir=fromdir) + + with open(tmpfilepath, 'wb') as f: + f.write(contentfile) + + return tmpfilepath + + class TestConverters(unittest.TestCase): + + @classmethod + def setupClass(cls): + cls.tmpdir = tempfile.mkdtemp(prefix='test-swh-loader-dir.') + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdir) + @istest def format_to_minutes(self): self.assertEquals(converters.format_to_minutes('+0100'), 60) self.assertEquals(converters.format_to_minutes('-0200'), -120) self.assertEquals(converters.format_to_minutes('+1250'), 12*60+50) self.assertEquals(converters.format_to_minutes('+0000'), 0) self.assertEquals(converters.format_to_minutes('-0000'), 0) @istest def annotated_tag_to_release(self): # given release = { 'sha1_git': '123', 'revision': '456', 'name': 'some-release', 'comment': 'some-comment-on-release', 'date': 1444054085, 'offset': '-0300', 'author_name': 'someone', 'author_email': 'someone@whatelse.eu' } expected_release = { 'id': '123', 'revision': '456', 'name': 'some-release', 'comment': 'some-comment-on-release', 'date': datetime.datetime.fromtimestamp( 1444054085, tz=datetime.timezone.utc), 'date_offset': -180, 'author_name': 'someone', 'author_email': 'someone@whatelse.eu', 'synthetic': True, } # when actual_release = converters.annotated_tag_to_release(release) # then self.assertDictEqual(actual_release, expected_release) @istest - def blob_to_content_visible(self): + def blob_to_content_visible_data(self): + # given + contentfile = b'temp file for testing blob to content conversion' + tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) + obj = { - 'length': 9, - 'data': b'some-data', - 'sha1': b'sha1', - 'sha1_git': b'sha1-git', - 'sha256': b'sha256', + 'path': tmpfilepath, 'perms': GitPerm.BLOB, - 'type': GitType.BLOB + 'type': GitType.BLOB, + 'sha1': 'some-sha1', + 'sha256': 'some-sha256', + 'sha1_git': 'some-sha1git', } - expected_content = { - 'length': 9, - 'data': b'some-data', - 'sha1': b'sha1', - 'sha1_git': b'sha1-git', - 'sha256': b'sha256', + expected_blob = { + 'data': contentfile, + 'length': len(contentfile), + 'status': 'visible', + 'sha1': 'some-sha1', + 'sha256': 'some-sha256', + 'sha1_git': 'some-sha1git', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, - 'status': 'visible' } # when - actual_content = converters._blob_to_content(obj) + actual_blob = converters.blob_to_content(obj) # then - self.assertEqual(actual_content, expected_content) + self.assertEqual(actual_blob, expected_blob) @istest - def blob_to_content_absent(self): + def blob_to_content2_absent_data(self): + # given + contentfile = b'temp file for testing blob to content conversion' + tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) + obj = { - 'length': 9, - 'data': b'some-data', - 'sha1': b'sha1', - 'sha1_git': b'sha1-git', - 'sha256': b'sha256', + 'path': tmpfilepath, 'perms': GitPerm.BLOB, - 'type': GitType.BLOB + 'type': GitType.BLOB, + 'sha1': 'some-sha1', + 'sha256': 'some-sha256', + 'sha1_git': 'some-sha1git', } - expected_content = { - 'length': 9, - 'sha1': b'sha1', - 'sha1_git': b'sha1-git', - 'sha256': b'sha256', + expected_blob = { + 'length': len(contentfile), + 'status': 'absent', + 'sha1': 'some-sha1', + 'sha256': 'some-sha256', + 'sha1_git': 'some-sha1git', 'perms': GitPerm.BLOB.value, 'type': GitType.BLOB.value, - 'status': 'absent', 'reason': 'Content too large', - 'origin': 3} + 'origin': 190 + } # when - actual_content = converters._blob_to_content(obj, - max_content_size=5, - origin_id=3) + actual_blob = converters.blob_to_content(obj, None, + max_content_size=10, + origin_id=190) # then - self.assertDictEqual(actual_content, expected_content) + self.assertEqual(actual_blob, expected_blob) @istest def tree_to_directory_no_entries(self): # given tree = { 'path': 'foo', 'sha1_git': b'tree_sha1_git' } objects = { 'foo': [{'type': GitType.TREE, 'perms': GitPerm.TREE, 'name': 'bar', 'sha1_git': b'sha1-target'}, {'type': GitType.BLOB, 'perms': GitPerm.BLOB, 'name': 'file-foo', 'sha1_git': b'file-foo-sha1-target'}] } expected_directory = { 'id': b'tree_sha1_git', 'entries': [{'type': 'dir', 'perms': int(GitPerm.TREE.value), 'name': 'bar', 'target': b'sha1-target'}, {'type': 'file', 'perms': int(GitPerm.BLOB.value), 'name': 'file-foo', 'target': b'file-foo-sha1-target'}] } # when actual_directory = converters.tree_to_directory(tree, objects) # then self.assertEqual(actual_directory, expected_directory) @istest def commit_to_revision(self): # given commit = { 'sha1_git': 'commit-git-sha1', 'author_date': 1444054085, 'author_offset': '+0000', 'committer_date': 1444054085, 'committer_offset': '-0000', 'type': 'tar', 'message': 'synthetic-message-input', 'author_name': 'author-name', 'author_email': 'author-email', 'committer_name': 'committer-name', 'committer_email': 'committer-email', 'metadata': {'checksums': {'sha1': b'sha1-as-bytes'}}, 'directory': 'targeted-tree-sha1', } objects = { git.ROOT_TREE_KEY: [{'sha1_git': 'targeted-tree-sha1'}] } expected_revision = { 'id': 'commit-git-sha1', 'date': datetime.datetime.fromtimestamp( 1444054085, tz=datetime.timezone.utc), 'date_offset': 0, 'committer_date': datetime.datetime.fromtimestamp( 1444054085, tz=datetime.timezone.utc), 'committer_date_offset': 0, 'type': 'tar', 'directory': 'targeted-tree-sha1', 'message': 'synthetic-message-input', 'author_name': 'author-name', 'author_email': 'author-email', 'committer_name': 'committer-name', 'committer_email': 'committer-email', 'synthetic': True, 'metadata': {'checksums': {'sha1': b'sha1-as-bytes'}}, 'parents': [], } # when actual_revision = converters.commit_to_revision(commit, objects) # then self.assertEquals(actual_revision, expected_revision) diff --git a/version.txt b/version.txt index 58a9f8a..6774e97 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.10-0-g78274a1 \ No newline at end of file +v0.0.11-0-gc7afc41 \ No newline at end of file