diff --git a/debian/control b/debian/control index eec09f5..b28478b 100644 --- a/debian/control +++ b/debian/control @@ -1,22 +1,22 @@ Source: swh-loader-core Maintainer: Software Heritage developers <swh-devel@inria.fr> Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core, python3-swh.storage (>= 0.0.76~), - python3-swh.model (>= 0.0.11~), + python3-swh.model (>= 0.0.15~), python3-retrying, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/60/ Package: python3-swh.loader.core Architecture: all Depends: python3-swh.core, python3-swh.storage (>= 0.0.76~), python3-swh.model (>= 0.0.11~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Loader Core diff --git a/requirements-swh.txt b/requirements-swh.txt index 3d7ee25..e065aad 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,3 @@ swh.core swh.storage >= 0.0.76 -swh.model >= 0.0.11 +swh.model >= 0.0.15 diff --git a/swh/loader/core/converters.py b/swh/loader/core/converters.py index 780825f..7e5bcfe 100644 --- a/swh/loader/core/converters.py +++ b/swh/loader/core/converters.py @@ -1,152 +1,154 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert objects to dictionaries suitable for swh.storage""" import os from swh.model.hashutil import hash_to_hex from swh.model import git def blob_to_content(obj, log=None, max_content_size=None, origin_id=None): """Convert obj to a swh storage content. Note: - If obj represents a link, the length and data are already provided so we use them directly. - 'data' is returned only if max_content_size is not reached. Returns: obj converted to content as a dictionary. """ filepath = obj['path'] if 'length' in obj: # link already has it size = obj['length'] else: size = os.lstat(filepath).st_size ret = { 'sha1': obj['sha1'], 'sha256': obj['sha256'], 'sha1_git': obj['sha1_git'], + 'blake2s256': obj['blake2s256'], 'length': size, 'perms': obj['perms'].value, 'type': obj['type'].value, } if max_content_size and size > max_content_size: if log: log.info('Skipping content %s, too large (%s > %s)' % (hash_to_hex(obj['sha1_git']), size, max_content_size)) ret.update({'status': 'absent', 'reason': 'Content too large', 'origin': origin_id}) return ret if 'data' in obj: # link already has it data = obj['data'] else: data = open(filepath, 'rb').read() ret.update({ 'data': data, 'status': 'visible' }) return ret # Map of type to swh types _entry_type_map = { git.GitType.TREE: 'dir', git.GitType.BLOB: 'file', git.GitType.COMM: 'rev', } def tree_to_directory(tree, log=None): """Format a tree as a directory """ entries = [] for entry in tree['children']: entries.append({ 'type': _entry_type_map[entry['type']], 'perms': int(entry['perms'].value), 'name': entry['name'], 'target': entry['sha1_git'] }) return { 'id': tree['sha1_git'], 'entries': entries } def ref_to_occurrence(ref): """Format a reference as an occurrence""" occ = ref.copy() if 'branch' in ref: branch = ref['branch'] if isinstance(branch, str): occ['branch'] = branch.encode('utf-8') else: occ['branch'] = branch return occ def shallow_blob(obj): """Convert a full swh content/blob to just what's needed by swh-storage for filtering. Returns: A shallow copy of a full swh content/blob object. """ return { 'sha1': obj['sha1'], 'sha256': obj['sha256'], 'sha1_git': obj['sha1_git'], + 'blake2s256': obj['blake2s256'], 'length': obj['length'] } def shallow_tree(tree): """Convert a full swh directory/tree to just what's needed by swh-storage for filtering. Returns: A shallow copy of a full swh directory/tree object. """ return tree['sha1_git'] def shallow_commit(commit): """Convert a full swh revision/commit to just what's needed by swh-storage for filtering. Returns: A shallow copy of a full swh revision/commit object. """ return commit['id'] def shallow_tag(tag): """Convert a full swh release/tag to just what's needed by swh-storage for filtering. Returns: A shallow copy of a full swh release/tag object. """ return tag['id'] diff --git a/swh/loader/core/tests/test_converters.py b/swh/loader/core/tests/test_converters.py index 1425dfb..8035e14 100644 --- a/swh/loader/core/tests/test_converters.py +++ b/swh/loader/core/tests/test_converters.py @@ -1,309 +1,319 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import tempfile import unittest from nose.tools import istest from swh.loader.core import converters from swh.model import git def tmpfile_with_content(fromdir, contentfile): """Create a temporary file with content contentfile in directory fromdir. """ tmpfilepath = tempfile.mktemp( suffix='.swh', prefix='tmp-file-for-test', dir=fromdir) with open(tmpfilepath, 'wb') as f: f.write(contentfile) return tmpfilepath class TestConverters(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.tmpdir = tempfile.mkdtemp(prefix='test-swh-loader-dir.') @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdir) super().tearDownClass() @istest def blob_to_content_visible_data(self): # given contentfile = b'temp file for testing blob to content conversion' tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) obj = { 'path': tmpfilepath, 'perms': git.GitPerm.BLOB, 'type': git.GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', + 'blake2s256': 'some-blak2s256', 'sha1_git': 'some-sha1git', } expected_blob = { 'data': contentfile, 'length': len(contentfile), 'status': 'visible', 'sha1': 'some-sha1', 'sha256': 'some-sha256', + 'blake2s256': 'some-blak2s256', 'sha1_git': 'some-sha1git', 'perms': git.GitPerm.BLOB.value, 'type': git.GitType.BLOB.value, } # when actual_blob = converters.blob_to_content(obj) # then self.assertEqual(actual_blob, expected_blob) @istest def blob_to_content_link(self): # given contentfile = b'temp file for testing blob to content conversion' tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) tmplinkpath = tempfile.mktemp(dir=self.tmpdir) os.symlink(tmpfilepath, tmplinkpath) obj = { 'path': tmplinkpath, 'perms': git.GitPerm.BLOB, 'type': git.GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', + 'blake2s256': 'some-blak2s256', } expected_blob = { 'data': contentfile, 'length': len(tmpfilepath), 'status': 'visible', 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', + 'blake2s256': 'some-blak2s256', 'perms': git.GitPerm.BLOB.value, 'type': git.GitType.BLOB.value, } # when actual_blob = converters.blob_to_content(obj) # then self.assertEqual(actual_blob, expected_blob) @istest def blob_to_content_link_with_data_length_populated(self): # given tmplinkpath = tempfile.mktemp(dir=self.tmpdir) obj = { 'length': 10, # wrong for test purposes 'data': 'something wrong', # again for test purposes 'path': tmplinkpath, 'perms': git.GitPerm.BLOB, 'type': git.GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', + 'blake2s256': 'some-blak2s256', } expected_blob = { 'length': 10, 'data': 'something wrong', 'status': 'visible', 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', + 'blake2s256': 'some-blak2s256', 'perms': git.GitPerm.BLOB.value, 'type': git.GitType.BLOB.value, } # when actual_blob = converters.blob_to_content(obj) # then self.assertEqual(actual_blob, expected_blob) @istest def blob_to_content2_absent_data(self): # given contentfile = b'temp file for testing blob to content conversion' tmpfilepath = tmpfile_with_content(self.tmpdir, contentfile) obj = { 'path': tmpfilepath, 'perms': git.GitPerm.BLOB, 'type': git.GitType.BLOB, 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', + 'blake2s256': 'some-blak2s256', } expected_blob = { 'length': len(contentfile), 'status': 'absent', 'sha1': 'some-sha1', 'sha256': 'some-sha256', 'sha1_git': 'some-sha1git', + 'blake2s256': 'some-blak2s256', 'perms': git.GitPerm.BLOB.value, 'type': git.GitType.BLOB.value, 'reason': 'Content too large', 'origin': 190 } # when actual_blob = converters.blob_to_content(obj, None, max_content_size=10, origin_id=190) # then self.assertEqual(actual_blob, expected_blob) @istest def tree_to_directory_no_entries(self): # given tree = { 'path': 'foo', 'sha1_git': b'tree_sha1_git', 'children': [{'type': git.GitType.TREE, 'perms': git.GitPerm.TREE, 'name': 'bar', 'sha1_git': b'sha1-target'}, {'type': git.GitType.BLOB, 'perms': git.GitPerm.BLOB, 'name': 'file-foo', 'sha1_git': b'file-foo-sha1-target'}] } expected_directory = { 'id': b'tree_sha1_git', 'entries': [{'type': 'dir', 'perms': int(git.GitPerm.TREE.value), 'name': 'bar', 'target': b'sha1-target'}, {'type': 'file', 'perms': int(git.GitPerm.BLOB.value), 'name': 'file-foo', 'target': b'file-foo-sha1-target'}] } # when actual_directory = converters.tree_to_directory(tree) # then self.assertEqual(actual_directory, expected_directory) @istest def ref_to_occurrence_1(self): # when actual_occ = converters.ref_to_occurrence({ 'id': 'some-id', 'branch': 'some/branch' }) # then self.assertEquals(actual_occ, { 'id': 'some-id', 'branch': b'some/branch' }) @istest def ref_to_occurrence_2(self): # when actual_occ = converters.ref_to_occurrence({ 'id': 'some-id', 'branch': b'some/branch' }) # then self.assertEquals(actual_occ, { 'id': 'some-id', 'branch': b'some/branch' }) @istest def shallow_blob(self): # when actual_blob = converters.shallow_blob({ 'length': 1451, 'sha1_git': b'\xd1\xdd\x9a@\xeb\xf6!\x99\xd4[S\x05\xa8Y\xa3\x80\xa7\xb1;\x9c', 'name': b'LDPCL', 'type': b'blob', 'sha256': b'\xe6it!\x99\xb37UT\x8f\x0e\x8f\xd7o\x92"\xce\xa3\x1d\xd2\xe5D>M\xaaj/\x03\x138\xad\x1b', # noqa 'perms': b'100644', 'sha1': b'.\x18Y\xd6M\x8c\x9a\xa4\xe1\xf1\xc7\x95\x082\xcf\xc9\xd8\nV)', + 'blake2s256': 'some-blak2s256', 'path': b'/tmp/tmp.c86tq5o9.swh.loader/pkg-doc-linux/copyrights/non-free/LDPCL' # noqa }) # then self.assertEqual(actual_blob, { 'sha1': b'.\x18Y\xd6M\x8c\x9a\xa4\xe1\xf1\xc7\x95\x082\xcf\xc9\xd8\nV)', 'sha1_git': b'\xd1\xdd\x9a@\xeb\xf6!\x99\xd4[S\x05\xa8Y\xa3\x80\xa7\xb1;\x9c', 'sha256': b'\xe6it!\x99\xb37UT\x8f\x0e\x8f\xd7o\x92"\xce\xa3\x1d\xd2\xe5D>M\xaaj/\x03\x138\xad\x1b', # noqa + 'blake2s256': 'some-blak2s256', 'length': 1451, }) @istest def shallow_tree(self): # when actual_shallow_tree = converters.shallow_tree({ 'length': 1451, 'sha1_git': b'tree-id', 'type': b'tree', 'sha256': b'\xe6it!\x99\xb37UT\x8f\x0e\x8f\xd7o\x92"\xce\xa3\x1d\xd2\xe5D>M\xaaj/\x03\x138\xad\x1b', # noqa 'perms': b'100644', 'sha1': b'.\x18Y\xd6M\x8c\x9a\xa4\xe1\xf1\xc7\x95\x082\xcf\xc9\xd8\nV)', }) # then self.assertEqual(actual_shallow_tree, b'tree-id') @istest def shallow_commit(self): # when actual_shallow_commit = converters.shallow_commit({ 'sha1_git': b'\xd1\xdd\x9a@\xeb\xf6!\x99\xd4[S\x05\xa8Y\xa3\x80\xa7\xb1;\x9c', 'type': b'commit', 'id': b'let-me-see-some-id', }) # then self.assertEqual(actual_shallow_commit, b'let-me-see-some-id') @istest def shallow_tag(self): # when actual_shallow_tag = converters.shallow_tag({ 'sha1': b'\xd1\xdd\x9a@\xeb\xf6!\x99\xd4[S\x05\xa8Y\xa3\x80\xa7\xb1;\x9c', 'type': b'tag', 'id': b'this-is-not-the-id-you-are-looking-for', }) # then self.assertEqual(actual_shallow_tag, b'this-is-not-the-id-you-are-looking-for') # noqa