diff --git a/swh/vault/cookers/base.py b/swh/vault/cookers/base.py --- a/swh/vault/cookers/base.py +++ b/swh/vault/cookers/base.py @@ -17,23 +17,6 @@ from swh.storage import get_storage -def get_tar_bytes(path, arcname=None): - path = Path(path) - if not arcname: - arcname = path.name - tar_buffer = io.BytesIO() - tar = tarfile.open(fileobj=tar_buffer, mode='w') - tar.add(str(path), arcname=arcname) - return tar_buffer.getbuffer() - - -SKIPPED_MESSAGE = (b'This content have not been retrieved in ' - b'Software Heritage archive due to its size') - - -HIDDEN_MESSAGE = (b'This content is hidden') - - class BaseVaultCooker(metaclass=abc.ABCMeta): """Abstract base class for the vault's bundle creators @@ -111,6 +94,47 @@ self.backend.send_all_notifications(self.obj_type, self.obj_id) +SKIPPED_MESSAGE = (b'This content has not been retrieved in the ' + b'Software Heritage archive due to its size.') + +HIDDEN_MESSAGE = (b'This content is hidden.') + + +def get_filtered_file_content(storage, file_data): + """Retrieve the file specified by file_data and apply filters for skipped + and missing contents. + + Args: + storage: the storage from which to retrieve the object + file_data: file entry descriptor as returned by directory_ls() + + Returns: + Bytes containing the specified content. The content will be replaced by + a specific message to indicate that the content could not be retrieved + (either due to privacy policy or because its size was too big for us to + archive it). + """ + + assert file_data['type'] == 'file' + + if file_data['status'] == 'absent': + return SKIPPED_MESSAGE + elif file_data['status'] == 'hidden': + return HIDDEN_MESSAGE + else: + return list(storage.content_get([file_data['sha1']]))[0]['data'] + + +def get_tar_bytes(path, arcname=None): + path = Path(path) + if not arcname: + arcname = path.name + tar_buffer = io.BytesIO() + tar = tarfile.open(fileobj=tar_buffer, mode='w') + tar.add(str(path), arcname=arcname) + return tar_buffer.getbuffer() + + class DirectoryBuilder: """Creates a cooked directory from its sha1_git in the db. @@ -170,15 +194,8 @@ # Then create the files for file_data in file_datas: path = os.path.join(root, file_data['name']) - status = file_data['status'] - perms = file_data['perms'] - if status == 'absent': - self._create_file_absent(path) - elif status == 'hidden': - self._create_file_hidden(path) - else: - content = self._get_file_content(file_data['sha1']) - self._create_file(path, content, perms) + content = get_filtered_file_content(self.storage, file_data) + self._create_file(path, content, file_data['perms']) def _create_file(self, path, content, perms=0o100644): """Create the given file and fill it with content. @@ -203,26 +220,6 @@ content = list(self.storage.content_get([obj_id]))[0]['data'] return content - def _create_file_absent(self, path): - """Create a file that indicates a skipped content - - Create the given file but fill it with a specific content to - indicate that the content have not been retrieved by the - software heritage archive due to its size. - - """ - self._create_file(self, SKIPPED_MESSAGE) - - def _create_file_hidden(self, path): - """Create a file that indicates an hidden content - - Create the given file but fill it with a specific content to - indicate that the content could not be retrieved due to - privacy policy. - - """ - self._create_file(self, HIDDEN_MESSAGE) - def _create_bundle_content(self, path, hex_dir_id): """Create a bundle from the given directory diff --git a/swh/vault/cookers/revision_gitfast.py b/swh/vault/cookers/revision_gitfast.py --- a/swh/vault/cookers/revision_gitfast.py +++ b/swh/vault/cookers/revision_gitfast.py @@ -10,7 +10,7 @@ import time import zlib -from .base import BaseVaultCooker +from .base import BaseVaultCooker, get_filtered_file_content from swh.model import hashutil @@ -103,7 +103,7 @@ obj_id = file_data['sha1'] if obj_id in self.obj_done: return - content = list(self.storage.content_get([obj_id]))[0]['data'] + content = get_filtered_file_content(self.storage, file_data) yield fastimport.commands.BlobCommand( mark=self.mark(obj_id), data=content, diff --git a/swh/vault/tests/test_backend.py b/swh/vault/tests/test_backend.py --- a/swh/vault/tests/test_backend.py +++ b/swh/vault/tests/test_backend.py @@ -13,7 +13,7 @@ from swh.core.tests.db_testing import DbTestFixture from swh.model import hashutil from swh.storage.tests.storage_testing import StorageTestFixture -from swh.vault.tests.vault_testing import VaultTestFixture +from swh.vault.tests.vault_testing import VaultTestFixture, hash_content class BaseTestBackend(VaultTestFixture, StorageTestFixture, DbTestFixture): @@ -37,12 +37,8 @@ creation_delta_secs = (ts - now).total_seconds() self.assertLess(creation_delta_secs, tolerance_secs) - def hash_content(self, content): - obj_id = hashutil.hash_data(content)['sha1'] - return content, obj_id - def fake_cook(self, obj_type, result_content, sticky=False): - content, obj_id = self.hash_content(result_content) + content, obj_id = hash_content(result_content) with self.mock_cooking(): self.vault_backend.create_task(obj_type, obj_id, sticky) self.vault_backend.cache.add(obj_type, obj_id, b'content') diff --git a/swh/vault/tests/test_cookers.py b/swh/vault/tests/test_cookers.py --- a/swh/vault/tests/test_cookers.py +++ b/swh/vault/tests/test_cookers.py @@ -26,7 +26,8 @@ from swh.model.from_disk import Directory from swh.storage.tests.storage_testing import StorageTestFixture from swh.vault.cookers import DirectoryCooker, RevisionGitfastCooker -from swh.vault.tests.vault_testing import VaultTestFixture +from swh.vault.cookers.base import SKIPPED_MESSAGE, HIDDEN_MESSAGE +from swh.vault.tests.vault_testing import VaultTestFixture, hash_content class TestRepo: @@ -148,6 +149,39 @@ directory = Directory.from_disk(path=bytes(p)) self.assertEqual(obj_id_hex, hashutil.hash_to_hex(directory.hash)) + def test_filtered_objects(self): + repo = TestRepo() + with repo as rp: + file_1, id_1 = hash_content(b'test1') + file_2, id_2 = hash_content(b'test2') + file_3, id_3 = hash_content(b'test3') + + (rp / 'file').write_bytes(file_1) + (rp / 'hidden_file').write_bytes(file_2) + (rp / 'absent_file').write_bytes(file_3) + + c = repo.commit() + self.load(str(rp)) + + obj_id_hex = repo.repo[c].tree.decode() + obj_id = hashutil.hash_to_bytes(obj_id_hex) + + # FIXME: storage.content_update() should be changed to allow things + # like that + cur = self.storage.db._cursor(None) + cur.execute("""update content set status = 'visible' + where sha1 = %s""", (id_1,)) + cur.execute("""update content set status = 'hidden' + where sha1 = %s""", (id_2,)) + cur.execute("""update content set status = 'absent' + where sha1 = %s""", (id_3,)) + cur.close() + + with self.cook_extract_directory(obj_id) as p: + self.assertEqual((p / 'file').read_bytes(), b'test1') + self.assertEqual((p / 'hidden_file').read_bytes(), HIDDEN_MESSAGE) + self.assertEqual((p / 'absent_file').read_bytes(), SKIPPED_MESSAGE) + class TestRevisionGitfastCooker(BaseTestCookers, unittest.TestCase): def test_revision_simple(self): @@ -270,3 +304,36 @@ with self.cook_extract_revision_gitfast(obj_id) as (ert, p): self.assertEqual(ert.repo.refs[b'HEAD'].decode(), obj_id_hex) + + def test_filtered_objects(self): + repo = TestRepo() + with repo as rp: + file_1, id_1 = hash_content(b'test1') + file_2, id_2 = hash_content(b'test2') + file_3, id_3 = hash_content(b'test3') + + (rp / 'file').write_bytes(file_1) + (rp / 'hidden_file').write_bytes(file_2) + (rp / 'absent_file').write_bytes(file_3) + + repo.commit() + obj_id_hex = repo.repo.refs[b'HEAD'].decode() + obj_id = hashutil.hash_to_bytes(obj_id_hex) + self.load(str(rp)) + + # FIXME: storage.content_update() should be changed to allow things + # like that + cur = self.storage.db._cursor(None) + cur.execute("""update content set status = 'visible' + where sha1 = %s""", (id_1,)) + cur.execute("""update content set status = 'hidden' + where sha1 = %s""", (id_2,)) + cur.execute("""update content set status = 'absent' + where sha1 = %s""", (id_3,)) + cur.close() + + with self.cook_extract_revision_gitfast(obj_id) as (ert, p): + ert.checkout(b'HEAD') + self.assertEqual((p / 'file').read_bytes(), b'test1') + self.assertEqual((p / 'hidden_file').read_bytes(), HIDDEN_MESSAGE) + self.assertEqual((p / 'absent_file').read_bytes(), SKIPPED_MESSAGE) diff --git a/swh/vault/tests/vault_testing.py b/swh/vault/tests/vault_testing.py --- a/swh/vault/tests/vault_testing.py +++ b/swh/vault/tests/vault_testing.py @@ -5,6 +5,8 @@ import tempfile import pathlib + +from swh.model import hashutil from swh.vault.backend import VaultBackend @@ -61,3 +63,8 @@ def reset_vault_tables(self): excluded = {'dbversion'} self.reset_db_tables(self.TEST_VAULT_DB_NAME, excluded=excluded) + + +def hash_content(content): + obj_id = hashutil.hash_data(content)['sha1'] + return content, obj_id