diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -7,15 +7,36 @@ import os import stat -from typing import List +import attr +from typing import List, Optional -from .hashutil import MultiHash, HASH_BLOCK_SIZE +from .hashutil import MultiHash from .merkle import MerkleLeaf, MerkleNode from .identifiers import ( directory_entry_sort_key, directory_identifier, identifier_to_bytes as id_to_bytes, identifier_to_str as id_to_str, ) +from . import model + + +@attr.s +class DiskBackedContent(model.Content): + """Subclass of Content, which allows lazy-loading data from the disk.""" + path = attr.ib(type=Optional[bytes], default=None) + + def __attrs_post_init__(self): + if self.path is None: + raise TypeError('path must not be None.') + + def with_data(self) -> model.Content: + args = self.to_dict() + del args['path'] + assert self.path is not None + with open(self.path, 'rb') as fd: + return model.Content.from_dict({ + **args, + 'data': fd.read()}) class DentryPerms(enum.IntEnum): @@ -94,8 +115,7 @@ @classmethod def from_file( - cls, *, path, data=False, save_path=False, - max_content_length=None): + cls, *, path, max_content_length=None): """Compute the Software Heritage content entry corresponding to an on-disk file. @@ -104,9 +124,6 @@ - using the content as a directory entry in a directory Args: - path (bytes): path to the file for which we're computing the - content entry - data (bool): add the file data to the entry save_path (bool): add the file path to the entry max_content_length (Optional[int]): if given, all contents larger than this will be skipped. @@ -137,36 +154,23 @@ if too_large: skip_reason = 'Content too large' - elif not data: - skip_reason = 'Skipping file content' else: skip_reason = None + hashes = MultiHash.from_path(path).digest() if skip_reason: ret = { - **MultiHash.from_path(path).digest(), + **hashes, 'status': 'absent', 'reason': skip_reason, } else: - h = MultiHash(length=length) - chunks = [] - with open(path, 'rb') as fobj: - while True: - chunk = fobj.read(HASH_BLOCK_SIZE) - if not chunk: - break - h.update(chunk) - chunks.append(chunk) - ret = { - **h.digest(), + **hashes, 'status': 'visible', - 'data': b''.join(chunks), } - if save_path: - ret['path'] = path + ret['path'] = path ret['perms'] = mode_to_perms(mode) ret['length'] = length @@ -179,6 +183,18 @@ def compute_hash(self): return self.data['sha1_git'] + def to_model(self) -> model.BaseContent: + """Builds a `model.BaseContent` object based on this leaf.""" + data = self.get_data().copy() + data.pop('perms', None) + if data['status'] == 'absent': + data.pop('path', None) + return model.SkippedContent.from_dict(data) + elif 'data' in data: + return model.Content.from_dict(data) + else: + return DiskBackedContent.from_dict(data) + def accept_all_directories(dirname, entries): """Default filter for :func:`Directory.from_disk` accepting all @@ -250,7 +266,7 @@ type = 'directory' @classmethod - def from_disk(cls, *, path, data=False, save_path=False, + def from_disk(cls, *, path, dir_filter=accept_all_directories, max_content_length=None): """Compute the Software Heritage objects for a given directory tree @@ -278,8 +294,7 @@ path = os.path.join(root, name) if not os.path.isdir(path) or os.path.islink(path): content = Content.from_file( - path=path, data=data, save_path=save_path, - max_content_length=max_content_length) + path=path, max_content_length=max_content_length) entries[name] = content else: if dir_filter(name, dirs[path].entries): @@ -338,6 +353,11 @@ def compute_hash(self): return id_to_bytes(directory_identifier({'entries': self.entries})) + def to_model(self) -> model.Directory: + """Builds a `model.Directory` object based on this node; + ignoring its children.""" + return model.Directory.from_dict(self.get_data()) + def __getitem__(self, key): if not isinstance(key, bytes): raise ValueError('Can only get a bytes from Directory') diff --git a/swh/model/model.py b/swh/model/model.py --- a/swh/model/model.py +++ b/swh/model/model.py @@ -18,6 +18,13 @@ ) from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes + +class MissingData(Exception): + """Raised by `Content.with_data` when it has no way of fetching the + data (but not when fetching the data fails).""" + pass + + SHA1_SIZE = 20 # TODO: Limit this to 20 bytes @@ -362,6 +369,10 @@ @attr.s(frozen=True) class BaseContent(BaseModel): + status = attr.ib( + type=str, + validator=attr.validators.in_(['visible', 'hidden', 'absent'])) + def to_dict(self): content = super().to_dict() if content['ctime'] is None: @@ -402,8 +413,8 @@ type=str, default='visible', validator=attr.validators.in_(['visible', 'hidden'])) - data = attr.ib(type=Optional[bytes], - default=None) + + data = attr.ib(type=Optional[bytes], default=None) ctime = attr.ib(type=Optional[datetime.datetime], default=None) @@ -424,6 +435,16 @@ def from_dict(cls, d): return super().from_dict(d, use_subclass=False) + def with_data(self) -> 'Content': + """Loads the `data` attribute; meaning that it is guaranteed not to + be None after this call. + + This call is almost a no-op, but subclasses may overload this method + to lazy-load data (eg. from disk or objstorage).""" + if self.data is None: + raise MissingData('Content data is None.') + return self + @attr.s(frozen=True) class SkippedContent(BaseContent): @@ -432,7 +453,7 @@ sha256 = attr.ib(type=Optional[bytes]) blake2s256 = attr.ib(type=Optional[bytes]) - length = attr.ib(type=int) + length = attr.ib(type=Optional[int]) status = attr.ib( type=str, diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py --- a/swh/model/tests/test_from_disk.py +++ b/swh/model/tests/test_from_disk.py @@ -12,8 +12,11 @@ from typing import ClassVar, Optional from swh.model import from_disk -from swh.model.from_disk import Content, DentryPerms, Directory +from swh.model.from_disk import ( + Content, DentryPerms, Directory, DiskBackedContent +) from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex +from swh.model import model TEST_DATA = os.path.join(os.path.dirname(__file__), 'data') @@ -48,6 +51,57 @@ self.assertEqual(perm, from_disk.mode_to_perms(fmode)) +class TestDiskBackedContent(unittest.TestCase): + def test_with_data(self): + expected_content = model.Content( + length=42, status='visible', data=b'foo bar', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + with tempfile.NamedTemporaryFile(mode='w+b') as fd: + content = DiskBackedContent( + length=42, status='visible', path=fd.name, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + fd.write(b'foo bar') + fd.seek(0) + content_with_data = content.with_data() + + assert expected_content == content_with_data + + def test_lazy_data(self): + with tempfile.NamedTemporaryFile(mode='w+b') as fd: + fd.write(b'foo') + fd.seek(0) + content = DiskBackedContent( + length=42, status='visible', path=fd.name, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + fd.write(b'bar') + fd.seek(0) + content_with_data = content.with_data() + fd.write(b'baz') + fd.seek(0) + + assert content_with_data.data == b'bar' + + def test_with_data_cannot_read(self): + with tempfile.NamedTemporaryFile(mode='w+b') as fd: + content = DiskBackedContent( + length=42, status='visible', path=fd.name, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + + with pytest.raises(OSError): + content.with_data() + + def test_missing_path(self): + with pytest.raises(TypeError): + DiskBackedContent( + length=42, status='visible', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + + with pytest.raises(TypeError): + DiskBackedContent( + length=42, status='visible', path=None, + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + + class DataMixin: maxDiff = None # type: ClassVar[Optional[int]] @@ -401,19 +455,19 @@ def tearDown(self): self.tmpdir.cleanup() - def assertContentEqual(self, left, right, *, check_data=False, # noqa + def assertContentEqual(self, left, right, *, # noqa check_path=False): if not isinstance(left, Content): raise ValueError('%s is not a Content' % left) if isinstance(right, Content): right = right.get_data() + # Compare dictionaries + keys = DEFAULT_ALGORITHMS | { 'length', 'perms', } - if check_data: - keys |= {'data'} if check_path: keys |= {'path'} @@ -449,6 +503,9 @@ right = right.get_data() assert left.entries == right['entries'] + assert left.hash == right['id'] + + assert left.to_model() == model.Directory.from_dict(right) def make_contents(self, directory): for filename, content in self.contents.items(): @@ -498,6 +555,19 @@ conv_content = Content.from_symlink(path=path, mode=perms) self.assertContentEqual(conv_content, symlink) + def test_symlink_to_base_model(self): + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + perms = 0o120000 + model_content = \ + Content.from_symlink(path=path, mode=perms).to_model() + + right = symlink.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) + class FileToContent(DataMixin, unittest.TestCase): def setUp(self): @@ -507,45 +577,86 @@ self.make_specials(self.tmpdir_name) def test_symlink_to_content(self): - # Check whether loading the data works - for data in [True, False]: - for filename, symlink in self.symlinks.items(): - path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, symlink, check_data=data) + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, symlink) def test_file_to_content(self): - for data in [True, False]: - for filename, content in self.contents.items(): - path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, content, check_data=data) + for filename, content in self.contents.items(): + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, content) def test_special_to_content(self): - for data in [True, False]: - for filename in self.specials: - path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, self.empty_content) + for filename in self.specials: + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, self.empty_content) - for path in ['/dev/null', '/dev/zero']: - path = os.path.join(self.tmpdir_name, filename) - conv_content = Content.from_file(path=path, data=data) - self.assertContentEqual(conv_content, self.empty_content) + for path in ['/dev/null', '/dev/zero']: + path = os.path.join(self.tmpdir_name, filename) + conv_content = Content.from_file(path=path) + self.assertContentEqual(conv_content, self.empty_content) + + def test_symlink_to_content_model(self): + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + model_content = Content.from_file(path=path).to_model() + + right = symlink.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) + + def test_file_to_content_model(self): + for filename, content in self.contents.items(): + path = os.path.join(self.tmpdir_name, filename) + model_content = Content.from_file(path=path).to_model() + + right = content.copy() + for key in ('perms', 'mode'): + right.pop(key, None) + assert model_content.with_data() == model.Content.from_dict(right) + + right['path'] = path + del right['data'] + assert model_content == DiskBackedContent.from_dict(right) + + def test_special_to_content_model(self): + for filename in self.specials: + path = os.path.join(self.tmpdir_name, filename) + model_content = Content.from_file(path=path).to_model() + + right = self.empty_content.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) + + for path in ['/dev/null', '/dev/zero']: + model_content = Content.from_file(path=path).to_model() + + right = self.empty_content.copy() + for key in ('perms', 'path', 'mode'): + right.pop(key, None) + right['status'] = 'visible' + assert model_content == model.Content.from_dict(right) def test_symlink_max_length(self): for max_content_length in [4, 10]: for filename, symlink in self.symlinks.items(): path = os.path.join(self.tmpdir_name, filename) - content = Content.from_file(path=path, data=True) + content = Content.from_file(path=path) if content.data['length'] > max_content_length: with pytest.raises(Exception, match='too large'): Content.from_file( - path=path, data=True, + path=path, max_content_length=max_content_length) else: limited_content = Content.from_file( - path=path, data=True, + path=path, max_content_length=max_content_length) assert content == limited_content @@ -553,9 +664,9 @@ for max_content_length in [2, 4]: for filename, content in self.contents.items(): path = os.path.join(self.tmpdir_name, filename) - content = Content.from_file(path=path, data=True) + content = Content.from_file(path=path) limited_content = Content.from_file( - path=path, data=True, + path=path, max_content_length=max_content_length) assert content.data['length'] == limited_content.data['length'] assert content.data['status'] == 'visible' @@ -565,15 +676,14 @@ == 'Content too large' else: assert limited_content.data['status'] == 'visible' - assert limited_content.data['data'] == content.data['data'] def test_special_file_max_length(self): for max_content_length in [None, 0, 1]: for filename in self.specials: path = os.path.join(self.tmpdir_name, filename) - content = Content.from_file(path=path, data=True) + content = Content.from_file(path=path) limited_content = Content.from_file( - path=path, data=True, + path=path, max_content_length=max_content_length) assert limited_content == content @@ -582,7 +692,7 @@ content_w_path = content.copy() path = os.path.join(self.tmpdir_name, filename) content_w_path['path'] = path - conv_content = Content.from_file(path=path, save_path=True) + conv_content = Content.from_file(path=path) self.assertContentEqual(conv_content, content_w_path, check_path=True) @@ -762,12 +872,12 @@ path=os.path.join(self.tmpdir_name, b'sample-folder') ) - for name, data in self.tarball_contents.items(): + for name, expected in self.tarball_contents.items(): obj = directory[name] if isinstance(obj, Content): - self.assertContentEqual(obj, data) + self.assertContentEqual(obj, expected) elif isinstance(obj, Directory): - self.assertDirectoryEqual(obj, data) + self.assertDirectoryEqual(obj, expected) else: raise self.failureException('Unknown type for %s' % obj) diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -6,8 +6,10 @@ import copy from hypothesis import given +import pytest from swh.model.model import Content, Directory, Revision, Release, Snapshot +from swh.model.model import MissingData from swh.model.hashutil import hash_to_bytes from swh.model.hypothesis_strategies import objects, origins, origin_visits from swh.model.identifiers import ( @@ -69,6 +71,21 @@ assert c.hashes() == hashes +def test_content_data(): + c = Content( + length=42, status='visible', data=b'foo', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + assert c.with_data() == c + + +def test_content_data_missing(): + c = Content( + length=42, status='visible', + sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') + with pytest.raises(MissingData): + c.with_data() + + def test_directory_model_id_computation(): dir_dict = dict(directory_example) del dir_dict['id']