diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py --- a/swh/model/from_disk.py +++ b/swh/model/from_disk.py @@ -83,6 +83,7 @@ ret['length'] = len(data) ret['perms'] = mode_to_perms(mode) ret['data'] = data + ret['status'] = 'visible' return cls(ret) @@ -92,7 +93,9 @@ return cls.from_bytes(mode=mode, data=os.readlink(path)) @classmethod - def from_file(cls, *, path, data=False, save_path=False): + def from_file( + cls, *, path, data=False, save_path=False, + max_content_length=None): """Compute the Software Heritage content entry corresponding to an on-disk file. @@ -105,22 +108,46 @@ content entry data (bool): add the file data to the entry save_path (bool): add the file path to the entry + max_content_length (Optional[int]): if given, all contents larger + than this will be skipped. """ file_stat = os.lstat(path) mode = file_stat.st_mode + length = file_stat.st_size + too_large = max_content_length is not None \ + and length > max_content_length if stat.S_ISLNK(mode): # Symbolic link: return a file whose contents are the link target + + if too_large: + # Unlike large contents, we can't stream symlinks to + # MultiHash, and we don't want to fit them in memory if + # they exceed max_content_length either. + # Thankfully, this should not happen for reasonable values of + # max_content_length because of OS/filesystem limitations, + # so let's just raise an error. + raise Exception(f'Symlink too large ({length} bytes)') + return cls.from_symlink(path=path, mode=mode) elif not stat.S_ISREG(mode): # not a regular file: return the empty file instead return cls.from_bytes(mode=mode, data=b'') - length = file_stat.st_size + if too_large: + skip_reason = 'Content too large' + elif not data: + skip_reason = 'Skipping file content' + else: + skip_reason = None - if not data: - ret = MultiHash.from_path(path).digest() + if skip_reason: + ret = { + **MultiHash.from_path(path).digest(), + 'status': 'absent', + 'reason': skip_reason, + } else: h = MultiHash(length=length) chunks = [] @@ -132,8 +159,11 @@ h.update(chunk) chunks.append(chunk) - ret = h.digest() - ret['data'] = b''.join(chunks) + ret = { + **h.digest(), + 'status': 'visible', + 'data': b''.join(chunks), + } if save_path: ret['path'] = path @@ -221,7 +251,8 @@ @classmethod def from_disk(cls, *, path, data=False, save_path=False, - dir_filter=accept_all_directories): + dir_filter=accept_all_directories, + max_content_length=None): """Compute the Software Heritage objects for a given directory tree Args: @@ -232,6 +263,8 @@ name or contents. Takes two arguments: dirname and entries, and returns True if the directory should be added, False if the directory should be ignored. + max_content_length (Optional[int]): if given, all contents larger + than this will be skipped. """ top_path = path @@ -244,8 +277,9 @@ for name in fentries + dentries: path = os.path.join(root, name) if not os.path.isdir(path) or os.path.islink(path): - content = Content.from_file(path=path, data=data, - save_path=save_path) + content = Content.from_file( + path=path, data=data, save_path=save_path, + max_content_length=max_content_length) entries[name] = content else: if dir_filter(name, dirs[path].entries): diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py --- a/swh/model/tests/test_from_disk.py +++ b/swh/model/tests/test_from_disk.py @@ -525,6 +525,50 @@ conv_content = Content.from_file(path=path, data=data) self.assertContentEqual(conv_content, self.empty_content) + def test_symlink_max_length(self): + for max_content_length in [4, 10]: + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir_name, filename) + content = Content.from_file(path=path, data=True) + if content.data['length'] > max_content_length: + with pytest.raises(Exception, match='too large'): + Content.from_file( + path=path, data=True, + max_content_length=max_content_length) + else: + limited_content = Content.from_file( + path=path, data=True, + max_content_length=max_content_length) + assert content == limited_content + + def test_file_max_length(self): + for max_content_length in [2, 4]: + for filename, content in self.contents.items(): + path = os.path.join(self.tmpdir_name, filename) + content = Content.from_file(path=path, data=True) + limited_content = Content.from_file( + path=path, data=True, + max_content_length=max_content_length) + assert content.data['length'] == limited_content.data['length'] + assert content.data['status'] == 'visible' + if content.data['length'] > max_content_length: + assert limited_content.data['status'] == 'absent' + assert limited_content.data['reason'] \ + == 'Content too large' + else: + assert limited_content.data['status'] == 'visible' + assert limited_content.data['data'] == content.data['data'] + + def test_special_file_max_length(self): + for max_content_length in [None, 0, 1]: + for filename in self.specials: + path = os.path.join(self.tmpdir_name, filename) + content = Content.from_file(path=path, data=True) + limited_content = Content.from_file( + path=path, data=True, + max_content_length=max_content_length) + assert limited_content == content + def test_file_to_content_with_path(self): for filename, content in self.contents.items(): content_w_path = content.copy()