Page MenuHomeSoftware Heritage

D2703.id9660.diff
No OneTemporary

D2703.id9660.diff

diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
@@ -7,15 +7,36 @@
import os
import stat
-from typing import List
+import attr
+from typing import List, Optional
-from .hashutil import MultiHash, HASH_BLOCK_SIZE
+from .hashutil import MultiHash
from .merkle import MerkleLeaf, MerkleNode
from .identifiers import (
directory_entry_sort_key, directory_identifier,
identifier_to_bytes as id_to_bytes,
identifier_to_str as id_to_str,
)
+from . import model
+
+
+@attr.s
+class DiskBackedContent(model.Content):
+ """Subclass of Content, which allows lazy-loading data from the disk."""
+ path = attr.ib(type=Optional[bytes], default=None)
+
+ def __attrs_post_init__(self):
+ if self.path is None:
+ raise TypeError('path must not be None.')
+
+ def with_data(self) -> model.Content:
+ args = self.to_dict()
+ del args['path']
+ assert self.path is not None
+ with open(self.path, 'rb') as fd:
+ return model.Content.from_dict({
+ **args,
+ 'data': fd.read()})
class DentryPerms(enum.IntEnum):
@@ -94,8 +115,7 @@
@classmethod
def from_file(
- cls, *, path, data=False, save_path=False,
- max_content_length=None):
+ cls, *, path, max_content_length=None):
"""Compute the Software Heritage content entry corresponding to an
on-disk file.
@@ -104,9 +124,6 @@
- using the content as a directory entry in a directory
Args:
- path (bytes): path to the file for which we're computing the
- content entry
- data (bool): add the file data to the entry
save_path (bool): add the file path to the entry
max_content_length (Optional[int]): if given, all contents larger
than this will be skipped.
@@ -137,36 +154,23 @@
if too_large:
skip_reason = 'Content too large'
- elif not data:
- skip_reason = 'Skipping file content'
else:
skip_reason = None
+ hashes = MultiHash.from_path(path).digest()
if skip_reason:
ret = {
- **MultiHash.from_path(path).digest(),
+ **hashes,
'status': 'absent',
'reason': skip_reason,
}
else:
- h = MultiHash(length=length)
- chunks = []
- with open(path, 'rb') as fobj:
- while True:
- chunk = fobj.read(HASH_BLOCK_SIZE)
- if not chunk:
- break
- h.update(chunk)
- chunks.append(chunk)
-
ret = {
- **h.digest(),
+ **hashes,
'status': 'visible',
- 'data': b''.join(chunks),
}
- if save_path:
- ret['path'] = path
+ ret['path'] = path
ret['perms'] = mode_to_perms(mode)
ret['length'] = length
@@ -179,6 +183,18 @@
def compute_hash(self):
return self.data['sha1_git']
+ def to_model(self) -> model.BaseContent:
+ """Builds a `model.BaseContent` object based on this leaf."""
+ data = self.get_data().copy()
+ data.pop('perms', None)
+ if data['status'] == 'absent':
+ data.pop('path', None)
+ return model.SkippedContent.from_dict(data)
+ elif 'data' in data:
+ return model.Content.from_dict(data)
+ else:
+ return DiskBackedContent.from_dict(data)
+
def accept_all_directories(dirname, entries):
"""Default filter for :func:`Directory.from_disk` accepting all
@@ -250,7 +266,7 @@
type = 'directory'
@classmethod
- def from_disk(cls, *, path, data=False, save_path=False,
+ def from_disk(cls, *, path,
dir_filter=accept_all_directories,
max_content_length=None):
"""Compute the Software Heritage objects for a given directory tree
@@ -278,8 +294,7 @@
path = os.path.join(root, name)
if not os.path.isdir(path) or os.path.islink(path):
content = Content.from_file(
- path=path, data=data, save_path=save_path,
- max_content_length=max_content_length)
+ path=path, max_content_length=max_content_length)
entries[name] = content
else:
if dir_filter(name, dirs[path].entries):
@@ -338,6 +353,11 @@
def compute_hash(self):
return id_to_bytes(directory_identifier({'entries': self.entries}))
+ def to_model(self) -> model.Directory:
+ """Builds a `model.Directory` object based on this node;
+ ignoring its children."""
+ return model.Directory.from_dict(self.get_data())
+
def __getitem__(self, key):
if not isinstance(key, bytes):
raise ValueError('Can only get a bytes from Directory')
diff --git a/swh/model/model.py b/swh/model/model.py
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -18,6 +18,13 @@
)
from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes
+
+class MissingData(Exception):
+ """Raised by `Content.with_data` when it has no way of fetching the
+ data (but not when fetching the data fails)."""
+ pass
+
+
SHA1_SIZE = 20
# TODO: Limit this to 20 bytes
@@ -362,6 +369,10 @@
@attr.s(frozen=True)
class BaseContent(BaseModel):
+ status = attr.ib(
+ type=str,
+ validator=attr.validators.in_(['visible', 'hidden', 'absent']))
+
def to_dict(self):
content = super().to_dict()
if content['ctime'] is None:
@@ -402,8 +413,8 @@
type=str,
default='visible',
validator=attr.validators.in_(['visible', 'hidden']))
- data = attr.ib(type=Optional[bytes],
- default=None)
+
+ data = attr.ib(type=Optional[bytes], default=None)
ctime = attr.ib(type=Optional[datetime.datetime],
default=None)
@@ -424,6 +435,16 @@
def from_dict(cls, d):
return super().from_dict(d, use_subclass=False)
+ def with_data(self) -> 'Content':
+ """Loads the `data` attribute; meaning that it is guaranteed not to
+ be None after this call.
+
+ This call is almost a no-op, but subclasses may overload this method
+ to lazy-load data (eg. from disk or objstorage)."""
+ if self.data is None:
+ raise MissingData('Content data is None.')
+ return self
+
@attr.s(frozen=True)
class SkippedContent(BaseContent):
@@ -432,7 +453,7 @@
sha256 = attr.ib(type=Optional[bytes])
blake2s256 = attr.ib(type=Optional[bytes])
- length = attr.ib(type=int)
+ length = attr.ib(type=Optional[int])
status = attr.ib(
type=str,
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
--- a/swh/model/tests/test_from_disk.py
+++ b/swh/model/tests/test_from_disk.py
@@ -12,8 +12,11 @@
from typing import ClassVar, Optional
from swh.model import from_disk
-from swh.model.from_disk import Content, DentryPerms, Directory
+from swh.model.from_disk import (
+ Content, DentryPerms, Directory, DiskBackedContent
+)
from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
+from swh.model import model
TEST_DATA = os.path.join(os.path.dirname(__file__), 'data')
@@ -48,6 +51,60 @@
self.assertEqual(perm, from_disk.mode_to_perms(fmode))
+class TestDiskBackedContent(unittest.TestCase):
+ def test_with_data(self):
+ expected_content = model.Content(
+ length=42, status='visible', data=b'foo bar',
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ with tempfile.NamedTemporaryFile(mode='w+b') as fd:
+ content = DiskBackedContent(
+ length=42, status='visible', path=fd.name,
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ fd.write(b'foo bar')
+ fd.seek(0)
+ content_with_data = content.with_data()
+
+ assert expected_content == content_with_data
+
+ def test_lazy_data(self):
+ expected_content = model.Content(
+ length=42, status='visible', data=b'bar',
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ with tempfile.NamedTemporaryFile(mode='w+b') as fd:
+ fd.write(b'foo')
+ fd.seek(0)
+ content = DiskBackedContent(
+ length=42, status='visible', path=fd.name,
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ fd.write(b'bar')
+ fd.seek(0)
+ content_with_data = content.with_data()
+ fd.write(b'baz')
+ fd.seek(0)
+
+ assert expected_content == content_with_data
+
+ def test_with_data_cannot_read(self):
+ with tempfile.NamedTemporaryFile(mode='w+b') as fd:
+ content = DiskBackedContent(
+ length=42, status='visible', path=fd.name,
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+
+ with pytest.raises(OSError):
+ content.with_data()
+
+ def test_missing_path(self):
+ with pytest.raises(TypeError):
+ DiskBackedContent(
+ length=42, status='visible',
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+
+ with pytest.raises(TypeError):
+ DiskBackedContent(
+ length=42, status='visible', path=None,
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+
+
class DataMixin:
maxDiff = None # type: ClassVar[Optional[int]]
@@ -401,19 +458,19 @@
def tearDown(self):
self.tmpdir.cleanup()
- def assertContentEqual(self, left, right, *, check_data=False, # noqa
+ def assertContentEqual(self, left, right, *, # noqa
check_path=False):
if not isinstance(left, Content):
raise ValueError('%s is not a Content' % left)
if isinstance(right, Content):
right = right.get_data()
+ # Compare dictionaries
+
keys = DEFAULT_ALGORITHMS | {
'length',
'perms',
}
- if check_data:
- keys |= {'data'}
if check_path:
keys |= {'path'}
@@ -449,6 +506,9 @@
right = right.get_data()
assert left.entries == right['entries']
+ assert left.hash == right['id']
+
+ assert left.to_model() == model.Directory.from_dict(right)
def make_contents(self, directory):
for filename, content in self.contents.items():
@@ -498,6 +558,19 @@
conv_content = Content.from_symlink(path=path, mode=perms)
self.assertContentEqual(conv_content, symlink)
+ def test_symlink_to_base_model(self):
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir_name, filename)
+ perms = 0o120000
+ model_content = \
+ Content.from_symlink(path=path, mode=perms).to_model()
+
+ right = symlink.copy()
+ for key in ('perms', 'path', 'mode'):
+ right.pop(key, None)
+ right['status'] = 'visible'
+ assert model_content == model.Content.from_dict(right)
+
class FileToContent(DataMixin, unittest.TestCase):
def setUp(self):
@@ -507,45 +580,86 @@
self.make_specials(self.tmpdir_name)
def test_symlink_to_content(self):
- # Check whether loading the data works
- for data in [True, False]:
- for filename, symlink in self.symlinks.items():
- path = os.path.join(self.tmpdir_name, filename)
- conv_content = Content.from_file(path=path, data=data)
- self.assertContentEqual(conv_content, symlink, check_data=data)
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir_name, filename)
+ conv_content = Content.from_file(path=path)
+ self.assertContentEqual(conv_content, symlink)
def test_file_to_content(self):
- for data in [True, False]:
- for filename, content in self.contents.items():
- path = os.path.join(self.tmpdir_name, filename)
- conv_content = Content.from_file(path=path, data=data)
- self.assertContentEqual(conv_content, content, check_data=data)
+ for filename, content in self.contents.items():
+ path = os.path.join(self.tmpdir_name, filename)
+ conv_content = Content.from_file(path=path)
+ self.assertContentEqual(conv_content, content)
def test_special_to_content(self):
- for data in [True, False]:
- for filename in self.specials:
- path = os.path.join(self.tmpdir_name, filename)
- conv_content = Content.from_file(path=path, data=data)
- self.assertContentEqual(conv_content, self.empty_content)
+ for filename in self.specials:
+ path = os.path.join(self.tmpdir_name, filename)
+ conv_content = Content.from_file(path=path)
+ self.assertContentEqual(conv_content, self.empty_content)
- for path in ['/dev/null', '/dev/zero']:
- path = os.path.join(self.tmpdir_name, filename)
- conv_content = Content.from_file(path=path, data=data)
- self.assertContentEqual(conv_content, self.empty_content)
+ for path in ['/dev/null', '/dev/zero']:
+ path = os.path.join(self.tmpdir_name, filename)
+ conv_content = Content.from_file(path=path)
+ self.assertContentEqual(conv_content, self.empty_content)
+
+ def test_symlink_to_content_model(self):
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir_name, filename)
+ model_content = Content.from_file(path=path).to_model()
+
+ right = symlink.copy()
+ for key in ('perms', 'path', 'mode'):
+ right.pop(key, None)
+ right['status'] = 'visible'
+ assert model_content == model.Content.from_dict(right)
+
+ def test_file_to_content_model(self):
+ for filename, content in self.contents.items():
+ path = os.path.join(self.tmpdir_name, filename)
+ model_content = Content.from_file(path=path).to_model()
+
+ right = content.copy()
+ for key in ('perms', 'mode'):
+ right.pop(key, None)
+ assert model_content.with_data() == model.Content.from_dict(right)
+
+ right['path'] = path
+ del right['data']
+ assert model_content == DiskBackedContent.from_dict(right)
+
+ def test_special_to_content_model(self):
+ for filename in self.specials:
+ path = os.path.join(self.tmpdir_name, filename)
+ model_content = Content.from_file(path=path).to_model()
+
+ right = self.empty_content.copy()
+ for key in ('perms', 'path', 'mode'):
+ right.pop(key, None)
+ right['status'] = 'visible'
+ assert model_content == model.Content.from_dict(right)
+
+ for path in ['/dev/null', '/dev/zero']:
+ model_content = Content.from_file(path=path).to_model()
+
+ right = self.empty_content.copy()
+ for key in ('perms', 'path', 'mode'):
+ right.pop(key, None)
+ right['status'] = 'visible'
+ assert model_content == model.Content.from_dict(right)
def test_symlink_max_length(self):
for max_content_length in [4, 10]:
for filename, symlink in self.symlinks.items():
path = os.path.join(self.tmpdir_name, filename)
- content = Content.from_file(path=path, data=True)
+ content = Content.from_file(path=path)
if content.data['length'] > max_content_length:
with pytest.raises(Exception, match='too large'):
Content.from_file(
- path=path, data=True,
+ path=path,
max_content_length=max_content_length)
else:
limited_content = Content.from_file(
- path=path, data=True,
+ path=path,
max_content_length=max_content_length)
assert content == limited_content
@@ -553,9 +667,9 @@
for max_content_length in [2, 4]:
for filename, content in self.contents.items():
path = os.path.join(self.tmpdir_name, filename)
- content = Content.from_file(path=path, data=True)
+ content = Content.from_file(path=path)
limited_content = Content.from_file(
- path=path, data=True,
+ path=path,
max_content_length=max_content_length)
assert content.data['length'] == limited_content.data['length']
assert content.data['status'] == 'visible'
@@ -565,15 +679,14 @@
== 'Content too large'
else:
assert limited_content.data['status'] == 'visible'
- assert limited_content.data['data'] == content.data['data']
def test_special_file_max_length(self):
for max_content_length in [None, 0, 1]:
for filename in self.specials:
path = os.path.join(self.tmpdir_name, filename)
- content = Content.from_file(path=path, data=True)
+ content = Content.from_file(path=path)
limited_content = Content.from_file(
- path=path, data=True,
+ path=path,
max_content_length=max_content_length)
assert limited_content == content
@@ -582,7 +695,7 @@
content_w_path = content.copy()
path = os.path.join(self.tmpdir_name, filename)
content_w_path['path'] = path
- conv_content = Content.from_file(path=path, save_path=True)
+ conv_content = Content.from_file(path=path)
self.assertContentEqual(conv_content, content_w_path,
check_path=True)
@@ -762,12 +875,12 @@
path=os.path.join(self.tmpdir_name, b'sample-folder')
)
- for name, data in self.tarball_contents.items():
+ for name, expected in self.tarball_contents.items():
obj = directory[name]
if isinstance(obj, Content):
- self.assertContentEqual(obj, data)
+ self.assertContentEqual(obj, expected)
elif isinstance(obj, Directory):
- self.assertDirectoryEqual(obj, data)
+ self.assertDirectoryEqual(obj, expected)
else:
raise self.failureException('Unknown type for %s' % obj)
diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py
--- a/swh/model/tests/test_model.py
+++ b/swh/model/tests/test_model.py
@@ -6,8 +6,10 @@
import copy
from hypothesis import given
+import pytest
from swh.model.model import Content, Directory, Revision, Release, Snapshot
+from swh.model.model import MissingData
from swh.model.hashutil import hash_to_bytes
from swh.model.hypothesis_strategies import objects, origins, origin_visits
from swh.model.identifiers import (
@@ -69,6 +71,21 @@
assert c.hashes() == hashes
+def test_content_data():
+ c = Content(
+ length=42, status='visible', data=b'foo',
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ assert c.with_data() == c
+
+
+def test_content_data_missing():
+ c = Content(
+ length=42, status='visible',
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ with pytest.raises(MissingData):
+ c.with_data()
+
+
def test_directory_model_id_computation():
dir_dict = dict(directory_example)
del dir_dict['id']

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 11:35 AM (15 h, 7 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233842

Event Timeline