Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163638
D2703.id9660.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
20 KB
Subscribers
None
D2703.id9660.diff
View Options
diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
@@ -7,15 +7,36 @@
import os
import stat
-from typing import List
+import attr
+from typing import List, Optional
-from .hashutil import MultiHash, HASH_BLOCK_SIZE
+from .hashutil import MultiHash
from .merkle import MerkleLeaf, MerkleNode
from .identifiers import (
directory_entry_sort_key, directory_identifier,
identifier_to_bytes as id_to_bytes,
identifier_to_str as id_to_str,
)
+from . import model
+
+
+@attr.s
+class DiskBackedContent(model.Content):
+ """Subclass of Content, which allows lazy-loading data from the disk."""
+ path = attr.ib(type=Optional[bytes], default=None)
+
+ def __attrs_post_init__(self):
+ if self.path is None:
+ raise TypeError('path must not be None.')
+
+ def with_data(self) -> model.Content:
+ args = self.to_dict()
+ del args['path']
+ assert self.path is not None
+ with open(self.path, 'rb') as fd:
+ return model.Content.from_dict({
+ **args,
+ 'data': fd.read()})
class DentryPerms(enum.IntEnum):
@@ -94,8 +115,7 @@
@classmethod
def from_file(
- cls, *, path, data=False, save_path=False,
- max_content_length=None):
+ cls, *, path, max_content_length=None):
"""Compute the Software Heritage content entry corresponding to an
on-disk file.
@@ -104,9 +124,6 @@
- using the content as a directory entry in a directory
Args:
- path (bytes): path to the file for which we're computing the
- content entry
- data (bool): add the file data to the entry
save_path (bool): add the file path to the entry
max_content_length (Optional[int]): if given, all contents larger
than this will be skipped.
@@ -137,36 +154,23 @@
if too_large:
skip_reason = 'Content too large'
- elif not data:
- skip_reason = 'Skipping file content'
else:
skip_reason = None
+ hashes = MultiHash.from_path(path).digest()
if skip_reason:
ret = {
- **MultiHash.from_path(path).digest(),
+ **hashes,
'status': 'absent',
'reason': skip_reason,
}
else:
- h = MultiHash(length=length)
- chunks = []
- with open(path, 'rb') as fobj:
- while True:
- chunk = fobj.read(HASH_BLOCK_SIZE)
- if not chunk:
- break
- h.update(chunk)
- chunks.append(chunk)
-
ret = {
- **h.digest(),
+ **hashes,
'status': 'visible',
- 'data': b''.join(chunks),
}
- if save_path:
- ret['path'] = path
+ ret['path'] = path
ret['perms'] = mode_to_perms(mode)
ret['length'] = length
@@ -179,6 +183,18 @@
def compute_hash(self):
return self.data['sha1_git']
+ def to_model(self) -> model.BaseContent:
+ """Builds a `model.BaseContent` object based on this leaf."""
+ data = self.get_data().copy()
+ data.pop('perms', None)
+ if data['status'] == 'absent':
+ data.pop('path', None)
+ return model.SkippedContent.from_dict(data)
+ elif 'data' in data:
+ return model.Content.from_dict(data)
+ else:
+ return DiskBackedContent.from_dict(data)
+
def accept_all_directories(dirname, entries):
"""Default filter for :func:`Directory.from_disk` accepting all
@@ -250,7 +266,7 @@
type = 'directory'
@classmethod
- def from_disk(cls, *, path, data=False, save_path=False,
+ def from_disk(cls, *, path,
dir_filter=accept_all_directories,
max_content_length=None):
"""Compute the Software Heritage objects for a given directory tree
@@ -278,8 +294,7 @@
path = os.path.join(root, name)
if not os.path.isdir(path) or os.path.islink(path):
content = Content.from_file(
- path=path, data=data, save_path=save_path,
- max_content_length=max_content_length)
+ path=path, max_content_length=max_content_length)
entries[name] = content
else:
if dir_filter(name, dirs[path].entries):
@@ -338,6 +353,11 @@
def compute_hash(self):
return id_to_bytes(directory_identifier({'entries': self.entries}))
+ def to_model(self) -> model.Directory:
+ """Builds a `model.Directory` object based on this node;
+ ignoring its children."""
+ return model.Directory.from_dict(self.get_data())
+
def __getitem__(self, key):
if not isinstance(key, bytes):
raise ValueError('Can only get a bytes from Directory')
diff --git a/swh/model/model.py b/swh/model/model.py
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -18,6 +18,13 @@
)
from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes
+
+class MissingData(Exception):
+ """Raised by `Content.with_data` when it has no way of fetching the
+ data (but not when fetching the data fails)."""
+ pass
+
+
SHA1_SIZE = 20
# TODO: Limit this to 20 bytes
@@ -362,6 +369,10 @@
@attr.s(frozen=True)
class BaseContent(BaseModel):
+ status = attr.ib(
+ type=str,
+ validator=attr.validators.in_(['visible', 'hidden', 'absent']))
+
def to_dict(self):
content = super().to_dict()
if content['ctime'] is None:
@@ -402,8 +413,8 @@
type=str,
default='visible',
validator=attr.validators.in_(['visible', 'hidden']))
- data = attr.ib(type=Optional[bytes],
- default=None)
+
+ data = attr.ib(type=Optional[bytes], default=None)
ctime = attr.ib(type=Optional[datetime.datetime],
default=None)
@@ -424,6 +435,16 @@
def from_dict(cls, d):
return super().from_dict(d, use_subclass=False)
+ def with_data(self) -> 'Content':
+ """Loads the `data` attribute; meaning that it is guaranteed not to
+ be None after this call.
+
+ This call is almost a no-op, but subclasses may overload this method
+ to lazy-load data (eg. from disk or objstorage)."""
+ if self.data is None:
+ raise MissingData('Content data is None.')
+ return self
+
@attr.s(frozen=True)
class SkippedContent(BaseContent):
@@ -432,7 +453,7 @@
sha256 = attr.ib(type=Optional[bytes])
blake2s256 = attr.ib(type=Optional[bytes])
- length = attr.ib(type=int)
+ length = attr.ib(type=Optional[int])
status = attr.ib(
type=str,
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
--- a/swh/model/tests/test_from_disk.py
+++ b/swh/model/tests/test_from_disk.py
@@ -12,8 +12,11 @@
from typing import ClassVar, Optional
from swh.model import from_disk
-from swh.model.from_disk import Content, DentryPerms, Directory
+from swh.model.from_disk import (
+ Content, DentryPerms, Directory, DiskBackedContent
+)
from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, hash_to_hex
+from swh.model import model
TEST_DATA = os.path.join(os.path.dirname(__file__), 'data')
@@ -48,6 +51,60 @@
self.assertEqual(perm, from_disk.mode_to_perms(fmode))
+class TestDiskBackedContent(unittest.TestCase):
+ def test_with_data(self):
+ expected_content = model.Content(
+ length=42, status='visible', data=b'foo bar',
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ with tempfile.NamedTemporaryFile(mode='w+b') as fd:
+ content = DiskBackedContent(
+ length=42, status='visible', path=fd.name,
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ fd.write(b'foo bar')
+ fd.seek(0)
+ content_with_data = content.with_data()
+
+ assert expected_content == content_with_data
+
+ def test_lazy_data(self):
+ expected_content = model.Content(
+ length=42, status='visible', data=b'bar',
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ with tempfile.NamedTemporaryFile(mode='w+b') as fd:
+ fd.write(b'foo')
+ fd.seek(0)
+ content = DiskBackedContent(
+ length=42, status='visible', path=fd.name,
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ fd.write(b'bar')
+ fd.seek(0)
+ content_with_data = content.with_data()
+ fd.write(b'baz')
+ fd.seek(0)
+
+ assert expected_content == content_with_data
+
+ def test_with_data_cannot_read(self):
+ with tempfile.NamedTemporaryFile(mode='w+b') as fd:
+ content = DiskBackedContent(
+ length=42, status='visible', path=fd.name,
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+
+ with pytest.raises(OSError):
+ content.with_data()
+
+ def test_missing_path(self):
+ with pytest.raises(TypeError):
+ DiskBackedContent(
+ length=42, status='visible',
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+
+ with pytest.raises(TypeError):
+ DiskBackedContent(
+ length=42, status='visible', path=None,
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+
+
class DataMixin:
maxDiff = None # type: ClassVar[Optional[int]]
@@ -401,19 +458,19 @@
def tearDown(self):
self.tmpdir.cleanup()
- def assertContentEqual(self, left, right, *, check_data=False, # noqa
+ def assertContentEqual(self, left, right, *, # noqa
check_path=False):
if not isinstance(left, Content):
raise ValueError('%s is not a Content' % left)
if isinstance(right, Content):
right = right.get_data()
+ # Compare dictionaries
+
keys = DEFAULT_ALGORITHMS | {
'length',
'perms',
}
- if check_data:
- keys |= {'data'}
if check_path:
keys |= {'path'}
@@ -449,6 +506,9 @@
right = right.get_data()
assert left.entries == right['entries']
+ assert left.hash == right['id']
+
+ assert left.to_model() == model.Directory.from_dict(right)
def make_contents(self, directory):
for filename, content in self.contents.items():
@@ -498,6 +558,19 @@
conv_content = Content.from_symlink(path=path, mode=perms)
self.assertContentEqual(conv_content, symlink)
+ def test_symlink_to_base_model(self):
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir_name, filename)
+ perms = 0o120000
+ model_content = \
+ Content.from_symlink(path=path, mode=perms).to_model()
+
+ right = symlink.copy()
+ for key in ('perms', 'path', 'mode'):
+ right.pop(key, None)
+ right['status'] = 'visible'
+ assert model_content == model.Content.from_dict(right)
+
class FileToContent(DataMixin, unittest.TestCase):
def setUp(self):
@@ -507,45 +580,86 @@
self.make_specials(self.tmpdir_name)
def test_symlink_to_content(self):
- # Check whether loading the data works
- for data in [True, False]:
- for filename, symlink in self.symlinks.items():
- path = os.path.join(self.tmpdir_name, filename)
- conv_content = Content.from_file(path=path, data=data)
- self.assertContentEqual(conv_content, symlink, check_data=data)
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir_name, filename)
+ conv_content = Content.from_file(path=path)
+ self.assertContentEqual(conv_content, symlink)
def test_file_to_content(self):
- for data in [True, False]:
- for filename, content in self.contents.items():
- path = os.path.join(self.tmpdir_name, filename)
- conv_content = Content.from_file(path=path, data=data)
- self.assertContentEqual(conv_content, content, check_data=data)
+ for filename, content in self.contents.items():
+ path = os.path.join(self.tmpdir_name, filename)
+ conv_content = Content.from_file(path=path)
+ self.assertContentEqual(conv_content, content)
def test_special_to_content(self):
- for data in [True, False]:
- for filename in self.specials:
- path = os.path.join(self.tmpdir_name, filename)
- conv_content = Content.from_file(path=path, data=data)
- self.assertContentEqual(conv_content, self.empty_content)
+ for filename in self.specials:
+ path = os.path.join(self.tmpdir_name, filename)
+ conv_content = Content.from_file(path=path)
+ self.assertContentEqual(conv_content, self.empty_content)
- for path in ['/dev/null', '/dev/zero']:
- path = os.path.join(self.tmpdir_name, filename)
- conv_content = Content.from_file(path=path, data=data)
- self.assertContentEqual(conv_content, self.empty_content)
+ for path in ['/dev/null', '/dev/zero']:
+ path = os.path.join(self.tmpdir_name, filename)
+ conv_content = Content.from_file(path=path)
+ self.assertContentEqual(conv_content, self.empty_content)
+
+ def test_symlink_to_content_model(self):
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir_name, filename)
+ model_content = Content.from_file(path=path).to_model()
+
+ right = symlink.copy()
+ for key in ('perms', 'path', 'mode'):
+ right.pop(key, None)
+ right['status'] = 'visible'
+ assert model_content == model.Content.from_dict(right)
+
+ def test_file_to_content_model(self):
+ for filename, content in self.contents.items():
+ path = os.path.join(self.tmpdir_name, filename)
+ model_content = Content.from_file(path=path).to_model()
+
+ right = content.copy()
+ for key in ('perms', 'mode'):
+ right.pop(key, None)
+ assert model_content.with_data() == model.Content.from_dict(right)
+
+ right['path'] = path
+ del right['data']
+ assert model_content == DiskBackedContent.from_dict(right)
+
+ def test_special_to_content_model(self):
+ for filename in self.specials:
+ path = os.path.join(self.tmpdir_name, filename)
+ model_content = Content.from_file(path=path).to_model()
+
+ right = self.empty_content.copy()
+ for key in ('perms', 'path', 'mode'):
+ right.pop(key, None)
+ right['status'] = 'visible'
+ assert model_content == model.Content.from_dict(right)
+
+ for path in ['/dev/null', '/dev/zero']:
+ model_content = Content.from_file(path=path).to_model()
+
+ right = self.empty_content.copy()
+ for key in ('perms', 'path', 'mode'):
+ right.pop(key, None)
+ right['status'] = 'visible'
+ assert model_content == model.Content.from_dict(right)
def test_symlink_max_length(self):
for max_content_length in [4, 10]:
for filename, symlink in self.symlinks.items():
path = os.path.join(self.tmpdir_name, filename)
- content = Content.from_file(path=path, data=True)
+ content = Content.from_file(path=path)
if content.data['length'] > max_content_length:
with pytest.raises(Exception, match='too large'):
Content.from_file(
- path=path, data=True,
+ path=path,
max_content_length=max_content_length)
else:
limited_content = Content.from_file(
- path=path, data=True,
+ path=path,
max_content_length=max_content_length)
assert content == limited_content
@@ -553,9 +667,9 @@
for max_content_length in [2, 4]:
for filename, content in self.contents.items():
path = os.path.join(self.tmpdir_name, filename)
- content = Content.from_file(path=path, data=True)
+ content = Content.from_file(path=path)
limited_content = Content.from_file(
- path=path, data=True,
+ path=path,
max_content_length=max_content_length)
assert content.data['length'] == limited_content.data['length']
assert content.data['status'] == 'visible'
@@ -565,15 +679,14 @@
== 'Content too large'
else:
assert limited_content.data['status'] == 'visible'
- assert limited_content.data['data'] == content.data['data']
def test_special_file_max_length(self):
for max_content_length in [None, 0, 1]:
for filename in self.specials:
path = os.path.join(self.tmpdir_name, filename)
- content = Content.from_file(path=path, data=True)
+ content = Content.from_file(path=path)
limited_content = Content.from_file(
- path=path, data=True,
+ path=path,
max_content_length=max_content_length)
assert limited_content == content
@@ -582,7 +695,7 @@
content_w_path = content.copy()
path = os.path.join(self.tmpdir_name, filename)
content_w_path['path'] = path
- conv_content = Content.from_file(path=path, save_path=True)
+ conv_content = Content.from_file(path=path)
self.assertContentEqual(conv_content, content_w_path,
check_path=True)
@@ -762,12 +875,12 @@
path=os.path.join(self.tmpdir_name, b'sample-folder')
)
- for name, data in self.tarball_contents.items():
+ for name, expected in self.tarball_contents.items():
obj = directory[name]
if isinstance(obj, Content):
- self.assertContentEqual(obj, data)
+ self.assertContentEqual(obj, expected)
elif isinstance(obj, Directory):
- self.assertDirectoryEqual(obj, data)
+ self.assertDirectoryEqual(obj, expected)
else:
raise self.failureException('Unknown type for %s' % obj)
diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py
--- a/swh/model/tests/test_model.py
+++ b/swh/model/tests/test_model.py
@@ -6,8 +6,10 @@
import copy
from hypothesis import given
+import pytest
from swh.model.model import Content, Directory, Revision, Release, Snapshot
+from swh.model.model import MissingData
from swh.model.hashutil import hash_to_bytes
from swh.model.hypothesis_strategies import objects, origins, origin_visits
from swh.model.identifiers import (
@@ -69,6 +71,21 @@
assert c.hashes() == hashes
+def test_content_data():
+ c = Content(
+ length=42, status='visible', data=b'foo',
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ assert c.with_data() == c
+
+
+def test_content_data_missing():
+ c = Content(
+ length=42, status='visible',
+ sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux')
+ with pytest.raises(MissingData):
+ c.with_data()
+
+
def test_directory_model_id_computation():
dir_dict = dict(directory_example)
del dir_dict['id']
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 11:35 AM (17 h, 31 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233842
Attached To
D2703: Add to_model() method to from_disk.{Content,Directory}, to convert to canonical model objects.
Event Timeline
Log In to Comment