Changeset View
Changeset View
Standalone View
Standalone View
swh/model/from_disk.py
# Copyright (C) 2017-2018 The Software Heritage developers | # Copyright (C) 2017-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import enum | import enum | ||||
import os | import os | ||||
import stat | import stat | ||||
from typing import List | import attr | ||||
from typing import List, Optional | |||||
from .hashutil import MultiHash, HASH_BLOCK_SIZE | from .hashutil import MultiHash | ||||
from .merkle import MerkleLeaf, MerkleNode | from .merkle import MerkleLeaf, MerkleNode | ||||
from .identifiers import ( | from .identifiers import ( | ||||
directory_entry_sort_key, directory_identifier, | directory_entry_sort_key, directory_identifier, | ||||
identifier_to_bytes as id_to_bytes, | identifier_to_bytes as id_to_bytes, | ||||
identifier_to_str as id_to_str, | identifier_to_str as id_to_str, | ||||
) | ) | ||||
from . import model | |||||
@attr.s | |||||
class DiskBackedContent(model.Content): | |||||
"""Subclass of Content, which allows lazy-loading data from the disk.""" | |||||
path = attr.ib(type=Optional[bytes], default=None) | |||||
def __attrs_post_init__(self): | |||||
if self.path is None: | |||||
raise TypeError('path must not be None.') | |||||
def with_data(self) -> model.Content: | |||||
args = self.to_dict() | |||||
del args['path'] | |||||
assert self.path is not None | |||||
with open(self.path, 'rb') as fd: | |||||
return model.Content.from_dict({ | |||||
**args, | |||||
'data': fd.read()}) | |||||
class DentryPerms(enum.IntEnum): | class DentryPerms(enum.IntEnum): | ||||
"""Admissible permissions for directory entries.""" | """Admissible permissions for directory entries.""" | ||||
content = 0o100644 | content = 0o100644 | ||||
"""Content""" | """Content""" | ||||
executable_content = 0o100755 | executable_content = 0o100755 | ||||
"""Executable content (e.g. executable script)""" | """Executable content (e.g. executable script)""" | ||||
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines | class Content(MerkleLeaf): | ||||
@classmethod | @classmethod | ||||
def from_symlink(cls, *, path, mode): | def from_symlink(cls, *, path, mode): | ||||
"""Convert a symbolic link to a Software Heritage content entry""" | """Convert a symbolic link to a Software Heritage content entry""" | ||||
return cls.from_bytes(mode=mode, data=os.readlink(path)) | return cls.from_bytes(mode=mode, data=os.readlink(path)) | ||||
@classmethod | @classmethod | ||||
def from_file( | def from_file( | ||||
cls, *, path, data=False, save_path=False, | cls, *, path, max_content_length=None): | ||||
max_content_length=None): | |||||
"""Compute the Software Heritage content entry corresponding to an | """Compute the Software Heritage content entry corresponding to an | ||||
on-disk file. | on-disk file. | ||||
The returned dictionary contains keys useful for both: | The returned dictionary contains keys useful for both: | ||||
- loading the content in the archive (hashes, `length`) | - loading the content in the archive (hashes, `length`) | ||||
- using the content as a directory entry in a directory | - using the content as a directory entry in a directory | ||||
Args: | Args: | ||||
path (bytes): path to the file for which we're computing the | |||||
content entry | |||||
data (bool): add the file data to the entry | |||||
save_path (bool): add the file path to the entry | save_path (bool): add the file path to the entry | ||||
max_content_length (Optional[int]): if given, all contents larger | max_content_length (Optional[int]): if given, all contents larger | ||||
than this will be skipped. | than this will be skipped. | ||||
""" | """ | ||||
file_stat = os.lstat(path) | file_stat = os.lstat(path) | ||||
mode = file_stat.st_mode | mode = file_stat.st_mode | ||||
length = file_stat.st_size | length = file_stat.st_size | ||||
Show All 14 Lines | def from_file( | ||||
return cls.from_symlink(path=path, mode=mode) | return cls.from_symlink(path=path, mode=mode) | ||||
elif not stat.S_ISREG(mode): | elif not stat.S_ISREG(mode): | ||||
# not a regular file: return the empty file instead | # not a regular file: return the empty file instead | ||||
return cls.from_bytes(mode=mode, data=b'') | return cls.from_bytes(mode=mode, data=b'') | ||||
if too_large: | if too_large: | ||||
skip_reason = 'Content too large' | skip_reason = 'Content too large' | ||||
elif not data: | |||||
skip_reason = 'Skipping file content' | |||||
else: | else: | ||||
skip_reason = None | skip_reason = None | ||||
hashes = MultiHash.from_path(path).digest() | |||||
if skip_reason: | if skip_reason: | ||||
ret = { | ret = { | ||||
**MultiHash.from_path(path).digest(), | **hashes, | ||||
'status': 'absent', | 'status': 'absent', | ||||
'reason': skip_reason, | 'reason': skip_reason, | ||||
} | } | ||||
else: | else: | ||||
h = MultiHash(length=length) | |||||
chunks = [] | |||||
with open(path, 'rb') as fobj: | |||||
while True: | |||||
chunk = fobj.read(HASH_BLOCK_SIZE) | |||||
if not chunk: | |||||
break | |||||
h.update(chunk) | |||||
chunks.append(chunk) | |||||
ret = { | ret = { | ||||
**h.digest(), | **hashes, | ||||
'status': 'visible', | 'status': 'visible', | ||||
'data': b''.join(chunks), | |||||
} | } | ||||
if save_path: | |||||
ret['path'] = path | ret['path'] = path | ||||
ret['perms'] = mode_to_perms(mode) | ret['perms'] = mode_to_perms(mode) | ||||
ret['length'] = length | ret['length'] = length | ||||
obj = cls(ret) | obj = cls(ret) | ||||
return obj | return obj | ||||
def __repr__(self): | def __repr__(self): | ||||
return 'Content(id=%s)' % id_to_str(self.hash) | return 'Content(id=%s)' % id_to_str(self.hash) | ||||
def compute_hash(self): | def compute_hash(self): | ||||
return self.data['sha1_git'] | return self.data['sha1_git'] | ||||
def to_model(self) -> model.BaseContent: | |||||
"""Builds a `model.BaseContent` object based on this leaf.""" | |||||
data = self.get_data().copy() | |||||
data.pop('perms', None) | |||||
if data['status'] == 'absent': | |||||
data.pop('path', None) | |||||
olasd: By default, `from_disk` will not pull the full data from disk into memory, so by doing this… | |||||
return model.SkippedContent.from_dict(data) | |||||
elif 'data' in data: | |||||
return model.Content.from_dict(data) | |||||
else: | |||||
return DiskBackedContent.from_dict(data) | |||||
def accept_all_directories(dirname, entries): | def accept_all_directories(dirname, entries): | ||||
"""Default filter for :func:`Directory.from_disk` accepting all | """Default filter for :func:`Directory.from_disk` accepting all | ||||
directories | directories | ||||
Args: | Args: | ||||
dirname (bytes): directory name | dirname (bytes): directory name | ||||
entries (list): directory entries | entries (list): directory entries | ||||
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines | class Directory(MerkleNode): | ||||
the affected levels of hierarchy are reset and can be collected again using | the affected levels of hierarchy are reset and can be collected again using | ||||
the same method. This enables the efficient collection of updated nodes, | the same method. This enables the efficient collection of updated nodes, | ||||
for instance when the client is applying diffs. | for instance when the client is applying diffs. | ||||
""" | """ | ||||
__slots__ = ['__entries'] | __slots__ = ['__entries'] | ||||
type = 'directory' | type = 'directory' | ||||
@classmethod | @classmethod | ||||
def from_disk(cls, *, path, data=False, save_path=False, | def from_disk(cls, *, path, | ||||
dir_filter=accept_all_directories, | dir_filter=accept_all_directories, | ||||
max_content_length=None): | max_content_length=None): | ||||
"""Compute the Software Heritage objects for a given directory tree | """Compute the Software Heritage objects for a given directory tree | ||||
Args: | Args: | ||||
path (bytes): the directory to traverse | path (bytes): the directory to traverse | ||||
data (bool): whether to add the data to the content objects | data (bool): whether to add the data to the content objects | ||||
save_path (bool): whether to add the path to the content objects | save_path (bool): whether to add the path to the content objects | ||||
Show All 11 Lines | def from_disk(cls, *, path, | ||||
for root, dentries, fentries in os.walk(top_path, topdown=False): | for root, dentries, fentries in os.walk(top_path, topdown=False): | ||||
entries = {} | entries = {} | ||||
# Join fentries and dentries in the same processing, as symbolic | # Join fentries and dentries in the same processing, as symbolic | ||||
# links to directories appear in dentries... | # links to directories appear in dentries... | ||||
for name in fentries + dentries: | for name in fentries + dentries: | ||||
path = os.path.join(root, name) | path = os.path.join(root, name) | ||||
if not os.path.isdir(path) or os.path.islink(path): | if not os.path.isdir(path) or os.path.islink(path): | ||||
content = Content.from_file( | content = Content.from_file( | ||||
path=path, data=data, save_path=save_path, | path=path, max_content_length=max_content_length) | ||||
max_content_length=max_content_length) | |||||
entries[name] = content | entries[name] = content | ||||
else: | else: | ||||
if dir_filter(name, dirs[path].entries): | if dir_filter(name, dirs[path].entries): | ||||
entries[name] = dirs[path] | entries[name] = dirs[path] | ||||
dirs[root] = cls({'name': os.path.basename(root)}) | dirs[root] = cls({'name': os.path.basename(root)}) | ||||
dirs[root].update(entries) | dirs[root].update(entries) | ||||
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines | def entries(self): | ||||
for name, child in self.items() | for name, child in self.items() | ||||
), key=directory_entry_sort_key) | ), key=directory_entry_sort_key) | ||||
return self.__entries | return self.__entries | ||||
def compute_hash(self): | def compute_hash(self): | ||||
return id_to_bytes(directory_identifier({'entries': self.entries})) | return id_to_bytes(directory_identifier({'entries': self.entries})) | ||||
def to_model(self) -> model.Directory: | |||||
"""Builds a `model.Directory` object based on this node; | |||||
ignoring its children.""" | |||||
return model.Directory.from_dict(self.get_data()) | |||||
def __getitem__(self, key): | def __getitem__(self, key): | ||||
if not isinstance(key, bytes): | if not isinstance(key, bytes): | ||||
raise ValueError('Can only get a bytes from Directory') | raise ValueError('Can only get a bytes from Directory') | ||||
# Convenience shortcut | # Convenience shortcut | ||||
if key == b'': | if key == b'': | ||||
return self | return self | ||||
Show All 39 Lines |
By default, from_disk will not pull the full data from disk into memory, so by doing this there's a good chance you'll end up with a Content object that has an empty data attribute.
We already have lots of issues with loaders having an "optimistic" usage of memory, so I'd be tempted to introduce a DiskBackedContent (DiskContent?) model, inheriting from Content, with a path attribute as well as a lazy data attribute that will read data from disk the first time it's accessed.