Differential D2703 Diff 9674 swh/model/from_disk.py

Changeset View

Standalone View

swh/model/from_disk.py

# Copyright (C) 2017-2018 The Software Heritage developers		# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

import enum		import enum
import os		import os
import stat		import stat

from typing import List		import attr
		from typing import List, Optional

from .hashutil import MultiHash, HASH_BLOCK_SIZE		from .hashutil import MultiHash
from .merkle import MerkleLeaf, MerkleNode		from .merkle import MerkleLeaf, MerkleNode
from .identifiers import (		from .identifiers import (
directory_entry_sort_key, directory_identifier,		directory_entry_sort_key, directory_identifier,
identifier_to_bytes as id_to_bytes,		identifier_to_bytes as id_to_bytes,
identifier_to_str as id_to_str,		identifier_to_str as id_to_str,
)		)
		from . import model


		@attr.s
		class DiskBackedContent(model.Content):
		"""Subclass of Content, which allows lazy-loading data from the disk."""
		path = attr.ib(type=Optional[bytes], default=None)

		def __attrs_post_init__(self):
		if self.path is None:
		raise TypeError('path must not be None.')

		def with_data(self) -> model.Content:
		args = self.to_dict()
		del args['path']
		assert self.path is not None
		with open(self.path, 'rb') as fd:
		return model.Content.from_dict({
		**args,
		'data': fd.read()})


class DentryPerms(enum.IntEnum):		class DentryPerms(enum.IntEnum):
"""Admissible permissions for directory entries."""		"""Admissible permissions for directory entries."""
content = 0o100644		content = 0o100644
"""Content"""		"""Content"""
executable_content = 0o100755		executable_content = 0o100755
"""Executable content (e.g. executable script)"""		"""Executable content (e.g. executable script)"""
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	class Content(MerkleLeaf):

@classmethod		@classmethod
def from_symlink(cls, *, path, mode):		def from_symlink(cls, *, path, mode):
"""Convert a symbolic link to a Software Heritage content entry"""		"""Convert a symbolic link to a Software Heritage content entry"""
return cls.from_bytes(mode=mode, data=os.readlink(path))		return cls.from_bytes(mode=mode, data=os.readlink(path))

@classmethod		@classmethod
def from_file(		def from_file(
cls, *, path, data=False, save_path=False,		cls, *, path, max_content_length=None):
max_content_length=None):
"""Compute the Software Heritage content entry corresponding to an		"""Compute the Software Heritage content entry corresponding to an
on-disk file.		on-disk file.

The returned dictionary contains keys useful for both:		The returned dictionary contains keys useful for both:
- loading the content in the archive (hashes, `length`)		- loading the content in the archive (hashes, `length`)
- using the content as a directory entry in a directory		- using the content as a directory entry in a directory

Args:		Args:
path (bytes): path to the file for which we're computing the
content entry
data (bool): add the file data to the entry
save_path (bool): add the file path to the entry		save_path (bool): add the file path to the entry
max_content_length (Optional[int]): if given, all contents larger		max_content_length (Optional[int]): if given, all contents larger
than this will be skipped.		than this will be skipped.

"""		"""
file_stat = os.lstat(path)		file_stat = os.lstat(path)
mode = file_stat.st_mode		mode = file_stat.st_mode
length = file_stat.st_size		length = file_stat.st_size
Show All 14 Lines	def from_file(

return cls.from_symlink(path=path, mode=mode)		return cls.from_symlink(path=path, mode=mode)
elif not stat.S_ISREG(mode):		elif not stat.S_ISREG(mode):
# not a regular file: return the empty file instead		# not a regular file: return the empty file instead
return cls.from_bytes(mode=mode, data=b'')		return cls.from_bytes(mode=mode, data=b'')

if too_large:		if too_large:
skip_reason = 'Content too large'		skip_reason = 'Content too large'
elif not data:
skip_reason = 'Skipping file content'
else:		else:
skip_reason = None		skip_reason = None

		hashes = MultiHash.from_path(path).digest()
if skip_reason:		if skip_reason:
ret = {		ret = {
**MultiHash.from_path(path).digest(),		**hashes,
'status': 'absent',		'status': 'absent',
'reason': skip_reason,		'reason': skip_reason,
}		}
else:		else:
h = MultiHash(length=length)
chunks = []
with open(path, 'rb') as fobj:
while True:
chunk = fobj.read(HASH_BLOCK_SIZE)
if not chunk:
break
h.update(chunk)
chunks.append(chunk)

ret = {		ret = {
**h.digest(),		**hashes,
'status': 'visible',		'status': 'visible',
'data': b''.join(chunks),
}		}

if save_path:
ret['path'] = path		ret['path'] = path
ret['perms'] = mode_to_perms(mode)		ret['perms'] = mode_to_perms(mode)
ret['length'] = length		ret['length'] = length

obj = cls(ret)		obj = cls(ret)
return obj		return obj

def __repr__(self):		def __repr__(self):
return 'Content(id=%s)' % id_to_str(self.hash)		return 'Content(id=%s)' % id_to_str(self.hash)

def compute_hash(self):		def compute_hash(self):
return self.data['sha1_git']		return self.data['sha1_git']

		def to_model(self) -> model.BaseContent:
		"""Builds a `model.BaseContent` object based on this leaf."""
		data = self.get_data().copy()
		data.pop('perms', None)
		if data['status'] == 'absent':
		data.pop('path', None)
		olasdUnsubmitted Not Done Inline Actions By default, `from_disk` will not pull the full data from disk into memory, so by doing this there's a good chance you'll end up with a `Content` object that has an empty data attribute. We already have lots of issues with loaders having an "optimistic" usage of memory, so I'd be tempted to introduce a `DiskBackedContent` (`DiskContent`?) model, inheriting from `Content`, with a `path` attribute as well as a lazy `data` attribute that will read data from disk the first time it's accessed. olasd: By default, `from_disk` will not pull the full data from disk into memory, so by doing this…
		return model.SkippedContent.from_dict(data)
		elif 'data' in data:
		return model.Content.from_dict(data)
		else:
		return DiskBackedContent.from_dict(data)


def accept_all_directories(dirname, entries):		def accept_all_directories(dirname, entries):
"""Default filter for :func:`Directory.from_disk` accepting all		"""Default filter for :func:`Directory.from_disk` accepting all
directories		directories

Args:		Args:
dirname (bytes): directory name		dirname (bytes): directory name
entries (list): directory entries		entries (list): directory entries
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	class Directory(MerkleNode):
the affected levels of hierarchy are reset and can be collected again using		the affected levels of hierarchy are reset and can be collected again using
the same method. This enables the efficient collection of updated nodes,		the same method. This enables the efficient collection of updated nodes,
for instance when the client is applying diffs.		for instance when the client is applying diffs.
"""		"""
__slots__ = ['__entries']		__slots__ = ['__entries']
type = 'directory'		type = 'directory'

@classmethod		@classmethod
def from_disk(cls, *, path, data=False, save_path=False,		def from_disk(cls, *, path,
dir_filter=accept_all_directories,		dir_filter=accept_all_directories,
max_content_length=None):		max_content_length=None):
"""Compute the Software Heritage objects for a given directory tree		"""Compute the Software Heritage objects for a given directory tree

Args:		Args:
path (bytes): the directory to traverse		path (bytes): the directory to traverse
data (bool): whether to add the data to the content objects		data (bool): whether to add the data to the content objects
save_path (bool): whether to add the path to the content objects		save_path (bool): whether to add the path to the content objects
Show All 11 Lines	def from_disk(cls, *, path,
for root, dentries, fentries in os.walk(top_path, topdown=False):		for root, dentries, fentries in os.walk(top_path, topdown=False):
entries = {}		entries = {}
# Join fentries and dentries in the same processing, as symbolic		# Join fentries and dentries in the same processing, as symbolic
# links to directories appear in dentries...		# links to directories appear in dentries...
for name in fentries + dentries:		for name in fentries + dentries:
path = os.path.join(root, name)		path = os.path.join(root, name)
if not os.path.isdir(path) or os.path.islink(path):		if not os.path.isdir(path) or os.path.islink(path):
content = Content.from_file(		content = Content.from_file(
path=path, data=data, save_path=save_path,		path=path, max_content_length=max_content_length)
max_content_length=max_content_length)
entries[name] = content		entries[name] = content
else:		else:
if dir_filter(name, dirs[path].entries):		if dir_filter(name, dirs[path].entries):
entries[name] = dirs[path]		entries[name] = dirs[path]

dirs[root] = cls({'name': os.path.basename(root)})		dirs[root] = cls({'name': os.path.basename(root)})
dirs[root].update(entries)		dirs[root].update(entries)

▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	def entries(self):
for name, child in self.items()		for name, child in self.items()
), key=directory_entry_sort_key)		), key=directory_entry_sort_key)

return self.__entries		return self.__entries

def compute_hash(self):		def compute_hash(self):
return id_to_bytes(directory_identifier({'entries': self.entries}))		return id_to_bytes(directory_identifier({'entries': self.entries}))

		def to_model(self) -> model.Directory:
		"""Builds a `model.Directory` object based on this node;
		ignoring its children."""
		return model.Directory.from_dict(self.get_data())

def __getitem__(self, key):		def __getitem__(self, key):
if not isinstance(key, bytes):		if not isinstance(key, bytes):
raise ValueError('Can only get a bytes from Directory')		raise ValueError('Can only get a bytes from Directory')

# Convenience shortcut		# Convenience shortcut
if key == b'':		if key == b'':
return self		return self

Show All 39 Lines