diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py new file mode 100644 --- /dev/null +++ b/swh/model/from_disk.py @@ -0,0 +1,193 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import stat + +from . import hashutil +from .identifiers import ( + directory_identifier, identifier_to_bytes as id_to_bytes +) + +# Permissions for directory entries +DENTRY_PERMS = { + 'content': 0o100644, + 'executable_content': 0o100755, + 'symlink': 0o120000, + 'directory': 0o040000, + 'revision': 0o160000, +} + + +def mode_to_perms(mode): + """Convert a file mode to a permission compatible with Software Heritage + directory entries + + Args: + mode (int): a file mode as returned by :func:`os.stat` in + :attr:`os.stat_result.st_mode` + + Returns: + int: one of the following values: + :0o100644: plain file + :0o100755: executable file + :0o120000: symbolic link + :0o040000: directory + + """ + if stat.S_ISLNK(mode): + return DENTRY_PERMS['symlink'] + if stat.S_ISDIR(mode): + return DENTRY_PERMS['directory'] + else: + # file is executable in any way + if mode & (0o111): + return DENTRY_PERMS['executable_content'] + else: + return DENTRY_PERMS['content'] + + +def data_to_content(*, filename, mode, data): + """Convert data (raw :class:`bytes`) to a Software Heritage content entry + + Args: + filename (bytes): the pathname to the loaded file + mode (int): a file mode (passed to :func:`mode_to_perms`) + data (bytes): raw contents of the file + + Returns: + dict: A Software Heritage compatible content with the following keys: + :sha1, sha1_git, ... (bytes): one key per hash algorithm + :length (int): the length of the data passed in + :type (str): hardcoded to 'file' (used for directory entries) + :perms (int): the permissions of the file (used for directory entries) + :name (bytes): the base name of the file + :data (bytes): the raw contents passed in as the `data` argument + """ + ret = hashutil.hash_data(data) + ret['length'] = len(data) + ret['perms'] = mode_to_perms(mode) + ret['name'] = os.path.basename(filename) + ret['type'] = 'file' + ret['data'] = data + + return ret + + +def symlink_to_content(*, filename, mode): + """Convert a symbolic link to a Software Heritage content entry""" + return data_to_content(filename=filename, mode=mode, + data=os.readlink(filename)) + + +def file_to_content(*, filename, data=False): + """Compute the Software Heritage content entry corresponding to an on-disk + file. + + The returned dictionary contains keys useful for both: + - loading the content in the archive (hashes, `length`) + - using the content as a directory entry in a directory + + Args: + filename (bytes): name of the file for which we're computing the content + entry + data (bool): add the file data to the entry + Returns: + dict: A dictionary with the following keys: + :sha1, sha1_git, ... (bytes): one key per hash algorithm (defined in + :const:`swh.model.hashutil.HASH_ALGORITHMS`) + :length (int): the length of the data passed in + :type (str): hardcoded to 'file' (used for directory entries) + :perms (int): the permissions of the file (used for directory entries) + :name (bytes): the base name of the file + :data (bytes): the raw contents passed in as the `data` argument + """ + file_stat = os.lstat(filename) + mode = file_stat.st_mode + + if stat.S_ISLNK(mode): + # Symbolic link: return a file whose contents are the link target + return symlink_to_content(filename=filename, mode=mode) + elif not stat.S_ISREG(mode): + # not a regular file: return the empty file instead + return data_to_content(filename=filename, mode=mode, data=b'') + + length = file_stat.st_size + + if not data: + ret = hashutil.hash_path(filename) + else: + chunks = [] + + def append_chunk(x, chunks=chunks): + chunks.append(x) + + with open(filename, 'rb') as fobj: + ret = hashutil.hash_file(fobj, length=length, + chunk_cb=append_chunk) + + ret['data'] = b''.join(chunks) + + ret['type'] = 'file' + ret['perms'] = mode_to_perms(mode) + ret['length'] = length + ret['name'] = os.path.basename(filename) + + return ret + + +def directory_to_objects(*, directory, data=False, ignore_empty_dirs=False): + """Compute the Software Heritage objects for a given directory tree + + Args: + directory (bytes): the directory name to traverse + data (bool): add the data to the content objects + ignore_empty_dirs (bool): whether to ignore empty directories + + Returns: + a dictionary with two keys: 'directory' and 'content', each containing a + list of objects ready to be consumed by the Software Heritage database. + """ + def content_to_directory_entry(content): + return { + 'type': content['type'], + 'perms': content['perms'], + 'target': content['sha1_git'], + 'name': content['name'], + } + + def directory_to_directory_entry(directory): + return { + 'type': 'dir', + 'perms': DENTRY_PERMS['directory'], + 'target': directory['id'], + 'name': directory['name'] + } + + dirs = {} + files = {} + for root, dentries, fentries in os.walk(directory, topdown=False): + entries = [] + # Join fentries and dentries in the same processing, as symbolic links + # to directories appear in dentries... + for file in fentries + dentries: + filename = os.path.join(root, file) + if not os.path.isdir(filename) or os.path.islink(filename): + content = file_to_content(filename=filename, data=data) + files[filename] = content + entries.append(content_to_directory_entry(content)) + else: + directory = dirs[filename] + if directory['entries'] or not ignore_empty_dirs: + entries.append(directory_to_directory_entry(directory)) + + directory = {'entries': entries, 'name': os.path.basename(root)} + directory['id'] = id_to_bytes(directory_identifier(directory)) + dirs[root] = directory + + return { + 'content': list(files.values()), + 'directory': list(dirs.values()), + } diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py new file mode 100644 --- /dev/null +++ b/swh/model/tests/test_from_disk.py @@ -0,0 +1,218 @@ +# Copyright (C) 2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import tempfile +import unittest + +from swh.model import from_disk +from swh.model.hashutil import hash_to_bytes + + +class ModeToPerms(unittest.TestCase): + def setUp(self): + super().setUp() + + # Generate a full permissions map + self.perms_map = {} + + # Symlinks + for i in range(0o120000, 0o127777 + 1): + self.perms_map[i] = 0o120000 + + # Directories + for i in range(0o040000, 0o047777 + 1): + self.perms_map[i] = 0o040000 + + # Other file types: socket, regular file, block device, character + # device, fifo all map to regular files + for ft in [0o140000, 0o100000, 0o060000, 0o020000, 0o010000]: + for i in range(ft, ft + 0o7777 + 1): + if i & 0o111: + # executable bits are set + self.perms_map[i] = 0o100755 + else: + self.perms_map[i] = 0o100644 + + def test_exhaustive_mode_to_perms(self): + for fmode, perm in self.perms_map.items(): + self.assertEqual(perm, from_disk.mode_to_perms(fmode)) + + +class DataMixIn: + maxDiff = None + + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory( + prefix=b'swh.model.from_disk' + ) + self.contents = { + b'file': { + 'data': b'42\n', + 'sha1': hash_to_bytes( + '34973274ccef6ab4dfaaf86599792fa9c3fe4689' + ), + 'sha256': hash_to_bytes( + '084c799cd551dd1d8d5c5f9a5d593b2e' + '931f5e36122ee5c793c1d08a19839cc0' + ), + 'sha1_git': hash_to_bytes( + 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd'), + 'blake2s256': hash_to_bytes( + 'd5fe1939576527e42cfd76a9455a2432' + 'fe7f56669564577dd93c4280e76d661d' + ), + 'length': 3, + 'mode': 0o100644 + }, + } + + self.symlinks = { + b'symlink': { + 'data': b'target', + 'blake2s256': hash_to_bytes( + '595d221b30fdd8e10e2fdf18376e688e' + '9f18d56fd9b6d1eb6a822f8c146c6da6' + ), + 'sha1': hash_to_bytes( + '0e8a3ad980ec179856012b7eecf4327e99cd44cd' + ), + 'sha1_git': hash_to_bytes( + '1de565933b05f74c75ff9a6520af5f9f8a5a2f1d' + ), + 'sha256': hash_to_bytes( + '34a04005bcaf206eec990bd9637d9fdb' + '6725e0a0c0d4aebf003f17f4c956eb5c' + ), + 'length': 6, + } + } + + self.specials = { + b'fifo': os.mkfifo, + b'devnull': lambda path: os.mknod(path, device=os.makedev(1, 3)), + } + + self.empty_content = { + 'data': b'', + 'length': 0, + 'length': 0, + 'blake2s256': hash_to_bytes( + '69217a3079908094e11121d042354a7c' + '1f55b6482ca1a51e1b250dfd1ed0eef9' + ), + 'sha1': hash_to_bytes( + 'da39a3ee5e6b4b0d3255bfef95601890afd80709' + ), + 'sha1_git': hash_to_bytes( + 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391' + ), + 'sha256': hash_to_bytes( + 'e3b0c44298fc1c149afbf4c8996fb924' + '27ae41e4649b934ca495991b7852b855' + ), + } + + def tearDown(self): + self.tmpdir.cleanup() + + def make_contents(self, directory): + for filename, content in self.contents.items(): + path = os.path.join(directory, filename) + with open(path, 'wb') as f: + f.write(content['data']) + os.chmod(path, content['mode']) + + def make_symlinks(self, directory): + for filename, symlink in self.symlinks.items(): + path = os.path.join(directory, filename) + os.symlink(symlink['data'], path) + + def make_specials(self, directory): + for filename, fn in self.specials.items(): + path = os.path.join(directory, filename) + fn(path) + + +class DataToContent(DataMixIn, unittest.TestCase): + def setUp(self): + super().setUp() + + def test_data_to_content(self): + for filename, content in self.contents.items(): + mode = content.pop('mode') + conv_content = from_disk.data_to_content( + filename=filename, mode=mode, data=content['data']) + self.assertEqual(conv_content.pop('name'), filename) + self.assertEqual(conv_content.pop('perms'), + from_disk.mode_to_perms(mode)) + self.assertEqual(conv_content.pop('type'), 'file') + self.assertEqual(conv_content, content) + + +class SymlinkToContent(DataMixIn, unittest.TestCase): + def setUp(self): + super().setUp() + self.make_symlinks(self.tmpdir.name) + + def test_symlink_to_content(self): + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir.name, filename) + perms = 0o120000 + conv_content = from_disk.symlink_to_content(filename=path, + mode=perms) + self.assertEqual(conv_content.pop('name'), filename) + self.assertEqual(conv_content.pop('perms'), perms) + self.assertEqual(conv_content.pop('type'), 'file') + self.assertEqual(conv_content, symlink) + + +class FileToContent(DataMixIn, unittest.TestCase): + def setUp(self): + super().setUp() + self.make_contents(self.tmpdir.name) + self.make_symlinks(self.tmpdir.name) + self.make_specials(self.tmpdir.name) + + def test_file_to_content(self): + for data in [False, True]: + for filename, symlink in self.symlinks.items(): + path = os.path.join(self.tmpdir.name, filename) + perms = 0o120000 + conv_content = from_disk.file_to_content(filename=path, + data=data) + self.assertEqual(conv_content.pop('name'), filename) + self.assertEqual(conv_content.pop('perms'), perms) + self.assertEqual(conv_content.pop('type'), 'file') + if not data: + conv_content['data'] = symlink['data'] + self.assertEqual(conv_content, symlink) + + for filename, content in self.contents.items(): + content = content.copy() + path = os.path.join(self.tmpdir.name, filename) + perms = 0o100644 + if content.pop('mode') & 0o111: + perms = 0o100755 + conv_content = from_disk.file_to_content(filename=path, + data=data) + self.assertEqual(conv_content.pop('name'), filename) + self.assertEqual(conv_content.pop('perms'), perms) + self.assertEqual(conv_content.pop('type'), 'file') + if not data: + conv_content['data'] = content['data'] + self.assertEqual(conv_content, content) + + for filename in self.specials: + path = os.path.join(self.tmpdir.name, filename) + perms = 0o100644 + conv_content = from_disk.file_to_content(filename=path, + data=data) + self.assertEqual(conv_content.pop('name'), filename) + self.assertEqual(conv_content.pop('perms'), perms) + self.assertEqual(conv_content.pop('type'), 'file') + if not data: + conv_content['data'] = b'' + self.assertEqual(conv_content, self.empty_content)