Page MenuHomeSoftware Heritage

D248.id819.diff
No OneTemporary

D248.id819.diff

diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
new file mode 100644
--- /dev/null
+++ b/swh/model/from_disk.py
@@ -0,0 +1,250 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import stat
+
+from . import hashutil
+from .identifiers import (
+ directory_identifier, identifier_to_bytes as id_to_bytes
+)
+
+# Permissions for directory entries
+DENTRY_PERMS = {
+ 'content': 0o100644,
+ 'executable_content': 0o100755,
+ 'symlink': 0o120000,
+ 'directory': 0o040000,
+ 'revision': 0o160000,
+}
+
+
+def mode_to_perms(mode):
+ """Convert a file mode to a permission compatible with Software Heritage
+ directory entries
+
+ Args:
+ mode (int): a file mode as returned by :func:`os.stat` in
+ :attr:`os.stat_result.st_mode`
+
+ Returns:
+ int: one of the following values:
+ :0o100644: plain file
+ :0o100755: executable file
+ :0o120000: symbolic link
+ :0o040000: directory
+
+ """
+ if stat.S_ISLNK(mode):
+ return DENTRY_PERMS['symlink']
+ if stat.S_ISDIR(mode):
+ return DENTRY_PERMS['directory']
+ else:
+ # file is executable in any way
+ if mode & (0o111):
+ return DENTRY_PERMS['executable_content']
+ else:
+ return DENTRY_PERMS['content']
+
+
+def data_to_content(*, filename, mode, data):
+ """Convert data (raw :class:`bytes`) to a Software Heritage content entry
+
+ Args:
+ filename (bytes): the pathname to the loaded file
+ mode (int): a file mode (passed to :func:`mode_to_perms`)
+ data (bytes): raw contents of the file
+
+ Returns:
+ dict: A Software Heritage compatible content with the following keys:
+ :sha1, sha1_git, ... (bytes): one key per hash algorithm
+ :length (int): the length of the data passed in
+ :type (str): hardcoded to 'file' (used for directory entries)
+ :perms (int): the permissions of the file (used for directory entries)
+ :name (bytes): the base name of the file
+ :data (bytes): the raw contents passed in as the `data` argument
+ """
+ ret = hashutil.hash_data(data)
+ ret['length'] = len(data)
+ ret['perms'] = mode_to_perms(mode)
+ ret['name'] = os.path.basename(filename)
+ ret['type'] = 'file'
+ ret['data'] = data
+
+ return ret
+
+
+def symlink_to_content(*, filename, mode):
+ """Convert a symbolic link to a Software Heritage content entry"""
+ return data_to_content(filename=filename, mode=mode,
+ data=os.readlink(filename))
+
+
+def file_to_content(*, filename, data=False):
+ """Compute the Software Heritage content entry corresponding to an on-disk
+ file.
+
+ The returned dictionary contains keys useful for both:
+ - loading the content in the archive (hashes, `length`)
+ - using the content as a directory entry in a directory
+
+ Args:
+ filename (bytes): name of the file for which we're computing the content
+ entry
+ data (bool): add the file data to the entry
+ Returns:
+ dict: A dictionary with the following keys:
+ :sha1, sha1_git, ... (bytes): one key per hash algorithm (defined in
+ :const:`swh.model.hashutil.HASH_ALGORITHMS`)
+ :length (int): the length of the data passed in
+ :type (str): hardcoded to 'file' (used for directory entries)
+ :perms (int): the permissions of the file (used for directory entries)
+ :name (bytes): the base name of the file
+ :data (bytes): the raw contents passed in as the `data` argument
+ """
+ file_stat = os.lstat(filename)
+ mode = file_stat.st_mode
+
+ if stat.S_ISLNK(mode):
+ # Symbolic link: return a file whose contents are the link target
+ return symlink_to_content(filename=filename, mode=mode)
+ elif not stat.S_ISREG(mode):
+ # not a regular file: return the empty file instead
+ return data_to_content(filename=filename, mode=mode, data=b'')
+
+ length = file_stat.st_size
+
+ if not data:
+ ret = hashutil.hash_path(filename)
+ else:
+ chunks = []
+
+ def append_chunk(x, chunks=chunks):
+ chunks.append(x)
+
+ with open(filename, 'rb') as fobj:
+ ret = hashutil.hash_file(fobj, length=length,
+ chunk_cb=append_chunk)
+
+ ret['data'] = b''.join(chunks)
+
+ ret['type'] = 'file'
+ ret['perms'] = mode_to_perms(mode)
+ ret['length'] = length
+ ret['name'] = os.path.basename(filename)
+
+ return ret
+
+
+def accept_all_directories(dirname, entries):
+ """Default filter for :func:`directory_to_objects` accepting all
+ directories
+
+ Args:
+ dirname (bytes): directory name
+ entries (list): directory entries
+ """
+ return True
+
+
+def ignore_empty_directories(dirname, entries):
+ """Filter for :func:`directory_to_objects` ignoring empty directories
+
+ Args:
+ dirname (bytes): directory name
+ entries (list): directory entries
+ Returns:
+ True if the directory is not empty, false if the directory is empty
+ """
+ return bool(entries)
+
+
+def ignore_named_directories(names, *, case_sensitive=True):
+ """Filter for :func:`directory_to_objects` to ignore directories named one
+ of names.
+
+ Args:
+ names (list of bytes): names to ignore
+ case_sensitive (bool): whether to do the filtering in a case sensitive
+ way
+ Returns:
+ a directory filter for :func:`directory_to_objects`
+ """
+ if not case_sensitive:
+ names = [name.lower() for name in names]
+
+ def named_filter(dirname, entries,
+ names=names, case_sensitive=case_sensitive):
+ if case_sensitive:
+ return dirname not in names
+ else:
+ return dirname.lower() not in names
+
+ return named_filter
+
+
+def directory_to_objects(*, directory, data=False,
+ dir_filter=accept_all_directories):
+ """Compute the Software Heritage objects for a given directory tree
+
+ Args:
+ directory (bytes): the directory name to traverse
+ data (bool): whether to add the data to the content objects
+ dir_filter (function): a filter to ignore some directories by
+ name or contents. Takes two arguments: dirname and entries, and
+ returns True if the directory should be added, False if the directory
+ should be ignored.
+
+ Returns:
+ a dictionary with two keys: 'directory' and 'content', each containing a
+ list of objects ready to be consumed by the Software Heritage database.
+ """
+ def content_to_directory_entry(content):
+ return {
+ 'type': content['type'],
+ 'perms': content['perms'],
+ 'target': content['sha1_git'],
+ 'name': content['name'],
+ }
+
+ def directory_to_directory_entry(directory):
+ return {
+ 'type': 'dir',
+ 'perms': DENTRY_PERMS['directory'],
+ 'target': directory['id'],
+ 'name': directory['name']
+ }
+
+ dirs = {}
+ dirs_to_prune = []
+ files = {}
+ for root, dentries, fentries in os.walk(directory, topdown=False):
+ entries = []
+ # Join fentries and dentries in the same processing, as symbolic links
+ # to directories appear in dentries...
+ for file in fentries + dentries:
+ filename = os.path.join(root, file)
+ if not os.path.isdir(filename) or os.path.islink(filename):
+ content = file_to_content(filename=filename, data=data)
+ files[filename] = content
+ entries.append(content_to_directory_entry(content))
+ else:
+ directory = dirs[filename]
+ if dir_filter(filename, directory['entries']):
+ entries.append(directory_to_directory_entry(directory))
+ else:
+ dirs_to_prune.append(filename)
+
+ directory = {'entries': entries, 'name': os.path.basename(root)}
+ directory['id'] = id_to_bytes(directory_identifier(directory))
+ dirs[root] = directory
+
+ # FIXME: We need to prune the ignored directories and their contents, lest
+ # we leak them to the caller.
+
+ return {
+ 'content': list(files.values()),
+ 'directory': list(dirs.values()),
+ }
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
new file mode 100644
--- /dev/null
+++ b/swh/model/tests/test_from_disk.py
@@ -0,0 +1,274 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import tempfile
+import unittest
+
+from swh.model import from_disk
+from swh.model.hashutil import hash_to_bytes
+
+
+class ModeToPerms(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+
+ # Generate a full permissions map
+ self.perms_map = {}
+
+ # Symlinks
+ for i in range(0o120000, 0o127777 + 1):
+ self.perms_map[i] = 0o120000
+
+ # Directories
+ for i in range(0o040000, 0o047777 + 1):
+ self.perms_map[i] = 0o040000
+
+ # Other file types: socket, regular file, block device, character
+ # device, fifo all map to regular files
+ for ft in [0o140000, 0o100000, 0o060000, 0o020000, 0o010000]:
+ for i in range(ft, ft + 0o7777 + 1):
+ if i & 0o111:
+ # executable bits are set
+ self.perms_map[i] = 0o100755
+ else:
+ self.perms_map[i] = 0o100644
+
+ def test_exhaustive_mode_to_perms(self):
+ for fmode, perm in self.perms_map.items():
+ self.assertEqual(perm, from_disk.mode_to_perms(fmode))
+
+
+class DataMixIn:
+ maxDiff = None
+
+ def setUp(self):
+ self.tmpdir = tempfile.TemporaryDirectory(
+ prefix=b'swh.model.from_disk'
+ )
+ self.contents = {
+ b'file': {
+ 'data': b'42\n',
+ 'sha1': hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4689'
+ ),
+ 'sha256': hash_to_bytes(
+ '084c799cd551dd1d8d5c5f9a5d593b2e'
+ '931f5e36122ee5c793c1d08a19839cc0'
+ ),
+ 'sha1_git': hash_to_bytes(
+ 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd'),
+ 'blake2s256': hash_to_bytes(
+ 'd5fe1939576527e42cfd76a9455a2432'
+ 'fe7f56669564577dd93c4280e76d661d'
+ ),
+ 'length': 3,
+ 'mode': 0o100644
+ },
+ }
+
+ self.symlinks = {
+ b'symlink': {
+ 'data': b'target',
+ 'blake2s256': hash_to_bytes(
+ '595d221b30fdd8e10e2fdf18376e688e'
+ '9f18d56fd9b6d1eb6a822f8c146c6da6'
+ ),
+ 'sha1': hash_to_bytes(
+ '0e8a3ad980ec179856012b7eecf4327e99cd44cd'
+ ),
+ 'sha1_git': hash_to_bytes(
+ '1de565933b05f74c75ff9a6520af5f9f8a5a2f1d'
+ ),
+ 'sha256': hash_to_bytes(
+ '34a04005bcaf206eec990bd9637d9fdb'
+ '6725e0a0c0d4aebf003f17f4c956eb5c'
+ ),
+ 'length': 6,
+ }
+ }
+
+ self.specials = {
+ b'fifo': os.mkfifo,
+ b'devnull': lambda path: os.mknod(path, device=os.makedev(1, 3)),
+ }
+
+ self.empty_content = {
+ 'data': b'',
+ 'length': 0,
+ 'length': 0,
+ 'blake2s256': hash_to_bytes(
+ '69217a3079908094e11121d042354a7c'
+ '1f55b6482ca1a51e1b250dfd1ed0eef9'
+ ),
+ 'sha1': hash_to_bytes(
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
+ ),
+ 'sha1_git': hash_to_bytes(
+ 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
+ ),
+ 'sha256': hash_to_bytes(
+ 'e3b0c44298fc1c149afbf4c8996fb924'
+ '27ae41e4649b934ca495991b7852b855'
+ ),
+ }
+
+ def tearDown(self):
+ self.tmpdir.cleanup()
+
+ def make_contents(self, directory):
+ for filename, content in self.contents.items():
+ path = os.path.join(directory, filename)
+ with open(path, 'wb') as f:
+ f.write(content['data'])
+ os.chmod(path, content['mode'])
+
+ def make_symlinks(self, directory):
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(directory, filename)
+ os.symlink(symlink['data'], path)
+
+ def make_specials(self, directory):
+ for filename, fn in self.specials.items():
+ path = os.path.join(directory, filename)
+ fn(path)
+
+
+class DataToContent(DataMixIn, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+
+ def test_data_to_content(self):
+ for filename, content in self.contents.items():
+ mode = content.pop('mode')
+ conv_content = from_disk.data_to_content(
+ filename=filename, mode=mode, data=content['data'])
+ self.assertEqual(conv_content.pop('name'), filename)
+ self.assertEqual(conv_content.pop('perms'),
+ from_disk.mode_to_perms(mode))
+ self.assertEqual(conv_content.pop('type'), 'file')
+ self.assertEqual(conv_content, content)
+
+
+class SymlinkToContent(DataMixIn, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ self.make_symlinks(self.tmpdir.name)
+
+ def test_symlink_to_content(self):
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o120000
+ conv_content = from_disk.symlink_to_content(filename=path,
+ mode=perms)
+ self.assertEqual(conv_content.pop('name'), filename)
+ self.assertEqual(conv_content.pop('perms'), perms)
+ self.assertEqual(conv_content.pop('type'), 'file')
+ self.assertEqual(conv_content, symlink)
+
+
+class FileToContent(DataMixIn, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ self.make_contents(self.tmpdir.name)
+ self.make_symlinks(self.tmpdir.name)
+ self.make_specials(self.tmpdir.name)
+
+ def test_file_to_content(self):
+ for data in [False, True]:
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o120000
+ conv_content = from_disk.file_to_content(filename=path,
+ data=data)
+ self.assertEqual(conv_content.pop('name'), filename)
+ self.assertEqual(conv_content.pop('perms'), perms)
+ self.assertEqual(conv_content.pop('type'), 'file')
+ if not data:
+ conv_content['data'] = symlink['data']
+ self.assertEqual(conv_content, symlink)
+
+ for filename, content in self.contents.items():
+ content = content.copy()
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o100644
+ if content.pop('mode') & 0o111:
+ perms = 0o100755
+ conv_content = from_disk.file_to_content(filename=path,
+ data=data)
+ self.assertEqual(conv_content.pop('name'), filename)
+ self.assertEqual(conv_content.pop('perms'), perms)
+ self.assertEqual(conv_content.pop('type'), 'file')
+ if not data:
+ conv_content['data'] = content['data']
+ self.assertEqual(conv_content, content)
+
+ for filename in self.specials:
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o100644
+ conv_content = from_disk.file_to_content(filename=path,
+ data=data)
+ self.assertEqual(conv_content.pop('name'), filename)
+ self.assertEqual(conv_content.pop('perms'), perms)
+ self.assertEqual(conv_content.pop('type'), 'file')
+ if not data:
+ conv_content['data'] = b''
+ self.assertEqual(conv_content, self.empty_content)
+
+
+class DirectoryToObjects(DataMixIn, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ contents = os.path.join(self.tmpdir.name, b'contents')
+ os.mkdir(contents)
+ self.make_contents(contents)
+ symlinks = os.path.join(self.tmpdir.name, b'symlinks')
+ os.mkdir(symlinks)
+ self.make_symlinks(symlinks)
+ specials = os.path.join(self.tmpdir.name, b'specials')
+ os.mkdir(specials)
+ self.make_specials(specials)
+ empties = os.path.join(self.tmpdir.name, b'empty1', b'empty2')
+ os.makedirs(empties)
+
+ def test_directory_to_objects(self):
+ objs = from_disk.directory_to_objects(directory=self.tmpdir.name)
+ self.assertIn('content', objs)
+ self.assertIn('directory', objs)
+
+ self.assertEqual(len(objs['directory']), 6)
+ self.assertEqual(len(objs['content']),
+ len(self.contents)
+ + len(self.symlinks)
+ + len(self.specials))
+
+ def test_directory_to_objects_ignore_empty(self):
+ objs = from_disk.directory_to_objects(
+ directory=self.tmpdir.name,
+ dir_filter=from_disk.ignore_empty_directories
+ )
+
+ self.assertIn('content', objs)
+ self.assertIn('directory', objs)
+
+ self.assertEqual(len(objs['directory']), 4)
+ self.assertEqual(len(objs['content']),
+ len(self.contents)
+ + len(self.symlinks)
+ + len(self.specials))
+
+ def test_directory_to_objects_ignore_name(self):
+ objs = from_disk.directory_to_objects(
+ directory=self.tmpdir.name,
+ dir_filter=from_disk.ignore_named_directories(['symlink'])
+ )
+
+ self.assertIn('content', objs)
+ self.assertIn('directory', objs)
+
+ self.assertEqual(len(objs['directory']), 5)
+ self.assertEqual(len(objs['content']),
+ len(self.contents)
+ + len(self.specials))

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 11:55 AM (3 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222769

Event Timeline