Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9341297
D248.id819.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
18 KB
Subscribers
None
D248.id819.diff
View Options
diff --git a/swh/model/from_disk.py b/swh/model/from_disk.py
new file mode 100644
--- /dev/null
+++ b/swh/model/from_disk.py
@@ -0,0 +1,250 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import stat
+
+from . import hashutil
+from .identifiers import (
+ directory_identifier, identifier_to_bytes as id_to_bytes
+)
+
+# Permissions for directory entries
+DENTRY_PERMS = {
+ 'content': 0o100644,
+ 'executable_content': 0o100755,
+ 'symlink': 0o120000,
+ 'directory': 0o040000,
+ 'revision': 0o160000,
+}
+
+
+def mode_to_perms(mode):
+ """Convert a file mode to a permission compatible with Software Heritage
+ directory entries
+
+ Args:
+ mode (int): a file mode as returned by :func:`os.stat` in
+ :attr:`os.stat_result.st_mode`
+
+ Returns:
+ int: one of the following values:
+ :0o100644: plain file
+ :0o100755: executable file
+ :0o120000: symbolic link
+ :0o040000: directory
+
+ """
+ if stat.S_ISLNK(mode):
+ return DENTRY_PERMS['symlink']
+ if stat.S_ISDIR(mode):
+ return DENTRY_PERMS['directory']
+ else:
+ # file is executable in any way
+ if mode & (0o111):
+ return DENTRY_PERMS['executable_content']
+ else:
+ return DENTRY_PERMS['content']
+
+
+def data_to_content(*, filename, mode, data):
+ """Convert data (raw :class:`bytes`) to a Software Heritage content entry
+
+ Args:
+ filename (bytes): the pathname to the loaded file
+ mode (int): a file mode (passed to :func:`mode_to_perms`)
+ data (bytes): raw contents of the file
+
+ Returns:
+ dict: A Software Heritage compatible content with the following keys:
+ :sha1, sha1_git, ... (bytes): one key per hash algorithm
+ :length (int): the length of the data passed in
+ :type (str): hardcoded to 'file' (used for directory entries)
+ :perms (int): the permissions of the file (used for directory entries)
+ :name (bytes): the base name of the file
+ :data (bytes): the raw contents passed in as the `data` argument
+ """
+ ret = hashutil.hash_data(data)
+ ret['length'] = len(data)
+ ret['perms'] = mode_to_perms(mode)
+ ret['name'] = os.path.basename(filename)
+ ret['type'] = 'file'
+ ret['data'] = data
+
+ return ret
+
+
+def symlink_to_content(*, filename, mode):
+ """Convert a symbolic link to a Software Heritage content entry"""
+ return data_to_content(filename=filename, mode=mode,
+ data=os.readlink(filename))
+
+
+def file_to_content(*, filename, data=False):
+ """Compute the Software Heritage content entry corresponding to an on-disk
+ file.
+
+ The returned dictionary contains keys useful for both:
+ - loading the content in the archive (hashes, `length`)
+ - using the content as a directory entry in a directory
+
+ Args:
+ filename (bytes): name of the file for which we're computing the content
+ entry
+ data (bool): add the file data to the entry
+ Returns:
+ dict: A dictionary with the following keys:
+ :sha1, sha1_git, ... (bytes): one key per hash algorithm (defined in
+ :const:`swh.model.hashutil.HASH_ALGORITHMS`)
+ :length (int): the length of the data passed in
+ :type (str): hardcoded to 'file' (used for directory entries)
+ :perms (int): the permissions of the file (used for directory entries)
+ :name (bytes): the base name of the file
+ :data (bytes): the raw contents passed in as the `data` argument
+ """
+ file_stat = os.lstat(filename)
+ mode = file_stat.st_mode
+
+ if stat.S_ISLNK(mode):
+ # Symbolic link: return a file whose contents are the link target
+ return symlink_to_content(filename=filename, mode=mode)
+ elif not stat.S_ISREG(mode):
+ # not a regular file: return the empty file instead
+ return data_to_content(filename=filename, mode=mode, data=b'')
+
+ length = file_stat.st_size
+
+ if not data:
+ ret = hashutil.hash_path(filename)
+ else:
+ chunks = []
+
+ def append_chunk(x, chunks=chunks):
+ chunks.append(x)
+
+ with open(filename, 'rb') as fobj:
+ ret = hashutil.hash_file(fobj, length=length,
+ chunk_cb=append_chunk)
+
+ ret['data'] = b''.join(chunks)
+
+ ret['type'] = 'file'
+ ret['perms'] = mode_to_perms(mode)
+ ret['length'] = length
+ ret['name'] = os.path.basename(filename)
+
+ return ret
+
+
+def accept_all_directories(dirname, entries):
+ """Default filter for :func:`directory_to_objects` accepting all
+ directories
+
+ Args:
+ dirname (bytes): directory name
+ entries (list): directory entries
+ """
+ return True
+
+
+def ignore_empty_directories(dirname, entries):
+ """Filter for :func:`directory_to_objects` ignoring empty directories
+
+ Args:
+ dirname (bytes): directory name
+ entries (list): directory entries
+ Returns:
+ True if the directory is not empty, false if the directory is empty
+ """
+ return bool(entries)
+
+
+def ignore_named_directories(names, *, case_sensitive=True):
+ """Filter for :func:`directory_to_objects` to ignore directories named one
+ of names.
+
+ Args:
+ names (list of bytes): names to ignore
+ case_sensitive (bool): whether to do the filtering in a case sensitive
+ way
+ Returns:
+ a directory filter for :func:`directory_to_objects`
+ """
+ if not case_sensitive:
+ names = [name.lower() for name in names]
+
+ def named_filter(dirname, entries,
+ names=names, case_sensitive=case_sensitive):
+ if case_sensitive:
+ return dirname not in names
+ else:
+ return dirname.lower() not in names
+
+ return named_filter
+
+
+def directory_to_objects(*, directory, data=False,
+ dir_filter=accept_all_directories):
+ """Compute the Software Heritage objects for a given directory tree
+
+ Args:
+ directory (bytes): the directory name to traverse
+ data (bool): whether to add the data to the content objects
+ dir_filter (function): a filter to ignore some directories by
+ name or contents. Takes two arguments: dirname and entries, and
+ returns True if the directory should be added, False if the directory
+ should be ignored.
+
+ Returns:
+ a dictionary with two keys: 'directory' and 'content', each containing a
+ list of objects ready to be consumed by the Software Heritage database.
+ """
+ def content_to_directory_entry(content):
+ return {
+ 'type': content['type'],
+ 'perms': content['perms'],
+ 'target': content['sha1_git'],
+ 'name': content['name'],
+ }
+
+ def directory_to_directory_entry(directory):
+ return {
+ 'type': 'dir',
+ 'perms': DENTRY_PERMS['directory'],
+ 'target': directory['id'],
+ 'name': directory['name']
+ }
+
+ dirs = {}
+ dirs_to_prune = []
+ files = {}
+ for root, dentries, fentries in os.walk(directory, topdown=False):
+ entries = []
+ # Join fentries and dentries in the same processing, as symbolic links
+ # to directories appear in dentries...
+ for file in fentries + dentries:
+ filename = os.path.join(root, file)
+ if not os.path.isdir(filename) or os.path.islink(filename):
+ content = file_to_content(filename=filename, data=data)
+ files[filename] = content
+ entries.append(content_to_directory_entry(content))
+ else:
+ directory = dirs[filename]
+ if dir_filter(filename, directory['entries']):
+ entries.append(directory_to_directory_entry(directory))
+ else:
+ dirs_to_prune.append(filename)
+
+ directory = {'entries': entries, 'name': os.path.basename(root)}
+ directory['id'] = id_to_bytes(directory_identifier(directory))
+ dirs[root] = directory
+
+ # FIXME: We need to prune the ignored directories and their contents, lest
+ # we leak them to the caller.
+
+ return {
+ 'content': list(files.values()),
+ 'directory': list(dirs.values()),
+ }
diff --git a/swh/model/tests/test_from_disk.py b/swh/model/tests/test_from_disk.py
new file mode 100644
--- /dev/null
+++ b/swh/model/tests/test_from_disk.py
@@ -0,0 +1,274 @@
+# Copyright (C) 2017 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import tempfile
+import unittest
+
+from swh.model import from_disk
+from swh.model.hashutil import hash_to_bytes
+
+
+class ModeToPerms(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+
+ # Generate a full permissions map
+ self.perms_map = {}
+
+ # Symlinks
+ for i in range(0o120000, 0o127777 + 1):
+ self.perms_map[i] = 0o120000
+
+ # Directories
+ for i in range(0o040000, 0o047777 + 1):
+ self.perms_map[i] = 0o040000
+
+ # Other file types: socket, regular file, block device, character
+ # device, fifo all map to regular files
+ for ft in [0o140000, 0o100000, 0o060000, 0o020000, 0o010000]:
+ for i in range(ft, ft + 0o7777 + 1):
+ if i & 0o111:
+ # executable bits are set
+ self.perms_map[i] = 0o100755
+ else:
+ self.perms_map[i] = 0o100644
+
+ def test_exhaustive_mode_to_perms(self):
+ for fmode, perm in self.perms_map.items():
+ self.assertEqual(perm, from_disk.mode_to_perms(fmode))
+
+
+class DataMixIn:
+ maxDiff = None
+
+ def setUp(self):
+ self.tmpdir = tempfile.TemporaryDirectory(
+ prefix=b'swh.model.from_disk'
+ )
+ self.contents = {
+ b'file': {
+ 'data': b'42\n',
+ 'sha1': hash_to_bytes(
+ '34973274ccef6ab4dfaaf86599792fa9c3fe4689'
+ ),
+ 'sha256': hash_to_bytes(
+ '084c799cd551dd1d8d5c5f9a5d593b2e'
+ '931f5e36122ee5c793c1d08a19839cc0'
+ ),
+ 'sha1_git': hash_to_bytes(
+ 'd81cc0710eb6cf9efd5b920a8453e1e07157b6cd'),
+ 'blake2s256': hash_to_bytes(
+ 'd5fe1939576527e42cfd76a9455a2432'
+ 'fe7f56669564577dd93c4280e76d661d'
+ ),
+ 'length': 3,
+ 'mode': 0o100644
+ },
+ }
+
+ self.symlinks = {
+ b'symlink': {
+ 'data': b'target',
+ 'blake2s256': hash_to_bytes(
+ '595d221b30fdd8e10e2fdf18376e688e'
+ '9f18d56fd9b6d1eb6a822f8c146c6da6'
+ ),
+ 'sha1': hash_to_bytes(
+ '0e8a3ad980ec179856012b7eecf4327e99cd44cd'
+ ),
+ 'sha1_git': hash_to_bytes(
+ '1de565933b05f74c75ff9a6520af5f9f8a5a2f1d'
+ ),
+ 'sha256': hash_to_bytes(
+ '34a04005bcaf206eec990bd9637d9fdb'
+ '6725e0a0c0d4aebf003f17f4c956eb5c'
+ ),
+ 'length': 6,
+ }
+ }
+
+ self.specials = {
+ b'fifo': os.mkfifo,
+ b'devnull': lambda path: os.mknod(path, device=os.makedev(1, 3)),
+ }
+
+ self.empty_content = {
+ 'data': b'',
+ 'length': 0,
+ 'length': 0,
+ 'blake2s256': hash_to_bytes(
+ '69217a3079908094e11121d042354a7c'
+ '1f55b6482ca1a51e1b250dfd1ed0eef9'
+ ),
+ 'sha1': hash_to_bytes(
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
+ ),
+ 'sha1_git': hash_to_bytes(
+ 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
+ ),
+ 'sha256': hash_to_bytes(
+ 'e3b0c44298fc1c149afbf4c8996fb924'
+ '27ae41e4649b934ca495991b7852b855'
+ ),
+ }
+
+ def tearDown(self):
+ self.tmpdir.cleanup()
+
+ def make_contents(self, directory):
+ for filename, content in self.contents.items():
+ path = os.path.join(directory, filename)
+ with open(path, 'wb') as f:
+ f.write(content['data'])
+ os.chmod(path, content['mode'])
+
+ def make_symlinks(self, directory):
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(directory, filename)
+ os.symlink(symlink['data'], path)
+
+ def make_specials(self, directory):
+ for filename, fn in self.specials.items():
+ path = os.path.join(directory, filename)
+ fn(path)
+
+
+class DataToContent(DataMixIn, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+
+ def test_data_to_content(self):
+ for filename, content in self.contents.items():
+ mode = content.pop('mode')
+ conv_content = from_disk.data_to_content(
+ filename=filename, mode=mode, data=content['data'])
+ self.assertEqual(conv_content.pop('name'), filename)
+ self.assertEqual(conv_content.pop('perms'),
+ from_disk.mode_to_perms(mode))
+ self.assertEqual(conv_content.pop('type'), 'file')
+ self.assertEqual(conv_content, content)
+
+
+class SymlinkToContent(DataMixIn, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ self.make_symlinks(self.tmpdir.name)
+
+ def test_symlink_to_content(self):
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o120000
+ conv_content = from_disk.symlink_to_content(filename=path,
+ mode=perms)
+ self.assertEqual(conv_content.pop('name'), filename)
+ self.assertEqual(conv_content.pop('perms'), perms)
+ self.assertEqual(conv_content.pop('type'), 'file')
+ self.assertEqual(conv_content, symlink)
+
+
+class FileToContent(DataMixIn, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ self.make_contents(self.tmpdir.name)
+ self.make_symlinks(self.tmpdir.name)
+ self.make_specials(self.tmpdir.name)
+
+ def test_file_to_content(self):
+ for data in [False, True]:
+ for filename, symlink in self.symlinks.items():
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o120000
+ conv_content = from_disk.file_to_content(filename=path,
+ data=data)
+ self.assertEqual(conv_content.pop('name'), filename)
+ self.assertEqual(conv_content.pop('perms'), perms)
+ self.assertEqual(conv_content.pop('type'), 'file')
+ if not data:
+ conv_content['data'] = symlink['data']
+ self.assertEqual(conv_content, symlink)
+
+ for filename, content in self.contents.items():
+ content = content.copy()
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o100644
+ if content.pop('mode') & 0o111:
+ perms = 0o100755
+ conv_content = from_disk.file_to_content(filename=path,
+ data=data)
+ self.assertEqual(conv_content.pop('name'), filename)
+ self.assertEqual(conv_content.pop('perms'), perms)
+ self.assertEqual(conv_content.pop('type'), 'file')
+ if not data:
+ conv_content['data'] = content['data']
+ self.assertEqual(conv_content, content)
+
+ for filename in self.specials:
+ path = os.path.join(self.tmpdir.name, filename)
+ perms = 0o100644
+ conv_content = from_disk.file_to_content(filename=path,
+ data=data)
+ self.assertEqual(conv_content.pop('name'), filename)
+ self.assertEqual(conv_content.pop('perms'), perms)
+ self.assertEqual(conv_content.pop('type'), 'file')
+ if not data:
+ conv_content['data'] = b''
+ self.assertEqual(conv_content, self.empty_content)
+
+
+class DirectoryToObjects(DataMixIn, unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ contents = os.path.join(self.tmpdir.name, b'contents')
+ os.mkdir(contents)
+ self.make_contents(contents)
+ symlinks = os.path.join(self.tmpdir.name, b'symlinks')
+ os.mkdir(symlinks)
+ self.make_symlinks(symlinks)
+ specials = os.path.join(self.tmpdir.name, b'specials')
+ os.mkdir(specials)
+ self.make_specials(specials)
+ empties = os.path.join(self.tmpdir.name, b'empty1', b'empty2')
+ os.makedirs(empties)
+
+ def test_directory_to_objects(self):
+ objs = from_disk.directory_to_objects(directory=self.tmpdir.name)
+ self.assertIn('content', objs)
+ self.assertIn('directory', objs)
+
+ self.assertEqual(len(objs['directory']), 6)
+ self.assertEqual(len(objs['content']),
+ len(self.contents)
+ + len(self.symlinks)
+ + len(self.specials))
+
+ def test_directory_to_objects_ignore_empty(self):
+ objs = from_disk.directory_to_objects(
+ directory=self.tmpdir.name,
+ dir_filter=from_disk.ignore_empty_directories
+ )
+
+ self.assertIn('content', objs)
+ self.assertIn('directory', objs)
+
+ self.assertEqual(len(objs['directory']), 4)
+ self.assertEqual(len(objs['content']),
+ len(self.contents)
+ + len(self.symlinks)
+ + len(self.specials))
+
+ def test_directory_to_objects_ignore_name(self):
+ objs = from_disk.directory_to_objects(
+ directory=self.tmpdir.name,
+ dir_filter=from_disk.ignore_named_directories(['symlink'])
+ )
+
+ self.assertIn('content', objs)
+ self.assertIn('directory', objs)
+
+ self.assertEqual(len(objs['directory']), 5)
+ self.assertEqual(len(objs['content']),
+ len(self.contents)
+ + len(self.specials))
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 11:55 AM (3 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222769
Attached To
D248: from_disk: convert on-disk data to Software Heritage archive objects
Event Timeline
Log In to Comment