diff --git a/PKG-INFO b/PKG-INFO index c9b5ff8..a630e6b 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.35 +Version: 0.0.36 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/docs/index.rst b/docs/index.rst index 8b64117..1954db2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,15 +1,17 @@ +.. _swh-core: + Software Heritage - Development Documentation ============================================= .. toctree:: :maxdepth: 2 :caption: Contents: Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` diff --git a/swh.core.egg-info/PKG-INFO b/swh.core.egg-info/PKG-INFO index c9b5ff8..a630e6b 100644 --- a/swh.core.egg-info/PKG-INFO +++ b/swh.core.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.35 +Version: 0.0.36 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.core.egg-info/SOURCES.txt b/swh.core.egg-info/SOURCES.txt index 73c6b68..fd3b0f2 100644 --- a/swh.core.egg-info/SOURCES.txt +++ b/swh.core.egg-info/SOURCES.txt @@ -1,43 +1,44 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile README.md requirements-swh.txt requirements.txt setup.py version.txt bin/swh-hashdir bin/swh-hashfile debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder sql/log-schema.sql swh/__init__.py swh.core.egg-info/PKG-INFO swh.core.egg-info/SOURCES.txt swh.core.egg-info/dependency_links.txt swh.core.egg-info/requires.txt swh.core.egg-info/top_level.txt swh/core/__init__.py swh/core/api.py swh/core/api_async.py swh/core/config.py swh/core/logger.py swh/core/serializers.py +swh/core/tarball.py swh/core/utils.py swh/core/tests/db_testing.py swh/core/tests/test_config.py swh/core/tests/test_logger.py swh/core/tests/test_serializers.py swh/core/tests/test_utils.py \ No newline at end of file diff --git a/swh/core/tarball.py b/swh/core/tarball.py new file mode 100644 index 0000000..69d56f4 --- /dev/null +++ b/swh/core/tarball.py @@ -0,0 +1,227 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import stat +import tarfile +import zipfile + +from os.path import abspath, realpath, join, dirname +from . import utils + + +def _canonical_abspath(path): + """Resolve all paths to an absolute and real one. + + Args: + path: to resolve + + Returns: + canonical absolute path to path + + """ + return realpath(abspath(path)) + + +def _badpath(path, basepath): + """Determine if a path is outside basepath. + + Args: + path: a relative or absolute path of a file or directory + basepath: the basepath path must be in + + Returns: + True if path is outside basepath, false otherwise. + + """ + return not _canonical_abspath(join(basepath, path)).startswith(basepath) + + +def _badlink(info, basepath): + """Determine if the tarinfo member is outside basepath. + + Args: + info: TarInfo member representing a symlink or hardlink of tar archive + basepath: the basepath the info member must be in + + Returns: + True if info is outside basepath, false otherwise. + + """ + tippath = _canonical_abspath(join(basepath, dirname(info.name))) + return _badpath(info.linkname, basepath=tippath) + + +def is_tarball(filepath): + """Given a filepath, determine if it represents an archive. + + Args: + filepath: file to test for tarball property + + Returns: + Bool, True if it's a tarball, False otherwise + + """ + return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) + + +def _uncompress_zip(tarpath, dirpath): + """Uncompress zip archive safely. + + As per zipfile is concerned + (cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa + + Args: + tarpath: path to the archive + dirpath: directory to uncompress the archive to + + """ + with zipfile.ZipFile(tarpath) as z: + z.extractall(path=dirpath) + + +def _uncompress_tar(tarpath, dirpath): + """Uncompress tarpath if the tarpath is safe. + Safe means, no file will be uncompressed outside of dirpath. + + Args: + tarpath: path to the archive + dirpath: directory to uncompress the archive to + + Raises: + ValueError when a member would be extracted outside dirpath. + + """ + def safemembers(tarpath, members, basepath): + """Given a list of archive members, yield the members (directory, + file, hard-link) that stays in bounds with basepath. Note + that symbolic link are authorized to point outside the + basepath though. + + Args: + tarpath: Name of the tarball + members: Archive members for such tarball + basepath: the basepath sandbox + + Yields: + Safe TarInfo member + + Raises: + ValueError when a member would be extracted outside basepath + + """ + errormsg = 'Archive {} blocked. Illegal path to %s %s'.format(tarpath) + + for finfo in members: + if finfo.isdir() and _badpath(finfo.name, basepath): + raise ValueError(errormsg % ('directory', finfo.name)) + elif finfo.isfile() and _badpath(finfo.name, basepath): + raise ValueError(errormsg % ('file', finfo.name)) + elif finfo.islnk() and _badlink(finfo, basepath): + raise ValueError(errormsg % ('hard-link', finfo.linkname)) + # Authorize symlinks to point outside basepath + # elif finfo.issym() and _badlink(finfo, basepath): + # raise ValueError(errormsg % ('symlink', finfo.linkname)) + else: + yield finfo + + with tarfile.open(tarpath) as t: + members = t.getmembers() + t.extractall(path=dirpath, + members=safemembers(tarpath, members, dirpath)) + + +def uncompress(tarpath, dest): + """Uncompress tarpath to dest folder if tarball is supported and safe. + Safe means, no file will be uncompressed outside of dirpath. + + Note that this fixes permissions after successfully + uncompressing the archive. + + Args: + tarpath: path to tarball to uncompress + dest: the destination folder where to uncompress the tarball + + Returns: + The nature of the tarball, zip or tar. + + Raises: + ValueError when: + - an archive member would be extracted outside basepath + - the archive is not supported + + """ + if tarfile.is_tarfile(tarpath): + _uncompress_tar(tarpath, dest) + nature = 'tar' + elif zipfile.is_zipfile(tarpath): + _uncompress_zip(tarpath, dest) + nature = 'zip' + else: + raise ValueError('File %s is not a supported archive.' % tarpath) + + # Fix permissions + for dirpath, _, fnames in os.walk(dest): + os.chmod(dirpath, 0o755) + for fname in fnames: + fpath = os.path.join(dirpath, fname) + if not os.path.islink(fpath): + fpath_exec = os.stat(fpath).st_mode & stat.S_IXUSR + if not fpath_exec: + os.chmod(fpath, 0o644) + + return nature + + +def _ls(rootdir): + """Generator of filepath, filename from rootdir. + + """ + for dirpath, dirnames, fnames in os.walk(rootdir): + for fname in (dirnames+fnames): + fpath = os.path.join(dirpath, fname) + fname = utils.commonname(rootdir, fpath) + yield fpath, fname + + +def _compress_zip(tarpath, files): + """Compress dirpath's content as tarpath. + + """ + with zipfile.ZipFile(tarpath, 'w') as z: + for fpath, fname in files: + z.write(fpath, arcname=fname) + + +def _compress_tar(tarpath, files): + """Compress dirpath's content as tarpath. + + """ + with tarfile.open(tarpath, 'w:bz2') as t: + for fpath, fname in files: + t.add(fpath, arcname=fname, recursive=False) + + +def compress(tarpath, nature, dirpath_or_files): + """Create a tarball tarpath with nature nature. + The content of the tarball is either dirpath's content (if representing + a directory path) or dirpath's iterable contents. + + Compress the directory dirpath's content to a tarball. + The tarball being dumped at tarpath. + The nature of the tarball is determined by the nature argument. + + """ + if isinstance(dirpath_or_files, str): + files = _ls(dirpath_or_files) + else: # iterable of 'filepath, filename' + files = dirpath_or_files + + if nature == 'zip': + _compress_zip(tarpath, files) + else: + _compress_tar(tarpath, files) + + return tarpath diff --git a/swh/core/tests/test_utils.py b/swh/core/tests/test_utils.py index 931647e..85403c4 100644 --- a/swh/core/tests/test_utils.py +++ b/swh/core/tests/test_utils.py @@ -1,110 +1,124 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.core import utils class UtilsLib(unittest.TestCase): @istest def grouper(self): # given actual_data = utils.grouper((i for i in range(0, 9)), 2) out = [] for d in actual_data: out.append(list(d)) # force generator resolution for checks self.assertEqual(out, [[0, 1], [2, 3], [4, 5], [6, 7], [8]]) # given actual_data = utils.grouper((i for i in range(9, 0, -1)), 4) out = [] for d in actual_data: out.append(list(d)) # force generator resolution for checks self.assertEqual(out, [[9, 8, 7, 6], [5, 4, 3, 2], [1]]) @istest def backslashescape_errors(self): raw_data_err = b'abcd\x80' with self.assertRaises(UnicodeDecodeError): raw_data_err.decode('utf-8', 'strict') self.assertEquals( raw_data_err.decode('utf-8', 'backslashescape'), 'abcd\\x80', ) raw_data_ok = b'abcd\xc3\xa9' self.assertEquals( raw_data_ok.decode('utf-8', 'backslashescape'), raw_data_ok.decode('utf-8', 'strict'), ) unicode_data = 'abcdef\u00a3' self.assertEquals( unicode_data.encode('ascii', 'backslashescape'), b'abcdef\\xa3', ) @istest def encode_with_unescape(self): valid_data = '\\x01020304\\x00' valid_data_encoded = b'\x01020304\x00' self.assertEquals( valid_data_encoded, utils.encode_with_unescape(valid_data) ) @istest def encode_with_unescape_invalid_escape(self): invalid_data = 'test\\abcd' with self.assertRaises(ValueError) as exc: utils.encode_with_unescape(invalid_data) self.assertIn('invalid escape', exc.exception.args[0]) self.assertIn('position 4', exc.exception.args[0]) @istest def decode_with_escape(self): backslashes = b'foo\\bar\\\\baz' backslashes_escaped = 'foo\\\\bar\\\\\\\\baz' self.assertEquals( backslashes_escaped, utils.decode_with_escape(backslashes), ) valid_utf8 = b'foo\xc3\xa2' valid_utf8_escaped = 'foo\u00e2' self.assertEquals( valid_utf8_escaped, utils.decode_with_escape(valid_utf8), ) invalid_utf8 = b'foo\xa2' invalid_utf8_escaped = 'foo\\xa2' self.assertEquals( invalid_utf8_escaped, utils.decode_with_escape(invalid_utf8), ) valid_utf8_nul = b'foo\xc3\xa2\x00' valid_utf8_nul_escaped = 'foo\u00e2\\x00' self.assertEquals( valid_utf8_nul_escaped, utils.decode_with_escape(valid_utf8_nul), ) + + @istest + def commonname(self): + # when + actual_commonname = utils.commonname('/some/where/to/', + '/some/where/to/go/to') + # then + self.assertEquals('go/to', actual_commonname) + + # when + actual_commonname2 = utils.commonname(b'/some/where/to/', + b'/some/where/to/go/to') + # then + self.assertEquals(b'go/to', actual_commonname2) diff --git a/swh/core/utils.py b/swh/core/utils.py index a4921d0..0e748a1 100644 --- a/swh/core/utils.py +++ b/swh/core/utils.py @@ -1,96 +1,103 @@ -# Copyright (C) 2016 The Software Heritage developers +# Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import itertools import codecs from contextlib import contextmanager @contextmanager def cwd(path): """Contextually change the working directory to do thy bidding. Then gets back to the original location. """ prev_cwd = os.getcwd() os.chdir(path) try: yield finally: os.chdir(prev_cwd) def grouper(iterable, n): """Collect data into fixed-length chunks or blocks. Args: iterable: an iterable n: size of block fillvalue: value to use for the last block Returns: fixed-length chunks of blocks as iterables """ args = [iter(iterable)] * n for _data in itertools.zip_longest(*args, fillvalue=None): yield (d for d in _data if d is not None) def backslashescape_errors(exception): if isinstance(exception, UnicodeDecodeError): bad_data = exception.object[exception.start:exception.end] escaped = ''.join(r'\x%02x' % x for x in bad_data) return escaped, exception.end return codecs.backslashreplace_errors(exception) codecs.register_error('backslashescape', backslashescape_errors) def encode_with_unescape(value): """Encode an unicode string containing \\x backslash escapes""" slices = [] start = 0 odd_backslashes = False i = 0 while i < len(value): if value[i] == '\\': odd_backslashes = not odd_backslashes else: if odd_backslashes: if value[i] != 'x': raise ValueError('invalid escape for %r at position %d' % (value, i-1)) slices.append( value[start:i-1].replace('\\\\', '\\').encode('utf-8') ) slices.append(bytes.fromhex(value[i+1:i+3])) odd_backslashes = False start = i = i + 3 continue i += 1 slices.append( value[start:i].replace('\\\\', '\\').encode('utf-8') ) return b''.join(slices) def decode_with_escape(value): """Decode a bytestring as utf-8, escaping the bytes of invalid utf-8 sequences as \\x. We also escape NUL bytes as they are invalid in JSON strings. """ # escape backslashes value = value.replace(b'\\', b'\\\\') value = value.replace(b'\x00', b'\\x00') return value.decode('utf-8', 'backslashescape') + + +def commonname(path0, path1, as_str=False): + """Compute the commonname between the path0 and path1. + + """ + return path1.split(path0)[1] diff --git a/version.txt b/version.txt index 73d8b02..64c0eae 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.35-0-g0dab089 \ No newline at end of file +v0.0.36-0-g141d1a3 \ No newline at end of file