diff --git a/PKG-INFO b/PKG-INFO index c9b5ff8..a630e6b 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.35 +Version: 0.0.36 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/changelog b/debian/changelog index 4ff49de..6d8d7e9 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,245 +1,246 @@ -swh-core (0.0.35-1~swh1~bpo9+1) stretch-swh; urgency=medium +swh-core (0.0.36-1~swh1) unstable-swh; urgency=medium - * Rebuild for stretch-backports. + * v0.0.36 + * Migrate swh.loader.tar.tarball module in swh.core - -- Nicolas Dandrimont Thu, 12 Oct 2017 18:07:50 +0200 + -- Antoine R. Dumont (@ardumont) Wed, 06 Dec 2017 12:03:29 +0100 swh-core (0.0.35-1~swh1) unstable-swh; urgency=medium * Release swh.core version 0.0.35 * Update packaging runes -- Nicolas Dandrimont Thu, 12 Oct 2017 18:07:50 +0200 swh-core (0.0.34-1~swh1) unstable-swh; urgency=medium * Release swh.core v0.0.34 * New modular database test fixture -- Nicolas Dandrimont Mon, 07 Aug 2017 18:29:48 +0200 swh-core (0.0.33-1~swh1) unstable-swh; urgency=medium * Release swh.core v0.0.33 * Be more conservative with remote API responses -- Nicolas Dandrimont Mon, 19 Jun 2017 19:01:38 +0200 swh-core (0.0.32-1~swh1) unstable-swh; urgency=medium * Release swh-core v0.0.32 * Add asynchronous streaming methods for internal APIs * Remove task arguments from systemd-journal loggers -- Nicolas Dandrimont Tue, 09 May 2017 14:04:22 +0200 swh-core (0.0.31-1~swh1) unstable-swh; urgency=medium * Release swh.core v0.0.31 * Add explicit dependency on python3-systemd -- Nicolas Dandrimont Fri, 07 Apr 2017 15:11:26 +0200 swh-core (0.0.30-1~swh1) unstable-swh; urgency=medium * Release swh.core v0.0.30 * drop swh.core.hashutil (moved to swh.model.hashutil) * add a systemd logger -- Nicolas Dandrimont Fri, 07 Apr 2017 11:49:15 +0200 swh-core (0.0.29-1~swh1) unstable-swh; urgency=medium * Release swh.core v0.0.29 * Catch proper exception in the base API client -- Nicolas Dandrimont Thu, 02 Feb 2017 00:19:25 +0100 swh-core (0.0.28-1~swh1) unstable-swh; urgency=medium * v0.0.28 * Refactoring some common code into swh.core -- Antoine R. Dumont (@ardumont) Thu, 26 Jan 2017 14:54:22 +0100 swh-core (0.0.27-1~swh1) unstable-swh; urgency=medium * v0.0.27 * Fix issue with default boolean value -- Antoine R. Dumont (@ardumont) Thu, 20 Oct 2016 16:15:20 +0200 swh-core (0.0.26-1~swh1) unstable-swh; urgency=medium * Release swh.core v0.0.26 * Raise an exception when a configuration file exists and is unreadable -- Nicolas Dandrimont Wed, 12 Oct 2016 10:16:09 +0200 swh-core (0.0.25-1~swh1) unstable-swh; urgency=medium * v0.0.25 * Add new function utils.cwd -- Antoine R. Dumont (@ardumont) Thu, 29 Sep 2016 21:29:37 +0200 swh-core (0.0.24-1~swh1) unstable-swh; urgency=medium * v0.0.24 * Deal with edge case in logger regarding json -- Antoine R. Dumont (@ardumont) Thu, 22 Sep 2016 12:21:09 +0200 swh-core (0.0.23-1~swh1) unstable-swh; urgency=medium * Release swh.core v0.0.23 * Properly fix the PyYAML dependency -- Nicolas Dandrimont Tue, 23 Aug 2016 16:20:29 +0200 swh-core (0.0.22-1~swh1) unstable-swh; urgency=medium * Release swh.core v0.0.22 * Proper loading of yaml and ini files in all paths -- Nicolas Dandrimont Fri, 19 Aug 2016 15:45:55 +0200 swh-core (0.0.21-1~swh1) unstable-swh; urgency=medium * v0.0.21 * Update test tools -- Antoine R. Dumont (@ardumont) Tue, 19 Jul 2016 14:47:01 +0200 swh-core (0.0.20-1~swh1) unstable-swh; urgency=medium * Release swh.core v0.0.20 * Add some generic bytes <-> escaped unicode methods -- Nicolas Dandrimont Tue, 14 Jun 2016 16:54:41 +0200 swh-core (0.0.19-1~swh1) unstable-swh; urgency=medium * v0.0.19 * Resurrect swh.core.utils -- Antoine R. Dumont (@ardumont) Fri, 15 Apr 2016 12:40:43 +0200 swh-core (0.0.18-1~swh1) unstable-swh; urgency=medium * v0.0.18 * Add swh.core.utils * serializers: support UUIDs all around -- Antoine R. Dumont (@ardumont) Sat, 26 Mar 2016 11:16:33 +0100 swh-core (0.0.17-1~swh1) unstable-swh; urgency=medium * Release swh.core v0.0.17 * Allow serialization of UUIDs -- Nicolas Dandrimont Fri, 04 Mar 2016 11:40:56 +0100 swh-core (0.0.16-1~swh1) unstable-swh; urgency=medium * Release swh.core version 0.0.16 * add bytehex_to_hash and hash_to_bytehex in hashutil * move scheduling utilities to swh.scheduler -- Nicolas Dandrimont Fri, 19 Feb 2016 18:12:10 +0100 swh-core (0.0.15-1~swh1) unstable-swh; urgency=medium * Release v0.0.15 * Add hashutil.hash_git_object -- Nicolas Dandrimont Wed, 16 Dec 2015 16:31:26 +0100 swh-core (0.0.14-1~swh1) unstable-swh; urgency=medium * v0.0.14 * Add simple README * Update license * swh.core.hashutil.hashfile can now deal with filepath as bytes -- Antoine R. Dumont (@ardumont) Fri, 23 Oct 2015 11:13:14 +0200 swh-core (0.0.13-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.core v0.0.13 -- Nicolas Dandrimont Fri, 09 Oct 2015 17:32:49 +0200 swh-core (0.0.12-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.core v0.0.12 -- Nicolas Dandrimont Tue, 06 Oct 2015 17:34:34 +0200 swh-core (0.0.11-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.core v0.0.11 -- Nicolas Dandrimont Sat, 03 Oct 2015 15:57:03 +0200 swh-core (0.0.10-1~swh1) unstable-swh; urgency=medium * Prepare deploying swh.core v0.0.10 -- Nicolas Dandrimont Sat, 03 Oct 2015 12:28:52 +0200 swh-core (0.0.9-1~swh1) unstable-swh; urgency=medium * Prepare deploying swh.core v0.0.9 -- Nicolas Dandrimont Sat, 03 Oct 2015 11:36:55 +0200 swh-core (0.0.8-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.core v0.0.8 -- Nicolas Dandrimont Thu, 01 Oct 2015 12:31:44 +0200 swh-core (0.0.7-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.core v0.0.7 -- Nicolas Dandrimont Thu, 01 Oct 2015 11:29:04 +0200 swh-core (0.0.6-1~swh1) unstable-swh; urgency=medium * Prepare deployment of swh.core v0.0.6 -- Nicolas Dandrimont Tue, 29 Sep 2015 16:48:44 +0200 swh-core (0.0.5-1~swh1) unstable-swh; urgency=medium * Prepare v0.0.5 deployment -- Nicolas Dandrimont Tue, 29 Sep 2015 16:08:32 +0200 swh-core (0.0.4-1~swh1) unstable-swh; urgency=medium * Tagging swh.core 0.0.4 -- Nicolas Dandrimont Fri, 25 Sep 2015 15:41:26 +0200 swh-core (0.0.3-1~swh1) unstable-swh; urgency=medium * Tag swh.core v0.0.3 -- Nicolas Dandrimont Fri, 25 Sep 2015 11:07:10 +0200 swh-core (0.0.2-1~swh1) unstable-swh; urgency=medium * Deploy v0.0.2 -- Nicolas Dandrimont Wed, 23 Sep 2015 12:08:50 +0200 swh-core (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * Tag v0.0.1 for deployment -- Nicolas Dandrimont Tue, 22 Sep 2015 14:52:26 +0200 diff --git a/docs/index.rst b/docs/index.rst index 8b64117..1954db2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,15 +1,17 @@ +.. _swh-core: + Software Heritage - Development Documentation ============================================= .. toctree:: :maxdepth: 2 :caption: Contents: Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` diff --git a/swh.core.egg-info/PKG-INFO b/swh.core.egg-info/PKG-INFO index c9b5ff8..a630e6b 100644 --- a/swh.core.egg-info/PKG-INFO +++ b/swh.core.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.core -Version: 0.0.35 +Version: 0.0.36 Summary: Software Heritage core utilities Home-page: https://forge.softwareheritage.org/diffusion/DCORE/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.core.egg-info/SOURCES.txt b/swh.core.egg-info/SOURCES.txt index 73c6b68..fd3b0f2 100644 --- a/swh.core.egg-info/SOURCES.txt +++ b/swh.core.egg-info/SOURCES.txt @@ -1,43 +1,44 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile README.md requirements-swh.txt requirements.txt setup.py version.txt bin/swh-hashdir bin/swh-hashfile debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder sql/log-schema.sql swh/__init__.py swh.core.egg-info/PKG-INFO swh.core.egg-info/SOURCES.txt swh.core.egg-info/dependency_links.txt swh.core.egg-info/requires.txt swh.core.egg-info/top_level.txt swh/core/__init__.py swh/core/api.py swh/core/api_async.py swh/core/config.py swh/core/logger.py swh/core/serializers.py +swh/core/tarball.py swh/core/utils.py swh/core/tests/db_testing.py swh/core/tests/test_config.py swh/core/tests/test_logger.py swh/core/tests/test_serializers.py swh/core/tests/test_utils.py \ No newline at end of file diff --git a/swh/core/tarball.py b/swh/core/tarball.py new file mode 100644 index 0000000..69d56f4 --- /dev/null +++ b/swh/core/tarball.py @@ -0,0 +1,227 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import stat +import tarfile +import zipfile + +from os.path import abspath, realpath, join, dirname +from . import utils + + +def _canonical_abspath(path): + """Resolve all paths to an absolute and real one. + + Args: + path: to resolve + + Returns: + canonical absolute path to path + + """ + return realpath(abspath(path)) + + +def _badpath(path, basepath): + """Determine if a path is outside basepath. + + Args: + path: a relative or absolute path of a file or directory + basepath: the basepath path must be in + + Returns: + True if path is outside basepath, false otherwise. + + """ + return not _canonical_abspath(join(basepath, path)).startswith(basepath) + + +def _badlink(info, basepath): + """Determine if the tarinfo member is outside basepath. + + Args: + info: TarInfo member representing a symlink or hardlink of tar archive + basepath: the basepath the info member must be in + + Returns: + True if info is outside basepath, false otherwise. + + """ + tippath = _canonical_abspath(join(basepath, dirname(info.name))) + return _badpath(info.linkname, basepath=tippath) + + +def is_tarball(filepath): + """Given a filepath, determine if it represents an archive. + + Args: + filepath: file to test for tarball property + + Returns: + Bool, True if it's a tarball, False otherwise + + """ + return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) + + +def _uncompress_zip(tarpath, dirpath): + """Uncompress zip archive safely. + + As per zipfile is concerned + (cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa + + Args: + tarpath: path to the archive + dirpath: directory to uncompress the archive to + + """ + with zipfile.ZipFile(tarpath) as z: + z.extractall(path=dirpath) + + +def _uncompress_tar(tarpath, dirpath): + """Uncompress tarpath if the tarpath is safe. + Safe means, no file will be uncompressed outside of dirpath. + + Args: + tarpath: path to the archive + dirpath: directory to uncompress the archive to + + Raises: + ValueError when a member would be extracted outside dirpath. + + """ + def safemembers(tarpath, members, basepath): + """Given a list of archive members, yield the members (directory, + file, hard-link) that stays in bounds with basepath. Note + that symbolic link are authorized to point outside the + basepath though. + + Args: + tarpath: Name of the tarball + members: Archive members for such tarball + basepath: the basepath sandbox + + Yields: + Safe TarInfo member + + Raises: + ValueError when a member would be extracted outside basepath + + """ + errormsg = 'Archive {} blocked. Illegal path to %s %s'.format(tarpath) + + for finfo in members: + if finfo.isdir() and _badpath(finfo.name, basepath): + raise ValueError(errormsg % ('directory', finfo.name)) + elif finfo.isfile() and _badpath(finfo.name, basepath): + raise ValueError(errormsg % ('file', finfo.name)) + elif finfo.islnk() and _badlink(finfo, basepath): + raise ValueError(errormsg % ('hard-link', finfo.linkname)) + # Authorize symlinks to point outside basepath + # elif finfo.issym() and _badlink(finfo, basepath): + # raise ValueError(errormsg % ('symlink', finfo.linkname)) + else: + yield finfo + + with tarfile.open(tarpath) as t: + members = t.getmembers() + t.extractall(path=dirpath, + members=safemembers(tarpath, members, dirpath)) + + +def uncompress(tarpath, dest): + """Uncompress tarpath to dest folder if tarball is supported and safe. + Safe means, no file will be uncompressed outside of dirpath. + + Note that this fixes permissions after successfully + uncompressing the archive. + + Args: + tarpath: path to tarball to uncompress + dest: the destination folder where to uncompress the tarball + + Returns: + The nature of the tarball, zip or tar. + + Raises: + ValueError when: + - an archive member would be extracted outside basepath + - the archive is not supported + + """ + if tarfile.is_tarfile(tarpath): + _uncompress_tar(tarpath, dest) + nature = 'tar' + elif zipfile.is_zipfile(tarpath): + _uncompress_zip(tarpath, dest) + nature = 'zip' + else: + raise ValueError('File %s is not a supported archive.' % tarpath) + + # Fix permissions + for dirpath, _, fnames in os.walk(dest): + os.chmod(dirpath, 0o755) + for fname in fnames: + fpath = os.path.join(dirpath, fname) + if not os.path.islink(fpath): + fpath_exec = os.stat(fpath).st_mode & stat.S_IXUSR + if not fpath_exec: + os.chmod(fpath, 0o644) + + return nature + + +def _ls(rootdir): + """Generator of filepath, filename from rootdir. + + """ + for dirpath, dirnames, fnames in os.walk(rootdir): + for fname in (dirnames+fnames): + fpath = os.path.join(dirpath, fname) + fname = utils.commonname(rootdir, fpath) + yield fpath, fname + + +def _compress_zip(tarpath, files): + """Compress dirpath's content as tarpath. + + """ + with zipfile.ZipFile(tarpath, 'w') as z: + for fpath, fname in files: + z.write(fpath, arcname=fname) + + +def _compress_tar(tarpath, files): + """Compress dirpath's content as tarpath. + + """ + with tarfile.open(tarpath, 'w:bz2') as t: + for fpath, fname in files: + t.add(fpath, arcname=fname, recursive=False) + + +def compress(tarpath, nature, dirpath_or_files): + """Create a tarball tarpath with nature nature. + The content of the tarball is either dirpath's content (if representing + a directory path) or dirpath's iterable contents. + + Compress the directory dirpath's content to a tarball. + The tarball being dumped at tarpath. + The nature of the tarball is determined by the nature argument. + + """ + if isinstance(dirpath_or_files, str): + files = _ls(dirpath_or_files) + else: # iterable of 'filepath, filename' + files = dirpath_or_files + + if nature == 'zip': + _compress_zip(tarpath, files) + else: + _compress_tar(tarpath, files) + + return tarpath diff --git a/swh/core/tests/test_utils.py b/swh/core/tests/test_utils.py index 931647e..85403c4 100644 --- a/swh/core/tests/test_utils.py +++ b/swh/core/tests/test_utils.py @@ -1,110 +1,124 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.core import utils class UtilsLib(unittest.TestCase): @istest def grouper(self): # given actual_data = utils.grouper((i for i in range(0, 9)), 2) out = [] for d in actual_data: out.append(list(d)) # force generator resolution for checks self.assertEqual(out, [[0, 1], [2, 3], [4, 5], [6, 7], [8]]) # given actual_data = utils.grouper((i for i in range(9, 0, -1)), 4) out = [] for d in actual_data: out.append(list(d)) # force generator resolution for checks self.assertEqual(out, [[9, 8, 7, 6], [5, 4, 3, 2], [1]]) @istest def backslashescape_errors(self): raw_data_err = b'abcd\x80' with self.assertRaises(UnicodeDecodeError): raw_data_err.decode('utf-8', 'strict') self.assertEquals( raw_data_err.decode('utf-8', 'backslashescape'), 'abcd\\x80', ) raw_data_ok = b'abcd\xc3\xa9' self.assertEquals( raw_data_ok.decode('utf-8', 'backslashescape'), raw_data_ok.decode('utf-8', 'strict'), ) unicode_data = 'abcdef\u00a3' self.assertEquals( unicode_data.encode('ascii', 'backslashescape'), b'abcdef\\xa3', ) @istest def encode_with_unescape(self): valid_data = '\\x01020304\\x00' valid_data_encoded = b'\x01020304\x00' self.assertEquals( valid_data_encoded, utils.encode_with_unescape(valid_data) ) @istest def encode_with_unescape_invalid_escape(self): invalid_data = 'test\\abcd' with self.assertRaises(ValueError) as exc: utils.encode_with_unescape(invalid_data) self.assertIn('invalid escape', exc.exception.args[0]) self.assertIn('position 4', exc.exception.args[0]) @istest def decode_with_escape(self): backslashes = b'foo\\bar\\\\baz' backslashes_escaped = 'foo\\\\bar\\\\\\\\baz' self.assertEquals( backslashes_escaped, utils.decode_with_escape(backslashes), ) valid_utf8 = b'foo\xc3\xa2' valid_utf8_escaped = 'foo\u00e2' self.assertEquals( valid_utf8_escaped, utils.decode_with_escape(valid_utf8), ) invalid_utf8 = b'foo\xa2' invalid_utf8_escaped = 'foo\\xa2' self.assertEquals( invalid_utf8_escaped, utils.decode_with_escape(invalid_utf8), ) valid_utf8_nul = b'foo\xc3\xa2\x00' valid_utf8_nul_escaped = 'foo\u00e2\\x00' self.assertEquals( valid_utf8_nul_escaped, utils.decode_with_escape(valid_utf8_nul), ) + + @istest + def commonname(self): + # when + actual_commonname = utils.commonname('/some/where/to/', + '/some/where/to/go/to') + # then + self.assertEquals('go/to', actual_commonname) + + # when + actual_commonname2 = utils.commonname(b'/some/where/to/', + b'/some/where/to/go/to') + # then + self.assertEquals(b'go/to', actual_commonname2) diff --git a/swh/core/utils.py b/swh/core/utils.py index a4921d0..0e748a1 100644 --- a/swh/core/utils.py +++ b/swh/core/utils.py @@ -1,96 +1,103 @@ -# Copyright (C) 2016 The Software Heritage developers +# Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import itertools import codecs from contextlib import contextmanager @contextmanager def cwd(path): """Contextually change the working directory to do thy bidding. Then gets back to the original location. """ prev_cwd = os.getcwd() os.chdir(path) try: yield finally: os.chdir(prev_cwd) def grouper(iterable, n): """Collect data into fixed-length chunks or blocks. Args: iterable: an iterable n: size of block fillvalue: value to use for the last block Returns: fixed-length chunks of blocks as iterables """ args = [iter(iterable)] * n for _data in itertools.zip_longest(*args, fillvalue=None): yield (d for d in _data if d is not None) def backslashescape_errors(exception): if isinstance(exception, UnicodeDecodeError): bad_data = exception.object[exception.start:exception.end] escaped = ''.join(r'\x%02x' % x for x in bad_data) return escaped, exception.end return codecs.backslashreplace_errors(exception) codecs.register_error('backslashescape', backslashescape_errors) def encode_with_unescape(value): """Encode an unicode string containing \\x backslash escapes""" slices = [] start = 0 odd_backslashes = False i = 0 while i < len(value): if value[i] == '\\': odd_backslashes = not odd_backslashes else: if odd_backslashes: if value[i] != 'x': raise ValueError('invalid escape for %r at position %d' % (value, i-1)) slices.append( value[start:i-1].replace('\\\\', '\\').encode('utf-8') ) slices.append(bytes.fromhex(value[i+1:i+3])) odd_backslashes = False start = i = i + 3 continue i += 1 slices.append( value[start:i].replace('\\\\', '\\').encode('utf-8') ) return b''.join(slices) def decode_with_escape(value): """Decode a bytestring as utf-8, escaping the bytes of invalid utf-8 sequences as \\x. We also escape NUL bytes as they are invalid in JSON strings. """ # escape backslashes value = value.replace(b'\\', b'\\\\') value = value.replace(b'\x00', b'\\x00') return value.decode('utf-8', 'backslashescape') + + +def commonname(path0, path1, as_str=False): + """Compute the commonname between the path0 and path1. + + """ + return path1.split(path0)[1] diff --git a/version.txt b/version.txt index 73d8b02..64c0eae 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.35-0-g0dab089 \ No newline at end of file +v0.0.36-0-g141d1a3 \ No newline at end of file