Changeset View
Standalone View
swh/core/tarball.py
# Copyright (C) 2015-2017 The Software Heritage developers | # Copyright (C) 2015-2017 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
import shutil | |||||
import stat | import stat | ||||
import tarfile | import tarfile | ||||
import zipfile | import zipfile | ||||
from os.path import abspath, realpath, join, dirname | |||||
from . import utils | from . import utils | ||||
def _canonical_abspath(path): | def unpack_specific_tar(tarpath: str, extract_dir: str) -> str: | ||||
douardda: As always, naming... what does that 'specific' means here? | |||||
ardumontAuthorUnsubmitted Done Inline ActionsI will improve on this. It's explained in the docstring though. It was initially named "unpack_tar_Z" as the first implementation was "specific" to tar.Z. ardumont: I will improve on this. It's explained in the docstring though.
It was initially named… | |||||
"""Resolve all paths to an absolute and real one. | """Unpack specific tarballs (.tar.Z, .tar.lz, .tar.x) file. Returns the | ||||
douarddaUnsubmitted Not Done Inline ActionsThis example list should at least be terminated by a '...'. The description should mention that since it uses the tar command, it supports any compression format this later supports. douardda: This example list should at least be terminated by a '...'. The description should mention that… | |||||
ardumontAuthorUnsubmitted Done Inline ActionsI did not put the ellipsis because the use case foreseen at the time was only for those. I agree we could use this for more though. ardumont: I did not put the ellipsis because the use case foreseen at the time was only for those.
(See… | |||||
full path to the uncompressed directory. | |||||
This relies on the `tar` command. | |||||
Raises | |||||
ReadError in case of issue uncompressing the archive. | |||||
""" | |||||
try: | |||||
if not os.path.exists(tarpath): | |||||
raise ValueError(f'{tarpath} not found') | |||||
douarddaUnsubmitted Not Done Inline ActionsWhy raise this exception from within the surrounding try block? douardda: Why raise this exception from within the surrounding try block?
A non existing file should not… | |||||
ardumontAuthorUnsubmitted Done Inline ActionsThis implementation is oriented toward the shutil registration... I also want to avoid indirection in function to translate error (one function calling this one and trapping error the shutil expected way). What should be a proper way to do so? I could also make the _unpack_tar a private function. ardumont: This implementation is oriented toward the shutil registration...
Thus raising the same error… | |||||
filename = os.path.basename(tarpath) | |||||
output_directory = os.path.join(extract_dir, filename) | |||||
os.makedirs(output_directory, exist_ok=True) | |||||
douarddaUnsubmitted Not Done Inline ActionsThis 'exist_ok' is a bit scary. Not sure we do want that. douardda: This 'exist_ok' is a bit scary. Not sure we do want that. | |||||
ardumontAuthorUnsubmitted Done Inline Actionswell, i want to make a dedicated directory containing only the content of the tarball. ardumont: well, i want to make a dedicated directory containing only the content of the tarball.
I guess… | |||||
from subprocess import run | |||||
douarddaUnsubmitted Not Done Inline Actionsno need for this "embedded" import statement, raise to the top of the module. douardda: no need for this "embedded" import statement, raise to the top of the module. | |||||
run(['tar', 'xf', tarpath, '-C', output_directory]) | |||||
# data = os.listdir(output_directory) | |||||
# assert len(data) > 0 | |||||
return output_directory | |||||
except Exception as e: | |||||
raise shutil.ReadError( | |||||
f'Unable to uncompress {tarpath} to {extract_dir}. Reason: {e}') | |||||
def register_new_archive_formats(): | |||||
"""Register new archive formats to uncompress | |||||
""" | |||||
registered_formats = [f[0] for f in shutil.get_unpack_formats()] | |||||
for format_id in ADDITIONAL_ARCHIVE_FORMATS: | |||||
douarddaUnsubmitted Not Done Inline Actionsbetter dispatch the elements of the expected 3-tuple in 3 variables here. Like douardda: better dispatch the elements of the expected 3-tuple in 3 variables here. Like
`for name… | |||||
ardumontAuthorUnsubmitted Done Inline Actionsoh yes, i did that and reverted it with something else. ardumont: oh yes, i did that and reverted it with something else.
I'll add it back. | |||||
name = format_id[0] | |||||
if name in registered_formats: | |||||
continue | |||||
shutil.register_unpack_format( | |||||
name=format_id[0], extensions=format_id[1], function=format_id[2]) | |||||
Args: | |||||
path: to resolve | |||||
Returns: | |||||
canonical absolute path to path | |||||
""" | |||||
return realpath(abspath(path)) | |||||
def _badpath(path, basepath): | |||||
"""Determine if a path is outside basepath. | |||||
Args: | |||||
path: a relative or absolute path of a file or directory | |||||
basepath: the basepath path must be in | |||||
Returns: | |||||
True if path is outside basepath, false otherwise. | |||||
""" | |||||
return not _canonical_abspath(join(basepath, path)).startswith(basepath) | |||||
def _badlink(info, basepath): | |||||
"""Determine if the tarinfo member is outside basepath. | |||||
Args: | |||||
info: TarInfo member representing a symlink or hardlink of tar archive | |||||
basepath: the basepath the info member must be in | |||||
Returns: | |||||
True if info is outside basepath, false otherwise. | |||||
""" | |||||
tippath = _canonical_abspath(join(basepath, dirname(info.name))) | |||||
return _badpath(info.linkname, basepath=tippath) | |||||
def is_tarball(filepath): | |||||
"""Given a filepath, determine if it represents an archive. | |||||
Args: | |||||
filepath: file to test for tarball property | |||||
Returns: | |||||
Bool, True if it's a tarball, False otherwise | |||||
""" | |||||
return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) | |||||
def _uncompress_zip(tarpath, dirpath): | |||||
"""Uncompress zip archive safely. | |||||
As per zipfile is concerned | def uncompress(tarpath: str, dest: str): | ||||
(cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa | |||||
Args: | |||||
tarpath: path to the archive | |||||
dirpath: directory to uncompress the archive to | |||||
""" | |||||
with zipfile.ZipFile(tarpath) as z: | |||||
z.extractall(path=dirpath) | |||||
def _safemembers(tarpath, members, basepath): | |||||
"""Given a list of archive members, yield the members (directory, | |||||
file, hard-link) that stays in bounds with basepath. Note | |||||
that symbolic link are authorized to point outside the | |||||
basepath though. | |||||
Args: | |||||
tarpath: Name of the tarball | |||||
members: Archive members for such tarball | |||||
basepath: the basepath sandbox | |||||
Yields: | |||||
Safe TarInfo member | |||||
Raises: | |||||
ValueError when a member would be extracted outside basepath | |||||
""" | |||||
errormsg = 'Archive {} blocked. Illegal path to %s %s'.format(tarpath) | |||||
for finfo in members: | |||||
if finfo.isdir() and _badpath(finfo.name, basepath): | |||||
raise ValueError(errormsg % ('directory', finfo.name)) | |||||
elif finfo.isfile() and _badpath(finfo.name, basepath): | |||||
raise ValueError(errormsg % ('file', finfo.name)) | |||||
elif finfo.islnk() and _badlink(finfo, basepath): | |||||
raise ValueError(errormsg % ('hard-link', finfo.linkname)) | |||||
# Authorize symlinks to point outside basepath | |||||
# elif finfo.issym() and _badlink(finfo, basepath): | |||||
# raise ValueError(errormsg % ('symlink', finfo.linkname)) | |||||
else: | |||||
yield finfo | |||||
def _uncompress_tar(tarpath, dirpath): | |||||
"""Uncompress tarpath if the tarpath is safe. | |||||
Safe means, no file will be uncompressed outside of dirpath. | |||||
Args: | |||||
tarpath: path to the archive | |||||
dirpath: directory to uncompress the archive to | |||||
Raises: | |||||
ValueError when a member would be extracted outside dirpath. | |||||
""" | |||||
with tarfile.open(tarpath) as t: | |||||
members = t.getmembers() | |||||
t.extractall(path=dirpath, | |||||
members=_safemembers(tarpath, members, dirpath)) | |||||
def uncompress(tarpath, dest): | |||||
"""Uncompress tarpath to dest folder if tarball is supported and safe. | """Uncompress tarpath to dest folder if tarball is supported and safe. | ||||
Safe means, no file will be uncompressed outside of dirpath. | Safe means, no file will be uncompressed outside of dirpath. | ||||
Note that this fixes permissions after successfully | Note that this fixes permissions after successfully | ||||
uncompressing the archive. | uncompressing the archive. | ||||
Args: | Args: | ||||
tarpath: path to tarball to uncompress | tarpath: path to tarball to uncompress | ||||
dest: the destination folder where to uncompress the tarball | dest: the destination folder where to uncompress the tarball | ||||
Returns: | Returns: | ||||
The nature of the tarball, zip or tar. | The nature of the tarball, zip or tar. | ||||
Raises: | Raises: | ||||
ValueError when: | ValueError when the archive is not supported | ||||
douarddaUnsubmitted Not Done Inline ActionsUnless I'm mistaken, this is not true. The ValueError will be raised in case of a corrupted file for example, which is IMHO not what we want. douardda: Unless I'm mistaken, this is not true. The ValueError will be raised in case of a corrupted… | |||||
- an archive member would be extracted outside basepath | |||||
- the archive is not supported | |||||
""" | """ | ||||
if tarfile.is_tarfile(tarpath): | try: | ||||
_uncompress_tar(tarpath, dest) | shutil.unpack_archive(tarpath, extract_dir=dest) | ||||
nature = 'tar' | except shutil.ReadError: | ||||
elif zipfile.is_zipfile(tarpath): | raise ValueError(f'File {tarpath} is not a supported archive.') | ||||
_uncompress_zip(tarpath, dest) | |||||
nature = 'zip' | |||||
else: | |||||
raise ValueError('File %s is not a supported archive.' % tarpath) | |||||
ardumontAuthorUnsubmitted Done Inline ActionsNo other clients of this relies on the actual 'nature' returned here. ardumont: No other clients of this relies on the actual 'nature' returned here. | |||||
# Fix permissions | # Fix permissions | ||||
for dirpath, _, fnames in os.walk(dest): | for dirpath, _, fnames in os.walk(dest): | ||||
os.chmod(dirpath, 0o755) | os.chmod(dirpath, 0o755) | ||||
for fname in fnames: | for fname in fnames: | ||||
fpath = os.path.join(dirpath, fname) | fpath = os.path.join(dirpath, fname) | ||||
if not os.path.islink(fpath): | if not os.path.islink(fpath): | ||||
fpath_exec = os.stat(fpath).st_mode & stat.S_IXUSR | fpath_exec = os.stat(fpath).st_mode & stat.S_IXUSR | ||||
if not fpath_exec: | if not fpath_exec: | ||||
os.chmod(fpath, 0o644) | os.chmod(fpath, 0o644) | ||||
return nature | |||||
def _ls(rootdir): | def _ls(rootdir): | ||||
"""Generator of filepath, filename from rootdir. | """Generator of filepath, filename from rootdir. | ||||
""" | """ | ||||
for dirpath, dirnames, fnames in os.walk(rootdir): | for dirpath, dirnames, fnames in os.walk(rootdir): | ||||
for fname in (dirnames+fnames): | for fname in (dirnames+fnames): | ||||
fpath = os.path.join(dirpath, fname) | fpath = os.path.join(dirpath, fname) | ||||
Show All 35 Lines | else: # iterable of 'filepath, filename' | ||||
files = dirpath_or_files | files = dirpath_or_files | ||||
if nature == 'zip': | if nature == 'zip': | ||||
_compress_zip(tarpath, files) | _compress_zip(tarpath, files) | ||||
else: | else: | ||||
_compress_tar(tarpath, files) | _compress_tar(tarpath, files) | ||||
return tarpath | return tarpath | ||||
# Additional uncompression archive format support | |||||
ADDITIONAL_ARCHIVE_FORMATS = [ | |||||
# name , extensions, function | |||||
('tar.Z|x', ['.tar.Z', '.tar.x'], unpack_specific_tar), | |||||
# FIXME: make this optional depending on the runtime lzip package install | |||||
('tar.lz', ['.tar.lz'], unpack_specific_tar), | |||||
] |
As always, naming... what does that 'specific' means here?