Changeset View
Standalone View
swh/core/tarball.py
# Copyright (C) 2015-2017 The Software Heritage developers | # Copyright (C) 2015-2017 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
import shutil | |||||
import stat | import stat | ||||
import tarfile | import tarfile | ||||
import zipfile | import zipfile | ||||
from os.path import abspath, realpath, join, dirname | from os.path import abspath, realpath, join, dirname | ||||
from . import utils | from . import utils | ||||
def _canonical_abspath(path): | def _canonical_abspath(path): | ||||
"""Resolve all paths to an absolute and real one. | """Resolve all paths to an absolute and real one. | ||||
douardda: As always, naming... what does that 'specific' means here? | |||||
Done Inline ActionsI will improve on this. It's explained in the docstring though. It was initially named "unpack_tar_Z" as the first implementation was "specific" to tar.Z. ardumont: I will improve on this. It's explained in the docstring though.
It was initially named… | |||||
Not Done Inline ActionsThis example list should at least be terminated by a '...'. The description should mention that since it uses the tar command, it supports any compression format this later supports. douardda: This example list should at least be terminated by a '...'. The description should mention that… | |||||
Done Inline ActionsI did not put the ellipsis because the use case foreseen at the time was only for those. I agree we could use this for more though. ardumont: I did not put the ellipsis because the use case foreseen at the time was only for those.
(See… | |||||
Args: | Args: | ||||
path: to resolve | path: to resolve | ||||
Returns: | Returns: | ||||
canonical absolute path to path | canonical absolute path to path | ||||
""" | """ | ||||
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines | |||||
def _uncompress_zip(tarpath, dirpath): | def _uncompress_zip(tarpath, dirpath): | ||||
"""Uncompress zip archive safely. | """Uncompress zip archive safely. | ||||
As per zipfile is concerned | As per zipfile is concerned | ||||
(cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa | (cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa | ||||
Args: | Args: | ||||
tarpath: path to the archive | tarpath: path to the archive | ||||
dirpath: directory to uncompress the archive to | dirpath: directory to uncompress the archive to | ||||
Not Done Inline ActionsWhy raise this exception from within the surrounding try block? douardda: Why raise this exception from within the surrounding try block?
A non existing file should not… | |||||
Done Inline ActionsThis implementation is oriented toward the shutil registration... I also want to avoid indirection in function to translate error (one function calling this one and trapping error the shutil expected way). What should be a proper way to do so? I could also make the _unpack_tar a private function. ardumont: This implementation is oriented toward the shutil registration...
Thus raising the same error… | |||||
""" | """ | ||||
with zipfile.ZipFile(tarpath) as z: | with zipfile.ZipFile(tarpath) as z: | ||||
z.extractall(path=dirpath) | z.extractall(path=dirpath) | ||||
Not Done Inline ActionsThis 'exist_ok' is a bit scary. Not sure we do want that. douardda: This 'exist_ok' is a bit scary. Not sure we do want that. | |||||
Done Inline Actionswell, i want to make a dedicated directory containing only the content of the tarball. ardumont: well, i want to make a dedicated directory containing only the content of the tarball.
I guess… | |||||
Not Done Inline Actionsno need for this "embedded" import statement, raise to the top of the module. douardda: no need for this "embedded" import statement, raise to the top of the module. | |||||
Not Done Inline Actionsbetter dispatch the elements of the expected 3-tuple in 3 variables here. Like douardda: better dispatch the elements of the expected 3-tuple in 3 variables here. Like
`for name… | |||||
Done Inline Actionsoh yes, i did that and reverted it with something else. ardumont: oh yes, i did that and reverted it with something else.
I'll add it back. | |||||
def _safemembers(tarpath, members, basepath): | def _safemembers(tarpath, members, basepath): | ||||
"""Given a list of archive members, yield the members (directory, | """Given a list of archive members, yield the members (directory, | ||||
file, hard-link) that stays in bounds with basepath. Note | file, hard-link) that stays in bounds with basepath. Note | ||||
that symbolic link are authorized to point outside the | that symbolic link are authorized to point outside the | ||||
basepath though. | basepath though. | ||||
Args: | Args: | ||||
Show All 37 Lines | def _uncompress_tar(tarpath, dirpath): | ||||
""" | """ | ||||
with tarfile.open(tarpath) as t: | with tarfile.open(tarpath) as t: | ||||
members = t.getmembers() | members = t.getmembers() | ||||
t.extractall(path=dirpath, | t.extractall(path=dirpath, | ||||
members=_safemembers(tarpath, members, dirpath)) | members=_safemembers(tarpath, members, dirpath)) | ||||
def uncompress(tarpath, dest): | def unpack_tar_Z(tarpath: str, extract_dir: str) -> str: | ||||
"""Unpack .tar.Z file and returns the full path to the uncompressed | |||||
directory. | |||||
Raises | |||||
ReadError in case of issue uncompressing the archive. | |||||
""" | |||||
try: | |||||
if not os.path.exists(tarpath): | |||||
raise ValueError(f'{tarpath} not found') | |||||
filename = os.path.basename(tarpath) | |||||
output_directory = os.path.join(extract_dir, filename) | |||||
os.makedirs(output_directory, exist_ok=True) | |||||
from subprocess import run | |||||
run(['tar', 'xf', tarpath, '-C', output_directory]) | |||||
# data = os.listdir(output_directory) | |||||
# assert len(data) > 0 | |||||
return output_directory | |||||
except Exception as e: | |||||
raise shutil.ReadError( | |||||
f'Unable to uncompress {tarpath} to {extract_dir}. Reason: {e}') | |||||
def register_new_archive_formats(): | |||||
"""Register new archive formats to uncompress | |||||
""" | |||||
registered_formats = [f[0] for f in shutil.get_unpack_formats()] | |||||
for format_id in ADDITIONAL_ARCHIVE_FORMATS: | |||||
name = format_id[0] | |||||
if name in registered_formats: | |||||
continue | |||||
shutil.register_unpack_format( | |||||
name=format_id[0], extensions=format_id[1], function=format_id[2]) | |||||
def uncompress(tarpath: str, dest: str): | |||||
"""Uncompress tarpath to dest folder if tarball is supported and safe. | """Uncompress tarpath to dest folder if tarball is supported and safe. | ||||
Safe means, no file will be uncompressed outside of dirpath. | Safe means, no file will be uncompressed outside of dirpath. | ||||
Note that this fixes permissions after successfully | Note that this fixes permissions after successfully | ||||
uncompressing the archive. | uncompressing the archive. | ||||
Args: | Args: | ||||
tarpath: path to tarball to uncompress | tarpath: path to tarball to uncompress | ||||
dest: the destination folder where to uncompress the tarball | dest: the destination folder where to uncompress the tarball | ||||
Returns: | Returns: | ||||
The nature of the tarball, zip or tar. | The nature of the tarball, zip or tar. | ||||
Raises: | Raises: | ||||
ValueError when: | ValueError when the archive is not supported | ||||
Not Done Inline ActionsUnless I'm mistaken, this is not true. The ValueError will be raised in case of a corrupted file for example, which is IMHO not what we want. douardda: Unless I'm mistaken, this is not true. The ValueError will be raised in case of a corrupted… | |||||
- an archive member would be extracted outside basepath | |||||
- the archive is not supported | |||||
""" | """ | ||||
if tarfile.is_tarfile(tarpath): | try: | ||||
_uncompress_tar(tarpath, dest) | shutil.unpack_archive(tarpath, extract_dir=dest) | ||||
nature = 'tar' | except shutil.ReadError: | ||||
elif zipfile.is_zipfile(tarpath): | raise ValueError(f'File {tarpath} is not a supported archive.') | ||||
_uncompress_zip(tarpath, dest) | |||||
nature = 'zip' | |||||
else: | |||||
raise ValueError('File %s is not a supported archive.' % tarpath) | |||||
Done Inline ActionsNo other clients of this relies on the actual 'nature' returned here. ardumont: No other clients of this relies on the actual 'nature' returned here. | |||||
# Fix permissions | # Fix permissions | ||||
for dirpath, _, fnames in os.walk(dest): | for dirpath, _, fnames in os.walk(dest): | ||||
os.chmod(dirpath, 0o755) | os.chmod(dirpath, 0o755) | ||||
for fname in fnames: | for fname in fnames: | ||||
fpath = os.path.join(dirpath, fname) | fpath = os.path.join(dirpath, fname) | ||||
if not os.path.islink(fpath): | if not os.path.islink(fpath): | ||||
fpath_exec = os.stat(fpath).st_mode & stat.S_IXUSR | fpath_exec = os.stat(fpath).st_mode & stat.S_IXUSR | ||||
if not fpath_exec: | if not fpath_exec: | ||||
os.chmod(fpath, 0o644) | os.chmod(fpath, 0o644) | ||||
return nature | |||||
def _ls(rootdir): | def _ls(rootdir): | ||||
"""Generator of filepath, filename from rootdir. | """Generator of filepath, filename from rootdir. | ||||
""" | """ | ||||
for dirpath, dirnames, fnames in os.walk(rootdir): | for dirpath, dirnames, fnames in os.walk(rootdir): | ||||
for fname in (dirnames+fnames): | for fname in (dirnames+fnames): | ||||
fpath = os.path.join(dirpath, fname) | fpath = os.path.join(dirpath, fname) | ||||
Show All 35 Lines | else: # iterable of 'filepath, filename' | ||||
files = dirpath_or_files | files = dirpath_or_files | ||||
if nature == 'zip': | if nature == 'zip': | ||||
_compress_zip(tarpath, files) | _compress_zip(tarpath, files) | ||||
else: | else: | ||||
_compress_tar(tarpath, files) | _compress_tar(tarpath, files) | ||||
return tarpath | return tarpath | ||||
# Additional uncompression archive format support | |||||
ADDITIONAL_ARCHIVE_FORMATS = [ | |||||
# name , extensions, function | |||||
('tar.Z', ['.tar.Z'], unpack_tar_Z), | |||||
] |
As always, naming... what does that 'specific' means here?