diff --git a/swh/core/tarball.py b/swh/core/tarball.py index 261bfe7800fa5b4758c838a186b21f0f2742e506..154443280f4191ecf4ec31721c734377b2062e1e 100644 --- a/swh/core/tarball.py +++ b/swh/core/tarball.py @@ -1,142 +1,58 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os +import shutil import stat import tarfile import zipfile -from os.path import abspath, realpath, join, dirname -from . import utils - - -def _canonical_abspath(path): - """Resolve all paths to an absolute and real one. - - Args: - path: to resolve - - Returns: - canonical absolute path to path - - """ - return realpath(abspath(path)) - - -def _badpath(path, basepath): - """Determine if a path is outside basepath. - - Args: - path: a relative or absolute path of a file or directory - basepath: the basepath path must be in - - Returns: - True if path is outside basepath, false otherwise. - - """ - return not _canonical_abspath(join(basepath, path)).startswith(basepath) - +from subprocess import run -def _badlink(info, basepath): - """Determine if the tarinfo member is outside basepath. - - Args: - info: TarInfo member representing a symlink or hardlink of tar archive - basepath: the basepath the info member must be in - - Returns: - True if info is outside basepath, false otherwise. - - """ - tippath = _canonical_abspath(join(basepath, dirname(info.name))) - return _badpath(info.linkname, basepath=tippath) - - -def is_tarball(filepath): - """Given a filepath, determine if it represents an archive. +from . import utils - Args: - filepath: file to test for tarball property - Returns: - Bool, True if it's a tarball, False otherwise +def _unpack_tar(tarpath: str, extract_dir: str) -> str: + """Unpack tarballs unsupported by the standard python library. Examples + include tar.Z, tar.lz, tar.x, etc.... - """ - return tarfile.is_tarfile(filepath) or zipfile.is_zipfile(filepath) + As this implementation relies on the `tar` command, this function supports + the same compression the tar command supports. + This expects the `extract_dir` to exist. -def _uncompress_zip(tarpath, dirpath): - """Uncompress zip archive safely. + Raises - As per zipfile is concerned - (cf. note on https://docs.python.org/3.5/library/zipfile.html#zipfile.ZipFile.extract) # noqa + shutil.ReadError in case of issue uncompressing the archive (tarpath + does not exist, extract_dir does not exist, etc...) - Args: - tarpath: path to the archive - dirpath: directory to uncompress the archive to + Returns + full path to the uncompressed directory. """ - with zipfile.ZipFile(tarpath) as z: - z.extractall(path=dirpath) - - -def _safemembers(tarpath, members, basepath): - """Given a list of archive members, yield the members (directory, - file, hard-link) that stays in bounds with basepath. Note - that symbolic link are authorized to point outside the - basepath though. + try: + run(['tar', 'xf', tarpath, '-C', extract_dir], check=True) + return extract_dir + except Exception as e: + raise shutil.ReadError( + f'Unable to uncompress {tarpath} to {extract_dir}. Reason: {e}') - Args: - tarpath: Name of the tarball - members: Archive members for such tarball - basepath: the basepath sandbox - - Yields: - Safe TarInfo member - Raises: - ValueError when a member would be extracted outside basepath +def register_new_archive_formats(): + """Register new archive formats to uncompress """ - errormsg = 'Archive {} blocked. Illegal path to %s %s'.format(tarpath) - - for finfo in members: - if finfo.isdir() and _badpath(finfo.name, basepath): - raise ValueError(errormsg % ('directory', finfo.name)) - elif finfo.isfile() and _badpath(finfo.name, basepath): - raise ValueError(errormsg % ('file', finfo.name)) - elif finfo.islnk() and _badlink(finfo, basepath): - raise ValueError(errormsg % ('hard-link', finfo.linkname)) - # Authorize symlinks to point outside basepath - # elif finfo.issym() and _badlink(finfo, basepath): - # raise ValueError(errormsg % ('symlink', finfo.linkname)) - else: - yield finfo - - -def _uncompress_tar(tarpath, dirpath): - """Uncompress tarpath if the tarpath is safe. - Safe means, no file will be uncompressed outside of dirpath. - - Args: - tarpath: path to the archive - dirpath: directory to uncompress the archive to - - Raises: - ValueError when a member would be extracted outside dirpath. - - """ - with tarfile.open(tarpath) as t: - members = t.getmembers() - t.extractall(path=dirpath, - members=_safemembers(tarpath, members, dirpath)) + registered_formats = [f[0] for f in shutil.get_unpack_formats()] + for name, extensions, function in ADDITIONAL_ARCHIVE_FORMATS: + if name in registered_formats: + continue + shutil.register_unpack_format(name, extensions, function) -def uncompress(tarpath, dest): - """Uncompress tarpath to dest folder if tarball is supported and safe. - Safe means, no file will be uncompressed outside of dirpath. +def uncompress(tarpath: str, dest: str): + """Uncompress tarpath to dest folder if tarball is supported. Note that this fixes permissions after successfully uncompressing the archive. @@ -149,19 +65,13 @@ def uncompress(tarpath, dest): The nature of the tarball, zip or tar. Raises: - ValueError when: - - an archive member would be extracted outside basepath - - the archive is not supported + ValueError when a problem occurs during unpacking """ - if tarfile.is_tarfile(tarpath): - _uncompress_tar(tarpath, dest) - nature = 'tar' - elif zipfile.is_zipfile(tarpath): - _uncompress_zip(tarpath, dest) - nature = 'zip' - else: - raise ValueError('File %s is not a supported archive.' % tarpath) + try: + shutil.unpack_archive(tarpath, extract_dir=dest) + except shutil.ReadError as e: + raise ValueError(f'Problem during unpacking {tarpath}. Reason: {e}') # Fix permissions for dirpath, _, fnames in os.walk(dest): @@ -173,8 +83,6 @@ def uncompress(tarpath, dest): if not fpath_exec: os.chmod(fpath, 0o644) - return nature - def _ls(rootdir): """Generator of filepath, filename from rootdir. @@ -226,3 +134,14 @@ def compress(tarpath, nature, dirpath_or_files): _compress_tar(tarpath, files) return tarpath + + +# Additional uncompression archive format support +ADDITIONAL_ARCHIVE_FORMATS = [ + # name , extensions, function + ('tar.Z|x', ['.tar.Z', '.tar.x'], _unpack_tar), + # FIXME: make this optional depending on the runtime lzip package install + ('tar.lz', ['.tar.lz'], _unpack_tar), +] + +register_new_archive_formats() diff --git a/swh/core/tests/data/archives/groff-1.02.tar.Z b/swh/core/tests/data/archives/groff-1.02.tar.Z new file mode 100644 index 0000000000000000000000000000000000000000..973ffb6f7e0d216413e88472cf66af581b76819d Binary files /dev/null and b/swh/core/tests/data/archives/groff-1.02.tar.Z differ diff --git a/swh/core/tests/data/archives/hello.tar b/swh/core/tests/data/archives/hello.tar new file mode 100644 index 0000000000000000000000000000000000000000..73fafe8ed56bf5bf5c42f71c0fb89f2788898847 Binary files /dev/null and b/swh/core/tests/data/archives/hello.tar differ diff --git a/swh/core/tests/data/archives/hello.tar.bz2 b/swh/core/tests/data/archives/hello.tar.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..730b09fcbcedc390e496a8acc501c15ed3e5d152 Binary files /dev/null and b/swh/core/tests/data/archives/hello.tar.bz2 differ diff --git a/swh/core/tests/data/archives/hello.tar.gz b/swh/core/tests/data/archives/hello.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..87727dbfd745257a5c2bc1bdbc4da2574aa33852 Binary files /dev/null and b/swh/core/tests/data/archives/hello.tar.gz differ diff --git a/swh/core/tests/data/archives/hello.tar.lz b/swh/core/tests/data/archives/hello.tar.lz new file mode 100644 index 0000000000000000000000000000000000000000..8195e331841f012ef9016125e90809d54b26093d Binary files /dev/null and b/swh/core/tests/data/archives/hello.tar.lz differ diff --git a/swh/core/tests/data/archives/hello.tar.x b/swh/core/tests/data/archives/hello.tar.x new file mode 100644 index 0000000000000000000000000000000000000000..09298d3aa33e111ef9404f0ec1dc491bf09000a1 Binary files /dev/null and b/swh/core/tests/data/archives/hello.tar.x differ diff --git a/swh/core/tests/data/archives/hello.zip b/swh/core/tests/data/archives/hello.zip new file mode 100644 index 0000000000000000000000000000000000000000..e9dde2adf56a4eafdaae4abd67f9ca7d5d7fef43 Binary files /dev/null and b/swh/core/tests/data/archives/hello.zip differ diff --git a/swh/core/tests/test_tarball.py b/swh/core/tests/test_tarball.py index 92e4f5e7e384375e237683ab6251df2fb7c975f9..7c7f189025886c010dace8af9c52ca234eea0312 100644 --- a/swh/core/tests/test_tarball.py +++ b/swh/core/tests/test_tarball.py @@ -3,28 +3,27 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from zipfile import ZipFile +import os +import pytest +import shutil from swh.core import tarball -def test_is_tarball(tmp_path): +@pytest.fixture +def prepare_shutil_state(): + """Reset any shutil modification in its current state - nozip = tmp_path / 'nozip.zip' - nozip.write_text('Im no zip') + """ + import shutil - assert tarball.is_tarball(str(nozip)) is False + registered_formats = [f[0] for f in shutil.get_unpack_formats()] + for format_id in tarball.ADDITIONAL_ARCHIVE_FORMATS: + name = format_id[0] + if name in registered_formats: + shutil.unregister_unpack_format(name) - notar = tmp_path / 'notar.tar' - notar.write_text('Im no tar') - - assert tarball.is_tarball(str(notar)) is False - - zipfile = tmp_path / 'truezip.zip' - with ZipFile(str(zipfile), 'w') as myzip: - myzip.writestr('file1.txt', 'some content') - - assert tarball.is_tarball(str(zipfile)) is True + return shutil def test_compress_uncompress_zip(tmp_path): @@ -38,8 +37,6 @@ def test_compress_uncompress_zip(tmp_path): zipfile = tmp_path / 'archive.zip' tarball.compress(str(zipfile), 'zip', str(tocompress)) - assert tarball.is_tarball(str(zipfile)) - destdir = tmp_path / 'destdir' tarball.uncompress(str(zipfile), str(destdir)) @@ -58,10 +55,115 @@ def test_compress_uncompress_tar(tmp_path): tarfile = tmp_path / 'archive.tar' tarball.compress(str(tarfile), 'tar', str(tocompress)) - assert tarball.is_tarball(str(tarfile)) - destdir = tmp_path / 'destdir' tarball.uncompress(str(tarfile), str(destdir)) lsdir = sorted(x.name for x in destdir.iterdir()) assert ['file%s.txt' % i for i in range(10)] == lsdir + + +def test__unpack_tar_failure(tmp_path, datadir): + """Unpack inexistent tarball should fail + + """ + tarpath = os.path.join(datadir, 'archives', 'inexistent-archive.tar.Z') + + assert not os.path.exists(tarpath) + + with pytest.raises(shutil.ReadError, + match=f'Unable to uncompress {tarpath} to {tmp_path}'): + tarball._unpack_tar(tarpath, tmp_path) + + +def test__unpack_tar_failure2(tmp_path, datadir): + """Unpack Existent tarball into an inexistent folder should fail + + """ + filename = 'groff-1.02.tar.Z' + tarpath = os.path.join(datadir, 'archives', filename) + + assert os.path.exists(tarpath) + + extract_dir = os.path.join(tmp_path, 'dir', 'inexistent') + + with pytest.raises(shutil.ReadError, + match=f'Unable to uncompress {tarpath} to {tmp_path}'): + tarball._unpack_tar(tarpath, extract_dir) + + +def test__unpack_tar_failure3(tmp_path, datadir): + """Unpack unsupported tarball should fail + + """ + filename = 'hello.zip' + tarpath = os.path.join(datadir, 'archives', filename) + + assert os.path.exists(tarpath) + + with pytest.raises(shutil.ReadError, + match=f'Unable to uncompress {tarpath} to {tmp_path}'): + tarball._unpack_tar(tarpath, tmp_path) + + +def test__unpack_tar(tmp_path, datadir): + """Unpack supported tarball into an existent folder should be ok + + """ + filename = 'groff-1.02.tar.Z' + tarpath = os.path.join(datadir, 'archives', filename) + + assert os.path.exists(tarpath) + + extract_dir = os.path.join(tmp_path, filename) + os.makedirs(extract_dir, exist_ok=True) + + output_directory = tarball._unpack_tar(tarpath, extract_dir) + + assert extract_dir == output_directory + assert len(os.listdir(extract_dir)) > 0 + + +def test_register_new_archive_formats(prepare_shutil_state): + """Registering new archive formats should be fine + + """ + unpack_formats_v1 = [f[0] for f in shutil.get_unpack_formats()] + for format_id in tarball.ADDITIONAL_ARCHIVE_FORMATS: + assert format_id[0] not in unpack_formats_v1 + + # when + tarball.register_new_archive_formats() + + # then + unpack_formats_v2 = [f[0] for f in shutil.get_unpack_formats()] + for format_id in tarball.ADDITIONAL_ARCHIVE_FORMATS: + assert format_id[0] in unpack_formats_v2 + + +def test_uncompress_tarpaths(tmp_path, datadir, prepare_shutil_state): + """High level call uncompression on un/supported tarballs + + """ + archive_dir = os.path.join(datadir, 'archives') + tarfiles = os.listdir(archive_dir) + tarpaths = [os.path.join(archive_dir, tarfile) for tarfile in tarfiles] + + unsupported_tarpaths = [] + for t in tarpaths: + if t.endswith('.Z') or t.endswith('.x') or t.endswith('.lz'): + unsupported_tarpaths.append(t) + + # not supported yet + for tarpath in unsupported_tarpaths: + with pytest.raises(ValueError, + match=f'Problem during unpacking {tarpath}.'): + tarball.uncompress(tarpath, dest=tmp_path) + + # register those unsupported formats + tarball.register_new_archive_formats() + + # unsupported formats are now supported + for n, tarpath in enumerate(tarpaths, start=1): + tarball.uncompress(tarpath, dest=tmp_path) + + assert n == len(tarpaths)