From 7f85bd7cea2cc493e9b29307131bab2476077c9c Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Thu, 9 Jun 2022 14:49:05 +0200
Subject: [PATCH] tarball: Use standard Python module zipfile to extract jar
 archive

It exists many cases where using unzip to extract a jar archive
fails while using the zipfile module succeeds.

So prefer to use the zipfile module to uncompress jar archives.

Related to T4318
---
 swh/core/tarball.py                    |  26 ++++++++++++++++++++++++-
 swh/core/tests/data/archives/hello.jar | Bin 0 -> 550 bytes
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 swh/core/tests/data/archives/hello.jar

diff --git a/swh/core/tarball.py b/swh/core/tarball.py
index d7b01e90..e2d06b99 100644
--- a/swh/core/tarball.py
+++ b/swh/core/tarball.py
@@ -64,6 +64,29 @@ def _unpack_zip(zippath: str, extract_dir: str) -> str:
         )
 
 
+def _unpack_jar(jarpath: str, extract_dir: str) -> str:
+    """Unpack jar files using standard Python module zipfile.
+
+    This expects the `extract_dir` to exist.
+
+    Raises:
+        shutil.ReadError in case of issue uncompressing the archive (jarpath
+        does not exist, extract_dir does not exist, etc...)
+
+    Returns:
+        full path to the uncompressed directory.
+
+    """
+    try:
+        with zipfile.ZipFile(jarpath) as jar:
+            jar.extractall(path=extract_dir)
+        return extract_dir
+    except Exception as e:
+        raise shutil.ReadError(
+            f"Unable to uncompress {jarpath} to {extract_dir}. Reason: {e}"
+        )
+
+
 def register_new_archive_formats():
     """Register new archive formats to uncompress"""
     registered_formats = [f[0] for f in shutil.get_unpack_formats()]
@@ -80,6 +103,7 @@ _mime_to_archive_format = {
     "application/gzip": "gztar",
     "application/x-lzip": "tar.lz",
     "application/zip": "zip",
+    "application/java-archive": "jar",
 }
 
 
@@ -192,7 +216,7 @@ def compress(tarpath, nature, dirpath_or_files):
 ADDITIONAL_ARCHIVE_FORMATS = [
     # name, extensions, function
     ("tar.Z|x", [".tar.Z", ".tar.x"], _unpack_tar),
-    ("jar", [".jar"], _unpack_zip),
+    ("jar", [".jar"], _unpack_jar),
     ("tbz2", [".tbz", "tbz2"], _unpack_tar),
     # FIXME: make this optional depending on the runtime lzip package install
     ("tar.lz", [".tar.lz"], _unpack_tar),
diff --git a/swh/core/tests/data/archives/hello.jar b/swh/core/tests/data/archives/hello.jar
new file mode 100644
index 0000000000000000000000000000000000000000..7b7a00fc8fb49ecc64b0678234dd8768c369affc
GIT binary patch
literal 550
zcmWIWW@Zs#;Nak3@Gd<W!hi%g8CV#6T|*poJ^kGD|D9rBU}gyLX6FE@V1g<Oz^PdT
zr~<Cp*U`_@%{4eg&)4m<@0rs+-nx1hdA)VD&Yd~GImqCO@q?#DdS1Rdp1v1LSFvz0
zxPIaB6*Y+onrtNe*`#u3#ZIujd^_@4%z&nv12M>6E(QjmTQ*nL_(L57rr3aFMruw@
zzCOZjJJHne!PTT?=A`PCRFs_dID7JBxW;+?Q=Si*z_#gHbF2rc;{oCTZ$>5&2Gl@;
zg#aj!PyyW6pddrnitGtc5FvmqkO|j{6ks3|xEOGI7Gc6xAQR?wgo8nzNA?WJQwVSq
a$b{&D1x0{2D<}+s-eO^R3#649K|BCgoMET{

literal 0
HcmV?d00001

-- 
GitLab