From 9ff5f6492f46b7f3342d47f138b590f09e939865 Mon Sep 17 00:00:00 2001
From: Yorick van Pelt <yorick@yorickvanpelt.nl>
Date: Sat, 7 Dec 2019 22:35:14 +0700
Subject: [PATCH] libarchive proof of concept

---
 Makefile.config.in                       |   1 +
 configure.ac                             |   2 +
 release-common.nix                       |   1 +
 src/libstore/download.cc                 |   2 +-
 src/libutil/local.mk                     |   2 +-
 src/libutil/tarfile.cc                   | 135 +++++++++++++++++++----
 src/libutil/tarfile.hh                   |   3 +-
 src/nix-prefetch-url/nix-prefetch-url.cc |   5 +-
 8 files changed, 124 insertions(+), 27 deletions(-)

diff --git a/Makefile.config.in b/Makefile.config.in
index 7e3b35b98..fe609ce06 100644
--- a/Makefile.config.in
+++ b/Makefile.config.in
@@ -18,6 +18,7 @@ SODIUM_LIBS = @SODIUM_LIBS@
 LIBLZMA_LIBS = @LIBLZMA_LIBS@
 SQLITE3_LIBS = @SQLITE3_LIBS@
 LIBBROTLI_LIBS = @LIBBROTLI_LIBS@
+LIBARCHIVE_LIBS = @LIBARCHIVE_LIBS@
 EDITLINE_LIBS = @EDITLINE_LIBS@
 bash = @bash@
 bindir = @bindir@
diff --git a/configure.ac b/configure.ac
index 9dd0acd86..29835195f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -178,6 +178,8 @@ AC_CHECK_LIB([bz2], [BZ2_bzWriteOpen], [true],
   [AC_MSG_ERROR([Nix requires libbz2, which is part of bzip2.  See https://web.archive.org/web/20180624184756/http://www.bzip.org/.])])
 AC_CHECK_HEADERS([bzlib.h], [true],
   [AC_MSG_ERROR([Nix requires libbz2, which is part of bzip2.  See https://web.archive.org/web/20180624184756/http://www.bzip.org/.])])
+# Checks for libarchive
+PKG_CHECK_MODULES([LIBARCHIVE], [libarchive >= 3.4.0], [CXXFLAGS="$LIBARCHIVE_CFLAGS $CXXFLAGS"])
 
 # Look for SQLite, a required dependency.
 PKG_CHECK_MODULES([SQLITE3], [sqlite3 >= 3.6.19], [CXXFLAGS="$SQLITE3_CFLAGS $CXXFLAGS"])
diff --git a/release-common.nix b/release-common.nix
index dd5f939d9..f8c93f76e 100644
--- a/release-common.nix
+++ b/release-common.nix
@@ -49,6 +49,7 @@ rec {
     [ curl
       bzip2 xz brotli editline
       openssl pkgconfig sqlite boehmgc
+      libarchive
       boost
       nlohmann_json
       rustc cargo
diff --git a/src/libstore/download.cc b/src/libstore/download.cc
index 61e88c5c1..c7c1b93ad 100644
--- a/src/libstore/download.cc
+++ b/src/libstore/download.cc
@@ -907,7 +907,7 @@ CachedDownloadResult Downloader::downloadCached(
             printInfo("unpacking '%s'...", url);
             Path tmpDir = createTempDir();
             AutoDelete autoDelete(tmpDir, true);
-            unpackTarfile(store->toRealPath(storePath), tmpDir, baseNameOf(url));
+            unpackTarfile(store->toRealPath(storePath), tmpDir);
             auto members = readDirectory(tmpDir);
             if (members.size() != 1)
                 throw nix::Error("tarball '%s' contains an unexpected number of top-level files", url);
diff --git a/src/libutil/local.mk b/src/libutil/local.mk
index 35c1f6c13..16c1fa03f 100644
--- a/src/libutil/local.mk
+++ b/src/libutil/local.mk
@@ -6,6 +6,6 @@ libutil_DIR := $(d)
 
 libutil_SOURCES := $(wildcard $(d)/*.cc)
 
-libutil_LDFLAGS = $(LIBLZMA_LIBS) -lbz2 -pthread $(OPENSSL_LIBS) $(LIBBROTLI_LIBS) $(BOOST_LDFLAGS) -lboost_context
+libutil_LDFLAGS = $(LIBLZMA_LIBS) -lbz2 -pthread $(OPENSSL_LIBS) $(LIBBROTLI_LIBS) $(LIBARCHIVE_LIBS) $(BOOST_LDFLAGS) -lboost_context
 
 libutil_LIBS = libnixrust
diff --git a/src/libutil/tarfile.cc b/src/libutil/tarfile.cc
index 2cc7793fd..ab30002dd 100644
--- a/src/libutil/tarfile.cc
+++ b/src/libutil/tarfile.cc
@@ -1,5 +1,8 @@
 #include "rust-ffi.hh"
 #include "compression.hh"
+#include <archive.h>
+#include <archive_entry.h>
+#include "finally.hh"
 
 extern "C" {
     rust::Result<std::tuple<>> *
@@ -8,29 +11,123 @@ extern "C" {
 
 namespace nix {
 
-void unpackTarfile(Source & source, const Path & destDir)
-{
-    rust::Source source2(source);
-    rust::CBox(unpack_tarfile(source2, destDir))->unwrap();
+std::shared_ptr<struct archive> archive_read_ptr() {
+    return std::shared_ptr<struct archive>(archive_read_new(),
+        [](auto p) {
+            archive_read_close(p);
+            archive_read_free(p);
+        });
 }
-
-void unpackTarfile(const Path & tarFile, const Path & destDir,
-    std::optional<std::string> baseName)
+void archive_read_open_source(std::shared_ptr<struct archive> a, Source& s, unsigned int bufsize = 1024) {
+    std::shared_ptr<unsigned char> buffer((unsigned char*)malloc(bufsize), [](auto p) { free(p); });
+    typedef struct {
+        decltype(buffer) buf;
+        Source& src;
+        unsigned int bs;
+    } St;
+    St* state = new St({buffer, s, bufsize});
+    if (archive_read_open(a.get(), state,
+            NULL /* open */,
+            ([] (struct archive*, void* sptr, const void** buf) -> long int {
+                 St& s = *(static_cast<St*>(sptr));
+                 *buf = s.buf.get();
+                 try {
+                     return s.src.read(s.buf.get(), s.bs);
+                 } catch (EndOfFile &) {
+                     return 0;
+                 }
+                 /* TODO: I don't know what happens if anything else is thrown here */
+             }), [] (struct archive*, void* sptr) {
+                     delete static_cast<St*>(sptr);
+                     return ARCHIVE_OK;
+                 })) {
+        throw Error("archive is corrupt (%s)", archive_error_string(a.get()));
+    }
+}
+std::shared_ptr<struct archive> archive_write_ptr() {
+    return std::shared_ptr<struct archive>(archive_write_disk_new(),
+        [](auto p) {
+            archive_write_close(p);
+            archive_write_free(p);
+        });
+}
+static void copy_data(std::shared_ptr<struct archive> ar, std::shared_ptr<struct archive> aw)
 {
-    if (!baseName) baseName = baseNameOf(tarFile);
+  int r;
+  const void *buff;
+  size_t size;
+  la_int64_t offset;
 
-    auto source = sinkToSource([&](Sink & sink) {
-        // FIXME: look at first few bytes to determine compression type.
-        auto decompressor =
-            // FIXME: add .gz support
-            hasSuffix(*baseName, ".bz2") ? makeDecompressionSink("bzip2", sink) :
-            hasSuffix(*baseName, ".xz") ? makeDecompressionSink("xz", sink) :
-            makeDecompressionSink("none", sink);
-        readFile(tarFile, *decompressor);
-        decompressor->finish();
-    });
+  for (;;) {
+      r = archive_read_data_block(ar.get(), &buff, &size, &offset);
+      if (r == ARCHIVE_EOF) return;
+      if (r < ARCHIVE_OK) {
+          throw Error("archive is corrupt (%s)", archive_error_string(ar.get()));
+      }
+      r = archive_write_data_block(aw.get(), buff, size, offset);
+      if (r < ARCHIVE_OK) {
+          throw Error("could not write archive output (%s)", archive_error_string(aw.get()));
+      }
+  }
+}
 
-    unpackTarfile(*source, destDir);
+static void extract_archive(std::shared_ptr<struct archive> a, const Path & destDir) {
+    char * cwd = getcwd(0, 0);
+    if (!cwd) throw SysError("getting current directory");
+    Finally freeCwd([&]() { free(cwd); });
+    int r = chdir(destDir.c_str());
+    if (r != 0) throw SysError("setting directory to tar output path");
+    struct archive_entry *entry;
+    r = archive_read_next_header(a.get(), &entry);
+    if (r != ARCHIVE_OK) {
+        throw Error("archive is corrupt (%s)", archive_error_string(a.get()));
+    }
+    int flags = 0;
+    auto ext = archive_write_ptr();
+    flags |= ARCHIVE_EXTRACT_PERM;
+    flags |= ARCHIVE_EXTRACT_FFLAGS;
+    archive_write_disk_set_options(ext.get(), flags);
+    archive_write_disk_set_standard_lookup(ext.get());
+    for(;;) {
+        r = archive_read_next_header(a.get(), &entry);
+        if (r == ARCHIVE_EOF) break;
+        if (r == ARCHIVE_WARN) {
+            std::cerr << "warning: " << archive_error_string(a.get());
+        } else if (r < ARCHIVE_WARN) {
+            throw Error("archive is corrupt (%s)", archive_error_string(a.get()));
+        }
+        r = archive_write_header(ext.get(), entry);
+        if (r != ARCHIVE_OK) {
+            throw Error("could not write archive output (%s)", archive_error_string(ext.get()));
+        }
+        if (archive_entry_size(entry) > 0) {
+            copy_data(a, ext);
+        }
+        archive_write_finish_entry(ext.get());
+    }
+    r = chdir(cwd);
+    if (r != 0) throw SysError("resetting directory after archive extraction");
+}
+void unpackTarfile(Source & source, const Path & destDir)
+{
+    auto a = archive_read_ptr();
+    archive_read_support_filter_all(a.get());
+    archive_read_support_format_all(a.get());
+    archive_read_open_source(a, source);
+    createDirs(destDir);
+    extract_archive(a, destDir);
+}
+void unpackTarfile(const Path & tarFile, const Path & destDir)
+{
+    auto a = archive_read_ptr();
+    archive_read_support_filter_all(a.get());
+    archive_read_support_format_all(a.get());
+    int r = archive_read_open_filename(a.get(), tarFile.c_str(), 16384);
+    if (r != ARCHIVE_OK) {
+        throw Error("archive is corrupt (%s)", archive_error_string(a.get()));
+    }
+    createDirs(destDir);
+    extract_archive(a, destDir);
 }
 
 }
diff --git a/src/libutil/tarfile.hh b/src/libutil/tarfile.hh
index ce0911e2a..89a024f1d 100644
--- a/src/libutil/tarfile.hh
+++ b/src/libutil/tarfile.hh
@@ -4,7 +4,6 @@ namespace nix {
 
 void unpackTarfile(Source & source, const Path & destDir);
 
-void unpackTarfile(const Path & tarFile, const Path & destDir,
-    std::optional<std::string> baseName = {});
+void unpackTarfile(const Path & tarFile, const Path & destDir);
 
 }
diff --git a/src/nix-prefetch-url/nix-prefetch-url.cc b/src/nix-prefetch-url/nix-prefetch-url.cc
index 78c883833..48714446b 100644
--- a/src/nix-prefetch-url/nix-prefetch-url.cc
+++ b/src/nix-prefetch-url/nix-prefetch-url.cc
@@ -190,10 +190,7 @@ static int _main(int argc, char * * argv)
                 printInfo("unpacking...");
                 Path unpacked = (Path) tmpDir + "/unpacked";
                 createDirs(unpacked);
-                if (hasSuffix(baseNameOf(uri), ".zip"))
-                    runProgram("unzip", true, {"-qq", tmpFile, "-d", unpacked});
-                else
-                    unpackTarfile(tmpFile, unpacked, baseNameOf(uri));
+                unpackTarfile(tmpFile, unpacked);
 
                 /* If the archive unpacks to a single file/directory, then use
                    that as the top-level. */
-- 
GitLab