diff --git a/Makefile b/Makefile
index f549f234..c23c96ea 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ LDFLAGS=-L$(TBB_LIBDIR) -Wl,-rpath=$(TBB_LIBDIR) \
 LIBS=-lcrypto -pthread -ltbb -lmimalloc -lz -lxxhash -ldl
 OBJS=main.o object_file.o input_sections.o output_chunks.o mapfile.o perf.o \
      linker_script.o archive_file.o output_file.o subprocess.o gc_sections.o \
-     icf.o symbols.o cmdline.o filepath.o glob.o passes.o tar.o \
+     icf.o symbols.o cmdline.o filepath.o glob.o passes.o tar.o compress.o \
      arch_x86_64.o arch_i386.o
 
 DEBUG ?= 0
diff --git a/compress.cc b/compress.cc
new file mode 100644
index 00000000..09ed03d5
--- /dev/null
+++ b/compress.cc
@@ -0,0 +1,102 @@
+// This file implements a multi-threaded zlib compression routine.
+//
+// Multiple pieces of raw zlib-format compressed data can be merged
+// just by concatenation as long as they are terminated with
+// Z_SYNC_FLUSH.
+
+#include "mold.h"
+
+#include <tbb/parallel_for_each.h>
+#include <zlib.h>
+
+static constexpr i64 SHARD_SIZE = 1024 * 1024;
+
+static std::vector<std::string_view> split(std::string_view input) {
+  std::vector<std::string_view> shards;
+
+  while (input.size() >= SHARD_SIZE) {
+    shards.push_back(input.substr(0, SHARD_SIZE));
+    input = input.substr(SHARD_SIZE);
+  }
+  if (!input.empty())
+    shards.push_back(input);
+  return shards;
+}
+
+static std::vector<u8> do_compress(std::string_view input) {
+  // Initialize zlib stream. Since debug info is generally compressed
+  // pretty well, we chose compression level 3.
+  z_stream strm;
+  strm.zalloc = Z_NULL;
+  strm.zfree = Z_NULL;
+  strm.opaque = Z_NULL;
+  int r = deflateInit2(&strm, 3, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
+  assert(r == Z_OK);
+
+  // Set an input buffer
+  strm.avail_in = input.size();
+  strm.next_in = (u8 *)input.data();
+
+  // Set an output buffer. deflateBound() returns an upper bound
+  // on the compression size. +16 for Z_SYNC_FLUSH.
+  std::vector<u8> buf(deflateBound(&strm, strm.avail_in) + 16);
+
+  strm.avail_out = buf.size();
+  strm.next_out = buf.data();
+
+  r = deflate(&strm, Z_SYNC_FLUSH);
+  assert(r == Z_OK);
+  assert(strm.avail_out > 0);
+
+  buf.resize(buf.size() - strm.avail_out);
+  deflateEnd(&strm);
+  return buf;
+}
+
+Compress::Compress(std::string_view input) {
+  std::vector<std::string_view> inputs = split(input);
+  std::vector<u64> adlers(inputs.size());
+  shards.resize(inputs.size());
+
+  // Compress each shard
+  tbb::parallel_for((i64)0, (i64)inputs.size(), [&](i64 i) {
+    adlers[i] = adler32(1, (u8 *)inputs[i].data(), inputs[i].size());
+    shards[i] = do_compress(inputs[i]);
+  });
+
+  // Combine checksums
+  checksum = adlers[0];
+  for (i64 i = 1; i < inputs.size(); i++)
+    checksum = adler32_combine(checksum, adlers[i], inputs[i].size());
+}
+
+i64 Compress::size() const {
+  i64 size = 2;    // +2 for header
+  for (const std::vector<u8> &shard : shards)
+    size += shard.size();
+  return size + 6; // +6 for trailer and checksum
+}
+
+void Compress::write_to(u8 *buf) {
+  // Write a zlib-format header
+  buf[0] = 0x78;
+  buf[1] = 0x9c;
+
+  // Copy compressed data
+  std::vector<i64> offsets(shards.size());
+  offsets[0] = 2; // +2 for header
+  for (i64 i = 1; i < shards.size(); i++)
+    offsets[i] = offsets[i - 1] + shards[i - 1].size();
+
+  tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) {
+    memcpy(&buf[offsets[i]], shards[i].data(), shards[i].size());
+  });
+
+  // Write a trailer
+  u8 *end = buf + size();
+  end[-6] = 3;
+  end[-5] = 0;
+
+  // Write a checksum
+  write32be(end - 4, checksum);
+}
diff --git a/mold.h b/mold.h
index 25cfe30e..9c4f6b66 100644
--- a/mold.h
+++ b/mold.h
@@ -53,6 +53,7 @@ template <typename E> class Symbol;
 template <typename E> struct Context;
 template <typename E> struct FdeRecord;
 template <typename E> struct CieRecord;
+class Compress;
 class TarFile;
 
 template <typename E> void cleanup();
@@ -812,7 +813,8 @@ public:
   void copy_buf(Context<E> &ctx) override;
 
 private:
-  std::unique_ptr<u8[]> contents;
+  ElfChdr<E> chdr = {};
+  std::unique_ptr<Compress> contents;
 };
 
 template <typename E>
@@ -1241,6 +1243,21 @@ template <typename E>
 void parse_nonpositional_args(Context<E> &ctx,
                               std::vector<std::string_view> &remaining);
 
+//
+// compress.cc
+//
+
+class Compress {
+public:
+  Compress(std::string_view input);
+  void write_to(u8 *buf);
+  i64 size() const;
+
+private:
+  std::vector<std::vector<u8>> shards;
+  u64 checksum = 0;
+};
+
 //
 // tar.cc
 //
@@ -2127,3 +2144,17 @@ template <typename T, typename U>
 inline void sort(T &vec, U less) {
   std::stable_sort(vec.begin(), vec.end(), less);
 }
+
+inline u64 read64be(u8 *buf) {
+  return ((u64)buf[0] << 56) | ((u64)buf[1] << 48) |
+         ((u64)buf[2] << 40) | ((u64)buf[3] << 32) |
+         ((u64)buf[4] << 24) | ((u64)buf[5] << 16) |
+         ((u64)buf[6] << 8)  | (u64)buf[7];
+}
+
+inline void write32be(u8 *buf, u32 val) {
+  buf[0] = val >> 24;
+  buf[1] = val >> 16;
+  buf[2] = val >> 8;
+  buf[3] = val;
+}
diff --git a/object_file.cc b/object_file.cc
index 4bd71d34..70abaf5f 100644
--- a/object_file.cc
+++ b/object_file.cc
@@ -181,13 +181,6 @@ u32 ObjectFile<E>::read_note_gnu_property(Context<E> &ctx,
   return ret;
 }
 
-static u64 read64be(u8 *buf) {
-  return ((u64)buf[0] << 56) | ((u64)buf[1] << 48) |
-         ((u64)buf[2] << 40) | ((u64)buf[3] << 32) |
-         ((u64)buf[4] << 24) | ((u64)buf[5] << 16) |
-         ((u64)buf[6] << 8)  | (u64)buf[7];
-}
-
 template <typename E>
 std::pair<std::string_view, const ElfShdr<E> *>
 ObjectFile<E>::uncompress_contents(Context<E> &ctx, const ElfShdr<E> &shdr,
diff --git a/output_chunks.cc b/output_chunks.cc
index 122262d0..2c3526ea 100644
--- a/output_chunks.cc
+++ b/output_chunks.cc
@@ -1634,26 +1634,23 @@ CompressedSection<E>::CompressedSection(Context<E> &ctx, OutputChunk<E> &chunk)
   std::unique_ptr<u8[]> buf(new u8[chunk.shdr.sh_size]);
   chunk.write_to(ctx, buf.get());
 
-  ElfChdr<E> hdr = {};
-  hdr.ch_type = ELFCOMPRESS_ZLIB;
-  hdr.ch_size = chunk.shdr.sh_size;
-  hdr.ch_addralign = chunk.shdr.sh_addralign;
+  chdr.ch_type = ELFCOMPRESS_ZLIB;
+  chdr.ch_size = chunk.shdr.sh_size;
+  chdr.ch_addralign = chunk.shdr.sh_addralign;
 
-  unsigned long size = compressBound(chunk.shdr.sh_size);
-  contents.reset(new u8[sizeof(hdr) + size]);
-  memcpy(contents.get(), &hdr, sizeof(hdr));
-  int res = compress2(contents.get() + sizeof(hdr), &size, buf.get(),
-                      chunk.shdr.sh_size, Z_DEFAULT_COMPRESSION);
+  contents.reset(new Compress({(char *)buf.get(), chunk.shdr.sh_size}));
 
   this->shdr = chunk.shdr;
   this->shdr.sh_flags |= SHF_COMPRESSED;
   this->shdr.sh_addralign = 1;
-  this->shdr.sh_size = sizeof(hdr) + size;
+  this->shdr.sh_size = sizeof(chdr) + contents->size();
 }
 
 template <typename E>
 void CompressedSection<E>::copy_buf(Context<E> &ctx) {
-  memcpy(ctx.buf + this->shdr.sh_offset, contents.get(), this->shdr.sh_size);
+  u8 *base = ctx.buf + this->shdr.sh_offset;
+  memcpy(base, &chdr, sizeof(chdr));
+  contents->write_to(base + sizeof(chdr));
 }
 
 template <typename E>