diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcf935d0..7a6ed662 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -272,6 +272,12 @@ if(NOT APPLE AND NOT MSVC)
   target_link_options(mold PRIVATE -pthread)
 endif()
 
+# shm_open needs -lrt
+find_library(LIBRT rt)
+if(LIBRT)
+  target_link_libraries(mold PRIVATE rt)
+endif()
+
 check_symbol_exists(madvise sys/mman.h HAVE_MADVISE)
 
 # Create a .cc file containing the current git hash for `mold --version`.
diff --git a/common/jobs-unix.cc b/common/jobs-unix.cc
index 9912ab52..0070cd88 100644
--- a/common/jobs-unix.cc
+++ b/common/jobs-unix.cc
@@ -8,43 +8,138 @@
 // This file implements a feature that limits the number of concurrent
 // mold processes to just 1 for each user. It is intended to be used as
 // `MOLD_JOBS=1 ninja` or `MOLD_JOBS=1 make -j$(nproc)`.
+//
+// We can't use POSIX semaphores because the counter will not be
+// decremented automatically when a process exits abnormally. That would
+// results in a deadlock. Therefore, we use lockf-based regional file
+// locking instead. Unlike POSIX semaphores, the lock will automatically
+// released on process termination.
+//
+// To wake processes that may be waiting on the lock file, we use a
+// pthread condition variable. On normal exit, mold sends notifications to
+// all waiting processes. In case of abnormal exit, we use
+// pthread_cond_timedwait so that waiters will not wait forever.
 
 #include "common.h"
 
+#include <atomic>
 #include <fcntl.h>
+#include <pthread.h>
 #include <pwd.h>
 #include <stdlib.h>
+#include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 
 namespace mold {
 
+static constexpr i64 MAX_JOBS = 128;
+
+struct SharedData {
+  std::atomic_bool initialized;
+  pthread_mutex_t mu;
+  pthread_cond_t cond;
+};
+
+static int num_jobs = -1;
 static int lock_fd = -1;
+static SharedData *shared_data = nullptr;
+
+static i64 get_mold_jobs() {
+  char *env = getenv("MOLD_JOBS");
+  if (!env)
+    return 0;
+
+  i64 jobs = std::stol(env);
+  if (jobs < 0)
+    return 0;
+  return std::min(jobs, MAX_JOBS);
+}
+
+static bool do_lock() {
+  for (i64 i = 0; i < num_jobs; i++) {
+    lseek(lock_fd, i, SEEK_SET);
+    if (lockf(lock_fd, F_TLOCK, 1) == 0)
+      return true;
+  }
+  return false;
+}
+
+static SharedData *get_shared_data() {
+  // Create a shared memory object and mmap it
+  std::string name = "/mold-signal-" + std::to_string(getuid());
+
+  int shm_fd = shm_open(name.c_str(), O_CREAT | O_RDWR, 0600);
+  if (shm_fd == -1) {
+    perror("shm_open");
+    exit(-1);
+  }
+
+  i64 size = sizeof(SharedData);
+  ftruncate(shm_fd, size);
+  SharedData *data = (SharedData *)mmap(0, size, PROT_READ | PROT_WRITE,
+                                        MAP_SHARED, shm_fd, 0);
+  close(shm_fd);
+
+  if (data->initialized.exchange(true) == false) {
+    pthread_mutexattr_t mu_attr;
+    pthread_mutexattr_init(&mu_attr);
+    pthread_mutexattr_setpshared(&mu_attr, PTHREAD_PROCESS_SHARED);
+    pthread_mutexattr_setrobust(&mu_attr, PTHREAD_MUTEX_ROBUST);
+    pthread_mutex_init(&data->mu, &mu_attr);
+
+    pthread_condattr_t cond_attr;
+    pthread_condattr_init(&cond_attr);
+    pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED);
+    pthread_cond_init(&data->cond, &cond_attr);
+  }
+  return data;
+}
 
 void acquire_global_lock() {
-  char *jobs = getenv("MOLD_JOBS");
-  if (!jobs || jobs != "1"s)
+  num_jobs = get_mold_jobs();
+  if (num_jobs == 0)
     return;
 
+  shared_data = get_shared_data();
+
   std::string path;
   if (char *dir = getenv("XDG_RUNTIME_DIR"))
-    path = dir + "/mold-lock"s;
+    path = dir + "/mold.lock"s;
   else
-    path = "/tmp/mold-lock-"s + getpwuid(getuid())->pw_name;
+    path = "/tmp/mold-" + std::to_string(getuid()) + ".lock";
 
-  int fd = open(path.c_str(), O_WRONLY | O_CREAT | O_CLOEXEC, 0600);
-  if (fd == -1)
+  lock_fd = open(path.c_str(), O_WRONLY | O_CREAT | O_CLOEXEC, 0600);
+  if (lock_fd == -1 || do_lock())
     return;
 
-  if (lockf(fd, F_LOCK, 0) == -1)
-    return;
-  lock_fd = fd;
+  pthread_mutex_t *mu = &shared_data->mu;
+  pthread_cond_t *cond = &shared_data->cond;
+
+  // If the previous process got killed while holding the mutex, the
+  // mutex has became inconsistent. We need to fix it in that case.
+  if (pthread_mutex_lock(mu) == EOWNERDEAD)
+    pthread_mutex_consistent(mu);
+
+  for (;;) {
+    struct timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    ts.tv_sec += 1;
+
+    int r = pthread_cond_timedwait(cond, mu, &ts);
+    if (do_lock() || r != ETIMEDOUT)
+      break;
+  }
+
+  pthread_mutex_unlock(mu);
 }
 
 void release_global_lock() {
-  if (lock_fd != -1)
-    close(lock_fd);
+  if (lock_fd == -1)
+    return;
+  close(lock_fd);
+  pthread_cond_broadcast(&shared_data->cond);
 }
 
 } // namespace mold
diff --git a/docs/mold.md b/docs/mold.md
index dae4ab78..7ba64401 100644
--- a/docs/mold.md
+++ b/docs/mold.md
@@ -832,8 +832,6 @@ but as `-o magic`.
   consider setting this environment variable to `1` to see if it addresses the
   OOM issue.
 
-  Currently, any value other than `1` is silently ignored.
-
 * `MOLD_DEBUG`:
   If this variable is set to a non-empty string, `mold` embeds its
   command-line options in the output file's `.comment` section.
diff --git a/test/elf/mold-jobs.sh b/test/elf/mold-jobs.sh
new file mode 100755
index 00000000..13b1d28c
--- /dev/null
+++ b/test/elf/mold-jobs.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -fno-PIE
+#include <stdio.h>
+int main() {
+  printf("Hello world\n");
+}
+EOF
+
+for i in `seq 1 20`; do
+  rm -f $t/exe$i
+  ( MOLD_JOBS=2 $CC -B. -o $t/exe$i $t/a.o -no-pie; echo $i) &
+done
+
+wait
+
+for i in `seq 1 20`; do
+  $QEMU $t/exe$i | grep -q 'Hello world'
+done