diff --git a/elf/arch-arm32.cc b/elf/arch-arm32.cc
index 53497a50..97dd82c7 100644
--- a/elf/arch-arm32.cc
+++ b/elf/arch-arm32.cc
@@ -272,11 +272,11 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - P;
       continue;
     case R_ARM_TLS_LE32:
-      *(ul32 *)loc = S + A - ctx.tls_begin + 8;
+      *(ul32 *)loc = S + A - ctx.tls_begin + E::tls_offset;
       continue;
     case R_ARM_TLS_GOTDESC:
       if (sym.get_tlsdesc_idx(ctx) == -1)
-        *(ul32 *)loc = S - ctx.tls_begin + 8;
+        *(ul32 *)loc = S - ctx.tls_begin + E::tls_offset;
       else
         *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) + A - P - 6;
       continue;
diff --git a/elf/arch-arm64.cc b/elf/arch-arm64.cc
index 02d101b5..e9d0b9b0 100644
--- a/elf/arch-arm64.cc
+++ b/elf/arch-arm64.cc
@@ -279,14 +279,14 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       *(ul32 *)loc |= bits(sym.get_gottp_addr(ctx) + A, 11, 3) << 10;
       continue;
     case R_AARCH64_TLSLE_ADD_TPREL_HI12: {
-      i64 val = S + A - ctx.tls_begin + 16;
+      i64 val = S + A - ctx.tls_begin + E::tls_offset;
       overflow_check(val, 0, (i64)1 << 24);
       *(ul32 *)loc |= bits(val, 23, 12) << 10;
       continue;
     }
     case R_AARCH64_TLSLE_ADD_TPREL_LO12:
     case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
-      *(ul32 *)loc |= bits(S + A - ctx.tls_begin + 16, 11, 0) << 10;
+      *(ul32 *)loc |= bits(S + A - ctx.tls_begin + E::tls_offset, 11, 0) << 10;
       continue;
     case R_AARCH64_TLSGD_ADR_PAGE21: {
       i64 val = page(sym.get_tlsgd_addr(ctx) + A) - page(P);
@@ -300,7 +300,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_AARCH64_TLSDESC_ADR_PAGE21: {
       if (ctx.relax_tlsdesc && !sym.is_imported) {
         // adrp x0, 0 -> movz x0, #tls_ofset_hi, lsl #16
-        i64 val = (S + A - ctx.tls_begin + 16);
+        i64 val = (S + A - ctx.tls_begin + E::tls_offset);
         overflow_check(val, -((i64)1 << 32), (i64)1 << 32);
         *(ul32 *)loc = 0xd2a00000 | (bits(val, 32, 16) << 5);
       } else {
@@ -313,7 +313,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_AARCH64_TLSDESC_LD64_LO12:
       if (ctx.relax_tlsdesc && !sym.is_imported) {
         // ldr x2, [x0] -> movk x0, #tls_ofset_lo
-        u32 offset_lo = (S + A - ctx.tls_begin + 16) & 0xffff;
+        u32 offset_lo = (S + A - ctx.tls_begin + E::tls_offset) & 0xffff;
         *(ul32 *)loc = 0xf2800000 | (offset_lo << 5);
       } else {
         *(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 3) << 10;
diff --git a/elf/elf.h b/elf/elf.h
index 240044a8..093835c9 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -1428,6 +1428,7 @@ struct ARM64 {
   static constexpr u32 plt_hdr_size = 32;
   static constexpr u32 plt_size = 16;
   static constexpr u32 pltgot_size = 16;
+  static constexpr u32 tls_offset = 16;
   static constexpr bool is_rel = false;
   static constexpr bool supports_tlsdesc = true;
 };
@@ -1461,6 +1462,7 @@ struct ARM32 {
   static constexpr u32 plt_hdr_size = 32;
   static constexpr u32 plt_size = 16;
   static constexpr u32 pltgot_size = 16;
+  static constexpr u32 tls_offset = 8;
   static constexpr bool is_rel = true;
   static constexpr bool supports_tlsdesc = true;
 };
@@ -1493,6 +1495,7 @@ struct RISCV64 {
   static constexpr u32 plt_hdr_size = 32;
   static constexpr u32 plt_size = 16;
   static constexpr u32 pltgot_size = 16;
+  static constexpr u32 tls_offset = 0;
   static constexpr bool is_rel = false;
   static constexpr bool supports_tlsdesc = false;
 };
diff --git a/elf/output-chunks.cc b/elf/output-chunks.cc
index 37378f31..a5efc443 100644
--- a/elf/output-chunks.cc
+++ b/elf/output-chunks.cc
@@ -356,7 +356,7 @@ void RelDynSection<E>::update_shdr(Context<E> &ctx) {
 
 template <typename E>
 static ElfRel<E> reloc(u64 offset, u32 type, u32 sym, i64 addend = 0) {
-  if constexpr (std::is_same_v<E, I386> || std::is_same_v<E, ARM32>)
+  if constexpr (E::is_rel)
     return {(u32)offset, (u8)type, sym};
   else
     return {offset, type, sym, addend};
@@ -1077,17 +1077,18 @@ std::vector<GotEntry<E>> GotSection<E>::get_entries(Context<E> &ctx) const {
       continue;
     }
 
-    // Otherwise, we know the offset at link-time, so fill the GOT entry.
+    // Otherwise, we know the offset from the thread pointer (TP) at
+    // link-time, so we can fill the GOT entry directly.
+    //
+    // On x86, TP (%gs for 32-bit, %fs for 64-bit) points to the end of
+    // all thread-local variables for a historical reason, so the offset
+    // we calculate here will be negative. On other architectures, TP
+    // points to an optional padding whose size is architecture-dependent
+    // followed by thread-local variables.
     if constexpr (std::is_same_v<E, X86_64> || std::is_same_v<E, I386>)
       entries.push_back({idx, sym->get_addr(ctx) - ctx.tls_end});
-    else if constexpr (std::is_same_v<E, ARM32>)
-      entries.push_back({idx, sym->get_addr(ctx) - ctx.tls_begin + 8});
-    else if constexpr (std::is_same_v<E, ARM64>)
-      entries.push_back({idx, sym->get_addr(ctx) - ctx.tls_begin + 16});
-    else if constexpr (std::is_same_v<E, RISCV64>)
-      entries.push_back({idx, sym->get_addr(ctx) - ctx.tls_begin});
     else
-      unreachable();
+      entries.push_back({idx, sym->get_addr(ctx) - ctx.tls_begin + E::tls_offset});
   }
 
   if (tlsld_idx != -1)