diff --git a/compiler/build/Cargo.toml b/compiler/build/Cargo.toml
index a2a0eea088..4b48dd2d26 100644
--- a/compiler/build/Cargo.toml
+++ b/compiler/build/Cargo.toml
@@ -54,3 +54,8 @@ maplit = "1.0.1"
 indoc = "0.3.3"
 quickcheck = "0.8"
 quickcheck_macros = "0.8"
+
+[features]
+target-arm = []
+target-aarch64 = []
+target-webassembly = []
diff --git a/compiler/builtins/bitcode/src/main.zig b/compiler/builtins/bitcode/src/main.zig
index ed40fb62d2..500f482eb1 100644
--- a/compiler/builtins/bitcode/src/main.zig
+++ b/compiler/builtins/bitcode/src/main.zig
@@ -19,6 +19,7 @@ comptime {
     exportStrFn(str.countSegments, "count_segments");
     exportStrFn(str.countGraphemeClusters, "count_grapheme_clusters");
     exportStrFn(str.startsWith, "starts_with");
+    exportStrFn(str.strConcat, "concat");
 }
 
 // Export helpers - Must be run inside a comptime
diff --git a/compiler/builtins/bitcode/src/str.zig b/compiler/builtins/bitcode/src/str.zig
index 0c9958622f..ddbb09b06e 100644
--- a/compiler/builtins/bitcode/src/str.zig
+++ b/compiler/builtins/bitcode/src/str.zig
@@ -122,6 +122,10 @@ const RocStr = extern struct {
         return if (self.is_small_str()) small_len else big_len;
     }
 
+    pub fn is_empty(self: RocStr) bool {
+        return self.len() == 0;
+    }
+
     // Given a pointer to some bytes, write the first (len) bytes of this
     // RocStr's contents into it.
     //
@@ -586,3 +590,145 @@ test "startsWith: 12345678912345678910 starts with 123456789123456789" {
 
     expect(startsWith(str_ptr, str_len, prefix_ptr, prefix_len));
 }
+
+// Str.concat
+
+test "RocStr.concat: small concat small" {
+    const str1_len = 3;
+    var str1: [str1_len]u8 = "foo".*;
+    const str1_ptr: [*]u8 = &str1;
+    var roc_str1 = RocStr.init(str1_ptr, str1_len);
+
+    const str2_len = 3;
+    var str2: [str2_len]u8 = "abc".*;
+    const str2_ptr: [*]u8 = &str2;
+    var roc_str2 = RocStr.init(str2_ptr, str2_len);
+
+    const str3_len = 6;
+    var str3: [str3_len]u8 = "fooabc".*;
+    const str3_ptr: [*]u8 = &str3;
+    var roc_str3 = RocStr.init(str3_ptr, str3_len);
+
+    const result = strConcat(8, InPlace.Clone, roc_str1, roc_str2);
+
+    expect(roc_str3.eq(result));
+
+    roc_str1.drop();
+    roc_str2.drop();
+    roc_str3.drop();
+    result.drop();
+}
+
+pub fn strConcat(ptr_size: u32, result_in_place: InPlace, arg1: RocStr, arg2: RocStr) callconv(.C) RocStr {
+    return switch (ptr_size) {
+        4 => strConcatHelp(i32, result_in_place, arg1, arg2),
+        8 => strConcatHelp(i64, result_in_place, arg1, arg2),
+        else => unreachable,
+    };
+}
+
+fn strConcatHelp(comptime T: type, result_in_place: InPlace, arg1: RocStr, arg2: RocStr) RocStr {
+    if (arg1.is_empty()) {
+        return cloneNonemptyStr(T, result_in_place, arg2);
+    } else if (arg2.is_empty()) {
+        return cloneNonemptyStr(T, result_in_place, arg1);
+    } else {
+        const combined_length = arg1.len() + arg2.len();
+
+        const small_str_bytes = 2 * @sizeOf(T);
+        const result_is_big = combined_length >= small_str_bytes;
+
+        if (result_is_big) {
+            var result = allocate_str(T, result_in_place, combined_length);
+
+            {
+                const old_if_small = &@bitCast([16]u8, arg1);
+                const old_if_big = @ptrCast([*]u8, arg1.str_bytes);
+                const old_bytes = if (arg1.is_small_str()) old_if_small else old_if_big;
+
+                const new_bytes: [*]u8 = @ptrCast([*]u8, result.str_bytes);
+
+                @memcpy(new_bytes, old_bytes, arg1.len());
+            }
+
+            {
+                const old_if_small = &@bitCast([16]u8, arg2);
+                const old_if_big = @ptrCast([*]u8, arg2.str_bytes);
+                const old_bytes = if (arg2.is_small_str()) old_if_small else old_if_big;
+
+                const new_bytes = @ptrCast([*]u8, result.str_bytes) + arg1.len();
+
+                @memcpy(new_bytes, old_bytes, arg2.len());
+            }
+
+            return result;
+        } else {
+            var result = [16]u8{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+            // if the result is small, then for sure arg1 and arg2 are also small
+
+            {
+                var old_bytes: [*]u8 = @ptrCast([*]u8, &@bitCast([16]u8, arg1));
+                var new_bytes: [*]u8 = @ptrCast([*]u8, &result);
+
+                @memcpy(new_bytes, old_bytes, arg1.len());
+            }
+
+            {
+                var old_bytes: [*]u8 = @ptrCast([*]u8, &@bitCast([16]u8, arg2));
+                var new_bytes = @ptrCast([*]u8, &result) + arg1.len();
+
+                @memcpy(new_bytes, old_bytes, arg2.len());
+            }
+
+            const mask: u8 = 0b1000_0000;
+            const final_byte = @truncate(u8, combined_length) | mask;
+
+            result[small_str_bytes - 1] = final_byte;
+
+            return @bitCast(RocStr, result);
+        }
+
+        return result;
+    }
+}
+
+const InPlace = packed enum(u8) {
+    InPlace,
+    Clone,
+};
+
+fn cloneNonemptyStr(comptime T: type, in_place: InPlace, str: RocStr) RocStr {
+    if (str.is_small_str() or str.is_empty()) {
+        // just return the bytes
+        return str;
+    } else {
+        var new_str = allocate_str(T, in_place, str.str_len);
+
+        var old_bytes: [*]u8 = @ptrCast([*]u8, str.str_bytes);
+        var new_bytes: [*]u8 = @ptrCast([*]u8, new_str.str_bytes);
+
+        @memcpy(new_bytes, old_bytes, str.str_len);
+
+        return new_str;
+    }
+}
+
+fn allocate_str(comptime T: type, in_place: InPlace, number_of_chars: u64) RocStr {
+    const length = @sizeOf(T) + number_of_chars;
+    var new_bytes: [*]T = @ptrCast([*]T, @alignCast(@alignOf(T), malloc(length)));
+
+    if (in_place == InPlace.InPlace) {
+        new_bytes[0] = @intCast(T, number_of_chars);
+    } else {
+        new_bytes[0] = std.math.minInt(T);
+    }
+
+    var first_element = @ptrCast([*]align(@alignOf(T)) u8, new_bytes);
+    first_element += 8;
+
+    return RocStr{
+        .str_bytes = first_element,
+        .str_len = number_of_chars,
+    };
+}
diff --git a/compiler/builtins/src/bitcode.rs b/compiler/builtins/src/bitcode.rs
index aefb7bd186..2ad70214e6 100644
--- a/compiler/builtins/src/bitcode.rs
+++ b/compiler/builtins/src/bitcode.rs
@@ -24,6 +24,7 @@ pub const NUM_IS_FINITE: &str = "roc_builtins.num.is_finite";
 pub const NUM_POW_INT: &str = "roc_builtins.num.pow_int";
 
 pub const STR_COUNT_SEGMENTS: &str = "roc_builtins.str.count_segments";
+pub const STR_CONCAT: &str = "roc_builtins.str.concat";
 pub const STR_STR_SPLIT_IN_PLACE: &str = "roc_builtins.str.str_split_in_place";
 pub const STR_COUNT_GRAPEHEME_CLUSTERS: &str = "roc_builtins.str.count_grapheme_clusters";
 pub const STR_STARTS_WITH: &str = "roc_builtins.str.starts_with";
diff --git a/compiler/gen/src/llvm/build.rs b/compiler/gen/src/llvm/build.rs
index cf88609054..b42461d47d 100644
--- a/compiler/gen/src/llvm/build.rs
+++ b/compiler/gen/src/llvm/build.rs
@@ -604,7 +604,9 @@ pub fn build_exp_expr<'a, 'ctx, 'env>(
 
     match expr {
         Literal(literal) => build_exp_literal(env, literal),
-        RunLowLevel(op, symbols) => run_low_level(env, scope, parent, layout, *op, symbols),
+        RunLowLevel(op, symbols) => {
+            run_low_level(env, layout_ids, scope, parent, layout, *op, symbols)
+        }
 
         ForeignCall {
             foreign_symbol,
@@ -1165,12 +1167,10 @@ fn list_literal<'a, 'ctx, 'env>(
     let builder = env.builder;
 
     let len_u64 = elems.len() as u64;
-    let elem_bytes = elem_layout.stack_size(env.ptr_bytes) as u64;
 
     let ptr = {
-        let bytes_len = elem_bytes * len_u64;
         let len_type = env.ptr_int();
-        let len = len_type.const_int(bytes_len, false);
+        let len = len_type.const_int(len_u64, false);
 
         allocate_list(env, inplace, elem_layout, len)
 
@@ -2383,6 +2383,7 @@ fn call_with_args<'a, 'ctx, 'env>(
 }
 
 #[derive(Copy, Clone)]
+#[repr(u8)]
 pub enum InPlace {
     InPlace,
     Clone,
@@ -2409,6 +2410,7 @@ pub static COLD_CALL_CONV: u32 = 9;
 
 fn run_low_level<'a, 'ctx, 'env>(
     env: &Env<'a, 'ctx, 'env>,
+    layout_ids: &mut LayoutIds<'a>,
     scope: &Scope<'a, 'ctx>,
     parent: FunctionValue<'ctx>,
     layout: &Layout<'a>,
@@ -2522,7 +2524,16 @@ fn run_low_level<'a, 'ctx, 'env>(
 
             let inplace = get_inplace_from_layout(layout);
 
-            list_map(env, inplace, parent, func, func_layout, list, list_layout)
+            list_map(
+                env,
+                layout_ids,
+                inplace,
+                parent,
+                func,
+                func_layout,
+                list,
+                list_layout,
+            )
         }
         ListKeepIf => {
             // List.keepIf : List elem, (elem -> Bool) -> List elem
diff --git a/compiler/gen/src/llvm/build_list.rs b/compiler/gen/src/llvm/build_list.rs
index 904c3f0fe9..9c6c20bbb7 100644
--- a/compiler/gen/src/llvm/build_list.rs
+++ b/compiler/gen/src/llvm/build_list.rs
@@ -3,12 +3,13 @@ use crate::llvm::build::{
 };
 use crate::llvm::compare::build_eq;
 use crate::llvm::convert::{basic_type_from_layout, collection, get_ptr_type};
+use crate::llvm::refcounting::decrement_refcount_layout;
 use inkwell::builder::Builder;
 use inkwell::context::Context;
 use inkwell::types::{BasicTypeEnum, PointerType};
 use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, PointerValue, StructValue};
 use inkwell::{AddressSpace, IntPredicate};
-use roc_mono::layout::{Builtin, Layout, MemoryMode};
+use roc_mono::layout::{Builtin, Layout, LayoutIds, MemoryMode};
 
 /// List.single : a -> List a
 pub fn list_single<'a, 'ctx, 'env>(
@@ -1318,8 +1319,10 @@ pub fn list_keep_if_help<'a, 'ctx, 'env>(
 }
 
 /// List.map : List before, (before -> after) -> List after
+#[allow(clippy::too_many_arguments)]
 pub fn list_map<'a, 'ctx, 'env>(
     env: &Env<'a, 'ctx, 'env>,
+    layout_ids: &mut LayoutIds<'a>,
     inplace: InPlace,
     parent: FunctionValue<'ctx>,
     func: BasicValueEnum<'ctx>,
@@ -1365,7 +1368,11 @@ pub fn list_map<'a, 'ctx, 'env>(
 
                 incrementing_elem_loop(builder, ctx, parent, list_ptr, len, "#index", list_loop);
 
-                store_list(env, ret_list_ptr, len)
+                let result = store_list(env, ret_list_ptr, len);
+
+                decrement_refcount_layout(env, parent, layout_ids, list, list_layout);
+
+                result
             };
 
             if_list_is_not_empty(env, parent, non_empty_fn, list, list_layout, "List.map")
@@ -2043,7 +2050,6 @@ pub fn allocate_list<'a, 'ctx, 'env>(
     let len_type = env.ptr_int();
     let elem_bytes = elem_layout.stack_size(env.ptr_bytes) as u64;
     let bytes_per_element = len_type.const_int(elem_bytes, false);
-
     let number_of_data_bytes = builder.build_int_mul(bytes_per_element, length, "data_length");
 
     let rc1 = match inplace {
diff --git a/compiler/gen/src/llvm/build_str.rs b/compiler/gen/src/llvm/build_str.rs
index 90dd22e58e..6b4a885331 100644
--- a/compiler/gen/src/llvm/build_str.rs
+++ b/compiler/gen/src/llvm/build_str.rs
@@ -1,9 +1,7 @@
 use crate::llvm::build::{
     call_bitcode_fn, call_void_bitcode_fn, ptr_from_symbol, Env, InPlace, Scope,
 };
-use crate::llvm::build_list::{
-    allocate_list, build_basic_phi2, empty_list, incrementing_elem_loop, load_list_ptr, store_list,
-};
+use crate::llvm::build_list::{allocate_list, build_basic_phi2, load_list_ptr, store_list};
 use crate::llvm::convert::collection;
 use inkwell::builder::Builder;
 use inkwell::types::BasicTypeEnum;
@@ -90,333 +88,117 @@ pub fn str_split<'a, 'ctx, 'env>(
     )
 }
 
+/*
+fn cast_to_zig_str(
+    env: &Env<'a, 'ctx, 'env>,
+    str_as_struct: StructValue<'ctx>,
+) -> BasicValueEnum<'ctx> {
+    // get the RocStr type defined by zig
+    let roc_str_type = env.module.get_struct_type("str.RocStr").unwrap();
+
+    // convert `{ *mut u8, i64 }` to `RocStr`
+    builder.build_bitcast(str_as_struct, roc_str_type, "convert_to_zig_rocstr");
+}
+
+fn cast_from_zig_str(
+    env: &Env<'a, 'ctx, 'env>,
+    str_as_struct: StructValue<'ctx>,
+) -> BasicValueEnum<'ctx> {
+    let ret_type = BasicTypeEnum::StructType(collection(ctx, env.ptr_bytes));
+
+    // convert `RocStr` to `{ *mut u8, i64 }`
+    builder.build_bitcast(str_as_struct, ret_type, "convert_from_zig_rocstr");
+}
+*/
+
+fn str_symbol_to_i128<'a, 'ctx, 'env>(
+    env: &Env<'a, 'ctx, 'env>,
+    scope: &Scope<'a, 'ctx>,
+    symbol: Symbol,
+) -> IntValue<'ctx> {
+    let str_ptr = ptr_from_symbol(scope, symbol);
+
+    let i128_ptr = env
+        .builder
+        .build_bitcast(
+            *str_ptr,
+            env.context.i128_type().ptr_type(AddressSpace::Generic),
+            "cast",
+        )
+        .into_pointer_value();
+
+    env.builder
+        .build_load(i128_ptr, "load_as_i128")
+        .into_int_value()
+}
+
+fn zig_str_to_struct<'a, 'ctx, 'env>(
+    env: &Env<'a, 'ctx, 'env>,
+    zig_str: StructValue<'ctx>,
+) -> StructValue<'ctx> {
+    let builder = env.builder;
+
+    // get the RocStr type defined by zig
+    let zig_str_type = env.module.get_struct_type("str.RocStr").unwrap();
+
+    let ret_type = BasicTypeEnum::StructType(collection(env.context, env.ptr_bytes));
+
+    // a roundabout way of casting (LLVM does not accept a standard bitcast)
+    let allocation = builder.build_alloca(zig_str_type, "zig_result");
+
+    builder.build_store(allocation, zig_str);
+
+    let ptr3 = builder
+        .build_bitcast(
+            allocation,
+            env.context.i128_type().ptr_type(AddressSpace::Generic),
+            "cast",
+        )
+        .into_pointer_value();
+
+    let ptr4 = builder
+        .build_bitcast(
+            ptr3,
+            ret_type.into_struct_type().ptr_type(AddressSpace::Generic),
+            "cast",
+        )
+        .into_pointer_value();
+
+    builder.build_load(ptr4, "load").into_struct_value()
+}
+
 /// Str.concat : Str, Str -> Str
 pub fn str_concat<'a, 'ctx, 'env>(
     env: &Env<'a, 'ctx, 'env>,
     inplace: InPlace,
     scope: &Scope<'a, 'ctx>,
-    parent: FunctionValue<'ctx>,
-    first_str_symbol: Symbol,
-    second_str_symbol: Symbol,
+    _parent: FunctionValue<'ctx>,
+    str1_symbol: Symbol,
+    str2_symbol: Symbol,
 ) -> BasicValueEnum<'ctx> {
-    let builder = env.builder;
-    let ctx = env.context;
+    // swap the arguments; second argument comes before the second in the output string
+    let str1_i128 = str_symbol_to_i128(env, scope, str1_symbol);
+    let str2_i128 = str_symbol_to_i128(env, scope, str2_symbol);
 
-    let second_str_ptr = ptr_from_symbol(scope, second_str_symbol);
-    let first_str_ptr = ptr_from_symbol(scope, first_str_symbol);
-
-    let ret_type = BasicTypeEnum::StructType(collection(ctx, env.ptr_bytes));
-
-    load_str(
+    let zig_result = call_bitcode_fn(
         env,
-        parent,
-        *second_str_ptr,
-        ret_type,
-        |second_str_ptr, second_str_len, second_str_smallness| {
-            load_str(
-                env,
-                parent,
-                *first_str_ptr,
-                ret_type,
-                |first_str_ptr, first_str_len, first_str_smallness| {
-                    // first_str_len > 0
-                    // We do this check to avoid allocating memory. If the first input
-                    // str is empty, then we can just return the second str cloned
-                    let first_str_length_comparison = str_is_not_empty(env, first_str_len);
-
-                    let if_first_str_is_empty = || {
-                        // second_str_len > 0
-                        // We do this check to avoid allocating memory. If the second input
-                        // str is empty, then we can just return an empty str
-                        let second_str_length_comparison = str_is_not_empty(env, second_str_len);
-
-                        let if_second_str_is_nonempty = || {
-                            let (new_wrapper, _) = clone_nonempty_str(
-                                env,
-                                inplace,
-                                second_str_smallness,
-                                second_str_len,
-                                second_str_ptr,
-                            );
-
-                            BasicValueEnum::StructValue(new_wrapper)
-                        };
-
-                        let if_second_str_is_empty = || empty_list(env);
-
-                        build_basic_phi2(
-                            env,
-                            parent,
-                            second_str_length_comparison,
-                            if_second_str_is_nonempty,
-                            if_second_str_is_empty,
-                            ret_type,
-                        )
-                    };
-
-                    let if_first_str_is_not_empty = || {
-                        let if_second_str_is_empty = || {
-                            let (new_wrapper, _) = clone_nonempty_str(
-                                env,
-                                inplace,
-                                first_str_smallness,
-                                first_str_len,
-                                first_str_ptr,
-                            );
-
-                            BasicValueEnum::StructValue(new_wrapper)
-                        };
-
-                        // second_str_len > 0
-                        // We do this check to avoid allocating memory. If the second input
-                        // str is empty, then we can just return the first str cloned
-                        let second_str_length_comparison = str_is_not_empty(env, second_str_len);
-
-                        let if_second_str_is_not_empty = || {
-                            let combined_str_len = builder.build_int_add(
-                                first_str_len,
-                                second_str_len,
-                                "add_list_lengths",
-                            );
-
-                            // The combined string is big iff its length is
-                            // greater than or equal to the size in memory
-                            // of a small str (e.g. len >= 16 on 64-bit targets)
-                            let is_big = env.builder.build_int_compare(
-                                IntPredicate::UGE,
-                                combined_str_len,
-                                env.ptr_int().const_int(env.small_str_bytes() as u64, false),
-                                "str_is_big",
-                            );
-
-                            let if_big = || {
-                                let combined_str_ptr =
-                                    allocate_list(env, inplace, &CHAR_LAYOUT, combined_str_len);
-
-                                // TODO replace FIRST_LOOP with a memcpy!
-                                // FIRST LOOP
-                                let first_loop = |first_index, first_str_elem| {
-                                    // The pointer to the element in the combined list
-                                    let combined_str_elem_ptr = unsafe {
-                                        builder.build_in_bounds_gep(
-                                            combined_str_ptr,
-                                            &[first_index],
-                                            "load_index_combined_list",
-                                        )
-                                    };
-
-                                    // Mutate the new array in-place to change the element.
-                                    builder.build_store(combined_str_elem_ptr, first_str_elem);
-                                };
-
-                                let index_name = "#index";
-
-                                let index_alloca = incrementing_elem_loop(
-                                    builder,
-                                    ctx,
-                                    parent,
-                                    first_str_ptr,
-                                    first_str_len,
-                                    index_name,
-                                    first_loop,
-                                );
-
-                                // Reset the index variable to 0
-                                builder
-                                    .build_store(index_alloca, ctx.i64_type().const_int(0, false));
-
-                                // TODO replace SECOND_LOOP with a memcpy!
-                                // SECOND LOOP
-                                let second_loop = |second_index, second_str_elem| {
-                                    // The pointer to the element in the combined str.
-                                    // Note that the pointer does not start at the index
-                                    // 0, it starts at the index of first_str_len. In that
-                                    // sense it is "offset".
-                                    let offset_combined_str_char_ptr = unsafe {
-                                        builder.build_in_bounds_gep(
-                                            combined_str_ptr,
-                                            &[first_str_len],
-                                            "elem",
-                                        )
-                                    };
-
-                                    // The pointer to the char from the second str
-                                    // in the combined list
-                                    let combined_str_char_ptr = unsafe {
-                                        builder.build_in_bounds_gep(
-                                            offset_combined_str_char_ptr,
-                                            &[second_index],
-                                            "load_index_combined_list",
-                                        )
-                                    };
-
-                                    // Mutate the new array in-place to change the element.
-                                    builder.build_store(combined_str_char_ptr, second_str_elem);
-                                };
-
-                                incrementing_elem_loop(
-                                    builder,
-                                    ctx,
-                                    parent,
-                                    second_str_ptr,
-                                    second_str_len,
-                                    index_name,
-                                    second_loop,
-                                );
-
-                                store_list(env, combined_str_ptr, combined_str_len)
-                            };
-
-                            let if_small = || {
-                                let combined_str_ptr = builder.build_array_alloca(
-                                    ctx.i8_type(),
-                                    ctx.i8_type().const_int(env.small_str_bytes() as u64, false),
-                                    "alloca_small_str",
-                                );
-
-                                // TODO replace FIRST_LOOP with a memcpy!
-                                // FIRST LOOP
-                                let first_loop = |first_index, first_str_elem| {
-                                    // The pointer to the element in the combined list
-                                    let combined_str_elem_ptr = unsafe {
-                                        builder.build_in_bounds_gep(
-                                            combined_str_ptr,
-                                            &[first_index],
-                                            "load_index_combined_list",
-                                        )
-                                    };
-
-                                    // Mutate the new array in-place to change the element.
-                                    builder.build_store(combined_str_elem_ptr, first_str_elem);
-                                };
-
-                                let index_name = "#index";
-
-                                let index_alloca = incrementing_elem_loop(
-                                    builder,
-                                    ctx,
-                                    parent,
-                                    first_str_ptr,
-                                    first_str_len,
-                                    index_name,
-                                    first_loop,
-                                );
-
-                                // Reset the index variable to 0
-                                builder
-                                    .build_store(index_alloca, ctx.i64_type().const_int(0, false));
-
-                                // TODO replace SECOND_LOOP with a memcpy!
-                                // SECOND LOOP
-                                let second_loop = |second_index, second_str_elem| {
-                                    // The pointer to the element in the combined str.
-                                    // Note that the pointer does not start at the index
-                                    // 0, it starts at the index of first_str_len. In that
-                                    // sense it is "offset".
-                                    let offset_combined_str_char_ptr = unsafe {
-                                        builder.build_in_bounds_gep(
-                                            combined_str_ptr,
-                                            &[first_str_len],
-                                            "elem",
-                                        )
-                                    };
-
-                                    // The pointer to the char from the second str
-                                    // in the combined list
-                                    let combined_str_char_ptr = unsafe {
-                                        builder.build_in_bounds_gep(
-                                            offset_combined_str_char_ptr,
-                                            &[second_index],
-                                            "load_index_combined_list",
-                                        )
-                                    };
-
-                                    // Mutate the new array in-place to change the element.
-                                    builder.build_store(combined_str_char_ptr, second_str_elem);
-                                };
-
-                                incrementing_elem_loop(
-                                    builder,
-                                    ctx,
-                                    parent,
-                                    second_str_ptr,
-                                    second_str_len,
-                                    index_name,
-                                    second_loop,
-                                );
-
-                                let final_byte = builder.build_int_cast(
-                                    combined_str_len,
-                                    ctx.i8_type(),
-                                    "str_len_to_i8",
-                                );
-
-                                let final_byte = builder.build_or(
-                                    final_byte,
-                                    ctx.i8_type().const_int(0b1000_0000, false),
-                                    "str_len_set_discriminant",
-                                );
-
-                                let final_byte_ptr = unsafe {
-                                    builder.build_in_bounds_gep(
-                                        combined_str_ptr,
-                                        &[ctx
-                                            .i8_type()
-                                            .const_int(env.small_str_bytes() as u64 - 1, false)],
-                                        "str_literal_final_byte",
-                                    )
-                                };
-
-                                builder.build_store(final_byte_ptr, final_byte);
-
-                                builder.build_load(
-                                    builder
-                                        .build_bitcast(
-                                            combined_str_ptr,
-                                            collection(ctx, env.ptr_bytes)
-                                                .ptr_type(AddressSpace::Generic),
-                                            "cast_collection",
-                                        )
-                                        .into_pointer_value(),
-                                    "small_str_array",
-                                )
-                            };
-
-                            // If the combined length fits in a small string,
-                            // write into a small string!
-                            build_basic_phi2(
-                                env,
-                                parent,
-                                is_big,
-                                // the result of a Str.concat is most likely big
-                                if_big,
-                                if_small,
-                                BasicTypeEnum::StructType(collection(ctx, env.ptr_bytes)),
-                            )
-                        };
-
-                        build_basic_phi2(
-                            env,
-                            parent,
-                            second_str_length_comparison,
-                            if_second_str_is_not_empty,
-                            if_second_str_is_empty,
-                            BasicTypeEnum::StructType(collection(ctx, env.ptr_bytes)),
-                        )
-                    };
-
-                    build_basic_phi2(
-                        env,
-                        parent,
-                        first_str_length_comparison,
-                        if_first_str_is_not_empty,
-                        if_first_str_is_empty,
-                        BasicTypeEnum::StructType(collection(ctx, env.ptr_bytes)),
-                    )
-                },
-            )
-        },
+        &[
+            env.context
+                .i32_type()
+                .const_int(env.ptr_bytes as u64, false)
+                .into(),
+            env.context
+                .i8_type()
+                .const_int(inplace as u64, false)
+                .into(),
+            str1_i128.into(),
+            str2_i128.into(),
+        ],
+        &bitcode::STR_CONCAT,
     )
+    .into_struct_value();
+
+    zig_str_to_struct(env, zig_result).into()
 }
 
 /// Obtain the string's length, cast from i8 to usize
@@ -511,82 +293,6 @@ enum Smallness {
     Big,
 }
 
-fn clone_nonempty_str<'a, 'ctx, 'env>(
-    env: &Env<'a, 'ctx, 'env>,
-    inplace: InPlace,
-    smallness: Smallness,
-    len: IntValue<'ctx>,
-    bytes_ptr: PointerValue<'ctx>,
-) -> (StructValue<'ctx>, PointerValue<'ctx>) {
-    let builder = env.builder;
-    let ctx = env.context;
-    let ptr_bytes = env.ptr_bytes;
-
-    // Allocate space for the new str that we'll copy into.
-    match smallness {
-        Smallness::Small => {
-            let wrapper_struct_ptr = cast_str_bytes_to_wrapper(env, bytes_ptr);
-            let wrapper_struct = builder.build_load(wrapper_struct_ptr, "str_wrapper");
-            let alloca = builder.build_alloca(collection(ctx, ptr_bytes), "small_str_clone");
-
-            builder.build_store(alloca, wrapper_struct);
-
-            (wrapper_struct.into_struct_value(), alloca)
-        }
-        Smallness::Big => {
-            let clone_ptr = allocate_list(env, inplace, &CHAR_LAYOUT, len);
-
-            // TODO check if malloc returned null; if so, runtime error for OOM!
-
-            // Copy the bytes from the original array into the new
-            // one we just malloc'd.
-            builder
-                .build_memcpy(clone_ptr, ptr_bytes, bytes_ptr, ptr_bytes, len)
-                .unwrap();
-
-            // Create a fresh wrapper struct for the newly populated array
-            let struct_type = collection(ctx, env.ptr_bytes);
-            let mut struct_val;
-
-            // Store the pointer
-            struct_val = builder
-                .build_insert_value(
-                    struct_type.get_undef(),
-                    clone_ptr,
-                    Builtin::WRAPPER_PTR,
-                    "insert_ptr",
-                )
-                .unwrap();
-
-            // Store the length
-            struct_val = builder
-                .build_insert_value(struct_val, len, Builtin::WRAPPER_LEN, "insert_len")
-                .unwrap();
-
-            let answer = builder
-                .build_bitcast(
-                    struct_val.into_struct_value(),
-                    collection(ctx, ptr_bytes),
-                    "cast_collection",
-                )
-                .into_struct_value();
-
-            (answer, clone_ptr)
-        }
-    }
-}
-
-fn cast_str_bytes_to_wrapper<'a, 'ctx, 'env>(
-    env: &Env<'a, 'ctx, 'env>,
-    bytes_ptr: PointerValue<'ctx>,
-) -> PointerValue<'ctx> {
-    let struct_ptr_type = collection(env.context, env.ptr_bytes).ptr_type(AddressSpace::Generic);
-
-    env.builder
-        .build_bitcast(bytes_ptr, struct_ptr_type, "str_as_struct_ptr")
-        .into_pointer_value()
-}
-
 fn cast_str_wrapper_to_array<'a, 'ctx, 'env>(
     env: &Env<'a, 'ctx, 'env>,
     wrapper_ptr: PointerValue<'ctx>,
@@ -661,6 +367,7 @@ fn big_str_len<'ctx>(builder: &Builder<'ctx>, wrapper_struct: StructValue<'ctx>)
         .into_int_value()
 }
 
+#[allow(dead_code)]
 fn str_is_not_empty<'ctx>(env: &Env<'_, 'ctx, '_>, len: IntValue<'ctx>) -> IntValue<'ctx> {
     env.builder.build_int_compare(
         IntPredicate::UGT,
diff --git a/compiler/gen_dev/Cargo.toml b/compiler/gen_dev/Cargo.toml
index e27ab63099..f7c41ebbe0 100644
--- a/compiler/gen_dev/Cargo.toml
+++ b/compiler/gen_dev/Cargo.toml
@@ -42,3 +42,6 @@ bumpalo = { version = "3.2", features = ["collections"] }
 libc = "0.2"
 tempfile = "3.1.0"
 itertools = "0.9"
+
+[features]
+target-aarch64 = ["roc_build/target-aarch64"]
diff --git a/compiler/gen_dev/src/generic64/aarch64.rs b/compiler/gen_dev/src/generic64/aarch64.rs
new file mode 100644
index 0000000000..df58823d12
--- /dev/null
+++ b/compiler/gen_dev/src/generic64/aarch64.rs
@@ -0,0 +1,814 @@
+use crate::generic64::{Assembler, CallConv, GPRegTrait};
+use bumpalo::collections::Vec;
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)]
+#[allow(dead_code)]
+pub enum AArch64GPReg {
+    X0 = 0,
+    X1 = 1,
+    X2 = 2,
+    X3 = 3,
+    X4 = 4,
+    X5 = 5,
+    X6 = 6,
+    X7 = 7,
+    XR = 8,
+    X9 = 9,
+    X10 = 10,
+    X11 = 11,
+    X12 = 12,
+    X13 = 13,
+    X14 = 14,
+    X15 = 15,
+    IP0 = 16,
+    IP1 = 17,
+    PR = 18,
+    X19 = 19,
+    X20 = 20,
+    X21 = 21,
+    X22 = 22,
+    X23 = 23,
+    X24 = 24,
+    X25 = 25,
+    X26 = 26,
+    X27 = 27,
+    X28 = 28,
+    FP = 29,
+    LR = 30,
+    /// This can mean Zero or Stack Pointer depending on the context.
+    ZRSP = 31,
+}
+
+impl GPRegTrait for AArch64GPReg {}
+
+pub struct AArch64Assembler {}
+
+// AArch64Call may need to eventually be split by OS,
+// but I think with how we use it, they may all be the same.
+pub struct AArch64Call {}
+
+const STACK_ALIGNMENT: u8 = 16;
+
+impl CallConv<AArch64GPReg> for AArch64Call {
+    const GP_PARAM_REGS: &'static [AArch64GPReg] = &[
+        AArch64GPReg::X0,
+        AArch64GPReg::X1,
+        AArch64GPReg::X2,
+        AArch64GPReg::X3,
+        AArch64GPReg::X4,
+        AArch64GPReg::X5,
+        AArch64GPReg::X6,
+        AArch64GPReg::X7,
+    ];
+    const GP_RETURN_REGS: &'static [AArch64GPReg] = Self::GP_PARAM_REGS;
+    const GP_DEFAULT_FREE_REGS: &'static [AArch64GPReg] = &[
+        // The regs we want to use first should be at the end of this vec.
+        // We will use pop to get which reg to use next
+
+        // Don't use frame pointer: AArch64GPReg::FP,
+        // Don't user indirect result location: AArch64GPReg::XR,
+        // Don't use platform register: AArch64GPReg::PR,
+        // Don't use link register: AArch64GPReg::LR,
+        // Don't use zero register/stack pointer: AArch64GPReg::ZRSP,
+
+        // Use callee saved regs last.
+        AArch64GPReg::X19,
+        AArch64GPReg::X20,
+        AArch64GPReg::X21,
+        AArch64GPReg::X22,
+        AArch64GPReg::X23,
+        AArch64GPReg::X24,
+        AArch64GPReg::X25,
+        AArch64GPReg::X26,
+        AArch64GPReg::X27,
+        AArch64GPReg::X28,
+        // Use caller saved regs first.
+        AArch64GPReg::X0,
+        AArch64GPReg::X1,
+        AArch64GPReg::X2,
+        AArch64GPReg::X3,
+        AArch64GPReg::X4,
+        AArch64GPReg::X5,
+        AArch64GPReg::X6,
+        AArch64GPReg::X7,
+        AArch64GPReg::X9,
+        AArch64GPReg::X10,
+        AArch64GPReg::X11,
+        AArch64GPReg::X12,
+        AArch64GPReg::X13,
+        AArch64GPReg::X14,
+        AArch64GPReg::X15,
+        AArch64GPReg::IP0,
+        AArch64GPReg::IP1,
+    ];
+
+    const SHADOW_SPACE_SIZE: u8 = 0;
+
+    #[inline(always)]
+    fn callee_saved(reg: &AArch64GPReg) -> bool {
+        matches!(
+            reg,
+            AArch64GPReg::X19
+                | AArch64GPReg::X20
+                | AArch64GPReg::X21
+                | AArch64GPReg::X22
+                | AArch64GPReg::X23
+                | AArch64GPReg::X24
+                | AArch64GPReg::X25
+                | AArch64GPReg::X26
+                | AArch64GPReg::X27
+                | AArch64GPReg::X28
+        )
+    }
+
+    #[inline(always)]
+    fn setup_stack<'a>(
+        buf: &mut Vec<'a, u8>,
+        leaf_function: bool,
+        saved_regs: &[AArch64GPReg],
+        requested_stack_size: i32,
+    ) -> Result<i32, String> {
+        // full size is upcast to i64 to make sure we don't overflow here.
+        let mut full_size = 8 * saved_regs.len() as i64 + requested_stack_size as i64;
+        if !leaf_function {
+            full_size += 8;
+        }
+        let alignment = if full_size <= 0 {
+            0
+        } else {
+            full_size % STACK_ALIGNMENT as i64
+        };
+        let offset = if alignment == 0 {
+            0
+        } else {
+            STACK_ALIGNMENT - alignment as u8
+        };
+        if let Some(aligned_stack_size) =
+            requested_stack_size.checked_add(8 * saved_regs.len() as i32 + offset as i32)
+        {
+            if aligned_stack_size > 0 {
+                AArch64Assembler::sub_reg64_reg64_imm32(
+                    buf,
+                    AArch64GPReg::ZRSP,
+                    AArch64GPReg::ZRSP,
+                    aligned_stack_size,
+                );
+
+                // All the following stores could be optimized by using `STP` to store pairs.
+                let mut offset = aligned_stack_size;
+                if !leaf_function {
+                    offset -= 8;
+                    AArch64Assembler::mov_stack32_reg64(buf, offset, AArch64GPReg::LR);
+                    offset -= 8;
+                    AArch64Assembler::mov_stack32_reg64(buf, offset, AArch64GPReg::FP);
+                }
+                for reg in saved_regs {
+                    offset -= 8;
+                    AArch64Assembler::mov_stack32_reg64(buf, offset, *reg);
+                }
+                Ok(aligned_stack_size)
+            } else {
+                Ok(0)
+            }
+        } else {
+            Err("Ran out of stack space".to_string())
+        }
+    }
+
+    #[inline(always)]
+    fn cleanup_stack<'a>(
+        buf: &mut Vec<'a, u8>,
+        leaf_function: bool,
+        saved_regs: &[AArch64GPReg],
+        aligned_stack_size: i32,
+    ) -> Result<(), String> {
+        if aligned_stack_size > 0 {
+            // All the following stores could be optimized by using `STP` to store pairs.
+            let mut offset = aligned_stack_size;
+            if !leaf_function {
+                offset -= 8;
+                AArch64Assembler::mov_reg64_stack32(buf, AArch64GPReg::LR, offset);
+                offset -= 8;
+                AArch64Assembler::mov_reg64_stack32(buf, AArch64GPReg::FP, offset);
+            }
+            for reg in saved_regs {
+                offset -= 8;
+                AArch64Assembler::mov_reg64_stack32(buf, *reg, offset);
+            }
+            AArch64Assembler::add_reg64_reg64_imm32(
+                buf,
+                AArch64GPReg::ZRSP,
+                AArch64GPReg::ZRSP,
+                aligned_stack_size,
+            );
+        }
+        Ok(())
+    }
+}
+
+impl Assembler<AArch64GPReg> for AArch64Assembler {
+    #[inline(always)]
+    fn abs_reg64_reg64<'a>(_buf: &mut Vec<'a, u8>, _dst: AArch64GPReg, _src: AArch64GPReg) {
+        unimplemented!("abs_reg64_reg64 is not yet implement for AArch64");
+    }
+
+    #[inline(always)]
+    fn add_reg64_reg64_imm32<'a>(
+        buf: &mut Vec<'a, u8>,
+        dst: AArch64GPReg,
+        src: AArch64GPReg,
+        imm32: i32,
+    ) {
+        if imm32 < 0 {
+            unimplemented!("immediate addition with values less than 0 are not yet implemented");
+        } else if imm32 < 0xFFF {
+            add_reg64_reg64_imm12(buf, dst, src, imm32 as u16);
+        } else {
+            unimplemented!(
+                "immediate additions with values greater than 12bits are not yet implemented"
+            );
+        }
+    }
+
+    #[inline(always)]
+    fn add_reg64_reg64_reg64<'a>(
+        buf: &mut Vec<'a, u8>,
+        dst: AArch64GPReg,
+        src1: AArch64GPReg,
+        src2: AArch64GPReg,
+    ) {
+        add_reg64_reg64_reg64(buf, dst, src1, src2);
+    }
+
+    #[inline(always)]
+    fn mov_reg64_imm64<'a>(buf: &mut Vec<'a, u8>, dst: AArch64GPReg, imm: i64) {
+        let mut remaining = imm as u64;
+        movz_reg64_imm16(buf, dst, remaining as u16, 0);
+        remaining >>= 16;
+        if remaining > 0 {
+            movk_reg64_imm16(buf, dst, remaining as u16, 1);
+        }
+        remaining >>= 16;
+        if remaining > 0 {
+            movk_reg64_imm16(buf, dst, remaining as u16, 2);
+        }
+        remaining >>= 16;
+        if remaining > 0 {
+            movk_reg64_imm16(buf, dst, remaining as u16, 3);
+        }
+    }
+
+    #[inline(always)]
+    fn mov_reg64_reg64<'a>(buf: &mut Vec<'a, u8>, dst: AArch64GPReg, src: AArch64GPReg) {
+        mov_reg64_reg64(buf, dst, src);
+    }
+
+    #[inline(always)]
+    fn mov_reg64_stack32<'a>(buf: &mut Vec<'a, u8>, dst: AArch64GPReg, offset: i32) {
+        if offset < 0 {
+            unimplemented!("negative stack offsets are not yet implement for AArch64");
+        } else if offset < (0xFFF << 8) {
+            debug_assert!(offset % 8 == 0);
+            ldr_reg64_imm12(buf, dst, AArch64GPReg::ZRSP, (offset as u16) >> 3);
+        } else {
+            unimplemented!("stack offsets over 32k are not yet implement for AArch64");
+        }
+    }
+
+    #[inline(always)]
+    fn mov_stack32_reg64<'a>(buf: &mut Vec<'a, u8>, offset: i32, src: AArch64GPReg) {
+        if offset < 0 {
+            unimplemented!("negative stack offsets are not yet implement for AArch64");
+        } else if offset < (0xFFF << 8) {
+            debug_assert!(offset % 8 == 0);
+            str_reg64_imm12(buf, src, AArch64GPReg::ZRSP, (offset as u16) >> 3);
+        } else {
+            unimplemented!("stack offsets over 32k are not yet implement for AArch64");
+        }
+    }
+
+    #[inline(always)]
+    fn sub_reg64_reg64_imm32<'a>(
+        buf: &mut Vec<'a, u8>,
+        dst: AArch64GPReg,
+        src: AArch64GPReg,
+        imm32: i32,
+    ) {
+        if imm32 < 0 {
+            unimplemented!(
+                "immediate subtractions with values less than 0 are not yet implemented"
+            );
+        } else if imm32 < 0xFFF {
+            sub_reg64_reg64_imm12(buf, dst, src, imm32 as u16);
+        } else {
+            unimplemented!(
+                "immediate subtractions with values greater than 12bits are not yet implemented"
+            );
+        }
+    }
+
+    #[inline(always)]
+    fn ret<'a>(buf: &mut Vec<'a, u8>) {
+        ret_reg64(buf, AArch64GPReg::LR)
+    }
+}
+
+impl AArch64Assembler {}
+
+/// AArch64Instruction, maps all instructions to an enum.
+/// Decoding the function should be cheap because we will always inline.
+/// All of the operations should resolved by constants, leave just some bit manipulation.
+/// Enums may not be complete since we will only add what we need.
+#[derive(Debug)]
+enum AArch64Instruction {
+    _Reserved,
+    _SVE,
+    DPImm(DPImmGroup),
+    Branch(BranchGroup),
+    LdStr(LdStrGroup),
+    DPReg(DPRegGroup),
+    _DPFloat,
+}
+
+#[derive(Debug)]
+enum BranchGroup {
+    UnconditionBranchReg {
+        opc: u8,
+        op2: u8,
+        op3: u8,
+        reg_n: AArch64GPReg,
+        op4: u8,
+    },
+}
+
+#[derive(Debug)]
+enum DPRegGroup {
+    AddSubShifted {
+        sf: bool,
+        subtract: bool,
+        set_flags: bool,
+        shift: u8,
+        reg_m: AArch64GPReg,
+        imm6: u8,
+        reg_n: AArch64GPReg,
+        reg_d: AArch64GPReg,
+    },
+    Logical {
+        sf: bool,
+        op: DPRegLogicalOp,
+        shift: u8,
+        reg_m: AArch64GPReg,
+        imm6: u8,
+        reg_n: AArch64GPReg,
+        reg_d: AArch64GPReg,
+    },
+}
+
+#[derive(Debug)]
+enum DPImmGroup {
+    AddSubImm {
+        sf: bool,
+        subtract: bool,
+        set_flags: bool,
+        shift: bool,
+        imm12: u16,
+        reg_n: AArch64GPReg,
+        reg_d: AArch64GPReg,
+    },
+    MoveWide {
+        sf: bool,
+        opc: u8,
+        hw: u8,
+        imm16: u16,
+        reg_d: AArch64GPReg,
+    },
+}
+
+#[derive(Debug)]
+enum LdStrGroup {
+    UnsignedImm {
+        size: u8,
+        v: bool,
+        opc: u8,
+        imm12: u16,
+        reg_n: AArch64GPReg,
+        reg_t: AArch64GPReg,
+    },
+}
+
+#[derive(Debug)]
+#[allow(dead_code)]
+enum DPRegLogicalOp {
+    AND,
+    BIC,
+    ORR,
+    ORN,
+    EOR,
+    EON,
+    ANDS,
+    BICS,
+}
+
+#[inline(always)]
+fn build_instruction(inst: AArch64Instruction) -> [u8; 4] {
+    let mut out: u32 = 0;
+    match inst {
+        AArch64Instruction::Branch(branch) => {
+            out |= 0b101 << 26;
+            match branch {
+                BranchGroup::UnconditionBranchReg {
+                    opc,
+                    op2,
+                    op3,
+                    reg_n,
+                    op4,
+                } => {
+                    debug_assert!(opc <= 0b1111);
+                    debug_assert!(op2 <= 0b11111);
+                    debug_assert!(op3 <= 0b111111);
+                    debug_assert!(op4 <= 0b1111);
+                    out |= 0b1101011 << 25;
+                    out |= (opc as u32) << 21;
+                    out |= (op2 as u32) << 16;
+                    out |= (op3 as u32) << 10;
+                    out |= (reg_n as u32) << 5;
+                    out |= op4 as u32;
+                }
+            }
+        }
+        AArch64Instruction::DPImm(dpimm) => {
+            out |= 0b100 << 26;
+            match dpimm {
+                DPImmGroup::MoveWide {
+                    sf,
+                    opc,
+                    hw,
+                    imm16,
+                    reg_d,
+                } => {
+                    out |= (sf as u32) << 31;
+                    out |= (opc as u32) << 29;
+                    out |= 0b101 << 23;
+                    out |= (hw as u32) << 21;
+                    out |= (imm16 as u32) << 5;
+                    out |= reg_d as u32;
+                }
+                DPImmGroup::AddSubImm {
+                    sf,
+                    subtract,
+                    set_flags,
+                    shift,
+                    imm12,
+                    reg_n,
+                    reg_d,
+                } => {
+                    debug_assert!(imm12 <= 0xFFF);
+                    out |= (sf as u32) << 31;
+                    out |= (subtract as u32) << 30;
+                    out |= (set_flags as u32) << 29;
+                    out |= 0b010 << 23;
+                    out |= (shift as u32) << 22;
+                    out |= (imm12 as u32) << 10;
+                    out |= (reg_n as u32) << 5;
+                    out |= reg_d as u32;
+                }
+            }
+        }
+        AArch64Instruction::DPReg(dpreg) => {
+            out |= 0b101 << 25;
+            match dpreg {
+                DPRegGroup::Logical {
+                    sf,
+                    op,
+                    shift,
+                    reg_m,
+                    imm6,
+                    reg_n,
+                    reg_d,
+                } => {
+                    debug_assert!(shift <= 0b11);
+                    debug_assert!(imm6 <= 0b111111);
+                    let (opc, n) = match op {
+                        DPRegLogicalOp::AND => (0b00, 0),
+                        DPRegLogicalOp::BIC => (0b00, 1),
+                        DPRegLogicalOp::ORR => (0b01, 0),
+                        DPRegLogicalOp::ORN => (0b01, 1),
+                        DPRegLogicalOp::EOR => (0b10, 0),
+                        DPRegLogicalOp::EON => (0b10, 1),
+                        DPRegLogicalOp::ANDS => (0b11, 0),
+                        DPRegLogicalOp::BICS => (0b11, 1),
+                    };
+                    out |= (sf as u32) << 31;
+                    out |= opc << 29;
+                    out |= (shift as u32) << 22;
+                    out |= n << 21;
+                    out |= (reg_m as u32) << 16;
+                    out |= (imm6 as u32) << 10;
+                    out |= (reg_n as u32) << 5;
+                    out |= reg_d as u32;
+                }
+                DPRegGroup::AddSubShifted {
+                    sf,
+                    subtract,
+                    set_flags,
+                    shift,
+                    reg_m,
+                    imm6,
+                    reg_n,
+                    reg_d,
+                } => {
+                    debug_assert!(shift <= 0b11);
+                    debug_assert!(imm6 <= 0b111111);
+                    out |= (sf as u32) << 31;
+                    out |= (subtract as u32) << 30;
+                    out |= (set_flags as u32) << 29;
+                    out |= 0b1 << 24;
+                    out |= (shift as u32) << 22;
+                    out |= (reg_m as u32) << 16;
+                    out |= (imm6 as u32) << 10;
+                    out |= (reg_n as u32) << 5;
+                    out |= reg_d as u32;
+                }
+            }
+        }
+        AArch64Instruction::LdStr(ldstr) => {
+            out |= 0b1 << 27;
+            match ldstr {
+                LdStrGroup::UnsignedImm {
+                    size,
+                    v,
+                    opc,
+                    imm12,
+                    reg_n,
+                    reg_t,
+                } => {
+                    debug_assert!(size <= 0b11);
+                    debug_assert!(imm12 <= 0xFFF);
+                    out |= (size as u32) << 30;
+                    out |= 0b11 << 28;
+                    out |= (v as u32) << 26;
+                    out |= 0b1 << 24;
+                    out |= (opc as u32) << 22;
+                    out |= (imm12 as u32) << 10;
+                    out |= (reg_n as u32) << 5;
+                    out |= reg_t as u32;
+                }
+            }
+        }
+        x => unimplemented!("The instruction, {:?}, has not be implemented yet", x),
+    }
+    out.to_le_bytes()
+}
+
+// Below here are the functions for all of the assembly instructions.
+// Their names are based on the instruction and operators combined.
+// You should call `buf.reserve()` if you push or extend more than once.
+// Unit tests are added at the bottom of the file to ensure correct asm generation.
+// Please keep these in alphanumeric order.
+
+/// `ADD Xd, Xn, imm12` -> Add Xn and imm12 and place the result into Xd.
+#[inline(always)]
+fn add_reg64_reg64_imm12<'a>(
+    buf: &mut Vec<'a, u8>,
+    dst: AArch64GPReg,
+    src: AArch64GPReg,
+    imm12: u16,
+) {
+    buf.extend(&build_instruction(AArch64Instruction::DPImm(
+        DPImmGroup::AddSubImm {
+            sf: true,
+            subtract: false,
+            set_flags: false,
+            shift: false,
+            imm12,
+            reg_n: src,
+            reg_d: dst,
+        },
+    )));
+}
+
+/// `ADD Xd, Xm, Xn` -> Add Xm and Xn and place the result into Xd.
+#[inline(always)]
+fn add_reg64_reg64_reg64<'a>(
+    buf: &mut Vec<'a, u8>,
+    dst: AArch64GPReg,
+    src1: AArch64GPReg,
+    src2: AArch64GPReg,
+) {
+    buf.extend(&build_instruction(AArch64Instruction::DPReg(
+        DPRegGroup::AddSubShifted {
+            sf: true,
+            subtract: false,
+            set_flags: false,
+            shift: 0,
+            reg_m: src1,
+            imm6: 0,
+            reg_n: src2,
+            reg_d: dst,
+        },
+    )));
+}
+
+/// `LDR Xt, [Xn, #offset]` -> Load Xn + Offset Xt. ZRSP is SP.
+/// Note: imm12 is the offest divided by 8.
+#[inline(always)]
+fn ldr_reg64_imm12<'a>(buf: &mut Vec<'a, u8>, dst: AArch64GPReg, base: AArch64GPReg, imm12: u16) {
+    debug_assert!(imm12 <= 0xFFF);
+    buf.extend(&build_instruction(AArch64Instruction::LdStr(
+        LdStrGroup::UnsignedImm {
+            size: 0b11,
+            v: false,
+            opc: 0b01,
+            imm12,
+            reg_n: base,
+            reg_t: dst,
+        },
+    )));
+}
+
+/// `MOV Xd, Xm` -> Move Xm to Xd.
+#[inline(always)]
+fn mov_reg64_reg64<'a>(buf: &mut Vec<'a, u8>, dst: AArch64GPReg, src: AArch64GPReg) {
+    // MOV is equvalent to `ORR Xd, XZR, XM` in AARCH64.
+    buf.extend(&build_instruction(AArch64Instruction::DPReg(
+        DPRegGroup::Logical {
+            sf: true,
+            op: DPRegLogicalOp::ORR,
+            shift: 0,
+            reg_m: src,
+            imm6: 0,
+            reg_n: AArch64GPReg::ZRSP,
+            reg_d: dst,
+        },
+    )));
+}
+
+/// `MOVK Xd, imm16` -> Keeps Xd and moves an optionally shifted imm16 to Xd.
+#[inline(always)]
+fn movk_reg64_imm16<'a>(buf: &mut Vec<'a, u8>, dst: AArch64GPReg, imm16: u16, hw: u8) {
+    debug_assert!(hw <= 0b11);
+    // MOV is equvalent to `ORR Xd, XZR, XM` in AARCH64.
+    buf.extend(&build_instruction(AArch64Instruction::DPImm(
+        DPImmGroup::MoveWide {
+            sf: true,
+            opc: 0b11,
+            hw,
+            imm16,
+            reg_d: dst,
+        },
+    )));
+}
+
+/// `MOVZ Xd, imm16` -> Zeros Xd and moves an optionally shifted imm16 to Xd.
+#[inline(always)]
+fn movz_reg64_imm16<'a>(buf: &mut Vec<'a, u8>, dst: AArch64GPReg, imm16: u16, hw: u8) {
+    debug_assert!(hw <= 0b11);
+    // MOV is equvalent to `ORR Xd, XZR, XM` in AARCH64.
+    buf.extend(&build_instruction(AArch64Instruction::DPImm(
+        DPImmGroup::MoveWide {
+            sf: true,
+            opc: 0b10,
+            hw,
+            imm16,
+            reg_d: dst,
+        },
+    )));
+}
+
+/// `STR Xt, [Xn, #offset]` -> Store Xt to Xn + Offset. ZRSP is SP.
+/// Note: imm12 is the offest divided by 8.
+#[inline(always)]
+fn str_reg64_imm12<'a>(buf: &mut Vec<'a, u8>, src: AArch64GPReg, base: AArch64GPReg, imm12: u16) {
+    debug_assert!(imm12 <= 0xFFF);
+    buf.extend(&build_instruction(AArch64Instruction::LdStr(
+        LdStrGroup::UnsignedImm {
+            size: 0b11,
+            v: false,
+            opc: 0b00,
+            imm12,
+            reg_n: base,
+            reg_t: src,
+        },
+    )));
+}
+
+/// `SUB Xd, Xn, imm12` -> Subtract Xn and imm12 and place the result into Xd.
+#[inline(always)]
+fn sub_reg64_reg64_imm12<'a>(
+    buf: &mut Vec<'a, u8>,
+    dst: AArch64GPReg,
+    src: AArch64GPReg,
+    imm12: u16,
+) {
+    buf.extend(&build_instruction(AArch64Instruction::DPImm(
+        DPImmGroup::AddSubImm {
+            sf: true,
+            subtract: true,
+            set_flags: false,
+            shift: false,
+            imm12,
+            reg_n: src,
+            reg_d: dst,
+        },
+    )));
+}
+
+/// `RET Xn` -> Return to the address stored in Xn.
+#[inline(always)]
+fn ret_reg64<'a>(buf: &mut Vec<'a, u8>, xn: AArch64GPReg) {
+    buf.extend(&build_instruction(AArch64Instruction::Branch(
+        BranchGroup::UnconditionBranchReg {
+            opc: 0b0010,
+            op2: 0b11111,
+            op3: 0b000000,
+            reg_n: xn,
+            op4: 0b000,
+        },
+    )));
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const TEST_U16: u16 = 0x1234;
+    //const TEST_I32: i32 = 0x12345678;
+    //const TEST_I64: i64 = 0x12345678_9ABCDEF0;
+
+    #[test]
+    fn test_add_reg64_reg64_reg64() {
+        let arena = bumpalo::Bump::new();
+        let mut buf = bumpalo::vec![in &arena];
+        add_reg64_reg64_reg64(
+            &mut buf,
+            AArch64GPReg::X10,
+            AArch64GPReg::ZRSP,
+            AArch64GPReg::X21,
+        );
+        assert_eq!(&buf, &[0xAA, 0x02, 0x1F, 0x8B]);
+    }
+
+    #[test]
+    fn test_add_reg64_reg64_imm12() {
+        let arena = bumpalo::Bump::new();
+        let mut buf = bumpalo::vec![in &arena];
+        add_reg64_reg64_imm12(&mut buf, AArch64GPReg::X10, AArch64GPReg::X21, 0x123);
+        assert_eq!(&buf, &[0xAA, 0x8E, 0x04, 0x91]);
+    }
+
+    #[test]
+    fn test_ldr_reg64_imm12() {
+        let arena = bumpalo::Bump::new();
+        let mut buf = bumpalo::vec![in &arena];
+        ldr_reg64_imm12(&mut buf, AArch64GPReg::X21, AArch64GPReg::ZRSP, 0x123);
+        assert_eq!(&buf, &[0xF5, 0x8F, 0x44, 0xF9]);
+    }
+
+    #[test]
+    fn test_mov_reg64_reg64() {
+        let arena = bumpalo::Bump::new();
+        let mut buf = bumpalo::vec![in &arena];
+        mov_reg64_reg64(&mut buf, AArch64GPReg::X10, AArch64GPReg::X21);
+        assert_eq!(&buf, &[0xEA, 0x03, 0x15, 0xAA]);
+    }
+
+    #[test]
+    fn test_movk_reg64_imm16() {
+        let arena = bumpalo::Bump::new();
+        let mut buf = bumpalo::vec![in &arena];
+        movk_reg64_imm16(&mut buf, AArch64GPReg::X21, TEST_U16, 3);
+        assert_eq!(&buf, &[0x95, 0x46, 0xE2, 0xF2]);
+    }
+
+    #[test]
+    fn test_movz_reg64_imm16() {
+        let arena = bumpalo::Bump::new();
+        let mut buf = bumpalo::vec![in &arena];
+        movz_reg64_imm16(&mut buf, AArch64GPReg::X21, TEST_U16, 3);
+        assert_eq!(&buf, &[0x95, 0x46, 0xE2, 0xD2]);
+    }
+
+    #[test]
+    fn test_str_reg64_imm12() {
+        let arena = bumpalo::Bump::new();
+        let mut buf = bumpalo::vec![in &arena];
+        str_reg64_imm12(&mut buf, AArch64GPReg::X21, AArch64GPReg::ZRSP, 0x123);
+        assert_eq!(&buf, &[0xF5, 0x8F, 0x04, 0xF9]);
+    }
+
+    #[test]
+    fn test_sub_reg64_reg64_imm12() {
+        let arena = bumpalo::Bump::new();
+        let mut buf = bumpalo::vec![in &arena];
+        sub_reg64_reg64_imm12(&mut buf, AArch64GPReg::X10, AArch64GPReg::X21, 0x123);
+        assert_eq!(&buf, &[0xAA, 0x8E, 0x04, 0xD1]);
+    }
+
+    #[test]
+    fn test_ret_reg64() {
+        let arena = bumpalo::Bump::new();
+        let mut buf = bumpalo::vec![in &arena];
+        ret_reg64(&mut buf, AArch64GPReg::LR);
+        assert_eq!(&buf, &[0xC0, 0x03, 0x5F, 0xD6]);
+    }
+}
diff --git a/compiler/gen_dev/src/generic64/mod.rs b/compiler/gen_dev/src/generic64/mod.rs
index 3d8c041776..fdc1519bb9 100644
--- a/compiler/gen_dev/src/generic64/mod.rs
+++ b/compiler/gen_dev/src/generic64/mod.rs
@@ -1,49 +1,61 @@
 use crate::{Backend, Env, Relocation};
 use bumpalo::collections::Vec;
-use roc_collections::all::{ImSet, MutMap, MutSet};
+use roc_collections::all::{MutMap, MutSet};
 use roc_module::symbol::Symbol;
 use roc_mono::ir::{Literal, Stmt};
 use std::marker::PhantomData;
 use target_lexicon::Triple;
 
+pub mod aarch64;
 pub mod x86_64;
 
-pub trait CallConv<GPReg> {
-    fn gp_param_regs() -> &'static [GPReg];
-    fn gp_return_regs() -> &'static [GPReg];
-    fn gp_default_free_regs() -> &'static [GPReg];
+pub trait CallConv<GPReg: GPRegTrait> {
+    const GP_PARAM_REGS: &'static [GPReg];
+    const GP_RETURN_REGS: &'static [GPReg];
+    const GP_DEFAULT_FREE_REGS: &'static [GPReg];
 
-    // A linear scan of an array may be faster than a set technically.
-    // That being said, fastest would likely be a trait based on calling convention/register.
-    fn caller_saved_regs() -> ImSet<GPReg>;
-    fn callee_saved_regs() -> ImSet<GPReg>;
+    const SHADOW_SPACE_SIZE: u8;
 
-    fn stack_pointer() -> GPReg;
-    fn frame_pointer() -> GPReg;
+    fn callee_saved(reg: &GPReg) -> bool;
+    #[inline(always)]
+    fn caller_saved_regs(reg: &GPReg) -> bool {
+        !Self::callee_saved(reg)
+    }
 
-    fn shadow_space_size() -> u8;
-    // It may be worth ignoring the red zone and keeping things simpler.
-    fn red_zone_size() -> u8;
+    fn setup_stack<'a>(
+        buf: &mut Vec<'a, u8>,
+        leaf_function: bool,
+        saved_regs: &[GPReg],
+        requested_stack_size: i32,
+    ) -> Result<i32, String>;
+    fn cleanup_stack<'a>(
+        buf: &mut Vec<'a, u8>,
+        leaf_function: bool,
+        saved_regs: &[GPReg],
+        aligned_stack_size: i32,
+    ) -> Result<(), String>;
 }
 
-pub trait Assembler<GPReg> {
-    fn add_register64bit_immediate32bit<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, imm: i32);
-    fn add_register64bit_register64bit<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, src: GPReg);
-    fn cmovl_register64bit_register64bit<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, src: GPReg);
-    fn mov_register64bit_immediate32bit<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, imm: i32);
-    fn mov_register64bit_immediate64bit<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, imm: i64);
-    fn mov_register64bit_register64bit<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, src: GPReg);
-    fn mov_register64bit_stackoffset32bit<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, offset: i32);
-    fn mov_stackoffset32bit_register64bit<'a>(buf: &mut Vec<'a, u8>, offset: i32, src: GPReg);
-    fn neg_register64bit<'a>(buf: &mut Vec<'a, u8>, reg: GPReg);
+/// Assembler contains calls to the backend assembly generator.
+/// These calls do not necessarily map directly to a single assembly instruction.
+/// They are higher level in cases where an instruction would not be common and shared between multiple architectures.
+/// Thus, some backends will need to use mulitiple instructions to preform a single one of this calls.
+/// Generally, I prefer explicit sources, as opposed to dst being one of the sources. Ex: `x = x + y` would be `add x, x, y` instead of `add x, y`.
+/// dst should always come before sources.
+pub trait Assembler<GPReg: GPRegTrait> {
+    fn abs_reg64_reg64<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, src: GPReg);
+    fn add_reg64_reg64_imm32<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, src1: GPReg, imm32: i32);
+    fn add_reg64_reg64_reg64<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, src1: GPReg, src2: GPReg);
+    fn mov_reg64_imm64<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, imm: i64);
+    fn mov_reg64_reg64<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, src: GPReg);
+    fn mov_reg64_stack32<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, offset: i32);
+    fn mov_stack32_reg64<'a>(buf: &mut Vec<'a, u8>, offset: i32, src: GPReg);
+    fn sub_reg64_reg64_imm32<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, src1: GPReg, imm32: i32);
     fn ret<'a>(buf: &mut Vec<'a, u8>);
-    fn sub_register64bit_immediate32bit<'a>(buf: &mut Vec<'a, u8>, dst: GPReg, imm: i32);
-    fn pop_register64bit<'a>(buf: &mut Vec<'a, u8>, reg: GPReg);
-    fn push_register64bit<'a>(buf: &mut Vec<'a, u8>, reg: GPReg);
 }
 
 #[derive(Clone, Debug, PartialEq)]
-enum SymbolStorage<GPReg> {
+enum SymbolStorage<GPReg: GPRegTrait> {
     // These may need layout, but I am not sure.
     // I think whenever a symbol would be used, we specify layout anyways.
     GPRegeg(GPReg),
@@ -69,7 +81,7 @@ pub struct Backend64Bit<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallCo
     literal_map: MutMap<Symbol, Literal<'a>>,
 
     // This should probably be smarter than a vec.
-    // There are certain registers we should always use first. With pushing and poping, this could get mixed.
+    // There are certain registers we should always use first. With pushing and popping, this could get mixed.
     gp_free_regs: Vec<'a, GPReg>,
 
     // The last major thing we need is a way to decide what reg to free when all of them are full.
@@ -109,7 +121,7 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>> Backend<
     }
 
     fn reset(&mut self) {
-        self.stack_size = -(CC::red_zone_size() as i32);
+        self.stack_size = 0;
         self.leaf_function = true;
         self.last_seen_map.clear();
         self.free_map.clear();
@@ -119,13 +131,12 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>> Backend<
         self.gp_free_regs.clear();
         self.gp_used_regs.clear();
         self.gp_free_regs
-            .extend_from_slice(CC::gp_default_free_regs());
+            .extend_from_slice(CC::GP_DEFAULT_FREE_REGS);
     }
 
     fn set_not_leaf_function(&mut self) {
         self.leaf_function = false;
-        // If this is not a leaf function, it can't use the shadow space.
-        self.stack_size = CC::shadow_space_size() as i32 - CC::red_zone_size() as i32;
+        self.stack_size = CC::SHADOW_SPACE_SIZE as i32;
     }
 
     fn literal_map(&mut self) -> &mut MutMap<Symbol, Literal<'a>> {
@@ -147,38 +158,17 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>> Backend<
     fn finalize(&mut self) -> Result<(&'a [u8], &[Relocation]), String> {
         let mut out = bumpalo::vec![in self.env.arena];
 
-        if !self.leaf_function {
-            // I believe that this will have to move away from push and to mov to be generic across backends.
-            ASM::push_register64bit(&mut out, CC::frame_pointer());
-            ASM::mov_register64bit_register64bit(
-                &mut out,
-                CC::frame_pointer(),
-                CC::stack_pointer(),
-            );
-        }
-        // Save data in all callee saved regs.
-        let mut pop_order = bumpalo::vec![in self.env.arena];
-        for reg in &self.used_callee_saved_regs {
-            ASM::push_register64bit(&mut out, *reg);
-            pop_order.push(*reg);
-        }
-        if self.stack_size > 0 {
-            ASM::sub_register64bit_immediate32bit(&mut out, CC::stack_pointer(), self.stack_size);
-        }
+        // Setup stack.
+        let mut used_regs = bumpalo::vec![in self.env.arena];
+        used_regs.extend(&self.used_callee_saved_regs);
+        let aligned_stack_size =
+            CC::setup_stack(&mut out, self.leaf_function, &used_regs, self.stack_size)?;
 
         // Add function body.
         out.extend(&self.buf);
 
-        if self.stack_size > 0 {
-            ASM::add_register64bit_immediate32bit(&mut out, CC::stack_pointer(), self.stack_size);
-        }
-        // Restore data in callee saved regs.
-        while let Some(reg) = pop_order.pop() {
-            ASM::pop_register64bit(&mut out, reg);
-        }
-        if !self.leaf_function {
-            ASM::pop_register64bit(&mut out, CC::frame_pointer());
-        }
+        // Cleanup stack.
+        CC::cleanup_stack(&mut out, self.leaf_function, &used_regs, aligned_stack_size)?;
         ASM::ret(&mut out);
 
         Ok((out.into_bump_slice(), &[]))
@@ -187,9 +177,7 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>> Backend<
     fn build_num_abs_i64(&mut self, dst: &Symbol, src: &Symbol) -> Result<(), String> {
         let dst_reg = self.claim_gp_reg(dst)?;
         let src_reg = self.load_to_reg(src)?;
-        ASM::mov_register64bit_register64bit(&mut self.buf, dst_reg, src_reg);
-        ASM::neg_register64bit(&mut self.buf, dst_reg);
-        ASM::cmovl_register64bit_register64bit(&mut self.buf, dst_reg, src_reg);
+        ASM::abs_reg64_reg64(&mut self.buf, dst_reg, src_reg);
         Ok(())
     }
 
@@ -201,9 +189,8 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>> Backend<
     ) -> Result<(), String> {
         let dst_reg = self.claim_gp_reg(dst)?;
         let src1_reg = self.load_to_reg(src1)?;
-        ASM::mov_register64bit_register64bit(&mut self.buf, dst_reg, src1_reg);
         let src2_reg = self.load_to_reg(src2)?;
-        ASM::add_register64bit_register64bit(&mut self.buf, dst_reg, src2_reg);
+        ASM::add_reg64_reg64_reg64(&mut self.buf, dst_reg, src1_reg, src2_reg);
         Ok(())
     }
 
@@ -212,7 +199,7 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>> Backend<
             Literal::Int(x) => {
                 let reg = self.claim_gp_reg(sym)?;
                 let val = *x;
-                ASM::mov_register64bit_immediate64bit(&mut self.buf, reg, val);
+                ASM::mov_reg64_imm64(&mut self.buf, reg, val);
                 Ok(())
             }
             x => Err(format!("loading literal, {:?}, is not yet implemented", x)),
@@ -234,11 +221,11 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>> Backend<
     fn return_symbol(&mut self, sym: &Symbol) -> Result<(), String> {
         let val = self.symbols_map.get(sym);
         match val {
-            Some(SymbolStorage::GPRegeg(reg)) if *reg == CC::gp_return_regs()[0] => Ok(()),
+            Some(SymbolStorage::GPRegeg(reg)) if *reg == CC::GP_RETURN_REGS[0] => Ok(()),
             Some(SymbolStorage::GPRegeg(reg)) => {
                 // If it fits in a general purpose register, just copy it over to.
                 // Technically this can be optimized to produce shorter instructions if less than 64bits.
-                ASM::mov_register64bit_register64bit(&mut self.buf, CC::gp_return_regs()[0], *reg);
+                ASM::mov_reg64_reg64(&mut self.buf, CC::GP_RETURN_REGS[0], *reg);
                 Ok(())
             }
             Some(x) => Err(format!(
@@ -258,7 +245,7 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>>
     fn claim_gp_reg(&mut self, sym: &Symbol) -> Result<GPReg, String> {
         let reg = if !self.gp_free_regs.is_empty() {
             let free_reg = self.gp_free_regs.pop().unwrap();
-            if CC::callee_saved_regs().contains(&free_reg) {
+            if CC::callee_saved(&free_reg) {
                 self.used_callee_saved_regs.insert(free_reg);
             }
             Ok(free_reg)
@@ -291,7 +278,7 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>>
                 let reg = self.claim_gp_reg(sym)?;
                 self.symbols_map
                     .insert(*sym, SymbolStorage::StackAndGPRegeg(reg, offset));
-                ASM::mov_register64bit_stackoffset32bit(&mut self.buf, reg, offset as i32);
+                ASM::mov_reg64_stack32(&mut self.buf, reg, offset as i32);
                 Ok(reg)
             }
             None => Err(format!("Unknown symbol: {}", sym)),
@@ -302,19 +289,9 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>>
         let val = self.symbols_map.remove(sym);
         match val {
             Some(SymbolStorage::GPRegeg(reg)) => {
-                let offset = self.stack_size;
-                self.stack_size += 8;
-                if let Some(size) = self.stack_size.checked_add(8) {
-                    self.stack_size = size;
-                } else {
-                    return Err(format!(
-                        "Ran out of stack space while saving symbol: {}",
-                        sym
-                    ));
-                }
-                ASM::mov_stackoffset32bit_register64bit(&mut self.buf, offset as i32, reg);
-                self.symbols_map
-                    .insert(*sym, SymbolStorage::Stack(offset as i32));
+                let offset = self.increase_stack_size(8)?;
+                ASM::mov_stack32_reg64(&mut self.buf, offset as i32, reg);
+                self.symbols_map.insert(*sym, SymbolStorage::Stack(offset));
                 Ok(())
             }
             Some(SymbolStorage::StackAndGPRegeg(_, offset)) => {
@@ -328,4 +305,16 @@ impl<'a, GPReg: GPRegTrait, ASM: Assembler<GPReg>, CC: CallConv<GPReg>>
             None => Err(format!("Unknown symbol: {}", sym)),
         }
     }
+
+    /// increase_stack_size increase the current stack size and returns the offset of the stack.
+    fn increase_stack_size(&mut self, amount: i32) -> Result<i32, String> {
+        debug_assert!(amount > 0);
+        let offset = self.stack_size;
+        if let Some(new_size) = self.stack_size.checked_add(amount) {
+            self.stack_size = new_size;
+            Ok(offset)
+        } else {
+            Err("Ran out of stack space".to_string())
+        }
+    }
 }
diff --git a/compiler/gen_dev/src/generic64/x86_64.rs b/compiler/gen_dev/src/generic64/x86_64.rs
index 97a97bc20d..98fa94afa6 100644
--- a/compiler/gen_dev/src/generic64/x86_64.rs
+++ b/compiler/gen_dev/src/generic64/x86_64.rs
@@ -1,6 +1,5 @@
 use crate::generic64::{Assembler, CallConv, GPRegTrait};
 use bumpalo::collections::Vec;
-use roc_collections::all::ImSet;
 
 // Not sure exactly how I want to represent registers.
 // If we want max speed, we would likely make them structs that impl the same trait to avoid ifs.
@@ -26,10 +25,312 @@ pub enum X86_64GPReg {
 
 impl GPRegTrait for X86_64GPReg {}
 
+pub struct X86_64Assembler {}
+pub struct X86_64WindowsFastcall {}
+pub struct X86_64SystemV {}
+
+const STACK_ALIGNMENT: u8 = 16;
+
+impl CallConv<X86_64GPReg> for X86_64SystemV {
+    const GP_PARAM_REGS: &'static [X86_64GPReg] = &[
+        X86_64GPReg::RDI,
+        X86_64GPReg::RSI,
+        X86_64GPReg::RDX,
+        X86_64GPReg::RCX,
+        X86_64GPReg::R8,
+        X86_64GPReg::R9,
+    ];
+    const GP_RETURN_REGS: &'static [X86_64GPReg] = &[X86_64GPReg::RAX, X86_64GPReg::RDX];
+
+    const GP_DEFAULT_FREE_REGS: &'static [X86_64GPReg] = &[
+        // The regs we want to use first should be at the end of this vec.
+        // We will use pop to get which reg to use next
+        // Use callee saved regs last.
+        X86_64GPReg::RBX,
+        // Don't use frame pointer: X86_64GPReg::RBP,
+        X86_64GPReg::R12,
+        X86_64GPReg::R13,
+        X86_64GPReg::R14,
+        X86_64GPReg::R15,
+        // Use caller saved regs first.
+        X86_64GPReg::RAX,
+        X86_64GPReg::RCX,
+        X86_64GPReg::RDX,
+        // Don't use stack pionter: X86_64GPReg::RSP,
+        X86_64GPReg::RSI,
+        X86_64GPReg::RDI,
+        X86_64GPReg::R8,
+        X86_64GPReg::R9,
+        X86_64GPReg::R10,
+        X86_64GPReg::R11,
+    ];
+    const SHADOW_SPACE_SIZE: u8 = 0;
+
+    #[inline(always)]
+    fn callee_saved(reg: &X86_64GPReg) -> bool {
+        matches!(
+            reg,
+            X86_64GPReg::RBX
+                | X86_64GPReg::RBP
+                | X86_64GPReg::R12
+                | X86_64GPReg::R13
+                | X86_64GPReg::R14
+                | X86_64GPReg::R15
+        )
+    }
+
+    #[inline(always)]
+    fn setup_stack<'a>(
+        buf: &mut Vec<'a, u8>,
+        leaf_function: bool,
+        saved_regs: &[X86_64GPReg],
+        requested_stack_size: i32,
+    ) -> Result<i32, String> {
+        x86_64_generic_setup_stack(buf, leaf_function, saved_regs, requested_stack_size)
+    }
+
+    #[inline(always)]
+    fn cleanup_stack<'a>(
+        buf: &mut Vec<'a, u8>,
+        leaf_function: bool,
+        saved_regs: &[X86_64GPReg],
+        aligned_stack_size: i32,
+    ) -> Result<(), String> {
+        x86_64_generic_cleanup_stack(buf, leaf_function, saved_regs, aligned_stack_size)
+    }
+}
+
+impl CallConv<X86_64GPReg> for X86_64WindowsFastcall {
+    const GP_PARAM_REGS: &'static [X86_64GPReg] = &[
+        X86_64GPReg::RCX,
+        X86_64GPReg::RDX,
+        X86_64GPReg::R8,
+        X86_64GPReg::R9,
+    ];
+    const GP_RETURN_REGS: &'static [X86_64GPReg] = &[X86_64GPReg::RAX];
+    const GP_DEFAULT_FREE_REGS: &'static [X86_64GPReg] = &[
+        // The regs we want to use first should be at the end of this vec.
+        // We will use pop to get which reg to use next
+
+        // Don't use stack pionter: X86_64GPReg::RSP,
+        // Don't use frame pointer: X86_64GPReg::RBP,
+
+        // Use callee saved regs last.
+        X86_64GPReg::RBX,
+        X86_64GPReg::RSI,
+        X86_64GPReg::RDI,
+        X86_64GPReg::R12,
+        X86_64GPReg::R13,
+        X86_64GPReg::R14,
+        X86_64GPReg::R15,
+        // Use caller saved regs first.
+        X86_64GPReg::RAX,
+        X86_64GPReg::RCX,
+        X86_64GPReg::RDX,
+        X86_64GPReg::R8,
+        X86_64GPReg::R9,
+        X86_64GPReg::R10,
+        X86_64GPReg::R11,
+    ];
+    const SHADOW_SPACE_SIZE: u8 = 32;
+
+    #[inline(always)]
+    fn callee_saved(reg: &X86_64GPReg) -> bool {
+        matches!(
+            reg,
+            X86_64GPReg::RBX
+                | X86_64GPReg::RBP
+                | X86_64GPReg::RSI
+                | X86_64GPReg::RSP
+                | X86_64GPReg::RDI
+                | X86_64GPReg::R12
+                | X86_64GPReg::R13
+                | X86_64GPReg::R14
+                | X86_64GPReg::R15
+        )
+    }
+
+    #[inline(always)]
+    fn setup_stack<'a>(
+        buf: &mut Vec<'a, u8>,
+        leaf_function: bool,
+        saved_regs: &[X86_64GPReg],
+        requested_stack_size: i32,
+    ) -> Result<i32, String> {
+        x86_64_generic_setup_stack(buf, leaf_function, saved_regs, requested_stack_size)
+    }
+
+    #[inline(always)]
+    fn cleanup_stack<'a>(
+        buf: &mut Vec<'a, u8>,
+        leaf_function: bool,
+        saved_regs: &[X86_64GPReg],
+        aligned_stack_size: i32,
+    ) -> Result<(), String> {
+        x86_64_generic_cleanup_stack(buf, leaf_function, saved_regs, aligned_stack_size)
+    }
+}
+
+#[inline(always)]
+fn x86_64_generic_setup_stack<'a>(
+    buf: &mut Vec<'a, u8>,
+    leaf_function: bool,
+    saved_regs: &[X86_64GPReg],
+    requested_stack_size: i32,
+) -> Result<i32, String> {
+    if !leaf_function {
+        X86_64Assembler::push_reg64(buf, X86_64GPReg::RBP);
+        X86_64Assembler::mov_reg64_reg64(buf, X86_64GPReg::RBP, X86_64GPReg::RSP);
+    }
+    for reg in saved_regs {
+        X86_64Assembler::push_reg64(buf, *reg);
+    }
+
+    // full size is upcast to i64 to make sure we don't overflow here.
+    let full_size = 8 * saved_regs.len() as i64 + requested_stack_size as i64;
+    let alignment = if full_size <= 0 {
+        0
+    } else {
+        full_size % STACK_ALIGNMENT as i64
+    };
+    let offset = if alignment == 0 {
+        0
+    } else {
+        STACK_ALIGNMENT - alignment as u8
+    };
+    if let Some(aligned_stack_size) = requested_stack_size.checked_add(offset as i32) {
+        if aligned_stack_size > 0 {
+            X86_64Assembler::sub_reg64_reg64_imm32(
+                buf,
+                X86_64GPReg::RSP,
+                X86_64GPReg::RSP,
+                aligned_stack_size,
+            );
+            Ok(aligned_stack_size)
+        } else {
+            Ok(0)
+        }
+    } else {
+        Err("Ran out of stack space".to_string())
+    }
+}
+
+#[inline(always)]
+fn x86_64_generic_cleanup_stack<'a>(
+    buf: &mut Vec<'a, u8>,
+    leaf_function: bool,
+    saved_regs: &[X86_64GPReg],
+    aligned_stack_size: i32,
+) -> Result<(), String> {
+    if aligned_stack_size > 0 {
+        X86_64Assembler::add_reg64_reg64_imm32(
+            buf,
+            X86_64GPReg::RSP,
+            X86_64GPReg::RSP,
+            aligned_stack_size,
+        );
+    }
+    for reg in saved_regs.iter().rev() {
+        X86_64Assembler::pop_reg64(buf, *reg);
+    }
+    if !leaf_function {
+        X86_64Assembler::mov_reg64_reg64(buf, X86_64GPReg::RSP, X86_64GPReg::RBP);
+        X86_64Assembler::pop_reg64(buf, X86_64GPReg::RBP);
+    }
+    Ok(())
+}
+
+impl Assembler<X86_64GPReg> for X86_64Assembler {
+    // These functions should map to the raw assembly functions below.
+    // In some cases, that means you can just directly call one of the direct assembly functions.
+    #[inline(always)]
+    fn abs_reg64_reg64<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, src: X86_64GPReg) {
+        mov_reg64_reg64(buf, dst, src);
+        neg_reg64(buf, dst);
+        cmovl_reg64_reg64(buf, dst, src);
+    }
+    #[inline(always)]
+    fn add_reg64_reg64_imm32<'a>(
+        buf: &mut Vec<'a, u8>,
+        dst: X86_64GPReg,
+        src1: X86_64GPReg,
+        imm32: i32,
+    ) {
+        if dst == src1 {
+            add_reg64_imm32(buf, dst, imm32);
+        } else {
+            mov_reg64_reg64(buf, dst, src1);
+            add_reg64_imm32(buf, dst, imm32);
+        }
+    }
+    #[inline(always)]
+    fn add_reg64_reg64_reg64<'a>(
+        buf: &mut Vec<'a, u8>,
+        dst: X86_64GPReg,
+        src1: X86_64GPReg,
+        src2: X86_64GPReg,
+    ) {
+        if dst == src1 {
+            add_reg64_reg64(buf, dst, src2);
+        } else if dst == src2 {
+            add_reg64_reg64(buf, dst, src1);
+        } else {
+            mov_reg64_reg64(buf, dst, src1);
+            add_reg64_reg64(buf, dst, src2);
+        }
+    }
+    #[inline(always)]
+    fn mov_reg64_imm64<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, imm: i64) {
+        mov_reg64_imm64(buf, dst, imm);
+    }
+    #[inline(always)]
+    fn mov_reg64_reg64<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, src: X86_64GPReg) {
+        mov_reg64_reg64(buf, dst, src);
+    }
+    #[inline(always)]
+    fn mov_reg64_stack32<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, offset: i32) {
+        mov_reg64_stack32(buf, dst, offset);
+    }
+    #[inline(always)]
+    fn mov_stack32_reg64<'a>(buf: &mut Vec<'a, u8>, offset: i32, src: X86_64GPReg) {
+        mov_stack32_reg64(buf, offset, src);
+    }
+    #[inline(always)]
+    fn sub_reg64_reg64_imm32<'a>(
+        buf: &mut Vec<'a, u8>,
+        dst: X86_64GPReg,
+        src1: X86_64GPReg,
+        imm32: i32,
+    ) {
+        if dst == src1 {
+            sub_reg64_imm32(buf, dst, imm32);
+        } else {
+            mov_reg64_reg64(buf, dst, src1);
+            sub_reg64_imm32(buf, dst, imm32);
+        }
+    }
+    #[inline(always)]
+    fn ret<'a>(buf: &mut Vec<'a, u8>) {
+        ret(buf);
+    }
+}
+
+impl X86_64Assembler {
+    #[inline(always)]
+    fn pop_reg64<'a>(buf: &mut Vec<'a, u8>, reg: X86_64GPReg) {
+        pop_reg64(buf, reg);
+    }
+
+    #[inline(always)]
+    fn push_reg64<'a>(buf: &mut Vec<'a, u8>, reg: X86_64GPReg) {
+        push_reg64(buf, reg);
+    }
+}
 const REX: u8 = 0x40;
 const REX_W: u8 = REX + 0x8;
 
-fn add_rm_extension(reg: X86_64GPReg, byte: u8) -> u8 {
+#[inline(always)]
+const fn add_rm_extension(reg: X86_64GPReg, byte: u8) -> u8 {
     if reg as u8 > 7 {
         byte + 1
     } else {
@@ -37,11 +338,13 @@ fn add_rm_extension(reg: X86_64GPReg, byte: u8) -> u8 {
     }
 }
 
-fn add_opcode_extension(reg: X86_64GPReg, byte: u8) -> u8 {
+#[inline(always)]
+const fn add_opcode_extension(reg: X86_64GPReg, byte: u8) -> u8 {
     add_rm_extension(reg, byte)
 }
 
-fn add_reg_extension(reg: X86_64GPReg, byte: u8) -> u8 {
+#[inline(always)]
+const fn add_reg_extension(reg: X86_64GPReg, byte: u8) -> u8 {
     if reg as u8 > 7 {
         byte + 4
     } else {
@@ -49,316 +352,149 @@ fn add_reg_extension(reg: X86_64GPReg, byte: u8) -> u8 {
     }
 }
 
-pub struct X86_64Assembler {}
-pub struct X86_64WindowsFastcall {}
-pub struct X86_64SystemV {}
+// Below here are the functions for all of the assembly instructions.
+// Their names are based on the instruction and operators combined.
+// You should call `buf.reserve()` if you push or extend more than once.
+// Unit tests are added at the bottom of the file to ensure correct asm generation.
+// Please keep these in alphanumeric order.
 
-impl CallConv<X86_64GPReg> for X86_64SystemV {
-    fn gp_param_regs() -> &'static [X86_64GPReg] {
-        &[
-            X86_64GPReg::RDI,
-            X86_64GPReg::RSI,
-            X86_64GPReg::RDX,
-            X86_64GPReg::RCX,
-            X86_64GPReg::R8,
-            X86_64GPReg::R9,
-        ]
-    }
-    fn gp_return_regs() -> &'static [X86_64GPReg] {
-        &[X86_64GPReg::RAX, X86_64GPReg::RDX]
-    }
-    fn gp_default_free_regs() -> &'static [X86_64GPReg] {
-        &[
-            // The regs we want to use first should be at the end of this vec.
-            // We will use pop to get which reg to use next
-            // Use callee saved regs last.
-            X86_64GPReg::RBX,
-            // Don't use frame pointer: X86_64GPReg::RBP,
-            X86_64GPReg::R12,
-            X86_64GPReg::R13,
-            X86_64GPReg::R14,
-            X86_64GPReg::R15,
-            // Use caller saved regs first.
-            X86_64GPReg::RAX,
-            X86_64GPReg::RCX,
-            X86_64GPReg::RDX,
-            // Don't use stack pionter: X86_64GPReg::RSP,
-            X86_64GPReg::RSI,
-            X86_64GPReg::RDI,
-            X86_64GPReg::R8,
-            X86_64GPReg::R9,
-            X86_64GPReg::R10,
-            X86_64GPReg::R11,
-        ]
-    }
-    fn caller_saved_regs() -> ImSet<X86_64GPReg> {
-        // TODO: stop using vec! here. I was just have trouble with some errors, but it shouldn't be needed.
-        ImSet::from(vec![
-            X86_64GPReg::RAX,
-            X86_64GPReg::RCX,
-            X86_64GPReg::RDX,
-            X86_64GPReg::RSP,
-            X86_64GPReg::RSI,
-            X86_64GPReg::RDI,
-            X86_64GPReg::R8,
-            X86_64GPReg::R9,
-            X86_64GPReg::R10,
-            X86_64GPReg::R11,
-        ])
-    }
-    fn callee_saved_regs() -> ImSet<X86_64GPReg> {
-        // TODO: stop using vec! here. I was just have trouble with some errors, but it shouldn't be needed.
-        ImSet::from(vec![
-            X86_64GPReg::RBX,
-            X86_64GPReg::RBP,
-            X86_64GPReg::R12,
-            X86_64GPReg::R13,
-            X86_64GPReg::R14,
-            X86_64GPReg::R15,
-        ])
-    }
-    fn stack_pointer() -> X86_64GPReg {
-        X86_64GPReg::RSP
-    }
-    fn frame_pointer() -> X86_64GPReg {
-        X86_64GPReg::RBP
-    }
-    fn shadow_space_size() -> u8 {
-        0
-    }
-    fn red_zone_size() -> u8 {
-        128
+/// `ADD r/m64, imm32` -> Add imm32 sign-extended to 64-bits from r/m64.
+#[inline(always)]
+fn add_reg64_imm32<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, imm: i32) {
+    // This can be optimized if the immediate is 1 byte.
+    let rex = add_rm_extension(dst, REX_W);
+    let dst_mod = dst as u8 % 8;
+    buf.reserve(7);
+    buf.extend(&[rex, 0x81, 0xC0 + dst_mod]);
+    buf.extend(&imm.to_le_bytes());
+}
+
+/// `ADD r/m64,r64` -> Add r64 to r/m64.
+#[inline(always)]
+fn add_reg64_reg64<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, src: X86_64GPReg) {
+    let rex = add_rm_extension(dst, REX_W);
+    let rex = add_reg_extension(src, rex);
+    let dst_mod = dst as u8 % 8;
+    let src_mod = (src as u8 % 8) << 3;
+    buf.extend(&[rex, 0x01, 0xC0 + dst_mod + src_mod]);
+}
+
+/// `CMOVL r64,r/m64` -> Move if less (SF≠ OF).
+#[inline(always)]
+fn cmovl_reg64_reg64<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, src: X86_64GPReg) {
+    let rex = add_reg_extension(dst, REX_W);
+    let rex = add_rm_extension(src, rex);
+    let dst_mod = (dst as u8 % 8) << 3;
+    let src_mod = src as u8 % 8;
+    buf.extend(&[rex, 0x0F, 0x4C, 0xC0 + dst_mod + src_mod]);
+}
+
+/// `MOV r/m64, imm32` -> Move imm32 sign extended to 64-bits to r/m64.
+#[inline(always)]
+fn mov_reg64_imm32<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, imm: i32) {
+    let rex = add_rm_extension(dst, REX_W);
+    let dst_mod = dst as u8 % 8;
+    buf.reserve(7);
+    buf.extend(&[rex, 0xC7, 0xC0 + dst_mod]);
+    buf.extend(&imm.to_le_bytes());
+}
+
+/// `MOV r64, imm64` -> Move imm64 to r64.
+#[inline(always)]
+fn mov_reg64_imm64<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, imm: i64) {
+    if imm <= i32::MAX as i64 && imm >= i32::MIN as i64 {
+        mov_reg64_imm32(buf, dst, imm as i32)
+    } else {
+        let rex = add_opcode_extension(dst, REX_W);
+        let dst_mod = dst as u8 % 8;
+        buf.reserve(10);
+        buf.extend(&[rex, 0xB8 + dst_mod]);
+        buf.extend(&imm.to_le_bytes());
     }
 }
 
-impl CallConv<X86_64GPReg> for X86_64WindowsFastcall {
-    fn gp_param_regs() -> &'static [X86_64GPReg] {
-        &[
-            X86_64GPReg::RCX,
-            X86_64GPReg::RDX,
-            X86_64GPReg::R8,
-            X86_64GPReg::R9,
-        ]
-    }
-    fn gp_return_regs() -> &'static [X86_64GPReg] {
-        &[X86_64GPReg::RAX]
-    }
-    fn gp_default_free_regs() -> &'static [X86_64GPReg] {
-        &[
-            // The regs we want to use first should be at the end of this vec.
-            // We will use pop to get which reg to use next
-            // Use callee saved regs last.
-            X86_64GPReg::RBX,
-            // Don't use frame pointer: X86_64GPReg::RBP,
-            X86_64GPReg::RSI,
-            // Don't use stack pionter: X86_64GPReg::RSP,
-            X86_64GPReg::RDI,
-            X86_64GPReg::R12,
-            X86_64GPReg::R13,
-            X86_64GPReg::R14,
-            X86_64GPReg::R15,
-            // Use caller saved regs first.
-            X86_64GPReg::RAX,
-            X86_64GPReg::RCX,
-            X86_64GPReg::RDX,
-            X86_64GPReg::R8,
-            X86_64GPReg::R9,
-            X86_64GPReg::R10,
-            X86_64GPReg::R11,
-        ]
-    }
-    fn caller_saved_regs() -> ImSet<X86_64GPReg> {
-        // TODO: stop using vec! here. I was just have trouble with some errors, but it shouldn't be needed.
-        ImSet::from(vec![
-            X86_64GPReg::RAX,
-            X86_64GPReg::RCX,
-            X86_64GPReg::RDX,
-            X86_64GPReg::R8,
-            X86_64GPReg::R9,
-            X86_64GPReg::R10,
-            X86_64GPReg::R11,
-        ])
-    }
-    fn callee_saved_regs() -> ImSet<X86_64GPReg> {
-        // TODO: stop using vec! here. I was just have trouble with some errors, but it shouldn't be needed.
-        ImSet::from(vec![
-            X86_64GPReg::RBX,
-            X86_64GPReg::RBP,
-            X86_64GPReg::RSI,
-            X86_64GPReg::RSP,
-            X86_64GPReg::RDI,
-            X86_64GPReg::R12,
-            X86_64GPReg::R13,
-            X86_64GPReg::R14,
-            X86_64GPReg::R15,
-        ])
-    }
-    fn stack_pointer() -> X86_64GPReg {
-        X86_64GPReg::RSP
-    }
-    fn frame_pointer() -> X86_64GPReg {
-        X86_64GPReg::RBP
-    }
-    fn shadow_space_size() -> u8 {
-        32
-    }
-    fn red_zone_size() -> u8 {
-        0
+/// `MOV r/m64,r64` -> Move r64 to r/m64.
+#[inline(always)]
+fn mov_reg64_reg64<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, src: X86_64GPReg) {
+    let rex = add_rm_extension(dst, REX_W);
+    let rex = add_reg_extension(src, rex);
+    let dst_mod = dst as u8 % 8;
+    let src_mod = (src as u8 % 8) << 3;
+    buf.extend(&[rex, 0x89, 0xC0 + dst_mod + src_mod]);
+}
+
+/// `MOV r64,r/m64` -> Move r/m64 to r64.
+#[inline(always)]
+fn mov_reg64_stack32<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, offset: i32) {
+    // This can be optimized based on how many bytes the offset actually is.
+    // This function can probably be made to take any memory offset, I didn't feel like figuring it out rn.
+    // Also, this may technically be faster genration since stack operations should be so common.
+    let rex = add_reg_extension(dst, REX_W);
+    let dst_mod = (dst as u8 % 8) << 3;
+    buf.reserve(8);
+    buf.extend(&[rex, 0x8B, 0x84 + dst_mod, 0x24]);
+    buf.extend(&offset.to_le_bytes());
+}
+
+/// `MOV r/m64,r64` -> Move r64 to r/m64.
+#[inline(always)]
+fn mov_stack32_reg64<'a>(buf: &mut Vec<'a, u8>, offset: i32, src: X86_64GPReg) {
+    // This can be optimized based on how many bytes the offset actually is.
+    // This function can probably be made to take any memory offset, I didn't feel like figuring it out rn.
+    // Also, this may technically be faster genration since stack operations should be so common.
+    let rex = add_reg_extension(src, REX_W);
+    let src_mod = (src as u8 % 8) << 3;
+    buf.reserve(8);
+    buf.extend(&[rex, 0x89, 0x84 + src_mod, 0x24]);
+    buf.extend(&offset.to_le_bytes());
+}
+
+/// `NEG r/m64` -> Two's complement negate r/m64.
+#[inline(always)]
+fn neg_reg64<'a>(buf: &mut Vec<'a, u8>, reg: X86_64GPReg) {
+    let rex = add_rm_extension(reg, REX_W);
+    let reg_mod = reg as u8 % 8;
+    buf.extend(&[rex, 0xF7, 0xD8 + reg_mod]);
+}
+
+/// `RET` -> Near return to calling procedure.
+#[inline(always)]
+fn ret<'a>(buf: &mut Vec<'a, u8>) {
+    buf.push(0xC3);
+}
+
+/// `SUB r/m64, imm32` -> Subtract imm32 sign-extended to 64-bits from r/m64.
+#[inline(always)]
+fn sub_reg64_imm32<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, imm: i32) {
+    // This can be optimized if the immediate is 1 byte.
+    let rex = add_rm_extension(dst, REX_W);
+    let dst_mod = dst as u8 % 8;
+    buf.reserve(7);
+    buf.extend(&[rex, 0x81, 0xE8 + dst_mod]);
+    buf.extend(&imm.to_le_bytes());
+}
+
+/// `POP r64` -> Pop top of stack into r64; increment stack pointer. Cannot encode 32-bit operand size.
+#[inline(always)]
+fn pop_reg64<'a>(buf: &mut Vec<'a, u8>, reg: X86_64GPReg) {
+    let reg_mod = reg as u8 % 8;
+    if reg as u8 > 7 {
+        let rex = add_opcode_extension(reg, REX);
+        buf.extend(&[rex, 0x58 + reg_mod]);
+    } else {
+        buf.push(0x58 + reg_mod);
     }
 }
 
-impl Assembler<X86_64GPReg> for X86_64Assembler {
-    // Below here are the functions for all of the assembly instructions.
-    // Their names are based on the instruction and operators combined.
-    // You should call `buf.reserve()` if you push or extend more than once.
-    // Unit tests are added at the bottom of the file to ensure correct asm generation.
-    // Please keep these in alphanumeric order.
-
-    /// `ADD r/m64, imm32` -> Add imm32 sign-extended to 64-bits from r/m64.
-    fn add_register64bit_immediate32bit<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, imm: i32) {
-        // This can be optimized if the immediate is 1 byte.
-        let rex = add_rm_extension(dst, REX_W);
-        let dst_mod = dst as u8 % 8;
-        buf.reserve(7);
-        buf.extend(&[rex, 0x81, 0xC0 + dst_mod]);
-        buf.extend(&imm.to_le_bytes());
-    }
-
-    /// `ADD r/m64,r64` -> Add r64 to r/m64.
-    fn add_register64bit_register64bit<'a>(
-        buf: &mut Vec<'a, u8>,
-        dst: X86_64GPReg,
-        src: X86_64GPReg,
-    ) {
-        let rex = add_rm_extension(dst, REX_W);
-        let rex = add_reg_extension(src, rex);
-        let dst_mod = dst as u8 % 8;
-        let src_mod = (src as u8 % 8) << 3;
-        buf.extend(&[rex, 0x01, 0xC0 + dst_mod + src_mod]);
-    }
-
-    /// `CMOVL r64,r/m64` -> Move if less (SF≠ OF).
-    fn cmovl_register64bit_register64bit<'a>(
-        buf: &mut Vec<'a, u8>,
-        dst: X86_64GPReg,
-        src: X86_64GPReg,
-    ) {
-        let rex = add_reg_extension(dst, REX_W);
-        let rex = add_rm_extension(src, rex);
-        let dst_mod = (dst as u8 % 8) << 3;
-        let src_mod = src as u8 % 8;
-        buf.extend(&[rex, 0x0F, 0x4C, 0xC0 + dst_mod + src_mod]);
-    }
-
-    /// `MOV r/m64, imm32` -> Move imm32 sign extended to 64-bits to r/m64.
-    fn mov_register64bit_immediate32bit<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, imm: i32) {
-        let rex = add_rm_extension(dst, REX_W);
-        let dst_mod = dst as u8 % 8;
-        buf.reserve(7);
-        buf.extend(&[rex, 0xC7, 0xC0 + dst_mod]);
-        buf.extend(&imm.to_le_bytes());
-    }
-
-    /// `MOV r64, imm64` -> Move imm64 to r64.
-    fn mov_register64bit_immediate64bit<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, imm: i64) {
-        if imm <= i32::MAX as i64 && imm >= i32::MIN as i64 {
-            Self::mov_register64bit_immediate32bit(buf, dst, imm as i32)
-        } else {
-            let rex = add_opcode_extension(dst, REX_W);
-            let dst_mod = dst as u8 % 8;
-            buf.reserve(10);
-            buf.extend(&[rex, 0xB8 + dst_mod]);
-            buf.extend(&imm.to_le_bytes());
-        }
-    }
-
-    /// `MOV r/m64,r64` -> Move r64 to r/m64.
-    fn mov_register64bit_register64bit<'a>(
-        buf: &mut Vec<'a, u8>,
-        dst: X86_64GPReg,
-        src: X86_64GPReg,
-    ) {
-        let rex = add_rm_extension(dst, REX_W);
-        let rex = add_reg_extension(src, rex);
-        let dst_mod = dst as u8 % 8;
-        let src_mod = (src as u8 % 8) << 3;
-        buf.extend(&[rex, 0x89, 0xC0 + dst_mod + src_mod]);
-    }
-
-    /// `MOV r64,r/m64` -> Move r/m64 to r64.
-    fn mov_register64bit_stackoffset32bit<'a>(
-        buf: &mut Vec<'a, u8>,
-        dst: X86_64GPReg,
-        offset: i32,
-    ) {
-        // This can be optimized based on how many bytes the offset actually is.
-        // This function can probably be made to take any memory offset, I didn't feel like figuring it out rn.
-        // Also, this may technically be faster genration since stack operations should be so common.
-        let rex = add_reg_extension(dst, REX_W);
-        let dst_mod = (dst as u8 % 8) << 3;
-        buf.reserve(8);
-        buf.extend(&[rex, 0x8B, 0x84 + dst_mod, 0x24]);
-        buf.extend(&offset.to_le_bytes());
-    }
-
-    /// `MOV r/m64,r64` -> Move r64 to r/m64.
-    fn mov_stackoffset32bit_register64bit<'a>(
-        buf: &mut Vec<'a, u8>,
-        offset: i32,
-        src: X86_64GPReg,
-    ) {
-        // This can be optimized based on how many bytes the offset actually is.
-        // This function can probably be made to take any memory offset, I didn't feel like figuring it out rn.
-        // Also, this may technically be faster genration since stack operations should be so common.
-        let rex = add_reg_extension(src, REX_W);
-        let src_mod = (src as u8 % 8) << 3;
-        buf.reserve(8);
-        buf.extend(&[rex, 0x89, 0x84 + src_mod, 0x24]);
-        buf.extend(&offset.to_le_bytes());
-    }
-
-    /// `NEG r/m64` -> Two's complement negate r/m64.
-    fn neg_register64bit<'a>(buf: &mut Vec<'a, u8>, reg: X86_64GPReg) {
-        let rex = add_rm_extension(reg, REX_W);
-        let reg_mod = reg as u8 % 8;
-        buf.extend(&[rex, 0xF7, 0xD8 + reg_mod]);
-    }
-
-    /// `RET` -> Near return to calling procedure.
-    fn ret<'a>(buf: &mut Vec<'a, u8>) {
-        buf.push(0xC3);
-    }
-
-    /// `SUB r/m64, imm32` -> Subtract imm32 sign-extended to 64-bits from r/m64.
-    fn sub_register64bit_immediate32bit<'a>(buf: &mut Vec<'a, u8>, dst: X86_64GPReg, imm: i32) {
-        // This can be optimized if the immediate is 1 byte.
-        let rex = add_rm_extension(dst, REX_W);
-        let dst_mod = dst as u8 % 8;
-        buf.reserve(7);
-        buf.extend(&[rex, 0x81, 0xE8 + dst_mod]);
-        buf.extend(&imm.to_le_bytes());
-    }
-
-    /// `POP r64` -> Pop top of stack into r64; increment stack pointer. Cannot encode 32-bit operand size.
-    fn pop_register64bit<'a>(buf: &mut Vec<'a, u8>, reg: X86_64GPReg) {
-        let reg_mod = reg as u8 % 8;
-        if reg as u8 > 7 {
-            let rex = add_opcode_extension(reg, REX);
-            buf.extend(&[rex, 0x58 + reg_mod]);
-        } else {
-            buf.push(0x58 + reg_mod);
-        }
-    }
-
-    /// `PUSH r64` -> Push r64,
-    fn push_register64bit<'a>(buf: &mut Vec<'a, u8>, reg: X86_64GPReg) {
-        let reg_mod = reg as u8 % 8;
-        if reg as u8 > 7 {
-            let rex = add_opcode_extension(reg, REX);
-            buf.extend(&[rex, 0x50 + reg_mod]);
-        } else {
-            buf.push(0x50 + reg_mod);
-        }
+/// `PUSH r64` -> Push r64,
+#[inline(always)]
+fn push_reg64<'a>(buf: &mut Vec<'a, u8>, reg: X86_64GPReg) {
+    let reg_mod = reg as u8 % 8;
+    if reg as u8 > 7 {
+        let rex = add_opcode_extension(reg, REX);
+        buf.extend(&[rex, 0x50 + reg_mod]);
+    } else {
+        buf.push(0x50 + reg_mod);
     }
 }
 
@@ -372,7 +508,7 @@ mod tests {
     const TEST_I64: i64 = 0x12345678_9ABCDEF0;
 
     #[test]
-    fn test_add_register64bit_immediate32bit() {
+    fn test_add_reg64_imm32() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for (dst, expected) in &[
@@ -380,14 +516,14 @@ mod tests {
             (X86_64GPReg::R15, [0x49, 0x81, 0xC7]),
         ] {
             buf.clear();
-            X86_64Assembler::add_register64bit_immediate32bit(&mut buf, *dst, TEST_I32);
+            add_reg64_imm32(&mut buf, *dst, TEST_I32);
             assert_eq!(expected, &buf[..3]);
             assert_eq!(TEST_I32.to_le_bytes(), &buf[3..]);
         }
     }
 
     #[test]
-    fn test_add_register64bit_register64bit() {
+    fn test_add_reg64_reg64() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for ((dst, src), expected) in &[
@@ -397,13 +533,13 @@ mod tests {
             ((X86_64GPReg::R15, X86_64GPReg::R15), [0x4D, 0x01, 0xFF]),
         ] {
             buf.clear();
-            X86_64Assembler::add_register64bit_register64bit(&mut buf, *dst, *src);
+            add_reg64_reg64(&mut buf, *dst, *src);
             assert_eq!(expected, &buf[..]);
         }
     }
 
     #[test]
-    fn test_cmovl_register64bit_register64bit() {
+    fn test_cmovl_reg64_reg64() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for ((dst, src), expected) in &[
@@ -425,13 +561,13 @@ mod tests {
             ),
         ] {
             buf.clear();
-            X86_64Assembler::cmovl_register64bit_register64bit(&mut buf, *dst, *src);
+            cmovl_reg64_reg64(&mut buf, *dst, *src);
             assert_eq!(expected, &buf[..]);
         }
     }
 
     #[test]
-    fn test_mov_register64bit_immediate32bit() {
+    fn test_mov_reg64_imm32() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for (dst, expected) in &[
@@ -439,14 +575,14 @@ mod tests {
             (X86_64GPReg::R15, [0x49, 0xC7, 0xC7]),
         ] {
             buf.clear();
-            X86_64Assembler::mov_register64bit_immediate32bit(&mut buf, *dst, TEST_I32);
+            mov_reg64_imm32(&mut buf, *dst, TEST_I32);
             assert_eq!(expected, &buf[..3]);
             assert_eq!(TEST_I32.to_le_bytes(), &buf[3..]);
         }
     }
 
     #[test]
-    fn test_mov_register64bit_immediate64bit() {
+    fn test_mov_reg64_imm64() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for (dst, expected) in &[
@@ -454,7 +590,7 @@ mod tests {
             (X86_64GPReg::R15, [0x49, 0xBF]),
         ] {
             buf.clear();
-            X86_64Assembler::mov_register64bit_immediate64bit(&mut buf, *dst, TEST_I64);
+            mov_reg64_imm64(&mut buf, *dst, TEST_I64);
             assert_eq!(expected, &buf[..2]);
             assert_eq!(TEST_I64.to_le_bytes(), &buf[2..]);
         }
@@ -463,14 +599,14 @@ mod tests {
             (X86_64GPReg::R15, [0x49, 0xC7, 0xC7]),
         ] {
             buf.clear();
-            X86_64Assembler::mov_register64bit_immediate64bit(&mut buf, *dst, TEST_I32 as i64);
+            mov_reg64_imm64(&mut buf, *dst, TEST_I32 as i64);
             assert_eq!(expected, &buf[..3]);
             assert_eq!(TEST_I32.to_le_bytes(), &buf[3..]);
         }
     }
 
     #[test]
-    fn test_mov_register64bit_register64bit() {
+    fn test_mov_reg64_reg64() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for ((dst, src), expected) in &[
@@ -480,13 +616,13 @@ mod tests {
             ((X86_64GPReg::R15, X86_64GPReg::R15), [0x4D, 0x89, 0xFF]),
         ] {
             buf.clear();
-            X86_64Assembler::mov_register64bit_register64bit(&mut buf, *dst, *src);
+            mov_reg64_reg64(&mut buf, *dst, *src);
             assert_eq!(expected, &buf[..]);
         }
     }
 
     #[test]
-    fn test_mov_register64bit_stackoffset32bit() {
+    fn test_mov_reg64_stack32() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for ((dst, offset), expected) in &[
@@ -494,14 +630,14 @@ mod tests {
             ((X86_64GPReg::R15, TEST_I32), [0x4C, 0x8B, 0xBC, 0x24]),
         ] {
             buf.clear();
-            X86_64Assembler::mov_register64bit_stackoffset32bit(&mut buf, *dst, *offset);
+            mov_reg64_stack32(&mut buf, *dst, *offset);
             assert_eq!(expected, &buf[..4]);
             assert_eq!(TEST_I32.to_le_bytes(), &buf[4..]);
         }
     }
 
     #[test]
-    fn test_mov_stackoffset32bit_register64bit() {
+    fn test_mov_stack32_reg64() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for ((offset, src), expected) in &[
@@ -509,14 +645,14 @@ mod tests {
             ((TEST_I32, X86_64GPReg::R15), [0x4C, 0x89, 0xBC, 0x24]),
         ] {
             buf.clear();
-            X86_64Assembler::mov_stackoffset32bit_register64bit(&mut buf, *offset, *src);
+            mov_stack32_reg64(&mut buf, *offset, *src);
             assert_eq!(expected, &buf[..4]);
             assert_eq!(TEST_I32.to_le_bytes(), &buf[4..]);
         }
     }
 
     #[test]
-    fn test_neg_register64bit() {
+    fn test_neg_reg64() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for (reg, expected) in &[
@@ -524,7 +660,7 @@ mod tests {
             (X86_64GPReg::R15, [0x49, 0xF7, 0xDF]),
         ] {
             buf.clear();
-            X86_64Assembler::neg_register64bit(&mut buf, *reg);
+            neg_reg64(&mut buf, *reg);
             assert_eq!(expected, &buf[..]);
         }
     }
@@ -533,12 +669,12 @@ mod tests {
     fn test_ret() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
-        X86_64Assembler::ret(&mut buf);
+        ret(&mut buf);
         assert_eq!(&[0xC3], &buf[..]);
     }
 
     #[test]
-    fn test_sub_register64bit_immediate32bit() {
+    fn test_sub_reg64_imm32() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for (dst, expected) in &[
@@ -546,14 +682,14 @@ mod tests {
             (X86_64GPReg::R15, [0x49, 0x81, 0xEF]),
         ] {
             buf.clear();
-            X86_64Assembler::sub_register64bit_immediate32bit(&mut buf, *dst, TEST_I32);
+            sub_reg64_imm32(&mut buf, *dst, TEST_I32);
             assert_eq!(expected, &buf[..3]);
             assert_eq!(TEST_I32.to_le_bytes(), &buf[3..]);
         }
     }
 
     #[test]
-    fn test_pop_register64bit() {
+    fn test_pop_reg64() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for (dst, expected) in &[
@@ -561,13 +697,13 @@ mod tests {
             (X86_64GPReg::R15, vec![0x41, 0x5F]),
         ] {
             buf.clear();
-            X86_64Assembler::pop_register64bit(&mut buf, *dst);
+            pop_reg64(&mut buf, *dst);
             assert_eq!(&expected[..], &buf[..]);
         }
     }
 
     #[test]
-    fn test_push_register64bit() {
+    fn test_push_reg64() {
         let arena = bumpalo::Bump::new();
         let mut buf = bumpalo::vec![in &arena];
         for (src, expected) in &[
@@ -575,7 +711,7 @@ mod tests {
             (X86_64GPReg::R15, vec![0x41, 0x57]),
         ] {
             buf.clear();
-            X86_64Assembler::push_register64bit(&mut buf, *src);
+            push_reg64(&mut buf, *src);
             assert_eq!(&expected[..], &buf[..]);
         }
     }
diff --git a/compiler/gen_dev/src/object_builder.rs b/compiler/gen_dev/src/object_builder.rs
index 6c16325d9c..588d206e23 100644
--- a/compiler/gen_dev/src/object_builder.rs
+++ b/compiler/gen_dev/src/object_builder.rs
@@ -1,4 +1,4 @@
-use crate::generic64::{x86_64, Backend64Bit};
+use crate::generic64::{aarch64, x86_64, Backend64Bit};
 use crate::{Backend, Env, Relocation, INLINED_SYMBOLS};
 use bumpalo::collections::Vec;
 use object::write;
@@ -22,7 +22,7 @@ pub fn build_module<'a>(
     target: &Triple,
     procedures: MutMap<(symbol::Symbol, Layout<'a>), Proc<'a>>,
 ) -> Result<Object, String> {
-    let (mut output, mut backend) = match target {
+    match target {
         Triple {
             architecture: TargetArch::X86_64,
             binary_format: TargetBF::Elf,
@@ -33,15 +33,42 @@ pub fn build_module<'a>(
                 x86_64::X86_64Assembler,
                 x86_64::X86_64SystemV,
             > = Backend::new(env, target)?;
-            Ok((
-                Object::new(BinaryFormat::Elf, Architecture::X86_64, Endianness::Little),
+            build_object(
+                env,
+                procedures,
                 backend,
-            ))
+                Object::new(BinaryFormat::Elf, Architecture::X86_64, Endianness::Little),
+            )
+        }
+        Triple {
+            architecture: TargetArch::Aarch64(_),
+            binary_format: TargetBF::Elf,
+            ..
+        } => {
+            let backend: Backend64Bit<
+                aarch64::AArch64GPReg,
+                aarch64::AArch64Assembler,
+                aarch64::AArch64Call,
+            > = Backend::new(env, target)?;
+            build_object(
+                env,
+                procedures,
+                backend,
+                Object::new(BinaryFormat::Elf, Architecture::Aarch64, Endianness::Little),
+            )
         }
         x => Err(format! {
         "the target, {:?}, is not yet implemented",
         x}),
-    }?;
+    }
+}
+
+fn build_object<'a, B: Backend<'a>>(
+    env: &'a Env,
+    procedures: MutMap<(symbol::Symbol, Layout<'a>), Proc<'a>>,
+    mut backend: B,
+    mut output: Object,
+) -> Result<Object, String> {
     let text = output.section_id(StandardSection::Text);
     let data_section = output.section_id(StandardSection::Data);
     let comment = output.add_section(vec![], b"comment".to_vec(), SectionKind::OtherString);
diff --git a/compiler/gen_dev/tests/gen_num.rs b/compiler/gen_dev/tests/gen_num.rs
index c2550052bc..7b73fe1d37 100644
--- a/compiler/gen_dev/tests/gen_num.rs
+++ b/compiler/gen_dev/tests/gen_num.rs
@@ -9,7 +9,7 @@ extern crate libc;
 #[macro_use]
 mod helpers;
 
-#[cfg(all(test, target_os = "linux", target_arch = "x86_64"))]
+#[cfg(all(test, target_os = "linux", any(target_arch = "x86_64"/*, target_arch = "aarch64"*/)))]
 mod gen_num {
     //use roc_std::RocOrder;