diff --git a/.travis.yml b/.travis.yml
index b32df5005..88a3865e0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -80,14 +80,14 @@ matrix:
 
     # All examples work
     - rust: nightly
-      env: JOB=examples-build
+      name: "examples - almost all examples"
       install:
         - *INSTALL_NODE_VIA_NVM
         - *INSTALL_AWS
         - npm install
       script:
         - |
-          for dir in `ls examples | grep -v README | grep -v asm.js | grep -v no_modules`; do
+          for dir in `ls examples | grep -v README | grep -v asm.js | grep -v raytrace | grep -v no_modules`; do
             (cd examples/$dir &&
              sed -i "s|: \"webpack-dev-server\"|: \"webpack --output-path $HOME/$TRAVIS_BUILD_NUMBER/exbuild/$dir\"|" package.json &&
              sed -i 's/npm install//' build.sh &&
@@ -96,6 +96,21 @@ matrix:
           done
         - if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then aws s3 sync --quiet ~/$TRAVIS_BUILD_NUMBER s3://wasm-bindgen-ci/$TRAVIS_BUILD_NUMBER; fi
       if: branch = master
+    - rust: nightly
+      name: "examples - raytracer"
+      install:
+        - *INSTALL_AWS
+        - rustup component add rust-src
+        - curl -L https://github.com/japaric/xargo/releases/download/v0.3.11/xargo-v0.3.11-x86_64-unknown-linux-musl.tar.gz | tar xzf -
+        - export PATH=$PATH:`pwd`
+      script:
+        - sed -i 's/python/#python/' examples/raytrace-parallel/build.sh
+        - (cd examples/raytrace-parallel && ./build.sh)
+        - dst=$HOME/$TRAVIS_BUILD_NUMBER/exbuild/raytrace-parallel
+        - mkdir -p $dst
+        - cp examples/raytrace-parallel/*.{js,html,wasm} $dst
+        - if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then aws s3 sync ~/$TRAVIS_BUILD_NUMBER s3://wasm-bindgen-ci/$TRAVIS_BUILD_NUMBER; fi
+      if: branch = master
 
     # The `web-sys` crate's tests pass
     - rust: beta
diff --git a/Cargo.toml b/Cargo.toml
index f558ada82..7947a6a0f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -72,6 +72,7 @@ members = [
   "examples/no_modules",
   "examples/paint",
   "examples/performance",
+  "examples/raytrace-parallel",
   "examples/wasm-in-wasm",
   "examples/wasm2js",
   "examples/webaudio",
diff --git a/crates/cli-support/Cargo.toml b/crates/cli-support/Cargo.toml
index 49f207221..e25704472 100644
--- a/crates/cli-support/Cargo.toml
+++ b/crates/cli-support/Cargo.toml
@@ -15,6 +15,7 @@ base64 = "0.9"
 failure = "0.1.2"
 parity-wasm = "0.35"
 tempfile = "3.0"
-wasm-bindgen-shared = { path = "../shared", version = '=0.2.25' }
-wasm-bindgen-wasm-interpreter = { path = "../wasm-interpreter", version = '=0.2.25' }
 wasm-bindgen-gc = { path = '../gc', version = '=0.2.25' }
+wasm-bindgen-shared = { path = "../shared", version = '=0.2.25' }
+wasm-bindgen-threads-xform = { path = '../threads-xform', version = '=0.2.25' }
+wasm-bindgen-wasm-interpreter = { path = "../wasm-interpreter", version = '=0.2.25' }
diff --git a/crates/cli-support/src/js/mod.rs b/crates/cli-support/src/js/mod.rs
index 8d6d28457..f5349db33 100644
--- a/crates/cli-support/src/js/mod.rs
+++ b/crates/cli-support/src/js/mod.rs
@@ -6,7 +6,7 @@ use decode;
 use failure::{Error, ResultExt};
 use parity_wasm::elements::*;
 use shared;
-use wasm_bindgen_gc;
+use gc;
 
 use super::Bindgen;
 use descriptor::{Descriptor, VectorKind};
@@ -388,12 +388,26 @@ impl<'a> Context<'a> {
             ))
         })?;
 
+        self.bind("__wbindgen_module", &|me| {
+            if !me.config.no_modules {
+                bail!("`wasm_bindgen::module` is currently only supported with \
+                       --no-modules");
+            }
+            me.expose_add_heap_object();
+            Ok(format!(
+                "
+                function() {{
+                    return addHeapObject(init.__wbindgen_wasm_module);
+                }}
+                ",
+            ))
+        })?;
+
         self.bind("__wbindgen_rethrow", &|me| {
             me.expose_take_object();
             Ok(String::from("function(idx) { throw takeObject(idx); }"))
         })?;
 
-        self.create_memory_export();
         self.unexport_unused_internal_exports();
         closures::rewrite(self)?;
         self.gc();
@@ -415,7 +429,76 @@ impl<'a> Context<'a> {
 
         self.rewrite_imports(module_name);
 
-        let mut js = if self.config.no_modules {
+        let mut js = if self.config.threads.is_some() {
+            // TODO: It's not clear right now how to best use threads with
+            // bundlers like webpack. We need a way to get the existing
+            // module/memory into web workers for now and we don't quite know
+            // idiomatically how to do that! In the meantime, always require
+            // `--no-modules`
+            if !self.config.no_modules {
+                bail!("most use `--no-modules` with threads for now")
+            }
+            self.memory(); // set `memory_limit` if it's not already set
+            let limits = match &self.memory_init {
+                Some(l) if l.shared() => l.clone(),
+                _ => bail!("must impot a shared memory with threads"),
+            };
+
+            let mut memory = String::from("new WebAssembly.Memory({");
+            memory.push_str(&format!("initial:{}", limits.initial()));
+            if let Some(max) = limits.maximum() {
+                memory.push_str(&format!(",maximum:{}", max));
+            }
+            if limits.shared() {
+                memory.push_str(",shared:true");
+            }
+            memory.push_str("})");
+
+            format!(
+                    "\
+(function() {{
+    var wasm;
+    var memory;
+    const __exports = {{}};
+    {globals}
+    function init(module_or_path, maybe_memory) {{
+        let result;
+        const imports = {{ './{module}': __exports }};
+        if (module_or_path instanceof WebAssembly.Module) {{
+            memory = __exports.memory = maybe_memory;
+            result = WebAssembly.instantiate(module_or_path, imports)
+                .then(instance => {{
+                    return {{ instance, module: module_or_path }}
+                }});
+        }} else {{
+            memory = __exports.memory = {init_memory};
+            const response = fetch(module_or_path);
+            if (typeof WebAssembly.instantiateStreaming === 'function') {{
+                result = WebAssembly.instantiateStreaming(response, imports);
+            }} else {{
+                result = response
+                    .then(r => r.arrayBuffer())
+                    .then(bytes => WebAssembly.instantiate(bytes, imports));
+            }}
+        }}
+        return result.then(({{instance, module}}) => {{
+            wasm = init.wasm = instance.exports;
+            init.__wbindgen_wasm_instance = instance;
+            init.__wbindgen_wasm_module = module;
+            init.__wbindgen_wasm_memory = __exports.memory;
+        }});
+    }};
+    self.{global_name} = Object.assign(init, __exports);
+}})();",
+                    globals = self.globals,
+                    module = module_name,
+                    global_name = self.config.no_modules_global
+                        .as_ref()
+                        .map(|s| &**s)
+                        .unwrap_or("wasm_bindgen"),
+                    init_memory = memory,
+            )
+        } else if self.config.no_modules {
             format!(
                     "\
 (function() {{
@@ -637,20 +720,6 @@ impl<'a> Context<'a> {
         }
     }
 
-    fn create_memory_export(&mut self) {
-        let limits = match self.memory_init.clone() {
-            Some(limits) => limits,
-            None => return,
-        };
-        let mut initializer = String::from("new WebAssembly.Memory({");
-        initializer.push_str(&format!("initial:{}", limits.initial()));
-        if let Some(max) = limits.maximum() {
-            initializer.push_str(&format!(",maximum:{}", max));
-        }
-        initializer.push_str("})");
-        self.export("memory", &initializer, None);
-    }
-
     fn rewrite_imports(&mut self, module_name: &str) {
         for (name, contents) in self._rewrite_imports(module_name) {
             self.export(&name, &contents, None);
@@ -1621,7 +1690,7 @@ impl<'a> Context<'a> {
 
     fn gc(&mut self) {
         self.parse_wasm_names();
-        wasm_bindgen_gc::Config::new()
+        gc::Config::new()
             .demangle(self.config.demangle)
             .keep_debug(self.config.keep_debug || self.config.debug)
             .run(&mut self.module);
@@ -1671,7 +1740,6 @@ impl<'a> Context<'a> {
                 _ => None,
             }).next()
             .expect("must import memory");
-        assert_eq!(entry.module(), "env");
         assert_eq!(entry.field(), "memory");
         self.memory_init = Some(mem.limits().clone());
         "memory"
diff --git a/crates/cli-support/src/lib.rs b/crates/cli-support/src/lib.rs
index bb7fddafa..72ee7eba7 100644
--- a/crates/cli-support/src/lib.rs
+++ b/crates/cli-support/src/lib.rs
@@ -3,10 +3,11 @@
 extern crate parity_wasm;
 #[macro_use]
 extern crate wasm_bindgen_shared as shared;
-extern crate wasm_bindgen_gc;
+extern crate wasm_bindgen_gc as gc;
 #[macro_use]
 extern crate failure;
 extern crate wasm_bindgen_wasm_interpreter as wasm_interpreter;
+extern crate wasm_bindgen_threads_xform as threads_xform;
 
 use std::collections::BTreeSet;
 use std::env;
@@ -37,6 +38,9 @@ pub struct Bindgen {
     // Experimental support for `WeakRefGroup`, an upcoming ECMAScript feature.
     // Currently only enable-able through an env var.
     weak_refs: bool,
+    // Experimental support for the wasm threads proposal, transforms the wasm
+    // module to be "ready to be instantiated on any thread"
+    threads: Option<threads_xform::Config>,
 }
 
 enum Input {
@@ -59,6 +63,7 @@ impl Bindgen {
             demangle: true,
             keep_debug: false,
             weak_refs: env::var("WASM_BINDGEN_WEAKREF").is_ok(),
+            threads: threads_config(),
         }
     }
 
@@ -143,6 +148,11 @@ impl Bindgen {
         let programs = extract_programs(&mut module, &mut program_storage)
             .with_context(|_| "failed to extract wasm-bindgen custom sections")?;
 
+        if let Some(cfg) = &self.threads {
+            cfg.run(&mut module)
+                .with_context(|_| "failed to prepare module for threading")?;
+        }
+
         // Here we're actually instantiating the module we've parsed above for
         // execution. Why, you might be asking, are we executing wasm code? A
         // good question!
@@ -447,3 +457,20 @@ fn reset_indentation(s: &str) -> String {
     }
     return dst;
 }
+
+// Eventually these will all be CLI options, but while they're unstable features
+// they're left as environment variables. We don't guarantee anything about
+// backwards-compatibility with these options.
+fn threads_config() -> Option<threads_xform::Config> {
+    if env::var("WASM_BINDGEN_THREADS").is_err() {
+        return None
+    }
+    let mut cfg = threads_xform::Config::new();
+    if let Ok(s) = env::var("WASM_BINDGEN_THREADS_MAX_MEMORY") {
+        cfg.maximum_memory(s.parse().unwrap());
+    }
+    if let Ok(s) = env::var("WASM_BINDGEN_THREADS_STACK_SIZE") {
+        cfg.thread_stack_size(s.parse().unwrap());
+    }
+    Some(cfg)
+}
diff --git a/crates/threads-xform/Cargo.toml b/crates/threads-xform/Cargo.toml
new file mode 100644
index 000000000..ffe4a1116
--- /dev/null
+++ b/crates/threads-xform/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "wasm-bindgen-threads-xform"
+version = "0.2.25"
+authors = ["The wasm-bindgen Developers"]
+license = "MIT/Apache-2.0"
+repository = "https://github.com/rustwasm/wasm-bindgen/tree/master/crates/threads-xform"
+homepage = "https://rustwasm.github.io/wasm-bindgen/"
+documentation = "https://docs.rs/wasm-bindgen-threads-xform"
+description = """
+Support for threading-related transformations in wasm-bindgen
+"""
+
+[dependencies]
+parity-wasm = "0.35"
+failure = "0.1"
diff --git a/crates/threads-xform/src/lib.rs b/crates/threads-xform/src/lib.rs
new file mode 100644
index 000000000..518061009
--- /dev/null
+++ b/crates/threads-xform/src/lib.rs
@@ -0,0 +1,672 @@
+#[macro_use]
+extern crate failure;
+extern crate parity_wasm;
+
+use std::collections::HashMap;
+
+use failure::{Error, ResultExt};
+use parity_wasm::elements::*;
+
+const PAGE_SIZE: u32 = 1 << 16;
+
+/// Configuration for the transformation pass in this module.
+///
+/// Created primarily through `new` and then executed through `run`.
+pub struct Config {
+    maximum_memory: u32,
+    thread_stack_size: u32,
+}
+
+impl Config {
+    /// Create a new configuration with default settings.
+    pub fn new() -> Config {
+        Config {
+            maximum_memory: 1 << 30, // 1GB
+            thread_stack_size: 1 << 20, // 1MB
+        }
+    }
+
+    /// Specify the maximum amount of memory the wasm module can ever have.
+    ///
+    /// We'll be specifying that the memory for this wasm module is shared, and
+    /// all shared memories must have their maximum limit specified (whereas
+    /// by default Rust/LLVM/LLD don't specify a maximum).
+    ///
+    /// The default for this option is 16MB, and this can be used to change
+    /// the maximum memory we'll be specifying.
+    ///
+    /// The `max` argument is in units of bytes.
+    ///
+    /// If the maximum memory is already specified this setting won't have any
+    /// affect.
+    pub fn maximum_memory(&mut self, max: u32) -> &mut Config {
+        self.maximum_memory = max;
+        self
+    }
+
+    /// Specify the stack size for all threads spawned.
+    ///
+    /// The stack size is typically set by rustc as an argument to LLD and
+    /// defaults to 1MB for the main thread. All threads spawned by the
+    /// main thread, however, need to allocate their own stack!
+    ///
+    /// This configuration option indicates how large the stack of each child
+    /// thread will be. This will be allocated as part of the `start` function
+    /// and will be stored in LLVM's global stack pointer.
+    pub fn thread_stack_size(&mut self, size: u32) -> &mut Config {
+        self.thread_stack_size = size;
+        self
+    }
+
+    /// Execute the transformation on the parsed wasm module specified.
+    ///
+    /// This function will prepare `Module` to be run on multiple threads,
+    /// performing steps such as:
+    ///
+    /// * All data segments are switched to "passive" data segments to ensure
+    ///   they're only initialized once (coming later)
+    /// * If memory is exported from this module, it is instead switched to
+    ///   being imported (with the same parameters).
+    /// * The imported memory is required to be `shared`, ensuring it's backed
+    ///   by a `SharedArrayBuffer` on the web.
+    /// * A `global` for a thread ID is injected.
+    /// * Four bytes in linear memory are reserved for the counter of thread
+    ///   IDs.
+    /// * A `start` function is injected (or prepended if one already exists)
+    ///   which initializes memory for the first thread and otherwise allocates
+    ///   thread ids for all threads.
+    ///
+    /// More and/or less may happen here over time, stay tuned!
+    pub fn run(&self, module: &mut Module) -> Result<(), Error> {
+        let segments = switch_data_segments_to_passive(module)?;
+        import_memory_zero(module)?;
+        share_imported_memory_zero(module, self.maximum_memory)?;
+        let stack_pointer_idx = find_stack_pointer(module)?;
+        let globals = inject_thread_globals(module);
+        let addr = inject_thread_id_counter(module)?;
+        start_with_init_memory(
+            module,
+            &segments,
+            &globals,
+            addr,
+            stack_pointer_idx,
+            self.thread_stack_size,
+        );
+        implement_thread_intrinsics(module, &globals)?;
+        Ok(())
+    }
+}
+
+struct PassiveSegment {
+    idx: u32,
+    offset: u32,
+    len: u32,
+}
+
+fn switch_data_segments_to_passive(module: &mut Module)
+    -> Result<Vec<PassiveSegment>, Error>
+{
+    // If there's no data, nothing to make passive!
+    let section = match module.data_section_mut() {
+        Some(section) => section,
+        None => return Ok(Vec::new()),
+    };
+
+    let mut ret = Vec::new();
+    for (i, segment) in section.entries_mut().iter_mut().enumerate() {
+        let mut offset = match segment.offset_mut().take() {
+            Some(offset) => offset,
+            // already passive ...
+            None => continue,
+        };
+        assert!(!segment.passive());
+
+        let offset = *get_offset(&mut offset)
+            .with_context(|_| format!("failed to read data segment {}", i))?;
+
+        // Flag it as passive after validation, and we've removed the offset via
+        // `take`, so time to process the next one
+        *segment.passive_mut() = true;
+        ret.push(PassiveSegment {
+            idx: i as u32,
+            offset: offset as u32,
+            len: segment.value().len() as u32,
+        });
+    }
+
+    Ok(ret)
+}
+
+fn get_offset(offset: &mut InitExpr) -> Result<&mut i32, Error> {
+    if offset.code().len() != 2 || offset.code()[1] != Instruction::End {
+        bail!("unrecognized offset")
+    }
+    match &mut offset.code_mut()[0] {
+        Instruction::I32Const(n) => Ok(n),
+        _ => bail!("unrecognized offset"),
+    }
+}
+
+fn import_memory_zero(module: &mut Module)
+    -> Result<(), Error>
+{
+    // If memory is exported, let's switch it to imported. If memory isn't
+    // exported then there's nothing to do as we'll deal with importing it
+    // later.
+    let limits = {
+        let section = match module.memory_section_mut() {
+            Some(section) => section,
+            None => return Ok(()),
+        };
+        let limits = match section.entries_mut().pop() {
+            Some(limits) => limits,
+            None => return Ok(()),
+        };
+        if section.entries().len() > 0 {
+            bail!("too many memories in wasm module for this tool to work");
+        }
+        limits
+    };
+
+    // Remove all memory sections as well as exported memory, we're switching to
+    // an import
+    module.sections_mut().retain(|s| {
+        match s {
+            Section::Memory(_) => false,
+            _ => true,
+        }
+    });
+    if let Some(s) = module.export_section_mut() {
+        s.entries_mut().retain(|s| {
+            match s.internal() {
+                Internal::Memory(_) => false,
+                _ => true,
+            }
+        });
+    }
+
+    // Add our new import to the import section
+    let pos = maybe_add_import_section(module);
+    let imports = match &mut module.sections_mut()[pos] {
+        Section::Import(s) => s,
+        _ => unreachable!(),
+    };
+
+    // Hardcode the field names for now, these are all internal details anyway
+    let entry = ImportEntry::new(
+        "env".to_string(),
+        "memory".to_string(),
+        External::Memory(limits),
+    );
+    imports.entries_mut().push(entry);
+    Ok(())
+}
+
+fn maybe_add_import_section(module: &mut Module) -> usize {
+    let mut pos = None;
+    // See this URL for section orderings, but the import section comes just
+    // after the type section.
+    //
+    // https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md#high-level-structure
+    for i in 0..module.sections().len() {
+        match &mut module.sections_mut()[i] {
+            Section::Type(_) => continue,
+            Section::Import(_) => return i,
+            _ => {}
+        }
+        pos = Some(i);
+        break
+    }
+    let empty = ImportSection::with_entries(Vec::new());
+    let section = Section::Import(empty);
+    let len = module.sections().len();
+    let pos = pos.unwrap_or_else(|| len - 1);
+    module.sections_mut().insert(pos, section);
+    return pos
+}
+
+fn share_imported_memory_zero(module: &mut Module, memory_max: u32) -> Result<(), Error> {
+    assert!(memory_max % PAGE_SIZE == 0);
+    // NB: this function assumes `import_memory_zero` has been called first to
+    // function correctly, which means we should find an imported memory here
+    // which we can rewrite to be unconditionally shared.
+    let imports = match module.import_section_mut() {
+        Some(s) => s,
+        None => panic!("failed to find an import section"),
+    };
+
+    for entry in imports.entries_mut() {
+        let mem = match entry.external_mut() {
+            External::Memory(m) => m,
+            _ => continue,
+        };
+        *mem = MemoryType::new(
+            mem.limits().initial(),
+            Some(mem.limits().maximum().unwrap_or(memory_max / PAGE_SIZE)),
+            true,
+        );
+        return Ok(())
+    }
+    panic!("failed to find an imported memory")
+}
+
+struct Globals {
+    thread_id: u32,
+    thread_tcb: u32,
+}
+
+fn inject_thread_globals(module: &mut Module) -> Globals {
+    let pos = maybe_add_global_section(module);
+    let globals = match &mut module.sections_mut()[pos] {
+        Section::Global(s) => s,
+        _ => unreachable!(),
+    };
+
+    // First up, our thread ID. The initial expression here isn't actually ever
+    // used but it's required. All threads will start off by setting this
+    // global to the thread's id.
+    globals.entries_mut().push(GlobalEntry::new(
+        GlobalType::new(ValueType::I32, true),
+        InitExpr::new(vec![Instruction::I32Const(0), Instruction::End]),
+    ));
+
+    // Next up the thread TCB, this is always set to null to start off with.
+    globals.entries_mut().push(GlobalEntry::new(
+        GlobalType::new(ValueType::I32, true),
+        InitExpr::new(vec![Instruction::I32Const(0), Instruction::End]),
+    ));
+
+    // ... and note that if either of the above globals isn't actually necessary
+    // we'll gc it away later.
+
+    let len = globals.entries().len() as u32;
+    Globals {
+        thread_id: len - 2,
+        thread_tcb: len - 1,
+    }
+}
+
+fn maybe_add_global_section(module: &mut Module) -> usize {
+    let mut pos = None;
+    // See this URL for section orderings:
+    //
+    // https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md#high-level-structure
+    for i in 0..module.sections().len() {
+        match &mut module.sections_mut()[i] {
+            Section::Type(_) |
+            Section::Import(_) |
+            Section::Function(_) |
+            Section::Table(_) |
+            Section::Memory(_) => continue,
+            Section::Global(_) => return i,
+            _ => {}
+        }
+        pos = Some(i);
+        break
+    }
+    let empty = GlobalSection::with_entries(Vec::new());
+    let section = Section::Global(empty);
+    let len = module.sections().len();
+    let pos = pos.unwrap_or_else(|| len - 1);
+    module.sections_mut().insert(pos, section);
+    return pos
+}
+
+fn inject_thread_id_counter(module: &mut Module) -> Result<u32, Error> {
+    // First up, look for a `__heap_base` export which is injected by LLD as
+    // part of the linking process. Note that `__heap_base` should in theory be
+    // *after* the stack and data, which means it's at the very end of the
+    // address space and should be safe for us to inject 4 bytes of data at.
+    let heap_base = {
+        let exports = match module.export_section() {
+            Some(s) => s,
+            None => bail!("failed to find `__heap_base` for injecting thread id"),
+        };
+
+        exports.entries()
+            .iter()
+            .filter(|e| e.field() == "__heap_base")
+            .filter_map(|e| {
+                match e.internal() {
+                    Internal::Global(idx) => Some(*idx),
+                    _ => None,
+                }
+            })
+            .next()
+    };
+    let heap_base = match heap_base {
+        Some(idx) => idx,
+        None => bail!("failed to find `__heap_base` for injecting thread id"),
+    };
+
+    // Now we need to bump up `__heap_base` by 4 bytes as we'd like to reserve
+    // those 4 bytes for our thread id counter. Do lots of validation here to
+    // make sure that `__heap_base` is an non-mutable integer, and then do
+    // some logic:
+    //
+    // * We require that `__heap_base` is aligned to 4 as that's what the atomic
+    //   will require anyway.
+    // * We *may* have to add another page to the minimum for this module. If by
+    //   reserving 4 bytes the heap base now lies on a different page then we
+    //   probably went past our minimum page requirement, so we'll need to
+    //   update our memory limits to add one.
+    //
+    // Otherwise here we'll rewrite the `__heap_base` global's initializer to be
+    // 4 larger, reserving us those 4 bytes for a thread id counter.
+    let (address, add_a_page) = {
+        let globals = match module.global_section_mut() {
+            Some(s) => s,
+            None => bail!("failed to find globals section"),
+        };
+        let entry = match globals.entries_mut().get_mut(heap_base as usize) {
+            Some(i) => i,
+            None => bail!("the `__heap_base` export index is out of bounds"),
+        };
+        if entry.global_type().content_type() != ValueType::I32 {
+            bail!("the `__heap_base` global doesn't have the type `i32`");
+        }
+        if entry.global_type().is_mutable() {
+            bail!("the `__heap_base` global is unexpectedly mutable");
+        }
+        let offset = get_offset(entry.init_expr_mut())?;
+        let address = (*offset as u32 + 3) & !3; // align up
+        let add_a_page = (address + 4) / PAGE_SIZE != address / PAGE_SIZE;
+        *offset = (address + 4) as i32;
+        (address, add_a_page)
+    };
+
+    if add_a_page {
+        add_one_to_imported_memory_limits_minimum(module);
+    }
+    Ok(address)
+}
+
+// see `inject_thread_id_counter` for why this is used and where it's called
+fn add_one_to_imported_memory_limits_minimum(module: &mut Module) {
+    let imports = match module.import_section_mut() {
+        Some(s) => s,
+        None => panic!("failed to find import section"),
+    };
+
+    for entry in imports.entries_mut() {
+        let mem = match entry.external_mut() {
+            External::Memory(m) => m,
+            _ => continue,
+        };
+        *mem = MemoryType::new(
+            mem.limits().initial() + 1,
+            mem.limits().maximum().map(|m| {
+                if m == mem.limits().initial() {
+                    m + 1
+                } else {
+                    m
+                }
+            }),
+            mem.limits().shared(),
+        );
+        return;
+    }
+    panic!("failed to find an imported memory")
+}
+
+fn find_stack_pointer(module: &mut Module) -> Result<Option<u32>, Error> {
+    let globals = match module.global_section() {
+        Some(s) => s,
+        None => bail!("failed to find the stack pointer"),
+    };
+    let candidates = globals.entries()
+        .iter()
+        .enumerate()
+        .filter(|(_, g)| g.global_type().content_type() == ValueType::I32)
+        .filter(|(_, g)| g.global_type().is_mutable())
+        .collect::<Vec<_>>();
+
+    // If there are no mutable i32 globals, assume this module doesn't even need
+    // a stack pointer!
+    if candidates.len() == 0 {
+        return Ok(None)
+    }
+
+    // Currently LLVM/LLD always use global 0 as the stack pointer, let's just
+    // blindly assume that.
+    if candidates[0].0 == 0 {
+        return Ok(Some(0))
+    }
+
+    bail!("the first global wasn't a mutable i32, has LLD changed or was \
+           this wasm file not produced by LLD?")
+}
+
+fn start_with_init_memory(
+    module: &mut Module,
+    segments: &[PassiveSegment],
+    globals: &Globals,
+    addr: u32,
+    stack_pointer_idx: Option<u32>,
+    stack_size: u32,
+) {
+    assert!(stack_size % PAGE_SIZE == 0);
+    let mut instrs = Vec::new();
+
+    // Execute an atomic add to learn what our thread ID is
+    instrs.push(Instruction::I32Const(addr as i32));
+    instrs.push(Instruction::I32Const(1));
+    let mem = parity_wasm::elements::MemArg { align: 2, offset: 0 };
+    instrs.push(Instruction::I32AtomicRmwAdd(mem));
+
+    // Store this thread ID into our thread ID global
+    instrs.push(Instruction::TeeLocal(0));
+    instrs.push(Instruction::SetGlobal(globals.thread_id));
+
+    // Perform an if/else based on whether we're the first thread or not. Our
+    // thread ID will be zero if we're the first thread, otherwise it'll be
+    // nonzero (assuming we don't overflow...)
+    //
+    // In the nonzero case (the first block) we give ourselves a stack via
+    // memory.grow and we update our stack pointer.
+    //
+    // In the zero case (the second block) we can skip both of those operations,
+    // but we need to initialize all our memory data segments.
+    instrs.push(Instruction::GetLocal(0));
+    instrs.push(Instruction::If(BlockType::NoResult));
+
+    if let Some(stack_pointer_idx) = stack_pointer_idx {
+        // local0 = grow_memory(stack_size);
+        instrs.push(Instruction::I32Const((stack_size / PAGE_SIZE) as i32));
+        instrs.push(Instruction::GrowMemory(0));
+        instrs.push(Instruction::SetLocal(0));
+
+        // if local0 == -1 then trap
+        instrs.push(Instruction::Block(BlockType::NoResult));
+        instrs.push(Instruction::GetLocal(0));
+        instrs.push(Instruction::I32Const(-1));
+        instrs.push(Instruction::I32Ne);
+        instrs.push(Instruction::BrIf(0));
+        instrs.push(Instruction::Unreachable);
+        instrs.push(Instruction::End); // end block
+
+        // stack_pointer = local0 + stack_size
+        instrs.push(Instruction::GetLocal(0));
+        instrs.push(Instruction::I32Const(PAGE_SIZE as i32));
+        instrs.push(Instruction::I32Mul);
+        instrs.push(Instruction::I32Const(stack_size as i32));
+        instrs.push(Instruction::I32Add);
+        instrs.push(Instruction::SetGlobal(stack_pointer_idx));
+    }
+
+    instrs.push(Instruction::Else);
+    for segment in segments {
+        // offset into memory
+        instrs.push(Instruction::I32Const(segment.offset as i32));
+        // offset into segment
+        instrs.push(Instruction::I32Const(0)); // offset into segment
+        // amount to copy
+        instrs.push(Instruction::I32Const(segment.len as i32));
+        instrs.push(Instruction::MemoryInit(segment.idx));
+    }
+    instrs.push(Instruction::End); // endif
+
+    // On all threads now memory segments are no longer needed
+    for segment in segments {
+        instrs.push(Instruction::MemoryDrop(segment.idx));
+    }
+
+    // If a start function previously existed we're done with our own
+    // initialization so delegate to them now.
+    if let Some(idx) = module.start_section() {
+        instrs.push(Instruction::Call(idx));
+    }
+
+    // End the function
+    instrs.push(Instruction::End);
+
+    // Add this newly generated function to the code section ...
+    let instrs = Instructions::new(instrs);
+    let local = Local::new(1, ValueType::I32);
+    let body = FuncBody::new(vec![local], instrs);
+    let code_idx = {
+        let s = module.code_section_mut().expect("module had no code");
+        s.bodies_mut().push(body);
+        (s.bodies().len() - 1) as u32
+    };
+    // ... and also be sure to add its signature to the function section ...
+    let type_idx = {
+        let section = module.type_section_mut().expect("module has no type section");
+        let pos = section.types()
+            .iter()
+            .map(|t| {
+                match t {
+                    Type::Function(t) => t,
+                }
+            })
+            .position(|t| t.params().is_empty() && t.return_type().is_none());
+        match pos {
+            Some(i) => i as u32,
+            None => {
+                let f = FunctionType::new(Vec::new(), None);
+                section.types_mut().push(Type::Function(f));
+                (section.types().len() - 1) as u32
+            }
+        }
+    };
+    module.function_section_mut()
+        .expect("module has no function section")
+        .entries_mut()
+        .push(Func::new(type_idx));
+
+    // ... and finally flag it as the new start function
+    let idx = code_idx + (module.import_count(ImportCountType::Function) as u32);
+    update_start_section(module, idx);
+}
+
+fn update_start_section(module: &mut Module, start: u32) {
+    // See this URL for section orderings:
+    //
+    // https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md#high-level-structure
+    let mut pos = None;
+    for i in 0..module.sections().len() {
+        match &mut module.sections_mut()[i] {
+            Section::Type(_) |
+            Section::Import(_) |
+            Section::Function(_) |
+            Section::Table(_) |
+            Section::Memory(_) |
+            Section::Global(_) |
+            Section::Export(_) => continue,
+            Section::Start(start_idx) => {
+                *start_idx = start;
+                return
+            }
+            _ => {}
+        }
+        pos = Some(i);
+        break
+    }
+    let section = Section::Start(start);
+    let len = module.sections().len();
+    let pos = pos.unwrap_or_else(|| len - 1);
+    module.sections_mut().insert(pos, section);
+}
+
+fn implement_thread_intrinsics(
+    module: &mut Module,
+    globals: &Globals,
+) -> Result<(), Error> {
+    let mut map = HashMap::new();
+    {
+        let imports = match module.import_section() {
+            Some(i) => i,
+            None => return Ok(()),
+        };
+        let entries = imports.entries()
+            .iter()
+            .filter(|i| {
+                match i.external() {
+                    External::Function(_) => true,
+                    _ => false,
+                }
+            })
+            .enumerate()
+            .filter(|(_, entry)| {
+                entry.module() == "__wbindgen_thread_xform__"
+            });
+        for (idx, entry) in entries {
+            let type_idx = match entry.external() {
+                External::Function(i) => *i,
+                _ => unreachable!(),
+            };
+            let types = module.type_section().unwrap();
+            let fty = match &types.types()[type_idx as usize] {
+                Type::Function(f) => f,
+            };
+            // Validate the type for this intrinsic
+            match entry.field() {
+                "__wbindgen_thread_id" => {
+                    if !fty.params().is_empty() ||
+                        fty.return_type() != Some(ValueType::I32)
+                    {
+                        bail!("__wbindgen_thread_id intrinsic has the wrong signature");
+                    }
+                    map.insert(idx as u32, Instruction::GetGlobal(globals.thread_id));
+                }
+                "__wbindgen_tcb_get" => {
+                    if !fty.params().is_empty() ||
+                        fty.return_type() != Some(ValueType::I32)
+                    {
+                        bail!("__wbindgen_tcb_get intrinsic has the wrong signature");
+                    }
+                    map.insert(idx as u32, Instruction::GetGlobal(globals.thread_tcb));
+                }
+                "__wbindgen_tcb_set" => {
+                    if fty.params().len() != 1 || fty.return_type().is_some() {
+                        bail!("__wbindgen_tcb_set intrinsic has the wrong signature");
+                    }
+                    map.insert(idx as u32, Instruction::SetGlobal(globals.thread_tcb));
+                }
+                other => bail!("unknown thread intrinsic: {}", other),
+            }
+        }
+    };
+
+    // Rewrite everything that calls `import_idx` to instead load the global
+    // `thread_id`
+    for body in module.code_section_mut().unwrap().bodies_mut() {
+        for instr in body.code_mut().elements_mut() {
+            let other = match instr {
+                Instruction::Call(idx) => {
+                    match map.get(idx) {
+                        Some(other) => other,
+                        None => continue,
+                    }
+                }
+                _ => continue,
+            };
+            *instr = other.clone();
+        }
+    }
+
+    // ... and in theory we'd remove `import_idx` here but we let `wasm-gc`
+    // take care of that later.
+
+    Ok(())
+}
diff --git a/examples/raytrace-parallel/.gitignore b/examples/raytrace-parallel/.gitignore
new file mode 100644
index 000000000..6ea626597
--- /dev/null
+++ b/examples/raytrace-parallel/.gitignore
@@ -0,0 +1,3 @@
+package-lock.json
+raytrace_parallel.js
+raytrace_parallel_bg.wasm
diff --git a/examples/raytrace-parallel/Cargo.toml b/examples/raytrace-parallel/Cargo.toml
new file mode 100644
index 000000000..e2ae2e9db
--- /dev/null
+++ b/examples/raytrace-parallel/Cargo.toml
@@ -0,0 +1,29 @@
+[package]
+name = "raytrace-parallel"
+version = "0.1.0"
+authors = ["The wasm-bindgen Developers"]
+
+[lib]
+crate-type = ["cdylib"]
+
+[dependencies]
+console_error_panic_hook = "0.1"
+futures = "0.1"
+js-sys = { path = '../../crates/js-sys' }
+raytracer = { git = 'https://github.com/alexcrichton/raytracer', branch = 'update-deps' }
+wasm-bindgen = { path = "../..", features = ['serde-serialize'] }
+wasm-bindgen-futures = { path = '../../crates/futures' }
+
+[dependencies.web-sys]
+path = '../../crates/web-sys'
+features = [
+  'CanvasRenderingContext2d',
+  'ErrorEvent',
+  'Event',
+  'ImageData',
+  'Navigator',
+  'Window',
+  'Worker',
+  'DedicatedWorkerGlobalScope',
+  'MessageEvent',
+]
diff --git a/examples/raytrace-parallel/README.md b/examples/raytrace-parallel/README.md
new file mode 100644
index 000000000..eb3f8af9d
--- /dev/null
+++ b/examples/raytrace-parallel/README.md
@@ -0,0 +1,17 @@
+# Parallel Raytracing
+
+[View documentation for this example online][dox] or [View compiled example
+online][compiled]
+
+[dox]: https://rustwasm.github.io/wasm-bindgen/examples/raytrace.html
+[compiled]: https://rustwasm.github.io/wasm-bindgen/exbuild/raytrace/
+
+You can build the example locally with:
+
+```
+$ ./build.sh
+```
+
+(or running the commands on Windows manually)
+
+and then visiting http://localhost:8080 in a browser should run the example!
diff --git a/examples/raytrace-parallel/Xargo.toml b/examples/raytrace-parallel/Xargo.toml
new file mode 100644
index 000000000..883133e4b
--- /dev/null
+++ b/examples/raytrace-parallel/Xargo.toml
@@ -0,0 +1,3 @@
+[dependencies.std]
+stage = 0
+features = ['wasm-bindgen-threads']
diff --git a/examples/raytrace-parallel/build.sh b/examples/raytrace-parallel/build.sh
new file mode 100755
index 000000000..fc52e0774
--- /dev/null
+++ b/examples/raytrace-parallel/build.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+set -ex
+
+# Two critical steps are required here to get this working:
+#
+# * First, the Rust standard library needs to be compiled. The default version
+#   is not compatible with atomics so we need to compile a version, with xargo,
+#   that is compatible.
+#
+# * Next we need to compile everything with the `atomics` feature enabled,
+#   ensuring that LLVM will generate atomic instructions and such.
+RUSTFLAGS='-C target-feature=+atomics' \
+  rustup run nightly xargo build --target wasm32-unknown-unknown --release
+
+# Threading support is disabled by default in wasm-bindgen, so use an env var
+# here to turn it on for our bindings generation. Also note that webpack isn't
+# currently compatible with atomics, so we go with the --no-modules output.
+WASM_BINDGEN_THREADS=1 \
+  cargo +nightly run --manifest-path ../../crates/cli/Cargo.toml \
+    --bin wasm-bindgen -- \
+    ../../target/wasm32-unknown-unknown/release/raytrace_parallel.wasm --out-dir . \
+    --no-modules
+
+python3 -m http.server
diff --git a/examples/raytrace-parallel/index.html b/examples/raytrace-parallel/index.html
new file mode 100644
index 000000000..222588758
--- /dev/null
+++ b/examples/raytrace-parallel/index.html
@@ -0,0 +1,226 @@
+<html>
+  <head>
+    <meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>
+    <style>
+      #scene {
+        height: 100%;
+        width: 500px;
+        float: left;
+      }
+
+      #render, .concurrency, .timing {
+        padding: 20px;
+        margin: 20px;
+        float: left;
+      }
+    </style>
+  </head>
+  <body>
+    <textarea id='scene'>
+{
+  "width": 800,
+  "height": 800,
+  "fov": 90.0,
+  "shadow_bias": 1e-13,
+  "max_recursion_depth": 20,
+  "elements": [
+    {
+      "Sphere" : {
+        "center": {
+          "x": 0.0,
+          "y": 0.0,
+          "z": -5.0
+        },
+        "radius": 1.0,
+        "material": {
+          "coloration" : {
+            "Color": {
+              "red": 0.2,
+              "green": 1.0,
+              "blue": 0.2
+            }
+          },
+          "albedo": 0.18,
+          "surface": {
+            "Reflective": {
+              "reflectivity": 0.7
+            }
+          }
+        }
+      }
+    },
+    {
+      "Sphere" : {
+        "center": {
+          "x": -3.0,
+          "y": 1.0,
+          "z": -6.0
+        },
+        "radius": 2.0,
+        "material": {
+          "coloration": {
+            "Color": {
+              "red": 1.0,
+              "green": 1.0,
+              "blue": 1.0
+            }
+          },
+          "albedo": 0.58,
+          "surface": "Diffuse"
+        }
+      }
+    },
+    {
+      "Sphere": {
+        "center": {
+          "x": 2.0,
+          "y": 1.0,
+          "z": -4.0
+        },
+        "radius": 1.5,
+        "material": {
+          "coloration": {
+            "Color": {
+              "red": 1.0,
+              "green": 1.0,
+              "blue": 1.0
+            }
+          },
+          "albedo": 0.18,
+          "surface": {
+            "Refractive": {
+              "index": 1.5,
+              "transparency": 1.0
+            }
+          }
+        }
+      }
+    },
+    {
+      "Plane": {
+        "origin": {
+          "x": 0.0,
+          "y": -2.0,
+          "z": -5.0
+        },
+        "normal": {
+          "x": 0.0,
+          "y": -1.0,
+          "z": 0.0
+        },
+        "material": {
+          "coloration": {
+            "Color": {
+              "red": 1.0,
+              "green": 1.0,
+              "blue": 1.0
+            }
+          },
+          "albedo": 0.18,
+          "surface": {
+            "Reflective": {
+              "reflectivity": 0.5
+            }
+          }
+        }
+      }
+    },
+    {
+      "Plane": {
+        "origin": {
+          "x": 0.0,
+          "y": 0.0,
+          "z": -20.0
+        },
+        "normal": {
+          "x": 0.0,
+          "y": 0.0,
+          "z": -1.0
+        },
+        "material": {
+          "coloration": {
+            "Color": {
+              "red": 0.2,
+              "green": 0.3,
+              "blue": 1.0
+            }
+          },
+          "albedo": 0.38,
+          "surface": "Diffuse"
+        }
+      }
+    }
+  ],
+  "lights": [
+    {
+      "Spherical": {
+        "position": {
+          "x": -2.0,
+          "y": 10.0,
+          "z": -3.0
+        },
+        "color": {
+          "red": 0.3,
+          "green": 0.8,
+          "blue": 0.3
+        },
+        "intensity": 10000.0
+      }
+    },
+    {
+      "Spherical": {
+        "position": {
+          "x": 0.25,
+          "y": 0.0,
+          "z": -2.0
+        },
+        "color": {
+          "red": 0.8,
+          "green": 0.3,
+          "blue": 0.3
+        },
+        "intensity": 250.0
+      }
+    },
+    {
+      "Directional": {
+        "direction": {
+          "x": 0.0,
+          "y": 0.0,
+          "z": -1.0
+        },
+        "color": {
+          "red": 1.0,
+          "green": 1.0,
+          "blue": 1.0
+        },
+        "intensity": 0.0
+      }
+    }
+  ]
+}
+    </textarea>
+
+    <button disabled id='render'>Loading wasm...</button>
+    <div class='concurrency'>
+      <p id='concurrency-amt'>Concurrency: 1</p>
+      <br/>
+      <input disabled type="range" id="concurrency" min="0" max="1" />
+    </div>
+    <div id='timing'>
+      Render duration:
+      <p id='timing-val'></p>
+    </div>
+
+
+    <canvas id='canvas'></canvas>
+
+    <script>
+      delete WebAssembly.instantiateStreaming;
+      document.getElementById('render').disabled = true;
+      document.getElementById('concurrency').disabled = true;
+    </script>
+    <script src='raytrace_parallel.js'></script>
+    <script src='index.js'></script>
+  </body>
+</html>
diff --git a/examples/raytrace-parallel/index.js b/examples/raytrace-parallel/index.js
new file mode 100644
index 000000000..9d3e566ea
--- /dev/null
+++ b/examples/raytrace-parallel/index.js
@@ -0,0 +1,119 @@
+const button = document.getElementById('render');
+const canvas = document.getElementById('canvas');
+const scene = document.getElementById('scene');
+const concurrency = document.getElementById('concurrency');
+const concurrencyAmt = document.getElementById('concurrency-amt');
+const timing = document.getElementById('timing');
+const timingVal = document.getElementById('timing-val');
+const ctx = canvas.getContext('2d');
+
+button.disabled = true;
+concurrency.disabled = true;
+
+// First up, but try to do feature detection to provide better error messages
+function loadWasm() {
+  if (typeof SharedArrayBuffer !== 'function') {
+    alert('this browser does not have SharedArrayBuffer support enabled');
+    return
+  }
+  // Test for bulk memory operations with passive data segments
+  //  (module (memory 1) (data passive ""))
+  const buf = new Uint8Array([0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00,
+    0x05, 0x03, 0x01, 0x00, 0x01, 0x0b, 0x03, 0x01, 0x01, 0x00]);
+  if (!WebAssembly.validate(buf)) {
+    alert('this browser does not support passive wasm memory, demo does not work');
+    return
+  }
+
+  wasm_bindgen('./raytrace_parallel_bg.wasm')
+    .then(run)
+    .catch(console.error);
+}
+
+loadWasm();
+
+const { Scene, WorkerPool } = wasm_bindgen;
+
+function run() {
+  // The maximal concurrency of our web worker pool is `hardwareConcurrency`,
+  // so set that up here and this ideally is the only location we create web
+  // workers.
+  pool = new WorkerPool(navigator.hardwareConcurrency);
+
+  // Configure various buttons and such.
+  button.onclick = function() {
+    console.time('render');
+    let json;
+    try {
+      json = JSON.parse(scene.value);
+    } catch(e) {
+      alert(`invalid json: ${e}`);
+      return
+    }
+    canvas.width = json.width;
+    canvas.height = json.height;
+    render(new Scene(json));
+  };
+  button.innerText = 'Render!';
+  button.disabled = false;
+
+  concurrency.oninput = function() {
+    concurrencyAmt.innerText = 'Concurrency: ' + concurrency.value;
+  };
+  concurrency.min = 1;
+  concurrency.step = 1;
+  concurrency.max = navigator.hardwareConcurrency;
+  concurrency.value = concurrency.max;
+  concurrency.oninput();
+  concurrency.disabled = false;
+}
+
+let rendering = null;
+let start = null;
+let interval = null;
+let pool = null;
+
+class State {
+  constructor(wasm) {
+    this.start = performance.now();
+    this.wasm = wasm;
+    this.running = true;
+    this.counter = 1;
+
+    this.interval = setInterval(() => this.updateTimer(), 100);
+
+    wasm.promise()
+      .then(() => {
+        this.updateTimer();
+        this.stop();
+      })
+      .catch(console.error);
+  }
+
+  updateTimer() {
+    const dur = performance.now() - this.start;
+    timingVal.innerText = `${dur}ms`;
+    this.counter += 1;
+    if (this.wasm && this.counter % 3 == 0)
+      this.wasm.requestUpdate();
+  }
+
+  stop() {
+    if (!this.running)
+      return;
+    console.timeEnd('render');
+    this.running = false;
+    pool = this.wasm.cancel(); // this frees `wasm`, returning the worker pool
+    this.wasm = null;
+    clearInterval(this.interval);
+  }
+}
+
+function render(scene) {
+  if (rendering) {
+    rendering.stop();
+    rendering = null;
+  }
+  rendering = new State(scene.render(concurrency.value, pool, ctx));
+  pool = null; // previous call took ownership of `pool`, zero it out here too
+}
diff --git a/examples/raytrace-parallel/src/lib.rs b/examples/raytrace-parallel/src/lib.rs
new file mode 100644
index 000000000..c8d4a3ff1
--- /dev/null
+++ b/examples/raytrace-parallel/src/lib.rs
@@ -0,0 +1,364 @@
+extern crate futures;
+extern crate js_sys;
+extern crate raytracer;
+extern crate wasm_bindgen;
+extern crate web_sys;
+
+use std::cell::RefCell;
+use std::cmp;
+use std::rc::Rc;
+use std::sync::atomic::{AtomicUsize, AtomicBool, Ordering::SeqCst};
+use std::sync::atomic::ATOMIC_USIZE_INIT;
+use std::sync::{Arc, Mutex, MutexGuard};
+
+use futures::Future;
+use futures::sync::oneshot;
+use js_sys::{Promise, Error, WebAssembly, Uint8ClampedArray, Array};
+use wasm_bindgen::JsCast;
+use wasm_bindgen::prelude::*;
+use web_sys::{CanvasRenderingContext2d, Worker, Event, ErrorEvent};
+use web_sys::{DedicatedWorkerGlobalScope, MessageEvent};
+
+macro_rules! console_log {
+    ($($t:tt)*) => (log(&format_args!($($t)*).to_string()))
+}
+
+#[wasm_bindgen]
+extern "C" {
+    #[wasm_bindgen(js_namespace = console)]
+    fn log(s: &str);
+}
+
+#[wasm_bindgen]
+pub struct Scene {
+    inner: raytracer::scene::Scene,
+}
+
+static NEXT_ID: AtomicUsize = ATOMIC_USIZE_INIT;
+
+#[wasm_bindgen]
+impl Scene {
+    #[wasm_bindgen(constructor)]
+    pub fn new(object: &JsValue) -> Result<Scene, JsValue> {
+        console_error_panic_hook::set_once();
+        Ok(Scene {
+            inner: object.into_serde()
+                .map_err(|e| JsValue::from(e.to_string()))?,
+        })
+    }
+
+    pub fn render(
+        self,
+        concurrency: usize,
+        pool: WorkerPool,
+        ctx: CanvasRenderingContext2d,
+    ) -> Result<RenderingScene, JsValue> {
+        let (tx, rx) = oneshot::channel();
+        let rx = rx.then(|_| Ok(JsValue::undefined()));
+
+        let data = Rc::new(RefCell::new(None::<Render>));
+
+        let pixels = (self.inner.width * self.inner.height) as usize;
+        let mut r = Render {
+            tx: Some(tx),
+            callback: None,
+            shared: Arc::new(Shared {
+                id: NEXT_ID.fetch_add(1, SeqCst),
+                need_update: AtomicBool::new(false),
+                scene: self.inner,
+                next_pixel: AtomicUsize::new(0),
+                remaining: AtomicUsize::new(concurrency),
+                rgb_data: Mutex::new(vec![0; 4 * pixels]),
+            }),
+            ctx,
+        };
+
+        let data2 = data.clone();
+        let callback = Closure::wrap(Box::new(move |msg: Event| -> Result<(), JsValue> {
+            let mut slot = data2.borrow_mut();
+            if let Some(mut data) = slot.take() {
+                match data.event(&msg) {
+                    Ok(true) => {}
+                    Ok(false) => *slot = Some(data),
+                    Err(e) => {
+                        *slot = Some(data);
+                        return Err(e)
+                    }
+                }
+            }
+            Ok(())
+        }) as Box<FnMut(_) -> Result<(), JsValue>>);
+
+        for worker in &pool.workers[..concurrency] {
+            let ptr_to_send = Arc::into_raw(r.shared.clone()) as u32;
+            let ptr_to_send = JsValue::from(ptr_to_send);
+            worker.post_message(&ptr_to_send)?;
+            worker.set_onmessage(Some(callback.as_ref().unchecked_ref()));
+            worker.set_onerror(Some(callback.as_ref().unchecked_ref()));
+        }
+
+        r.callback = Some(callback);
+        *data.borrow_mut() = Some(r);
+
+        Ok(RenderingScene {
+            inner: data,
+            promise: wasm_bindgen_futures::future_to_promise(rx),
+            pool,
+        })
+    }
+}
+
+#[wasm_bindgen]
+pub struct WorkerPool {
+    workers: Vec<Worker>,
+    callback: Closure<FnMut(Event)>,
+}
+
+#[wasm_bindgen]
+impl WorkerPool {
+    #[wasm_bindgen(constructor)]
+    pub fn new(max: u32) -> Result<WorkerPool, JsValue> {
+        let callback = Closure::wrap(Box::new(|event: Event| {
+            console_log!("unhandled event: {}", event.type_());
+        }) as Box<FnMut(Event)>);
+        let mut workers = Vec::new();
+        for _ in 0..max {
+            // TODO: what do do about `./worker.js`:
+            //
+            // * the path is only known by the bundler. How can we, as a
+            //   library, know what's going on?
+            // * How do we not fetch a script N times? It internally then
+            //   causes another script to get fetched N times...
+            let worker = Worker::new("./worker.js")?;
+            let array = js_sys::Array::new();
+            array.push(&wasm_bindgen::module());
+
+            // TODO: memory allocation error handling here is hard:
+            //
+            // * How to we make sure that our strong ref made it to a client
+            //   thread?
+            // * Need to handle the `?` on `post_message` as well.
+            array.push(&wasm_bindgen::memory());
+            worker.post_message(array.as_ref())?;
+            worker.set_onmessage(Some(callback.as_ref().unchecked_ref()));
+            worker.set_onerror(Some(callback.as_ref().unchecked_ref()));
+            workers.push(worker);
+        }
+
+        Ok(WorkerPool {
+            workers,
+            callback,
+        })
+    }
+}
+
+impl Drop for WorkerPool {
+    fn drop(&mut self) {
+        for worker in self.workers.iter() {
+            worker.terminate();
+        }
+    }
+}
+
+#[wasm_bindgen]
+pub struct RenderingScene {
+    inner: Rc<RefCell<Option<Render>>>,
+    promise: Promise,
+    pool: WorkerPool,
+}
+
+#[wasm_bindgen]
+impl RenderingScene {
+    pub fn promise(&self) -> Promise {
+        self.promise.clone()
+    }
+
+    #[wasm_bindgen(js_name = requestUpdate)]
+    pub fn request_update(&self) {
+        if let Some(render) = self.inner.borrow().as_ref() {
+            render.shared.need_update.store(true, SeqCst);
+        }
+    }
+
+    pub fn cancel(self) -> WorkerPool {
+        if let Some(render) = self.inner.borrow_mut().take() {
+            // drain the rest of the pixels to cause all workers to cancel ASAP.
+            let pixels = render.shared.scene.width * render.shared.scene.height;
+            render.shared.next_pixel.fetch_add(pixels as usize, SeqCst);
+        }
+        for worker in self.pool.workers.iter() {
+            worker.set_onmessage(Some(&self.pool.callback.as_ref().unchecked_ref()));
+            worker.set_onerror(Some(&self.pool.callback.as_ref().unchecked_ref()));
+        }
+        self.pool
+    }
+}
+
+struct Render {
+    callback: Option<Closure<FnMut(Event) -> Result<(), JsValue>>>,
+    tx: Option<oneshot::Sender<()>>,
+    shared: Arc<Shared>,
+    ctx: CanvasRenderingContext2d,
+}
+
+struct Shared {
+    id: usize,
+    need_update: AtomicBool,
+    scene: raytracer::scene::Scene,
+    next_pixel: AtomicUsize,
+    remaining: AtomicUsize,
+    rgb_data: Mutex<Vec<u8>>,
+}
+
+#[wasm_bindgen]
+extern {
+    type ImageData;
+
+    #[wasm_bindgen(constructor, catch)]
+    fn new(data: &Uint8ClampedArray, width: f64, height: f64) -> Result<ImageData, JsValue>;
+}
+
+impl Render {
+    fn event(&mut self, event: &Event) -> Result<bool, JsValue> {
+        if let Some(error) = event.dyn_ref::<ErrorEvent>() {
+            let msg = format!("error in worker: {}", error.message());
+            return Err(Error::new(&msg).into());
+        }
+
+        if let Some(msg) = event.dyn_ref::<MessageEvent>() {
+            let data = msg.data();
+            if let Some(data) = data.dyn_ref::<Array>() {
+                let id = data.pop();
+                let done = data.pop();
+                let image = data.pop();
+                if let Some(id) = id.as_f64() {
+                    if id == self.shared.id as f64 {
+                        self.ctx.put_image_data(image.unchecked_ref(), 0.0, 0.0)?;
+                        return Ok(done.as_bool() == Some(true))
+                    }
+                }
+            }
+            console_log!("unhandled message: {:?}", data);
+            return Ok(false)
+        }
+
+        console_log!("unhandled event: {}", event.type_());
+
+        Ok(false)
+    }
+}
+
+#[wasm_bindgen]
+pub fn child_entry_point(ptr: u32) -> Result<(), JsValue> {
+    let ptr = unsafe {
+        Arc::from_raw(ptr as *const Shared)
+    };
+    assert_send(&ptr);
+
+    let global = js_sys::global()
+        .unchecked_into::<DedicatedWorkerGlobalScope>();
+    ptr.work(&global)?;
+
+    return Ok(());
+
+    fn assert_send<T: Send + 'static>(_: &T) {}
+}
+
+impl Shared {
+    fn work(&self, global: &DedicatedWorkerGlobalScope) -> Result<(), JsValue> {
+        // Once we're done raytracing a pixel we need to actually write its rgb
+        // value into the shared memory buffer for our image. This, however,
+        // requires synchronization with other threads (as currently
+        // implemented). To help amortize the cost of synchronization each
+        // thread processes a chunk of pixels at a time, and this number is how
+        // many pixes will be rendered synchronously before committing them back
+        // to memory.
+        const BLOCK: usize = 1024;
+
+        let width = self.scene.width as usize;
+        let height = self.scene.height as usize;
+        let end = width * height;
+
+        // Thread-local storage for our RGB data, commited back in one batch to
+        // the main image memory.
+        let mut local_rgb = [0; BLOCK * 4];
+
+        loop {
+            // First up, grab a block of pixels to render using an atomic add.
+            // If we're beyond the end then we're done!
+            let start = self.next_pixel.fetch_add(BLOCK, SeqCst);
+            if start >= end {
+                break
+            }
+
+            // Raytrace all our pixels synchronously, writing all the results
+            // into our local memory buffer.
+            let len = cmp::min(end, start + BLOCK) - start;
+            for (i, dst) in local_rgb.chunks_mut(4).enumerate().take(len) {
+                let x = (start + i) % width;
+                let y = (start + i) / width;
+                let ray = raytracer::Ray::create_prime(x as u32, y as u32, &self.scene);
+                let result = raytracer::cast_ray(&self.scene, &ray, 0).to_rgba();
+                dst[0] = result.data[0];
+                dst[1] = result.data[1];
+                dst[2] = result.data[2];
+                dst[3] = result.data[3];
+            }
+
+            // Ok, time to synchronize and commit this data back into the main
+            // image buffer for other threads and the main thread to see.
+            let mut data = self.rgb_data.lock().unwrap();
+            data[start * 4..(start + len) * 4]
+                .copy_from_slice(&mut local_rgb[..len * 4]);
+
+            // As a "nifty feature" we try to have a live progressive rendering.
+            // That means that we need to periodically send an `ImageData` to
+            // the main thread. Do so whenever the main thread requests it.
+            if self.need_update.swap(false, SeqCst) {
+                self.update_image(false, data, global)?;
+            }
+        }
+
+
+        // If we're the last thread out, be sure to update the main thread's
+        // image as this is the last chance we'll get!
+        if self.remaining.fetch_sub(1, SeqCst) == 1 {
+            let data = self.rgb_data.lock().unwrap();
+            self.update_image(true, data, global)?;
+        }
+
+        Ok(())
+    }
+
+    fn update_image(
+        &self,
+        done: bool,
+        data: MutexGuard<Vec<u8>>,
+        global: &DedicatedWorkerGlobalScope,
+    ) -> Result<(), JsValue> {
+        // This is pretty icky. We can't create an `ImageData` backed by
+        // `SharedArrayBuffer`, so we need to copy the memory into a local
+        // JS array using `slice`. This means we can't use
+        // `web_sys::ImageData` right now but rather we have to use our own
+        // binding.
+        let mem = wasm_bindgen::memory()
+            .unchecked_into::<WebAssembly::Memory>();
+        let mem = Uint8ClampedArray::new(&mem.buffer())
+            .slice(
+                data.as_ptr() as u32,
+                data.as_ptr() as u32 + data.len() as u32,
+            );
+        drop(data); // unlock the lock, we've copied the data now
+        let data = ImageData::new(
+            &mem,
+            self.scene.width as f64,
+            self.scene.height as f64,
+        )?;
+        let arr = Array::new();
+        arr.push(data.as_ref());
+        arr.push(&JsValue::from(done));
+        arr.push(&JsValue::from(self.id as f64));
+        global.post_message(arr.as_ref())?;
+        Ok(())
+    }
+}
diff --git a/examples/raytrace-parallel/worker.js b/examples/raytrace-parallel/worker.js
new file mode 100644
index 000000000..dae9ae087
--- /dev/null
+++ b/examples/raytrace-parallel/worker.js
@@ -0,0 +1,32 @@
+// synchronously, using the browser, import out shim JS scripts
+importScripts('raytrace_parallel.js');
+
+let booted = false;
+let lastPtr = null;
+
+// Wait for the main thread to send us the shared module/memory. Once we've got
+// it, initialize it all with the `wasm_bindgen` global we imported via
+// `importScripts`.
+//
+// After our first message all subsequent messages are an entry point to run,
+// so we just do that.
+self.onmessage = function(args) {
+  self.onmessage = event => run(event.data);
+  const [module, memory] = args.data;
+  wasm_bindgen(module, memory)
+    .then(() => {
+      booted = true;
+      if (lastPtr)
+        run(lastPtr);
+    })
+    .catch(e => setTimeout(() => { throw e; })); // propagate to main `onerror`
+};
+
+function run(ptr) {
+  if (!booted) {
+    lastPtr = ptr;
+    return;
+  }
+  lastPtr = null;
+  wasm_bindgen.child_entry_point(ptr);
+}
diff --git a/guide/src/SUMMARY.md b/guide/src/SUMMARY.md
index 0bf293de6..a119dc336 100644
--- a/guide/src/SUMMARY.md
+++ b/guide/src/SUMMARY.md
@@ -26,6 +26,7 @@
   - [web-sys: WebAudio](./examples/web-audio.md)
   - [web-sys: WebGL](./examples/webgl.md)
   - [web-sys: A Simple Paint Program](./examples/paint.md)
+  - [Parallel Raytracing](./examples/raytrace.md)
 - [Reference](./reference/index.md)
   - [Passing Rust Closures to JS](./reference/passing-rust-closures-to-js.md)
   - [Receiving JS Closures in Rust](./reference/receiving-js-closures-in-rust.md)
diff --git a/guide/src/examples/raytrace.md b/guide/src/examples/raytrace.md
new file mode 100644
index 000000000..b061a1c54
--- /dev/null
+++ b/guide/src/examples/raytrace.md
@@ -0,0 +1,26 @@
+# Parallel Raytracing
+
+[View full source code][code] or [view the compiled example online][online]
+
+[online]: https://rustwasm.github.io/wasm-bindgen/exbuild/raytrace-parallel/
+[code]: https://github.com/rustwasm/wasm-bindgen/tree/master/examples/raytrace-parallel
+
+**This is an unstable and experimental example** of using threads with
+WebAssembly and Rust, culminating in a parallel raytracer demo. Current
+requirements for viewing this demo are:
+
+* Firefox Nightly - other browsers haven't implemented the proposed WebAssembly
+  features yet.
+* `SharedArrayBuffer` is enabled in `about:config` in Firefox
+
+This demo may also break over time as Firefox updates, but we'll try to keep it
+working!
+
+Locally to build this demo you'll need `xargo` and the `rust-src` rustup
+component, and afterwards `./build.sh` like other examples should build the
+example.
+
+Again, to reiterate, this is all experimental and we're working through various
+issues as we're working on this. If you're curious to see how this works it's
+best to explore via the source code right now! More info will be available here
+once WebAssembly threads are closer to stabilization.
diff --git a/src/lib.rs b/src/lib.rs
index 2b89cb918..3b8624698 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -491,6 +491,7 @@ externs! {
     fn __wbindgen_jsval_eq(a: u32, b: u32) -> u32;
 
     fn __wbindgen_memory() -> u32;
+    fn __wbindgen_module() -> u32;
 }
 
 impl Clone for JsValue {
@@ -633,6 +634,19 @@ pub fn throw_val(s: JsValue) -> ! {
     }
 }
 
+/// Returns a handle to this wasm instance's `WebAssembly.Module`
+///
+/// Note that this is only available when the final wasm app is built with
+/// `--no-modules`, it's not recommended to rely on this API yet! This is
+/// largely just an experimental addition to enable threading demos. Using this
+/// may prevent your wasm module from building down the road.
+#[doc(hidden)]
+pub fn module() -> JsValue {
+    unsafe {
+        JsValue::_new(__wbindgen_module())
+    }
+}
+
 /// Returns a handle to this wasm instance's `WebAssembly.Memory`
 pub fn memory() -> JsValue {
     unsafe {