diff --git a/CHANGELOG.md b/CHANGELOG.md
index 681fa59a..db2f658a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,11 +11,15 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### Added
 
 ### Fixed
+- Multi-loss casts type to first loss-type before accumulation (aborted before due to missing cast)
+- Throw `ShapeSizeException` if total expanded shape size exceeds numeric capacity of the maximum int value (2^31-1)
+- During mini-batch-fitting, catch `ShapeSizeException` and use another sizing hint. Aborts outside mini-batch-fitting. 
 - Fix incorrect/missing gradient accumulation with delay > 1 or large effective batch size of biases of affine operations.
 - Fixed case augmentation with multi-threaded reading.
 - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load
 
 ### Changed
+- Negative `--workspace -N` value allocates workspace as total available GPU memory minus N megabytes.  
 - Set default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable.
 - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce.
 - Changed minimal C++ standard to C++-17
diff --git a/VERSION b/VERSION
index a130ad69..77418c85 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.11.6
+v1.11.7
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index e3ac2108..404b43f1 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -118,8 +118,8 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
     ->implicit_val("basic");
   cli.add<std::vector<std::string>>("--config,-c",
     "Configuration file(s). If multiple, later overrides earlier");
-  cli.add<size_t>("--workspace,-w",
-    "Preallocate arg MB of work space",
+  cli.add<int>("--workspace,-w",
+    "Preallocate arg MB of work space. Negative `--workspace -N` value allocates workspace as total available GPU memory minus N megabytes.",
     defaultWorkspace);
   cli.add<std::string>("--log",
     "Log training process information to file given by arg");
diff --git a/src/common/shape.h b/src/common/shape.h
index 59e6cf21..270b3537 100644
--- a/src/common/shape.h
+++ b/src/common/shape.h
@@ -12,6 +12,26 @@
 
 namespace marian {
 
+class ShapeSizeException : public std::exception {
+private:
+  char* message_;
+
+public:
+  ShapeSizeException(size_t available, size_t asked) {
+    std::string mstr = "Expanded shape size " + std::to_string(asked)
+                       + " exceeds numeric capcacity " + std::to_string(available);
+
+    message_ = new char[mstr.size() + 1];
+    std::copy(mstr.begin(), mstr.end(), message_);
+    message_[mstr.size()] = 0;
+  }
+
+  ~ShapeSizeException() { delete[] message_; }
+
+  virtual const char* what() const noexcept override { return message_; }
+};
+
+
 struct Slice // Python-like slice/index descriptor
 {
   Slice(int b, int e, int s) : begin(b), end(e), stride(s) {}
@@ -110,10 +130,12 @@ public:
 
   template<typename T = int> // using a template so that FactoredSegmenter, which uses this as well, can pass size_t
   inline T elements() const {
-    T el = 1;
+    size_t el = 1;
     for(auto s : shape_)
-      el *= (T)s;
-    return el;
+      el *= (size_t)s;
+    if(el > std::numeric_limits<T>::max())
+      throw ShapeSizeException(std::numeric_limits<T>::max(), el);
+    return (T)el;
   }
 
   inline void dims(int i, std::vector<int>& d) const {
diff --git a/src/embedder/embedder.h b/src/embedder/embedder.h
index f2e4a10c..36b3df44 100644
--- a/src/embedder/embedder.h
+++ b/src/embedder/embedder.h
@@ -84,7 +84,7 @@ public:
       auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
       graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
       graph->setDevice(device);
-      graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+      graph->reserveWorkspaceMB(options_->get<int>("workspace"));
       graphs_.push_back(graph);
     }
 
diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp
index 12a1195e..146f7c4c 100644
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@@ -23,6 +23,23 @@ void ExpressionGraph::setDevice(DeviceId deviceId, Ptr<Device> device) {
   }
 }
 
+void ExpressionGraph::reserveWorkspaceMB(int num) {
+  size_t bytes;
+  if(num > 0) {
+    bytes = (size_t)num * 1024 * 1024 - 1;
+  } else if (num < 0) {
+    ABORT_IF(getDeviceId().type == DeviceType::cpu, "Negative workspace not allowed on CPU device");
+    size_t globalMemorySize = backend_->getGlobalMemorySize(); // in bytes, only implemented for GPU backend
+    size_t notWorkspaceSize = (size_t)std::abs(num) * 1024 * 1024 - 1;
+    ABORT_IF(notWorkspaceSize >= globalMemorySize, "Negative workspace {} larger/equal total memory {}?", notWorkspaceSize, globalMemorySize);
+    bytes = globalMemorySize - notWorkspaceSize;
+    LOG(debug, "Reserving {} = {} - {} bytes as workspace", bytes, globalMemorySize, notWorkspaceSize);
+  } else {
+    ABORT("Allocating 0 bytes?");
+  }
+  tensors_->reserve(bytes);
+}
+
 Expr ExpressionGraph::add(Expr node) {
   auto found = tensors_->findOrRemember(node);
   if(found) {
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index e3222a0f..9272e42a 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -244,11 +244,10 @@ public:
    * Preallocate workspace memory (MB) for the graph.
    * Sets the size of the memory available for the forward and backward step of the training procedure.
    * This does not include model size and optimizer parameters that are allocated outsize workspace.
+   * If memory is negative (<0) the total available GPU memory is used with the absolute value substracted.
+   * Negative workspace is not supported on CPU.
    */
-  void reserveWorkspaceMB(size_t num) {
-    size_t bytes = num * 1024 * 1024 - 1;
-    tensors_->reserve(bytes);
-  }
+  void reserveWorkspaceMB(int num);
 
   /** Copy tensor objects from one graph to current graph */
   void reuseWorkspace(Ptr<ExpressionGraph> graph) {
@@ -277,7 +276,7 @@ public:
       tensors_->throwAtReallocation(true);
       backprop();
       tensors_->throwAtReallocation(false);
-    } catch(AllocationException&) {
+    } catch(const AllocationException&) {
       tensors_->throwAtReallocation(false);
       return false;
     }
diff --git a/src/layers/guided_alignment.h b/src/layers/guided_alignment.h
index d2171c50..d5929a6d 100644
--- a/src/layers/guided_alignment.h
+++ b/src/layers/guided_alignment.h
@@ -53,9 +53,9 @@ static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
   auto attentionAtAligned = cols(flatten(attention), alignmentIndices);
 
   float epsilon           = 1e-6f;
-  Expr alignmentLoss      = -sum(alignmentValues * log(attentionAtAligned + epsilon));
+  Expr alignmentLoss      = -sum(cast(alignmentValues * log(attentionAtAligned + epsilon), Type::float32));
   size_t numLabels        = alignmentIndices->shape().elements();
-  
+
   // Create label node, also weigh by scalar so labels and cost are in the same domain.
   // Fractional label counts are OK. But only if combined as "sum".
   // @TODO: It is ugly to check the multi-loss type here, but doing this right requires
diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h
index af0a1606..26d74917 100644
--- a/src/rescorer/rescorer.h
+++ b/src/rescorer/rescorer.h
@@ -73,7 +73,7 @@ public:
       auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
       graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
       graph->setDevice(device);
-      graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+      graph->reserveWorkspaceMB(options_->get<int>("workspace"));
       graphs_.push_back(graph);
     }
 
diff --git a/src/tensors/backend.h b/src/tensors/backend.h
index e0e93039..64a28f92 100644
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@@ -29,6 +29,7 @@ public:
   // for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name.
   virtual void setDevice() = 0;
   virtual void synchronize() = 0;
+  virtual size_t getGlobalMemorySize() = 0;
 
   // for CPU, sets to use optimized code for inference.
   // for GPU, this is invalid. for gpu, isOptimized() function always returns false.
diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h
index f52ff6a3..76c47a79 100644
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@@ -20,6 +20,10 @@ public:
   void setDevice() override {}
   void synchronize() override {}
 
+  size_t getGlobalMemorySize() override {
+   ABORT("Not implemented on CPU");
+  }
+
   // for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU.
   void setOptimized(bool optimize) override { optimized_ = optimize; }
   bool isOptimized() override { return optimized_; }
diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h
index 410b41a4..022e4f3f 100644
--- a/src/tensors/gpu/backend.h
+++ b/src/tensors/gpu/backend.h
@@ -96,6 +96,12 @@ public:
 
   CudaCompute getCudaComputeCapability() { return compute_; }
 
+  size_t getGlobalMemorySize() override {
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, (int)deviceId_.no));
+    return prop.totalGlobalMem;
+  }
+
 private:
   cublasHandle_t cublasHandle_{0};     // make sure it's 0, so it can be initalized lazily
   cusparseHandle_t cusparseHandle_{0}; // as above
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index 59cd4b6d..4d92b1c9 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -82,7 +82,7 @@ void GraphGroup::initGraphsAndOpts() {
 
     graph->setDevice(device);
     
-    graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+    graph->reserveWorkspaceMB(options_->get<int>("workspace"));
 
     graphs_.push_back(graph);
 
@@ -510,8 +510,18 @@ Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
       lengths[j] = std::min(lengths[j], localMaxes[j]);
 
     auto batch = data::CorpusBatch::fakeBatch(lengths, vocabs, maxBatch, options_);
-    auto loss = model->build(graph, batch);
-    fits = graph->fits();
+
+    // We check for a ShapeSizeException (happens if total shape size would exceed max int).
+    // If caught, we reduce the batch size. In any other context, this exception will cause
+    // an error and exit Marian.
+    try {
+      auto loss = model->build(graph, batch);
+      fits = graph->fits();
+    } catch(const ShapeSizeException& e) {
+      LOG(debug, "Exception for maxBatch size {}: {}", maxBatch, e.what());
+      fits = false;
+    }
+
     if(fits)
       maxBatch *= 2;
   }
@@ -530,8 +540,15 @@ Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
     do {
       size_t current = (start + end) / 2;
       auto batch = data::CorpusBatch::fakeBatch(lengths, vocabs, current, options_);
-      auto loss = model->build(graph, batch);
-      fits = graph->fits();
+
+      // Same as above.
+      try {
+        auto loss = model->build(graph, batch);
+        fits = graph->fits();
+      } catch(const ShapeSizeException& e) {
+        LOG(debug, "Exception for maxBatch size {}: {}", maxBatch, e.what());
+        fits = false;
+      }
 
       LOG(debug, "[batching] length: {} - size: {} - fits: {}", lengths[0], current, fits);
 
diff --git a/src/translator/translator.h b/src/translator/translator.h
index 75b5070b..3103e7dd 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -98,7 +98,7 @@ public:
           graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
           graph->getBackend()->setQuantizeRange(options_->get<float>("quantize-range"));
         }
-        graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+        graph->reserveWorkspaceMB(options_->get<int>("workspace"));
         graphs_[id] = graph;
 
         std::vector<Ptr<Scorer>> scorers;
@@ -311,7 +311,7 @@ public:
         graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
         graph->getBackend()->setQuantizeRange(options_->get<float>("quantize-range"));
       }
-      graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+      graph->reserveWorkspaceMB(options_->get<int>("workspace"));
       graphs_.push_back(graph);
 
       auto scorers = createScorers(options_, model_items_);