Merged PR 15233: Sync internal master with public master

Regular sync of public and internal master.
2024-09-17 09:47:34 +03:00 · 2020-09-07 19:37:41 +00:00 · 2020-09-07 19:37:41 +00:00 · e3916b3d08
commit e3916b3d08
parent ea3ac624c6
28 changed files with 521 additions and 191 deletions
--- a/.github/workflows/build-windows-2019-cpu.yml
+++ b/.github/workflows/build-windows-2019-cpu.yml
@ -1,49 +0,0 @@
-name: windows-2019-cpu
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  build:
-
-    runs-on: windows-2019
-
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v2
-      with:
-        submodules: recursive
-
-    - name: Prepare vcpkg
-      uses: lukka/run-vcpkg@v3
-      with:
-        vcpkgArguments: protobuf
-        vcpkgGitCommitId: 6185aa76504a5025f36754324abf307cc776f3da
-        vcpkgDirectory: ${{ github.workspace }}/vcpkg/
-        vcpkgTriplet: x64-windows-static
-
-    # Note that we build with a simplified CMake settings JSON file
-    - name: Run CMake
-      uses: lukka/run-cmake@v2
-      with:
-        buildDirectory: ${{ github.workspace }}/build/
-        cmakeAppendedArgs: -G Ninja
-        cmakeListsOrSettingsJson: CMakeSettingsJson
-        cmakeSettingsJsonPath: ${{ github.workspace }}/CMakeSettingsCI.json
-        useVcpkgToolchainFile: true
-
-    - name: Run unit tests
-      working-directory: build/Debug/
-      run: ctest
-
-    - name: Print versions
-      working-directory: build/Debug/
-      run: |
-        .\marian.exe --version
-        .\marian-decoder.exe --version
-        .\marian-scorer.exe --version
-        .\spm_encode.exe --version
-
--- a/.github/workflows/build-macos-10.15-cpu.yml
+++ b/.github/workflows/build-macos-10.15-cpu.yml
@ -1,4 +1,4 @@
-name: macos-10.5-cpu
+name: macOS CPU-only

 on:
  push:
@ -7,7 +7,7 @@ on:
    branches: [ master ]

 jobs:
-  build:
+  build-macos:

    runs-on: macos-10.15

--- a/.github/workflows/build-ubuntu-18.04-cpu.yml
+++ b/.github/workflows/build-ubuntu-18.04-cpu.yml
@ -1,4 +1,4 @@
-name: ubuntu-18.04-cpu
+name: Ubuntu 18.04 CPU-only

 on:
  push:
@ -7,7 +7,7 @@ on:
    branches: [ master ]

 jobs:
-  build:
+  build-ubuntu:

    runs-on: ubuntu-18.04

@ -41,7 +41,8 @@ jobs:
      run: |
        mkdir -p build
        cd build
-        cmake .. -DCOMPILE_CPU=on -DCOMPILE_CUDA=off -DCOMPILE_EXAMPLES=on -DCOMPILE_SERVER=on -DCOMPILE_TESTS=on \
+        cmake .. -DCMAKE_BUILD_TYPE=Release \
+          -DCOMPILE_CPU=on -DCOMPILE_CUDA=off -DCOMPILE_EXAMPLES=on -DCOMPILE_SERVER=on -DCOMPILE_TESTS=on \
          -DUSE_FBGEMM=on -DUSE_SENTENCEPIECE=on \
          -DBOOST_ROOT=$BOOST_ROOT_1_69_0 -DBOOST_INCLUDEDIR=$BOOST_ROOT_1_69_0/include -DBOOST_LIBRARYDIR=$BOOST_ROOT_1_69_0/lib \
          -DBoost_ARCHITECTURE=-x64
@ -62,3 +63,13 @@ jobs:
        ./marian-scorer --version
        ./spm_encode --version

+    - name: Prepare archive
+      working-directory: build
+      run: tar zcvf marian-ubuntu-release-static.tar.gz marian*
+
+    # Marian is built with FBGEMM, so there are some restrictions on what CPUs the executables can be run
+    - name: Upload archive
+      uses: actions/upload-artifact@v2
+      with:
+        name: marian-ubuntu-release-static.tar.gz
+        path: build/marian-ubuntu-release-static.tar.gz
--- a/.github/workflows/ubuntu-gpu.yml
+++ b/.github/workflows/ubuntu-gpu.yml
@ -0,0 +1,96 @@
+name: Ubuntu CPU+CUDA
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build-ubuntu-cuda:
+
+    runs-on: ${{ matrix.os }}
+    continue-on-error: ${{ matrix.experimental }}
+
+    strategy:
+      matrix:
+        include:
+          # Ubuntu 20.04 supports CUDA 11+
+          #- os: ubuntu-20.04
+            #cuda: "11.0"
+            #gcc: 9
+            #boost: false        # ubuntu-20.04 image does not have Boost pre-installed yet
+            #experimental: true  # continue even if the job fails
+          # Ubuntu 18.04 supports CUDA 10.1+
+          - os: ubuntu-18.04
+            cuda: "10.2"
+            gcc: 8
+            boost: true
+            experimental: false
+          # Ubuntu 16.04 supports CUDA 8+
+          - os: ubuntu-16.04
+            cuda: "10.2"
+            gcc: 7
+            boost: true
+            experimental: false
+          - os: ubuntu-16.04
+            cuda: 9.2
+            gcc: 7
+            boost: true
+            experimental: false
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    # The following packages are already installed on GitHub-hosted runners: build-essential openssl libssl-dev
+    # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev
+    - name: Install dependencies
+      run: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler
+
+    # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
+    - name: Install MKL
+      run: |
+        wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
+        sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
+        sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
+        sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
+
+    # The script simplifies installation of different versions of CUDA
+    - name: Install CUDA
+      run: ./scripts/ci/install_cuda_ubuntu.sh ${{ matrix.cuda }}
+
+    # Boost is already installed on GitHub-hosted runners in a non-standard location
+    # https://github.com/actions/virtual-environments/issues/687#issuecomment-610471671
+    - name: Configure CMake
+      run: |
+        mkdir -p build
+        cd build
+        CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} \
+        cmake .. \
+          -DCOMPILE_CPU=on -DCOMPILE_CUDA=on -DCOMPILE_EXAMPLES=on -DCOMPILE_SERVER=${{ matrix.boost }} -DCOMPILE_TESTS=on \
+          -DUSE_FBGEMM=on -DUSE_SENTENCEPIECE=on \
+          -DBOOST_ROOT=$BOOST_ROOT_1_69_0 -DBOOST_INCLUDEDIR=$BOOST_ROOT_1_69_0/include -DBOOST_LIBRARYDIR=$BOOST_ROOT_1_69_0/lib \
+          -DBoost_ARCHITECTURE=-x64 \
+          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }}
+
+    - name: Compile
+      working-directory: build
+      run: make -j2
+
+    # Unit tests are not run because GitHub-hosted runners do not have GPUs
+    # TODO: add a flag to CMake to compile unit tests only on CPU
+    #- name: Run unit tests
+      #working-directory: build
+      #run: make test
+
+    - name: Print versions
+      working-directory: build
+      run: |
+        ./marian --version
+        ./marian-decoder --version
+        ./marian-scorer --version
+        ./spm_encode --version
+
--- a/.github/workflows/windows-cpu.yml
+++ b/.github/workflows/windows-cpu.yml
@ -0,0 +1,67 @@
+name: Windows CPU-only
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build-windows:
+
+    runs-on: windows-2019
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Download MKL
+      run: |
+        # Wget can retry downloading files, so it is used instead of Invoke-WebRequest
+        C:\msys64\usr\bin\wget.exe -nv https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip -O mkl.zip
+        Expand-Archive -Force mkl.zip ${{ github.workspace }}/mkl
+        # Set the MKLROOT environment variable so that CMake can find MKL.
+        # GITHUB_WORKSPACE is an environment variable available on all GitHub-hosted runners
+        echo "::set-env name=MKLROOT::$env:GITHUB_WORKSPACE/mkl"
+      shell: powershell
+
+    - name: Prepare vcpkg
+      uses: lukka/run-vcpkg@v2
+      with:
+        vcpkgArguments: protobuf
+        vcpkgGitCommitId: 6185aa76504a5025f36754324abf307cc776f3da
+        vcpkgDirectory: ${{ github.workspace }}/vcpkg/
+        vcpkgTriplet: x64-windows-static
+
+    # Build with a simplified CMake settings JSON file
+    - name: Run CMake
+      uses: lukka/run-cmake@v2
+      with:
+        buildDirectory: ${{ github.workspace }}/build/
+        cmakeAppendedArgs: -G Ninja
+        cmakeListsOrSettingsJson: CMakeSettingsJson
+        # JSON file must be in the same directory as the main CMakeLists.txt
+        cmakeSettingsJsonPath: ${{ github.workspace }}/_CMakeSettingsCI_CPU.json
+        useVcpkgToolchainFile: true
+
+    - name: Run unit tests
+      working-directory: build/Release/
+      run: ctest
+
+    #- name: Print versions
+      #working-directory: build/Release/
+      #run: |
+        #.\marian.exe --version
+        #.\marian-decoder.exe --version
+        #.\marian-scorer.exe --version
+        #.\spm_encode.exe --version
+      #shell: powershell
+
+    # Marian is built with FBGEMM, so there are some restrictions on what CPUs the executables can be run
+    - name: Upload archive
+      uses: actions/upload-artifact@v2
+      with:
+        name: marian-windows-release-static
+        path: build/Release/marian*.exe
--- a/.github/workflows/windows-gpu.yml
+++ b/.github/workflows/windows-gpu.yml
@ -0,0 +1,64 @@
+name: Windows CPU+CUDA
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build-windows-cuda:
+
+    runs-on: windows-2019
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Install CUDA
+      run: |
+        .\scripts\ci\install_cuda_windows.ps1 "10.2"
+        # Set path to CUDA for subsequent steps so that CMake can find it
+        echo "::set-env name=CUDA_PATH::$env:CUDA_PATH"
+        echo "::add-path::$env:CUDA_PATH/bin"
+      shell: powershell
+
+    - name: Download MKL
+      run: |
+        # Wget can retry downloading files, so it is used instead of Invoke-WebRequest
+        C:\msys64\usr\bin\wget.exe -nv https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip -O mkl.zip
+        Expand-Archive -Force mkl.zip ${{ github.workspace }}/mkl
+        # Set the MKLROOT environment variable so that CMake can find MKL.
+        # GITHUB_WORKSPACE is an environment variable available on all GitHub-hosted runners
+        echo "::set-env name=MKLROOT::$env:GITHUB_WORKSPACE/mkl"
+      shell: powershell
+
+    - name: Prepare vcpkg
+      uses: lukka/run-vcpkg@v2
+      with:
+        vcpkgArguments: protobuf
+        vcpkgGitCommitId: 6185aa76504a5025f36754324abf307cc776f3da
+        vcpkgDirectory: ${{ github.workspace }}/vcpkg/
+        vcpkgTriplet: x64-windows-static
+
+    # Build with a simplified CMake settings JSON file.
+    # On Windows+CUDA we compile with COMPILE_CUDA=on and USE_NCCL=off
+    - name: Run CMake
+      uses: lukka/run-cmake@v2
+      with:
+        buildDirectory: ${{ github.workspace }}/build/
+        cmakeAppendedArgs: -G Ninja
+        cmakeListsOrSettingsJson: CMakeSettingsJson
+        # JSON file must be in the same directory as the main CMakeLists.txt
+        cmakeSettingsJsonPath: ${{ github.workspace }}/_CMakeSettingsCI_GPU.json
+        useVcpkgToolchainFile: true
+
+    - name: Print versions
+      working-directory: build/Debug/
+      run: |
+        .\marian.exe --version
+        .\marian-decoder.exe --version
+        .\marian-scorer.exe --version
+        .\spm_encode.exe --version
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]

 ### Added
+- Turing and Ampere GPU optimisation support, if the CUDA version supports it.
 - Printing word-level scores in marian-scorer
 - Optimize LayerNormalization on CPU by 6x through vectorization (ffast-math) and fixing performance regression introduced with strides in 77a420
 - Decoding multi-source models in marian-server with --tsv
@ -24,6 +25,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Internal optional parameter in n-best list generation that skips empty hypotheses.

 ### Fixed
+- Print "server is listening on port" message after it is accepting connections
 - Fix compilation without BLAS installed
 - Providing a single value to vector-like options using the equals sign, e.g. --models=model.npz
 - Fix quiet-translation in marian-server
@ -39,6 +41,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Properly record cmake variables in the cmake build directory instead of the source tree.
 - Added default "none" for option shuffle in BatchGenerator, so that it works in executables where shuffle is not an option.
 - Added a few missing header files in shortlist.h and beam_search.h.
+- Improved handling for receiving SIGTERM during training. By default, SIGTERM triggers 'save (now) and exit'. Prior to this fix, batch pre-fetching did not check for this sigal, potentially delaying exit considerably. It now pays attention to that. Also, the default behaviour of save-and-exit can now be disabled on the command line with --sigterm exit-immediately.

 ### Changed
 - Move Simple-WebSocket-Server to submodule
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,10 +13,6 @@ set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
 # Custom CMake options
 option(COMPILE_CPU "Compile CPU version" ON)
 option(COMPILE_CUDA "Compile GPU version" ON)
-option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
-option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
-option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
-option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
 option(COMPILE_EXAMPLES "Compile examples" OFF)
 option(COMPILE_SERVER "Compile marian-server" OFF)
 option(COMPILE_TESTS "Compile tests" OFF)
@ -243,6 +239,30 @@ if(CUDA_FOUND)
      message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
  endif()

+  # We want to compile as many targets as possible but different CUDA versions support different targets.
+  # Let's instead enable options based on what cuda version we have.
+  if((CUDA_VERSION VERSION_EQUAL "9.0" OR CUDA_VERSION VERSION_GREATER "9.0") AND CUDA_VERSION VERSION_LESS "11.0")
+    option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
+    option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
+    option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
+    option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
+  endif()
+  if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND CUDA_VERSION VERSION_LESS "11.0")
+    option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
+    option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
+    option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
+    option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
+    option(COMPILE_CUDA_SM75 "Compile GPU version with SM75 support" ON)
+  endif()
+  if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")
+    option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
+    option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
+    option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
+    option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
+    option(COMPILE_CUDA_SM75 "Compile GPU version with SM75 support" ON)
+    option(COMPILE_CUDA_SM80 "Compile GPU version with SM80 support" ON)
+  endif()
+
  if(COMPILE_CUDA_SM35)
    LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;)                             # Tesla K40 and above
  endif(COMPILE_CUDA_SM35)
@ -255,6 +275,16 @@ if(CUDA_FOUND)
  if(COMPILE_CUDA_SM70)
    LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) # Volta GPUs
  endif(COMPILE_CUDA_SM70)
+  if(CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0")
+    if(COMPILE_CUDA_SM75)
+        LIST(APPEND COMPUTE -gencode=arch=compute_75,code=sm_75; -gencode=arch=compute_75,code=compute_75) # Turing GPUs
+    endif(COMPILE_CUDA_SM75)
+  endif()
+  if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")
+    if(COMPILE_CUDA_SM80)
+        LIST(APPEND COMPUTE -gencode=arch=compute_80,code=sm_80; -gencode=arch=compute_80,code=compute_80) # Ampere GPUs
+    endif(COMPILE_CUDA_SM80)
+  endif()

  if(USE_STATIC_LIBS)
    set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
--- a/2
+++ b/2
@ -1 +1 @@
-v1.9.35
+v1.9.36
--- a/_CMakeSettingsCI_CPU.json
+++ b/_CMakeSettingsCI_CPU.json
@ -0,0 +1,28 @@
+{
+  "configurations": [
+    {
+      "name": "Release",
+      "generator": "Ninja",
+      "configurationType": "Release",
+      "inheritEnvironments": [ "msvc_x64" ],
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "-v",
+      "ctestCommandArgs": "",
+      "variables": [
+        { "name": "OPENSSL_USE_STATIC_LIBS:BOOL", "value": "TRUE" },
+        { "name": "OPENSSL_MSVC_STATIC_RT:BOOL", "value": "TRUE" },
+
+        { "name": "COMPILE_CUDA:BOOL", "value": "FALSE" },
+        { "name": "COMPILE_CPU:BOOL", "value": "TRUE" },
+        { "name": "COMPILE_EXAMPLES:BOOL", "value": "FALSE" },
+        { "name": "COMPILE_SERVER:BOOL", "value": "FALSE" },
+        { "name": "COMPILE_TESTS:BOOL", "value": "TRUE" },
+
+        { "name": "USE_FBGEMM:BOOL", "value": "TRUE" },
+        { "name": "USE_MPI:BOOL", "value": "FALSE" },
+        { "name": "USE_SENTENCEPIECE:BOOL", "value": "TRUE" },
+        { "name": "USE_STATIC_LIBS:BOOL", "value": "TRUE" }
+      ]
+    }
+  ]
+}
--- a/_CMakeSettingsCI_GPU.json
+++ b/_CMakeSettingsCI_GPU.json
@ -12,7 +12,7 @@
        { "name": "OPENSSL_USE_STATIC_LIBS:BOOL", "value": "TRUE" },
        { "name": "OPENSSL_MSVC_STATIC_RT:BOOL", "value": "TRUE" },

-        { "name": "COMPILE_CUDA:BOOL", "value": "FALSE" },
+        { "name": "COMPILE_CUDA:BOOL", "value": "TRUE" },
        { "name": "COMPILE_CPU:BOOL", "value": "TRUE" },
        { "name": "COMPILE_EXAMPLES:BOOL", "value": "FALSE" },
        { "name": "COMPILE_SERVER:BOOL", "value": "FALSE" },
@ -20,6 +20,7 @@

        { "name": "USE_FBGEMM:BOOL", "value": "TRUE" },
        { "name": "USE_MPI:BOOL", "value": "FALSE" },
+        { "name": "USE_NCCL:BOOL", "value": "FALSE" },
        { "name": "USE_SENTENCEPIECE:BOOL", "value": "TRUE" },
        { "name": "USE_STATIC_LIBS:BOOL", "value": "TRUE" }
      ]
@ -36,7 +37,7 @@
        { "name": "OPENSSL_MSVC_STATIC_RT:BOOL", "value": "TRUE" },
        { "name": "OPENSSL_USE_STATIC_LIBS:BOOL", "value": "TRUE" },

-        { "name": "COMPILE_CUDA:BOOL", "value": "FALSE" },
+        { "name": "COMPILE_CUDA:BOOL", "value": "TRUE" },
        { "name": "COMPILE_CPU:BOOL", "value": "TRUE" },
        { "name": "COMPILE_EXAMPLES:BOOL", "value": "FALSE" },
        { "name": "COMPILE_SERVER:BOOL", "value": "FALSE" },
@ -44,6 +45,7 @@

        { "name": "USE_FBGEMM:BOOL", "value": "TRUE" },
        { "name": "USE_MPI:BOOL", "value": "FALSE" },
+        { "name": "USE_NCCL:BOOL", "value": "FALSE" },
        { "name": "USE_SENTENCEPIECE:BOOL", "value": "TRUE" },
        { "name": "USE_STATIC_LIBS:BOOL", "value": "TRUE" }
      ]
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 0d0da014671ac3366d5021a4b33fe2efa1809a15
+Subproject commit cdad78089484d7817d91c803d6fc7049328e20db
--- a/scripts/ci/install_mkl.sh
+++ b/scripts/ci/install_mkl.sh
@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
+wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
+sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
+sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
+sudo apt-get install --no-install-recommends intel-mkl-64bit-2020.0-088
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@ -133,7 +133,7 @@ if(CUDA_FOUND)
      BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nccl
      CONFIGURE_COMMAND ""
      BUILD_COMMAND
-        $(MAKE) -f ${CMAKE_CURRENT_SOURCE_DIR}/nccl/Makefile src.build
+        ${CMAKE_MAKE_PROGRAM} -f ${CMAKE_CURRENT_SOURCE_DIR}/nccl/Makefile src.build
        BUILDDIR=${CMAKE_BINARY_DIR}/local CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}
        CUDA8_GENCODE=${GENCODE} CXX=${CMAKE_CXX_COMPILER}
      INSTALL_COMMAND "")
--- a/src/3rd_party/nccl
+++ b/src/3rd_party/nccl
@ -1 +1 @@
-Subproject commit b56650c7f59b8cd40d18809784a6d6be38ef8acb
+Subproject commit 7d3486128ebc865b9f2cad63a5cfd3a8f6abcb5a
--- a/src/3rd_party/simple-websocket-server
+++ b/src/3rd_party/simple-websocket-server
@ -1 +1 @@
-Subproject commit 417a2a9e9dbd720b8d2dfa1dafe57cf1b37ca0d7
+Subproject commit 257439f5bd0a15f315c1c2733ea8a4fb0e32c1db
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -25,6 +25,7 @@ add_library(marian STATIC
  common/filesystem.cpp
  common/file_stream.cpp
  common/file_utils.cpp
+  common/signal_handling.cpp
  common/types.cpp

  data/alignment.cpp
@ -99,7 +100,6 @@ add_library(marian STATIC
  training/graph_group_singleton.cpp
  training/validator.cpp
  training/communicator.cpp
-  training/scheduler.cpp

  # this is only compiled to catch build errors, but not linked
  microsoft/quicksand.cpp
--- a/src/command/marian_server.cpp
+++ b/src/command/marian_server.cpp
@ -37,9 +37,8 @@ int main(int argc, char **argv) {

    // Send translation back
    connection->send(sendStream, [](const SimpleWeb::error_code &ec) {
-      if(ec) {
+      if(ec)
        LOG(error, "Error sending message: ({}) {}", ec.value(), ec.message());
-      }
    });
  };

@ -52,8 +51,9 @@ int main(int argc, char **argv) {

  // Start server thread
  std::thread serverThread([&server]() {
-    LOG(info, "Server is listening on port {}", server.config.port);
-    server.start();
+    server.start([](unsigned short port) {
+      LOG(info, "Server is listening on port {}", port);
+    });
  });

  serverThread.join();
--- a/src/command/marian_train.cpp
+++ b/src/command/marian_train.cpp
@ -1,6 +1,7 @@
 #include <signal.h>
 #include "marian.h"

+#include "common/signal_handling.h"
 #include "training/graph_group_async.h"
 #include "training/graph_group_singleton.h"
 #include "training/graph_group_sync.h"
@ -42,14 +43,12 @@ int main(int argc, char** argv) {
      New<Train<AsyncGraphGroup>>(options)->run();
    }
  }
-
-  // If we exit due to SIGTERM, exit with 128 + the signal number, as suggested
-  // for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent
+  // If we exit due to a graceful exit request via SIGTERM, exit with 128 + SIGTERM,
+  // as suggested for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent
  // scripts to determine if training terminated naturally or via SIGTERM.
-  // Whith this approach we can accommodate additional signals in the future.
-  // An alternative would be to return 124, which is what the timeout command
+  // An alternative would be to exit with code 124, which is what the timeout command
  // returns for timeout -s SIGTERM <seconds> ...., because exiting after SIGTERM
  // is not technically a fatal error (which is what the 128+x convention usually
  // stands for).
-  return getSigtermFlag() ? (128 + SIGTERM) : 0;
+  exit(getSignalFlag(SIGTERM) ? 128 + SIGTERM : EXIT_SUCCESS);
 }
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -143,6 +143,15 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
  cli.add<std::string>("--dump-config",
    "Dump current (modified) configuration to stdout and exit. Possible values: full, minimal, expand")
    ->implicit_val("full");
+  if(mode_ == cli::mode::training) {
+    // --sigterm is deliberately not a boolean, to allow for a consistent
+    // pattern of specifying custom signal handling in the future.
+    // (e.g., dump model but continue training upon SIGUSR1, or report current
+    // training status upon SIGINFO.)
+    cli.add<std::string>("--sigterm",
+      "What to do with SIGTERM: save-and-exit or exit-immediately.",
+      "save-and-exit");
+  }
  // clang-format on
 }

--- a/src/common/file_stream.cpp
+++ b/src/common/file_stream.cpp
@ -175,8 +175,8 @@ void TemporaryFile::MakeTemp(const std::string &base) {
  // open again with c++
  streamBuf1_.reset(new std::filebuf());
  auto ret = static_cast<std::filebuf*>(streamBuf1_.get())->open(name, std::ios::out | std::ios_base::binary);
-  ABORT_IF(!streamBuf1_, "File cannot be temp opened", name);
-  ABORT_IF(ret != streamBuf1_.get(), "Return value is not equal to streambuf pointer, that is weird");
+  ABORT_IF(!streamBuf1_, "File {} cannot be temp opened", name);
+  ABORT_IF(ret != streamBuf1_.get(), "Return value ({}) is not equal to streambuf pointer ({}), that is weird.", (size_t)ret, (size_t)streamBuf1_.get());

  this->init(streamBuf1_.get());

--- a/src/common/signal_handling.cpp
+++ b/src/common/signal_handling.cpp
@ -0,0 +1,58 @@
+#include "common/logging.h"
+#include "signal_handling.h"
+
+// The simplest (and recommended) way to handle signals is to simply set a flag
+// in the signal handler and check that flag later.
+//
+// We provide setSignalFlag as the most generic signal handler. This handler uses a
+// single sig_atomic_t as a bit field. On Linux, sig_atomic_t is equivalent to a signed int,
+// theoretically providing 32 binary flags; in practice, most likely signals for which we may
+// want to install signal handlers are
+// - SIGTERM (15): which by default signals the request for a graceful shutdown
+// - SIGUSR1 (10): intended for custom use, default action in Linux is termination
+// - SIGUSR2 (12): intended for custom use, default action in Linux is termination
+// - SIGINT (2): interrupt from the console
+// Just to be safe, we accommodate signals up to signal No. 30.
+
+// In addition, we also provide requestSaveAndExit() and saveAndExit() as a signal
+// handler/checker for graceful shutdown requests during training.
+constexpr int maxSignalForSetSignalFlag{30};
+
+// Make sure sig_atomic_t is large enough as a bit field for our purposes.
+// That said, I'm not aware of any platform where this would be a problem.
+static_assert(SIG_ATOMIC_MAX > (1U<<maxSignalForSetSignalFlag),
+              "sig_atomic_type is too small for signal flags on this platform.");
+
+namespace marian{
+volatile std::sig_atomic_t sigflags_{0};
+volatile std::sig_atomic_t saveAndExit_{0};
+
+void setSignalFlag(int sig) {
+  // sigflags_ is an int type serving as a bit filed for flags corresponding
+  // to signals (lower or equeal to maxSignalForSetSignalFlag). We set the
+  // flag by a binary or (|=) of the bit field and an int value with exactly
+  // one bit set (s^sig).
+  sigflags_ |= (1<<sig);
+}
+
+// Check if the flag for the signal sig is set in the bit field sigflags_
+bool getSignalFlag(const int sig) {
+  ABORT_IF(sig > maxSignalForSetSignalFlag,
+           "Signal out of range (must be < {}, is {}).", maxSignalForSetSignalFlag, sig);
+  // Do bitwise AND between sigflags_ and an int value that has exactly one bit set that
+  // corresponds to the signal in question. If the bit is set (see setSignalFlag above),
+  // the bitwise AND will return a non-zero integer, if it is not set, the result will
+  // be zero.
+  return (sigflags_ & (1<<sig)) != 0;
+}
+
+void requestSaveAndExit(int sig) {
+  setSignalFlag(sig);         // keep track of triggering signal
+  saveAndExit_ = 1; // set flag to exit gracefully
+}
+
+bool saveAndExitRequested() {
+  return saveAndExit_ == 1;
+}
+
+}
--- a/src/common/signal_handling.h
+++ b/src/common/signal_handling.h
@ -0,0 +1,39 @@
+#pragma once
+#include <csignal>
+#include <string>
+
+// SIGNAL HANDLING
+
+// The signal handlers (and checkers) here are implemented in line with with the recommendations
+// for signal handling in the SEI CERT C Coding Standard, specifically
+//
+// - SIG30-C:
+//   https://wiki.sei.cmu.edu/confluence/display/c/SIG30-C.+Call+only+asynchronous-safe+functions+within+signal+handlers
+//
+// - SIG31-C:
+//   https://wiki.sei.cmu.edu/confluence/display/c/SIG31-C.+Do+not+access+shared+objects+in+signal+handlers
+//
+// The exact behavior of 'graceful exit' depends on the application; for training, it means 'save model and exit',
+// for a server (not implemented yet): 'block new requests but serve pending requests and then exit'.
+//
+// Graceful exit for training is useful for training on clusters with time limits on jobs. Slurm, for example, can be
+// set up to send a custom signal at a set time before the end of the time slot, giving Marian time to save its current
+// state before getting killed.
+
+namespace marian {
+
+
+/// Request graceful exit (signal handler)
+void requestSaveAndExit(int sig);
+
+/// Check if graceful exit was requested.
+bool saveAndExitRequested();
+
+/// General purpose signal handler that simply sets a flag when a signal is received.
+//  (only for SIGNAL No. < 32).
+void setSignalFlag(int sig);  // custom handler (set flag) for sig
+
+/// Check if a setSignalFlag was triggered for this signal
+bool getSignalFlag(int sig);
+
+} // End of namespace marian
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@ -1,6 +1,7 @@
 #pragma once

 #include "common/options.h"
+#include "common/signal_handling.h"
 #include "data/batch_stats.h"
 #include "data/rng_engine.h"
 #include "training/training_state.h"
@ -136,6 +137,8 @@ private:
    }
    size_t sets = 0;
    while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data
+      if (saveAndExitRequested()) // stop generating batches
+        return std::deque<BatchPtr>();
      maxiBatch->push(*current_);
      sets = current_->size();
      // do not consume more than required for the maxi batch as this causes
@ -161,6 +164,8 @@ private:
    if (stats_)
      cachedStatsIter = stats_->begin();
    while(!maxiBatch->empty()) { // while there are sentences in the queue
+      if (saveAndExitRequested()) // stop generating batches
+        return std::deque<BatchPtr>();
      // push item onto batch
      batchVector.push_back(maxiBatch->top());
      maxiBatch->pop(); // fetch next-shortest
@ -249,7 +254,7 @@ private:
            "If you have changed the training corpus, add --no-restore-corpus to the training command and run it again.");
        bufferedBatches_ = std::move(futureBufferedBatches_.get());
        // if bg thread returns an empty swath, we hit the end of the epoch
-        if (bufferedBatches_.empty()) {
+        if (bufferedBatches_.empty() || saveAndExitRequested()) {
          return nullptr;
        }
        // and kick off the next bg operation
@ -257,7 +262,7 @@ private:
      } else { // don't spawn any threads, i.e. batch fetching is blocking.
        bufferedBatches_ = fetchBatches();
        // if bufferedBatches is empty we hit the end of the epoch
-        if (bufferedBatches_.empty()) {
+        if (bufferedBatches_.empty() || saveAndExitRequested()) {
          return nullptr;
        }
      }
--- a/src/tensors/gpu/prod.cpp
+++ b/src/tensors/gpu/prod.cpp
@ -12,63 +12,6 @@
 #include "tensors/gpu/cuda_helpers.h"
 // clang-format on

-// recreations of a few cusparse functions that were deprecated in CUDA 11
-// @TODO: Fill these in. This is not trivial. Until then, using these with CUDA 11 will fail.
-#if CUDA_VERSION >= 11000
-cusparseStatus_t
-cusparseSgemmi10(cusparseHandle_t handle,
-               int              m,
-               int              n,
-               int              k,
-               int              nnz,
-               const float*     alpha,
-               const float*     A,
-               int              lda,
-               const float*     cscValB,
-               const int*       cscColPtrB,
-               const int*       cscRowIndB,
-               const float*     beta,
-               float*           C,
-               int              ldc) {
-  ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
-}
-#define cusparseSgemmi cusparseSgemmi10
-cusparseStatus_t
-cusparseScsr2csc(cusparseHandle_t    handle,
-                 int                 m,
-                 int                 n,
-                 int                 nnz,
-                 const float*        csrVal,
-                 const int*          csrRowPtr,
-                 const int*          csrColInd,
-                 float*              cscVal,
-                 int*                cscRowInd,
-                 int*                cscColPtr,
-                 cusparseAction_t    copyValues,
-                 cusparseIndexBase_t idxBase) {
-  ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
-}
-cusparseStatus_t
-cusparseScsrmm(cusparseHandle_t         handle,
-               cusparseOperation_t      transA,
-               int                      m,
-               int                      n,
-               int                      k,
-               int                      nnz,
-               const float*             alpha,
-               const cusparseMatDescr_t descrA,
-               const float*             csrValA,
-               const int*               csrRowPtrA,
-               const int*               csrColIndA,
-               const float*             B,
-               int                      ldb,
-               const float*             beta,
-               float*                   C,
-               int                      ldc) {
-  ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
-}
-#endif
-
 namespace marian {

 namespace gpu {
@ -420,6 +363,9 @@ static cusparseSgemmiEx(cusparseHandle_t handle, int m,
  const float *cscValB, const int *cscColPtrB, const int *cscRowIndB, const float *beta,
  float *C, int ldc)
 {
+#if CUDA_VERSION >= 11000
+  ABORT("cusparseSgemmi is not available in CUDA VERSION >= 11.");
+#else
  const int nMax = 65535; // max. number of columns allowed by cuSparse 10 implementation
  for (int j0 = 0; j0 < n; j0 += 65535) { // loop over column slices, j0 = index of first column
    // Call original function on a column slice.
@ -432,6 +378,7 @@ static cusparseSgemmiEx(cusparseHandle_t handle, int m,
    if (rc != CUSPARSE_STATUS_SUCCESS)
      return rc;
  }
+#endif
  return CUSPARSE_STATUS_SUCCESS;
 }

@ -483,6 +430,45 @@ void CSRProd(marian::Tensor C,
    St_indices = allocator->alloc<int>(numValues);
    St_offsets = allocator->alloc<int>(colsS + 1);
    // transpose the second argument
+#if CUDA_VERSION >= 11000
+    size_t buffer_size;
+    CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize(cusparseHandle,
+                                          /*m=*/ rowsS, // number of rows of matrix
+                                          /*n=*/ colsS, // number of columns of matrix
+                                          /*nnz=*/ (int)numValues,
+                                          /*csrcVal=*/          S_values ->data<float>(),
+                                          /*csrcRowPtr=*/ (int*)S_offsets->data<IndexType>(),
+                                          /*csrcColInd=*/ (int*)S_indices->data<IndexType>(),
+                                          /*cscVal=*/    St_values ->data<float>(),  // transposed version goes here
+                                          /*cscColPtr=*/ St_offsets->data<int>(),
+                                          /*cscRowInd=*/ St_indices->data<int>(),
+                                          /*valType*/ CUDA_R_32F,
+                                          /*copyValues=*/ CUSPARSE_ACTION_NUMERIC,
+                                          /*idxBase=*/ CUSPARSE_INDEX_BASE_ZERO,
+                                          /*alg*/ CUSPARSE_CSR2CSC_ALG1,
+                                          /*bufferSize*/ &buffer_size));
+    MemoryPiece::PtrType buffer= (buffer_size > 0) ? allocator->alloc<uint8_t>(buffer_size) : nullptr;
+
+    CUSPARSE_CHECK(cusparseCsr2cscEx2(cusparseHandle,
+                                          /*m=*/ rowsS, // number of rows of matrix
+                                          /*n=*/ colsS, // number of columns of matrix
+                                          /*nnz=*/ (int)numValues,
+                                          /*csrcVal=*/          S_values ->data<float>(),
+                                          /*csrcRowPtr=*/ (int*)S_offsets->data<IndexType>(),
+                                          /*csrcColInd=*/ (int*)S_indices->data<IndexType>(),
+                                          /*cscVal=*/    St_values ->data<float>(),  // transposed version goes here
+                                          /*cscColPtr=*/ St_offsets->data<int>(),
+                                          /*cscRowInd=*/ St_indices->data<int>(),
+                                          /*valType=*/ CUDA_R_32F,
+                                          /*copyValues=*/ CUSPARSE_ACTION_NUMERIC,
+                                          /*idxBase=*/ CUSPARSE_INDEX_BASE_ZERO,
+                                          /*alg=*/ CUSPARSE_CSR2CSC_ALG1,
+                                          /*buffer=*/ buffer->data<uint8_t>()));
+
+    if (buffer)
+      allocator->free(buffer);
+    ABORT("This code is untested. Please remove this ABORT once tests exist and pass.");
+#else
    CUSPARSE_CHECK(cusparseScsr2csc(cusparseHandle,
        /*m=*/ rowsS, // number of rows of matrix
        /*n=*/ colsS, // number of columns of matrix
@ -495,12 +481,16 @@ void CSRProd(marian::Tensor C,
        /*cscColPtr=*/ St_offsets->data<int>(),
        /*copyValues=*/ CUSPARSE_ACTION_NUMERIC,
        /*idxBase=*/ CUSPARSE_INDEX_BASE_ZERO));
+#endif
    std::swap(rowsS, colsS); // these variables now represent the dims of the explicitly transposed object
  }
  if (swapOperands) {
    // C = D x S for row-major matrices
    // Implemented via cusparse as C' = S' x D' ("csrmm") where C' and D' are column-major,
    // and S' is CSR (if not transS then we make a transposed copy).
+#if CUDA_VERSION >= 11000
+    ABORT("CSRProd is not yet implemented for CUDA VERSION >= 11");
+#else
    cusparseMatDescr_t descrA;
    CUSPARSE_CHECK(cusparseCreateMatDescr(&descrA));
    cusparseSetMatType     (descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
@ -521,6 +511,7 @@ void CSRProd(marian::Tensor C,
        C->data(),
        /*ldc=*/ colsC)); // stride
    cusparseDestroyMatDescr(descrA);
+#endif
  }
  else {
    // C = S x D for row-major matrices
--- a/src/training/scheduler.cpp
+++ b/src/training/scheduler.cpp
@ -1,43 +0,0 @@
-#include "scheduler.h"
-#include <signal.h>
-#include <cassert>
-
-namespace marian {
-
-// SIGNAL HANDLING, see scheduler.cpp for definitions
-// Currently, only the following is handled by a custom signal handler:
-// SIGTERM: When SIGTERM is received, the global (static member) flag sigterm_ (false by default) is set to true
-//     by signalHandler(). When sigterm_ is true, keepGoing() returns false, and the current state of training models
-//     is saved prior to exiting.
-//        This functionality is helpful when training on clusters with time limits on compute slots, e.g., on s
-//     clusters managed by slurm. Slurm can be asked to sending a (custom) warning signal to a process at a given
-//     point in time prior to the hard "time's up".
-
-bool sigterm_{false}; // flag signalling that SIGTERM has been received false by default, set to true by signalHandler(SIGTERM)
-
-void signalHandler(int sig) {
-  // Note: sys_siglist[sig] or stdsignal() describe the effect (e.g.,
-  // 'Terminated' rather than provide the signal name (which are #define(s)
-  // in signal.h), so we have to do custom log messages here.
-  switch (sig) {
-    case SIGTERM: // save models and exit
-      LOG(info, "[training] Scheduler received signal SIGTERM"); // @TODO: figure out if this is safe. The logs are global and thread-safe, so should be OK?
-      sigterm_ = true;
-      break;
-    default:
-      ABORT("No action defined for signal {}", sig);
-  }
-}
-
-// installs signalHandler() for select signals (currently only SIGTERM)
-void installSignalHandlers() {
-  // TODO: use sigaction instead of signal, 
-  // cf. https://stackoverflow.com/questions/231912/what-is-the-difference-between-sigaction-and-signal
-  signal(SIGTERM, signalHandler);
-}
-
-bool getSigtermFlag() {
-  return sigterm_;
-}
-
-}
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@ -1,6 +1,7 @@
 #pragma once

 #include "common/options.h"
+#include "common/signal_handling.h"
 #include "training/training_state.h"
 #include "training/validator.h"
 #include "training/communicator.h"
@ -8,9 +9,6 @@

 namespace marian {

-bool getSigtermFlag();
-void installSignalHandlers();
-
 class Scheduler : public TrainingObserver {
 private:
  Ptr<Options> options_;
@ -154,11 +152,10 @@ public:
      : options_(options), state_(state) {
    ABORT_IF(state_->factor != 1, "state.factor unexpectedly not 1 at this point??");
    updateLearningRate(*state);
-    installSignalHandlers();
  }

  bool keepGoing() {
-    if(getSigtermFlag()) // received signal SIGERM => exit gracefully
+    if(saveAndExitRequested()) // via SIGTERM
      return false;

    // stop if it reached the maximum number of epochs
@ -192,13 +189,12 @@ public:

  void started() { LOG(info, "Training started"); }
  void finished() {
-    if (getSigtermFlag())
-      LOG(info, "Training interrupted (SIGTERM).");
+    if (saveAndExitRequested())
+      LOG(info, "Training interrupted (via signal).");
    else
      LOG(info, "Training finished");
  }

-
  void addValidator(Ptr<ValidatorBase> validator) {
    validators_.push_back(validator);

@ -223,9 +219,10 @@ public:

  void validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                bool isFinal = false) {
-    // Do not validate if already validated (for instance, after the model is
-    // loaded) or if validation is scheduled for another update, or when signal SIGTERM was received
-    if(getSigtermFlag() // SIGTERM was received
+    // Do not validate if already validated (for instance, after the model is loaded)
+    // or if validation is scheduled for another update, or when a graceful shutdown
+    // was requested.
+    if(saveAndExitRequested()
       || state_->validated // already validated (in resumed training, for example)
       || (!state_->enteredNewPeriodOf(options_->get<std::string>("valid-freq")) && !isFinal)) // not now
      return;
--- a/src/training/training.h
+++ b/src/training/training.h
@ -16,6 +16,7 @@ template <class ModelWrapper>
 class Train : public ModelTask {
 private:
  Ptr<Options> options_;
+  void installCustomSignalHandlers();

 public:
  Train(Ptr<Options> options) : options_(options) {}
@ -77,6 +78,9 @@ public:
    bool restored = !options_->get<bool>("no-restore-corpus")
                    && batchGenerator->restore(trainState);

+    // We only want custom behavior once training starts.
+    installCustomSignalHandlers();
+
    // -- main training loop
    scheduler->started();
    while(scheduler->keepGoing()) {
@ -107,4 +111,16 @@ public:
    finalizeMPI(std::move(mpi));
  }
 };
+
+template <class ModelWrapper>
+void Train<ModelWrapper>::installCustomSignalHandlers(){
+  const std::string sigTermAction = options_->get<std::string>("sigterm");
+  if (sigTermAction == "save-and-exit") {
+    LOG(debug, "Will save before exiting upon SIGTERM.");
+    signal(SIGTERM, requestSaveAndExit);
+  }
+  else if (sigTermAction != "exit-immediately")
+    ABORT("Unrecognized value '{}' for --sigterm", sigTermAction);
+}
+
 }  // namespace marian