Merged PR 15233: Sync internal master with public master

Regular sync of public and internal master.
This commit is contained in:
Martin Junczys-Dowmunt 2020-09-07 19:37:41 +00:00
parent ea3ac624c6
commit e3916b3d08
28 changed files with 521 additions and 191 deletions

View File

@ -1,49 +0,0 @@
name: windows-2019-cpu
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build:
runs-on: windows-2019
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
- name: Prepare vcpkg
uses: lukka/run-vcpkg@v3
with:
vcpkgArguments: protobuf
vcpkgGitCommitId: 6185aa76504a5025f36754324abf307cc776f3da
vcpkgDirectory: ${{ github.workspace }}/vcpkg/
vcpkgTriplet: x64-windows-static
# Note that we build with a simplified CMake settings JSON file
- name: Run CMake
uses: lukka/run-cmake@v2
with:
buildDirectory: ${{ github.workspace }}/build/
cmakeAppendedArgs: -G Ninja
cmakeListsOrSettingsJson: CMakeSettingsJson
cmakeSettingsJsonPath: ${{ github.workspace }}/CMakeSettingsCI.json
useVcpkgToolchainFile: true
- name: Run unit tests
working-directory: build/Debug/
run: ctest
- name: Print versions
working-directory: build/Debug/
run: |
.\marian.exe --version
.\marian-decoder.exe --version
.\marian-scorer.exe --version
.\spm_encode.exe --version

View File

@ -1,4 +1,4 @@
name: macos-10.5-cpu
name: macOS CPU-only
on:
push:
@ -7,7 +7,7 @@ on:
branches: [ master ]
jobs:
build:
build-macos:
runs-on: macos-10.15

View File

@ -1,4 +1,4 @@
name: ubuntu-18.04-cpu
name: Ubuntu 18.04 CPU-only
on:
push:
@ -7,7 +7,7 @@ on:
branches: [ master ]
jobs:
build:
build-ubuntu:
runs-on: ubuntu-18.04
@ -41,7 +41,8 @@ jobs:
run: |
mkdir -p build
cd build
cmake .. -DCOMPILE_CPU=on -DCOMPILE_CUDA=off -DCOMPILE_EXAMPLES=on -DCOMPILE_SERVER=on -DCOMPILE_TESTS=on \
cmake .. -DCMAKE_BUILD_TYPE=Release \
-DCOMPILE_CPU=on -DCOMPILE_CUDA=off -DCOMPILE_EXAMPLES=on -DCOMPILE_SERVER=on -DCOMPILE_TESTS=on \
-DUSE_FBGEMM=on -DUSE_SENTENCEPIECE=on \
-DBOOST_ROOT=$BOOST_ROOT_1_69_0 -DBOOST_INCLUDEDIR=$BOOST_ROOT_1_69_0/include -DBOOST_LIBRARYDIR=$BOOST_ROOT_1_69_0/lib \
-DBoost_ARCHITECTURE=-x64
@ -62,3 +63,13 @@ jobs:
./marian-scorer --version
./spm_encode --version
- name: Prepare archive
working-directory: build
run: tar zcvf marian-ubuntu-release-static.tar.gz marian*
# Marian is built with FBGEMM, so there are some restrictions on what CPUs the executables can be run
- name: Upload archive
uses: actions/upload-artifact@v2
with:
name: marian-ubuntu-release-static.tar.gz
path: build/marian-ubuntu-release-static.tar.gz

96
.github/workflows/ubuntu-gpu.yml vendored Normal file
View File

@ -0,0 +1,96 @@
name: Ubuntu CPU+CUDA
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build-ubuntu-cuda:
runs-on: ${{ matrix.os }}
continue-on-error: ${{ matrix.experimental }}
strategy:
matrix:
include:
# Ubuntu 20.04 supports CUDA 11+
#- os: ubuntu-20.04
#cuda: "11.0"
#gcc: 9
#boost: false # ubuntu-20.04 image does not have Boost pre-installed yet
#experimental: true # continue even if the job fails
# Ubuntu 18.04 supports CUDA 10.1+
- os: ubuntu-18.04
cuda: "10.2"
gcc: 8
boost: true
experimental: false
# Ubuntu 16.04 supports CUDA 8+
- os: ubuntu-16.04
cuda: "10.2"
gcc: 7
boost: true
experimental: false
- os: ubuntu-16.04
cuda: 9.2
gcc: 7
boost: true
experimental: false
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
# The following packages are already installed on GitHub-hosted runners: build-essential openssl libssl-dev
# No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev
- name: Install dependencies
run: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler
# https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
- name: Install MKL
run: |
wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
# The script simplifies installation of different versions of CUDA
- name: Install CUDA
run: ./scripts/ci/install_cuda_ubuntu.sh ${{ matrix.cuda }}
# Boost is already installed on GitHub-hosted runners in a non-standard location
# https://github.com/actions/virtual-environments/issues/687#issuecomment-610471671
- name: Configure CMake
run: |
mkdir -p build
cd build
CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} \
cmake .. \
-DCOMPILE_CPU=on -DCOMPILE_CUDA=on -DCOMPILE_EXAMPLES=on -DCOMPILE_SERVER=${{ matrix.boost }} -DCOMPILE_TESTS=on \
-DUSE_FBGEMM=on -DUSE_SENTENCEPIECE=on \
-DBOOST_ROOT=$BOOST_ROOT_1_69_0 -DBOOST_INCLUDEDIR=$BOOST_ROOT_1_69_0/include -DBOOST_LIBRARYDIR=$BOOST_ROOT_1_69_0/lib \
-DBoost_ARCHITECTURE=-x64 \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }}
- name: Compile
working-directory: build
run: make -j2
# Unit tests are not run because GitHub-hosted runners do not have GPUs
# TODO: add a flag to CMake to compile unit tests only on CPU
#- name: Run unit tests
#working-directory: build
#run: make test
- name: Print versions
working-directory: build
run: |
./marian --version
./marian-decoder --version
./marian-scorer --version
./spm_encode --version

67
.github/workflows/windows-cpu.yml vendored Normal file
View File

@ -0,0 +1,67 @@
name: Windows CPU-only
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build-windows:
runs-on: windows-2019
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
- name: Download MKL
run: |
# Wget can retry downloading files, so it is used instead of Invoke-WebRequest
C:\msys64\usr\bin\wget.exe -nv https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip -O mkl.zip
Expand-Archive -Force mkl.zip ${{ github.workspace }}/mkl
# Set the MKLROOT environment variable so that CMake can find MKL.
# GITHUB_WORKSPACE is an environment variable available on all GitHub-hosted runners
echo "::set-env name=MKLROOT::$env:GITHUB_WORKSPACE/mkl"
shell: powershell
- name: Prepare vcpkg
uses: lukka/run-vcpkg@v2
with:
vcpkgArguments: protobuf
vcpkgGitCommitId: 6185aa76504a5025f36754324abf307cc776f3da
vcpkgDirectory: ${{ github.workspace }}/vcpkg/
vcpkgTriplet: x64-windows-static
# Build with a simplified CMake settings JSON file
- name: Run CMake
uses: lukka/run-cmake@v2
with:
buildDirectory: ${{ github.workspace }}/build/
cmakeAppendedArgs: -G Ninja
cmakeListsOrSettingsJson: CMakeSettingsJson
# JSON file must be in the same directory as the main CMakeLists.txt
cmakeSettingsJsonPath: ${{ github.workspace }}/_CMakeSettingsCI_CPU.json
useVcpkgToolchainFile: true
- name: Run unit tests
working-directory: build/Release/
run: ctest
#- name: Print versions
#working-directory: build/Release/
#run: |
#.\marian.exe --version
#.\marian-decoder.exe --version
#.\marian-scorer.exe --version
#.\spm_encode.exe --version
#shell: powershell
# Marian is built with FBGEMM, so there are some restrictions on what CPUs the executables can be run
- name: Upload archive
uses: actions/upload-artifact@v2
with:
name: marian-windows-release-static
path: build/Release/marian*.exe

64
.github/workflows/windows-gpu.yml vendored Normal file
View File

@ -0,0 +1,64 @@
name: Windows CPU+CUDA
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build-windows-cuda:
runs-on: windows-2019
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
- name: Install CUDA
run: |
.\scripts\ci\install_cuda_windows.ps1 "10.2"
# Set path to CUDA for subsequent steps so that CMake can find it
echo "::set-env name=CUDA_PATH::$env:CUDA_PATH"
echo "::add-path::$env:CUDA_PATH/bin"
shell: powershell
- name: Download MKL
run: |
# Wget can retry downloading files, so it is used instead of Invoke-WebRequest
C:\msys64\usr\bin\wget.exe -nv https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip -O mkl.zip
Expand-Archive -Force mkl.zip ${{ github.workspace }}/mkl
# Set the MKLROOT environment variable so that CMake can find MKL.
# GITHUB_WORKSPACE is an environment variable available on all GitHub-hosted runners
echo "::set-env name=MKLROOT::$env:GITHUB_WORKSPACE/mkl"
shell: powershell
- name: Prepare vcpkg
uses: lukka/run-vcpkg@v2
with:
vcpkgArguments: protobuf
vcpkgGitCommitId: 6185aa76504a5025f36754324abf307cc776f3da
vcpkgDirectory: ${{ github.workspace }}/vcpkg/
vcpkgTriplet: x64-windows-static
# Build with a simplified CMake settings JSON file.
# On Windows+CUDA we compile with COMPILE_CUDA=on and USE_NCCL=off
- name: Run CMake
uses: lukka/run-cmake@v2
with:
buildDirectory: ${{ github.workspace }}/build/
cmakeAppendedArgs: -G Ninja
cmakeListsOrSettingsJson: CMakeSettingsJson
# JSON file must be in the same directory as the main CMakeLists.txt
cmakeSettingsJsonPath: ${{ github.workspace }}/_CMakeSettingsCI_GPU.json
useVcpkgToolchainFile: true
- name: Print versions
working-directory: build/Debug/
run: |
.\marian.exe --version
.\marian-decoder.exe --version
.\marian-scorer.exe --version
.\spm_encode.exe --version

View File

@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]
### Added
- Turing and Ampere GPU optimisation support, if the CUDA version supports it.
- Printing word-level scores in marian-scorer
- Optimize LayerNormalization on CPU by 6x through vectorization (ffast-math) and fixing performance regression introduced with strides in 77a420
- Decoding multi-source models in marian-server with --tsv
@ -24,6 +25,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Internal optional parameter in n-best list generation that skips empty hypotheses.
### Fixed
- Print "server is listening on port" message after it is accepting connections
- Fix compilation without BLAS installed
- Providing a single value to vector-like options using the equals sign, e.g. --models=model.npz
- Fix quiet-translation in marian-server
@ -39,6 +41,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Properly record cmake variables in the cmake build directory instead of the source tree.
- Added default "none" for option shuffle in BatchGenerator, so that it works in executables where shuffle is not an option.
- Added a few missing header files in shortlist.h and beam_search.h.
- Improved handling for receiving SIGTERM during training. By default, SIGTERM triggers 'save (now) and exit'. Prior to this fix, batch pre-fetching did not check for this sigal, potentially delaying exit considerably. It now pays attention to that. Also, the default behaviour of save-and-exit can now be disabled on the command line with --sigterm exit-immediately.
### Changed
- Move Simple-WebSocket-Server to submodule

View File

@ -13,10 +13,6 @@ set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
# Custom CMake options
option(COMPILE_CPU "Compile CPU version" ON)
option(COMPILE_CUDA "Compile GPU version" ON)
option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
option(COMPILE_EXAMPLES "Compile examples" OFF)
option(COMPILE_SERVER "Compile marian-server" OFF)
option(COMPILE_TESTS "Compile tests" OFF)
@ -243,6 +239,30 @@ if(CUDA_FOUND)
message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
endif()
# We want to compile as many targets as possible but different CUDA versions support different targets.
# Let's instead enable options based on what cuda version we have.
if((CUDA_VERSION VERSION_EQUAL "9.0" OR CUDA_VERSION VERSION_GREATER "9.0") AND CUDA_VERSION VERSION_LESS "11.0")
option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
endif()
if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND CUDA_VERSION VERSION_LESS "11.0")
option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
option(COMPILE_CUDA_SM75 "Compile GPU version with SM75 support" ON)
endif()
if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")
option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
option(COMPILE_CUDA_SM75 "Compile GPU version with SM75 support" ON)
option(COMPILE_CUDA_SM80 "Compile GPU version with SM80 support" ON)
endif()
if(COMPILE_CUDA_SM35)
LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
endif(COMPILE_CUDA_SM35)
@ -255,6 +275,16 @@ if(CUDA_FOUND)
if(COMPILE_CUDA_SM70)
LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) # Volta GPUs
endif(COMPILE_CUDA_SM70)
if(CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0")
if(COMPILE_CUDA_SM75)
LIST(APPEND COMPUTE -gencode=arch=compute_75,code=sm_75; -gencode=arch=compute_75,code=compute_75) # Turing GPUs
endif(COMPILE_CUDA_SM75)
endif()
if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")
if(COMPILE_CUDA_SM80)
LIST(APPEND COMPUTE -gencode=arch=compute_80,code=sm_80; -gencode=arch=compute_80,code=compute_80) # Ampere GPUs
endif(COMPILE_CUDA_SM80)
endif()
if(USE_STATIC_LIBS)
set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})

View File

@ -1 +1 @@
v1.9.35
v1.9.36

28
_CMakeSettingsCI_CPU.json Normal file
View File

@ -0,0 +1,28 @@
{
"configurations": [
{
"name": "Release",
"generator": "Ninja",
"configurationType": "Release",
"inheritEnvironments": [ "msvc_x64" ],
"cmakeCommandArgs": "",
"buildCommandArgs": "-v",
"ctestCommandArgs": "",
"variables": [
{ "name": "OPENSSL_USE_STATIC_LIBS:BOOL", "value": "TRUE" },
{ "name": "OPENSSL_MSVC_STATIC_RT:BOOL", "value": "TRUE" },
{ "name": "COMPILE_CUDA:BOOL", "value": "FALSE" },
{ "name": "COMPILE_CPU:BOOL", "value": "TRUE" },
{ "name": "COMPILE_EXAMPLES:BOOL", "value": "FALSE" },
{ "name": "COMPILE_SERVER:BOOL", "value": "FALSE" },
{ "name": "COMPILE_TESTS:BOOL", "value": "TRUE" },
{ "name": "USE_FBGEMM:BOOL", "value": "TRUE" },
{ "name": "USE_MPI:BOOL", "value": "FALSE" },
{ "name": "USE_SENTENCEPIECE:BOOL", "value": "TRUE" },
{ "name": "USE_STATIC_LIBS:BOOL", "value": "TRUE" }
]
}
]
}

View File

@ -12,7 +12,7 @@
{ "name": "OPENSSL_USE_STATIC_LIBS:BOOL", "value": "TRUE" },
{ "name": "OPENSSL_MSVC_STATIC_RT:BOOL", "value": "TRUE" },
{ "name": "COMPILE_CUDA:BOOL", "value": "FALSE" },
{ "name": "COMPILE_CUDA:BOOL", "value": "TRUE" },
{ "name": "COMPILE_CPU:BOOL", "value": "TRUE" },
{ "name": "COMPILE_EXAMPLES:BOOL", "value": "FALSE" },
{ "name": "COMPILE_SERVER:BOOL", "value": "FALSE" },
@ -20,6 +20,7 @@
{ "name": "USE_FBGEMM:BOOL", "value": "TRUE" },
{ "name": "USE_MPI:BOOL", "value": "FALSE" },
{ "name": "USE_NCCL:BOOL", "value": "FALSE" },
{ "name": "USE_SENTENCEPIECE:BOOL", "value": "TRUE" },
{ "name": "USE_STATIC_LIBS:BOOL", "value": "TRUE" }
]
@ -36,7 +37,7 @@
{ "name": "OPENSSL_MSVC_STATIC_RT:BOOL", "value": "TRUE" },
{ "name": "OPENSSL_USE_STATIC_LIBS:BOOL", "value": "TRUE" },
{ "name": "COMPILE_CUDA:BOOL", "value": "FALSE" },
{ "name": "COMPILE_CUDA:BOOL", "value": "TRUE" },
{ "name": "COMPILE_CPU:BOOL", "value": "TRUE" },
{ "name": "COMPILE_EXAMPLES:BOOL", "value": "FALSE" },
{ "name": "COMPILE_SERVER:BOOL", "value": "FALSE" },
@ -44,6 +45,7 @@
{ "name": "USE_FBGEMM:BOOL", "value": "TRUE" },
{ "name": "USE_MPI:BOOL", "value": "FALSE" },
{ "name": "USE_NCCL:BOOL", "value": "FALSE" },
{ "name": "USE_SENTENCEPIECE:BOOL", "value": "TRUE" },
{ "name": "USE_STATIC_LIBS:BOOL", "value": "TRUE" }
]

@ -1 +1 @@
Subproject commit 0d0da014671ac3366d5021a4b33fe2efa1809a15
Subproject commit cdad78089484d7817d91c803d6fc7049328e20db

7
scripts/ci/install_mkl.sh Executable file
View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
# https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
sudo apt-get install --no-install-recommends intel-mkl-64bit-2020.0-088

View File

@ -133,7 +133,7 @@ if(CUDA_FOUND)
BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nccl
CONFIGURE_COMMAND ""
BUILD_COMMAND
$(MAKE) -f ${CMAKE_CURRENT_SOURCE_DIR}/nccl/Makefile src.build
${CMAKE_MAKE_PROGRAM} -f ${CMAKE_CURRENT_SOURCE_DIR}/nccl/Makefile src.build
BUILDDIR=${CMAKE_BINARY_DIR}/local CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}
CUDA8_GENCODE=${GENCODE} CXX=${CMAKE_CXX_COMPILER}
INSTALL_COMMAND "")

2
src/3rd_party/nccl vendored

@ -1 +1 @@
Subproject commit b56650c7f59b8cd40d18809784a6d6be38ef8acb
Subproject commit 7d3486128ebc865b9f2cad63a5cfd3a8f6abcb5a

@ -1 +1 @@
Subproject commit 417a2a9e9dbd720b8d2dfa1dafe57cf1b37ca0d7
Subproject commit 257439f5bd0a15f315c1c2733ea8a4fb0e32c1db

View File

@ -25,6 +25,7 @@ add_library(marian STATIC
common/filesystem.cpp
common/file_stream.cpp
common/file_utils.cpp
common/signal_handling.cpp
common/types.cpp
data/alignment.cpp
@ -99,7 +100,6 @@ add_library(marian STATIC
training/graph_group_singleton.cpp
training/validator.cpp
training/communicator.cpp
training/scheduler.cpp
# this is only compiled to catch build errors, but not linked
microsoft/quicksand.cpp

View File

@ -37,9 +37,8 @@ int main(int argc, char **argv) {
// Send translation back
connection->send(sendStream, [](const SimpleWeb::error_code &ec) {
if(ec) {
if(ec)
LOG(error, "Error sending message: ({}) {}", ec.value(), ec.message());
}
});
};
@ -52,8 +51,9 @@ int main(int argc, char **argv) {
// Start server thread
std::thread serverThread([&server]() {
LOG(info, "Server is listening on port {}", server.config.port);
server.start();
server.start([](unsigned short port) {
LOG(info, "Server is listening on port {}", port);
});
});
serverThread.join();

View File

@ -1,6 +1,7 @@
#include <signal.h>
#include "marian.h"
#include "common/signal_handling.h"
#include "training/graph_group_async.h"
#include "training/graph_group_singleton.h"
#include "training/graph_group_sync.h"
@ -42,14 +43,12 @@ int main(int argc, char** argv) {
New<Train<AsyncGraphGroup>>(options)->run();
}
}
// If we exit due to SIGTERM, exit with 128 + the signal number, as suggested
// for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent
// If we exit due to a graceful exit request via SIGTERM, exit with 128 + SIGTERM,
// as suggested for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent
// scripts to determine if training terminated naturally or via SIGTERM.
// Whith this approach we can accommodate additional signals in the future.
// An alternative would be to return 124, which is what the timeout command
// An alternative would be to exit with code 124, which is what the timeout command
// returns for timeout -s SIGTERM <seconds> ...., because exiting after SIGTERM
// is not technically a fatal error (which is what the 128+x convention usually
// stands for).
return getSigtermFlag() ? (128 + SIGTERM) : 0;
exit(getSignalFlag(SIGTERM) ? 128 + SIGTERM : EXIT_SUCCESS);
}

View File

@ -143,6 +143,15 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
cli.add<std::string>("--dump-config",
"Dump current (modified) configuration to stdout and exit. Possible values: full, minimal, expand")
->implicit_val("full");
if(mode_ == cli::mode::training) {
// --sigterm is deliberately not a boolean, to allow for a consistent
// pattern of specifying custom signal handling in the future.
// (e.g., dump model but continue training upon SIGUSR1, or report current
// training status upon SIGINFO.)
cli.add<std::string>("--sigterm",
"What to do with SIGTERM: save-and-exit or exit-immediately.",
"save-and-exit");
}
// clang-format on
}

View File

@ -175,8 +175,8 @@ void TemporaryFile::MakeTemp(const std::string &base) {
// open again with c++
streamBuf1_.reset(new std::filebuf());
auto ret = static_cast<std::filebuf*>(streamBuf1_.get())->open(name, std::ios::out | std::ios_base::binary);
ABORT_IF(!streamBuf1_, "File cannot be temp opened", name);
ABORT_IF(ret != streamBuf1_.get(), "Return value is not equal to streambuf pointer, that is weird");
ABORT_IF(!streamBuf1_, "File {} cannot be temp opened", name);
ABORT_IF(ret != streamBuf1_.get(), "Return value ({}) is not equal to streambuf pointer ({}), that is weird.", (size_t)ret, (size_t)streamBuf1_.get());
this->init(streamBuf1_.get());

View File

@ -0,0 +1,58 @@
#include "common/logging.h"
#include "signal_handling.h"
// The simplest (and recommended) way to handle signals is to simply set a flag
// in the signal handler and check that flag later.
//
// We provide setSignalFlag as the most generic signal handler. This handler uses a
// single sig_atomic_t as a bit field. On Linux, sig_atomic_t is equivalent to a signed int,
// theoretically providing 32 binary flags; in practice, most likely signals for which we may
// want to install signal handlers are
// - SIGTERM (15): which by default signals the request for a graceful shutdown
// - SIGUSR1 (10): intended for custom use, default action in Linux is termination
// - SIGUSR2 (12): intended for custom use, default action in Linux is termination
// - SIGINT (2): interrupt from the console
// Just to be safe, we accommodate signals up to signal No. 30.
// In addition, we also provide requestSaveAndExit() and saveAndExit() as a signal
// handler/checker for graceful shutdown requests during training.
constexpr int maxSignalForSetSignalFlag{30};
// Make sure sig_atomic_t is large enough as a bit field for our purposes.
// That said, I'm not aware of any platform where this would be a problem.
static_assert(SIG_ATOMIC_MAX > (1U<<maxSignalForSetSignalFlag),
"sig_atomic_type is too small for signal flags on this platform.");
namespace marian{
volatile std::sig_atomic_t sigflags_{0};
volatile std::sig_atomic_t saveAndExit_{0};
void setSignalFlag(int sig) {
// sigflags_ is an int type serving as a bit filed for flags corresponding
// to signals (lower or equeal to maxSignalForSetSignalFlag). We set the
// flag by a binary or (|=) of the bit field and an int value with exactly
// one bit set (s^sig).
sigflags_ |= (1<<sig);
}
// Check if the flag for the signal sig is set in the bit field sigflags_
bool getSignalFlag(const int sig) {
ABORT_IF(sig > maxSignalForSetSignalFlag,
"Signal out of range (must be < {}, is {}).", maxSignalForSetSignalFlag, sig);
// Do bitwise AND between sigflags_ and an int value that has exactly one bit set that
// corresponds to the signal in question. If the bit is set (see setSignalFlag above),
// the bitwise AND will return a non-zero integer, if it is not set, the result will
// be zero.
return (sigflags_ & (1<<sig)) != 0;
}
void requestSaveAndExit(int sig) {
setSignalFlag(sig); // keep track of triggering signal
saveAndExit_ = 1; // set flag to exit gracefully
}
bool saveAndExitRequested() {
return saveAndExit_ == 1;
}
}

View File

@ -0,0 +1,39 @@
#pragma once
#include <csignal>
#include <string>
// SIGNAL HANDLING
// The signal handlers (and checkers) here are implemented in line with with the recommendations
// for signal handling in the SEI CERT C Coding Standard, specifically
//
// - SIG30-C:
// https://wiki.sei.cmu.edu/confluence/display/c/SIG30-C.+Call+only+asynchronous-safe+functions+within+signal+handlers
//
// - SIG31-C:
// https://wiki.sei.cmu.edu/confluence/display/c/SIG31-C.+Do+not+access+shared+objects+in+signal+handlers
//
// The exact behavior of 'graceful exit' depends on the application; for training, it means 'save model and exit',
// for a server (not implemented yet): 'block new requests but serve pending requests and then exit'.
//
// Graceful exit for training is useful for training on clusters with time limits on jobs. Slurm, for example, can be
// set up to send a custom signal at a set time before the end of the time slot, giving Marian time to save its current
// state before getting killed.
namespace marian {
/// Request graceful exit (signal handler)
void requestSaveAndExit(int sig);
/// Check if graceful exit was requested.
bool saveAndExitRequested();
/// General purpose signal handler that simply sets a flag when a signal is received.
// (only for SIGNAL No. < 32).
void setSignalFlag(int sig); // custom handler (set flag) for sig
/// Check if a setSignalFlag was triggered for this signal
bool getSignalFlag(int sig);
} // End of namespace marian

View File

@ -1,6 +1,7 @@
#pragma once
#include "common/options.h"
#include "common/signal_handling.h"
#include "data/batch_stats.h"
#include "data/rng_engine.h"
#include "training/training_state.h"
@ -136,6 +137,8 @@ private:
}
size_t sets = 0;
while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data
if (saveAndExitRequested()) // stop generating batches
return std::deque<BatchPtr>();
maxiBatch->push(*current_);
sets = current_->size();
// do not consume more than required for the maxi batch as this causes
@ -161,6 +164,8 @@ private:
if (stats_)
cachedStatsIter = stats_->begin();
while(!maxiBatch->empty()) { // while there are sentences in the queue
if (saveAndExitRequested()) // stop generating batches
return std::deque<BatchPtr>();
// push item onto batch
batchVector.push_back(maxiBatch->top());
maxiBatch->pop(); // fetch next-shortest
@ -249,7 +254,7 @@ private:
"If you have changed the training corpus, add --no-restore-corpus to the training command and run it again.");
bufferedBatches_ = std::move(futureBufferedBatches_.get());
// if bg thread returns an empty swath, we hit the end of the epoch
if (bufferedBatches_.empty()) {
if (bufferedBatches_.empty() || saveAndExitRequested()) {
return nullptr;
}
// and kick off the next bg operation
@ -257,7 +262,7 @@ private:
} else { // don't spawn any threads, i.e. batch fetching is blocking.
bufferedBatches_ = fetchBatches();
// if bufferedBatches is empty we hit the end of the epoch
if (bufferedBatches_.empty()) {
if (bufferedBatches_.empty() || saveAndExitRequested()) {
return nullptr;
}
}

View File

@ -12,63 +12,6 @@
#include "tensors/gpu/cuda_helpers.h"
// clang-format on
// recreations of a few cusparse functions that were deprecated in CUDA 11
// @TODO: Fill these in. This is not trivial. Until then, using these with CUDA 11 will fail.
#if CUDA_VERSION >= 11000
cusparseStatus_t
cusparseSgemmi10(cusparseHandle_t handle,
int m,
int n,
int k,
int nnz,
const float* alpha,
const float* A,
int lda,
const float* cscValB,
const int* cscColPtrB,
const int* cscRowIndB,
const float* beta,
float* C,
int ldc) {
ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
}
#define cusparseSgemmi cusparseSgemmi10
cusparseStatus_t
cusparseScsr2csc(cusparseHandle_t handle,
int m,
int n,
int nnz,
const float* csrVal,
const int* csrRowPtr,
const int* csrColInd,
float* cscVal,
int* cscRowInd,
int* cscColPtr,
cusparseAction_t copyValues,
cusparseIndexBase_t idxBase) {
ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
}
cusparseStatus_t
cusparseScsrmm(cusparseHandle_t handle,
cusparseOperation_t transA,
int m,
int n,
int k,
int nnz,
const float* alpha,
const cusparseMatDescr_t descrA,
const float* csrValA,
const int* csrRowPtrA,
const int* csrColIndA,
const float* B,
int ldb,
const float* beta,
float* C,
int ldc) {
ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
}
#endif
namespace marian {
namespace gpu {
@ -420,6 +363,9 @@ static cusparseSgemmiEx(cusparseHandle_t handle, int m,
const float *cscValB, const int *cscColPtrB, const int *cscRowIndB, const float *beta,
float *C, int ldc)
{
#if CUDA_VERSION >= 11000
ABORT("cusparseSgemmi is not available in CUDA VERSION >= 11.");
#else
const int nMax = 65535; // max. number of columns allowed by cuSparse 10 implementation
for (int j0 = 0; j0 < n; j0 += 65535) { // loop over column slices, j0 = index of first column
// Call original function on a column slice.
@ -432,6 +378,7 @@ static cusparseSgemmiEx(cusparseHandle_t handle, int m,
if (rc != CUSPARSE_STATUS_SUCCESS)
return rc;
}
#endif
return CUSPARSE_STATUS_SUCCESS;
}
@ -483,6 +430,45 @@ void CSRProd(marian::Tensor C,
St_indices = allocator->alloc<int>(numValues);
St_offsets = allocator->alloc<int>(colsS + 1);
// transpose the second argument
#if CUDA_VERSION >= 11000
size_t buffer_size;
CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize(cusparseHandle,
/*m=*/ rowsS, // number of rows of matrix
/*n=*/ colsS, // number of columns of matrix
/*nnz=*/ (int)numValues,
/*csrcVal=*/ S_values ->data<float>(),
/*csrcRowPtr=*/ (int*)S_offsets->data<IndexType>(),
/*csrcColInd=*/ (int*)S_indices->data<IndexType>(),
/*cscVal=*/ St_values ->data<float>(), // transposed version goes here
/*cscColPtr=*/ St_offsets->data<int>(),
/*cscRowInd=*/ St_indices->data<int>(),
/*valType*/ CUDA_R_32F,
/*copyValues=*/ CUSPARSE_ACTION_NUMERIC,
/*idxBase=*/ CUSPARSE_INDEX_BASE_ZERO,
/*alg*/ CUSPARSE_CSR2CSC_ALG1,
/*bufferSize*/ &buffer_size));
MemoryPiece::PtrType buffer= (buffer_size > 0) ? allocator->alloc<uint8_t>(buffer_size) : nullptr;
CUSPARSE_CHECK(cusparseCsr2cscEx2(cusparseHandle,
/*m=*/ rowsS, // number of rows of matrix
/*n=*/ colsS, // number of columns of matrix
/*nnz=*/ (int)numValues,
/*csrcVal=*/ S_values ->data<float>(),
/*csrcRowPtr=*/ (int*)S_offsets->data<IndexType>(),
/*csrcColInd=*/ (int*)S_indices->data<IndexType>(),
/*cscVal=*/ St_values ->data<float>(), // transposed version goes here
/*cscColPtr=*/ St_offsets->data<int>(),
/*cscRowInd=*/ St_indices->data<int>(),
/*valType=*/ CUDA_R_32F,
/*copyValues=*/ CUSPARSE_ACTION_NUMERIC,
/*idxBase=*/ CUSPARSE_INDEX_BASE_ZERO,
/*alg=*/ CUSPARSE_CSR2CSC_ALG1,
/*buffer=*/ buffer->data<uint8_t>()));
if (buffer)
allocator->free(buffer);
ABORT("This code is untested. Please remove this ABORT once tests exist and pass.");
#else
CUSPARSE_CHECK(cusparseScsr2csc(cusparseHandle,
/*m=*/ rowsS, // number of rows of matrix
/*n=*/ colsS, // number of columns of matrix
@ -495,12 +481,16 @@ void CSRProd(marian::Tensor C,
/*cscColPtr=*/ St_offsets->data<int>(),
/*copyValues=*/ CUSPARSE_ACTION_NUMERIC,
/*idxBase=*/ CUSPARSE_INDEX_BASE_ZERO));
#endif
std::swap(rowsS, colsS); // these variables now represent the dims of the explicitly transposed object
}
if (swapOperands) {
// C = D x S for row-major matrices
// Implemented via cusparse as C' = S' x D' ("csrmm") where C' and D' are column-major,
// and S' is CSR (if not transS then we make a transposed copy).
#if CUDA_VERSION >= 11000
ABORT("CSRProd is not yet implemented for CUDA VERSION >= 11");
#else
cusparseMatDescr_t descrA;
CUSPARSE_CHECK(cusparseCreateMatDescr(&descrA));
cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
@ -521,6 +511,7 @@ void CSRProd(marian::Tensor C,
C->data(),
/*ldc=*/ colsC)); // stride
cusparseDestroyMatDescr(descrA);
#endif
}
else {
// C = S x D for row-major matrices

View File

@ -1,43 +0,0 @@
#include "scheduler.h"
#include <signal.h>
#include <cassert>
namespace marian {
// SIGNAL HANDLING, see scheduler.cpp for definitions
// Currently, only the following is handled by a custom signal handler:
// SIGTERM: When SIGTERM is received, the global (static member) flag sigterm_ (false by default) is set to true
// by signalHandler(). When sigterm_ is true, keepGoing() returns false, and the current state of training models
// is saved prior to exiting.
// This functionality is helpful when training on clusters with time limits on compute slots, e.g., on s
// clusters managed by slurm. Slurm can be asked to sending a (custom) warning signal to a process at a given
// point in time prior to the hard "time's up".
bool sigterm_{false}; // flag signalling that SIGTERM has been received false by default, set to true by signalHandler(SIGTERM)
void signalHandler(int sig) {
// Note: sys_siglist[sig] or stdsignal() describe the effect (e.g.,
// 'Terminated' rather than provide the signal name (which are #define(s)
// in signal.h), so we have to do custom log messages here.
switch (sig) {
case SIGTERM: // save models and exit
LOG(info, "[training] Scheduler received signal SIGTERM"); // @TODO: figure out if this is safe. The logs are global and thread-safe, so should be OK?
sigterm_ = true;
break;
default:
ABORT("No action defined for signal {}", sig);
}
}
// installs signalHandler() for select signals (currently only SIGTERM)
void installSignalHandlers() {
// TODO: use sigaction instead of signal,
// cf. https://stackoverflow.com/questions/231912/what-is-the-difference-between-sigaction-and-signal
signal(SIGTERM, signalHandler);
}
bool getSigtermFlag() {
return sigterm_;
}
}

View File

@ -1,6 +1,7 @@
#pragma once
#include "common/options.h"
#include "common/signal_handling.h"
#include "training/training_state.h"
#include "training/validator.h"
#include "training/communicator.h"
@ -8,9 +9,6 @@
namespace marian {
bool getSigtermFlag();
void installSignalHandlers();
class Scheduler : public TrainingObserver {
private:
Ptr<Options> options_;
@ -154,11 +152,10 @@ public:
: options_(options), state_(state) {
ABORT_IF(state_->factor != 1, "state.factor unexpectedly not 1 at this point??");
updateLearningRate(*state);
installSignalHandlers();
}
bool keepGoing() {
if(getSigtermFlag()) // received signal SIGERM => exit gracefully
if(saveAndExitRequested()) // via SIGTERM
return false;
// stop if it reached the maximum number of epochs
@ -192,13 +189,12 @@ public:
void started() { LOG(info, "Training started"); }
void finished() {
if (getSigtermFlag())
LOG(info, "Training interrupted (SIGTERM).");
if (saveAndExitRequested())
LOG(info, "Training interrupted (via signal).");
else
LOG(info, "Training finished");
}
void addValidator(Ptr<ValidatorBase> validator) {
validators_.push_back(validator);
@ -223,9 +219,10 @@ public:
void validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
bool isFinal = false) {
// Do not validate if already validated (for instance, after the model is
// loaded) or if validation is scheduled for another update, or when signal SIGTERM was received
if(getSigtermFlag() // SIGTERM was received
// Do not validate if already validated (for instance, after the model is loaded)
// or if validation is scheduled for another update, or when a graceful shutdown
// was requested.
if(saveAndExitRequested()
|| state_->validated // already validated (in resumed training, for example)
|| (!state_->enteredNewPeriodOf(options_->get<std::string>("valid-freq")) && !isFinal)) // not now
return;

View File

@ -16,6 +16,7 @@ template <class ModelWrapper>
class Train : public ModelTask {
private:
Ptr<Options> options_;
void installCustomSignalHandlers();
public:
Train(Ptr<Options> options) : options_(options) {}
@ -77,6 +78,9 @@ public:
bool restored = !options_->get<bool>("no-restore-corpus")
&& batchGenerator->restore(trainState);
// We only want custom behavior once training starts.
installCustomSignalHandlers();
// -- main training loop
scheduler->started();
while(scheduler->keepGoing()) {
@ -107,4 +111,16 @@ public:
finalizeMPI(std::move(mpi));
}
};
template <class ModelWrapper>
void Train<ModelWrapper>::installCustomSignalHandlers(){
const std::string sigTermAction = options_->get<std::string>("sigterm");
if (sigTermAction == "save-and-exit") {
LOG(debug, "Will save before exiting upon SIGTERM.");
signal(SIGTERM, requestSaveAndExit);
}
else if (sigTermAction != "exit-immediately")
ABORT("Unrecognized value '{}' for --sigterm", sigTermAction);
}
} // namespace marian