Merged PR 11920: Compare external master against internal master

Compare external master against internal master. Just double checking.
This commit is contained in:
Martin Junczys-Dowmunt 2020-03-10 00:29:55 +00:00
parent 9f29403627
commit cf7f0321f8
79 changed files with 533 additions and 332 deletions

View File

@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]
### Added
- Add support for compiling on Mac (and clang)
- An option for resetting stalled validation metrics
- Add CMAKE options to disable compilation for specific GPU SM types
- An option to print word-level translation scores
- An option to turn off automatic detokenization from SentencePiece
@ -59,6 +61,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Compilation with CUDA 10.1
### Changed
- Combine two for-loops in nth_element.cpp on CPU
- Revert LayerNorm eps to old position, i.e. sigma' = sqrt(sigma^2 + eps)
- Downgrade NCCL to 2.3.7 as 2.4.2 is buggy (hangs with larger models)
- Return error signal on SIGTERM

View File

@ -24,6 +24,7 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
option(USE_CUDNN "Use CUDNN library" OFF)
option(USE_DOXYGEN "Build documentation with Doxygen" ON)
option(USE_FBGEMM "Use FBGEMM" OFF)
option(USE_MKL "Compile with MKL support" ON)
option(USE_MPI "Use MPI library" OFF)
option(USE_NCCL "Use NCCL library" ON)
option(USE_SENTENCEPIECE "Download and compile SentencePiece" OFF)
@ -33,7 +34,7 @@ option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
if(USE_CCACHE)
find_program(CCACHE_PROGRAM ccache)
if(CCACHE_PROGRAM)
message(STATUS "Found and will be using ccache for faster repeat compilation (use cmake -DUSE_CCACHE=off to disable).")
message(STATUS "Will be using ccache for faster repeat compilation (use cmake -DUSE_CCACHE=off to disable).")
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
else(CCACHE_PROGRAM)
message(WARNING "Compilation with ccache requested but no ccache found.")
@ -141,20 +142,32 @@ else(MSVC)
add_definitions(-DUSE_FBGEMM=1)
endif(USE_FBGEMM)
set(DISABLE_GLOBALLY "-Wno-unused-result")
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
# Clang-10.0.0 complains when CUDA is newer than 10.1
set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-cuda-version")
endif()
set(DISABLE_GLOBALLY "-Wno-unused-result -Wno-unknown-warning-option ${CLANG_IGNORE_UNKNOWN_CUDA}")
# These are used in src/CMakeLists.txt on a per-target basis
list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
-Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)
list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated;
-Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function;
-Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare;
-Wno-missing-field-initializers;)
# This warning does not exist prior to gcc 5.0
if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
list(APPEND ALL_WARNINGS -Wsuggest-override)
list(APPEND ALL_WARNINGS -Wsuggest-override -Wno-int-in-bool-context)
endif()
set(CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wl,--no-as-needed -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -m64 -funroll-loops -ffinite-math-only -g -rdynamic")
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -rdynamic")
if(CMAKE_COMPILER_IS_GNUCC)
# these flags are not known to clang
set(CMAKE_GCC_FLAGS "-Wl,--no-as-needed")
set(CMAKE_RDYNAMIC_FLAG "-rdynamic")
endif(CMAKE_COMPILER_IS_GNUCC)
set(CMAKE_CXX_FLAGS "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_SLIM "-Ofast -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE}")
set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -pg")
@ -162,9 +175,9 @@ else(MSVC)
set(CMAKE_CXX_FLAGS_PROFUSE "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")
# these need to be set separately
set(CMAKE_C_FLAGS "-pthread -Wl,--no-as-needed -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
set(CMAKE_C_FLAGS_RELEASE "-O3 -m64 -funroll-loops -ffinite-math-only -g -rdynamic")
set(CMAKE_C_FLAGS_DEBUG "-O0 -g -rdynamic")
set(CMAKE_C_FLAGS "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
set(CMAKE_C_FLAGS_RELEASE "-O3 -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_SLIM "-O3 -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELEASE}")
set(CMAKE_C_FLAGS_PROFILE "${CMAKE_C_FLAGS_RELEASE} -pg")
@ -204,7 +217,7 @@ if(CUDA_FOUND)
if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND (CMAKE_VERSION VERSION_LESS "3.12.2"))
message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
endif()
if(COMPILE_CUDA_SM35)
LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
endif(COMPILE_CUDA_SM35)
@ -323,13 +336,15 @@ if(USE_MPI)
endif(USE_MPI)
if(COMPILE_CPU)
find_package(MKL)
if(USE_MKL)
find_package(MKL)
endif(USE_MKL)
if(MKL_FOUND)
include_directories(${MKL_INCLUDE_DIR})
set(EXT_LIBS ${EXT_LIBS} ${MKL_LIBRARIES})
add_definitions(-DBLAS_FOUND=1 -DMKL_FOUND=1)
else(MKL_FOUND)
set(BLA_VENDOR "OpenBLAS")
set(BLAS_VENDOR "OpenBLAS")
find_package(BLAS)
if(BLAS_FOUND)
include(FindCBLAS)

View File

@ -1 +1 @@
v1.8.43
v1.8.51

View File

@ -54,7 +54,7 @@ MACRO(CHECK_ALL_LIBRARIES LIBRARIES INCLUDE _prefix _name _flags _list _include
IF(APPLE)
FIND_LIBRARY(${_prefix}_${_library}_LIBRARY
NAMES ${_library}
PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV
PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 /usr/local/opt/openblas/lib ENV
DYLD_LIBRARY_PATH
)
ELSE(APPLE)

View File

@ -9,18 +9,22 @@ import numpy as np
def main():
desc = """Export word embedding from model"""
desc = """Export word embeddings from model"""
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter, description=desc)
parser.add_argument("-m", "--model", help="Model file", required=True)
parser.add_argument(
"-o", "--output-prefix", help="Output files prefix", required=True)
parser.add_argument("-m", "--model", help="path to model.npz file", required=True)
parser.add_argument("-o", "--output-prefix", help="prefix for output files", required=True)
args = parser.parse_args()
print("Loading model")
model = np.load(args.model)
special = yaml.load(model["special:model.yml"][:-1].tobytes())
if special["tied-embeddings-all"] or special["tied-embeddings-src"]:
all_emb = model["Wemb"]
export_emb(args.output_prefix + ".all", all_emb)
exit()
if special["type"] == "amun":
enc_emb = model["Wemb"]
dec_emb = model["Wemb_dec"]
@ -28,16 +32,15 @@ def main():
enc_emb = model["encoder_Wemb"]
dec_emb = model["decoder_Wemb"]
with open(args.output_prefix + ".src", "w") as out:
out.write("{0} {1}\n".format(*enc_emb.shape))
for i in range(enc_emb.shape[0]):
vec = " ".join("{0:.8f}".format(v) for v in enc_emb[i])
out.write("{0} {1}\n".format(i, vec))
export_emb(args.output_prefix + ".src", enc_emb)
export_emb(args.output_prefix + ".trg", dec_emb)
with open(args.output_prefix + ".trg", "w") as out:
out.write("{0} {1}\n".format(*dec_emb.shape))
for i in range(dec_emb.shape[0]):
vec = " ".join("{0:.8f}".format(v) for v in dec_emb[i])
def export_emb(filename, emb):
with open(filename, "w") as out:
out.write("{0} {1}\n".format(*emb.shape))
for i in range(emb.shape[0]):
vec = " ".join("{0:.8f}".format(v) for v in emb[i])
out.write("{0} {1}\n".format(i, vec))

View File

@ -15,12 +15,22 @@ if(USE_FBGEMM)
if(NOT MSVC)
# only locally disabled for the 3rd_party folder
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused")
endif()
set(FBGEMM_BUILD_TESTS OFF CACHE BOOL "Disable fbgemm tests")
set(FBGEMM_BUILD_BENCHMARKS OFF CACHE BOOL "Disable fbgemm benchmark")
add_subdirectory(./fbgemm)
# asmjit (3rd-party submodule of fbgemm) sets -Wall -Wextra near the end of
# the compile options, invalidating any -Wno-... flags that we may have set
# earlier. Let's remove them.
get_property(ASMJIT_COMPILE_OPTIONS TARGET asmjit PROPERTY COMPILE_OPTIONS)
list(REMOVE_ITEM ASMJIT_COMPILE_OPTIONS -Wall -Wextra)
set_property(TARGET asmjit PROPERTY COMPILE_OPTIONS ${ASMJIT_COMPILE_OPTIONS})
message(" ASMJIT COMPILE FLAGS: ${ASMJIT_COMPILE_OPTIONS}")
endif(USE_FBGEMM)
if(USE_SENTENCEPIECE)
@ -39,7 +49,7 @@ if(USE_SENTENCEPIECE)
message(WARNING "You are compiling SentencePiece binaries with -DUSE_STATIC_LIBS=on. \
This will cause spm_train to segfault. No need to worry if you do not intend to use that binary. \
Marian support for SentencePiece will work fine.")
set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE)
set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE)
else(USE_STATIC_LIBS)
@ -51,8 +61,19 @@ if(USE_SENTENCEPIECE)
include_directories(./sentencepiece)
set_target_properties(spm_encode spm_decode spm_train spm_normalize spm_export_vocab
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
foreach(t sentencepiece sentencepiece_train sentencepiece_train-static
spm_decode spm_encode spm_export_vocab spm_normalize spm_train)
set_property(TARGET ${t} APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-tautological-compare -Wno-unused")
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
set_property(TARGET ${t} APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-range-loop-construct")
endif()
# get_property(SENTENCEPIECE_COMPILE_FLAGS TARGET ${t} PROPERTY COMPILE_FLAGS)
# message("-- SENTENCPIECE: compile flags for target ${t}: ${SENTENCEPIECE_COMPILE_FLAGS}")
endforeach(t)
endif()
if(USE_STATIC_LIBS)
set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
@ -63,6 +84,22 @@ include_directories(./SQLiteCpp/include)
include_directories(./CLI)
include_directories(./pathie-cpp/include)
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
#set_target_properties(SQLiteCpp PROPERTIES COMPILE_FLAGS
set_property(TARGET SQLiteCpp APPEND_STRING PROPERTY COMPILE_FLAGS
" -Wno-parentheses-equality -Wno-unused-value")
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
set_property(TARGET SQLiteCpp APPEND_STRING PROPERTY COMPILE_FLAGS
" -Wno-implicit-int-float-conversion")
endif()
set_property(TARGET libyaml-cpp APPEND_STRING PROPERTY COMPILE_FLAGS
" -fPIC -Wno-unused-value")
set_property(TARGET pathie-cpp APPEND_STRING PROPERTY COMPILE_FLAGS
" -fPIC -Wno-unused-value")
endif()
include_directories(./zlib)
include(ExternalProject)

View File

@ -186,7 +186,7 @@ inline HalfFloat& HalfFloat::operator= (float other)
inline bool HalfFloat::operator== (HalfFloat other) const
{
// +0 and -0 are considered to be equal
if (!(bits << 1u) && !(other.bits << 1u))return true;
if ((bits << 1u) == 0 && (other.bits << 1u) == 0) return true;
return bits == other.bits && !this->IsNaN();
}
@ -194,7 +194,7 @@ inline bool HalfFloat::operator== (HalfFloat other) const
inline bool HalfFloat::operator!= (HalfFloat other) const
{
// +0 and -0 are considered to be equal
if (!(bits << 1u) && !(other.bits << 1u))return false;
if ((bits << 1u) == 0 && (other.bits << 1u) == 0) return false;
return bits != other.bits || this->IsNaN();
}

View File

@ -31,7 +31,7 @@
#include "../include/path.hpp"
#include "../include/errors.hpp"
#if defined(__unix__)
#if defined(__unix__) || defined(__APPLE__)
#include <sys/types.h>
#include <dirent.h>
#include <errno.h>

View File

@ -902,7 +902,7 @@ Path Path::pwd()
*/
Path Path::exe()
{
#if defined(__linux__)
#if defined(__linux__) || defined(__APPLE__)
char buf[PATH_MAX];
ssize_t size = ::readlink("/proc/self/exe", buf, PATH_MAX);

View File

@ -143,7 +143,7 @@ std::string Pathie::convert_encodings(const char* from_encoding, const char* to_
errno = 0;
errsav = 0;
#ifdef BSD
#if defined(BSD) && ! defined(__APPLE__) //Since MacOS evolved from BSD, it is captured here but the iconv on macos behaves differently
// What the heck. FreeBSD violates POSIX.1-2008: it declares iconv()
// differently than mandated by POSIX: http://pubs.opengroup.org/onlinepubs/9699919799/functions/iconv.html
// (it declares a `const' where it must not be).
@ -181,11 +181,10 @@ std::string Pathie::convert_encodings(const char* from_encoding, const char* to_
std::string Pathie::utf8_to_filename(const std::string& utf8)
{
bool fs_encoding_is_utf8 = false;
char* fsencoding = NULL;
#if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX)
fs_encoding_is_utf8 = true;
#else
char* fsencoding = NULL;
fsencoding = nl_langinfo(CODESET);
fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0);
#endif
@ -206,11 +205,10 @@ std::string Pathie::utf8_to_filename(const std::string& utf8)
std::string Pathie::filename_to_utf8(const std::string& native_filename)
{
bool fs_encoding_is_utf8 = false;
char* fsencoding = NULL;
#if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX)
fs_encoding_is_utf8 = true;
#else
char* fsencoding = NULL;
fsencoding = nl_langinfo(CODESET);
fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0);
#endif

View File

@ -27,7 +27,7 @@ static std::string strerror()
{
buff = "Unknown error";
}
#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE
#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || __APPLE__) && ! _GNU_SOURCE
// XSI-compliant strerror_r()
if (strerror_r(errno, &buff[0], buff.size()) != 0)
{

View File

@ -215,6 +215,10 @@ if(COMPILE_SERVER)
set(EXECUTABLES ${EXECUTABLES} marian_server)
endif(COMPILE_SERVER)
if(APPLE) # This is a dependency of pathie but I can't seem to link it into that CMakeLists because we're not compiling it as a library.
set(EXT_LIBS ${EXT_LIBS} iconv)
endif()
foreach(exec ${EXECUTABLES})
target_link_libraries(${exec} marian ${EXT_LIBS} ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
if(CUDA_FOUND)

View File

@ -44,7 +44,7 @@ int main(int argc, char **argv) {
// Error Codes for error code meanings
// http://www.boost.org/doc/libs/1_55_0/doc/html/boost_asio/reference.html
translate.on_error = [](Ptr<WSServer::Connection> connection,
translate.on_error = [](Ptr<WSServer::Connection> /*connection*/,
const SimpleWeb::error_code &ec) {
LOG(error, "Connection error: ({}) {}", ec.value(), ec.message());
};

View File

@ -15,7 +15,6 @@ static inline std::string InterpolateEnvVars(std::string str) {
// presently has the form /hdfs/VC instead of /{gfs,hdfs}/CLUSTER/VC
// Catch stdin/stdout and do not process
std::cerr << str << std::endl;
if(str == "stdin" || str == "stdout") {
return str;
}

View File

@ -525,6 +525,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
"Metric to use during validation: cross-entropy, ce-mean-words, perplexity, valid-script, "
"translation, bleu, bleu-detok. Multiple metrics can be specified",
{"cross-entropy"});
cli.add<bool>("--valid-reset-stalled",
"Reset all stalled validation metrics when the training is restarted");
cli.add<size_t>("--early-stopping",
"Stop if the first validation metric does not improve for arg consecutive validation steps",
10);
@ -553,7 +555,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
"Size of mini-batch used during validation",
32);
cli.add<size_t>("--valid-max-length",
"Maximum length of a sentence in a validating sentence pair",
"Maximum length of a sentence in a validating sentence pair. "
"Sentences longer than valid-max-length are cropped to valid-max-length",
1000);
// options for validation script

View File

@ -10,6 +10,21 @@
#include <string>
#include <vector>
// The macro MAYBE_UNUSED is used to selectively disable
// unused-variable warnings. C++17 defines the attribute
// [[maybe_unused]], but I don't think we're at C++17 yet. We can add it when we reach C++17.
// The compilers gcc and clang (and maybe others) define
// __has_attribute and support __attribute__(unused) in C++11,
#if defined __has_attribute
# if __has_attribute(unused)
# define MAYBE_UNUSED __attribute__((unused))
# else
# define MAYBE_UNUSED
# endif
#else
# define MAYBE_UNUSED
#endif
#define THREAD_GUARD(body) [&]() { body; }() // test if THREAD_GUARD is neccessary, remove if no problems occur.
#define NodeOp(op) [=]() { op; }

View File

@ -84,10 +84,16 @@ std::vector<T> As<std::vector<T>>::apply(const FastOpt& node) {
// specializations for simple vector types
template struct As<std::vector<bool>>;
template struct As<std::vector<int>>;
// Windows and Unix based OS have different type definitions for 'unsigned long'.
// So, we need an explicit definition for uint64_t. Otherwise, there's a linking error on windows.
// Windows, Linux based OS and Mac have different type definitions for 'unsigned long'.
// So, we need an explicit definitions for uint64_t, that cover different platforms.
// Otherwise, there's a linking error on windows or Linux or Mac.
// https://software.intel.com/en-us/articles/size-of-long-integer-type-on-different-architecture-and-os/
template struct As<std::vector<uint64_t>>;
// https://stackoverflow.com/questions/32021860/c-should-you-size-t-with-a-regular-array
// MacOS: size_t = unsigned long (8 bytes), uint64_t = unsigned long long (8 bytes)
// Linux: size_t = unsigned long (8 bytes), uint64_t = unsigned long (8 bytes)
// Windows: size_t = unsigned long long (8 bytes), uint64_t = unsigned long long (8 bytes)
template struct As<std::vector<unsigned long long>>;
template struct As<std::vector<unsigned long>>;
template struct As<std::vector<float>>;
template struct As<std::vector<double>>;
template struct As<std::vector<std::string>>;
@ -103,4 +109,4 @@ std::pair<T1, T2> As<std::pair<T1, T2>>::apply(const FastOpt& node) {
template struct As<std::pair<int, int>>;
}
}
}

View File

@ -367,7 +367,8 @@ public:
}
const FastOpt& operator[](const char* const key) const {
return operator[](crc::crc(key));
// MacOS requires explicit cast to size_t before we can use it.
return operator[]((size_t)crc::crc(key));
}
const FastOpt& operator[](const std::string& key) const {
@ -375,4 +376,4 @@ public:
}
};
}
}

View File

@ -7,10 +7,21 @@
#include "common/filesystem.h"
#include "common/logging.h"
// Even when compiling with clang, __GNUC__ may be defined, so
// we need to add some extra checks to avoid compile errors with
// respect to -Wsuggest-override.
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsuggest-override"
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wunused-value"
# if defined(__has_warning)
# if __has_warning("-Wsuggest-override")
# pragma GCC diagnostic ignored "-Wsuggest-override"
# endif
# else
# pragma GCC diagnostic ignored "-Wsuggest-override"
# endif
#endif
#ifdef _MSC_VER
#pragma warning(push) // 4101: 'identifier' : unreferenced local variable. One parameter variable in zstr.hpp is not used.
#pragma warning(disable : 4101)
@ -82,7 +93,7 @@ protected:
void NormalizeTempPrefix(std::string& base) const;
void MakeTemp(const std::string& base);
};
} // namespace io

View File

@ -7,9 +7,19 @@
// @TODO: go back to canonical names for functions and objects
// as specified in C++17 so it becomes easy to move in the future
// Even when compiling with clang, __GNUC__ may be defined, so
// we need to add some extra checks to avoid compile errors with
// respect to -Wsuggest-override.
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsuggest-override"
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wunused-value"
# if defined(__has_warning)
# if __has_warning("-Wsuggest-override")
# pragma GCC diagnostic ignored "-Wsuggest-override"
# endif
# else
# pragma GCC diagnostic ignored "-Wsuggest-override"
# endif
#endif
#include "3rd_party/pathie-cpp/include/path.hpp" // @TODO: update to latest Pathie

View File

@ -147,16 +147,6 @@ inline bool operator!=(const IntrusivePtr<T>& a, const IntrusivePtr<U>& b) {
return a.get() != b.get();
}
template<class T>
inline bool operator==(const IntrusivePtr<T>& a, T* b) {
return a.get() == b;
}
template<class T>
inline bool operator!=(const IntrusivePtr<T>& a, T* b) {
return a.get() != b;
}
template<class T>
inline bool operator==(const IntrusivePtr<T>& a, std::nullptr_t) {
return a.get() == 0;
@ -167,14 +157,24 @@ inline bool operator!=(const IntrusivePtr<T>& a, std::nullptr_t) {
return a.get() != 0;
}
template<class T>
inline bool operator==(const IntrusivePtr<T>& a, T* b) {
return a.get() == b;
}
template<class T>
inline bool operator!=(const IntrusivePtr<T>& a, T* b) {
return a.get() != b;
}
template<class T>
inline bool operator==(T* a, const IntrusivePtr<T>& b) {
return b.get();
return a == b.get();
}
template<class T>
inline bool operator!=(T* a, const IntrusivePtr<T>& b) {
return b.get();
return a != b.get();
}
template<class T, class U>
@ -223,5 +223,3 @@ namespace std {
}
};
}

View File

@ -124,7 +124,7 @@ static void setErrorHandlers() {
std::set_terminate(unhandledException);
#ifdef __unix__
// catch segfaults
struct sigaction sa = { 0 };
struct sigaction sa = { {0} };
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };

View File

@ -254,7 +254,7 @@ enum class Type : size_t {
packed16 = TypeClass::packed_type + 2u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
};
static inline size_t operator&(TypeClass typeClass, Type type) {
@ -394,7 +394,7 @@ static Type inline typeFromString(const std::string& str) {
return Type::float32;
if(str == "float64")
return Type::float64;
if(str == "packed16")
return Type::packed16;
if(str == "packed8avx2")
@ -437,19 +437,35 @@ void matchOrAbort(Type type) {
namespace typeFitting { // own namespace instead of in class, otherwise we get error "explicit specialization in non-namespace scope"
// compares max for different types as constexpr, so can be used at compile-time to determine if RequestType type max fits into ReturnType max, see std::conditional below.
template <typename RequestType, typename ReturnType>
constexpr bool fitsIntoMax() { return std::numeric_limits<RequestType>::max() <= std::numeric_limits<ReturnType>::max(); } // for built-in types everything is constexpr
// Helper function for fitsIntoMax() below
// Returns the 'capacity' of a type: number of digits for integers,
// max_exponent for floats. We ignore the mantissa for floats.
template<typename X> constexpr int capacity() {
static_assert(std::is_arithmetic<X>::value || std::is_same<X,HalfFloat>::value,
"Wrong type for this template");
return (std::is_integral<X>::value
? std::numeric_limits<X>::digits
: std::numeric_limits<X>::max_exponent);
}
// Compare max for different types as constexpr, so can be used at compile-time to determine if RequestType type max fits into ReturnType max, see std::conditional below.
template <typename RequestType, typename ReturnType>
constexpr bool fitsIntoMax() {
// We can't just compare std::numeric_limits<>::max(), because Clang-10
// complains about rounding errors when implicitly converting int to float
return ((!std::is_integral<RequestType>::value // RequestType is a float
&& std::is_integral<ReturnType>::value) // ReturnType an integer
? capacity<RequestType>() < capacity<ReturnType>() // special case
: capacity<RequestType>() <= capacity<ReturnType>()); // normal case
} // for built-in types everything is constexpr
// add specializations here when needed
template <> constexpr bool fitsIntoMax<float16, float>() { return true; }; // for float16 conversion to float is not constexpr, hence specializations
template <> constexpr bool fitsIntoMax<float, float16>() { return false; }; // for float16 conversion to float is not constexpr, hence specializations
}
template <typename ReturnType>
class NumericLimits {
private:
template <typename MaxType> void setLimitsMax() {
max = (ReturnType)std::numeric_limits<MaxType>::max();
lowest = (ReturnType)std::numeric_limits<MaxType>::lowest();
@ -459,10 +475,14 @@ private:
void setLimits() {
// check if the maximum of type RequestType fits into ReturnType
constexpr bool fits = typeFitting::fitsIntoMax<RequestType, ReturnType>();
// sanity check:
static_assert(fits || typeFitting::fitsIntoMax<ReturnType, RequestType>(),
"RequestType doesn't fit into ReturnType, and ReturnType doesn't "
"fit into RequestType. fitsIntoMax is broken!");
// and then use the smaller of each types to determine max, min, lowest.
using MaxType = typename std::conditional<fits, RequestType, ReturnType>::type;
setLimitsMax<MaxType>();
// @TODO: should we rather abort if the RequestType does not fit into ReturnType instead of clipping to smaller type?
// @TODO: should we rather abort if the RequestType does not fit into ReturnType instead of clipping to smaller type?
// ABORT_IF(!fits, "Type {} is too small to contain max of type {}", typeId<ReturnType>(), typeId<RequestType>());
}

View File

@ -8,12 +8,22 @@
#include <sstream>
#include <string>
#include <set>
#ifdef __unix__
#if defined(__unix__) || defined(__APPLE__)
#include <unistd.h>
#endif
#include <codecvt>
#include <cwctype>
// MACOS lacks HOST_NAME_MAX
#ifndef HOST_NAME_MAX
# if defined(_POSIX_HOST_NAME_MAX)
# define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
# elif defined(MAXHOSTNAMELEN)
# define HOST_NAME_MAX MAXHOSTNAMELEN
# endif
#endif
namespace marian {
namespace utils {

View File

@ -26,7 +26,7 @@ public:
virtual void setGuidedAlignment(std::vector<float>&&) = 0;
virtual void setDataWeights(const std::vector<float>&) = 0;
virtual ~Batch() {};
protected:
std::vector<size_t> sentenceIds_;
};

View File

@ -66,10 +66,10 @@ protected:
Ptr<Options> options_;
bool restored_{false};
// replacing old shuffle_ with two variants that determine more fine-grained shuffling behavior.
// Both set to false is equivalent to old shuffle_ == false.
// replacing old shuffle_ with two variants that determine more fine-grained shuffling behavior.
// Both set to false is equivalent to old shuffle_ == false.
// Now we can not shuffle the data, but shuffle batches. Useful for linear reading of very large data sets with pre-reading.
// Parameters like maxi-batch determine how much data is pre-read and sorted by length or other criteria.
// Parameters like maxi-batch determine how much data is pre-read and sorted by length or other criteria.
bool shuffleData_{false}; // determine if full data should be shuffled before reading and batching.
bool shuffleBatches_{false}; // determine if batches should be shuffled after batching.
@ -103,7 +103,7 @@ private:
};
auto cmpNone = [](const Sample& a, const Sample& b) { return a.getId() < b.getId(); }; // sort in order of original ids = original data order unless shuffling
typedef std::function<bool(const Sample&, const Sample&)> cmp_type;
typedef std::priority_queue<Sample, Samples, cmp_type> sample_queue;
@ -229,7 +229,7 @@ private:
// this starts fillBatches() as a background operation
void fetchBatchesAsync() {
ABORT_IF(futureBufferedBatches_.valid(), "attempted to restart futureBufferedBatches_ while still running");
ABORT_IF(futureBufferedBatches_.valid(), "Attempted to restart futureBufferedBatches_ while still running");
futureBufferedBatches_ = threadPool_.enqueue([this]() {
return fetchBatches();
});
@ -239,7 +239,9 @@ private:
if(bufferedBatches_.empty()) {
// out of data: need to get next batch from background thread
// We only get here if the future has been scheduled to run; it must be valid.
ABORT_IF(!futureBufferedBatches_.valid(), "attempted to wait for futureBufferedBatches_ when none pending");
ABORT_IF(!futureBufferedBatches_.valid(), "Attempted to wait for futureBufferedBatches_ when none pending.\n"
"This error often occurs when Marian tries to restore the training data iterator, but the corpus has been changed or replaced.\n"
"If you have changed the training corpus, add --no-restore-corpus to the training command and run it again.");
bufferedBatches_ = std::move(futureBufferedBatches_.get());
// if bg thread returns an empty swath, we hit the end of the epoch
if (bufferedBatches_.empty()) {

View File

@ -525,6 +525,7 @@ public:
const std::vector<Ptr<Vocab>>& vocabs,
Ptr<Options> options);
virtual ~CorpusBase() {}
virtual std::vector<Ptr<Vocab>>& getVocabs() = 0;
protected:

View File

@ -45,6 +45,7 @@ protected:
public:
// @TODO: choose between 'virtual' and 'final'. Can we derive from this class?
virtual ~DefaultVocab() {};
virtual const std::string& canonicalExtension() const override { return suffixes_[0]; }
virtual const std::vector<std::string>& suffixes() const override { return suffixes_; }
@ -295,7 +296,7 @@ private:
class ClassVocab : public DefaultVocab {
private:
// Do nothing.
virtual void addRequiredVocabulary(const std::string& vocabPath, bool isJson) override { vocabPath; isJson; }
virtual void addRequiredVocabulary(const std::string& /*vocabPath*/, bool /*isJson*/) override {}
// Not adding special class labels, only seen classes.
virtual void create(const std::string& vocabPath,

View File

@ -36,6 +36,8 @@ public:
class ShortlistGenerator {
public:
virtual ~ShortlistGenerator() {}
virtual Ptr<Shortlist> generate(Ptr<data::CorpusBatch> batch) const = 0;
// Writes text version of (possibly) pruned short list to file
@ -129,7 +131,6 @@ private:
Ptr<const Vocab> trgVocab_;
size_t srcIdx_;
size_t trgIdx_;
bool shared_{false};
size_t firstNum_{100};
@ -183,13 +184,12 @@ public:
Ptr<const Vocab> srcVocab,
Ptr<const Vocab> trgVocab,
size_t srcIdx = 0,
size_t trgIdx = 1,
size_t /*trgIdx*/ = 1,
bool shared = false)
: options_(options),
srcVocab_(srcVocab),
trgVocab_(trgVocab),
srcIdx_(srcIdx),
trgIdx_(trgIdx),
shared_(shared) {
std::vector<std::string> vals = options_->get<std::vector<std::string>>("shortlist");
@ -235,7 +235,6 @@ public:
virtual Ptr<Shortlist> generate(Ptr<data::CorpusBatch> batch) const override {
auto srcBatch = (*batch)[srcIdx_];
// auto trgBatch = (*batch)[trgIdx_];
// add firstNum most frequent words
std::unordered_set<WordIndex> indexSet;

View File

@ -37,6 +37,7 @@ public:
typedef SentenceTuple Sample;
TextInput(std::vector<std::string> inputs, std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
virtual ~TextInput() {}
Sample next() override;

View File

@ -57,6 +57,7 @@ public:
virtual Word randWord() const {
return Word::fromWordIndex(rand() % size());
}
virtual ~IVocab() {};
};
class Options;

View File

@ -62,6 +62,7 @@ private:
std::vector<Input> inputs_;
public:
std::vector<Input>& inputs() { return inputs_; }
const std::vector<Input>& inputs() const { return inputs_; }
@ -144,6 +145,8 @@ public:
loadData();
}
virtual ~MNISTData(){}
void loadData() override {
ABORT_IF(paths_.size() != 2, "Paths to MNIST data files are not specified");

View File

@ -47,6 +47,8 @@ class MNISTLogsoftmax : public ILogProb {
public:
MNISTLogsoftmax() {}
virtual ~MNISTLogsoftmax(){}
Logits apply(Ptr<IModel> model,
Ptr<ExpressionGraph> graph,
Ptr<data::Batch> batch,
@ -61,13 +63,15 @@ public:
typedef data::MNISTData dataset_type;
template <class... Args>
MnistFeedForwardNet(Ptr<Options> options, Args... args)
MnistFeedForwardNet(Ptr<Options> options, Args... /*args*/)
: options_(options), inference_(options->get<bool>("inference", false)) {}
virtual ~MnistFeedForwardNet(){}
virtual Logits build(Ptr<ExpressionGraph> graph,
Ptr<data::Batch> batch,
bool /*clean*/ = false) override {
return Logits(apply(graph, batch, inference_));
}

View File

@ -19,7 +19,9 @@ public:
builder_ = models::createModelFromOptions(options, models::usage::translation);
}
virtual void keepBest(const std::vector<Ptr<ExpressionGraph>>& graphs) override {
virtual ~MNISTAccuracyValidator(){}
virtual void keepBest(const std::vector<Ptr<ExpressionGraph>>& /*graphs*/) override {
LOG(warn, "Keeping best model for MNIST examples is not supported");
}

View File

@ -7,55 +7,58 @@ namespace marian {
namespace functional {
// General template, will be used for any type without specializations
// and will fail with an abort message.
// and will fail at runtime with an abort message. Note that the
// general template functions don't have named parameters on purpose,
// because clang will warn about unused parameters during compilation.
template <typename T>
struct Ops {
static HOST_DEVICE_INLINE T tanh(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sin(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T cos(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T tan(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T log(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T exp(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T abs(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sqrt(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T neg(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sgn(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T tanh(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sin(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T cos(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T tan(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T log(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T exp(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T abs(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sqrt(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T neg(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sgn(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T add(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sub(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T mul(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T div(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T add(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sub(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T mul(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T div(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T max(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T min(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T pow(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T max(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T min(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T pow(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T negate(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T eq(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T neq(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T gt(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T lt(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T geq(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T leq(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T _and(const T& x, const T& y) { ABORT("Unknown type"); } // 'and' is used by gcc
static HOST_DEVICE_INLINE T _or(const T& x, const T& y) { ABORT("Unknown type"); } // 'or' is used by gcc
static HOST_DEVICE_INLINE T negate(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T eq(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T neq(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T gt(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T lt(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T geq(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T leq(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T _and(const T&, const T&) { ABORT("Unknown type"); } // 'and' is used by gcc
static HOST_DEVICE_INLINE T _or(const T&, const T&) { ABORT("Unknown type"); } // 'or' is used by gcc
// Neural Networks specific functions
static HOST_DEVICE_INLINE T sigmoid(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T logaddexp(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T clip(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sigmoid(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T logaddexp(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T clip(const T&, const T&) { ABORT("Unknown type"); }
// derivative of Clip, cut-off function
static HOST_DEVICE_INLINE T bump(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T relu(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T reluBack(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T prelu(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T preluBack(const T& x, const T& y) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T bump(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T relu(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T reluBack(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T prelu(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T preluBack(const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T if_then_else(const T& x, const T& y, const T& z) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T if_then_else(const T&, const T&, const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sumReduce(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T maxReduce(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T minReduce(const T& x) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T sumReduce(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T maxReduce(const T&) { ABORT("Unknown type"); }
static HOST_DEVICE_INLINE T minReduce(const T&) { ABORT("Unknown type"); }
};
// Specialization for float
@ -127,14 +130,14 @@ template <>
struct Ops<double> {
typedef double Single;
static HOST_DEVICE_INLINE double tanh(const double& x) { return tanh(x); }
static HOST_DEVICE_INLINE double sin(const double& x) { return sin(x); }
static HOST_DEVICE_INLINE double cos(const double& x) { return cos(x); }
static HOST_DEVICE_INLINE double tan(const double& x) { return tan(x); }
static HOST_DEVICE_INLINE double log(const double& x) { return log(x); }
static HOST_DEVICE_INLINE double exp(const double& x) { return exp(x); }
static HOST_DEVICE_INLINE double abs(const double& x) { return abs(x); }
static HOST_DEVICE_INLINE double sqrt(const double& x) { return sqrt(x); }
static HOST_DEVICE_INLINE double tanh(const double& x) { return std::tanh(x); }
static HOST_DEVICE_INLINE double sin(const double& x) { return std::sin(x); }
static HOST_DEVICE_INLINE double cos(const double& x) { return std::cos(x); }
static HOST_DEVICE_INLINE double tan(const double& x) { return std::tan(x); }
static HOST_DEVICE_INLINE double log(const double& x) { return std::log(x); }
static HOST_DEVICE_INLINE double exp(const double& x) { return std::exp(x); }
static HOST_DEVICE_INLINE double abs(const double& x) { return std::abs(x); }
static HOST_DEVICE_INLINE double sqrt(const double& x) { return std::sqrt(x); }
static HOST_DEVICE_INLINE double neg(const double& x) { return -x; }
static HOST_DEVICE_INLINE double sgn(const double& x) { return (0 < x) - (x < 0); }
@ -145,7 +148,7 @@ struct Ops<double> {
static HOST_DEVICE_INLINE double max(const double& x, const double& y) { return x < y ? y : x; }
static HOST_DEVICE_INLINE double min(const double& x, const double& y) { return x < y ? x : y; }
static HOST_DEVICE_INLINE double pow(const double& x, const double& y) { return pow(x, y); }
static HOST_DEVICE_INLINE double pow(const double& x, const double& y) { return std::pow(x, y); }
static HOST_DEVICE_INLINE double negate(const double& x) { return !(bool)x; }
@ -460,7 +463,7 @@ struct Ops<half> {
static DEVICE_INLINE half exp(const half& x) { return hexp(x); }
static DEVICE_INLINE half sqrt(const half& x) { return hsqrt(x); }
static DEVICE_INLINE half neg(const half& x) { return -x; }
static DEVICE_INLINE half abs(const half& x) { return fabs((float)x); }// @TODO half has this information somewhere in the struct, right?
static DEVICE_INLINE half sgn(const half& x) { half zero = 0.f; return (zero < x) - (x < zero); } // @TODO half has this information somewhere in the struct, right?

View File

@ -130,9 +130,6 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
bool inferenceOnly_{false};
// during inference, use optimizations that might lead to precision loss, e.g. 8-bit MatMul.
// At this moment, this is used for int16 qunatized Matmul - 11/1/2019
bool optimized_{false};
bool checkpointing_{false}; // use gradient checkpointing if true
bool reloaded_{false};
@ -178,9 +175,6 @@ public:
void setInference(bool inference) { inferenceOnly_ = inference; }
bool isInference() { return inferenceOnly_; }
void setOptimized(bool optimized) { optimized_ = optimized; }
bool isOptimized() { return (optimized_ && inferenceOnly_); }
void setCheckpointing(bool checkpointing) { checkpointing_ = checkpointing; }
bool isCheckpointing() { return checkpointing_; }

View File

@ -40,7 +40,7 @@ protected:
std::string debugMessage_;
Ptr<std::list<Expr>> subtape_; // a subtape is used to keep track of nodes that need to be freed and recomputed with gradient-checkpointing.
bool isCheckpoint_{false}; // true if this node has been selected to be a checkpoint, currently only done manually.
bool isCheckpoint_{false}; // true if this node has been selected to be a checkpoint, currently only done manually.
Ptr<AutoTunerRecorder> recorder_;
size_t recorderHash_;
@ -138,7 +138,7 @@ public:
virtual std::string graphviz() override {
std::stringstream ss;
ss << "\"" << this << "\" ["
ss << "\"" << this << "\" ["
<< "shape=\"" << form() << "\", "
<< "label=" << label() << ", "
<< "style=\"filled\", "
@ -147,7 +147,7 @@ public:
for(auto&& child : children())
ss << "\"" << child << "\" -> \"" << this << "\";" << std::endl;
if(subtape_) {
for(auto&& dep : *subtape_)
ss << "\"" << dep << "\" -> \"" << this << "\" [style=dotted];" << std::endl;
@ -188,9 +188,9 @@ struct NaryNodeOp : public Node {
// Deduce type automatically, but then all types must be the same
// this is called automatically when no output type is specified.
// If the input types are mixed, the output type needs to be specified
// If the input types are mixed, the output type needs to be specified
// in the constructor.
Type commonType(const std::vector<Expr>& nodes) {
static Type commonType(const std::vector<Expr>& nodes) {
ABORT_IF(nodes.size() == 0, "NaryNodeOp has no children");
Type type = nodes[0]->value_type();
for(int i = 1; i < nodes.size(); ++i)

View File

@ -17,9 +17,9 @@ namespace inits {
/**
* Base class for specialized NodeInitializers.
*
* A NodeInitializer is a functor that is associated with parameters
* and constants, and is invoked on a tensor during node intialization.
* You need to override NodeIntializer::apply(Tensor) with your own
* A NodeInitializer is a functor that is associated with parameters
* and constants, and is invoked on a tensor during node intialization.
* You need to override NodeIntializer::apply(Tensor) with your own
* functionality or use a fromLambda intializer.
*
* See node_initializers.cpp for examples.
@ -31,6 +31,7 @@ protected:
public:
virtual void apply(Tensor t) = 0;
void setAllocator(Ptr<Allocator> allocator) { allocator_ = allocator; }
virtual ~NodeInitializer() {}
};
/**
@ -135,7 +136,7 @@ Ptr<NodeInitializer> dropout(float dropoutProbabilty);
/**
* Intialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps)
*
*
* @return A NodeInitializer
*/
Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
@ -163,7 +164,7 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,
/**
* Computes Google's Transformer-style sinusoidal position embeddings
* starting from position 'start' taking into account batch and time
* starting from position 'start' taking into account batch and time
* dimensions of the tensor.
*
* Expected tensor layout {-2: time, -1: model}

View File

@ -480,9 +480,12 @@ class CSRDotNodeOp : public NaryNodeOp {
bool transS_;
bool swapOperands_;
public:
CSRDotNodeOp(const Shape& S_shape, Expr S_values, Expr S_indices, Expr S_offsets, Expr D, bool transS, bool swapOperands)
: NaryNodeOp({ S_values, S_indices, S_offsets, D }, newShape(S_shape, S_values, S_indices, S_offsets, D, transS, swapOperands), commonType({S_values, D})),
transS_(transS), swapOperands_(swapOperands) {
CSRDotNodeOp(const Shape& S_shape, Expr S_values, Expr S_indices,
Expr S_offsets, Expr D, bool transS, bool swapOperands)
: NaryNodeOp({ S_values, S_indices, S_offsets, D },
newShape(S_shape, S_values, S_indices, S_offsets, D, transS, swapOperands),
NaryNodeOp::commonType({S_values, D})),
transS_(transS), swapOperands_(swapOperands) {
matchOrAbort<IndexType>(S_indices->value_type());
matchOrAbort<IndexType>(S_offsets->value_type());
}
@ -513,7 +516,7 @@ public:
NodeOps backwardOps() override {
return { nullptr, // can't backprop into the sparse matrix (the gradient is dense)
nullptr,
nullptr,
nullptr,
NodeOp(CSRProd(child(3)->grad(), // child(3) = D
graph()->allocator(),
@ -527,7 +530,7 @@ public:
virtual size_t hash() override {
size_t seed = NaryNodeOp::hash();
for(auto s : shape())
util::hash_combine(seed, s);
util::hash_combine(seed, s);
util::hash_combine(seed, transS_);
util::hash_combine(seed, swapOperands_);
return seed;
@ -1050,8 +1053,8 @@ struct ConcatenateNodeOp : public NaryNodeOp {
auto checkShape = shape;
for(auto child : nodes) {
checkShape.set(ax_, child->shape()[ax_]); // don't abort on different sizes on axis dim.
ABORT_IF(checkShape != child->shape(),
"Child shapes {} and {} cannot be concatenated along axis {}",
ABORT_IF(checkShape != child->shape(),
"Child shapes {} and {} cannot be concatenated along axis {}",
shape, child->shape(), ax);
sum += child->shape()[ax_];

View File

@ -10,10 +10,10 @@
namespace marian {
// @TODO: Currently an ExpressionGraph only supports one Parameters object and
// @TODO: Currently an ExpressionGraph only supports one Parameters object and
// the type of parameters has to be the inside on Parameters object. This limits
// parameter types to a single chosen type, e.g. only fp32 or only fp16. This should
// be extended to allow multiple sets of parameters.
// be extended to allow multiple sets of parameters.
// The reason here is to be able to efficiently compute updates of whole parameter
// sets of one type.
class Parameters {
@ -40,7 +40,7 @@ public:
LOG(debug, "Created parameter object of type {}", acceptedElementType_);
}
~Parameters() {
virtual ~Parameters() {
LOG(debug, "Destroyed parameter object of type {}", acceptedElementType_);
}
@ -88,7 +88,7 @@ public:
// sort parameters by name before allocation to make sure the memory layout after allocation is always the same
std::sort(params_.begin(), params_.end(), [](Expr n1, Expr n2){ return n1->name() < n2->name(); });
for(auto p : params_) {
if(!p->val()) {
vals_->allocate(p->val(), p->shape(), p->value_type());

View File

@ -39,6 +39,7 @@ public:
// Simplest layer interface: Unary function
struct IUnaryLayer {
virtual ~IUnaryLayer() {}
virtual Expr apply(Expr) = 0;
virtual Expr apply(const std::vector<Expr>& es) {
ABORT_IF(es.size() > 1, "Not implemented"); // simple stub
@ -59,6 +60,7 @@ struct IEmbeddingLayer {
// alternative from indices directly
virtual Expr applyIndices(const std::vector<WordIndex>& embIdx, const Shape& shape) const = 0;
virtual ~IEmbeddingLayer() {}
};
// base class for Encoder and Decoder classes, which have embeddings and a batch index (=stream index)

View File

@ -5,14 +5,14 @@
namespace marian {
static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> /*graph*/,
Ptr<data::CorpusBatch> batch,
Ptr<Options> options,
Expr attention) { // [beam depth=1, max src length, batch size, tgt length]
std::string guidedLossType = options->get<std::string>("guided-alignment-cost"); // @TODO: change "cost" to "loss"
float guidedLossWeight = options->get<float>("guided-alignment-weight");
const auto& shape = attention->shape(); // [beam depth=1, max src length, batch size, tgt length]
float epsilon = 1e-6f;
Expr alignmentLoss; // sum up loss over all attention/alignment positions
@ -55,8 +55,8 @@ static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
else
ABORT("Unknown alignment cost type: {}", guidedLossType);
// every position is a label as they should all agree
// @TODO: there should be positional masking here ... on the other hand, positions that are not
// in a sentence should always agree (both being 0). Lack of masking affects label count only which is
// @TODO: there should be positional masking here ... on the other hand, positions that are not
// in a sentence should always agree (both being 0). Lack of masking affects label count only which is
// probably negligible?
numLabels = shape.elements();
}

View File

@ -331,6 +331,7 @@ public:
: LabelwiseLoss(axes), // cross-entropy already reduces over axis -1
labelSmoothing_(labelSmoothing), factorWeight_(factorWeight) {}
virtual ~CrossEntropyLoss() {}
protected:
float labelSmoothing_; // interpolation factor for label smoothing, see below
float factorWeight_; // give extra weight to factors
@ -368,7 +369,7 @@ protected:
if(labelWeights) {
// We currently do not know how to use target factors and word-level label weights together
bool wordlevel = labelWeights->shape()[-3] > 1; // Time-dimension is not trivially 1, hence we have word-level weights.
bool wordlevel = labelWeights->shape()[-3] > 1; // Time-dimension is not trivially 1, hence we have word-level weights.
ABORT_IF(wordlevel && logits.getNumFactorGroups() > 1, "CE loss with word-level label weights is not implemented for factors");
ce = ce * cast(labelWeights, Type::float32);
}
@ -379,15 +380,15 @@ protected:
/**
* @brief Unlikelihood loss across last axis, summed up over batch and time dimensions. This is an
* implementation of sequence-level unlikelihood loss from https://arxiv.org/abs/1908.04319.
* @brief Unlikelihood loss across last axis, summed up over batch and time dimensions. This is an
* implementation of sequence-level unlikelihood loss from https://arxiv.org/abs/1908.04319.
* We rely on word-level label weights where 1 is correct and 0 is marking an error. If there are not
* zeros for a sentence it going to be trained with normal CE loss if there is at least one 0 it is going
* to flip over to use SUL for that sentence to penalize the selected word.
*
*
* SUL is implemented as:
* -log(gather(1 - softmax(logits), -1, indices))
*
*
* Factors are currently not supported.
*/
class SequenceUnlikelihoodLoss : public CrossEntropyLoss {
@ -411,17 +412,17 @@ protected:
ABORT_IF(!mask, "mask is required"); // @TODO: check this, it seems weights for padding are by default 1, which would make this obsolete.
// use label weights, where 1 is GOOD and 0 is BAD. After inversion here, now 1 marks, mask again to eliminate padding (might be obsolete)
auto errorMask = (1.f - cast(labelWeights, Type::float32)) * cast(mask, Type::float32);
auto ceUl = logits.applyLossFunction(labels, [&](Expr logits, Expr indices) {
return cast(unlikelihood(logits, indices), Type::float32);
});
// compute if want to use CE or UL. If there are no errors train with CE, otherwise train _only on_ the errors with UL. This is the "mixed" training
// schedule from https://arxiv.org/abs/1908.04319. Providing labels with or without error scores we can easily switch between CE and UL.
// schedule from https://arxiv.org/abs/1908.04319. Providing labels with or without error scores we can easily switch between CE and UL.
auto onlyCe = eq(sum(errorMask, /*axis=*/-3), 0.f); // [1, 1, dimBatch, 1] - equal 1 if no errors are present
ceUl = errorMask * ceUl; // don't use for correct label or padding
auto cost = onlyCe * ce + (1.f - onlyCe) * ceUl; // ce or unlikelihood part are never simultanously used as cost per batch entry
auto cost = onlyCe * ce + (1.f - onlyCe) * ceUl; // ce or unlikelihood part are never simultanously used as cost per batch entry
return cost;
}

View File

@ -17,6 +17,7 @@ public:
virtual void debugWeighting(std::vector<float> /*weightedMask*/,
std::vector<float> /*freqMask*/,
Ptr<data::CorpusBatch> /*batch*/){};
virtual ~WeightingBase() {}
};
class DataWeighting : public WeightingBase {

View File

@ -41,6 +41,7 @@ class VocabWrapper : public IVocabWrapper {
Ptr<Vocab> pImpl_;
public:
VocabWrapper(Ptr<Vocab> vocab) : pImpl_(vocab) {}
virtual ~VocabWrapper() {}
WordIndex encode(const std::string& word) const override { return (*pImpl_)[word].toWordIndex(); }
std::string decode(WordIndex id) const override { return (*pImpl_)[Word::fromWordIndex(id)]; }
size_t size() const override { return pImpl_->size(); }
@ -243,7 +244,7 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) {
}
}
// @TODO: clean-up this code and unify with marian-conv. The targetPrec parameter is not clear enought etc.
// @TODO: clean-up this code and unify with marian-conv. The targetPrec parameter is not clear enought etc.
bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec) {
std::cout << "Converting from: " << inputFile << ", to: " << outputFile << std::endl;

View File

@ -54,6 +54,8 @@ public:
const std::vector<const void*>& ptrs)
: options_(options), ptrs_(ptrs) {}
virtual ~IBeamSearchDecoder() {}
virtual QSNBestBatch decode(const QSBatch& qsBatch,
size_t maxLength,
const std::unordered_set<WordIndex>& shortlist)

View File

@ -25,6 +25,7 @@ public:
Ptr<ExpressionGraph> graph, // @TODO: why needed? Can it be gotten from model?
Ptr<data::Batch> batch,
bool clearGraph = true) = 0;
virtual ~ICost() {}
};
class EncoderDecoderCECost : public ICost {
@ -51,6 +52,8 @@ public:
weighter_ = WeightingFactory(options_);
}
virtual ~EncoderDecoderCECost() {}
Ptr<MultiRationalLoss> apply(Ptr<IModel> model,
Ptr<ExpressionGraph> graph,
Ptr<data::Batch> batch,
@ -136,6 +139,8 @@ public:
Trainer(Ptr<IModel> model, Ptr<ICost> cost)
: model_(model), cost_(cost) {}
virtual ~Trainer() {}
Ptr<IModel> getModel() { return model_; }
virtual void load(Ptr<ExpressionGraph> graph,
@ -179,6 +184,8 @@ public:
Scorer(Ptr<IModel> model, Ptr<ILogProb> cost)
: model_(model), logProb_(cost) {}
virtual ~Scorer(){}
Ptr<IModel> getModel() { return model_; }
virtual void load(Ptr<ExpressionGraph> graph,
@ -211,6 +218,7 @@ public:
class LogSoftmaxStep : public ILogProbStep {
public:
virtual ~LogSoftmaxStep() {}
virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
// decoder needs normalized probabilities (note: skipped if beam 1 and --skip-cost)
state->setLogProbs(state->getLogProbs().applyUnaryFunction(logsoftmax));
@ -224,6 +232,7 @@ public:
// with --output-sampling during translation with marian-decoder
class GumbelSoftmaxStep : public ILogProbStep {
public:
virtual ~GumbelSoftmaxStep() {}
virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
[](Expr logits){ // lemma gets gumbelled

View File

@ -11,6 +11,7 @@ namespace marian {
class IEncoderDecoder : public models::IModel {
public:
virtual ~IEncoderDecoder() {}
virtual void load(Ptr<ExpressionGraph> graph,
const std::string& name,
bool markedReloaded = true) override

View File

@ -41,6 +41,8 @@ public:
// @TODO: Is there a better name?
class ICriterionFunction {
public:
virtual ~ICriterionFunction() {}
virtual void load(Ptr<ExpressionGraph>,
const std::string&,
bool markReloaded = true)

View File

@ -5,10 +5,12 @@
namespace marian {
struct ModelTask {
virtual ~ModelTask() {}
virtual void run() = 0;
};
struct ModelServiceTask {
virtual ~ModelServiceTask() {}
virtual std::string run(const std::string&) = 0;
};
} // namespace marian

View File

@ -11,6 +11,7 @@ namespace marian {
class EncoderS2S : public EncoderBase {
using EncoderBase::EncoderBase;
public:
virtual ~EncoderS2S() {}
Expr applyEncoderRNN(Ptr<ExpressionGraph> graph,
Expr embeddings,
Expr mask,
@ -254,7 +255,7 @@ public:
auto embeddings = state->getTargetHistoryEmbeddings();
// The batch dimension of the inputs can change due to batch-pruning, in that case
// cached elements need to be rebuilt, in this case the mapped encoder context in the
// cached elements need to be rebuilt, in this case the mapped encoder context in the
// attention mechanism of the decoder RNN.
int currDimBatch = embeddings->shape()[-2];
if(!rnn_ || lastDimBatch_ != currDimBatch) // if currDimBatch is different, rebuild the cached RNN
@ -263,7 +264,7 @@ public:
// Also @TODO: maybe implement a Cached(build, updateIf) that runs a check and rebuild if required
// at dereferecing :
// rnn_ = Cached<decltype(constructDecoderRNN(graph, state))>(
// /*build=*/[]{ return constructDecoderRNN(graph, state); },
// /*build=*/[]{ return constructDecoderRNN(graph, state); },
// /*updateIf=*/[]{ return state->batchDimChanged() });
// rnn_->transduce(...);

View File

@ -17,6 +17,7 @@ public:
: context_(context), mask_(mask), batch_(batch) {}
EncoderState() {}
virtual ~EncoderState() {}
virtual Expr getContext() const { return context_; }
virtual Expr getAttended() const { return context_; }
@ -53,6 +54,7 @@ public:
const std::vector<Ptr<EncoderState>>& encStates,
Ptr<data::CorpusBatch> batch)
: states_(states), logProbs_(logProbs), encStates_(encStates), batch_(batch) {}
virtual ~DecoderState() {}
// @TODO: Do we need all these to be virtual?
virtual const std::vector<Ptr<EncoderState>>& getEncoderStates() const {
@ -68,10 +70,10 @@ public:
int beamSize) const {
std::vector<Ptr<EncoderState>> newEncStates;
for(auto& es : encStates_)
for(auto& es : encStates_)
// If the size of the batch dimension of the encoder state context changed, subselect the correct batch entries
newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices));
// hypindices matches batchIndices in terms of batch dimension, so we only need hypIndices
auto selectedState = New<DecoderState>(
states_.select(hypIndices, beamSize, /*isBatchMajor=*/false), logProbs_, newEncStates, batch_);
@ -121,6 +123,7 @@ private:
Words targetWords_;
public:
virtual ~ClassifierState() {}
virtual Expr getLogProbs() const { return logProbs_; }
virtual void setLogProbs(Expr logProbs) { logProbs_ = logProbs; }

View File

@ -16,6 +16,7 @@ namespace marian {
class ClipperBase {
public:
virtual void clip(Tensor) = 0;
virtual ~ClipperBase() {}
};
typedef std::shared_ptr<ClipperBase> ClipperPtr;

View File

@ -29,6 +29,8 @@ public:
LOG(info, "[optimizers] Learning rate gets automatically adjusted as if minibatch size was {}", refMBWordsParam_);
}
virtual ~OptimizerBase() {}
static constexpr size_t mbSizeNotProvided = SIZE_MAX;
void update(Ptr<ExpressionGraph> graph, size_t mbSize = mbSizeNotProvided) {
@ -114,7 +116,7 @@ class Sgd : public OptimizerBase {
public:
Sgd(float eta, size_t refMBWordsParam = 0, Ptr<ClipperBase> clipper = nullptr)
: OptimizerBase(eta, refMBWordsParam, clipper) {}
virtual ~Sgd() {}
virtual void setParams(const std::vector<float>& /*params*/) override {}
private:
void updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBWords) override;

View File

@ -13,6 +13,7 @@ namespace marian {
class ScoreCollector {
public:
ScoreCollector(const Ptr<Options>& options);
virtual ~ScoreCollector() {}
virtual void Write(long id, const std::string& message);
virtual void Write(long id,

View File

@ -35,7 +35,7 @@ protected:
public:
BaseRNN(Ptr<ExpressionGraph> graph, Ptr<Options> options)
: graph_(graph), options_(options) {}
virtual ~BaseRNN() {}
virtual Expr transduce(Expr, Expr = nullptr) = 0;
virtual Expr transduce(Expr, State, Expr = nullptr) = 0;
virtual Expr transduce(Expr, States, Expr = nullptr) = 0;
@ -113,6 +113,7 @@ private:
public:
friend RNN;
virtual ~SingleLayerRNN() {}
// @TODO: benchmark whether this concatenation is a good idea
virtual Expr transduce(Expr input, Expr mask = nullptr) override {

View File

@ -17,7 +17,7 @@ protected:
public:
Backend(DeviceId deviceId, size_t seed)
: deviceId_(deviceId), seed_(seed), randomGenerator_(createRandomGenerator(seed, deviceId)) {}
virtual ~Backend() {};
virtual DeviceId getDeviceId() { return deviceId_; };
virtual Ptr<RandomGenerator> getRandomGenerator() { return randomGenerator_; }

View File

@ -8,29 +8,40 @@
namespace marian {
namespace cpu {
namespace {
// allocate function for tensor reserve() below.
// Needed for AVX512, while not available on all compilers. It seems clang
// does not have aligned_alloc for all cstlib versions. If AVX512 is not used
// a simple malloc is probably fine.
// Should generate a runtime error otherwise as we have a check in the AVX512
// functions which tests for alignment.
#ifdef _WIN32
#define MALLOC(size) _aligned_malloc(size, alignment_)
#elif __GNUC__
#define MALLOC(size) aligned_alloc(alignment_, size)
#else
#define MALLOC(size) malloc(size)
#endif
// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory.
#ifdef _WIN32
#define FREE(ptr) _aligned_free(ptr)
void *genericMalloc(size_t alignment, size_t size) {
void *ret = _aligned_malloc(size, alignment);
ABORT_IF(!ret, "Failed to allocate memory on CPU");
return ret;
}
void genericFree(void *ptr) {
_aligned_free(ptr);
}
#else
#define FREE(ptr) free(ptr)
// Linux and OS X. There is no fallback to malloc because we need it to be aligned.
void *genericMalloc(size_t alignment, size_t size) {
// On macos, aligned_alloc is available only on c++17
// Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails.
// posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang
void *result;
// Error could be detected by return value or just remaining nullptr.
ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU");
return result;
}
void genericFree(void *ptr) {
free(ptr);
}
#endif
} // namespace
Device::~Device() {
FREE(data_);
genericFree(data_);
}
void Device::reserve(size_t size) {
@ -38,14 +49,12 @@ void Device::reserve(size_t size) {
ABORT_IF(size < size_ || size == 0,
"New size must be larger than old size and larger than 0");
uint8_t *temp = static_cast<uint8_t*>(genericMalloc(alignment_, size));
if(data_) {
uint8_t *temp = static_cast<uint8_t*>(MALLOC(size));
std::copy(data_, data_ + size_, temp);
FREE(data_);
data_ = temp;
} else {
data_ = static_cast<uint8_t*>(MALLOC(size));
genericFree(data_);
}
data_ = temp;
size_ = size;
}
} // namespace cpu

View File

@ -17,6 +17,7 @@
#endif
using namespace fbgemm;
// @TODO: don't use using namespace ...; in header files. Just don't. [UG]
#endif // USE_FBGEMM
namespace marian {
@ -96,7 +97,7 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
const std::string type() override { return "packMatFp16"; }
Shape newShape(Expr a, bool transpose) {
Shape newShape(Expr MAYBE_UNUSED a, bool MAYBE_UNUSED transpose) {
#if USE_FBGEMM
auto shapeMat = a->shape();
// Should be 2D - weight matrix
@ -115,9 +116,8 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
packsize_);
Shape outShape({(int)packsize_});
return outShape;
#else // USE_FBGEMM
#else
ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
return Shape();
#endif // USE_FBGEMM
@ -180,19 +180,21 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
const std::string type() override { return "packMatInt8"; }
Shape newShape(Expr a, bool transpose) {
#if USE_FBGEMM
Shape newShape(Expr a, bool transpose) {
fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
Shape outShape({(int)packsize_});
return outShape;
#else // USE_FBGEMM
}
#else
Shape newShape(Expr /*a*/, bool /*transpose*/) {
ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
return Shape();
#endif // USE_FBGEMM
}
#endif // USE_FBGEMM
};
// Affine transform (matrix multiplication) using packed B matrix
// float scalar_: scalar multiplier
// size_t m_: the number of rows in A and C
@ -202,7 +204,6 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
// bool transB_: transpose B
class FbgemmPacked16AffineNodeOp : public NaryNodeOp {
private:
float scalar_;
size_t m_;
size_t n_;
size_t k_;
@ -210,9 +211,8 @@ private:
bool transB_;
public:
FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
scalar_(scalar) {
FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar)*/ {
transA_ = transA;
transB_ = transB;
m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
@ -281,7 +281,6 @@ public:
// bool transB_: transpose B
class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
private:
float scalar_;
size_t m_;
size_t n_;
size_t k_;
@ -289,9 +288,8 @@ private:
bool transB_;
public:
FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
scalar_(scalar) {
FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar) */ {
transA_ = transA;
transB_ = transB;
m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
@ -302,7 +300,7 @@ public:
size_t l = bShape.elements() / bShape[-1];
n_ = bShape[-1];
if(transB)
std::swap(l, n_);
std::swap(l, n_);
}
Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
@ -369,9 +367,9 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo
Type elementType = b->value_type();
if (elementType == Type::packed16)
return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
else if (isPacked(elementType) && sizeOf(elementType) == 1)
return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
else {
ABORT("Only int8 and fp16 are available. {}", elementType);
return nullptr;
@ -380,9 +378,9 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo
static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
if (elementType == Type::packed16)
return Expression<cpu::variant::FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
else if (isPacked(elementType) && sizeOf(elementType) == 1)
return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
else {
ABORT("Only int8 and fp16 are available. {}", elementType);
return nullptr;
@ -394,9 +392,9 @@ static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, f
Type elementType = b->value_type();
if (elementType == Type::packed16)
return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
else if (isPacked(elementType) && sizeOf(elementType) == 1)
return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
else {
ABORT("Only int8 and fp16 are available. {}", elementType);
return nullptr;

View File

@ -20,7 +20,7 @@ namespace marian {
namespace cpu {
void IsNaN(const Tensor in, Ptr<Allocator> allocator, bool& /*isNaN*/, bool& /*isInf*/) {
void IsNaN(const Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool& /*isNaN*/, bool& /*isInf*/) {
ABORT("Not implemented");
}
@ -214,9 +214,11 @@ void Transpose0213(Tensor out, Tensor in) {
}
}
// This function is called only when MKL is available.
#if MKL_FOUND
// Given a 4D array, transpose (swap) the initial 3 dimensions while keeping the last dimension.
// e.g. 1234 --> 2134, 1234 --> 3214 (4 is always kept).
// This is an optimized version for swapping first 3 dimensions
// This is an optimized version for swapping first 3 dimensions
// assuming the last dimension is large enough to get benefits from vectorized copy.
//
// @param out output tensor
@ -225,14 +227,13 @@ void Transpose0213(Tensor out, Tensor in) {
template <bool add>
void TransposeFirst3In4(Tensor out, Tensor in, const std::vector<int>& vAxis) {
ABORT_IF(vAxis.size() != 4, "This function handles only 4D arrays.");
#if MKL_FOUND
int innermost = in->shape()[-1];
int l1 = in->shape()[vAxis[0]];
int l2 = in->shape()[vAxis[1]];
int l3 = in->shape()[vAxis[2]];
// find the mapping between the transposed output dimensional indices (oi, oj, ok)
// find the mapping between the transposed output dimensional indices (oi, oj, ok)
// and original input dimensional indices (i, j, k)
int oi, oj, ok;
#pragma omp parallel for
@ -275,11 +276,8 @@ void TransposeFirst3In4(Tensor out, Tensor in, const std::vector<int>& vAxis) {
}
}
}
#else
// it shouldn't come into here. This function is called only when MKL is available.
ABORT("Should not get here");
#endif // MKL_FOUND
}
#endif // MKL_FOUND
inline void transpose4x4_SSE(const float* A,
float* B,
@ -656,7 +654,7 @@ void SelectAxis2(Tensor out,
functional::Shape outShape = out->shape();
functional::Shape inShape = in->shape();
auto idxData = indices->data<IndexType>();
auto odata = out->data();
const auto idata = in->data();

View File

@ -15,11 +15,11 @@ protected:
public:
RandomGenerator(size_t seed) : seed_(seed) { }
virtual ~RandomGenerator() {}
virtual void uniform(Tensor, float a, float b) = 0;
virtual void normal(Tensor, float mean, float stddev) = 0;
};
Ptr<RandomGenerator> createRandomGenerator(size_t /*seed*/, DeviceId);
}
}

View File

@ -25,7 +25,7 @@
namespace marian {
template <typename InIt, typename OutIt>
void copy(Ptr<Backend> backend, const InIt beg, const InIt end, OutIt it) {
void copy(Ptr<Backend>& MAYBE_UNUSED backend, const InIt beg, const InIt end, OutIt it) {
#ifdef CUDA_FOUND
if(backend->getDeviceId().type == DeviceType::gpu)
gpu::copy(backend, beg, end, it);
@ -119,7 +119,7 @@ DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)
// clang-format on
// Bernoulli(tensor, 0.5f, 2.f, -1.f) generates a tensor composed of 50% of 1 and 50% of -1.
// Bernoulli(tensor, 0.5f, 2.f, -1.f) generates a tensor composed of 50% of 1 and 50% of -1.
static inline void Bernoulli(Tensor resultTensor, float keepProb, float scale = 1.f, float shift = 0.f) {
// in-place uniform distribution
auto rnd = resultTensor->getBackend()->getRandomGenerator();
@ -190,7 +190,7 @@ void LayerNormalizationGrad(Tensor gradX,
}
static inline void LayerNormalizationGrad(
Ptr<Allocator> allocator,
Ptr<Allocator> MAYBE_UNUSED allocator,
Tensor gradX,
Tensor gradGamma,
Tensor gradBeta,

View File

@ -1,7 +1,7 @@
#include "marian.h"
#include "common/timer.h"
int main(int argc, char** argv) {
int main(int /*argc*/, char** /*argv*/) {
using namespace marian;
{

View File

@ -8,6 +8,8 @@
#include <fstream>
int main(int argc, char** argv) {
ABORT_IF(argc != 3, "FATAL ERROR: Incorrect number of command line arguments "
"(expected: 2) for command {}.",argv[0]);
SQLite::Database db("corpus.db", SQLite::OPEN_READWRITE|SQLite::OPEN_CREATE);
db.exec("PRAGMA temp_store_directory = '/data1/marcinjd';");

View File

@ -38,7 +38,7 @@ Ptr<ICommunicator> createCommunicator(
}
// the actual implementation is inside communicator.cu
return New<NCCLCommunicator>(graphs, mpi);
return New<NCCLCommunicator>(graphs, mpi);
#else // no CUDA or no NCCL
noNccl; // (unused)
return New<DefaultCommunicator>(graphs, mpi);
@ -141,7 +141,7 @@ public:
FakeMPIWrapper(bool) {
LOG(warn, "Compiled without MPI support. Falling back to FakeMPIWrapper");
}
virtual ~FakeMPIWrapper() {}
virtual size_t myMPIRank() const override { return 0; };
virtual size_t numMPIProcesses() const override { return 1; };

View File

@ -156,11 +156,8 @@ public:
void scatterReduceAndResetGrads() const override {
const_cast<DefaultCommunicator*>(this)->lazyInit();
int totalSize = (int)graphs_[0]->params()->vals()->size();
int shardSize = (int)ceil(totalSize / (float)graphs_.size());
// Gather gradients from different devices into current gradient shards
auto scatter = [this, shardSize](size_t idx, size_t begin, size_t end) {
auto scatter = [this](size_t idx, size_t begin, size_t end) {
auto curGrad = graphs_[idx]->params()->grads()->subtensor(begin, end-begin);
// collect and sum gradients
@ -176,7 +173,7 @@ public:
};
// reset gradients outside current shard
auto reset = [this, shardSize](size_t idx, size_t begin, size_t end) {
auto reset = [this](size_t idx, size_t begin, size_t end) {
auto grad = graphs_[idx]->params()->grads();
if (begin > 0)
grad->subtensor(0, begin)->set(0);
@ -189,11 +186,9 @@ public:
}
void allGatherParams() const override {
int totalSize = (int)graphs_[0]->params()->vals()->size();
int shardSize = (int)ceil(totalSize / (float)graphs_.size());
// Update all graphs with parameter shard
auto gather = [this, shardSize](size_t idx, size_t begin, size_t end) {
auto gather = [this](size_t idx, size_t begin, size_t end) {
auto getShard = [&](Ptr<ExpressionGraph> graph) {
return graph->params()->vals()->subtensor(begin, end-begin);
};

View File

@ -118,7 +118,7 @@ public:
}
// Convert a tensor into a sparse tensor format
void fromDense(Tensor t) {
void fromDense(Tensor MAYBE_UNUSED t) {
if(backend_->getDeviceId().type == DeviceType::cpu) {
ABORT("Gradient Dropping for CPU is not yet supported");
}

View File

@ -54,10 +54,10 @@ public:
* number of devices, which is passed in as the 'multiplier'.
*/
// @TODO: Can this be made const? It seems wrong to have a stateful method that still returns a result.
virtual Ptr<data::BatchStats> collectStats(Ptr<ExpressionGraph> graph,
Ptr<models::ICriterionFunction> model,
const std::vector<Ptr<Vocab>>& vocabs,
double multiplier = 1.) {
Ptr<data::BatchStats> collectStats(Ptr<ExpressionGraph> graph,
Ptr<models::ICriterionFunction> model,
const std::vector<Ptr<Vocab>>& vocabs,
double multiplier = 1.) {
auto stats = New<data::BatchStats>();
size_t numFiles = options_->get<std::vector<std::string>>("train-sets").size();
@ -92,8 +92,8 @@ public:
maxBatch *= 2;
}
// Do a binary search for maxmimum batch size that fits into given workspace memory
// for a tested sentence length.
// Do a binary search for maxmimum batch size that fits into given workspace memory
// for a tested sentence length.
for(size_t i = step; i <= maxLength; i += step) {
size_t start = 1;
size_t end = maxBatch;

View File

@ -64,7 +64,7 @@ public:
void save(Ptr<ExpressionGraph>, bool final = false);
// @TODO: give it a fake batch generator which own vocabs instead of passing vocabs
Ptr<data::BatchStats> collectStats(const std::vector<Ptr<Vocab>>& vocabs) {
virtual Ptr<data::BatchStats> collectStats(const std::vector<Ptr<Vocab>>& vocabs) {
return GraphGroup::collectStats(graphs_[0], builders_[0], vocabs);
}

View File

@ -63,7 +63,6 @@ private:
Tensor paramsAvg_;
std::vector<float> accGradientsSync_cpu;
std::vector<float> receiveBuffer_cpu;
bool synchronization_happened{false};
Ptr<OptimizerBase> syncOptimizer_;

View File

@ -26,7 +26,6 @@ class SyncGraphGroup : public GraphGroup, public ExponentialSmoothing {
// state for update()
bool first_{ true }; // gets interpreted and cleared by update()
std::vector<Ptr<data::Batch>> pendingBatches_; // in case of dynamic MB-size scaling, we temporarly buffer up batches across update() calls until enough
size_t typicalTrgWords_{}; // typical batch size in words (labels), 0 if unknown (e.g. specified in sentences)
double updateMultiplier_{1}; // multiplier not applied in collectStats() (no multiplier if not mini-batch-fit)
void initialize(const Ptr<data::Batch>& exampleBatch);

View File

@ -379,6 +379,14 @@ public:
state_->wordsDisp = 0;
}
if(options_->get<bool>("valid-reset-stalled")) {
state_->stalled = 0;
state_->maxStalled = 0;
for(const auto& validator : validators_) {
state_->validators[validator->type()]["stalled"] = 0;
}
}
state_->newLoad();
}

View File

@ -14,7 +14,7 @@ class TrainingState;
class TrainingObserver {
public:
virtual ~TrainingObserver() {}
virtual void init(TrainingState&) {}
virtual void actAfterEpoch(TrainingState&) {}
virtual void actAfterBatches(TrainingState&) {}

View File

@ -36,6 +36,7 @@ protected:
public:
ValidatorBase(bool lowerIsBetter) : lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {}
virtual ~ValidatorBase() {}
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> state) = 0;
@ -51,6 +52,7 @@ public:
template <class DataSet, class BuilderType> // @TODO: BuilderType doesn't really serve a purpose here? Review and remove.
class Validator : public ValidatorBase {
public:
virtual ~Validator() {}
Validator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool lowerIsBetter = true)
: ValidatorBase(lowerIsBetter),
vocabs_(vocabs),
@ -137,6 +139,7 @@ class CrossEntropyValidator : public Validator<data::Corpus, models::ICriterionF
public:
CrossEntropyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
virtual ~CrossEntropyValidator() {}
std::string type() override { return options_->get<std::string>("cost-type"); }
@ -148,6 +151,7 @@ protected:
class AccuracyValidator : public Validator<data::Corpus, models::IModel> {
public:
AccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
virtual ~AccuracyValidator() {}
std::string type() override { return "accuracy"; }
@ -161,6 +165,7 @@ private:
public:
BertAccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool evalMaskedLM);
virtual ~BertAccuracyValidator() {}
std::string type() override {
if(evalMaskedLM_)
@ -177,6 +182,7 @@ protected:
class ScriptValidator : public Validator<data::Corpus, models::IModel> {
public:
ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
virtual ~ScriptValidator() {}
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> /*ignored*/) override;
@ -193,6 +199,7 @@ protected:
class TranslationValidator : public Validator<data::Corpus, models::IModel> {
public:
TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
virtual ~TranslationValidator() {}
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> state) override;
@ -212,6 +219,7 @@ protected:
class BleuValidator : public Validator<data::Corpus, models::IModel> {
public:
BleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool detok = false);
virtual ~BleuValidator() {}
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> state) override;

View File

@ -16,7 +16,7 @@ private:
Ptr<Options> options_;
std::vector<Ptr<Scorer>> scorers_;
size_t beamSize_;
Ptr<Vocab> trgVocab_;
Ptr<const Vocab> trgVocab_;
const float INVALID_PATH_SCORE = std::numeric_limits<float>::lowest(); // @TODO: observe this closely
const bool PURGE_BATCH = true; // @TODO: diagnostic, to-be-removed once confirmed there are no issues.
@ -24,7 +24,7 @@ private:
public:
BeamSearch(Ptr<Options> options,
const std::vector<Ptr<Scorer>>& scorers,
Ptr<Vocab> trgVocab)
const Ptr<const Vocab> trgVocab)
: options_(options),
scorers_(scorers),
beamSize_(options_->get<size_t>("beam-size")),
@ -42,8 +42,8 @@ public:
const std::vector<bool>& dropBatchEntries, // [origDimBatch] - empty source batch entries are marked with true, should be cleared after first use.
const std::vector<IndexType>& batchIdxMap) const { // [origBatchIdx -> currentBatchIdx]
std::vector<float> align; // collects alignment information from the last executed time step
if(options_->hasAndNotEmpty("alignment") && factorGroup == 0)
align = scorers_[0]->getAlignment(); // [beam depth * max src length * current batch size] -> P(s|t); use alignments from the first scorer, even if ensemble,
if(options_->hasAndNotEmpty("alignment") && factorGroup == 0)
align = scorers_[0]->getAlignment(); // [beam depth * max src length * current batch size] -> P(s|t); use alignments from the first scorer, even if ensemble,
const auto origDimBatch = beams.size(); // see function search for definition of origDimBatch and currentDimBatch etc.
Beams newBeams(origDimBatch); // return value of this function goes here. There are always origDimBatch beams.
@ -56,7 +56,7 @@ public:
reverseBatchIdxMap.resize(batchIdxMap.size()); // adjust size if doing batch purging.
currentDimBatch = 0;
for(int i = 0; i < batchIdxMap.size(); ++i) {
reverseBatchIdxMap[batchIdxMap[i]] = i; // reverse batch index mapping, multiple occurences get overwritten with the last one,
reverseBatchIdxMap[batchIdxMap[i]] = i; // reverse batch index mapping, multiple occurences get overwritten with the last one,
// which is expected due to down-shifting
if(!beams[i].empty())
currentDimBatch++;
@ -154,12 +154,12 @@ public:
auto lval = states[j]->getLogProbs().getFactoredLogitsTensor(factorGroup); // [maxBeamSize, 1, currentDimBatch, dimFactorVocab]
// The flatting happens based on actual (current) batch size and batch index computed with batch-pruning as we are looking into the pruned tensor
size_t flattenedLogitIndex = (beamHypIdx * currentDimBatch + currentBatchIdx) * vocabSize + wordIdx; // (beam idx, batch idx, word idx); note: beam and batch are transposed, compared to 'key'
// @TODO: use a function on shape() to index, or new method val->at({i1, i2, i3, i4}) with broadcasting
ABORT_IF(lval->shape() != Shape({(int)nBestBeamSize, 1, (int)currentDimBatch, (int)vocabSize}) &&
(beamHypIdx == 0 && lval->shape() != Shape({1, 1, (int)currentDimBatch, (int)vocabSize})),
"Unexpected shape of logits?? {} != {}", lval->shape(), Shape({(int)nBestBeamSize, 1, (int)currentDimBatch, (int)vocabSize}));
breakDown[j] += lval->get(flattenedLogitIndex);
}
hyp->setScoreBreakdown(breakDown);
@ -173,7 +173,7 @@ public:
newBeam.push_back(hyp);
}
// if factored vocab and this is not the first factor, we need to
// also propagate factored hypotheses that do not get expanded in this step because they don't have this factor
if (factorGroup > 0) {
@ -225,7 +225,7 @@ public:
// in a single beam, i.e.:
// * [word1-batch1, word1-batch2, ..., word2-batch1, ...]
//
size_t origDimBatch = batch->size(); // number of sentences in batch
size_t batchWidth = batch->width(); // max src length
@ -254,7 +254,7 @@ public:
for(auto beam : beams) {
Beam newBeam; // a beam of surviving hyps
for(auto hyp : beam)
if(hyp->getWord() != trgEosId) // if this hyp is not finished,
if(hyp->getWord() != trgEosId) // if this hyp is not finished,
newBeam.push_back(hyp); // move over to beam of surviving hyps
if(PURGE_BATCH)
@ -309,8 +309,8 @@ public:
// create one beam per batch entry with sentence-start hypothesis
Beams beams(origDimBatch, Beam(beamSize_, Hypothesis::New())); // array [origDimBatch] of array [maxBeamSize] of Hypothesis, keeps full size through search.
// batch purging is determined from an empty sub-beam.
std::vector<IndexType> batchIdxMap(origDimBatch); // Record at which batch entry a beam is looking.
// By default that corresponds to position in array,
std::vector<IndexType> batchIdxMap(origDimBatch); // Record at which batch entry a beam is looking.
// By default that corresponds to position in array,
// but shifts in the course of removing batch entries when they are finished.
const std::vector<bool> emptyBatchEntries; // used for recording if there are empty input batch entries
@ -370,7 +370,7 @@ public:
std::vector<IndexType> hypIndices; // [maxBeamSize, 1, currentDimBatch, 1] (flattened) tensor index ((beamHypIdx, batchIdx), flattened) of prev hyp that a hyp originated from
std::vector<Word> prevWords; // [maxBeamSize, 1, currentDimBatch, 1] (flattened) word that a hyp ended in, for advancing the decoder-model's history
Expr prevPathScores; // [maxBeamSize, 1, currentDimBatch, 1], path score that a hyp ended in (last axis will broadcast into vocab size when adding expandedPathScores)
bool anyCanExpand = false; // stays false if all hyps are invalid factor expansions
if(t == 0 && factorGroup == 0) { // no scores yet
prevPathScores = graph->constant({1, 1, 1, 1}, inits::fromValue(0));
@ -384,7 +384,7 @@ public:
for(int currentBatchIdx = 0; currentBatchIdx < beams.size(); ++currentBatchIdx) // loop over batch entries (active sentences)
if(!beams[currentBatchIdx].empty() || !PURGE_BATCH) // for each beam check
batchIndices.push_back(prevBatchIdxMap[currentBatchIdx]); // which batch entries were active in previous step
std::vector<float> prevScores;
for(size_t beamHypIdx = 0; beamHypIdx < maxBeamSize; ++beamHypIdx) { // loop over globally maximal beam-size (maxBeamSize)
for(int origBatchIdx = 0; origBatchIdx < origDimBatch; ++origBatchIdx) { // loop over all batch entries (active and inactive)
@ -401,11 +401,11 @@ public:
if(factorGroup == 0)
currentBatchIdx = prevBatchIdxMap[origBatchIdx]; // subselection may happen for factorGroup == 0
else
currentBatchIdx = batchIdxMap[origBatchIdx]; // no subselection happens for factorGroup > 0,
// but we treat it like a next step, since a step
currentBatchIdx = batchIdxMap[origBatchIdx]; // no subselection happens for factorGroup > 0,
// but we treat it like a next step, since a step
// happened for factorGroup == 0
}
auto hypIndex = (IndexType)(hyp->getPrevStateIndex() * currentDimBatch + currentBatchIdx); // (beamHypIdx, batchIdx), flattened, for index_select() operation
hypIndices.push_back(hypIndex); // (beamHypIdx, batchIdx), flattened as said above.
@ -420,7 +420,7 @@ public:
}
}
}
if(factorGroup == 0)
if(factorGroup == 0)
currentDimBatch = (IndexType) batchIndices.size(); // keep batch size constant for all factor groups in a time step
prevPathScores = graph->constant({(int)maxBeamSize, 1, (int)currentDimBatch, 1}, inits::fromVector(prevScores));
}
@ -505,7 +505,7 @@ public:
beams,
states, // used for keeping track of per-ensemble-member path score
batch, // only used for propagating alignment info
factoredVocab, factorGroup,
factoredVocab, factorGroup,
emptyBatchEntries, // [origDimBatch] - empty source batch entries are marked with true
batchIdxMap); // used to create a reverse batch index map to recover original batch indices for this step
} // END FOR factorGroup = 0 .. numFactorGroups-1

View File

@ -20,34 +20,6 @@ public:
NthElementCPU() {}
NthElementCPU(const NthElementCPU& copy) = delete;
private:
// for each batch, select the max N elements, where N is the beam size for this batch.
void selectNBest(const float* scores,
const std::vector<int>& batchFirstElementIdxs,
const std::vector<int>& cumulativeBeamSizes) {
int numProbs = batchFirstElementIdxs.back();
std::vector<int> idxs(numProbs);
std::iota(idxs.begin(), idxs.end(), 0);
size_t numBatches = batchFirstElementIdxs.size() - 1;
for(size_t batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
int pos = cumulativeBeamSizes[batchIdx];
int beamSize = cumulativeBeamSizes[batchIdx + 1] - pos;
std::vector<int>::iterator begin = idxs.begin() + batchFirstElementIdxs[batchIdx];
std::vector<int>::iterator middle = begin + beamSize;
std::vector<int>::iterator end = idxs.begin() + batchFirstElementIdxs[batchIdx + 1];
std::partial_sort(
begin, middle, end, [&](int a, int b) { return scores[a] > scores[b]; });
while(begin != middle) {
int idx = *begin++;
h_res_idx[pos] = idx;
h_res[pos] = scores[idx];
++pos;
}
}
}
public:
void getNBestList(Tensor scores, // [dimBatch, 1, beamSize, dimVocab or dimShortlist]
@ -59,23 +31,39 @@ public:
const auto inputN = scores->shape()[-2];
const auto dimBatch = scores->shape()[-4];
ABORT_IF(inputN != (isFirst ? 1 : N), "Input tensor has wrong beam dim??"); // @TODO: Remove isFirst argument altogether
std::vector<int> cumulativeBeamSizes(dimBatch + 1, 0);
std::vector<int> batchFirstElementIdxs(dimBatch + 1, 0);
for(int batchIdx = 0; batchIdx < dimBatch; ++batchIdx) {
cumulativeBeamSizes[batchIdx + 1] = (batchIdx + 1) * (int)N;
batchFirstElementIdxs[batchIdx + 1] += (batchIdx + 1) * inputN * vocabSize;
ABORT_IF(cumulativeBeamSizes[batchIdx + 1] != cumulativeBeamSizes[batchIdx] + (int)N, "cumulativeBeamSizes wrong??");
ABORT_IF((isFirst ? batchIdx + 1 : cumulativeBeamSizes[batchIdx + 1]) != (batchIdx + 1) * inputN, "inputN wrong??");
}
ABORT_IF(cumulativeBeamSizes.back() != dimBatch * N, "cumulativeBeamSizes.back() wrong??");
const float* scoresData = scores->data();
size_t maxSize = N * dimBatch;
h_res.resize(maxSize);
h_res_idx.resize(maxSize);
size_t pos = 0; // iterates through h_res and h_res_idx
selectNBest(scores->data(), batchFirstElementIdxs, cumulativeBeamSizes);
size_t batchOffset = inputN * vocabSize;
std::vector<int> idxs(batchOffset); // re-used for each batch
std::iota(idxs.begin(), idxs.end(), 0);
for(size_t batchIdx = 0; batchIdx < dimBatch; ++batchIdx) {
std::partial_sort(
// sorts the top N (beam size) idxs by score to the front
idxs.begin(),
idxs.begin() + N,
idxs.end(),
[&](int a, int b) { return scoresData[a] > scoresData[b]; }
);
// copy top N idxs and scores to return vectors
for(size_t i = 0; i < N; ++i) {
int idx = idxs[i];
// since idxs is re-used for each batch, add batch offset to each idx to get absolute position
h_res_idx[pos] = idx + batchIdx * batchOffset;
h_res[pos] = scoresData[idx];
++pos;
}
// advance pointer to next batch's beginning
scoresData += batchOffset;
}
getPairs(/*cumulativeBeamSizes.back(),*/ outKeys, outPathScores);
}

View File

@ -11,6 +11,7 @@ namespace marian {
class PrintingStrategy {
public:
virtual ~PrintingStrategy() {}
virtual bool shouldBePrinted(long) = 0;
};

View File

@ -10,6 +10,8 @@ namespace marian {
class ScorerState {
public:
virtual ~ScorerState(){}
virtual Logits getLogProbs() const = 0;
virtual void blacklist(Expr /*totalCosts*/, Ptr<data::CorpusBatch> /*batch*/){};
@ -24,6 +26,8 @@ public:
Scorer(const std::string& name, float weight)
: name_(name), weight_(weight) {}
virtual ~Scorer(){}
std::string getName() { return name_; }
float getWeight() { return weight_; }
@ -53,6 +57,7 @@ protected:
public:
ScorerWrapperState(Ptr<DecoderState> state) : state_(state) {}
virtual ~ScorerWrapperState() {}
virtual Ptr<DecoderState> getState() { return state_; }
@ -88,6 +93,8 @@ public:
encdec_(std::static_pointer_cast<IEncoderDecoder>(encdec)),
ptr_{ptr} {}
virtual ~ScorerWrapper() {}
virtual void init(Ptr<ExpressionGraph> graph) override {
graph->switchParams(getName());
if(ptr_)

View File

@ -109,6 +109,17 @@ public:
threadPool.enqueue(task, device, id++);
}
if(options_->get<bool>("output-sampling", false)) {
if(options_->get<size_t>("beam-size") > 1)
LOG(warn,
"[warning] Output sampling and beam search (beam-size > 1) are contradictory methods "
"and using them together is not recommended. Set beam-size to 1");
if(options_->get<std::vector<std::string>>("models").size() > 1)
LOG(warn,
"[warning] Output sampling and model ensembling are contradictory methods and using "
"them together is not recommended. Use a single model");
}
}
void run() override {