mirror of
https://github.com/marian-nmt/marian.git
synced 2024-11-03 20:13:47 +03:00
Update sentencepiece to newest version (#753)
- Updates sentencepiece to the newest version (removes dependency on protobuf) - Enable SentencePiece compilation by default since there is no dependency in protobuf anymore.
This commit is contained in:
parent
65ea504ec8
commit
bbdccd1c92
@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
||||
- Quantized training (fixed point or log-based quantization) with --quantize-bits N command
|
||||
|
||||
### Fixed
|
||||
- Segfault of spm_train when compiled with -DUSE_STATIC_LIBS=ON seems to have gone away with update to newer SentencePiece version.
|
||||
- Fix bug causing certain reductions into scalars to be 0 on the GPU backend. Removed unnecessary warp shuffle instructions.
|
||||
- Do not apply dropout in embeddings layers during inference with dropout-src/trg
|
||||
- Print "server is listening on port" message after it is accepting connections
|
||||
@ -53,6 +54,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
||||
- Improved handling for receiving SIGTERM during training. By default, SIGTERM triggers 'save (now) and exit'. Prior to this fix, batch pre-fetching did not check for this sigal, potentially delaying exit considerably. It now pays attention to that. Also, the default behaviour of save-and-exit can now be disabled on the command line with --sigterm exit-immediately.
|
||||
|
||||
### Changed
|
||||
- Updated SentencePiece repository to version 8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5 from https://github.com/google/sentencepiece.
|
||||
- Enabled compilation of SentencePiece by default since no dependency on protobuf anymore.
|
||||
- Changed default value of --sentencepiece-max-lines from 10000000 to 2000000 since apparently the new version doesn't sample automatically anymore (Not quite clear how that affects quality of the vocabulary).
|
||||
- --metric bleu now always detokenizes SacreBLEU-style if a vocabulary knows how to, use bleu-segmented to compute BLEU on word ids. bleu-detok is now a synonym for bleu.
|
||||
- Move label-smoothing computation into Cross-entropy node
|
||||
- Move Simple-WebSocket-Server to submodule
|
||||
|
@ -23,7 +23,7 @@ option(USE_FBGEMM "Use FBGEMM" OFF)
|
||||
option(USE_MKL "Compile with MKL support" ON)
|
||||
option(USE_MPI "Use MPI library" OFF)
|
||||
option(USE_NCCL "Use NCCL library" ON)
|
||||
option(USE_SENTENCEPIECE "Download and compile SentencePiece" OFF)
|
||||
option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
|
||||
option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
|
||||
option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
|
||||
|
||||
@ -236,6 +236,7 @@ endif()
|
||||
if(USE_ONNX)
|
||||
message(STATUS "Enabling experimental ONNX support")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ONNX")
|
||||
# TODO: likely required to find protobuf by itself, we should check/fix this. Before it would take advantage of sentencepiece doing that.
|
||||
set(EXT_LIBS ${EXT_LIBS} protobuf)
|
||||
include_directories(${Protobuf_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit cdad78089484d7817d91c803d6fc7049328e20db
|
||||
Subproject commit 17dacc949bf837d5998cbdcc9edcec60495e6d6c
|
4
src/3rd_party/CMakeLists.txt
vendored
4
src/3rd_party/CMakeLists.txt
vendored
@ -66,10 +66,6 @@ if(USE_SENTENCEPIECE)
|
||||
set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.")
|
||||
|
||||
if(USE_STATIC_LIBS)
|
||||
message(WARNING "You are compiling SentencePiece binaries with -DUSE_STATIC_LIBS=on. \
|
||||
This will cause spm_train to segfault. No need to worry if you do not intend to use that binary. \
|
||||
Marian support for SentencePiece will work fine.")
|
||||
|
||||
set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE)
|
||||
set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE)
|
||||
else(USE_STATIC_LIBS)
|
||||
|
2
src/3rd_party/sentencepiece
vendored
2
src/3rd_party/sentencepiece
vendored
@ -1 +1 @@
|
||||
Subproject commit c0a84a4ff8bdc200480e179d57ece43d4929d242
|
||||
Subproject commit 8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5
|
@ -372,7 +372,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
|
||||
cli.add<size_t>("--sentencepiece-max-lines",
|
||||
"Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. "
|
||||
"When set to 0 all lines are going to be used.",
|
||||
10000000);
|
||||
2000000);
|
||||
#endif
|
||||
// scheduling options
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user