Merged PR 16337: Update sentencepiece to new version

This updates the SentencePiece version in Marian to a much more recent revision. Due to that there is no dependency on Protobuf anymore.
2024-09-17 09:47:34 +03:00 · 2020-11-11 00:38:37 +00:00 · 2020-11-11 00:38:37 +00:00 · 9dad84ae9b
commit 9dad84ae9b
parent b90229d8ee
6 changed files with 12 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Quantized training (fixed point or log-based quantization) with --quantize-bits N command

 ### Fixed
+- Segfault of spm_train when compiled with -DUSE_STATIC_LIBS=ON seems to have gone away with update to newer SentencePiece version.
 - Fix bug causing certain reductions into scalars to be 0 on the GPU backend. Removed unnecessary warp shuffle instructions.
 - Do not apply dropout in embeddings layers during inference with dropout-src/trg
 - Print "server is listening on port" message after it is accepting connections
@ -53,6 +54,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Improved handling for receiving SIGTERM during training. By default, SIGTERM triggers 'save (now) and exit'. Prior to this fix, batch pre-fetching did not check for this sigal, potentially delaying exit considerably. It now pays attention to that. Also, the default behaviour of save-and-exit can now be disabled on the command line with --sigterm exit-immediately.

 ### Changed
+- Updated SentencePiece repository to version 8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5 from https://github.com/google/sentencepiece. 
+- Enabled compilation of SentencePiece by default since no dependency on protobuf anymore. 
+- Changed default value of --sentencepiece-max-lines from 10000000 to 2000000 since apparently the new version doesn't sample automatically anymore (Not quite clear how that affects quality of the vocabulary).
 - Change mini-batch-fit search stopping criterion to stop at ideal binary search threshold.
 - --metric bleu now always detokenizes SacreBLEU-style if a vocabulary knows how to, use bleu-segmented to compute BLEU on word ids. bleu-detok is now a synonym for bleu.
 - Move label-smoothing computation into Cross-entropy node
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -23,7 +23,7 @@ option(USE_FBGEMM "Use FBGEMM" OFF)
 option(USE_MKL "Compile with MKL support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
-option(USE_SENTENCEPIECE "Download and compile SentencePiece" OFF)
+option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
 option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
 option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)

@ -236,6 +236,7 @@ endif()
 if(USE_ONNX)
  message(STATUS "Enabling experimental ONNX support")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ONNX")
+  # TODO: likely required to find protobuf by itself, we should check/fix this. Before it would take advantage of sentencepiece doing that.
  set(EXT_LIBS ${EXT_LIBS} protobuf)
  include_directories(${Protobuf_INCLUDE_DIRS})
 endif()
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 910d489b7b71f306ab3867d696f86ab25f7a1b42
+Subproject commit 16914ae94c80f338c678f0461c4e45965149f6aa
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@ -66,10 +66,6 @@ if(USE_SENTENCEPIECE)
  set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.")

  if(USE_STATIC_LIBS)
-    message(WARNING "You are compiling SentencePiece binaries with -DUSE_STATIC_LIBS=on. \
-    This will cause spm_train to segfault. No need to worry if you do not intend to use that binary. \
-    Marian support for SentencePiece will work fine.")
-
    set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE)
    set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE)
  else(USE_STATIC_LIBS)
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -372,7 +372,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
  cli.add<size_t>("--sentencepiece-max-lines",
      "Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. "
      "When set to 0 all lines are going to be used.",
-      10000000);
+      2000000);
 #endif
  // scheduling options

--- a/src/optimizers/quantizer.cpp
+++ b/src/optimizers/quantizer.cpp
@ -90,6 +90,8 @@ void ModelQuantizer::quantize(Ptr<ExpressionGraph> graph) {
    allocator->reserveExact(graph->params()->vals()->memory()->size());
    allocator->allocate(errorResidual_, {1, numElements});

+    errorResidual_->set(0);
+
    allocators_.push_back(allocator);
    isFirstError_ = true;
  }
@ -140,7 +142,6 @@ void ModelQuantizer::quantizeImpl(Tensor t) {
    allocators_.push_back(allocator);
  }
  
-  Tensor q = delta_->subtensor(0, t->size());  // to store the quantized t
  Tensor tflat = t->subtensor(0, t->size());   // flatten t for reduce

  float S = 0.0f; // scaling factor S
@ -153,6 +154,8 @@ void ModelQuantizer::quantizeImpl(Tensor t) {

  // optimize the scaling factor S
  for(int i = 0; i < optSteps_; i++) {
+    Tensor q = delta_->subtensor(0, t->size());  // to store the quantized t
+    
    // let t be the original tensor, and q be the quantized tensor, and q = S*a where S is the
    // scaling factor. we want to optimize S to minimize MSE(S*a - t) therefore, S =
    // sum(a*t)/sum(a*a) see https://www.aclweb.org/anthology/2020.ngt-1.4.pdf for more details.