Added Tensorflow module

2025-01-08 18:26:38 +03:00 · 2018-06-26 14:59:55 +09:00 · 2018-06-26 14:59:55 +09:00 · 81d1a091fb
commit 81d1a091fb
parent 65da95da9a
12 changed files with 1211 additions and 31 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -22,12 +22,17 @@ matrix:
    - os: linux
      env: IMAGE=x86_64 COMMAND=make_py_wheel
      script:
-        - $TRAVIS_BUILD_DIR/make_py_wheel.sh ${IMAGE}
+        - $TRAVIS_BUILD_DIR/python/make_py_wheel.sh ${IMAGE}
      services: docker
    - os: linux
      env: IMAGE=i686 COMMAND=make_py_wheel
      script:
-        - $TRAVIS_BUILD_DIR/make_py_wheel.sh ${IMAGE}
+        - $TRAVIS_BUILD_DIR/python/make_py_wheel.sh ${IMAGE}
      services: docker
    - os: linux
      env: IMAGE=x86_64 COMMAND=make_py_wheel
      script:
        - $TRAVIS_BUILD_DIR/tensorflow/make_py_wheel.sh
      services: docker      
    - os: osx
      osx_image: xcode9.3
--- a/python/make_py_wheel.sh
+++ b/python/make_py_wheel.sh
@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # Copyright 2018 Google Inc.
 #
@ -14,27 +14,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.!
 # Usage:
 # > sudo sh make_py_wheel.sh
 # wheel packages are built under <pwd>/manylinux_wh dir
 set -e  # exit immediately on error
 set -x  # display all commands
 PROTOBUF_VERSION=3.6.0
 run_docker() {
-  rm -fr manylinux_wh/$2
+  cd `dirname $0`
-  mkdir -p manylinux_wh/$2
+  docker pull $1
-  docker pull "$1"
+  docker run --rm -ti --name py_sentencepiece \
-  docker run --rm -ti --name manylinux -v `pwd`:/sentencepiece -w /sentencepiece/manylinux_wh/$2 -td "$1" /bin/bash
+    -v `pwd`/../:/sentencepiece -w /sentencepiece/python \
-  docker exec manylinux bash -c "../../make_py_wheel.sh make_wheel $2"
+    -td $1 /bin/bash
-  docker stop manylinux
+  docker exec py_sentencepiece bash -c "./make_py_wheel.sh native"
  docker stop py_sentencepiece
 }
-make_wheel() {
+build() {
-  export PATH="/usr/local/bin:$PATH"
+  rm -fr tmp
-  TRG=$1
+  mkdir -p tmp
-  wget http://ftpmirror.gnu.org/libtool/libtool-2.4.6.tar.gz
+  # Installs necessary libraries under `tmp` sub directory.
  cd tmp
  # Install libtool
  curl -L -O http://ftpmirror.gnu.org/libtool/libtool-2.4.6.tar.gz
  tar zxfv libtool-2.4.6.tar.gz
  cd libtool-2.4.6
  ./configure
@ -42,23 +45,27 @@ make_wheel() {
  make install
  cd ..
-  git clone https://github.com/google/protobuf.git
+  # Install protobuf
-  cd protobuf
+  curl -L -O https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protobuf-cpp-${PROTOBUF_VERSION}.tar.gz
-  ./autogen.sh
+  tar zxfv protobuf-cpp-${PROTOBUF_VERSION}.tar.gz
  cd protobuf-${PROTOBUF_VERSION}
  ./configure --disable-shared --with-pic
-  make -j4
+  make CXXFLAGS+="-std=c++11 -O3" \
-  make install
+       CFLAGS+="-std=c++11 -O3" -j4
-  cd ..
+  make install || true
  cd ../..
-  cd ../../
+  # Install sentencepiece
  cd ..
  make distclean || true
  ./autogen.sh
  grep -v PKG_CHECK_MODULES configure > tmp
  mv tmp -f configure
  chmod +x configure
  LIBS+="-pthread -L/usr/local/lib -lprotobuf" ./configure --disable-shared --with-pic
-  make -j4
+  make CXXFLAGS+="-std=c++11 -O3" \
-  make install
+       CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4
  make install || true
  cd python
  for i in /opt/python/*
@ -77,14 +84,16 @@ make_wheel() {
    auditwheel repair $i
  done
-  mv -f wheelhouse/*${TRG}.whl ../../manylinux_wh
+  mv -f wheelhouse/*${TRG}.whl .
  cd .. && rm -fr tmp
  cd .. && make distclean
 }
-if [ "$#" -eq 2 ]; then
+if [ "$1" = "native" ]; then
-  eval "$1" $2
+  build
 elif [ "$#" -eq 1 ]; then
  run_docker quay.io/pypa/manylinux1_${1}  ${1}
 else
-  run_docker quay.io/pypa/manylinux1_i686   i686
+  run_docker quay.io/pypa/manylinux1_i686
-  run_docker quay.io/pypa/manylinux1_x86_64 x86_64
+  run_docker quay.io/pypa/manylinux1_x86_64
 fi
--- a/tensorflow/.gitignore
+++ b/tensorflow/.gitignore
@ -0,0 +1,5 @@
 build/
 sdist/
 dist/
 tmp/
 *py[cod]
--- a/tensorflow/init.py
+++ b/tensorflow/init.py
--- a/tensorflow/make_py_wheel.sh
+++ b/tensorflow/make_py_wheel.sh
@ -0,0 +1,103 @@
 #!/bin/bash
 # Copyright 2018 Google Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.!
 set -e  # exit immediately on error
 set -x  # display all commands
 PROTOBUF_VERSION=3.6.0
 run_docker() {
  cd `dirname $0`
  docker pull $1
  docker run --rm -ti --name tf_sentencepiece \
    -v `pwd`/../:/sentencepiece -w /sentencepiece/tensorflow \
    -td $1 /bin/bash
  docker exec tf_sentencepiece bash -c "./build.sh native"
  docker stop tf_sentencepiece
 }
 build() {
  rm -fr tmp
  mkdir -p tmp
  export PATH="/opt/python/cp27-cp27mu/bin:${PATH}"
  # Installs necessary libraries under `tmp` sub directory.
  cd tmp
  # Install libtool
  curl -L -O http://ftpmirror.gnu.org/libtool/libtool-2.4.6.tar.gz
  tar zxfv libtool-2.4.6.tar.gz
  cd libtool-2.4.6
  ./configure
  make -j4
  make install
  cd ..
  # Install protobuf
  curl -L -O https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protobuf-cpp-${PROTOBUF_VERSION}.tar.gz
  tar zxfv protobuf-cpp-${PROTOBUF_VERSION}.tar.gz
  cd protobuf-${PROTOBUF_VERSION}
  ./configure --disable-shared --with-pic
  make CXXFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" \
       CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4
  make install || true
  cd ../..
  # Install sentencepiece
  cd ..
  make distclean || true
  ./autogen.sh
  grep -v PKG_CHECK_MODULES configure > tmp
  mv tmp -f configure
  chmod +x configure
  LIBS+="-pthread -L/usr/local/lib -lprotobuf" ./configure --disable-shared --with-pic
  make CXXFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" \
       CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4
  make install || true
  # Builds _sentencepiece_processor_ops.so
  cd tensorflow
  pip install tensorflow
  TF_CFLAGS="-I/opt/python/cp27-cp27mu/lib/python2.7/site-packages/tensorflow/include"
  TF_LFLAGS="-L/opt/python/cp27-cp27mu/lib/python2.7/site-packages/tensorflow -ltensorflow_framework"
  g++ -std=c++11 -shared \
    -I../src \
    -fPIC ${TF_CFLAGS[@]} -O2 \
    -D_GLIBCXX_USE_CXX11_ABI=0 \
    -Wl,--whole-archive \
    /usr/local/lib/libprotobuf.a \
    /usr/local/lib/libsentencepiece.a \
    -Wl,--no-whole-archive \
    sentencepiece_processor_ops.cc \
    -o tf_sentencepiece/_sentencepiece_processor_ops.so \
    ${TF_LFLAGS[@]}
  strip tf_sentencepiece/_sentencepiece_processor_ops.so
  # Builds Python manylinux wheel package.
  python setup.py bdist_wheel --universal --plat-name=linux_x86_64
  python setup.py sdist
  rm -fr build tf_sentencepiece.egg-info tmp
  cd .. && make distclean
 }
 if [ "$1" = "native" ]; then
  build
 else
  run_docker quay.io/pypa/manylinux1_x86_64
 fi
--- a/tensorflow/sentencepiece_processor_ops.cc
+++ b/tensorflow/sentencepiece_processor_ops.cc
@ -0,0 +1,532 @@
 // Copyright 2016 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.!
 #include <string>
 #include <vector>
 #include "sentencepiece_processor.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 typedef int int32;
 typedef long long int int64;
 namespace sentencepiece {
 using ::tensorflow::DEVICE_CPU;
 using ::tensorflow::OpKernel;
 using ::tensorflow::OpKernelConstruction;
 using ::tensorflow::OpKernelContext;
 using ::tensorflow::Tensor;
 using ::tensorflow::TensorShapeUtils;
 using ::tensorflow::shape_inference::DimensionHandle;
 using ::tensorflow::shape_inference::InferenceContext;
 using ::tensorflow::shape_inference::ShapeHandle;
 namespace {
 // A utility function to convert sentencepiece::util::Status to
 // ::tensorflow::Status
 ::tensorflow::Status ToTFStatus(const sentencepiece::util::Status& s) {
  if (s.ok()) return ::tensorflow::Status();
  return ::tensorflow::Status(static_cast<::tensorflow::error::Code>(s.code()),
                              ::tensorflow::string(s.error_message()));
 }
 // A factory function to initialize SentencePieceProcessor with
 // OpKernelConstruction `context`.
 enum InitType { GENERAL, ENCODE, DECODE };  // purpose of processor.
 void InitializeModel(OpKernelConstruction* context,
                     SentencePieceProcessor* sentencepiece_processor,
                     InitType type) {
  std::string model_file_attr, model_proto_attr;
  OP_REQUIRES_OK(context, context->GetAttr("model_file", &model_file_attr));
  OP_REQUIRES_OK(context, context->GetAttr("model_proto", &model_proto_attr));
  if (!model_file_attr.empty()) {
    OP_REQUIRES(
        context, model_proto_attr.empty(),
        ::tensorflow::errors::InvalidArgument(
            "`model_proto` must be empty when `model_file` is specified."));
    OP_REQUIRES_OK(context,
                   ToTFStatus(sentencepiece_processor->Load(model_file_attr)));
  } else {
    // Loads serialized sentencepiece model proto to enable embedding the
    // relatively small sentencepiece model proto into the tensorflow graph
    // such that the tensorflow graph is self-contained.
    OP_REQUIRES_OK(context,
                   ToTFStatus(sentencepiece_processor->LoadFromSerializedProto(
                       model_proto_attr)));
  }
  // Sets extra options to add <s>, </s>.
  std::string options;
  auto add_options = [&options, &context](const std::string& name,
                                          const std::string& v) {
    bool flag = false;
    OP_REQUIRES_OK(context, context->GetAttr(name, &flag));
    if (flag) {
      if (!options.empty()) options += ':';
      options += v;
    }
  };
  if (type == ENCODE || type == DECODE) {
    add_options("reverse", "reverse");
  }
  if (type == ENCODE) {
    add_options("add_bos", "bos");
    add_options("add_eos", "eos");
    OP_REQUIRES_OK(
        context,
        ToTFStatus(sentencepiece_processor->SetEncodeExtraOptions(options)));
  } else if (type == DECODE) {
    OP_REQUIRES_OK(
        context,
        ToTFStatus(sentencepiece_processor->SetDecodeExtraOptions(options)));
  }
 }
 }  // namespace
 class SentencePieceGetPieceSizeOp : public OpKernel {
 public:
  explicit SentencePieceGetPieceSizeOp(OpKernelConstruction* context)
      : OpKernel(context) {
    SentencePieceProcessor sp;
    InitializeModel(context, &sp, GENERAL);
    vocab_size_ = sp.GetPieceSize();
  }
  void Compute(OpKernelContext* context) override {
    Tensor* vocab_size_tensor = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, {}, &vocab_size_tensor));
    vocab_size_tensor->scalar<int32>()() = vocab_size_;
  }
 private:
  int32 vocab_size_ = 0;
 };
 template <typename S, typename T>
 class SentencePieceConvertPieceOp : public OpKernel {
 public:
  explicit SentencePieceConvertPieceOp(OpKernelConstruction* context)
      : OpKernel(context) {
    InitializeModel(context, &sentencepiece_processor_, GENERAL);
  }
  void Compute(OpKernelContext* context) override {
    const Tensor* input_tensor = nullptr;
    OP_REQUIRES_OK(context, context->input("input", &input_tensor));
    Tensor* output_tensor = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(),
                                                     &output_tensor));
    for (int i = 0; i < input_tensor->NumElements(); ++i)
      output_tensor->flat<T>()(i) = Convert(input_tensor->flat<S>()(i));
  }
  int32 Convert(const std::string& piece) const {
    return sentencepiece_processor_.PieceToId(piece);
  }
  std::string Convert(int32 id) const {
    if (id >= 0 && id < sentencepiece_processor_.GetPieceSize()) {
      return sentencepiece_processor_.IdToPiece(id);
    }
    return "";
  }
 private:
  SentencePieceProcessor sentencepiece_processor_;
 };
 template <typename T>
 class SentencePieceEncodeOpBase : public OpKernel {
 public:
  explicit SentencePieceEncodeOpBase(OpKernelConstruction* context)
      : OpKernel(context) {
    InitializeModel(context, &sentencepiece_processor_, ENCODE);
  }
  void Compute(OpKernelContext* context) override {
    const Tensor* input_tensor = nullptr;
    OP_REQUIRES_OK(context, context->input("input", &input_tensor));
    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor->shape()),
                ::tensorflow::errors::InvalidArgument(
                    "`input` must be a vector, got shape: ",
                    input_tensor->shape().DebugString()));
    const auto& input_sentences = input_tensor->vec<std::string>();
    const int64 batch_size = input_sentences.size();
    const Tensor* nbest_size_tensor = nullptr;
    OP_REQUIRES_OK(context, context->input("nbest_size", &nbest_size_tensor));
    OP_REQUIRES(context, nbest_size_tensor->dims() <= 1,
                ::tensorflow::errors::InvalidArgument(
                    "`nbest_size` must be a scalar or vector. got shape: ",
                    nbest_size_tensor->shape().DebugString()));
    if (nbest_size_tensor->dims() == 1) {
      OP_REQUIRES(
          context, batch_size == nbest_size_tensor->dim_size(0),
          ::tensorflow::errors::InvalidArgument(
              "`nbest_size` must have the same batch size as `input`."));
    }
    const Tensor* alpha_tensor = nullptr;
    OP_REQUIRES_OK(context, context->input("alpha", &alpha_tensor));
    OP_REQUIRES(context, alpha_tensor->dims() <= 1,
                ::tensorflow::errors::InvalidArgument(
                    "`alpha` must be a scalar or vector, got shape: ",
                    alpha_tensor->shape().DebugString()));
    if (alpha_tensor->dims() == 1) {
      OP_REQUIRES(context, batch_size == alpha_tensor->dim_size(0),
                  ::tensorflow::errors::InvalidArgument(
                      "`alpha` must have the same batch size as `input`."));
    }
    std::vector<std::vector<T>> pieces(batch_size);
    for (int64 i = 0; i < batch_size; ++i) {
      const int32 nbest_size = nbest_size_tensor->dims() == 1
                                   ? nbest_size_tensor->vec<int32>()(i)
                                   : nbest_size_tensor->scalar<int32>()();
      if (nbest_size == 0 || nbest_size == 1) {
        OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_.Encode(
                                    input_sentences(i), &pieces[i])));
      } else {
        const float alpha = alpha_tensor->dims() == 1
                                ? alpha_tensor->vec<float>()(i)
                                : alpha_tensor->scalar<float>()();
        OP_REQUIRES_OK(context,
                       ToTFStatus(sentencepiece_processor_.SampleEncode(
                           input_sentences(i), nbest_size, alpha, &pieces[i])));
      }
    }
    MakeOutputTensor(context, pieces);
  }
 private:
  virtual void MakeOutputTensor(OpKernelContext* context,
                                const std::vector<std::vector<T>>& pieces) = 0;
  SentencePieceProcessor sentencepiece_processor_;
 };
 template <typename T>
 class SentencePieceEncodeSparseOp : public SentencePieceEncodeOpBase<T> {
 public:
  explicit SentencePieceEncodeSparseOp(OpKernelConstruction* context)
      : SentencePieceEncodeOpBase<T>(context) {}
 private:
  void MakeOutputTensor(OpKernelContext* context,
                        const std::vector<std::vector<T>>& pieces) override {
    const int64 batch_size = pieces.size();
    int64 max_sequence_length = 0;
    int64 indices_size = 0;
    for (int row = 0; row < batch_size; ++row) {
      const int col_size = pieces[row].size();
      max_sequence_length = std::max<int64>(col_size, max_sequence_length);
      indices_size += col_size;
    }
    // Creates the indices output tensor.
    Tensor* indices_tensor = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, {indices_size, 2},
                                                     &indices_tensor));
    auto indices_tensor_output = indices_tensor->matrix<int64>();
    int item_idx = 0;
    for (int row = 0; row < batch_size; ++row) {
      for (int col = 0; col < pieces[row].size(); ++col) {
        indices_tensor_output(item_idx, 0) = row;
        indices_tensor_output(item_idx, 1) = col;
        ++item_idx;
      }
    }
    // Creates the values output tensor.
    Tensor* values_tensor = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(1, {indices_size}, &values_tensor));
    auto values_tensor_output = values_tensor->flat<T>();
    item_idx = 0;
    for (int row = 0; row < batch_size; ++row) {
      std::copy(pieces[row].begin(), pieces[row].end(),
                &values_tensor_output(item_idx));
      item_idx += pieces[row].size();
    }
    // Creates the shape output tensor.
    Tensor* shape_tensor = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(2, {2}, &shape_tensor));
    auto shape_tensor_output = shape_tensor->flat<int64>();
    shape_tensor_output(0) = batch_size;
    shape_tensor_output(1) = max_sequence_length;
  }
 };
 template <typename T>
 class SentencePieceEncodeDenseOp : public SentencePieceEncodeOpBase<T> {
 public:
  explicit SentencePieceEncodeDenseOp(OpKernelConstruction* context)
      : SentencePieceEncodeOpBase<T>(context) {}
 private:
  void MakeOutputTensor(OpKernelContext* context,
                        const std::vector<std::vector<T>>& pieces) override {
    const int64 batch_size = pieces.size();
    int64 max_sequence_length = 0;
    for (int row = 0; row < batch_size; ++row) {
      max_sequence_length =
          std::max<int64>(pieces[row].size(), max_sequence_length);
    }
    Tensor* values_tensor = nullptr;
    Tensor* length_tensor = nullptr;
    OP_REQUIRES_OK(
        context, context->allocate_output(0, {batch_size, max_sequence_length},
                                          &values_tensor));
    OP_REQUIRES_OK(context,
                   context->allocate_output(1, {batch_size}, &length_tensor));
    auto values_tensor_output = values_tensor->matrix<T>();
    auto length_tensor_output = length_tensor->vec<int32>();
    for (int row = 0; row < batch_size; ++row) {
      for (int col = 0; col < max_sequence_length; ++col) {
        values_tensor_output(row, col) =
            col < pieces[row].size() ? pieces[row][col] : T();
      }
      length_tensor_output(row) = pieces[row].size();
    }
  }
 };
 template <typename T>
 class SentencePieceDecodeOp : public OpKernel {
 public:
  explicit SentencePieceDecodeOp(OpKernelConstruction* context)
      : OpKernel(context) {
    InitializeModel(context, &sentencepiece_processor_, DECODE);
  }
  void Compute(OpKernelContext* context) override {
    const Tensor* input_tensor = nullptr;
    const Tensor* length_tensor = nullptr;
    OP_REQUIRES_OK(context, context->input("input", &input_tensor));
    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_tensor->shape()),
                ::tensorflow::errors::InvalidArgument(
                    "`input` must be a 2-D matrix. got shape: ",
                    input_tensor->shape().DebugString()));
    OP_REQUIRES_OK(context, context->input("sequence_length", &length_tensor));
    OP_REQUIRES(context, TensorShapeUtils::IsVector(length_tensor->shape()),
                ::tensorflow::errors::InvalidArgument(
                    "`sequence_length` must be a vector. got shape: ",
                    length_tensor->shape().DebugString()));
    OP_REQUIRES(
        context, input_tensor->dim_size(0) == length_tensor->dim_size(0),
        ::tensorflow::errors::InvalidArgument(
            "`sequence_length` must have the same batch size as `input`."));
    const auto& input_sentences = input_tensor->matrix<T>();
    const auto& sequence_length = length_tensor->vec<int32>();
    const int64 batch_size = input_tensor->dim_size(0);
    const int max_sequence_length = input_tensor->dim_size(1);
    Tensor* values_tensor = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, {batch_size}, &values_tensor));
    auto values_tensor_output = values_tensor->vec<std::string>();
    for (int64 i = 0; i < batch_size; ++i) {
      OP_REQUIRES(context,
                  (sequence_length(i) >= 0 &&
                   sequence_length(i) <= max_sequence_length),
                  ::tensorflow::errors::InvalidArgument(
                      "`sequence_length` is out-of-range."));
      const std::vector<T> pieces(&input_sentences(i, 0),
                                  &input_sentences(i, 0) + sequence_length(i));
      OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_.Decode(
                                  pieces, &values_tensor_output(i))));
    }
  }
 private:
  SentencePieceProcessor sentencepiece_processor_;
 };
 namespace {
 // The snake case of this variables are used as the function names.
 constexpr char kGetPieceSizeOpName[] = "SentencepieceGetPieceSize";
 constexpr char kPieceToIdOpName[] = "SentencepiecePieceToId";
 constexpr char kIdToPieceOpName[] = "SentencepieceIdToPiece";
 constexpr char kEncodeDenseOpName[] = "SentencepieceEncodeDense";
 constexpr char kEncodeSparseOpName[] = "SentencepieceEncodeSparse";
 constexpr char kDecodeOpName[] = "SentencepieceDecode";
 }  // namespace
 REGISTER_OP(kGetPieceSizeOpName)
    .Output("vocab_size: int32")
    .Attr("model_file: string = ''")
    .Attr("model_proto: string = ''")
    .SetShapeFn([](InferenceContext* c) {
      c->set_output(0, c->MakeShape({}));
      return ::tensorflow::Status::OK();
    });
 REGISTER_KERNEL_BUILDER(Name(kGetPieceSizeOpName).Device(DEVICE_CPU),
                        SentencePieceGetPieceSizeOp);
 REGISTER_OP(kPieceToIdOpName)
    .Input("input: string")
    .Output("values: int32")
    .Attr("model_file: string = ''")
    .Attr("model_proto: string = ''")
    .SetShapeFn([](InferenceContext* c) {
      c->set_output(0, c->input(0));
      return ::tensorflow::Status::OK();
    });
 REGISTER_KERNEL_BUILDER(Name(kPieceToIdOpName).Device(DEVICE_CPU),
                        SentencePieceConvertPieceOp<std::string, int32>);
 REGISTER_OP(kIdToPieceOpName)
    .Input("input: int32")
    .Output("values: string")
    .Attr("model_file: string = ''")
    .Attr("model_proto: string = ''")
    .SetShapeFn([](InferenceContext* c) {
      c->set_output(0, c->input(0));
      return ::tensorflow::Status::OK();
    });
 REGISTER_KERNEL_BUILDER(Name(kIdToPieceOpName).Device(DEVICE_CPU),
                        SentencePieceConvertPieceOp<int32, std::string>);
 REGISTER_OP(kEncodeDenseOpName)
    .Attr("out_type: {int32, string} = DT_INT32")
    .Input("input: string")
    .Input("nbest_size: int32")
    .Input("alpha: float")
    .Output("values: out_type")
    .Output("sequence_length: int32")
    .Attr("model_file: string = ''")
    .Attr("model_proto: string = ''")
    .Attr("reverse: bool = false")
    .Attr("add_bos: bool = false")
    .Attr("add_eos: bool = false")
    .SetShapeFn([](InferenceContext* c) {
      ShapeHandle input, nbest, alpha;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest));
      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha));
      DimensionHandle batch_size = c->Dim(input, 0);
      if (c->Rank(nbest) == 1)
        TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size));
      if (c->Rank(alpha) == 1)
        TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size));
      c->set_output(0, c->MakeShape({batch_size, c->UnknownDim()}));
      c->set_output(1, c->MakeShape({batch_size}));
      return ::tensorflow::Status::OK();
    });
 REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName)
                            .Device(DEVICE_CPU)
                            .TypeConstraint<int32>("out_type"),
                        SentencePieceEncodeDenseOp<int32>);
 REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName)
                            .Device(DEVICE_CPU)
                            .TypeConstraint<std::string>("out_type"),
                        SentencePieceEncodeDenseOp<std::string>);
 REGISTER_OP(kEncodeSparseOpName)
    .Attr("out_type: {int32, string} = DT_INT32")
    .Input("input: string")
    .Input("nbest_size: int32")
    .Input("alpha: float")
    .Output("indices: int64")
    .Output("values: out_type")
    .Output("dense_shape: int64")
    .Attr("model_file: string = ''")
    .Attr("model_proto: string = ''")
    .Attr("reverse: bool = false")
    .Attr("add_bos: bool = false")
    .Attr("add_eos: bool = false")
    .SetShapeFn([](InferenceContext* c) {
      ShapeHandle input, nbest, alpha;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest));
      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha));
      DimensionHandle batch_size = c->Dim(input, 0);
      if (c->Rank(nbest) == 1)
        TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size));
      if (c->Rank(alpha) == 1)
        TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size));
      c->set_output(0, c->MakeShape({c->UnknownDim(), 2}));
      c->set_output(1, c->MakeShape({c->UnknownDim()}));
      c->set_output(2, c->MakeShape({2}));
      return ::tensorflow::Status::OK();
    });
 REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName)
                            .Device(DEVICE_CPU)
                            .TypeConstraint<int32>("out_type"),
                        SentencePieceEncodeSparseOp<int32>);
 REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName)
                            .Device(DEVICE_CPU)
                            .TypeConstraint<std::string>("out_type"),
                        SentencePieceEncodeSparseOp<std::string>);
 REGISTER_OP(kDecodeOpName)
    .Attr("T: {int32, string}")
    .Input("input: T")
    .Input("sequence_length: int32")
    .Output("values: string")
    .Attr("model_file: string = ''")
    .Attr("model_proto: string = ''")
    .Attr("reverse: bool = false")
    .SetShapeFn([](InferenceContext* c) {
      ShapeHandle input, sequence_length;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sequence_length));
      DimensionHandle batch_size = c->Dim(input, 0);
      TF_RETURN_IF_ERROR(
          c->Merge(batch_size, c->Dim(sequence_length, 0), &batch_size));
      c->set_output(0, c->MakeShape({batch_size}));
      return ::tensorflow::Status::OK();
    });
 REGISTER_KERNEL_BUILDER(
    Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<int32>("T"),
    SentencePieceDecodeOp<int32>);
 REGISTER_KERNEL_BUILDER(
    Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<std::string>("T"),
    SentencePieceDecodeOp<std::string>);
 }  // namespace sentencepiece
--- a/tensorflow/setup.py
+++ b/tensorflow/setup.py
@ -0,0 +1,46 @@
 #!/usr/bin/env python
 # Copyright 2018 Google Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.!
 from setuptools import setup
 from setuptools import find_packages
 import string
 import sys
 sys.path.append('./test')
 setup(name = 'tf_sentencepiece',
      author = 'Taku Kudo',
      author_email='taku@google.com',
      description = 'SentencePiece Encode/Decode ops for TensorFlow',
      version='0.1.1',
      url = 'https://github.com/google/sentencepiece',
      license = 'Apache',
      platforms = 'Unix',
      packages=find_packages(exclude=['test']),
      package_data={'tf_sentencepiece':  ['_sentencepiece_processor_ops.so']},
      classifiers = [
        'Development Status :: 5 - Production/Stable',
        'Environment :: Console',
        'Intended Audience :: Developers',
        'Intended Audience :: Science/Research',
        'License :: OSI Approved :: Apache Software License',
        'Operating System :: Unix',
        'Programming Language :: Python',
        'Topic :: Text Processing :: Linguistic',
        'Topic :: Software Development :: Libraries :: Python Modules'
      ],
      keywords='tensorflow machine learning sentencepiece NLP segmentation',
      test_suite = 'tf_sentencepiece_test.suite')
--- a/tensorflow/test/init.py
+++ b/tensorflow/test/init.py
--- a/tensorflow/test/tf_sentencepiece_test.py
+++ b/tensorflow/test/tf_sentencepiece_test.py
@ -0,0 +1,283 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 import itertools as it
 import os
 import unittest
 import tensorflow as tf
 import sentencepiece as spm
 import tf_sentencepiece as tfspm
 class SentencePieceProcssorOpTest(unittest.TestCase):
  def _getSentencePieceModelFile(self):
    return '../python/test/test_ja_model.model'
  def _getExpected(self, processor, reverse=False, add_bos=False,
                   add_eos=False, padding=''):
    options = []
    if reverse:
      options.append('reverse')
    if add_bos:
      options.append('bos')
    if add_eos:
      options.append('eos')
    processor.SetEncodeExtraOptions(':'.join(options))
    processor.SetDecodeExtraOptions(':'.join(options))
    sentences = ['Hello world.', 'I have a pen.',
                 'I saw a girl with a telescope.']
    pieces = []
    ids = []
    seq_len = []
    for s in sentences:
      x = processor.EncodeAsPieces(s)
      y = processor.EncodeAsIds(s)
      pieces.append(x)
      ids.append(y)
      seq_len.append(len(x))
      self.assertEqual(len(x), len(y))
    # padding
    max_len = max(seq_len)
    pieces = [x + [padding] * (max_len - len(x)) for x in pieces]
    ids = [x + [0] * (max_len - len(x)) for x in ids]
    return sentences, pieces, ids, seq_len
  def testGetPieceSize(self):
    sentencepiece_model_file = self._getSentencePieceModelFile()
    processor = spm.SentencePieceProcessor()
    processor.Load(sentencepiece_model_file)
    with tf.Session():
      s = tfspm.piece_size(
          model_file=sentencepiece_model_file)
      self.assertEqual(s.eval(), processor.GetPieceSize())
  def testConvertPiece(self):
    sentencepiece_model_file = self._getSentencePieceModelFile()
    processor = spm.SentencePieceProcessor()
    processor.Load(sentencepiece_model_file)
    (sentences, expected_pieces,
     expected_ids, expected_seq_len) = self._getExpected(processor,
                                                         padding='<unk>')
    with tf.Session():
      ids_matrix = tfspm.piece_to_id(
          tf.constant(expected_pieces),
          model_file=sentencepiece_model_file)
      ids_vec = tfspm.piece_to_id(
          tf.constant(expected_pieces[0]),
          model_file=sentencepiece_model_file)
      ids_scalar = tfspm.piece_to_id(
          tf.constant(expected_pieces[0][0]),
          model_file=sentencepiece_model_file)
      self.assertEqual(ids_matrix.eval().tolist(), expected_ids)
      self.assertEqual(ids_vec.eval().tolist(), expected_ids[0])
      self.assertEqual(ids_scalar.eval(), expected_ids[0][0])
      pieces_matrix = tfspm.id_to_piece(
          tf.constant(expected_ids),
          model_file=sentencepiece_model_file)
      pieces_vec = tfspm.id_to_piece(
          tf.constant(expected_ids[0]),
          model_file=sentencepiece_model_file)
      pieces_scalar = tfspm.id_to_piece(
          tf.constant(expected_ids[0][0]),
          model_file=sentencepiece_model_file)
      self.assertEqual(pieces_matrix.eval().tolist(), expected_pieces)
      self.assertEqual(pieces_vec.eval().tolist(), expected_pieces[0])
      self.assertEqual(pieces_scalar.eval(), expected_pieces[0][0])
  def testEncodeAndDecode(self):
    sentencepiece_model_file = self._getSentencePieceModelFile()
    processor = spm.SentencePieceProcessor()
    processor.Load(sentencepiece_model_file)
    with tf.Session():
      for reverse, add_bos, add_eos in list(it.product(
          (True, False), repeat=3)):
        (sentences, expected_pieces,
         expected_ids, expected_seq_len) = self._getExpected(
             processor, reverse, add_bos, add_eos)
        # Encode sentences into pieces/ids.
        s = tf.constant(sentences)
        pieces, seq_len1 = tfspm.encode(
            s, model_file=sentencepiece_model_file,
            reverse=reverse, add_bos=add_bos, add_eos=add_eos,
            out_type=tf.string)
        ids, seq_len2 = tfspm.encode(
            s, model_file=sentencepiece_model_file,
            reverse=reverse, add_bos=add_bos, add_eos=add_eos)
        self.assertEqual(pieces.eval().tolist(), expected_pieces)
        self.assertEqual(ids.eval().tolist(), expected_ids)
        self.assertEqual(seq_len1.eval().tolist(), expected_seq_len)
        self.assertEqual(seq_len2.eval().tolist(), expected_seq_len)
        # Decode pieces into sentences/ids.
        pieces = tf.constant(expected_pieces)
        ids = tf.constant(expected_ids)
        seq_len = tf.constant(expected_seq_len, dtype=tf.int32)
        decoded_sentences1 = tfspm.decode(
            pieces, seq_len, model_file=sentencepiece_model_file,
            reverse=reverse)
        decoded_sentences2 = tfspm.decode(
            ids, seq_len, model_file=sentencepiece_model_file,
            reverse=reverse)
        self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
        self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
  def testSampleEncodeAndDecode(self):
    sentencepiece_model_file = self._getSentencePieceModelFile()
    processor = spm.SentencePieceProcessor()
    processor.Load(sentencepiece_model_file)
    sentences, _, _, _ = self._getExpected(processor)
    with tf.Session():
      for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]:
        # Round trip test.
        nbest_size = tf.constant(n)
        alpha = tf.constant(a)
        s = tf.constant(sentences)
        pieces, seq_len1 = tfspm.encode(
            s, nbest_size=nbest_size, alpha=alpha,
            model_file=sentencepiece_model_file, out_type=tf.string)
        ids, seq_len2 = tfspm.encode(
            s, nbest_size=nbest_size, alpha=alpha,
            model_file=sentencepiece_model_file)
        decoded_sentences1 = tfspm.decode(
            pieces, seq_len1, model_file=sentencepiece_model_file)
        decoded_sentences2 = tfspm.decode(
            ids, seq_len2, model_file=sentencepiece_model_file)
        self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
        self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
  def testEncodeAndDecodeSparse(self):
    sentencepiece_model_file = self._getSentencePieceModelFile()
    processor = spm.SentencePieceProcessor()
    processor.Load(sentencepiece_model_file)
    with tf.Session():
      for reverse, add_bos, add_eos in list(it.product(
          (True, False), repeat=3)):
        (sentences, expected_pieces, expected_ids,
         _) = self._getExpected(processor, reverse, add_bos, add_eos)
        # Encode sentences into sparse pieces/ids.
        s = tf.constant(sentences)
        pieces = tfspm.encode_sparse(
            s, model_file=sentencepiece_model_file,
            reverse=reverse, add_bos=add_bos, add_eos=add_eos,
            out_type=tf.string)
        ids = tfspm.encode_sparse(
            s, model_file=sentencepiece_model_file,
            reverse=reverse, add_bos=add_bos, add_eos=add_eos)
        pieces = tf.sparse_tensor_to_dense(pieces, default_value='')
        ids = tf.sparse_tensor_to_dense(ids, default_value=0)
        self.assertEqual(ids.eval().tolist(), expected_ids)
        self.assertEqual(pieces.eval().tolist(), expected_pieces)
  def testLoadModelProto(self):
    # Makes a serialized model proto.
    model_proto = open(self._getSentencePieceModelFile(), 'rb').read()
    with tf.Session() as sess:
      sentences = ['Hello world.']
      a = tf.constant(sentences)
      sess.run(tfspm.encode(
          a, model_proto=model_proto,
          out_type=tf.string))
  def testInvalidModelPath(self):
    with tf.Session() as sess:
      with self.assertRaises(tf.errors.NotFoundError):
        sentences = ['Hello world.']
        a = tf.constant(sentences)
        sess.run(tfspm.encode(
            a, model_file='invalid path', out_type=tf.string))
  def testInvalidModelProto(self):
    with tf.Session() as sess:
      with self.assertRaises(tf.errors.InternalError):
        sentences = ['Hello world.']
        a = tf.constant(sentences)
        sess.run(tfspm.encode(
            a, model_proto='invalid proto', out_type=tf.string))
  def testInvalidInput(self):
    sentences = ['Hello world.', 'This is a test.']
    ids = [[0,1],[2,3]]
    model_file = self._getSentencePieceModelFile()
    with tf.Session() as sess:
      a = tf.constant(sentences)
      b = tf.constant(ids)
      alpha = tf.constant([1.0, 2.0])
      sess.run(tfspm.encode(
          a, model_file=model_file, alpha=alpha, name='foo'))
      nbest_size = tf.constant([1, 2], dtype=tf.int32)
      sess.run(tfspm.encode(
          a, model_file=model_file, nbest_size=nbest_size, name='foo'))
      alpha = tf.constant(1.0)
      sess.run(tfspm.encode(
          a, model_file=model_file, alpha=alpha, name='foo'))
      nbest_size = tf.constant(10, dtype=tf.int32)
      sess.run(tfspm.encode(
          a, model_file=model_file, nbest_size=nbest_size, name='foo'))
      sess.run(tfspm.decode(
          b, sequence_length=tf.constant([2, 2]), model_file=model_file))
      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        alpha = tf.constant([1.0, 2.0, 3.0])
        sess.run(tfspm.encode(
            a, model_file=model_file, alpha=alpha))
      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        nbest_size = tf.constant([1, 2, 3], dtype=tf.int32)
        sess.run(tfspm.encode(
            a, model_file=model_file, nbest_size=nbest_size))
      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        alpha = tf.constant([[1.0], [2.0]])
        sess.run(tfspm.encode(
            a, model_file=model_file, alpha=alpha))
      with self.assertRaises(ValueError):
        a = tf.constant(sentences)
        nbest_size = tf.constant([[1], [2]], dtype=tf.int32)
        sess.run(tfspm.encode(
            a, model_file=model_file, nbest_size=nbest_size))
      with self.assertRaises(ValueError):
        b = tf.constant(ids)
        sess.run(tfspm.decode(
            a, sequence_length=2, model_file=model_file))
      with self.assertRaises(ValueError):
        b = tf.constant(ids)
        sess.run(tfspm.decode(
            a, sequence_length=tf.constant([2, 2, 2]),
            model_file=model_file))
 def suite():
  suite = unittest.TestSuite()
  suite.addTests(unittest.makeSuite(SentencePieceProcssorOpTest))
  return suite
 if __name__ == '__main__':
  unittest.main()
--- a/tensorflow/tf_sentencepiece/init.py
+++ b/tensorflow/tf_sentencepiece/init.py
@ -0,0 +1,5 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from tf_sentencepiece.sentencepiece_processor_ops import *
--- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so
+++ b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so
--- a/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py
+++ b/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py
@ -0,0 +1,192 @@
 # Copyright 2018 Google Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.!
 r"""Ops for SentencePiece Encoding/Decoding."""
 # TODO(taku):  Implements n-best output
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
 import tensorflow as tf
 _gen_sentencepiece_processor_op = tf.load_op_library(
    os.path.join(os.path.dirname(__file__), '_sentencepiece_processor_ops.so'))
 def piece_size(model_file=None, model_proto=None, name=None):
  """Returns the piece size (vocabulary size).
  Args:
    model_file: The sentencepiece model file path.
    model_proto: The sentencepiece model serialized proto.
                 Either `model_file` or `model_proto` must be set.
    name: The name argument that is passed to the op function.
  Returns:
    A scalar representing the vocabulary size.
  """
  return _gen_sentencepiece_processor_op.sentencepiece_get_piece_size(
      model_file=model_file, model_proto=model_proto, name=name)
 def piece_to_id(input, model_file=None, model_proto=None, name=None):
  """Converts piece into vocabulary id.
  Args:
    input: An arbitrary tensor of string.
    model_file: The sentencepiece model file path.
    model_proto: The sentencepiece model serialized proto.
                 Either `model_file` or `model_proto` must be set.
    name: The name argument that is passed to the op function.
  Returns:
    A tensor of int32 with the same shape as input.
  """
  return _gen_sentencepiece_processor_op.sentencepiece_piece_to_id(
      input, model_file=model_file, model_proto=model_proto, name=name)
 def id_to_piece(input, model_file=None, model_proto=None, name=None):
  """Converts vocabulary id into piece.
  Args:
    input: An arbitrary tensor of int32.
    model_file: The sentencepiece model file path.
    model_proto: The sentencepiece model serialized proto.
                 Either `model_file` or `model_proto` must be set.
    name: The name argument that is passed to the op function.
  Returns:
    A tensor of string with the same shape as input.
  """
  return _gen_sentencepiece_processor_op.sentencepiece_id_to_piece(
      input, model_file=model_file, model_proto=model_proto, name=name)
 def encode_dense(input_sentences, nbest_size=0, alpha=1.0,
                 model_file=None, model_proto=None,
                 reverse=False, add_bos=False, add_eos=False,
                 out_type=tf.int32, name=None):
  """Encodes sentences into pieces in dense tensor format.
  Args:
    input_sentences: A 1D string tensor of arbitrary size holding the raw
                     text of input sentences.
    nbest_size: A scalar or 1D tensor for sampling.
                nbest_size = {0,1}: No sampling is performed.
                nbest_size > 1: samples from the nbest_size results.
                nbest_size < 0: assuming that nbest_size is infinite
                and samples from the all hypothesis (lattice) using
                forward-filtering-and-backward-sampling algorithm.
    alpha: A scalar or 1D tensor for a moothing parameter.
           Inverse temparature for probablity rescaling.
    model_file: The sentencepiece model file path.
    model_proto: The sentencepiece model serialized proto.
                 Either `model_file` or `model_proto` must be set.
    reverse: Reverses the tokenized sequence (Default = false)
    add_bos: Add <s> to the result (Default = false)
    add_eos: Add </s> to the result (Default = false)
             <s>/</s> is added after reversing (if enabled).
    out_type: output type. tf.int32 or tf.string (Default = tf.int32)
              Setting tf.int32 directly encodes the string into an id sequence.
    name: The name argument that is passed to the op function.
  Returns:
    pieces: A dense 2D tensor representing the tokenized sentences.
    sequence_length: A 1D tensor representing the length of pieces.
  """
  return _gen_sentencepiece_processor_op.sentencepiece_encode_dense(
      input_sentences, nbest_size=nbest_size, alpha=alpha,
      model_file=model_file, model_proto=model_proto,
      reverse=reverse, add_bos=add_bos, add_eos=add_eos,
      out_type=out_type, name=name)
 # Adds an alias for encode_dense. Accepts the `encode` function.
 encode = encode_dense
 def encode_sparse(input_sentences, nbest_size=0, alpha=1.0,
                  model_file=None, model_proto=None,
                  reverse=False, add_bos=False, add_eos=False,
                  out_type=tf.int32, name=None):
  """Encodes sentences into pieces in sparse tensor format.
  Args:
    input_sentences: A 1D string tensor of arbitrary size holding the raw
                     text of input sentences.
    nbest_size: A scalar or 1D tensor for sampling.
                nbest_size = {0,1}: No sampling is performed.
                nbest_size > 1: samples from the nbest_size results.
                nbest_size < 0: assuming that nbest_size is infinite
                and samples from the all hypothesis (lattice) using
                forward-filtering-and-backward-sampling algorithm.
    alpha: A scalar or 1D tensor for a moothing parameter.
           Inverse temparature for probablity rescaling.
    model_file: The sentencepiece model file path.
    model_proto: The sentencepiece model serialized proto.
                 Either `model_file` or `model_proto` must be set.
    reverse: Reverses the tokenized sequence (Default = false)
    add_bos: Add <s> to the result (Default = false)
    add_eos: Add </s> to the result (Default = false)
             <s>/</s> is added after reversing (if enabled).
    out_type: output type. tf.int32 or tf.string (Default = tf.int32)
              Setting tf.int32 directly encodes the string into an id sequence.
    name: The name argument that is passed to the op function.
  Returns:
    pieces: A sparse 2D tensor representing the tokenized sentences.
  """
  indices, values, dense_shape = (
      _gen_sentencepiece_processor_op.sentencepiece_encode_sparse(
          input_sentences, nbest_size=nbest_size, alpha=alpha,
          model_file=model_file, model_proto=model_proto,
          reverse=reverse, add_bos=add_bos, add_eos=add_eos,
          out_type=out_type, name=name))
  return tf.SparseTensor(indices, values, dense_shape)
 def decode(pieces, sequence_length, model_file=None, model_proto=None,
           reverse=False, name=None):
  """Decode pieces into postproecssed text.
  Args:
    pieces: A 2D int32 or string tensor [batch_size x max_length] of
            encoded sequences.
    sequence_length: A 1D int32 tensor [batch_size] representing the
                   length of pieces.
    model_file: The sentencepiece model file path.
    model_proto: The sentencepiece model serialized proto.
                 Either `model_file` or `model_proto` must be set.
    reverse: Reverses the tokenized sequence (Default = false)
    name: The name argument that is passed to the op function.
  Returns:
    text: A 1D string tensor of decoded string.
  """
  return _gen_sentencepiece_processor_op.sentencepiece_decode(
      pieces, sequence_length, model_file=model_file,
      model_proto=model_proto, reverse=reverse, name=name)
 tf.NotDifferentiable('SentencepieceGetPieceSize')
 tf.NotDifferentiable('SentencepieceIdToPiece')
 tf.NotDifferentiable('SentencepiecePieceToId')
 tf.NotDifferentiable('SentencepieceEncodeDense')
 tf.NotDifferentiable('SentencepieceEncodeSparse')
 tf.NotDifferentiable('SentencepieceDecode')