mirror of
https://github.com/google/sentencepiece.git
synced 2025-01-08 18:26:38 +03:00
Added Tensorflow module
This commit is contained in:
parent
65da95da9a
commit
81d1a091fb
@ -22,12 +22,17 @@ matrix:
|
|||||||
- os: linux
|
- os: linux
|
||||||
env: IMAGE=x86_64 COMMAND=make_py_wheel
|
env: IMAGE=x86_64 COMMAND=make_py_wheel
|
||||||
script:
|
script:
|
||||||
- $TRAVIS_BUILD_DIR/make_py_wheel.sh ${IMAGE}
|
- $TRAVIS_BUILD_DIR/python/make_py_wheel.sh ${IMAGE}
|
||||||
services: docker
|
services: docker
|
||||||
- os: linux
|
- os: linux
|
||||||
env: IMAGE=i686 COMMAND=make_py_wheel
|
env: IMAGE=i686 COMMAND=make_py_wheel
|
||||||
script:
|
script:
|
||||||
- $TRAVIS_BUILD_DIR/make_py_wheel.sh ${IMAGE}
|
- $TRAVIS_BUILD_DIR/python/make_py_wheel.sh ${IMAGE}
|
||||||
|
services: docker
|
||||||
|
- os: linux
|
||||||
|
env: IMAGE=x86_64 COMMAND=make_py_wheel
|
||||||
|
script:
|
||||||
|
- $TRAVIS_BUILD_DIR/tensorflow/make_py_wheel.sh
|
||||||
services: docker
|
services: docker
|
||||||
- os: osx
|
- os: osx
|
||||||
osx_image: xcode9.3
|
osx_image: xcode9.3
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#!/bin/sh
|
#!/bin/bash
|
||||||
|
|
||||||
# Copyright 2018 Google Inc.
|
# Copyright 2018 Google Inc.
|
||||||
#
|
#
|
||||||
@ -14,27 +14,30 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.!
|
# limitations under the License.!
|
||||||
|
|
||||||
# Usage:
|
|
||||||
# > sudo sh make_py_wheel.sh
|
|
||||||
# wheel packages are built under <pwd>/manylinux_wh dir
|
|
||||||
|
|
||||||
set -e # exit immediately on error
|
set -e # exit immediately on error
|
||||||
set -x # display all commands
|
set -x # display all commands
|
||||||
|
|
||||||
|
PROTOBUF_VERSION=3.6.0
|
||||||
|
|
||||||
run_docker() {
|
run_docker() {
|
||||||
rm -fr manylinux_wh/$2
|
cd `dirname $0`
|
||||||
mkdir -p manylinux_wh/$2
|
docker pull $1
|
||||||
docker pull "$1"
|
docker run --rm -ti --name py_sentencepiece \
|
||||||
docker run --rm -ti --name manylinux -v `pwd`:/sentencepiece -w /sentencepiece/manylinux_wh/$2 -td "$1" /bin/bash
|
-v `pwd`/../:/sentencepiece -w /sentencepiece/python \
|
||||||
docker exec manylinux bash -c "../../make_py_wheel.sh make_wheel $2"
|
-td $1 /bin/bash
|
||||||
docker stop manylinux
|
docker exec py_sentencepiece bash -c "./make_py_wheel.sh native"
|
||||||
|
docker stop py_sentencepiece
|
||||||
}
|
}
|
||||||
|
|
||||||
make_wheel() {
|
build() {
|
||||||
export PATH="/usr/local/bin:$PATH"
|
rm -fr tmp
|
||||||
TRG=$1
|
mkdir -p tmp
|
||||||
|
|
||||||
wget http://ftpmirror.gnu.org/libtool/libtool-2.4.6.tar.gz
|
# Installs necessary libraries under `tmp` sub directory.
|
||||||
|
cd tmp
|
||||||
|
|
||||||
|
# Install libtool
|
||||||
|
curl -L -O http://ftpmirror.gnu.org/libtool/libtool-2.4.6.tar.gz
|
||||||
tar zxfv libtool-2.4.6.tar.gz
|
tar zxfv libtool-2.4.6.tar.gz
|
||||||
cd libtool-2.4.6
|
cd libtool-2.4.6
|
||||||
./configure
|
./configure
|
||||||
@ -42,23 +45,27 @@ make_wheel() {
|
|||||||
make install
|
make install
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
git clone https://github.com/google/protobuf.git
|
# Install protobuf
|
||||||
cd protobuf
|
curl -L -O https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protobuf-cpp-${PROTOBUF_VERSION}.tar.gz
|
||||||
./autogen.sh
|
tar zxfv protobuf-cpp-${PROTOBUF_VERSION}.tar.gz
|
||||||
|
cd protobuf-${PROTOBUF_VERSION}
|
||||||
./configure --disable-shared --with-pic
|
./configure --disable-shared --with-pic
|
||||||
make -j4
|
make CXXFLAGS+="-std=c++11 -O3" \
|
||||||
make install
|
CFLAGS+="-std=c++11 -O3" -j4
|
||||||
cd ..
|
make install || true
|
||||||
|
cd ../..
|
||||||
|
|
||||||
cd ../../
|
# Install sentencepiece
|
||||||
|
cd ..
|
||||||
make distclean || true
|
make distclean || true
|
||||||
./autogen.sh
|
./autogen.sh
|
||||||
grep -v PKG_CHECK_MODULES configure > tmp
|
grep -v PKG_CHECK_MODULES configure > tmp
|
||||||
mv tmp -f configure
|
mv tmp -f configure
|
||||||
chmod +x configure
|
chmod +x configure
|
||||||
LIBS+="-pthread -L/usr/local/lib -lprotobuf" ./configure --disable-shared --with-pic
|
LIBS+="-pthread -L/usr/local/lib -lprotobuf" ./configure --disable-shared --with-pic
|
||||||
make -j4
|
make CXXFLAGS+="-std=c++11 -O3" \
|
||||||
make install
|
CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4
|
||||||
|
make install || true
|
||||||
|
|
||||||
cd python
|
cd python
|
||||||
for i in /opt/python/*
|
for i in /opt/python/*
|
||||||
@ -77,14 +84,16 @@ make_wheel() {
|
|||||||
auditwheel repair $i
|
auditwheel repair $i
|
||||||
done
|
done
|
||||||
|
|
||||||
mv -f wheelhouse/*${TRG}.whl ../../manylinux_wh
|
mv -f wheelhouse/*${TRG}.whl .
|
||||||
|
cd .. && rm -fr tmp
|
||||||
|
cd .. && make distclean
|
||||||
}
|
}
|
||||||
|
|
||||||
if [ "$#" -eq 2 ]; then
|
if [ "$1" = "native" ]; then
|
||||||
eval "$1" $2
|
build
|
||||||
elif [ "$#" -eq 1 ]; then
|
elif [ "$#" -eq 1 ]; then
|
||||||
run_docker quay.io/pypa/manylinux1_${1} ${1}
|
run_docker quay.io/pypa/manylinux1_${1} ${1}
|
||||||
else
|
else
|
||||||
run_docker quay.io/pypa/manylinux1_i686 i686
|
run_docker quay.io/pypa/manylinux1_i686
|
||||||
run_docker quay.io/pypa/manylinux1_x86_64 x86_64
|
run_docker quay.io/pypa/manylinux1_x86_64
|
||||||
fi
|
fi
|
5
tensorflow/.gitignore
vendored
Normal file
5
tensorflow/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
build/
|
||||||
|
sdist/
|
||||||
|
dist/
|
||||||
|
tmp/
|
||||||
|
*py[cod]
|
0
tensorflow/__init__.py
Normal file
0
tensorflow/__init__.py
Normal file
103
tensorflow/make_py_wheel.sh
Executable file
103
tensorflow/make_py_wheel.sh
Executable file
@ -0,0 +1,103 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2018 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.!
|
||||||
|
|
||||||
|
set -e # exit immediately on error
|
||||||
|
set -x # display all commands
|
||||||
|
|
||||||
|
PROTOBUF_VERSION=3.6.0
|
||||||
|
|
||||||
|
run_docker() {
|
||||||
|
cd `dirname $0`
|
||||||
|
docker pull $1
|
||||||
|
docker run --rm -ti --name tf_sentencepiece \
|
||||||
|
-v `pwd`/../:/sentencepiece -w /sentencepiece/tensorflow \
|
||||||
|
-td $1 /bin/bash
|
||||||
|
docker exec tf_sentencepiece bash -c "./build.sh native"
|
||||||
|
docker stop tf_sentencepiece
|
||||||
|
}
|
||||||
|
|
||||||
|
build() {
|
||||||
|
rm -fr tmp
|
||||||
|
mkdir -p tmp
|
||||||
|
|
||||||
|
export PATH="/opt/python/cp27-cp27mu/bin:${PATH}"
|
||||||
|
|
||||||
|
# Installs necessary libraries under `tmp` sub directory.
|
||||||
|
cd tmp
|
||||||
|
|
||||||
|
# Install libtool
|
||||||
|
curl -L -O http://ftpmirror.gnu.org/libtool/libtool-2.4.6.tar.gz
|
||||||
|
tar zxfv libtool-2.4.6.tar.gz
|
||||||
|
cd libtool-2.4.6
|
||||||
|
./configure
|
||||||
|
make -j4
|
||||||
|
make install
|
||||||
|
cd ..
|
||||||
|
|
||||||
|
# Install protobuf
|
||||||
|
curl -L -O https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protobuf-cpp-${PROTOBUF_VERSION}.tar.gz
|
||||||
|
tar zxfv protobuf-cpp-${PROTOBUF_VERSION}.tar.gz
|
||||||
|
cd protobuf-${PROTOBUF_VERSION}
|
||||||
|
./configure --disable-shared --with-pic
|
||||||
|
make CXXFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" \
|
||||||
|
CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4
|
||||||
|
make install || true
|
||||||
|
cd ../..
|
||||||
|
|
||||||
|
# Install sentencepiece
|
||||||
|
cd ..
|
||||||
|
make distclean || true
|
||||||
|
./autogen.sh
|
||||||
|
grep -v PKG_CHECK_MODULES configure > tmp
|
||||||
|
mv tmp -f configure
|
||||||
|
chmod +x configure
|
||||||
|
LIBS+="-pthread -L/usr/local/lib -lprotobuf" ./configure --disable-shared --with-pic
|
||||||
|
make CXXFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" \
|
||||||
|
CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4
|
||||||
|
make install || true
|
||||||
|
|
||||||
|
# Builds _sentencepiece_processor_ops.so
|
||||||
|
cd tensorflow
|
||||||
|
pip install tensorflow
|
||||||
|
TF_CFLAGS="-I/opt/python/cp27-cp27mu/lib/python2.7/site-packages/tensorflow/include"
|
||||||
|
TF_LFLAGS="-L/opt/python/cp27-cp27mu/lib/python2.7/site-packages/tensorflow -ltensorflow_framework"
|
||||||
|
|
||||||
|
g++ -std=c++11 -shared \
|
||||||
|
-I../src \
|
||||||
|
-fPIC ${TF_CFLAGS[@]} -O2 \
|
||||||
|
-D_GLIBCXX_USE_CXX11_ABI=0 \
|
||||||
|
-Wl,--whole-archive \
|
||||||
|
/usr/local/lib/libprotobuf.a \
|
||||||
|
/usr/local/lib/libsentencepiece.a \
|
||||||
|
-Wl,--no-whole-archive \
|
||||||
|
sentencepiece_processor_ops.cc \
|
||||||
|
-o tf_sentencepiece/_sentencepiece_processor_ops.so \
|
||||||
|
${TF_LFLAGS[@]}
|
||||||
|
strip tf_sentencepiece/_sentencepiece_processor_ops.so
|
||||||
|
|
||||||
|
# Builds Python manylinux wheel package.
|
||||||
|
python setup.py bdist_wheel --universal --plat-name=linux_x86_64
|
||||||
|
python setup.py sdist
|
||||||
|
|
||||||
|
rm -fr build tf_sentencepiece.egg-info tmp
|
||||||
|
cd .. && make distclean
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ "$1" = "native" ]; then
|
||||||
|
build
|
||||||
|
else
|
||||||
|
run_docker quay.io/pypa/manylinux1_x86_64
|
||||||
|
fi
|
532
tensorflow/sentencepiece_processor_ops.cc
Normal file
532
tensorflow/sentencepiece_processor_ops.cc
Normal file
@ -0,0 +1,532 @@
|
|||||||
|
// Copyright 2016 Google Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.!
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "sentencepiece_processor.h"
|
||||||
|
#include "tensorflow/core/framework/op.h"
|
||||||
|
#include "tensorflow/core/framework/op_kernel.h"
|
||||||
|
#include "tensorflow/core/framework/shape_inference.h"
|
||||||
|
#include "tensorflow/core/framework/tensor_shape.h"
|
||||||
|
|
||||||
|
typedef int int32;
|
||||||
|
typedef long long int int64;
|
||||||
|
|
||||||
|
namespace sentencepiece {
|
||||||
|
using ::tensorflow::DEVICE_CPU;
|
||||||
|
using ::tensorflow::OpKernel;
|
||||||
|
using ::tensorflow::OpKernelConstruction;
|
||||||
|
using ::tensorflow::OpKernelContext;
|
||||||
|
using ::tensorflow::Tensor;
|
||||||
|
using ::tensorflow::TensorShapeUtils;
|
||||||
|
using ::tensorflow::shape_inference::DimensionHandle;
|
||||||
|
using ::tensorflow::shape_inference::InferenceContext;
|
||||||
|
using ::tensorflow::shape_inference::ShapeHandle;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
// A utility function to convert sentencepiece::util::Status to
|
||||||
|
// ::tensorflow::Status
|
||||||
|
::tensorflow::Status ToTFStatus(const sentencepiece::util::Status& s) {
|
||||||
|
if (s.ok()) return ::tensorflow::Status();
|
||||||
|
return ::tensorflow::Status(static_cast<::tensorflow::error::Code>(s.code()),
|
||||||
|
::tensorflow::string(s.error_message()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// A factory function to initialize SentencePieceProcessor with
|
||||||
|
// OpKernelConstruction `context`.
|
||||||
|
enum InitType { GENERAL, ENCODE, DECODE }; // purpose of processor.
|
||||||
|
|
||||||
|
void InitializeModel(OpKernelConstruction* context,
|
||||||
|
SentencePieceProcessor* sentencepiece_processor,
|
||||||
|
InitType type) {
|
||||||
|
std::string model_file_attr, model_proto_attr;
|
||||||
|
OP_REQUIRES_OK(context, context->GetAttr("model_file", &model_file_attr));
|
||||||
|
OP_REQUIRES_OK(context, context->GetAttr("model_proto", &model_proto_attr));
|
||||||
|
|
||||||
|
if (!model_file_attr.empty()) {
|
||||||
|
OP_REQUIRES(
|
||||||
|
context, model_proto_attr.empty(),
|
||||||
|
::tensorflow::errors::InvalidArgument(
|
||||||
|
"`model_proto` must be empty when `model_file` is specified."));
|
||||||
|
OP_REQUIRES_OK(context,
|
||||||
|
ToTFStatus(sentencepiece_processor->Load(model_file_attr)));
|
||||||
|
} else {
|
||||||
|
// Loads serialized sentencepiece model proto to enable embedding the
|
||||||
|
// relatively small sentencepiece model proto into the tensorflow graph
|
||||||
|
// such that the tensorflow graph is self-contained.
|
||||||
|
OP_REQUIRES_OK(context,
|
||||||
|
ToTFStatus(sentencepiece_processor->LoadFromSerializedProto(
|
||||||
|
model_proto_attr)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sets extra options to add <s>, </s>.
|
||||||
|
std::string options;
|
||||||
|
auto add_options = [&options, &context](const std::string& name,
|
||||||
|
const std::string& v) {
|
||||||
|
bool flag = false;
|
||||||
|
OP_REQUIRES_OK(context, context->GetAttr(name, &flag));
|
||||||
|
if (flag) {
|
||||||
|
if (!options.empty()) options += ':';
|
||||||
|
options += v;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (type == ENCODE || type == DECODE) {
|
||||||
|
add_options("reverse", "reverse");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type == ENCODE) {
|
||||||
|
add_options("add_bos", "bos");
|
||||||
|
add_options("add_eos", "eos");
|
||||||
|
OP_REQUIRES_OK(
|
||||||
|
context,
|
||||||
|
ToTFStatus(sentencepiece_processor->SetEncodeExtraOptions(options)));
|
||||||
|
} else if (type == DECODE) {
|
||||||
|
OP_REQUIRES_OK(
|
||||||
|
context,
|
||||||
|
ToTFStatus(sentencepiece_processor->SetDecodeExtraOptions(options)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
class SentencePieceGetPieceSizeOp : public OpKernel {
|
||||||
|
public:
|
||||||
|
explicit SentencePieceGetPieceSizeOp(OpKernelConstruction* context)
|
||||||
|
: OpKernel(context) {
|
||||||
|
SentencePieceProcessor sp;
|
||||||
|
InitializeModel(context, &sp, GENERAL);
|
||||||
|
vocab_size_ = sp.GetPieceSize();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Compute(OpKernelContext* context) override {
|
||||||
|
Tensor* vocab_size_tensor = nullptr;
|
||||||
|
OP_REQUIRES_OK(context,
|
||||||
|
context->allocate_output(0, {}, &vocab_size_tensor));
|
||||||
|
vocab_size_tensor->scalar<int32>()() = vocab_size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
int32 vocab_size_ = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename S, typename T>
|
||||||
|
class SentencePieceConvertPieceOp : public OpKernel {
|
||||||
|
public:
|
||||||
|
explicit SentencePieceConvertPieceOp(OpKernelConstruction* context)
|
||||||
|
: OpKernel(context) {
|
||||||
|
InitializeModel(context, &sentencepiece_processor_, GENERAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Compute(OpKernelContext* context) override {
|
||||||
|
const Tensor* input_tensor = nullptr;
|
||||||
|
OP_REQUIRES_OK(context, context->input("input", &input_tensor));
|
||||||
|
|
||||||
|
Tensor* output_tensor = nullptr;
|
||||||
|
OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(),
|
||||||
|
&output_tensor));
|
||||||
|
for (int i = 0; i < input_tensor->NumElements(); ++i)
|
||||||
|
output_tensor->flat<T>()(i) = Convert(input_tensor->flat<S>()(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
int32 Convert(const std::string& piece) const {
|
||||||
|
return sentencepiece_processor_.PieceToId(piece);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Convert(int32 id) const {
|
||||||
|
if (id >= 0 && id < sentencepiece_processor_.GetPieceSize()) {
|
||||||
|
return sentencepiece_processor_.IdToPiece(id);
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
SentencePieceProcessor sentencepiece_processor_;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
class SentencePieceEncodeOpBase : public OpKernel {
|
||||||
|
public:
|
||||||
|
explicit SentencePieceEncodeOpBase(OpKernelConstruction* context)
|
||||||
|
: OpKernel(context) {
|
||||||
|
InitializeModel(context, &sentencepiece_processor_, ENCODE);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Compute(OpKernelContext* context) override {
|
||||||
|
const Tensor* input_tensor = nullptr;
|
||||||
|
|
||||||
|
OP_REQUIRES_OK(context, context->input("input", &input_tensor));
|
||||||
|
OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor->shape()),
|
||||||
|
::tensorflow::errors::InvalidArgument(
|
||||||
|
"`input` must be a vector, got shape: ",
|
||||||
|
input_tensor->shape().DebugString()));
|
||||||
|
const auto& input_sentences = input_tensor->vec<std::string>();
|
||||||
|
const int64 batch_size = input_sentences.size();
|
||||||
|
|
||||||
|
const Tensor* nbest_size_tensor = nullptr;
|
||||||
|
OP_REQUIRES_OK(context, context->input("nbest_size", &nbest_size_tensor));
|
||||||
|
OP_REQUIRES(context, nbest_size_tensor->dims() <= 1,
|
||||||
|
::tensorflow::errors::InvalidArgument(
|
||||||
|
"`nbest_size` must be a scalar or vector. got shape: ",
|
||||||
|
nbest_size_tensor->shape().DebugString()));
|
||||||
|
if (nbest_size_tensor->dims() == 1) {
|
||||||
|
OP_REQUIRES(
|
||||||
|
context, batch_size == nbest_size_tensor->dim_size(0),
|
||||||
|
::tensorflow::errors::InvalidArgument(
|
||||||
|
"`nbest_size` must have the same batch size as `input`."));
|
||||||
|
}
|
||||||
|
|
||||||
|
const Tensor* alpha_tensor = nullptr;
|
||||||
|
OP_REQUIRES_OK(context, context->input("alpha", &alpha_tensor));
|
||||||
|
OP_REQUIRES(context, alpha_tensor->dims() <= 1,
|
||||||
|
::tensorflow::errors::InvalidArgument(
|
||||||
|
"`alpha` must be a scalar or vector, got shape: ",
|
||||||
|
alpha_tensor->shape().DebugString()));
|
||||||
|
if (alpha_tensor->dims() == 1) {
|
||||||
|
OP_REQUIRES(context, batch_size == alpha_tensor->dim_size(0),
|
||||||
|
::tensorflow::errors::InvalidArgument(
|
||||||
|
"`alpha` must have the same batch size as `input`."));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<T>> pieces(batch_size);
|
||||||
|
|
||||||
|
for (int64 i = 0; i < batch_size; ++i) {
|
||||||
|
const int32 nbest_size = nbest_size_tensor->dims() == 1
|
||||||
|
? nbest_size_tensor->vec<int32>()(i)
|
||||||
|
: nbest_size_tensor->scalar<int32>()();
|
||||||
|
if (nbest_size == 0 || nbest_size == 1) {
|
||||||
|
OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_.Encode(
|
||||||
|
input_sentences(i), &pieces[i])));
|
||||||
|
} else {
|
||||||
|
const float alpha = alpha_tensor->dims() == 1
|
||||||
|
? alpha_tensor->vec<float>()(i)
|
||||||
|
: alpha_tensor->scalar<float>()();
|
||||||
|
OP_REQUIRES_OK(context,
|
||||||
|
ToTFStatus(sentencepiece_processor_.SampleEncode(
|
||||||
|
input_sentences(i), nbest_size, alpha, &pieces[i])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MakeOutputTensor(context, pieces);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
virtual void MakeOutputTensor(OpKernelContext* context,
|
||||||
|
const std::vector<std::vector<T>>& pieces) = 0;
|
||||||
|
|
||||||
|
SentencePieceProcessor sentencepiece_processor_;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
class SentencePieceEncodeSparseOp : public SentencePieceEncodeOpBase<T> {
|
||||||
|
public:
|
||||||
|
explicit SentencePieceEncodeSparseOp(OpKernelConstruction* context)
|
||||||
|
: SentencePieceEncodeOpBase<T>(context) {}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void MakeOutputTensor(OpKernelContext* context,
|
||||||
|
const std::vector<std::vector<T>>& pieces) override {
|
||||||
|
const int64 batch_size = pieces.size();
|
||||||
|
|
||||||
|
int64 max_sequence_length = 0;
|
||||||
|
int64 indices_size = 0;
|
||||||
|
for (int row = 0; row < batch_size; ++row) {
|
||||||
|
const int col_size = pieces[row].size();
|
||||||
|
max_sequence_length = std::max<int64>(col_size, max_sequence_length);
|
||||||
|
indices_size += col_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the indices output tensor.
|
||||||
|
Tensor* indices_tensor = nullptr;
|
||||||
|
OP_REQUIRES_OK(context, context->allocate_output(0, {indices_size, 2},
|
||||||
|
&indices_tensor));
|
||||||
|
|
||||||
|
auto indices_tensor_output = indices_tensor->matrix<int64>();
|
||||||
|
int item_idx = 0;
|
||||||
|
for (int row = 0; row < batch_size; ++row) {
|
||||||
|
for (int col = 0; col < pieces[row].size(); ++col) {
|
||||||
|
indices_tensor_output(item_idx, 0) = row;
|
||||||
|
indices_tensor_output(item_idx, 1) = col;
|
||||||
|
++item_idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the values output tensor.
|
||||||
|
Tensor* values_tensor = nullptr;
|
||||||
|
OP_REQUIRES_OK(context,
|
||||||
|
context->allocate_output(1, {indices_size}, &values_tensor));
|
||||||
|
|
||||||
|
auto values_tensor_output = values_tensor->flat<T>();
|
||||||
|
item_idx = 0;
|
||||||
|
for (int row = 0; row < batch_size; ++row) {
|
||||||
|
std::copy(pieces[row].begin(), pieces[row].end(),
|
||||||
|
&values_tensor_output(item_idx));
|
||||||
|
item_idx += pieces[row].size();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the shape output tensor.
|
||||||
|
Tensor* shape_tensor = nullptr;
|
||||||
|
OP_REQUIRES_OK(context, context->allocate_output(2, {2}, &shape_tensor));
|
||||||
|
|
||||||
|
auto shape_tensor_output = shape_tensor->flat<int64>();
|
||||||
|
shape_tensor_output(0) = batch_size;
|
||||||
|
shape_tensor_output(1) = max_sequence_length;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
class SentencePieceEncodeDenseOp : public SentencePieceEncodeOpBase<T> {
|
||||||
|
public:
|
||||||
|
explicit SentencePieceEncodeDenseOp(OpKernelConstruction* context)
|
||||||
|
: SentencePieceEncodeOpBase<T>(context) {}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void MakeOutputTensor(OpKernelContext* context,
|
||||||
|
const std::vector<std::vector<T>>& pieces) override {
|
||||||
|
const int64 batch_size = pieces.size();
|
||||||
|
|
||||||
|
int64 max_sequence_length = 0;
|
||||||
|
for (int row = 0; row < batch_size; ++row) {
|
||||||
|
max_sequence_length =
|
||||||
|
std::max<int64>(pieces[row].size(), max_sequence_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
Tensor* values_tensor = nullptr;
|
||||||
|
Tensor* length_tensor = nullptr;
|
||||||
|
|
||||||
|
OP_REQUIRES_OK(
|
||||||
|
context, context->allocate_output(0, {batch_size, max_sequence_length},
|
||||||
|
&values_tensor));
|
||||||
|
OP_REQUIRES_OK(context,
|
||||||
|
context->allocate_output(1, {batch_size}, &length_tensor));
|
||||||
|
|
||||||
|
auto values_tensor_output = values_tensor->matrix<T>();
|
||||||
|
auto length_tensor_output = length_tensor->vec<int32>();
|
||||||
|
|
||||||
|
for (int row = 0; row < batch_size; ++row) {
|
||||||
|
for (int col = 0; col < max_sequence_length; ++col) {
|
||||||
|
values_tensor_output(row, col) =
|
||||||
|
col < pieces[row].size() ? pieces[row][col] : T();
|
||||||
|
}
|
||||||
|
length_tensor_output(row) = pieces[row].size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
class SentencePieceDecodeOp : public OpKernel {
|
||||||
|
public:
|
||||||
|
explicit SentencePieceDecodeOp(OpKernelConstruction* context)
|
||||||
|
: OpKernel(context) {
|
||||||
|
InitializeModel(context, &sentencepiece_processor_, DECODE);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Compute(OpKernelContext* context) override {
|
||||||
|
const Tensor* input_tensor = nullptr;
|
||||||
|
const Tensor* length_tensor = nullptr;
|
||||||
|
|
||||||
|
OP_REQUIRES_OK(context, context->input("input", &input_tensor));
|
||||||
|
OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_tensor->shape()),
|
||||||
|
::tensorflow::errors::InvalidArgument(
|
||||||
|
"`input` must be a 2-D matrix. got shape: ",
|
||||||
|
input_tensor->shape().DebugString()));
|
||||||
|
OP_REQUIRES_OK(context, context->input("sequence_length", &length_tensor));
|
||||||
|
OP_REQUIRES(context, TensorShapeUtils::IsVector(length_tensor->shape()),
|
||||||
|
::tensorflow::errors::InvalidArgument(
|
||||||
|
"`sequence_length` must be a vector. got shape: ",
|
||||||
|
length_tensor->shape().DebugString()));
|
||||||
|
OP_REQUIRES(
|
||||||
|
context, input_tensor->dim_size(0) == length_tensor->dim_size(0),
|
||||||
|
::tensorflow::errors::InvalidArgument(
|
||||||
|
"`sequence_length` must have the same batch size as `input`."));
|
||||||
|
|
||||||
|
const auto& input_sentences = input_tensor->matrix<T>();
|
||||||
|
const auto& sequence_length = length_tensor->vec<int32>();
|
||||||
|
const int64 batch_size = input_tensor->dim_size(0);
|
||||||
|
const int max_sequence_length = input_tensor->dim_size(1);
|
||||||
|
|
||||||
|
Tensor* values_tensor = nullptr;
|
||||||
|
OP_REQUIRES_OK(context,
|
||||||
|
context->allocate_output(0, {batch_size}, &values_tensor));
|
||||||
|
auto values_tensor_output = values_tensor->vec<std::string>();
|
||||||
|
|
||||||
|
for (int64 i = 0; i < batch_size; ++i) {
|
||||||
|
OP_REQUIRES(context,
|
||||||
|
(sequence_length(i) >= 0 &&
|
||||||
|
sequence_length(i) <= max_sequence_length),
|
||||||
|
::tensorflow::errors::InvalidArgument(
|
||||||
|
"`sequence_length` is out-of-range."));
|
||||||
|
const std::vector<T> pieces(&input_sentences(i, 0),
|
||||||
|
&input_sentences(i, 0) + sequence_length(i));
|
||||||
|
OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_.Decode(
|
||||||
|
pieces, &values_tensor_output(i))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
SentencePieceProcessor sentencepiece_processor_;
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
// The snake case of this variables are used as the function names.
|
||||||
|
constexpr char kGetPieceSizeOpName[] = "SentencepieceGetPieceSize";
|
||||||
|
constexpr char kPieceToIdOpName[] = "SentencepiecePieceToId";
|
||||||
|
constexpr char kIdToPieceOpName[] = "SentencepieceIdToPiece";
|
||||||
|
constexpr char kEncodeDenseOpName[] = "SentencepieceEncodeDense";
|
||||||
|
constexpr char kEncodeSparseOpName[] = "SentencepieceEncodeSparse";
|
||||||
|
constexpr char kDecodeOpName[] = "SentencepieceDecode";
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
REGISTER_OP(kGetPieceSizeOpName)
|
||||||
|
.Output("vocab_size: int32")
|
||||||
|
.Attr("model_file: string = ''")
|
||||||
|
.Attr("model_proto: string = ''")
|
||||||
|
.SetShapeFn([](InferenceContext* c) {
|
||||||
|
c->set_output(0, c->MakeShape({}));
|
||||||
|
return ::tensorflow::Status::OK();
|
||||||
|
});
|
||||||
|
|
||||||
|
REGISTER_KERNEL_BUILDER(Name(kGetPieceSizeOpName).Device(DEVICE_CPU),
|
||||||
|
SentencePieceGetPieceSizeOp);
|
||||||
|
|
||||||
|
REGISTER_OP(kPieceToIdOpName)
|
||||||
|
.Input("input: string")
|
||||||
|
.Output("values: int32")
|
||||||
|
.Attr("model_file: string = ''")
|
||||||
|
.Attr("model_proto: string = ''")
|
||||||
|
.SetShapeFn([](InferenceContext* c) {
|
||||||
|
c->set_output(0, c->input(0));
|
||||||
|
return ::tensorflow::Status::OK();
|
||||||
|
});
|
||||||
|
|
||||||
|
REGISTER_KERNEL_BUILDER(Name(kPieceToIdOpName).Device(DEVICE_CPU),
|
||||||
|
SentencePieceConvertPieceOp<std::string, int32>);
|
||||||
|
|
||||||
|
REGISTER_OP(kIdToPieceOpName)
|
||||||
|
.Input("input: int32")
|
||||||
|
.Output("values: string")
|
||||||
|
.Attr("model_file: string = ''")
|
||||||
|
.Attr("model_proto: string = ''")
|
||||||
|
.SetShapeFn([](InferenceContext* c) {
|
||||||
|
c->set_output(0, c->input(0));
|
||||||
|
return ::tensorflow::Status::OK();
|
||||||
|
});
|
||||||
|
|
||||||
|
REGISTER_KERNEL_BUILDER(Name(kIdToPieceOpName).Device(DEVICE_CPU),
|
||||||
|
SentencePieceConvertPieceOp<int32, std::string>);
|
||||||
|
|
||||||
|
REGISTER_OP(kEncodeDenseOpName)
|
||||||
|
.Attr("out_type: {int32, string} = DT_INT32")
|
||||||
|
.Input("input: string")
|
||||||
|
.Input("nbest_size: int32")
|
||||||
|
.Input("alpha: float")
|
||||||
|
.Output("values: out_type")
|
||||||
|
.Output("sequence_length: int32")
|
||||||
|
.Attr("model_file: string = ''")
|
||||||
|
.Attr("model_proto: string = ''")
|
||||||
|
.Attr("reverse: bool = false")
|
||||||
|
.Attr("add_bos: bool = false")
|
||||||
|
.Attr("add_eos: bool = false")
|
||||||
|
.SetShapeFn([](InferenceContext* c) {
|
||||||
|
ShapeHandle input, nbest, alpha;
|
||||||
|
TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
|
||||||
|
TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest));
|
||||||
|
TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha));
|
||||||
|
DimensionHandle batch_size = c->Dim(input, 0);
|
||||||
|
if (c->Rank(nbest) == 1)
|
||||||
|
TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size));
|
||||||
|
if (c->Rank(alpha) == 1)
|
||||||
|
TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size));
|
||||||
|
c->set_output(0, c->MakeShape({batch_size, c->UnknownDim()}));
|
||||||
|
c->set_output(1, c->MakeShape({batch_size}));
|
||||||
|
return ::tensorflow::Status::OK();
|
||||||
|
});
|
||||||
|
|
||||||
|
REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName)
|
||||||
|
.Device(DEVICE_CPU)
|
||||||
|
.TypeConstraint<int32>("out_type"),
|
||||||
|
SentencePieceEncodeDenseOp<int32>);
|
||||||
|
|
||||||
|
REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName)
|
||||||
|
.Device(DEVICE_CPU)
|
||||||
|
.TypeConstraint<std::string>("out_type"),
|
||||||
|
SentencePieceEncodeDenseOp<std::string>);
|
||||||
|
|
||||||
|
REGISTER_OP(kEncodeSparseOpName)
|
||||||
|
.Attr("out_type: {int32, string} = DT_INT32")
|
||||||
|
.Input("input: string")
|
||||||
|
.Input("nbest_size: int32")
|
||||||
|
.Input("alpha: float")
|
||||||
|
.Output("indices: int64")
|
||||||
|
.Output("values: out_type")
|
||||||
|
.Output("dense_shape: int64")
|
||||||
|
.Attr("model_file: string = ''")
|
||||||
|
.Attr("model_proto: string = ''")
|
||||||
|
.Attr("reverse: bool = false")
|
||||||
|
.Attr("add_bos: bool = false")
|
||||||
|
.Attr("add_eos: bool = false")
|
||||||
|
.SetShapeFn([](InferenceContext* c) {
|
||||||
|
ShapeHandle input, nbest, alpha;
|
||||||
|
TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
|
||||||
|
TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest));
|
||||||
|
TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha));
|
||||||
|
DimensionHandle batch_size = c->Dim(input, 0);
|
||||||
|
if (c->Rank(nbest) == 1)
|
||||||
|
TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size));
|
||||||
|
if (c->Rank(alpha) == 1)
|
||||||
|
TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size));
|
||||||
|
c->set_output(0, c->MakeShape({c->UnknownDim(), 2}));
|
||||||
|
c->set_output(1, c->MakeShape({c->UnknownDim()}));
|
||||||
|
c->set_output(2, c->MakeShape({2}));
|
||||||
|
return ::tensorflow::Status::OK();
|
||||||
|
});
|
||||||
|
|
||||||
|
REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName)
|
||||||
|
.Device(DEVICE_CPU)
|
||||||
|
.TypeConstraint<int32>("out_type"),
|
||||||
|
SentencePieceEncodeSparseOp<int32>);
|
||||||
|
|
||||||
|
REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName)
|
||||||
|
.Device(DEVICE_CPU)
|
||||||
|
.TypeConstraint<std::string>("out_type"),
|
||||||
|
SentencePieceEncodeSparseOp<std::string>);
|
||||||
|
|
||||||
|
REGISTER_OP(kDecodeOpName)
|
||||||
|
.Attr("T: {int32, string}")
|
||||||
|
.Input("input: T")
|
||||||
|
.Input("sequence_length: int32")
|
||||||
|
.Output("values: string")
|
||||||
|
.Attr("model_file: string = ''")
|
||||||
|
.Attr("model_proto: string = ''")
|
||||||
|
.Attr("reverse: bool = false")
|
||||||
|
.SetShapeFn([](InferenceContext* c) {
|
||||||
|
ShapeHandle input, sequence_length;
|
||||||
|
TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
|
||||||
|
TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sequence_length));
|
||||||
|
DimensionHandle batch_size = c->Dim(input, 0);
|
||||||
|
TF_RETURN_IF_ERROR(
|
||||||
|
c->Merge(batch_size, c->Dim(sequence_length, 0), &batch_size));
|
||||||
|
c->set_output(0, c->MakeShape({batch_size}));
|
||||||
|
return ::tensorflow::Status::OK();
|
||||||
|
});
|
||||||
|
|
||||||
|
REGISTER_KERNEL_BUILDER(
|
||||||
|
Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<int32>("T"),
|
||||||
|
SentencePieceDecodeOp<int32>);
|
||||||
|
|
||||||
|
REGISTER_KERNEL_BUILDER(
|
||||||
|
Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<std::string>("T"),
|
||||||
|
SentencePieceDecodeOp<std::string>);
|
||||||
|
} // namespace sentencepiece
|
46
tensorflow/setup.py
Executable file
46
tensorflow/setup.py
Executable file
@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Copyright 2018 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.!
|
||||||
|
|
||||||
|
from setuptools import setup
|
||||||
|
from setuptools import find_packages
|
||||||
|
import string
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append('./test')
|
||||||
|
|
||||||
|
setup(name = 'tf_sentencepiece',
|
||||||
|
author = 'Taku Kudo',
|
||||||
|
author_email='taku@google.com',
|
||||||
|
description = 'SentencePiece Encode/Decode ops for TensorFlow',
|
||||||
|
version='0.1.1',
|
||||||
|
url = 'https://github.com/google/sentencepiece',
|
||||||
|
license = 'Apache',
|
||||||
|
platforms = 'Unix',
|
||||||
|
packages=find_packages(exclude=['test']),
|
||||||
|
package_data={'tf_sentencepiece': ['_sentencepiece_processor_ops.so']},
|
||||||
|
classifiers = [
|
||||||
|
'Development Status :: 5 - Production/Stable',
|
||||||
|
'Environment :: Console',
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'Intended Audience :: Science/Research',
|
||||||
|
'License :: OSI Approved :: Apache Software License',
|
||||||
|
'Operating System :: Unix',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Topic :: Text Processing :: Linguistic',
|
||||||
|
'Topic :: Software Development :: Libraries :: Python Modules'
|
||||||
|
],
|
||||||
|
keywords='tensorflow machine learning sentencepiece NLP segmentation',
|
||||||
|
test_suite = 'tf_sentencepiece_test.suite')
|
0
tensorflow/test/__init__.py
Normal file
0
tensorflow/test/__init__.py
Normal file
283
tensorflow/test/tf_sentencepiece_test.py
Executable file
283
tensorflow/test/tf_sentencepiece_test.py
Executable file
@ -0,0 +1,283 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import itertools as it
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
import tensorflow as tf
|
||||||
|
import sentencepiece as spm
|
||||||
|
import tf_sentencepiece as tfspm
|
||||||
|
|
||||||
|
class SentencePieceProcssorOpTest(unittest.TestCase):
|
||||||
|
|
||||||
|
def _getSentencePieceModelFile(self):
|
||||||
|
return '../python/test/test_ja_model.model'
|
||||||
|
|
||||||
|
def _getExpected(self, processor, reverse=False, add_bos=False,
|
||||||
|
add_eos=False, padding=''):
|
||||||
|
options = []
|
||||||
|
if reverse:
|
||||||
|
options.append('reverse')
|
||||||
|
if add_bos:
|
||||||
|
options.append('bos')
|
||||||
|
if add_eos:
|
||||||
|
options.append('eos')
|
||||||
|
|
||||||
|
processor.SetEncodeExtraOptions(':'.join(options))
|
||||||
|
processor.SetDecodeExtraOptions(':'.join(options))
|
||||||
|
|
||||||
|
sentences = ['Hello world.', 'I have a pen.',
|
||||||
|
'I saw a girl with a telescope.']
|
||||||
|
pieces = []
|
||||||
|
ids = []
|
||||||
|
seq_len = []
|
||||||
|
|
||||||
|
for s in sentences:
|
||||||
|
x = processor.EncodeAsPieces(s)
|
||||||
|
y = processor.EncodeAsIds(s)
|
||||||
|
pieces.append(x)
|
||||||
|
ids.append(y)
|
||||||
|
seq_len.append(len(x))
|
||||||
|
self.assertEqual(len(x), len(y))
|
||||||
|
|
||||||
|
# padding
|
||||||
|
max_len = max(seq_len)
|
||||||
|
pieces = [x + [padding] * (max_len - len(x)) for x in pieces]
|
||||||
|
ids = [x + [0] * (max_len - len(x)) for x in ids]
|
||||||
|
|
||||||
|
return sentences, pieces, ids, seq_len
|
||||||
|
|
||||||
|
def testGetPieceSize(self):
|
||||||
|
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||||
|
processor = spm.SentencePieceProcessor()
|
||||||
|
processor.Load(sentencepiece_model_file)
|
||||||
|
|
||||||
|
with tf.Session():
|
||||||
|
s = tfspm.piece_size(
|
||||||
|
model_file=sentencepiece_model_file)
|
||||||
|
self.assertEqual(s.eval(), processor.GetPieceSize())
|
||||||
|
|
||||||
|
def testConvertPiece(self):
|
||||||
|
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||||
|
processor = spm.SentencePieceProcessor()
|
||||||
|
processor.Load(sentencepiece_model_file)
|
||||||
|
(sentences, expected_pieces,
|
||||||
|
expected_ids, expected_seq_len) = self._getExpected(processor,
|
||||||
|
padding='<unk>')
|
||||||
|
|
||||||
|
with tf.Session():
|
||||||
|
ids_matrix = tfspm.piece_to_id(
|
||||||
|
tf.constant(expected_pieces),
|
||||||
|
model_file=sentencepiece_model_file)
|
||||||
|
ids_vec = tfspm.piece_to_id(
|
||||||
|
tf.constant(expected_pieces[0]),
|
||||||
|
model_file=sentencepiece_model_file)
|
||||||
|
ids_scalar = tfspm.piece_to_id(
|
||||||
|
tf.constant(expected_pieces[0][0]),
|
||||||
|
model_file=sentencepiece_model_file)
|
||||||
|
|
||||||
|
self.assertEqual(ids_matrix.eval().tolist(), expected_ids)
|
||||||
|
self.assertEqual(ids_vec.eval().tolist(), expected_ids[0])
|
||||||
|
self.assertEqual(ids_scalar.eval(), expected_ids[0][0])
|
||||||
|
|
||||||
|
pieces_matrix = tfspm.id_to_piece(
|
||||||
|
tf.constant(expected_ids),
|
||||||
|
model_file=sentencepiece_model_file)
|
||||||
|
pieces_vec = tfspm.id_to_piece(
|
||||||
|
tf.constant(expected_ids[0]),
|
||||||
|
model_file=sentencepiece_model_file)
|
||||||
|
pieces_scalar = tfspm.id_to_piece(
|
||||||
|
tf.constant(expected_ids[0][0]),
|
||||||
|
model_file=sentencepiece_model_file)
|
||||||
|
|
||||||
|
self.assertEqual(pieces_matrix.eval().tolist(), expected_pieces)
|
||||||
|
self.assertEqual(pieces_vec.eval().tolist(), expected_pieces[0])
|
||||||
|
self.assertEqual(pieces_scalar.eval(), expected_pieces[0][0])
|
||||||
|
|
||||||
|
|
||||||
|
def testEncodeAndDecode(self):
|
||||||
|
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||||
|
processor = spm.SentencePieceProcessor()
|
||||||
|
processor.Load(sentencepiece_model_file)
|
||||||
|
|
||||||
|
with tf.Session():
|
||||||
|
for reverse, add_bos, add_eos in list(it.product(
|
||||||
|
(True, False), repeat=3)):
|
||||||
|
(sentences, expected_pieces,
|
||||||
|
expected_ids, expected_seq_len) = self._getExpected(
|
||||||
|
processor, reverse, add_bos, add_eos)
|
||||||
|
|
||||||
|
# Encode sentences into pieces/ids.
|
||||||
|
s = tf.constant(sentences)
|
||||||
|
pieces, seq_len1 = tfspm.encode(
|
||||||
|
s, model_file=sentencepiece_model_file,
|
||||||
|
reverse=reverse, add_bos=add_bos, add_eos=add_eos,
|
||||||
|
out_type=tf.string)
|
||||||
|
ids, seq_len2 = tfspm.encode(
|
||||||
|
s, model_file=sentencepiece_model_file,
|
||||||
|
reverse=reverse, add_bos=add_bos, add_eos=add_eos)
|
||||||
|
|
||||||
|
self.assertEqual(pieces.eval().tolist(), expected_pieces)
|
||||||
|
self.assertEqual(ids.eval().tolist(), expected_ids)
|
||||||
|
self.assertEqual(seq_len1.eval().tolist(), expected_seq_len)
|
||||||
|
self.assertEqual(seq_len2.eval().tolist(), expected_seq_len)
|
||||||
|
|
||||||
|
# Decode pieces into sentences/ids.
|
||||||
|
pieces = tf.constant(expected_pieces)
|
||||||
|
ids = tf.constant(expected_ids)
|
||||||
|
seq_len = tf.constant(expected_seq_len, dtype=tf.int32)
|
||||||
|
decoded_sentences1 = tfspm.decode(
|
||||||
|
pieces, seq_len, model_file=sentencepiece_model_file,
|
||||||
|
reverse=reverse)
|
||||||
|
decoded_sentences2 = tfspm.decode(
|
||||||
|
ids, seq_len, model_file=sentencepiece_model_file,
|
||||||
|
reverse=reverse)
|
||||||
|
|
||||||
|
self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
|
||||||
|
self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
|
||||||
|
|
||||||
|
def testSampleEncodeAndDecode(self):
|
||||||
|
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||||
|
processor = spm.SentencePieceProcessor()
|
||||||
|
processor.Load(sentencepiece_model_file)
|
||||||
|
sentences, _, _, _ = self._getExpected(processor)
|
||||||
|
|
||||||
|
with tf.Session():
|
||||||
|
for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]:
|
||||||
|
# Round trip test.
|
||||||
|
nbest_size = tf.constant(n)
|
||||||
|
alpha = tf.constant(a)
|
||||||
|
s = tf.constant(sentences)
|
||||||
|
|
||||||
|
pieces, seq_len1 = tfspm.encode(
|
||||||
|
s, nbest_size=nbest_size, alpha=alpha,
|
||||||
|
model_file=sentencepiece_model_file, out_type=tf.string)
|
||||||
|
ids, seq_len2 = tfspm.encode(
|
||||||
|
s, nbest_size=nbest_size, alpha=alpha,
|
||||||
|
model_file=sentencepiece_model_file)
|
||||||
|
decoded_sentences1 = tfspm.decode(
|
||||||
|
pieces, seq_len1, model_file=sentencepiece_model_file)
|
||||||
|
decoded_sentences2 = tfspm.decode(
|
||||||
|
ids, seq_len2, model_file=sentencepiece_model_file)
|
||||||
|
|
||||||
|
self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
|
||||||
|
self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
|
||||||
|
|
||||||
|
def testEncodeAndDecodeSparse(self):
|
||||||
|
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||||
|
processor = spm.SentencePieceProcessor()
|
||||||
|
processor.Load(sentencepiece_model_file)
|
||||||
|
|
||||||
|
with tf.Session():
|
||||||
|
for reverse, add_bos, add_eos in list(it.product(
|
||||||
|
(True, False), repeat=3)):
|
||||||
|
(sentences, expected_pieces, expected_ids,
|
||||||
|
_) = self._getExpected(processor, reverse, add_bos, add_eos)
|
||||||
|
|
||||||
|
# Encode sentences into sparse pieces/ids.
|
||||||
|
s = tf.constant(sentences)
|
||||||
|
pieces = tfspm.encode_sparse(
|
||||||
|
s, model_file=sentencepiece_model_file,
|
||||||
|
reverse=reverse, add_bos=add_bos, add_eos=add_eos,
|
||||||
|
out_type=tf.string)
|
||||||
|
ids = tfspm.encode_sparse(
|
||||||
|
s, model_file=sentencepiece_model_file,
|
||||||
|
reverse=reverse, add_bos=add_bos, add_eos=add_eos)
|
||||||
|
pieces = tf.sparse_tensor_to_dense(pieces, default_value='')
|
||||||
|
ids = tf.sparse_tensor_to_dense(ids, default_value=0)
|
||||||
|
|
||||||
|
self.assertEqual(ids.eval().tolist(), expected_ids)
|
||||||
|
self.assertEqual(pieces.eval().tolist(), expected_pieces)
|
||||||
|
|
||||||
|
def testLoadModelProto(self):
|
||||||
|
# Makes a serialized model proto.
|
||||||
|
model_proto = open(self._getSentencePieceModelFile(), 'rb').read()
|
||||||
|
with tf.Session() as sess:
|
||||||
|
sentences = ['Hello world.']
|
||||||
|
a = tf.constant(sentences)
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_proto=model_proto,
|
||||||
|
out_type=tf.string))
|
||||||
|
|
||||||
|
def testInvalidModelPath(self):
|
||||||
|
with tf.Session() as sess:
|
||||||
|
with self.assertRaises(tf.errors.NotFoundError):
|
||||||
|
sentences = ['Hello world.']
|
||||||
|
a = tf.constant(sentences)
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_file='invalid path', out_type=tf.string))
|
||||||
|
|
||||||
|
def testInvalidModelProto(self):
|
||||||
|
with tf.Session() as sess:
|
||||||
|
with self.assertRaises(tf.errors.InternalError):
|
||||||
|
sentences = ['Hello world.']
|
||||||
|
a = tf.constant(sentences)
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_proto='invalid proto', out_type=tf.string))
|
||||||
|
|
||||||
|
def testInvalidInput(self):
|
||||||
|
sentences = ['Hello world.', 'This is a test.']
|
||||||
|
ids = [[0,1],[2,3]]
|
||||||
|
model_file = self._getSentencePieceModelFile()
|
||||||
|
with tf.Session() as sess:
|
||||||
|
a = tf.constant(sentences)
|
||||||
|
b = tf.constant(ids)
|
||||||
|
|
||||||
|
alpha = tf.constant([1.0, 2.0])
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_file=model_file, alpha=alpha, name='foo'))
|
||||||
|
|
||||||
|
nbest_size = tf.constant([1, 2], dtype=tf.int32)
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_file=model_file, nbest_size=nbest_size, name='foo'))
|
||||||
|
|
||||||
|
alpha = tf.constant(1.0)
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_file=model_file, alpha=alpha, name='foo'))
|
||||||
|
|
||||||
|
nbest_size = tf.constant(10, dtype=tf.int32)
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_file=model_file, nbest_size=nbest_size, name='foo'))
|
||||||
|
|
||||||
|
sess.run(tfspm.decode(
|
||||||
|
b, sequence_length=tf.constant([2, 2]), model_file=model_file))
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
a = tf.constant(sentences)
|
||||||
|
alpha = tf.constant([1.0, 2.0, 3.0])
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_file=model_file, alpha=alpha))
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
a = tf.constant(sentences)
|
||||||
|
nbest_size = tf.constant([1, 2, 3], dtype=tf.int32)
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_file=model_file, nbest_size=nbest_size))
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
a = tf.constant(sentences)
|
||||||
|
alpha = tf.constant([[1.0], [2.0]])
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_file=model_file, alpha=alpha))
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
a = tf.constant(sentences)
|
||||||
|
nbest_size = tf.constant([[1], [2]], dtype=tf.int32)
|
||||||
|
sess.run(tfspm.encode(
|
||||||
|
a, model_file=model_file, nbest_size=nbest_size))
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
b = tf.constant(ids)
|
||||||
|
sess.run(tfspm.decode(
|
||||||
|
a, sequence_length=2, model_file=model_file))
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
b = tf.constant(ids)
|
||||||
|
sess.run(tfspm.decode(
|
||||||
|
a, sequence_length=tf.constant([2, 2, 2]),
|
||||||
|
model_file=model_file))
|
||||||
|
|
||||||
|
|
||||||
|
def suite():
|
||||||
|
suite = unittest.TestSuite()
|
||||||
|
suite.addTests(unittest.makeSuite(SentencePieceProcssorOpTest))
|
||||||
|
return suite
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
5
tensorflow/tf_sentencepiece/__init__.py
Normal file
5
tensorflow/tf_sentencepiece/__init__.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
from tf_sentencepiece.sentencepiece_processor_ops import *
|
BIN
tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so
Executable file
BIN
tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so
Executable file
Binary file not shown.
192
tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py
Normal file
192
tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
# Copyright 2018 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.!
|
||||||
|
|
||||||
|
r"""Ops for SentencePiece Encoding/Decoding."""
|
||||||
|
|
||||||
|
# TODO(taku): Implements n-best output
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
_gen_sentencepiece_processor_op = tf.load_op_library(
|
||||||
|
os.path.join(os.path.dirname(__file__), '_sentencepiece_processor_ops.so'))
|
||||||
|
|
||||||
|
|
||||||
|
def piece_size(model_file=None, model_proto=None, name=None):
|
||||||
|
"""Returns the piece size (vocabulary size).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_file: The sentencepiece model file path.
|
||||||
|
model_proto: The sentencepiece model serialized proto.
|
||||||
|
Either `model_file` or `model_proto` must be set.
|
||||||
|
name: The name argument that is passed to the op function.
|
||||||
|
Returns:
|
||||||
|
A scalar representing the vocabulary size.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return _gen_sentencepiece_processor_op.sentencepiece_get_piece_size(
|
||||||
|
model_file=model_file, model_proto=model_proto, name=name)
|
||||||
|
|
||||||
|
|
||||||
|
def piece_to_id(input, model_file=None, model_proto=None, name=None):
|
||||||
|
"""Converts piece into vocabulary id.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input: An arbitrary tensor of string.
|
||||||
|
model_file: The sentencepiece model file path.
|
||||||
|
model_proto: The sentencepiece model serialized proto.
|
||||||
|
Either `model_file` or `model_proto` must be set.
|
||||||
|
name: The name argument that is passed to the op function.
|
||||||
|
Returns:
|
||||||
|
A tensor of int32 with the same shape as input.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return _gen_sentencepiece_processor_op.sentencepiece_piece_to_id(
|
||||||
|
input, model_file=model_file, model_proto=model_proto, name=name)
|
||||||
|
|
||||||
|
|
||||||
|
def id_to_piece(input, model_file=None, model_proto=None, name=None):
|
||||||
|
"""Converts vocabulary id into piece.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input: An arbitrary tensor of int32.
|
||||||
|
model_file: The sentencepiece model file path.
|
||||||
|
model_proto: The sentencepiece model serialized proto.
|
||||||
|
Either `model_file` or `model_proto` must be set.
|
||||||
|
name: The name argument that is passed to the op function.
|
||||||
|
Returns:
|
||||||
|
A tensor of string with the same shape as input.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return _gen_sentencepiece_processor_op.sentencepiece_id_to_piece(
|
||||||
|
input, model_file=model_file, model_proto=model_proto, name=name)
|
||||||
|
|
||||||
|
|
||||||
|
def encode_dense(input_sentences, nbest_size=0, alpha=1.0,
|
||||||
|
model_file=None, model_proto=None,
|
||||||
|
reverse=False, add_bos=False, add_eos=False,
|
||||||
|
out_type=tf.int32, name=None):
|
||||||
|
"""Encodes sentences into pieces in dense tensor format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_sentences: A 1D string tensor of arbitrary size holding the raw
|
||||||
|
text of input sentences.
|
||||||
|
nbest_size: A scalar or 1D tensor for sampling.
|
||||||
|
nbest_size = {0,1}: No sampling is performed.
|
||||||
|
nbest_size > 1: samples from the nbest_size results.
|
||||||
|
nbest_size < 0: assuming that nbest_size is infinite
|
||||||
|
and samples from the all hypothesis (lattice) using
|
||||||
|
forward-filtering-and-backward-sampling algorithm.
|
||||||
|
alpha: A scalar or 1D tensor for a moothing parameter.
|
||||||
|
Inverse temparature for probablity rescaling.
|
||||||
|
model_file: The sentencepiece model file path.
|
||||||
|
model_proto: The sentencepiece model serialized proto.
|
||||||
|
Either `model_file` or `model_proto` must be set.
|
||||||
|
reverse: Reverses the tokenized sequence (Default = false)
|
||||||
|
add_bos: Add <s> to the result (Default = false)
|
||||||
|
add_eos: Add </s> to the result (Default = false)
|
||||||
|
<s>/</s> is added after reversing (if enabled).
|
||||||
|
out_type: output type. tf.int32 or tf.string (Default = tf.int32)
|
||||||
|
Setting tf.int32 directly encodes the string into an id sequence.
|
||||||
|
name: The name argument that is passed to the op function.
|
||||||
|
Returns:
|
||||||
|
pieces: A dense 2D tensor representing the tokenized sentences.
|
||||||
|
sequence_length: A 1D tensor representing the length of pieces.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return _gen_sentencepiece_processor_op.sentencepiece_encode_dense(
|
||||||
|
input_sentences, nbest_size=nbest_size, alpha=alpha,
|
||||||
|
model_file=model_file, model_proto=model_proto,
|
||||||
|
reverse=reverse, add_bos=add_bos, add_eos=add_eos,
|
||||||
|
out_type=out_type, name=name)
|
||||||
|
|
||||||
|
# Adds an alias for encode_dense. Accepts the `encode` function.
|
||||||
|
encode = encode_dense
|
||||||
|
|
||||||
|
|
||||||
|
def encode_sparse(input_sentences, nbest_size=0, alpha=1.0,
|
||||||
|
model_file=None, model_proto=None,
|
||||||
|
reverse=False, add_bos=False, add_eos=False,
|
||||||
|
out_type=tf.int32, name=None):
|
||||||
|
"""Encodes sentences into pieces in sparse tensor format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_sentences: A 1D string tensor of arbitrary size holding the raw
|
||||||
|
text of input sentences.
|
||||||
|
nbest_size: A scalar or 1D tensor for sampling.
|
||||||
|
nbest_size = {0,1}: No sampling is performed.
|
||||||
|
nbest_size > 1: samples from the nbest_size results.
|
||||||
|
nbest_size < 0: assuming that nbest_size is infinite
|
||||||
|
and samples from the all hypothesis (lattice) using
|
||||||
|
forward-filtering-and-backward-sampling algorithm.
|
||||||
|
alpha: A scalar or 1D tensor for a moothing parameter.
|
||||||
|
Inverse temparature for probablity rescaling.
|
||||||
|
model_file: The sentencepiece model file path.
|
||||||
|
model_proto: The sentencepiece model serialized proto.
|
||||||
|
Either `model_file` or `model_proto` must be set.
|
||||||
|
reverse: Reverses the tokenized sequence (Default = false)
|
||||||
|
add_bos: Add <s> to the result (Default = false)
|
||||||
|
add_eos: Add </s> to the result (Default = false)
|
||||||
|
<s>/</s> is added after reversing (if enabled).
|
||||||
|
out_type: output type. tf.int32 or tf.string (Default = tf.int32)
|
||||||
|
Setting tf.int32 directly encodes the string into an id sequence.
|
||||||
|
name: The name argument that is passed to the op function.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pieces: A sparse 2D tensor representing the tokenized sentences.
|
||||||
|
"""
|
||||||
|
|
||||||
|
indices, values, dense_shape = (
|
||||||
|
_gen_sentencepiece_processor_op.sentencepiece_encode_sparse(
|
||||||
|
input_sentences, nbest_size=nbest_size, alpha=alpha,
|
||||||
|
model_file=model_file, model_proto=model_proto,
|
||||||
|
reverse=reverse, add_bos=add_bos, add_eos=add_eos,
|
||||||
|
out_type=out_type, name=name))
|
||||||
|
return tf.SparseTensor(indices, values, dense_shape)
|
||||||
|
|
||||||
|
|
||||||
|
def decode(pieces, sequence_length, model_file=None, model_proto=None,
|
||||||
|
reverse=False, name=None):
|
||||||
|
"""Decode pieces into postproecssed text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pieces: A 2D int32 or string tensor [batch_size x max_length] of
|
||||||
|
encoded sequences.
|
||||||
|
sequence_length: A 1D int32 tensor [batch_size] representing the
|
||||||
|
length of pieces.
|
||||||
|
model_file: The sentencepiece model file path.
|
||||||
|
model_proto: The sentencepiece model serialized proto.
|
||||||
|
Either `model_file` or `model_proto` must be set.
|
||||||
|
reverse: Reverses the tokenized sequence (Default = false)
|
||||||
|
name: The name argument that is passed to the op function.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
text: A 1D string tensor of decoded string.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return _gen_sentencepiece_processor_op.sentencepiece_decode(
|
||||||
|
pieces, sequence_length, model_file=model_file,
|
||||||
|
model_proto=model_proto, reverse=reverse, name=name)
|
||||||
|
|
||||||
|
|
||||||
|
tf.NotDifferentiable('SentencepieceGetPieceSize')
|
||||||
|
tf.NotDifferentiable('SentencepieceIdToPiece')
|
||||||
|
tf.NotDifferentiable('SentencepiecePieceToId')
|
||||||
|
tf.NotDifferentiable('SentencepieceEncodeDense')
|
||||||
|
tf.NotDifferentiable('SentencepieceEncodeSparse')
|
||||||
|
tf.NotDifferentiable('SentencepieceDecode')
|
Loading…
Reference in New Issue
Block a user