mirror of
https://github.com/google/sentencepiece.git
synced 2024-09-19 06:40:00 +03:00
remove tensorflow module
This commit is contained in:
parent
9cf136582d
commit
8f56103758
18
.travis.yml
18
.travis.yml
@ -8,13 +8,13 @@ matrix:
|
||||
env: IMAGE=i386/ubuntu:rolling COMMAND=build_linux_gcc_ubuntu_i386
|
||||
services: docker
|
||||
- os: linux
|
||||
env: IMAGE=ubuntu:bionic COMMAND=build_linux_gcc_ubuntu_no_tf
|
||||
env: IMAGE=ubuntu:bionic COMMAND=build_linux_gcc_ubuntu
|
||||
services: docker
|
||||
- os: linux
|
||||
env: IMAGE=ubuntu:xenial COMMAND=build_linux_gcc_ubuntu_no_tf
|
||||
env: IMAGE=ubuntu:xenial COMMAND=build_linux_gcc_ubuntu
|
||||
services: docker
|
||||
- os: linux
|
||||
env: IMAGE=ubuntu:trusty COMMAND=build_linux_gcc_ubuntu_no_tf
|
||||
env: IMAGE=ubuntu:trusty COMMAND=build_linux_gcc_ubuntu
|
||||
services: docker
|
||||
- os: linux
|
||||
env: IMAGE=debian:stable COMMAND=build_linux_gcc_debian
|
||||
@ -37,12 +37,6 @@ matrix:
|
||||
- $TRAVIS_BUILD_DIR/python/make_py_wheel.sh ${IMAGE}
|
||||
- if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi
|
||||
services: docker
|
||||
- os: linux
|
||||
env: IMAGE=x86_64 COMMAND=make_py_wheel_tf RELEASE_FILES="$TRAVIS_BUILD_DIR/tensorflow/dist/*.whl"
|
||||
script:
|
||||
- $TRAVIS_BUILD_DIR/tensorflow/make_py_wheel.sh
|
||||
- if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi
|
||||
services: docker
|
||||
- os: osx
|
||||
osx_image: xcode9.4
|
||||
env: IMAGE=native COMMAND=build_osx
|
||||
@ -52,12 +46,6 @@ matrix:
|
||||
script:
|
||||
- $TRAVIS_BUILD_DIR/python/make_py_wheel_mac.sh
|
||||
- if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi
|
||||
- os: osx
|
||||
osx_image: xcode9.4
|
||||
env: IMAGE=native COMMAND=make_py_wheel_mac_tf RELEASE_FILES="$TRAVIS_BUILD_DIR/tensorflow/dist/*.whl"
|
||||
script:
|
||||
- $TRAVIS_BUILD_DIR/tensorflow/make_py_wheel_mac.sh
|
||||
- if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi
|
||||
script:
|
||||
- $TRAVIS_BUILD_DIR/test.sh ${IMAGE} ${COMMAND}
|
||||
- if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi
|
||||
|
@ -1,88 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright 2018 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.!
|
||||
|
||||
set -e # exit immediately on error
|
||||
set -x # display all commands
|
||||
|
||||
CMAKE_VERSION=3.12.0
|
||||
|
||||
run_docker() {
|
||||
cd `dirname $0`
|
||||
docker pull $1
|
||||
docker run --rm -ti --name tf_sentencepiece \
|
||||
-v `pwd`/../:/sentencepiece -w /sentencepiece/tensorflow \
|
||||
-td $1 /bin/bash
|
||||
docker exec tf_sentencepiece bash -c "./make_py_wheel.sh native $2"
|
||||
docker stop tf_sentencepiece
|
||||
}
|
||||
|
||||
build_tf_wrapper() {
|
||||
pkg_name="==$1"
|
||||
|
||||
pip3 install tensorflow${pkg_name} --upgrade
|
||||
|
||||
TF_CFLAGS=( $(python3 -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
|
||||
TF_LFLAGS=( $(python3 -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
|
||||
TF_VERSION=( $(python3 -c 'import tensorflow as tf; print(tf.__version__)') )
|
||||
|
||||
echo TF_CFLAGS=${TF_CFLAGS[@]}
|
||||
echo TF_LFLAGS=${TF_LFLAGS[@]}
|
||||
echo TF_VERSION=${TF_VERSION}
|
||||
|
||||
g++ -std=c++11 -shared \
|
||||
-I../../src \
|
||||
-D_USE_TF_STRING_VIEW \
|
||||
-fPIC ${TF_CFLAGS[@]} -O2 \
|
||||
-Wl,--whole-archive \
|
||||
/usr/local/lib/libsentencepiece.a \
|
||||
-Wl,--no-whole-archive \
|
||||
sentencepiece_processor_ops.cc \
|
||||
-o tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION} \
|
||||
${TF_LFLAGS[@]}
|
||||
|
||||
strip tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION}
|
||||
|
||||
python3 setup.py test
|
||||
}
|
||||
|
||||
build() {
|
||||
rm -fr build
|
||||
mkdir -p build
|
||||
cd build
|
||||
|
||||
cmake ../.. -DSPM_ENABLE_SHARED=OFF -DSPM_ENABLE_TENSORFLOW_SHARED=ON
|
||||
make -j4
|
||||
make install
|
||||
cd ..
|
||||
|
||||
for v in $@; do
|
||||
build_tf_wrapper $v
|
||||
done
|
||||
|
||||
python3 setup.py bdist_wheel --universal --plat-name=manylinux1_x86_64
|
||||
python3 setup.py sdist
|
||||
rm -fr build tf_sentencepiece.egg-info
|
||||
}
|
||||
|
||||
if [ "$1" = "native" ]; then
|
||||
shift
|
||||
build $@
|
||||
else
|
||||
# Do not support TF<=1.14 because API compatiblity issue is not fixed.
|
||||
# run_docker tensorflow/tensorflow:custom-op-ubuntu14 "1.13.1 1.13.2 1.14.0"
|
||||
run_docker tensorflow/tensorflow:custom-op-ubuntu16 "1.15.0 1.15.2 2.0.0 2.0.1"
|
||||
run_docker tensorflow/tensorflow:2.1.0-custom-op-ubuntu16 "2.1.0 2.2.0"
|
||||
fi
|
@ -1,116 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright 2018 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.!
|
||||
|
||||
set -e # exit immediately on error
|
||||
set -x # display all commands
|
||||
|
||||
build_tf_wrapper() {
|
||||
if [ "$1" != "" ]; then
|
||||
pkg_name="==$1"
|
||||
fi
|
||||
|
||||
# Builds _sentencepiece_processor_ops.so
|
||||
pip install tensorflow${pkg_name} --upgrade --no-cache-dir -I
|
||||
|
||||
pip uninstall numpy -y || true
|
||||
pip uninstall numpy -y || true
|
||||
pip uninstall numpy -y || true
|
||||
pip install numpy --upgrade --no-cache-dir -I
|
||||
|
||||
TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
|
||||
TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
|
||||
TF_VERSION=( $(python -c 'import tensorflow as tf; print(tf.__version__)') )
|
||||
|
||||
TF_LFLAGS2=`echo -n ${TF_LFLAGS[@]} | sed -e 's/-l:lib/-l/' -e 's/.[12].dylib//'`
|
||||
|
||||
g++ -std=c++11 -shared -undefined dynamic_lookup \
|
||||
-I../../src \
|
||||
-D_USE_TF_STRING_VIEW \
|
||||
-fPIC ${TF_CFLAGS[@]} -O2 \
|
||||
-D_GLIBCXX_USE_CXX11_ABI=0 \
|
||||
-Wl,-force_load \
|
||||
/usr/local/lib/libsentencepiece.a \
|
||||
sentencepiece_processor_ops.cc \
|
||||
-o tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION} \
|
||||
${TF_LFLAGS2}
|
||||
|
||||
strip -x tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION}
|
||||
}
|
||||
|
||||
build() {
|
||||
VERSION="3.7"
|
||||
URL="https://www.python.org/ftp/python/3.7.0/python-3.7.0-macosx10.6.pkg"
|
||||
INSTALL_PATH="/Library/Frameworks/Python.framework/Versions/${VERSION}/bin"
|
||||
CURRENT_PATH=${PATH}
|
||||
|
||||
curl -L -o python.pkg ${URL}
|
||||
sudo installer -pkg python.pkg -target /
|
||||
|
||||
if [ -f "${INSTALL_PATH}/python3" ]; then
|
||||
ln -s ${INSTALL_PATH}/python3 ${INSTALL_PATH}/python
|
||||
ln -s ${INSTALL_PATH}/python3-config ${INSTALL_PATH}/python-config
|
||||
ln -s ${INSTALL_PATH}/pip3 ${INSTALL_PATH}/pip
|
||||
fi
|
||||
|
||||
curl -L -O https://bootstrap.pypa.io/get-pip.py
|
||||
|
||||
export PATH="${INSTALL_PATH}:${CURRENT_PATH}"
|
||||
ls -l ${INSTALL_PATH}
|
||||
which python
|
||||
which pip
|
||||
python --version
|
||||
sudo python get-pip.py --no-setuptools --no-wheel --ignore-installed
|
||||
pip install --upgrade setuptools
|
||||
pip install wheel
|
||||
pip install delocate
|
||||
|
||||
cd tensorflow
|
||||
rm -fr build
|
||||
mkdir -p build
|
||||
cd build
|
||||
|
||||
# Install sentencepiece
|
||||
cmake ../.. -DSPM_ENABLE_SHARED=OFF -DSPM_ENABLE_TENSORFLOW_SHARED=ON
|
||||
make -j4 VERBOSE=1
|
||||
make install
|
||||
cd ..
|
||||
|
||||
# Remove pre-installed Linux so files.
|
||||
rm -f tf_sentencepiece/*.so.*
|
||||
|
||||
build_tf_wrapper "2.2.0"
|
||||
build_tf_wrapper "2.1.0"
|
||||
# build_tf_wrapper "2.0.1"
|
||||
build_tf_wrapper "2.0.0"
|
||||
# build_tf_wrapper "1.15.2"
|
||||
build_tf_wrapper "1.15.0"
|
||||
# build_tf_wrapper "1.14.0"
|
||||
# build_tf_wrapper "1.13.2"
|
||||
# build_tf_wrapper "1.13.1"
|
||||
|
||||
# Builds Python manylinux wheel package.
|
||||
# Platform name is determined by the tensorflow pip package.
|
||||
# TODO(taku): Automatically detect the platname of tensoflow-pip
|
||||
# PLAT_NAME=$(python -c 'import distutils.util; print(distutils.util.get_platform())')
|
||||
PLAT_NAME=macosx_10_10_x86_64
|
||||
python setup.py bdist_wheel --universal --plat-name=${PLAT_NAME}
|
||||
# python setup.py test
|
||||
python setup.py sdist
|
||||
|
||||
rm -fr build tf_sentencepiece.egg-info tmp
|
||||
}
|
||||
|
||||
build
|
@ -1,652 +0,0 @@
|
||||
// Copyright 2018 Google Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.!
|
||||
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "sentencepiece_processor.h"
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/shape_inference.h"
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/lib/hash/hash.h"
|
||||
|
||||
typedef int int32;
|
||||
typedef long long int int64;
|
||||
typedef unsigned long long int uint64;
|
||||
|
||||
namespace sentencepiece {
|
||||
using ::tensorflow::DEVICE_CPU;
|
||||
using ::tensorflow::Hash64;
|
||||
using ::tensorflow::OpKernel;
|
||||
using ::tensorflow::OpKernelConstruction;
|
||||
using ::tensorflow::OpKernelContext;
|
||||
using ::tensorflow::Tensor;
|
||||
using ::tensorflow::TensorShapeUtils;
|
||||
using ::tensorflow::tstring;
|
||||
using ::tensorflow::shape_inference::DimensionHandle;
|
||||
using ::tensorflow::shape_inference::InferenceContext;
|
||||
using ::tensorflow::shape_inference::ShapeHandle;
|
||||
|
||||
namespace {
|
||||
|
||||
// A utility function to convert sentencepiece::util::Status to
|
||||
// ::tensorflow::Status
|
||||
::tensorflow::Status ToTFStatus(const sentencepiece::util::Status& s) {
|
||||
if (s.ok()) return ::tensorflow::Status();
|
||||
return ::tensorflow::Status(static_cast<::tensorflow::error::Code>(s.code()),
|
||||
::tensorflow::string(s.error_message()));
|
||||
}
|
||||
|
||||
// Global cache to reuse SentencePieceProcessor with the same
|
||||
// model file or model proto. The instance is managed with shared_ptr so
|
||||
// the instance is deleted when no client is using it (refcount is zero).
|
||||
class SentencePieceProcessorCache {
|
||||
public:
|
||||
std::shared_ptr<SentencePieceProcessor> get(
|
||||
const std::string key, bool is_proto,
|
||||
sentencepiece::util::Status* status) {
|
||||
std::lock_guard<std::mutex> l(mutex_);
|
||||
|
||||
const uint64 fp = Hash64(key.data(), key.size());
|
||||
auto sp = data_[fp].lock();
|
||||
|
||||
if (sp) {
|
||||
*status = sp->status();
|
||||
return sp;
|
||||
}
|
||||
|
||||
sp = std::make_shared<SentencePieceProcessor>();
|
||||
*status = is_proto ? sp->LoadFromSerializedProto(key) : sp->Load(key);
|
||||
if (!status->ok()) return nullptr;
|
||||
|
||||
data_[fp] = sp;
|
||||
return sp;
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex mutex_;
|
||||
std::unordered_map<uint64, std::weak_ptr<SentencePieceProcessor>> data_;
|
||||
};
|
||||
|
||||
class SentencePieceBaseOp : public OpKernel {
|
||||
public:
|
||||
explicit SentencePieceBaseOp(OpKernelConstruction* context)
|
||||
: OpKernel(context) {
|
||||
std::string model_file_attr, model_proto_attr;
|
||||
OP_REQUIRES_OK(context, context->GetAttr("model_file", &model_file_attr));
|
||||
OP_REQUIRES_OK(context, context->GetAttr("model_proto", &model_proto_attr));
|
||||
|
||||
// Initializes global cache.
|
||||
static SentencePieceProcessorCache* cache = new SentencePieceProcessorCache;
|
||||
sentencepiece::util::Status status;
|
||||
|
||||
OP_REQUIRES(context,
|
||||
((model_proto_attr.empty() && !model_file_attr.empty()) ||
|
||||
(!model_proto_attr.empty() && model_file_attr.empty())),
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"Either `model_proto` or `model_file` must be set."));
|
||||
|
||||
if (!model_file_attr.empty()) {
|
||||
sentencepiece_processor_ = cache->get(model_file_attr, false, &status);
|
||||
} else {
|
||||
// Loads serialized sentencepiece model proto to enable embedding the
|
||||
// relatively small sentencepiece model proto into the tensorflow graph
|
||||
// such that the tensorflow graph is self-contained.
|
||||
sentencepiece_processor_ = cache->get(model_proto_attr, true, &status);
|
||||
}
|
||||
|
||||
OP_REQUIRES_OK(context, ToTFStatus(status));
|
||||
OP_REQUIRES(context, sentencepiece_processor_,
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"Failed to initialize SentencePieceProcessor"));
|
||||
|
||||
// Sets extra options to add <s>, </s>.
|
||||
auto has_attribute = [&context](const std::string& name) {
|
||||
bool flag = false;
|
||||
context->GetAttr(name, &flag).IgnoreError();
|
||||
return flag;
|
||||
};
|
||||
|
||||
if (has_attribute("add_bos")) {
|
||||
bos_id_ = sentencepiece_processor_->bos_id();
|
||||
OP_REQUIRES(context, bos_id_ >= 0,
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`bos_id` is not defined in model"));
|
||||
}
|
||||
|
||||
if (has_attribute("add_eos")) {
|
||||
eos_id_ = sentencepiece_processor_->eos_id();
|
||||
OP_REQUIRES(context, eos_id_ >= 0,
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`eos_id` is not defined in model"));
|
||||
}
|
||||
|
||||
reverse_ = has_attribute("reverse");
|
||||
|
||||
pad_id_ = sentencepiece_processor_->pad_id();
|
||||
if (pad_id_ == -1) pad_id_ = sentencepiece_processor_->unk_id();
|
||||
}
|
||||
|
||||
protected:
|
||||
void GetPad(int32* pad) const { *pad = pad_id_; }
|
||||
|
||||
void GetPad(tstring* pad) const {
|
||||
pad->clear();
|
||||
if (sentencepiece_processor_ && pad_id_ >= 0 &&
|
||||
pad_id_ != sentencepiece_processor_->unk_id())
|
||||
*pad = sentencepiece_processor_->IdToPiece(pad_id_);
|
||||
}
|
||||
|
||||
std::shared_ptr<SentencePieceProcessor> sentencepiece_processor_;
|
||||
int bos_id_ = -1;
|
||||
int eos_id_ = -1;
|
||||
int pad_id_ = -1;
|
||||
bool reverse_ = false;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
class SentencePieceGetPieceSizeOp : public SentencePieceBaseOp {
|
||||
public:
|
||||
explicit SentencePieceGetPieceSizeOp(OpKernelConstruction* context)
|
||||
: SentencePieceBaseOp(context) {}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
Tensor* vocab_size_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context,
|
||||
context->allocate_output(0, {}, &vocab_size_tensor));
|
||||
vocab_size_tensor->scalar<int32>()() =
|
||||
sentencepiece_processor_->GetPieceSize();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename S, typename T>
|
||||
class SentencePieceConvertPieceOp : public SentencePieceBaseOp {
|
||||
public:
|
||||
explicit SentencePieceConvertPieceOp(OpKernelConstruction* context)
|
||||
: SentencePieceBaseOp(context) {}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor* input_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context, context->input("input", &input_tensor));
|
||||
|
||||
Tensor* output_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(),
|
||||
&output_tensor));
|
||||
for (int i = 0; i < input_tensor->NumElements(); ++i)
|
||||
output_tensor->flat<T>()(i) = Convert(input_tensor->flat<S>()(i));
|
||||
}
|
||||
|
||||
int32 Convert(const std::string& piece) const {
|
||||
return sentencepiece_processor_->PieceToId(piece);
|
||||
}
|
||||
|
||||
std::string Convert(int32 id) const {
|
||||
if (id >= 0 && id < sentencepiece_processor_->GetPieceSize()) {
|
||||
return sentencepiece_processor_->IdToPiece(id);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
};
|
||||
|
||||
class SentencePieceGetPieceTypeOp : public SentencePieceBaseOp {
|
||||
public:
|
||||
explicit SentencePieceGetPieceTypeOp(OpKernelConstruction* context)
|
||||
: SentencePieceBaseOp(context) {
|
||||
OP_REQUIRES_OK(context, context->GetAttr("piece_type", &piece_type_));
|
||||
}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor* input_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context, context->input("input", &input_tensor));
|
||||
|
||||
Tensor* output_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(),
|
||||
&output_tensor));
|
||||
|
||||
for (int i = 0; i < input_tensor->NumElements(); ++i) {
|
||||
const int id = input_tensor->flat<int32>()(i);
|
||||
switch (piece_type_) {
|
||||
case 0:
|
||||
output_tensor->flat<bool>()(i) =
|
||||
sentencepiece_processor_->IsUnknown(id);
|
||||
break;
|
||||
case 1:
|
||||
output_tensor->flat<bool>()(i) =
|
||||
sentencepiece_processor_->IsControl(id);
|
||||
break;
|
||||
case 2:
|
||||
output_tensor->flat<bool>()(i) =
|
||||
sentencepiece_processor_->IsUnused(id);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int piece_type_;
|
||||
};
|
||||
|
||||
template <typename T, typename U = T>
|
||||
class SentencePieceEncodeOpBase : public SentencePieceBaseOp {
|
||||
public:
|
||||
explicit SentencePieceEncodeOpBase(OpKernelConstruction* context)
|
||||
: SentencePieceBaseOp(context) {}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor* input_tensor = nullptr;
|
||||
|
||||
OP_REQUIRES_OK(context, context->input("input", &input_tensor));
|
||||
OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor->shape()),
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`input` must be a vector, got shape: ",
|
||||
input_tensor->shape().DebugString()));
|
||||
const auto& input_sentences = input_tensor->vec<tstring>();
|
||||
const int64 batch_size = input_sentences.size();
|
||||
|
||||
const Tensor* nbest_size_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context, context->input("nbest_size", &nbest_size_tensor));
|
||||
OP_REQUIRES(context, nbest_size_tensor->dims() <= 1,
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`nbest_size` must be a scalar or vector. got shape: ",
|
||||
nbest_size_tensor->shape().DebugString()));
|
||||
if (nbest_size_tensor->dims() == 1) {
|
||||
OP_REQUIRES(
|
||||
context, batch_size == nbest_size_tensor->dim_size(0),
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`nbest_size` must have the same batch size as `input`."));
|
||||
}
|
||||
|
||||
const Tensor* alpha_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context, context->input("alpha", &alpha_tensor));
|
||||
OP_REQUIRES(context, alpha_tensor->dims() <= 1,
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`alpha` must be a scalar or vector, got shape: ",
|
||||
alpha_tensor->shape().DebugString()));
|
||||
if (alpha_tensor->dims() == 1) {
|
||||
OP_REQUIRES(context, batch_size == alpha_tensor->dim_size(0),
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`alpha` must have the same batch size as `input`."));
|
||||
}
|
||||
|
||||
std::vector<std::vector<U>> pieces(batch_size);
|
||||
|
||||
for (int64 i = 0; i < batch_size; ++i) {
|
||||
const int32 nbest_size = nbest_size_tensor->dims() == 1
|
||||
? nbest_size_tensor->vec<int32>()(i)
|
||||
: nbest_size_tensor->scalar<int32>()();
|
||||
if (nbest_size == 0 || nbest_size == 1) {
|
||||
OP_REQUIRES_OK(context,
|
||||
ToTFStatus(sentencepiece_processor_->Encode(
|
||||
absl::string_view(input_sentences(i)), &pieces[i])));
|
||||
} else {
|
||||
const float alpha = alpha_tensor->dims() == 1
|
||||
? alpha_tensor->vec<float>()(i)
|
||||
: alpha_tensor->scalar<float>()();
|
||||
OP_REQUIRES_OK(context,
|
||||
ToTFStatus(sentencepiece_processor_->SampleEncode(
|
||||
absl::string_view(input_sentences(i)), nbest_size,
|
||||
alpha, &pieces[i])));
|
||||
}
|
||||
RewritePieces(&pieces[i]);
|
||||
}
|
||||
|
||||
MakeOutputTensor(context, pieces);
|
||||
}
|
||||
|
||||
protected:
|
||||
void RewritePieces(std::vector<std::string>* pieces) const {
|
||||
if (reverse_) std::reverse(pieces->begin(), pieces->end());
|
||||
if (bos_id_ > 0)
|
||||
pieces->insert(pieces->begin(),
|
||||
sentencepiece_processor_->IdToPiece(bos_id_));
|
||||
if (eos_id_ > 0)
|
||||
pieces->push_back(sentencepiece_processor_->IdToPiece(eos_id_));
|
||||
}
|
||||
|
||||
void RewritePieces(std::vector<int32>* pieces) const {
|
||||
if (reverse_) std::reverse(pieces->begin(), pieces->end());
|
||||
if (bos_id_ > 0) pieces->insert(pieces->begin(), bos_id_);
|
||||
if (eos_id_ > 0) pieces->push_back(eos_id_);
|
||||
}
|
||||
|
||||
virtual void MakeOutputTensor(OpKernelContext* context,
|
||||
const std::vector<std::vector<U>>& pieces) = 0;
|
||||
};
|
||||
|
||||
template <typename T, typename U = T>
|
||||
class SentencePieceEncodeSparseOp : public SentencePieceEncodeOpBase<T, U> {
|
||||
public:
|
||||
explicit SentencePieceEncodeSparseOp(OpKernelConstruction* context)
|
||||
: SentencePieceEncodeOpBase<T, U>(context) {}
|
||||
|
||||
protected:
|
||||
void MakeOutputTensor(OpKernelContext* context,
|
||||
const std::vector<std::vector<U>>& pieces) override {
|
||||
const int64 batch_size = pieces.size();
|
||||
|
||||
int64 max_sequence_length = 0;
|
||||
int64 indices_size = 0;
|
||||
for (int row = 0; row < batch_size; ++row) {
|
||||
const int col_size = pieces[row].size();
|
||||
max_sequence_length = std::max<int64>(col_size, max_sequence_length);
|
||||
indices_size += col_size;
|
||||
}
|
||||
|
||||
// Creates the indices output tensor.
|
||||
Tensor* indices_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context, context->allocate_output(0, {indices_size, 2},
|
||||
&indices_tensor));
|
||||
|
||||
auto indices_tensor_output = indices_tensor->matrix<int64>();
|
||||
int item_idx = 0;
|
||||
for (int row = 0; row < batch_size; ++row) {
|
||||
for (int col = 0; col < pieces[row].size(); ++col) {
|
||||
indices_tensor_output(item_idx, 0) = row;
|
||||
indices_tensor_output(item_idx, 1) = col;
|
||||
++item_idx;
|
||||
}
|
||||
}
|
||||
|
||||
// Creates the values output tensor.
|
||||
Tensor* values_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context,
|
||||
context->allocate_output(1, {indices_size}, &values_tensor));
|
||||
|
||||
auto values_tensor_output = values_tensor->flat<T>();
|
||||
item_idx = 0;
|
||||
for (int row = 0; row < batch_size; ++row) {
|
||||
std::copy(pieces[row].begin(), pieces[row].end(),
|
||||
&values_tensor_output(item_idx));
|
||||
item_idx += pieces[row].size();
|
||||
}
|
||||
|
||||
// Creates the shape output tensor.
|
||||
Tensor* shape_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context, context->allocate_output(2, {2}, &shape_tensor));
|
||||
|
||||
auto shape_tensor_output = shape_tensor->flat<int64>();
|
||||
shape_tensor_output(0) = batch_size;
|
||||
shape_tensor_output(1) = max_sequence_length;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename U = T>
|
||||
class SentencePieceEncodeDenseOp : public SentencePieceEncodeOpBase<T, U> {
|
||||
public:
|
||||
explicit SentencePieceEncodeDenseOp(OpKernelConstruction* context)
|
||||
: SentencePieceEncodeOpBase<T, U>(context) {
|
||||
this->GetPad(&pad_);
|
||||
}
|
||||
|
||||
// protected:
|
||||
void MakeOutputTensor(OpKernelContext* context,
|
||||
const std::vector<std::vector<U>>& pieces) override {
|
||||
const int64 batch_size = pieces.size();
|
||||
|
||||
int64 max_sequence_length = 0;
|
||||
for (int row = 0; row < batch_size; ++row) {
|
||||
max_sequence_length =
|
||||
std::max<int64>(pieces[row].size(), max_sequence_length);
|
||||
}
|
||||
|
||||
Tensor* values_tensor = nullptr;
|
||||
Tensor* length_tensor = nullptr;
|
||||
|
||||
OP_REQUIRES_OK(
|
||||
context, context->allocate_output(0, {batch_size, max_sequence_length},
|
||||
&values_tensor));
|
||||
OP_REQUIRES_OK(context,
|
||||
context->allocate_output(1, {batch_size}, &length_tensor));
|
||||
|
||||
auto values_tensor_output = values_tensor->matrix<T>();
|
||||
auto length_tensor_output = length_tensor->vec<int32>();
|
||||
|
||||
U pad = pad_;
|
||||
|
||||
for (int row = 0; row < batch_size; ++row) {
|
||||
for (int col = 0; col < max_sequence_length; ++col) {
|
||||
values_tensor_output(row, col) =
|
||||
col < pieces[row].size() ? pieces[row][col] : pad;
|
||||
}
|
||||
length_tensor_output(row) = pieces[row].size();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
T pad_;
|
||||
};
|
||||
|
||||
template <typename T, typename U = T>
|
||||
class SentencePieceDecodeOp : public SentencePieceBaseOp {
|
||||
public:
|
||||
explicit SentencePieceDecodeOp(OpKernelConstruction* context)
|
||||
: SentencePieceBaseOp(context) {}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor* input_tensor = nullptr;
|
||||
const Tensor* length_tensor = nullptr;
|
||||
|
||||
OP_REQUIRES_OK(context, context->input("input", &input_tensor));
|
||||
OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_tensor->shape()),
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`input` must be a 2-D matrix. got shape: ",
|
||||
input_tensor->shape().DebugString()));
|
||||
OP_REQUIRES_OK(context, context->input("sequence_length", &length_tensor));
|
||||
OP_REQUIRES(context, TensorShapeUtils::IsVector(length_tensor->shape()),
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`sequence_length` must be a vector. got shape: ",
|
||||
length_tensor->shape().DebugString()));
|
||||
OP_REQUIRES(
|
||||
context, input_tensor->dim_size(0) == length_tensor->dim_size(0),
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`sequence_length` must have the same batch size as `input`."));
|
||||
|
||||
const auto& input_sentences = input_tensor->matrix<T>();
|
||||
const auto& sequence_length = length_tensor->vec<int32>();
|
||||
const int64 batch_size = input_tensor->dim_size(0);
|
||||
const int max_sequence_length = input_tensor->dim_size(1);
|
||||
|
||||
Tensor* values_tensor = nullptr;
|
||||
OP_REQUIRES_OK(context,
|
||||
context->allocate_output(0, {batch_size}, &values_tensor));
|
||||
auto values_tensor_output = values_tensor->vec<tstring>();
|
||||
|
||||
for (int64 i = 0; i < batch_size; ++i) {
|
||||
OP_REQUIRES(context,
|
||||
(sequence_length(i) >= 0 &&
|
||||
sequence_length(i) <= max_sequence_length),
|
||||
::tensorflow::errors::InvalidArgument(
|
||||
"`sequence_length` is out-of-range."));
|
||||
std::vector<U> pieces(&input_sentences(i, 0),
|
||||
&input_sentences(i, 0) + sequence_length(i));
|
||||
if (reverse_) std::reverse(pieces.begin(), pieces.end());
|
||||
std::string detokenized_str;
|
||||
OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_->Decode(
|
||||
pieces, &detokenized_str)));
|
||||
values_tensor_output(i) = detokenized_str;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
namespace {
|
||||
// The snake case of this variables are used as the function names.
|
||||
constexpr char kGetPieceSizeOpName[] = "SentencepieceGetPieceSize";
|
||||
constexpr char kPieceToIdOpName[] = "SentencepiecePieceToId";
|
||||
constexpr char kIdToPieceOpName[] = "SentencepieceIdToPiece";
|
||||
constexpr char kGetPieceTypeOpName[] = "SentencepieceGetPieceType";
|
||||
constexpr char kEncodeDenseOpName[] = "SentencepieceEncodeDense";
|
||||
constexpr char kEncodeSparseOpName[] = "SentencepieceEncodeSparse";
|
||||
constexpr char kDecodeOpName[] = "SentencepieceDecode";
|
||||
} // namespace
|
||||
|
||||
REGISTER_OP(kGetPieceSizeOpName)
|
||||
.Output("vocab_size: int32")
|
||||
.Attr("model_file: string = ''")
|
||||
.Attr("model_proto: string = ''")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
c->set_output(0, c->MakeShape({}));
|
||||
return ::tensorflow::Status::OK();
|
||||
});
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name(kGetPieceSizeOpName).Device(DEVICE_CPU),
|
||||
SentencePieceGetPieceSizeOp);
|
||||
|
||||
REGISTER_OP(kPieceToIdOpName)
|
||||
.Input("input: string")
|
||||
.Output("values: int32")
|
||||
.Attr("model_file: string = ''")
|
||||
.Attr("model_proto: string = ''")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
c->set_output(0, c->input(0));
|
||||
return ::tensorflow::Status::OK();
|
||||
});
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name(kPieceToIdOpName).Device(DEVICE_CPU),
|
||||
SentencePieceConvertPieceOp<tstring, int32>);
|
||||
|
||||
REGISTER_OP(kIdToPieceOpName)
|
||||
.Input("input: int32")
|
||||
.Output("values: string")
|
||||
.Attr("model_file: string = ''")
|
||||
.Attr("model_proto: string = ''")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
c->set_output(0, c->input(0));
|
||||
return ::tensorflow::Status::OK();
|
||||
});
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name(kIdToPieceOpName).Device(DEVICE_CPU),
|
||||
SentencePieceConvertPieceOp<int32, tstring>);
|
||||
|
||||
REGISTER_OP(kGetPieceTypeOpName)
|
||||
.Input("input: int32")
|
||||
.Output("values: bool")
|
||||
.Attr("model_file: string = ''")
|
||||
.Attr("model_proto: string = ''")
|
||||
.Attr("piece_type: int = 0")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
c->set_output(0, c->input(0));
|
||||
return ::tensorflow::Status::OK();
|
||||
});
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name(kGetPieceTypeOpName).Device(DEVICE_CPU),
|
||||
SentencePieceGetPieceTypeOp);
|
||||
|
||||
REGISTER_OP(kEncodeDenseOpName)
|
||||
.Attr("out_type: {int32, string} = DT_INT32")
|
||||
.Input("input: string")
|
||||
.Input("nbest_size: int32")
|
||||
.Input("alpha: float")
|
||||
.Output("values: out_type")
|
||||
.Output("sequence_length: int32")
|
||||
.Attr("model_file: string = ''")
|
||||
.Attr("model_proto: string = ''")
|
||||
.Attr("reverse: bool = false")
|
||||
.Attr("add_bos: bool = false")
|
||||
.Attr("add_eos: bool = false")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
ShapeHandle input, nbest, alpha;
|
||||
TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
|
||||
TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest));
|
||||
TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha));
|
||||
DimensionHandle batch_size = c->Dim(input, 0);
|
||||
if (c->Rank(nbest) == 1)
|
||||
TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size));
|
||||
if (c->Rank(alpha) == 1)
|
||||
TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size));
|
||||
c->set_output(0, c->MakeShape({batch_size, c->UnknownDim()}));
|
||||
c->set_output(1, c->MakeShape({batch_size}));
|
||||
return ::tensorflow::Status::OK();
|
||||
});
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName)
|
||||
.Device(DEVICE_CPU)
|
||||
.TypeConstraint<int32>("out_type"),
|
||||
SentencePieceEncodeDenseOp<int32>);
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName)
|
||||
.Device(DEVICE_CPU)
|
||||
.TypeConstraint<tstring>("out_type"),
|
||||
SentencePieceEncodeDenseOp<tstring, std::string>);
|
||||
|
||||
REGISTER_OP(kEncodeSparseOpName)
|
||||
.Attr("out_type: {int32, string} = DT_INT32")
|
||||
.Input("input: string")
|
||||
.Input("nbest_size: int32")
|
||||
.Input("alpha: float")
|
||||
.Output("indices: int64")
|
||||
.Output("values: out_type")
|
||||
.Output("dense_shape: int64")
|
||||
.Attr("model_file: string = ''")
|
||||
.Attr("model_proto: string = ''")
|
||||
.Attr("reverse: bool = false")
|
||||
.Attr("add_bos: bool = false")
|
||||
.Attr("add_eos: bool = false")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
ShapeHandle input, nbest, alpha;
|
||||
TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
|
||||
TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest));
|
||||
TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha));
|
||||
DimensionHandle batch_size = c->Dim(input, 0);
|
||||
if (c->Rank(nbest) == 1)
|
||||
TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size));
|
||||
if (c->Rank(alpha) == 1)
|
||||
TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size));
|
||||
c->set_output(0, c->MakeShape({c->UnknownDim(), 2}));
|
||||
c->set_output(1, c->MakeShape({c->UnknownDim()}));
|
||||
c->set_output(2, c->MakeShape({2}));
|
||||
return ::tensorflow::Status::OK();
|
||||
});
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName)
|
||||
.Device(DEVICE_CPU)
|
||||
.TypeConstraint<int32>("out_type"),
|
||||
SentencePieceEncodeSparseOp<int32>);
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName)
|
||||
.Device(DEVICE_CPU)
|
||||
.TypeConstraint<tstring>("out_type"),
|
||||
SentencePieceEncodeSparseOp<tstring, std::string>);
|
||||
|
||||
REGISTER_OP(kDecodeOpName)
|
||||
.Attr("T: {int32, string}")
|
||||
.Input("input: T")
|
||||
.Input("sequence_length: int32")
|
||||
.Output("values: string")
|
||||
.Attr("model_file: string = ''")
|
||||
.Attr("model_proto: string = ''")
|
||||
.Attr("reverse: bool = false")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
ShapeHandle input, sequence_length;
|
||||
TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
|
||||
TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sequence_length));
|
||||
DimensionHandle batch_size = c->Dim(input, 0);
|
||||
TF_RETURN_IF_ERROR(
|
||||
c->Merge(batch_size, c->Dim(sequence_length, 0), &batch_size));
|
||||
c->set_output(0, c->MakeShape({batch_size}));
|
||||
return ::tensorflow::Status::OK();
|
||||
});
|
||||
|
||||
REGISTER_KERNEL_BUILDER(
|
||||
Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<int32>("T"),
|
||||
SentencePieceDecodeOp<int32>);
|
||||
|
||||
REGISTER_KERNEL_BUILDER(
|
||||
Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<tstring>("T"),
|
||||
SentencePieceDecodeOp<tstring, std::string>);
|
||||
} // namespace sentencepiece
|
@ -1,52 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2018 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.!
|
||||
|
||||
from setuptools import setup
|
||||
from setuptools import find_packages
|
||||
import tensorflow as tf
|
||||
import codecs
|
||||
import string
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.join('.', 'test'))
|
||||
|
||||
with codecs.open(os.path.join('..', 'VERSION'), 'r', 'utf-8') as f:
|
||||
version = f.read()
|
||||
|
||||
setup(name = 'tf_sentencepiece',
|
||||
author = 'Taku Kudo',
|
||||
author_email='taku@google.com',
|
||||
description = 'SentencePiece Encode/Decode ops for TensorFlow',
|
||||
version=version,
|
||||
url = 'https://github.com/google/sentencepiece',
|
||||
license = 'Apache',
|
||||
platforms = 'Unix',
|
||||
packages=find_packages(exclude=['test']),
|
||||
package_data={'tf_sentencepiece': ['_sentencepiece_processor_ops.so*']},
|
||||
classifiers = [
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Operating System :: Unix',
|
||||
'Programming Language :: Python',
|
||||
'Topic :: Text Processing :: Linguistic',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules'
|
||||
],
|
||||
keywords='tensorflow machine learning sentencepiece NLP segmentation',
|
||||
test_suite = 'tf_sentencepiece_test.suite')
|
@ -1,322 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import itertools as it
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
import tensorflow as tf
|
||||
import tf_sentencepiece as tfspm
|
||||
|
||||
try:
|
||||
tf.Session = tf.compat.v1.Session
|
||||
tf.sparse_tensor_to_dense = tf.compat.v1.sparse_tensor_to_dense
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
class SentencePieceProcssorOpTest(unittest.TestCase):
|
||||
|
||||
def _getSentencePieceModelFile(self):
|
||||
return os.path.join('..', 'python', 'test', 'test_model.model')
|
||||
|
||||
def _getPieceSize(self):
|
||||
return 1000
|
||||
|
||||
def _getExpected(self,
|
||||
reverse=False,
|
||||
add_bos=False,
|
||||
add_eos=False,
|
||||
padding=''):
|
||||
# TF uses str(bytes) as a string representation.
|
||||
padding = padding.encode('utf8')
|
||||
sentences = [
|
||||
b'Hello world.', b'I have a pen.', b'I saw a girl with a telescope.'
|
||||
]
|
||||
pieces = [[b'\xe2\x96\x81He', b'll', b'o', b'\xe2\x96\x81world', b'.'],
|
||||
[
|
||||
b'\xe2\x96\x81I', b'\xe2\x96\x81have', b'\xe2\x96\x81a',
|
||||
b'\xe2\x96\x81p', b'en', b'.'
|
||||
],
|
||||
[
|
||||
b'\xe2\x96\x81I', b'\xe2\x96\x81saw', b'\xe2\x96\x81a',
|
||||
b'\xe2\x96\x81girl', b'\xe2\x96\x81with', b'\xe2\x96\x81a',
|
||||
b'\xe2\x96\x81', b'te', b'le', b's', b'c', b'o', b'pe', b'.'
|
||||
]]
|
||||
ids = [[151, 88, 21, 887, 6], [9, 76, 11, 68, 98, 6],
|
||||
[9, 459, 11, 939, 44, 11, 4, 142, 82, 8, 28, 21, 132, 6]]
|
||||
seq_len = [5, 6, 14]
|
||||
|
||||
if reverse:
|
||||
ids = [x[::-1] for x in ids]
|
||||
pieces = [x[::-1] for x in pieces]
|
||||
|
||||
if add_bos:
|
||||
ids = [[1] + x for x in ids]
|
||||
pieces = [[b'<s>'] + x for x in pieces]
|
||||
seq_len = [x + 1 for x in seq_len]
|
||||
|
||||
if add_eos:
|
||||
ids = [x + [2] for x in ids]
|
||||
pieces = [x + [b'</s>'] for x in pieces]
|
||||
seq_len = [x + 1 for x in seq_len]
|
||||
|
||||
max_len = max(seq_len)
|
||||
pieces = [x + [padding] * (max_len - len(x)) for x in pieces]
|
||||
ids = [x + [0] * (max_len - len(x)) for x in ids]
|
||||
|
||||
return sentences, pieces, ids, seq_len
|
||||
|
||||
def testGetPieceSize(self):
|
||||
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||
|
||||
with tf.Session():
|
||||
s = tfspm.piece_size(model_file=sentencepiece_model_file)
|
||||
self.assertEqual(s.eval(), self._getPieceSize())
|
||||
|
||||
def testConvertPiece(self):
|
||||
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||
(sentences, expected_pieces, expected_ids,
|
||||
expected_seq_len) = self._getExpected(padding='<unk>')
|
||||
|
||||
with tf.Session():
|
||||
ids_matrix = tfspm.piece_to_id(
|
||||
tf.constant(expected_pieces), model_file=sentencepiece_model_file)
|
||||
ids_vec = tfspm.piece_to_id(
|
||||
tf.constant(expected_pieces[0]), model_file=sentencepiece_model_file)
|
||||
ids_scalar = tfspm.piece_to_id(
|
||||
tf.constant(expected_pieces[0][0]),
|
||||
model_file=sentencepiece_model_file)
|
||||
|
||||
self.assertEqual(ids_matrix.eval().tolist(), expected_ids)
|
||||
self.assertEqual(ids_vec.eval().tolist(), expected_ids[0])
|
||||
self.assertEqual(ids_scalar.eval(), expected_ids[0][0])
|
||||
|
||||
pieces_matrix = tfspm.id_to_piece(
|
||||
tf.constant(expected_ids), model_file=sentencepiece_model_file)
|
||||
pieces_vec = tfspm.id_to_piece(
|
||||
tf.constant(expected_ids[0]), model_file=sentencepiece_model_file)
|
||||
pieces_scalar = tfspm.id_to_piece(
|
||||
tf.constant(expected_ids[0][0]), model_file=sentencepiece_model_file)
|
||||
|
||||
self.assertEqual(pieces_matrix.eval().tolist(), expected_pieces)
|
||||
self.assertEqual(pieces_vec.eval().tolist(), expected_pieces[0])
|
||||
self.assertEqual(pieces_scalar.eval(), expected_pieces[0][0])
|
||||
|
||||
def testEncodeAndDecode(self):
|
||||
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||
|
||||
with tf.Session():
|
||||
for reverse, add_bos, add_eos in list(
|
||||
it.product((True, False), repeat=3)):
|
||||
(sentences, expected_pieces, expected_ids,
|
||||
expected_seq_len) = self._getExpected(
|
||||
reverse=reverse, add_bos=add_bos, add_eos=add_eos)
|
||||
|
||||
# Encode sentences into pieces/ids.
|
||||
s = tf.constant(sentences)
|
||||
pieces, seq_len1 = tfspm.encode(
|
||||
s,
|
||||
model_file=sentencepiece_model_file,
|
||||
reverse=reverse,
|
||||
add_bos=add_bos,
|
||||
add_eos=add_eos,
|
||||
out_type=tf.string)
|
||||
ids, seq_len2 = tfspm.encode(
|
||||
s,
|
||||
model_file=sentencepiece_model_file,
|
||||
reverse=reverse,
|
||||
add_bos=add_bos,
|
||||
add_eos=add_eos)
|
||||
|
||||
self.assertEqual(pieces.eval().tolist(), expected_pieces)
|
||||
self.assertEqual(ids.eval().tolist(), expected_ids)
|
||||
self.assertEqual(seq_len1.eval().tolist(), expected_seq_len)
|
||||
self.assertEqual(seq_len2.eval().tolist(), expected_seq_len)
|
||||
|
||||
# Decode pieces into sentences/ids.
|
||||
pieces = tf.constant(expected_pieces)
|
||||
ids = tf.constant(expected_ids)
|
||||
seq_len = tf.constant(expected_seq_len, dtype=tf.int32)
|
||||
decoded_sentences1 = tfspm.decode(
|
||||
pieces,
|
||||
seq_len,
|
||||
model_file=sentencepiece_model_file,
|
||||
reverse=reverse)
|
||||
decoded_sentences2 = tfspm.decode(
|
||||
ids, seq_len, model_file=sentencepiece_model_file, reverse=reverse)
|
||||
|
||||
self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
|
||||
self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
|
||||
|
||||
def testSampleEncodeAndDecode(self):
|
||||
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||
sentences, _, _, _ = self._getExpected()
|
||||
|
||||
with tf.Session():
|
||||
for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]:
|
||||
# Round trip test.
|
||||
nbest_size = tf.constant(n)
|
||||
alpha = tf.constant(a)
|
||||
s = tf.constant(sentences)
|
||||
|
||||
pieces, seq_len1 = tfspm.encode(
|
||||
s,
|
||||
nbest_size=nbest_size,
|
||||
alpha=alpha,
|
||||
model_file=sentencepiece_model_file,
|
||||
out_type=tf.string)
|
||||
ids, seq_len2 = tfspm.encode(
|
||||
s,
|
||||
nbest_size=nbest_size,
|
||||
alpha=alpha,
|
||||
model_file=sentencepiece_model_file)
|
||||
decoded_sentences1 = tfspm.decode(
|
||||
pieces, seq_len1, model_file=sentencepiece_model_file)
|
||||
decoded_sentences2 = tfspm.decode(
|
||||
ids, seq_len2, model_file=sentencepiece_model_file)
|
||||
|
||||
self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
|
||||
self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
|
||||
|
||||
def testEncodeAndDecodeSparse(self):
|
||||
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||
|
||||
with tf.Session():
|
||||
for reverse, add_bos, add_eos in list(
|
||||
it.product((True, False), repeat=3)):
|
||||
(sentences, expected_pieces, expected_ids,
|
||||
_) = self._getExpected(reverse, add_bos, add_eos)
|
||||
|
||||
# Encode sentences into sparse pieces/ids.
|
||||
s = tf.constant(sentences)
|
||||
pieces = tfspm.encode_sparse(
|
||||
s,
|
||||
model_file=sentencepiece_model_file,
|
||||
reverse=reverse,
|
||||
add_bos=add_bos,
|
||||
add_eos=add_eos,
|
||||
out_type=tf.string)
|
||||
ids = tfspm.encode_sparse(
|
||||
s,
|
||||
model_file=sentencepiece_model_file,
|
||||
reverse=reverse,
|
||||
add_bos=add_bos,
|
||||
add_eos=add_eos)
|
||||
pieces = tf.sparse_tensor_to_dense(pieces, default_value='')
|
||||
ids = tf.sparse_tensor_to_dense(ids, default_value=0)
|
||||
|
||||
self.assertEqual(ids.eval().tolist(), expected_ids)
|
||||
self.assertEqual(pieces.eval().tolist(), expected_pieces)
|
||||
|
||||
def testGetPieceType(self):
|
||||
sentencepiece_model_file = self._getSentencePieceModelFile()
|
||||
expected_is_unknown = []
|
||||
expected_is_control = []
|
||||
expected_is_unused = []
|
||||
ids = []
|
||||
|
||||
for i in range(self._getPieceSize()):
|
||||
ids.append(i)
|
||||
expected_is_unknown.append(i == 0)
|
||||
expected_is_control.append(i == 1 or i == 2)
|
||||
expected_is_unused.append(False)
|
||||
|
||||
with tf.Session():
|
||||
s = tf.constant(ids)
|
||||
is_unknown = tfspm.is_unknown(s, model_file=sentencepiece_model_file)
|
||||
is_control = tfspm.is_control(s, model_file=sentencepiece_model_file)
|
||||
is_unused = tfspm.is_unused(s, model_file=sentencepiece_model_file)
|
||||
|
||||
self.assertEqual(is_unknown.eval().tolist(), expected_is_unknown)
|
||||
self.assertEqual(is_control.eval().tolist(), expected_is_control)
|
||||
self.assertEqual(is_unused.eval().tolist(), expected_is_unused)
|
||||
|
||||
def testLoadModelProto(self):
|
||||
# Makes a serialized model proto.
|
||||
with open(self._getSentencePieceModelFile(), 'rb') as f:
|
||||
model_proto = f.read()
|
||||
with tf.Session() as sess:
|
||||
sentences = ['Hello world.']
|
||||
a = tf.constant(sentences)
|
||||
sess.run(tfspm.encode(a, model_proto=model_proto, out_type=tf.string))
|
||||
|
||||
def testInvalidModelPath(self):
|
||||
with tf.Session() as sess:
|
||||
with self.assertRaises(tf.errors.NotFoundError):
|
||||
sentences = ['Hello world.']
|
||||
a = tf.constant(sentences)
|
||||
sess.run(tfspm.encode(a, model_file='invalid path', out_type=tf.string))
|
||||
|
||||
def testInvalidModelProto(self):
|
||||
with tf.Session() as sess:
|
||||
with self.assertRaises(tf.errors.InternalError):
|
||||
sentences = ['Hello world.']
|
||||
a = tf.constant(sentences)
|
||||
sess.run(
|
||||
tfspm.encode(a, model_proto='invalid proto', out_type=tf.string))
|
||||
|
||||
def testInvalidInput(self):
|
||||
sentences = ['Hello world.', 'This is a test.']
|
||||
ids = [[0, 1], [2, 3]]
|
||||
model_file = self._getSentencePieceModelFile()
|
||||
with tf.Session() as sess:
|
||||
a = tf.constant(sentences)
|
||||
b = tf.constant(ids)
|
||||
|
||||
alpha = tf.constant([1.0, 2.0])
|
||||
sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha, name='foo'))
|
||||
|
||||
nbest_size = tf.constant([1, 2], dtype=tf.int32)
|
||||
sess.run(
|
||||
tfspm.encode(
|
||||
a, model_file=model_file, nbest_size=nbest_size, name='foo'))
|
||||
|
||||
alpha = tf.constant(1.0)
|
||||
sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha, name='foo'))
|
||||
|
||||
nbest_size = tf.constant(10, dtype=tf.int32)
|
||||
sess.run(
|
||||
tfspm.encode(
|
||||
a, model_file=model_file, nbest_size=nbest_size, name='foo'))
|
||||
|
||||
sess.run(
|
||||
tfspm.decode(
|
||||
b, sequence_length=tf.constant([2, 2]), model_file=model_file))
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
a = tf.constant(sentences)
|
||||
alpha = tf.constant([1.0, 2.0, 3.0])
|
||||
sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha))
|
||||
with self.assertRaises(ValueError):
|
||||
a = tf.constant(sentences)
|
||||
nbest_size = tf.constant([1, 2, 3], dtype=tf.int32)
|
||||
sess.run(tfspm.encode(a, model_file=model_file, nbest_size=nbest_size))
|
||||
with self.assertRaises(ValueError):
|
||||
a = tf.constant(sentences)
|
||||
alpha = tf.constant([[1.0], [2.0]])
|
||||
sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha))
|
||||
with self.assertRaises(ValueError):
|
||||
a = tf.constant(sentences)
|
||||
nbest_size = tf.constant([[1], [2]], dtype=tf.int32)
|
||||
sess.run(tfspm.encode(a, model_file=model_file, nbest_size=nbest_size))
|
||||
with self.assertRaises(ValueError):
|
||||
b = tf.constant(ids)
|
||||
sess.run(tfspm.decode(a, sequence_length=2, model_file=model_file))
|
||||
with self.assertRaises(ValueError):
|
||||
b = tf.constant(ids)
|
||||
sess.run(
|
||||
tfspm.decode(
|
||||
a,
|
||||
sequence_length=tf.constant([2, 2, 2]),
|
||||
model_file=model_file))
|
||||
|
||||
|
||||
def suite():
|
||||
suite = unittest.TestSuite()
|
||||
suite.addTests(unittest.makeSuite(SentencePieceProcssorOpTest))
|
||||
return suite
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -1,5 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tf_sentencepiece.sentencepiece_processor_ops import *
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,317 +0,0 @@
|
||||
# Copyright 2018 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.!
|
||||
r"""Ops for SentencePiece Encoding/Decoding."""
|
||||
|
||||
# TODO(taku): Implements n-best output
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
import warnings
|
||||
import glob
|
||||
import re
|
||||
import os
|
||||
import tensorflow as tf
|
||||
|
||||
so_base = os.path.join(
|
||||
os.path.dirname(__file__), '_sentencepiece_processor_ops.so')
|
||||
so_file = so_base + '.' + tf.__version__
|
||||
|
||||
if not hasattr(tf, 'no_gradient'):
|
||||
tf.no_gradient = tf.NotDifferentiable
|
||||
|
||||
if not os.path.exists(so_file):
|
||||
versions = [
|
||||
re.search('so.([0-9]+\.[0-9\.]+.*)$', os.path.basename(n)).group(1)
|
||||
for n in glob.glob(so_base + '.*')
|
||||
]
|
||||
latest = sorted(versions, key=LooseVersion)[-1]
|
||||
warnings.warn('No so file is found for [%s] from [%s]' %
|
||||
(tf.__version__, ', '.join(versions)))
|
||||
warnings.warn('use the latest version %s' % (latest))
|
||||
so_file = so_base + '.' + latest
|
||||
|
||||
_gen_sentencepiece_processor_op = tf.load_op_library(so_file)
|
||||
|
||||
|
||||
def piece_size(model_file=None, model_proto=None, name=None):
|
||||
"""Returns the piece size (vocabulary size).
|
||||
|
||||
Args:
|
||||
model_file: The sentencepiece model file path.
|
||||
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
||||
or `model_proto` must be set.
|
||||
name: The name argument that is passed to the op function.
|
||||
|
||||
Returns:
|
||||
A scalar representing the vocabulary size.
|
||||
"""
|
||||
|
||||
return _gen_sentencepiece_processor_op.sentencepiece_get_piece_size(
|
||||
model_file=model_file, model_proto=model_proto, name=name)
|
||||
|
||||
|
||||
def piece_to_id(input, model_file=None, model_proto=None, name=None):
|
||||
"""Converts piece into vocabulary id.
|
||||
|
||||
Args:
|
||||
input: An arbitrary tensor of string.
|
||||
model_file: The sentencepiece model file path.
|
||||
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
||||
or `model_proto` must be set.
|
||||
name: The name argument that is passed to the op function.
|
||||
|
||||
Returns:
|
||||
A tensor of int32 with the same shape as input.
|
||||
"""
|
||||
|
||||
return _gen_sentencepiece_processor_op.sentencepiece_piece_to_id(
|
||||
input, model_file=model_file, model_proto=model_proto, name=name)
|
||||
|
||||
|
||||
def id_to_piece(input, model_file=None, model_proto=None, name=None):
|
||||
"""Converts vocabulary id into piece.
|
||||
|
||||
Args:
|
||||
input: An arbitrary tensor of int32.
|
||||
model_file: The sentencepiece model file path.
|
||||
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
||||
or `model_proto` must be set.
|
||||
name: The name argument that is passed to the op function.
|
||||
|
||||
Returns:
|
||||
A tensor of string with the same shape as input.
|
||||
"""
|
||||
|
||||
return _gen_sentencepiece_processor_op.sentencepiece_id_to_piece(
|
||||
input, model_file=model_file, model_proto=model_proto, name=name)
|
||||
|
||||
|
||||
def is_unknown(input, model_file=None, model_proto=None, name=None):
|
||||
"""Returns true if input id is unknown piece.
|
||||
|
||||
Args:
|
||||
input: An arbitrary tensor of int32.
|
||||
model_file: The sentencepiece model file path.
|
||||
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
||||
or `model_proto` must be set.
|
||||
name: The name argument that is passed to the op function.
|
||||
|
||||
Returns:
|
||||
A tensor of bool with the same shape as input.
|
||||
"""
|
||||
|
||||
return _gen_sentencepiece_processor_op.sentencepiece_get_piece_type(
|
||||
input,
|
||||
model_file=model_file,
|
||||
model_proto=model_proto,
|
||||
name=name,
|
||||
piece_type=0)
|
||||
|
||||
|
||||
def is_control(input, model_file=None, model_proto=None, name=None):
|
||||
"""Returns true if input id is control piece.
|
||||
|
||||
Args:
|
||||
input: An arbitrary tensor of int32.
|
||||
model_file: The sentencepiece model file path.
|
||||
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
||||
or `model_proto` must be set.
|
||||
name: The name argument that is passed to the op function.
|
||||
|
||||
Returns:
|
||||
A tensor of bool with the same shape as input.
|
||||
"""
|
||||
|
||||
return _gen_sentencepiece_processor_op.sentencepiece_get_piece_type(
|
||||
input,
|
||||
model_file=model_file,
|
||||
model_proto=model_proto,
|
||||
name=name,
|
||||
piece_type=1)
|
||||
|
||||
|
||||
def is_unused(input, model_file=None, model_proto=None, name=None):
|
||||
"""Returns true if input id is unused piece.
|
||||
|
||||
Args:
|
||||
input: An arbitrary tensor of int32.
|
||||
model_file: The sentencepiece model file path.
|
||||
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
||||
or `model_proto` must be set.
|
||||
name: The name argument that is passed to the op function.
|
||||
|
||||
Returns:
|
||||
A tensor of bool with the same shape as input.
|
||||
"""
|
||||
|
||||
return _gen_sentencepiece_processor_op.sentencepiece_get_piece_type(
|
||||
input,
|
||||
model_file=model_file,
|
||||
model_proto=model_proto,
|
||||
name=name,
|
||||
piece_type=2)
|
||||
|
||||
|
||||
def encode_dense(input_sentences,
|
||||
nbest_size=0,
|
||||
alpha=1.0,
|
||||
model_file=None,
|
||||
model_proto=None,
|
||||
reverse=False,
|
||||
add_bos=False,
|
||||
add_eos=False,
|
||||
out_type=tf.int32,
|
||||
name=None):
|
||||
"""Encodes sentences into pieces in dense tensor format.
|
||||
|
||||
Args:
|
||||
input_sentences: A 1D string tensor of arbitrary size holding the raw text
|
||||
of input sentences.
|
||||
nbest_size: A scalar or 1D tensor for sampling.
|
||||
nbest_size = {0,1}: No sampling is performed.
|
||||
nbest_size > 1: samples from the nbest_size results.
|
||||
nbest_size < 0: assuming that nbest_size is infinite and samples
|
||||
from the all hypothesis (lattice) using
|
||||
forward-filtering-and-backward-sampling algorithm.
|
||||
alpha: A scalar or 1D tensor for a smoothing parameter. Inverse temperature
|
||||
for probability rescaling.
|
||||
model_file: The sentencepiece model file path.
|
||||
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
||||
or `model_proto` must be set.
|
||||
reverse: Reverses the tokenized sequence (Default = false)
|
||||
add_bos: Add <s> to the result (Default = false)
|
||||
add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
|
||||
reversing (if enabled).
|
||||
out_type: output type. tf.int32 or tf.string (Default = tf.int32) Setting
|
||||
tf.int32 directly encodes the string into an id sequence.
|
||||
name: The name argument that is passed to the op function.
|
||||
|
||||
Returns:
|
||||
pieces: A dense 2D tensor representing the tokenized sentences.
|
||||
sequence_length: A 1D tensor representing the length of pieces.
|
||||
"""
|
||||
|
||||
return _gen_sentencepiece_processor_op.sentencepiece_encode_dense(
|
||||
input_sentences,
|
||||
nbest_size=nbest_size,
|
||||
alpha=alpha,
|
||||
model_file=model_file,
|
||||
model_proto=model_proto,
|
||||
reverse=reverse,
|
||||
add_bos=add_bos,
|
||||
add_eos=add_eos,
|
||||
out_type=out_type,
|
||||
name=name)
|
||||
|
||||
|
||||
def encode_sparse(input_sentences,
|
||||
nbest_size=0,
|
||||
alpha=1.0,
|
||||
model_file=None,
|
||||
model_proto=None,
|
||||
reverse=False,
|
||||
add_bos=False,
|
||||
add_eos=False,
|
||||
out_type=tf.int32,
|
||||
name=None):
|
||||
"""Encodes sentences into pieces in sparse tensor format.
|
||||
|
||||
Args:
|
||||
input_sentences: A 1D string tensor of arbitrary size holding the raw text
|
||||
of input sentences.
|
||||
nbest_size: A scalar or 1D tensor for sampling.
|
||||
nbest_size = {0,1}: No sampling is performed.
|
||||
nbest_size > 1: samples from the nbest_size results.
|
||||
nbest_size < 0: assuming that nbest_size is infinite and samples
|
||||
from the all hypothesis (lattice) using
|
||||
forward-filtering-and-backward-sampling algorithm.
|
||||
alpha: A scalar or 1D tensor for a smoothing parameter. Inverse temperature
|
||||
for probability rescaling.
|
||||
model_file: The sentencepiece model file path.
|
||||
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
||||
or `model_proto` must be set.
|
||||
reverse: Reverses the tokenized sequence (Default = false)
|
||||
add_bos: Add <s> to the result (Default = false)
|
||||
add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
|
||||
reversing (if enabled).
|
||||
out_type: output type. tf.int32 or tf.string (Default = tf.int32) Setting
|
||||
tf.int32 directly encodes the string into an id sequence.
|
||||
name: The name argument that is passed to the op function.
|
||||
|
||||
Returns:
|
||||
pieces: A sparse 2D tensor representing the tokenized sentences.
|
||||
"""
|
||||
|
||||
indices, values, dense_shape = (
|
||||
_gen_sentencepiece_processor_op.sentencepiece_encode_sparse(
|
||||
input_sentences,
|
||||
nbest_size=nbest_size,
|
||||
alpha=alpha,
|
||||
model_file=model_file,
|
||||
model_proto=model_proto,
|
||||
reverse=reverse,
|
||||
add_bos=add_bos,
|
||||
add_eos=add_eos,
|
||||
out_type=out_type,
|
||||
name=name))
|
||||
return tf.SparseTensor(indices, values, dense_shape)
|
||||
|
||||
|
||||
def decode(pieces,
|
||||
sequence_length,
|
||||
model_file=None,
|
||||
model_proto=None,
|
||||
reverse=False,
|
||||
name=None):
|
||||
"""Decode pieces into postprocessed text.
|
||||
|
||||
Args:
|
||||
pieces: A 2D int32 or string tensor [batch_size x max_length] of encoded
|
||||
sequences.
|
||||
sequence_length: A 1D int32 tensor [batch_size] representing the length of
|
||||
pieces.
|
||||
model_file: The sentencepiece model file path.
|
||||
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
||||
or `model_proto` must be set.
|
||||
reverse: Reverses the tokenized sequence (Default = false)
|
||||
name: The name argument that is passed to the op function.
|
||||
|
||||
Returns:
|
||||
text: A 1D string tensor of decoded string.
|
||||
"""
|
||||
|
||||
return _gen_sentencepiece_processor_op.sentencepiece_decode(
|
||||
pieces,
|
||||
sequence_length,
|
||||
model_file=model_file,
|
||||
model_proto=model_proto,
|
||||
reverse=reverse,
|
||||
name=name)
|
||||
|
||||
|
||||
# Adds an alias for encode_dense. Accepts the `encode` function.
|
||||
encode = encode_dense
|
||||
sparse_encode = encode_sparse
|
||||
dense_encode = encode_dense
|
||||
|
||||
tf.no_gradient('SentencepieceGetPieceSize')
|
||||
tf.no_gradient('SentencepieceIdToPiece')
|
||||
tf.no_gradient('SentencepiecePieceToId')
|
||||
tf.no_gradient('SentencepieceGetPieceType')
|
||||
tf.no_gradient('SentencepieceEncodeDense')
|
||||
tf.no_gradient('SentencepieceEncodeSparse')
|
||||
tf.no_gradient('SentencepieceDecode')
|
19
test.sh
19
test.sh
@ -62,15 +62,6 @@ build_python() {
|
||||
cd ..
|
||||
}
|
||||
|
||||
build_tensorflow() {
|
||||
cd tensorflow
|
||||
pip3 install tensorflow
|
||||
python3 setup.py bdist_wheel
|
||||
python3 setup.py sdist
|
||||
python3 setup.py test
|
||||
cd ..
|
||||
}
|
||||
|
||||
build_linux_gcc_coverall_ubuntu() {
|
||||
setup_debian
|
||||
apt-get install -y lcov
|
||||
@ -78,7 +69,6 @@ build_linux_gcc_coverall_ubuntu() {
|
||||
pip3 install 'requests[security]'
|
||||
build_generic
|
||||
build_python
|
||||
build_tensorflow
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake .. -DSPM_COVERAGE=ON
|
||||
@ -92,13 +82,6 @@ build_linux_gcc_ubuntu() {
|
||||
setup_ubuntu
|
||||
build_generic
|
||||
build_python
|
||||
build_tensorflow
|
||||
}
|
||||
|
||||
build_linux_gcc_ubuntu_no_tf() {
|
||||
setup_ubuntu
|
||||
build_generic
|
||||
build_python
|
||||
}
|
||||
|
||||
build_linux_gcc_ubuntu_i386() {
|
||||
@ -111,14 +94,12 @@ build_linux_gcc_debian() {
|
||||
setup_debian
|
||||
build_generic
|
||||
build_python
|
||||
build_tensorflow
|
||||
}
|
||||
|
||||
build_linux_gcc_fedora() {
|
||||
setup_fedora
|
||||
build_generic
|
||||
build_python
|
||||
# build_tensorflow
|
||||
}
|
||||
|
||||
build_linux_clang_ubuntu() {
|
||||
|
Loading…
Reference in New Issue
Block a user