mirror of
https://github.com/google/sentencepiece.git
synced 2024-10-26 11:38:45 +03:00
Added Python wrapper
This commit is contained in:
parent
f56e2aac6a
commit
8177dab62b
8
.gitignore
vendored
8
.gitignore
vendored
@ -30,6 +30,7 @@ Makefile.in
|
|||||||
*.lo
|
*.lo
|
||||||
*.a
|
*.a
|
||||||
*.la
|
*.la
|
||||||
|
*.pyc
|
||||||
|
|
||||||
.libs
|
.libs
|
||||||
.deps
|
.deps
|
||||||
@ -49,3 +50,10 @@ spm_test
|
|||||||
|
|
||||||
*.pb.cc
|
*.pb.cc
|
||||||
*.pb.h
|
*.pb.h
|
||||||
|
|
||||||
|
.DS_Store
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*.pyc
|
||||||
|
@ -1,10 +1,13 @@
|
|||||||
AUTOMAKE_OPTIONS = foreign
|
AUTOMAKE_OPTIONS = foreign
|
||||||
SUBDIRS = src
|
SUBDIRS = src
|
||||||
|
|
||||||
EXTRA_DIRS = m4 third_party data doc
|
EXTRA_DIRS = m4 third_party data doc python
|
||||||
EXTRA_DIST = README.md LICENSE
|
EXTRA_DIST = README.md LICENSE
|
||||||
ACLOCAL_AMFLAGS = -I third_party/m4
|
ACLOCAL_AMFLAGS = -I third_party/m4
|
||||||
|
|
||||||
|
pkgconfigdir = @pkgconfigdir@
|
||||||
|
pkgconfig_DATA = sentencepiece.pc
|
||||||
|
|
||||||
dist-hook:
|
dist-hook:
|
||||||
for subdir in $(EXTRA_DIRS); do \
|
for subdir in $(EXTRA_DIRS); do \
|
||||||
cp -rp $$subdir $(distdir); \
|
cp -rp $$subdir $(distdir); \
|
||||||
@ -17,6 +20,6 @@ dist-hook:
|
|||||||
rm -rf $(distdir)/*/*/.svn; \
|
rm -rf $(distdir)/*/*/.svn; \
|
||||||
rm -rf $(distdir)/$$subdir/*/CVS; \
|
rm -rf $(distdir)/$$subdir/*/CVS; \
|
||||||
rm -rf $(distdir)/$$subdir/*/.svn; \
|
rm -rf $(distdir)/$$subdir/*/.svn; \
|
||||||
rm -rf $(distdir)/$$subdir/*/.pb.cc; \
|
rm -rf $(distdir)/$$subdir/*/.pb.cc; \
|
||||||
find $(distdir) -name .svn | xargs rm -fr; \
|
find $(distdir) -name .svn | xargs rm -fr; \
|
||||||
done
|
done
|
||||||
|
16
configure.ac
16
configure.ac
@ -58,6 +58,19 @@ if test "${enable_gcov}" = "yes"; then
|
|||||||
LIBS="$LIBS -lgcov"
|
LIBS="$LIBS -lgcov"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# pkgconfigdir
|
||||||
|
AC_ARG_WITH(pkgconfigdir,
|
||||||
|
AC_HELP_STRING([--with-pkgconfigdir],
|
||||||
|
[Use the specified pkgconfig dir (default is libdir/pkgconfig)]),
|
||||||
|
[pkgconfigdir=${withval}],
|
||||||
|
[pkgconfigdir='${libdir}/pkgconfig'])
|
||||||
|
AC_MSG_NOTICE([pkgconfig directory is ${pkgconfigdir}])
|
||||||
|
pkgconfigcflags=$CFLAGS
|
||||||
|
pkgconfiglibs=$LIBS
|
||||||
|
AC_SUBST([pkgconfigdir])
|
||||||
|
AC_SUBST([pkgconfigcflags])
|
||||||
|
AC_SUBST([pkgconfiglibs])
|
||||||
|
|
||||||
# Checks for header files.
|
# Checks for header files.
|
||||||
AC_CHECK_HEADERS([unistd.h])
|
AC_CHECK_HEADERS([unistd.h])
|
||||||
|
|
||||||
@ -69,6 +82,7 @@ AC_FUNC_STRTOD
|
|||||||
AC_CHECK_FUNCS([memchr memset])
|
AC_CHECK_FUNCS([memchr memset])
|
||||||
|
|
||||||
AC_CONFIG_FILES([Makefile
|
AC_CONFIG_FILES([Makefile
|
||||||
src/Makefile])
|
src/Makefile
|
||||||
|
sentencepiece.pc])
|
||||||
|
|
||||||
AC_OUTPUT
|
AC_OUTPUT
|
||||||
|
3
python/MANIFEST.in
Normal file
3
python/MANIFEST.in
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
recursive-include test *.py *.model
|
||||||
|
include *.i *.md
|
||||||
|
|
47
python/README.md
Normal file
47
python/README.md
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
# SentencePiece Python Wrapper
|
||||||
|
|
||||||
|
Python wrapper for SentencePiece with SWIG. This module wrappes sentencepiece::SentencePieceProcessor class with the following modifications:
|
||||||
|
* Encode and Decode methods are re-defined as EncodeAsIds, EncodeAsPieces, DecodeIds and DecodePieces respectevely.
|
||||||
|
* SentencePieceText proto is not supported.
|
||||||
|
|
||||||
|
## Build and Install SentencePiece
|
||||||
|
You need to install SentencePiece before before installing this python wrapper.
|
||||||
|
|
||||||
|
```
|
||||||
|
% pip install sentencepiece
|
||||||
|
```
|
||||||
|
|
||||||
|
You can install this module manually as follows:
|
||||||
|
```
|
||||||
|
% python setup.py build
|
||||||
|
% sudo python setup.py install
|
||||||
|
```
|
||||||
|
|
||||||
|
If you don’t have write permission to the global site-packages directory or don’t want to install into it, please try:
|
||||||
|
```
|
||||||
|
% python setup.py install --user
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
% python
|
||||||
|
>>> import sentencepiece as spm
|
||||||
|
>>> sp = spm.SentencePieceProcessor()
|
||||||
|
>>> sp.Load("test/test_model.model")
|
||||||
|
True
|
||||||
|
>>> sp.EncodeAsPieces("This is a test")
|
||||||
|
['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est']
|
||||||
|
>>> sp.EncodeAsIds("This is a test")
|
||||||
|
[284, 47, 11, 4, 15, 400]
|
||||||
|
>>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'])
|
||||||
|
'This is a test'
|
||||||
|
>>> sp.DecodeIds([284, 47, 11, 4, 15, 400])
|
||||||
|
'This is a test'
|
||||||
|
>>> sp.GetPieceSize()
|
||||||
|
1000
|
||||||
|
>>> sp.IdToPiece(2)
|
||||||
|
'</s>'
|
||||||
|
>>> sp.PieceToId('</s>')
|
||||||
|
2
|
||||||
|
```
|
155
python/sentencepiece.i
Normal file
155
python/sentencepiece.i
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
%module sentencepiece
|
||||||
|
|
||||||
|
# Python wrapper is generated with:
|
||||||
|
# % swig -python -c++ sentencepiece.i
|
||||||
|
|
||||||
|
%{
|
||||||
|
#include <sentencepiece_processor.h>
|
||||||
|
%}
|
||||||
|
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::Encode(std::string const &, std::vector<std::string>*) const;
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::Encode(std::string const &, std::vector<int>*) const;
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::Encode(std::string const &, SentencePieceText *) const;
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<std::string> const &,std::string *) const;
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<int> const &, std::string *) const;
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<std::string> const &, SentencePieceText *) const;
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<int> const &, SentencePieceText *) const;
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::model_proto;
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::Load(std::istream *);
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::LoadOrDie(std::istream *);
|
||||||
|
%ignore sentencepiece::SentencePieceProcessor::model_proto();
|
||||||
|
|
||||||
|
%extend sentencepiece::SentencePieceProcessor {
|
||||||
|
std::vector<std::string> Encode(const std::string& input) const {
|
||||||
|
std::vector<std::string> output;
|
||||||
|
$self->Encode(input, &output);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> EncodeAsPieces(const std::string& input) const {
|
||||||
|
std::vector<std::string> output;
|
||||||
|
$self->Encode(input, &output);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> EncodeAsIds(const std::string& input) const {
|
||||||
|
std::vector<int> output;
|
||||||
|
$self->Encode(input, &output);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Decode(const std::vector<std::string>& input) const {
|
||||||
|
std::string output;
|
||||||
|
$self->Decode(input, &output);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string DecodePieces(const std::vector<std::string>& input) const {
|
||||||
|
std::string output;
|
||||||
|
$self->Decode(input, &output);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string DecodeIds(const std::vector<int>& input) const {
|
||||||
|
std::string output;
|
||||||
|
$self->Decode(input, &output);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
int __len__() {
|
||||||
|
return $self->GetPieceSize();
|
||||||
|
}
|
||||||
|
|
||||||
|
int __getitem__(const std::string& key) const {
|
||||||
|
return $self->PieceToId(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
%typemap(out) std::vector<int> {
|
||||||
|
$result = PyList_New($1.size());
|
||||||
|
for (size_t i = 0; i < $1.size(); ++i)
|
||||||
|
PyList_SetItem($result, i, PyInt_FromLong((long)$1[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
%typemap(out) std::vector<std::string> {
|
||||||
|
$result = PyList_New($1.size());
|
||||||
|
for (size_t i = 0; i < $1.size(); ++i)
|
||||||
|
PyList_SetItem($result, i, PyString_FromStringAndSize($1[i].data(), $1[i].size()));
|
||||||
|
}
|
||||||
|
|
||||||
|
%typemap(out) std::string {
|
||||||
|
$result = PyString_FromStringAndSize($1.data(), $1.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
%typemap(in) const std::string & {
|
||||||
|
std::string *out = nullptr;
|
||||||
|
if (PyString_Check($input)) {
|
||||||
|
char *str = nullptr;
|
||||||
|
Py_ssize_t str_size = 0;
|
||||||
|
PyString_AsStringAndSize($input, &str, &str_size);
|
||||||
|
out = new std::string(str, str_size);
|
||||||
|
} else {
|
||||||
|
PyErr_SetString(PyExc_TypeError,"not a string");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
$1 = out;
|
||||||
|
}
|
||||||
|
|
||||||
|
%typemap(in) const std::vector<std::string>& {
|
||||||
|
std::vector<std::string> *out = nullptr;
|
||||||
|
if (PyList_Check($input)) {
|
||||||
|
const size_t size = PyList_Size($input);
|
||||||
|
out = new std::vector<std::string>(size);
|
||||||
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
PyObject *o = PyList_GetItem($input, i);
|
||||||
|
if (PyString_Check(o)) {
|
||||||
|
char *str = nullptr;
|
||||||
|
Py_ssize_t str_size = 0;
|
||||||
|
PyString_AsStringAndSize(o, &str, &str_size);
|
||||||
|
(*out)[i] = std::string(str, static_cast<size_t>(str_size));
|
||||||
|
} else {
|
||||||
|
PyErr_SetString(PyExc_TypeError,"list must contain strings");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
PyErr_SetString(PyExc_TypeError,"not a list");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
$1 = out;
|
||||||
|
}
|
||||||
|
|
||||||
|
%typemap(in) const std::vector<int>& {
|
||||||
|
std::vector<int> *out = nullptr;
|
||||||
|
if (PyList_Check($input)) {
|
||||||
|
const size_t size = PyList_Size($input);
|
||||||
|
out = new std::vector<int>(size);
|
||||||
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
PyObject *o = PyList_GetItem($input, i);
|
||||||
|
if (PyInt_Check(o)) {
|
||||||
|
(*out)[i] = static_cast<int>(PyInt_AsLong(o));
|
||||||
|
} else {
|
||||||
|
PyErr_SetString(PyExc_TypeError,"list must contain integers");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
PyErr_SetString(PyExc_TypeError,"not a list");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
$1 = out;
|
||||||
|
}
|
||||||
|
|
||||||
|
%typemap(freearg) const std::string& {
|
||||||
|
delete $1;
|
||||||
|
}
|
||||||
|
|
||||||
|
%typemap(freearg) const std::vector<std::string>& {
|
||||||
|
delete $1;
|
||||||
|
}
|
||||||
|
|
||||||
|
%typemap(freearg) const std::vector<int>& {
|
||||||
|
delete $1;
|
||||||
|
}
|
||||||
|
|
||||||
|
%include <sentencepiece_processor.h>
|
107
python/sentencepiece.py
Normal file
107
python/sentencepiece.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
# This file was automatically generated by SWIG (http://www.swig.org).
|
||||||
|
# Version 2.0.11
|
||||||
|
#
|
||||||
|
# Do not make changes to this file unless you know what you are doing--modify
|
||||||
|
# the SWIG interface file instead.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from sys import version_info
|
||||||
|
if version_info >= (2,6,0):
|
||||||
|
def swig_import_helper():
|
||||||
|
from os.path import dirname
|
||||||
|
import imp
|
||||||
|
fp = None
|
||||||
|
try:
|
||||||
|
fp, pathname, description = imp.find_module('_sentencepiece', [dirname(__file__)])
|
||||||
|
except ImportError:
|
||||||
|
import _sentencepiece
|
||||||
|
return _sentencepiece
|
||||||
|
if fp is not None:
|
||||||
|
try:
|
||||||
|
_mod = imp.load_module('_sentencepiece', fp, pathname, description)
|
||||||
|
finally:
|
||||||
|
fp.close()
|
||||||
|
return _mod
|
||||||
|
_sentencepiece = swig_import_helper()
|
||||||
|
del swig_import_helper
|
||||||
|
else:
|
||||||
|
import _sentencepiece
|
||||||
|
del version_info
|
||||||
|
try:
|
||||||
|
_swig_property = property
|
||||||
|
except NameError:
|
||||||
|
pass # Python < 2.2 doesn't have 'property'.
|
||||||
|
def _swig_setattr_nondynamic(self,class_type,name,value,static=1):
|
||||||
|
if (name == "thisown"): return self.this.own(value)
|
||||||
|
if (name == "this"):
|
||||||
|
if type(value).__name__ == 'SwigPyObject':
|
||||||
|
self.__dict__[name] = value
|
||||||
|
return
|
||||||
|
method = class_type.__swig_setmethods__.get(name,None)
|
||||||
|
if method: return method(self,value)
|
||||||
|
if (not static):
|
||||||
|
self.__dict__[name] = value
|
||||||
|
else:
|
||||||
|
raise AttributeError("You cannot add attributes to %s" % self)
|
||||||
|
|
||||||
|
def _swig_setattr(self,class_type,name,value):
|
||||||
|
return _swig_setattr_nondynamic(self,class_type,name,value,0)
|
||||||
|
|
||||||
|
def _swig_getattr(self,class_type,name):
|
||||||
|
if (name == "thisown"): return self.this.own()
|
||||||
|
method = class_type.__swig_getmethods__.get(name,None)
|
||||||
|
if method: return method(self)
|
||||||
|
raise AttributeError(name)
|
||||||
|
|
||||||
|
def _swig_repr(self):
|
||||||
|
try: strthis = "proxy of " + self.this.__repr__()
|
||||||
|
except: strthis = ""
|
||||||
|
return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
|
||||||
|
|
||||||
|
try:
|
||||||
|
_object = object
|
||||||
|
_newclass = 1
|
||||||
|
except AttributeError:
|
||||||
|
class _object : pass
|
||||||
|
_newclass = 0
|
||||||
|
|
||||||
|
|
||||||
|
class SentencePieceProcessor(_object):
|
||||||
|
__swig_setmethods__ = {}
|
||||||
|
__setattr__ = lambda self, name, value: _swig_setattr(self, SentencePieceProcessor, name, value)
|
||||||
|
__swig_getmethods__ = {}
|
||||||
|
__getattr__ = lambda self, name: _swig_getattr(self, SentencePieceProcessor, name)
|
||||||
|
__repr__ = _swig_repr
|
||||||
|
def __init__(self):
|
||||||
|
this = _sentencepiece.new_SentencePieceProcessor()
|
||||||
|
try: self.this.append(this)
|
||||||
|
except: self.this = this
|
||||||
|
__swig_destroy__ = _sentencepiece.delete_SentencePieceProcessor
|
||||||
|
__del__ = lambda self : None;
|
||||||
|
def Load(self, *args): return _sentencepiece.SentencePieceProcessor_Load(self, *args)
|
||||||
|
def LoadOrDie(self, *args): return _sentencepiece.SentencePieceProcessor_LoadOrDie(self, *args)
|
||||||
|
def SetEncodeExtraOptions(self, *args): return _sentencepiece.SentencePieceProcessor_SetEncodeExtraOptions(self, *args)
|
||||||
|
def SetDecodeExtraOptions(self, *args): return _sentencepiece.SentencePieceProcessor_SetDecodeExtraOptions(self, *args)
|
||||||
|
def GetPieceSize(self): return _sentencepiece.SentencePieceProcessor_GetPieceSize(self)
|
||||||
|
def PieceToId(self, *args): return _sentencepiece.SentencePieceProcessor_PieceToId(self, *args)
|
||||||
|
def IdToPiece(self, *args): return _sentencepiece.SentencePieceProcessor_IdToPiece(self, *args)
|
||||||
|
def GetScore(self, *args): return _sentencepiece.SentencePieceProcessor_GetScore(self, *args)
|
||||||
|
def IsUnknown(self, *args): return _sentencepiece.SentencePieceProcessor_IsUnknown(self, *args)
|
||||||
|
def IsControl(self, *args): return _sentencepiece.SentencePieceProcessor_IsControl(self, *args)
|
||||||
|
def Encode(self, *args): return _sentencepiece.SentencePieceProcessor_Encode(self, *args)
|
||||||
|
def EncodeAsPieces(self, *args): return _sentencepiece.SentencePieceProcessor_EncodeAsPieces(self, *args)
|
||||||
|
def EncodeAsIds(self, *args): return _sentencepiece.SentencePieceProcessor_EncodeAsIds(self, *args)
|
||||||
|
def Decode(self, *args): return _sentencepiece.SentencePieceProcessor_Decode(self, *args)
|
||||||
|
def DecodePieces(self, *args): return _sentencepiece.SentencePieceProcessor_DecodePieces(self, *args)
|
||||||
|
def DecodeIds(self, *args): return _sentencepiece.SentencePieceProcessor_DecodeIds(self, *args)
|
||||||
|
def __len__(self): return _sentencepiece.SentencePieceProcessor___len__(self)
|
||||||
|
def __getitem__(self, *args): return _sentencepiece.SentencePieceProcessor___getitem__(self, *args)
|
||||||
|
SentencePieceProcessor_swigregister = _sentencepiece.SentencePieceProcessor_swigregister
|
||||||
|
SentencePieceProcessor_swigregister(SentencePieceProcessor)
|
||||||
|
|
||||||
|
# This file is compatible with both classic and new-style classes.
|
||||||
|
|
||||||
|
|
4764
python/sentencepiece_wrap.cxx
Normal file
4764
python/sentencepiece_wrap.cxx
Normal file
File diff suppressed because it is too large
Load Diff
2
python/setup.cfg
Normal file
2
python/setup.cfg
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
[metadata]
|
||||||
|
description-file = README.md
|
42
python/setup.py
Executable file
42
python/setup.py
Executable file
@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from setuptools import setup, Extension
|
||||||
|
import string
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
sys.path.append('./test')
|
||||||
|
|
||||||
|
with open("README.md") as f:
|
||||||
|
long_description = f.read()
|
||||||
|
|
||||||
|
def cmd(line):
|
||||||
|
return os.popen(line).readlines()[0][:-1].split()
|
||||||
|
|
||||||
|
setup(name = 'sentencepiece',
|
||||||
|
author = 'Taku Kudo',
|
||||||
|
author_email='taku@google.com',
|
||||||
|
description = 'SentencePiece python wrapper',
|
||||||
|
long_description = long_description,
|
||||||
|
url = 'https://github.com/google/sentencepiece',
|
||||||
|
license = 'Apache',
|
||||||
|
platforms = 'Unix',
|
||||||
|
py_modules=['sentencepiece'],
|
||||||
|
ext_modules = [Extension('_sentencepiece',
|
||||||
|
sources=['sentencepiece_wrap.cxx'],
|
||||||
|
extra_compile_args=['-std=c++11'] +
|
||||||
|
cmd('pkg-config sentencepiece --cflags'),
|
||||||
|
extra_link_args=cmd('pkg-config sentencepiece --libs'))
|
||||||
|
],
|
||||||
|
classifiers = [
|
||||||
|
'Development Status :: 5 - Production/Stable',
|
||||||
|
'Environment :: Console',
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'Intended Audience :: Science/Research',
|
||||||
|
'License :: OSI Approved :: Apache Software License',
|
||||||
|
'Operating System :: Unix',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Topic :: Text Processing :: Linguistic',
|
||||||
|
'Topic :: Software Development :: Libraries :: Python Modules'
|
||||||
|
],
|
||||||
|
test_suite = 'sentencepiece_test.suite')
|
0
python/test/__init__.py
Normal file
0
python/test/__init__.py
Normal file
42
python/test/sentencepiece_test.py
Executable file
42
python/test/sentencepiece_test.py
Executable file
@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sentencepiece as spm
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
class TestSentencepieceProcessor(unittest.TestCase):
|
||||||
|
"""Test case for SentencePieceProcessor"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.sp_ = spm.SentencePieceProcessor()
|
||||||
|
self.assertTrue(self.sp_.Load('test/test_model.model'))
|
||||||
|
|
||||||
|
def test_load(self):
|
||||||
|
self.assertEqual(1000, self.sp_.GetPieceSize())
|
||||||
|
self.assertEqual(0, self.sp_.PieceToId('<unk>'))
|
||||||
|
self.assertEqual(1, self.sp_.PieceToId('<s>'))
|
||||||
|
self.assertEqual(2, self.sp_.PieceToId('</s>'))
|
||||||
|
self.assertEqual('<unk>', self.sp_.IdToPiece(0))
|
||||||
|
self.assertEqual('<s>', self.sp_.IdToPiece(1))
|
||||||
|
self.assertEqual('</s>', self.sp_.IdToPiece(2))
|
||||||
|
for i in range(self.sp_.GetPieceSize()):
|
||||||
|
piece = self.sp_.IdToPiece(i)
|
||||||
|
self.assertEqual(i, self.sp_.PieceToId(piece))
|
||||||
|
|
||||||
|
def test_roundtrip(self):
|
||||||
|
text = 'I saw a girl with a telescope.'
|
||||||
|
ids = self.sp_.EncodeAsIds(text)
|
||||||
|
pieces1 = self.sp_.EncodeAsPieces(text)
|
||||||
|
pieces2 = self.sp_.Encode(text)
|
||||||
|
self.assertEqual(pieces1, pieces2)
|
||||||
|
self.assertEqual(text, self.sp_.Decode(pieces1))
|
||||||
|
self.assertEqual(text, self.sp_.DecodePieces(pieces2))
|
||||||
|
self.assertEqual(text, self.sp_.DecodeIds(ids))
|
||||||
|
|
||||||
|
def suite():
|
||||||
|
suite = unittest.TestSuite()
|
||||||
|
suite.addTests(unittest.makeSuite(TestSentencepieceProcessor))
|
||||||
|
return suite
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
BIN
python/test/test_model.model
Normal file
BIN
python/test/test_model.model
Normal file
Binary file not shown.
@ -17,6 +17,7 @@ libsentencepiece_la_SOURCES = \
|
|||||||
word_model.h word_model.cc \
|
word_model.h word_model.cc \
|
||||||
char_model.h char_model.cc \
|
char_model.h char_model.cc \
|
||||||
bpe_model.h bpe_model.cc
|
bpe_model.h bpe_model.cc
|
||||||
|
include_HEADERS = sentencepiece_processor.h
|
||||||
|
|
||||||
noinst_LIBRARIES = libtrain.a
|
noinst_LIBRARIES = libtrain.a
|
||||||
libtrain_a_SOURCES = builder.cc builder.h \
|
libtrain_a_SOURCES = builder.cc builder.h \
|
||||||
@ -37,7 +38,7 @@ BUILT_SOURCES = \
|
|||||||
sentencepiece.pb.cc \
|
sentencepiece.pb.cc \
|
||||||
sentencepiece_model.pb.cc
|
sentencepiece_model.pb.cc
|
||||||
|
|
||||||
EXTRA_DIST = sentencepiece.proto sentencepiece_model.proto
|
EXTRA_DIST = sentencepiece.proto sentencepiece_model.proto
|
||||||
|
|
||||||
bin_PROGRAMS = spm_encode spm_decode spm_normalize spm_train spm_export_vocab
|
bin_PROGRAMS = spm_encode spm_decode spm_normalize spm_train spm_export_vocab
|
||||||
noinst_PROGRAMS = compile_charsmap
|
noinst_PROGRAMS = compile_charsmap
|
||||||
|
@ -168,6 +168,7 @@ class SentencePieceProcessor {
|
|||||||
// Returns true if |id| is control symbol.
|
// Returns true if |id| is control symbol.
|
||||||
virtual bool IsControl(int id) const;
|
virtual bool IsControl(int id) const;
|
||||||
|
|
||||||
|
#ifndef SWIG
|
||||||
//////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////
|
||||||
// Model management.
|
// Model management.
|
||||||
//
|
//
|
||||||
@ -176,6 +177,7 @@ class SentencePieceProcessor {
|
|||||||
|
|
||||||
// Allows injection of a normalizer instance. |normalizer| is moved.
|
// Allows injection of a normalizer instance. |normalizer| is moved.
|
||||||
void SetNormalizer(std::unique_ptr<normalizer::Normalizer> &&normalizer);
|
void SetNormalizer(std::unique_ptr<normalizer::Normalizer> &&normalizer);
|
||||||
|
#endif
|
||||||
|
|
||||||
// Returns immutable model proto. Useful to obtain extended
|
// Returns immutable model proto. Useful to obtain extended
|
||||||
// or experimental parameters encoded in model_proto.
|
// or experimental parameters encoded in model_proto.
|
||||||
|
Loading…
Reference in New Issue
Block a user