mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
Bug fix: tokenizer used in mert; add unit tests for that.
When tokenizing a string delimited by spaces (say, "9 9 8 7 ") with Tokenize(), resulting a sequence of strings are {"9", "9", "8", "7", "" }, which is different from we have expected. We are not interested in empty strings. This commit fix this issue, and add unit tests for the tokenize functions.
This commit is contained in:
parent
4b6232b757
commit
8c7dfe04e7
@ -44,5 +44,6 @@ exe pro : pro.cpp mert_lib ..//boost_program_options ;
|
||||
alias programs : mert extractor evaluator pro ;
|
||||
|
||||
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test util_test : UtilTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
|
||||
install legacy : programs : <location>. ;
|
||||
|
@ -67,7 +67,8 @@ void Tokenize(const char *str, const char delim,
|
||||
while (1) {
|
||||
const char *begin = str;
|
||||
while (*str != delim && *str) str++;
|
||||
res->push_back(std::string(begin, str));
|
||||
if (begin != str) // Don't create empty string objects.
|
||||
res->push_back(std::string(begin, str));
|
||||
if (*str++ == 0) break;
|
||||
}
|
||||
}
|
||||
|
65
mert/UtilTest.cpp
Normal file
65
mert/UtilTest.cpp
Normal file
@ -0,0 +1,65 @@
|
||||
#include "Util.h"
|
||||
|
||||
#define BOOST_TEST_MODULE UtilTest
|
||||
#include <boost/test/unit_test.hpp>
|
||||
|
||||
BOOST_AUTO_TEST_CASE(util_get_next_pound_test) {
|
||||
{
|
||||
std::string str("9 9 7 ");
|
||||
std::string substr;
|
||||
std::vector<std::string> res;
|
||||
|
||||
while (!str.empty()) {
|
||||
getNextPound(str, substr);
|
||||
res.push_back(substr);
|
||||
}
|
||||
BOOST_REQUIRE(res.size() == 3);
|
||||
BOOST_CHECK_EQUAL("9", res[0]);
|
||||
BOOST_CHECK_EQUAL("9", res[1]);
|
||||
BOOST_CHECK_EQUAL("7", res[2]);
|
||||
}
|
||||
|
||||
{
|
||||
std::string str("ref.0,ref.1,ref.2");
|
||||
std::string substr;
|
||||
std::vector<std::string> res;
|
||||
const std::string delim(",");
|
||||
|
||||
while (!str.empty()) {
|
||||
getNextPound(str, substr, delim);
|
||||
res.push_back(substr);
|
||||
}
|
||||
BOOST_REQUIRE(res.size() == 3);
|
||||
BOOST_CHECK_EQUAL("ref.0", res[0]);
|
||||
BOOST_CHECK_EQUAL("ref.1", res[1]);
|
||||
BOOST_CHECK_EQUAL("ref.2", res[2]);
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(util_tokenize_test) {
|
||||
{
|
||||
std::vector<std::string> res;
|
||||
Tokenize("9 9 7", ' ', &res);
|
||||
BOOST_REQUIRE(res.size() == 3);
|
||||
BOOST_CHECK_EQUAL("9", res[0]);
|
||||
BOOST_CHECK_EQUAL("9", res[1]);
|
||||
BOOST_CHECK_EQUAL("7", res[2]);
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<std::string> res;
|
||||
Tokenize("9 8 7 ", ' ', &res);
|
||||
BOOST_REQUIRE(res.size() == 3);
|
||||
BOOST_CHECK_EQUAL("9", res[0]);
|
||||
BOOST_CHECK_EQUAL("8", res[1]);
|
||||
BOOST_CHECK_EQUAL("7", res[2]);
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<std::string> res;
|
||||
Tokenize("ref.0,ref.1,", ',', &res);
|
||||
BOOST_REQUIRE(res.size() == 2);
|
||||
BOOST_CHECK_EQUAL("ref.0", res[0]);
|
||||
BOOST_CHECK_EQUAL("ref.1", res[1]);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user