Bug fix: tokenizer used in mert; add unit tests for that.

When tokenizing a string delimited by spaces (say, "9 9 8 7 ")
with Tokenize(), resulting a sequence of strings are
{"9", "9", "8", "7", "" }, which is different
from we have expected. We are not interested in empty strings.

This commit fix this issue, and add unit tests for
the tokenize functions.
This commit is contained in:
Tetsuo Kiso 2012-02-20 07:39:24 +09:00
parent 4b6232b757
commit 8c7dfe04e7
3 changed files with 68 additions and 1 deletions

View File

@ -44,5 +44,6 @@ exe pro : pro.cpp mert_lib ..//boost_program_options ;
alias programs : mert extractor evaluator pro ;
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test util_test : UtilTest.cpp mert_lib ..//boost_unit_test_framework ;
install legacy : programs : <location>. ;

View File

@ -67,7 +67,8 @@ void Tokenize(const char *str, const char delim,
while (1) {
const char *begin = str;
while (*str != delim && *str) str++;
res->push_back(std::string(begin, str));
if (begin != str) // Don't create empty string objects.
res->push_back(std::string(begin, str));
if (*str++ == 0) break;
}
}

65
mert/UtilTest.cpp Normal file
View File

@ -0,0 +1,65 @@
#include "Util.h"
#define BOOST_TEST_MODULE UtilTest
#include <boost/test/unit_test.hpp>
BOOST_AUTO_TEST_CASE(util_get_next_pound_test) {
{
std::string str("9 9 7 ");
std::string substr;
std::vector<std::string> res;
while (!str.empty()) {
getNextPound(str, substr);
res.push_back(substr);
}
BOOST_REQUIRE(res.size() == 3);
BOOST_CHECK_EQUAL("9", res[0]);
BOOST_CHECK_EQUAL("9", res[1]);
BOOST_CHECK_EQUAL("7", res[2]);
}
{
std::string str("ref.0,ref.1,ref.2");
std::string substr;
std::vector<std::string> res;
const std::string delim(",");
while (!str.empty()) {
getNextPound(str, substr, delim);
res.push_back(substr);
}
BOOST_REQUIRE(res.size() == 3);
BOOST_CHECK_EQUAL("ref.0", res[0]);
BOOST_CHECK_EQUAL("ref.1", res[1]);
BOOST_CHECK_EQUAL("ref.2", res[2]);
}
}
BOOST_AUTO_TEST_CASE(util_tokenize_test) {
{
std::vector<std::string> res;
Tokenize("9 9 7", ' ', &res);
BOOST_REQUIRE(res.size() == 3);
BOOST_CHECK_EQUAL("9", res[0]);
BOOST_CHECK_EQUAL("9", res[1]);
BOOST_CHECK_EQUAL("7", res[2]);
}
{
std::vector<std::string> res;
Tokenize("9 8 7 ", ' ', &res);
BOOST_REQUIRE(res.size() == 3);
BOOST_CHECK_EQUAL("9", res[0]);
BOOST_CHECK_EQUAL("8", res[1]);
BOOST_CHECK_EQUAL("7", res[2]);
}
{
std::vector<std::string> res;
Tokenize("ref.0,ref.1,", ',', &res);
BOOST_REQUIRE(res.size() == 2);
BOOST_CHECK_EQUAL("ref.0", res[0]);
BOOST_CHECK_EQUAL("ref.1", res[1]);
}
}