Bug fix: tokenizer used in mert; add unit tests for that.

When tokenizing a string delimited by spaces (say, "9 9 8 7 ") with Tokenize(), resulting a sequence of strings are {"9", "9", "8", "7", "" }, which is different from we have expected. We are not interested in empty strings. This commit fix this issue, and add unit tests for the tokenize functions.
2024-12-26 13:23:25 +03:00 · 2012-02-20 07:39:24 +09:00 · 2012-02-20 07:39:24 +09:00 · 8c7dfe04e7
commit 8c7dfe04e7
parent 4b6232b757
3 changed files with 68 additions and 1 deletions
--- a/mert/Jamfile
+++ b/mert/Jamfile
@ -44,5 +44,6 @@ exe pro : pro.cpp mert_lib ..//boost_program_options ;
 alias programs : mert extractor evaluator pro ;

 unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test util_test : UtilTest.cpp mert_lib ..//boost_unit_test_framework ;

 install legacy : programs : <location>. ;
--- a/mert/Util.cpp
+++ b/mert/Util.cpp
@ -67,7 +67,8 @@ void Tokenize(const char *str, const char delim,
  while (1) {
    const char *begin = str;
    while (*str != delim && *str) str++;
-    res->push_back(std::string(begin, str));
+    if (begin != str)            // Don't create empty string objects.
+      res->push_back(std::string(begin, str));
    if (*str++ == 0) break;
  }
 }
--- a/mert/UtilTest.cpp
+++ b/mert/UtilTest.cpp
@ -0,0 +1,65 @@
+#include "Util.h"
+
+#define BOOST_TEST_MODULE UtilTest
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_CASE(util_get_next_pound_test) {
+  {
+    std::string str("9 9 7 ");
+    std::string substr;
+    std::vector<std::string> res;
+
+    while (!str.empty()) {
+      getNextPound(str, substr);
+      res.push_back(substr);
+    }
+    BOOST_REQUIRE(res.size() == 3);
+    BOOST_CHECK_EQUAL("9", res[0]);
+    BOOST_CHECK_EQUAL("9", res[1]);
+    BOOST_CHECK_EQUAL("7", res[2]);
+  }
+
+  {
+    std::string str("ref.0,ref.1,ref.2");
+    std::string substr;
+    std::vector<std::string> res;
+    const std::string delim(",");
+
+    while (!str.empty()) {
+      getNextPound(str, substr, delim);
+      res.push_back(substr);
+    }
+    BOOST_REQUIRE(res.size() == 3);
+    BOOST_CHECK_EQUAL("ref.0", res[0]);
+    BOOST_CHECK_EQUAL("ref.1", res[1]);
+    BOOST_CHECK_EQUAL("ref.2", res[2]);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(util_tokenize_test) {
+  {
+    std::vector<std::string> res;
+    Tokenize("9 9 7", ' ', &res);
+    BOOST_REQUIRE(res.size() == 3);
+    BOOST_CHECK_EQUAL("9", res[0]);
+    BOOST_CHECK_EQUAL("9", res[1]);
+    BOOST_CHECK_EQUAL("7", res[2]);
+  }
+
+  {
+    std::vector<std::string> res;
+    Tokenize("9 8 7 ", ' ', &res);
+    BOOST_REQUIRE(res.size() == 3);
+    BOOST_CHECK_EQUAL("9", res[0]);
+    BOOST_CHECK_EQUAL("8", res[1]);
+    BOOST_CHECK_EQUAL("7", res[2]);
+  }
+
+  {
+    std::vector<std::string> res;
+    Tokenize("ref.0,ref.1,", ',', &res);
+    BOOST_REQUIRE(res.size() == 2);
+    BOOST_CHECK_EQUAL("ref.0", res[0]);
+    BOOST_CHECK_EQUAL("ref.1", res[1]);
+  }
+}