fix tokenization

This commit is contained in:
Hieu Hoang 2015-12-22 18:15:00 +00:00
parent 1f2c4785b4
commit 139ee7e0c8
2 changed files with 23 additions and 8 deletions

View File

@ -2,41 +2,48 @@
line_text splitLine(StringPiece textin)
{
const char delim[] = " ||| ";
const char delim[] = "|||";
line_text output;
//Tokenize
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//Get source phrase
output.source_phrase = *it;
output.source_phrase = Trim(*it);
//std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
//Get target_phrase
it++;
output.target_phrase = *it;
output.target_phrase = Trim(*it);
//std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
//Get probabilities
it++;
output.prob = *it;
output.prob = Trim(*it);
//std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
//Get WordAllignment
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.word_align = *it;
output.word_align = Trim(*it);
//std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
//Get count
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.counts = *it;
output.counts = Trim(*it);
//std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
//Get sparse_score
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.sparse_score = *it;
output.sparse_score = Trim(*it);
//std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
//Get property
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.property = *it;
output.property = Trim(*it);
//std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
return output;
}

View File

@ -257,6 +257,14 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) {
return !(x < y);
}
inline StringPiece Trim(const StringPiece& str, const std::string dropChars = " \t\n\r")
{
StringPiece::size_type startPos = str.find_first_not_of(dropChars);
StringPiece::size_type endPos = str.find_last_not_of(dropChars);
StringPiece ret = str.substr(startPos, endPos - startPos + 1);
return ret;
}
// allow StringPiece to be logged (needed for unit testing).
inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));