fix tokenization

This commit is contained in:
Hieu Hoang 2015-12-22 18:15:00 +00:00
parent 1f2c4785b4
commit 139ee7e0c8
2 changed files with 23 additions and 8 deletions

View File

@ -2,41 +2,48 @@
line_text splitLine(StringPiece textin) line_text splitLine(StringPiece textin)
{ {
const char delim[] = " ||| "; const char delim[] = "|||";
line_text output; line_text output;
//Tokenize //Tokenize
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim)); util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//Get source phrase //Get source phrase
output.source_phrase = *it; output.source_phrase = Trim(*it);
//std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
//Get target_phrase //Get target_phrase
it++; it++;
output.target_phrase = *it; output.target_phrase = Trim(*it);
//std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
//Get probabilities //Get probabilities
it++; it++;
output.prob = *it; output.prob = Trim(*it);
//std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
//Get WordAllignment //Get WordAllignment
it++; it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output; if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.word_align = *it; output.word_align = Trim(*it);
//std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
//Get count //Get count
it++; it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output; if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.counts = *it; output.counts = Trim(*it);
//std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
//Get sparse_score //Get sparse_score
it++; it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output; if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.sparse_score = *it; output.sparse_score = Trim(*it);
//std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
//Get property //Get property
it++; it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output; if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
output.property = *it; output.property = Trim(*it);
//std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
return output; return output;
} }

View File

@ -257,6 +257,14 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) {
return !(x < y); return !(x < y);
} }
inline StringPiece Trim(const StringPiece& str, const std::string dropChars = " \t\n\r")
{
StringPiece::size_type startPos = str.find_first_not_of(dropChars);
StringPiece::size_type endPos = str.find_last_not_of(dropChars);
StringPiece ret = str.substr(startPos, endPos - startPos + 1);
return ret;
}
// allow StringPiece to be logged (needed for unit testing). // allow StringPiece to be logged (needed for unit testing).
inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
return o.write(piece.data(), static_cast<std::streamsize>(piece.size())); return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));