Bugfix / Stephan Peitz and more paranoid error checking

This commit is contained in:
Kenneth Heafield 2014-08-27 23:23:39 -04:00
parent 1c45d780d4
commit 02ab8f5102
5 changed files with 28 additions and 22 deletions

View File

@ -176,7 +176,7 @@ template <class M> void MinimalState(const M &model) {
AppendTest("to", 1, -1.687872, false);
AppendTest("look", 2, -0.2922095, true);
BOOST_CHECK_EQUAL(2, state.length);
AppendTest("good", 3, -7, true);
AppendTest("a", 3, -7, true);
}
template <class M> void ExtendLeftTest(const M &model) {

View File

@ -41,29 +41,24 @@ class PositiveProbWarn {
WarningAction action_;
};
template <class Weights> StringPiece Read1Gram(util::FilePiece &f, Weights &weights, PositiveProbWarn &warn) {
template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
try {
weights.prob = f.ReadFloat();
if (weights.prob > 0.0) {
warn.Warn(weights.prob);
weights.prob = 0.0;
float prob = f.ReadFloat();
if (prob > 0.0) {
warn.Warn(prob);
prob = 0.0;
}
UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
StringPiece ret(f.ReadDelimited(kARPASpaces));
ReadBackoff(f, weights);
return ret;
WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces));
Weights &w = unigrams[word];
w.prob = prob;
ReadBackoff(f, w);
} catch(util::Exception &e) {
e << " in the 1-gram at byte " << f.Offset();
throw;
}
}
template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
Weights temp;
WordIndex word = vocab.Insert(Read1Gram(f, temp, warn));
unigrams[word] = temp;
}
template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
ReadNGramHeader(f, 1);
for (std::size_t i = 0; i < count; ++i) {
@ -81,7 +76,12 @@ template <class Voc, class Weights, class Iterator> void ReadNGram(util::FilePie
weights.prob = 0.0;
}
for (unsigned char i = 0; i < n; ++i, ++indices_out) {
*indices_out = vocab.Index(f.ReadDelimited(kARPASpaces));
StringPiece word(f.ReadDelimited(kARPASpaces));
WordIndex index = vocab.Index(word);
*indices_out = index;
// Check for words mapped to <unk> that are not the string <unk>.
UTIL_THROW_IF(index == 0 /* mapped to <unk> */ && (word != StringPiece("<unk>", 5)) && (word != StringPiece("<UNK>", 5)),
FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears");
}
ReadBackoff(f, weights);
} catch(util::Exception &e) {

View File

@ -105,7 +105,7 @@ ngram 5=4
-0.04835128 looking on a -0.4771212
-3 also would consider -7
-6 <unk> however <unk> -12
-7 to look good
-7 to look a
\4-grams:
-0.009249173 looking on a little -0.4771212

View File

@ -101,7 +101,7 @@ ngram 5=4
-0.1892331 little more loin
-0.04835128 looking on a -0.4771212
-3 also would consider -7
-7 to look good
-7 to look a
\4-grams:
-0.009249173 looking on a little -0.4771212

View File

@ -107,14 +107,20 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_pre
}
struct ThrowCombine {
void operator()(std::size_t /*entry_size*/, const void * /*first*/, const void * /*second*/, FILE * /*out*/) const {
UTIL_THROW(FormatLoadException, "Duplicate n-gram detected.");
void operator()(std::size_t entry_size, unsigned char order, const void *first, const void *second, FILE * /*out*/) const {
const WordIndex *base = reinterpret_cast<const WordIndex*>(first);
FormatLoadException e;
e << "Duplicate n-gram detected with vocab ids";
for (const WordIndex *i = base; i != base + order; ++i) {
e << ' ' << *i;
}
throw e;
}
};
// Useful for context files that just contain records with no value.
struct FirstCombine {
void operator()(std::size_t entry_size, const void *first, const void * /*second*/, FILE *out) const {
void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const {
util::WriteOrThrow(out, first, entry_size);
}
};
@ -134,7 +140,7 @@ template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_f
util::WriteOrThrow(out_file.get(), second.Data(), entry_size);
++second;
} else {
combine(entry_size, first.Data(), second.Data(), out_file.get());
combine(entry_size, order, first.Data(), second.Data(), out_file.get());
++first; ++second;
}
}