Merged PR 10692: new factor conditioning, inline fixing suppression, nan suppression

This adds two new features related to factored vocabs:
* a new conditioning mechanism that mimics a mini transformer layer between the emitted lemma and the factors. This affects only factored vocabs, and requires to be enabled explicitly. Change is in `generic.cpp`.
* in case of inline phrase-fixing, cross-attention is now no longer allows to look into the source sequence. This only affects inputs with `|is` factors or `<IOPEN>` tags. Change is in `states.h`.
* Adam optimizer now skips update if the gradient contains a NaN. Does not affect existing configs unless they produce NaNs. Change is in `optimizers.cpp`.
* reverts to old `LayerNorm` routine. *TODO*: Is this change still needed?

Additional changes:
* new method `locate()` for accessing batch data with array coordinates
* new overloads for `constant_like()` from vector directly (most used case)
* rvalue-ref version of `fromVector()`
This commit is contained in:
Frank Seide 2019-12-20 01:46:37 +00:00 committed by Martin Junczys-Dowmunt
parent bab02e3b84
commit f882f27c09
24 changed files with 356 additions and 132 deletions

0
src/common/file_stream.cpp Normal file → Executable file
View File

11
src/data/corpus.cpp Normal file → Executable file
View File

@ -235,11 +235,12 @@ CorpusBase::batch_ptr Corpus::toBatch(const std::vector<Sample>& batchVector) {
}
std::vector<size_t> words(maxDims.size(), 0);
for(size_t i = 0; i < batchSize; ++i) {
for(size_t j = 0; j < maxDims.size(); ++j) {
for(size_t k = 0; k < batchVector[i][j].size(); ++k) {
subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
subBatches[j]->mask()[k * batchSize + i] = 1.f;
for(size_t b = 0; b < batchSize; ++b) { // loop over batch entries
for(size_t j = 0; j < maxDims.size(); ++j) { // loop over streams
auto subBatch = subBatches[j];
for(size_t s = 0; s < batchVector[b][j].size(); ++s) { // loop over word positions
subBatch->data()[subBatch->locate(/*batchIdx=*/b, /*wordPos=*/s)/*s * batchSize + b*/] = batchVector[b][j][s];
subBatch->mask()[subBatch->locate(/*batchIdx=*/b, /*wordPos=*/s)/*s * batchSize + b*/] = 1.f;
words[j]++;
}
}

View File

@ -1,6 +1,7 @@
#include <random>
#include "data/corpus.h"
#include "data/factored_vocab.h"
namespace marian {
namespace data {
@ -330,5 +331,54 @@ void CorpusBase::initEOS(bool training = true) {
}
}
// experimental: hide inline-fix source tokens from cross attention
std::vector<float> SubBatch::crossMaskWithInlineFixSourceSuppressed() const
{
const auto& srcVocab = *vocab();
auto factoredVocab = vocab()->tryAs<FactoredVocab>();
size_t inlineFixGroupIndex = 0, inlineFixSrc = 0;
auto hasInlineFixFactors = factoredVocab && factoredVocab->tryGetFactor(FactoredVocab_INLINE_FIX_WHAT_serialized, /*out*/ inlineFixGroupIndex, /*out*/ inlineFixSrc);
auto fixSrcId = srcVocab[FactoredVocab_FIX_SRC_ID_TAG];
auto fixTgtId = srcVocab[FactoredVocab_FIX_TGT_ID_TAG];
auto fixEndId = srcVocab[FactoredVocab_FIX_END_ID_TAG];
auto unkId = srcVocab.getUnkId();
auto hasInlineFixTags = fixSrcId != unkId && fixTgtId != unkId && fixEndId != unkId;
auto m = mask(); // default return value, which we will modify in-place below in case we need to
if (hasInlineFixFactors || hasInlineFixTags) {
LOG_ONCE(info, "[data] Suppressing cross-attention into inline-fix source tokens");
// example: force French translation of name "frank" to always be "franck"
// - hasInlineFixFactors: "frank|is franck|it", "frank|is" cannot be cross-attended to
// - hasInlineFixTags: "<IOPEN> frank <IDELIM> franck <ICLOSE>", "frank" and all tags cannot be cross-attended to
auto dimBatch = batchSize(); // number of sentences in the batch
auto dimWidth = batchWidth(); // number of words in the longest sentence in the batch
const auto& d = data();
size_t numWords = 0;
for (size_t b = 0; b < dimBatch; b++) { // loop over batch entries
bool inside = false;
for (size_t s = 0; s < dimWidth; s++) { // loop over source positions
auto i = locate(/*batchIdx=*/b, /*wordPos=*/s);
if (!m[i])
break;
numWords++;
// keep track of entering/exiting the inline-fix source tags
auto w = d[i];
if (w == fixSrcId)
inside = true;
else if (w == fixTgtId)
inside = false;
bool wHasSrcIdFactor = hasInlineFixFactors && factoredVocab->getFactor(w, inlineFixGroupIndex) == inlineFixSrc;
if (inside || w == fixSrcId || w == fixTgtId || w == fixEndId || wHasSrcIdFactor)
m[i] = 0.0f; // decoder must not look at embedded source, nor the markup tokens
}
}
ABORT_IF(batchWords() != 0/*n/a*/ && numWords != batchWords(), "batchWords() inconsistency??");
}
return m;
}
} // namespace data
} // namespace marian

71
src/data/corpus_base.h Normal file → Executable file
View File

@ -143,12 +143,19 @@ public:
* words (width) and \f$s\f$ is the number of sentences (size).
*/
Words& data() { return indices_; }
const Words& data() const { return indices_; }
/**
* @brief compute flat index into data() and mask() vectors for given batch index and word index in sentence
*/
size_t locate(size_t batchIdx, size_t wordPos) const { return locate(batchIdx, wordPos, size_); }
static size_t locate(size_t batchIdx, size_t wordPos, size_t batchSize) { return wordPos * batchSize + batchIdx; }
/**
* @brief Flat masking vector; 0 is used for masked words.
*
* @see data()
*/
std::vector<float>& mask() { return mask_; }
const std::vector<float>& mask() const { return mask_; }
/**
* @brief Accessors to the vocab_ field.
@ -158,15 +165,15 @@ public:
/**
* @brief The number of sentences in the batch.
*/
size_t batchSize() { return size_; }
size_t batchSize() const { return size_; }
/**
* @brief The number of words in the longest sentence in the batch.
*/
size_t batchWidth() { return width_; };
size_t batchWidth() const { return width_; };
/**
* @brief The total number of words in the batch (not counting masked-out words).
*/
size_t batchWords() { return words_; }
size_t batchWords() const { return words_; }
/**
* @brief Splits the stream into sub-batches of equal size (except for last).
@ -179,7 +186,7 @@ public:
*
* @see marian::data::Batch::split(size_t n)
*/
std::vector<Ptr<SubBatch>> split(size_t n, size_t sizeLimit /*or SIZE_MAX*/) {
std::vector<Ptr<SubBatch>> split(size_t n, size_t sizeLimit /*or SIZE_MAX*/) const {
ABORT_IF(size_ == 0, "Encountered sub-batch size of 0");
auto size = std::min(size_, sizeLimit); // if limit is given then pretend the batch only has that many sentences
@ -191,11 +198,11 @@ public:
// determine actual width (=max length) of this sub-batch, which may be smaller than the overall max length
size_t subWidth = 0;
for(size_t j = 0; j < width_; ++j) {
for(size_t i = 0; i < subSize; ++i) {
if(mask_[j * size_ + (pos + i)] != 0)
if (subWidth < j + 1)
subWidth = j + 1;
for(size_t s = 0; s < width_; ++s) {
for(size_t b = 0; b < subSize; ++b) {
if(mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)] != 0) // s * size_ + (pos + b)
if (subWidth < s + 1)
subWidth = s + 1;
}
}
@ -203,12 +210,12 @@ public:
auto sb = New<SubBatch>(subSize, subWidth, vocab_);
size_t words = 0;
for(size_t j = 0; j < subWidth; ++j) {
for(size_t i = 0; i < subSize; ++i) {
sb->data()[j * subSize + i] = indices_[j * size_ + (pos + i)];
sb->mask()[j * subSize + i] = mask_[j * size_ + (pos + i)];
for(size_t s = 0; s < subWidth; ++s) {
for(size_t b = 0; b < subSize; ++b) {
sb->data()[locate(/*batchIdx=*/b, /*wordPos=*/s, /*batchSize=*/subSize)/*s * subSize + b*/] = indices_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)]; // s * size_ + (pos + b)
sb->mask()[locate(/*batchIdx=*/b, /*wordPos=*/s, /*batchSize=*/subSize)/*s * subSize + b*/] = mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)]; // s * size_ + (pos + b)
if(mask_[j * size_ + (pos + i)] != 0)
if(mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)/*s * size_ + (pos + b)*/] != 0)
words++;
}
}
@ -220,6 +227,9 @@ public:
}
void setWords(size_t words) { words_ = words; }
// experimental: hide inline-fix source tokens from cross attention
std::vector<float> crossMaskWithInlineFixSourceSuppressed() const;
};
/**
@ -229,7 +239,7 @@ public:
class CorpusBatch : public Batch {
protected:
std::vector<Ptr<SubBatch>> subBatches_;
std::vector<float> guidedAlignment_;
std::vector<float> guidedAlignment_; // [max source len, batch size, max target len] flattened
std::vector<float> dataWeights_;
public:
@ -300,7 +310,8 @@ public:
/**
* @brief Creates a batch filled with fake data. Used to determine the size of
* the batch object.
* the batch object. With guided-alignments and multiple encoders, those
* multiple source streams are expected to have the same lengths.
*
* @param lengths List of subbatch sizes.
* @param batchSize Number of sentences in the batch.
@ -333,6 +344,7 @@ public:
return batch;
if(options->get("guided-alignment", std::string("none")) != "none") {
// @TODO: if > 1 encoder, verify that all encoders have the same sentence lengths
std::vector<float> alignment(batchSize * lengths.front() * lengths.back(),
0.f);
batch->setGuidedAlignment(std::move(alignment));
@ -406,7 +418,7 @@ public:
size_t bi = i + pos;
for(size_t sid = 0; sid < srcWords; ++sid) {
for(size_t tid = 0; tid < trgWords; ++tid) {
size_t bidx = sid * oldSize * oldTrgWords + bi * oldTrgWords + tid;
size_t bidx = sid * oldSize * oldTrgWords + bi * oldTrgWords + tid; // [sid, bi, tid]
size_t idx = sid * dimBatch * trgWords + i * trgWords + tid;
aligns[idx] = guidedAlignment_[bidx];
}
@ -432,9 +444,9 @@ public:
// this needs to be split along the batch dimension
// which is here the innermost dimension.
// Should work for sentence-based weights, too.
for(size_t j = 0; j < width; ++j) {
for(size_t i = 0; i < split->size(); ++i) {
ws[j * split->size() + i] = dataWeights_[j * oldSize + i + pos];
for(size_t s = 0; s < width; ++s) {
for(size_t b = 0; b < split->size(); ++b) {
ws[s * split->size() + b] = dataWeights_[s * oldSize + b + pos]; // @TODO: use locate() as well
}
}
split->setDataWeights(ws);
@ -445,9 +457,13 @@ public:
return splits;
}
std::vector<float>& getGuidedAlignment() { return guidedAlignment_; }
const std::vector<float>& getGuidedAlignment() const { return guidedAlignment_; } // [dimSrcWords, dimBatch, dimTrgWords] flattened
void setGuidedAlignment(std::vector<float>&& aln) override {
guidedAlignment_ = std::move(aln);
guidedAlignment_ = std::move(aln);
}
size_t locateInGuidedAlignments(size_t b, size_t s, size_t t) {
return ((s * size()) + b) * widthTrg() + t;
}
std::vector<float>& getDataWeights() { return dataWeights_; }
@ -469,15 +485,14 @@ public:
std::cerr << std::endl;
}
size_t b = 0;
size_t subBatchIndex = 0;
for(auto sb : subBatches_) {
std::cerr << "batch " << b++ << ": " << std::endl;
std::cerr << "stream " << subBatchIndex++ << ": " << std::endl;
const auto& vocab = sb->vocab();
for(size_t i = 0; i < sb->batchWidth(); i++) {
for(size_t s = 0; s < sb->batchWidth(); s++) {
std::cerr << "\t w: ";
for(size_t j = 0; j < sb->batchSize(); j++) {
size_t idx = i * sb->batchSize() + j;
Word w = sb->data()[idx];
for(size_t b = 0; b < sb->batchSize(); b++) {
Word w = sb->data()[sb->locate(/*batchIdx=*/b, /*wordPos=*/s)]; // s * sb->batchSize() + b;
if (vocab && !printIndices)
std::cerr << (*vocab)[w] << " ";
else

View File

@ -400,7 +400,7 @@ std::string FactoredVocab::word2string(Word word) const {
res.append("?");
}
else
res.append(factorVocab_[(WordIndex)(index + groupRanges_[g].first)]);
res.append(getFactorName(g, index));
}
return res;
}
@ -431,6 +431,21 @@ Word FactoredVocab::string2word(const std::string& w) const {
return word;
}
// does a specific factor exist in the vocabulary
// Factor name must be given without separator. This function cannot be used for lemmas.
bool FactoredVocab::tryGetFactor(const std::string& factorName, size_t& groupIndex, size_t& factorIndex) const {
WordIndex u;
if (factorVocab_.tryFind(factorSeparator_ + factorName, u))
{
groupIndex = factorGroups_[u];
ABORT_IF(u < groupRanges_[groupIndex].first || u >= groupRanges_[groupIndex].second, "Invalid factorGroups_ entry??");
factorIndex = u - groupRanges_[groupIndex].first;
return true;
}
else
return false;
}
// extract the factor index of a given factor type from the 'Word' representation
size_t FactoredVocab::getFactor(Word word, size_t groupIndex) const {
size_t index = word.toWordIndex();
@ -565,12 +580,18 @@ void FactoredVocab::constructNormalizationInfoForVocab() {
// decode a 'Word' array into the external string representation of that token sequence, as written to output files
/*virtual*/ std::string FactoredVocab::decode(const Words& sentence, bool ignoreEOS /*= true*/) const /*override final*/ {
std::vector<std::string> decoded;
decoded.reserve(sentence.size());
for(auto w : sentence) {
std::vector<std::string> decoded; decoded.reserve(sentence.size());
for(auto w : sentence)
if((w != getEosId() || !ignoreEOS))
decoded.push_back((*this)[w]);
}
return utils::join(decoded, " ");
}
// diagnostics version of decode() that will not fail on partial words, will print EOS, and is a little slower
std::string FactoredVocab::decodeForDiagnostics(const Words& sentence) const {
std::vector<std::string> decoded; decoded.reserve(sentence.size());
for (auto w : sentence)
decoded.push_back(word2string(w));
return utils::join(decoded, " ");
}
@ -740,7 +761,7 @@ Ptr<IVocab> createFactoredVocab(const std::string& vocabPath) {
static std::map<std::string, Ptr<IVocab>> s_cache;
auto iter = s_cache.find(vocabPath);
if (iter != s_cache.end()) {
LOG(info, "[vocab] Reusing existing vocabulary object in memory (vocab size {})", iter->second->size());
LOG_ONCE(info, "[vocab] Reusing existing vocabulary object in memory (vocab size {})", iter->second->size());
return iter->second;
}
auto vocab = New<FactoredVocab>();

14
src/data/factored_vocab.h Normal file → Executable file
View File

@ -66,6 +66,9 @@ public:
bool canExpandFactoredWord(Word word, size_t groupIndex) const { return lemmaHasFactorGroup(getFactor(word, 0), groupIndex); }
size_t getFactor(Word word, size_t groupIndex) const;
bool lemmaHasFactorGroup(size_t factor0Index, size_t g) const { return lemmaHasFactorGroup_[factor0Index][g]; }
const std::string& getFactorGroupPrefix(size_t groupIndex) const { return groupPrefixes_[groupIndex]; } // for diagnostics only
const std::string& getFactorName(size_t groupIndex, size_t factorIndex) const { return factorVocab_[(WordIndex)(factorIndex + groupRanges_[groupIndex].first)]; }
std::string decodeForDiagnostics(const Words& sentence) const;
static constexpr size_t FACTOR_NOT_APPLICABLE = (SIZE_MAX - 1);
static constexpr size_t FACTOR_NOT_SPECIFIED = (SIZE_MAX - 2);
@ -74,6 +77,17 @@ public:
static Ptr<FactoredVocab> tryCreateAndLoad(const std::string& path); // load from "vocab" option if it specifies a factored vocab
std::string word2string(Word word) const;
Word string2word(const std::string& w) const;
bool tryGetFactor(const std::string& factorGroupName, size_t& groupIndex, size_t& factorIndex) const; // note: factorGroupName given without separator
// some hard-coded constants from FactoredSegmenter
// The naming mimics the names in FactoredSegmenter.cs, and therefore intentionally does not follow Marian conventions.
// @TODO: We have more hard-coded constants throughout the code. Move them all here.
// @TODO: figure out how to do this with static const*/constexpr
#define FactoredVocab_INLINE_FIX_WHAT_serialized "is"
#define FactoredVocab_FIX_SRC_ID_TAG "<IOPEN>"
#define FactoredVocab_FIX_TGT_ID_TAG "<IDELIM>"
#define FactoredVocab_FIX_END_ID_TAG "<ICLOSE>"
private:
void constructGroupInfoFromFactorVocab();
void constructFactorIndexConversion();

0
src/data/vocab.cpp Normal file → Executable file
View File

0
src/data/vocab_base.h Normal file → Executable file
View File

View File

@ -284,11 +284,6 @@ Expr stopGradient(Expr a) {
return res;
}
Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init) {
auto graph = a->graph();
return graph->constant(a->shape(), init, a->value_type());
}
// gather() -- gather arbitrary elements along an axis; batched or non-batched
Expr gather(Expr a, int axis, Expr indices) {
return Expression<GatherNodeOp>(a, axis, indices);

View File

@ -141,7 +141,17 @@ Expr atleast_4d(Expr a);
Expr atleast_nd(Expr a, size_t dims);
// create a constant of shape a->shape() and initialize with init
Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init);
// @TODO: add a && version, to avoid a ref count. NodeInitializers are typically temps.
// @TODO: and/or make this a template on init
static inline Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init) {
return a->graph()->constant(a->shape(), init, a->value_type());
}
// short-cut to init from std::vector, since we do this so often
template<typename ElementType>
Expr constant_like(Expr a, const std::vector<ElementType>& v) { return constant_like(a, inits::fromVector(std::move(v))); }
template<typename ElementType>
Expr constant_like(Expr a, std::vector<ElementType>&& v) { return constant_like(a, inits::fromVector(v)); }
Expr flatten(Expr a);
Expr flatten_2d(Expr a);

View File

@ -145,10 +145,20 @@ Ptr<NodeInitializer> fromVector(const std::vector<T>& v) {
return fromLambda([v](Tensor t) { t->set(v.data(), v.data() + v.size()); }, typeId<T>());
}
template <typename T>
Ptr<NodeInitializer> fromVector(std::vector<T>&& v) {
return fromLambda([v](Tensor t) { t->set(v.data(), v.data() + v.size()); }, typeId<T>());
}
template Ptr<NodeInitializer> fromVector<float16>(const std::vector<float16>& v);
template Ptr<NodeInitializer> fromVector<float>(const std::vector<float>& v);
template Ptr<NodeInitializer> fromVector<IndexType>(const std::vector<IndexType>& v);
// @TODO: can we remove the const& ones above? They always make a copy anyways, and often from a temp
template Ptr<NodeInitializer> fromVector<float16> (std::vector<float16> && v);
template Ptr<NodeInitializer> fromVector<float> (std::vector<float> && v);
template Ptr<NodeInitializer> fromVector<IndexType>(std::vector<IndexType>&& v);
Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v) {
return fromLambda([v](Tensor t) { t->set(1e-6); t->setSparse(v.first, v.second); });
}

2
src/graph/node_initializers.h Normal file → Executable file
View File

@ -143,6 +143,8 @@ Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
// @TODO: add documentation
template <typename T>
Ptr<NodeInitializer> fromVector(const std::vector<T>& v);
template <typename T>
Ptr<NodeInitializer> fromVector(std::vector<T>&& v);
// @TODO: add documentation
Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v);

4
src/graph/node_operators_binary.h Normal file → Executable file
View File

@ -859,7 +859,9 @@ struct MinimumNodeOp : public ElementBinaryNodeOp {
struct CmpNodeOp : public ElementBinaryNodeOp {
CmpNodeOp(Expr a, Expr b, int cmp_, bool not_) : ElementBinaryNodeOp(a, b), cmp_(cmp_), not_(not_) {
setTrainable(false); // has no gradient
//setTrainable(false); // has no gradient
// Note: ^^ Disabled because it currently causing Marian to choke, for unknown reasons.
// Not setting this will not change the result since the vector of gradient functions is empty.
}
NodeOps forwardOps() override {

View File

@ -4,7 +4,8 @@
#include "layers/constructors.h"
#include "layers/loss.h"
#include "data/factored_vocab.h"
#include "rnn/types.h" // for State::select()
#include "rnn/types.h" // for State::select()
#include "models/states.h" // for EncoderState
//using std::size_t; // not sure why this is needed
@ -219,7 +220,7 @@ namespace marian {
factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get<std::string>("vocab", ""));
if (factoredVocab_) {
numOutputClasses = (int)factoredVocab_->factorVocabSize();
LOG(info, "[embedding] Factored outputs enabled");
LOG_ONCE(info, "[embedding] Factored outputs enabled");
}
if(tiedParam_) {
@ -237,10 +238,10 @@ namespace marian {
/*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
ABORT_IF(lemmaDimEmb && !factoredVocab_, "--lemma-dim-emb requires a factored vocabulary");
if (lemmaDimEmb > 0) {
if (lemmaDimEmb > 0) { // > 0 means to embed the (expected) word with a different embedding matrix
#define HARDMAX_HACK
#ifdef HARDMAX_HACK
lemmaDimEmb = lemmaDimEmb & 0xfffffffe;
lemmaDimEmb = lemmaDimEmb & 0xfffffffe; // hack to select hard-max: use an odd number
#endif
auto range = factoredVocab_->getGroupRange(0);
auto lemmaVocabDim = (int)(range.second - range.first);
@ -263,8 +264,9 @@ namespace marian {
// project each factor separately
auto numGroups = factoredVocab_->getNumGroups();
std::vector<Ptr<RationalLoss>> allLogits(numGroups, nullptr); // (note: null entries for absent factors)
Expr input1 = input;
Expr Plemma = nullptr;
Expr input1 = input; // [B... x D]
Expr Plemma = nullptr; // used for lemmaDimEmb=-1
Expr inputLemma = nullptr; // used for lemmaDimEmb=-2, -3
for (size_t g = 0; g < numGroups; g++) {
auto range = factoredVocab_->getGroupRange(g);
if (g > 0 && range.first == range.second) // empty entry
@ -280,6 +282,52 @@ namespace marian {
factorWt = slice(Wt_, isLegacyUntransposedW ? -1 : 0, Slice((int)range.first, (int)range.second));
factorB = slice(b_, -1, Slice((int)range.first, (int)range.second));
}
/*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
if ((lemmaDimEmb == -2 || lemmaDimEmb == -3) && g > 0) { // -2/-3 means a gated transformer-like structure (-3 = hard-max)
LOG_ONCE(info, "[embedding] using lemma conditioning with gate");
// this mimics one transformer layer
// - attention over two inputs:
// - e = current lemma. We use the original embedding vector; specifically, expectation over all lemmas.
// - input = hidden state FF(h_enc+h_dec)
// - dot-prod attention to allow both sides to influence (unlike our recurrent self-attention)
// - multi-head to allow for multiple conditions to be modeled
// - add & norm, for gradient flow and scaling
// - FF layer --this is expensive; it is per-factor
// multi-head attention
int inputDim = input->shape()[-1];
int heads = 8;
auto name = options_->get<std::string>("prefix") + "_factor" + std::to_string(g);
auto Wq = graph_->param(name + "_Wq", { inputDim, inputDim }, inits::glorotUniform());
auto Wk = graph_->param(name + "_Wk", { inputDim, inputDim }, inits::glorotUniform());
auto Wv = graph_->param(name + "_Wv", { inputDim, inputDim }, inits::glorotUniform());
auto toMultiHead = [&](Expr x, int heads) {
const auto& shape = x->shape();
int inputDim = shape[-1];
int otherDim = shape.elements() / inputDim;
ABORT_IF(inputDim / heads * heads != inputDim, "inputDim ({}) must be multiple of number of heads ({})", inputDim, heads);
return reshape(x, { otherDim, heads, 1, inputDim / heads });
};
input1 = inputLemma;
auto qm = toMultiHead(dot(input1, Wq), heads); // [B... x H x D/H] projected query
auto kdm = toMultiHead(dot(input1 - input, Wk), heads); // [B... x H x D/H] the two data vectors projected as keys. Use diff and sigmoid, instead of softmax.
auto vem = toMultiHead(dot(input1, Wv), heads); // [B... x H x D/H] one of the two data vectors projected as values
auto vim = toMultiHead(dot( input, Wv), heads); // [B... x H x D/H] the other
auto zm = bdot(qm, kdm, false, true); // [B... x H x 1]
auto sm = sigmoid(zm); // [B... x H x 1]
auto rm = sm * (vem - vim) + vim; // [B... x H x D/H]
auto r = reshape(rm, input->shape()); // [B... x D]
// add & norm
input1 = r + input1;
input1 = layerNorm(input1, name + "_att");
// FF layer
auto ffnDropProb = 0.1f; // @TODO: get as a parameter
auto ffnDim = inputDim * 2; // @TODO: get as a parameter
auto f = denseInline(input1, name + "_ffn", /*suffix=*/"1", ffnDim, (ActivationFunction*)relu, ffnDropProb);
f = denseInline(f, name + "_ffn", /*suffix=*/"2", inputDim);
// add & norm
input1 = f + input1;
input1 = layerNorm(input1, name + "_ffn");
}
// @TODO: b_ should be a vector, not a matrix; but shotlists use cols() in, which requires a matrix
auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true, /*scale=*/1.0f); // [B... x U] factor logits
// optionally add lemma-dependent bias
@ -294,15 +342,28 @@ namespace marian {
allLogits[g] = New<RationalLoss>(factorLogits, nullptr);
// optionally add a soft embedding of lemma back to create some lemma dependency
// @TODO: if this works, move it into lazyConstruct
/*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
if (lemmaDimEmb < 0 && g == 0) {
ABORT_IF(shortlist_ && lemmaDimEmb != 0, "Lemma-dependent bias with short list is not yet implemented");
if (lemmaDimEmb == -2 && g == 0) { // -2 means a gated transformer-like structure
LOG_ONCE(info, "[embedding] using lemma conditioning with gate, soft-max version");
// get expected lemma embedding vector
auto factorLogSoftmax = logsoftmax(factorLogits); // [B... x U] note: with shortlist, this is not the full lemma set
auto factorSoftmax = exp(factorLogSoftmax);
inputLemma = dot(factorSoftmax, factorWt, false, /*transB=*/isLegacyUntransposedW ? true : false); // [B... x D]
}
else if (lemmaDimEmb == -3 && g == 0) { // same as -2 except with hard max
LOG_ONCE(info, "[embedding] using lemma conditioning with gate, hard-max version");
// get max-lemma embedding vector
auto maxVal = max(factorLogits, -1); // [B... x U] note: with shortlist, this is not the full lemma set
auto factorHardmax = eq(factorLogits, maxVal);
inputLemma = dot(factorHardmax, factorWt, false, /*transB=*/isLegacyUntransposedW ? true : false); // [B... x D]
}
else if (lemmaDimEmb == -1 && g == 0) { // -1 means learn a lemma-dependent bias
ABORT_IF(shortlist_, "Lemma-dependent bias with short list is not yet implemented");
LOG_ONCE(info, "[embedding] using lemma-dependent bias");
auto factorLogSoftmax = logsoftmax(factorLogits); // (we do that again later, CSE will kick in)
auto z = /*stopGradient*/(factorLogSoftmax);
Plemma = exp(z); // [B... x U]
}
if (lemmaDimEmb > 0 && g == 0) {
else if (lemmaDimEmb > 0 && g == 0) { // > 0 means learn a re-embedding matrix
LOG_ONCE(info, "[embedding] enabled re-embedding of lemma, at dim {}", lemmaDimEmb);
// compute softmax. We compute logsoftmax() separately because this way, computation will be reused later via CSE
auto factorLogSoftmax = logsoftmax(factorLogits);
@ -349,7 +410,7 @@ namespace marian {
factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get<std::string>("vocab", ""));
if (factoredVocab_) {
dimVoc = (int)factoredVocab_->factorVocabSize();
LOG(info, "[embedding] Factored embeddings enabled");
LOG_ONCE(info, "[embedding] Factored embeddings enabled");
}
// Embedding layer initialization should depend only on embedding size, hence fanIn=false
@ -389,7 +450,7 @@ namespace marian {
auto graph = E_->graph();
int dimBatch = (int)subBatch->batchSize();
int dimEmb = E_->shape()[-1];
int dimWords = (int)subBatch->batchWidth();
int dimWidth = (int)subBatch->batchWidth();
// factored embeddings:
// - regular:
@ -419,9 +480,16 @@ namespace marian {
// - but forward pass weighs them down, so that all factors are in a similar numeric range
// - if it is required to be in a different range, the embeddings can still learn that, but more slowly
auto batchEmbeddings = apply(subBatch->data(), {dimWords, dimBatch, dimEmb});
auto batchMask = graph->constant({dimWords, dimBatch, 1},
auto batchEmbeddings = apply(subBatch->data(), {dimWidth, dimBatch, dimEmb});
#if 0
auto batchMask = graph->constant({dimWidth, dimBatch, 1},
inits::fromVector(subBatch->mask()));
#else
// experimental: hide inline-fix source tokens from cross attention
auto batchMask = graph->constant({dimWidth, dimBatch, 1},
inits::fromVector(subBatch->crossMaskWithInlineFixSourceSuppressed()));
#endif
return std::make_tuple(batchEmbeddings, batchMask);
}

View File

@ -412,4 +412,32 @@ public:
ABORT("not implemented"); // @TODO: implement me
}
};
// --- a few layers with built-in parameters created on the fly, without proper object
// @TODO: change to a proper layer object
// like affine() but with built-in parameters, activation, and dropout
static inline
Expr denseInline(Expr x, std::string prefix, std::string suffix, int outDim, const std::function<Expr(Expr)>& actFn = nullptr, float dropProb = 0.0f)
{
auto graph = x->graph();
auto W = graph->param(prefix + "_W" + suffix, { x->shape()[-1], outDim }, inits::glorotUniform());
auto b = graph->param(prefix + "_b" + suffix, { 1, outDim }, inits::zeros());
x = affine(x, W, b);
if (actFn)
x = actFn(x);
x = dropout(x, dropProb);
return x;
}
static inline
Expr layerNorm(Expr x, std::string prefix, std::string suffix = std::string()) {
int dimModel = x->shape()[-1];
auto scale = x->graph()->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones());
auto bias = x->graph()->param(prefix + "_ln_bias" + suffix, { 1, dimModel }, inits::zeros());
return marian::layerNorm(x, scale, bias, 1e-6f);
}
} // namespace marian

View File

@ -18,29 +18,36 @@ static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
Expr alignmentLoss; // sum up loss over all attention/alignment positions
size_t numLabels;
if(guidedLossType == "ce") {
// ce requires normalized probabilities
// normalizedAlignment is multi-hot, but ce requires normalized probabilities, so need to normalize to P(s|t)
auto dimBatch = shape[-2];
auto dimTrgWords = shape[-1];
auto dimSrcWords = shape[-3];
ABORT_IF(shape[-4] != 1, "Guided alignments with beam??");
auto normalizedAlignment = batch->getGuidedAlignment();
auto normalizedAlignment = batch->getGuidedAlignment(); // [dimSrcWords, dimBatch, dimTrgWords] flattened, matches shape of 'attention'
auto srcBatch = batch->front();
const auto& srcMask = srcBatch->mask();
ABORT_IF(shape.elements() != normalizedAlignment.size(), "Attention-matrix and alignment shapes differ??");
ABORT_IF(dimBatch != batch->size() || dimTrgWords != batch->widthTrg() || dimSrcWords != batch->width(), "Attention-matrix and batch shapes differ??");
auto locate = [=](size_t s, size_t b, size_t t) { return ((s * dimBatch) + b) * dimTrgWords + t; };
for (size_t b = 0; b < dimBatch; b++) {
for (size_t t = 0; t < dimTrgWords; t++) {
for (size_t s = 0; s < dimSrcWords; s++)
ABORT_IF(locate(s, b, t) != batch->locateInGuidedAlignments(b, s, t), "locate() and locateInGuidedAlignments() differ??");
// renormalize the alignment such that it sums up to 1
float sum = 0;
for (size_t s = 0; s < dimSrcWords; s++)
sum += normalizedAlignment[locate(s, b, t)];
sum += srcMask[srcBatch->locate(b, s)] * normalizedAlignment[locate(s, b, t)]; // these values are 0 or 1
if (sum != 0 && sum != 1)
for (size_t s = 0; s < dimSrcWords; s++)
normalizedAlignment[locate(s, b, t)] /= sum;
}
}
auto alignment = constant_like(attention, inits::fromVector(normalizedAlignment));
auto alignment = constant_like(attention, std::move(normalizedAlignment));
alignmentLoss = -sum(flatten(alignment * log(attention + epsilon)));
numLabels = batch->back()->batchWords();
ABORT_IF(numLabels > shape.elements() / shape[-3], "Num labels of guided alignment cost is off??");
} else {
auto alignment = constant_like(attention, inits::fromVector(batch->getGuidedAlignment()));
auto alignment = constant_like(attention, batch->getGuidedAlignment());
if(guidedLossType == "mse")
alignmentLoss = sum(flatten(square(attention - alignment))) / 2.f;
else if(guidedLossType == "mult") // @TODO: I don't know what this criterion is for. Can we remove it?

View File

@ -206,7 +206,7 @@ private:
virtual Expr accumulateLoss(const RationalLoss& current) override {
if(loss_) {
const auto& first = partialLosses_.front();
return loss_ + first.count() * (current.loss() / current.count()); // scale up/down to match scale of first loss
return loss_ + current.loss() * first.count() / current.count(); // scale up/down to match scale of first loss
} else {
return current.loss(); // first reference loss, keeps to scale with this one
}
@ -344,8 +344,8 @@ protected:
// for bert training or classification the time dimension is lot.
// Here safeguard against 2d classifier output, adds 1 on the left, non-op.
Expr ce = cast(cross_entropy(logits, indices), Type::float32);
if (inFactor) {
LOG_ONCE("scaling factor losses with weight {}", factorWeight_);
if (inFactor && factorWeight_ != 1.0f) {
LOG_ONCE(info, "scaling factor losses with weight {}", factorWeight_);
ce = ce * factorWeight_;
}
if (labelSmoothing_ > 0) {

8
src/models/states.h Normal file → Executable file
View File

@ -9,7 +9,7 @@ namespace marian {
class EncoderState {
private:
Expr context_;
Expr mask_;
Expr mask_; // [beam depth=1, max length, batch size, vector dim=1] source mask
Ptr<data::CorpusBatch> batch_;
public:
@ -18,9 +18,9 @@ public:
EncoderState() {}
virtual Expr getContext() { return context_; }
virtual Expr getAttended() { return context_; }
virtual Expr getMask() { return mask_; }
virtual Expr getContext() const { return context_; }
virtual Expr getAttended() const { return context_; }
virtual Expr getMask() const { return mask_; } // source batch mask; may have additional positions suppressed
virtual const Words& getSourceWords() {
return batch_->front()->data();

View File

@ -142,29 +142,6 @@ public:
return reshape(output, {dimBeam, dimBatch, dimSteps, dimModel});
}
// like affine() but with built-in parameters, activation, and dropout
static inline
Expr dense(Expr x, std::string prefix, std::string suffix, int outDim, const std::function<Expr(Expr)>& actFn = nullptr, float dropProb = 0.0f)
{
auto graph = x->graph();
auto W = graph->param(prefix + "_W" + suffix, { x->shape()[-1], outDim }, inits::glorotUniform());
auto b = graph->param(prefix + "_b" + suffix, { 1, outDim }, inits::zeros());
x = affine(x, W, b);
if (actFn)
x = actFn(x);
x = dropout(x, dropProb);
return x;
}
Expr layerNorm(Expr x, std::string prefix, std::string suffix = std::string()) const {
int dimModel = x->shape()[-1];
auto scale = graph_->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones());
auto bias = graph_->param(prefix + "_ln_bias" + suffix, { 1, dimModel }, inits::zeros());
return marian::layerNorm(x, scale, bias, 1e-6f);
}
Expr preProcess(std::string prefix, std::string ops, Expr input, float dropProb = 0.0f) const {
auto output = input;
for(auto op : ops) {
@ -192,7 +169,7 @@ public:
// highway connection
else if(op == 'h') {
int dimModel = input->shape()[-1];
auto t = dense(prevInput, prefix, /*suffix=*/"h", dimModel);
auto t = denseInline(prevInput, prefix, /*suffix=*/"h", dimModel);
output = highway(output, prevInput, t);
}
// layer normalization
@ -402,8 +379,8 @@ public:
// the stack of FF layers
for(int i = 1; i < depthFfn; ++i)
output = dense(output, prefix, /*suffix=*/std::to_string(i), dimFfn, actFn, ffnDropProb);
output = dense(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel);
output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, actFn, ffnDropProb);
output = denseInline(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel);
auto opsPost = opt<std::string>("transformer-postprocess");
output
@ -430,14 +407,14 @@ public:
// the stack of AAN layers
for(int i = 1; i < depthAan; ++i)
y = dense(y, prefix, /*suffix=*/std::to_string(i), dimAan, actFn, aanDropProb);
y = denseInline(y, prefix, /*suffix=*/std::to_string(i), dimAan, actFn, aanDropProb);
if(y->shape()[-1] != dimModel) // bring it back to the desired dimension if needed
y = dense(y, prefix, std::to_string(depthAan), dimModel);
y = denseInline(y, prefix, std::to_string(depthAan), dimModel);
bool noGate = opt<bool>("transformer-aan-nogate");
if(!noGate) {
auto gi = dense(x, prefix, /*suffix=*/"i", dimModel, (ActivationFunction*)sigmoid);
auto gf = dense(y, prefix, /*suffix=*/"f", dimModel, (ActivationFunction*)sigmoid);
auto gi = denseInline(x, prefix, /*suffix=*/"i", dimModel, (ActivationFunction*)sigmoid);
auto gf = denseInline(y, prefix, /*suffix=*/"f", dimModel, (ActivationFunction*)sigmoid);
y = gi * x + gf * y;
}
@ -533,28 +510,29 @@ public:
batchEmbeddings = addSpecialEmbeddings(batchEmbeddings, /*start=*/0, batch);
// reorganize batch and timestep
batchEmbeddings = atleast_nd(batchEmbeddings, 4);
batchMask = atleast_nd(batchMask, 4);
auto layer = transposeTimeBatch(batchEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
auto layerMask
= reshape(transposeTimeBatch(batchMask), {1, dimBatch, 1, dimSrcWords}); // [-4: beam depth=1, -3: batch size, -2: vector dim=1, -1: max length]
batchEmbeddings = atleast_nd(batchEmbeddings, 4); // [beam depth=1, max length, batch size, vector dim]
batchMask = atleast_nd(batchMask, 4); // [beam depth=1, max length, batch size, vector dim=1]
auto layer = transposeTimeBatch(batchEmbeddings); // [beam depth=1, batch size, max length, vector dim]
auto layerMask = transposeTimeBatch(batchMask); // [beam depth=1, batch size, max length, vector dim=1]
auto opsEmb = opt<std::string>("transformer-postprocess-emb");
float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
layer = preProcess(prefix_ + "_emb", opsEmb, layer, dropProb);
layerMask = transposedLogMask(layerMask); // [-4: batch size, -3: 1, -2: vector dim=1, -1: max length]
// LayerAttention expects mask in a different layout
layerMask = reshape(layerMask, {1, dimBatch, 1, dimSrcWords}); // [1, batch size, 1, max length]
layerMask = transposedLogMask(layerMask); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
// apply encoder layers
// This is the Transformer Encoder stack.
auto encDepth = opt<int>("enc-depth");
for(int i = 1; i <= encDepth; ++i) {
layer = LayerAttention(prefix_ + "_l" + std::to_string(i) + "_self",
layer, // query
layer, // keys
layer, // values
layerMask);
layerMask); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
layer = LayerFFN(prefix_ + "_l" + std::to_string(i) + "_ffn", layer);
}
@ -698,12 +676,14 @@ public:
std::vector<Expr> encoderContexts;
std::vector<Expr> encoderMasks;
for(auto encoderState : state->getEncoderStates()) {
auto encoderContext = encoderState->getContext();
auto encoderMask = encoderState->getMask();
auto encoderContext = encoderState->getContext(); // encoder output
auto encoderMask = encoderState->getMask(); // note: may differ from Encoder self-attention mask in that additional positions are banned for cross-attention
encoderMask = atleast_nd(encoderMask, 4);
encoderContext = transposeTimeBatch(encoderContext); // [beam depth=1, batch size, max length, vector dim]
encoderMask = transposeTimeBatch(encoderMask); // [beam depth=1, max length, batch size, vector dim=1]
encoderContext = transposeTimeBatch(encoderContext); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
int dimSrcWords = encoderContext->shape()[-2];
// This would happen if something goes wrong during batch pruning.
@ -712,10 +692,9 @@ public:
encoderContext->shape()[-3],
dimBatch);
encoderMask = atleast_nd(encoderMask, 4);
encoderMask = reshape(transposeTimeBatch(encoderMask),
{1, dimBatch, 1, dimSrcWords});
encoderMask = transposedLogMask(encoderMask);
// LayerAttention expects mask in a different layout
encoderMask = reshape(encoderMask, { 1, dimBatch, 1, dimSrcWords }); // [1, batch size, 1, max length]
encoderMask = transposedLogMask(encoderMask); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
if(dimBeam > 1)
encoderMask = repeat(encoderMask, dimBeam, /*axis=*/ -4);

0
src/optimizers/optimizers.cpp Normal file → Executable file
View File

20
src/tensors/gpu/tensor_operators.cu Normal file → Executable file
View File

@ -980,7 +980,7 @@ __global__ void gPasteRows(T* out,
const IndexType* targetRowIdx,
size_t rows) {
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
int j = bid + blockIdx.x; // index into 'indices' vector
if(j < rows) {
size_t dstId = targetRowIdx[j];
size_t srcId = j;
@ -988,11 +988,15 @@ __global__ void gPasteRows(T* out,
T* rowOut = out + dstId * cols;
const T* rowIn = in + srcId * cols;
// aggregate the entire row
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
int i = tid + threadIdx.x; // column index --@TODO: column index should be called 'j'
if(i < cols) {
// @TODO: Do we need to get rid of this atomic add? It seems slow for fp16
atomics::atomicAdd(rowOut + i, rowIn[i]);
// Note: atomicAdd() not needed if number of blocks is 1. Avoid it because it is slow for fp16.
if (gridDim.x == 1)
rowOut[i] += rowIn[i];
else
atomics::atomicAdd(rowOut + i, rowIn[i]);
}
}
}
@ -1011,7 +1015,15 @@ void PasteRows(Tensor out,
size_t rowsToCopy = indices->size();
int threads = std::min(MAX_THREADS, (int)cols);
#if 1 // @TODO: make this configurable with a 'deterministic' flag
// If we only use one block, then each core operates on a different column,
// hence the summation becomes deterministic.
// However, we only use e.g. 512 cores out of possibly 3000+, so this will be
// 6 x slower in this example.
int blocks = 1;
#else
int blocks = std::min(MAX_BLOCKS, (int)rowsToCopy);
#endif
if(out->type() == Type::float32) {
gPasteRows<<<blocks, threads>>>(

View File

@ -401,15 +401,20 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
paramsAvg_[idx], curParam, scheduler_->numberOfBatches(), updateTrgWords);
};
comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards
comm_->foreach(update); // per-shard model-update
comm_->allGatherParams(); // distribute param value shards back
// cost across all local devices (scheduler will aggregate cross-process)
StaticLoss localLoss;
for(auto& l : localDeviceLosses) // localDeviceLosses is already summed up over delay steps
localLoss += l;
// model update
if (std::isfinite(localLoss.loss) || mpi_->numMPIProcesses() > 1) { // guard against NaN (except with MPI, as this simple way could hang it)
comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices and MPI nodes into shards
comm_->foreach(update); // per-shard model-update
comm_->allGatherParams(); // distribute param value shards back
}
else
LOG(info, "[training] skipping {}-th update due to loss being {}", scheduler_->numberOfBatches(), localLoss.loss);
if(scheduler_) {
// track and log localLoss
scheduler_->update(localLoss, numReadBatches, effectiveBatchSize, effectiveBatchTrgWords, mpi_);

2
src/training/scheduler.h Normal file → Executable file
View File

@ -358,7 +358,7 @@ public:
&& heartBeatTimer_.elapsed<std::chrono::minutes>() >= 10) {
printf("PROGRESS: %.2f%%\nEVALERR: %.7f%%\n",
(double)state_->epochs,
state_->costSum / state_->costCount / (mpi ? mpi->numMPIProcesses() : 1));
state_->costSum / (state_->costCount ? state_->costCount : 1) / (mpi ? mpi->numMPIProcesses() : 1));
fflush(stdout);
std::cout << "MBSIZE: " << batchLabels << " after " << state_->batches << " updates = " << state_->labelsTotal << " labels" << std::endl << std::flush;
heartBeatTimer_.start();

View File

@ -99,12 +99,17 @@ public:
// starting with the lemma, then adding factors one by one.
if (factorGroup == 0) {
word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0
//std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
//LOG(info, "new lemma {},{}={} -> {}->{}", word.toWordIndex(), factorIndices[0], factoredVocab->word2string(word), prevHyp->getPathScore(), pathScore);
std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
//LOG(info, "{} + {} ({}) -> {} -> {}",
// factoredVocab->decode(prevHyp->tracebackWords()),
// factoredVocab->word2string(word), factorIndices[0], prevHyp->getPathScore(), pathScore);
}
else {
//LOG(info, "expand word {}={} with factor[{}] {} -> {}->{}", beam[beamHypIdx]->getWord().toWordIndex(),
// factoredVocab->word2string(beam[beamHypIdx]->getWord()), factorGroup, wordIdx, prevHyp->getPathScore(), pathScore);
//LOG(info, "{} |{} ({}) = {} ({}) -> {} -> {}",
// factoredVocab->decodeForDiagnostics(beam[beamHypIdx]->tracebackWords()),
// factoredVocab->getFactorGroupPrefix(factorGroup), factorGroup,
// factoredVocab->getFactorName(factorGroup, wordIdx), wordIdx,
// prevHyp->getPathScore(), pathScore);
word = beam[beamHypIdx]->getWord();
ABORT_IF(!factoredVocab->canExpandFactoredWord(word, factorGroup),
"A word without this factor snuck through to here??");