Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Hieu Hoang 2015-04-05 16:45:17 +04:00
commit 4cb8a1837e
10 changed files with 95 additions and 34 deletions

View File

@ -6,7 +6,7 @@
#include <vector>
#include <algorithm>
#include <cstring>
#include <unordered_set>
#include <set>
#include <glib.h>
#include <stdexcept>
#include <boost/thread.hpp>
@ -1557,9 +1557,9 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
{
std::size_t line_no = 0;
std::size_t perchunk = chunksize ? chunksize : 2000;
std::vector< std::string > lines[nthreads];
std::vector< std::string > results[nthreads];
boost::thread workers[nthreads];
std::vector< std::vector< std::string > > lines(nthreads);
std::vector< std::vector< std::string > > results(nthreads);
std::vector< boost::thread > workers(nthreads);
bool done_p = !(is.good() && os.good());
@ -1589,20 +1589,20 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
results[ithread].resize(line_pos);
break;
}
lines[ithread][line_pos].clear();
lines[ithread][line_pos].clear();
} else if (skip_xml_p &&
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
lines[ithread][line_pos].clear();
lines[ithread][line_pos].clear();
} else {
lines[ithread][line_pos] =
std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
}
}
if (line_pos)
if (line_pos) {
workers[ithread] =
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
}
} // end for loop starting threads
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
@ -1772,12 +1772,12 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
std::size_t finilen = 0;
std::size_t dotslen = 0;
static std::size_t SEQ_LIM = 6;
const std::size_t SEQ_LIM = 6;
charclass_t prev_class = empty;
charclass_t curr_class = empty;
charclass_t seq[SEQ_LIM] = { empty };
std::size_t pos[SEQ_LIM] = { 0 };
std::vector<charclass_t> seq(SEQ_LIM, empty);
std::vector<std::size_t> pos(SEQ_LIM, 0);
std::size_t seqpos = 0;
GUnicodeType curr_type = G_UNICODE_UNASSIGNED;
@ -1785,7 +1785,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
bool curr_word_p = false;
std::vector<std::size_t> breaks;
std::unordered_set<std::size_t> suppress;
std::set<std::size_t> suppress;
for (; icp <= ncp; ++icp) {
currwc = wchar_t(ucs4[icp]);
@ -1822,7 +1822,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else if (currwc >= SMAL_HYPH) {
curr_word_p = true;
} else {
curr_word_p = currwc >= WAVE_DASH && curr_word_p <= KANA_DHYP;
curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
}
break;
case G_UNICODE_CLOSE_PUNCTUATION:

View File

@ -180,7 +180,7 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
references_.Load(referenceFiles, vocab_);
SparseVector weights;
wv.ToSparse(&weights);
wv.ToSparse(&weights,num_dense_);
scorer_ = scorer;
static const string kWeights = "weights";
@ -243,7 +243,7 @@ void HypergraphHopeFearDecoder::HopeFear(
{
size_t sentenceId = *sentenceIdIter_;
SparseVector weights;
wv.ToSparse(&weights);
wv.ToSparse(&weights, num_dense_);
const Graph& graph = *(graphs_[sentenceId]);
// ValType hope_scale = 1.0;
@ -338,7 +338,7 @@ void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValTy
HgHypothesis bestHypo;
size_t sentenceId = *sentenceIdIter_;
SparseVector weights;
wv.ToSparse(&weights);
wv.ToSparse(&weights, num_dense_);
vector<ValType> bg(scorer_->NumberOfScores());
//cerr << "Calculating bleu on " << sentenceId << endl;
Viterbi(*(graphs_[sentenceId]), weights, 0, references_, sentenceId, bg, &bestHypo);

View File

@ -77,6 +77,7 @@ unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_f
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test forest_rescore_test : ForestRescoreTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test hypergraph_test : HypergraphTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test mira_feature_vector_test : MiraFeatureVectorTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test ngram_test : NgramTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test optimizer_factory_test : OptimizerFactoryTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test point_test : PointTest.cpp mert_lib ..//boost_unit_test_framework ;

View File

@ -0,0 +1,49 @@
#include "MiraFeatureVector.h"
#include "MiraWeightVector.h"
#define BOOST_TEST_MODULE MiraFeatureVector
#include <boost/test/unit_test.hpp>
using namespace MosesTuning;
/* Note that the conversion to and from SparseVector needs to know
how many of the features are really "dense". This is because in hg mira
all features (sparse and dense) are to get rolled in to SparseVector
*/
BOOST_AUTO_TEST_CASE(from_sparse) {
SparseVector sp;
sp.set("dense0", 0.2);
sp.set("dense1", 0.3);
sp.set("sparse0", 0.7);
sp.set("sparse1", 0.9);
sp.set("sparse2", 0.1);
MiraFeatureVector mfv(sp,2);
BOOST_CHECK_EQUAL(mfv.size(),5);
BOOST_CHECK_EQUAL(mfv.feat(0),0);
BOOST_CHECK_EQUAL(mfv.feat(1),1);
BOOST_CHECK_EQUAL(mfv.feat(2),4);
BOOST_CHECK_EQUAL(mfv.feat(3),5);
BOOST_CHECK_EQUAL(mfv.feat(4),6);
BOOST_CHECK_CLOSE(mfv.val(0), 0.2,1e-5);
BOOST_CHECK_CLOSE(mfv.val(1), 0.3,1e-5);
BOOST_CHECK_CLOSE(mfv.val(2), 0.7,1e-5);
BOOST_CHECK_CLOSE(mfv.val(3), 0.9,1e-5);
BOOST_CHECK_CLOSE(mfv.val(4), 0.1,1e-5);
MiraWeightVector mwv;
mwv.update(mfv,1.0);
SparseVector sp2;
mwv.ToSparse(&sp2,2);
//check we get back what we started with
BOOST_CHECK_CLOSE(sp2.get("dense0"), 0.2,1e-5);
BOOST_CHECK_CLOSE(sp2.get("dense1"), 0.3,1e-5);
BOOST_CHECK_CLOSE(sp2.get("sparse0"), 0.7,1e-5);
BOOST_CHECK_CLOSE(sp2.get("sparse1"), 0.9,1e-5);
BOOST_CHECK_CLOSE(sp2.get("sparse2"), 0.1,1e-5);
}

View File

@ -93,11 +93,17 @@ void MiraWeightVector::update(size_t index, ValType delta)
m_lastUpdated[index] = m_numUpdates;
}
void MiraWeightVector::ToSparse(SparseVector* sparse) const
void MiraWeightVector::ToSparse(SparseVector* sparse, size_t denseSize) const
{
for (size_t i = 0; i < m_weights.size(); ++i) {
if(abs(m_weights[i])>1e-8) {
sparse->set(i,m_weights[i]);
if (i < denseSize) {
sparse->set(i,m_weights[i]);
} else {
//The ids in MiraFeatureVector/MiraWeightVector for sparse features
//need to be translated when converting back to SparseVector.
sparse->set(i-denseSize, m_weights[i]);
}
}
}
}
@ -172,12 +178,18 @@ size_t AvgWeightVector::size() const
return m_wv.m_weights.size();
}
void AvgWeightVector::ToSparse(SparseVector* sparse) const
void AvgWeightVector::ToSparse(SparseVector* sparse, size_t denseSize) const
{
for (size_t i = 0; i < size(); ++i) {
ValType w = weight(i);
if(abs(w)>1e-8) {
sparse->set(i,w);
if (i < denseSize) {
sparse->set(i,w);
} else {
//The ids in MiraFeatureVector/MiraWeightVector for sparse features
//need to be translated when converting back to SparseVector.
sparse->set(i-denseSize, w);
}
}
}
}

View File

@ -64,9 +64,9 @@ public:
AvgWeightVector avg();
/**
* Convert to sparse vector, interpreting all features as sparse.
* Convert to sparse vector, interpreting all features as sparse. Only used by hgmira.
**/
void ToSparse(SparseVector* sparse) const;
void ToSparse(SparseVector* sparse, size_t denseSize) const;
friend class AvgWeightVector;
@ -104,7 +104,7 @@ public:
ValType score(const MiraFeatureVector& fv) const;
ValType weight(std::size_t index) const;
std::size_t size() const;
void ToSparse(SparseVector* sparse) const;
void ToSparse(SparseVector* sparse, size_t num_dense) const;
private:
const MiraWeightVector& m_wv;
};

View File

@ -3,7 +3,7 @@
#include <string>
#include <boost/thread/tss.hpp>
#include "Classifier.h"
#include "vw/Classifier.h"
#include "moses/TypeDef.h"
#include "moses/Util.h"
#include "moses/FF/StatelessFeatureFunction.h"

View File

@ -484,9 +484,7 @@ namespace Moses
targetOffset += tp.GetSize();
}
// Removing std::endl here breaks -alignment-output-file, so stop doing that, please :)
// Or fix it somewhere else.
out << std::endl;
// Used by --print-alignment-info, so no endl
}
void

View File

@ -1868,8 +1868,7 @@ void Manager::OutputAlignment(ostream &out, const vector<const Hypothesis *> &ed
targetOffset += tp.GetSize();
}
// Removing std::endl here breaks -alignment-output-file, so stop doing that, please :)
// Or fix it somewhere else.
// Used by --alignment-output-file so requires endl
out << std::endl;
}
@ -2024,6 +2023,8 @@ void Manager::OutputBestHypo(const Moses::TrellisPath &path, long /*translationI
void Manager::OutputAlignment(std::ostringstream &out, const TrellisPath &path) const
{
Hypothesis::OutputAlignment(out, path.GetEdges());
// Used by --alignment-output-file so requires endl
out << std::endl;
}
} // namespace

View File

@ -49,7 +49,7 @@ my $l1input = "$corpus.$l1";
if (-e $l1input) {
$opn = $l1input;
} elsif (-e $l1input.".gz") {
$opn = "zcat $l1input.gz |";
$opn = "gunzip -c $l1input.gz |";
} else {
die "Error: $l1input does not exist";
}
@ -59,7 +59,7 @@ my $l2input = "$corpus.$l2";
if (-e $l2input) {
$opn = $l2input;
} elsif (-e $l2input.".gz") {
$opn = "zcat $l2input.gz |";
$opn = "gunzip -c $l2input.gz |";
} else {
die "Error: $l2input does not exist";
}
@ -154,7 +154,7 @@ print STDERR "Input sentences: $innr Output sentences: $outnr\n";
sub word_count {
my ($line) = @_;
if ($ignore_xml) {
$line =~ s/<\S[^>]*\S>//g;
$line =~ s/<\S[^>]*\S>/ /g;
$line =~ s/\s+/ /g;
$line =~ s/^ //g;
$line =~ s/ $//g;