mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
4cb8a1837e
@ -6,7 +6,7 @@
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <unordered_set>
|
||||
#include <set>
|
||||
#include <glib.h>
|
||||
#include <stdexcept>
|
||||
#include <boost/thread.hpp>
|
||||
@ -1557,9 +1557,9 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
|
||||
{
|
||||
std::size_t line_no = 0;
|
||||
std::size_t perchunk = chunksize ? chunksize : 2000;
|
||||
std::vector< std::string > lines[nthreads];
|
||||
std::vector< std::string > results[nthreads];
|
||||
boost::thread workers[nthreads];
|
||||
std::vector< std::vector< std::string > > lines(nthreads);
|
||||
std::vector< std::vector< std::string > > results(nthreads);
|
||||
std::vector< boost::thread > workers(nthreads);
|
||||
bool done_p = !(is.good() && os.good());
|
||||
|
||||
|
||||
@ -1589,20 +1589,20 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
|
||||
results[ithread].resize(line_pos);
|
||||
break;
|
||||
}
|
||||
lines[ithread][line_pos].clear();
|
||||
lines[ithread][line_pos].clear();
|
||||
} else if (skip_xml_p &&
|
||||
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
|
||||
lines[ithread][line_pos].clear();
|
||||
lines[ithread][line_pos].clear();
|
||||
} else {
|
||||
lines[ithread][line_pos] =
|
||||
std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
|
||||
std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
|
||||
}
|
||||
}
|
||||
|
||||
if (line_pos)
|
||||
if (line_pos) {
|
||||
workers[ithread] =
|
||||
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
|
||||
|
||||
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
|
||||
}
|
||||
} // end for loop starting threads
|
||||
|
||||
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
|
||||
@ -1772,12 +1772,12 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
std::size_t finilen = 0;
|
||||
std::size_t dotslen = 0;
|
||||
|
||||
static std::size_t SEQ_LIM = 6;
|
||||
const std::size_t SEQ_LIM = 6;
|
||||
|
||||
charclass_t prev_class = empty;
|
||||
charclass_t curr_class = empty;
|
||||
charclass_t seq[SEQ_LIM] = { empty };
|
||||
std::size_t pos[SEQ_LIM] = { 0 };
|
||||
std::vector<charclass_t> seq(SEQ_LIM, empty);
|
||||
std::vector<std::size_t> pos(SEQ_LIM, 0);
|
||||
std::size_t seqpos = 0;
|
||||
|
||||
GUnicodeType curr_type = G_UNICODE_UNASSIGNED;
|
||||
@ -1785,7 +1785,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
bool curr_word_p = false;
|
||||
|
||||
std::vector<std::size_t> breaks;
|
||||
std::unordered_set<std::size_t> suppress;
|
||||
std::set<std::size_t> suppress;
|
||||
|
||||
for (; icp <= ncp; ++icp) {
|
||||
currwc = wchar_t(ucs4[icp]);
|
||||
@ -1822,7 +1822,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
||||
} else if (currwc >= SMAL_HYPH) {
|
||||
curr_word_p = true;
|
||||
} else {
|
||||
curr_word_p = currwc >= WAVE_DASH && curr_word_p <= KANA_DHYP;
|
||||
curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
|
||||
}
|
||||
break;
|
||||
case G_UNICODE_CLOSE_PUNCTUATION:
|
||||
|
@ -180,7 +180,7 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
|
||||
references_.Load(referenceFiles, vocab_);
|
||||
|
||||
SparseVector weights;
|
||||
wv.ToSparse(&weights);
|
||||
wv.ToSparse(&weights,num_dense_);
|
||||
scorer_ = scorer;
|
||||
|
||||
static const string kWeights = "weights";
|
||||
@ -243,7 +243,7 @@ void HypergraphHopeFearDecoder::HopeFear(
|
||||
{
|
||||
size_t sentenceId = *sentenceIdIter_;
|
||||
SparseVector weights;
|
||||
wv.ToSparse(&weights);
|
||||
wv.ToSparse(&weights, num_dense_);
|
||||
const Graph& graph = *(graphs_[sentenceId]);
|
||||
|
||||
// ValType hope_scale = 1.0;
|
||||
@ -338,7 +338,7 @@ void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValTy
|
||||
HgHypothesis bestHypo;
|
||||
size_t sentenceId = *sentenceIdIter_;
|
||||
SparseVector weights;
|
||||
wv.ToSparse(&weights);
|
||||
wv.ToSparse(&weights, num_dense_);
|
||||
vector<ValType> bg(scorer_->NumberOfScores());
|
||||
//cerr << "Calculating bleu on " << sentenceId << endl;
|
||||
Viterbi(*(graphs_[sentenceId]), weights, 0, references_, sentenceId, bg, &bestHypo);
|
||||
|
@ -77,6 +77,7 @@ unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_f
|
||||
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test forest_rescore_test : ForestRescoreTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test hypergraph_test : HypergraphTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test mira_feature_vector_test : MiraFeatureVectorTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test ngram_test : NgramTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test optimizer_factory_test : OptimizerFactoryTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test point_test : PointTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
|
49
mert/MiraFeatureVectorTest.cpp
Normal file
49
mert/MiraFeatureVectorTest.cpp
Normal file
@ -0,0 +1,49 @@
|
||||
#include "MiraFeatureVector.h"
|
||||
#include "MiraWeightVector.h"
|
||||
|
||||
#define BOOST_TEST_MODULE MiraFeatureVector
|
||||
#include <boost/test/unit_test.hpp>
|
||||
|
||||
using namespace MosesTuning;
|
||||
|
||||
/* Note that the conversion to and from SparseVector needs to know
|
||||
how many of the features are really "dense". This is because in hg mira
|
||||
all features (sparse and dense) are to get rolled in to SparseVector
|
||||
*/
|
||||
|
||||
BOOST_AUTO_TEST_CASE(from_sparse) {
|
||||
SparseVector sp;
|
||||
sp.set("dense0", 0.2);
|
||||
sp.set("dense1", 0.3);
|
||||
sp.set("sparse0", 0.7);
|
||||
sp.set("sparse1", 0.9);
|
||||
sp.set("sparse2", 0.1);
|
||||
|
||||
MiraFeatureVector mfv(sp,2);
|
||||
BOOST_CHECK_EQUAL(mfv.size(),5);
|
||||
|
||||
BOOST_CHECK_EQUAL(mfv.feat(0),0);
|
||||
BOOST_CHECK_EQUAL(mfv.feat(1),1);
|
||||
BOOST_CHECK_EQUAL(mfv.feat(2),4);
|
||||
BOOST_CHECK_EQUAL(mfv.feat(3),5);
|
||||
BOOST_CHECK_EQUAL(mfv.feat(4),6);
|
||||
|
||||
BOOST_CHECK_CLOSE(mfv.val(0), 0.2,1e-5);
|
||||
BOOST_CHECK_CLOSE(mfv.val(1), 0.3,1e-5);
|
||||
BOOST_CHECK_CLOSE(mfv.val(2), 0.7,1e-5);
|
||||
BOOST_CHECK_CLOSE(mfv.val(3), 0.9,1e-5);
|
||||
BOOST_CHECK_CLOSE(mfv.val(4), 0.1,1e-5);
|
||||
|
||||
MiraWeightVector mwv;
|
||||
mwv.update(mfv,1.0);
|
||||
SparseVector sp2;
|
||||
mwv.ToSparse(&sp2,2);
|
||||
|
||||
//check we get back what we started with
|
||||
BOOST_CHECK_CLOSE(sp2.get("dense0"), 0.2,1e-5);
|
||||
BOOST_CHECK_CLOSE(sp2.get("dense1"), 0.3,1e-5);
|
||||
BOOST_CHECK_CLOSE(sp2.get("sparse0"), 0.7,1e-5);
|
||||
BOOST_CHECK_CLOSE(sp2.get("sparse1"), 0.9,1e-5);
|
||||
BOOST_CHECK_CLOSE(sp2.get("sparse2"), 0.1,1e-5);
|
||||
|
||||
}
|
@ -93,11 +93,17 @@ void MiraWeightVector::update(size_t index, ValType delta)
|
||||
m_lastUpdated[index] = m_numUpdates;
|
||||
}
|
||||
|
||||
void MiraWeightVector::ToSparse(SparseVector* sparse) const
|
||||
void MiraWeightVector::ToSparse(SparseVector* sparse, size_t denseSize) const
|
||||
{
|
||||
for (size_t i = 0; i < m_weights.size(); ++i) {
|
||||
if(abs(m_weights[i])>1e-8) {
|
||||
sparse->set(i,m_weights[i]);
|
||||
if (i < denseSize) {
|
||||
sparse->set(i,m_weights[i]);
|
||||
} else {
|
||||
//The ids in MiraFeatureVector/MiraWeightVector for sparse features
|
||||
//need to be translated when converting back to SparseVector.
|
||||
sparse->set(i-denseSize, m_weights[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -172,12 +178,18 @@ size_t AvgWeightVector::size() const
|
||||
return m_wv.m_weights.size();
|
||||
}
|
||||
|
||||
void AvgWeightVector::ToSparse(SparseVector* sparse) const
|
||||
void AvgWeightVector::ToSparse(SparseVector* sparse, size_t denseSize) const
|
||||
{
|
||||
for (size_t i = 0; i < size(); ++i) {
|
||||
ValType w = weight(i);
|
||||
if(abs(w)>1e-8) {
|
||||
sparse->set(i,w);
|
||||
if (i < denseSize) {
|
||||
sparse->set(i,w);
|
||||
} else {
|
||||
//The ids in MiraFeatureVector/MiraWeightVector for sparse features
|
||||
//need to be translated when converting back to SparseVector.
|
||||
sparse->set(i-denseSize, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -64,9 +64,9 @@ public:
|
||||
AvgWeightVector avg();
|
||||
|
||||
/**
|
||||
* Convert to sparse vector, interpreting all features as sparse.
|
||||
* Convert to sparse vector, interpreting all features as sparse. Only used by hgmira.
|
||||
**/
|
||||
void ToSparse(SparseVector* sparse) const;
|
||||
void ToSparse(SparseVector* sparse, size_t denseSize) const;
|
||||
|
||||
friend class AvgWeightVector;
|
||||
|
||||
@ -104,7 +104,7 @@ public:
|
||||
ValType score(const MiraFeatureVector& fv) const;
|
||||
ValType weight(std::size_t index) const;
|
||||
std::size_t size() const;
|
||||
void ToSparse(SparseVector* sparse) const;
|
||||
void ToSparse(SparseVector* sparse, size_t num_dense) const;
|
||||
private:
|
||||
const MiraWeightVector& m_wv;
|
||||
};
|
||||
|
@ -3,7 +3,7 @@
|
||||
#include <string>
|
||||
#include <boost/thread/tss.hpp>
|
||||
|
||||
#include "Classifier.h"
|
||||
#include "vw/Classifier.h"
|
||||
#include "moses/TypeDef.h"
|
||||
#include "moses/Util.h"
|
||||
#include "moses/FF/StatelessFeatureFunction.h"
|
||||
|
@ -484,9 +484,7 @@ namespace Moses
|
||||
|
||||
targetOffset += tp.GetSize();
|
||||
}
|
||||
// Removing std::endl here breaks -alignment-output-file, so stop doing that, please :)
|
||||
// Or fix it somewhere else.
|
||||
out << std::endl;
|
||||
// Used by --print-alignment-info, so no endl
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -1868,8 +1868,7 @@ void Manager::OutputAlignment(ostream &out, const vector<const Hypothesis *> &ed
|
||||
|
||||
targetOffset += tp.GetSize();
|
||||
}
|
||||
// Removing std::endl here breaks -alignment-output-file, so stop doing that, please :)
|
||||
// Or fix it somewhere else.
|
||||
// Used by --alignment-output-file so requires endl
|
||||
out << std::endl;
|
||||
}
|
||||
|
||||
@ -2024,6 +2023,8 @@ void Manager::OutputBestHypo(const Moses::TrellisPath &path, long /*translationI
|
||||
void Manager::OutputAlignment(std::ostringstream &out, const TrellisPath &path) const
|
||||
{
|
||||
Hypothesis::OutputAlignment(out, path.GetEdges());
|
||||
// Used by --alignment-output-file so requires endl
|
||||
out << std::endl;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -49,7 +49,7 @@ my $l1input = "$corpus.$l1";
|
||||
if (-e $l1input) {
|
||||
$opn = $l1input;
|
||||
} elsif (-e $l1input.".gz") {
|
||||
$opn = "zcat $l1input.gz |";
|
||||
$opn = "gunzip -c $l1input.gz |";
|
||||
} else {
|
||||
die "Error: $l1input does not exist";
|
||||
}
|
||||
@ -59,7 +59,7 @@ my $l2input = "$corpus.$l2";
|
||||
if (-e $l2input) {
|
||||
$opn = $l2input;
|
||||
} elsif (-e $l2input.".gz") {
|
||||
$opn = "zcat $l2input.gz |";
|
||||
$opn = "gunzip -c $l2input.gz |";
|
||||
} else {
|
||||
die "Error: $l2input does not exist";
|
||||
}
|
||||
@ -154,7 +154,7 @@ print STDERR "Input sentences: $innr Output sentences: $outnr\n";
|
||||
sub word_count {
|
||||
my ($line) = @_;
|
||||
if ($ignore_xml) {
|
||||
$line =~ s/<\S[^>]*\S>//g;
|
||||
$line =~ s/<\S[^>]*\S>/ /g;
|
||||
$line =~ s/\s+/ /g;
|
||||
$line =~ s/^ //g;
|
||||
$line =~ s/ $//g;
|
||||
|
Loading…
Reference in New Issue
Block a user