mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-27 03:49:57 +03:00
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder into mmt-dev
Conflicts: moses/Syntax/F2S/Manager-inl.h moses/TranslationModel/UG/mmsapt.cpp
This commit is contained in:
commit
0d72cdd72c
@ -109,14 +109,17 @@ size_t lookup( string query )
|
||||
return suffixArray.Count( queryString );
|
||||
}
|
||||
|
||||
// Duplicate of definition in util/tokenize.hh.
|
||||
// TODO: Can we de-duplicate this? At the time of writing biconcor does not
|
||||
// use util at all.
|
||||
vector<string> tokenize(const char input[])
|
||||
{
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
int i;
|
||||
for(i = 0; input[i] != '\0'; i++) {
|
||||
const bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
|
@ -45,8 +45,8 @@ struct LMClient {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
bzero((char *)&server, sizeof(server));
|
||||
bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
|
||||
memset(&server, '\0', sizeof(server));
|
||||
memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
|
||||
server.sin_family = hp->h_addrtype;
|
||||
server.sin_port = htons(port);
|
||||
|
||||
|
46
contrib/mada/qsub-madamira.perl
Executable file
46
contrib/mada/qsub-madamira.perl
Executable file
@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use File::Slurp;
|
||||
use File::Basename;
|
||||
use Cwd 'abs_path';
|
||||
|
||||
my $splitDir = $ARGV[0];
|
||||
$splitDir = abs_path($splitDir);
|
||||
|
||||
my @files = read_dir $splitDir;
|
||||
|
||||
my $qsubDir=dirname($splitDir) ."/qsub";
|
||||
print STDERR "qsubDir=$qsubDir\n";
|
||||
`mkdir -p $qsubDir`;
|
||||
|
||||
my $out2Dir=dirname($splitDir) ."/out2";
|
||||
print STDERR "out2Dir=$out2Dir\n";
|
||||
`mkdir -p $out2Dir`;
|
||||
|
||||
for my $file ( @files ) {
|
||||
print STDERR "$file ";
|
||||
|
||||
my $qsubFile = "$qsubDir/$file.sh";
|
||||
open(RUN_FILE, ">$qsubFile");
|
||||
|
||||
print RUN_FILE "#!/usr/bin/env bash\n"
|
||||
."#PBS -d/scratch/hh65/workspace/experiment/ar-en \n"
|
||||
."#PBS -l mem=5gb \n\n"
|
||||
."export PATH=\"/scratch/statmt/bin:/share/apps/NYUAD/perl/gcc_4.9.1/5.20.1/bin:/share/apps/NYUAD/jdk/1.8.0_31/bin:/share/apps/NYUAD/zlib/gcc_4.9.1/1.2.8/bin:/share/apps/NYUAD/cmake/gcc_4.9.1/3.1.0-rc3/bin:/share/apps/NYUAD/boost/gcc_4.9.1/openmpi_1.8.3/1.57.0/bin:/share/apps/NYUAD/openmpi/gcc_4.9.1/1.8.3/bin:/share/apps/NYUAD/python/gcc_4.9.1/2.7.9/bin:/share/apps/NYUAD/gcc/binutils/2.21/el6/bin:/share/apps/NYUAD/gcc/gcc/4.9.1/el6/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/opt/bio/ncbi/bin:/opt/bio/mpiblast/bin:/opt/bio/EMBOSS/bin:/opt/bio/clustalw/bin:/opt/bio/tcoffee/bin:/opt/bio/hmmer/bin:/opt/bio/phylip/exe:/opt/bio/mrbayes:/opt/bio/fasta:/opt/bio/glimmer/bin:/opt/bio/glimmer/scripts:/opt/bio/gromacs/bin:/opt/bio/gmap/bin:/opt/bio/tigr/bin:/opt/bio/autodocksuite/bin:/opt/bio/wgs/bin:/opt/ganglia/bin:/opt/ganglia/sbin:/opt/bin:/usr/java/latest/bin:/opt/pdsh/bin:/opt/rocks/bin:/opt/rocks/sbin:/opt/torque/bin:/opt/torque/sbin:/home/hh65/bin:/home/hh65/bin\" \n"
|
||||
|
||||
."module load NYUAD/2.0 \n"
|
||||
."module load gcc python/2.7.9 openmpi/1.8.3 boost cmake zlib jdk perl expat \n"
|
||||
|
||||
."cd /scratch/statmt/MADAMIRA-release-20140709-1.0 \n";
|
||||
print RUN_FILE "java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar /scratch/statmt/MADAMIRA-release-20140709-1.0/MADAMIRA.jar "
|
||||
."-rawinput $splitDir/$file -rawoutdir $out2Dir -rawconfig /scratch/statmt/MADAMIRA-release-20140709-1.0/samples/sampleConfigFile.xml \n";
|
||||
|
||||
close(RUN_FILE);
|
||||
|
||||
my $cmd = "qsub $qsubFile";
|
||||
`$cmd`;
|
||||
|
||||
}
|
||||
|
@ -46,6 +46,7 @@ namespace mpi = boost::mpi;
|
||||
#include "moses/FF/PhrasePairFeature.h"
|
||||
#include "moses/FF/WordPenaltyProducer.h"
|
||||
#include "moses/LM/Base.h"
|
||||
#include "util/random.hh"
|
||||
|
||||
using namespace Mira;
|
||||
using namespace std;
|
||||
@ -54,6 +55,7 @@ namespace po = boost::program_options;
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
util::rand_init();
|
||||
size_t rank = 0;
|
||||
size_t size = 1;
|
||||
#ifdef MPI_ENABLE
|
||||
|
@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "moses/Word.h"
|
||||
#include "moses/FF/FeatureFunction.h"
|
||||
#include "Decoder.h"
|
||||
#include "util/random.hh"
|
||||
|
||||
typedef std::map<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightMap;
|
||||
typedef std::pair<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightPair;
|
||||
@ -37,8 +38,7 @@ template <class T> bool from_string(T& t, const std::string& s, std::ios_base& (
|
||||
|
||||
struct RandomIndex {
|
||||
ptrdiff_t operator()(ptrdiff_t max) {
|
||||
srand(time(0)); // Initialize random number generator with current time.
|
||||
return static_cast<ptrdiff_t> (rand() % max);
|
||||
return util::rand_excl(max);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1,5 +1,22 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<CodeLite_Project Name="manual-label" InternalType="Console">
|
||||
<Plugins>
|
||||
<Plugin Name="CMakePlugin">
|
||||
<![CDATA[[{
|
||||
"name": "Debug",
|
||||
"enabled": false,
|
||||
"buildDirectory": "build",
|
||||
"sourceDirectory": "$(ProjectPath)",
|
||||
"generator": "",
|
||||
"buildType": "",
|
||||
"arguments": [],
|
||||
"parentProject": ""
|
||||
}]]]>
|
||||
</Plugin>
|
||||
<Plugin Name="qmake">
|
||||
<![CDATA[00010001N0005Debug000000000000]]>
|
||||
</Plugin>
|
||||
</Plugins>
|
||||
<Description/>
|
||||
<Dependencies/>
|
||||
<VirtualDirectory Name="manual-label">
|
||||
@ -14,6 +31,8 @@
|
||||
<File Name="Main.cpp"/>
|
||||
<File Name="Main.h"/>
|
||||
</VirtualDirectory>
|
||||
<Dependencies Name="Debug"/>
|
||||
<Dependencies Name="Release"/>
|
||||
<Settings Type="Executable">
|
||||
<GlobalSettings>
|
||||
<Compiler Options="" C_Options="" Assembler="">
|
||||
@ -33,6 +52,8 @@
|
||||
<Linker Options="" Required="yes">
|
||||
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
|
||||
<Library Value="boost_program_options"/>
|
||||
<Library Value="boost_filesystem"/>
|
||||
<Library Value="boost_system"/>
|
||||
</Linker>
|
||||
<ResourceCompiler Options="" Required="no"/>
|
||||
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
|
||||
@ -107,6 +128,4 @@
|
||||
</Completion>
|
||||
</Configuration>
|
||||
</Settings>
|
||||
<Dependencies Name="Debug"/>
|
||||
<Dependencies Name="Release"/>
|
||||
</CodeLite_Project>
|
||||
|
@ -1,5 +1,22 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<CodeLite_Project Name="moses-cmd" InternalType="Console">
|
||||
<Plugins>
|
||||
<Plugin Name="CMakePlugin">
|
||||
<![CDATA[[{
|
||||
"name": "Debug",
|
||||
"enabled": false,
|
||||
"buildDirectory": "build",
|
||||
"sourceDirectory": "$(ProjectPath)",
|
||||
"generator": "",
|
||||
"buildType": "",
|
||||
"arguments": [],
|
||||
"parentProject": ""
|
||||
}]]]>
|
||||
</Plugin>
|
||||
<Plugin Name="qmake">
|
||||
<![CDATA[00010001N0005Debug000000000000]]>
|
||||
</Plugin>
|
||||
</Plugins>
|
||||
<Description/>
|
||||
<Dependencies/>
|
||||
<VirtualDirectory Name="src"/>
|
||||
@ -9,6 +26,14 @@
|
||||
<File Name="../../../moses-cmd/MainVW.cpp" ExcludeProjConfig="Debug"/>
|
||||
<File Name="../../../moses-cmd/MainVW.h" ExcludeProjConfig="Debug"/>
|
||||
</VirtualDirectory>
|
||||
<Dependencies Name="Release"/>
|
||||
<Dependencies Name="Debug">
|
||||
<Project Name="OnDiskPt"/>
|
||||
<Project Name="lm"/>
|
||||
<Project Name="moses"/>
|
||||
<Project Name="search"/>
|
||||
<Project Name="util"/>
|
||||
</Dependencies>
|
||||
<Settings Type="Executable">
|
||||
<GlobalSettings>
|
||||
<Compiler Options="" C_Options="" Assembler="">
|
||||
@ -53,7 +78,7 @@
|
||||
<Library Value="rt"/>
|
||||
</Linker>
|
||||
<ResourceCompiler Options="" Required="no"/>
|
||||
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
|
||||
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="-f /var/folders/c4/2p48fcwx611dmkdqq44mbblm0000gn/T/ZVd8xvuJAR.ini -i /Users/hieu/workspace/github/moses-regression-tests/tests/phrase.basic-surface-binptable.oldformat/to-translate.txt" UseSeparateDebugArgs="yes" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
|
||||
<Environment EnvVarSetName="<Use Defaults>" DbgSetName="<Use Defaults>">
|
||||
<![CDATA[]]>
|
||||
</Environment>
|
||||
@ -125,12 +150,4 @@
|
||||
</Completion>
|
||||
</Configuration>
|
||||
</Settings>
|
||||
<Dependencies Name="Release"/>
|
||||
<Dependencies Name="Debug">
|
||||
<Project Name="OnDiskPt"/>
|
||||
<Project Name="lm"/>
|
||||
<Project Name="moses"/>
|
||||
<Project Name="search"/>
|
||||
<Project Name="util"/>
|
||||
</Dependencies>
|
||||
</CodeLite_Project>
|
||||
|
@ -474,8 +474,6 @@
|
||||
<File Name="../../../moses/FF/DistortionScoreProducer.h"/>
|
||||
<File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.cpp"/>
|
||||
<File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.h"/>
|
||||
<File Name="../../../moses/FF/ExternalFeature.cpp"/>
|
||||
<File Name="../../../moses/FF/ExternalFeature.h"/>
|
||||
<File Name="../../../moses/FF/Factory.cpp"/>
|
||||
<File Name="../../../moses/FF/Factory.h"/>
|
||||
<File Name="../../../moses/FF/FeatureFunction.cpp"/>
|
||||
|
@ -42,6 +42,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "RelativeEntropyCalc.h"
|
||||
#include "LexicalReordering.h"
|
||||
#include "LexicalReorderingState.h"
|
||||
#include "util/random.hh"
|
||||
|
||||
#ifdef HAVE_PROTOBUF
|
||||
#include "hypergraph.pb.h"
|
||||
@ -205,7 +206,7 @@ int main(int argc, char** argv)
|
||||
|
||||
|
||||
//initialise random numbers
|
||||
srand(time(NULL));
|
||||
rand_init();
|
||||
|
||||
// set up read/writing class
|
||||
IOWrapper* ioWrapper = GetIOWrapper(staticData);
|
||||
|
@ -536,7 +536,7 @@ public:
|
||||
{
|
||||
// should the score breakdown be reported in a more structured manner?
|
||||
ostringstream buf;
|
||||
path.GetScoreBreakdown().OutputAllFeatureScores(buf);
|
||||
path.GetScoreBreakdown()->OutputAllFeatureScores(buf);
|
||||
nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include "util/exception.hh"
|
||||
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/random.hh"
|
||||
#include "util/tokenize_piece.hh"
|
||||
#include "util/string_piece.hh"
|
||||
#include "FeatureDataIterator.h"
|
||||
@ -286,7 +287,7 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
|
||||
} else {
|
||||
//create shards by randomly sampling
|
||||
for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
|
||||
shard_contents.push_back(rand() % data_size);
|
||||
shard_contents.push_back(util::rand_excl(data_size));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13,6 +13,8 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "util/unistd.hh"
|
||||
|
||||
#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
|
||||
#include <ext/stdio_filebuf.h>
|
||||
|
||||
|
@ -40,28 +40,3 @@ inputfilestream::~inputfilestream()
|
||||
void inputfilestream::close()
|
||||
{
|
||||
}
|
||||
|
||||
outputfilestream::outputfilestream(const std::string &filePath)
|
||||
: std::ostream(0), m_streambuf(0), m_is_good(false)
|
||||
{
|
||||
// check if file is readable
|
||||
std::filebuf* fb = new std::filebuf();
|
||||
m_is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
|
||||
|
||||
if (IsGzipFile(filePath)) {
|
||||
throw runtime_error("Output to a zipped file not supported!");
|
||||
} else {
|
||||
m_streambuf = fb;
|
||||
}
|
||||
this->init(m_streambuf);
|
||||
}
|
||||
|
||||
outputfilestream::~outputfilestream()
|
||||
{
|
||||
delete m_streambuf;
|
||||
m_streambuf = 0;
|
||||
}
|
||||
|
||||
void outputfilestream::close()
|
||||
{
|
||||
}
|
||||
|
@ -22,20 +22,4 @@ public:
|
||||
void close();
|
||||
};
|
||||
|
||||
class outputfilestream : public std::ostream
|
||||
{
|
||||
protected:
|
||||
std::streambuf *m_streambuf;
|
||||
bool m_is_good;
|
||||
|
||||
public:
|
||||
explicit outputfilestream(const std::string &filePath);
|
||||
virtual ~outputfilestream();
|
||||
|
||||
bool good() const {
|
||||
return m_is_good;
|
||||
}
|
||||
void close();
|
||||
};
|
||||
|
||||
#endif // MERT_FILE_STREAM_H_
|
||||
|
@ -1,6 +1,9 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
#include "ForestRescore.h"
|
||||
#include "MiraFeatureVector.h"
|
||||
|
||||
#define BOOST_TEST_MODULE MertForestRescore
|
||||
#include <boost/test/unit_test.hpp>
|
||||
@ -10,8 +13,7 @@
|
||||
using namespace std;
|
||||
using namespace MosesTuning;
|
||||
|
||||
BOOST_AUTO_TEST_CASE(viterbi_simple_lattice)
|
||||
{
|
||||
BOOST_AUTO_TEST_CASE(viterbi_simple_lattice) {
|
||||
Vocab vocab;
|
||||
WordVec words;
|
||||
string wordStrings[] =
|
||||
@ -242,5 +244,101 @@ BOOST_AUTO_TEST_CASE(viterbi_3branch_lattice)
|
||||
BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[8]);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(viterbi_full_hypergraph) {
|
||||
Vocab vocab;
|
||||
//References
|
||||
ReferenceSet references;
|
||||
references.AddLine(0,"in addition to EU support for businesses , also the administration of national business support will be concentrated in four Centres for Economic Development , Transport and Environment ( ELY Centres ) , starting from mid @-@ September .",vocab);
|
||||
//Load the hypergraph
|
||||
Graph graph(vocab);
|
||||
util::scoped_fd fd(util::OpenReadOrThrow("mert/hgtest/0.gz"));
|
||||
util::FilePiece file(fd.release());
|
||||
ReadGraph(file,graph);
|
||||
|
||||
//prune
|
||||
SparseVector weights;
|
||||
weights.set("OpSequenceModel0_1",0.011187);
|
||||
weights.set("OpSequenceModel0_2",-0.002797);
|
||||
weights.set("OpSequenceModel0_3",0.002797);
|
||||
weights.set("OpSequenceModel0_4",-0.000140);
|
||||
weights.set("OpSequenceModel0_5",0.004195);
|
||||
weights.set("Distortion0",0.041952);
|
||||
weights.set("PhrasePenalty0",0.027968);
|
||||
weights.set("WordPenalty0",-0.139841);
|
||||
weights.set("UnknownWordPenalty0",1.000000);
|
||||
weights.set("LM0",0.069920);
|
||||
weights.set("LexicalReordering0_1",0.041952);
|
||||
weights.set("LexicalReordering0_2",0.041952);
|
||||
weights.set("LexicalReordering0_3",0.041952);
|
||||
weights.set("LexicalReordering0_4",0.041952);
|
||||
weights.set("LexicalReordering0_5",0.041952);
|
||||
weights.set("LexicalReordering0_6",0.041952);
|
||||
weights.set("LexicalReordering0_7",0.041952);
|
||||
weights.set("LexicalReordering0_8",0.041952);
|
||||
weights.set("TranslationModel0_1",0.027968);
|
||||
weights.set("TranslationModel0_2",0.027968);
|
||||
weights.set("TranslationModel0_3",0.027968);
|
||||
weights.set("TranslationModel0_4",0.027968);
|
||||
weights.set("TranslationModel0_5",0.027968);
|
||||
weights.set("TranslationModel0_6",0.027968);
|
||||
weights.set("TranslationModel0_7",0.027968);
|
||||
weights.set("TranslationModel0_8",0.027968);
|
||||
weights.set("TranslationModel0_9",0.027968);
|
||||
weights.set("TranslationModel0_10",0.027968);
|
||||
weights.set("TranslationModel0_11",0.027968);
|
||||
weights.set("TranslationModel0_12",0.027968);
|
||||
weights.set("TranslationModel0_13",0.027968);
|
||||
size_t edgeCount = 500;
|
||||
boost::shared_ptr<Graph> prunedGraph;
|
||||
prunedGraph.reset(new Graph(vocab));
|
||||
graph.Prune(prunedGraph.get(), weights, edgeCount);
|
||||
|
||||
vector<ValType> bg(9);
|
||||
HgHypothesis bestHypo;
|
||||
//best hypothesis
|
||||
Viterbi(*prunedGraph, weights, 0, references, 0, bg, &bestHypo);
|
||||
//check output as expected
|
||||
string expectedStr = "<s> the EU matters , but also the national matters management focus since mid @-@ September four ely @-@ centre . </s>";
|
||||
util::TokenIter<util::SingleCharacter, true> expected(expectedStr, util::SingleCharacter(' '));
|
||||
for (size_t i = 0; i < bestHypo.text.size(); ++i) {
|
||||
//cerr << bestHypo.text[i]->first << " ";
|
||||
BOOST_CHECK_EQUAL(*expected,bestHypo.text[i]->first);
|
||||
++expected;
|
||||
}
|
||||
BOOST_CHECK(!expected);
|
||||
//cerr << endl;
|
||||
//check scores
|
||||
BOOST_CHECK_CLOSE(-80.062,bestHypo.featureVector.get("OpSequenceModel0_1"), 0.001);
|
||||
BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_2"), 0.001);
|
||||
BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_3"), 0.001);
|
||||
BOOST_CHECK_CLOSE(3,bestHypo.featureVector.get("OpSequenceModel0_4"), 0.001);
|
||||
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("OpSequenceModel0_5"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-6,bestHypo.featureVector.get("Distortion0"), 0.001);
|
||||
BOOST_CHECK_CLOSE(14,bestHypo.featureVector.get("PhrasePenalty0"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-20,bestHypo.featureVector.get("WordPenalty0"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-100,bestHypo.featureVector.get("UnknownWordPenalty0"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-126.616,bestHypo.featureVector.get("LM0"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-5.2238,bestHypo.featureVector.get("LexicalReordering0_1"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-0.29515,bestHypo.featureVector.get("LexicalReordering0_2"), 0.001);
|
||||
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_3"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_4"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-9.28267,bestHypo.featureVector.get("LexicalReordering0_5"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_6"), 0.001);
|
||||
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_7"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-0.402678,bestHypo.featureVector.get("LexicalReordering0_8"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-54.3119,bestHypo.featureVector.get("TranslationModel0_1"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-62.2619,bestHypo.featureVector.get("TranslationModel0_2"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-23.8782,bestHypo.featureVector.get("TranslationModel0_3"), 0.001);
|
||||
BOOST_CHECK_CLOSE(-25.1626,bestHypo.featureVector.get("TranslationModel0_4"), 0.001);
|
||||
BOOST_CHECK_CLOSE(12.9986,bestHypo.featureVector.get("TranslationModel0_5"), 0.001);
|
||||
BOOST_CHECK_CLOSE(3.99959,bestHypo.featureVector.get("TranslationModel0_6"), 0.001);
|
||||
BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_7"), 0.001);
|
||||
BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_8"), 0.001);
|
||||
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_9"), 0.001);
|
||||
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_10"), 0.001);
|
||||
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_11"), 0.001);
|
||||
BOOST_CHECK_CLOSE(0.999896,bestHypo.featureVector.get("TranslationModel0_12"), 0.001);
|
||||
BOOST_CHECK_CLOSE(7.99917,bestHypo.featureVector.get("TranslationModel0_13"), 0.001);
|
||||
}
|
||||
|
||||
|
||||
|
@ -18,6 +18,7 @@
|
||||
|
||||
#include "ScoreStats.h"
|
||||
#include "Util.h"
|
||||
#include "util/unistd.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -25,7 +26,7 @@ namespace MosesTuning
|
||||
{
|
||||
|
||||
// Meteor supported
|
||||
#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
|
||||
#if (defined(__GLIBCXX__) || defined(__GLIBCPP__)) && !defined(_WIN32)
|
||||
|
||||
// for clarity
|
||||
#define CHILD_STDIN_READ pipefds_input[0]
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include "util/exception.hh"
|
||||
#include "util/random.hh"
|
||||
#include "FeatureStats.h"
|
||||
#include "Optimizer.h"
|
||||
|
||||
@ -57,10 +58,8 @@ void Point::Randomize()
|
||||
UTIL_THROW_IF(m_min.size() != Point::m_dim, util::Exception, "Error");
|
||||
UTIL_THROW_IF(m_max.size() != Point::m_dim, util::Exception, "Error");
|
||||
|
||||
for (unsigned int i = 0; i < size(); i++) {
|
||||
operator[](i) = m_min[i] +
|
||||
static_cast<float>(random()) / static_cast<float>(RAND_MAX) * (m_max[i] - m_min[i]);
|
||||
}
|
||||
for (unsigned int i = 0; i < size(); i++)
|
||||
operator[](i) = util::rand_incl(m_min[i], m_max[i]);
|
||||
}
|
||||
|
||||
double Point::operator*(const FeatureStats& F) const
|
||||
|
@ -5,11 +5,8 @@
|
||||
|
||||
- check that --pairwise-ranked is compatible with all optimization metrics
|
||||
|
||||
- Replace the standard rand() currently used in MERT and PRO with better
|
||||
random generators such as Boost's random generators (e.g., boost::mt19937).
|
||||
- create a Random class to hide the details, i.e., how to generate
|
||||
random numbers, which allows us to use custom random generators more
|
||||
easily.
|
||||
- Use better random generators in util/random.cc, e.g. boost::mt19937.
|
||||
- Support plugging of custom random generators.
|
||||
|
||||
Pros:
|
||||
- In MERT, you might want to use the random restarting technique to avoid
|
||||
|
@ -11,7 +11,20 @@ using namespace MosesTuning;
|
||||
BOOST_AUTO_TEST_CASE(timer_basic_test)
|
||||
{
|
||||
Timer timer;
|
||||
const int sleep_time_microsec = 40; // ad-hoc microseconds to pass unit tests.
|
||||
|
||||
// Sleep time. The test will sleep for this number of microseconds, and
|
||||
// expect the elapsed time to be noticeable.
|
||||
// Keep this number low to avoid wasting test time sleeping, but at least as
|
||||
// high as the Boost timer's resolution. Tests must pass consistently, not
|
||||
// just on lucky runs.
|
||||
#if defined(WIN32)
|
||||
// Timer resolution on Windows seems to be a millisecond. Anything less and
|
||||
// the test fails consistently.
|
||||
const int sleep_time_microsec = 1000;
|
||||
#else
|
||||
// Unix-like systems seem to have more fine-grained clocks.
|
||||
const int sleep_time_microsec = 40;
|
||||
#endif
|
||||
|
||||
timer.start();
|
||||
BOOST_REQUIRE(timer.is_running());
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
@ -15,6 +16,7 @@
|
||||
#include "Timer.h"
|
||||
#include "Util.h"
|
||||
#include "Data.h"
|
||||
#include "util/random.hh"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTuning;
|
||||
@ -91,17 +93,15 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_i
|
||||
if (bootstrap) {
|
||||
vector<float> scores;
|
||||
for (int i = 0; i < bootstrap; ++i) {
|
||||
// TODO: Use smart pointer for exceptional-safety.
|
||||
ScoreData* scoredata = new ScoreData(g_scorer);
|
||||
ScoreData scoredata(g_scorer);
|
||||
for (int j = 0; j < n; ++j) {
|
||||
int randomIndex = random() % n;
|
||||
scoredata->add(entries[randomIndex], j);
|
||||
const int randomIndex = util::rand_excl(n);
|
||||
scoredata.add(entries[randomIndex], j);
|
||||
}
|
||||
g_scorer->setScoreData(scoredata);
|
||||
g_scorer->setScoreData(&scoredata);
|
||||
candidates_t candidates(n, 0);
|
||||
float score = g_scorer->score(candidates);
|
||||
scores.push_back(score);
|
||||
delete scoredata;
|
||||
}
|
||||
|
||||
float avg = average(scores);
|
||||
@ -121,15 +121,13 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_i
|
||||
cout.precision(4);
|
||||
cout << avg << "\t[" << lb << "," << rb << "]" << endl;
|
||||
} else {
|
||||
// TODO: Use smart pointer for exceptional-safety.
|
||||
ScoreData* scoredata = new ScoreData(g_scorer);
|
||||
ScoreData scoredata(g_scorer);
|
||||
for (int sid = 0; sid < n; ++sid) {
|
||||
scoredata->add(entries[sid], sid);
|
||||
scoredata.add(entries[sid], sid);
|
||||
}
|
||||
g_scorer->setScoreData(scoredata);
|
||||
g_scorer->setScoreData(&scoredata);
|
||||
candidates_t candidates(n, 0);
|
||||
float score = g_scorer->score(candidates);
|
||||
delete scoredata;
|
||||
|
||||
if (g_has_more_files) cout << candFile << "\t";
|
||||
if (g_has_more_scorers) cout << g_scorer->getName() << "\t";
|
||||
@ -287,10 +285,10 @@ void InitSeed(const ProgramOption *opt)
|
||||
{
|
||||
if (opt->has_seed) {
|
||||
cerr << "Seeding random numbers with " << opt->seed << endl;
|
||||
srandom(opt->seed);
|
||||
util::rand_init(opt->seed);
|
||||
} else {
|
||||
cerr << "Seeding random numbers with system clock " << endl;
|
||||
srandom(time(NULL));
|
||||
util::rand_init();
|
||||
}
|
||||
}
|
||||
|
||||
|
BIN
mert/hgtest/0.gz
Normal file
BIN
mert/hgtest/0.gz
Normal file
Binary file not shown.
@ -40,6 +40,7 @@ de recherches du Canada
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
|
||||
#include "util/exception.hh"
|
||||
#include "util/random.hh"
|
||||
|
||||
#include "BleuScorer.h"
|
||||
#include "HopeFearDecoder.h"
|
||||
@ -122,10 +123,10 @@ int main(int argc, char** argv)
|
||||
|
||||
if (vm.count("random-seed")) {
|
||||
cerr << "Initialising random seed to " << seed << endl;
|
||||
srand(seed);
|
||||
util::rand_init(seed);
|
||||
} else {
|
||||
cerr << "Initialising random seed from system clock" << endl;
|
||||
srand(time(NULL));
|
||||
util::rand_init();
|
||||
}
|
||||
|
||||
// Initialize weights
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "Types.h"
|
||||
#include "Timer.h"
|
||||
#include "Util.h"
|
||||
#include "util/random.hh"
|
||||
|
||||
#include "moses/ThreadPool.h"
|
||||
|
||||
@ -289,10 +290,10 @@ int main(int argc, char **argv)
|
||||
|
||||
if (option.has_seed) {
|
||||
cerr << "Seeding random numbers with " << option.seed << endl;
|
||||
srandom(option.seed);
|
||||
util::rand_init(option.seed);
|
||||
} else {
|
||||
cerr << "Seeding random numbers with system clock " << endl;
|
||||
srandom(time(NULL));
|
||||
util::rand_init();
|
||||
}
|
||||
|
||||
if (option.sparse_weights_file.size()) ++option.pdim;
|
||||
|
@ -43,6 +43,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "ScoreDataIterator.h"
|
||||
#include "BleuScorer.h"
|
||||
#include "Util.h"
|
||||
#include "util/random.hh"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTuning;
|
||||
@ -141,10 +142,10 @@ int main(int argc, char** argv)
|
||||
|
||||
if (vm.count("random-seed")) {
|
||||
cerr << "Initialising random seed to " << seed << endl;
|
||||
srand(seed);
|
||||
util::rand_init(seed);
|
||||
} else {
|
||||
cerr << "Initialising random seed from system clock" << endl;
|
||||
srand(time(NULL));
|
||||
util::rand_init();
|
||||
}
|
||||
|
||||
if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
|
||||
@ -211,11 +212,11 @@ int main(int argc, char** argv)
|
||||
vector<float> scores;
|
||||
size_t n_translations = hypotheses.size();
|
||||
for(size_t i=0; i<n_candidates; i++) {
|
||||
size_t rand1 = rand() % n_translations;
|
||||
size_t rand1 = util::rand_excl(n_translations);
|
||||
pair<size_t,size_t> translation1 = hypotheses[rand1];
|
||||
float bleu1 = smoothedSentenceBleu(scoreDataIters[translation1.first]->operator[](translation1.second), bleuSmoothing, smoothBP);
|
||||
|
||||
size_t rand2 = rand() % n_translations;
|
||||
size_t rand2 = util::rand_excl(n_translations);
|
||||
pair<size_t,size_t> translation2 = hypotheses[rand2];
|
||||
float bleu2 = smoothedSentenceBleu(scoreDataIters[translation2.first]->operator[](translation2.second), bleuSmoothing, smoothBP);
|
||||
|
||||
|
@ -45,6 +45,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "moses/FF/StatefulFeatureFunction.h"
|
||||
#include "moses/FF/StatelessFeatureFunction.h"
|
||||
#include "moses/TrainingTask.h"
|
||||
#include "util/random.hh"
|
||||
|
||||
#ifdef HAVE_PROTOBUF
|
||||
#include "hypergraph.pb.h"
|
||||
@ -117,7 +118,7 @@ int main(int argc, char** argv)
|
||||
|
||||
|
||||
//initialise random numbers
|
||||
srand(time(NULL));
|
||||
util::rand_init();
|
||||
|
||||
// set up read/writing class
|
||||
IFVERBOSE(1) {
|
||||
|
@ -27,6 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#include "util/random.hh"
|
||||
#include "util/usage.hh"
|
||||
|
||||
#ifdef WIN32
|
||||
@ -91,7 +92,7 @@ SimpleTranslationInterface::SimpleTranslationInterface(const string &mosesIni):
|
||||
exit(1);
|
||||
}
|
||||
|
||||
srand(time(NULL));
|
||||
util::rand_init();
|
||||
|
||||
}
|
||||
|
||||
@ -185,7 +186,7 @@ batch_run()
|
||||
const StaticData& staticData = StaticData::Instance();
|
||||
|
||||
//initialise random numbers
|
||||
srand(time(NULL));
|
||||
util::rand_init();
|
||||
|
||||
IFVERBOSE(1) PrintUserTime("Created input-output object");
|
||||
|
||||
|
@ -13,8 +13,11 @@
|
||||
#include "LexicalReordering.h"
|
||||
#include "SparseReordering.h"
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
|
||||
|
||||
using namespace std;
|
||||
using namespace boost::algorithm;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -57,6 +60,7 @@ const std::string& SparseReorderingFeatureKey::Name (const string& wordListId)
|
||||
|
||||
SparseReordering::SparseReordering(const map<string,string>& config, const LexicalReordering* producer)
|
||||
: m_producer(producer)
|
||||
, m_useWeightMap(false)
|
||||
{
|
||||
static const string kSource= "source";
|
||||
static const string kTarget = "target";
|
||||
@ -80,6 +84,14 @@ SparseReordering::SparseReordering(const map<string,string>& config, const Lexic
|
||||
} else {
|
||||
UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
|
||||
}
|
||||
} else if (fields[0] == "weights") {
|
||||
ReadWeightMap(i->second);
|
||||
m_useWeightMap = true;
|
||||
for (int reoType=0; reoType<=LRModel::MAX; ++reoType) {
|
||||
ostringstream buf;
|
||||
buf << reoType;
|
||||
m_featureMap2.push_back(m_producer->GetFeatureName(buf.str()));
|
||||
}
|
||||
|
||||
} else if (fields[0] == "phrase") {
|
||||
m_usePhrase = true;
|
||||
@ -175,8 +187,17 @@ void SparseReordering::AddFeatures(
|
||||
SparseReorderingFeatureKey key(id, type, wordFactor, false, position, side, reoType);
|
||||
FeatureMap::const_iterator fmi = m_featureMap.find(key);
|
||||
assert(fmi != m_featureMap.end());
|
||||
if (m_useWeightMap) {
|
||||
WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
|
||||
if (wmi != m_weightMap.end()) {
|
||||
if (wmi->second != 0) {
|
||||
scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
scores->SparsePlusEquals(fmi->second, 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t id = 0; id < clusterMaps->size(); ++id) {
|
||||
const ClusterMap& clusterMap = (*clusterMaps)[id];
|
||||
@ -186,9 +207,18 @@ void SparseReordering::AddFeatures(
|
||||
SparseReorderingFeatureKey key(id, type, clusterIter->second, true, position, side, reoType);
|
||||
FeatureMap::const_iterator fmi = m_featureMap.find(key);
|
||||
assert(fmi != m_featureMap.end());
|
||||
if (m_useWeightMap) {
|
||||
WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
|
||||
if (wmi != m_weightMap.end()) {
|
||||
if (wmi->second != 0) {
|
||||
scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
scores->SparsePlusEquals(fmi->second, 1.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -256,5 +286,29 @@ void SparseReordering::CopyScores(
|
||||
|
||||
}
|
||||
|
||||
|
||||
void SparseReordering::ReadWeightMap(const string& filename)
|
||||
{
|
||||
util::FilePiece file(filename.c_str());
|
||||
StringPiece line;
|
||||
while (true) {
|
||||
try {
|
||||
line = file.ReadLine();
|
||||
} catch (const util::EndOfFileException &e) {
|
||||
break;
|
||||
}
|
||||
util::TokenIter<util::SingleCharacter, true> lineIter(line,util::SingleCharacter(' '));
|
||||
UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
|
||||
const std::string& name = lineIter->as_string();
|
||||
++lineIter;
|
||||
UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
|
||||
float weight = Moses::Scan<float>(lineIter->as_string());
|
||||
|
||||
std::pair< WeightMap::iterator, bool> inserted = m_weightMap.insert( std::make_pair(name, weight) );
|
||||
UTIL_THROW_IF2(!inserted.second, "Duplicate weight: '" << name << "'");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} //namespace
|
||||
|
||||
|
@ -112,10 +112,16 @@ private:
|
||||
typedef boost::unordered_map<SparseReorderingFeatureKey, FName, HashSparseReorderingFeatureKey, EqualsSparseReorderingFeatureKey> FeatureMap;
|
||||
FeatureMap m_featureMap;
|
||||
|
||||
typedef boost::unordered_map<std::string, float> WeightMap;
|
||||
WeightMap m_weightMap;
|
||||
bool m_useWeightMap;
|
||||
std::vector<FName> m_featureMap2;
|
||||
|
||||
void ReadWordList(const std::string& filename, const std::string& id,
|
||||
SparseReorderingFeatureKey::Side side, std::vector<WordList>* pWordLists);
|
||||
void ReadClusterMap(const std::string& filename, const std::string& id, SparseReorderingFeatureKey::Side side, std::vector<ClusterMap>* pClusterMaps);
|
||||
void PreCalculateFeatureNames(size_t index, const std::string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster);
|
||||
void ReadWeightMap(const std::string& filename);
|
||||
|
||||
void AddFeatures(
|
||||
SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side,
|
||||
|
@ -86,6 +86,10 @@ struct VWTargetSentence {
|
||||
int src = it->first;
|
||||
int tgt = it->second;
|
||||
|
||||
if (src >= m_sourceConstraints.size() || tgt >= m_targetConstraints.size()) {
|
||||
UTIL_THROW2("VW :: alignment point out of bounds: " << src << "-" << tgt);
|
||||
}
|
||||
|
||||
m_sourceConstraints[src].Update(tgt);
|
||||
m_targetConstraints[tgt].Update(src);
|
||||
}
|
||||
|
@ -98,6 +98,7 @@ HypergraphOutput<M>::HypergraphOutput(size_t precision) :
|
||||
// If this line gives you compile errors,
|
||||
// contact Lane Schwartz on the Moses mailing list
|
||||
m_hypergraphDir = nbestPath.parent_path().string();
|
||||
if (m_hypergraphDir.empty()) m_hypergraphDir=".";
|
||||
|
||||
} else {
|
||||
stringstream hypergraphDirName;
|
||||
|
@ -1,14 +1,15 @@
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <netdb.h>
|
||||
#include "Remote.h"
|
||||
#include "moses/Factor.h"
|
||||
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
#include <arpa/inet.h>
|
||||
#endif
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
@ -41,12 +42,16 @@ bool LanguageModelRemote::start(const std::string& host, int port)
|
||||
sock = socket(AF_INET, SOCK_STREAM, 0);
|
||||
hp = gethostbyname(host.c_str());
|
||||
if (hp==NULL) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
fprintf(stderr, "gethostbyname failed\n");
|
||||
#else
|
||||
herror("gethostbyname failed");
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
bzero((char *)&server, sizeof(server));
|
||||
bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
|
||||
memset(&server, '\0', sizeof(server));
|
||||
memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
|
||||
server.sin_family = hp->h_addrtype;
|
||||
server.sin_port = htons(port);
|
||||
|
||||
|
@ -4,9 +4,15 @@
|
||||
#include "SingleFactor.h"
|
||||
#include "moses/TypeDef.h"
|
||||
#include "moses/Factor.h"
|
||||
#include <sys/socket.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#include <winsock2.h>
|
||||
#else
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netdb.h>
|
||||
#endif
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
@ -55,6 +55,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#endif
|
||||
|
||||
#include "util/exception.hh"
|
||||
#include "util/random.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -426,7 +427,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
|
||||
//cerr << endl;
|
||||
|
||||
//draw the sample
|
||||
float frandom = log((float)rand()/RAND_MAX);
|
||||
const float frandom = log(util::rand_incl(0.0f, 1.0f));
|
||||
size_t position = 1;
|
||||
float sum = candidateScores[0];
|
||||
for (; position < candidateScores.size() && sum < frandom; ++position) {
|
||||
@ -1645,7 +1646,7 @@ void Manager::OutputNBest(std::ostream& out
|
||||
out << " |||";
|
||||
|
||||
// print scores with feature names
|
||||
path.GetScoreBreakdown().OutputAllFeatureScores(out );
|
||||
path.GetScoreBreakdown()->OutputAllFeatureScores(out);
|
||||
|
||||
// total
|
||||
out << " ||| " << path.GetTotalScore();
|
||||
|
@ -31,6 +31,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "InputFileStream.h"
|
||||
#include "StaticData.h"
|
||||
#include "util/exception.hh"
|
||||
#include "util/random.hh"
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
|
||||
@ -1393,7 +1394,7 @@ struct Credit {
|
||||
this->contact = contact ;
|
||||
this->currentPursuits = currentPursuits ;
|
||||
this->areaResponsibility = areaResponsibility;
|
||||
this->sortId = rand() % 1000;
|
||||
this->sortId = util::rand_excl(1000);
|
||||
}
|
||||
|
||||
bool operator<(const Credit &other) const {
|
||||
|
@ -40,12 +40,12 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
|
||||
const std::vector<FactorType> &output,
|
||||
const std::string &inFile,
|
||||
const RuleTableFF &ff,
|
||||
HyperTree &trie)
|
||||
HyperTree &trie,
|
||||
boost::unordered_set<std::size_t> &sourceTermSet)
|
||||
{
|
||||
PrintUserTime(std::string("Start loading HyperTree"));
|
||||
|
||||
// const StaticData &staticData = StaticData::Instance();
|
||||
// const std::string &factorDelimiter = staticData.GetFactorDelimiter();
|
||||
sourceTermSet.clear();
|
||||
|
||||
std::size_t count = 0;
|
||||
|
||||
@ -106,6 +106,7 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
|
||||
// Source-side
|
||||
HyperPath sourceFragment;
|
||||
hyperPathLoader.Load(sourceString, sourceFragment);
|
||||
ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet);
|
||||
|
||||
// Target-side
|
||||
TargetPhrase *targetPhrase = new TargetPhrase(&ff);
|
||||
@ -144,6 +145,23 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
|
||||
return true;
|
||||
}
|
||||
|
||||
void HyperTreeLoader::ExtractSourceTerminalSetFromHyperPath(
|
||||
const HyperPath &hp, boost::unordered_set<std::size_t> &sourceTerminalSet)
|
||||
{
|
||||
for (std::vector<HyperPath::NodeSeq>::const_iterator p = hp.nodeSeqs.begin();
|
||||
p != hp.nodeSeqs.end(); ++p) {
|
||||
for (std::vector<std::size_t>::const_iterator q = p->begin();
|
||||
q != p->end(); ++q) {
|
||||
const std::size_t factorId = *q;
|
||||
if (factorId >= moses_MaxNumNonterminals &&
|
||||
factorId != HyperPath::kComma &&
|
||||
factorId != HyperPath::kEpsilon) {
|
||||
sourceTerminalSet.insert(factorId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace F2S
|
||||
} // namespace Syntax
|
||||
} // namespace Moses
|
||||
|
@ -3,9 +3,12 @@
|
||||
#include <istream>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/unordered_set.hpp>
|
||||
|
||||
#include "moses/TypeDef.h"
|
||||
#include "moses/Syntax/RuleTableFF.h"
|
||||
|
||||
#include "HyperPath.h"
|
||||
#include "HyperTree.h"
|
||||
#include "HyperTreeCreator.h"
|
||||
|
||||
@ -23,7 +26,12 @@ public:
|
||||
const std::vector<FactorType> &output,
|
||||
const std::string &inFile,
|
||||
const RuleTableFF &,
|
||||
HyperTree &);
|
||||
HyperTree &,
|
||||
boost::unordered_set<std::size_t> &);
|
||||
|
||||
private:
|
||||
void ExtractSourceTerminalSetFromHyperPath(
|
||||
const HyperPath &, boost::unordered_set<std::size_t> &);
|
||||
};
|
||||
|
||||
} // namespace F2S
|
||||
|
@ -39,6 +39,7 @@ Manager<RuleMatcher>::Manager(ttasksptr const& ttask)
|
||||
if (const ForestInput *p = dynamic_cast<const ForestInput*>(&m_source)) {
|
||||
m_forest = p->GetForest();
|
||||
m_rootVertex = p->GetRootVertex();
|
||||
m_sentenceLength = p->GetSize();
|
||||
} else if (const TreeInput *p = dynamic_cast<const TreeInput*>(&m_source)) {
|
||||
T2S::InputTreeBuilder builder;
|
||||
T2S::InputTree tmpTree;
|
||||
@ -46,6 +47,7 @@ Manager<RuleMatcher>::Manager(ttasksptr const& ttask)
|
||||
boost::shared_ptr<Forest> forest = boost::make_shared<Forest>();
|
||||
m_rootVertex = T2S::InputTreeToForest(tmpTree, *forest);
|
||||
m_forest = forest;
|
||||
m_sentenceLength = p->GetSize();
|
||||
} else {
|
||||
UTIL_THROW2("ERROR: F2S::Manager requires input to be a tree or forest");
|
||||
}
|
||||
@ -83,8 +85,13 @@ void Manager<RuleMatcher>::Decode()
|
||||
p = sortedVertices.begin(); p != sortedVertices.end(); ++p) {
|
||||
const Forest::Vertex &vertex = **p;
|
||||
|
||||
// Skip terminal vertices.
|
||||
// Skip terminal vertices (after checking if they are OOVs).
|
||||
if (vertex.incoming.empty()) {
|
||||
if (vertex.pvertex.span.GetStartPos() > 0 &&
|
||||
vertex.pvertex.span.GetEndPos() < m_sentenceLength-1 &&
|
||||
IsUnknownSourceWord(vertex.pvertex.symbol)) {
|
||||
m_oovs.insert(vertex.pvertex.symbol);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -190,6 +197,21 @@ void Manager<RuleMatcher>::InitializeStacks()
|
||||
}
|
||||
}
|
||||
|
||||
template<typename RuleMatcher>
|
||||
bool Manager<RuleMatcher>::IsUnknownSourceWord(const Word &w) const
|
||||
{
|
||||
const std::size_t factorId = w[0]->GetId();
|
||||
const std::vector<RuleTableFF*> &ffs = RuleTableFF::Instances();
|
||||
for (std::size_t i = 0; i < ffs.size(); ++i) {
|
||||
RuleTableFF *ff = ffs[i];
|
||||
const boost::unordered_set<std::size_t> &sourceTerms =
|
||||
ff->GetSourceTerminalSet();
|
||||
if (sourceTerms.find(factorId) != sourceTerms.end()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename RuleMatcher>
|
||||
const SHyperedge *Manager<RuleMatcher>::GetBestSHyperedge() const
|
||||
|
@ -50,10 +50,13 @@ private:
|
||||
|
||||
void InitializeStacks();
|
||||
|
||||
bool IsUnknownSourceWord(const Word &) const;
|
||||
|
||||
void RecombineAndSort(const std::vector<SHyperedge*> &, SVertexStack &);
|
||||
|
||||
boost::shared_ptr<const Forest> m_forest;
|
||||
const Forest::Vertex *m_rootVertex;
|
||||
std::size_t m_sentenceLength; // Includes <s> and </s>
|
||||
PVertexToStackMap m_stackMap;
|
||||
boost::shared_ptr<HyperTree> m_glueRuleTrie;
|
||||
std::vector<boost::shared_ptr<RuleMatcher> > m_mainRuleMatchers;
|
||||
|
@ -35,7 +35,8 @@ void RuleTableFF::Load()
|
||||
staticData.GetSearchAlgorithm() == SyntaxT2S) {
|
||||
F2S::HyperTree *trie = new F2S::HyperTree(this);
|
||||
F2S::HyperTreeLoader loader;
|
||||
loader.Load(m_input, m_output, m_filePath, *this, *trie);
|
||||
loader.Load(m_input, m_output, m_filePath, *this, *trie,
|
||||
m_sourceTerminalSet);
|
||||
m_table = trie;
|
||||
} else if (staticData.GetSearchAlgorithm() == SyntaxS2T) {
|
||||
S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm();
|
||||
|
@ -43,10 +43,17 @@ public:
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Get the source terminal vocabulary for this table's grammar (as a set of
|
||||
// factor IDs)
|
||||
const boost::unordered_set<std::size_t> &GetSourceTerminalSet() const {
|
||||
return m_sourceTerminalSet;
|
||||
}
|
||||
|
||||
private:
|
||||
static std::vector<RuleTableFF*> s_instances;
|
||||
|
||||
const RuleTable *m_table;
|
||||
boost::unordered_set<std::size_t> m_sourceTerminalSet;
|
||||
};
|
||||
|
||||
} // Syntax
|
||||
|
@ -24,14 +24,18 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <sys/mman.h>
|
||||
#include <cstdio>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifndef __MMAN_PAGE_SIZE__
|
||||
#define __MMAN_PAGE_SIZE__ sysconf(_SC_PAGE_SIZE)
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
|
||||
#include "util/mmap.hh"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
template <class T>
|
||||
@ -60,25 +64,25 @@ public:
|
||||
|
||||
MmapAllocator() throw()
|
||||
: m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
|
||||
m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
|
||||
m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
|
||||
m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
|
||||
}
|
||||
|
||||
MmapAllocator(std::FILE* f_ptr) throw()
|
||||
: m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
|
||||
m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
|
||||
m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
|
||||
m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
|
||||
}
|
||||
|
||||
MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
|
||||
: m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
|
||||
m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
|
||||
m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
|
||||
m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0)) {
|
||||
}
|
||||
|
||||
MmapAllocator(std::string fileName) throw()
|
||||
: m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
|
||||
m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0),
|
||||
m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
|
||||
m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
|
||||
}
|
||||
|
||||
@ -92,7 +96,7 @@ public:
|
||||
|
||||
~MmapAllocator() throw() {
|
||||
if(m_data_ptr && *m_count == 0) {
|
||||
munmap(m_data_ptr, m_map_size);
|
||||
util::UnmapOrThrow(m_data_ptr, m_map_size);
|
||||
if(!m_fixed && std::ftell(m_file_ptr) != -1)
|
||||
std::fclose(m_file_ptr);
|
||||
}
|
||||
@ -119,13 +123,17 @@ public:
|
||||
pointer allocate (size_type num, const void* = 0) {
|
||||
m_map_size = num * sizeof(T);
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
// On Windows, MAP_SHARED is not defined and MapOrThrow ignores the flags.
|
||||
const int map_shared = 0;
|
||||
#else
|
||||
const int map_shared = MAP_SHARED;
|
||||
#endif
|
||||
if(!m_fixed) {
|
||||
size_t read = 0;
|
||||
read += ftruncate(m_file_desc, m_map_size);
|
||||
m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED,
|
||||
m_file_desc, 0);
|
||||
if(m_data_ptr == MAP_FAILED)
|
||||
std::cerr << "Error: mmapping" << std::endl;
|
||||
m_data_ptr = (char *)util::MapOrThrow(
|
||||
m_map_size, true, map_shared, false, m_file_desc, 0);
|
||||
return (pointer)m_data_ptr;
|
||||
} else {
|
||||
size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
|
||||
@ -133,8 +141,8 @@ public:
|
||||
|
||||
size_t map_size = m_map_size + relative_offset;
|
||||
|
||||
m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED,
|
||||
m_file_desc, map_offset);
|
||||
m_data_ptr = (char *)util::MapOrThrow(
|
||||
m_map_size, false, map_shared, false, m_file_desc, map_offset);
|
||||
|
||||
return (pointer)(m_data_ptr + relative_offset);
|
||||
}
|
||||
@ -142,11 +150,11 @@ public:
|
||||
|
||||
void deallocate (pointer p, size_type num) {
|
||||
if(!m_fixed) {
|
||||
munmap(p, num * sizeof(T));
|
||||
util::UnmapOrThrow(p, num * sizeof(T));
|
||||
} else {
|
||||
size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
|
||||
size_t relative_offset = m_data_offset - map_offset;
|
||||
munmap((pointer)((char*)p - relative_offset), num * sizeof(T));
|
||||
util::UnmapOrThrow((pointer)((char*)p - relative_offset), num * sizeof(T));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,7 +1,9 @@
|
||||
#include "FileHandler.h"
|
||||
#include <cstdio>
|
||||
|
||||
#ifdef WIN32
|
||||
// Workaround: plain Windows does not have popen()/pclose().
|
||||
// (MinGW already #define's them, so skip the workaround there.)
|
||||
#if defined(WIN32) && !defined(__MINGW32__)
|
||||
#define popen(A, B) _popen(A, B)
|
||||
#define pclose(A) _pclose(A)
|
||||
#endif
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "utils.h"
|
||||
#include "FileHandler.h"
|
||||
#include "util/exception.hh"
|
||||
#include "util/random.hh"
|
||||
|
||||
using namespace Moses;
|
||||
typedef uint64_t P; // largest input range is 2^64
|
||||
@ -162,7 +163,7 @@ void Hash_shiftAddXOR<T>::initSeeds()
|
||||
{
|
||||
v_ = new T[this->H_];
|
||||
for(count_t i=0; i < this->H_; i++)
|
||||
v_[i] = Utils::rand<T>() + 1;
|
||||
v_[i] = util::wide_rand<T>() + 1;
|
||||
}
|
||||
template <typename T>
|
||||
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h)
|
||||
@ -187,9 +188,8 @@ void UnivHash_tableXOR<T>::initSeeds()
|
||||
// fill with random values
|
||||
for(count_t j=0; j < this->H_; j++) {
|
||||
table_[j] = new T[tblLen_];
|
||||
for(count_t i=0; i < tblLen_; i++) {
|
||||
table_[j][i] = Utils::rand<T>(this->m_-1);
|
||||
}
|
||||
for(count_t i=0; i < tblLen_; i++)
|
||||
table_[j][i] = util::wide_rand_excl(this->m_-1);
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
@ -218,7 +218,7 @@ void UnivHash_noPrimes<T>::initSeeds()
|
||||
{
|
||||
a_ = new P[this->H_];
|
||||
for(T i=0; i < this->H_; i++) {
|
||||
a_[i] = Utils::rand<P>();
|
||||
a_[i] = util::wide_rand<P>();
|
||||
if(a_[i] % 2 == 0) a_[i]++; // a must be odd
|
||||
}
|
||||
}
|
||||
@ -284,8 +284,8 @@ void UnivHash_linear<T>::initSeeds()
|
||||
a_[i] = new T[MAX_NGRAM_ORDER];
|
||||
b_[i] = new T[MAX_NGRAM_ORDER];
|
||||
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
|
||||
a_[i][j] = 1 + Utils::rand<T>();
|
||||
b_[i][j] = Utils::rand<T>();
|
||||
a_[i][j] = 1 + util::wide_rand<T>();
|
||||
b_[i][j] = util::wide_rand<T>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -302,7 +302,8 @@ float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
|
||||
}
|
||||
while(num_fnd > 1) { // get lower order count
|
||||
//get sub-context of size one less than length found (exluding target)
|
||||
if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) &&
|
||||
den_val = query(&ngram[len - num_fnd], num_fnd - 1);
|
||||
if((den_val > 0) &&
|
||||
(den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
|
||||
break;
|
||||
} else --num_fnd; // else backoff to lower ngram order
|
||||
|
@ -62,22 +62,6 @@ public:
|
||||
str[i] = tolower(str[i]);
|
||||
}
|
||||
}
|
||||
// TODO: interface with decent PRG
|
||||
template<typename T>
|
||||
static T rand(T mod_bnd = 0) {
|
||||
T random = 0;
|
||||
if(sizeof(T) <= 4) {
|
||||
random = static_cast<T>(std::rand());
|
||||
} else if(sizeof(T) == 8) {
|
||||
random = static_cast<T>(std::rand());
|
||||
random <<= 31;
|
||||
random <<= 1;
|
||||
random |= static_cast<T>(std::rand());
|
||||
}
|
||||
if(mod_bnd != 0)
|
||||
return random % mod_bnd;
|
||||
else return random;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,4 +1,6 @@
|
||||
#include "DynSuffixArray.h"
|
||||
#include "util/random.hh"
|
||||
|
||||
#include <iostream>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
@ -315,33 +317,31 @@ int DynSuffixArray::Compare(int pos1, int pos2, int max)
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
/// Helper: swap two entries in an int array.
|
||||
inline void swap_ints(int array[], int one, int other)
|
||||
{
|
||||
const int tmp = array[one];
|
||||
array[one] = array[other];
|
||||
array[other] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
void DynSuffixArray::Qsort(int* array, int begin, int end)
|
||||
{
|
||||
if(end > begin) {
|
||||
int index;
|
||||
int index = util::rand_incl(begin, end);
|
||||
{
|
||||
index = begin + (rand() % (end - begin + 1));
|
||||
int pivot = array[index];
|
||||
{
|
||||
int tmp = array[index];
|
||||
array[index] = array[end];
|
||||
array[end] = tmp;
|
||||
}
|
||||
const int pivot = array[index];
|
||||
swap_ints(array, index, end);
|
||||
for(int i=index=begin; i < end; ++i) {
|
||||
if (Compare(array[i], pivot, 20) <= 0) {
|
||||
{
|
||||
int tmp = array[index];
|
||||
array[index] = array[i];
|
||||
array[i] = tmp;
|
||||
swap_ints(array, index, i);
|
||||
index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
int tmp = array[index];
|
||||
array[index] = array[end];
|
||||
array[end] = tmp;
|
||||
}
|
||||
swap_ints(array, index, end);
|
||||
}
|
||||
Qsort(array, begin, index - 1);
|
||||
Qsort(array, index + 1, end);
|
||||
|
@ -17,6 +17,7 @@ License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
#include "util/exception.hh"
|
||||
#include "util/tokenize.hh"
|
||||
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
|
||||
|
||||
using namespace std;
|
||||
@ -30,29 +31,6 @@ void OutputVec(const vector<T> &vec)
|
||||
cerr << endl;
|
||||
}
|
||||
|
||||
// from phrase-extract/tables-core.cpp
|
||||
inline vector<string> tokenize( const char* input )
|
||||
{
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
} else if (isSpace && !betweenWords) {
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
@ -464,7 +442,7 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
|
||||
i++;
|
||||
if (i%100000 == 0) cerr << "." << flush;
|
||||
|
||||
vector<string> token = tokenize( line.c_str() );
|
||||
const vector<string> token = util::tokenize( line );
|
||||
if (token.size() != 4) {
|
||||
cerr << "line " << i << " in " << fileName
|
||||
<< " has wrong number of tokens, skipping:\n"
|
||||
|
@ -1,11 +1,11 @@
|
||||
// vim:tabstop=2
|
||||
#include <cstdlib>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
#include "PhraseDictionaryTransliteration.h"
|
||||
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
|
||||
#include "moses/DecodeGraph.h"
|
||||
#include "moses/DecodeStep.h"
|
||||
#include "util/tempfile.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -70,11 +70,10 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
|
||||
inputPath.SetTargetPhrases(*this, tpColl, NULL);
|
||||
} else {
|
||||
// TRANSLITERATE
|
||||
const boost::filesystem::path
|
||||
inFile = boost::filesystem::unique_path(),
|
||||
outDir = boost::filesystem::unique_path();
|
||||
const util::temp_file inFile;
|
||||
const util::temp_dir outDir;
|
||||
|
||||
ofstream inStream(inFile.c_str());
|
||||
ofstream inStream(inFile.path().c_str());
|
||||
inStream << sourcePhrase.ToString() << endl;
|
||||
inStream.close();
|
||||
|
||||
@ -84,14 +83,14 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
|
||||
" --external-bin-dir " + m_externalDir +
|
||||
" --input-extension " + m_inputLang +
|
||||
" --output-extension " + m_outputLang +
|
||||
" --oov-file " + inFile.native() +
|
||||
" --out-dir " + outDir.native();
|
||||
" --oov-file " + inFile.path() +
|
||||
" --out-dir " + outDir.path();
|
||||
|
||||
int ret = system(cmd.c_str());
|
||||
UTIL_THROW_IF2(ret != 0, "Transliteration script error");
|
||||
|
||||
TargetPhraseCollection *tpColl = new TargetPhraseCollection();
|
||||
vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.native());
|
||||
vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.path());
|
||||
vector<TargetPhrase*>::const_iterator iter;
|
||||
for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
|
||||
TargetPhrase *tp = *iter;
|
||||
@ -102,10 +101,6 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
|
||||
cache[hash] = value;
|
||||
|
||||
inputPath.SetTargetPhrases(*this, tpColl, NULL);
|
||||
|
||||
// clean up temporary files
|
||||
remove(inFile.c_str());
|
||||
boost::filesystem::remove_all(outDir);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -45,6 +45,7 @@
|
||||
#include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h"
|
||||
#include "util/file.hh"
|
||||
#include "util/exception.hh"
|
||||
#include "util/random.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -62,8 +63,8 @@ char *mkdtemp(char *tempbuf)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
srand((unsigned)time(0));
|
||||
rand_value = (int)((rand() / ((double)RAND_MAX+1.0)) * 1e6);
|
||||
util::rand_init();
|
||||
rand_value = util::rand_excl(1e6);
|
||||
tempbase = strrchr(tempbuf, '/');
|
||||
tempbase = tempbase ? tempbase+1 : tempbuf;
|
||||
strcpy(tempbasebuf, tempbase);
|
||||
@ -130,10 +131,6 @@ int removedirectoryrecursively(const char *dirname)
|
||||
struct dirent *entry;
|
||||
char path[PATH_MAX];
|
||||
|
||||
if (path == NULL) {
|
||||
fprintf(stderr, "Out of memory error\n");
|
||||
return 0;
|
||||
}
|
||||
dir = opendir(dirname);
|
||||
if (dir == NULL) {
|
||||
perror("Error opendir()");
|
||||
|
@ -2,6 +2,9 @@
|
||||
#define __sampling_h
|
||||
#include <boost/dynamic_bitset.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "util/random.hh"
|
||||
|
||||
// Utility functions for proper sub-sampling.
|
||||
// (c) 2007-2012 Ulrich Germann
|
||||
|
||||
@ -9,12 +12,6 @@
|
||||
namespace Moses
|
||||
{
|
||||
using namespace std;
|
||||
inline
|
||||
size_t
|
||||
randInt(size_t N)
|
||||
{
|
||||
return N*(rand()/(RAND_MAX+1.));
|
||||
}
|
||||
|
||||
// select a random sample of size /s/ without restitution from the range of
|
||||
// integers [0,N);
|
||||
@ -35,15 +32,15 @@ randomSample(vector<idx_t>& v, size_t s, size_t N)
|
||||
if (s*10<N) {
|
||||
boost::dynamic_bitset<uint64_t> check(N,0);
|
||||
for (size_t i = 0; i < v.size(); i++) {
|
||||
size_t x = randInt(N);
|
||||
while (check[x]) x = randInt(N);
|
||||
size_t x = util::rand_excl(N);
|
||||
while (check[x]) x = util::rand_excl(N);
|
||||
check[x]=true;
|
||||
v[i] = x;
|
||||
}
|
||||
} else {
|
||||
size_t m=0;
|
||||
for (size_t t = 0; m <= s && t < N; t++)
|
||||
if (s==N || randInt(N-t) < s-m) v[m++] = t;
|
||||
if (s==N || util::rand_excl(N-t) < s-m) v[m++] = t;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -345,7 +345,7 @@
|
||||
// {
|
||||
// boost::lock_guard<boost::mutex> lock(stats->lock);
|
||||
// if (stats->raw_cnt == ctr) ++stats->raw_cnt;
|
||||
// size_t rnum = randInt(stats->raw_cnt - ctr++);
|
||||
// size_t rnum = util::rand_excl(stats->raw_cnt - ctr++);
|
||||
// // cout << stats->raw_cnt << " " << ctr-1 << " "
|
||||
// // << rnum << " " << max_samples - stats->good << endl;
|
||||
// if (rnum < max_samples - stats->good)
|
||||
|
@ -69,7 +69,7 @@ namespace ugdiss
|
||||
// while (chosen < samplesize && next < stop)
|
||||
// {
|
||||
// root->readEntry(next,*this);
|
||||
// if (randInt(N - sampled++) < samplesize - chosen)
|
||||
// if (util::rand_excl(N - sampled++) < samplesize - chosen)
|
||||
// {
|
||||
// ++chosen;
|
||||
// return true;
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <iostream>
|
||||
#include "util/exception.hh"
|
||||
#include "moses/Util.h"
|
||||
#include "util/random.hh"
|
||||
//#include <cassert>
|
||||
|
||||
// #include "ug_bv_iter.h"
|
||||
@ -896,13 +897,6 @@ namespace ugdiss
|
||||
return bv;
|
||||
}
|
||||
|
||||
inline
|
||||
size_t
|
||||
randInt(size_t N)
|
||||
{
|
||||
return size_t(N*(rand()/(RAND_MAX+1.)));
|
||||
}
|
||||
|
||||
/// randomly select up to N occurrences of the sequence
|
||||
template<typename Token>
|
||||
sptr<vector<typename ttrack::Position> >
|
||||
@ -924,8 +918,8 @@ namespace ugdiss
|
||||
root->readEntry(I.next,I);
|
||||
|
||||
// t: expected number of remaining samples
|
||||
double t = (stop - I.pos)/root->aveIndexEntrySize();
|
||||
double r = t*rand()/(RAND_MAX+1.);
|
||||
const double t = (stop - I.pos)/root->aveIndexEntrySize();
|
||||
const double r = util::rand_excl(t);
|
||||
if (r < N-m)
|
||||
{
|
||||
ret->at(m).offset = I.offset;
|
||||
|
@ -16,7 +16,7 @@ namespace Moses
|
||||
{
|
||||
using namespace bitext;
|
||||
using namespace std;
|
||||
// using namespace boost;
|
||||
using namespace boost;
|
||||
|
||||
void
|
||||
fillIdSeq(Phrase const& mophrase, size_t const ifactor,
|
||||
@ -155,6 +155,10 @@ namespace Moses
|
||||
input_factor = atoi(param.insert(dflt).first->second.c_str());
|
||||
// shouldn't that be a string?
|
||||
|
||||
dflt = pair<string,string> ("output-factor","0");
|
||||
output_factor = atoi(param.insert(dflt).first->second.c_str());
|
||||
ofactor.assign(1,output_factor);
|
||||
|
||||
dflt = pair<string,string> ("smooth",".01");
|
||||
m_lbop_conf = atof(param.insert(dflt).first->second.c_str());
|
||||
|
||||
|
@ -31,7 +31,6 @@ namespace Moses
|
||||
TrellisPath::TrellisPath(const Hypothesis *hypo)
|
||||
: m_prevEdgeChanged(NOT_FOUND)
|
||||
{
|
||||
m_scoreBreakdown = hypo->GetScoreBreakdown();
|
||||
m_totalScore = hypo->GetTotalScore();
|
||||
|
||||
// enumerate path using prevHypo
|
||||
@ -41,10 +40,9 @@ TrellisPath::TrellisPath(const Hypothesis *hypo)
|
||||
}
|
||||
}
|
||||
|
||||
void TrellisPath::InitScore()
|
||||
void TrellisPath::InitTotalScore()
|
||||
{
|
||||
m_totalScore = m_path[0]->GetWinningHypo()->GetTotalScore();
|
||||
m_scoreBreakdown= m_path[0]->GetWinningHypo()->GetScoreBreakdown();
|
||||
|
||||
//calc score
|
||||
size_t sizePath = m_path.size();
|
||||
@ -53,12 +51,8 @@ void TrellisPath::InitScore()
|
||||
const Hypothesis *winningHypo = hypo->GetWinningHypo();
|
||||
if (hypo != winningHypo) {
|
||||
m_totalScore = m_totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore();
|
||||
m_scoreBreakdown.MinusEquals(winningHypo->GetScoreBreakdown());
|
||||
m_scoreBreakdown.PlusEquals(hypo->GetScoreBreakdown());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
TrellisPath::TrellisPath(const TrellisPath ©, size_t edgeIndex, const Hypothesis *arc)
|
||||
@ -80,7 +74,7 @@ TrellisPath::TrellisPath(const TrellisPath ©, size_t edgeIndex, const Hypoth
|
||||
prevHypo = prevHypo->GetPrevHypo();
|
||||
}
|
||||
|
||||
InitScore();
|
||||
InitTotalScore();
|
||||
}
|
||||
|
||||
TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
|
||||
@ -88,9 +82,7 @@ TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
|
||||
{
|
||||
m_path.resize(edges.size());
|
||||
copy(edges.rbegin(),edges.rend(),m_path.begin());
|
||||
InitScore();
|
||||
|
||||
|
||||
InitTotalScore();
|
||||
}
|
||||
|
||||
|
||||
@ -172,6 +164,32 @@ void TrellisPath::CreateDeviantPaths(TrellisPathList &pathColl) const
|
||||
}
|
||||
}
|
||||
|
||||
const boost::shared_ptr<ScoreComponentCollection> TrellisPath::GetScoreBreakdown() const
|
||||
{
|
||||
if (!m_scoreBreakdown) {
|
||||
float totalScore = m_path[0]->GetWinningHypo()->GetTotalScore(); // calculated for sanity check only
|
||||
|
||||
m_scoreBreakdown = boost::shared_ptr<ScoreComponentCollection>(new ScoreComponentCollection());
|
||||
m_scoreBreakdown->PlusEquals(ScoreComponentCollection(m_path[0]->GetWinningHypo()->GetScoreBreakdown()));
|
||||
|
||||
//calc score
|
||||
size_t sizePath = m_path.size();
|
||||
for (size_t pos = 0 ; pos < sizePath ; pos++) {
|
||||
const Hypothesis *hypo = m_path[pos];
|
||||
const Hypothesis *winningHypo = hypo->GetWinningHypo();
|
||||
if (hypo != winningHypo) {
|
||||
totalScore = totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore();
|
||||
m_scoreBreakdown->MinusEquals(winningHypo->GetScoreBreakdown());
|
||||
m_scoreBreakdown->PlusEquals(hypo->GetScoreBreakdown());
|
||||
}
|
||||
}
|
||||
|
||||
assert(totalScore == m_totalScore);
|
||||
}
|
||||
|
||||
return m_scoreBreakdown;
|
||||
}
|
||||
|
||||
Phrase TrellisPath::GetTargetPhrase() const
|
||||
{
|
||||
Phrase targetPhrase(ARRAY_SIZE_INCR);
|
||||
|
@ -19,14 +19,14 @@ License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#ifndef moses_TrellisPath_h
|
||||
#define moses_TrellisPath_h
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include "Hypothesis.h"
|
||||
#include "TypeDef.h"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -50,13 +50,13 @@ protected:
|
||||
, or NOT_FOUND if this path is the best trans so consist of only hypos
|
||||
*/
|
||||
|
||||
ScoreComponentCollection m_scoreBreakdown;
|
||||
float m_totalScore;
|
||||
mutable boost::shared_ptr<ScoreComponentCollection> m_scoreBreakdown;
|
||||
|
||||
//Used by Manager::LatticeSample()
|
||||
explicit TrellisPath(const std::vector<const Hypothesis*> edges);
|
||||
|
||||
void InitScore();
|
||||
void InitTotalScore();
|
||||
|
||||
public:
|
||||
TrellisPath(); // not implemented
|
||||
@ -91,9 +91,7 @@ public:
|
||||
//! create a list of next best paths by wiggling 1 of the node at a time.
|
||||
void CreateDeviantPaths(TrellisPathList &pathColl) const;
|
||||
|
||||
inline const ScoreComponentCollection &GetScoreBreakdown() const {
|
||||
return m_scoreBreakdown;
|
||||
}
|
||||
const boost::shared_ptr<ScoreComponentCollection> GetScoreBreakdown() const;
|
||||
|
||||
//! get target words range of the hypo within n-best trellis. not necessarily the same as hypo.GetCurrTargetWordsRange()
|
||||
WordsRange GetTargetWordsRange(const Hypothesis &hypo) const;
|
||||
@ -123,4 +121,4 @@ inline std::ostream& operator<<(std::ostream& out, const TrellisPath& path)
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -502,13 +502,11 @@ inline std::string GetFirstString(const std::string& str, int& first_pos, const
|
||||
template<class T>
|
||||
T log_sum (T log_a, T log_b)
|
||||
{
|
||||
T v;
|
||||
if (log_a < log_b) {
|
||||
v = log_b+log ( 1 + exp ( log_a-log_b ));
|
||||
return log_b + log1p(exp(log_a - log_b));
|
||||
} else {
|
||||
v = log_a+log ( 1 + exp ( log_b-log_a ));
|
||||
return log_a + log1p(exp(log_b - log_a));
|
||||
}
|
||||
return ( v );
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -105,13 +105,13 @@ const TrellisPath doMBR(const TrellisPathList& nBestList)
|
||||
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
|
||||
const TrellisPath &path = **iter;
|
||||
float score = StaticData::Instance().GetMBRScale()
|
||||
* path.GetScoreBreakdown().GetWeightedScore();
|
||||
* path.GetScoreBreakdown()->GetWeightedScore();
|
||||
if (maxScore < score) maxScore = score;
|
||||
}
|
||||
|
||||
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
|
||||
const TrellisPath &path = **iter;
|
||||
joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().GetWeightedScore() - maxScore);
|
||||
joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown()->GetWeightedScore() - maxScore);
|
||||
marginal += joint_prob;
|
||||
joint_prob_vec.push_back(joint_prob);
|
||||
|
||||
|
@ -166,7 +166,7 @@ namespace MosesServer
|
||||
{
|
||||
// should the score breakdown be reported in a more structured manner?
|
||||
ostringstream buf;
|
||||
path->GetScoreBreakdown().OutputAllFeatureScores(buf);
|
||||
path->GetScoreBreakdown()->OutputAllFeatureScores(buf);
|
||||
nBestXmlItem["fvals"] = xmlrpc_c::value_string(buf.str());
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include "ExtractionPhrasePair.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -17,7 +18,7 @@ void Domain::load( const std::string &domainFileName )
|
||||
string line;
|
||||
while(getline(*fileP, line)) {
|
||||
// read
|
||||
vector< string > domainSpecLine = tokenize( line.c_str() );
|
||||
const vector< string > domainSpecLine = util::tokenize( line );
|
||||
int lineNumber;
|
||||
if (domainSpecLine.size() != 2 ||
|
||||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
|
||||
@ -25,7 +26,7 @@ void Domain::load( const std::string &domainFileName )
|
||||
exit(1);
|
||||
}
|
||||
// store
|
||||
string &name = domainSpecLine[1];
|
||||
const string &name = domainSpecLine[1];
|
||||
spec.push_back( make_pair( lineNumber, name ));
|
||||
if (name2id.find( name ) == name2id.end()) {
|
||||
name2id[ name ] = list.size();
|
||||
|
@ -14,8 +14,6 @@
|
||||
|
||||
#include "ScoreFeature.h"
|
||||
|
||||
extern std::vector<std::string> tokenize( const char*);
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <string>
|
||||
|
||||
#include "tables-core.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -40,7 +41,7 @@ void addBoundaryWords(vector<string> &phrase)
|
||||
|
||||
bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
|
||||
{
|
||||
target = tokenize(targetString);
|
||||
target = util::tokenize(targetString);
|
||||
if (boundaryRules)
|
||||
addBoundaryWords(target);
|
||||
return true;
|
||||
@ -48,7 +49,7 @@ bool SentenceAlignment::processTargetSentence(const char * targetString, int, bo
|
||||
|
||||
bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
|
||||
{
|
||||
source = tokenize(sourceString);
|
||||
source = util::tokenize(sourceString);
|
||||
if (boundaryRules)
|
||||
addBoundaryWords(source);
|
||||
return true;
|
||||
@ -89,7 +90,7 @@ bool SentenceAlignment::create(const char targetString[],
|
||||
}
|
||||
|
||||
// reading in alignments
|
||||
vector<string> alignmentSequence = tokenize( alignmentString );
|
||||
vector<string> alignmentSequence = util::tokenize( alignmentString );
|
||||
for(size_t i=0; i<alignmentSequence.size(); i++) {
|
||||
int s,t;
|
||||
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "tables-core.h"
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -49,7 +50,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
|
||||
<< sentenceID << ": " << e.getMsg() << std::endl;
|
||||
return false;
|
||||
}
|
||||
target = tokenize(targetStringCPP.c_str());
|
||||
target = util::tokenize(targetStringCPP);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -70,11 +71,8 @@ bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceStrin
|
||||
<< sentenceID << ": " << e.getMsg() << std::endl;
|
||||
return false;
|
||||
}
|
||||
source = tokenize(sourceStringCPP.c_str());
|
||||
source = util::tokenize(sourceStringCPP);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
|
||||
|
@ -25,11 +25,10 @@
|
||||
#include <cstdlib>
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
std::vector<std::string> tokenize( const char [] );
|
||||
|
||||
vector< string > splitLine(const char *line)
|
||||
{
|
||||
vector< string > item;
|
||||
@ -109,7 +108,7 @@ int main(int argc, char* argv[])
|
||||
if (! getLine(fileDirectP, itemDirect ))
|
||||
break;
|
||||
|
||||
vector< string > count = tokenize( itemDirect[4].c_str() );
|
||||
const vector< string > count = util::tokenize( itemDirect[4] );
|
||||
float countEF = atof(count[0].c_str());
|
||||
float countF = atof(count[1].c_str());
|
||||
float prob = countF/countEF;
|
||||
|
@ -28,6 +28,7 @@
|
||||
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -165,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);
|
||||
|
||||
// counts, for debugging
|
||||
vector<string> directCounts = tokenize(itemDirect[4].c_str());
|
||||
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
|
||||
const vector<string> directCounts = util::tokenize(itemDirect[4]);
|
||||
const vector<string> indirectCounts = util::tokenize(itemIndirect[4]);
|
||||
fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
|
||||
// output rule count if present in either file
|
||||
if (indirectCounts.size() > 1) {
|
||||
@ -199,7 +200,6 @@ bool getLine( istream &fileP, vector< string > &item )
|
||||
vector< string > splitLine(const char *line)
|
||||
{
|
||||
vector< string > item;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; line[i] != '\0'; i++) {
|
||||
@ -223,10 +223,10 @@ string reverseAlignment(const string &alignments)
|
||||
{
|
||||
stringstream ret("");
|
||||
|
||||
vector<string> alignToks = tokenize(alignments.c_str());
|
||||
const vector<string> alignToks = util::tokenize(alignments);
|
||||
|
||||
for (size_t i = 0; i < alignToks.size(); ++i) {
|
||||
string &alignPair = alignToks[i];
|
||||
const string &alignPair = alignToks[i];
|
||||
vector<string> alignPoints;
|
||||
Tokenize(alignPoints, alignPair, "-");
|
||||
assert(alignPoints.size() == 2);
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include "tables-core.h"
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
@ -56,7 +57,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
|
||||
m_tree.ConnectNodes();
|
||||
SyntaxNode *root = m_tree.GetTop();
|
||||
assert(root);
|
||||
m_words = tokenize(m_line.c_str());
|
||||
m_words = util::tokenize(m_line);
|
||||
return ConvertTree(*root, m_words);
|
||||
}
|
||||
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "tables-core.h"
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
#include "syntax-common/exception.h"
|
||||
|
||||
@ -51,7 +52,7 @@ std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line) {
|
||||
// There is no XML tree.
|
||||
return std::auto_ptr<PcfgTree>();
|
||||
}
|
||||
m_words = tokenize(m_line.c_str());
|
||||
m_words = util::tokenize(m_line);
|
||||
return ConvertTree(*root, m_words);
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
|
||||
#include "relax-parse.h"
|
||||
#include "tables-core.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
@ -44,7 +45,7 @@ int main(int argc, char* argv[])
|
||||
map< string, int > topLabelCollection; // count of top labels, not used
|
||||
SyntaxTree tree;
|
||||
ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
|
||||
vector< string > inWords = tokenize( inBufferString.c_str() );
|
||||
const vector< string > inWords = util::tokenize( inBufferString );
|
||||
|
||||
// output tree
|
||||
// cerr << "BEFORE:" << endl << tree;
|
||||
@ -104,7 +105,7 @@ void init(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
void store( SyntaxTree &tree, vector< string > &words )
|
||||
void store( SyntaxTree &tree, const vector< string > &words )
|
||||
{
|
||||
// output words
|
||||
for( size_t i=0; i<words.size(); i++ ) {
|
||||
|
@ -39,7 +39,7 @@ char SAMTLevel = 0;
|
||||
|
||||
// functions
|
||||
void init(int argc, char* argv[]);
|
||||
void store( MosesTraining::SyntaxTree &tree, std::vector<std::string> &words );
|
||||
void store( MosesTraining::SyntaxTree &tree, const std::vector<std::string> &words );
|
||||
void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
|
||||
void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
|
||||
void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents );
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "AlignmentPhrase.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
@ -237,7 +238,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
|
||||
|
||||
bool PhraseAlignment::create(const char line[], int lineID )
|
||||
{
|
||||
vector< string > token = tokenize( line );
|
||||
const vector< string > token = util::tokenize( line );
|
||||
int item = 1;
|
||||
PHRASE phraseF, phraseE;
|
||||
for (size_t j=0; j<token.size(); j++) {
|
||||
@ -321,7 +322,7 @@ void LexicalTable::load( const string &filePath )
|
||||
i++;
|
||||
if (i%100000 == 0) cerr << "." << flush;
|
||||
|
||||
vector<string> token = tokenize( line.c_str() );
|
||||
const vector<string> token = util::tokenize( line );
|
||||
if (token.size() != 3) {
|
||||
cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
|
||||
token.size() << " " << token[0] << " " << line << endl;
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "tables-core.h"
|
||||
#include "XmlException.h"
|
||||
#include "XmlTree.h"
|
||||
#include "util/tokenize.hh"
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
@ -24,7 +25,7 @@ StringTree *XmlTreeParser::Parse(const std::string &line) {
|
||||
tree_.ConnectNodes();
|
||||
SyntaxNode *root = tree_.GetTop();
|
||||
assert(root);
|
||||
words_ = tokenize(line_.c_str());
|
||||
words_ = util::tokenize(line_);
|
||||
return ConvertTree(*root, words_);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
// $Id$
|
||||
//#include "beammain.h"
|
||||
#include "util/tokenize.hh"
|
||||
#include "tables-core.h"
|
||||
|
||||
#define TABLE_LINE_MAX_LENGTH 1000
|
||||
@ -7,37 +8,9 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
// as in beamdecoder/tables.cpp
|
||||
vector<string> tokenize( const char* input )
|
||||
{
|
||||
vector< string > token;
|
||||
bool betweenWords = true;
|
||||
int start=0;
|
||||
int i=0;
|
||||
for(; input[i] != '\0'; i++) {
|
||||
bool isSpace = (input[i] == ' ' || input[i] == '\t');
|
||||
|
||||
if (!isSpace && betweenWords) {
|
||||
start = i;
|
||||
betweenWords = false;
|
||||
} else if (isSpace && !betweenWords) {
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
betweenWords = true;
|
||||
}
|
||||
}
|
||||
if (!betweenWords)
|
||||
token.push_back( string( input+start, i-start ) );
|
||||
return token;
|
||||
}
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
bool isNonTerminal( const WORD &symbol )
|
||||
{
|
||||
return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::storeIfNew( const WORD& word )
|
||||
{
|
||||
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
||||
@ -107,7 +80,7 @@ void DTable::load( const string& fileName )
|
||||
abort();
|
||||
}
|
||||
|
||||
vector<string> token = tokenize(line.c_str());
|
||||
const vector<string> token = util::tokenize(line);
|
||||
if (token.size() < 2) {
|
||||
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
|
||||
continue;
|
||||
|
@ -12,8 +12,6 @@
|
||||
#include <map>
|
||||
#include <cmath>
|
||||
|
||||
extern std::vector<std::string> tokenize( const char*);
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
use FindBin qw($RealBin);
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use Getopt::Std;
|
||||
getopts('q');
|
||||
|
||||
|
@ -1,4 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
my $file = shift(@ARGV);
|
||||
|
@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
#input hindi word urdu word, delete all those entries that have number on any side
|
||||
use warnings;
|
||||
use utf8;
|
||||
|
||||
use Getopt::Std;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
use utf8;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
use utf8;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
use utf8;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
use utf8;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use utf8;
|
||||
require Encode;
|
||||
use IO::Handle;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use utf8;
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
@ -14,6 +14,7 @@ use utf8;
|
||||
# 23.01.2010: added NIST p-value and interval computation
|
||||
###############################################
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
#constants
|
||||
|
@ -4,6 +4,7 @@
|
||||
#sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors
|
||||
#usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.html
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
# Script to convert MOSES searchgraph to DOT format
|
||||
#
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use File::Path;
|
||||
use File::Basename;
|
||||
|
@ -5,7 +5,9 @@
|
||||
#usage: show-phrases-used DECODER_OUTFILE > output.html
|
||||
# where DECODER_OUTFILE is the output of moses with the -T (show alignments) option
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
BEGIN
|
||||
{
|
||||
my $wd= `pawd 2>/dev/null`;
|
||||
|
@ -9,6 +9,7 @@
|
||||
#similar function to filter-model-given-input.pl, but only operates
|
||||
#on the phrase table and doesn't require that any subdirectories exist
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
my $MAX_LENGTH = 10;
|
||||
|
@ -7,8 +7,15 @@ get-corpus
|
||||
default-name: corpus/txt
|
||||
rerun-on-change: input-extension output-extension
|
||||
template: IN OUT $input-extension $output-extension
|
||||
tokenize
|
||||
pre-tok-clean
|
||||
in: raw-stem
|
||||
out: pre-tok-cleaned
|
||||
default-name: corpus/pre-tok-cleaned
|
||||
pass-unless: pre-tok-clean
|
||||
template: $pre-tok-clean IN $input-extension $output-extension OUT OUT.lines-retained
|
||||
parallelizable: yes
|
||||
tokenize
|
||||
in: pre-tok-cleaned
|
||||
out: tokenized-stem
|
||||
default-name: corpus/tok
|
||||
pass-unless: input-tokenizer output-tokenizer
|
||||
@ -158,11 +165,18 @@ get-corpus
|
||||
pass-unless: get-corpus-script
|
||||
default-name: lm/txt
|
||||
template: $get-corpus-script > OUT
|
||||
use-parallel-corpus
|
||||
in: parallel-corpus-stem
|
||||
out: tokenized-corpus
|
||||
default-name: lm/tok
|
||||
ignore-unless: parallel-corpus-stem
|
||||
template: ln -s IN.$output-extension OUT
|
||||
tokenize
|
||||
in: raw-corpus
|
||||
out: tokenized-corpus
|
||||
default-name: lm/tok
|
||||
pass-unless: output-tokenizer
|
||||
ignore-if: parallel-corpus-stem
|
||||
template: $output-tokenizer < IN > OUT
|
||||
parallelizable: yes
|
||||
mock-parse
|
||||
@ -204,8 +218,14 @@ split
|
||||
default-name: lm/split
|
||||
pass-unless: output-splitter
|
||||
template: $output-splitter -model IN1.$output-extension < IN > OUT
|
||||
train
|
||||
strip
|
||||
in: split-corpus
|
||||
out: stripped-corpus
|
||||
default-name: lm/stripped
|
||||
pass-unless: mock-output-parser-lm
|
||||
template: $moses-script-dir/training/strip-xml.perl < IN > OUT
|
||||
train
|
||||
in: stripped-corpus
|
||||
out: lm
|
||||
default-name: lm/lm
|
||||
ignore-if: rlm-training
|
||||
@ -220,7 +240,7 @@ randomize
|
||||
pass-unless: lm-randomizer
|
||||
ignore-if: rlm-training
|
||||
train-randomized
|
||||
in: split-corpus
|
||||
in: stripped-corpus
|
||||
out: rlm
|
||||
default-name: lm/rlm
|
||||
ignore-unless: rlm-training
|
||||
@ -953,21 +973,21 @@ split-reference-devtest
|
||||
ignore-unless: use-mira
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-splitter -model IN1.$output-extension < IN > OUT
|
||||
reduce-reference
|
||||
strip-reference
|
||||
in: split-ref
|
||||
out: reference
|
||||
default-name: tuning/reference.reduced
|
||||
default-name: tuning/reference.stripped
|
||||
pass-unless: mock-output-parser-references
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
|
||||
reduce-reference-devtest
|
||||
template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
|
||||
strip-reference-devtest
|
||||
in: split-ref-devtest
|
||||
out: reference
|
||||
default-name: tuning/reference.devtest.reduced
|
||||
default-name: tuning/reference.devtest.stripped
|
||||
pass-unless: mock-output-parser-references
|
||||
ignore-unless: use-mira
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
|
||||
template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
|
||||
filter
|
||||
in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table
|
||||
out: filtered-dir
|
||||
@ -1224,13 +1244,13 @@ lowercase-reference
|
||||
pass-if: recaser
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-lowercaser < IN > OUT
|
||||
reduce-reference
|
||||
strip-reference
|
||||
in: lowercased-reference
|
||||
out: reference
|
||||
default-name: evaluation/reference
|
||||
pass-unless: mock-output-parser-references
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
|
||||
template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
|
||||
wade
|
||||
in: filtered-dir truecased-input tokenized-reference alignment system-output
|
||||
out: wade-analysis
|
||||
|
@ -3,6 +3,7 @@
|
||||
# Experiment Management System
|
||||
# Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
use FindBin qw($RealBin);
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
my ($file,$step) = @ARGV;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
# Create domain file from corpora
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
# Build necessary files for sparse lexical features
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
# $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
my ($in,$out,$consolidated,@PART) = @ARGV;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
my $cores = 8;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
my $jobs = 20;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
die("ERROR syntax: input-from-sgm.perl < in.sgm > in.txt")
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use IPC::Open3;
|
||||
use File::Temp qw/tempdir/;
|
||||
|
@ -1,10 +1,13 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
||||
Getopt::Long::config("no_auto_abbrev");
|
||||
Getopt::Long::config("pass_through");
|
||||
|
||||
|
||||
my ($TEXT,$ORDER,$BIN,$LM);
|
||||
|
||||
&GetOptions('text=s' => \$TEXT,
|
||||
@ -15,8 +18,9 @@ my ($TEXT,$ORDER,$BIN,$LM);
|
||||
die("ERROR: specify at least --bin BIN --text CORPUS --lm LM and --order N!")
|
||||
unless defined($BIN) && defined($TEXT) && defined($LM) && defined($ORDER);
|
||||
|
||||
my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM";
|
||||
$cmd .= " " . join(' ', @ARGV) if scalar(@ARGV); # Pass remaining args through.
|
||||
my $settings = join(' ', @ARGV);
|
||||
#print STDERR "settngs=$settings \n";
|
||||
|
||||
my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM $settings";
|
||||
print "exec: $cmd\n";
|
||||
`$cmd`;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user