diff --git a/biconcor/phrase-lookup.cpp b/biconcor/phrase-lookup.cpp index 3ef82e73a..60ab8db66 100644 --- a/biconcor/phrase-lookup.cpp +++ b/biconcor/phrase-lookup.cpp @@ -109,14 +109,17 @@ size_t lookup( string query ) return suffixArray.Count( queryString ); } -vector tokenize( const char input[] ) +// Duplicate of definition in util/tokenize.hh. +// TODO: Can we de-duplicate this? At the time of writing biconcor does not +// use util at all. +vector tokenize(const char input[]) { vector< string > token; bool betweenWords = true; int start=0; - int i=0; - for(; input[i] != '\0'; i++) { - bool isSpace = (input[i] == ' ' || input[i] == '\t'); + int i; + for(i = 0; input[i] != '\0'; i++) { + const bool isSpace = (input[i] == ' ' || input[i] == '\t'); if (!isSpace && betweenWords) { start = i; diff --git a/contrib/lmserver/examples/lmclient.cc b/contrib/lmserver/examples/lmclient.cc index b26984df9..0d9fc23ff 100644 --- a/contrib/lmserver/examples/lmclient.cc +++ b/contrib/lmserver/examples/lmclient.cc @@ -45,8 +45,8 @@ struct LMClient { exit(1); } - bzero((char *)&server, sizeof(server)); - bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); + memset(&server, '\0', sizeof(server)); + memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length); server.sin_family = hp->h_addrtype; server.sin_port = htons(port); diff --git a/contrib/mada/qsub-madamira.perl b/contrib/mada/qsub-madamira.perl new file mode 100755 index 000000000..bb7ecd06b --- /dev/null +++ b/contrib/mada/qsub-madamira.perl @@ -0,0 +1,46 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use File::Slurp; +use File::Basename; +use Cwd 'abs_path'; + +my $splitDir = $ARGV[0]; +$splitDir = abs_path($splitDir); + +my @files = read_dir $splitDir; + +my $qsubDir=dirname($splitDir) ."/qsub"; +print STDERR "qsubDir=$qsubDir\n"; +`mkdir -p $qsubDir`; + +my $out2Dir=dirname($splitDir) ."/out2"; +print STDERR "out2Dir=$out2Dir\n"; +`mkdir -p $out2Dir`; + +for my $file ( @files ) { + print STDERR "$file "; + + my $qsubFile = "$qsubDir/$file.sh"; + open(RUN_FILE, ">$qsubFile"); + + print RUN_FILE "#!/usr/bin/env bash\n" + ."#PBS -d/scratch/hh65/workspace/experiment/ar-en \n" + ."#PBS -l mem=5gb \n\n" + ."export PATH=\"/scratch/statmt/bin:/share/apps/NYUAD/perl/gcc_4.9.1/5.20.1/bin:/share/apps/NYUAD/jdk/1.8.0_31/bin:/share/apps/NYUAD/zlib/gcc_4.9.1/1.2.8/bin:/share/apps/NYUAD/cmake/gcc_4.9.1/3.1.0-rc3/bin:/share/apps/NYUAD/boost/gcc_4.9.1/openmpi_1.8.3/1.57.0/bin:/share/apps/NYUAD/openmpi/gcc_4.9.1/1.8.3/bin:/share/apps/NYUAD/python/gcc_4.9.1/2.7.9/bin:/share/apps/NYUAD/gcc/binutils/2.21/el6/bin:/share/apps/NYUAD/gcc/gcc/4.9.1/el6/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/opt/bio/ncbi/bin:/opt/bio/mpiblast/bin:/opt/bio/EMBOSS/bin:/opt/bio/clustalw/bin:/opt/bio/tcoffee/bin:/opt/bio/hmmer/bin:/opt/bio/phylip/exe:/opt/bio/mrbayes:/opt/bio/fasta:/opt/bio/glimmer/bin:/opt/bio/glimmer/scripts:/opt/bio/gromacs/bin:/opt/bio/gmap/bin:/opt/bio/tigr/bin:/opt/bio/autodocksuite/bin:/opt/bio/wgs/bin:/opt/ganglia/bin:/opt/ganglia/sbin:/opt/bin:/usr/java/latest/bin:/opt/pdsh/bin:/opt/rocks/bin:/opt/rocks/sbin:/opt/torque/bin:/opt/torque/sbin:/home/hh65/bin:/home/hh65/bin\" \n" + + ."module load NYUAD/2.0 \n" + ."module load gcc python/2.7.9 openmpi/1.8.3 boost cmake zlib jdk perl expat \n" + + ."cd /scratch/statmt/MADAMIRA-release-20140709-1.0 \n"; + print RUN_FILE "java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar /scratch/statmt/MADAMIRA-release-20140709-1.0/MADAMIRA.jar " + ."-rawinput $splitDir/$file -rawoutdir $out2Dir -rawconfig /scratch/statmt/MADAMIRA-release-20140709-1.0/samples/sampleConfigFile.xml \n"; + + close(RUN_FILE); + + my $cmd = "qsub $qsubFile"; + `$cmd`; + +} + diff --git a/contrib/mira/Main.cpp b/contrib/mira/Main.cpp index abf92b598..acc2f8886 100644 --- a/contrib/mira/Main.cpp +++ b/contrib/mira/Main.cpp @@ -46,6 +46,7 @@ namespace mpi = boost::mpi; #include "moses/FF/PhrasePairFeature.h" #include "moses/FF/WordPenaltyProducer.h" #include "moses/LM/Base.h" +#include "util/random.hh" using namespace Mira; using namespace std; @@ -54,6 +55,7 @@ namespace po = boost::program_options; int main(int argc, char** argv) { + util::rand_init(); size_t rank = 0; size_t size = 1; #ifdef MPI_ENABLE diff --git a/contrib/mira/Main.h b/contrib/mira/Main.h index 8736257f6..b8faedae7 100644 --- a/contrib/mira/Main.h +++ b/contrib/mira/Main.h @@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "moses/Word.h" #include "moses/FF/FeatureFunction.h" #include "Decoder.h" +#include "util/random.hh" typedef std::map > ProducerWeightMap; typedef std::pair > ProducerWeightPair; @@ -37,8 +38,7 @@ template bool from_string(T& t, const std::string& s, std::ios_base& ( struct RandomIndex { ptrdiff_t operator()(ptrdiff_t max) { - srand(time(0)); // Initialize random number generator with current time. - return static_cast (rand() % max); + return util::rand_excl(max); } }; diff --git a/contrib/other-builds/manual-label/manual-label.project b/contrib/other-builds/manual-label/manual-label.project index 2bc69a6ca..3e3efcddb 100644 --- a/contrib/other-builds/manual-label/manual-label.project +++ b/contrib/other-builds/manual-label/manual-label.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -14,6 +31,8 @@ + + @@ -33,6 +52,8 @@ + + @@ -107,6 +128,4 @@ - - diff --git a/contrib/other-builds/moses-cmd/moses-cmd.project b/contrib/other-builds/moses-cmd/moses-cmd.project index b978b451e..ecef4038b 100644 --- a/contrib/other-builds/moses-cmd/moses-cmd.project +++ b/contrib/other-builds/moses-cmd/moses-cmd.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -9,6 +26,14 @@ + + + + + + + + @@ -53,7 +78,7 @@ - + @@ -125,12 +150,4 @@ - - - - - - - - diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project index 7d666558f..55bf4e8f1 100644 --- a/contrib/other-builds/moses/moses.project +++ b/contrib/other-builds/moses/moses.project @@ -474,8 +474,6 @@ - - diff --git a/contrib/relent-filter/src/Main.cpp b/contrib/relent-filter/src/Main.cpp index 1f86e2cc7..3c7911248 100755 --- a/contrib/relent-filter/src/Main.cpp +++ b/contrib/relent-filter/src/Main.cpp @@ -42,6 +42,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "RelativeEntropyCalc.h" #include "LexicalReordering.h" #include "LexicalReorderingState.h" +#include "util/random.hh" #ifdef HAVE_PROTOBUF #include "hypergraph.pb.h" @@ -205,7 +206,7 @@ int main(int argc, char** argv) //initialise random numbers - srand(time(NULL)); + rand_init(); // set up read/writing class IOWrapper* ioWrapper = GetIOWrapper(staticData); diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index befebd8d2..edf7daa13 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -536,7 +536,7 @@ public: { // should the score breakdown be reported in a more structured manner? ostringstream buf; - path.GetScoreBreakdown().OutputAllFeatureScores(buf); + path.GetScoreBreakdown()->OutputAllFeatureScores(buf); nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str()); } diff --git a/mert/Data.cpp b/mert/Data.cpp index 49c1239e5..98f6c8399 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -17,6 +17,7 @@ #include "util/exception.hh" #include "util/file_piece.hh" +#include "util/random.hh" #include "util/tokenize_piece.hh" #include "util/string_piece.hh" #include "FeatureDataIterator.h" @@ -286,7 +287,7 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor } else { //create shards by randomly sampling for (size_t i = 0; i < floor(shard_size+0.5); ++i) { - shard_contents.push_back(rand() % data_size); + shard_contents.push_back(util::rand_excl(data_size)); } } diff --git a/mert/Fdstream.h b/mert/Fdstream.h index 2258ef4a5..23eecc466 100644 --- a/mert/Fdstream.h +++ b/mert/Fdstream.h @@ -13,6 +13,8 @@ #include #include +#include "util/unistd.hh" + #if defined(__GLIBCXX__) || defined(__GLIBCPP__) #include diff --git a/mert/FileStream.cpp b/mert/FileStream.cpp index 800ce1bfe..3d908de4f 100644 --- a/mert/FileStream.cpp +++ b/mert/FileStream.cpp @@ -40,28 +40,3 @@ inputfilestream::~inputfilestream() void inputfilestream::close() { } - -outputfilestream::outputfilestream(const std::string &filePath) - : std::ostream(0), m_streambuf(0), m_is_good(false) -{ - // check if file is readable - std::filebuf* fb = new std::filebuf(); - m_is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL); - - if (IsGzipFile(filePath)) { - throw runtime_error("Output to a zipped file not supported!"); - } else { - m_streambuf = fb; - } - this->init(m_streambuf); -} - -outputfilestream::~outputfilestream() -{ - delete m_streambuf; - m_streambuf = 0; -} - -void outputfilestream::close() -{ -} diff --git a/mert/FileStream.h b/mert/FileStream.h index 582cbcb59..8cbf4f591 100644 --- a/mert/FileStream.h +++ b/mert/FileStream.h @@ -22,20 +22,4 @@ public: void close(); }; -class outputfilestream : public std::ostream -{ -protected: - std::streambuf *m_streambuf; - bool m_is_good; - -public: - explicit outputfilestream(const std::string &filePath); - virtual ~outputfilestream(); - - bool good() const { - return m_is_good; - } - void close(); -}; - #endif // MERT_FILE_STREAM_H_ diff --git a/mert/ForestRescoreTest.cpp b/mert/ForestRescoreTest.cpp index 4b62e8317..23668ab20 100644 --- a/mert/ForestRescoreTest.cpp +++ b/mert/ForestRescoreTest.cpp @@ -1,6 +1,9 @@ #include +#include "util/tokenize_piece.hh" + #include "ForestRescore.h" +#include "MiraFeatureVector.h" #define BOOST_TEST_MODULE MertForestRescore #include @@ -10,8 +13,7 @@ using namespace std; using namespace MosesTuning; -BOOST_AUTO_TEST_CASE(viterbi_simple_lattice) -{ +BOOST_AUTO_TEST_CASE(viterbi_simple_lattice) { Vocab vocab; WordVec words; string wordStrings[] = @@ -242,5 +244,101 @@ BOOST_AUTO_TEST_CASE(viterbi_3branch_lattice) BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[8]); } +BOOST_AUTO_TEST_CASE(viterbi_full_hypergraph) { + Vocab vocab; + //References + ReferenceSet references; + references.AddLine(0,"in addition to EU support for businesses , also the administration of national business support will be concentrated in four Centres for Economic Development , Transport and Environment ( ELY Centres ) , starting from mid @-@ September .",vocab); + //Load the hypergraph + Graph graph(vocab); + util::scoped_fd fd(util::OpenReadOrThrow("mert/hgtest/0.gz")); + util::FilePiece file(fd.release()); + ReadGraph(file,graph); + + //prune + SparseVector weights; + weights.set("OpSequenceModel0_1",0.011187); + weights.set("OpSequenceModel0_2",-0.002797); + weights.set("OpSequenceModel0_3",0.002797); + weights.set("OpSequenceModel0_4",-0.000140); + weights.set("OpSequenceModel0_5",0.004195); + weights.set("Distortion0",0.041952); + weights.set("PhrasePenalty0",0.027968); + weights.set("WordPenalty0",-0.139841); + weights.set("UnknownWordPenalty0",1.000000); + weights.set("LM0",0.069920); + weights.set("LexicalReordering0_1",0.041952); + weights.set("LexicalReordering0_2",0.041952); + weights.set("LexicalReordering0_3",0.041952); + weights.set("LexicalReordering0_4",0.041952); + weights.set("LexicalReordering0_5",0.041952); + weights.set("LexicalReordering0_6",0.041952); + weights.set("LexicalReordering0_7",0.041952); + weights.set("LexicalReordering0_8",0.041952); + weights.set("TranslationModel0_1",0.027968); + weights.set("TranslationModel0_2",0.027968); + weights.set("TranslationModel0_3",0.027968); + weights.set("TranslationModel0_4",0.027968); + weights.set("TranslationModel0_5",0.027968); + weights.set("TranslationModel0_6",0.027968); + weights.set("TranslationModel0_7",0.027968); + weights.set("TranslationModel0_8",0.027968); + weights.set("TranslationModel0_9",0.027968); + weights.set("TranslationModel0_10",0.027968); + weights.set("TranslationModel0_11",0.027968); + weights.set("TranslationModel0_12",0.027968); + weights.set("TranslationModel0_13",0.027968); + size_t edgeCount = 500; + boost::shared_ptr prunedGraph; + prunedGraph.reset(new Graph(vocab)); + graph.Prune(prunedGraph.get(), weights, edgeCount); + + vector bg(9); + HgHypothesis bestHypo; + //best hypothesis + Viterbi(*prunedGraph, weights, 0, references, 0, bg, &bestHypo); + //check output as expected + string expectedStr = " the EU matters , but also the national matters management focus since mid @-@ September four ely @-@ centre . "; + util::TokenIter expected(expectedStr, util::SingleCharacter(' ')); + for (size_t i = 0; i < bestHypo.text.size(); ++i) { + //cerr << bestHypo.text[i]->first << " "; + BOOST_CHECK_EQUAL(*expected,bestHypo.text[i]->first); + ++expected; + } + BOOST_CHECK(!expected); + //cerr << endl; + //check scores + BOOST_CHECK_CLOSE(-80.062,bestHypo.featureVector.get("OpSequenceModel0_1"), 0.001); + BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_2"), 0.001); + BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_3"), 0.001); + BOOST_CHECK_CLOSE(3,bestHypo.featureVector.get("OpSequenceModel0_4"), 0.001); + BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("OpSequenceModel0_5"), 0.001); + BOOST_CHECK_CLOSE(-6,bestHypo.featureVector.get("Distortion0"), 0.001); + BOOST_CHECK_CLOSE(14,bestHypo.featureVector.get("PhrasePenalty0"), 0.001); + BOOST_CHECK_CLOSE(-20,bestHypo.featureVector.get("WordPenalty0"), 0.001); + BOOST_CHECK_CLOSE(-100,bestHypo.featureVector.get("UnknownWordPenalty0"), 0.001); + BOOST_CHECK_CLOSE(-126.616,bestHypo.featureVector.get("LM0"), 0.001); + BOOST_CHECK_CLOSE(-5.2238,bestHypo.featureVector.get("LexicalReordering0_1"), 0.001); + BOOST_CHECK_CLOSE(-0.29515,bestHypo.featureVector.get("LexicalReordering0_2"), 0.001); + BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_3"), 0.001); + BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_4"), 0.001); + BOOST_CHECK_CLOSE(-9.28267,bestHypo.featureVector.get("LexicalReordering0_5"), 0.001); + BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_6"), 0.001); + BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_7"), 0.001); + BOOST_CHECK_CLOSE(-0.402678,bestHypo.featureVector.get("LexicalReordering0_8"), 0.001); + BOOST_CHECK_CLOSE(-54.3119,bestHypo.featureVector.get("TranslationModel0_1"), 0.001); + BOOST_CHECK_CLOSE(-62.2619,bestHypo.featureVector.get("TranslationModel0_2"), 0.001); + BOOST_CHECK_CLOSE(-23.8782,bestHypo.featureVector.get("TranslationModel0_3"), 0.001); + BOOST_CHECK_CLOSE(-25.1626,bestHypo.featureVector.get("TranslationModel0_4"), 0.001); + BOOST_CHECK_CLOSE(12.9986,bestHypo.featureVector.get("TranslationModel0_5"), 0.001); + BOOST_CHECK_CLOSE(3.99959,bestHypo.featureVector.get("TranslationModel0_6"), 0.001); + BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_7"), 0.001); + BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_8"), 0.001); + BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_9"), 0.001); + BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_10"), 0.001); + BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_11"), 0.001); + BOOST_CHECK_CLOSE(0.999896,bestHypo.featureVector.get("TranslationModel0_12"), 0.001); + BOOST_CHECK_CLOSE(7.99917,bestHypo.featureVector.get("TranslationModel0_13"), 0.001); +} diff --git a/mert/MeteorScorer.cpp b/mert/MeteorScorer.cpp index 1254ec95f..f4c7997ee 100644 --- a/mert/MeteorScorer.cpp +++ b/mert/MeteorScorer.cpp @@ -18,6 +18,7 @@ #include "ScoreStats.h" #include "Util.h" +#include "util/unistd.hh" using namespace std; @@ -25,7 +26,7 @@ namespace MosesTuning { // Meteor supported -#if defined(__GLIBCXX__) || defined(__GLIBCPP__) +#if (defined(__GLIBCXX__) || defined(__GLIBCPP__)) && !defined(_WIN32) // for clarity #define CHILD_STDIN_READ pipefds_input[0] diff --git a/mert/Point.cpp b/mert/Point.cpp index 55dc6a6b2..681d3ab3e 100644 --- a/mert/Point.cpp +++ b/mert/Point.cpp @@ -3,6 +3,7 @@ #include #include #include "util/exception.hh" +#include "util/random.hh" #include "FeatureStats.h" #include "Optimizer.h" @@ -57,10 +58,8 @@ void Point::Randomize() UTIL_THROW_IF(m_min.size() != Point::m_dim, util::Exception, "Error"); UTIL_THROW_IF(m_max.size() != Point::m_dim, util::Exception, "Error"); - for (unsigned int i = 0; i < size(); i++) { - operator[](i) = m_min[i] + - static_cast(random()) / static_cast(RAND_MAX) * (m_max[i] - m_min[i]); - } + for (unsigned int i = 0; i < size(); i++) + operator[](i) = util::rand_incl(m_min[i], m_max[i]); } double Point::operator*(const FeatureStats& F) const diff --git a/mert/TODO b/mert/TODO index 21b4ce04e..4ceb628d3 100644 --- a/mert/TODO +++ b/mert/TODO @@ -5,11 +5,8 @@ - check that --pairwise-ranked is compatible with all optimization metrics -- Replace the standard rand() currently used in MERT and PRO with better - random generators such as Boost's random generators (e.g., boost::mt19937). - - create a Random class to hide the details, i.e., how to generate - random numbers, which allows us to use custom random generators more - easily. +- Use better random generators in util/random.cc, e.g. boost::mt19937. + - Support plugging of custom random generators. Pros: - In MERT, you might want to use the random restarting technique to avoid diff --git a/mert/TimerTest.cpp b/mert/TimerTest.cpp index d72b1c312..532e44fc1 100644 --- a/mert/TimerTest.cpp +++ b/mert/TimerTest.cpp @@ -11,7 +11,20 @@ using namespace MosesTuning; BOOST_AUTO_TEST_CASE(timer_basic_test) { Timer timer; - const int sleep_time_microsec = 40; // ad-hoc microseconds to pass unit tests. + + // Sleep time. The test will sleep for this number of microseconds, and + // expect the elapsed time to be noticeable. + // Keep this number low to avoid wasting test time sleeping, but at least as + // high as the Boost timer's resolution. Tests must pass consistently, not + // just on lucky runs. +#if defined(WIN32) + // Timer resolution on Windows seems to be a millisecond. Anything less and + // the test fails consistently. + const int sleep_time_microsec = 1000; +#else + // Unix-like systems seem to have more fine-grained clocks. + const int sleep_time_microsec = 40; +#endif timer.start(); BOOST_REQUIRE(timer.is_running()); diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp index 25da9adbc..59ffaf3cd 100644 --- a/mert/evaluator.cpp +++ b/mert/evaluator.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -15,6 +16,7 @@ #include "Timer.h" #include "Util.h" #include "Data.h" +#include "util/random.hh" using namespace std; using namespace MosesTuning; @@ -91,17 +93,15 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_i if (bootstrap) { vector scores; for (int i = 0; i < bootstrap; ++i) { - // TODO: Use smart pointer for exceptional-safety. - ScoreData* scoredata = new ScoreData(g_scorer); + ScoreData scoredata(g_scorer); for (int j = 0; j < n; ++j) { - int randomIndex = random() % n; - scoredata->add(entries[randomIndex], j); + const int randomIndex = util::rand_excl(n); + scoredata.add(entries[randomIndex], j); } - g_scorer->setScoreData(scoredata); + g_scorer->setScoreData(&scoredata); candidates_t candidates(n, 0); float score = g_scorer->score(candidates); scores.push_back(score); - delete scoredata; } float avg = average(scores); @@ -121,15 +121,13 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap, bool nbest_i cout.precision(4); cout << avg << "\t[" << lb << "," << rb << "]" << endl; } else { - // TODO: Use smart pointer for exceptional-safety. - ScoreData* scoredata = new ScoreData(g_scorer); + ScoreData scoredata(g_scorer); for (int sid = 0; sid < n; ++sid) { - scoredata->add(entries[sid], sid); + scoredata.add(entries[sid], sid); } - g_scorer->setScoreData(scoredata); + g_scorer->setScoreData(&scoredata); candidates_t candidates(n, 0); float score = g_scorer->score(candidates); - delete scoredata; if (g_has_more_files) cout << candFile << "\t"; if (g_has_more_scorers) cout << g_scorer->getName() << "\t"; @@ -287,10 +285,10 @@ void InitSeed(const ProgramOption *opt) { if (opt->has_seed) { cerr << "Seeding random numbers with " << opt->seed << endl; - srandom(opt->seed); + util::rand_init(opt->seed); } else { cerr << "Seeding random numbers with system clock " << endl; - srandom(time(NULL)); + util::rand_init(); } } diff --git a/mert/hgtest/0.gz b/mert/hgtest/0.gz new file mode 100644 index 000000000..012f9efbe Binary files /dev/null and b/mert/hgtest/0.gz differ diff --git a/mert/kbmira.cpp b/mert/kbmira.cpp index 5a119e875..092176984 100644 --- a/mert/kbmira.cpp +++ b/mert/kbmira.cpp @@ -40,6 +40,7 @@ de recherches du Canada #include #include "util/exception.hh" +#include "util/random.hh" #include "BleuScorer.h" #include "HopeFearDecoder.h" @@ -122,10 +123,10 @@ int main(int argc, char** argv) if (vm.count("random-seed")) { cerr << "Initialising random seed to " << seed << endl; - srand(seed); + util::rand_init(seed); } else { cerr << "Initialising random seed from system clock" << endl; - srand(time(NULL)); + util::rand_init(); } // Initialize weights diff --git a/mert/mert.cpp b/mert/mert.cpp index 275aa7b09..82b4cc34d 100644 --- a/mert/mert.cpp +++ b/mert/mert.cpp @@ -24,6 +24,7 @@ #include "Types.h" #include "Timer.h" #include "Util.h" +#include "util/random.hh" #include "moses/ThreadPool.h" @@ -289,10 +290,10 @@ int main(int argc, char **argv) if (option.has_seed) { cerr << "Seeding random numbers with " << option.seed << endl; - srandom(option.seed); + util::rand_init(option.seed); } else { cerr << "Seeding random numbers with system clock " << endl; - srandom(time(NULL)); + util::rand_init(); } if (option.sparse_weights_file.size()) ++option.pdim; diff --git a/mert/pro.cpp b/mert/pro.cpp index 7660fe7d0..c0f9f7b57 100644 --- a/mert/pro.cpp +++ b/mert/pro.cpp @@ -43,6 +43,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "ScoreDataIterator.h" #include "BleuScorer.h" #include "Util.h" +#include "util/random.hh" using namespace std; using namespace MosesTuning; @@ -141,10 +142,10 @@ int main(int argc, char** argv) if (vm.count("random-seed")) { cerr << "Initialising random seed to " << seed << endl; - srand(seed); + util::rand_init(seed); } else { cerr << "Initialising random seed from system clock" << endl; - srand(time(NULL)); + util::rand_init(); } if (scoreFiles.size() == 0 || featureFiles.size() == 0) { @@ -211,11 +212,11 @@ int main(int argc, char** argv) vector scores; size_t n_translations = hypotheses.size(); for(size_t i=0; i translation1 = hypotheses[rand1]; float bleu1 = smoothedSentenceBleu(scoreDataIters[translation1.first]->operator[](translation1.second), bleuSmoothing, smoothBP); - size_t rand2 = rand() % n_translations; + size_t rand2 = util::rand_excl(n_translations); pair translation2 = hypotheses[rand2]; float bleu2 = smoothedSentenceBleu(scoreDataIters[translation2.first]->operator[](translation2.second), bleuSmoothing, smoothBP); diff --git a/moses-cmd/MainVW.cpp b/moses-cmd/MainVW.cpp index c55b0fe2c..302866733 100644 --- a/moses-cmd/MainVW.cpp +++ b/moses-cmd/MainVW.cpp @@ -45,6 +45,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "moses/FF/StatefulFeatureFunction.h" #include "moses/FF/StatelessFeatureFunction.h" #include "moses/TrainingTask.h" +#include "util/random.hh" #ifdef HAVE_PROTOBUF #include "hypergraph.pb.h" @@ -117,7 +118,7 @@ int main(int argc, char** argv) //initialise random numbers - srand(time(NULL)); + util::rand_init(); // set up read/writing class IFVERBOSE(1) { diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index b6a5ec255..3a64ac8ac 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -27,6 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include #include +#include "util/random.hh" #include "util/usage.hh" #ifdef WIN32 @@ -91,7 +92,7 @@ SimpleTranslationInterface::SimpleTranslationInterface(const string &mosesIni): exit(1); } - srand(time(NULL)); + util::rand_init(); } @@ -185,7 +186,7 @@ batch_run() const StaticData& staticData = StaticData::Instance(); //initialise random numbers - srand(time(NULL)); + util::rand_init(); IFVERBOSE(1) PrintUserTime("Created input-output object"); diff --git a/moses/FF/LexicalReordering/SparseReordering.cpp b/moses/FF/LexicalReordering/SparseReordering.cpp index 040b94988..27e090ccd 100644 --- a/moses/FF/LexicalReordering/SparseReordering.cpp +++ b/moses/FF/LexicalReordering/SparseReordering.cpp @@ -13,8 +13,11 @@ #include "LexicalReordering.h" #include "SparseReordering.h" +#include + using namespace std; +using namespace boost::algorithm; namespace Moses { @@ -57,6 +60,7 @@ const std::string& SparseReorderingFeatureKey::Name (const string& wordListId) SparseReordering::SparseReordering(const map& config, const LexicalReordering* producer) : m_producer(producer) + , m_useWeightMap(false) { static const string kSource= "source"; static const string kTarget = "target"; @@ -80,6 +84,14 @@ SparseReordering::SparseReordering(const map& config, const Lexic } else { UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]); } + } else if (fields[0] == "weights") { + ReadWeightMap(i->second); + m_useWeightMap = true; + for (int reoType=0; reoType<=LRModel::MAX; ++reoType) { + ostringstream buf; + buf << reoType; + m_featureMap2.push_back(m_producer->GetFeatureName(buf.str())); + } } else if (fields[0] == "phrase") { m_usePhrase = true; @@ -175,7 +187,16 @@ void SparseReordering::AddFeatures( SparseReorderingFeatureKey key(id, type, wordFactor, false, position, side, reoType); FeatureMap::const_iterator fmi = m_featureMap.find(key); assert(fmi != m_featureMap.end()); - scores->SparsePlusEquals(fmi->second, 1.0); + if (m_useWeightMap) { + WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name()); + if (wmi != m_weightMap.end()) { + if (wmi->second != 0) { + scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second); + } + } + } else { + scores->SparsePlusEquals(fmi->second, 1.0); + } } for (size_t id = 0; id < clusterMaps->size(); ++id) { @@ -186,7 +207,16 @@ void SparseReordering::AddFeatures( SparseReorderingFeatureKey key(id, type, clusterIter->second, true, position, side, reoType); FeatureMap::const_iterator fmi = m_featureMap.find(key); assert(fmi != m_featureMap.end()); - scores->SparsePlusEquals(fmi->second, 1.0); + if (m_useWeightMap) { + WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name()); + if (wmi != m_weightMap.end()) { + if (wmi->second != 0) { + scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second); + } + } + } else { + scores->SparsePlusEquals(fmi->second, 1.0); + } } } @@ -256,5 +286,29 @@ void SparseReordering::CopyScores( } + +void SparseReordering::ReadWeightMap(const string& filename) +{ + util::FilePiece file(filename.c_str()); + StringPiece line; + while (true) { + try { + line = file.ReadLine(); + } catch (const util::EndOfFileException &e) { + break; + } + util::TokenIter lineIter(line,util::SingleCharacter(' ')); + UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'"); + const std::string& name = lineIter->as_string(); + ++lineIter; + UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'"); + float weight = Moses::Scan(lineIter->as_string()); + + std::pair< WeightMap::iterator, bool> inserted = m_weightMap.insert( std::make_pair(name, weight) ); + UTIL_THROW_IF2(!inserted.second, "Duplicate weight: '" << name << "'"); + } +} + + } //namespace diff --git a/moses/FF/LexicalReordering/SparseReordering.h b/moses/FF/LexicalReordering/SparseReordering.h index 8a2495ce8..958ce998b 100644 --- a/moses/FF/LexicalReordering/SparseReordering.h +++ b/moses/FF/LexicalReordering/SparseReordering.h @@ -112,10 +112,16 @@ private: typedef boost::unordered_map FeatureMap; FeatureMap m_featureMap; + typedef boost::unordered_map WeightMap; + WeightMap m_weightMap; + bool m_useWeightMap; + std::vector m_featureMap2; + void ReadWordList(const std::string& filename, const std::string& id, SparseReorderingFeatureKey::Side side, std::vector* pWordLists); void ReadClusterMap(const std::string& filename, const std::string& id, SparseReorderingFeatureKey::Side side, std::vector* pClusterMaps); void PreCalculateFeatureNames(size_t index, const std::string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster); + void ReadWeightMap(const std::string& filename); void AddFeatures( SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side, diff --git a/moses/FF/VW/VW.h b/moses/FF/VW/VW.h index 6bdb1416c..dd9d0b858 100644 --- a/moses/FF/VW/VW.h +++ b/moses/FF/VW/VW.h @@ -86,6 +86,10 @@ struct VWTargetSentence { int src = it->first; int tgt = it->second; + if (src >= m_sourceConstraints.size() || tgt >= m_targetConstraints.size()) { + UTIL_THROW2("VW :: alignment point out of bounds: " << src << "-" << tgt); + } + m_sourceConstraints[src].Update(tgt); m_targetConstraints[tgt].Update(src); } diff --git a/moses/HypergraphOutput.cpp b/moses/HypergraphOutput.cpp index 47c564882..6b353a83b 100644 --- a/moses/HypergraphOutput.cpp +++ b/moses/HypergraphOutput.cpp @@ -98,6 +98,7 @@ HypergraphOutput::HypergraphOutput(size_t precision) : // If this line gives you compile errors, // contact Lane Schwartz on the Moses mailing list m_hypergraphDir = nbestPath.parent_path().string(); + if (m_hypergraphDir.empty()) m_hypergraphDir="."; } else { stringstream hypergraphDirName; diff --git a/moses/LM/Remote.cpp b/moses/LM/Remote.cpp index af02a6617..33946442a 100644 --- a/moses/LM/Remote.cpp +++ b/moses/LM/Remote.cpp @@ -1,14 +1,15 @@ #include #include +#include #include #include -#include -#include -#include -#include #include "Remote.h" #include "moses/Factor.h" +#if !defined(_WIN32) && !defined(_WIN64) +#include +#endif + namespace Moses { @@ -41,12 +42,16 @@ bool LanguageModelRemote::start(const std::string& host, int port) sock = socket(AF_INET, SOCK_STREAM, 0); hp = gethostbyname(host.c_str()); if (hp==NULL) { +#if defined(_WIN32) || defined(_WIN64) + fprintf(stderr, "gethostbyname failed\n"); +#else herror("gethostbyname failed"); +#endif exit(1); } - bzero((char *)&server, sizeof(server)); - bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); + memset(&server, '\0', sizeof(server)); + memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length); server.sin_family = hp->h_addrtype; server.sin_port = htons(port); diff --git a/moses/LM/Remote.h b/moses/LM/Remote.h index d50e3e9b4..b7a72d853 100644 --- a/moses/LM/Remote.h +++ b/moses/LM/Remote.h @@ -4,9 +4,15 @@ #include "SingleFactor.h" #include "moses/TypeDef.h" #include "moses/Factor.h" -#include #include + +#if defined(_WIN32) || defined(_WIN64) +#include +#else +#include #include +#include +#endif namespace Moses { diff --git a/moses/Manager.cpp b/moses/Manager.cpp index cb91a9d29..a936fa7c7 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -55,6 +55,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #endif #include "util/exception.hh" +#include "util/random.hh" using namespace std; @@ -426,7 +427,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const //cerr << endl; //draw the sample - float frandom = log((float)rand()/RAND_MAX); + const float frandom = log(util::rand_incl(0.0f, 1.0f)); size_t position = 1; float sum = candidateScores[0]; for (; position < candidateScores.size() && sum < frandom; ++position) { @@ -1645,7 +1646,7 @@ void Manager::OutputNBest(std::ostream& out out << " |||"; // print scores with feature names - path.GetScoreBreakdown().OutputAllFeatureScores(out ); + path.GetScoreBreakdown()->OutputAllFeatureScores(out); // total out << " ||| " << path.GetTotalScore(); diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 98ed1f439..3c21a6725 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -31,6 +31,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "InputFileStream.h" #include "StaticData.h" #include "util/exception.hh" +#include "util/random.hh" #include @@ -1393,7 +1394,7 @@ struct Credit { this->contact = contact ; this->currentPursuits = currentPursuits ; this->areaResponsibility = areaResponsibility; - this->sortId = rand() % 1000; + this->sortId = util::rand_excl(1000); } bool operator<(const Credit &other) const { diff --git a/moses/Syntax/F2S/HyperTreeLoader.cpp b/moses/Syntax/F2S/HyperTreeLoader.cpp index f3caa2cec..bd19cbace 100644 --- a/moses/Syntax/F2S/HyperTreeLoader.cpp +++ b/moses/Syntax/F2S/HyperTreeLoader.cpp @@ -40,12 +40,12 @@ bool HyperTreeLoader::Load(const std::vector &input, const std::vector &output, const std::string &inFile, const RuleTableFF &ff, - HyperTree &trie) + HyperTree &trie, + boost::unordered_set &sourceTermSet) { PrintUserTime(std::string("Start loading HyperTree")); - // const StaticData &staticData = StaticData::Instance(); - // const std::string &factorDelimiter = staticData.GetFactorDelimiter(); + sourceTermSet.clear(); std::size_t count = 0; @@ -106,6 +106,7 @@ bool HyperTreeLoader::Load(const std::vector &input, // Source-side HyperPath sourceFragment; hyperPathLoader.Load(sourceString, sourceFragment); + ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet); // Target-side TargetPhrase *targetPhrase = new TargetPhrase(&ff); @@ -144,6 +145,23 @@ bool HyperTreeLoader::Load(const std::vector &input, return true; } +void HyperTreeLoader::ExtractSourceTerminalSetFromHyperPath( + const HyperPath &hp, boost::unordered_set &sourceTerminalSet) +{ + for (std::vector::const_iterator p = hp.nodeSeqs.begin(); + p != hp.nodeSeqs.end(); ++p) { + for (std::vector::const_iterator q = p->begin(); + q != p->end(); ++q) { + const std::size_t factorId = *q; + if (factorId >= moses_MaxNumNonterminals && + factorId != HyperPath::kComma && + factorId != HyperPath::kEpsilon) { + sourceTerminalSet.insert(factorId); + } + } + } +} + } // namespace F2S } // namespace Syntax } // namespace Moses diff --git a/moses/Syntax/F2S/HyperTreeLoader.h b/moses/Syntax/F2S/HyperTreeLoader.h index ea009022d..088c7eaf5 100644 --- a/moses/Syntax/F2S/HyperTreeLoader.h +++ b/moses/Syntax/F2S/HyperTreeLoader.h @@ -3,9 +3,12 @@ #include #include +#include + #include "moses/TypeDef.h" #include "moses/Syntax/RuleTableFF.h" +#include "HyperPath.h" #include "HyperTree.h" #include "HyperTreeCreator.h" @@ -23,7 +26,12 @@ public: const std::vector &output, const std::string &inFile, const RuleTableFF &, - HyperTree &); + HyperTree &, + boost::unordered_set &); + +private: + void ExtractSourceTerminalSetFromHyperPath( + const HyperPath &, boost::unordered_set &); }; } // namespace F2S diff --git a/moses/Syntax/F2S/Manager-inl.h b/moses/Syntax/F2S/Manager-inl.h index 6c289440c..3aedc640e 100644 --- a/moses/Syntax/F2S/Manager-inl.h +++ b/moses/Syntax/F2S/Manager-inl.h @@ -39,6 +39,7 @@ Manager::Manager(ttasksptr const& ttask) if (const ForestInput *p = dynamic_cast(&m_source)) { m_forest = p->GetForest(); m_rootVertex = p->GetRootVertex(); + m_sentenceLength = p->GetSize(); } else if (const TreeInput *p = dynamic_cast(&m_source)) { T2S::InputTreeBuilder builder; T2S::InputTree tmpTree; @@ -46,6 +47,7 @@ Manager::Manager(ttasksptr const& ttask) boost::shared_ptr forest = boost::make_shared(); m_rootVertex = T2S::InputTreeToForest(tmpTree, *forest); m_forest = forest; + m_sentenceLength = p->GetSize(); } else { UTIL_THROW2("ERROR: F2S::Manager requires input to be a tree or forest"); } @@ -83,8 +85,13 @@ void Manager::Decode() p = sortedVertices.begin(); p != sortedVertices.end(); ++p) { const Forest::Vertex &vertex = **p; - // Skip terminal vertices. + // Skip terminal vertices (after checking if they are OOVs). if (vertex.incoming.empty()) { + if (vertex.pvertex.span.GetStartPos() > 0 && + vertex.pvertex.span.GetEndPos() < m_sentenceLength-1 && + IsUnknownSourceWord(vertex.pvertex.symbol)) { + m_oovs.insert(vertex.pvertex.symbol); + } continue; } @@ -190,6 +197,21 @@ void Manager::InitializeStacks() } } +template +bool Manager::IsUnknownSourceWord(const Word &w) const +{ + const std::size_t factorId = w[0]->GetId(); + const std::vector &ffs = RuleTableFF::Instances(); + for (std::size_t i = 0; i < ffs.size(); ++i) { + RuleTableFF *ff = ffs[i]; + const boost::unordered_set &sourceTerms = + ff->GetSourceTerminalSet(); + if (sourceTerms.find(factorId) != sourceTerms.end()) { + return false; + } + } + return true; +} template const SHyperedge *Manager::GetBestSHyperedge() const diff --git a/moses/Syntax/F2S/Manager.h b/moses/Syntax/F2S/Manager.h index 53f4cff13..44128ad65 100644 --- a/moses/Syntax/F2S/Manager.h +++ b/moses/Syntax/F2S/Manager.h @@ -50,10 +50,13 @@ private: void InitializeStacks(); + bool IsUnknownSourceWord(const Word &) const; + void RecombineAndSort(const std::vector &, SVertexStack &); boost::shared_ptr m_forest; const Forest::Vertex *m_rootVertex; + std::size_t m_sentenceLength; // Includes and PVertexToStackMap m_stackMap; boost::shared_ptr m_glueRuleTrie; std::vector > m_mainRuleMatchers; diff --git a/moses/Syntax/RuleTableFF.cpp b/moses/Syntax/RuleTableFF.cpp index f4e06f489..37063e048 100644 --- a/moses/Syntax/RuleTableFF.cpp +++ b/moses/Syntax/RuleTableFF.cpp @@ -35,7 +35,8 @@ void RuleTableFF::Load() staticData.GetSearchAlgorithm() == SyntaxT2S) { F2S::HyperTree *trie = new F2S::HyperTree(this); F2S::HyperTreeLoader loader; - loader.Load(m_input, m_output, m_filePath, *this, *trie); + loader.Load(m_input, m_output, m_filePath, *this, *trie, + m_sourceTerminalSet); m_table = trie; } else if (staticData.GetSearchAlgorithm() == SyntaxS2T) { S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm(); diff --git a/moses/Syntax/RuleTableFF.h b/moses/Syntax/RuleTableFF.h index 4d6132e86..25e7d8428 100644 --- a/moses/Syntax/RuleTableFF.h +++ b/moses/Syntax/RuleTableFF.h @@ -43,10 +43,17 @@ public: return 0; } + // Get the source terminal vocabulary for this table's grammar (as a set of + // factor IDs) + const boost::unordered_set &GetSourceTerminalSet() const { + return m_sourceTerminalSet; + } + private: static std::vector s_instances; const RuleTable *m_table; + boost::unordered_set m_sourceTerminalSet; }; } // Syntax diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h index bf08574ff..389b60359 100644 --- a/moses/TranslationModel/CompactPT/MmapAllocator.h +++ b/moses/TranslationModel/CompactPT/MmapAllocator.h @@ -24,14 +24,18 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include #include -#include #include #include -#ifndef __MMAN_PAGE_SIZE__ -#define __MMAN_PAGE_SIZE__ sysconf(_SC_PAGE_SIZE) +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#else +#include #endif +#include "util/mmap.hh" + namespace Moses { template @@ -60,25 +64,25 @@ public: MmapAllocator() throw() : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), - m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0), + m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0), m_data_offset(0), m_fixed(false), m_count(new size_t(0)) { } MmapAllocator(std::FILE* f_ptr) throw() : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), - m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0), + m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0), m_data_offset(0), m_fixed(false), m_count(new size_t(0)) { } MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw() : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)), - m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0), + m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0), m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0)) { } MmapAllocator(std::string fileName) throw() : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)), - m_page_size(__MMAN_PAGE_SIZE__), m_map_size(0), m_data_ptr(0), + m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0), m_data_offset(0), m_fixed(false), m_count(new size_t(0)) { } @@ -92,7 +96,7 @@ public: ~MmapAllocator() throw() { if(m_data_ptr && *m_count == 0) { - munmap(m_data_ptr, m_map_size); + util::UnmapOrThrow(m_data_ptr, m_map_size); if(!m_fixed && std::ftell(m_file_ptr) != -1) std::fclose(m_file_ptr); } @@ -119,13 +123,17 @@ public: pointer allocate (size_type num, const void* = 0) { m_map_size = num * sizeof(T); +#if defined(_WIN32) || defined(_WIN64) + // On Windows, MAP_SHARED is not defined and MapOrThrow ignores the flags. + const int map_shared = 0; +#else + const int map_shared = MAP_SHARED; +#endif if(!m_fixed) { size_t read = 0; read += ftruncate(m_file_desc, m_map_size); - m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED, - m_file_desc, 0); - if(m_data_ptr == MAP_FAILED) - std::cerr << "Error: mmapping" << std::endl; + m_data_ptr = (char *)util::MapOrThrow( + m_map_size, true, map_shared, false, m_file_desc, 0); return (pointer)m_data_ptr; } else { size_t map_offset = (m_data_offset / m_page_size) * m_page_size; @@ -133,8 +141,8 @@ public: size_t map_size = m_map_size + relative_offset; - m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED, - m_file_desc, map_offset); + m_data_ptr = (char *)util::MapOrThrow( + m_map_size, false, map_shared, false, m_file_desc, map_offset); return (pointer)(m_data_ptr + relative_offset); } @@ -142,11 +150,11 @@ public: void deallocate (pointer p, size_type num) { if(!m_fixed) { - munmap(p, num * sizeof(T)); + util::UnmapOrThrow(p, num * sizeof(T)); } else { size_t map_offset = (m_data_offset / m_page_size) * m_page_size; size_t relative_offset = m_data_offset - map_offset; - munmap((pointer)((char*)p - relative_offset), num * sizeof(T)); + util::UnmapOrThrow((pointer)((char*)p - relative_offset), num * sizeof(T)); } } diff --git a/moses/TranslationModel/DynSAInclude/FileHandler.cpp b/moses/TranslationModel/DynSAInclude/FileHandler.cpp index 9413ffd7c..ecde3c644 100644 --- a/moses/TranslationModel/DynSAInclude/FileHandler.cpp +++ b/moses/TranslationModel/DynSAInclude/FileHandler.cpp @@ -1,7 +1,9 @@ #include "FileHandler.h" #include -#ifdef WIN32 +// Workaround: plain Windows does not have popen()/pclose(). +// (MinGW already #define's them, so skip the workaround there.) +#if defined(WIN32) && !defined(__MINGW32__) #define popen(A, B) _popen(A, B) #define pclose(A) _pclose(A) #endif diff --git a/moses/TranslationModel/DynSAInclude/hash.h b/moses/TranslationModel/DynSAInclude/hash.h index 8536c46f5..4cf69bf2f 100644 --- a/moses/TranslationModel/DynSAInclude/hash.h +++ b/moses/TranslationModel/DynSAInclude/hash.h @@ -6,6 +6,7 @@ #include "utils.h" #include "FileHandler.h" #include "util/exception.hh" +#include "util/random.hh" using namespace Moses; typedef uint64_t P; // largest input range is 2^64 @@ -162,7 +163,7 @@ void Hash_shiftAddXOR::initSeeds() { v_ = new T[this->H_]; for(count_t i=0; i < this->H_; i++) - v_[i] = Utils::rand() + 1; + v_[i] = util::wide_rand() + 1; } template T Hash_shiftAddXOR::hash(const char* s, count_t h) @@ -187,9 +188,8 @@ void UnivHash_tableXOR::initSeeds() // fill with random values for(count_t j=0; j < this->H_; j++) { table_[j] = new T[tblLen_]; - for(count_t i=0; i < tblLen_; i++) { - table_[j][i] = Utils::rand(this->m_-1); - } + for(count_t i=0; i < tblLen_; i++) + table_[j][i] = util::wide_rand_excl(this->m_-1); } } template @@ -218,7 +218,7 @@ void UnivHash_noPrimes::initSeeds() { a_ = new P[this->H_]; for(T i=0; i < this->H_; i++) { - a_[i] = Utils::rand

(); + a_[i] = util::wide_rand

(); if(a_[i] % 2 == 0) a_[i]++; // a must be odd } } @@ -284,8 +284,8 @@ void UnivHash_linear::initSeeds() a_[i] = new T[MAX_NGRAM_ORDER]; b_[i] = new T[MAX_NGRAM_ORDER]; for(count_t j=0; j < MAX_NGRAM_ORDER; j++) { - a_[i][j] = 1 + Utils::rand(); - b_[i][j] = Utils::rand(); + a_[i][j] = 1 + util::wide_rand(); + b_[i][j] = util::wide_rand(); } } } diff --git a/moses/TranslationModel/DynSAInclude/onlineRLM.h b/moses/TranslationModel/DynSAInclude/onlineRLM.h index 1d3f66eac..050e016c9 100644 --- a/moses/TranslationModel/DynSAInclude/onlineRLM.h +++ b/moses/TranslationModel/DynSAInclude/onlineRLM.h @@ -302,7 +302,8 @@ float OnlineRLM::getProb(const wordID_t* ngram, int len, } while(num_fnd > 1) { // get lower order count //get sub-context of size one less than length found (exluding target) - if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) && + den_val = query(&ngram[len - num_fnd], num_fnd - 1); + if((den_val > 0) && (den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) { break; } else --num_fnd; // else backoff to lower ngram order diff --git a/moses/TranslationModel/DynSAInclude/utils.h b/moses/TranslationModel/DynSAInclude/utils.h index e2f24fd4f..485e4a065 100644 --- a/moses/TranslationModel/DynSAInclude/utils.h +++ b/moses/TranslationModel/DynSAInclude/utils.h @@ -62,22 +62,6 @@ public: str[i] = tolower(str[i]); } } - // TODO: interface with decent PRG - template - static T rand(T mod_bnd = 0) { - T random = 0; - if(sizeof(T) <= 4) { - random = static_cast(std::rand()); - } else if(sizeof(T) == 8) { - random = static_cast(std::rand()); - random <<= 31; - random <<= 1; - random |= static_cast(std::rand()); - } - if(mod_bnd != 0) - return random % mod_bnd; - else return random; - } }; #endif diff --git a/moses/TranslationModel/DynSuffixArray.cpp b/moses/TranslationModel/DynSuffixArray.cpp index 3e8c79c0e..c1dc62f12 100644 --- a/moses/TranslationModel/DynSuffixArray.cpp +++ b/moses/TranslationModel/DynSuffixArray.cpp @@ -1,4 +1,6 @@ #include "DynSuffixArray.h" +#include "util/random.hh" + #include #include @@ -315,33 +317,31 @@ int DynSuffixArray::Compare(int pos1, int pos2, int max) return 0; } +namespace +{ +/// Helper: swap two entries in an int array. +inline void swap_ints(int array[], int one, int other) +{ + const int tmp = array[one]; + array[one] = array[other]; + array[other] = tmp; +} +} + void DynSuffixArray::Qsort(int* array, int begin, int end) { if(end > begin) { - int index; + int index = util::rand_incl(begin, end); { - index = begin + (rand() % (end - begin + 1)); - int pivot = array[index]; - { - int tmp = array[index]; - array[index] = array[end]; - array[end] = tmp; - } + const int pivot = array[index]; + swap_ints(array, index, end); for(int i=index=begin; i < end; ++i) { if (Compare(array[i], pivot, 20) <= 0) { - { - int tmp = array[index]; - array[index] = array[i]; - array[i] = tmp; - index++; - } + swap_ints(array, index, i); + index++; } } - { - int tmp = array[index]; - array[index] = array[end]; - array[end] = tmp; - } + swap_ints(array, index, end); } Qsort(array, begin, index - 1); Qsort(array, index + 1, end); diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp index c632f9ff2..773e027cc 100644 --- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp +++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp @@ -17,6 +17,7 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include "util/exception.hh" +#include "util/tokenize.hh" #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h" using namespace std; @@ -30,29 +31,6 @@ void OutputVec(const vector &vec) cerr << endl; } -// from phrase-extract/tables-core.cpp -inline vector tokenize( const char* input ) -{ - vector< string > token; - bool betweenWords = true; - int start=0; - int i=0; - for(; input[i] != '\0'; i++) { - bool isSpace = (input[i] == ' ' || input[i] == '\t'); - - if (!isSpace && betweenWords) { - start = i; - betweenWords = false; - } else if (isSpace && !betweenWords) { - token.push_back( string( input+start, i-start ) ); - betweenWords = true; - } - } - if (!betweenWords) - token.push_back( string( input+start, i-start ) ); - return token; -} - namespace Moses { @@ -464,7 +442,7 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic i++; if (i%100000 == 0) cerr << "." << flush; - vector token = tokenize( line.c_str() ); + const vector token = util::tokenize( line ); if (token.size() != 4) { cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" diff --git a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp index 69b7e9f5f..1d654f4b0 100644 --- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp +++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp @@ -1,11 +1,11 @@ // vim:tabstop=2 #include -#include #include "PhraseDictionaryTransliteration.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" #include "moses/DecodeGraph.h" #include "moses/DecodeStep.h" +#include "util/tempfile.hh" using namespace std; @@ -70,11 +70,10 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input inputPath.SetTargetPhrases(*this, tpColl, NULL); } else { // TRANSLITERATE - const boost::filesystem::path - inFile = boost::filesystem::unique_path(), - outDir = boost::filesystem::unique_path(); + const util::temp_file inFile; + const util::temp_dir outDir; - ofstream inStream(inFile.c_str()); + ofstream inStream(inFile.path().c_str()); inStream << sourcePhrase.ToString() << endl; inStream.close(); @@ -84,14 +83,14 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input " --external-bin-dir " + m_externalDir + " --input-extension " + m_inputLang + " --output-extension " + m_outputLang + - " --oov-file " + inFile.native() + - " --out-dir " + outDir.native(); + " --oov-file " + inFile.path() + + " --out-dir " + outDir.path(); int ret = system(cmd.c_str()); UTIL_THROW_IF2(ret != 0, "Transliteration script error"); TargetPhraseCollection *tpColl = new TargetPhraseCollection(); - vector targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.native()); + vector targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.path()); vector::const_iterator iter; for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) { TargetPhrase *tp = *iter; @@ -102,10 +101,6 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input cache[hash] = value; inputPath.SetTargetPhrases(*this, tpColl, NULL); - - // clean up temporary files - remove(inFile.c_str()); - boost::filesystem::remove_all(outDir); } } diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp index 1ca9dce67..9135b7e73 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp @@ -45,6 +45,7 @@ #include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h" #include "util/file.hh" #include "util/exception.hh" +#include "util/random.hh" using namespace std; @@ -62,8 +63,8 @@ char *mkdtemp(char *tempbuf) return NULL; } - srand((unsigned)time(0)); - rand_value = (int)((rand() / ((double)RAND_MAX+1.0)) * 1e6); + util::rand_init(); + rand_value = util::rand_excl(1e6); tempbase = strrchr(tempbuf, '/'); tempbase = tempbase ? tempbase+1 : tempbuf; strcpy(tempbasebuf, tempbase); @@ -130,10 +131,6 @@ int removedirectoryrecursively(const char *dirname) struct dirent *entry; char path[PATH_MAX]; - if (path == NULL) { - fprintf(stderr, "Out of memory error\n"); - return 0; - } dir = opendir(dirname); if (dir == NULL) { perror("Error opendir()"); diff --git a/moses/TranslationModel/UG/generic/sampling/Sampling.h b/moses/TranslationModel/UG/generic/sampling/Sampling.h index c60953d5d..652e532bc 100644 --- a/moses/TranslationModel/UG/generic/sampling/Sampling.h +++ b/moses/TranslationModel/UG/generic/sampling/Sampling.h @@ -2,19 +2,16 @@ #define __sampling_h #include #include + +#include "util/random.hh" + // Utility functions for proper sub-sampling. // (c) 2007-2012 Ulrich Germann namespace Moses { - using namespace std; -inline -size_t -randInt(size_t N) -{ - return N*(rand()/(RAND_MAX+1.)); -} +using namespace std; // select a random sample of size /s/ without restitution from the range of // integers [0,N); @@ -35,15 +32,15 @@ randomSample(vector& v, size_t s, size_t N) if (s*10 check(N,0); for (size_t i = 0; i < v.size(); i++) { - size_t x = randInt(N); - while (check[x]) x = randInt(N); + size_t x = util::rand_excl(N); + while (check[x]) x = util::rand_excl(N); check[x]=true; v[i] = x; } } else { size_t m=0; for (size_t t = 0; m <= s && t < N; t++) - if (s==N || randInt(N-t) < s-m) v[m++] = t; + if (s==N || util::rand_excl(N-t) < s-m) v[m++] = t; } } diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.cc b/moses/TranslationModel/UG/mm/ug_mmbitext.cc index 8f1a4aa12..2c00665bb 100644 --- a/moses/TranslationModel/UG/mm/ug_mmbitext.cc +++ b/moses/TranslationModel/UG/mm/ug_mmbitext.cc @@ -345,7 +345,7 @@ // { // boost::lock_guard lock(stats->lock); // if (stats->raw_cnt == ctr) ++stats->raw_cnt; -// size_t rnum = randInt(stats->raw_cnt - ctr++); +// size_t rnum = util::rand_excl(stats->raw_cnt - ctr++); // // cout << stats->raw_cnt << " " << ctr-1 << " " // // << rnum << " " << max_samples - stats->good << endl; // if (rnum < max_samples - stats->good) diff --git a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h index fc4b9f0ad..034a74bd9 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h @@ -69,7 +69,7 @@ namespace ugdiss // while (chosen < samplesize && next < stop) // { // root->readEntry(next,*this); - // if (randInt(N - sampled++) < samplesize - chosen) + // if (util::rand_excl(N - sampled++) < samplesize - chosen) // { // ++chosen; // return true; diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h index 096739fe9..508f09304 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h @@ -9,6 +9,7 @@ #include #include "util/exception.hh" #include "moses/Util.h" +#include "util/random.hh" //#include // #include "ug_bv_iter.h" @@ -896,13 +897,6 @@ namespace ugdiss return bv; } - inline - size_t - randInt(size_t N) - { - return size_t(N*(rand()/(RAND_MAX+1.))); - } - /// randomly select up to N occurrences of the sequence template sptr > @@ -924,8 +918,8 @@ namespace ugdiss root->readEntry(I.next,I); // t: expected number of remaining samples - double t = (stop - I.pos)/root->aveIndexEntrySize(); - double r = t*rand()/(RAND_MAX+1.); + const double t = (stop - I.pos)/root->aveIndexEntrySize(); + const double r = util::rand_excl(t); if (r < N-m) { ret->at(m).offset = I.offset; diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index af1053438..83b3db6a3 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -16,7 +16,7 @@ namespace Moses { using namespace bitext; using namespace std; - // using namespace boost; + using namespace boost; void fillIdSeq(Phrase const& mophrase, size_t const ifactor, @@ -155,6 +155,10 @@ namespace Moses input_factor = atoi(param.insert(dflt).first->second.c_str()); // shouldn't that be a string? + dflt = pair ("output-factor","0"); + output_factor = atoi(param.insert(dflt).first->second.c_str()); + ofactor.assign(1,output_factor); + dflt = pair ("smooth",".01"); m_lbop_conf = atof(param.insert(dflt).first->second.c_str()); diff --git a/moses/TrellisPath.cpp b/moses/TrellisPath.cpp index e76adc2db..36397e006 100644 --- a/moses/TrellisPath.cpp +++ b/moses/TrellisPath.cpp @@ -31,7 +31,6 @@ namespace Moses TrellisPath::TrellisPath(const Hypothesis *hypo) : m_prevEdgeChanged(NOT_FOUND) { - m_scoreBreakdown = hypo->GetScoreBreakdown(); m_totalScore = hypo->GetTotalScore(); // enumerate path using prevHypo @@ -41,10 +40,9 @@ TrellisPath::TrellisPath(const Hypothesis *hypo) } } -void TrellisPath::InitScore() +void TrellisPath::InitTotalScore() { m_totalScore = m_path[0]->GetWinningHypo()->GetTotalScore(); - m_scoreBreakdown= m_path[0]->GetWinningHypo()->GetScoreBreakdown(); //calc score size_t sizePath = m_path.size(); @@ -53,12 +51,8 @@ void TrellisPath::InitScore() const Hypothesis *winningHypo = hypo->GetWinningHypo(); if (hypo != winningHypo) { m_totalScore = m_totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore(); - m_scoreBreakdown.MinusEquals(winningHypo->GetScoreBreakdown()); - m_scoreBreakdown.PlusEquals(hypo->GetScoreBreakdown()); } } - - } TrellisPath::TrellisPath(const TrellisPath ©, size_t edgeIndex, const Hypothesis *arc) @@ -80,7 +74,7 @@ TrellisPath::TrellisPath(const TrellisPath ©, size_t edgeIndex, const Hypoth prevHypo = prevHypo->GetPrevHypo(); } - InitScore(); + InitTotalScore(); } TrellisPath::TrellisPath(const vector edges) @@ -88,9 +82,7 @@ TrellisPath::TrellisPath(const vector edges) { m_path.resize(edges.size()); copy(edges.rbegin(),edges.rend(),m_path.begin()); - InitScore(); - - + InitTotalScore(); } @@ -172,6 +164,32 @@ void TrellisPath::CreateDeviantPaths(TrellisPathList &pathColl) const } } +const boost::shared_ptr TrellisPath::GetScoreBreakdown() const +{ + if (!m_scoreBreakdown) { + float totalScore = m_path[0]->GetWinningHypo()->GetTotalScore(); // calculated for sanity check only + + m_scoreBreakdown = boost::shared_ptr(new ScoreComponentCollection()); + m_scoreBreakdown->PlusEquals(ScoreComponentCollection(m_path[0]->GetWinningHypo()->GetScoreBreakdown())); + + //calc score + size_t sizePath = m_path.size(); + for (size_t pos = 0 ; pos < sizePath ; pos++) { + const Hypothesis *hypo = m_path[pos]; + const Hypothesis *winningHypo = hypo->GetWinningHypo(); + if (hypo != winningHypo) { + totalScore = totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore(); + m_scoreBreakdown->MinusEquals(winningHypo->GetScoreBreakdown()); + m_scoreBreakdown->PlusEquals(hypo->GetScoreBreakdown()); + } + } + + assert(totalScore == m_totalScore); + } + + return m_scoreBreakdown; +} + Phrase TrellisPath::GetTargetPhrase() const { Phrase targetPhrase(ARRAY_SIZE_INCR); diff --git a/moses/TrellisPath.h b/moses/TrellisPath.h index def86549b..89efb32e4 100644 --- a/moses/TrellisPath.h +++ b/moses/TrellisPath.h @@ -19,14 +19,14 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ -#ifndef moses_TrellisPath_h -#define moses_TrellisPath_h +#pragma once #include #include #include #include "Hypothesis.h" #include "TypeDef.h" +#include namespace Moses { @@ -50,13 +50,13 @@ protected: , or NOT_FOUND if this path is the best trans so consist of only hypos */ - ScoreComponentCollection m_scoreBreakdown; float m_totalScore; + mutable boost::shared_ptr m_scoreBreakdown; //Used by Manager::LatticeSample() explicit TrellisPath(const std::vector edges); - void InitScore(); + void InitTotalScore(); public: TrellisPath(); // not implemented @@ -91,9 +91,7 @@ public: //! create a list of next best paths by wiggling 1 of the node at a time. void CreateDeviantPaths(TrellisPathList &pathColl) const; - inline const ScoreComponentCollection &GetScoreBreakdown() const { - return m_scoreBreakdown; - } + const boost::shared_ptr GetScoreBreakdown() const; //! get target words range of the hypo within n-best trellis. not necessarily the same as hypo.GetCurrTargetWordsRange() WordsRange GetTargetWordsRange(const Hypothesis &hypo) const; @@ -123,4 +121,4 @@ inline std::ostream& operator<<(std::ostream& out, const TrellisPath& path) } } -#endif + diff --git a/moses/Util.h b/moses/Util.h index 48e6a51ae..68989721c 100644 --- a/moses/Util.h +++ b/moses/Util.h @@ -502,13 +502,11 @@ inline std::string GetFirstString(const std::string& str, int& first_pos, const template T log_sum (T log_a, T log_b) { - T v; if (log_a < log_b) { - v = log_b+log ( 1 + exp ( log_a-log_b )); + return log_b + log1p(exp(log_a - log_b)); } else { - v = log_a+log ( 1 + exp ( log_b-log_a )); + return log_a + log1p(exp(log_b - log_a)); } - return ( v ); } /** diff --git a/moses/mbr.cpp b/moses/mbr.cpp index df2313b66..66dac47f7 100644 --- a/moses/mbr.cpp +++ b/moses/mbr.cpp @@ -105,13 +105,13 @@ const TrellisPath doMBR(const TrellisPathList& nBestList) for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { const TrellisPath &path = **iter; float score = StaticData::Instance().GetMBRScale() - * path.GetScoreBreakdown().GetWeightedScore(); + * path.GetScoreBreakdown()->GetWeightedScore(); if (maxScore < score) maxScore = score; } for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { const TrellisPath &path = **iter; - joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().GetWeightedScore() - maxScore); + joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown()->GetWeightedScore() - maxScore); marginal += joint_prob; joint_prob_vec.push_back(joint_prob); diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index 02558fa84..aab8867b5 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -166,7 +166,7 @@ namespace MosesServer { // should the score breakdown be reported in a more structured manner? ostringstream buf; - path->GetScoreBreakdown().OutputAllFeatureScores(buf); + path->GetScoreBreakdown()->OutputAllFeatureScores(buf); nBestXmlItem["fvals"] = xmlrpc_c::value_string(buf.str()); } diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp index 899eb9f1c..d5138ba9b 100644 --- a/phrase-extract/DomainFeature.cpp +++ b/phrase-extract/DomainFeature.cpp @@ -2,6 +2,7 @@ #include "ExtractionPhrasePair.h" #include "tables-core.h" #include "InputFileStream.h" +#include "util/tokenize.hh" using namespace std; @@ -17,7 +18,7 @@ void Domain::load( const std::string &domainFileName ) string line; while(getline(*fileP, line)) { // read - vector< string > domainSpecLine = tokenize( line.c_str() ); + const vector< string > domainSpecLine = util::tokenize( line ); int lineNumber; if (domainSpecLine.size() != 2 || ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) { @@ -25,7 +26,7 @@ void Domain::load( const std::string &domainFileName ) exit(1); } // store - string &name = domainSpecLine[1]; + const string &name = domainSpecLine[1]; spec.push_back( make_pair( lineNumber, name )); if (name2id.find( name ) == name2id.end()) { name2id[ name ] = list.size(); diff --git a/phrase-extract/DomainFeature.h b/phrase-extract/DomainFeature.h index 040a5fc72..95babb6c2 100644 --- a/phrase-extract/DomainFeature.h +++ b/phrase-extract/DomainFeature.h @@ -14,8 +14,6 @@ #include "ScoreFeature.h" -extern std::vector tokenize( const char*); - namespace MosesTraining { diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp index ee7f27ed9..21c1a1dbd 100644 --- a/phrase-extract/SentenceAlignment.cpp +++ b/phrase-extract/SentenceAlignment.cpp @@ -24,6 +24,7 @@ #include #include "tables-core.h" +#include "util/tokenize.hh" using namespace std; @@ -40,7 +41,7 @@ void addBoundaryWords(vector &phrase) bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules) { - target = tokenize(targetString); + target = util::tokenize(targetString); if (boundaryRules) addBoundaryWords(target); return true; @@ -48,7 +49,7 @@ bool SentenceAlignment::processTargetSentence(const char * targetString, int, bo bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules) { - source = tokenize(sourceString); + source = util::tokenize(sourceString); if (boundaryRules) addBoundaryWords(source); return true; @@ -89,7 +90,7 @@ bool SentenceAlignment::create(const char targetString[], } // reading in alignments - vector alignmentSequence = tokenize( alignmentString ); + vector alignmentSequence = util::tokenize( alignmentString ); for(size_t i=0; i #include "InputFileStream.h" #include "OutputFileStream.h" +#include "util/tokenize.hh" using namespace std; -std::vector tokenize( const char [] ); - vector< string > splitLine(const char *line) { vector< string > item; @@ -109,7 +108,7 @@ int main(int argc, char* argv[]) if (! getLine(fileDirectP, itemDirect )) break; - vector< string > count = tokenize( itemDirect[4].c_str() ); + const vector< string > count = util::tokenize( itemDirect[4] ); float countEF = atof(count[0].c_str()); float countF = atof(count[1].c_str()); float prob = countF/countEF; diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp index e2b0ad473..bce496a0c 100644 --- a/phrase-extract/consolidate-reverse-main.cpp +++ b/phrase-extract/consolidate-reverse-main.cpp @@ -28,6 +28,7 @@ #include "tables-core.h" #include "InputFileStream.h" +#include "util/tokenize.hh" using namespace std; @@ -165,8 +166,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]); // counts, for debugging - vector directCounts = tokenize(itemDirect[4].c_str()); - vector indirectCounts = tokenize(itemIndirect[4].c_str()); + const vector directCounts = util::tokenize(itemDirect[4]); + const vector indirectCounts = util::tokenize(itemIndirect[4]); fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0]; // output rule count if present in either file if (indirectCounts.size() > 1) { @@ -199,7 +200,6 @@ bool getLine( istream &fileP, vector< string > &item ) vector< string > splitLine(const char *line) { vector< string > item; - bool betweenWords = true; int start=0; int i=0; for(; line[i] != '\0'; i++) { @@ -223,10 +223,10 @@ string reverseAlignment(const string &alignments) { stringstream ret(""); - vector alignToks = tokenize(alignments.c_str()); + const vector alignToks = util::tokenize(alignments); for (size_t i = 0; i < alignToks.size(); ++i) { - string &alignPair = alignToks[i]; + const string &alignPair = alignToks[i]; vector alignPoints; Tokenize(alignPoints, alignPair, "-"); assert(alignPoints.size() == 2); diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp index 2f28c3244..f9800c8e0 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -23,6 +23,7 @@ #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" +#include "util/tokenize.hh" #include #include @@ -56,7 +57,7 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) m_tree.ConnectNodes(); SyntaxNode *root = m_tree.GetTop(); assert(root); - m_words = tokenize(m_line.c_str()); + m_words = util::tokenize(m_line); return ConvertTree(*root, m_words); } diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc index 3d9291994..29e46a9f2 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.cc +++ b/phrase-extract/pcfg-common/xml_tree_parser.cc @@ -25,6 +25,7 @@ #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" +#include "util/tokenize.hh" #include "syntax-common/exception.h" @@ -51,7 +52,7 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) { // There is no XML tree. return std::auto_ptr(); } - m_words = tokenize(m_line.c_str()); + m_words = util::tokenize(m_line); return ConvertTree(*root, m_words); } diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index a6d50cef5..5c9daa7ae 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -21,6 +21,7 @@ #include "relax-parse.h" #include "tables-core.h" +#include "util/tokenize.hh" using namespace std; using namespace MosesTraining; @@ -44,7 +45,7 @@ int main(int argc, char* argv[]) map< string, int > topLabelCollection; // count of top labels, not used SyntaxTree tree; ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false ); - vector< string > inWords = tokenize( inBufferString.c_str() ); + const vector< string > inWords = util::tokenize( inBufferString ); // output tree // cerr << "BEFORE:" << endl << tree; @@ -104,7 +105,7 @@ void init(int argc, char* argv[]) } } -void store( SyntaxTree &tree, vector< string > &words ) +void store( SyntaxTree &tree, const vector< string > &words ) { // output words for( size_t i=0; i &words ); +void store( MosesTraining::SyntaxTree &tree, const std::vector &words ); void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp index a6c0b74db..840f18602 100644 --- a/phrase-extract/statistics-main.cpp +++ b/phrase-extract/statistics-main.cpp @@ -14,6 +14,7 @@ #include "AlignmentPhrase.h" #include "tables-core.h" #include "InputFileStream.h" +#include "util/tokenize.hh" using namespace std; using namespace MosesTraining; @@ -237,7 +238,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) bool PhraseAlignment::create(const char line[], int lineID ) { - vector< string > token = tokenize( line ); + const vector< string > token = util::tokenize( line ); int item = 1; PHRASE phraseF, phraseE; for (size_t j=0; j token = tokenize( line.c_str() ); + const vector token = util::tokenize( line ); if (token.size() != 3) { cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" << token.size() << " " << token[0] << " " << line << endl; diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index c4363a3e2..c6e3cd3c3 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -3,6 +3,7 @@ #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" +#include "util/tokenize.hh" #include #include @@ -24,7 +25,7 @@ StringTree *XmlTreeParser::Parse(const std::string &line) { tree_.ConnectNodes(); SyntaxNode *root = tree_.GetTop(); assert(root); - words_ = tokenize(line_.c_str()); + words_ = util::tokenize(line_); return ConvertTree(*root, words_); } diff --git a/phrase-extract/tables-core.cpp b/phrase-extract/tables-core.cpp index 30c1544e9..4dd8e704a 100644 --- a/phrase-extract/tables-core.cpp +++ b/phrase-extract/tables-core.cpp @@ -1,5 +1,6 @@ // $Id$ //#include "beammain.h" +#include "util/tokenize.hh" #include "tables-core.h" #define TABLE_LINE_MAX_LENGTH 1000 @@ -7,37 +8,9 @@ using namespace std; -// as in beamdecoder/tables.cpp -vector tokenize( const char* input ) -{ - vector< string > token; - bool betweenWords = true; - int start=0; - int i=0; - for(; input[i] != '\0'; i++) { - bool isSpace = (input[i] == ' ' || input[i] == '\t'); - - if (!isSpace && betweenWords) { - start = i; - betweenWords = false; - } else if (isSpace && !betweenWords) { - token.push_back( string( input+start, i-start ) ); - betweenWords = true; - } - } - if (!betweenWords) - token.push_back( string( input+start, i-start ) ); - return token; -} - namespace MosesTraining { -bool isNonTerminal( const WORD &symbol ) -{ - return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]"; -} - WORD_ID Vocabulary::storeIfNew( const WORD& word ) { map::iterator i = lookup.find( word ); @@ -107,7 +80,7 @@ void DTable::load( const string& fileName ) abort(); } - vector token = tokenize(line.c_str()); + const vector token = util::tokenize(line); if (token.size() < 2) { cerr << "line " << i << " in " << fileName << " too short, skipping\n"; continue; diff --git a/phrase-extract/tables-core.h b/phrase-extract/tables-core.h index 44545d3a0..011fe09e6 100644 --- a/phrase-extract/tables-core.h +++ b/phrase-extract/tables-core.h @@ -12,8 +12,6 @@ #include #include -extern std::vector tokenize( const char*); - namespace MosesTraining { diff --git a/scripts/OSM/OSM-Train.perl b/scripts/OSM/OSM-Train.perl index 27ecfe342..e2b604f0b 100755 --- a/scripts/OSM/OSM-Train.perl +++ b/scripts/OSM/OSM-Train.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/OSM/extract-singletons.perl b/scripts/OSM/extract-singletons.perl index faa4e8dd6..83719502f 100755 --- a/scripts/OSM/extract-singletons.perl +++ b/scripts/OSM/extract-singletons.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use Getopt::Std; getopts('q'); diff --git a/scripts/OSM/flipAlignment.perl b/scripts/OSM/flipAlignment.perl index 3c14a4542..3559bf79b 100755 --- a/scripts/OSM/flipAlignment.perl +++ b/scripts/OSM/flipAlignment.perl @@ -1,5 +1,7 @@ #!/usr/bin/env perl - use strict; + +use warnings; +use strict; my $file = shift(@ARGV); open(MYFILE, $file); diff --git a/scripts/Transliteration/clean.pl b/scripts/Transliteration/clean.pl index 252a25075..c59bf0798 100755 --- a/scripts/Transliteration/clean.pl +++ b/scripts/Transliteration/clean.pl @@ -1,6 +1,7 @@ #!/usr/bin/env perl #input hindi word urdu word, delete all those entries that have number on any side +use warnings; use utf8; use Getopt::Std; diff --git a/scripts/Transliteration/corpusCreator.pl b/scripts/Transliteration/corpusCreator.pl index 8c8dab863..d2df8323c 100755 --- a/scripts/Transliteration/corpusCreator.pl +++ b/scripts/Transliteration/corpusCreator.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use utf8; diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl index e4e8b41e3..216d99a3e 100755 --- a/scripts/Transliteration/in-decoding-transliteration.pl +++ b/scripts/Transliteration/in-decoding-transliteration.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use utf8; diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl index 7e6f249ae..201f40d97 100755 --- a/scripts/Transliteration/post-decoding-transliteration.pl +++ b/scripts/Transliteration/post-decoding-transliteration.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use utf8; diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl index 565a98297..4fc03b526 100755 --- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl +++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use utf8; diff --git a/scripts/Transliteration/threshold.pl b/scripts/Transliteration/threshold.pl index 8af699821..8e3704fd6 100755 --- a/scripts/Transliteration/threshold.pl +++ b/scripts/Transliteration/threshold.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use utf8; require Encode; use IO::Handle; diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl index 54c2ccf78..05804afb6 100755 --- a/scripts/Transliteration/train-transliteration-module.pl +++ b/scripts/Transliteration/train-transliteration-module.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use utf8; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl index b74aa003d..149676b6f 100755 --- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl +++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl @@ -14,6 +14,7 @@ use utf8; # 23.01.2010: added NIST p-value and interval computation ############################################### +use warnings; use strict; #constants diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl index c8bc367b2..4f6560a56 100755 --- a/scripts/analysis/sentence-by-sentence.pl +++ b/scripts/analysis/sentence-by-sentence.pl @@ -4,6 +4,7 @@ #sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors #usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.html +use warnings; use strict; use Getopt::Long; diff --git a/scripts/analysis/sg2dot.perl b/scripts/analysis/sg2dot.perl index a165cf25e..b17dfd9fb 100755 --- a/scripts/analysis/sg2dot.perl +++ b/scripts/analysis/sg2dot.perl @@ -4,6 +4,7 @@ # Script to convert MOSES searchgraph to DOT format # +use warnings; use strict; use File::Path; use File::Basename; diff --git a/scripts/analysis/show-phrases-used.pl b/scripts/analysis/show-phrases-used.pl index c31e930d5..0a719d207 100755 --- a/scripts/analysis/show-phrases-used.pl +++ b/scripts/analysis/show-phrases-used.pl @@ -5,7 +5,9 @@ #usage: show-phrases-used DECODER_OUTFILE > output.html # where DECODER_OUTFILE is the output of moses with the -T (show alignments) option +use warnings; use strict; + BEGIN { my $wd= `pawd 2>/dev/null`; diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl index ed09d0b3f..9f411f3fa 100755 --- a/scripts/analysis/smtgui/filter-phrase-table.pl +++ b/scripts/analysis/smtgui/filter-phrase-table.pl @@ -9,6 +9,7 @@ #similar function to filter-model-given-input.pl, but only operates #on the phrase table and doesn't require that any subdirectories exist +use warnings; use strict; my $MAX_LENGTH = 10; diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index bc0a3b6b9..57ef4f9d6 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -7,8 +7,15 @@ get-corpus default-name: corpus/txt rerun-on-change: input-extension output-extension template: IN OUT $input-extension $output-extension +pre-tok-clean + in: raw-stem + out: pre-tok-cleaned + default-name: corpus/pre-tok-cleaned + pass-unless: pre-tok-clean + template: $pre-tok-clean IN $input-extension $output-extension OUT OUT.lines-retained + parallelizable: yes tokenize - in: raw-stem + in: pre-tok-cleaned out: tokenized-stem default-name: corpus/tok pass-unless: input-tokenizer output-tokenizer @@ -158,11 +165,18 @@ get-corpus pass-unless: get-corpus-script default-name: lm/txt template: $get-corpus-script > OUT +use-parallel-corpus + in: parallel-corpus-stem + out: tokenized-corpus + default-name: lm/tok + ignore-unless: parallel-corpus-stem + template: ln -s IN.$output-extension OUT tokenize in: raw-corpus out: tokenized-corpus default-name: lm/tok pass-unless: output-tokenizer + ignore-if: parallel-corpus-stem template: $output-tokenizer < IN > OUT parallelizable: yes mock-parse @@ -204,8 +218,14 @@ split default-name: lm/split pass-unless: output-splitter template: $output-splitter -model IN1.$output-extension < IN > OUT +strip + in: split-corpus + out: stripped-corpus + default-name: lm/stripped + pass-unless: mock-output-parser-lm + template: $moses-script-dir/training/strip-xml.perl < IN > OUT train - in: split-corpus + in: stripped-corpus out: lm default-name: lm/lm ignore-if: rlm-training @@ -220,7 +240,7 @@ randomize pass-unless: lm-randomizer ignore-if: rlm-training train-randomized - in: split-corpus + in: stripped-corpus out: rlm default-name: lm/rlm ignore-unless: rlm-training @@ -953,21 +973,21 @@ split-reference-devtest ignore-unless: use-mira multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-splitter -model IN1.$output-extension < IN > OUT -reduce-reference +strip-reference in: split-ref out: reference - default-name: tuning/reference.reduced + default-name: tuning/reference.stripped pass-unless: mock-output-parser-references multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl - template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees -reduce-reference-devtest + template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees +strip-reference-devtest in: split-ref-devtest out: reference - default-name: tuning/reference.devtest.reduced + default-name: tuning/reference.devtest.stripped pass-unless: mock-output-parser-references ignore-unless: use-mira multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl - template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees + template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees filter in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table out: filtered-dir @@ -1224,13 +1244,13 @@ lowercase-reference pass-if: recaser multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-lowercaser < IN > OUT -reduce-reference +strip-reference in: lowercased-reference out: reference default-name: evaluation/reference pass-unless: mock-output-parser-references multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl - template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees + template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees wade in: filtered-dir truecased-input tokenized-reference alignment system-output out: wade-analysis diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index a7ce88622..7070a7c9e 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -3,6 +3,7 @@ # Experiment Management System # Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/ems/fix-info.perl b/scripts/ems/fix-info.perl index 98139f211..8f83d4ccf 100755 --- a/scripts/ems/fix-info.perl +++ b/scripts/ems/fix-info.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my ($file,$step) = @ARGV; diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl index 8df3d6551..cea2657c9 100755 --- a/scripts/ems/support/analysis.perl +++ b/scripts/ems/support/analysis.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/ems/support/build-domain-file-from-subcorpora.perl b/scripts/ems/support/build-domain-file-from-subcorpora.perl index 683ef1ed7..f166c8927 100755 --- a/scripts/ems/support/build-domain-file-from-subcorpora.perl +++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; # Create domain file from corpora diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl index 04da69873..5d9b786ad 100755 --- a/scripts/ems/support/build-sparse-features.perl +++ b/scripts/ems/support/build-sparse-features.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; # Build necessary files for sparse lexical features diff --git a/scripts/ems/support/consolidate-training-data.perl b/scripts/ems/support/consolidate-training-data.perl index f312b1649..170ba999c 100755 --- a/scripts/ems/support/consolidate-training-data.perl +++ b/scripts/ems/support/consolidate-training-data.perl @@ -2,6 +2,7 @@ # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ +use warnings; use strict; my ($in,$out,$consolidated,@PART) = @ARGV; diff --git a/scripts/ems/support/generic-multicore-parallelizer.perl b/scripts/ems/support/generic-multicore-parallelizer.perl index 073e0f62e..e5a12adce 100755 --- a/scripts/ems/support/generic-multicore-parallelizer.perl +++ b/scripts/ems/support/generic-multicore-parallelizer.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my $cores = 8; diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl index db4d2f492..0b248be7e 100755 --- a/scripts/ems/support/generic-parallelizer.perl +++ b/scripts/ems/support/generic-parallelizer.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my $jobs = 20; diff --git a/scripts/ems/support/input-from-sgm.perl b/scripts/ems/support/input-from-sgm.perl index 81f177d6c..223996676 100755 --- a/scripts/ems/support/input-from-sgm.perl +++ b/scripts/ems/support/input-from-sgm.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; die("ERROR syntax: input-from-sgm.perl < in.sgm > in.txt") diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl index 34bd2219d..a2fe62b22 100755 --- a/scripts/ems/support/interpolate-lm.perl +++ b/scripts/ems/support/interpolate-lm.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use IPC::Open3; use File::Temp qw/tempdir/; diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl index 0fa676ce8..eadca6263 100755 --- a/scripts/ems/support/lmplz-wrapper.perl +++ b/scripts/ems/support/lmplz-wrapper.perl @@ -1,10 +1,13 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; +Getopt::Long::config("no_auto_abbrev"); Getopt::Long::config("pass_through"); + my ($TEXT,$ORDER,$BIN,$LM); &GetOptions('text=s' => \$TEXT, @@ -15,8 +18,9 @@ my ($TEXT,$ORDER,$BIN,$LM); die("ERROR: specify at least --bin BIN --text CORPUS --lm LM and --order N!") unless defined($BIN) && defined($TEXT) && defined($LM) && defined($ORDER); -my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM"; -$cmd .= " " . join(' ', @ARGV) if scalar(@ARGV); # Pass remaining args through. +my $settings = join(' ', @ARGV); +#print STDERR "settngs=$settings \n"; +my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM $settings"; print "exec: $cmd\n"; `$cmd`; diff --git a/scripts/ems/support/mml-filter.perl b/scripts/ems/support/mml-filter.perl index 5b6e02834..c50725aae 100755 --- a/scripts/ems/support/mml-filter.perl +++ b/scripts/ems/support/mml-filter.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use FindBin qw($RealBin); diff --git a/scripts/ems/support/mml-score.perl b/scripts/ems/support/mml-score.perl index 1fe065586..449d6a05c 100755 --- a/scripts/ems/support/mml-score.perl +++ b/scripts/ems/support/mml-score.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; # diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl index aacf153a7..1f0548082 100755 --- a/scripts/ems/support/mml-train.perl +++ b/scripts/ems/support/mml-train.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my ($indomain_source,,$indomain_target,$outdomain_source,$outdomain_target,$lm_training,$lm_binarizer,$order,$lm_settings,$line_count,$model); diff --git a/scripts/ems/support/prepare-fast-align.perl b/scripts/ems/support/prepare-fast-align.perl index 1d6e75422..54c124af0 100755 --- a/scripts/ems/support/prepare-fast-align.perl +++ b/scripts/ems/support/prepare-fast-align.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my ($source_file,$target_file,$alignment_factors) = @ARGV; @@ -22,7 +23,7 @@ while(my $source = ) { # remove markup foreach my $line (\$source,\$target) { - $$line =~ s/\<[^\>]+\>//g; + $$line =~ s/\<[^\>]+\>/ /g; $$line =~ s/\s+/ /g; $$line =~ s/^ //; $$line =~ s/ $//; diff --git a/scripts/ems/support/reference-from-sgm.perl b/scripts/ems/support/reference-from-sgm.perl index 0749648c0..595226bf1 100755 --- a/scripts/ems/support/reference-from-sgm.perl +++ b/scripts/ems/support/reference-from-sgm.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; die("ERROR syntax: reference-from-sgm.perl ref src out") diff --git a/scripts/ems/support/remove-segmentation-markup.perl b/scripts/ems/support/remove-segmentation-markup.perl index b345c9a7e..d6333f813 100755 --- a/scripts/ems/support/remove-segmentation-markup.perl +++ b/scripts/ems/support/remove-segmentation-markup.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; $|++; diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl index 5bcf32f48..2e433f291 100755 --- a/scripts/ems/support/report-experiment-scores.perl +++ b/scripts/ems/support/report-experiment-scores.perl @@ -2,6 +2,7 @@ # $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $ +use warnings; use strict; my $email; diff --git a/scripts/ems/support/run-command-on-multiple-refsets.perl b/scripts/ems/support/run-command-on-multiple-refsets.perl index f8e211582..c3db3c4dc 100755 --- a/scripts/ems/support/run-command-on-multiple-refsets.perl +++ b/scripts/ems/support/run-command-on-multiple-refsets.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; die("ERROR: syntax: run-command-on-multiple-refsets.perl cmd in out") diff --git a/scripts/ems/support/run-wade.perl b/scripts/ems/support/run-wade.perl index cf4121a14..25cda3bb3 100755 --- a/scripts/ems/support/run-wade.perl +++ b/scripts/ems/support/run-wade.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use File::Temp qw/ tempfile tempdir /; diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 6537e84b3..f1af451b3 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -6,6 +6,7 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); +use warnings; use FindBin qw($RealBin); use strict; diff --git a/scripts/ems/support/submit-grid.perl b/scripts/ems/support/submit-grid.perl index 6e6193674..9997241e7 100755 --- a/scripts/ems/support/submit-grid.perl +++ b/scripts/ems/support/submit-grid.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Cwd; use FindBin qw($RealBin); @@ -37,7 +38,7 @@ print $runFile "export PATH=\"$path\"\n\n"; print $runFile "export PERL5LIB=\"/share/apps/NYUAD/perl/gcc_4.9.1/5.20.1:/home/$user/perl5/lib/perl5\"\n\n"; print $runFile "module load NYUAD/2.0 \n"; -print $runFile "module load gcc/4.9.1 python/2.7.9 openmpi/1.8.3 boost cmake zlib jdk perl expat\n\n"; +print $runFile "module load gcc python/2.7.9 boost cmake zlib jdk perl expat \n\n"; my $emsDir = dirname($RealBin); diff --git a/scripts/ems/support/substitute-filtered-tables-and-weights.perl b/scripts/ems/support/substitute-filtered-tables-and-weights.perl index 9c06b54f8..681d251c7 100755 --- a/scripts/ems/support/substitute-filtered-tables-and-weights.perl +++ b/scripts/ems/support/substitute-filtered-tables-and-weights.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl index eee454728..e7d9f55f8 100755 --- a/scripts/ems/support/substitute-filtered-tables.perl +++ b/scripts/ems/support/substitute-filtered-tables.perl @@ -1,5 +1,7 @@ #!/usr/bin/env perl +use warnings; + # experiment.perl support script # get filtered rule and reordering tables and place them into a configuration file diff --git a/scripts/ems/support/substitute-weights.perl b/scripts/ems/support/substitute-weights.perl index 24ac034e8..42357ed1e 100755 --- a/scripts/ems/support/substitute-weights.perl +++ b/scripts/ems/support/substitute-weights.perl @@ -1,5 +1,7 @@ #!/usr/bin/env perl +use warnings; + # experiment.perl support script # get filtered rule and reordering tables and place them into a configuration file diff --git a/scripts/ems/support/symmetrize-fast-align.perl b/scripts/ems/support/symmetrize-fast-align.perl index f93af642d..90621dea9 100755 --- a/scripts/ems/support/symmetrize-fast-align.perl +++ b/scripts/ems/support/symmetrize-fast-align.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; die("ERROR: syntax is fastalign2bal.perl direct-alignment inverse-alignment source-file target-file out-stem symmetrization-method symal\n") unless scalar(@ARGV) == 7; diff --git a/scripts/ems/support/thot-lm-wrapper.perl b/scripts/ems/support/thot-lm-wrapper.perl index bd1f89c7b..222623c5b 100755 --- a/scripts/ems/support/thot-lm-wrapper.perl +++ b/scripts/ems/support/thot-lm-wrapper.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl index 587e4c541..28708a62a 100755 --- a/scripts/ems/support/wrap-xml.perl +++ b/scripts/ems/support/wrap-xml.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my ($language,$src,$system) = @ARGV; diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php index a64d5977f..00bb9e15f 100644 --- a/scripts/ems/web/analysis.php +++ b/scripts/ems/web/analysis.php @@ -1261,8 +1261,8 @@ function input_annotation($sentence,$input,$segmentation,$filter) { for($j=$from;$j<=$to;$j++) { if ($j>$from) { $phrase .= " "; } $phrase .= $word[$j]; - $highlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='#ffff80';"; - $lowlightwords .= " document.getElementById('inputword-$i-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';"; + $highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';"; + $lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';"; } print "

"; } @@ -1443,10 +1443,10 @@ function biconcor($query) { $sentence = $_GET['sentence']; $biconcor = get_biconcor_version($dir,$set,$id); print "
-
+ - +
"; $cmd = "./biconcor -html -l $dir/model/biconcor.$biconcor -Q ".base64_encode($query)." 2>/dev/null"; diff --git a/scripts/ems/web/base64.js b/scripts/ems/web/base64.js index e0e94d765..67fd9ad8d 100644 --- a/scripts/ems/web/base64.js +++ b/scripts/ems/web/base64.js @@ -1,108 +1,193 @@ -var END_OF_INPUT = -1; +/* + * $Id: base64.js,v 2.15 2014/04/05 12:58:57 dankogai Exp dankogai $ + * + * Licensed under the MIT license. + * http://opensource.org/licenses/mit-license + * + * References: + * http://en.wikipedia.org/wiki/Base64 + */ -var base64Chars = new Array( - 'A','B','C','D','E','F','G','H', - 'I','J','K','L','M','N','O','P', - 'Q','R','S','T','U','V','W','X', - 'Y','Z','a','b','c','d','e','f', - 'g','h','i','j','k','l','m','n', - 'o','p','q','r','s','t','u','v', - 'w','x','y','z','0','1','2','3', - '4','5','6','7','8','9','+','/' -); - -var reverseBase64Chars = new Array(); -for (var i=0; i < base64Chars.length; i++){ - reverseBase64Chars[base64Chars[i]] = i; -} - -var base64Str; -var base64Count; -function setBase64Str(str){ - base64Str = str; - base64Count = 0; -} -function readBase64(){ - if (!base64Str) return END_OF_INPUT; - if (base64Count >= base64Str.length) return END_OF_INPUT; - var c = base64Str.charCodeAt(base64Count) & 0xff; - base64Count++; - return c; -} -function encodeBase64(str){ - setBase64Str(str); - var result = ''; - var inBuffer = new Array(3); - var lineCount = 0; - var done = false; - while (!done && (inBuffer[0] = readBase64()) != END_OF_INPUT){ - inBuffer[1] = readBase64(); - inBuffer[2] = readBase64(); - result += (base64Chars[ inBuffer[0] >> 2 ]); - if (inBuffer[1] != END_OF_INPUT){ - result += (base64Chars [(( inBuffer[0] << 4 ) & 0x30) | (inBuffer[1] >> 4) ]); - if (inBuffer[2] != END_OF_INPUT){ - result += (base64Chars [((inBuffer[1] << 2) & 0x3c) | (inBuffer[2] >> 6) ]); - result += (base64Chars [inBuffer[2] & 0x3F]); - } else { - result += (base64Chars [((inBuffer[1] << 2) & 0x3c)]); - result += ('='); - done = true; - } +(function(global) { + 'use strict'; + // existing version for noConflict() + var _Base64 = global.Base64; + var version = "2.1.7"; + // if node.js, we use Buffer + var buffer; + if (typeof module !== 'undefined' && module.exports) { + buffer = require('buffer').Buffer; + } + // constants + var b64chars + = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'; + var b64tab = function(bin) { + var t = {}; + for (var i = 0, l = bin.length; i < l; i++) t[bin.charAt(i)] = i; + return t; + }(b64chars); + var fromCharCode = String.fromCharCode; + // encoder stuff + var cb_utob = function(c) { + if (c.length < 2) { + var cc = c.charCodeAt(0); + return cc < 0x80 ? c + : cc < 0x800 ? (fromCharCode(0xc0 | (cc >>> 6)) + + fromCharCode(0x80 | (cc & 0x3f))) + : (fromCharCode(0xe0 | ((cc >>> 12) & 0x0f)) + + fromCharCode(0x80 | ((cc >>> 6) & 0x3f)) + + fromCharCode(0x80 | ( cc & 0x3f))); } else { - result += (base64Chars [(( inBuffer[0] << 4 ) & 0x30)]); - result += ('='); - result += ('='); - done = true; - } - lineCount += 4; - if (lineCount >= 76){ - result += ('\n'); - lineCount = 0; + var cc = 0x10000 + + (c.charCodeAt(0) - 0xD800) * 0x400 + + (c.charCodeAt(1) - 0xDC00); + return (fromCharCode(0xf0 | ((cc >>> 18) & 0x07)) + + fromCharCode(0x80 | ((cc >>> 12) & 0x3f)) + + fromCharCode(0x80 | ((cc >>> 6) & 0x3f)) + + fromCharCode(0x80 | ( cc & 0x3f))); } + }; + var re_utob = /[\uD800-\uDBFF][\uDC00-\uDFFFF]|[^\x00-\x7F]/g; + var utob = function(u) { + return u.replace(re_utob, cb_utob); + }; + var cb_encode = function(ccc) { + var padlen = [0, 2, 1][ccc.length % 3], + ord = ccc.charCodeAt(0) << 16 + | ((ccc.length > 1 ? ccc.charCodeAt(1) : 0) << 8) + | ((ccc.length > 2 ? ccc.charCodeAt(2) : 0)), + chars = [ + b64chars.charAt( ord >>> 18), + b64chars.charAt((ord >>> 12) & 63), + padlen >= 2 ? '=' : b64chars.charAt((ord >>> 6) & 63), + padlen >= 1 ? '=' : b64chars.charAt(ord & 63) + ]; + return chars.join(''); + }; + var btoa = global.btoa ? function(b) { + return global.btoa(b); + } : function(b) { + return b.replace(/[\s\S]{1,3}/g, cb_encode); + }; + var _encode = buffer ? function (u) { + return (u.constructor === buffer.constructor ? u : new buffer(u)) + .toString('base64') } - return result; -} -function readReverseBase64(){ - if (!base64Str) return END_OF_INPUT; - while (true){ - if (base64Count >= base64Str.length) return END_OF_INPUT; - var nextCharacter = base64Str.charAt(base64Count); - base64Count++; - if (reverseBase64Chars[nextCharacter]){ - return reverseBase64Chars[nextCharacter]; + : function (u) { return btoa(utob(u)) } + ; + var encode = function(u, urisafe) { + return !urisafe + ? _encode(String(u)) + : _encode(String(u)).replace(/[+\/]/g, function(m0) { + return m0 == '+' ? '-' : '_'; + }).replace(/=/g, ''); + }; + var encodeURI = function(u) { return encode(u, true) }; + // decoder stuff + var re_btou = new RegExp([ + '[\xC0-\xDF][\x80-\xBF]', + '[\xE0-\xEF][\x80-\xBF]{2}', + '[\xF0-\xF7][\x80-\xBF]{3}' + ].join('|'), 'g'); + var cb_btou = function(cccc) { + switch(cccc.length) { + case 4: + var cp = ((0x07 & cccc.charCodeAt(0)) << 18) + | ((0x3f & cccc.charCodeAt(1)) << 12) + | ((0x3f & cccc.charCodeAt(2)) << 6) + | (0x3f & cccc.charCodeAt(3)), + offset = cp - 0x10000; + return (fromCharCode((offset >>> 10) + 0xD800) + + fromCharCode((offset & 0x3FF) + 0xDC00)); + case 3: + return fromCharCode( + ((0x0f & cccc.charCodeAt(0)) << 12) + | ((0x3f & cccc.charCodeAt(1)) << 6) + | (0x3f & cccc.charCodeAt(2)) + ); + default: + return fromCharCode( + ((0x1f & cccc.charCodeAt(0)) << 6) + | (0x3f & cccc.charCodeAt(1)) + ); } - if (nextCharacter == 'A') return 0; + }; + var btou = function(b) { + return b.replace(re_btou, cb_btou); + }; + var cb_decode = function(cccc) { + var len = cccc.length, + padlen = len % 4, + n = (len > 0 ? b64tab[cccc.charAt(0)] << 18 : 0) + | (len > 1 ? b64tab[cccc.charAt(1)] << 12 : 0) + | (len > 2 ? b64tab[cccc.charAt(2)] << 6 : 0) + | (len > 3 ? b64tab[cccc.charAt(3)] : 0), + chars = [ + fromCharCode( n >>> 16), + fromCharCode((n >>> 8) & 0xff), + fromCharCode( n & 0xff) + ]; + chars.length -= [0, 0, 2, 1][padlen]; + return chars.join(''); + }; + var atob = global.atob ? function(a) { + return global.atob(a); + } : function(a){ + return a.replace(/[\s\S]{1,4}/g, cb_decode); + }; + var _decode = buffer ? function(a) { + return (a.constructor === buffer.constructor + ? a : new buffer(a, 'base64')).toString(); } - return END_OF_INPUT; -} -function ntos(n){ - n=n.toString(16); - if (n.length == 1) n="0"+n; - n="%"+n; - return unescape(n); -} + : function(a) { return btou(atob(a)) }; + var decode = function(a){ + return _decode( + String(a).replace(/[-_]/g, function(m0) { return m0 == '-' ? '+' : '/' }) + .replace(/[^A-Za-z0-9\+\/]/g, '') + ); + }; + var noConflict = function() { + var Base64 = global.Base64; + global.Base64 = _Base64; + return Base64; + }; + // export Base64 + global.Base64 = { + VERSION: version, + atob: atob, + btoa: btoa, + fromBase64: decode, + toBase64: encode, + utob: utob, + encode: encode, + encodeURI: encodeURI, + btou: btou, + decode: decode, + noConflict: noConflict + }; + // if ES5 is available, make Base64.extendString() available + if (typeof Object.defineProperty === 'function') { + var noEnum = function(v){ + return {value:v,enumerable:false,writable:true,configurable:true}; + }; + global.Base64.extendString = function () { + Object.defineProperty( + String.prototype, 'fromBase64', noEnum(function () { + return decode(this) + })); + Object.defineProperty( + String.prototype, 'toBase64', noEnum(function (urisafe) { + return encode(this, urisafe) + })); + Object.defineProperty( + String.prototype, 'toBase64URI', noEnum(function () { + return encode(this, true) + })); + }; + } + // that's it! +})(this); -function decodeBase64(str){ - setBase64Str(str); - var result = ""; - var inBuffer = new Array(4); - var done = false; - while (!done && (inBuffer[0] = readReverseBase64()) != END_OF_INPUT - && (inBuffer[1] = readReverseBase64()) != END_OF_INPUT){ - inBuffer[2] = readReverseBase64(); - inBuffer[3] = readReverseBase64(); - result += ntos((((inBuffer[0] << 2) & 0xff)| inBuffer[1] >> 4)); - if (inBuffer[2] != END_OF_INPUT){ - result += ntos((((inBuffer[1] << 4) & 0xff)| inBuffer[2] >> 2)); - if (inBuffer[3] != END_OF_INPUT){ - result += ntos((((inBuffer[2] << 6) & 0xff) | inBuffer[3])); - } else { - done = true; - } - } else { - done = true; - } - } - return result; +if (this['Meteor']) { + Base64 = global.Base64; // for normal export in Meteor.js } diff --git a/scripts/ems/web/bilingual-concordance.css b/scripts/ems/web/bilingual-concordance.css index e232337d2..4648a21dd 100644 --- a/scripts/ems/web/bilingual-concordance.css +++ b/scripts/ems/web/bilingual-concordance.css @@ -93,5 +93,6 @@ span.mismatch_aligned { td.pp_more { font-size: 70%; + color: navy; text-align: center; } diff --git a/scripts/ems/web/index.php b/scripts/ems/web/index.php index 6b785cf3f..d216b114a 100644 --- a/scripts/ems/web/index.php +++ b/scripts/ems/web/index.php @@ -8,7 +8,7 @@ require("diff.php"); require("sgviz.php"); function head($title) { - print ' + print ' '.$title.' diff --git a/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc b/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc index 21f8c8cf6..57f78eb53 100644 --- a/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc +++ b/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/README.rdoc @@ -32,8 +32,8 @@ in a directory of your website, e.g. /javascripts. Now, you can include the scripts by adding the following tags to the HEAD section of your HTML pages: - - + + scriptaculous.js will automatically load the other files of the script.aculo.us distribution in, provided they are accessible @@ -56,4 +56,4 @@ the sources of the examples provided. == License script.aculo.us is licensed under the terms of the MIT License, -see the included MIT-LICENSE file. \ No newline at end of file +see the included MIT-LICENSE file. diff --git a/scripts/ems/web/overview.php b/scripts/ems/web/overview.php index e56ed6f08..ce0434bb8 100644 --- a/scripts/ems/web/overview.php +++ b/scripts/ems/web/overview.php @@ -1,6 +1,5 @@ $dir[0]$dir[1]$dir[2]$dir[3]\n"; } print "\n"; - print "

To add experiment, edit /fs/thor4/html/experiment/setup"; + print "

To add experiment, edit the \"setup\" file.

"; } function overview() { @@ -26,7 +25,7 @@ function overview() { head("Task: $task ($user)"); print "Wiki Notes"; - print "     |     Overview of experiments     |     $dir

"; + print "     |     Overview of experiments     |     $dir

"; reset($experiment); print "

\n"; diff --git a/scripts/ems/web/progress.perl b/scripts/ems/web/progress.perl index 0612a0a44..fd742e410 100755 --- a/scripts/ems/web/progress.perl +++ b/scripts/ems/web/progress.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Date::Parse; diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl index 56d4dff0f..80a1b3120 100755 --- a/scripts/fuzzy-match/create_xml.perl +++ b/scripts/fuzzy-match/create_xml.perl @@ -3,6 +3,7 @@ binmode( STDIN, ":utf8" ); binmode( STDOUT, ":utf8" ); +use warnings; use strict; use FindBin qw($RealBin); use File::Basename; diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl index bbbccc8ef..c0b25f519 100755 --- a/scripts/generic/compound-splitter.perl +++ b/scripts/generic/compound-splitter.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/generic/extract-factors.pl b/scripts/generic/extract-factors.pl index 566849053..56c719051 100755 --- a/scripts/generic/extract-factors.pl +++ b/scripts/generic/extract-factors.pl @@ -6,6 +6,7 @@ #factor indices start at 0 #factor indices too large ought to be ignored +use warnings; use strict; my ($filename, @factors) = @ARGV; diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index 687a21e28..2b02fa869 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -3,6 +3,7 @@ # example # ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput +use warnings; use strict; use File::Basename; @@ -32,8 +33,8 @@ my $glueFile; my $phraseOrientation = 0; my $phraseOrientationPriorsFile; -my $GZIP_EXEC; # = which("pigz"); -if(-f "/usr/bin/pigz") { +my $GZIP_EXEC; +if(`which pigz`) { $GZIP_EXEC = 'pigz'; } else { diff --git a/scripts/generic/fsa2fsal.pl b/scripts/generic/fsa2fsal.pl index 8cfdc0462..50bff1404 100755 --- a/scripts/generic/fsa2fsal.pl +++ b/scripts/generic/fsa2fsal.pl @@ -5,6 +5,7 @@ # Some rudimentary sanity checks are done on the fly. # Ondrej Bojar, bojar@ufal.mff.cuni.cz +use warnings; use strict; my $errs = 0; diff --git a/scripts/generic/fsa2plf.pl b/scripts/generic/fsa2plf.pl index 1177b01d5..4e7454a9f 100755 --- a/scripts/generic/fsa2plf.pl +++ b/scripts/generic/fsa2plf.pl @@ -8,6 +8,7 @@ # Note that the output format may not contain any spaces. # Ondrej Bojar, bojar@ufal.mff.cuni.cz +use warnings; use strict; use Getopt::Long; diff --git a/scripts/generic/fsal2fsa.pl b/scripts/generic/fsal2fsa.pl index 26258587d..d1aa461ac 100755 --- a/scripts/generic/fsal2fsa.pl +++ b/scripts/generic/fsal2fsa.pl @@ -2,6 +2,7 @@ # A very simple script that converts fsal back to fsa format (openfst lattices) # Ondrej Bojar, bojar@ufal.mff.cuni.cz +use warnings; use strict; while (<>) { diff --git a/scripts/generic/generic-parallel.perl b/scripts/generic/generic-parallel.perl index b7dca1bc9..653912c5c 100755 --- a/scripts/generic/generic-parallel.perl +++ b/scripts/generic/generic-parallel.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use utf8; diff --git a/scripts/generic/giza-parallel.perl b/scripts/generic/giza-parallel.perl index b5575e4d0..8793d3d8e 100755 --- a/scripts/generic/giza-parallel.perl +++ b/scripts/generic/giza-parallel.perl @@ -3,6 +3,7 @@ # example # ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align +use warnings; use strict; use File::Basename; diff --git a/scripts/generic/lopar2pos.pl b/scripts/generic/lopar2pos.pl index d95389c05..c75069135 100755 --- a/scripts/generic/lopar2pos.pl +++ b/scripts/generic/lopar2pos.pl @@ -4,6 +4,8 @@ #lopar2pos: extract POSs from LOPAR output #usage: lopar2pos.pl CORPUS.lopar > CORPUS.pos +use warnings; + my $infilename = shift @ARGV; open(INFILE, "<$infilename") or die "couldn't open '$infilename' for read: $!\n"; while(my $line = ) diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl index 4890864aa..7c0f56c70 100755 --- a/scripts/generic/moses-parallel.pl +++ b/scripts/generic/moses-parallel.pl @@ -15,6 +15,7 @@ # added checks for existence of decoder and configuration file # 26 Jul 2006 fix a bug related to the use of absolute path for srcfile and nbestfile +use warnings; use strict; ####################### diff --git a/scripts/generic/mteval-v12.pl b/scripts/generic/mteval-v12.pl index 0c771fc77..360376242 100755 --- a/scripts/generic/mteval-v12.pl +++ b/scripts/generic/mteval-v12.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use utf8; use Encode; diff --git a/scripts/generic/multi-bleu.perl b/scripts/generic/multi-bleu.perl index 5ed6add74..2f44d419f 100755 --- a/scripts/generic/multi-bleu.perl +++ b/scripts/generic/multi-bleu.perl @@ -1,6 +1,7 @@ #!/usr/bin/env perl # $Id$ +use warnings; use strict; my $lowercase = 0; diff --git a/scripts/generic/ph_numbers.perl b/scripts/generic/ph_numbers.perl index b33cd2805..ea56927ac 100755 --- a/scripts/generic/ph_numbers.perl +++ b/scripts/generic/ph_numbers.perl @@ -7,6 +7,7 @@ package ph_numbers; # # (c) 2013 TAUS +use warnings; use strict; run() unless caller(); diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl index c5b63a71b..622323bdb 100755 --- a/scripts/generic/qsub-wrapper.pl +++ b/scripts/generic/qsub-wrapper.pl @@ -1,6 +1,7 @@ #!/usr/bin/env perl # $Id$ +use warnings; use strict; ####################### diff --git a/scripts/generic/reverse-alignment.perl b/scripts/generic/reverse-alignment.perl index fc8c33dff..d00140c74 100755 --- a/scripts/generic/reverse-alignment.perl +++ b/scripts/generic/reverse-alignment.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my $line; diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index d6f16b2fc..9e5ee0025 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -4,6 +4,7 @@ # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0 # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.inv.sorted.gz ./lex.2.e2f ./phrase-table.2.half.e2f --Inverse 1 +use warnings; use strict; use File::Basename; @@ -13,8 +14,8 @@ sub GetSourcePhrase($); sub NumStr($); sub CutContextFile($$$); -my $GZIP_EXEC; # = which("pigz"); -if(-f "/usr/bin/pigz") { +my $GZIP_EXEC; +if(`which pigz`) { $GZIP_EXEC = 'pigz'; } else { diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl index 61b823ce2..95513b608 100755 --- a/scripts/generic/strip-xml.perl +++ b/scripts/generic/strip-xml.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; while (my $line = ) { diff --git a/scripts/generic/trainlm-irst2.perl b/scripts/generic/trainlm-irst2.perl index a84ea1c61..596143386 100755 --- a/scripts/generic/trainlm-irst2.perl +++ b/scripts/generic/trainlm-irst2.perl @@ -10,6 +10,7 @@ # irst-dir = /Users/hieu/workspace/irstlm/trunk/bin # Set smoothing method in settings, if different from modified Kneser-Ney +use warnings; use strict; use FindBin qw($RealBin); use Getopt::Long; diff --git a/scripts/generic/trainlm-lmplz.perl b/scripts/generic/trainlm-lmplz.perl deleted file mode 100755 index 045248675..000000000 --- a/scripts/generic/trainlm-lmplz.perl +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env perl - -# Compatible with sri LM-creating script, eg. -# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt -# To use it in the EMS, add this to the [LM] section -# lm-training = "$moses-script-dir/generic/trainlm-lmplz.perl -lmplz $lmplz" -# settings = "-T $working-dir/tmp -S 10G" -# Also, make sure that $lmplz is defined (in the [LM] or [GENERAL] section. -# It should point to the binary file -# lmplz = /home/waziz/workspace/github/moses/bin/lmplz - -use strict; -use FindBin qw($RealBin); -use Getopt::Long qw/GetOptionsFromArray/; -#use Getopt::Long; -Getopt::Long::Configure("pass_through", "no_ignore_case"); - -my $order = 3; # order of language model (default trigram) -my $corpus; # input text data -my $lm; # generated language model -my $lmplz; # bin directory of IRSTLM -my $help = 0; - -my @optconfig = ( - "-order=s" => \$order, - "-text=s" => \$corpus, - "-lm=s" => \$lm, - "-lmplz=s" => \$lmplz, -); - -GetOptionsFromArray(\@ARGV, @optconfig); -die("ERROR: please set text") unless defined($corpus); -die("ERROR: please set lm") unless defined($lm); -die("ERROR: please set lmplz") unless defined($lmplz); - -my $settings = join(' ', @ARGV); -my $cmd = "$lmplz --order $order $settings < $corpus > $lm"; - -print STDERR "EXECUTING $cmd\n"; -`$cmd`; diff --git a/scripts/other/beautify.perl b/scripts/other/beautify.perl index 73ea51beb..130afd56b 100755 --- a/scripts/other/beautify.perl +++ b/scripts/other/beautify.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use File::Basename; use FindBin qw($RealBin); diff --git a/scripts/other/delete-scores.perl b/scripts/other/delete-scores.perl index c0b723d64..08316c95b 100755 --- a/scripts/other/delete-scores.perl +++ b/scripts/other/delete-scores.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/other/get_many_translations_from_google.perl b/scripts/other/get_many_translations_from_google.perl index 6ef83e240..512b84e36 100755 --- a/scripts/other/get_many_translations_from_google.perl +++ b/scripts/other/get_many_translations_from_google.perl @@ -6,6 +6,7 @@ # # Ondrej Bojar, bojar@ufal.mff.cuni.cz +use warnings; use strict; use Getopt::Long; use CGI; diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl index efa5e12b6..549cd8abe 100755 --- a/scripts/recaser/detruecase.perl +++ b/scripts/recaser/detruecase.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl index 0b1ded200..3ba83712a 100755 --- a/scripts/recaser/recase.perl +++ b/scripts/recaser/recase.perl @@ -1,6 +1,7 @@ #!/usr/bin/env perl # $Id$ +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl index 27c5da198..87a720f6e 100755 --- a/scripts/recaser/train-recaser.perl +++ b/scripts/recaser/train-recaser.perl @@ -1,6 +1,7 @@ #!/usr/bin/env perl # $Id$ +use warnings; use strict; use FindBin qw($Bin); use Getopt::Long "GetOptions"; diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index b6e5c3884..b653a8ca5 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -8,6 +8,7 @@ # --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token. # +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index d14d7ebe4..373aa509f 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -1,6 +1,8 @@ #!/usr/bin/env perl # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ + +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/regression-testing/compare-results.pl b/scripts/regression-testing/compare-results.pl index 0d77ef8fc..df14d444f 100755 --- a/scripts/regression-testing/compare-results.pl +++ b/scripts/regression-testing/compare-results.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my ($results, $truth) = @ARGV; diff --git a/scripts/regression-testing/create_localized_moses_ini.pl b/scripts/regression-testing/create_localized_moses_ini.pl index 78a033b32..612a39e82 100755 --- a/scripts/regression-testing/create_localized_moses_ini.pl +++ b/scripts/regression-testing/create_localized_moses_ini.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } use MosesScriptsRegressionTesting; diff --git a/scripts/regression-testing/modify-pars.pl b/scripts/regression-testing/modify-pars.pl index 4669ae0b6..5ad2514a4 100755 --- a/scripts/regression-testing/modify-pars.pl +++ b/scripts/regression-testing/modify-pars.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my $argv=join(" ",@ARGV); diff --git a/scripts/regression-testing/moses-virtual.pl b/scripts/regression-testing/moses-virtual.pl index 55198900b..41ddd6b13 100755 --- a/scripts/regression-testing/moses-virtual.pl +++ b/scripts/regression-testing/moses-virtual.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my %opt = (); diff --git a/scripts/regression-testing/run-single-test.pl b/scripts/regression-testing/run-single-test.pl index 2fa7b4dce..bb66e96f6 100755 --- a/scripts/regression-testing/run-single-test.pl +++ b/scripts/regression-testing/run-single-test.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } use MosesScriptsRegressionTesting; diff --git a/scripts/regression-testing/run-test-suite.pl b/scripts/regression-testing/run-test-suite.pl index d90dfa35d..8ae9ec60f 100755 --- a/scripts/regression-testing/run-test-suite.pl +++ b/scripts/regression-testing/run-test-suite.pl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } use Getopt::Long; diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl index 17fe650d2..0e73a7718 100755 --- a/scripts/tokenizer/deescape-special-chars-PTB.perl +++ b/scripts/tokenizer/deescape-special-chars-PTB.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; while() { diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl index dc810d817..076d1e62f 100755 --- a/scripts/tokenizer/deescape-special-chars.perl +++ b/scripts/tokenizer/deescape-special-chars.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; while() { diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl index 14d6666c9..7874d5d04 100755 --- a/scripts/tokenizer/detokenizer.perl +++ b/scripts/tokenizer/detokenizer.perl @@ -7,6 +7,8 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); + +use warnings; use strict; use utf8; # tell perl this script file is in UTF-8 (see all funny punct below) @@ -36,7 +38,7 @@ if ($HELP) { exit; } -if ($language !~ /^(cs|en|fr|it)$/) { +if ($language !~ /^(cs|en|fr|it|fi)$/) { print STDERR "Warning: No built-in rules for language $language.\n" } @@ -176,6 +178,11 @@ sub detokenize { } + } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) { + # Finnish : without intervening space if followed by case suffix + # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ... + $text=$text. lc $words[$i]; + $prependSpace = " "; } else { $text=$text.$prependSpace.$words[$i]; $prependSpace = " "; diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl index 79ae39469..e94b91744 100755 --- a/scripts/tokenizer/escape-special-chars.perl +++ b/scripts/tokenizer/escape-special-chars.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; while() { diff --git a/scripts/tokenizer/lowercase.perl b/scripts/tokenizer/lowercase.perl index cb1250938..9ee307bc2 100755 --- a/scripts/tokenizer/lowercase.perl +++ b/scripts/tokenizer/lowercase.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; binmode(STDIN, ":utf8"); diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl index 8f779449f..db8f9c60e 100755 --- a/scripts/tokenizer/normalize-punctuation.perl +++ b/scripts/tokenizer/normalize-punctuation.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my $language = "en"; diff --git a/scripts/tokenizer/pre-tok-clean.perl b/scripts/tokenizer/pre-tok-clean.perl new file mode 100755 index 000000000..900e992ee --- /dev/null +++ b/scripts/tokenizer/pre-tok-clean.perl @@ -0,0 +1,46 @@ +#!/usr/bin/env perl + +use strict; + +my $minChars = $ARGV[0]; +my $maxChars = $ARGV[1]; +my $inputStem = $ARGV[2]; +my $source = $ARGV[3]; +my $target = $ARGV[4]; +my $outputStem = $ARGV[5]; +my $linesRetained = $ARGV[6]; + +open(IN_SOURCE, "<:encoding(UTF-8)", "$inputStem.$source") or die "cannot open $inputStem.$source"; +open(IN_TARGET, "<:encoding(UTF-8)", "$inputStem.$target") or die "cannot open $inputStem.$target"; + +open(OUT_SOURCE, ">:encoding(UTF-8)", "$outputStem.$source") or die "cannot open $outputStem.$source"; +open(OUT_TARGET, ">:encoding(UTF-8)", "$outputStem.$target") or die "cannot open $outputStem.$target"; + +open(LINE_RETAINED, ">:encoding(UTF-8)", "$linesRetained"); + +my $lineNum = 0; +while (my $lineSource = ) { + ++$lineNum; + #print STDERR "$lineNum "; + + chomp($lineSource); + my $lineTarget = ; + chomp($lineTarget); + + my $lenSource = length($lineSource); + my $lenTarget = length($lineTarget); + + if ($lenSource < $minChars || $lenSource > $maxChars + || $lenTarget < $minChars || $lenTarget > $maxChars) { + # do nothing + } + else { + print OUT_SOURCE "$lineSource\n"; + print OUT_TARGET "$lineTarget\n"; + print LINE_RETAINED "$lineNum\n"; + } +} + +close(OUT_SOURCE); +close(OUT_SOURCE); +close(LINE_RETAINED); diff --git a/scripts/tokenizer/pre-tokenizer.perl b/scripts/tokenizer/pre-tokenizer.perl index 35134a9c0..499671b44 100755 --- a/scripts/tokenizer/pre-tokenizer.perl +++ b/scripts/tokenizer/pre-tokenizer.perl @@ -4,6 +4,7 @@ # Start by Ulrich Germann, after noticing systematic preprocessing errors # in some of the English Europarl data. +use warnings; use strict; use Getopt::Std; diff --git a/scripts/tokenizer/remove-non-printing-char.perl b/scripts/tokenizer/remove-non-printing-char.perl index 4dadd1d77..2b90dfd3b 100755 --- a/scripts/tokenizer/remove-non-printing-char.perl +++ b/scripts/tokenizer/remove-non-printing-char.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use utf8; binmode(STDIN, ":utf8"); diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl index 748e1d063..08eb766bf 100755 --- a/scripts/tokenizer/replace-unicode-punctuation.perl +++ b/scripts/tokenizer/replace-unicode-punctuation.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; #binmode(STDIN, ":utf8"); diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index eeede0af0..8abffbea4 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -16,6 +16,7 @@ use warnings; binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); +use warnings; use FindBin qw($RealBin); use strict; use Time::HiRes; diff --git a/scripts/tokenizer/tokenizer_PTB.perl b/scripts/tokenizer/tokenizer_PTB.perl index 6417b7d6e..bce7a38a0 100755 --- a/scripts/tokenizer/tokenizer_PTB.perl +++ b/scripts/tokenizer/tokenizer_PTB.perl @@ -14,6 +14,7 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); +use warnings; use FindBin qw($RealBin); use strict; use Time::HiRes; diff --git a/scripts/training/absolutize_moses_model.pl b/scripts/training/absolutize_moses_model.pl index ecfcb3395..5c9c0970a 100755 --- a/scripts/training/absolutize_moses_model.pl +++ b/scripts/training/absolutize_moses_model.pl @@ -6,6 +6,8 @@ # # Ondrej Bojar. +use warnings; + my $ini = shift; die "usage: absolutize_moses_model.pl path-to-moses.ini > moses.abs.ini" if !defined $ini; diff --git a/scripts/training/bilingual-lm/extract_training.py b/scripts/training/bilingual-lm/extract_training.py index 66f8f0413..cd8755580 100755 --- a/scripts/training/bilingual-lm/extract_training.py +++ b/scripts/training/bilingual-lm/extract_training.py @@ -147,7 +147,7 @@ def main(): #Numberize the file for line in ngrams_file_handle: - numberized_file_handle.write(extract.numberize(line, m, n, tvocab_idmap, tvocab_idmap)) + numberized_file_handle.write(extract.numberize(line, options.m, options.n, svocab_idmap, tvocab_idmap)) numberized_file_handle.close() ngrams_file_handle.close() diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl index 0246190f2..3d4798ffd 100755 --- a/scripts/training/binarize-model.perl +++ b/scripts/training/binarize-model.perl @@ -4,6 +4,7 @@ # Binarize a Moses model # +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/training/build-generation-table.perl b/scripts/training/build-generation-table.perl index 8b1303795..fb59f4acc 100755 --- a/scripts/training/build-generation-table.perl +++ b/scripts/training/build-generation-table.perl @@ -1,6 +1,7 @@ #!/usr/bin/env perl # $Id$ +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/training/build-mmsapt.perl b/scripts/training/build-mmsapt.perl index bd8d1078f..a7ddaff70 100755 --- a/scripts/training/build-mmsapt.perl +++ b/scripts/training/build-mmsapt.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl index 40e4d8935..e1e96528c 100755 --- a/scripts/training/clean-corpus-n.perl +++ b/scripts/training/clean-corpus-n.perl @@ -1,6 +1,7 @@ #!/usr/bin/env perl # $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $ +use warnings; use strict; use Getopt::Long; my $help; diff --git a/scripts/training/clone_moses_model.pl b/scripts/training/clone_moses_model.pl index 93e37b803..5e9dff72a 100755 --- a/scripts/training/clone_moses_model.pl +++ b/scripts/training/clone_moses_model.pl @@ -5,6 +5,7 @@ # in the current directory # All relevant files are hardlinked or copied to the directory, too. +use warnings; use strict; use Getopt::Long; diff --git a/scripts/training/convert-moses-ini-to-v2.perl b/scripts/training/convert-moses-ini-to-v2.perl index 1bc4fe79d..25c562ef4 100755 --- a/scripts/training/convert-moses-ini-to-v2.perl +++ b/scripts/training/convert-moses-ini-to-v2.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my $header = ""; diff --git a/scripts/training/corpus-sizes.perl b/scripts/training/corpus-sizes.perl index 1eccf9bd5..02dd4ae9b 100755 --- a/scripts/training/corpus-sizes.perl +++ b/scripts/training/corpus-sizes.perl @@ -2,6 +2,7 @@ # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ +use warnings; use strict; my ($in,$out,@PART) = @ARGV; diff --git a/scripts/training/exodus.perl b/scripts/training/exodus.perl index ef3d8df92..d3466f5dd 100755 --- a/scripts/training/exodus.perl +++ b/scripts/training/exodus.perl @@ -2,6 +2,7 @@ # $Id$ +use warnings; use strict; my @LINE = ; diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index dbafc73be..7dec0762c 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -8,6 +8,7 @@ # changes by Ondrej Bojar # adapted for hierarchical models by Phil Williams +use warnings; use strict; use FindBin qw($RealBin); diff --git a/scripts/training/get-lexical.perl b/scripts/training/get-lexical.perl index 2dcf7437f..45fe6d54c 100755 --- a/scripts/training/get-lexical.perl +++ b/scripts/training/get-lexical.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use FindBin qw($RealBin); BEGIN { require "$RealBin/LexicalTranslationModel.pm"; "LexicalTranslationModel"->import; } diff --git a/scripts/training/giza2bal.pl b/scripts/training/giza2bal.pl index 8b2150e31..56fc9a466 100755 --- a/scripts/training/giza2bal.pl +++ b/scripts/training/giza2bal.pl @@ -7,6 +7,8 @@ #Copyright Marcello Federico, November 2004 +#use warnings; + ($cnt,$dir,$inv)=(); while ($w=shift @ARGV){ @@ -17,7 +19,7 @@ while ($w=shift @ARGV){ my $lc = 0; -if (!$dir || !inv){ +if (!$dir || !$inv){ print "usage: giza2bal.pl [-c ] -d -i \n"; print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n"; exit(0); diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 5d1f9b368..86084abbf 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -47,6 +47,7 @@ # 13 Oct 2004 Use alternative decoders (DWC) # Original version by Philipp Koehn +use warnings; use strict; use FindBin qw($RealBin); use File::Basename; diff --git a/scripts/training/postprocess-lopar.perl b/scripts/training/postprocess-lopar.perl index 9962d5594..5171e02fb 100755 --- a/scripts/training/postprocess-lopar.perl +++ b/scripts/training/postprocess-lopar.perl @@ -2,6 +2,7 @@ # $Id$ +use warnings; use strict; use utf8; diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl index bc08a3a9d..c265652f6 100755 --- a/scripts/training/reduce-factors.perl +++ b/scripts/training/reduce-factors.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl index 2f412cd28..bd5d7f1d2 100755 --- a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl +++ b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my ($ttable_file) = @ARGV; diff --git a/scripts/training/strip-xml.perl b/scripts/training/strip-xml.perl new file mode 100755 index 000000000..0f403d15d --- /dev/null +++ b/scripts/training/strip-xml.perl @@ -0,0 +1,17 @@ +#!/usr/bin/env perl + +# strip text file of any XML markup + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +use strict; + +while() { + s/<\S[^>]*>/ /g; + chomp; + s/ +/ /g; + s/^ //; + print $_; + print "\n"; +} diff --git a/scripts/training/threshold-filter.perl b/scripts/training/threshold-filter.perl index f8d15a8ae..a23fb8b5c 100755 --- a/scripts/training/threshold-filter.perl +++ b/scripts/training/threshold-filter.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my %MIN_SCORE; diff --git a/scripts/training/train-global-lexicon-model.perl b/scripts/training/train-global-lexicon-model.perl index 20ee42b72..0e7d3077d 100755 --- a/scripts/training/train-global-lexicon-model.perl +++ b/scripts/training/train-global-lexicon-model.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use Switch; diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index ade5c5277..4c355479c 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -1,5 +1,6 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); @@ -404,8 +405,8 @@ else { $SORT_EXEC = 'sort'; } -my $GZIP_EXEC; # = which("pigz"); -if(-f "/usr/bin/pigz") { +my $GZIP_EXEC; +if(`which pigz`) { $GZIP_EXEC = 'pigz'; } else { diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl index 3bbf982b7..3dd8fc4ac 100755 --- a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; while() { diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl index 91fc515cb..e61a53652 100755 --- a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; while() { diff --git a/scripts/training/wrappers/filter-excluded-lines.perl b/scripts/training/wrappers/filter-excluded-lines.perl index 2f1e25ad4..7f9da3efa 100755 --- a/scripts/training/wrappers/filter-excluded-lines.perl +++ b/scripts/training/wrappers/filter-excluded-lines.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long; diff --git a/scripts/training/wrappers/find-unparseable.perl b/scripts/training/wrappers/find-unparseable.perl index 0bbf35df4..b0d38027b 100755 --- a/scripts/training/wrappers/find-unparseable.perl +++ b/scripts/training/wrappers/find-unparseable.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my $lineNum = 1; diff --git a/scripts/training/wrappers/mada-wrapper.perl b/scripts/training/wrappers/mada-wrapper.perl index eec10a3ef..20f76f821 100755 --- a/scripts/training/wrappers/mada-wrapper.perl +++ b/scripts/training/wrappers/mada-wrapper.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use File::Temp qw/tempfile/; use Getopt::Long "GetOptions"; diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl new file mode 100755 index 000000000..6e7efe245 --- /dev/null +++ b/scripts/training/wrappers/madamira-wrapper.perl @@ -0,0 +1,93 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use File::Temp qw/tempfile/; +use Getopt::Long "GetOptions"; +use File::Basename; +use FindBin qw($RealBin); +use Cwd 'abs_path'; + +my $TMPDIR = "tmp"; +my $SCHEME = "D2"; +my $KEEP_TMP = 0; +my $MADA_DIR; + +GetOptions( + "scheme=s" => \$SCHEME, + "tmpdir=s" => \$TMPDIR, + "keep-tmp" => \$KEEP_TMP, + "mada-dir=s" => \$MADA_DIR + ) or die("ERROR: unknown options"); + +$TMPDIR = abs_path($TMPDIR); +print STDERR "TMPDIR=$TMPDIR \n"; + +#binmode(STDIN, ":utf8"); +#binmode(STDOUT, ":utf8"); + +$TMPDIR = "$TMPDIR/madamira.$$"; +`mkdir -p $TMPDIR`; +`mkdir -p $TMPDIR/split`; +`mkdir -p $TMPDIR/out`; + +my $infile = "$TMPDIR/input"; +print STDERR $infile."\n"; + +open(TMP,">$infile"); +while() { + print TMP $_; +} +close(TMP); + +my $cmd; + +# split input file +my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; +if($SPLIT_EXEC) { + $SPLIT_EXEC = 'gsplit'; +} +else { + $SPLIT_EXEC = 'split'; +} + +$cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x"; +`$cmd`; + +$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $MADA_DIR/samples/sampleConfigFile.xml ::: $TMPDIR/split/x*"; +print STDERR "Executing: $cmd\n"; +`$cmd`; + +$cmd = "cat $TMPDIR/out/x*.mada > $infile.mada"; +print STDERR "Executing: $cmd\n"; +`$cmd`; + +# get stuff out of mada output +open(MADA_OUT,"<$infile.mada"); +#binmode(MADA_OUT, ":utf8"); +while(my $line = ) { + chop($line); + #print STDERR "line=$line \n"; + + if (index($line, "SENTENCE BREAK") == 0) { + # new sentence + #print STDERR "BREAK\n"; + print "\n"; + } + elsif (index($line, ";;WORD") == 0) { + # word + my $word = substr($line, 7, length($line) - 8); + #print STDERR "FOund $word\n"; + print "$word "; + } + else { + #print STDERR "NADA\n"; + } +} +close (MADA_OUT); + + +if ($KEEP_TMP == 0) { +# `rm -rf $TMPDIR`; +} + diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl index cf7473e44..88d16b3f6 100755 --- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl +++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my ($lowercase, $cluster_file,$in,$out,$tmp) = @ARGV; diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl index 4b2c90495..1cc917bce 100755 --- a/scripts/training/wrappers/make-factor-de-morph.perl +++ b/scripts/training/wrappers/make-factor-de-morph.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Encode; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/make-factor-de-pos.perl b/scripts/training/wrappers/make-factor-de-pos.perl index 8cc28695a..2eadd4123 100755 --- a/scripts/training/wrappers/make-factor-de-pos.perl +++ b/scripts/training/wrappers/make-factor-de-pos.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my ($in,$out,$tmpdir) = @ARGV; diff --git a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl index 3ab2b1ca4..0d27aa12f 100755 --- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl +++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl index 1e00a8fa3..2af6eb75c 100755 --- a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl +++ b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; # handle switches diff --git a/scripts/training/wrappers/make-factor-stem.perl b/scripts/training/wrappers/make-factor-stem.perl index c222ad0df..60aca0b34 100755 --- a/scripts/training/wrappers/make-factor-stem.perl +++ b/scripts/training/wrappers/make-factor-stem.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my ($size,$in,$out) = @ARGV; diff --git a/scripts/training/wrappers/make-factor-suffix.perl b/scripts/training/wrappers/make-factor-suffix.perl index d13c43230..7e864ea0c 100755 --- a/scripts/training/wrappers/make-factor-suffix.perl +++ b/scripts/training/wrappers/make-factor-suffix.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; my ($size,$in,$out) = @ARGV; diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl index f7855e06d..fc1f0c532 100755 --- a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl +++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; #( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) ) diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl index b8b546953..68df07c49 100755 --- a/scripts/training/wrappers/parse-de-berkeley.perl +++ b/scripts/training/wrappers/parse-de-berkeley.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/parse-de-bitpar.perl b/scripts/training/wrappers/parse-de-bitpar.perl index 8cb34055c..4723d6aa0 100755 --- a/scripts/training/wrappers/parse-de-bitpar.perl +++ b/scripts/training/wrappers/parse-de-bitpar.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/parse-en-collins.perl b/scripts/training/wrappers/parse-en-collins.perl index 3d879c06b..27b33a2dd 100755 --- a/scripts/training/wrappers/parse-en-collins.perl +++ b/scripts/training/wrappers/parse-en-collins.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use File::Basename; use File::Temp qw/tempfile/; diff --git a/scripts/training/wrappers/parse-en-egret.perl b/scripts/training/wrappers/parse-en-egret.perl index fc330c70f..c3d23a4ee 100755 --- a/scripts/training/wrappers/parse-en-egret.perl +++ b/scripts/training/wrappers/parse-en-egret.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl index 2c830f6b6..1bb616939 100755 --- a/scripts/training/wrappers/syntax-hyphen-splitting.perl +++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/scripts/training/wrappers/tagger-german-chunk.perl b/scripts/training/wrappers/tagger-german-chunk.perl index b6b2871ba..4f26efabe 100755 --- a/scripts/training/wrappers/tagger-german-chunk.perl +++ b/scripts/training/wrappers/tagger-german-chunk.perl @@ -1,5 +1,6 @@ #!/usr/bin/env perl +use warnings; use strict; use Getopt::Long "GetOptions"; diff --git a/symal/symal.cpp b/symal/symal.cpp index dbe68f1b9..249aa6caa 100644 --- a/symal/symal.cpp +++ b/symal/symal.cpp @@ -67,7 +67,7 @@ int verbose=0; int lc = 0; -int getals(fstream& inp,int& m, int *a,int& n, int *b) +int getals(istream& inp,int& m, int *a,int& n, int *b) { char w[MAX_WORD], dummy[10]; int i,j,freq; @@ -121,7 +121,7 @@ int getals(fstream& inp,int& m, int *a,int& n, int *b) //compute union alignment -int prunionalignment(fstream& out,int m,int *a,int n,int* b) +int prunionalignment(ostream& out,int m,int *a,int n,int* b) { ostringstream sout; @@ -150,7 +150,7 @@ int prunionalignment(fstream& out,int m,int *a,int n,int* b) //Compute intersection alignment -int printersect(fstream& out,int m,int *a,int n,int* b) +int printersect(ostream& out,int m,int *a,int n,int* b) { ostringstream sout; @@ -174,7 +174,7 @@ int printersect(fstream& out,int m,int *a,int n,int* b) //Compute target-to-source alignment -int printtgttosrc(fstream& out,int m,int *a,int n,int* b) +int printtgttosrc(ostream& out,int m,int *a,int n,int* b) { ostringstream sout; @@ -198,7 +198,7 @@ int printtgttosrc(fstream& out,int m,int *a,int n,int* b) //Compute source-to-target alignment -int printsrctotgt(fstream& out,int m,int *a,int n,int* b) +int printsrctotgt(ostream& out,int m,int *a,int n,int* b) { ostringstream sout; @@ -226,7 +226,7 @@ int printsrctotgt(fstream& out,int m,int *a,int n,int* b) //to represent the grow alignment as the unionalignment of a //directed and inverted alignment -int printgrow(fstream& out,int m,int *a,int n,int* b, bool diagonal=false,bool final=false,bool bothuncovered=false) +int printgrow(ostream& out,int m,int *a,int n,int* b, bool diagonal=false,bool final=false,bool bothuncovered=false) { ostringstream sout; @@ -392,8 +392,8 @@ int main(int argc, char** argv) { int alignment=0; - char* input=(char*)"/dev/stdin"; - char* output=(char*)"/dev/stdout"; + char* input= NULL; + char* output= NULL; int diagonal=false; int final=false; int bothuncovered=false; @@ -421,23 +421,29 @@ int main(int argc, char** argv) << "Input file or std must be in .bal format (see script giza2bal.pl).\n"; exit(1); - } - fstream inp(input,ios::in); - fstream out(output,ios::out); + istream *inp = &std::cin; + ostream *out = &std::cout; - if (!inp.is_open()) { - cerr << "cannot open " << input << "\n"; - exit(1); + if (input) { + fstream *fin = new fstream(input,ios::in); + if (!fin->is_open()) { + cerr << "cannot open " << input << "\n"; + exit(1); + } + inp = fin; } - if (!out.is_open()) { - cerr << "cannot open " << output << "\n"; - exit(1); + if (output) { + fstream *fout = new fstream(output,ios::out); + if (!fout->is_open()) { + cerr << "cannot open " << output << "\n"; + exit(1); + } + out = fout; } - int a[MAX_M],b[MAX_N],m,n; fa=new int[MAX_M+1]; ea=new int[MAX_N+1]; @@ -450,16 +456,16 @@ int main(int argc, char** argv) switch (alignment) { case UNION: cerr << "symal: computing union alignment\n"; - while(getals(inp,m,a,n,b)) { - prunionalignment(out,m,a,n,b); + while(getals(*inp,m,a,n,b)) { + prunionalignment(*out,m,a,n,b); sents++; } cerr << "Sents: " << sents << endl; break; case INTERSECT: cerr << "symal: computing intersect alignment\n"; - while(getals(inp,m,a,n,b)) { - printersect(out,m,a,n,b); + while(getals(*inp,m,a,n,b)) { + printersect(*out,m,a,n,b); sents++; } cerr << "Sents: " << sents << endl; @@ -469,15 +475,15 @@ int main(int argc, char** argv) << diagonal << ") final ("<< final << ")" << "both-uncovered (" << bothuncovered <<")\n"; - while(getals(inp,m,a,n,b)) - printgrow(out,m,a,n,b,diagonal,final,bothuncovered); + while(getals(*inp,m,a,n,b)) + printgrow(*out,m,a,n,b,diagonal,final,bothuncovered); break; case TGTTOSRC: cerr << "symal: computing target-to-source alignment\n"; - while(getals(inp,m,a,n,b)) { - printtgttosrc(out,m,a,n,b); + while(getals(*inp,m,a,n,b)) { + printtgttosrc(*out,m,a,n,b); sents++; } cerr << "Sents: " << sents << endl; @@ -485,8 +491,8 @@ int main(int argc, char** argv) case SRCTOTGT: cerr << "symal: computing source-to-target alignment\n"; - while(getals(inp,m,a,n,b)) { - printsrctotgt(out,m,a,n,b); + while(getals(*inp,m,a,n,b)) { + printsrctotgt(*out,m,a,n,b); sents++; } cerr << "Sents: " << sents << endl; @@ -500,5 +506,12 @@ int main(int argc, char** argv) for (int i=1; i<=MAX_N; i++) delete [] A[i]; delete [] A; + if (inp != &std::cin) { + delete inp; + } + if (out != &std::cout) { + delete inp; + } + exit(0); } diff --git a/util/Jamfile b/util/Jamfile index 18b20a33a..2d3cede01 100644 --- a/util/Jamfile +++ b/util/Jamfile @@ -21,7 +21,7 @@ obj file_piece_test.o : file_piece_test.cc /top//boost_unit_test_framework : $(c fakelib parallel_read : parallel_read.cc : multi:/top//boost_thread multi:WITH_THREADS : : .. ; -fakelib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc parallel_read pool.cc read_compressed scoped.cc string_piece.cc usage.cc double-conversion//double-conversion : .. LINUX,single:rt : : .. ; +fakelib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc parallel_read pool.cc random.cc read_compressed scoped.cc string_piece.cc usage.cc double-conversion//double-conversion : .. LINUX,single:rt : : .. ; exe cat_compressed : cat_compressed_main.cc kenutil ; @@ -32,5 +32,5 @@ import testing ; run file_piece_test.o kenutil /top//boost_unit_test_framework : : file_piece.cc ; for local t in [ glob *_test.cc : file_piece_test.cc read_compressed_test.cc ] { local name = [ MATCH "(.*)\.cc" : $(t) ] ; - unit-test $(name) : $(t) kenutil /top//boost_unit_test_framework /top//boost_system ; + unit-test $(name) : $(t) kenutil /top//boost_unit_test_framework /top//boost_filesystem /top//boost_system ; } diff --git a/util/mmap.hh b/util/mmap.hh index 9b1e120f3..37feb5bee 100644 --- a/util/mmap.hh +++ b/util/mmap.hh @@ -100,9 +100,12 @@ typedef enum { extern const int kFileFlags; -// Wrapper around mmap to check it worked and hide some platform macros. +// Cross-platform, error-checking wrapper for mmap(). void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0); +// Cross-platform, error-checking wrapper for munmap(). +void UnmapOrThrow(void *start, size_t length); + void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out); void MapAnonymous(std::size_t size, scoped_memory &to); diff --git a/util/random.cc b/util/random.cc new file mode 100644 index 000000000..4db1a61ee --- /dev/null +++ b/util/random.cc @@ -0,0 +1,43 @@ +#include "util/random.hh" + +#include + +#include +#include +#include + +namespace util +{ +namespace +{ +/** Lock to protect randomizer. + * + * This module is implemented in terms of rand()/srand() from . + * These functions are standard C, but they're not thread-safe. Scalability + * is not worth much complexity here, so just slap a mutex around it. + */ +boost::mutex rand_lock; +} // namespace + +void rand_init(unsigned int seed) +{ + boost::lock_guard lock(rand_lock); + srand(seed); +} + + +void rand_init() +{ + rand_init(time(NULL)); +} + +namespace internal +{ +// This is the one call to the actual randomizer. All else is built on this. +int rand_int() +{ + boost::lock_guard lock(rand_lock); + return std::rand(); +} +} // namespace internal +} // namespace util diff --git a/util/random.hh b/util/random.hh new file mode 100644 index 000000000..6c2773520 --- /dev/null +++ b/util/random.hh @@ -0,0 +1,229 @@ +#ifndef UTIL_RANDOM_H +#define UTIL_RANDOM_H + +#include +#include + +namespace util +{ +/** Thread-safe, cross-platform random number generator. + * + * This is not for proper security-grade randomness, but should be "good + * enough" for producing arbitrary values of various numeric types. + * + * Before starting, call rand_init() to seed the randomizer. There is no need + * to do this more than once; in fact doing it more often is likely to make the + * randomizer less effective. Once that is done, call the rand(), rand_excl(), + * and rand_incl() functions as needed to generate pseudo-random numbers. + * + * Probability distribution is roughly uniform, but for integral types is + * skewed slightly towards lower numbers depending on how close "top" comes to + * RAND_MAX. + * + * For floating-point types, resolution is limited; there will actually be + * only RAND_MAX different possible values. + */ + +/** Initialize randomizer with a fixed seed. + * + * After this, unless the randomizer gets seeded again, consecutive calls to + * the random functions will return a sequence of pseudo-random numbers + * determined by the seed. Every time the randomizer is seeded with this same + * seed, it will again start returning the same sequence of numbers. + */ +void rand_init(unsigned int); + +/** Initialize randomizer based on current time. + * + * Call this to make the randomizer return hard-to-predict numbers. It won't + * produce high-grade randomness, but enough to make the program act + * differently on different runs. + * + * The seed will be based on the current time in seconds. So calling it twice + * within the same second will just reset the randomizer to where it was before. + * Don't do that. + */ +void rand_init(); + + +/** Return a pseudorandom number between 0 and RAND_MAX inclusive. + * + * Initialize (seed) the randomizer before starting to call this. + */ +template inline T rand(); + + +/** Return a pseudorandom number in the half-open interval [bottom, top). + * + * Generates a value between "bottom" (inclusive) and "top" (exclusive), + * assuming that (top - bottom) <= RAND_MAX. + */ +template inline T rand_excl(T bottom, T top); + + +/** Return a pseudorandom number in the half-open interval [0, top). + * + * Generates a value between 0 (inclusive) and "top" (exclusive), assuming that + * bottom <= RAND_MAX. + */ +template inline T rand_excl(T top); + + +/** Return a pseudorandom number in the open interval [bottom, top]. + * + * Generates a value between "bottom" and "top" inclusive, assuming that + * (top - bottom) < RAND_MAX. + */ +template inline T rand_incl(T bottom, T top); + + +/** Return a pseudorandom number in the open interval [0, top]. + * + * Generates a value between 0 and "top" inclusive, assuming that + * bottom < RAND_MAX. + */ +template inline T rand_incl(T top); + + +/** Return a pseudorandom number which may be larger than RAND_MAX. + * + * The requested type must be integral, and its size must be an even multiple + * of the size of an int. The return value will combine one or more random + * ints into a single value, which could get quite large. + * + * The result is nonnegative. Because the constituent ints are also + * nonnegative, the most significant bit in each of the ints will be zero, + * so for a wider type, there will be "gaps" in the range of possible outputs. + */ +template inline T wide_rand(); + +/** Return a pseudorandom number in [0, top), not limited to RAND_MAX. + * + * Works like wide_rand(), but if the requested type is wider than an int, it + * accommodates larger top values than an int can represent. + */ +template inline T wide_rand_excl(T top); + +/** Return a pseudorandom number in [bottom, top), not limited to RAND_MAX. + * + * Works like wide_rand(), but if the requested type is wider than an int, it + * accommodates larger value ranges than an int can represent. + */ +template inline T wide_rand_excl(T bottom, T top); + +/** Return a pseudorandom number in [0, top], not limited to RAND_MAX. + * + * Works like wide_rand(), but if the requested type is wider than an int, it + * accommodates larger top values than an int can represent. + */ +template inline T wide_rand_incl(T top); + +/** Return a pseudorandom number in [bottom, top], not limited to RAND_MAX. + * + * Works like wide_rand(), but if the requested type is wider than an int, it + * accommodates larger top values than an int can represent. + */ +template inline T wide_rand_incl(T bottom, T top); + + +/// Implementation detail. For the random module's internal use only. +namespace internal +{ +/// The central call to the randomizer upon which this whole module is built. +int rand_int(); + +/// Helper template: customize random values to required ranges. +template struct random_scaler; + +/// Specialized random_scaler for integral types. +template struct random_scaler +{ + static T rnd_excl(T value, T range) { return value % range; } + static T rnd_incl(T value, T range) { return value % (range + 1); } +}; + +/// Specialized random_scaler for non-integral types. +template struct random_scaler +{ + static T rnd_excl(T value, T range) + { + // Promote RAND_MAX to T before adding one to avoid overflow. + return range * value / (T(RAND_MAX) + 1); + } + static T rnd_incl(T value, T range) { return range * value / RAND_MAX; } +}; + +/// Helper for filling a wider variable with random ints. +template struct wide_random_collector +{ + static T generate() + { + T one_int = util::rand() << (8 * sizeof(int)); + return one_int | wide_random_collector::generate(); + } +}; +/// Specialized wide_random_collector for generating just a single int. +template struct wide_random_collector +{ + static T generate() { return util::rand(); } +}; + +} // namespace internal + + +template inline T rand() +{ + return T(util::internal::rand_int()); +} + +template inline T rand_excl(T top) +{ + typedef internal::random_scaler::is_integer> scaler; + return scaler::rnd_excl(util::rand(), top); +} + +template inline T rand_excl(T bottom, T top) +{ + return bottom + rand_excl(top - bottom); +} + +template inline T rand_incl(T top) +{ + typedef internal::random_scaler::is_integer> scaler; + return scaler::rnd_incl(util::rand(), top); +} + +template inline T rand_incl(T bottom, T top) +{ + return bottom + rand_incl(top - bottom); +} + +template inline T wide_rand() +{ + return internal::wide_random_collector::generate(); +} + +template inline T wide_rand_excl(T top) +{ + typedef internal::random_scaler::is_integer> scaler; + return scaler::rnd_excl(util::wide_rand(), top); +} + +template inline T wide_rand_excl(T bottom, T top) +{ + return bottom + wide_rand_excl(top - bottom); +} + +template inline T wide_rand_incl(T top) +{ + typedef internal::random_scaler::is_integer> scaler; + return scaler::rnd_incl(util::wide_rand(), top); +} + +template inline T wide_rand_incl(T bottom, T top) +{ + return bottom + wide_rand_incl(top - bottom); +} +} // namespace util + +#endif diff --git a/util/random_test.cc b/util/random_test.cc new file mode 100644 index 000000000..6d8981de8 --- /dev/null +++ b/util/random_test.cc @@ -0,0 +1,191 @@ +#include + +#include "util/random.hh" + +#define BOOST_TEST_MODULE RandomTest +#include + +namespace util +{ +namespace +{ + +BOOST_AUTO_TEST_CASE(rand_int_returns_positive_no_greater_than_RAND_MAX) +{ + rand_init(); + for (int i=0; i<100; i++) + { + const int random_number = rand(); + BOOST_CHECK(random_number >= 0); + BOOST_CHECK(random_number <= RAND_MAX); + } +} + +BOOST_AUTO_TEST_CASE(rand_int_returns_different_consecutive_numbers) +{ + rand_init(99); + const int first = rand(), second = rand(), third = rand(); + // Sometimes you'll get the same number twice in a row, but generally the + // randomizer returns different numbers. + BOOST_CHECK(second != first || third != first); +} + +BOOST_AUTO_TEST_CASE(rand_int_returns_different_numbers_for_different_seeds) +{ + rand_init(1); + const int one1 = rand(), one2 = rand(); + rand_init(2); + const int two1 = rand(), two2 = rand(); + BOOST_CHECK(two1 != one1 || two2 != one2); +} + +BOOST_AUTO_TEST_CASE(rand_int_returns_same_sequence_for_same_seed) +{ + rand_init(1); + const int first = rand(); + rand_init(1); + const int second = rand(); + BOOST_CHECK_EQUAL(first, second); +} + +BOOST_AUTO_TEST_CASE(rand_excl_int_returns_number_in_range) +{ + const int bottom = 10, top = 50; + for (int i=0; i<100; i++) + { + const int random_number = rand_excl(bottom, top); + BOOST_CHECK(random_number >= bottom); + BOOST_CHECK(random_number < top); + } +} + +BOOST_AUTO_TEST_CASE(rand_excl_int_covers_full_range) +{ + // The spread of random numbers really goes all the way from 0 (inclusive) + // to "top" (exclusive). It's not some smaller subset. + // This test will randomly fail sometimes, though very very rarely, when the + // random numbers don't actually have enough different values. + const int bottom = 1, top = 4; + int lowest = 99, highest = -1; + for (int i=0; i<100; i++) + { + const int random_number = rand_excl(bottom, top); + lowest = std::min(lowest, random_number); + highest = std::max(highest, random_number); + } + + BOOST_CHECK_EQUAL(lowest, bottom); + BOOST_CHECK_EQUAL(highest, top - 1); +} + +BOOST_AUTO_TEST_CASE(rand_incl_int_returns_number_in_range) +{ + const int bottom = 10, top = 50; + for (int i=0; i<100; i++) + { + const int random_number = rand_incl(bottom, top); + BOOST_CHECK(random_number >= 0); + BOOST_CHECK(random_number <= top); + } +} + +BOOST_AUTO_TEST_CASE(rand_incl_int_covers_full_range) +{ + // The spread of random numbers really goes all the way from 0 to "top" + // inclusive. It's not some smaller subset. + // This test will randomly fail sometimes, though very very rarely, when the + // random numbers don't actually have enough different values. + const int bottom = 1, top = 4; + int lowest = 99, highest = -1; + for (int i=0; i<100; i++) + { + const int random_number = rand_incl(bottom, top); + lowest = std::min(lowest, random_number); + highest = std::max(highest, random_number); + } + + BOOST_CHECK_EQUAL(lowest, bottom); + BOOST_CHECK_EQUAL(highest, top); +} + +BOOST_AUTO_TEST_CASE(rand_excl_float_returns_float_in_range) +{ + const float bottom = 5, top = 10; + for (int i=0; i<100; i++) + { + const float random_number = rand_excl(bottom, top); + BOOST_CHECK(random_number >= bottom); + BOOST_CHECK(random_number < top); + } +} + +BOOST_AUTO_TEST_CASE(rand_excl_float_returns_different_values) +{ + const float bottom = 5, top = 10; + float lowest = 99, highest = -1; + for (int i=0; i<10; i++) + { + const float random_number = rand_excl(bottom, top); + lowest = std::min(lowest, random_number); + highest = std::max(highest, random_number); + } + BOOST_CHECK(lowest < highest); +} + +BOOST_AUTO_TEST_CASE(rand_float_incl_returns_float_in_range) +{ + const float bottom = 5, top = 10; + for (int i=0; i<1000; i++) + { + const float random_number = rand_excl(bottom, top); + BOOST_CHECK(random_number >= bottom); + BOOST_CHECK(random_number <= top); + } +} + +BOOST_AUTO_TEST_CASE(rand_float_incl_returns_different_values) +{ + const float bottom = 0, top = 10; + float lowest = 99, highest = -1; + for (int i=0; i<10; i++) + { + const float random_number = rand_excl(bottom, top); + lowest = std::min(lowest, random_number); + highest = std::max(highest, random_number); + } + BOOST_CHECK(lowest < highest); +} + +BOOST_AUTO_TEST_CASE(wide_rand_int_returns_different_numbers_in_range) +{ + for (int i=0; i<100; i++) + { + const int random_number = wide_rand(); + BOOST_CHECK(random_number >= 0); + BOOST_CHECK(random_number <= RAND_MAX); + } +} + +BOOST_AUTO_TEST_CASE(wide_rand_long_long_returns_big_numbers) +{ + long long one = wide_rand(), two = wide_rand(); + // This test will fail sometimes because of unlucky random numbers, but only + // very very rarely. + BOOST_CHECK(one > RAND_MAX || two > RAND_MAX); +} + +BOOST_AUTO_TEST_CASE(wide_rand_excl_supports_larger_range) +{ + const long long top = 1000 * (long long)RAND_MAX; + long long + one = wide_rand_excl(top), + two = wide_rand_excl(top); + BOOST_CHECK(one < top); + BOOST_CHECK(two < top); + // This test will fail sometimes because of unlucky random numbers, but only + // very very rarely. + BOOST_CHECK(one > RAND_MAX || two > RAND_MAX); +} + +} // namespace +} // namespace util diff --git a/util/tempfile.hh b/util/tempfile.hh new file mode 100644 index 000000000..9b872a27e --- /dev/null +++ b/util/tempfile.hh @@ -0,0 +1,151 @@ +#ifndef UTIL_TEMPFILE_H +#define UTIL_TEMPFILE_H + +// Utilities for creating temporary files and directories. + +#include +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#endif + +#include +#include + +#include "util/exception.hh" +#include "util/unistd.hh" + +namespace util +{ + +/// Obtain a directory for temporary files, e.g. /tmp. +std::string temp_location() +{ +#if defined(_WIN32) || defined(_WIN64) + char dir_buffer[1000]; + if (GetTempPath(1000, dir_buffer) == 0) + throw std::runtime_error("Could not read temporary directory."); + return std::string(dir_buffer); +#else + // POSIX says to try these environment variables, in this order: + const char *const vars[] = {"TMPDIR", "TMP", "TEMPDIR", "TEMP", 0}; + for (int i=0; vars[i]; ++i) + { + const char *val = getenv(vars[i]); + // Environment variable is set and nonempty. Use it. + if (val && *val) return val; + } + // No environment variables set. Default to /tmp. + return "/tmp"; +#endif +} + + +#if defined(_WIN32) || defined(_WIN64) +/// Windows helper: create temporary filename. +std::string windows_tmpnam() +{ + const std::string tmp = temp_location(); + char output_buffer[MAX_PATH]; + if (GetTempFileName(tmp.c_str(), "tmp", 0, output_buffer) == 0) + throw std::runtime_error("Could not create temporary file name."); + return output_buffer; +} +#else +/** POSIX helper: create template for temporary filename. + * + * Writes the template into buf, which must have room for at least PATH_MAX + * bytes. The function fails if the template is too long. + */ +void posix_tmp_template(char *buf) +{ + const std::string tmp = temp_location(); + const std::string name_template = tmp + "/tmp.XXXXXX"; + if (name_template.size() >= PATH_MAX-1) + throw std::runtime_error("Path for temp files is too long: " + tmp); + strcpy(buf, name_template.c_str()); +} +#endif + + +/** Temporary directory. + * + * Automatically creates, and on destruction deletes, a temporary directory. + * The actual directory in the filesystem will only exist while the temp_dir + * object exists. + * + * If the directory no longer exists by the time the temp_dir is destroyed, + * cleanup is skipped. + */ +class temp_dir : boost::noncopyable +{ +public: + temp_dir() + { +#if defined(_WIN32) || defined(_WIN64) + m_path = windows_tmpnam(); + boost::filesystem::create_directory(m_path); +#else + char buf[PATH_MAX]; + posix_tmp_template(buf); + m_path = std::string(mkdtemp(buf)); +#endif + } + + ~temp_dir() + { + boost::filesystem::remove_all(path()); + } + + /// Return the temporary directory's full path. + const std::string &path() const { return m_path; } + +private: + std::string m_path; +}; + + +/** Temporary file. + * + * Automatically creates, and on destruction deletes, a temporary file. + * + * If the file no longer exists by the time the temp_file is destroyed, + * cleanup is skipped. + */ +class temp_file : boost::noncopyable +{ +public: + temp_file() + { +#if defined(_WIN32) || defined(_WIN64) + m_path = windows_tmpnam(); + std::ofstream out(m_path.c_str()); + out.flush(); +#else + char buf[PATH_MAX]; + posix_tmp_template(buf); + const int fd = mkstemp(buf); + if (fd == -1) throw ErrnoException(); + close(fd); + m_path = buf; +#endif + } + + ~temp_file() + { + boost::filesystem::remove(path()); + } + + /// Return the temporary file's full path. + const std::string &path() const { return m_path; } + +private: + std::string m_path; +}; + +} // namespace util + +#endif diff --git a/util/tempfile_test.cc b/util/tempfile_test.cc new file mode 100644 index 000000000..49736fe0c --- /dev/null +++ b/util/tempfile_test.cc @@ -0,0 +1,119 @@ +#include "util/tempfile.hh" + +#include + +#include + +#define BOOST_TEST_MODULE TempFileTest +#include + +namespace util +{ +namespace +{ + +BOOST_AUTO_TEST_CASE(temp_dir_has_path) +{ + BOOST_CHECK(temp_dir().path().size() > 0); +} + +BOOST_AUTO_TEST_CASE(temp_dir_creates_temp_directory) +{ + const temp_dir t; + BOOST_CHECK(boost::filesystem::exists(t.path())); + BOOST_CHECK(boost::filesystem::is_directory(t.path())); +} + +BOOST_AUTO_TEST_CASE(temp_dir_creates_unique_directory) +{ + BOOST_CHECK(temp_dir().path() != temp_dir().path()); +} + +BOOST_AUTO_TEST_CASE(temp_dir_cleans_up_directory) +{ + std::string path; + { + const temp_dir t; + path = t.path(); + } + BOOST_CHECK(!boost::filesystem::exists(path)); +} + +BOOST_AUTO_TEST_CASE(temp_dir_cleanup_succeeds_if_directory_contains_file) +{ + std::string path; + { + const temp_dir t; + path = t.path(); + boost::filesystem::create_directory(path + "/directory"); + std::ofstream file((path + "/file").c_str()); + file << "Text"; + file.flush(); + } + BOOST_CHECK(!boost::filesystem::exists(path)); +} + +BOOST_AUTO_TEST_CASE(temp_dir_cleanup_succeeds_if_directory_is_gone) +{ + std::string path; + { + const temp_dir t; + path = t.path(); + boost::filesystem::remove_all(path); + } + BOOST_CHECK(!boost::filesystem::exists(path)); +} + +BOOST_AUTO_TEST_CASE(temp_file_has_path) +{ + BOOST_CHECK(temp_file().path().size() > 0); +} + +BOOST_AUTO_TEST_CASE(temp_file_creates_temp_file) +{ + const temp_file f; + BOOST_CHECK(boost::filesystem::exists(f.path())); + BOOST_CHECK(boost::filesystem::is_regular_file(f.path())); +} + +BOOST_AUTO_TEST_CASE(temp_file_creates_unique_file) +{ + BOOST_CHECK(temp_file().path() != temp_file().path()); +} + +BOOST_AUTO_TEST_CASE(temp_file_creates_writable_file) +{ + const std::string data = "Test-data-goes-here"; + const temp_file f; + std::ofstream outfile(f.path().c_str()); + outfile << data; + outfile.flush(); + std::string read_data; + std::ifstream infile(f.path().c_str()); + infile >> read_data; + BOOST_CHECK_EQUAL(data, read_data); +} + +BOOST_AUTO_TEST_CASE(temp_file_cleans_up_file) +{ + std::string path; + { + const temp_file f; + path = f.path(); + } + BOOST_CHECK(!boost::filesystem::exists(path)); +} + +BOOST_AUTO_TEST_CASE(temp_file_cleanup_succeeds_if_file_is_gone) +{ + std::string path; + { + const temp_file t; + path = t.path(); + boost::filesystem::remove(path); + } + BOOST_CHECK(!boost::filesystem::exists(path)); +} + +} // namespace anonymous +} // namespace util diff --git a/util/tokenize.hh b/util/tokenize.hh new file mode 100644 index 000000000..5d8430222 --- /dev/null +++ b/util/tokenize.hh @@ -0,0 +1,51 @@ +#ifndef TOKENIZE_H +#define TOKENIZE_H + +#include +#include + +namespace util +{ + +/** Split input text into a series of tokens. + * + * Splits on spaces and tabs, no other whitespace characters, and is not + * locale-sensitive. + * + * The spaces themselves are not included. A sequence of consecutive space/tab + * characters count as one. + */ +inline std::vector tokenize(const char input[]) +{ + std::vector token; + bool betweenWords = true; + int start = 0; + int i; + for(i = 0; input[i] != '\0'; i++) { + const bool isSpace = (input[i] == ' ' || input[i] == '\t'); + + if (!isSpace && betweenWords) { + start = i; + betweenWords = false; + } else if (isSpace && !betweenWords) { + token.push_back( std::string( input+start, i-start ) ); + betweenWords = true; + } + } + if (!betweenWords) + token.push_back( std::string( input+start, i-start ) ); + return token; +} + +/** Split input string into a series of tokens. + * + * Like tokenize(const char[]), but takes a std::string. + */ +inline std::vector tokenize(const std::string &input) +{ + return tokenize(input.c_str()); +} + +} // namespace util + +#endif diff --git a/util/tokenize_test.cc b/util/tokenize_test.cc new file mode 100644 index 000000000..d879fa97f --- /dev/null +++ b/util/tokenize_test.cc @@ -0,0 +1,69 @@ +#include "util/tokenize.hh" + +#define BOOST_TEST_MODULE TokenizeTest +#include + +namespace util +{ +namespace +{ + +BOOST_AUTO_TEST_CASE(empty_text_yields_empty_vector) +{ + const std::vector tokens = util::tokenize(""); + BOOST_CHECK_EQUAL(tokens.size(), 0); +} + +BOOST_AUTO_TEST_CASE(whitespace_only_yields_empty_vector) +{ + const std::vector tokens = util::tokenize(" "); + BOOST_CHECK_EQUAL(tokens.size(), 0); +} + +BOOST_AUTO_TEST_CASE(parses_single_token) +{ + const std::vector tokens = util::tokenize("mytoken"); + BOOST_CHECK_EQUAL(tokens.size(), 1); + BOOST_CHECK_EQUAL(tokens[0], "mytoken"); +} + +BOOST_AUTO_TEST_CASE(ignores_leading_whitespace) +{ + const std::vector tokens = util::tokenize(" \t mytoken"); + BOOST_CHECK_EQUAL(tokens.size(), 1); + BOOST_CHECK_EQUAL(tokens[0], "mytoken"); +} + +BOOST_AUTO_TEST_CASE(ignores_trailing_whitespace) +{ + const std::vector tokens = util::tokenize("mytoken \t "); + BOOST_CHECK_EQUAL(tokens.size(), 1); + BOOST_CHECK_EQUAL(tokens[0], "mytoken"); +} + +BOOST_AUTO_TEST_CASE(splits_tokens_on_tabs) +{ + const std::vector tokens = util::tokenize("one\ttwo"); + BOOST_CHECK_EQUAL(tokens.size(), 2); + BOOST_CHECK_EQUAL(tokens[0], "one"); + BOOST_CHECK_EQUAL(tokens[1], "two"); +} + +BOOST_AUTO_TEST_CASE(splits_tokens_on_spaces) +{ + const std::vector tokens = util::tokenize("one two"); + BOOST_CHECK_EQUAL(tokens.size(), 2); + BOOST_CHECK_EQUAL(tokens[0], "one"); + BOOST_CHECK_EQUAL(tokens[1], "two"); +} + +BOOST_AUTO_TEST_CASE(treats_sequence_of_space_as_one_space) +{ + const std::vector tokens = util::tokenize("one\t \ttwo"); + BOOST_CHECK_EQUAL(tokens.size(), 2); + BOOST_CHECK_EQUAL(tokens[0], "one"); + BOOST_CHECK_EQUAL(tokens[1], "two"); +} + +} // namespace +} // namespace util diff --git a/util/unistd.hh b/util/unistd.hh index 0379c4914..f99be592a 100644 --- a/util/unistd.hh +++ b/util/unistd.hh @@ -1,7 +1,7 @@ #ifndef UTIL_UNISTD_H #define UTIL_UNISTD_H -#if defined(_WIN32) || defined(_WIN64) +#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) // Windows doesn't define //