Merge branch 'master' of github.com:moses-smt/mosesdecoder

2024-12-26 21:42:19 +03:00 · 2015-04-14 11:30:33 +04:00 · 2015-04-14 11:30:33 +04:00 · 044968bb4b
commit 044968bb4b
parent 7af653ac80 6162223690
136 changed files with 231 additions and 14 deletions
--- a/contrib/other-builds/all.workspace
+++ b/contrib/other-builds/all.workspace
@ -7,8 +7,8 @@
  <Project Name="lm" Path="lm/lm.project" Active="No"/>
  <Project Name="OnDiskPt" Path="OnDiskPt/OnDiskPt.project" Active="No"/>
  <Project Name="search" Path="search/search.project" Active="No"/>
-  <Project Name="moses" Path="moses/moses.project" Active="No"/>
-  <Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="Yes"/>
+  <Project Name="moses" Path="moses/moses.project" Active="Yes"/>
+  <Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="No"/>
  <Project Name="score" Path="score/score.project" Active="No"/>
  <Project Name="consolidate" Path="consolidate/consolidate.project" Active="No"/>
  <BuildMatrix>
--- a/contrib/other-builds/manual-label/manual-label.project
+++ b/contrib/other-builds/manual-label/manual-label.project
@ -1,5 +1,22 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <CodeLite_Project Name="manual-label" InternalType="Console">
+  <Plugins>
+    <Plugin Name="CMakePlugin">
+      <![CDATA[[{
+  "name": "Debug",
+  "enabled": false,
+  "buildDirectory": "build",
+  "sourceDirectory": "$(ProjectPath)",
+  "generator": "",
+  "buildType": "",
+  "arguments": [],
+  "parentProject": ""
+ }]]]>
+    </Plugin>
+    <Plugin Name="qmake">
+      <![CDATA[00010001N0005Debug000000000000]]>
+    </Plugin>
+  </Plugins>
  <Description/>
  <Dependencies/>
  <VirtualDirectory Name="manual-label">
@ -14,6 +31,8 @@
    <File Name="Main.cpp"/>
    <File Name="Main.h"/>
  </VirtualDirectory>
+  <Dependencies Name="Debug"/>
+  <Dependencies Name="Release"/>
  <Settings Type="Executable">
    <GlobalSettings>
      <Compiler Options="" C_Options="" Assembler="">
@ -33,6 +52,8 @@
      <Linker Options="" Required="yes">
        <LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
        <Library Value="boost_program_options"/>
+        <Library Value="boost_filesystem"/>
+        <Library Value="boost_system"/>
      </Linker>
      <ResourceCompiler Options="" Required="no"/>
      <General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
@ -107,6 +128,4 @@
      </Completion>
    </Configuration>
  </Settings>
-  <Dependencies Name="Debug"/>
-  <Dependencies Name="Release"/>
 </CodeLite_Project>
--- a/contrib/other-builds/moses/moses.project
+++ b/contrib/other-builds/moses/moses.project
@ -474,8 +474,6 @@
    <File Name="../../../moses/FF/DistortionScoreProducer.h"/>
    <File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.cpp"/>
    <File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.h"/>
-    <File Name="../../../moses/FF/ExternalFeature.cpp"/>
-    <File Name="../../../moses/FF/ExternalFeature.h"/>
    <File Name="../../../moses/FF/Factory.cpp"/>
    <File Name="../../../moses/FF/Factory.h"/>
    <File Name="../../../moses/FF/FeatureFunction.cpp"/>
--- a/moses/Syntax/F2S/HyperTreeLoader.cpp
+++ b/moses/Syntax/F2S/HyperTreeLoader.cpp
@ -40,12 +40,12 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
                           const std::vector<FactorType> &output,
                           const std::string &inFile,
                           const RuleTableFF &ff,
-                           HyperTree &trie)
+                           HyperTree &trie,
+                           boost::unordered_set<std::size_t> &sourceTermSet)
 {
  PrintUserTime(std::string("Start loading HyperTree"));

-  // const StaticData &staticData = StaticData::Instance();
-  // const std::string &factorDelimiter = staticData.GetFactorDelimiter();
+  sourceTermSet.clear();

  std::size_t count = 0;

@ -106,6 +106,7 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
    // Source-side
    HyperPath sourceFragment;
    hyperPathLoader.Load(sourceString, sourceFragment);
+    ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet);

    // Target-side
    TargetPhrase *targetPhrase = new TargetPhrase(&ff);
@ -144,6 +145,23 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
  return true;
 }

+void HyperTreeLoader::ExtractSourceTerminalSetFromHyperPath(
+    const HyperPath &hp, boost::unordered_set<std::size_t> &sourceTerminalSet)
+{
+  for (std::vector<HyperPath::NodeSeq>::const_iterator p = hp.nodeSeqs.begin();
+       p != hp.nodeSeqs.end(); ++p) {
+    for (std::vector<std::size_t>::const_iterator q = p->begin();
+         q != p->end(); ++q) {
+      const std::size_t factorId = *q;
+      if (factorId >= moses_MaxNumNonterminals &&
+          factorId != HyperPath::kComma &&
+          factorId != HyperPath::kEpsilon) {
+        sourceTerminalSet.insert(factorId);
+      }
+    }
+  }
+}
+
 }  // namespace F2S
 }  // namespace Syntax
 }  // namespace Moses
--- a/moses/Syntax/F2S/HyperTreeLoader.h
+++ b/moses/Syntax/F2S/HyperTreeLoader.h
@ -3,9 +3,12 @@
 #include <istream>
 #include <vector>

+#include <boost/unordered_set.hpp>
+
 #include "moses/TypeDef.h"
 #include "moses/Syntax/RuleTableFF.h"

+#include "HyperPath.h"
 #include "HyperTree.h"
 #include "HyperTreeCreator.h"

@ -23,7 +26,12 @@ public:
            const std::vector<FactorType> &output,
            const std::string &inFile,
            const RuleTableFF &,
-            HyperTree &);
+            HyperTree &,
+            boost::unordered_set<std::size_t> &);
+
+private:
+  void ExtractSourceTerminalSetFromHyperPath(
+     const HyperPath &, boost::unordered_set<std::size_t> &);
 };

 }  // namespace F2S
--- a/moses/Syntax/F2S/Manager-inl.h
+++ b/moses/Syntax/F2S/Manager-inl.h
@ -38,6 +38,7 @@ Manager<RuleMatcher>::Manager(const InputType &source)
  if (const ForestInput *p = dynamic_cast<const ForestInput*>(&source)) {
    m_forest = p->GetForest();
    m_rootVertex = p->GetRootVertex();
+    m_sentenceLength = p->GetSize();
  } else if (const TreeInput *p = dynamic_cast<const TreeInput*>(&source)) {
    T2S::InputTreeBuilder builder;
    T2S::InputTree tmpTree;
@ -45,6 +46,7 @@ Manager<RuleMatcher>::Manager(const InputType &source)
    boost::shared_ptr<Forest> forest = boost::make_shared<Forest>();
    m_rootVertex = T2S::InputTreeToForest(tmpTree, *forest);
    m_forest = forest;
+    m_sentenceLength = p->GetSize();
  } else {
    UTIL_THROW2("ERROR: F2S::Manager requires input to be a tree or forest");
  }
@ -82,8 +84,13 @@ void Manager<RuleMatcher>::Decode()
       p = sortedVertices.begin(); p != sortedVertices.end(); ++p) {
    const Forest::Vertex &vertex = **p;

-    // Skip terminal vertices.
+    // Skip terminal vertices (after checking if they are OOVs).
    if (vertex.incoming.empty()) {
+      if (vertex.pvertex.span.GetStartPos() > 0 &&
+          vertex.pvertex.span.GetEndPos() < m_sentenceLength-1 &&
+          IsUnknownSourceWord(vertex.pvertex.symbol)) {
+        m_oovs.insert(vertex.pvertex.symbol);
+      }
      continue;
    }

@ -189,6 +196,21 @@ void Manager<RuleMatcher>::InitializeStacks()
  }
 }

+template<typename RuleMatcher>
+bool Manager<RuleMatcher>::IsUnknownSourceWord(const Word &w) const
+{
+  const std::size_t factorId = w[0]->GetId();
+  const std::vector<RuleTableFF*> &ffs = RuleTableFF::Instances();
+  for (std::size_t i = 0; i < ffs.size(); ++i) {
+    RuleTableFF *ff = ffs[i];
+    const boost::unordered_set<std::size_t> &sourceTerms =
+      ff->GetSourceTerminalSet();
+    if (sourceTerms.find(factorId) != sourceTerms.end()) {
+      return false;
+    }
+  }
+  return true;
+}

 template<typename RuleMatcher>
 const SHyperedge *Manager<RuleMatcher>::GetBestSHyperedge() const
--- a/moses/Syntax/F2S/Manager.h
+++ b/moses/Syntax/F2S/Manager.h
@ -51,10 +51,13 @@ private:

  void InitializeStacks();

+  bool IsUnknownSourceWord(const Word &) const;
+
  void RecombineAndSort(const std::vector<SHyperedge*> &, SVertexStack &);

  boost::shared_ptr<const Forest> m_forest;
  const Forest::Vertex *m_rootVertex;
+  std::size_t m_sentenceLength;  // Includes <s> and </s>
  PVertexToStackMap m_stackMap;
  boost::shared_ptr<HyperTree> m_glueRuleTrie;
  std::vector<boost::shared_ptr<RuleMatcher> > m_mainRuleMatchers;
--- a/moses/Syntax/RuleTableFF.cpp
+++ b/moses/Syntax/RuleTableFF.cpp
@ -35,7 +35,8 @@ void RuleTableFF::Load()
      staticData.GetSearchAlgorithm() == SyntaxT2S) {
    F2S::HyperTree *trie = new F2S::HyperTree(this);
    F2S::HyperTreeLoader loader;
-    loader.Load(m_input, m_output, m_filePath, *this, *trie);
+    loader.Load(m_input, m_output, m_filePath, *this, *trie,
+                m_sourceTerminalSet);
    m_table = trie;
  } else if (staticData.GetSearchAlgorithm() == SyntaxS2T) {
    S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm();
--- a/moses/Syntax/RuleTableFF.h
+++ b/moses/Syntax/RuleTableFF.h
@ -43,10 +43,17 @@ public:
    return 0;
  }

+  // Get the source terminal vocabulary for this table's grammar (as a set of
+  // factor IDs)
+  const boost::unordered_set<std::size_t> &GetSourceTerminalSet() const {
+    return m_sourceTerminalSet;
+  }
+
 private:
  static std::vector<RuleTableFF*> s_instances;

  const RuleTable *m_table;
+  boost::unordered_set<std::size_t> m_sourceTerminalSet;
 };

 }  // Syntax
--- a/scripts/OSM/OSM-Train.perl
+++ b/scripts/OSM/OSM-Train.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";
 use FindBin qw($RealBin);
--- a/scripts/OSM/extract-singletons.perl
+++ b/scripts/OSM/extract-singletons.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use Getopt::Std;
 getopts('q');

--- a/scripts/OSM/flipAlignment.perl
+++ b/scripts/OSM/flipAlignment.perl
@ -1,5 +1,7 @@
 #!/usr/bin/env perl 
-  use strict;
+
+use warnings;
+use strict;

  my $file = shift(@ARGV);
  open(MYFILE, $file);
--- a/scripts/Transliteration/clean.pl
+++ b/scripts/Transliteration/clean.pl
@ -1,6 +1,7 @@
 #!/usr/bin/env perl 

 #input hindi word urdu word, delete all those entries that have number on any side
+use warnings;
 use utf8;

 use Getopt::Std;
--- a/scripts/Transliteration/corpusCreator.pl
+++ b/scripts/Transliteration/corpusCreator.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 use utf8;
--- a/scripts/Transliteration/in-decoding-transliteration.pl
+++ b/scripts/Transliteration/in-decoding-transliteration.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 use utf8;
--- a/scripts/Transliteration/post-decoding-transliteration.pl
+++ b/scripts/Transliteration/post-decoding-transliteration.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 use utf8;
--- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl
+++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 use utf8;
--- a/scripts/Transliteration/threshold.pl
+++ b/scripts/Transliteration/threshold.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use utf8;
 require Encode;
 use IO::Handle;
--- a/scripts/Transliteration/train-transliteration-module.pl
+++ b/scripts/Transliteration/train-transliteration-module.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use utf8;
 use strict;
 use Getopt::Long "GetOptions";
--- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@ -14,6 +14,7 @@ use utf8;
 # 23.01.2010: added NIST p-value and interval computation
 ###############################################

+use warnings;
 use strict;

 #constants
--- a/scripts/analysis/sentence-by-sentence.pl
+++ b/scripts/analysis/sentence-by-sentence.pl
@ -4,6 +4,7 @@
 #sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors
 #usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.html

+use warnings;
 use strict;
 use Getopt::Long;

--- a/scripts/analysis/sg2dot.perl
+++ b/scripts/analysis/sg2dot.perl
@ -4,6 +4,7 @@
 # Script to convert MOSES searchgraph to DOT format
 #

+use warnings;
 use strict;
 use File::Path;
 use File::Basename;
--- a/scripts/analysis/show-phrases-used.pl
+++ b/scripts/analysis/show-phrases-used.pl
@ -5,7 +5,9 @@
 #usage: show-phrases-used DECODER_OUTFILE > output.html
 #  where DECODER_OUTFILE is the output of moses with the -T (show alignments) option

+use warnings;
 use strict;
+
 BEGIN
 {
    my $wd= `pawd 2>/dev/null`;
--- a/scripts/analysis/smtgui/filter-phrase-table.pl
+++ b/scripts/analysis/smtgui/filter-phrase-table.pl
@ -9,6 +9,7 @@
 #similar function to filter-model-given-input.pl, but only operates
 #on the phrase table and doesn't require that any subdirectories exist

+use warnings;
 use strict;

 my $MAX_LENGTH = 10;
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -3,6 +3,7 @@
 # Experiment Management System
 # Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";
 use FindBin qw($RealBin);
--- a/scripts/ems/fix-info.perl
+++ b/scripts/ems/fix-info.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my ($file,$step) = @ARGV;
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

--- a/scripts/ems/support/build-domain-file-from-subcorpora.perl
+++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 # Create domain file from corpora
--- a/scripts/ems/support/build-sparse-features.perl
+++ b/scripts/ems/support/build-sparse-features.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 # Build necessary files for sparse lexical features
--- a/scripts/ems/support/consolidate-training-data.perl
+++ b/scripts/ems/support/consolidate-training-data.perl
@ -2,6 +2,7 @@

 # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $

+use warnings;
 use strict;

 my ($in,$out,$consolidated,@PART) = @ARGV;
--- a/scripts/ems/support/generic-multicore-parallelizer.perl
+++ b/scripts/ems/support/generic-multicore-parallelizer.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my $cores = 8;
--- a/scripts/ems/support/generic-parallelizer.perl
+++ b/scripts/ems/support/generic-parallelizer.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my $jobs = 20;
--- a/scripts/ems/support/input-from-sgm.perl
+++ b/scripts/ems/support/input-from-sgm.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 die("ERROR syntax: input-from-sgm.perl < in.sgm > in.txt") 
--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use IPC::Open3;
 use File::Temp qw/tempdir/;
--- a/scripts/ems/support/lmplz-wrapper.perl
+++ b/scripts/ems/support/lmplz-wrapper.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

--- a/scripts/ems/support/mml-filter.perl
+++ b/scripts/ems/support/mml-filter.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use FindBin qw($RealBin);

--- a/scripts/ems/support/mml-score.perl
+++ b/scripts/ems/support/mml-score.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 #
--- a/scripts/ems/support/mml-train.perl
+++ b/scripts/ems/support/mml-train.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my ($indomain_source,,$indomain_target,$outdomain_source,$outdomain_target,$lm_training,$lm_binarizer,$order,$lm_settings,$line_count,$model);
--- a/scripts/ems/support/prepare-fast-align.perl
+++ b/scripts/ems/support/prepare-fast-align.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my ($source_file,$target_file,$alignment_factors) = @ARGV;
--- a/scripts/ems/support/reference-from-sgm.perl
+++ b/scripts/ems/support/reference-from-sgm.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 die("ERROR syntax: reference-from-sgm.perl ref src out") 
--- a/scripts/ems/support/remove-segmentation-markup.perl
+++ b/scripts/ems/support/remove-segmentation-markup.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 $|++;
--- a/scripts/ems/support/report-experiment-scores.perl
+++ b/scripts/ems/support/report-experiment-scores.perl
@ -2,6 +2,7 @@

 # $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $

+use warnings;
 use strict;

 my $email;
--- a/scripts/ems/support/run-command-on-multiple-refsets.perl
+++ b/scripts/ems/support/run-command-on-multiple-refsets.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 die("ERROR: syntax: run-command-on-multiple-refsets.perl cmd in out") 
--- a/scripts/ems/support/run-wade.perl
+++ b/scripts/ems/support/run-wade.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use File::Temp qw/ tempfile tempdir /;

--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@ -6,6 +6,7 @@ binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
 binmode(STDERR, ":utf8");

+use warnings;
 use FindBin qw($RealBin);
 use strict;

--- a/scripts/ems/support/submit-grid.perl
+++ b/scripts/ems/support/submit-grid.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl

+use warnings;
 use strict;
 use Cwd;
 use FindBin qw($RealBin);
--- a/scripts/ems/support/substitute-filtered-tables-and-weights.perl
+++ b/scripts/ems/support/substitute-filtered-tables-and-weights.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";
 use FindBin qw($RealBin);
--- a/scripts/ems/support/substitute-filtered-tables.perl
+++ b/scripts/ems/support/substitute-filtered-tables.perl
@ -1,5 +1,7 @@
 #!/usr/bin/env perl 

+use warnings;
+
 # experiment.perl support script
 # get filtered rule and reordering tables and place them into a configuration file

--- a/scripts/ems/support/substitute-weights.perl
+++ b/scripts/ems/support/substitute-weights.perl
@ -1,5 +1,7 @@
 #!/usr/bin/env perl 

+use warnings;
+
 # experiment.perl support script
 # get filtered rule and reordering tables and place them into a configuration file

--- a/scripts/ems/support/symmetrize-fast-align.perl
+++ b/scripts/ems/support/symmetrize-fast-align.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 die("ERROR: syntax is fastalign2bal.perl direct-alignment inverse-alignment source-file target-file out-stem symmetrization-method symal\n") unless scalar(@ARGV) == 7;
--- a/scripts/ems/support/thot-lm-wrapper.perl
+++ b/scripts/ems/support/thot-lm-wrapper.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my ($language,$src,$system) = @ARGV;
--- a/scripts/ems/web/progress.perl
+++ b/scripts/ems/web/progress.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Date::Parse;

--- a/scripts/fuzzy-match/create_xml.perl
+++ b/scripts/fuzzy-match/create_xml.perl
@ -3,6 +3,7 @@
 binmode( STDIN,  ":utf8" );
 binmode( STDOUT, ":utf8" );

+use warnings;
 use strict;
 use FindBin qw($RealBin);
 use File::Basename;
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

--- a/scripts/generic/extract-factors.pl
+++ b/scripts/generic/extract-factors.pl
@ -6,6 +6,7 @@
 #factor indices start at 0
 #factor indices too large ought to be ignored

+use warnings;
 use strict;

 my ($filename, @factors) = @ARGV;
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@ -3,6 +3,7 @@
 # example
 #  ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput

+use warnings;
 use strict;
 use File::Basename;

--- a/scripts/generic/fsa2fsal.pl
+++ b/scripts/generic/fsa2fsal.pl
@ -5,6 +5,7 @@
 # Some rudimentary sanity checks are done on the fly.
 # Ondrej Bojar, bojar@ufal.mff.cuni.cz

+use warnings;
 use strict;

 my $errs = 0;
--- a/scripts/generic/fsa2plf.pl
+++ b/scripts/generic/fsa2plf.pl
@ -8,6 +8,7 @@
 # Note that the output format may not contain any spaces.
 # Ondrej Bojar, bojar@ufal.mff.cuni.cz

+use warnings;
 use strict;
 use Getopt::Long;

--- a/scripts/generic/fsal2fsa.pl
+++ b/scripts/generic/fsal2fsa.pl
@ -2,6 +2,7 @@
 # A very simple script that converts fsal back to fsa format (openfst lattices)
 # Ondrej Bojar, bojar@ufal.mff.cuni.cz

+use warnings;
 use strict;

 while (<>) {
--- a/scripts/generic/generic-parallel.perl
+++ b/scripts/generic/generic-parallel.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use utf8;

--- a/scripts/generic/giza-parallel.perl
+++ b/scripts/generic/giza-parallel.perl
@ -3,6 +3,7 @@
 # example
 # ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align

+use warnings;
 use strict;
 use File::Basename;

--- a/scripts/generic/lopar2pos.pl
+++ b/scripts/generic/lopar2pos.pl
@ -4,6 +4,8 @@
 #lopar2pos: extract POSs from LOPAR output
 #usage: lopar2pos.pl CORPUS.lopar > CORPUS.pos

+use warnings;
+
 my $infilename = shift @ARGV;
 open(INFILE, "<$infilename") or die "couldn't open '$infilename' for read: $!\n";
 while(my $line = <INFILE>)
--- a/scripts/generic/moses-parallel.pl
+++ b/scripts/generic/moses-parallel.pl
@ -15,6 +15,7 @@
 #             added checks for existence of decoder and configuration file
 # 26 Jul 2006 fix a bug related to the use of absolute path for srcfile and nbestfile

+use warnings;
 use strict;

 #######################
--- a/scripts/generic/mteval-v12.pl
+++ b/scripts/generic/mteval-v12.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 
 
+use warnings;
 use strict;
 use utf8;
 use Encode;
--- a/scripts/generic/multi-bleu.perl
+++ b/scripts/generic/multi-bleu.perl
@ -1,6 +1,7 @@
 #!/usr/bin/env perl 

 # $Id$
+use warnings;
 use strict;

 my $lowercase = 0;
--- a/scripts/generic/ph_numbers.perl
+++ b/scripts/generic/ph_numbers.perl
@ -7,6 +7,7 @@ package ph_numbers;
 #
 # (c) 2013 TAUS

+use warnings;
 use strict;

 run() unless caller();
--- a/scripts/generic/qsub-wrapper.pl
+++ b/scripts/generic/qsub-wrapper.pl
@ -1,6 +1,7 @@
 #!/usr/bin/env perl 

 # $Id$
+use warnings;
 use strict;

 #######################
--- a/scripts/generic/reverse-alignment.perl
+++ b/scripts/generic/reverse-alignment.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my $line;
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@ -4,6 +4,7 @@
 # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e  --GoodTuring ./phrase-table.2.coc 0
 # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.inv.sorted.gz ./lex.2.e2f ./phrase-table.2.half.e2f  --Inverse 1

+use warnings;
 use strict;
 use File::Basename;

--- a/scripts/generic/strip-xml.perl
+++ b/scripts/generic/strip-xml.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 while (my $line = <STDIN>) {
--- a/scripts/generic/trainlm-irst2.perl
+++ b/scripts/generic/trainlm-irst2.perl
@ -10,6 +10,7 @@
 #    irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
 # Set smoothing method in settings, if different from modified Kneser-Ney 

+use warnings;
 use strict;
 use FindBin qw($RealBin);
 use Getopt::Long;
--- a/scripts/generic/trainlm-lmplz.perl
+++ b/scripts/generic/trainlm-lmplz.perl
@ -9,6 +9,7 @@
 # It should point to the binary file
 #    lmplz = /home/waziz/workspace/github/moses/bin/lmplz

+use warnings;
 use strict;
 use FindBin qw($RealBin);
 use Getopt::Long qw/GetOptionsFromArray/;
--- a/scripts/other/beautify.perl
+++ b/scripts/other/beautify.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use File::Basename;
 use FindBin qw($RealBin);
--- a/scripts/other/delete-scores.perl
+++ b/scripts/other/delete-scores.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

--- a/scripts/other/get_many_translations_from_google.perl
+++ b/scripts/other/get_many_translations_from_google.perl
@ -6,6 +6,7 @@
 #
 # Ondrej Bojar, bojar@ufal.mff.cuni.cz

+use warnings;
 use strict;
 use Getopt::Long;
 use CGI;
--- a/scripts/recaser/detruecase.perl
+++ b/scripts/recaser/detruecase.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@ -1,6 +1,7 @@
 #!/usr/bin/env perl 

 # $Id$
+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@ -1,6 +1,7 @@
 #!/usr/bin/env perl 

 # $Id$
+use warnings;
 use strict;
 use FindBin qw($Bin);
 use Getopt::Long "GetOptions";
--- a/scripts/recaser/train-truecaser.perl
+++ b/scripts/recaser/train-truecaser.perl
@ -8,6 +8,7 @@
 # --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token.
 #

+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@ -1,6 +1,8 @@
 #!/usr/bin/env perl 

 # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
+
+use warnings;
 use strict;
 use Getopt::Long "GetOptions";

--- a/scripts/regression-testing/compare-results.pl
+++ b/scripts/regression-testing/compare-results.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 my ($results, $truth) = @ARGV;

--- a/scripts/regression-testing/create_localized_moses_ini.pl
+++ b/scripts/regression-testing/create_localized_moses_ini.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
 use MosesScriptsRegressionTesting;
--- a/scripts/regression-testing/modify-pars.pl
+++ b/scripts/regression-testing/modify-pars.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 	
 my $argv=join(" ",@ARGV);
--- a/scripts/regression-testing/moses-virtual.pl
+++ b/scripts/regression-testing/moses-virtual.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my %opt = ();
--- a/scripts/regression-testing/run-single-test.pl
+++ b/scripts/regression-testing/run-single-test.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
 use MosesScriptsRegressionTesting;
--- a/scripts/regression-testing/run-test-suite.pl
+++ b/scripts/regression-testing/run-test-suite.pl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;
 my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
 use Getopt::Long;
--- a/scripts/tokenizer/deescape-special-chars-PTB.perl
+++ b/scripts/tokenizer/deescape-special-chars-PTB.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 while(<STDIN>) {
--- a/scripts/tokenizer/deescape-special-chars.perl
+++ b/scripts/tokenizer/deescape-special-chars.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 while(<STDIN>) {
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@ -7,6 +7,8 @@

 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
+
+use warnings;
 use strict;
 use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)

@ -36,7 +38,7 @@ if ($HELP) {
 	exit;
 }

-if ($language !~ /^(cs|en|fr|it)$/) {
+if ($language !~ /^(cs|en|fr|it|fi)$/) {
  print STDERR "Warning: No built-in rules for language $language.\n"
 }

@ -176,6 +178,11 @@ sub detokenize {

 			}
 			
+        } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) {
+            # Finnish : without intervening space if followed by case suffix
+            # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
+            $text=$text. lc $words[$i];
+            $prependSpace = " ";
 		} else {
 			$text=$text.$prependSpace.$words[$i];
 			$prependSpace = " ";
--- a/scripts/tokenizer/escape-special-chars.perl
+++ b/scripts/tokenizer/escape-special-chars.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 while(<STDIN>) {
--- a/scripts/tokenizer/lowercase.perl
+++ b/scripts/tokenizer/lowercase.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 binmode(STDIN, ":utf8");
--- a/scripts/tokenizer/normalize-punctuation.perl
+++ b/scripts/tokenizer/normalize-punctuation.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 my $language = "en";
--- a/scripts/tokenizer/pre-tokenizer.perl
+++ b/scripts/tokenizer/pre-tokenizer.perl
@ -4,6 +4,7 @@
 # Start by Ulrich Germann, after noticing systematic preprocessing errors
 # in some of the English Europarl data.

+use warnings;
 use strict;
 use Getopt::Std;

--- a/scripts/tokenizer/remove-non-printing-char.perl
+++ b/scripts/tokenizer/remove-non-printing-char.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use utf8; 

 binmode(STDIN, ":utf8");
--- a/scripts/tokenizer/replace-unicode-punctuation.perl
+++ b/scripts/tokenizer/replace-unicode-punctuation.perl
@ -1,5 +1,6 @@
 #!/usr/bin/env perl 

+use warnings;
 use strict;

 #binmode(STDIN, ":utf8");
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@ -16,6 +16,7 @@ use warnings;
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");

+use warnings;
 use FindBin qw($RealBin);
 use strict;
 use Time::HiRes;
--- a/scripts/tokenizer/tokenizer_PTB.perl
+++ b/scripts/tokenizer/tokenizer_PTB.perl
@ -14,6 +14,7 @@
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");

+use warnings;
 use FindBin qw($RealBin);
 use strict;
 use Time::HiRes;
--- a/scripts/training/absolutize_moses_model.pl
+++ b/scripts/training/absolutize_moses_model.pl
@ -6,6 +6,8 @@
 #
 # Ondrej Bojar.

+use warnings;
+
 my $ini = shift;
 die "usage: absolutize_moses_model.pl path-to-moses.ini > moses.abs.ini"
  if !defined $ini;
--- a/scripts/training/binarize-model.perl
+++ b/scripts/training/binarize-model.perl
@ -4,6 +4,7 @@
 # Binarize a Moses model
 #

+use warnings;
 use strict;

 use Getopt::Long "GetOptions";
--- a/Show More
+++ b/Show More