diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h index 8b9088770..604b6d0e2 100644 --- a/phrase-extract/SentenceAlignmentWithSyntax.h +++ b/phrase-extract/SentenceAlignmentWithSyntax.h @@ -28,7 +28,7 @@ #include "RuleExtractionOptions.h" #include "SentenceAlignment.h" -#include "SyntaxTree.h" +#include "SyntaxNodeCollection.h" namespace MosesTraining { @@ -36,8 +36,8 @@ namespace MosesTraining class SentenceAlignmentWithSyntax : public SentenceAlignment { public: - SyntaxTree targetTree; - SyntaxTree sourceTree; + SyntaxNodeCollection targetTree; + SyntaxNodeCollection sourceTree; std::set & m_targetLabelCollection; std::set & m_sourceLabelCollection; std::map & m_targetTopLabelCollection; diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h new file mode 100644 index 000000000..46e0f456f --- /dev/null +++ b/phrase-extract/SyntaxNode.h @@ -0,0 +1,75 @@ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include + +namespace MosesTraining +{ + +class SyntaxNode +{ +protected: + int m_start, m_end; + std::string m_label; + std::vector< SyntaxNode* > m_children; + SyntaxNode* m_parent; + float m_pcfgScore; +public: + SyntaxNode( int startPos, int endPos, std::string label ) + :m_start(startPos) + ,m_end(endPos) + ,m_label(label) + ,m_parent(0) + ,m_pcfgScore(0.0f) { + } + int GetStart() const { + return m_start; + } + int GetEnd() const { + return m_end; + } + std::string GetLabel() const { + return m_label; + } + float GetPcfgScore() const { + return m_pcfgScore; + } + void SetPcfgScore(float score) { + m_pcfgScore = score; + } + SyntaxNode *GetParent() { + return m_parent; + } + void SetParent(SyntaxNode *parent) { + m_parent = parent; + } + void AddChild(SyntaxNode* child) { + m_children.push_back(child); + } + const std::vector< SyntaxNode* > &GetChildren() const { + return m_children; + } +}; + +} // namespace MosesTraining diff --git a/phrase-extract/SyntaxTree.cpp b/phrase-extract/SyntaxNodeCollection.cpp similarity index 80% rename from phrase-extract/SyntaxTree.cpp rename to phrase-extract/SyntaxNodeCollection.cpp index c50693e0d..099a5697f 100644 --- a/phrase-extract/SyntaxTree.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -1,6 +1,3 @@ -// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2009 University of Edinburgh @@ -21,7 +18,7 @@ ***********************************************************************/ -#include "SyntaxTree.h" +#include "SyntaxNodeCollection.h" #include #include @@ -29,12 +26,12 @@ namespace MosesTraining { -SyntaxTree::~SyntaxTree() +SyntaxNodeCollection::~SyntaxNodeCollection() { Clear(); } -void SyntaxTree::Clear() +void SyntaxNodeCollection::Clear() { m_top = 0; // loop through all m_nodes, delete them @@ -45,7 +42,8 @@ void SyntaxTree::Clear() m_index.clear(); } -SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label ) +SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, + const std::string &label) { SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label ); m_nodes.push_back( newNode ); @@ -54,7 +52,7 @@ SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label ) return newNode; } -ParentNodes SyntaxTree::Parse() +ParentNodes SyntaxNodeCollection::Parse() { ParentNodes parents; @@ -94,12 +92,12 @@ ParentNodes SyntaxTree::Parse() return parents; } -bool SyntaxTree::HasNode( int startPos, int endPos ) const +bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const { return GetNodes( startPos, endPos).size() > 0; } -const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos ) const +const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const { SyntaxTreeIndexIterator startIndex = m_index.find( startPos ); if (startIndex == m_index.end() ) @@ -112,15 +110,7 @@ const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos return endIndex->second; } -// for printing out tree -std::string SyntaxTree::ToString() const -{ - std::stringstream out; - out << *this; - return out.str(); -} - -void SyntaxTree::ConnectNodes() +void SyntaxNodeCollection::ConnectNodes() { typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator; @@ -162,27 +152,4 @@ void SyntaxTree::ConnectNodes() } } -std::ostream& operator<<(std::ostream& os, const SyntaxTree& t) -{ - size_t size = t.m_index.size(); - for(size_t length=1; length<=size; length++) { - for(size_t space=0; spaceGetLabel() + "#######"; - - os << label.substr(0,7) << " "; - } else { - os << "------- "; - } - } - os << std::endl; - } - return os; -} - -} - +} // namespace MosesTraining diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxNodeCollection.h similarity index 62% rename from phrase-extract/SyntaxTree.h rename to phrase-extract/SyntaxNodeCollection.h index 6ffb5da34..70b14206d 100644 --- a/phrase-extract/SyntaxTree.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -1,6 +1,3 @@ -// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2009 University of Edinburgh @@ -20,66 +17,22 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ - #pragma once -#include -#include + #include #include +#include +#include + +#include "SyntaxNode.h" namespace MosesTraining { -class SyntaxNode -{ -protected: - int m_start, m_end; - std::string m_label; - std::vector< SyntaxNode* > m_children; - SyntaxNode* m_parent; - float m_pcfgScore; -public: - SyntaxNode( int startPos, int endPos, std::string label ) - :m_start(startPos) - ,m_end(endPos) - ,m_label(label) - ,m_parent(0) - ,m_pcfgScore(0.0f) { - } - int GetStart() const { - return m_start; - } - int GetEnd() const { - return m_end; - } - std::string GetLabel() const { - return m_label; - } - float GetPcfgScore() const { - return m_pcfgScore; - } - void SetPcfgScore(float score) { - m_pcfgScore = score; - } - SyntaxNode *GetParent() { - return m_parent; - } - void SetParent(SyntaxNode *parent) { - m_parent = parent; - } - void AddChild(SyntaxNode* child) { - m_children.push_back(child); - } - const std::vector< SyntaxNode* > &GetChildren() const { - return m_children; - } -}; - - typedef std::vector< int > SplitPoints; typedef std::vector< SplitPoints > ParentNodes; -class SyntaxTree +class SyntaxNodeCollection { protected: std::vector< SyntaxNode* > m_nodes; @@ -93,16 +46,14 @@ protected: int m_size; std::vector< SyntaxNode* > m_emptyNode; - friend std::ostream& operator<<(std::ostream&, const SyntaxTree&); - public: - SyntaxTree() + SyntaxNodeCollection() : m_top(0) // m_top doesn't get set unless ConnectNodes is called. , m_size(0) {} - ~SyntaxTree(); + ~SyntaxNodeCollection(); - SyntaxNode *AddNode( int startPos, int endPos, std::string label ); + SyntaxNode *AddNode( int startPos, int endPos, const std::string &label ); SyntaxNode *GetTop() { return m_top; @@ -119,10 +70,6 @@ public: } void ConnectNodes(); void Clear(); - std::string ToString() const; }; -std::ostream& operator<<(std::ostream&, const SyntaxTree&); - -} - +} // namespace MosesTraining diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index 6efa1bf5c..0f068fca7 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -1,6 +1,3 @@ -// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh @@ -27,7 +24,8 @@ #include #include #include -#include "SyntaxTree.h" + +#include "SyntaxNodeCollection.h" #include "XmlException.h" using namespace std; @@ -228,7 +226,10 @@ vector TokenizeXml(const string& str) parse because we don't have the completed source parsed until after this function removes all the markup from it (CreateFromString in Sentence::Read). */ -bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection, bool unescapeSpecialChars ) +bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, + set< string > &labelCollection, + map< string, int > &topLabelCollection, + bool unescapeSpecialChars ) { //parse XML markup in translation line @@ -374,7 +375,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label cerr << "XML TAG LABEL IS: '" << label << "'" << endl; cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; } - SyntaxNode *node = tree.AddNode( startPos, endPos-1, label ); + SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label ); node->SetPcfgScore(pcfgScore); } } @@ -386,7 +387,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label } // collect top labels - const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 ); + const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 ); for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) { SyntaxNode *n = *node; const string &label = n->GetLabel(); diff --git a/phrase-extract/XmlTree.h b/phrase-extract/XmlTree.h index 50b1c0acc..3b5afd4dd 100644 --- a/phrase-extract/XmlTree.h +++ b/phrase-extract/XmlTree.h @@ -1,6 +1,3 @@ -// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh @@ -21,11 +18,13 @@ ***********************************************************************/ #pragma once + #include #include #include #include -#include "SyntaxTree.h" + +#include "SyntaxNodeCollection.h" namespace MosesTraining { @@ -35,9 +34,8 @@ std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r" std::string TrimXml(const std::string& str); bool isXmlTag(const std::string& tag); std::vector TokenizeXml(const std::string& str); -bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true); +bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true); std::string unescape(const std::string &str); -} // namespace - +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index bc687ec6b..937d88030 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -33,7 +33,8 @@ #include "Span.h" #include "StsgRule.h" #include "StsgRuleWriter.h" -#include "SyntaxTree.h" +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" @@ -172,7 +173,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Parse source tree and construct a SyntaxTree object. - MosesTraining::SyntaxTree sourceSyntaxTree; + MosesTraining::SyntaxNodeCollection sourceSyntaxTree; MosesTraining::SyntaxNode *sourceSyntaxTreeRoot=NULL; if (options.sourceLabels) { @@ -196,7 +197,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Read source tokens. std::vector sourceTokens(ReadTokens(sourceLine)); - // Construct a source ParseTree object from the SyntaxTree object. + // Construct a source ParseTree object from the SyntaxNodeCollection object. std::auto_ptr sourceParseTree; if (options.sourceLabels) { diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index 01178b72c..918c88eeb 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -19,11 +19,12 @@ #include "ScfgRule.h" +#include + #include "Node.h" #include "Subgraph.h" -#include "SyntaxTree.h" - -#include +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" namespace Moses { @@ -31,7 +32,7 @@ namespace GHKM { ScfgRule::ScfgRule(const Subgraph &fragment, - const MosesTraining::SyntaxTree *sourceSyntaxTree) + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree) : m_graphFragment(fragment) , m_sourceLHS("X", NonTerminal) , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal) @@ -133,9 +134,9 @@ ScfgRule::ScfgRule(const Subgraph &fragment, } } -void ScfgRule::PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree, - const Node *node, - const std::string &nonMatchingLabel) +void ScfgRule::PushSourceLabel( + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree, + const Node *node, const std::string &nonMatchingLabel) { ContiguousSpan span = Closure(node->GetSpan()); if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span? diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h index 94ee7b82e..c8b76114a 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.h +++ b/phrase-extract/extract-ghkm/ScfgRule.h @@ -19,16 +19,16 @@ #pragma once -#include "Alignment.h" -#include "Rule.h" -#include "SyntaxTree.h" - #include #include #include #include #include +#include "Alignment.h" +#include "Rule.h" +#include "SyntaxNodeCollection.h" + namespace Moses { namespace GHKM @@ -41,7 +41,7 @@ class ScfgRule : public Rule { public: ScfgRule(const Subgraph &fragment, - const MosesTraining::SyntaxTree *sourceSyntaxTree = 0); + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree = 0); const Subgraph &GetGraphFragment() const { return m_graphFragment; @@ -78,9 +78,9 @@ public: } private: - void PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree, - const Node *node, - const std::string &nonMatchingLabel); + void PushSourceLabel( + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree, + const Node *node, const std::string &nonMatchingLabel); const Subgraph& m_graphFragment; Symbol m_sourceLHS; @@ -95,4 +95,3 @@ private: } // namespace GHKM } // namespace Moses - diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index ff0baeace..db9fa8bf2 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -23,14 +23,15 @@ #include "Exception.h" -#include "SyntaxTree.h" - #include #include #include #include #include +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" + namespace Moses { namespace GHKM @@ -58,7 +59,7 @@ private: std::set &m_labelSet; std::map &m_topLabelSet; std::string m_line; - MosesTraining::SyntaxTree m_tree; + MosesTraining::SyntaxNodeCollection m_tree; std::vector m_words; }; diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 50baa4e0d..825f12d89 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -41,7 +41,7 @@ #include "HoleCollection.h" #include "RuleExist.h" #include "SentenceAlignmentWithSyntax.h" -#include "SyntaxTree.h" +#include "SyntaxNode.h" #include "tables-core.h" #include "XmlTree.h" #include "InputFileStream.h" diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h index 675a112d8..8605c0691 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.h +++ b/phrase-extract/pcfg-common/xml_tree_parser.h @@ -28,7 +28,8 @@ #include #include "pcfg_tree.h" -#include "SyntaxTree.h" +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" namespace MosesTraining { namespace Syntax { @@ -47,7 +48,7 @@ class XmlTreeParser { std::set m_labelSet; std::map m_topLabelSet; std::string m_line; - MosesTraining::SyntaxTree m_tree; + MosesTraining::SyntaxNodeCollection m_tree; std::vector m_words; }; diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index 5c9daa7ae..5bca886bf 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) // process into syntax tree representation set< string > labelCollection; // set of labels, not used map< string, int > topLabelCollection; // count of top labels, not used - SyntaxTree tree; + SyntaxNodeCollection tree; ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false ); const vector< string > inWords = util::tokenize( inBufferString ); @@ -105,7 +105,7 @@ void init(int argc, char* argv[]) } } -void store( SyntaxTree &tree, const vector< string > &words ) +void store( SyntaxNodeCollection &tree, const vector< string > &words ) { // output words for( size_t i=0; i &words ) cout << endl; } -void LeftBinarize( SyntaxTree &tree, ParentNodes &parents ) +void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents ) { for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; @@ -143,7 +143,7 @@ void LeftBinarize( SyntaxTree &tree, ParentNodes &parents ) } } -void RightBinarize( SyntaxTree &tree, ParentNodes &parents ) +void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents ) { for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; @@ -161,11 +161,11 @@ void RightBinarize( SyntaxTree &tree, ParentNodes &parents ) } } -void SAMT( SyntaxTree &tree, ParentNodes &parents ) +void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) { int numWords = tree.GetNumWords(); - SyntaxTree newTree; // to store new nodes + SyntaxNodeCollection newTree; // to store new nodes // look through parents to combine children for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h index 9bd0bfb23..a00aa6deb 100644 --- a/phrase-extract/relax-parse.h +++ b/phrase-extract/relax-parse.h @@ -28,7 +28,7 @@ #include #include -#include "SyntaxTree.h" +#include "SyntaxNodeCollection.h" #include "XmlTree.h" #define LINE_MAX_LENGTH 1000000 @@ -39,8 +39,8 @@ char SAMTLevel = 0; // functions void init(int argc, char* argv[]); -void store( MosesTraining::SyntaxTree &tree, const std::vector &words ); -void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); -void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); -void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); +void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector &words ); +void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); +void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); +void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index c6e3cd3c3..2f8a904fa 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -13,17 +13,17 @@ namespace Syntax { StringTree *XmlTreeParser::Parse(const std::string &line) { line_ = line; - tree_.Clear(); + node_collection_.Clear(); try { - if (!ProcessAndStripXMLTags(line_, tree_, label_set_, top_label_set_, - false)) { + if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_, + top_label_set_, false)) { throw Exception(""); } } catch (const XmlException &e) { throw Exception(e.getMsg()); } - tree_.ConnectNodes(); - SyntaxNode *root = tree_.GetTop(); + node_collection_.ConnectNodes(); + SyntaxNode *root = node_collection_.GetTop(); assert(root); words_ = util::tokenize(line_); return ConvertTree(*root, words_); diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index a5563f63a..c84ea25ec 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -5,7 +5,8 @@ #include #include -#include "SyntaxTree.h" +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" #include "exception.h" #include "string_tree.h" @@ -26,7 +27,7 @@ class XmlTreeParser { std::set label_set_; std::map top_label_set_; std::string line_; - MosesTraining::SyntaxTree tree_; + MosesTraining::SyntaxNodeCollection node_collection_; std::vector words_; }; diff --git a/scripts/OSM/OSM-Train.perl b/scripts/OSM/OSM-Train.perl index 895a821db..07ad71f68 100755 --- a/scripts/OSM/OSM-Train.perl +++ b/scripts/OSM/OSM-Train.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/OSM/extract-singletons.perl b/scripts/OSM/extract-singletons.perl index 5a1665a8c..6295edfad 100755 --- a/scripts/OSM/extract-singletons.perl +++ b/scripts/OSM/extract-singletons.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #use strict; use warnings; diff --git a/scripts/OSM/flipAlignment.perl b/scripts/OSM/flipAlignment.perl index b896c0a23..57a1e9bb0 100755 --- a/scripts/OSM/flipAlignment.perl +++ b/scripts/OSM/flipAlignment.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/clean.pl b/scripts/Transliteration/clean.pl index ccc364fc9..7a08271da 100755 --- a/scripts/Transliteration/clean.pl +++ b/scripts/Transliteration/clean.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #input hindi word urdu word, delete all those entries that have number on any side use warnings; @@ -314,4 +317,4 @@ sub charFreqFilter{ } } } -} \ No newline at end of file +} diff --git a/scripts/Transliteration/corpusCreator.pl b/scripts/Transliteration/corpusCreator.pl index 4c62449df..ac67f5d74 100755 --- a/scripts/Transliteration/corpusCreator.pl +++ b/scripts/Transliteration/corpusCreator.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl index c3cc31f26..e8130db02 100755 --- a/scripts/Transliteration/in-decoding-transliteration.pl +++ b/scripts/Transliteration/in-decoding-transliteration.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl index 60c3200f6..2c7908085 100755 --- a/scripts/Transliteration/post-decoding-transliteration.pl +++ b/scripts/Transliteration/post-decoding-transliteration.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl index df3b1ceca..0a9f554c5 100755 --- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl +++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/threshold.pl b/scripts/Transliteration/threshold.pl index bf6657742..3baa8e0a7 100755 --- a/scripts/Transliteration/threshold.pl +++ b/scripts/Transliteration/threshold.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use utf8; diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl index 35e4ee396..b1d4d0ff5 100755 --- a/scripts/Transliteration/train-transliteration-module.pl +++ b/scripts/Transliteration/train-transliteration-module.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use utf8; diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl index 8e6a6255a..9a3f63d69 100755 --- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl +++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use utf8; ############################################### diff --git a/scripts/analysis/extract-target-trees.py b/scripts/analysis/extract-target-trees.py index 3a92fdc4d..7166211d9 100755 --- a/scripts/analysis/extract-target-trees.py +++ b/scripts/analysis/extract-target-trees.py @@ -1,9 +1,13 @@ #!/usr/bin/env python - -# Usage: extract-target-trees.py [FILE] # -# Reads moses-chart's -T output from FILE or standard input and writes trees to -# standard output in Moses' XML tree format. +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +"""Usage: extract-target-trees.py [FILE] + +Reads moses-chart's -T output from FILE or standard input and writes trees to +standard output in Moses' XML tree format. +""" import re import sys diff --git a/scripts/analysis/nontranslated_words.pl b/scripts/analysis/nontranslated_words.pl index 51a4f9d20..7213deb76 100755 --- a/scripts/analysis/nontranslated_words.pl +++ b/scripts/analysis/nontranslated_words.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # Reads a source and hypothesis file and counts equal tokens. Some of these diff --git a/scripts/analysis/oov.pl b/scripts/analysis/oov.pl index 052c9994d..9756887c9 100755 --- a/scripts/analysis/oov.pl +++ b/scripts/analysis/oov.pl @@ -1,6 +1,9 @@ #!/usr/bin/env perl # Display OOV rate of a test set against a training corpus or a phrase table. # Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl index 72b70dc72..b9eb6e56d 100755 --- a/scripts/analysis/sentence-by-sentence.pl +++ b/scripts/analysis/sentence-by-sentence.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors diff --git a/scripts/analysis/sg2dot.perl b/scripts/analysis/sg2dot.perl index e9c1639ed..5f9a5ea1d 100755 --- a/scripts/analysis/sg2dot.perl +++ b/scripts/analysis/sg2dot.perl @@ -3,6 +3,8 @@ # Author : Loic BARRAULT # Script to convert MOSES searchgraph to DOT format # +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/analysis/show-phrases-used.pl b/scripts/analysis/show-phrases-used.pl index 522e6d3ff..9428ea9b8 100755 --- a/scripts/analysis/show-phrases-used.pl +++ b/scripts/analysis/show-phrases-used.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #show-phrases-used: display all source and target phrases for each sentence in a corpus, and give average phrase length used diff --git a/scripts/analysis/smtgui/Corpus.pm b/scripts/analysis/smtgui/Corpus.pm index f050a9f6d..2391a6c15 100644 --- a/scripts/analysis/smtgui/Corpus.pm +++ b/scripts/analysis/smtgui/Corpus.pm @@ -1,5 +1,8 @@ #package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole #Evan Herbst, 7 / 25 / 06 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. package Corpus; BEGIN diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl index 55f2619c0..cd0f6b91b 100755 --- a/scripts/analysis/smtgui/filter-phrase-table.pl +++ b/scripts/analysis/smtgui/filter-phrase-table.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #by Philipp Koehn, de-augmented by Evan Herbst diff --git a/scripts/analysis/smtgui/newsmtgui.cgi b/scripts/analysis/smtgui/newsmtgui.cgi index 32ad3a948..034ee265e 100755 --- a/scripts/analysis/smtgui/newsmtgui.cgi +++ b/scripts/analysis/smtgui/newsmtgui.cgi @@ -1,4 +1,7 @@ #!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use strict; diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl index 3ea15154e..f807153d9 100755 --- a/scripts/analysis/suspicious_tokenization.pl +++ b/scripts/analysis/suspicious_tokenization.pl @@ -2,6 +2,9 @@ # Collects and prints all n-grams that appear in the given corpus both # tokenized as well as untokenized. # Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/analysis/weight-scan-summarize.sh b/scripts/analysis/weight-scan-summarize.sh index 237182736..2fccb6470 100755 --- a/scripts/analysis/weight-scan-summarize.sh +++ b/scripts/analysis/weight-scan-summarize.sh @@ -1,4 +1,8 @@ #!/bin/bash +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + # Hackish summarization of weight-scan.pl results, heavily relies on tools by # Ondrej Bojar (bojar@ufal.mff.cuni.cz), some of which need Mercury; beware. diff --git a/scripts/analysis/weight-scan.pl b/scripts/analysis/weight-scan.pl index b33360694..b51a6bcd1 100755 --- a/scripts/analysis/weight-scan.pl +++ b/scripts/analysis/weight-scan.pl @@ -1,4 +1,8 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + # runs Moses many times changing the values of one weight, all others fixed # nbest lists are always produced to allow for comparison of real and # 'projected' BLEU (BLEU estimated from n-best lists collected at a neighouring diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index ece110fbc..a3f5310a5 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Experiment Management System # Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS diff --git a/scripts/ems/fix-info.perl b/scripts/ems/fix-info.perl index abe58fe83..6659027b2 100755 --- a/scripts/ems/fix-info.perl +++ b/scripts/ems/fix-info.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl index f4d5a55b4..01bb21773 100755 --- a/scripts/ems/support/analysis.perl +++ b/scripts/ems/support/analysis.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/berkeley-process.sh b/scripts/ems/support/berkeley-process.sh index e68056c96..347ebba3c 100755 --- a/scripts/ems/support/berkeley-process.sh +++ b/scripts/ems/support/berkeley-process.sh @@ -1,4 +1,7 @@ #!/bin/sh +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. if [ $# -lt 8 ] then diff --git a/scripts/ems/support/berkeley-train.sh b/scripts/ems/support/berkeley-train.sh index 96f6b648c..530cf978f 100755 --- a/scripts/ems/support/berkeley-train.sh +++ b/scripts/ems/support/berkeley-train.sh @@ -1,4 +1,7 @@ #!/bin/sh +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. if [ $# -lt 6 ] then diff --git a/scripts/ems/support/build-domain-file-from-subcorpora.perl b/scripts/ems/support/build-domain-file-from-subcorpora.perl index 085fd2629..f45b5ba2a 100755 --- a/scripts/ems/support/build-domain-file-from-subcorpora.perl +++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl index 79fc1e394..b134cee69 100755 --- a/scripts/ems/support/build-sparse-features.perl +++ b/scripts/ems/support/build-sparse-features.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/consolidate-training-data.perl b/scripts/ems/support/consolidate-training-data.perl index 4ab7f82cf..2a732be77 100755 --- a/scripts/ems/support/consolidate-training-data.perl +++ b/scripts/ems/support/consolidate-training-data.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ diff --git a/scripts/ems/support/defaultconfig.py b/scripts/ems/support/defaultconfig.py index a118e96b3..53913da08 100644 --- a/scripts/ems/support/defaultconfig.py +++ b/scripts/ems/support/defaultconfig.py @@ -1,4 +1,7 @@ #!/usr/bin/env python2 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Version of ConfigParser which accepts default values.""" diff --git a/scripts/ems/support/fast-align-in-parts.perl b/scripts/ems/support/fast-align-in-parts.perl index f777d7e52..bc340a50f 100755 --- a/scripts/ems/support/fast-align-in-parts.perl +++ b/scripts/ems/support/fast-align-in-parts.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. ####################### # Revision history diff --git a/scripts/ems/support/generic-multicore-parallelizer.perl b/scripts/ems/support/generic-multicore-parallelizer.perl index 0f7910603..d821aa114 100755 --- a/scripts/ems/support/generic-multicore-parallelizer.perl +++ b/scripts/ems/support/generic-multicore-parallelizer.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl index 811a99bde..087498ccf 100755 --- a/scripts/ems/support/generic-parallelizer.perl +++ b/scripts/ems/support/generic-parallelizer.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/input-from-sgm.perl b/scripts/ems/support/input-from-sgm.perl index 18000581a..eb6a2e3a1 100755 --- a/scripts/ems/support/input-from-sgm.perl +++ b/scripts/ems/support/input-from-sgm.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl index 7d52fd877..4d9a513f6 100755 --- a/scripts/ems/support/interpolate-lm.perl +++ b/scripts/ems/support/interpolate-lm.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl index df503754f..89b2847d6 100755 --- a/scripts/ems/support/lmplz-wrapper.perl +++ b/scripts/ems/support/lmplz-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/mml-filter.perl b/scripts/ems/support/mml-filter.perl index 51bc4cda5..32bca335b 100755 --- a/scripts/ems/support/mml-filter.perl +++ b/scripts/ems/support/mml-filter.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/mml-score.perl b/scripts/ems/support/mml-score.perl index 6f7b724ea..f88021818 100755 --- a/scripts/ems/support/mml-score.perl +++ b/scripts/ems/support/mml-score.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl index dcc998711..bdf6c1c1a 100755 --- a/scripts/ems/support/mml-train.perl +++ b/scripts/ems/support/mml-train.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/prepare-fast-align.perl b/scripts/ems/support/prepare-fast-align.perl index 80fec36b2..68b1f0189 100755 --- a/scripts/ems/support/prepare-fast-align.perl +++ b/scripts/ems/support/prepare-fast-align.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/reference-from-sgm.perl b/scripts/ems/support/reference-from-sgm.perl index ebb9ae4ae..b8e1d108d 100755 --- a/scripts/ems/support/reference-from-sgm.perl +++ b/scripts/ems/support/reference-from-sgm.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/remove-segmentation-markup.perl b/scripts/ems/support/remove-segmentation-markup.perl index a0bd61fff..3b02bceaf 100755 --- a/scripts/ems/support/remove-segmentation-markup.perl +++ b/scripts/ems/support/remove-segmentation-markup.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl index b649951ce..c859508cb 100755 --- a/scripts/ems/support/report-experiment-scores.perl +++ b/scripts/ems/support/report-experiment-scores.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $ diff --git a/scripts/ems/support/run-command-on-multiple-refsets.perl b/scripts/ems/support/run-command-on-multiple-refsets.perl index 1e914b44b..41823b4ee 100755 --- a/scripts/ems/support/run-command-on-multiple-refsets.perl +++ b/scripts/ems/support/run-command-on-multiple-refsets.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/run-wade.perl b/scripts/ems/support/run-wade.perl index 175948b98..dfdb8e59d 100755 --- a/scripts/ems/support/run-wade.perl +++ b/scripts/ems/support/run-wade.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 02a1e2315..f72767054 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Based on Preprocessor written by Philipp Koehn diff --git a/scripts/ems/support/submit-grid.perl b/scripts/ems/support/submit-grid.perl index a0967f9a5..ff43cd123 100755 --- a/scripts/ems/support/submit-grid.perl +++ b/scripts/ems/support/submit-grid.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/substitute-filtered-tables-and-weights.perl b/scripts/ems/support/substitute-filtered-tables-and-weights.perl index 13be52c6b..2e6908ab4 100755 --- a/scripts/ems/support/substitute-filtered-tables-and-weights.perl +++ b/scripts/ems/support/substitute-filtered-tables-and-weights.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl index c5ebabded..548982592 100755 --- a/scripts/ems/support/substitute-filtered-tables.perl +++ b/scripts/ems/support/substitute-filtered-tables.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/ems/support/substitute-weights.perl b/scripts/ems/support/substitute-weights.perl index b692f3f85..efa9338ca 100755 --- a/scripts/ems/support/substitute-weights.perl +++ b/scripts/ems/support/substitute-weights.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/ems/support/symmetrize-fast-align.perl b/scripts/ems/support/symmetrize-fast-align.perl index 9f7fec248..4ed3e087d 100755 --- a/scripts/ems/support/symmetrize-fast-align.perl +++ b/scripts/ems/support/symmetrize-fast-align.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/thot-lm-wrapper.perl b/scripts/ems/support/thot-lm-wrapper.perl index 59d483e65..ffbcb50e2 100755 --- a/scripts/ems/support/thot-lm-wrapper.perl +++ b/scripts/ems/support/thot-lm-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/tree-converter-wrapper.perl b/scripts/ems/support/tree-converter-wrapper.perl index aae55991a..ae7e2c5a6 100755 --- a/scripts/ems/support/tree-converter-wrapper.perl +++ b/scripts/ems/support/tree-converter-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl index 52190309a..09ea2a2f8 100755 --- a/scripts/ems/support/wrap-xml.perl +++ b/scripts/ems/support/wrap-xml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php index 57776dd22..5e5f707f6 100644 --- a/scripts/ems/web/analysis.php +++ b/scripts/ems/web/analysis.php @@ -1,5 +1,10 @@ Search Graph Visualization, Sentence <?php $sentence ?> diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl index 4ab281eae..97025d62a 100755 --- a/scripts/fuzzy-match/create_xml.perl +++ b/scripts/fuzzy-match/create_xml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. binmode( STDIN, ":utf8" ); binmode( STDOUT, ":utf8" ); diff --git a/scripts/generic/bsbleu.py b/scripts/generic/bsbleu.py index 12d2201de..296900b18 100755 --- a/scripts/generic/bsbleu.py +++ b/scripts/generic/bsbleu.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # compute Bleu scores with confidence intervals via boostrap resampling # written by Ulrich Germann +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from argparse import ArgumentParser import math diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl index b39d4d660..2ece80a60 100755 --- a/scripts/generic/compound-splitter.perl +++ b/scripts/generic/compound-splitter.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/extract-factors.pl b/scripts/generic/extract-factors.pl index 38cf97bd4..2b1c51cd1 100755 --- a/scripts/generic/extract-factors.pl +++ b/scripts/generic/extract-factors.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #extract-factors.pl: extract only the desired factors from a factored corpus diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index be30ff652..3240f24eb 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # example # ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput diff --git a/scripts/generic/fsa2fsal.pl b/scripts/generic/fsa2fsal.pl index 7dc7751ee..28ec28a26 100755 --- a/scripts/generic/fsa2fsal.pl +++ b/scripts/generic/fsa2fsal.pl @@ -4,6 +4,9 @@ # ' ' to delimit nodes (i.e. original lines). # Some rudimentary sanity checks are done on the fly. # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/fsa2plf.pl b/scripts/generic/fsa2plf.pl index 07c8a4cc1..4b9474d5a 100755 --- a/scripts/generic/fsa2plf.pl +++ b/scripts/generic/fsa2plf.pl @@ -7,6 +7,9 @@ # final nodes. # Note that the output format may not contain any spaces. # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/fsal2fsa.pl b/scripts/generic/fsal2fsa.pl index a21305dad..158dab5b3 100755 --- a/scripts/generic/fsal2fsa.pl +++ b/scripts/generic/fsal2fsa.pl @@ -1,6 +1,9 @@ #!/usr/bin/env perl # A very simple script that converts fsal back to fsa format (openfst lattices) # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/generic-parallel.perl b/scripts/generic/generic-parallel.perl index a9bc73d85..07f6a210a 100755 --- a/scripts/generic/generic-parallel.perl +++ b/scripts/generic/generic-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/giza-parallel.perl b/scripts/generic/giza-parallel.perl index 9a6516a8f..a9921a992 100755 --- a/scripts/generic/giza-parallel.perl +++ b/scripts/generic/giza-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # example # ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align diff --git a/scripts/generic/lopar2pos.pl b/scripts/generic/lopar2pos.pl index 2b9245e0f..fc2c35c7f 100755 --- a/scripts/generic/lopar2pos.pl +++ b/scripts/generic/lopar2pos.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #lopar2pos: extract POSs from LOPAR output diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl index eb51daa98..144b7d6b2 100755 --- a/scripts/generic/moses-parallel.pl +++ b/scripts/generic/moses-parallel.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ ####################### diff --git a/scripts/generic/moses_sim_pe.py b/scripts/generic/moses_sim_pe.py index 32f785961..3497ca558 100755 --- a/scripts/generic/moses_sim_pe.py +++ b/scripts/generic/moses_sim_pe.py @@ -1,20 +1,25 @@ #!/usr/bin/env python # Written by Michael Denkowski +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# This script parallelizes decoding with simulated post-editing via moses XML -# input (XML entities need to be escaped in tokenization). Memory mapped -# dynamic phrase tables (Ulrich Germann, -# www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models -# (Kenneth Heafield, -# http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19) -# facilitate memory efficient multi process decoding. Input is divided into -# batches, each of which is decoded sequentially. Each batch pre-loads the -# data from previous batches. +"""Parallelize decoding with simulated post-editing via moses XML input. -# To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the -# alignment from input to references. Specify the number of jobs with -# --decoder-flags="-threads N". +(XML entities need to be escaped in tokenization). Memory mapped +dynamic phrase tables (Ulrich Germann, +www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models +(Kenneth Heafield, +http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19) +facilitate memory efficient multi process decoding. Input is divided into +batches, each of which is decoded sequentially. Each batch pre-loads the +data from previous batches. + +To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the +alignment from input to references. Specify the number of jobs with +--decoder-flags="-threads N". +""" import gzip import itertools diff --git a/scripts/generic/mteval-v12.pl b/scripts/generic/mteval-v12.pl index 2666c8012..b4dfbf83a 100755 --- a/scripts/generic/mteval-v12.pl +++ b/scripts/generic/mteval-v12.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl index 41a88800a..bdc2d9479 100755 --- a/scripts/generic/mteval-v13a.pl +++ b/scripts/generic/mteval-v13a.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/multi-bleu.perl b/scripts/generic/multi-bleu.perl index 344f58c6f..61de10d45 100755 --- a/scripts/generic/multi-bleu.perl +++ b/scripts/generic/multi-bleu.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/generic/ph_numbers.perl b/scripts/generic/ph_numbers.perl index 612263249..f0ae1f851 100755 --- a/scripts/generic/ph_numbers.perl +++ b/scripts/generic/ph_numbers.perl @@ -6,6 +6,9 @@ package ph_numbers; # and decoder input # # (c) 2013 TAUS +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl index ac3d0900a..ef9938e07 100755 --- a/scripts/generic/qsub-wrapper.pl +++ b/scripts/generic/qsub-wrapper.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/generic/reverse-alignment.perl b/scripts/generic/reverse-alignment.perl index 681b3221e..f01acf5b0 100755 --- a/scripts/generic/reverse-alignment.perl +++ b/scripts/generic/reverse-alignment.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index 81bc6f7d0..625b449c0 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # example # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0 diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl index c993421f0..a5dbbaa37 100755 --- a/scripts/generic/strip-xml.perl +++ b/scripts/generic/strip-xml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/trainlm-irst2.perl b/scripts/generic/trainlm-irst2.perl index f664e96ee..8af372fac 100755 --- a/scripts/generic/trainlm-irst2.perl +++ b/scripts/generic/trainlm-irst2.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Compatible with sri LM-creating script, eg. # ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt diff --git a/scripts/other/beautify.py b/scripts/other/beautify.py index f03a58ce7..56df24bc8 100755 --- a/scripts/other/beautify.py +++ b/scripts/other/beautify.py @@ -1,4 +1,9 @@ #! /usr/bin/env python +# +# Originally written in 2015 by Jeroen Vermeulen (Precision Translation Tools). +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Reformat project source code, and/or check for style errors ("lint"). @@ -38,6 +43,17 @@ BEAUTIFY_IGNORE = '.beautify-ignore' class LintCheckFailure(Exception): """Lint was found, or the lint checker otherwise returned failure.""" + exit_code = 1 + + +class ProgramFailure(Exception): + """The program failed, but it's not a bug. No traceback.""" + exit_code = 2 + + +class CommandLineError(Exception): + """Something wrong with the command-line arguments.""" + exit_code = 3 def read_ignore_file(root_dir): @@ -52,7 +68,7 @@ def read_ignore_file(root_dir): ignore_contents = ignore_file.read() except IOError as error: if error.errno == ENOENT: - raise Exception( + raise ProgramFailure( "No .gitignore file found in %s. " "Is it really the project's root directory?" % root_dir) @@ -200,7 +216,7 @@ def check_astyle_version(verbose=False): ['astyle', '--version'], verbose=verbose, env={'LC_ALL': 'C'}) version = version.strip() if version != EXPECTED_ASTYLE_VERSION: - raise Exception( + raise ProgramFailure( "Wrong astyle version. " "Expected '%s', but got version string '%s'." % (EXPECTED_ASTYLE_VERSION, version)) @@ -226,8 +242,15 @@ def run_perltidy(source_files, verbose=False, dry_run=False): # Write "} else {", with 'else' on the same line as the braces. '--cuddled-else', ] - _, stderr = run_command( - command_line + source_files, verbose=verbose, dry_run=dry_run) + try: + _, stderr = run_command( + command_line + source_files, verbose=verbose, dry_run=dry_run) + except OSError as error: + if error.errno == ENOENT: + raise ProgramFailure( + "Could not run 'perltidy'. Make sure that it is installed.") + else: + raise if stderr != '': sys.stderr.write(stderr) @@ -386,7 +409,7 @@ def main(): """Find and format source files.""" args = parse_arguments() if not args.format and not args.lint: - raise Exception("Select action: --format, --lint, or both.") + raise CommandLineError("Select action: --format, --lint, or both.") ignore = read_ignore_file(args.root_dir) @@ -409,8 +432,8 @@ def main(): if __name__ == '__main__': try: main() - except LintCheckFailure as error: + except (CommandLineError, LintCheckFailure, ProgramFailure) as error: # This is a failure, but not a bug. Print a friendly error # message, not a traceback. sys.stderr.write('%s\n' % error) - sys.exit(1) + sys.exit(error.exit_code) diff --git a/scripts/other/convert-pt.perl b/scripts/other/convert-pt.perl index e087126f1..60c8cbdb2 100755 --- a/scripts/other/convert-pt.perl +++ b/scripts/other/convert-pt.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # convert a phrase-table with alignment in Moses' dead-end format diff --git a/scripts/other/delete-scores.perl b/scripts/other/delete-scores.perl index ffb788867..ebaf277fa 100755 --- a/scripts/other/delete-scores.perl +++ b/scripts/other/delete-scores.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/other/gacha_filter.py b/scripts/other/gacha_filter.py index 0deb45761..af5921d41 100644 --- a/scripts/other/gacha_filter.py +++ b/scripts/other/gacha_filter.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ The Gacha filter cleans out sentence pairs that have global character mean diff --git a/scripts/other/get_many_translations_from_google.perl b/scripts/other/get_many_translations_from_google.perl index 0b1436c20..ac2933296 100755 --- a/scripts/other/get_many_translations_from_google.perl +++ b/scripts/other/get_many_translations_from_google.perl @@ -5,6 +5,9 @@ # Expects one sentence per line, not tokenized! # # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/other/retain-lines.perl b/scripts/other/retain-lines.perl index f04a8ebad..c789f96c7 100755 --- a/scripts/other/retain-lines.perl +++ b/scripts/other/retain-lines.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #retain lines in clean.lines-retained.1 use strict; diff --git a/scripts/other/translate_by_microsoft_bing.perl b/scripts/other/translate_by_microsoft_bing.perl index c9b1b31de..d4222878e 100755 --- a/scripts/other/translate_by_microsoft_bing.perl +++ b/scripts/other/translate_by_microsoft_bing.perl @@ -2,6 +2,9 @@ # Script implemented by Pranava Swaroop Madhyastha (a student at Charles # University, UFAL) +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl index b882852a0..66ca24fa2 100755 --- a/scripts/recaser/detruecase.perl +++ b/scripts/recaser/detruecase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl index 52cec36ea..b951ca764 100755 --- a/scripts/recaser/recase.perl +++ b/scripts/recaser/recase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl index dce388bca..cb3388c38 100755 --- a/scripts/recaser/train-recaser.perl +++ b/scripts/recaser/train-recaser.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 753183324..7f8909082 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index 544b79c47..aab185ce9 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ diff --git a/scripts/regression-testing/MosesScriptsRegressionTesting.pm b/scripts/regression-testing/MosesScriptsRegressionTesting.pm index d8b0590c8..acc134d70 100644 --- a/scripts/regression-testing/MosesScriptsRegressionTesting.pm +++ b/scripts/regression-testing/MosesScriptsRegressionTesting.pm @@ -1,3 +1,6 @@ +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + package MosesScriptsRegressionTesting; use strict; diff --git a/scripts/regression-testing/compare-results.pl b/scripts/regression-testing/compare-results.pl index 572431951..8f1461cec 100755 --- a/scripts/regression-testing/compare-results.pl +++ b/scripts/regression-testing/compare-results.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/create_localized_moses_ini.pl b/scripts/regression-testing/create_localized_moses_ini.pl index 1d03e5ab8..3e2b6f37f 100755 --- a/scripts/regression-testing/create_localized_moses_ini.pl +++ b/scripts/regression-testing/create_localized_moses_ini.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/modify-pars.pl b/scripts/regression-testing/modify-pars.pl index de2df2919..7726af9e6 100755 --- a/scripts/regression-testing/modify-pars.pl +++ b/scripts/regression-testing/modify-pars.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/moses-virtual.pl b/scripts/regression-testing/moses-virtual.pl index 3af3c79e4..3b23b525a 100755 --- a/scripts/regression-testing/moses-virtual.pl +++ b/scripts/regression-testing/moses-virtual.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/run-single-test.pl b/scripts/regression-testing/run-single-test.pl index e8307da36..037de8285 100755 --- a/scripts/regression-testing/run-single-test.pl +++ b/scripts/regression-testing/run-single-test.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/run-test-suite.pl b/scripts/regression-testing/run-test-suite.pl index b384f8b98..a12938e61 100755 --- a/scripts/regression-testing/run-test-suite.pl +++ b/scripts/regression-testing/run-test-suite.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/server/moses.py b/scripts/server/moses.py index 7cf152187..e825ab39e 100644 --- a/scripts/server/moses.py +++ b/scripts/server/moses.py @@ -1,5 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ Python utilities for moses diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py index 5f1407524..6f76bf46d 100755 --- a/scripts/server/sim-pe.py +++ b/scripts/server/sim-pe.py @@ -2,8 +2,14 @@ # -*- coding: utf-8 -*- # Written by Ulrich Germann on the basis of contrib/server/client.py. -# This script simulates post-editing of MT output and incrementally -# updates the dynamic phrase tables in the moses server. +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +"""Simulate post-editing of MT output. + +Incrementally updates the dynamic phrase tables in the moses server. +""" import argparse import os diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl index f9601924f..ad2529b21 100755 --- a/scripts/tokenizer/deescape-special-chars-PTB.perl +++ b/scripts/tokenizer/deescape-special-chars-PTB.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl index 002955e62..b9d1ad74c 100755 --- a/scripts/tokenizer/deescape-special-chars.perl +++ b/scripts/tokenizer/deescape-special-chars.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl index 3a92bd024..881b93dd1 100755 --- a/scripts/tokenizer/detokenizer.perl +++ b/scripts/tokenizer/detokenizer.perl @@ -4,6 +4,9 @@ # Sample De-Tokenizer # written by Josh Schroeder, based on code by Philipp Koehn # further modifications by Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl index fbbbae292..143e85490 100755 --- a/scripts/tokenizer/escape-special-chars.perl +++ b/scripts/tokenizer/escape-special-chars.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/lowercase.perl b/scripts/tokenizer/lowercase.perl index e5c41bbed..bc75e5e5c 100755 --- a/scripts/tokenizer/lowercase.perl +++ b/scripts/tokenizer/lowercase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl index 13e9fd3fc..7dab7543a 100755 --- a/scripts/tokenizer/normalize-punctuation.perl +++ b/scripts/tokenizer/normalize-punctuation.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/pre-tok-clean.perl b/scripts/tokenizer/pre-tok-clean.perl index 900e992ee..064f7b187 100755 --- a/scripts/tokenizer/pre-tok-clean.perl +++ b/scripts/tokenizer/pre-tok-clean.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; diff --git a/scripts/tokenizer/pre-tokenizer.perl b/scripts/tokenizer/pre-tokenizer.perl index 514d8da8d..541ce77fb 100755 --- a/scripts/tokenizer/pre-tokenizer.perl +++ b/scripts/tokenizer/pre-tokenizer.perl @@ -3,6 +3,9 @@ # script for preprocessing language data prior to tokenization # Start by Ulrich Germann, after noticing systematic preprocessing errors # in some of the English Europarl data. +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py index 096a45dc4..c03af8f66 100644 --- a/scripts/tokenizer/pre_tokenize_cleaning.py +++ b/scripts/tokenizer/pre_tokenize_cleaning.py @@ -1,4 +1,7 @@ #!/usr/bin/env python -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ The Gacha filter cleans out sentence pairs that have global character mean diff --git a/scripts/tokenizer/remove-non-printing-char.perl b/scripts/tokenizer/remove-non-printing-char.perl index 9125b7691..92f6ade16 100755 --- a/scripts/tokenizer/remove-non-printing-char.perl +++ b/scripts/tokenizer/remove-non-printing-char.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use utf8; diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl index cda69ddf7..c2c7088d6 100755 --- a/scripts/tokenizer/replace-unicode-punctuation.perl +++ b/scripts/tokenizer/replace-unicode-punctuation.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index a5d4fadd3..e08bac941 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/tokenizer/tokenizer_PTB.perl b/scripts/tokenizer/tokenizer_PTB.perl index 6fff8d7f7..46b14775c 100755 --- a/scripts/tokenizer/tokenizer_PTB.perl +++ b/scripts/tokenizer/tokenizer_PTB.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Sample Tokenizer ### Version 1.1 diff --git a/scripts/training/LexicalTranslationModel.pm b/scripts/training/LexicalTranslationModel.pm index c5dad60fb..3adc45f5e 100644 --- a/scripts/training/LexicalTranslationModel.pm +++ b/scripts/training/LexicalTranslationModel.pm @@ -1,3 +1,6 @@ +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + package LexicalTranslationModel; use strict; diff --git a/scripts/training/absolutize_moses_model.pl b/scripts/training/absolutize_moses_model.pl index bb7085895..27eccd8c7 100755 --- a/scripts/training/absolutize_moses_model.pl +++ b/scripts/training/absolutize_moses_model.pl @@ -5,6 +5,9 @@ # paths with absolute paths. # # Ondrej Bojar. +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/training/analyse_moses_model.pl b/scripts/training/analyse_moses_model.pl index 656f4a59b..7a5c2e701 100755 --- a/scripts/training/analyse_moses_model.pl +++ b/scripts/training/analyse_moses_model.pl @@ -4,6 +4,9 @@ # given a moses.ini file, checks the translation and generation tables and reports # statistics on ambiguity # Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/bilingual-lm/averageNullEmbedding.py b/scripts/training/bilingual-lm/averageNullEmbedding.py index 891595aff..54c9a1bc4 100755 --- a/scripts/training/bilingual-lm/averageNullEmbedding.py +++ b/scripts/training/bilingual-lm/averageNullEmbedding.py @@ -1,4 +1,7 @@ #!/usr/bin/env python2 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. import sys import numpy import argparse diff --git a/scripts/training/bilingual-lm/extract.py b/scripts/training/bilingual-lm/extract.py index f620edb5d..876fba9ee 100755 --- a/scripts/training/bilingual-lm/extract.py +++ b/scripts/training/bilingual-lm/extract.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from collections import Counter import logging diff --git a/scripts/training/bilingual-lm/extract_test.py b/scripts/training/bilingual-lm/extract_test.py index 3c9a03b85..8cade1e04 100755 --- a/scripts/training/bilingual-lm/extract_test.py +++ b/scripts/training/bilingual-lm/extract_test.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Create a test corpus, using a previously pruned vocabulary.""" diff --git a/scripts/training/bilingual-lm/extract_training.py b/scripts/training/bilingual-lm/extract_training.py index bd3538188..e39a70318 100755 --- a/scripts/training/bilingual-lm/extract_training.py +++ b/scripts/training/bilingual-lm/extract_training.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from collections import Counter import logging diff --git a/scripts/training/bilingual-lm/reduce_ngrams.py b/scripts/training/bilingual-lm/reduce_ngrams.py index 3442fb302..4db41378d 100755 --- a/scripts/training/bilingual-lm/reduce_ngrams.py +++ b/scripts/training/bilingual-lm/reduce_ngrams.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -"""Reduces an ngrams file for training nplm to a smaller version of it. +"""Reduce an ngrams file for training nplm to a smaller version of it. The smaller version will have fewer ngrams. """ diff --git a/scripts/training/bilingual-lm/test_nplm.py b/scripts/training/bilingual-lm/test_nplm.py index 737266bc3..3a59fd344 100755 --- a/scripts/training/bilingual-lm/test_nplm.py +++ b/scripts/training/bilingual-lm/test_nplm.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. import logging import optparse diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py index 7bc74429e..572076006 100755 --- a/scripts/training/bilingual-lm/train_nplm.py +++ b/scripts/training/bilingual-lm/train_nplm.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from __future__ import print_function, unicode_literals @@ -36,7 +39,8 @@ parser.add_argument("--input-words-file", dest="input_words_file") parser.add_argument("--output-words-file", dest="output_words_file") parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int) parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int) - +parser.add_argument("--mmap", dest="mmap", action="store_true", + help="Use memory-mapped file (for lower memory consumption).") parser.set_defaults( working_dir="working", @@ -110,6 +114,11 @@ def main(options): options.working_dir, os.path.basename(options.corpus_stem) + ".numberized") + mmap_command = [] + if options.mmap: + in_file += '.mmap' + mmap_command = ['--mmap_file', '1'] + model_prefix = os.path.join( options.output_dir, options.output_model + ".model.nplm") train_args = [ @@ -124,9 +133,9 @@ def main(options): "--input_embedding_dimension", str(options.input_embedding), "--output_embedding_dimension", str(options.output_embedding), "--num_threads", str(options.threads), - "--activation_function", - options.activation_fn, - ] + validations_command + vocab_command + "--activation_function", options.activation_fn, + "--ngram_size", str(options.ngram_size), + ] + validations_command + vocab_command + mmap_command print("Train model command: ") print(', '.join(train_args)) diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl index 0239f5fc8..0131d2222 100755 --- a/scripts/training/binarize-model.perl +++ b/scripts/training/binarize-model.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # # Binarize a Moses model diff --git a/scripts/training/build-generation-table.perl b/scripts/training/build-generation-table.perl index 435f7f58e..14176908a 100755 --- a/scripts/training/build-generation-table.perl +++ b/scripts/training/build-generation-table.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/training/build-mmsapt.perl b/scripts/training/build-mmsapt.perl index 00cbd09d6..d0c5b818e 100755 --- a/scripts/training/build-mmsapt.perl +++ b/scripts/training/build-mmsapt.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl index cee4c76a2..76a09e539 100755 --- a/scripts/training/clean-corpus-n.perl +++ b/scripts/training/clean-corpus-n.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $ use warnings; diff --git a/scripts/training/clone_moses_model.pl b/scripts/training/clone_moses_model.pl index bf6708fca..18dc4aa41 100755 --- a/scripts/training/clone_moses_model.pl +++ b/scripts/training/clone_moses_model.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # given a moses.ini file, creates a fresh version of it diff --git a/scripts/training/combine_factors.pl b/scripts/training/combine_factors.pl index fa6f15db2..fcc9ab3f5 100755 --- a/scripts/training/combine_factors.pl +++ b/scripts/training/combine_factors.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # given a list of files, combines them to a single corpus (sent to stdout) diff --git a/scripts/training/convert-moses-ini-to-v2.perl b/scripts/training/convert-moses-ini-to-v2.perl index e091a710d..3fdfa53a6 100755 --- a/scripts/training/convert-moses-ini-to-v2.perl +++ b/scripts/training/convert-moses-ini-to-v2.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/convert-moses-ini-v2-to-v1.py b/scripts/training/convert-moses-ini-v2-to-v1.py index 44f192efe..3ef7d7c0d 100755 --- a/scripts/training/convert-moses-ini-v2-to-v1.py +++ b/scripts/training/convert-moses-ini-v2-to-v1.py @@ -1,5 +1,8 @@ #! /usr/bin/env python # -*- coding: utf8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 3 or, at your option, any later version. from __future__ import ( diff --git a/scripts/training/corpus-sizes.perl b/scripts/training/corpus-sizes.perl index 30ae67ebb..1a6db669b 100755 --- a/scripts/training/corpus-sizes.perl +++ b/scripts/training/corpus-sizes.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ diff --git a/scripts/training/create_count_tables.py b/scripts/training/create_count_tables.py index 2288c034a..12499b1d7 100755 --- a/scripts/training/create_count_tables.py +++ b/scripts/training/create_count_tables.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # This script creates tables that store phrase pair frequencies rather than # probabilities. diff --git a/scripts/training/exodus.perl b/scripts/training/exodus.perl index bb8616007..579056ff0 100755 --- a/scripts/training/exodus.perl +++ b/scripts/training/exodus.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index e3a34c40b..a44d9c193 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # Given a moses.ini file and an input text prepare minimized translation diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py index 14736fe1f..d28fa0c89 100755 --- a/scripts/training/filter-rule-table.py +++ b/scripts/training/filter-rule-table.py @@ -1,25 +1,29 @@ #!/usr/bin/env python # Author: Phil Williams +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT -# -# Given a rule table (on stdin) and an input text, filter out rules that -# couldn't be used in parsing the input and write the resulting rule table -# to stdout. The input text is assumed to contain the same factors as -# the rule table and is assumed to be small (not more than a few thousand -# sentences): the current algorithm won't scale well to large input sets. -# -# The filtering algorithm considers a source RHS to be a sequence of -# words and gaps, which must match a sequence of words in one of the -# input sentences, with at least one input word per gap. The NT labels -# are ignored, so for example a rule with the source RHS "the JJ dog" -# would be allowed if the sequence "the slobbering dog" occurs in one of -# the input sentences, even if there's no rule to derive a JJ from -# "slobbering." (If "slobbering" were an unknown word, the 'unknown-lhs' -# decoder option would allow it to take a number of NT labels, likely -# including JJ, with varying probabilities, so removing the rule would -# be a bad idea.) +"""Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT + +Given a rule table (on stdin) and an input text, filter out rules that +couldn't be used in parsing the input and write the resulting rule table +to stdout. The input text is assumed to contain the same factors as +the rule table and is assumed to be small (not more than a few thousand +sentences): the current algorithm won't scale well to large input sets. + +The filtering algorithm considers a source RHS to be a sequence of +words and gaps, which must match a sequence of words in one of the +input sentences, with at least one input word per gap. The NT labels +are ignored, so for example a rule with the source RHS "the JJ dog" +would be allowed if the sequence "the slobbering dog" occurs in one of +the input sentences, even if there's no rule to derive a JJ from +"slobbering." (If "slobbering" were an unknown word, the 'unknown-lhs' +decoder option would allow it to take a number of NT labels, likely +including JJ, with varying probabilities, so removing the rule would +be a bad idea.) +""" import optparse import sys diff --git a/scripts/training/flexibility_score.py b/scripts/training/flexibility_score.py index 496184616..56d4f9425 100755 --- a/scripts/training/flexibility_score.py +++ b/scripts/training/flexibility_score.py @@ -2,6 +2,9 @@ # -*- coding: utf-8 -*- # author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Add flexibility scores to a phrase table half. diff --git a/scripts/training/giza2bal.pl b/scripts/training/giza2bal.pl index 27ba9d659..ad9edb584 100755 --- a/scripts/training/giza2bal.pl +++ b/scripts/training/giza2bal.pl @@ -6,6 +6,9 @@ #produced by giza containing the frequency of each traning sentence. #Copyright Marcello Federico, November 2004 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #use warnings; diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 92e1a79ff..c73e75a87 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -1,4 +1,8 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + # $Id$ # Usage: # mert-moses.pl diff --git a/scripts/training/postprocess-lopar.perl b/scripts/training/postprocess-lopar.perl index 44be9c26c..05a56a3b5 100755 --- a/scripts/training/postprocess-lopar.perl +++ b/scripts/training/postprocess-lopar.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ diff --git a/scripts/training/rdlm/average_null_embedding.py b/scripts/training/rdlm/average_null_embedding.py index 28abc9508..899b402c1 100755 --- a/scripts/training/rdlm/average_null_embedding.py +++ b/scripts/training/rdlm/average_null_embedding.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Average embeddings of special null words for RDLM. diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py index 1292e90f2..be4ed2335 100755 --- a/scripts/training/rdlm/extract_syntactic_ngrams.py +++ b/scripts/training/rdlm/extract_syntactic_ngrams.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ Extract syntactic n-grams from dependency treebank in Moses XML format for diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py index ed9266fd9..48e5215c3 100755 --- a/scripts/training/rdlm/extract_vocab.py +++ b/scripts/training/rdlm/extract_vocab.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # extract 5 vocabulary files from parsed corpus in moses XML format diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py index 639c1b32c..289ab405c 100755 --- a/scripts/training/rdlm/train_rdlm.py +++ b/scripts/training/rdlm/train_rdlm.py @@ -1,5 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from __future__ import print_function, unicode_literals @@ -91,11 +94,14 @@ parser.add_argument( "--output-words-file", dest="output_words_file", metavar="PATH", help="Output vocabulary (default: %(default)s).") parser.add_argument( - "--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", + "--input-vocab-size", dest="input_vocab_size", type=int, metavar="INT", help="Input vocabulary size (default: %(default)s).") parser.add_argument( "--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="Output vocabulary size (default: %(default)s).") +parser.add_argument( + "--mmap", dest="mmap", action="store_true", + help="Use memory-mapped file (for lower memory consumption).") parser.set_defaults( @@ -192,11 +198,14 @@ def main(options): "extracting vocabulary from training text.\n") prepare_vocabulary(options) + numberized_file = os.path.basename(options.corpus_stem) + '.numberized' + train_file = numberized_file + if options.mmap: + train_file += '.mmap' + extract_options = extract_syntactic_ngrams.create_parser().parse_args([ '--input', options.corpus_stem, - '--output', os.path.join( - options.working_dir, - os.path.basename(options.corpus_stem) + '.numberized'), + '--output', os.path.join(options.working_dir, numberized_file), '--vocab', options.input_words_file, '--output_vocab', options.output_words_file, '--right_context', str(options.right_context_size), @@ -219,6 +228,23 @@ def main(options): else: options.validation_file = None + if options.mmap: + try: + os.remove(os.path.join(options.working_dir, train_file)) + except OSError: + pass + mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'), + '--input_file', + os.path.join(options.working_dir, numberized_file), + '--output_file', + os.path.join(options.working_dir, train_file) + ] + sys.stderr.write('creating memory-mapped file\n') + sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n') + ret = subprocess.call(mmap_cmd) + if ret: + raise Exception("creating memory-mapped file failed") + sys.stderr.write('training neural network\n') train_nplm.main(options) @@ -231,7 +257,7 @@ def main(options): options.output_model + '.model.nplm.' + str(options.epochs)), os.path.join( options.working_dir, - os.path.basename(options.corpus_stem) + '.numberized'), + numberized_file), os.path.join(options.output_dir, options.output_model + '.model.nplm') ]) if ret: diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl index 09f9c7f2b..82aed4355 100755 --- a/scripts/training/reduce-factors.perl +++ b/scripts/training/reduce-factors.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/reduce-topt-count.pl b/scripts/training/reduce-topt-count.pl index f760051c4..85ce0d6d9 100755 --- a/scripts/training/reduce-topt-count.pl +++ b/scripts/training/reduce-topt-count.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # given a moses.ini, filter the phrase tables to contain # only ttable-limit options per source phrase diff --git a/scripts/training/reduce_combine.pl b/scripts/training/reduce_combine.pl index a7614f73e..2055bed5b 100755 --- a/scripts/training/reduce_combine.pl +++ b/scripts/training/reduce_combine.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # given a pathname to a factored corpus, a list of (numeric) factors to keep diff --git a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl index eda529393..25c5cc028 100755 --- a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl +++ b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/threshold-filter.perl b/scripts/training/threshold-filter.perl index 3e42ca795..0aed67d25 100755 --- a/scripts/training/threshold-filter.perl +++ b/scripts/training/threshold-filter.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/train-global-lexicon-model.perl b/scripts/training/train-global-lexicon-model.perl index d3c55789d..528bfbd72 100755 --- a/scripts/training/train-global-lexicon-model.perl +++ b/scripts/training/train-global-lexicon-model.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 5a304c2f9..b693d774d 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py index 2d2f12015..ae77a42af 100755 --- a/scripts/training/train-neurallm.py +++ b/scripts/training/train-neurallm.py @@ -1,8 +1,12 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -""" train feed-forward neural network LM with NPLM tool -resulting model can be used in Moses as feature function NeuralLM +"""Train feed-forward neural network LM with NPLM tool. + +The resulting model can be used in Moses as feature function NeuralLM. """ from __future__ import print_function, unicode_literals @@ -83,6 +87,9 @@ parser.add_argument( parser.add_argument( "--vocab-size", dest="vocab_size", type=int, metavar="INT", help="Vocabulary size (default: %(default)s).") +parser.add_argument( + "--mmap", dest="mmap", action="store_true", + help="Use memory-mapped file (for lower memory consumption).") parser.set_defaults( working_dir="working", @@ -117,20 +124,43 @@ def main(options): if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) + numberized_file = os.path.basename(options.corpus_stem) + '.numberized' + train_file = numberized_file + if options.mmap: + train_file += '.mmap' + extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), '--train_text', options.corpus_stem, '--ngramize', '1', '--ngram_size', str(options.ngram_size), '--vocab_size', str(options.vocab_size), '--write_words_file', os.path.join(options.working_dir, options.words_file), - '--train_file', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized') + '--train_file', os.path.join(options.working_dir, numberized_file) ] sys.stderr.write('extracting n-grams\n') + sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n') ret = subprocess.call(extraction_cmd) if ret: raise Exception("preparing neural LM failed") - + + if options.mmap: + try: + os.remove(os.path.join(options.working_dir, train_file)) + except OSError: + pass + mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'), + '--input_file', + os.path.join(options.working_dir, numberized_file), + '--output_file', + os.path.join(options.working_dir, train_file) + ] + sys.stderr.write('creating memory-mapped file\n') + sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n') + ret = subprocess.call(mmap_cmd) + if ret: + raise Exception("creating memory-mapped file failed") + if options.validation_corpus: extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), @@ -143,6 +173,7 @@ def main(options): ] sys.stderr.write('extracting n-grams (validation file)\n') + sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n') ret = subprocess.call(extraction_cmd) if ret: raise Exception("preparing neural LM failed") @@ -162,7 +193,7 @@ def main(options): average_options = averageNullEmbedding.parser.parse_args( ['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)), '-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'), - '-t', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'), + '-t', os.path.join(options.working_dir, numberized_file), '-p', os.path.join(options.nplm_home, 'python')]) averageNullEmbedding.main(average_options) diff --git a/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh b/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh index 238a53349..5db5e9aa9 100755 --- a/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh +++ b/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh @@ -1,4 +1,7 @@ #!/bin/bash +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/suffix-array-create.sh $SA_EXEC_DIR $SOURCE_CORPUS $TARGET_CORPUS $ALIGNMENT $SA_OUTPUT diff --git a/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh b/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh index 8c255b1b6..128ccaa9e 100755 --- a/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh +++ b/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh @@ -1,4 +1,7 @@ #!/bin/bash +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $SA_EXEC_DIR $MODEL_DIR $INPUT_FILE $OUTPUT_DIR diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl index 232cfefab..9c376200c 100755 --- a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl index 9e8c30d42..b8ba146c9 100755 --- a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py index 761037488..6473166d9 100755 --- a/scripts/training/wrappers/conll2mosesxml.py +++ b/scripts/training/wrappers/conll2mosesxml.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on diff --git a/scripts/training/wrappers/filter-excluded-lines.perl b/scripts/training/wrappers/filter-excluded-lines.perl index dff104dba..508ab8a06 100755 --- a/scripts/training/wrappers/filter-excluded-lines.perl +++ b/scripts/training/wrappers/filter-excluded-lines.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/find-unparseable.perl b/scripts/training/wrappers/find-unparseable.perl index 00009e2e9..fd0664f1d 100755 --- a/scripts/training/wrappers/find-unparseable.perl +++ b/scripts/training/wrappers/find-unparseable.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/mada-wrapper.perl b/scripts/training/wrappers/mada-wrapper.perl index f2cf14f40..d4124e34c 100755 --- a/scripts/training/wrappers/mada-wrapper.perl +++ b/scripts/training/wrappers/mada-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/madamira-tok.perl b/scripts/training/wrappers/madamira-tok.perl index 37e70079e..e9f19d53a 100755 --- a/scripts/training/wrappers/madamira-tok.perl +++ b/scripts/training/wrappers/madamira-tok.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl index 6535b6187..05ec44d7d 100755 --- a/scripts/training/wrappers/madamira-wrapper.perl +++ b/scripts/training/wrappers/madamira-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl index 1e3a1ce3f..a8ce5f24e 100755 --- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl +++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-de-lemma.perl b/scripts/training/wrappers/make-factor-de-lemma.perl index db978317e..0b93002a9 100755 --- a/scripts/training/wrappers/make-factor-de-lemma.perl +++ b/scripts/training/wrappers/make-factor-de-lemma.perl @@ -1,4 +1,7 @@ #!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use Encode; diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl index 366a5a76d..d09196745 100755 --- a/scripts/training/wrappers/make-factor-de-morph.perl +++ b/scripts/training/wrappers/make-factor-de-morph.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-de-pos.perl b/scripts/training/wrappers/make-factor-de-pos.perl index 495517352..585323bd4 100755 --- a/scripts/training/wrappers/make-factor-de-pos.perl +++ b/scripts/training/wrappers/make-factor-de-pos.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-en-porter.perl b/scripts/training/wrappers/make-factor-en-porter.perl index 749dc1318..7ae5fd0b3 100755 --- a/scripts/training/wrappers/make-factor-en-porter.perl +++ b/scripts/training/wrappers/make-factor-en-porter.perl @@ -1,4 +1,7 @@ #!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl index 4aa66bac6..2bff8e329 100755 --- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl +++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl index 0ad04d4de..1e8ccd0ee 100755 --- a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl +++ b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-stem.perl b/scripts/training/wrappers/make-factor-stem.perl index 662f1d882..9bde7648f 100755 --- a/scripts/training/wrappers/make-factor-stem.perl +++ b/scripts/training/wrappers/make-factor-stem.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-suffix.perl b/scripts/training/wrappers/make-factor-suffix.perl index 6a59254e4..015df3874 100755 --- a/scripts/training/wrappers/make-factor-suffix.perl +++ b/scripts/training/wrappers/make-factor-suffix.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/morfessor-wrapper.perl b/scripts/training/wrappers/morfessor-wrapper.perl index c65a2cebc..0269045a0 100755 --- a/scripts/training/wrappers/morfessor-wrapper.perl +++ b/scripts/training/wrappers/morfessor-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl index e929658ff..02bc7b88e 100755 --- a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl +++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py index 6ff1d20c9..6b90aa256 100755 --- a/scripts/training/wrappers/mosesxml2brackets.py +++ b/scripts/training/wrappers/mosesxml2brackets.py @@ -1,8 +1,11 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# convert trees in moses XML format to PTB-style bracketed format +"""Convert trees in moses XML format to PTB-style bracketed format.""" from __future__ import print_function, unicode_literals import sys diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl index 596fb3eff..f605a37ae 100755 --- a/scripts/training/wrappers/parse-de-berkeley.perl +++ b/scripts/training/wrappers/parse-de-berkeley.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-de-bitpar.perl b/scripts/training/wrappers/parse-de-bitpar.perl index 1bbcf5329..0d5346058 100755 --- a/scripts/training/wrappers/parse-de-bitpar.perl +++ b/scripts/training/wrappers/parse-de-bitpar.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-en-collins.perl b/scripts/training/wrappers/parse-en-collins.perl index 252d3d2b7..c9a960912 100755 --- a/scripts/training/wrappers/parse-en-collins.perl +++ b/scripts/training/wrappers/parse-en-collins.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-en-egret.perl b/scripts/training/wrappers/parse-en-egret.perl index 9f434063b..e97bc1ae0 100755 --- a/scripts/training/wrappers/parse-en-egret.perl +++ b/scripts/training/wrappers/parse-en-egret.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-en-senna.perl b/scripts/training/wrappers/parse-en-senna.perl index f271633ea..2df46284b 100755 --- a/scripts/training/wrappers/parse-en-senna.perl +++ b/scripts/training/wrappers/parse-en-senna.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/training/wrappers/parse-en-stanford.py b/scripts/training/wrappers/parse-en-stanford.py index 7d8be4bcf..06b027e55 100755 --- a/scripts/training/wrappers/parse-en-stanford.py +++ b/scripts/training/wrappers/parse-en-stanford.py @@ -1,11 +1,17 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# (hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format. -# assumes tokenized and sentence-split text. +""" +(Hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format. +Assumes tokenized and sentence-split text. -# to get Moses XML format, first projectivize the trees, then use conll2mosesxml.py. +To get Moses XML format, first projectivize the trees, then use +conll2mosesxml.py. +""" from __future__ import print_function, unicode_literals import os diff --git a/scripts/training/wrappers/senna2brackets.py b/scripts/training/wrappers/senna2brackets.py index 4fc71ed44..a81100277 100755 --- a/scripts/training/wrappers/senna2brackets.py +++ b/scripts/training/wrappers/senna2brackets.py @@ -1,19 +1,24 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# Read SENNA output (from stdin), extract the parse trees, and write them in -# PTB-style bracketed format (to stdout). -# -# The SENNA output is assumed to contain tokens in the first column, POS tags -# in the second column, and PSG fragments in the final column. -# -# It is also assumed that SENNA was run through the parse-en-senna.perl wrapper, -# which: -# -# - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that -# exceed SENNA's hardcoded limit. -# -# - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")", -# etc. +""" +Read SENNA output (from stdin), extract the parse trees, and write them in +PTB-style bracketed format (to stdout). + +The SENNA output is assumed to contain tokens in the first column, POS tags +in the second column, and PSG fragments in the final column. + +It is also assumed that SENNA was run through the parse-en-senna.perl wrapper, +which: + + - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that + exceed SENNA's hardcoded limit. + + - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")", + etc. +""" import optparse import os diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl index 653b410d0..1a260df10 100755 --- a/scripts/training/wrappers/syntax-hyphen-splitting.perl +++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/tagger-german-chunk.perl b/scripts/training/wrappers/tagger-german-chunk.perl index c57031889..0b707a579 100755 --- a/scripts/training/wrappers/tagger-german-chunk.perl +++ b/scripts/training/wrappers/tagger-german-chunk.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict;