From 33eee96f35b07168d5912b35baed7df74d4cea23 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 21 May 2015 16:36:51 +0000 Subject: [PATCH 001/108] Bug fix related to context-sensitive decoding: --context-string had no effect, even when --context-window was not specified. --- moses/IOWrapper.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp index 29769f180..8cbf4f091 100644 --- a/moses/IOWrapper.cpp +++ b/moses/IOWrapper.cpp @@ -296,16 +296,19 @@ GetBufferedInput() } boost::shared_ptr -IOWrapper::ReadInput() +IOWrapper:: +ReadInput() { #ifdef WITH_THREADS boost::lock_guard lock(m_lock); #endif boost::shared_ptr source = GetBufferedInput(); - if (source) { - source->SetTranslationId(m_currentLine++); - this->set_context_for(*source); - } + if (source) + { + source->SetTranslationId(m_currentLine++); + if (m_look_ahead || m_look_back) + this->set_context_for(*source); + } m_past_input.push_back(source); return source; } @@ -344,7 +347,7 @@ set_context_for(InputType& source) } } // cerr << string(80,'=') << endl; - source.SetContext(context); + if (context->size()) source.SetContext(context); } From a1678187fead90da0e19da5a71a82e421b57ff06 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Fri, 22 May 2015 15:28:42 +0100 Subject: [PATCH 002/108] wrapper for stanford dependency parser --- .../training/wrappers/parse-en-stanford.py | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100755 scripts/training/wrappers/parse-en-stanford.py diff --git a/scripts/training/wrappers/parse-en-stanford.py b/scripts/training/wrappers/parse-en-stanford.py new file mode 100755 index 000000000..7d8be4bcf --- /dev/null +++ b/scripts/training/wrappers/parse-en-stanford.py @@ -0,0 +1,129 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +# (hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format. +# assumes tokenized and sentence-split text. + +# to get Moses XML format, first projectivize the trees, then use conll2mosesxml.py. + +from __future__ import print_function, unicode_literals +import os +import sys +import codecs +import argparse + +from collections import defaultdict +from subprocess import Popen, PIPE + +# hack for python2/3 compatibility +from io import open +argparse.open = open + + +def create_parser(): + parser = argparse.ArgumentParser( + description=( + """Wrapper around Stanford CoreNLP to produce CoNLL dependency format. + Assumes that text is tokenized and has one sentence per line.""")) + + parser.add_argument( + '--stanford', type=str, + metavar='PATH', required=True, + help='path to Stanford CoreNLP') + + parser.add_argument( + '--java', type=str, default='java', + metavar='PATH', + help='path to java executable') + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input text (default: standard input).") + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output text (default: standard output).") + + return parser + +def process_stanford(infile, javacmd, stanfordpath): + + stanford = Popen([javacmd, + '-cp', os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar') + ':' + os.path.join(stanfordpath, 'stanford-corenlp-3.5.0-models.jar'), + 'edu.stanford.nlp.pipeline.StanfordCoreNLP', + '-annotators', 'tokenize, ssplit, pos, depparse, lemma', + '-ssplit.eolonly', 'true', + '-tokenize.whitespace', 'true', + '-numThreads', '8', + '-textFile', '-', + 'outFile', '-'], stdin=infile, stdout = PIPE, stderr = open('/dev/null', 'w')) + return stanford.stdout + + +def get_sentences(instream): + sentence = [] + expect = 0 + + for line in instream: + if expect == 0 and line.startswith('Sentence #'): + if sentence: + yield sentence + sentence = [] + expect = 1 + + elif line == '\n': + expect = 0 + + elif expect == 3: + rel, remainder = line.split('(') + head, dep = remainder.split() + head_int = int(head.split('-')[-1][:-1]) + dep_int = int(dep.split('-')[-1][:-1]) + sentence[dep_int-1]['head'] = head_int + sentence[dep_int-1]['label'] = rel + + elif expect == 2: + linesplit = line.split('[',1)[1].rsplit(']',1)[0].split('] [') + if len(linesplit) != len(sentence): + sys.stderr.write('Warning: mismatch in number of words in sentence\n') + sys.stderr.write(' '.join(w['word'] for w in sentence)) + for i in range(len(sentence)): + sentence[i]['pos'] = '-' + sentence[i]['lemma'] = '-' + sentence[i]['head'] = 0 + sentence[i]['label'] = '-' + expect = 0 + continue + for i,w in enumerate(linesplit): + sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0] + sentence[i]['lemma'] = w.split(' Lemma=')[-1] + expect = 3 + + elif expect == 1: + for w in line.split(): + sentence.append({'word':w}) + expect = 2 + + if sentence: + yield sentence + +def write(sentence, outstream): + for i, w in enumerate(sentence): + outstream.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(i+1, w['word'], w['lemma'], w['pos'], w['pos'], '-', w['head'], w['label'])) + +if __name__ == '__main__': + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + + + parser = create_parser() + options = parser.parse_args() + + stanford = process_stanford(options.input, options.java, options.stanford) + for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)): + write(sentence, options.output) + options.output.write('\n') From 43527c82fc8f759b13807fb10b17a3d5fcd47561 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Fri, 22 May 2015 15:31:08 +0100 Subject: [PATCH 003/108] training script for monolingual Neural LM (+bugfixes and usability improvements for RDLM training) --- .../bilingual-lm/averageNullEmbedding.py | 16 +- .../training/rdlm/extract_syntactic_ngrams.py | 2 +- scripts/training/rdlm/train_rdlm.py | 15 +- scripts/training/train-neurallm.py | 179 ++++++++++++++++++ 4 files changed, 201 insertions(+), 11 deletions(-) create mode 100755 scripts/training/train-neurallm.py diff --git a/scripts/training/bilingual-lm/averageNullEmbedding.py b/scripts/training/bilingual-lm/averageNullEmbedding.py index aca03aaae..891595aff 100755 --- a/scripts/training/bilingual-lm/averageNullEmbedding.py +++ b/scripts/training/bilingual-lm/averageNullEmbedding.py @@ -21,16 +21,10 @@ parser.add_argument( required=True) -options = parser.parse_args() - -sys.path.append(options.nplm_python_path) -import nplm - - def load_model(model_file): + import nplm return nplm.NeuralLM.from_file(model_file) - def get_weights(path, length): counter = [0] * length for line in open(path): @@ -38,8 +32,9 @@ def get_weights(path, length): counter[last_context] += 1 return counter +def main(options): -if __name__ == "__main__": + sys.path.append(options.nplm_python_path) model = load_model(options.input_model) if options.null_idx == -1: @@ -50,3 +45,8 @@ if __name__ == "__main__": model.input_embeddings[options.null_idx] = numpy.average( numpy.array(model.input_embeddings), weights=weights, axis=0) model.to_file(open(options.output_model, 'w')) + +if __name__ == "__main__": + + options = parser.parse_args() + main(options) diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py index c6d4b7968..1292e90f2 100755 --- a/scripts/training/rdlm/extract_syntactic_ngrams.py +++ b/scripts/training/rdlm/extract_syntactic_ngrams.py @@ -133,7 +133,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, skip_glue_labels = [ options.glue_symbol, options.start_symbol, - options.end_symbo, + options.end_symbol, ] if xml.get('label') in skip_glue_labels: for child in xml: diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py index ae57e8dfc..639c1b32c 100755 --- a/scripts/training/rdlm/train_rdlm.py +++ b/scripts/training/rdlm/train_rdlm.py @@ -23,7 +23,7 @@ parser = argparse.ArgumentParser() parser.add_argument( "--working-dir", dest="working_dir", metavar="PATH") parser.add_argument( - "--corpus", dest="corpus_stem", metavar="PATH", help="Input file.") + "--corpus", '-text', dest="corpus_stem", metavar="PATH", help="Input file.") parser.add_argument( "--nplm-home", dest="nplm_home", metavar="PATH", required=True, help="Location of NPLM.") @@ -169,6 +169,13 @@ def prepare_vocabulary(options): def main(options): + if options.output_dir is None: + options.output_dir = options.working_dir + else: + # Create output dir if necessary + if not os.path.exists(options.output_dir): + os.makedirs(options.output_dir) + options.ngram_size = ( 2 * options.up_context_size + 2 * options.left_context_size + @@ -209,6 +216,8 @@ def main(options): sys.stderr.write('extracting syntactic n-grams (validation file)\n') extract_syntactic_ngrams.main(extract_options) extract_options.output.close() + else: + options.validation_file = None sys.stderr.write('training neural network\n') train_nplm.main(options) @@ -235,5 +244,7 @@ if __name__ == "__main__": sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) sys.stdin = codecs.getreader('UTF-8')(sys.stdin) - options = parser.parse_args() + options = parser.parse_known_args()[0] + if parser.parse_known_args()[1]: + sys.stderr.write('Warning: unknown arguments: {0}\n'.format(parser.parse_known_args()[1])) main(options) diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py new file mode 100755 index 000000000..2d2f12015 --- /dev/null +++ b/scripts/training/train-neurallm.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" train feed-forward neural network LM with NPLM tool +resulting model can be used in Moses as feature function NeuralLM +""" + +from __future__ import print_function, unicode_literals + +import logging +import argparse +import subprocess +import sys +import os +import codecs + +# ./bilingual-lm +sys.path.append(os.path.join(sys.path[0], 'bilingual-lm')) +import train_nplm +import averageNullEmbedding + +logging.basicConfig( + format='%(asctime)s %(levelname)s: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) +parser = argparse.ArgumentParser() +parser.add_argument( + "--working-dir", dest="working_dir", metavar="PATH") +parser.add_argument( + "--corpus", '-text', dest="corpus_stem", metavar="PATH", help="Input file.") +parser.add_argument( + "--nplm-home", dest="nplm_home", metavar="PATH", required=True, + help="Location of NPLM.") +parser.add_argument( + "--epochs", dest="epochs", type=int, metavar="INT", + help="Number of training epochs (default: %(default)s).") +parser.add_argument( + "--order", dest="order", type=int, metavar="INT", + help="N-gram order of language model (default: %(default)s).") +parser.add_argument( + "--minibatch-size", dest="minibatch_size", type=int, metavar="INT", + help="Minibatch size (default: %(default)s).") +parser.add_argument( + "--noise", dest="noise", type=int, metavar="INT", + help="Number of noise samples for NCE (default: %(default)s).") +parser.add_argument( + "--hidden", dest="hidden", type=int, metavar="INT", + help=( + "Size of hidden layer (0 for single hidden layer) " + "(default: %(default)s)")) +parser.add_argument( + "--input-embedding", dest="input_embedding", type=int, metavar="INT", + help="Size of input embedding layer (default: %(default)s).") +parser.add_argument( + "--output-embedding", dest="output_embedding", type=int, metavar="INT", + help="Size of output embedding layer (default: %(default)s).") +parser.add_argument( + "--threads", "-t", dest="threads", type=int, metavar="INT", + help="Number of threads (default: %(default)s).") +parser.add_argument( + "--output-model", dest="output_model", metavar="PATH", + help="Name of output model (default: %(default)s).") +parser.add_argument( + "--output-dir", dest="output_dir", metavar="PATH", + help="Output directory (default: same as working-dir).") +parser.add_argument( + "--config-options-file", dest="config_options_file", metavar="PATH") +parser.add_argument( + "--log-file", dest="log_file", metavar="PATH", + help="Log file to write to (default: %(default)s).") +parser.add_argument( + "--validation-corpus", dest="validation_corpus", metavar="PATH", + help="Validation file (default: %(default)s).") +parser.add_argument( + "--activation-function", dest="activation_fn", + choices=['identity', 'rectifier', 'tanh', 'hardtanh'], + help="Activation function (default: %(default)s).") +parser.add_argument( + "--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", + help="Learning rate (default: %(default)s).") +parser.add_argument( + "--words-file", dest="words_file", metavar="PATH", + help="Output vocabulary file (default: %(default)s).") +parser.add_argument( + "--vocab-size", dest="vocab_size", type=int, metavar="INT", + help="Vocabulary size (default: %(default)s).") + +parser.set_defaults( + working_dir="working", + corpus_stem="train", + nplm_home="/home/bhaddow/tools/nplm", + epochs=2, + order=5, + minibatch_size=1000, + noise=100, + hidden=0, + input_embedding=150, + output_embedding=750, + threads=4, + output_model="train", + output_dir=None, + config_options_file="config", + log_file="log", + validation_corpus=None, + activation_fn="rectifier", + learning_rate=1, + words_file='vocab', + vocab_size=500000) + +def main(options): + + options.ngram_size = options.order + + if options.output_dir is None: + options.output_dir = options.working_dir + else: + # Create output dir if necessary + if not os.path.exists(options.output_dir): + os.makedirs(options.output_dir) + + extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), + '--train_text', options.corpus_stem, + '--ngramize', '1', + '--ngram_size', str(options.ngram_size), + '--vocab_size', str(options.vocab_size), + '--write_words_file', os.path.join(options.working_dir, options.words_file), + '--train_file', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized') + ] + + sys.stderr.write('extracting n-grams\n') + ret = subprocess.call(extraction_cmd) + if ret: + raise Exception("preparing neural LM failed") + + if options.validation_corpus: + + extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), + '--train_text', options.validation_corpus, + '--ngramize', '1', + '--ngram_size', str(options.ngram_size), + '--vocab_size', str(options.vocab_size), + '--words_file', os.path.join(options.working_dir, options.words_file), + '--train_file', os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized') + ] + + sys.stderr.write('extracting n-grams (validation file)\n') + ret = subprocess.call(extraction_cmd) + if ret: + raise Exception("preparing neural LM failed") + + else: + options.validation_file = None + + options.input_words_file = options.words_file + options.output_words_file = options.words_file + options.input_vocab_size = options.vocab_size + options.output_vocab_size = options.vocab_size + + sys.stderr.write('training neural network\n') + train_nplm.main(options) + + sys.stderr.write('averaging null words\n') + average_options = averageNullEmbedding.parser.parse_args( + ['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)), + '-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'), + '-t', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'), + '-p', os.path.join(options.nplm_home, 'python')]) + averageNullEmbedding.main(average_options) + + +if __name__ == "__main__": + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + + options = parser.parse_known_args()[0] + if parser.parse_known_args()[1]: + sys.stderr.write('Warning: unknown arguments: {0}\n'.format(parser.parse_known_args()[1])) + main(options) From 502e72ce91e749e3e24480bb0d2692c4bf6b0b83 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 24 May 2015 17:08:57 +0400 Subject: [PATCH 004/108] eclipse --- contrib/other-builds/extract-rules/.cproject | 9 +- contrib/other-builds/extract/.cproject | 9 +- contrib/other-builds/manual-label/.cproject | 132 ---------- contrib/other-builds/manual-label/.project | 27 --- contrib/other-builds/manual-label/DeEn.cpp | 46 ---- contrib/other-builds/manual-label/DeEn.h | 5 - .../manual-label/EnOpenNLPChunker.cpp | 202 ---------------- .../manual-label/EnOpenNLPChunker.h | 29 --- .../manual-label/EnPhrasalVerb.cpp | 226 ------------------ .../other-builds/manual-label/EnPhrasalVerb.h | 11 - .../manual-label/LabelByInitialLetter.cpp | 29 --- .../manual-label/LabelByInitialLetter.h | 6 - contrib/other-builds/manual-label/Main.cpp | 195 --------------- contrib/other-builds/manual-label/Main.h | 27 --- contrib/other-builds/manual-label/Makefile | 14 -- .../manual-label/manual-label.project | 131 ---------- contrib/other-builds/moses/.project | 10 - contrib/other-builds/score/.cproject | 1 - contrib/other-builds/server/.cproject | 5 +- 19 files changed, 12 insertions(+), 1102 deletions(-) delete mode 100644 contrib/other-builds/manual-label/.cproject delete mode 100644 contrib/other-builds/manual-label/.project delete mode 100644 contrib/other-builds/manual-label/DeEn.cpp delete mode 100644 contrib/other-builds/manual-label/DeEn.h delete mode 100644 contrib/other-builds/manual-label/EnOpenNLPChunker.cpp delete mode 100644 contrib/other-builds/manual-label/EnOpenNLPChunker.h delete mode 100644 contrib/other-builds/manual-label/EnPhrasalVerb.cpp delete mode 100644 contrib/other-builds/manual-label/EnPhrasalVerb.h delete mode 100644 contrib/other-builds/manual-label/LabelByInitialLetter.cpp delete mode 100644 contrib/other-builds/manual-label/LabelByInitialLetter.h delete mode 100644 contrib/other-builds/manual-label/Main.cpp delete mode 100644 contrib/other-builds/manual-label/Main.h delete mode 100644 contrib/other-builds/manual-label/Makefile delete mode 100644 contrib/other-builds/manual-label/manual-label.project diff --git a/contrib/other-builds/extract-rules/.cproject b/contrib/other-builds/extract-rules/.cproject index e79f0f526..86e38979e 100644 --- a/contrib/other-builds/extract-rules/.cproject +++ b/contrib/other-builds/extract-rules/.cproject @@ -5,16 +5,16 @@ + - - + @@ -25,6 +25,7 @@ @@ -60,16 +61,16 @@ + - - + diff --git a/contrib/other-builds/extract/.cproject b/contrib/other-builds/extract/.cproject index 10701cb6e..4c80306be 100644 --- a/contrib/other-builds/extract/.cproject +++ b/contrib/other-builds/extract/.cproject @@ -5,16 +5,16 @@ + - - + @@ -25,6 +25,7 @@ @@ -61,16 +62,16 @@ + - - + diff --git a/contrib/other-builds/manual-label/.cproject b/contrib/other-builds/manual-label/.cproject deleted file mode 100644 index d9297a9fc..000000000 --- a/contrib/other-builds/manual-label/.cproject +++ /dev/null @@ -1,132 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/contrib/other-builds/manual-label/.project b/contrib/other-builds/manual-label/.project deleted file mode 100644 index d0c1dba19..000000000 --- a/contrib/other-builds/manual-label/.project +++ /dev/null @@ -1,27 +0,0 @@ - - - manual-label - - - - - - org.eclipse.cdt.managedbuilder.core.genmakebuilder - clean,full,incremental, - - - - - org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder - full,incremental, - - - - - - org.eclipse.cdt.core.cnature - org.eclipse.cdt.core.ccnature - org.eclipse.cdt.managedbuilder.core.managedBuildNature - org.eclipse.cdt.managedbuilder.core.ScannerConfigNature - - diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp deleted file mode 100644 index ea2934c5a..000000000 --- a/contrib/other-builds/manual-label/DeEn.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include "DeEn.h" -#include "Main.h" -#include "moses/Util.h" - -using namespace std; - -extern bool g_debug; - -bool Contains(const Phrase &source, int start, int end, int factor, const string &str) -{ - for (int pos = start; pos <= end; ++pos) { - bool found = IsA(source, pos, 0, factor, str); - if (found) { - return true; - } - } - return false; -} - -void LabelDeEn(const Phrase &source, ostream &out) -{ - Ranges ranges; - - // find ranges to label - for (int start = 0; start < source.size(); ++start) { - for (int end = start; end < source.size(); ++end) { - if (IsA(source, start, -1, 1, "VAFIN") - && IsA(source, end, +1, 1, "VVINF VVPP") - && !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) { - Range range(start, end, "reorder-label"); - ranges.push_back(range); - } - else if ((start == 0 || IsA(source, start, -1, 1, "$,")) - && IsA(source, end, +1, 0, "zu") - && IsA(source, end, +2, 1, "VVINF") - && !Contains(source, start, end, 1, "$,")) { - Range range(start, end, "reorder-label"); - ranges.push_back(range); - } - } - } - - OutputWithLabels(source, ranges, out); -} - diff --git a/contrib/other-builds/manual-label/DeEn.h b/contrib/other-builds/manual-label/DeEn.h deleted file mode 100644 index c24ce0079..000000000 --- a/contrib/other-builds/manual-label/DeEn.h +++ /dev/null @@ -1,5 +0,0 @@ -#pragma once - -#include "Main.h" - -void LabelDeEn(const Phrase &source, std::ostream &out); diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp deleted file mode 100644 index 538aa9746..000000000 --- a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp +++ /dev/null @@ -1,202 +0,0 @@ -/* - * EnApacheChunker.cpp - * - * Created on: 28 Feb 2014 - * Author: hieu - */ -#include -#include -#include -#include -#include -#include -#include "EnOpenNLPChunker.h" -#include "moses/Util.h" - -using namespace std; -using namespace boost::algorithm; - -EnOpenNLPChunker::EnOpenNLPChunker(const std::string &openNLPPath) -:m_openNLPPath(openNLPPath) -{ - // TODO Auto-generated constructor stub - -} - -EnOpenNLPChunker::~EnOpenNLPChunker() { - // TODO Auto-generated destructor stub -} - -void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector &filterList) -{ - const boost::filesystem::path - inPath = boost::filesystem::unique_path(), - outPath = boost::filesystem::unique_path(); - // read all input to a temp file - ofstream inFile(inPath.c_str()); - - string line; - while (getline(in, line)) { - Unescape(line); - inFile << line << endl; - } - inFile.close(); - - // execute chunker - string cmd = "cat " + inPath.native() + " | " - + m_openNLPPath + "/bin/opennlp POSTagger " - + m_openNLPPath + "/models/en-pos-maxent.bin | " - + m_openNLPPath + "/bin/opennlp ChunkerME " - + m_openNLPPath + "/models/en-chunker.bin > " - + outPath.native(); - //g << "Executing:" << cmd << endl; - int ret = system(cmd.c_str()); - - // read result of chunker and output as Moses xml trees - ifstream outFile(outPath.c_str()); - - size_t lineNum = 0; - while (getline(outFile, line)) { - //cerr << line << endl; - MosesReformat(line, out, filterList); - out << endl; - ++lineNum; - } - outFile.close(); - - // clean up temporary files - remove(inPath.c_str()); - remove(outPath.c_str()); -} - -void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector &filterList) -{ - //cerr << "REFORMATING:" << line << endl; - bool inLabel = false; - vector toks; - Moses::Tokenize(toks, line); - for (size_t i = 0; i < toks.size(); ++i) { - const string &tok = toks[i]; - - if (tok.substr(0, 1) == "[" && tok.substr(1,1) != "_") { - // start of chunk - string label = tok.substr(1); - if (UseLabel(label, filterList)) { - out << ""; - inLabel = true; - } - } - else if (ends_with(tok, "]")) { - // end of chunk - if (tok.size() > 1) { - if (tok.substr(1,1) == "_") { - // just a word that happens to be ] - vector factors; - Moses::Tokenize(factors, tok, "_"); - assert(factors.size() == 2); - - Escape(factors[0]); - out << factors[0] << " "; - } - else { - // a word and end of tree - string word = tok.substr(0, tok.size()-1); - - vector factors; - Moses::Tokenize(factors, word, "_"); - assert(factors.size() == 2); - - Escape(factors[0]); - out << factors[0] << " "; - } - - if (inLabel) { - out << " "; - inLabel = false; - } - } - else { - if (inLabel) { - out << " "; - inLabel = false; - } - } - - } - else { - // lexical item - vector factors; - Moses::Tokenize(factors, tok, "_"); - if (factors.size() == 2) { - Escape(factors[0]); - out << factors[0] << " "; - } - else if (factors.size() == 1) { - // word is _ - assert(tok.substr(0, 2) == "__"); - out << "_ "; - } - else { - throw "Unknown format:" + tok; - } - } - } -} - -std::string -replaceAll( std::string const& original, - std::string const& before, - std::string const& after ) -{ - std::string retval; - std::string::const_iterator end = original.end(); - std::string::const_iterator current = original.begin(); - std::string::const_iterator next = - std::search( current, end, before.begin(), before.end() ); - while ( next != end ) { - retval.append( current, next ); - retval.append( after ); - current = next + before.size(); - next = std::search( current, end, before.begin(), before.end() ); - } - retval.append( current, next ); - return retval; -} - -void EnOpenNLPChunker::Escape(string &line) -{ - line = replaceAll(line, "&", "&"); - line = replaceAll(line, "|", "|"); - line = replaceAll(line, "<", "<"); - line = replaceAll(line, ">", ">"); - line = replaceAll(line, "'", "'"); - line = replaceAll(line, "\"", """); - line = replaceAll(line, "[", "["); - line = replaceAll(line, "]", "]"); -} - -void EnOpenNLPChunker::Unescape(string &line) -{ - line = replaceAll(line, "|", "|"); - line = replaceAll(line, "<", "<"); - line = replaceAll(line, ">", ">"); - line = replaceAll(line, """, "\""); - line = replaceAll(line, "'", "'"); - line = replaceAll(line, "[", "["); - line = replaceAll(line, "]", "]"); - line = replaceAll(line, "&", "&"); -} - -bool EnOpenNLPChunker::UseLabel(const std::string &label, const std::vector &filterList) const -{ - if (filterList.size() == 0) { - return true; - } - - for (size_t i = 0; i < filterList.size(); ++i) { - if (label == filterList[i]) { - return true; - } - } - return false; -} diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.h b/contrib/other-builds/manual-label/EnOpenNLPChunker.h deleted file mode 100644 index df9f90e42..000000000 --- a/contrib/other-builds/manual-label/EnOpenNLPChunker.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * EnApacheChunker.h - * - * Created on: 28 Feb 2014 - * Author: hieu - */ - -#pragma once - -#include -#include -#include - -class EnOpenNLPChunker { -public: - EnOpenNLPChunker(const std::string &openNLPPath); - virtual ~EnOpenNLPChunker(); - void Process(std::istream &in, std::ostream &out, const std::vector &filterList); -protected: - const std::string m_openNLPPath; - - void Escape(std::string &line); - void Unescape(std::string &line); - - void MosesReformat(const std::string &line, std::ostream &out, const std::vector &filterList); - - bool UseLabel(const std::string &label, const std::vector &filterList) const; -}; - diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp b/contrib/other-builds/manual-label/EnPhrasalVerb.cpp deleted file mode 100644 index 4bee9b941..000000000 --- a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp +++ /dev/null @@ -1,226 +0,0 @@ -#include -#include -#include -#include -#include "EnPhrasalVerb.h" -#include "moses/Util.h" - -using namespace std; - -void EnPhrasalVerb(const Phrase &source, int revision, ostream &out) -{ - Ranges ranges; - - // find ranges to label - for (int start = 0; start < source.size(); ++start) { - size_t end = std::numeric_limits::max(); - - if (IsA(source, start, 0, 0, "ask asked asking")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "back backed backing")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "blow blown blew")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "break broke broken")) { - end = Found(source, start, 0, "down up in"); - } - else if (IsA(source, start, 0, 0, "bring brought bringing")) { - end = Found(source, start, 0, "down up in"); - } - else if (IsA(source, start, 0, 0, "call called calling")) { - end = Found(source, start, 0, "back up off"); - } - else if (IsA(source, start, 0, 0, "check checked checking")) { - end = Found(source, start, 0, "out in"); - } - else if (IsA(source, start, 0, 0, "cheer cheered cheering")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "clean cleaned cleaning")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "cross crossed crossing")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "cut cutting")) { - end = Found(source, start, 0, "down off out"); - } - else if (IsA(source, start, 0, 0, "do did done")) { - end = Found(source, start, 0, "over up"); - } - else if (IsA(source, start, 0, 0, "drop dropped dropping")) { - end = Found(source, start, 0, "off"); - } - else if (IsA(source, start, 0, 0, "figure figured figuring")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "fill filled filling")) { - end = Found(source, start, 0, "in out up"); - } - else if (IsA(source, start, 0, 0, "find found finding")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "get got getting gotten")) { - end = Found(source, start, 0, "across over back"); - } - else if (IsA(source, start, 0, 0, "give given gave giving")) { - end = Found(source, start, 0, "away back out up"); - } - else if (IsA(source, start, 0, 0, "hand handed handing")) { - end = Found(source, start, 0, "down in over"); - } - else if (IsA(source, start, 0, 0, "hold held holding")) { - end = Found(source, start, 0, "back up"); - } - else if (IsA(source, start, 0, 0, "keep kept keeping")) { - end = Found(source, start, 0, "from up"); - } - else if (IsA(source, start, 0, 0, "let letting")) { - end = Found(source, start, 0, "down in"); - } - else if (IsA(source, start, 0, 0, "look looked looking")) { - end = Found(source, start, 0, "over up"); - } - else if (IsA(source, start, 0, 0, "make made making")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "mix mixed mixing")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "pass passed passing")) { - end = Found(source, start, 0, "out up"); - } - else if (IsA(source, start, 0, 0, "pay payed paying")) { - end = Found(source, start, 0, "back"); - } - else if (IsA(source, start, 0, 0, "pick picked picking")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "point pointed pointing")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "put putting")) { - end = Found(source, start, 0, "down off out together on"); - } - else if (IsA(source, start, 0, 0, "send sending")) { - end = Found(source, start, 0, "back"); - } - else if (IsA(source, start, 0, 0, "set setting")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "sort sorted sorting")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "switch switched switching")) { - end = Found(source, start, 0, "off on"); - } - else if (IsA(source, start, 0, 0, "take took taking")) { - end = Found(source, start, 0, "apart back off out"); - } - else if (IsA(source, start, 0, 0, "tear torn tearing")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "think thought thinking")) { - end = Found(source, start, 0, "over"); - } - else if (IsA(source, start, 0, 0, "thrown threw thrown throwing")) { - end = Found(source, start, 0, "away"); - } - else if (IsA(source, start, 0, 0, "turn turned turning")) { - end = Found(source, start, 0, "down off on"); - } - else if (IsA(source, start, 0, 0, "try tried trying")) { - end = Found(source, start, 0, "on out"); - } - else if (IsA(source, start, 0, 0, "use used using")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "warm warmed warming")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "work worked working")) { - end = Found(source, start, 0, "out"); - } - - // found range to label - if (end != std::numeric_limits::max() && - end > start + 1) { - bool add = true; - if (revision == 1 && Exist(source, - start + 1, - end - 1, - 1, - "VB VBD VBG VBN VBP VBZ")) { - // there's a verb in between - add = false; - } - - if (add) { - Range range(start + 1, end - 1, "reorder-label"); - ranges.push_back(range); - } - } - } - - OutputWithLabels(source, ranges, out); -} - -bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str) -{ - vector soughts = Moses::Tokenize(str, " "); - for (size_t i = start; i <= end; ++i) { - const Word &word = source[i]; - bool found = Found(word, factor, soughts); - if (found) { - return true; - } - } - - return false; -} - -size_t Found(const Phrase &source, int pos, int factor, const std::string &str) -{ - const size_t MAX_RANGE = 10; - - vector soughts = Moses::Tokenize(str, " "); - vector puncts = Moses::Tokenize(". : , ;", " "); - - - size_t maxEnd = std::min(source.size(), (size_t) pos + MAX_RANGE); - for (size_t i = pos + 1; i < maxEnd; ++i) { - const Word &word = source[i]; - bool found; - - found = Found(word, factor, puncts); - if (found) { - return std::numeric_limits::max(); - } - - found = Found(word, factor, soughts); - if (found) { - return i; - } - } - - return std::numeric_limits::max(); -} - - -bool Found(const Word &word, int factor, const vector &soughts) -{ - const string &element = word[factor]; - for (size_t i = 0; i < soughts.size(); ++i) { - const string &sought = soughts[i]; - bool found = (element == sought); - if (found) { - return true; - } - } - return false; -} - - diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.h b/contrib/other-builds/manual-label/EnPhrasalVerb.h deleted file mode 100644 index 4cb5f7348..000000000 --- a/contrib/other-builds/manual-label/EnPhrasalVerb.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include "Main.h" - -// roll your own identification of phrasal verbs -void EnPhrasalVerb(const Phrase &source, int revision, std::ostream &out); - -bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str); -size_t Found(const Phrase &source, int pos, int factor, const std::string &str); -bool Found(const Word &word, int factor, const std::vector &soughts); - diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp b/contrib/other-builds/manual-label/LabelByInitialLetter.cpp deleted file mode 100644 index e4136a7ea..000000000 --- a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include "LabelByInitialLetter.h" -#include "Main.h" - -using namespace std; - -void LabelByInitialLetter(const Phrase &source, std::ostream &out) -{ - Ranges ranges; - - for (int start = 0; start < source.size(); ++start) { - const string &startWord = source[start][0]; - string startChar = startWord.substr(0,1); - - for (int end = start + 1; end < source.size(); ++end) { - const string &endWord = source[end][0]; - string endChar = endWord.substr(0,1); - - if (startChar == endChar) { - Range range(start, end, startChar + "-label"); - ranges.push_back(range); - } - } - } - - OutputWithLabels(source, ranges, out); - -} - - diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.h b/contrib/other-builds/manual-label/LabelByInitialLetter.h deleted file mode 100644 index ba8d34c19..000000000 --- a/contrib/other-builds/manual-label/LabelByInitialLetter.h +++ /dev/null @@ -1,6 +0,0 @@ -#pragma once - -#include "Main.h" - -void LabelByInitialLetter(const Phrase &source, std::ostream &out); - diff --git a/contrib/other-builds/manual-label/Main.cpp b/contrib/other-builds/manual-label/Main.cpp deleted file mode 100644 index 896f70590..000000000 --- a/contrib/other-builds/manual-label/Main.cpp +++ /dev/null @@ -1,195 +0,0 @@ -#include -#include -#include -#include "moses/Util.h" -#include "Main.h" -#include "DeEn.h" -#include "EnPhrasalVerb.h" -#include "EnOpenNLPChunker.h" -#include "LabelByInitialLetter.h" - -using namespace std; - -bool g_debug = false; - -Phrase Tokenize(const string &line); - -int main(int argc, char** argv) -{ - cerr << "Starting" << endl; - - namespace po = boost::program_options; - po::options_description desc("Options"); - desc.add_options() - ("help", "Print help messages") - - ("input,i", po::value(), "Input file. Otherwise it will read from standard in") - ("output,o", po::value(), "Output file. Otherwise it will print from standard out") - - ("source-language,s", po::value()->required(), "Source Language") - ("target-language,t", po::value()->required(), "Target Language") - ("revision,r", po::value()->default_value(0), "Revision") - ("filter", po::value(), "Only use labels from this comma-separated list") - - ("opennlp", po::value()->default_value(""), "Path to Apache OpenNLP toolkit") - - ; - - po::variables_map vm; - try - { - po::store(po::parse_command_line(argc, argv, desc), - vm); // can throw - - /** --help option - */ - if ( vm.count("help") ) - { - std::cout << "Basic Command Line Parameter App" << std::endl - << desc << std::endl; - return EXIT_SUCCESS; - } - - po::notify(vm); // throws on error, so do after help in case - // there are any problems - } - catch(po::error& e) - { - std::cerr << "ERROR: " << e.what() << std::endl << std::endl; - std::cerr << desc << std::endl; - return EXIT_FAILURE; - } - - istream *inStrm = &cin; - if (vm.count("input")) { - string inStr = vm["input"].as(); - cerr << "inStr=" << inStr << endl; - ifstream *inFile = new ifstream(inStr.c_str()); - inStrm = inFile; - } - - ostream *outStrm = &cout; - if (vm.count("output")) { - string outStr = vm["output"].as(); - cerr << "outStr=" << outStr << endl; - ostream *outFile = new ofstream(outStr.c_str()); - outStrm = outFile; - } - - vector filterList; - if (vm.count("filter")) { - string filter = vm["filter"].as(); - Moses::Tokenize(filterList, filter, ","); - } - - string sourceLang = vm["source-language"].as(); - string targetLang = vm["target-language"].as(); - int revision = vm["revision"].as(); - - cerr << sourceLang << " " << targetLang << " " << revision << endl; - - if (sourceLang == "en" && revision == 2) { - if (vm.count("opennlp") == 0) { - throw "Need path to openNLP toolkit"; - } - - string openNLPPath = vm["opennlp"].as(); - EnOpenNLPChunker chunker(openNLPPath); - chunker.Process(*inStrm, *outStrm, filterList); - } - else { - // process line-by-line - string line; - size_t lineNum = 1; - - while (getline(*inStrm, line)) { - //cerr << lineNum << ":" << line << endl; - if (lineNum % 1000 == 0) { - cerr << lineNum << " "; - } - - Phrase source = Tokenize(line); - - if (revision == 600 ) { - LabelByInitialLetter(source, *outStrm); - } - else if (sourceLang == "de" && targetLang == "en") { - LabelDeEn(source, *outStrm); - } - else if (sourceLang == "en") { - if (revision == 0 || revision == 1) { - EnPhrasalVerb(source, revision, *outStrm); - } - else if (revision == 2) { - string openNLPPath = vm["opennlp-path"].as(); - EnOpenNLPChunker chunker(openNLPPath); - } - } - - ++lineNum; - } - } - - - cerr << "Finished" << endl; - return EXIT_SUCCESS; -} - -Phrase Tokenize(const string &line) -{ - Phrase ret; - - vector toks = Moses::Tokenize(line); - for (size_t i = 0; i < toks.size(); ++i) { - Word word = Moses::Tokenize(toks[i], "|"); - ret.push_back(word); - } - - return ret; -} - -bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str) -{ - pos += offset; - if (pos >= source.size() || pos < 0) { - return false; - } - - const string &word = source[pos][factor]; - vector soughts = Moses::Tokenize(str, " "); - for (int i = 0; i < soughts.size(); ++i) { - string &sought = soughts[i]; - bool found = (word == sought); - if (found) { - return true; - } - } - return false; -} - - -void OutputWithLabels(const Phrase &source, const Ranges ranges, ostream &out) -{ - // output sentence, with labels - for (int pos = 0; pos < source.size(); ++pos) { - // output beginning of label - for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) { - const Range &range = *iter; - if (range.range.first == pos) { - out << " "; - } - } - - const Word &word = source[pos]; - out << word[0] << " "; - - for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) { - const Range &range = *iter; - if (range.range.second == pos) { - out << " "; - } - } - } - out << endl; - -} diff --git a/contrib/other-builds/manual-label/Main.h b/contrib/other-builds/manual-label/Main.h deleted file mode 100644 index 036da0d45..000000000 --- a/contrib/other-builds/manual-label/Main.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -typedef std::vector Word; -typedef std::vector Phrase; - -struct Range -{ - Range(int start,int end, const std::string &l) - :range(start, end) - ,label(l) - {} - - std::pair range; - std::string label; -}; - -typedef std::list Ranges; - -bool IsA(const Phrase &source, int pos, int offset, int factor, const std::string &str); -void OutputWithLabels(const Phrase &source, const Ranges ranges, std::ostream &out); - - diff --git a/contrib/other-builds/manual-label/Makefile b/contrib/other-builds/manual-label/Makefile deleted file mode 100644 index f24d69dc7..000000000 --- a/contrib/other-builds/manual-label/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -all: manual-label - -clean: - rm -f *.o manual-label - -.cpp.o: - g++ -I../../../boost/include -I../../../ -O3 -g -c $< - -OBJECTS = DeEn.o EnOpenNLPChunker.o EnPhrasalVerb.o Main.o LabelByInitialLetter.o - -manual-label: $(OBJECTS) - g++ $(OBJECTS) -L../../../boost/lib64 -lz -lboost_program_options-mt -o manual-label - - diff --git a/contrib/other-builds/manual-label/manual-label.project b/contrib/other-builds/manual-label/manual-label.project deleted file mode 100644 index 5c678561a..000000000 --- a/contrib/other-builds/manual-label/manual-label.project +++ /dev/null @@ -1,131 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - None - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - None - - - - - - - - - - - - - - - - - diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index 7ed5723ea..389f71297 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1140,16 +1140,6 @@ 1 PARENT-3-PROJECT_LOC/moses/FF/DynamicCacheBasedLanguageModel.h - - FF/ExternalFeature.cpp - 1 - PARENT-3-PROJECT_LOC/moses/FF/ExternalFeature.cpp - - - FF/ExternalFeature.h - 1 - PARENT-3-PROJECT_LOC/moses/FF/ExternalFeature.h - FF/FFState.cpp 1 diff --git a/contrib/other-builds/score/.cproject b/contrib/other-builds/score/.cproject index 78a5e13f9..d904122eb 100644 --- a/contrib/other-builds/score/.cproject +++ b/contrib/other-builds/score/.cproject @@ -59,7 +59,6 @@ - diff --git a/contrib/other-builds/server/.cproject b/contrib/other-builds/server/.cproject index 688221af6..78c5185f9 100644 --- a/contrib/other-builds/server/.cproject +++ b/contrib/other-builds/server/.cproject @@ -75,7 +75,6 @@ - @@ -159,10 +158,10 @@ - + - + From df5aff2d827a976edf81c3d2b3baae9a5f27eb3f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 24 May 2015 15:12:05 +0100 Subject: [PATCH 005/108] eclipse --- contrib/other-builds/OnDiskPt/.cproject | 8 +- contrib/other-builds/manual-label/.cproject | 132 ---------- contrib/other-builds/manual-label/.project | 27 --- contrib/other-builds/manual-label/DeEn.cpp | 46 ---- contrib/other-builds/manual-label/DeEn.h | 5 - .../manual-label/EnOpenNLPChunker.cpp | 202 ---------------- .../manual-label/EnOpenNLPChunker.h | 29 --- .../manual-label/EnPhrasalVerb.cpp | 226 ------------------ .../other-builds/manual-label/EnPhrasalVerb.h | 11 - .../manual-label/LabelByInitialLetter.cpp | 29 --- .../manual-label/LabelByInitialLetter.h | 6 - contrib/other-builds/manual-label/Main.cpp | 195 --------------- contrib/other-builds/manual-label/Main.h | 27 --- contrib/other-builds/manual-label/Makefile | 14 -- .../manual-label/manual-label.project | 131 ---------- contrib/other-builds/moses/.cproject | 4 +- contrib/other-builds/moses/.project | 10 + 17 files changed, 16 insertions(+), 1086 deletions(-) delete mode 100644 contrib/other-builds/manual-label/.cproject delete mode 100644 contrib/other-builds/manual-label/.project delete mode 100644 contrib/other-builds/manual-label/DeEn.cpp delete mode 100644 contrib/other-builds/manual-label/DeEn.h delete mode 100644 contrib/other-builds/manual-label/EnOpenNLPChunker.cpp delete mode 100644 contrib/other-builds/manual-label/EnOpenNLPChunker.h delete mode 100644 contrib/other-builds/manual-label/EnPhrasalVerb.cpp delete mode 100644 contrib/other-builds/manual-label/EnPhrasalVerb.h delete mode 100644 contrib/other-builds/manual-label/LabelByInitialLetter.cpp delete mode 100644 contrib/other-builds/manual-label/LabelByInitialLetter.h delete mode 100644 contrib/other-builds/manual-label/Main.cpp delete mode 100644 contrib/other-builds/manual-label/Main.h delete mode 100644 contrib/other-builds/manual-label/Makefile delete mode 100644 contrib/other-builds/manual-label/manual-label.project diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject index f551380fd..e32a5baea 100644 --- a/contrib/other-builds/OnDiskPt/.cproject +++ b/contrib/other-builds/OnDiskPt/.cproject @@ -11,12 +11,12 @@ - - + + @@ -72,13 +72,13 @@ - - + + diff --git a/contrib/other-builds/manual-label/.cproject b/contrib/other-builds/manual-label/.cproject deleted file mode 100644 index d9297a9fc..000000000 --- a/contrib/other-builds/manual-label/.cproject +++ /dev/null @@ -1,132 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/contrib/other-builds/manual-label/.project b/contrib/other-builds/manual-label/.project deleted file mode 100644 index d0c1dba19..000000000 --- a/contrib/other-builds/manual-label/.project +++ /dev/null @@ -1,27 +0,0 @@ - - - manual-label - - - - - - org.eclipse.cdt.managedbuilder.core.genmakebuilder - clean,full,incremental, - - - - - org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder - full,incremental, - - - - - - org.eclipse.cdt.core.cnature - org.eclipse.cdt.core.ccnature - org.eclipse.cdt.managedbuilder.core.managedBuildNature - org.eclipse.cdt.managedbuilder.core.ScannerConfigNature - - diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp deleted file mode 100644 index ea2934c5a..000000000 --- a/contrib/other-builds/manual-label/DeEn.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include "DeEn.h" -#include "Main.h" -#include "moses/Util.h" - -using namespace std; - -extern bool g_debug; - -bool Contains(const Phrase &source, int start, int end, int factor, const string &str) -{ - for (int pos = start; pos <= end; ++pos) { - bool found = IsA(source, pos, 0, factor, str); - if (found) { - return true; - } - } - return false; -} - -void LabelDeEn(const Phrase &source, ostream &out) -{ - Ranges ranges; - - // find ranges to label - for (int start = 0; start < source.size(); ++start) { - for (int end = start; end < source.size(); ++end) { - if (IsA(source, start, -1, 1, "VAFIN") - && IsA(source, end, +1, 1, "VVINF VVPP") - && !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) { - Range range(start, end, "reorder-label"); - ranges.push_back(range); - } - else if ((start == 0 || IsA(source, start, -1, 1, "$,")) - && IsA(source, end, +1, 0, "zu") - && IsA(source, end, +2, 1, "VVINF") - && !Contains(source, start, end, 1, "$,")) { - Range range(start, end, "reorder-label"); - ranges.push_back(range); - } - } - } - - OutputWithLabels(source, ranges, out); -} - diff --git a/contrib/other-builds/manual-label/DeEn.h b/contrib/other-builds/manual-label/DeEn.h deleted file mode 100644 index c24ce0079..000000000 --- a/contrib/other-builds/manual-label/DeEn.h +++ /dev/null @@ -1,5 +0,0 @@ -#pragma once - -#include "Main.h" - -void LabelDeEn(const Phrase &source, std::ostream &out); diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp deleted file mode 100644 index 538aa9746..000000000 --- a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp +++ /dev/null @@ -1,202 +0,0 @@ -/* - * EnApacheChunker.cpp - * - * Created on: 28 Feb 2014 - * Author: hieu - */ -#include -#include -#include -#include -#include -#include -#include "EnOpenNLPChunker.h" -#include "moses/Util.h" - -using namespace std; -using namespace boost::algorithm; - -EnOpenNLPChunker::EnOpenNLPChunker(const std::string &openNLPPath) -:m_openNLPPath(openNLPPath) -{ - // TODO Auto-generated constructor stub - -} - -EnOpenNLPChunker::~EnOpenNLPChunker() { - // TODO Auto-generated destructor stub -} - -void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector &filterList) -{ - const boost::filesystem::path - inPath = boost::filesystem::unique_path(), - outPath = boost::filesystem::unique_path(); - // read all input to a temp file - ofstream inFile(inPath.c_str()); - - string line; - while (getline(in, line)) { - Unescape(line); - inFile << line << endl; - } - inFile.close(); - - // execute chunker - string cmd = "cat " + inPath.native() + " | " - + m_openNLPPath + "/bin/opennlp POSTagger " - + m_openNLPPath + "/models/en-pos-maxent.bin | " - + m_openNLPPath + "/bin/opennlp ChunkerME " - + m_openNLPPath + "/models/en-chunker.bin > " - + outPath.native(); - //g << "Executing:" << cmd << endl; - int ret = system(cmd.c_str()); - - // read result of chunker and output as Moses xml trees - ifstream outFile(outPath.c_str()); - - size_t lineNum = 0; - while (getline(outFile, line)) { - //cerr << line << endl; - MosesReformat(line, out, filterList); - out << endl; - ++lineNum; - } - outFile.close(); - - // clean up temporary files - remove(inPath.c_str()); - remove(outPath.c_str()); -} - -void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector &filterList) -{ - //cerr << "REFORMATING:" << line << endl; - bool inLabel = false; - vector toks; - Moses::Tokenize(toks, line); - for (size_t i = 0; i < toks.size(); ++i) { - const string &tok = toks[i]; - - if (tok.substr(0, 1) == "[" && tok.substr(1,1) != "_") { - // start of chunk - string label = tok.substr(1); - if (UseLabel(label, filterList)) { - out << ""; - inLabel = true; - } - } - else if (ends_with(tok, "]")) { - // end of chunk - if (tok.size() > 1) { - if (tok.substr(1,1) == "_") { - // just a word that happens to be ] - vector factors; - Moses::Tokenize(factors, tok, "_"); - assert(factors.size() == 2); - - Escape(factors[0]); - out << factors[0] << " "; - } - else { - // a word and end of tree - string word = tok.substr(0, tok.size()-1); - - vector factors; - Moses::Tokenize(factors, word, "_"); - assert(factors.size() == 2); - - Escape(factors[0]); - out << factors[0] << " "; - } - - if (inLabel) { - out << " "; - inLabel = false; - } - } - else { - if (inLabel) { - out << " "; - inLabel = false; - } - } - - } - else { - // lexical item - vector factors; - Moses::Tokenize(factors, tok, "_"); - if (factors.size() == 2) { - Escape(factors[0]); - out << factors[0] << " "; - } - else if (factors.size() == 1) { - // word is _ - assert(tok.substr(0, 2) == "__"); - out << "_ "; - } - else { - throw "Unknown format:" + tok; - } - } - } -} - -std::string -replaceAll( std::string const& original, - std::string const& before, - std::string const& after ) -{ - std::string retval; - std::string::const_iterator end = original.end(); - std::string::const_iterator current = original.begin(); - std::string::const_iterator next = - std::search( current, end, before.begin(), before.end() ); - while ( next != end ) { - retval.append( current, next ); - retval.append( after ); - current = next + before.size(); - next = std::search( current, end, before.begin(), before.end() ); - } - retval.append( current, next ); - return retval; -} - -void EnOpenNLPChunker::Escape(string &line) -{ - line = replaceAll(line, "&", "&"); - line = replaceAll(line, "|", "|"); - line = replaceAll(line, "<", "<"); - line = replaceAll(line, ">", ">"); - line = replaceAll(line, "'", "'"); - line = replaceAll(line, "\"", """); - line = replaceAll(line, "[", "["); - line = replaceAll(line, "]", "]"); -} - -void EnOpenNLPChunker::Unescape(string &line) -{ - line = replaceAll(line, "|", "|"); - line = replaceAll(line, "<", "<"); - line = replaceAll(line, ">", ">"); - line = replaceAll(line, """, "\""); - line = replaceAll(line, "'", "'"); - line = replaceAll(line, "[", "["); - line = replaceAll(line, "]", "]"); - line = replaceAll(line, "&", "&"); -} - -bool EnOpenNLPChunker::UseLabel(const std::string &label, const std::vector &filterList) const -{ - if (filterList.size() == 0) { - return true; - } - - for (size_t i = 0; i < filterList.size(); ++i) { - if (label == filterList[i]) { - return true; - } - } - return false; -} diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.h b/contrib/other-builds/manual-label/EnOpenNLPChunker.h deleted file mode 100644 index df9f90e42..000000000 --- a/contrib/other-builds/manual-label/EnOpenNLPChunker.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * EnApacheChunker.h - * - * Created on: 28 Feb 2014 - * Author: hieu - */ - -#pragma once - -#include -#include -#include - -class EnOpenNLPChunker { -public: - EnOpenNLPChunker(const std::string &openNLPPath); - virtual ~EnOpenNLPChunker(); - void Process(std::istream &in, std::ostream &out, const std::vector &filterList); -protected: - const std::string m_openNLPPath; - - void Escape(std::string &line); - void Unescape(std::string &line); - - void MosesReformat(const std::string &line, std::ostream &out, const std::vector &filterList); - - bool UseLabel(const std::string &label, const std::vector &filterList) const; -}; - diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp b/contrib/other-builds/manual-label/EnPhrasalVerb.cpp deleted file mode 100644 index 4bee9b941..000000000 --- a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp +++ /dev/null @@ -1,226 +0,0 @@ -#include -#include -#include -#include -#include "EnPhrasalVerb.h" -#include "moses/Util.h" - -using namespace std; - -void EnPhrasalVerb(const Phrase &source, int revision, ostream &out) -{ - Ranges ranges; - - // find ranges to label - for (int start = 0; start < source.size(); ++start) { - size_t end = std::numeric_limits::max(); - - if (IsA(source, start, 0, 0, "ask asked asking")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "back backed backing")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "blow blown blew")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "break broke broken")) { - end = Found(source, start, 0, "down up in"); - } - else if (IsA(source, start, 0, 0, "bring brought bringing")) { - end = Found(source, start, 0, "down up in"); - } - else if (IsA(source, start, 0, 0, "call called calling")) { - end = Found(source, start, 0, "back up off"); - } - else if (IsA(source, start, 0, 0, "check checked checking")) { - end = Found(source, start, 0, "out in"); - } - else if (IsA(source, start, 0, 0, "cheer cheered cheering")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "clean cleaned cleaning")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "cross crossed crossing")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "cut cutting")) { - end = Found(source, start, 0, "down off out"); - } - else if (IsA(source, start, 0, 0, "do did done")) { - end = Found(source, start, 0, "over up"); - } - else if (IsA(source, start, 0, 0, "drop dropped dropping")) { - end = Found(source, start, 0, "off"); - } - else if (IsA(source, start, 0, 0, "figure figured figuring")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "fill filled filling")) { - end = Found(source, start, 0, "in out up"); - } - else if (IsA(source, start, 0, 0, "find found finding")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "get got getting gotten")) { - end = Found(source, start, 0, "across over back"); - } - else if (IsA(source, start, 0, 0, "give given gave giving")) { - end = Found(source, start, 0, "away back out up"); - } - else if (IsA(source, start, 0, 0, "hand handed handing")) { - end = Found(source, start, 0, "down in over"); - } - else if (IsA(source, start, 0, 0, "hold held holding")) { - end = Found(source, start, 0, "back up"); - } - else if (IsA(source, start, 0, 0, "keep kept keeping")) { - end = Found(source, start, 0, "from up"); - } - else if (IsA(source, start, 0, 0, "let letting")) { - end = Found(source, start, 0, "down in"); - } - else if (IsA(source, start, 0, 0, "look looked looking")) { - end = Found(source, start, 0, "over up"); - } - else if (IsA(source, start, 0, 0, "make made making")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "mix mixed mixing")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "pass passed passing")) { - end = Found(source, start, 0, "out up"); - } - else if (IsA(source, start, 0, 0, "pay payed paying")) { - end = Found(source, start, 0, "back"); - } - else if (IsA(source, start, 0, 0, "pick picked picking")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "point pointed pointing")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "put putting")) { - end = Found(source, start, 0, "down off out together on"); - } - else if (IsA(source, start, 0, 0, "send sending")) { - end = Found(source, start, 0, "back"); - } - else if (IsA(source, start, 0, 0, "set setting")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "sort sorted sorting")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "switch switched switching")) { - end = Found(source, start, 0, "off on"); - } - else if (IsA(source, start, 0, 0, "take took taking")) { - end = Found(source, start, 0, "apart back off out"); - } - else if (IsA(source, start, 0, 0, "tear torn tearing")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "think thought thinking")) { - end = Found(source, start, 0, "over"); - } - else if (IsA(source, start, 0, 0, "thrown threw thrown throwing")) { - end = Found(source, start, 0, "away"); - } - else if (IsA(source, start, 0, 0, "turn turned turning")) { - end = Found(source, start, 0, "down off on"); - } - else if (IsA(source, start, 0, 0, "try tried trying")) { - end = Found(source, start, 0, "on out"); - } - else if (IsA(source, start, 0, 0, "use used using")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "warm warmed warming")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "work worked working")) { - end = Found(source, start, 0, "out"); - } - - // found range to label - if (end != std::numeric_limits::max() && - end > start + 1) { - bool add = true; - if (revision == 1 && Exist(source, - start + 1, - end - 1, - 1, - "VB VBD VBG VBN VBP VBZ")) { - // there's a verb in between - add = false; - } - - if (add) { - Range range(start + 1, end - 1, "reorder-label"); - ranges.push_back(range); - } - } - } - - OutputWithLabels(source, ranges, out); -} - -bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str) -{ - vector soughts = Moses::Tokenize(str, " "); - for (size_t i = start; i <= end; ++i) { - const Word &word = source[i]; - bool found = Found(word, factor, soughts); - if (found) { - return true; - } - } - - return false; -} - -size_t Found(const Phrase &source, int pos, int factor, const std::string &str) -{ - const size_t MAX_RANGE = 10; - - vector soughts = Moses::Tokenize(str, " "); - vector puncts = Moses::Tokenize(". : , ;", " "); - - - size_t maxEnd = std::min(source.size(), (size_t) pos + MAX_RANGE); - for (size_t i = pos + 1; i < maxEnd; ++i) { - const Word &word = source[i]; - bool found; - - found = Found(word, factor, puncts); - if (found) { - return std::numeric_limits::max(); - } - - found = Found(word, factor, soughts); - if (found) { - return i; - } - } - - return std::numeric_limits::max(); -} - - -bool Found(const Word &word, int factor, const vector &soughts) -{ - const string &element = word[factor]; - for (size_t i = 0; i < soughts.size(); ++i) { - const string &sought = soughts[i]; - bool found = (element == sought); - if (found) { - return true; - } - } - return false; -} - - diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.h b/contrib/other-builds/manual-label/EnPhrasalVerb.h deleted file mode 100644 index 4cb5f7348..000000000 --- a/contrib/other-builds/manual-label/EnPhrasalVerb.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include "Main.h" - -// roll your own identification of phrasal verbs -void EnPhrasalVerb(const Phrase &source, int revision, std::ostream &out); - -bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str); -size_t Found(const Phrase &source, int pos, int factor, const std::string &str); -bool Found(const Word &word, int factor, const std::vector &soughts); - diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp b/contrib/other-builds/manual-label/LabelByInitialLetter.cpp deleted file mode 100644 index e4136a7ea..000000000 --- a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include "LabelByInitialLetter.h" -#include "Main.h" - -using namespace std; - -void LabelByInitialLetter(const Phrase &source, std::ostream &out) -{ - Ranges ranges; - - for (int start = 0; start < source.size(); ++start) { - const string &startWord = source[start][0]; - string startChar = startWord.substr(0,1); - - for (int end = start + 1; end < source.size(); ++end) { - const string &endWord = source[end][0]; - string endChar = endWord.substr(0,1); - - if (startChar == endChar) { - Range range(start, end, startChar + "-label"); - ranges.push_back(range); - } - } - } - - OutputWithLabels(source, ranges, out); - -} - - diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.h b/contrib/other-builds/manual-label/LabelByInitialLetter.h deleted file mode 100644 index ba8d34c19..000000000 --- a/contrib/other-builds/manual-label/LabelByInitialLetter.h +++ /dev/null @@ -1,6 +0,0 @@ -#pragma once - -#include "Main.h" - -void LabelByInitialLetter(const Phrase &source, std::ostream &out); - diff --git a/contrib/other-builds/manual-label/Main.cpp b/contrib/other-builds/manual-label/Main.cpp deleted file mode 100644 index 896f70590..000000000 --- a/contrib/other-builds/manual-label/Main.cpp +++ /dev/null @@ -1,195 +0,0 @@ -#include -#include -#include -#include "moses/Util.h" -#include "Main.h" -#include "DeEn.h" -#include "EnPhrasalVerb.h" -#include "EnOpenNLPChunker.h" -#include "LabelByInitialLetter.h" - -using namespace std; - -bool g_debug = false; - -Phrase Tokenize(const string &line); - -int main(int argc, char** argv) -{ - cerr << "Starting" << endl; - - namespace po = boost::program_options; - po::options_description desc("Options"); - desc.add_options() - ("help", "Print help messages") - - ("input,i", po::value(), "Input file. Otherwise it will read from standard in") - ("output,o", po::value(), "Output file. Otherwise it will print from standard out") - - ("source-language,s", po::value()->required(), "Source Language") - ("target-language,t", po::value()->required(), "Target Language") - ("revision,r", po::value()->default_value(0), "Revision") - ("filter", po::value(), "Only use labels from this comma-separated list") - - ("opennlp", po::value()->default_value(""), "Path to Apache OpenNLP toolkit") - - ; - - po::variables_map vm; - try - { - po::store(po::parse_command_line(argc, argv, desc), - vm); // can throw - - /** --help option - */ - if ( vm.count("help") ) - { - std::cout << "Basic Command Line Parameter App" << std::endl - << desc << std::endl; - return EXIT_SUCCESS; - } - - po::notify(vm); // throws on error, so do after help in case - // there are any problems - } - catch(po::error& e) - { - std::cerr << "ERROR: " << e.what() << std::endl << std::endl; - std::cerr << desc << std::endl; - return EXIT_FAILURE; - } - - istream *inStrm = &cin; - if (vm.count("input")) { - string inStr = vm["input"].as(); - cerr << "inStr=" << inStr << endl; - ifstream *inFile = new ifstream(inStr.c_str()); - inStrm = inFile; - } - - ostream *outStrm = &cout; - if (vm.count("output")) { - string outStr = vm["output"].as(); - cerr << "outStr=" << outStr << endl; - ostream *outFile = new ofstream(outStr.c_str()); - outStrm = outFile; - } - - vector filterList; - if (vm.count("filter")) { - string filter = vm["filter"].as(); - Moses::Tokenize(filterList, filter, ","); - } - - string sourceLang = vm["source-language"].as(); - string targetLang = vm["target-language"].as(); - int revision = vm["revision"].as(); - - cerr << sourceLang << " " << targetLang << " " << revision << endl; - - if (sourceLang == "en" && revision == 2) { - if (vm.count("opennlp") == 0) { - throw "Need path to openNLP toolkit"; - } - - string openNLPPath = vm["opennlp"].as(); - EnOpenNLPChunker chunker(openNLPPath); - chunker.Process(*inStrm, *outStrm, filterList); - } - else { - // process line-by-line - string line; - size_t lineNum = 1; - - while (getline(*inStrm, line)) { - //cerr << lineNum << ":" << line << endl; - if (lineNum % 1000 == 0) { - cerr << lineNum << " "; - } - - Phrase source = Tokenize(line); - - if (revision == 600 ) { - LabelByInitialLetter(source, *outStrm); - } - else if (sourceLang == "de" && targetLang == "en") { - LabelDeEn(source, *outStrm); - } - else if (sourceLang == "en") { - if (revision == 0 || revision == 1) { - EnPhrasalVerb(source, revision, *outStrm); - } - else if (revision == 2) { - string openNLPPath = vm["opennlp-path"].as(); - EnOpenNLPChunker chunker(openNLPPath); - } - } - - ++lineNum; - } - } - - - cerr << "Finished" << endl; - return EXIT_SUCCESS; -} - -Phrase Tokenize(const string &line) -{ - Phrase ret; - - vector toks = Moses::Tokenize(line); - for (size_t i = 0; i < toks.size(); ++i) { - Word word = Moses::Tokenize(toks[i], "|"); - ret.push_back(word); - } - - return ret; -} - -bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str) -{ - pos += offset; - if (pos >= source.size() || pos < 0) { - return false; - } - - const string &word = source[pos][factor]; - vector soughts = Moses::Tokenize(str, " "); - for (int i = 0; i < soughts.size(); ++i) { - string &sought = soughts[i]; - bool found = (word == sought); - if (found) { - return true; - } - } - return false; -} - - -void OutputWithLabels(const Phrase &source, const Ranges ranges, ostream &out) -{ - // output sentence, with labels - for (int pos = 0; pos < source.size(); ++pos) { - // output beginning of label - for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) { - const Range &range = *iter; - if (range.range.first == pos) { - out << " "; - } - } - - const Word &word = source[pos]; - out << word[0] << " "; - - for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) { - const Range &range = *iter; - if (range.range.second == pos) { - out << " "; - } - } - } - out << endl; - -} diff --git a/contrib/other-builds/manual-label/Main.h b/contrib/other-builds/manual-label/Main.h deleted file mode 100644 index 036da0d45..000000000 --- a/contrib/other-builds/manual-label/Main.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -typedef std::vector Word; -typedef std::vector Phrase; - -struct Range -{ - Range(int start,int end, const std::string &l) - :range(start, end) - ,label(l) - {} - - std::pair range; - std::string label; -}; - -typedef std::list Ranges; - -bool IsA(const Phrase &source, int pos, int offset, int factor, const std::string &str); -void OutputWithLabels(const Phrase &source, const Ranges ranges, std::ostream &out); - - diff --git a/contrib/other-builds/manual-label/Makefile b/contrib/other-builds/manual-label/Makefile deleted file mode 100644 index f24d69dc7..000000000 --- a/contrib/other-builds/manual-label/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -all: manual-label - -clean: - rm -f *.o manual-label - -.cpp.o: - g++ -I../../../boost/include -I../../../ -O3 -g -c $< - -OBJECTS = DeEn.o EnOpenNLPChunker.o EnPhrasalVerb.o Main.o LabelByInitialLetter.o - -manual-label: $(OBJECTS) - g++ $(OBJECTS) -L../../../boost/lib64 -lz -lboost_program_options-mt -o manual-label - - diff --git a/contrib/other-builds/manual-label/manual-label.project b/contrib/other-builds/manual-label/manual-label.project deleted file mode 100644 index 5c678561a..000000000 --- a/contrib/other-builds/manual-label/manual-label.project +++ /dev/null @@ -1,131 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - None - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - None - - - - - - - - - - - - - - - - - diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject index 960a13947..2fd2601c6 100644 --- a/contrib/other-builds/moses/.cproject +++ b/contrib/other-builds/moses/.cproject @@ -11,11 +11,11 @@ - + @@ -79,12 +79,12 @@ - + diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index 389f71297..fcc6b8948 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -220,6 +220,16 @@ 1 PARENT-3-PROJECT_LOC/moses/ConfusionNet.h + + ContextParameters.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.cpp + + + ContextParameters.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.h + DecodeGraph.cpp 1 From da052b7f2b05f886960dd60f175b977e6f254f5e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 24 May 2015 16:05:14 +0100 Subject: [PATCH 006/108] Removed dependency on libcurlpp, as it was difficult to link that staticly. --- Jamroot | 2 +- .../UG/mm/test-http-client.cc | 27 +++ .../UG/mm/test-xml-escaping.cc | 13 ++ .../TranslationModel/UG/mm/ug_http_client.cc | 200 ++++++++++++++++++ moses/TranslationModel/UG/mm/ug_http_client.h | 57 +++++ .../UG/mm/ug_sampling_bias.cc | 36 ++-- moses/TranslationModel/UG/mmsapt.cpp | 10 +- 7 files changed, 321 insertions(+), 24 deletions(-) create mode 100644 moses/TranslationModel/UG/mm/test-http-client.cc create mode 100644 moses/TranslationModel/UG/mm/test-xml-escaping.cc create mode 100644 moses/TranslationModel/UG/mm/ug_http_client.cc create mode 100644 moses/TranslationModel/UG/mm/ug_http_client.h diff --git a/Jamroot b/Jamroot index 65282ff63..119c6183e 100644 --- a/Jamroot +++ b/Jamroot @@ -89,7 +89,7 @@ if [ path.exists $(home)/moses-environment.jam ] include $(TOP)/jam-files/check-environment.jam ; # get resource locations # from environment variables include $(TOP)/jam-files/xmlrpc-c.jam ; # xmlrpc-c stuff for the server -include $(TOP)/jam-files/curlpp.jam ; # curlpp stuff for bias lookup (MMT only) +# include $(TOP)/jam-files/curlpp.jam ; # curlpp stuff for bias lookup (MMT only) # exit "done" : 0 ; diff --git a/moses/TranslationModel/UG/mm/test-http-client.cc b/moses/TranslationModel/UG/mm/test-http-client.cc new file mode 100644 index 000000000..f50f3b468 --- /dev/null +++ b/moses/TranslationModel/UG/mm/test-http-client.cc @@ -0,0 +1,27 @@ +// -*- c++ -*- +#include "ug_http_client.h" + +int main(int argc, char* argv[]) +{ + try + { + if (argc != 2) + { + std::cout << "Usage: async_client \n"; + std::cout << "Example:\n"; + std::cout << " async_client www.boost.org/LICENSE_1_0.txt\n"; + return 1; + } + + boost::asio::io_service io_service; + Moses::http_client c(io_service, argv[1]); + io_service.run(); + std::cout << c.content() << std::endl; + } + catch (std::exception& e) + { + std::cout << "Exception: " << e.what() << "\n"; + } + + return 0; +} diff --git a/moses/TranslationModel/UG/mm/test-xml-escaping.cc b/moses/TranslationModel/UG/mm/test-xml-escaping.cc new file mode 100644 index 000000000..a99471a9b --- /dev/null +++ b/moses/TranslationModel/UG/mm/test-xml-escaping.cc @@ -0,0 +1,13 @@ +#include +#include +#include +#include "ug_http_client.h" + +using namespace std; +int main() +{ + string line; + while (getline(cin,line)) + cout << Moses::uri_encode(line) << endl; +} + diff --git a/moses/TranslationModel/UG/mm/ug_http_client.cc b/moses/TranslationModel/UG/mm/ug_http_client.cc new file mode 100644 index 000000000..1d6d70edb --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_http_client.cc @@ -0,0 +1,200 @@ +#include "ug_http_client.h" +namespace Moses +{ +using boost::asio::ip::tcp; + +std::string http_client::content() const { return m_content.str(); } + +http_client:: +http_client(boost::asio::io_service& io_service, + const std::string& server, const std::string& path) + : resolver_(io_service), socket_(io_service) +{ + init(server,path); +} + +http_client:: +http_client(boost::asio::io_service& io_service, std::string url) + : resolver_(io_service), socket_(io_service) +{ + size_t p = url.find("://"); + if (p < url.size()) url.erase(0,p+3); + p = url.find("/"); + if (p < url.size()) + init(url.substr(0,p),url.substr(p)); + else + init(url,"/"); +} + +void +http_client:: +init(std::string const& server, std::string const& path) +{ + // Form the request. We specify the "Connection: close" header so + // that the server will close the socket after transmitting the + // response. This will allow us to treat all data up until the EOF + // as the content. + + std::ostream request_stream(&request_); + request_stream << "GET " << path << " HTTP/1.0\r\n"; + request_stream << "Host: " << server << "\r\n"; + request_stream << "Accept: */*\r\n"; + request_stream << "Connection: close\r\n\r\n"; + + // Start an asynchronous resolve to translate the server and service names + // into a list of endpoints. + tcp::resolver::query query(server, "http"); + resolver_.async_resolve(query, + boost::bind(&http_client::handle_resolve, this, + boost::asio::placeholders::error, + boost::asio::placeholders::iterator)); + +} + +void +http_client:: +handle_resolve(const boost::system::error_code& err, + tcp::resolver::iterator endpoint_iterator) +{ + if (!err) + { + // Attempt a connection to the first endpoint in the list. Each endpoint + // will be tried until we successfully establish a connection. + tcp::endpoint endpoint = *endpoint_iterator; + socket_.async_connect(endpoint, + boost::bind(&http_client::handle_connect, this, + boost::asio::placeholders::error, ++endpoint_iterator)); + } + else + { + m_error << "Error: " << err.message() << "\n"; + } +} + +void +http_client:: +handle_connect(const boost::system::error_code& err, + tcp::resolver::iterator endpoint_iterator) +{ + if (!err) + { + // The connection was successful. Send the request. + boost::asio::async_write(socket_, request_, + boost::bind(&http_client::handle_write_request, this, + boost::asio::placeholders::error)); + } + else if (endpoint_iterator != tcp::resolver::iterator()) + { + // The connection failed. Try the next endpoint in the list. + socket_.close(); + tcp::endpoint endpoint = *endpoint_iterator; + socket_.async_connect(endpoint, + boost::bind(&http_client::handle_connect, this, + boost::asio::placeholders::error, ++endpoint_iterator)); + } + else m_error << "Error: " << err.message() << "\n"; +} + +void +http_client:: +handle_write_request(const boost::system::error_code& err) +{ + using namespace boost::asio; + if (err) { m_error << "Error: " << err.message() << "\n"; return; } + + // Read the response status line. The response_ streambuf will + // automatically grow to accommodate the entire line. The growth may be + // limited by passing a maximum size to the streambuf constructor. + async_read_until(socket_, response_, "\r\n", + boost::bind(&http_client::handle_read_status_line, + this, placeholders::error)); +} + +void +http_client:: +handle_read_status_line(const boost::system::error_code& err) +{ + if (err) { m_error << "Error: " << err << "\n"; return; } + + using namespace boost::asio; + // Check that response is OK. + std::istream response_stream(&response_); + response_stream >> m_http_version >> m_status_code; + std::getline(response_stream, m_status_message); + if (!response_stream || m_http_version.substr(0, 5) != "HTTP/") + m_error << "Invalid response\n"; + else if (m_status_code != 200) + m_error << "Response returned with status code " << m_status_code << "\n"; + else // Read the response headers, which are terminated by a blank line. + async_read_until(socket_, response_, "\r\n\r\n", + boost::bind(&http_client::handle_read_headers, this, + placeholders::error)); +} + + +void +http_client:: +handle_read_headers(const boost::system::error_code& err) +{ + if (err) { m_error << "Error: " << err << "\n"; return; } + + // Process the response headers. + std::istream response_stream(&response_); + std::string line; + while (std::getline(response_stream, line) && line != "\r") + m_header.push_back(line); + + // Write whatever content we already have to output. + if (response_.size() > 0) + m_content << &response_; + + using namespace boost::asio; + // Start reading remaining data until EOF. + async_read(socket_, response_, transfer_at_least(1), + boost::bind(&http_client::handle_read_content, this, + placeholders::error)); +} + +void +http_client:: +handle_read_content(const boost::system::error_code& err) +{ + using namespace boost::asio; + if(!err) + { + // Write all of the data that has been read so far. + // Then continue reading remaining data until EOF. + m_content << &response_; + async_read(socket_, response_, transfer_at_least(1), + boost::bind(&http_client::handle_read_content, this, + placeholders::error)); + } + else if (err != boost::asio::error::eof) + { + m_error << "Error: " << err << "\n"; + } +} + +std::string +uri_encode(std::string const& in) +{ + char buf[3 * in.size() + 1]; + size_t i = 0; + for (unsigned char const* c = (unsigned char const*)in.c_str(); *c; ++c) + { + // cout << *c << " " << int(*c) << endl; + if (*c == ' ') buf[i++] = '+'; + else if (*c == '.' || *c == '~' || *c == '_' || *c == '-') buf[i++] = *c; + else if (*c < '0') i += sprintf(buf+i, "%%%x", int(*c)); + else if (*c <= '9') buf[i++] = *c; + else if (*c < 'A') i += sprintf(buf+i, "%%%x", int(*c)); + else if (*c <= 'Z') buf[i++] = *c; + else if (*c < 'a') i += sprintf(buf+i, "%%%x", int(*c)); + else if (*c <= 'z') buf[i++] = *c; + else i += sprintf(buf+i, "%%%x", int(*c)); + } + buf[i] = 0; + return std::string(buf); +} + +} diff --git a/moses/TranslationModel/UG/mm/ug_http_client.h b/moses/TranslationModel/UG/mm/ug_http_client.h new file mode 100644 index 000000000..53ee258f9 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_http_client.h @@ -0,0 +1,57 @@ +// -*- c++ -*- +// Adapted by Ulrich Germann from: +// async_client.cpp +// ~~~~~~~~~~~~~~~~ +// +// Copyright (c) 2003-2011 Christopher M. Kohlhoff (chris at kohlhoff dot com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace Moses +{ +using boost::asio::ip::tcp; + +std::string uri_encode(std::string const& in); + +class http_client +{ + std::ostringstream m_content; + std::vector m_header; + std::string m_http_version; + unsigned int m_status_code; + std::string m_status_message; + std::ostringstream m_error; + +public: + http_client(boost::asio::io_service& io_service, std::string url); + http_client(boost::asio::io_service& io_service, + const std::string& server, const std::string& path); +private: + void init(std::string const& server, std::string const& path); + void handle_resolve(const boost::system::error_code& err, + tcp::resolver::iterator endpoint_iterator); + void handle_connect(const boost::system::error_code& err, + tcp::resolver::iterator endpoint_iterator); + void handle_write_request(const boost::system::error_code& err); + void handle_read_status_line(const boost::system::error_code& err); + void handle_read_headers(const boost::system::error_code& err); + void handle_read_content(const boost::system::error_code& err); + tcp::resolver resolver_; + tcp::socket socket_; + boost::asio::streambuf request_; + boost::asio::streambuf response_; +public: + std::string content() const; +}; + +} diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index ebe1ce91d..31046e178 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -3,11 +3,15 @@ #include #include "moses/Timer.h" -#ifdef HAVE_CURLPP -#include -#include -#include -#endif +// #ifdef HAVE_CURLPP +// #include +// #include +// #include +// #endif + +// #ifdef WITH_MMT_BIAS_CLIENT +#include "ug_http_client.h" +// #endif namespace Moses { @@ -15,21 +19,17 @@ namespace Moses { using ugdiss::id_type; -#ifdef HAVE_CURLPP + // #ifdef WITH_MMT_BIAS_CLIENT std::string query_bias_server(std::string const& url, std::string const& text) { - // communicate with the bias server; resuts will be in ... - std::ostringstream os; - curlpp::Easy myRequest; - std::string query = url+curlpp::escape(text); - myRequest.setOpt(new curlpp::options::Url(query)); - curlpp::options::WriteStream ws(&os); - myRequest.setOpt(ws); // Give it to your request - myRequest.perform(); // This will output to os - return os.str(); + std::string query = url+uri_encode(text); + boost::asio::io_service io_service; + Moses::client c(io_service, query); + io_service.run(); + return c.content(); } -#endif + // #endif DocumentBias ::DocumentBias @@ -40,13 +40,13 @@ namespace Moses : m_sid2docid(sid2doc) , m_bias(docname2docid.size(), 0) { -#ifdef HAVE_CURLPP + // #ifdef HAVE_CURLPP Timer timer; if (log) timer.start(NULL); std::string json = query_bias_server(server_url, text); init_from_json(json, docname2docid, log); if (log) *log << "Bias query took " << timer << " seconds." << std::endl; -#endif + // #endif } void diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 4ce775877..f05c0d59b 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -1,8 +1,8 @@ -#ifdef HAVE_CURLPP -#include -#include -#include -#endif +// #ifdef HAVE_CURLPP +// #include +// #include +// #include +// #endif #include "mmsapt.h" #include From c82ee9a4e9fb5727b946bb0fe0f805f8ce8c044c Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 24 May 2015 16:44:41 +0100 Subject: [PATCH 007/108] Bug fix. --- moses/TranslationModel/UG/mm/ug_sampling_bias.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index 31046e178..da408dfb3 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -25,7 +25,7 @@ namespace Moses { std::string query = url+uri_encode(text); boost::asio::io_service io_service; - Moses::client c(io_service, query); + Moses::http_client c(io_service, query); io_service.run(); return c.content(); } From 582a845524774ecea69fed7232c3cd56fe1ed3a1 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 24 May 2015 20:04:01 +0400 Subject: [PATCH 008/108] don't use zcat --- scripts/generic/score-parallel.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index e911cd4a3..81bc6f7d0 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -188,7 +188,7 @@ for (my $i = 0; $i < $fileCount; ++$i) print STDERR $cmd; if ($FlexibilityScore) { - $cmd .= "zcat $TMPDIR/phrase-table.half.$numStr.gz | $FlexibilityCmd $TMPDIR/extract.context.$i.gz"; + $cmd .= "gzip -cd $TMPDIR/phrase-table.half.$numStr.gz | $FlexibilityCmd $TMPDIR/extract.context.$i.gz"; $cmd .= " --Inverse" if ($otherExtractArgs =~ /--Inverse/); $cmd .= " --Hierarchical" if ($otherExtractArgs =~ /--Hierarchical/); $cmd .= " | $GZIP_EXEC -c > $TMPDIR/phrase-table.half.$numStr.flex.gz\n"; From f6f56d11af1868e3cf0104b6ac2fc27f65f92ed4 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Mon, 25 May 2015 15:50:45 +0100 Subject: [PATCH 009/108] ems: parse-relax comes last in train; do same for dev/test --- scripts/ems/experiment.meta | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 910c0c040..dafbe4a42 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -858,22 +858,22 @@ parse-input-devtest ignore-unless: use-mira template: $input-parser < IN > OUT parse-relax-input - in: parsed-input - out: parse-relaxed-input + in: split-input + out: input default-name: tuning/input.parse-relaxed pass-unless: input-parse-relaxer pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval template: $input-parse-relaxer < IN > OUT parse-relax-input-devtest - in: parsed-input-devtest - out: parse-relaxed-input-devtest + in: split-input-devtest + out: input-devtest default-name: tuning/input.devtest.parse-relaxed pass-unless: input-parse-relaxer pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval ignore-unless: use-mira template: $input-parse-relaxer < IN > OUT factorize-input - in: parse-relaxed-input + in: parsed-input out: factorized-input default-name: tuning/input.factorized rerun-on-change: TRAINING:input-factors @@ -881,7 +881,7 @@ factorize-input error: can't open error: incompatible number of words in factor factorize-input-devtest - in: parse-relaxed-input-devtest + in: parsed-input-devtest out: factorized-input-devtest default-name: tuning/input.devtest.factorized rerun-on-change: TRAINING:input-factors @@ -934,14 +934,14 @@ truecase-input-devtest template: $input-truecaser -model IN1.$input-extension < IN > OUT split-input in: truecased-input SPLITTER:splitter-model - out: input + out: split-input rerun-on-change: input-splitter default-name: tuning/input.split pass-unless: input-splitter template: $input-splitter -model IN1.$input-extension < IN > OUT split-input-devtest in: truecased-input-devtest SPLITTER:splitter-model - out: input-devtest + out: split-input-devtest rerun-on-change: input-splitter default-name: tuning/input.devtest.split pass-unless: input-splitter @@ -1148,14 +1148,14 @@ parse-input pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval template: $input-parser < IN > OUT parse-relax-input - in: parsed-input - out: parse-relaxed-input + in: split-input + out: input default-name: evaluation/input.parse-relaxed pass-unless: input-parse-relaxer pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval template: $input-parse-relaxer < IN > OUT factorize-input - in: parse-relaxed-input + in: parsed-input out: factorized-input default-name: evaluation/input.factorized rerun-on-change: TRAINING:input-factors @@ -1187,7 +1187,7 @@ truecase-input template: $input-truecaser -model IN1.$input-extension < IN > OUT split-input in: truecased-input SPLITTER:splitter-model - out: input + out: split-input default-name: evaluation/input.split pass-unless: input-splitter template: $input-splitter -model IN1.$input-extension < IN > OUT From ea9b097aba6ac422d346a8766ef27607bd31e787 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Tue, 26 May 2015 15:06:04 +0700 Subject: [PATCH 010/108] =?UTF-8?q?OutputFileStream:=20accept=20=E2=80=98-?= =?UTF-8?q?=E2=80=99=20for=20=E2=80=9Cstdout=E2=80=9D.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a common convention: when a program gets a dash as the path of a file that it should write, it writes to standard output instead. Enhances portability to systems that don't have /dev/stdout. --- phrase-extract/OutputFileStream.cpp | 45 +++++++++++++++++------------ phrase-extract/OutputFileStream.h | 35 ++++++++++++++++++++-- 2 files changed, 60 insertions(+), 20 deletions(-) diff --git a/phrase-extract/OutputFileStream.cpp b/phrase-extract/OutputFileStream.cpp index 15c2bd73e..d7874b06f 100644 --- a/phrase-extract/OutputFileStream.cpp +++ b/phrase-extract/OutputFileStream.cpp @@ -19,6 +19,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ +#include #include #include #include "OutputFileStream.h" @@ -32,11 +33,13 @@ namespace Moses OutputFileStream::OutputFileStream() :boost::iostreams::filtering_ostream() ,m_outFile(NULL) + ,m_open(false) { } OutputFileStream::OutputFileStream(const std::string &filePath) - : m_outFile(NULL) + :m_outFile(NULL) + ,m_open(false) { Open(filePath); } @@ -48,32 +51,38 @@ OutputFileStream::~OutputFileStream() bool OutputFileStream::Open(const std::string &filePath) { - m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary); - if (m_outFile->fail()) { - return false; + assert(!m_open); + if (filePath == std::string("-")) { + // Write to standard output. Leave m_outFile null. + this->push(std::cout); + } else { + m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary); + if (m_outFile->fail()) { + return false; + } + + if (ends_with(filePath, ".gz")) { + this->push(boost::iostreams::gzip_compressor()); + } + this->push(*m_outFile); } - if (ends_with(filePath, ".gz")) { - this->push(boost::iostreams::gzip_compressor()); - } - this->push(*m_outFile); - + m_open = true; return true; } void OutputFileStream::Close() { - if (m_outFile == NULL) { - return; - } - + if (!m_open) return; this->flush(); - this->pop(); // file + if (m_outFile) { + this->pop(); // file - m_outFile->close(); - delete m_outFile; - m_outFile = NULL; - return; + m_outFile->close(); + delete m_outFile; + m_outFile = NULL; + } + m_open = false; } diff --git a/phrase-extract/OutputFileStream.h b/phrase-extract/OutputFileStream.h index f52e36d76..b77741a73 100644 --- a/phrase-extract/OutputFileStream.h +++ b/phrase-extract/OutputFileStream.h @@ -30,19 +30,50 @@ namespace Moses { -/** Used in place of std::istream, can read zipped files if it ends in .gz +/** Version of std::ostream with transparent compression. + * + * Transparently compresses output when writing to a file whose name ends in + * ".gz". Or, writes to stdout instead of a file when given a filename + * consisting of just a dash ("-"). */ class OutputFileStream : public boost::iostreams::filtering_ostream { -protected: +private: + /** File that needs flushing & closing when we close this stream. + * + * Is NULL when no file is opened, e.g. when writing to standard output. + */ std::ofstream *m_outFile; + + /// Is this stream open? + bool m_open; + public: + /** Create an unopened OutputFileStream. + * + * Until it's been opened, nothing can be done with this stream. + */ OutputFileStream(); + /// Create an OutputFileStream, and open it by calling Open(). OutputFileStream(const std::string &filePath); virtual ~OutputFileStream(); + // TODO: Can we please just always throw an exception when this fails? + /** Open stream. + * + * If filePath is "-" (just a dash), this opens the stream for writing to + * standard output. Otherwise, it opens the given file. If the filename + * has the ".gz" suffix, output will be transparently compressed. + * + * Call Close() to close the file. + * + * Returns whether opening the file was successful. It may also throw an + * exception on failure. + */ bool Open(const std::string &filePath); + + /// Flush and close stream. After this, the stream can be opened again. void Close(); }; From c086a8ee5054f5fc56298736f1e4ca2ba441c51b Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Tue, 26 May 2015 16:44:13 +0100 Subject: [PATCH 011/108] Add a wrapper script for parsing English text with SENNA --- scripts/training/wrappers/parse-en-senna.perl | 149 ++++++++++++++++++ scripts/training/wrappers/senna2brackets.py | 98 ++++++++++++ 2 files changed, 247 insertions(+) create mode 100755 scripts/training/wrappers/parse-en-senna.perl create mode 100755 scripts/training/wrappers/senna2brackets.py diff --git a/scripts/training/wrappers/parse-en-senna.perl b/scripts/training/wrappers/parse-en-senna.perl new file mode 100755 index 000000000..f271633ea --- /dev/null +++ b/scripts/training/wrappers/parse-en-senna.perl @@ -0,0 +1,149 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use autodie; +use FindBin qw($RealBin); +use Getopt::Long "GetOptions"; + +my ($SENNA, + $SENNA_DIR, + $SENNA_OPTIONS, + $SPLIT_HYPHEN, + $SPLIT_SLASH, + $MARK_SPLIT, + $BINARIZE, + $UNPARSEABLE, + $RAW_IN, + $RAW_OUT); + +$UNPARSEABLE = 0; + +die("ERROR: syntax is: parse-en-senna.perl [-senna-options OPTIONS] [-split-hyphen] [-split-slash] [-mark-split] [-binarize] [-unparseable] [-raw-in PATH] [-raw-out PATH] -senna PATH -senna-dir PATH < in > out\n") + unless &GetOptions + ('senna=s' => \$SENNA, + 'senna-dir=s' => \$SENNA_DIR, + 'senna-options=s' => \$SENNA_OPTIONS, + 'split-hyphen' => \$SPLIT_HYPHEN, + 'split-slash' => \$SPLIT_SLASH, + 'mark-split' => \$MARK_SPLIT, + 'binarize' => \$BINARIZE, + 'unparseable' => \$UNPARSEABLE, + 'raw-in=s' => \$RAW_IN, + 'raw-out=s' => \$RAW_OUT + ) + && defined($SENNA); + +die("ERROR: file not found or not executable: '$SENNA'\n") unless -x $SENNA; +die("ERROR: could not find SENNA directory: '$SENNA_DIR'\n") unless -d $SENNA_DIR; + +# Step 1: Read standard input and write two temporary files: +# +# $tmpOriginal Contains a copy of the input as-is +# +# $tmpProcessed Contains a copy of the input after pre-processing ready +# for input to SENNA + +my $tmpOriginal = "/tmp/parse-en-senna.1.$$"; +my $tmpProcessed = "/tmp/parse-en-senna.2.$$"; + +open(TMP_ORIGINAL, ">$tmpOriginal"); + +open(TMP_PROCESSED, + "| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmpProcessed;"); + +while() { + print TMP_ORIGINAL $_; + + # If the line is longer than 1023 bytes (including the newline) then replace + # it with "SENTENCE_TOO_LONG\n". This is because SENNA reads lines into a + # 1024 character array and if a line is longer than 1023 characters then it + # gets read in stages and treated as multiple input lines. + my $num_bytes; + { + use bytes; + $num_bytes = length($_); + } + if ($num_bytes > 1023) { + print TMP_PROCESSED "SENTENCE_TOO_LONG\n"; + next; + } + + # Replace "-LRB-", "-RRB-", etc. with "(", ")", etc. + s/-LRB-/(/g; + s/-RRB-/)/g; + s/-LSB-/[/g; + s/-RSB-/]/g; + s/-LCB-/{/g; + s/-RCB-/}/g; + + # Unsplit hyphens. + s/ \@-\@ /-/g if $SPLIT_HYPHEN; + # Unsplit slashes. + s/ \@\/\@ /\//g if $SPLIT_SLASH; + + print TMP_PROCESSED $_; +} + +close(TMP_ORIGINAL); +close(TMP_PROCESSED); + +# Step 2: Parse $tmpProcessed then pass the raw output through a post-processing +# pipeline. + +my $pipeline = ""; + +# Stage 1: Parse input (unless given pre-parsed input via -raw-in option). +if (defined($RAW_IN)) { + $pipeline .= "cat \"$RAW_IN\" |"; +} else { + $pipeline .= "cat $tmpProcessed |"; + my $path = $SENNA_DIR; + # SENNA requires -path's argument to end with a slash. + if ($path !~ /\/$/) { + $path .= "/"; + } + $pipeline .= " $SENNA -path $path -usrtokens"; + $pipeline .= " $SENNA_OPTIONS" if defined($SENNA_OPTIONS); + $pipeline .= " |"; +} + +if (defined($RAW_OUT)) { + $pipeline .= " tee \"$RAW_OUT\" |"; +} + +# Stage 2: Convert SENNA output to Moses XML (via Berkeley output format) +$pipeline .= " $RealBin/senna2brackets.py --berkeley-style |"; +$pipeline .= " $RealBin/berkeleyparsed2mosesxml.perl |"; + +# Stage 3: Re-split hyphens / slashes. +if ($SPLIT_HYPHEN) { + $pipeline .= " $RealBin/syntax-hyphen-splitting.perl"; + $pipeline .= " -binarize" if $BINARIZE; + $pipeline .= " -mark-split" if $MARK_SPLIT; + $pipeline .= " |"; +} +if ($SPLIT_SLASH) { + $pipeline .= " $RealBin/syntax-hyphen-splitting.perl -slash"; + $pipeline .= " -binarize" if $BINARIZE; + $pipeline .= " -mark-split" if $MARK_SPLIT; + $pipeline .= " |"; +} + +# Run the parsing + post-processing pipeline. +open(PARSE, $pipeline); +open(TMP_ORIGINAL, $tmpOriginal); +while () { + my $parsedLine = $_; + my $originalLine = ; + if ($UNPARSEABLE == 1 && length($parsedLine) == 1) { + print $originalLine; + } else { + print $parsedLine; + } +} +close(PARSE); + +`rm $tmpOriginal`; +`rm $tmpProcessed`; diff --git a/scripts/training/wrappers/senna2brackets.py b/scripts/training/wrappers/senna2brackets.py new file mode 100755 index 000000000..28fa6d2d7 --- /dev/null +++ b/scripts/training/wrappers/senna2brackets.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +# Read SENNA output (from stdin), extract the parse trees, and write them in +# PTB-style bracketed format (to stdout). +# +# The SENNA output is assumed to contain tokens in the first column, POS tags +# in the second column, and PSG fragments in the final column. +# +# It is also assumed that SENNA was run through the parse-en-senna.perl wrapper, +# which: +# +# - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that +# exceed SENNA's hardcoded limit. +# +# - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")", +# etc. + +import optparse +import os +import sys + +def main(): + usage = "usage: %prog [options]" + parser = optparse.OptionParser(usage=usage) + parser.add_option("--berkeley-style", action="store_true", default=False, + dest="berkeley", + help="mimic the Berkeley Parser's output format") + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error("incorrect number of arguments") + + tree = "" + for line in sys.stdin: + if line.strip() == "": + if not balanced(tree): + warn("unbalanced parentheses at line %d: " + "discarding tree" % line_num) + tree = "" + if tree == "" and options.berkeley: + print "(())" + else: + tree = beautify(tree) + if options.berkeley: + tree = berkelify(tree) + print tree + tree = "" + continue + tokens = line.split() + word, pos, frag = tokens[0], tokens[1], tokens[-1] + # Check for the special "SENTENCE_TOO_LONG" token (see + # parse-en-senna.perl) + if word == "SENTENCE_TOO_LONG": + continue + # Restore -LRB-, -RRB-, etc. + if word == "(": + word = "-LRB-" + elif word == ")": + word = "-RRB-" + elif word == "[": + word = "-LSB-" + elif word == "]": + word = "-RSB-" + elif word == "{": + word = "-LCB-" + elif word == "}": + word = "-RCB-" + tree += frag.replace("*", "(%s %s)" % (pos, word)) + +def balanced(s): + num_left = 0 + num_right = 0 + for char in s: + if char == "(": + num_left += 1 + elif char == ")": + num_right += 1 + return num_left == num_right + +def beautify(tree): + s = tree.replace("(", " (") + return s.strip() + +def berkelify(tree): + if len(tree) == 0: + return tree + assert tree[0] == "(" + pos = tree.find(" (", 1) + assert pos != -1 + old_root = tree[1:pos] + return tree.replace(old_root, "TOP") + +def warn(msg): + prog_name = os.path.basename(sys.argv[0]) + sys.stderr.write("%s: warning: %s" % (prog_name, msg)) + sys.exit(1) + +if __name__ == "__main__": + main() From 842fc9780e8f86aea076cc9de39d4039039da904 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 27 May 2015 20:33:43 +0100 Subject: [PATCH 012/108] senna2brackets.py: bug fixes + clean-up --- scripts/training/wrappers/senna2brackets.py | 23 ++++++++++----------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/scripts/training/wrappers/senna2brackets.py b/scripts/training/wrappers/senna2brackets.py index 28fa6d2d7..4fc71ed44 100755 --- a/scripts/training/wrappers/senna2brackets.py +++ b/scripts/training/wrappers/senna2brackets.py @@ -30,19 +30,19 @@ def main(): parser.error("incorrect number of arguments") tree = "" + line_num = 0 for line in sys.stdin: + line_num += 1 + # Check for a blank line (the sentence delimiter). if line.strip() == "": if not balanced(tree): - warn("unbalanced parentheses at line %d: " + warn("unbalanced parentheses in tree ending at line %d: " "discarding tree" % line_num) tree = "" - if tree == "" and options.berkeley: - print "(())" - else: - tree = beautify(tree) - if options.berkeley: - tree = berkelify(tree) - print tree + tree = beautify(tree) + if options.berkeley: + tree = berkelify(tree) + print tree tree = "" continue tokens = line.split() @@ -81,8 +81,8 @@ def beautify(tree): return s.strip() def berkelify(tree): - if len(tree) == 0: - return tree + if tree == "": + return "(())" assert tree[0] == "(" pos = tree.find(" (", 1) assert pos != -1 @@ -91,8 +91,7 @@ def berkelify(tree): def warn(msg): prog_name = os.path.basename(sys.argv[0]) - sys.stderr.write("%s: warning: %s" % (prog_name, msg)) - sys.exit(1) + sys.stderr.write("%s: warning: %s\n" % (prog_name, msg)) if __name__ == "__main__": main() From 7ff1f9c06370ba10eb7951b86002fde171e97b7f Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 27 May 2015 20:45:55 +0100 Subject: [PATCH 013/108] Option bundling. --- moses/StaticData.cpp | 71 +++++++------------- moses/StaticData.h | 87 +++++++++++++++---------- moses/parameters/BookkeepingOptions.cpp | 18 +++++ moses/parameters/BookkeepingOptions.h | 15 +++++ moses/parameters/NBestOptions.cpp | 40 ++++++++++++ moses/parameters/NBestOptions.h | 29 +++++++++ 6 files changed, 179 insertions(+), 81 deletions(-) create mode 100644 moses/parameters/BookkeepingOptions.cpp create mode 100644 moses/parameters/BookkeepingOptions.h create mode 100644 moses/parameters/NBestOptions.cpp create mode 100644 moses/parameters/NBestOptions.h diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 9cf97657a..ac0c3c990 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -63,8 +63,8 @@ StaticData::StaticData() : m_sourceStartPosMattersForRecombination(false) , m_requireSortingAfterSourceContext(false) , m_inputType(SentenceInput) - , m_onlyDistinctNBest(false) - , m_needAlignmentInfo(false) + // , m_onlyDistinctNBest(false) + // , m_needAlignmentInfo(false) , m_lmEnableOOVFeature(false) , m_isAlwaysCreateDirectTranslationOption(false) , m_currentWeightSetting("default") @@ -203,25 +203,26 @@ StaticData //word-to-word alignment // alignments m_parameter->SetParameter(m_PrintAlignmentInfo, "print-alignment-info", false ); - if (m_PrintAlignmentInfo) { - m_needAlignmentInfo = true; - } + + // if (m_PrintAlignmentInfo) { // => now in BookkeepingOptions::init() + // m_needAlignmentInfo = true; + // } m_parameter->SetParameter(m_wordAlignmentSort, "sort-word-alignment", NoSort); - if (m_PrintAlignmentInfoNbest) { - m_needAlignmentInfo = true; - } + // if (m_PrintAlignmentInfoNbest) { // => now in BookkeepingOptions::init() + // m_needAlignmentInfo = true; + // } params = m_parameter->GetParam("alignment-output-file"); if (params && params->size()) { m_alignmentOutputFile = Scan(params->at(0)); - m_needAlignmentInfo = true; + // m_needAlignmentInfo = true; // => now in BookkeepingOptions::init() } m_parameter->SetParameter( m_PrintID, "print-id", false ); m_parameter->SetParameter( m_PrintPassthroughInformation, "print-passthrough", false ); - m_parameter->SetParameter( m_PrintPassthroughInformationInNBest, "print-passthrough-in-n-best", false ); + // m_parameter->SetParameter( m_PrintPassthroughInformationInNBest, "print-passthrough-in-n-best", false ); // => now in BookkeepingOptions::init() // word graph params = m_parameter->GetParam("output-word-graph"); @@ -327,41 +328,7 @@ bool StaticData ::ini_nbest_options() { - const PARAM_VEC *params; - // n-best - params = m_parameter->GetParam("n-best-list"); - if (params) { - if (params->size() >= 2) { - m_nBestFilePath = params->at(0); - m_nBestSize = Scan( params->at(1) ); - m_onlyDistinctNBest=(params->size()>2 && params->at(2)=="distinct"); - } else { - std::cerr << "wrong format for switch -n-best-list file size [disinct]"; - return false; - } - } else { - m_nBestSize = 0; - } - - m_parameter->SetParameter(m_nBestFactor, "n-best-factor", 20); - - - m_parameter->SetParameter(m_PrintAlignmentInfoNbest, - "print-alignment-info-in-n-best", false ); - - // include feature names in the n-best list - m_parameter->SetParameter(m_labeledNBestList, "labeled-n-best-list", true ); - - // include word alignment in the n-best list - m_parameter->SetParameter(m_nBestIncludesSegmentation, - "include-segmentation-in-n-best", false ); - - // print all factors of output translations - m_parameter->SetParameter(m_reportAllFactorsNBest, - "report-all-factors-in-n-best", false ); - - m_parameter->SetParameter(m_printNBestTrees, "n-best-trees", false ); - return true; + return m_nbest_options.init(*m_parameter); } void @@ -625,8 +592,9 @@ bool StaticData::LoadData(Parameter *parameter) // input, output ini_factor_maps(); ini_input_options(); + m_bookkeeping_options.init(*parameter); + m_nbest_options.init(*parameter); // if (!ini_nbest_options()) return false; if (!ini_output_options()) return false; - if (!ini_nbest_options()) return false; // threading etc. if (!ini_performance_options()) return false; @@ -647,6 +615,17 @@ bool StaticData::LoadData(Parameter *parameter) ini_mira_options(); + // set m_nbest_options.enabled = true if necessary: + if (m_mbr || m_useLatticeMBR || m_outputSearchGraph || m_outputSearchGraphSLF + || m_mira || m_outputSearchGraphHypergraph || m_useConsensusDecoding +#ifdef HAVE_PROTOBUF + || m_outputSearchGraphPB +#endif + || m_latticeSamplesFilePath.size()) + { + m_nbest_options.enabled = true; + } + // S2T decoder m_parameter->SetParameter(m_s2tParsingAlgorithm, "s2t-parsing-algorithm", RecursiveCYKPlus); diff --git a/moses/StaticData.h b/moses/StaticData.h index 7e71f0881..2b1e37b83 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -45,6 +45,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "moses/PP/Factory.h" #include "moses/parameters/ContextParameters.h" +#include "moses/parameters/NBestOptions.h" +#include "moses/parameters/BookkeepingOptions.h" namespace Moses { @@ -95,18 +97,21 @@ protected: // 0 = no disortion (monotone in old pharaoh) bool m_reorderingConstraint; //! use additional reordering constraints bool m_useEarlyDistortionCost; - size_t - m_maxHypoStackSize //! hypothesis-stack size that triggers pruning - , m_minHypoStackDiversity //! minimum number of hypothesis in stack for each source word coverage - , m_nBestSize - , m_latticeSamplesSize - , m_nBestFactor - , m_maxNoTransOptPerCoverage - , m_maxNoPartTransOpt - , m_maxPhraseLength; + size_t m_maxHypoStackSize; //! hypothesis-stack size that triggers pruning + size_t m_minHypoStackDiversity; //! minimum number of hypothesis in stack for each source word coverage; + NBestOptions m_nbest_options; + BookkeepingOptions m_bookkeeping_options; + // size_t m_nBestSize; + // size_t m_nBestFactor; + + size_t m_latticeSamplesSize; + size_t m_maxNoTransOptPerCoverage; + size_t m_maxNoPartTransOpt; + size_t m_maxPhraseLength; - std::string m_nBestFilePath, m_latticeSamplesFilePath; - bool m_labeledNBestList,m_nBestIncludesSegmentation; + // std::string m_nBestFilePath; + std::string m_latticeSamplesFilePath; + // bool m_labeledNBestList,m_nBestIncludesSegmentation; bool m_dropUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = drop (ignore) them bool m_markUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = mark and (ignore) them bool m_wordDeletionEnabled; @@ -128,21 +133,21 @@ protected: bool m_reportSegmentation; bool m_reportSegmentationEnriched; bool m_reportAllFactors; - bool m_reportAllFactorsNBest; + // bool m_reportAllFactorsNBest; std::string m_detailedTranslationReportingFilePath; std::string m_detailedTreeFragmentsTranslationReportingFilePath; //DIMw std::string m_detailedAllTranslationReportingFilePath; - bool m_onlyDistinctNBest; + // bool m_onlyDistinctNBest; bool m_PrintAlignmentInfo; - bool m_needAlignmentInfo; - bool m_PrintAlignmentInfoNbest; + // bool m_needAlignmentInfo; // => BookkeepingOptions + // bool m_PrintAlignmentInfoNbest; bool m_PrintID; bool m_PrintPassthroughInformation; - bool m_PrintPassthroughInformationInNBest; + // bool m_PrintPassthroughInformationInNBest; std::string m_alignmentOutputFile; @@ -214,7 +219,7 @@ protected: bool m_useLegacyPT; bool m_defaultNonTermOnlyForEmptyRange; S2TParsingAlgorithm m_s2tParsingAlgorithm; - bool m_printNBestTrees; + // bool m_printNBestTrees; FeatureRegistry m_registry; PhrasePropertyFactory m_phrasePropertyFactory; @@ -361,7 +366,8 @@ public: return m_PrintPassthroughInformation; } bool IsPassthroughInNBestEnabled() const { - return m_PrintPassthroughInformationInNBest; + return m_nbest_options.include_passthrough; + // return m_PrintPassthroughInformationInNBest; } int GetMaxDistortion() const { return m_maxDistortion; @@ -410,7 +416,8 @@ public: return m_reportAllFactors; } bool GetReportAllFactorsNBest() const { - return m_reportAllFactorsNBest; + return m_nbest_options.include_all_factors; + // return m_reportAllFactorsNBest; } bool IsDetailedTranslationReportingEnabled() const { return !m_detailedTranslationReportingFilePath.empty(); @@ -430,7 +437,8 @@ public: return m_detailedTreeFragmentsTranslationReportingFilePath; } bool IsLabeledNBestList() const { - return m_labeledNBestList; + return m_nbest_options.include_feature_labels; + // return m_labeledNBestList; } bool UseMinphrInMemory() const { @@ -443,21 +451,24 @@ public: // for mert size_t GetNBestSize() const { - return m_nBestSize; + return m_nbest_options.nbest_size; + // return m_nBestSize; } const std::string &GetNBestFilePath() const { - return m_nBestFilePath; + return m_nbest_options.output_file_path; + // return m_nBestFilePath; } bool IsNBestEnabled() const { - return (!m_nBestFilePath.empty() || m_mbr || m_useLatticeMBR || m_mira || - m_outputSearchGraph || m_outputSearchGraphSLF || - m_outputSearchGraphHypergraph || m_useConsensusDecoding || -#ifdef HAVE_PROTOBUF - m_outputSearchGraphPB || -#endif - !m_latticeSamplesFilePath.empty()); + return m_nbest_options.enabled; + // return (!m_nBestFilePath.empty() || m_mbr || m_useLatticeMBR || m_mira || + // m_outputSearchGraph || m_outputSearchGraphSLF || + // m_outputSearchGraphHypergraph || m_useConsensusDecoding || + // #ifdef HAVE_PROTOBUF + // m_outputSearchGraphPB || + // #endif + // !m_latticeSamplesFilePath.empty()); } size_t GetLatticeSamplesSize() const { @@ -469,7 +480,8 @@ public: } size_t GetNBestFactor() const { - return m_nBestFactor; + return m_nbest_options.factor; + // return m_nBestFactor; } bool GetOutputWordGraph() const { return m_outputWordGraph; @@ -527,7 +539,8 @@ public: void SetWeights(const FeatureFunction* sp, const std::vector& weights); bool GetDistinctNBest() const { - return m_onlyDistinctNBest; + return m_nbest_options.only_distinct; + // return m_onlyDistinctNBest; } const std::string& GetFactorDelimiter() const { return m_factorDelimiter; @@ -692,7 +705,8 @@ public: const std::string &GetBinDirectory() const; bool NeedAlignmentInfo() const { - return m_needAlignmentInfo; + return m_bookkeeping_options.need_alignment_info; + // return m_needAlignmentInfo; } const std::string &GetAlignmentOutputFile() const { return m_alignmentOutputFile; @@ -701,14 +715,16 @@ public: return m_PrintAlignmentInfo; } bool PrintAlignmentInfoInNbest() const { - return m_PrintAlignmentInfoNbest; + return m_nbest_options.include_alignment_info; + // return m_PrintAlignmentInfoNbest; } WordAlignmentSort GetWordAlignmentSort() const { return m_wordAlignmentSort; } bool NBestIncludesSegmentation() const { - return m_nBestIncludesSegmentation; + return m_nbest_options.include_segmentation; + // return m_nBestIncludesSegmentation; } bool GetHasAlternateWeightSettings() const { @@ -849,7 +865,8 @@ public: } bool PrintNBestTrees() const { - return m_printNBestTrees; + return m_nbest_options.print_trees; + // return m_printNBestTrees; } bool RequireSortingAfterSourceContext() const { diff --git a/moses/parameters/BookkeepingOptions.cpp b/moses/parameters/BookkeepingOptions.cpp new file mode 100644 index 000000000..875c605bf --- /dev/null +++ b/moses/parameters/BookkeepingOptions.cpp @@ -0,0 +1,18 @@ +#include "BookkeepingOptions.h" + +namespace Moses { + bool + BookkeepingOptions:: + init(Parameter const& P) + { + bool& x = need_alignment_info; + P.SetParameter(x, "print-alignment-info", false); + if (!x) P.SetParameter(x, "print-alignment-info-in-n-best", false); + if (!x) + { + PARAM_VEC const* params = P.GetParam("alignment-output-file"); + x = params && params->size(); + } + return true; + } +} diff --git a/moses/parameters/BookkeepingOptions.h b/moses/parameters/BookkeepingOptions.h new file mode 100644 index 000000000..8e800c587 --- /dev/null +++ b/moses/parameters/BookkeepingOptions.h @@ -0,0 +1,15 @@ +// -*- mode: c++; cc-style: gnu -*- +#include "moses/Parameter.h" +// #include + +namespace Moses { + + struct BookkeepingOptions + { + bool need_alignment_info; + bool init(Parameter const& param); + }; + + + +} diff --git a/moses/parameters/NBestOptions.cpp b/moses/parameters/NBestOptions.cpp new file mode 100644 index 000000000..6ec97c91b --- /dev/null +++ b/moses/parameters/NBestOptions.cpp @@ -0,0 +1,40 @@ +// -*- mode: c++; cc-style: gnu -*- +#include "moses/Parameter.h" +#include "NBestOptions.h" + +namespace Moses { + +bool +NBestOptions:: +init(Parameter const& P) +{ + const PARAM_VEC *params; + params = P.GetParam("n-best-list"); + if (params) + { + if (params->size() >= 2) + { + output_file_path = params->at(0); + nbest_size = Scan( params->at(1) ); + only_distinct = (params->size()>2 && params->at(2)=="distinct"); + } + else + { + std::cerr << "wrong format for switch -n-best-list file size [disinct]"; + return false; + } + } + else nbest_size = 0; + + P.SetParameter(factor, "n-best-factor", 20); + P.SetParameter(include_alignment_info, "print-alignment-info-in-n-best", false ); + P.SetParameter(include_feature_labels, "labeled-n-best-list", true ); + P.SetParameter(include_segmentation, "include-segmentation-in-n-best", false ); + P.SetParameter(include_passthrough, "print-passthrough-in-n-best", false ); + P.SetParameter(include_all_factors, "report-all-factors-in-n-best", false ); + P.SetParameter(print_trees, "n-best-trees", false ); + + enabled = output_file_path.size(); + return true; +} +} // namespace Moses diff --git a/moses/parameters/NBestOptions.h b/moses/parameters/NBestOptions.h new file mode 100644 index 000000000..e844c1eac --- /dev/null +++ b/moses/parameters/NBestOptions.h @@ -0,0 +1,29 @@ +// -*- mode: c++; cc-style: gnu -*- +#include + +namespace Moses { + + struct NBestOptions + { + size_t nbest_size; + size_t factor; + bool enabled; + bool print_trees; + bool only_distinct; + + bool include_alignment_info; + bool include_segmentation; + bool include_feature_labels; + bool include_passthrough; + + bool include_all_factors; + + std::string output_file_path; + + bool init(Parameter const& param); + + }; + + + +} From ab2d396781e57d8d7e3526d102db24597e289fab Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Thu, 28 May 2015 17:10:21 +0100 Subject: [PATCH 014/108] Min score parameter --- scripts/training/binarize-model.perl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl index 0239f5fc8..cca74f1ab 100755 --- a/scripts/training/binarize-model.perl +++ b/scripts/training/binarize-model.perl @@ -17,12 +17,14 @@ if ($SCRIPTS_ROOTDIR eq '') { } $SCRIPTS_ROOTDIR =~ s/\/training$//; -my ($binarizer, $input_config, $output_config); +my ($binarizer, $input_config, $output_config, $min_score); my $opt_hierarchical = 0; -$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTable"; +$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTableMin"; +$min_score = "0"; GetOptions( "Hierarchical" => \$opt_hierarchical, - "Binarizer=s" => \$binarizer + "Binarizer=s" => \$binarizer, + "MinScore=s" => \$min_score ) or exit(1); $input_config = shift; @@ -37,7 +39,9 @@ my $hierarchical = ""; $hierarchical = "-Hierarchical" if $opt_hierarchical; my $targetdir = "$output_config.tables"; -safesystem("$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer") || die "binarising failed"; +my $cmd = "$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer"; +$cmd .= "--MinScore $min_score" if (defined $min_score); +safesystem($cmd) || die "binarising failed"; safesystem("rm -f $output_config; ln -s $targetdir/moses.ini $output_config") || die "failed to link new ini file"; #FIXME: Why isn't this in a module? From c27aa193eaa3c73754c8d90dea0cd32dd5a22e7d Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Thu, 28 May 2015 17:44:26 +0100 Subject: [PATCH 015/108] Revert "Min score parameter". Doesn't work without filter. This reverts commit ab2d396781e57d8d7e3526d102db24597e289fab. --- scripts/training/binarize-model.perl | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl index cca74f1ab..0239f5fc8 100755 --- a/scripts/training/binarize-model.perl +++ b/scripts/training/binarize-model.perl @@ -17,14 +17,12 @@ if ($SCRIPTS_ROOTDIR eq '') { } $SCRIPTS_ROOTDIR =~ s/\/training$//; -my ($binarizer, $input_config, $output_config, $min_score); +my ($binarizer, $input_config, $output_config); my $opt_hierarchical = 0; -$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTableMin"; -$min_score = "0"; +$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTable"; GetOptions( "Hierarchical" => \$opt_hierarchical, - "Binarizer=s" => \$binarizer, - "MinScore=s" => \$min_score + "Binarizer=s" => \$binarizer ) or exit(1); $input_config = shift; @@ -39,9 +37,7 @@ my $hierarchical = ""; $hierarchical = "-Hierarchical" if $opt_hierarchical; my $targetdir = "$output_config.tables"; -my $cmd = "$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer"; -$cmd .= "--MinScore $min_score" if (defined $min_score); -safesystem($cmd) || die "binarising failed"; +safesystem("$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer") || die "binarising failed"; safesystem("rm -f $output_config; ln -s $targetdir/moses.ini $output_config") || die "failed to link new ini file"; #FIXME: Why isn't this in a module? From 26170a41790bc1dfbc01c90dbcbf2699a0fe3cd0 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Fri, 29 May 2015 09:37:37 +0700 Subject: [PATCH 016/108] Friendlier error reporting in beautify.py. --- scripts/other/beautify.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/scripts/other/beautify.py b/scripts/other/beautify.py index f03a58ce7..0caa6b162 100755 --- a/scripts/other/beautify.py +++ b/scripts/other/beautify.py @@ -38,6 +38,17 @@ BEAUTIFY_IGNORE = '.beautify-ignore' class LintCheckFailure(Exception): """Lint was found, or the lint checker otherwise returned failure.""" + exit_code = 1 + + +class ProgramFailure(Exception): + """The program failed, but it's not a bug. No traceback.""" + exit_code = 2 + + +class CommandLineError(Exception): + """Something wrong with the command-line arguments.""" + exit_code = 3 def read_ignore_file(root_dir): @@ -52,7 +63,7 @@ def read_ignore_file(root_dir): ignore_contents = ignore_file.read() except IOError as error: if error.errno == ENOENT: - raise Exception( + raise ProgramFailure( "No .gitignore file found in %s. " "Is it really the project's root directory?" % root_dir) @@ -200,7 +211,7 @@ def check_astyle_version(verbose=False): ['astyle', '--version'], verbose=verbose, env={'LC_ALL': 'C'}) version = version.strip() if version != EXPECTED_ASTYLE_VERSION: - raise Exception( + raise ProgramFailure( "Wrong astyle version. " "Expected '%s', but got version string '%s'." % (EXPECTED_ASTYLE_VERSION, version)) @@ -226,8 +237,15 @@ def run_perltidy(source_files, verbose=False, dry_run=False): # Write "} else {", with 'else' on the same line as the braces. '--cuddled-else', ] - _, stderr = run_command( - command_line + source_files, verbose=verbose, dry_run=dry_run) + try: + _, stderr = run_command( + command_line + source_files, verbose=verbose, dry_run=dry_run) + except OSError as error: + if error.errno == ENOENT: + raise ProgramFailure( + "Could not run 'perltidy'. Make sure that it is installed.") + else: + raise if stderr != '': sys.stderr.write(stderr) @@ -386,7 +404,7 @@ def main(): """Find and format source files.""" args = parse_arguments() if not args.format and not args.lint: - raise Exception("Select action: --format, --lint, or both.") + raise CommandLineError("Select action: --format, --lint, or both.") ignore = read_ignore_file(args.root_dir) @@ -409,8 +427,8 @@ def main(): if __name__ == '__main__': try: main() - except LintCheckFailure as error: + except (CommandLineError, LintCheckFailure, ProgramFailure) as error: # This is a failure, but not a bug. Print a friendly error # message, not a traceback. sys.stderr.write('%s\n' % error) - sys.exit(1) + sys.exit(error.exit_code) From ef028446f3640e007215b4576a4dc52a9c9de6db Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Fri, 29 May 2015 18:30:26 +0700 Subject: [PATCH 017/108] Add license notices to scripts. This is not pleasant to read (and much, much less pleasant to write!) but sort of necessary in an open project. Right now it's quite hard to figure out what is licensed how, which doesn't matter much to most people but can suddenly become very important when people want to know what they're being allowed to do. I kept the notices as short as I could. As far as I could see, everything without a clear license notice is LGPL v2.1 or later. --- scripts/OSM/OSM-Train.perl | 3 ++ scripts/OSM/extract-singletons.perl | 3 ++ scripts/OSM/flipAlignment.perl | 3 ++ scripts/Transliteration/clean.pl | 5 ++- scripts/Transliteration/corpusCreator.pl | 3 ++ .../in-decoding-transliteration.pl | 3 ++ .../post-decoding-transliteration.pl | 3 ++ .../prepare-transliteration-phrase-table.pl | 3 ++ scripts/Transliteration/threshold.pl | 3 ++ .../train-transliteration-module.pl | 3 ++ ...trap-hypothesis-difference-significance.pl | 3 ++ scripts/analysis/extract-target-trees.py | 12 ++++-- scripts/analysis/nontranslated_words.pl | 3 ++ scripts/analysis/oov.pl | 3 ++ scripts/analysis/sentence-by-sentence.pl | 3 ++ scripts/analysis/sg2dot.perl | 2 + scripts/analysis/show-phrases-used.pl | 3 ++ scripts/analysis/smtgui/Corpus.pm | 3 ++ .../analysis/smtgui/filter-phrase-table.pl | 3 ++ scripts/analysis/smtgui/newsmtgui.cgi | 3 ++ scripts/analysis/suspicious_tokenization.pl | 3 ++ scripts/analysis/weight-scan-summarize.sh | 4 ++ scripts/analysis/weight-scan.pl | 4 ++ scripts/ems/experiment.perl | 3 ++ scripts/ems/fix-info.perl | 3 ++ scripts/ems/support/analysis.perl | 3 ++ scripts/ems/support/berkeley-process.sh | 3 ++ scripts/ems/support/berkeley-train.sh | 3 ++ .../build-domain-file-from-subcorpora.perl | 3 ++ .../ems/support/build-sparse-features.perl | 3 ++ .../support/consolidate-training-data.perl | 3 ++ scripts/ems/support/defaultconfig.py | 3 ++ scripts/ems/support/fast-align-in-parts.perl | 3 ++ .../generic-multicore-parallelizer.perl | 3 ++ scripts/ems/support/generic-parallelizer.perl | 3 ++ scripts/ems/support/input-from-sgm.perl | 3 ++ scripts/ems/support/interpolate-lm.perl | 3 ++ scripts/ems/support/lmplz-wrapper.perl | 3 ++ scripts/ems/support/mml-filter.perl | 3 ++ scripts/ems/support/mml-score.perl | 3 ++ scripts/ems/support/mml-train.perl | 3 ++ scripts/ems/support/prepare-fast-align.perl | 3 ++ scripts/ems/support/reference-from-sgm.perl | 3 ++ .../support/remove-segmentation-markup.perl | 3 ++ .../ems/support/report-experiment-scores.perl | 3 ++ .../run-command-on-multiple-refsets.perl | 3 ++ scripts/ems/support/run-wade.perl | 3 ++ scripts/ems/support/split-sentences.perl | 3 ++ scripts/ems/support/submit-grid.perl | 3 ++ ...ubstitute-filtered-tables-and-weights.perl | 3 ++ .../support/substitute-filtered-tables.perl | 3 ++ scripts/ems/support/substitute-weights.perl | 3 ++ .../ems/support/symmetrize-fast-align.perl | 3 ++ scripts/ems/support/thot-lm-wrapper.perl | 3 ++ .../ems/support/tree-converter-wrapper.perl | 3 ++ scripts/ems/support/wrap-xml.perl | 3 ++ scripts/ems/web/analysis.php | 5 +++ scripts/ems/web/analysis_diff.php | 4 ++ scripts/ems/web/diff.php | 5 +++ scripts/ems/web/hierarchical-segmentation.js | 4 ++ scripts/ems/web/index.php | 5 +++ scripts/ems/web/lib.php | 5 +++ scripts/ems/web/overview.php | 4 ++ scripts/ems/web/progress.perl | 3 ++ scripts/ems/web/sgviz.js | 4 ++ scripts/ems/web/sgviz.php | 6 +++ scripts/fuzzy-match/create_xml.perl | 3 ++ scripts/generic/bsbleu.py | 3 ++ scripts/generic/compound-splitter.perl | 3 ++ scripts/generic/extract-factors.pl | 3 ++ scripts/generic/extract-parallel.perl | 3 ++ scripts/generic/fsa2fsal.pl | 3 ++ scripts/generic/fsa2plf.pl | 3 ++ scripts/generic/fsal2fsa.pl | 3 ++ scripts/generic/generic-parallel.perl | 3 ++ scripts/generic/giza-parallel.perl | 3 ++ scripts/generic/lopar2pos.pl | 3 ++ scripts/generic/moses-parallel.pl | 3 ++ scripts/generic/moses_sim_pe.py | 29 ++++++++------ scripts/generic/mteval-v12.pl | 3 ++ scripts/generic/mteval-v13a.pl | 3 ++ scripts/generic/multi-bleu.perl | 3 ++ scripts/generic/ph_numbers.perl | 3 ++ scripts/generic/qsub-wrapper.pl | 3 ++ scripts/generic/reverse-alignment.perl | 3 ++ scripts/generic/score-parallel.perl | 3 ++ scripts/generic/strip-xml.perl | 3 ++ scripts/generic/trainlm-irst2.perl | 3 ++ scripts/other/beautify.py | 5 +++ scripts/other/convert-pt.perl | 3 ++ scripts/other/delete-scores.perl | 3 ++ scripts/other/gacha_filter.py | 3 ++ .../get_many_translations_from_google.perl | 3 ++ scripts/other/retain-lines.perl | 3 ++ .../other/translate_by_microsoft_bing.perl | 3 ++ scripts/recaser/detruecase.perl | 3 ++ scripts/recaser/recase.perl | 3 ++ scripts/recaser/train-recaser.perl | 3 ++ scripts/recaser/train-truecaser.perl | 3 ++ scripts/recaser/truecase.perl | 3 ++ .../MosesScriptsRegressionTesting.pm | 3 ++ scripts/regression-testing/compare-results.pl | 3 ++ .../create_localized_moses_ini.pl | 3 ++ scripts/regression-testing/modify-pars.pl | 3 ++ scripts/regression-testing/moses-virtual.pl | 3 ++ scripts/regression-testing/run-single-test.pl | 3 ++ scripts/regression-testing/run-test-suite.pl | 3 ++ scripts/server/moses.py | 3 ++ scripts/server/sim-pe.py | 10 ++++- .../tokenizer/deescape-special-chars-PTB.perl | 3 ++ scripts/tokenizer/deescape-special-chars.perl | 3 ++ scripts/tokenizer/detokenizer.perl | 3 ++ scripts/tokenizer/escape-special-chars.perl | 3 ++ scripts/tokenizer/lowercase.perl | 3 ++ scripts/tokenizer/normalize-punctuation.perl | 3 ++ scripts/tokenizer/pre-tok-clean.perl | 3 ++ scripts/tokenizer/pre-tokenizer.perl | 3 ++ scripts/tokenizer/pre_tokenize_cleaning.py | 3 ++ .../tokenizer/remove-non-printing-char.perl | 3 ++ .../replace-unicode-punctuation.perl | 3 ++ scripts/tokenizer/tokenizer.perl | 3 ++ scripts/tokenizer/tokenizer_PTB.perl | 3 ++ scripts/training/LexicalTranslationModel.pm | 3 ++ scripts/training/absolutize_moses_model.pl | 3 ++ scripts/training/analyse_moses_model.pl | 3 ++ .../bilingual-lm/averageNullEmbedding.py | 3 ++ scripts/training/bilingual-lm/extract.py | 3 ++ scripts/training/bilingual-lm/extract_test.py | 3 ++ .../training/bilingual-lm/extract_training.py | 3 ++ .../training/bilingual-lm/reduce_ngrams.py | 5 ++- scripts/training/bilingual-lm/test_nplm.py | 3 ++ scripts/training/bilingual-lm/train_nplm.py | 3 ++ scripts/training/binarize-model.perl | 3 ++ scripts/training/build-generation-table.perl | 3 ++ scripts/training/build-mmsapt.perl | 3 ++ scripts/training/clean-corpus-n.perl | 3 ++ scripts/training/clone_moses_model.pl | 3 ++ scripts/training/combine_factors.pl | 3 ++ scripts/training/convert-moses-ini-to-v2.perl | 3 ++ .../training/convert-moses-ini-v2-to-v1.py | 3 ++ scripts/training/corpus-sizes.perl | 3 ++ scripts/training/create_count_tables.py | 3 ++ scripts/training/exodus.perl | 3 ++ scripts/training/filter-model-given-input.pl | 3 ++ scripts/training/filter-rule-table.py | 40 ++++++++++--------- scripts/training/flexibility_score.py | 3 ++ scripts/training/giza2bal.pl | 3 ++ scripts/training/mert-moses.pl | 4 ++ scripts/training/postprocess-lopar.perl | 3 ++ .../training/rdlm/average_null_embedding.py | 3 ++ .../training/rdlm/extract_syntactic_ngrams.py | 3 ++ scripts/training/rdlm/extract_vocab.py | 3 ++ scripts/training/rdlm/train_rdlm.py | 3 ++ scripts/training/reduce-factors.perl | 3 ++ scripts/training/reduce-topt-count.pl | 3 ++ scripts/training/reduce_combine.pl | 3 ++ ...an-phrase-pairs-from-reordering-table.perl | 3 ++ scripts/training/threshold-filter.perl | 3 ++ .../training/train-global-lexicon-model.perl | 3 ++ scripts/training/train-model.perl | 3 ++ scripts/training/train-neurallm.py | 8 +++- .../adam-suffix-array/suffix-array-create.sh | 3 ++ .../adam-suffix-array/suffix-array-extract.sh | 3 ++ .../wrappers/berkeleyparsed2mosesxml.perl | 3 ++ .../wrappers/berkeleyparsed2mosesxml_PTB.perl | 3 ++ scripts/training/wrappers/conll2mosesxml.py | 3 ++ .../wrappers/filter-excluded-lines.perl | 3 ++ .../training/wrappers/find-unparseable.perl | 3 ++ scripts/training/wrappers/mada-wrapper.perl | 3 ++ scripts/training/wrappers/madamira-tok.perl | 3 ++ .../training/wrappers/madamira-wrapper.perl | 3 ++ .../make-factor-brown-cluster-mkcls.perl | 3 ++ .../wrappers/make-factor-de-lemma.perl | 3 ++ .../wrappers/make-factor-de-morph.perl | 3 ++ .../training/wrappers/make-factor-de-pos.perl | 3 ++ .../wrappers/make-factor-en-porter.perl | 3 ++ .../wrappers/make-factor-en-pos.mxpost.perl | 3 ++ .../wrappers/make-factor-pos.tree-tagger.perl | 3 ++ .../training/wrappers/make-factor-stem.perl | 3 ++ .../training/wrappers/make-factor-suffix.perl | 3 ++ .../training/wrappers/morfessor-wrapper.perl | 3 ++ .../wrappers/mosesxml2berkeleyparsed.perl | 3 ++ .../training/wrappers/mosesxml2brackets.py | 5 ++- .../training/wrappers/parse-de-berkeley.perl | 3 ++ .../training/wrappers/parse-de-bitpar.perl | 3 ++ .../training/wrappers/parse-en-collins.perl | 3 ++ scripts/training/wrappers/parse-en-egret.perl | 3 ++ scripts/training/wrappers/parse-en-senna.perl | 3 ++ .../training/wrappers/parse-en-stanford.py | 12 ++++-- scripts/training/wrappers/senna2brackets.py | 33 ++++++++------- .../wrappers/syntax-hyphen-splitting.perl | 3 ++ .../wrappers/tagger-german-chunk.perl | 3 ++ 192 files changed, 666 insertions(+), 58 deletions(-) diff --git a/scripts/OSM/OSM-Train.perl b/scripts/OSM/OSM-Train.perl index 895a821db..07ad71f68 100755 --- a/scripts/OSM/OSM-Train.perl +++ b/scripts/OSM/OSM-Train.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/OSM/extract-singletons.perl b/scripts/OSM/extract-singletons.perl index 5a1665a8c..6295edfad 100755 --- a/scripts/OSM/extract-singletons.perl +++ b/scripts/OSM/extract-singletons.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #use strict; use warnings; diff --git a/scripts/OSM/flipAlignment.perl b/scripts/OSM/flipAlignment.perl index b896c0a23..57a1e9bb0 100755 --- a/scripts/OSM/flipAlignment.perl +++ b/scripts/OSM/flipAlignment.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/clean.pl b/scripts/Transliteration/clean.pl index ccc364fc9..7a08271da 100755 --- a/scripts/Transliteration/clean.pl +++ b/scripts/Transliteration/clean.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #input hindi word urdu word, delete all those entries that have number on any side use warnings; @@ -314,4 +317,4 @@ sub charFreqFilter{ } } } -} \ No newline at end of file +} diff --git a/scripts/Transliteration/corpusCreator.pl b/scripts/Transliteration/corpusCreator.pl index 4c62449df..ac67f5d74 100755 --- a/scripts/Transliteration/corpusCreator.pl +++ b/scripts/Transliteration/corpusCreator.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl index c3cc31f26..e8130db02 100755 --- a/scripts/Transliteration/in-decoding-transliteration.pl +++ b/scripts/Transliteration/in-decoding-transliteration.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl index 60c3200f6..2c7908085 100755 --- a/scripts/Transliteration/post-decoding-transliteration.pl +++ b/scripts/Transliteration/post-decoding-transliteration.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl index df3b1ceca..0a9f554c5 100755 --- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl +++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/threshold.pl b/scripts/Transliteration/threshold.pl index bf6657742..3baa8e0a7 100755 --- a/scripts/Transliteration/threshold.pl +++ b/scripts/Transliteration/threshold.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use utf8; diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl index 35e4ee396..b1d4d0ff5 100755 --- a/scripts/Transliteration/train-transliteration-module.pl +++ b/scripts/Transliteration/train-transliteration-module.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use utf8; diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl index 8e6a6255a..9a3f63d69 100755 --- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl +++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use utf8; ############################################### diff --git a/scripts/analysis/extract-target-trees.py b/scripts/analysis/extract-target-trees.py index 3a92fdc4d..7166211d9 100755 --- a/scripts/analysis/extract-target-trees.py +++ b/scripts/analysis/extract-target-trees.py @@ -1,9 +1,13 @@ #!/usr/bin/env python - -# Usage: extract-target-trees.py [FILE] # -# Reads moses-chart's -T output from FILE or standard input and writes trees to -# standard output in Moses' XML tree format. +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +"""Usage: extract-target-trees.py [FILE] + +Reads moses-chart's -T output from FILE or standard input and writes trees to +standard output in Moses' XML tree format. +""" import re import sys diff --git a/scripts/analysis/nontranslated_words.pl b/scripts/analysis/nontranslated_words.pl index 51a4f9d20..7213deb76 100755 --- a/scripts/analysis/nontranslated_words.pl +++ b/scripts/analysis/nontranslated_words.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # Reads a source and hypothesis file and counts equal tokens. Some of these diff --git a/scripts/analysis/oov.pl b/scripts/analysis/oov.pl index 052c9994d..9756887c9 100755 --- a/scripts/analysis/oov.pl +++ b/scripts/analysis/oov.pl @@ -1,6 +1,9 @@ #!/usr/bin/env perl # Display OOV rate of a test set against a training corpus or a phrase table. # Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl index 72b70dc72..b9eb6e56d 100755 --- a/scripts/analysis/sentence-by-sentence.pl +++ b/scripts/analysis/sentence-by-sentence.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors diff --git a/scripts/analysis/sg2dot.perl b/scripts/analysis/sg2dot.perl index e9c1639ed..5f9a5ea1d 100755 --- a/scripts/analysis/sg2dot.perl +++ b/scripts/analysis/sg2dot.perl @@ -3,6 +3,8 @@ # Author : Loic BARRAULT # Script to convert MOSES searchgraph to DOT format # +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/analysis/show-phrases-used.pl b/scripts/analysis/show-phrases-used.pl index 522e6d3ff..9428ea9b8 100755 --- a/scripts/analysis/show-phrases-used.pl +++ b/scripts/analysis/show-phrases-used.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #show-phrases-used: display all source and target phrases for each sentence in a corpus, and give average phrase length used diff --git a/scripts/analysis/smtgui/Corpus.pm b/scripts/analysis/smtgui/Corpus.pm index f050a9f6d..2391a6c15 100644 --- a/scripts/analysis/smtgui/Corpus.pm +++ b/scripts/analysis/smtgui/Corpus.pm @@ -1,5 +1,8 @@ #package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole #Evan Herbst, 7 / 25 / 06 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. package Corpus; BEGIN diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl index 55f2619c0..cd0f6b91b 100755 --- a/scripts/analysis/smtgui/filter-phrase-table.pl +++ b/scripts/analysis/smtgui/filter-phrase-table.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #by Philipp Koehn, de-augmented by Evan Herbst diff --git a/scripts/analysis/smtgui/newsmtgui.cgi b/scripts/analysis/smtgui/newsmtgui.cgi index 32ad3a948..034ee265e 100755 --- a/scripts/analysis/smtgui/newsmtgui.cgi +++ b/scripts/analysis/smtgui/newsmtgui.cgi @@ -1,4 +1,7 @@ #!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use strict; diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl index 3ea15154e..f807153d9 100755 --- a/scripts/analysis/suspicious_tokenization.pl +++ b/scripts/analysis/suspicious_tokenization.pl @@ -2,6 +2,9 @@ # Collects and prints all n-grams that appear in the given corpus both # tokenized as well as untokenized. # Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/analysis/weight-scan-summarize.sh b/scripts/analysis/weight-scan-summarize.sh index 237182736..2fccb6470 100755 --- a/scripts/analysis/weight-scan-summarize.sh +++ b/scripts/analysis/weight-scan-summarize.sh @@ -1,4 +1,8 @@ #!/bin/bash +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + # Hackish summarization of weight-scan.pl results, heavily relies on tools by # Ondrej Bojar (bojar@ufal.mff.cuni.cz), some of which need Mercury; beware. diff --git a/scripts/analysis/weight-scan.pl b/scripts/analysis/weight-scan.pl index b33360694..b51a6bcd1 100755 --- a/scripts/analysis/weight-scan.pl +++ b/scripts/analysis/weight-scan.pl @@ -1,4 +1,8 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + # runs Moses many times changing the values of one weight, all others fixed # nbest lists are always produced to allow for comparison of real and # 'projected' BLEU (BLEU estimated from n-best lists collected at a neighouring diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index ece110fbc..a3f5310a5 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Experiment Management System # Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS diff --git a/scripts/ems/fix-info.perl b/scripts/ems/fix-info.perl index abe58fe83..6659027b2 100755 --- a/scripts/ems/fix-info.perl +++ b/scripts/ems/fix-info.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl index f4d5a55b4..01bb21773 100755 --- a/scripts/ems/support/analysis.perl +++ b/scripts/ems/support/analysis.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/berkeley-process.sh b/scripts/ems/support/berkeley-process.sh index e68056c96..347ebba3c 100755 --- a/scripts/ems/support/berkeley-process.sh +++ b/scripts/ems/support/berkeley-process.sh @@ -1,4 +1,7 @@ #!/bin/sh +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. if [ $# -lt 8 ] then diff --git a/scripts/ems/support/berkeley-train.sh b/scripts/ems/support/berkeley-train.sh index 96f6b648c..530cf978f 100755 --- a/scripts/ems/support/berkeley-train.sh +++ b/scripts/ems/support/berkeley-train.sh @@ -1,4 +1,7 @@ #!/bin/sh +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. if [ $# -lt 6 ] then diff --git a/scripts/ems/support/build-domain-file-from-subcorpora.perl b/scripts/ems/support/build-domain-file-from-subcorpora.perl index 085fd2629..f45b5ba2a 100755 --- a/scripts/ems/support/build-domain-file-from-subcorpora.perl +++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl index 79fc1e394..b134cee69 100755 --- a/scripts/ems/support/build-sparse-features.perl +++ b/scripts/ems/support/build-sparse-features.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/consolidate-training-data.perl b/scripts/ems/support/consolidate-training-data.perl index 4ab7f82cf..2a732be77 100755 --- a/scripts/ems/support/consolidate-training-data.perl +++ b/scripts/ems/support/consolidate-training-data.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ diff --git a/scripts/ems/support/defaultconfig.py b/scripts/ems/support/defaultconfig.py index a118e96b3..53913da08 100644 --- a/scripts/ems/support/defaultconfig.py +++ b/scripts/ems/support/defaultconfig.py @@ -1,4 +1,7 @@ #!/usr/bin/env python2 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Version of ConfigParser which accepts default values.""" diff --git a/scripts/ems/support/fast-align-in-parts.perl b/scripts/ems/support/fast-align-in-parts.perl index f777d7e52..bc340a50f 100755 --- a/scripts/ems/support/fast-align-in-parts.perl +++ b/scripts/ems/support/fast-align-in-parts.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. ####################### # Revision history diff --git a/scripts/ems/support/generic-multicore-parallelizer.perl b/scripts/ems/support/generic-multicore-parallelizer.perl index 0f7910603..d821aa114 100755 --- a/scripts/ems/support/generic-multicore-parallelizer.perl +++ b/scripts/ems/support/generic-multicore-parallelizer.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl index 811a99bde..087498ccf 100755 --- a/scripts/ems/support/generic-parallelizer.perl +++ b/scripts/ems/support/generic-parallelizer.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/input-from-sgm.perl b/scripts/ems/support/input-from-sgm.perl index 18000581a..eb6a2e3a1 100755 --- a/scripts/ems/support/input-from-sgm.perl +++ b/scripts/ems/support/input-from-sgm.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl index 7d52fd877..4d9a513f6 100755 --- a/scripts/ems/support/interpolate-lm.perl +++ b/scripts/ems/support/interpolate-lm.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl index df503754f..89b2847d6 100755 --- a/scripts/ems/support/lmplz-wrapper.perl +++ b/scripts/ems/support/lmplz-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/mml-filter.perl b/scripts/ems/support/mml-filter.perl index 51bc4cda5..32bca335b 100755 --- a/scripts/ems/support/mml-filter.perl +++ b/scripts/ems/support/mml-filter.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/mml-score.perl b/scripts/ems/support/mml-score.perl index 6f7b724ea..f88021818 100755 --- a/scripts/ems/support/mml-score.perl +++ b/scripts/ems/support/mml-score.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl index dcc998711..bdf6c1c1a 100755 --- a/scripts/ems/support/mml-train.perl +++ b/scripts/ems/support/mml-train.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/prepare-fast-align.perl b/scripts/ems/support/prepare-fast-align.perl index 80fec36b2..68b1f0189 100755 --- a/scripts/ems/support/prepare-fast-align.perl +++ b/scripts/ems/support/prepare-fast-align.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/reference-from-sgm.perl b/scripts/ems/support/reference-from-sgm.perl index ebb9ae4ae..b8e1d108d 100755 --- a/scripts/ems/support/reference-from-sgm.perl +++ b/scripts/ems/support/reference-from-sgm.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/remove-segmentation-markup.perl b/scripts/ems/support/remove-segmentation-markup.perl index a0bd61fff..3b02bceaf 100755 --- a/scripts/ems/support/remove-segmentation-markup.perl +++ b/scripts/ems/support/remove-segmentation-markup.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl index b649951ce..c859508cb 100755 --- a/scripts/ems/support/report-experiment-scores.perl +++ b/scripts/ems/support/report-experiment-scores.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $ diff --git a/scripts/ems/support/run-command-on-multiple-refsets.perl b/scripts/ems/support/run-command-on-multiple-refsets.perl index 1e914b44b..41823b4ee 100755 --- a/scripts/ems/support/run-command-on-multiple-refsets.perl +++ b/scripts/ems/support/run-command-on-multiple-refsets.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/run-wade.perl b/scripts/ems/support/run-wade.perl index 175948b98..dfdb8e59d 100755 --- a/scripts/ems/support/run-wade.perl +++ b/scripts/ems/support/run-wade.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 02a1e2315..f72767054 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Based on Preprocessor written by Philipp Koehn diff --git a/scripts/ems/support/submit-grid.perl b/scripts/ems/support/submit-grid.perl index a0967f9a5..ff43cd123 100755 --- a/scripts/ems/support/submit-grid.perl +++ b/scripts/ems/support/submit-grid.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/substitute-filtered-tables-and-weights.perl b/scripts/ems/support/substitute-filtered-tables-and-weights.perl index 13be52c6b..2e6908ab4 100755 --- a/scripts/ems/support/substitute-filtered-tables-and-weights.perl +++ b/scripts/ems/support/substitute-filtered-tables-and-weights.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl index c5ebabded..548982592 100755 --- a/scripts/ems/support/substitute-filtered-tables.perl +++ b/scripts/ems/support/substitute-filtered-tables.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/ems/support/substitute-weights.perl b/scripts/ems/support/substitute-weights.perl index b692f3f85..efa9338ca 100755 --- a/scripts/ems/support/substitute-weights.perl +++ b/scripts/ems/support/substitute-weights.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/ems/support/symmetrize-fast-align.perl b/scripts/ems/support/symmetrize-fast-align.perl index 9f7fec248..4ed3e087d 100755 --- a/scripts/ems/support/symmetrize-fast-align.perl +++ b/scripts/ems/support/symmetrize-fast-align.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/thot-lm-wrapper.perl b/scripts/ems/support/thot-lm-wrapper.perl index 59d483e65..ffbcb50e2 100755 --- a/scripts/ems/support/thot-lm-wrapper.perl +++ b/scripts/ems/support/thot-lm-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/tree-converter-wrapper.perl b/scripts/ems/support/tree-converter-wrapper.perl index aae55991a..ae7e2c5a6 100755 --- a/scripts/ems/support/tree-converter-wrapper.perl +++ b/scripts/ems/support/tree-converter-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl index 52190309a..09ea2a2f8 100755 --- a/scripts/ems/support/wrap-xml.perl +++ b/scripts/ems/support/wrap-xml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php index 57776dd22..5e5f707f6 100644 --- a/scripts/ems/web/analysis.php +++ b/scripts/ems/web/analysis.php @@ -1,5 +1,10 @@ Search Graph Visualization, Sentence <?php $sentence ?> diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl index 4ab281eae..97025d62a 100755 --- a/scripts/fuzzy-match/create_xml.perl +++ b/scripts/fuzzy-match/create_xml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. binmode( STDIN, ":utf8" ); binmode( STDOUT, ":utf8" ); diff --git a/scripts/generic/bsbleu.py b/scripts/generic/bsbleu.py index 12d2201de..296900b18 100755 --- a/scripts/generic/bsbleu.py +++ b/scripts/generic/bsbleu.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # compute Bleu scores with confidence intervals via boostrap resampling # written by Ulrich Germann +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from argparse import ArgumentParser import math diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl index b39d4d660..2ece80a60 100755 --- a/scripts/generic/compound-splitter.perl +++ b/scripts/generic/compound-splitter.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/extract-factors.pl b/scripts/generic/extract-factors.pl index 38cf97bd4..2b1c51cd1 100755 --- a/scripts/generic/extract-factors.pl +++ b/scripts/generic/extract-factors.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #extract-factors.pl: extract only the desired factors from a factored corpus diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index be30ff652..3240f24eb 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # example # ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput diff --git a/scripts/generic/fsa2fsal.pl b/scripts/generic/fsa2fsal.pl index 7dc7751ee..28ec28a26 100755 --- a/scripts/generic/fsa2fsal.pl +++ b/scripts/generic/fsa2fsal.pl @@ -4,6 +4,9 @@ # ' ' to delimit nodes (i.e. original lines). # Some rudimentary sanity checks are done on the fly. # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/fsa2plf.pl b/scripts/generic/fsa2plf.pl index 07c8a4cc1..4b9474d5a 100755 --- a/scripts/generic/fsa2plf.pl +++ b/scripts/generic/fsa2plf.pl @@ -7,6 +7,9 @@ # final nodes. # Note that the output format may not contain any spaces. # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/fsal2fsa.pl b/scripts/generic/fsal2fsa.pl index a21305dad..158dab5b3 100755 --- a/scripts/generic/fsal2fsa.pl +++ b/scripts/generic/fsal2fsa.pl @@ -1,6 +1,9 @@ #!/usr/bin/env perl # A very simple script that converts fsal back to fsa format (openfst lattices) # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/generic-parallel.perl b/scripts/generic/generic-parallel.perl index a9bc73d85..07f6a210a 100755 --- a/scripts/generic/generic-parallel.perl +++ b/scripts/generic/generic-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/giza-parallel.perl b/scripts/generic/giza-parallel.perl index 9a6516a8f..a9921a992 100755 --- a/scripts/generic/giza-parallel.perl +++ b/scripts/generic/giza-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # example # ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align diff --git a/scripts/generic/lopar2pos.pl b/scripts/generic/lopar2pos.pl index 2b9245e0f..fc2c35c7f 100755 --- a/scripts/generic/lopar2pos.pl +++ b/scripts/generic/lopar2pos.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #lopar2pos: extract POSs from LOPAR output diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl index eb51daa98..144b7d6b2 100755 --- a/scripts/generic/moses-parallel.pl +++ b/scripts/generic/moses-parallel.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ ####################### diff --git a/scripts/generic/moses_sim_pe.py b/scripts/generic/moses_sim_pe.py index 32f785961..3497ca558 100755 --- a/scripts/generic/moses_sim_pe.py +++ b/scripts/generic/moses_sim_pe.py @@ -1,20 +1,25 @@ #!/usr/bin/env python # Written by Michael Denkowski +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# This script parallelizes decoding with simulated post-editing via moses XML -# input (XML entities need to be escaped in tokenization). Memory mapped -# dynamic phrase tables (Ulrich Germann, -# www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models -# (Kenneth Heafield, -# http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19) -# facilitate memory efficient multi process decoding. Input is divided into -# batches, each of which is decoded sequentially. Each batch pre-loads the -# data from previous batches. +"""Parallelize decoding with simulated post-editing via moses XML input. -# To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the -# alignment from input to references. Specify the number of jobs with -# --decoder-flags="-threads N". +(XML entities need to be escaped in tokenization). Memory mapped +dynamic phrase tables (Ulrich Germann, +www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models +(Kenneth Heafield, +http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19) +facilitate memory efficient multi process decoding. Input is divided into +batches, each of which is decoded sequentially. Each batch pre-loads the +data from previous batches. + +To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the +alignment from input to references. Specify the number of jobs with +--decoder-flags="-threads N". +""" import gzip import itertools diff --git a/scripts/generic/mteval-v12.pl b/scripts/generic/mteval-v12.pl index 2666c8012..b4dfbf83a 100755 --- a/scripts/generic/mteval-v12.pl +++ b/scripts/generic/mteval-v12.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl index 41a88800a..bdc2d9479 100755 --- a/scripts/generic/mteval-v13a.pl +++ b/scripts/generic/mteval-v13a.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/multi-bleu.perl b/scripts/generic/multi-bleu.perl index 344f58c6f..61de10d45 100755 --- a/scripts/generic/multi-bleu.perl +++ b/scripts/generic/multi-bleu.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/generic/ph_numbers.perl b/scripts/generic/ph_numbers.perl index 612263249..f0ae1f851 100755 --- a/scripts/generic/ph_numbers.perl +++ b/scripts/generic/ph_numbers.perl @@ -6,6 +6,9 @@ package ph_numbers; # and decoder input # # (c) 2013 TAUS +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl index ac3d0900a..ef9938e07 100755 --- a/scripts/generic/qsub-wrapper.pl +++ b/scripts/generic/qsub-wrapper.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/generic/reverse-alignment.perl b/scripts/generic/reverse-alignment.perl index 681b3221e..f01acf5b0 100755 --- a/scripts/generic/reverse-alignment.perl +++ b/scripts/generic/reverse-alignment.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index 81bc6f7d0..625b449c0 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # example # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0 diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl index c993421f0..a5dbbaa37 100755 --- a/scripts/generic/strip-xml.perl +++ b/scripts/generic/strip-xml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/trainlm-irst2.perl b/scripts/generic/trainlm-irst2.perl index f664e96ee..8af372fac 100755 --- a/scripts/generic/trainlm-irst2.perl +++ b/scripts/generic/trainlm-irst2.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Compatible with sri LM-creating script, eg. # ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt diff --git a/scripts/other/beautify.py b/scripts/other/beautify.py index 0caa6b162..56df24bc8 100755 --- a/scripts/other/beautify.py +++ b/scripts/other/beautify.py @@ -1,4 +1,9 @@ #! /usr/bin/env python +# +# Originally written in 2015 by Jeroen Vermeulen (Precision Translation Tools). +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Reformat project source code, and/or check for style errors ("lint"). diff --git a/scripts/other/convert-pt.perl b/scripts/other/convert-pt.perl index e087126f1..60c8cbdb2 100755 --- a/scripts/other/convert-pt.perl +++ b/scripts/other/convert-pt.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # convert a phrase-table with alignment in Moses' dead-end format diff --git a/scripts/other/delete-scores.perl b/scripts/other/delete-scores.perl index ffb788867..ebaf277fa 100755 --- a/scripts/other/delete-scores.perl +++ b/scripts/other/delete-scores.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/other/gacha_filter.py b/scripts/other/gacha_filter.py index 0deb45761..af5921d41 100644 --- a/scripts/other/gacha_filter.py +++ b/scripts/other/gacha_filter.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ The Gacha filter cleans out sentence pairs that have global character mean diff --git a/scripts/other/get_many_translations_from_google.perl b/scripts/other/get_many_translations_from_google.perl index 0b1436c20..ac2933296 100755 --- a/scripts/other/get_many_translations_from_google.perl +++ b/scripts/other/get_many_translations_from_google.perl @@ -5,6 +5,9 @@ # Expects one sentence per line, not tokenized! # # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/other/retain-lines.perl b/scripts/other/retain-lines.perl index f04a8ebad..c789f96c7 100755 --- a/scripts/other/retain-lines.perl +++ b/scripts/other/retain-lines.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #retain lines in clean.lines-retained.1 use strict; diff --git a/scripts/other/translate_by_microsoft_bing.perl b/scripts/other/translate_by_microsoft_bing.perl index c9b1b31de..d4222878e 100755 --- a/scripts/other/translate_by_microsoft_bing.perl +++ b/scripts/other/translate_by_microsoft_bing.perl @@ -2,6 +2,9 @@ # Script implemented by Pranava Swaroop Madhyastha (a student at Charles # University, UFAL) +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl index b882852a0..66ca24fa2 100755 --- a/scripts/recaser/detruecase.perl +++ b/scripts/recaser/detruecase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl index 52cec36ea..b951ca764 100755 --- a/scripts/recaser/recase.perl +++ b/scripts/recaser/recase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl index dce388bca..cb3388c38 100755 --- a/scripts/recaser/train-recaser.perl +++ b/scripts/recaser/train-recaser.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 753183324..7f8909082 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index 544b79c47..aab185ce9 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ diff --git a/scripts/regression-testing/MosesScriptsRegressionTesting.pm b/scripts/regression-testing/MosesScriptsRegressionTesting.pm index d8b0590c8..acc134d70 100644 --- a/scripts/regression-testing/MosesScriptsRegressionTesting.pm +++ b/scripts/regression-testing/MosesScriptsRegressionTesting.pm @@ -1,3 +1,6 @@ +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + package MosesScriptsRegressionTesting; use strict; diff --git a/scripts/regression-testing/compare-results.pl b/scripts/regression-testing/compare-results.pl index 572431951..8f1461cec 100755 --- a/scripts/regression-testing/compare-results.pl +++ b/scripts/regression-testing/compare-results.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/create_localized_moses_ini.pl b/scripts/regression-testing/create_localized_moses_ini.pl index 1d03e5ab8..3e2b6f37f 100755 --- a/scripts/regression-testing/create_localized_moses_ini.pl +++ b/scripts/regression-testing/create_localized_moses_ini.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/modify-pars.pl b/scripts/regression-testing/modify-pars.pl index de2df2919..7726af9e6 100755 --- a/scripts/regression-testing/modify-pars.pl +++ b/scripts/regression-testing/modify-pars.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/moses-virtual.pl b/scripts/regression-testing/moses-virtual.pl index 3af3c79e4..3b23b525a 100755 --- a/scripts/regression-testing/moses-virtual.pl +++ b/scripts/regression-testing/moses-virtual.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/run-single-test.pl b/scripts/regression-testing/run-single-test.pl index e8307da36..037de8285 100755 --- a/scripts/regression-testing/run-single-test.pl +++ b/scripts/regression-testing/run-single-test.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/run-test-suite.pl b/scripts/regression-testing/run-test-suite.pl index b384f8b98..a12938e61 100755 --- a/scripts/regression-testing/run-test-suite.pl +++ b/scripts/regression-testing/run-test-suite.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/server/moses.py b/scripts/server/moses.py index 7cf152187..e825ab39e 100644 --- a/scripts/server/moses.py +++ b/scripts/server/moses.py @@ -1,5 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ Python utilities for moses diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py index 5f1407524..6f76bf46d 100755 --- a/scripts/server/sim-pe.py +++ b/scripts/server/sim-pe.py @@ -2,8 +2,14 @@ # -*- coding: utf-8 -*- # Written by Ulrich Germann on the basis of contrib/server/client.py. -# This script simulates post-editing of MT output and incrementally -# updates the dynamic phrase tables in the moses server. +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +"""Simulate post-editing of MT output. + +Incrementally updates the dynamic phrase tables in the moses server. +""" import argparse import os diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl index f9601924f..ad2529b21 100755 --- a/scripts/tokenizer/deescape-special-chars-PTB.perl +++ b/scripts/tokenizer/deescape-special-chars-PTB.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl index 002955e62..b9d1ad74c 100755 --- a/scripts/tokenizer/deescape-special-chars.perl +++ b/scripts/tokenizer/deescape-special-chars.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl index 3a92bd024..881b93dd1 100755 --- a/scripts/tokenizer/detokenizer.perl +++ b/scripts/tokenizer/detokenizer.perl @@ -4,6 +4,9 @@ # Sample De-Tokenizer # written by Josh Schroeder, based on code by Philipp Koehn # further modifications by Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl index fbbbae292..143e85490 100755 --- a/scripts/tokenizer/escape-special-chars.perl +++ b/scripts/tokenizer/escape-special-chars.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/lowercase.perl b/scripts/tokenizer/lowercase.perl index e5c41bbed..bc75e5e5c 100755 --- a/scripts/tokenizer/lowercase.perl +++ b/scripts/tokenizer/lowercase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl index 13e9fd3fc..7dab7543a 100755 --- a/scripts/tokenizer/normalize-punctuation.perl +++ b/scripts/tokenizer/normalize-punctuation.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/pre-tok-clean.perl b/scripts/tokenizer/pre-tok-clean.perl index 900e992ee..064f7b187 100755 --- a/scripts/tokenizer/pre-tok-clean.perl +++ b/scripts/tokenizer/pre-tok-clean.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; diff --git a/scripts/tokenizer/pre-tokenizer.perl b/scripts/tokenizer/pre-tokenizer.perl index 514d8da8d..541ce77fb 100755 --- a/scripts/tokenizer/pre-tokenizer.perl +++ b/scripts/tokenizer/pre-tokenizer.perl @@ -3,6 +3,9 @@ # script for preprocessing language data prior to tokenization # Start by Ulrich Germann, after noticing systematic preprocessing errors # in some of the English Europarl data. +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py index 096a45dc4..c03af8f66 100644 --- a/scripts/tokenizer/pre_tokenize_cleaning.py +++ b/scripts/tokenizer/pre_tokenize_cleaning.py @@ -1,4 +1,7 @@ #!/usr/bin/env python -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ The Gacha filter cleans out sentence pairs that have global character mean diff --git a/scripts/tokenizer/remove-non-printing-char.perl b/scripts/tokenizer/remove-non-printing-char.perl index 9125b7691..92f6ade16 100755 --- a/scripts/tokenizer/remove-non-printing-char.perl +++ b/scripts/tokenizer/remove-non-printing-char.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use utf8; diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl index cda69ddf7..c2c7088d6 100755 --- a/scripts/tokenizer/replace-unicode-punctuation.perl +++ b/scripts/tokenizer/replace-unicode-punctuation.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index a5d4fadd3..e08bac941 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/tokenizer/tokenizer_PTB.perl b/scripts/tokenizer/tokenizer_PTB.perl index 6fff8d7f7..46b14775c 100755 --- a/scripts/tokenizer/tokenizer_PTB.perl +++ b/scripts/tokenizer/tokenizer_PTB.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Sample Tokenizer ### Version 1.1 diff --git a/scripts/training/LexicalTranslationModel.pm b/scripts/training/LexicalTranslationModel.pm index c5dad60fb..3adc45f5e 100644 --- a/scripts/training/LexicalTranslationModel.pm +++ b/scripts/training/LexicalTranslationModel.pm @@ -1,3 +1,6 @@ +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + package LexicalTranslationModel; use strict; diff --git a/scripts/training/absolutize_moses_model.pl b/scripts/training/absolutize_moses_model.pl index bb7085895..27eccd8c7 100755 --- a/scripts/training/absolutize_moses_model.pl +++ b/scripts/training/absolutize_moses_model.pl @@ -5,6 +5,9 @@ # paths with absolute paths. # # Ondrej Bojar. +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/training/analyse_moses_model.pl b/scripts/training/analyse_moses_model.pl index 656f4a59b..7a5c2e701 100755 --- a/scripts/training/analyse_moses_model.pl +++ b/scripts/training/analyse_moses_model.pl @@ -4,6 +4,9 @@ # given a moses.ini file, checks the translation and generation tables and reports # statistics on ambiguity # Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/bilingual-lm/averageNullEmbedding.py b/scripts/training/bilingual-lm/averageNullEmbedding.py index 891595aff..54c9a1bc4 100755 --- a/scripts/training/bilingual-lm/averageNullEmbedding.py +++ b/scripts/training/bilingual-lm/averageNullEmbedding.py @@ -1,4 +1,7 @@ #!/usr/bin/env python2 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. import sys import numpy import argparse diff --git a/scripts/training/bilingual-lm/extract.py b/scripts/training/bilingual-lm/extract.py index f620edb5d..876fba9ee 100755 --- a/scripts/training/bilingual-lm/extract.py +++ b/scripts/training/bilingual-lm/extract.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from collections import Counter import logging diff --git a/scripts/training/bilingual-lm/extract_test.py b/scripts/training/bilingual-lm/extract_test.py index 3c9a03b85..8cade1e04 100755 --- a/scripts/training/bilingual-lm/extract_test.py +++ b/scripts/training/bilingual-lm/extract_test.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Create a test corpus, using a previously pruned vocabulary.""" diff --git a/scripts/training/bilingual-lm/extract_training.py b/scripts/training/bilingual-lm/extract_training.py index bd3538188..e39a70318 100755 --- a/scripts/training/bilingual-lm/extract_training.py +++ b/scripts/training/bilingual-lm/extract_training.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from collections import Counter import logging diff --git a/scripts/training/bilingual-lm/reduce_ngrams.py b/scripts/training/bilingual-lm/reduce_ngrams.py index 3442fb302..4db41378d 100755 --- a/scripts/training/bilingual-lm/reduce_ngrams.py +++ b/scripts/training/bilingual-lm/reduce_ngrams.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -"""Reduces an ngrams file for training nplm to a smaller version of it. +"""Reduce an ngrams file for training nplm to a smaller version of it. The smaller version will have fewer ngrams. """ diff --git a/scripts/training/bilingual-lm/test_nplm.py b/scripts/training/bilingual-lm/test_nplm.py index 737266bc3..3a59fd344 100755 --- a/scripts/training/bilingual-lm/test_nplm.py +++ b/scripts/training/bilingual-lm/test_nplm.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. import logging import optparse diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py index 7bc74429e..cb5980a91 100755 --- a/scripts/training/bilingual-lm/train_nplm.py +++ b/scripts/training/bilingual-lm/train_nplm.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from __future__ import print_function, unicode_literals diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl index 0239f5fc8..0131d2222 100755 --- a/scripts/training/binarize-model.perl +++ b/scripts/training/binarize-model.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # # Binarize a Moses model diff --git a/scripts/training/build-generation-table.perl b/scripts/training/build-generation-table.perl index 435f7f58e..14176908a 100755 --- a/scripts/training/build-generation-table.perl +++ b/scripts/training/build-generation-table.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/training/build-mmsapt.perl b/scripts/training/build-mmsapt.perl index 00cbd09d6..d0c5b818e 100755 --- a/scripts/training/build-mmsapt.perl +++ b/scripts/training/build-mmsapt.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl index cee4c76a2..76a09e539 100755 --- a/scripts/training/clean-corpus-n.perl +++ b/scripts/training/clean-corpus-n.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $ use warnings; diff --git a/scripts/training/clone_moses_model.pl b/scripts/training/clone_moses_model.pl index bf6708fca..18dc4aa41 100755 --- a/scripts/training/clone_moses_model.pl +++ b/scripts/training/clone_moses_model.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # given a moses.ini file, creates a fresh version of it diff --git a/scripts/training/combine_factors.pl b/scripts/training/combine_factors.pl index fa6f15db2..fcc9ab3f5 100755 --- a/scripts/training/combine_factors.pl +++ b/scripts/training/combine_factors.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # given a list of files, combines them to a single corpus (sent to stdout) diff --git a/scripts/training/convert-moses-ini-to-v2.perl b/scripts/training/convert-moses-ini-to-v2.perl index e091a710d..3fdfa53a6 100755 --- a/scripts/training/convert-moses-ini-to-v2.perl +++ b/scripts/training/convert-moses-ini-to-v2.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/convert-moses-ini-v2-to-v1.py b/scripts/training/convert-moses-ini-v2-to-v1.py index 44f192efe..3ef7d7c0d 100755 --- a/scripts/training/convert-moses-ini-v2-to-v1.py +++ b/scripts/training/convert-moses-ini-v2-to-v1.py @@ -1,5 +1,8 @@ #! /usr/bin/env python # -*- coding: utf8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 3 or, at your option, any later version. from __future__ import ( diff --git a/scripts/training/corpus-sizes.perl b/scripts/training/corpus-sizes.perl index 30ae67ebb..1a6db669b 100755 --- a/scripts/training/corpus-sizes.perl +++ b/scripts/training/corpus-sizes.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ diff --git a/scripts/training/create_count_tables.py b/scripts/training/create_count_tables.py index 2288c034a..12499b1d7 100755 --- a/scripts/training/create_count_tables.py +++ b/scripts/training/create_count_tables.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # This script creates tables that store phrase pair frequencies rather than # probabilities. diff --git a/scripts/training/exodus.perl b/scripts/training/exodus.perl index bb8616007..579056ff0 100755 --- a/scripts/training/exodus.perl +++ b/scripts/training/exodus.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index e3a34c40b..a44d9c193 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # Given a moses.ini file and an input text prepare minimized translation diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py index 14736fe1f..d28fa0c89 100755 --- a/scripts/training/filter-rule-table.py +++ b/scripts/training/filter-rule-table.py @@ -1,25 +1,29 @@ #!/usr/bin/env python # Author: Phil Williams +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT -# -# Given a rule table (on stdin) and an input text, filter out rules that -# couldn't be used in parsing the input and write the resulting rule table -# to stdout. The input text is assumed to contain the same factors as -# the rule table and is assumed to be small (not more than a few thousand -# sentences): the current algorithm won't scale well to large input sets. -# -# The filtering algorithm considers a source RHS to be a sequence of -# words and gaps, which must match a sequence of words in one of the -# input sentences, with at least one input word per gap. The NT labels -# are ignored, so for example a rule with the source RHS "the JJ dog" -# would be allowed if the sequence "the slobbering dog" occurs in one of -# the input sentences, even if there's no rule to derive a JJ from -# "slobbering." (If "slobbering" were an unknown word, the 'unknown-lhs' -# decoder option would allow it to take a number of NT labels, likely -# including JJ, with varying probabilities, so removing the rule would -# be a bad idea.) +"""Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT + +Given a rule table (on stdin) and an input text, filter out rules that +couldn't be used in parsing the input and write the resulting rule table +to stdout. The input text is assumed to contain the same factors as +the rule table and is assumed to be small (not more than a few thousand +sentences): the current algorithm won't scale well to large input sets. + +The filtering algorithm considers a source RHS to be a sequence of +words and gaps, which must match a sequence of words in one of the +input sentences, with at least one input word per gap. The NT labels +are ignored, so for example a rule with the source RHS "the JJ dog" +would be allowed if the sequence "the slobbering dog" occurs in one of +the input sentences, even if there's no rule to derive a JJ from +"slobbering." (If "slobbering" were an unknown word, the 'unknown-lhs' +decoder option would allow it to take a number of NT labels, likely +including JJ, with varying probabilities, so removing the rule would +be a bad idea.) +""" import optparse import sys diff --git a/scripts/training/flexibility_score.py b/scripts/training/flexibility_score.py index 496184616..56d4f9425 100755 --- a/scripts/training/flexibility_score.py +++ b/scripts/training/flexibility_score.py @@ -2,6 +2,9 @@ # -*- coding: utf-8 -*- # author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Add flexibility scores to a phrase table half. diff --git a/scripts/training/giza2bal.pl b/scripts/training/giza2bal.pl index 27ba9d659..ad9edb584 100755 --- a/scripts/training/giza2bal.pl +++ b/scripts/training/giza2bal.pl @@ -6,6 +6,9 @@ #produced by giza containing the frequency of each traning sentence. #Copyright Marcello Federico, November 2004 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #use warnings; diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 92e1a79ff..c73e75a87 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -1,4 +1,8 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + # $Id$ # Usage: # mert-moses.pl diff --git a/scripts/training/postprocess-lopar.perl b/scripts/training/postprocess-lopar.perl index 44be9c26c..05a56a3b5 100755 --- a/scripts/training/postprocess-lopar.perl +++ b/scripts/training/postprocess-lopar.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ diff --git a/scripts/training/rdlm/average_null_embedding.py b/scripts/training/rdlm/average_null_embedding.py index 28abc9508..899b402c1 100755 --- a/scripts/training/rdlm/average_null_embedding.py +++ b/scripts/training/rdlm/average_null_embedding.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Average embeddings of special null words for RDLM. diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py index 1292e90f2..be4ed2335 100755 --- a/scripts/training/rdlm/extract_syntactic_ngrams.py +++ b/scripts/training/rdlm/extract_syntactic_ngrams.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ Extract syntactic n-grams from dependency treebank in Moses XML format for diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py index ed9266fd9..48e5215c3 100755 --- a/scripts/training/rdlm/extract_vocab.py +++ b/scripts/training/rdlm/extract_vocab.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # extract 5 vocabulary files from parsed corpus in moses XML format diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py index 639c1b32c..a7edbab36 100755 --- a/scripts/training/rdlm/train_rdlm.py +++ b/scripts/training/rdlm/train_rdlm.py @@ -1,5 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from __future__ import print_function, unicode_literals diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl index 09f9c7f2b..82aed4355 100755 --- a/scripts/training/reduce-factors.perl +++ b/scripts/training/reduce-factors.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/reduce-topt-count.pl b/scripts/training/reduce-topt-count.pl index f760051c4..85ce0d6d9 100755 --- a/scripts/training/reduce-topt-count.pl +++ b/scripts/training/reduce-topt-count.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # given a moses.ini, filter the phrase tables to contain # only ttable-limit options per source phrase diff --git a/scripts/training/reduce_combine.pl b/scripts/training/reduce_combine.pl index a7614f73e..2055bed5b 100755 --- a/scripts/training/reduce_combine.pl +++ b/scripts/training/reduce_combine.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # given a pathname to a factored corpus, a list of (numeric) factors to keep diff --git a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl index eda529393..25c5cc028 100755 --- a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl +++ b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/threshold-filter.perl b/scripts/training/threshold-filter.perl index 3e42ca795..0aed67d25 100755 --- a/scripts/training/threshold-filter.perl +++ b/scripts/training/threshold-filter.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/train-global-lexicon-model.perl b/scripts/training/train-global-lexicon-model.perl index d3c55789d..528bfbd72 100755 --- a/scripts/training/train-global-lexicon-model.perl +++ b/scripts/training/train-global-lexicon-model.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 5a304c2f9..b693d774d 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py index 2d2f12015..fec859611 100755 --- a/scripts/training/train-neurallm.py +++ b/scripts/training/train-neurallm.py @@ -1,8 +1,12 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -""" train feed-forward neural network LM with NPLM tool -resulting model can be used in Moses as feature function NeuralLM +"""Train feed-forward neural network LM with NPLM tool. + +The resulting model can be used in Moses as feature function NeuralLM. """ from __future__ import print_function, unicode_literals diff --git a/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh b/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh index 238a53349..5db5e9aa9 100755 --- a/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh +++ b/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh @@ -1,4 +1,7 @@ #!/bin/bash +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/suffix-array-create.sh $SA_EXEC_DIR $SOURCE_CORPUS $TARGET_CORPUS $ALIGNMENT $SA_OUTPUT diff --git a/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh b/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh index 8c255b1b6..128ccaa9e 100755 --- a/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh +++ b/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh @@ -1,4 +1,7 @@ #!/bin/bash +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $SA_EXEC_DIR $MODEL_DIR $INPUT_FILE $OUTPUT_DIR diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl index 232cfefab..9c376200c 100755 --- a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl index 9e8c30d42..b8ba146c9 100755 --- a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py index 761037488..6473166d9 100755 --- a/scripts/training/wrappers/conll2mosesxml.py +++ b/scripts/training/wrappers/conll2mosesxml.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on diff --git a/scripts/training/wrappers/filter-excluded-lines.perl b/scripts/training/wrappers/filter-excluded-lines.perl index dff104dba..508ab8a06 100755 --- a/scripts/training/wrappers/filter-excluded-lines.perl +++ b/scripts/training/wrappers/filter-excluded-lines.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/find-unparseable.perl b/scripts/training/wrappers/find-unparseable.perl index 00009e2e9..fd0664f1d 100755 --- a/scripts/training/wrappers/find-unparseable.perl +++ b/scripts/training/wrappers/find-unparseable.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/mada-wrapper.perl b/scripts/training/wrappers/mada-wrapper.perl index f2cf14f40..d4124e34c 100755 --- a/scripts/training/wrappers/mada-wrapper.perl +++ b/scripts/training/wrappers/mada-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/madamira-tok.perl b/scripts/training/wrappers/madamira-tok.perl index 37e70079e..e9f19d53a 100755 --- a/scripts/training/wrappers/madamira-tok.perl +++ b/scripts/training/wrappers/madamira-tok.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl index 6535b6187..05ec44d7d 100755 --- a/scripts/training/wrappers/madamira-wrapper.perl +++ b/scripts/training/wrappers/madamira-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl index 1e3a1ce3f..a8ce5f24e 100755 --- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl +++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-de-lemma.perl b/scripts/training/wrappers/make-factor-de-lemma.perl index db978317e..0b93002a9 100755 --- a/scripts/training/wrappers/make-factor-de-lemma.perl +++ b/scripts/training/wrappers/make-factor-de-lemma.perl @@ -1,4 +1,7 @@ #!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use Encode; diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl index 366a5a76d..d09196745 100755 --- a/scripts/training/wrappers/make-factor-de-morph.perl +++ b/scripts/training/wrappers/make-factor-de-morph.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-de-pos.perl b/scripts/training/wrappers/make-factor-de-pos.perl index 495517352..585323bd4 100755 --- a/scripts/training/wrappers/make-factor-de-pos.perl +++ b/scripts/training/wrappers/make-factor-de-pos.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-en-porter.perl b/scripts/training/wrappers/make-factor-en-porter.perl index 749dc1318..7ae5fd0b3 100755 --- a/scripts/training/wrappers/make-factor-en-porter.perl +++ b/scripts/training/wrappers/make-factor-en-porter.perl @@ -1,4 +1,7 @@ #!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl index 4aa66bac6..2bff8e329 100755 --- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl +++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl index 0ad04d4de..1e8ccd0ee 100755 --- a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl +++ b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-stem.perl b/scripts/training/wrappers/make-factor-stem.perl index 662f1d882..9bde7648f 100755 --- a/scripts/training/wrappers/make-factor-stem.perl +++ b/scripts/training/wrappers/make-factor-stem.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-suffix.perl b/scripts/training/wrappers/make-factor-suffix.perl index 6a59254e4..015df3874 100755 --- a/scripts/training/wrappers/make-factor-suffix.perl +++ b/scripts/training/wrappers/make-factor-suffix.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/morfessor-wrapper.perl b/scripts/training/wrappers/morfessor-wrapper.perl index c65a2cebc..0269045a0 100755 --- a/scripts/training/wrappers/morfessor-wrapper.perl +++ b/scripts/training/wrappers/morfessor-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl index e929658ff..02bc7b88e 100755 --- a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl +++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py index 6ff1d20c9..6b90aa256 100755 --- a/scripts/training/wrappers/mosesxml2brackets.py +++ b/scripts/training/wrappers/mosesxml2brackets.py @@ -1,8 +1,11 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# convert trees in moses XML format to PTB-style bracketed format +"""Convert trees in moses XML format to PTB-style bracketed format.""" from __future__ import print_function, unicode_literals import sys diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl index 596fb3eff..f605a37ae 100755 --- a/scripts/training/wrappers/parse-de-berkeley.perl +++ b/scripts/training/wrappers/parse-de-berkeley.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-de-bitpar.perl b/scripts/training/wrappers/parse-de-bitpar.perl index 1bbcf5329..0d5346058 100755 --- a/scripts/training/wrappers/parse-de-bitpar.perl +++ b/scripts/training/wrappers/parse-de-bitpar.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-en-collins.perl b/scripts/training/wrappers/parse-en-collins.perl index 252d3d2b7..c9a960912 100755 --- a/scripts/training/wrappers/parse-en-collins.perl +++ b/scripts/training/wrappers/parse-en-collins.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-en-egret.perl b/scripts/training/wrappers/parse-en-egret.perl index 9f434063b..e97bc1ae0 100755 --- a/scripts/training/wrappers/parse-en-egret.perl +++ b/scripts/training/wrappers/parse-en-egret.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-en-senna.perl b/scripts/training/wrappers/parse-en-senna.perl index f271633ea..2df46284b 100755 --- a/scripts/training/wrappers/parse-en-senna.perl +++ b/scripts/training/wrappers/parse-en-senna.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/training/wrappers/parse-en-stanford.py b/scripts/training/wrappers/parse-en-stanford.py index 7d8be4bcf..06b027e55 100755 --- a/scripts/training/wrappers/parse-en-stanford.py +++ b/scripts/training/wrappers/parse-en-stanford.py @@ -1,11 +1,17 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# (hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format. -# assumes tokenized and sentence-split text. +""" +(Hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format. +Assumes tokenized and sentence-split text. -# to get Moses XML format, first projectivize the trees, then use conll2mosesxml.py. +To get Moses XML format, first projectivize the trees, then use +conll2mosesxml.py. +""" from __future__ import print_function, unicode_literals import os diff --git a/scripts/training/wrappers/senna2brackets.py b/scripts/training/wrappers/senna2brackets.py index 4fc71ed44..a81100277 100755 --- a/scripts/training/wrappers/senna2brackets.py +++ b/scripts/training/wrappers/senna2brackets.py @@ -1,19 +1,24 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# Read SENNA output (from stdin), extract the parse trees, and write them in -# PTB-style bracketed format (to stdout). -# -# The SENNA output is assumed to contain tokens in the first column, POS tags -# in the second column, and PSG fragments in the final column. -# -# It is also assumed that SENNA was run through the parse-en-senna.perl wrapper, -# which: -# -# - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that -# exceed SENNA's hardcoded limit. -# -# - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")", -# etc. +""" +Read SENNA output (from stdin), extract the parse trees, and write them in +PTB-style bracketed format (to stdout). + +The SENNA output is assumed to contain tokens in the first column, POS tags +in the second column, and PSG fragments in the final column. + +It is also assumed that SENNA was run through the parse-en-senna.perl wrapper, +which: + + - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that + exceed SENNA's hardcoded limit. + + - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")", + etc. +""" import optparse import os diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl index 653b410d0..1a260df10 100755 --- a/scripts/training/wrappers/syntax-hyphen-splitting.perl +++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/tagger-german-chunk.perl b/scripts/training/wrappers/tagger-german-chunk.perl index c57031889..0b707a579 100755 --- a/scripts/training/wrappers/tagger-german-chunk.perl +++ b/scripts/training/wrappers/tagger-german-chunk.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; From 5d8af9c2896d86785c5db2fd3a8029ae9b741e26 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Fri, 29 May 2015 16:07:26 +0100 Subject: [PATCH 018/108] support memory-mapped files for NPLM training --- scripts/training/bilingual-lm/train_nplm.py | 14 ++++++--- scripts/training/rdlm/train_rdlm.py | 33 +++++++++++++++++---- scripts/training/train-neurallm.py | 33 +++++++++++++++++++-- 3 files changed, 68 insertions(+), 12 deletions(-) diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py index cb5980a91..572076006 100755 --- a/scripts/training/bilingual-lm/train_nplm.py +++ b/scripts/training/bilingual-lm/train_nplm.py @@ -39,7 +39,8 @@ parser.add_argument("--input-words-file", dest="input_words_file") parser.add_argument("--output-words-file", dest="output_words_file") parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int) parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int) - +parser.add_argument("--mmap", dest="mmap", action="store_true", + help="Use memory-mapped file (for lower memory consumption).") parser.set_defaults( working_dir="working", @@ -113,6 +114,11 @@ def main(options): options.working_dir, os.path.basename(options.corpus_stem) + ".numberized") + mmap_command = [] + if options.mmap: + in_file += '.mmap' + mmap_command = ['--mmap_file', '1'] + model_prefix = os.path.join( options.output_dir, options.output_model + ".model.nplm") train_args = [ @@ -127,9 +133,9 @@ def main(options): "--input_embedding_dimension", str(options.input_embedding), "--output_embedding_dimension", str(options.output_embedding), "--num_threads", str(options.threads), - "--activation_function", - options.activation_fn, - ] + validations_command + vocab_command + "--activation_function", options.activation_fn, + "--ngram_size", str(options.ngram_size), + ] + validations_command + vocab_command + mmap_command print("Train model command: ") print(', '.join(train_args)) diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py index a7edbab36..289ab405c 100755 --- a/scripts/training/rdlm/train_rdlm.py +++ b/scripts/training/rdlm/train_rdlm.py @@ -94,11 +94,14 @@ parser.add_argument( "--output-words-file", dest="output_words_file", metavar="PATH", help="Output vocabulary (default: %(default)s).") parser.add_argument( - "--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", + "--input-vocab-size", dest="input_vocab_size", type=int, metavar="INT", help="Input vocabulary size (default: %(default)s).") parser.add_argument( "--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="Output vocabulary size (default: %(default)s).") +parser.add_argument( + "--mmap", dest="mmap", action="store_true", + help="Use memory-mapped file (for lower memory consumption).") parser.set_defaults( @@ -195,11 +198,14 @@ def main(options): "extracting vocabulary from training text.\n") prepare_vocabulary(options) + numberized_file = os.path.basename(options.corpus_stem) + '.numberized' + train_file = numberized_file + if options.mmap: + train_file += '.mmap' + extract_options = extract_syntactic_ngrams.create_parser().parse_args([ '--input', options.corpus_stem, - '--output', os.path.join( - options.working_dir, - os.path.basename(options.corpus_stem) + '.numberized'), + '--output', os.path.join(options.working_dir, numberized_file), '--vocab', options.input_words_file, '--output_vocab', options.output_words_file, '--right_context', str(options.right_context_size), @@ -222,6 +228,23 @@ def main(options): else: options.validation_file = None + if options.mmap: + try: + os.remove(os.path.join(options.working_dir, train_file)) + except OSError: + pass + mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'), + '--input_file', + os.path.join(options.working_dir, numberized_file), + '--output_file', + os.path.join(options.working_dir, train_file) + ] + sys.stderr.write('creating memory-mapped file\n') + sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n') + ret = subprocess.call(mmap_cmd) + if ret: + raise Exception("creating memory-mapped file failed") + sys.stderr.write('training neural network\n') train_nplm.main(options) @@ -234,7 +257,7 @@ def main(options): options.output_model + '.model.nplm.' + str(options.epochs)), os.path.join( options.working_dir, - os.path.basename(options.corpus_stem) + '.numberized'), + numberized_file), os.path.join(options.output_dir, options.output_model + '.model.nplm') ]) if ret: diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py index fec859611..ae77a42af 100755 --- a/scripts/training/train-neurallm.py +++ b/scripts/training/train-neurallm.py @@ -87,6 +87,9 @@ parser.add_argument( parser.add_argument( "--vocab-size", dest="vocab_size", type=int, metavar="INT", help="Vocabulary size (default: %(default)s).") +parser.add_argument( + "--mmap", dest="mmap", action="store_true", + help="Use memory-mapped file (for lower memory consumption).") parser.set_defaults( working_dir="working", @@ -121,20 +124,43 @@ def main(options): if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) + numberized_file = os.path.basename(options.corpus_stem) + '.numberized' + train_file = numberized_file + if options.mmap: + train_file += '.mmap' + extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), '--train_text', options.corpus_stem, '--ngramize', '1', '--ngram_size', str(options.ngram_size), '--vocab_size', str(options.vocab_size), '--write_words_file', os.path.join(options.working_dir, options.words_file), - '--train_file', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized') + '--train_file', os.path.join(options.working_dir, numberized_file) ] sys.stderr.write('extracting n-grams\n') + sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n') ret = subprocess.call(extraction_cmd) if ret: raise Exception("preparing neural LM failed") - + + if options.mmap: + try: + os.remove(os.path.join(options.working_dir, train_file)) + except OSError: + pass + mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'), + '--input_file', + os.path.join(options.working_dir, numberized_file), + '--output_file', + os.path.join(options.working_dir, train_file) + ] + sys.stderr.write('creating memory-mapped file\n') + sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n') + ret = subprocess.call(mmap_cmd) + if ret: + raise Exception("creating memory-mapped file failed") + if options.validation_corpus: extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), @@ -147,6 +173,7 @@ def main(options): ] sys.stderr.write('extracting n-grams (validation file)\n') + sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n') ret = subprocess.call(extraction_cmd) if ret: raise Exception("preparing neural LM failed") @@ -166,7 +193,7 @@ def main(options): average_options = averageNullEmbedding.parser.parse_args( ['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)), '-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'), - '-t', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'), + '-t', os.path.join(options.working_dir, numberized_file), '-p', os.path.join(options.nplm_home, 'python')]) averageNullEmbedding.main(average_options) From 2f735998ca8755263ec8dcc30303358988519091 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 29 May 2015 18:46:02 +0100 Subject: [PATCH 019/108] Rename MosesTraining::SyntaxTree to MosesTraining::SyntaxNodeCollection This is the first step in a small-scale refactoring effort that will touch a lot of the syntax-related code in moses/phrase-extract. The end goals are: - a storage mechanism for general attribute/value pairs in XML-style tree / lattice input. E.g. the "pcfg-score" and "semantic-role" attributes in: I - consolidation of the various near-duplicate Tree / XmlTreeParser classes that have accumulated over the years (my fault) - general de-crufting --- phrase-extract/SentenceAlignmentWithSyntax.h | 4 +- phrase-extract/SyntaxTree.cpp | 48 +++---------------- phrase-extract/SyntaxTree.h | 23 +++------ phrase-extract/XmlTree.cpp | 12 ++--- phrase-extract/XmlTree.h | 2 +- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 4 +- phrase-extract/extract-ghkm/ScfgRule.cpp | 8 ++-- phrase-extract/extract-ghkm/ScfgRule.h | 8 ++-- phrase-extract/extract-ghkm/XmlTreeParser.h | 2 +- phrase-extract/pcfg-common/xml_tree_parser.h | 2 +- phrase-extract/relax-parse-main.cpp | 12 ++--- phrase-extract/relax-parse.h | 8 ++-- .../syntax-common/xml_tree_parser.cc | 10 ++-- .../syntax-common/xml_tree_parser.h | 2 +- 14 files changed, 51 insertions(+), 94 deletions(-) diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h index 8b9088770..a603f7722 100644 --- a/phrase-extract/SentenceAlignmentWithSyntax.h +++ b/phrase-extract/SentenceAlignmentWithSyntax.h @@ -36,8 +36,8 @@ namespace MosesTraining class SentenceAlignmentWithSyntax : public SentenceAlignment { public: - SyntaxTree targetTree; - SyntaxTree sourceTree; + SyntaxNodeCollection targetTree; + SyntaxNodeCollection sourceTree; std::set & m_targetLabelCollection; std::set & m_sourceLabelCollection; std::map & m_targetTopLabelCollection; diff --git a/phrase-extract/SyntaxTree.cpp b/phrase-extract/SyntaxTree.cpp index c50693e0d..7f641125e 100644 --- a/phrase-extract/SyntaxTree.cpp +++ b/phrase-extract/SyntaxTree.cpp @@ -1,6 +1,3 @@ -// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2009 University of Edinburgh @@ -29,12 +26,12 @@ namespace MosesTraining { -SyntaxTree::~SyntaxTree() +SyntaxNodeCollection::~SyntaxNodeCollection() { Clear(); } -void SyntaxTree::Clear() +void SyntaxNodeCollection::Clear() { m_top = 0; // loop through all m_nodes, delete them @@ -45,7 +42,7 @@ void SyntaxTree::Clear() m_index.clear(); } -SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label ) +SyntaxNode *SyntaxNodeCollection::AddNode( int startPos, int endPos, std::string label ) { SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label ); m_nodes.push_back( newNode ); @@ -54,7 +51,7 @@ SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label ) return newNode; } -ParentNodes SyntaxTree::Parse() +ParentNodes SyntaxNodeCollection::Parse() { ParentNodes parents; @@ -94,12 +91,12 @@ ParentNodes SyntaxTree::Parse() return parents; } -bool SyntaxTree::HasNode( int startPos, int endPos ) const +bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const { return GetNodes( startPos, endPos).size() > 0; } -const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos ) const +const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const { SyntaxTreeIndexIterator startIndex = m_index.find( startPos ); if (startIndex == m_index.end() ) @@ -112,15 +109,7 @@ const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos return endIndex->second; } -// for printing out tree -std::string SyntaxTree::ToString() const -{ - std::stringstream out; - out << *this; - return out.str(); -} - -void SyntaxTree::ConnectNodes() +void SyntaxNodeCollection::ConnectNodes() { typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator; @@ -162,27 +151,4 @@ void SyntaxTree::ConnectNodes() } } -std::ostream& operator<<(std::ostream& os, const SyntaxTree& t) -{ - size_t size = t.m_index.size(); - for(size_t length=1; length<=size; length++) { - for(size_t space=0; spaceGetLabel() + "#######"; - - os << label.substr(0,7) << " "; - } else { - os << "------- "; - } - } - os << std::endl; - } - return os; } - -} - diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxTree.h index 6ffb5da34..649a6197b 100644 --- a/phrase-extract/SyntaxTree.h +++ b/phrase-extract/SyntaxTree.h @@ -1,6 +1,3 @@ -// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2009 University of Edinburgh @@ -20,12 +17,12 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ - #pragma once -#include -#include + #include #include +#include +#include namespace MosesTraining { @@ -79,7 +76,7 @@ public: typedef std::vector< int > SplitPoints; typedef std::vector< SplitPoints > ParentNodes; -class SyntaxTree +class SyntaxNodeCollection { protected: std::vector< SyntaxNode* > m_nodes; @@ -93,14 +90,12 @@ protected: int m_size; std::vector< SyntaxNode* > m_emptyNode; - friend std::ostream& operator<<(std::ostream&, const SyntaxTree&); - public: - SyntaxTree() + SyntaxNodeCollection() : m_top(0) // m_top doesn't get set unless ConnectNodes is called. , m_size(0) {} - ~SyntaxTree(); + ~SyntaxNodeCollection(); SyntaxNode *AddNode( int startPos, int endPos, std::string label ); @@ -119,10 +114,6 @@ public: } void ConnectNodes(); void Clear(); - std::string ToString() const; }; -std::ostream& operator<<(std::ostream&, const SyntaxTree&); - -} - +} // namespace MosesTraining diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index 6efa1bf5c..d45fd99eb 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -1,6 +1,3 @@ -// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh @@ -228,7 +225,10 @@ vector TokenizeXml(const string& str) parse because we don't have the completed source parsed until after this function removes all the markup from it (CreateFromString in Sentence::Read). */ -bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection, bool unescapeSpecialChars ) +bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, + set< string > &labelCollection, + map< string, int > &topLabelCollection, + bool unescapeSpecialChars ) { //parse XML markup in translation line @@ -374,7 +374,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label cerr << "XML TAG LABEL IS: '" << label << "'" << endl; cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; } - SyntaxNode *node = tree.AddNode( startPos, endPos-1, label ); + SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label ); node->SetPcfgScore(pcfgScore); } } @@ -386,7 +386,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label } // collect top labels - const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 ); + const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 ); for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) { SyntaxNode *n = *node; const string &label = n->GetLabel(); diff --git a/phrase-extract/XmlTree.h b/phrase-extract/XmlTree.h index 50b1c0acc..392192ae6 100644 --- a/phrase-extract/XmlTree.h +++ b/phrase-extract/XmlTree.h @@ -35,7 +35,7 @@ std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r" std::string TrimXml(const std::string& str); bool isXmlTag(const std::string& tag); std::vector TokenizeXml(const std::string& str); -bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true); +bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true); std::string unescape(const std::string &str); diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index bc687ec6b..9e6aacc20 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -172,7 +172,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Parse source tree and construct a SyntaxTree object. - MosesTraining::SyntaxTree sourceSyntaxTree; + MosesTraining::SyntaxNodeCollection sourceSyntaxTree; MosesTraining::SyntaxNode *sourceSyntaxTreeRoot=NULL; if (options.sourceLabels) { @@ -196,7 +196,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Read source tokens. std::vector sourceTokens(ReadTokens(sourceLine)); - // Construct a source ParseTree object from the SyntaxTree object. + // Construct a source ParseTree object from the SyntaxNodeCollection object. std::auto_ptr sourceParseTree; if (options.sourceLabels) { diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index 01178b72c..94ff3c605 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -31,7 +31,7 @@ namespace GHKM { ScfgRule::ScfgRule(const Subgraph &fragment, - const MosesTraining::SyntaxTree *sourceSyntaxTree) + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree) : m_graphFragment(fragment) , m_sourceLHS("X", NonTerminal) , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal) @@ -133,9 +133,9 @@ ScfgRule::ScfgRule(const Subgraph &fragment, } } -void ScfgRule::PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree, - const Node *node, - const std::string &nonMatchingLabel) +void ScfgRule::PushSourceLabel( + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree, + const Node *node, const std::string &nonMatchingLabel) { ContiguousSpan span = Closure(node->GetSpan()); if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span? diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h index 94ee7b82e..b3d8ad017 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.h +++ b/phrase-extract/extract-ghkm/ScfgRule.h @@ -41,7 +41,7 @@ class ScfgRule : public Rule { public: ScfgRule(const Subgraph &fragment, - const MosesTraining::SyntaxTree *sourceSyntaxTree = 0); + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree = 0); const Subgraph &GetGraphFragment() const { return m_graphFragment; @@ -78,9 +78,9 @@ public: } private: - void PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree, - const Node *node, - const std::string &nonMatchingLabel); + void PushSourceLabel( + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree, + const Node *node, const std::string &nonMatchingLabel); const Subgraph& m_graphFragment; Symbol m_sourceLHS; diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index ff0baeace..03450383a 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -58,7 +58,7 @@ private: std::set &m_labelSet; std::map &m_topLabelSet; std::string m_line; - MosesTraining::SyntaxTree m_tree; + MosesTraining::SyntaxNodeCollection m_tree; std::vector m_words; }; diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h index 675a112d8..69754bb56 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.h +++ b/phrase-extract/pcfg-common/xml_tree_parser.h @@ -47,7 +47,7 @@ class XmlTreeParser { std::set m_labelSet; std::map m_topLabelSet; std::string m_line; - MosesTraining::SyntaxTree m_tree; + MosesTraining::SyntaxNodeCollection m_tree; std::vector m_words; }; diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index 5c9daa7ae..5bca886bf 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) // process into syntax tree representation set< string > labelCollection; // set of labels, not used map< string, int > topLabelCollection; // count of top labels, not used - SyntaxTree tree; + SyntaxNodeCollection tree; ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false ); const vector< string > inWords = util::tokenize( inBufferString ); @@ -105,7 +105,7 @@ void init(int argc, char* argv[]) } } -void store( SyntaxTree &tree, const vector< string > &words ) +void store( SyntaxNodeCollection &tree, const vector< string > &words ) { // output words for( size_t i=0; i &words ) cout << endl; } -void LeftBinarize( SyntaxTree &tree, ParentNodes &parents ) +void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents ) { for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; @@ -143,7 +143,7 @@ void LeftBinarize( SyntaxTree &tree, ParentNodes &parents ) } } -void RightBinarize( SyntaxTree &tree, ParentNodes &parents ) +void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents ) { for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; @@ -161,11 +161,11 @@ void RightBinarize( SyntaxTree &tree, ParentNodes &parents ) } } -void SAMT( SyntaxTree &tree, ParentNodes &parents ) +void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) { int numWords = tree.GetNumWords(); - SyntaxTree newTree; // to store new nodes + SyntaxNodeCollection newTree; // to store new nodes // look through parents to combine children for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h index 9bd0bfb23..af41b0945 100644 --- a/phrase-extract/relax-parse.h +++ b/phrase-extract/relax-parse.h @@ -39,8 +39,8 @@ char SAMTLevel = 0; // functions void init(int argc, char* argv[]); -void store( MosesTraining::SyntaxTree &tree, const std::vector &words ); -void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); -void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); -void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); +void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector &words ); +void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); +void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); +void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index c6e3cd3c3..2f8a904fa 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -13,17 +13,17 @@ namespace Syntax { StringTree *XmlTreeParser::Parse(const std::string &line) { line_ = line; - tree_.Clear(); + node_collection_.Clear(); try { - if (!ProcessAndStripXMLTags(line_, tree_, label_set_, top_label_set_, - false)) { + if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_, + top_label_set_, false)) { throw Exception(""); } } catch (const XmlException &e) { throw Exception(e.getMsg()); } - tree_.ConnectNodes(); - SyntaxNode *root = tree_.GetTop(); + node_collection_.ConnectNodes(); + SyntaxNode *root = node_collection_.GetTop(); assert(root); words_ = util::tokenize(line_); return ConvertTree(*root, words_); diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index a5563f63a..e530b84ef 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -26,7 +26,7 @@ class XmlTreeParser { std::set label_set_; std::map top_label_set_; std::string line_; - MosesTraining::SyntaxTree tree_; + MosesTraining::SyntaxNodeCollection node_collection_; std::vector words_; }; From 985e7bbfc30c6f124c546e769948caf22eacfc66 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 29 May 2015 20:57:25 +0100 Subject: [PATCH 020/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/SentenceAlignmentWithSyntax.h | 2 +- phrase-extract/SyntaxNode.h | 75 +++++++++++++++++++ ...yntaxTree.cpp => SyntaxNodeCollection.cpp} | 7 +- .../{SyntaxTree.h => SyntaxNodeCollection.h} | 50 +------------ phrase-extract/XmlTree.cpp | 3 +- phrase-extract/XmlTree.h | 10 +-- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 3 +- phrase-extract/extract-ghkm/ScfgRule.cpp | 7 +- phrase-extract/extract-ghkm/ScfgRule.h | 9 +-- phrase-extract/extract-ghkm/XmlTreeParser.h | 5 +- phrase-extract/extract-rules-main.cpp | 2 +- phrase-extract/pcfg-common/xml_tree_parser.h | 3 +- phrase-extract/relax-parse.h | 2 +- .../syntax-common/xml_tree_parser.h | 3 +- 14 files changed, 108 insertions(+), 73 deletions(-) create mode 100644 phrase-extract/SyntaxNode.h rename phrase-extract/{SyntaxTree.cpp => SyntaxNodeCollection.cpp} (96%) rename phrase-extract/{SyntaxTree.h => SyntaxNodeCollection.h} (69%) diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h index a603f7722..604b6d0e2 100644 --- a/phrase-extract/SentenceAlignmentWithSyntax.h +++ b/phrase-extract/SentenceAlignmentWithSyntax.h @@ -28,7 +28,7 @@ #include "RuleExtractionOptions.h" #include "SentenceAlignment.h" -#include "SyntaxTree.h" +#include "SyntaxNodeCollection.h" namespace MosesTraining { diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h new file mode 100644 index 000000000..46e0f456f --- /dev/null +++ b/phrase-extract/SyntaxNode.h @@ -0,0 +1,75 @@ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include + +namespace MosesTraining +{ + +class SyntaxNode +{ +protected: + int m_start, m_end; + std::string m_label; + std::vector< SyntaxNode* > m_children; + SyntaxNode* m_parent; + float m_pcfgScore; +public: + SyntaxNode( int startPos, int endPos, std::string label ) + :m_start(startPos) + ,m_end(endPos) + ,m_label(label) + ,m_parent(0) + ,m_pcfgScore(0.0f) { + } + int GetStart() const { + return m_start; + } + int GetEnd() const { + return m_end; + } + std::string GetLabel() const { + return m_label; + } + float GetPcfgScore() const { + return m_pcfgScore; + } + void SetPcfgScore(float score) { + m_pcfgScore = score; + } + SyntaxNode *GetParent() { + return m_parent; + } + void SetParent(SyntaxNode *parent) { + m_parent = parent; + } + void AddChild(SyntaxNode* child) { + m_children.push_back(child); + } + const std::vector< SyntaxNode* > &GetChildren() const { + return m_children; + } +}; + +} // namespace MosesTraining diff --git a/phrase-extract/SyntaxTree.cpp b/phrase-extract/SyntaxNodeCollection.cpp similarity index 96% rename from phrase-extract/SyntaxTree.cpp rename to phrase-extract/SyntaxNodeCollection.cpp index 7f641125e..099a5697f 100644 --- a/phrase-extract/SyntaxTree.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -18,7 +18,7 @@ ***********************************************************************/ -#include "SyntaxTree.h" +#include "SyntaxNodeCollection.h" #include #include @@ -42,7 +42,8 @@ void SyntaxNodeCollection::Clear() m_index.clear(); } -SyntaxNode *SyntaxNodeCollection::AddNode( int startPos, int endPos, std::string label ) +SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, + const std::string &label) { SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label ); m_nodes.push_back( newNode ); @@ -151,4 +152,4 @@ void SyntaxNodeCollection::ConnectNodes() } } -} +} // namespace MosesTraining diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxNodeCollection.h similarity index 69% rename from phrase-extract/SyntaxTree.h rename to phrase-extract/SyntaxNodeCollection.h index 649a6197b..70b14206d 100644 --- a/phrase-extract/SyntaxTree.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -24,55 +24,11 @@ #include #include +#include "SyntaxNode.h" + namespace MosesTraining { -class SyntaxNode -{ -protected: - int m_start, m_end; - std::string m_label; - std::vector< SyntaxNode* > m_children; - SyntaxNode* m_parent; - float m_pcfgScore; -public: - SyntaxNode( int startPos, int endPos, std::string label ) - :m_start(startPos) - ,m_end(endPos) - ,m_label(label) - ,m_parent(0) - ,m_pcfgScore(0.0f) { - } - int GetStart() const { - return m_start; - } - int GetEnd() const { - return m_end; - } - std::string GetLabel() const { - return m_label; - } - float GetPcfgScore() const { - return m_pcfgScore; - } - void SetPcfgScore(float score) { - m_pcfgScore = score; - } - SyntaxNode *GetParent() { - return m_parent; - } - void SetParent(SyntaxNode *parent) { - m_parent = parent; - } - void AddChild(SyntaxNode* child) { - m_children.push_back(child); - } - const std::vector< SyntaxNode* > &GetChildren() const { - return m_children; - } -}; - - typedef std::vector< int > SplitPoints; typedef std::vector< SplitPoints > ParentNodes; @@ -97,7 +53,7 @@ public: ~SyntaxNodeCollection(); - SyntaxNode *AddNode( int startPos, int endPos, std::string label ); + SyntaxNode *AddNode( int startPos, int endPos, const std::string &label ); SyntaxNode *GetTop() { return m_top; diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index d45fd99eb..0f068fca7 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -24,7 +24,8 @@ #include #include #include -#include "SyntaxTree.h" + +#include "SyntaxNodeCollection.h" #include "XmlException.h" using namespace std; diff --git a/phrase-extract/XmlTree.h b/phrase-extract/XmlTree.h index 392192ae6..3b5afd4dd 100644 --- a/phrase-extract/XmlTree.h +++ b/phrase-extract/XmlTree.h @@ -1,6 +1,3 @@ -// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh @@ -21,11 +18,13 @@ ***********************************************************************/ #pragma once + #include #include #include #include -#include "SyntaxTree.h" + +#include "SyntaxNodeCollection.h" namespace MosesTraining { @@ -39,5 +38,4 @@ bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std:: std::string unescape(const std::string &str); -} // namespace - +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 9e6aacc20..937d88030 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -33,7 +33,8 @@ #include "Span.h" #include "StsgRule.h" #include "StsgRuleWriter.h" -#include "SyntaxTree.h" +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index 94ff3c605..918c88eeb 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -19,11 +19,12 @@ #include "ScfgRule.h" +#include + #include "Node.h" #include "Subgraph.h" -#include "SyntaxTree.h" - -#include +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" namespace Moses { diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h index b3d8ad017..c8b76114a 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.h +++ b/phrase-extract/extract-ghkm/ScfgRule.h @@ -19,16 +19,16 @@ #pragma once -#include "Alignment.h" -#include "Rule.h" -#include "SyntaxTree.h" - #include #include #include #include #include +#include "Alignment.h" +#include "Rule.h" +#include "SyntaxNodeCollection.h" + namespace Moses { namespace GHKM @@ -95,4 +95,3 @@ private: } // namespace GHKM } // namespace Moses - diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index 03450383a..db9fa8bf2 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -23,14 +23,15 @@ #include "Exception.h" -#include "SyntaxTree.h" - #include #include #include #include #include +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" + namespace Moses { namespace GHKM diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 50baa4e0d..825f12d89 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -41,7 +41,7 @@ #include "HoleCollection.h" #include "RuleExist.h" #include "SentenceAlignmentWithSyntax.h" -#include "SyntaxTree.h" +#include "SyntaxNode.h" #include "tables-core.h" #include "XmlTree.h" #include "InputFileStream.h" diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h index 69754bb56..8605c0691 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.h +++ b/phrase-extract/pcfg-common/xml_tree_parser.h @@ -28,7 +28,8 @@ #include #include "pcfg_tree.h" -#include "SyntaxTree.h" +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" namespace MosesTraining { namespace Syntax { diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h index af41b0945..a00aa6deb 100644 --- a/phrase-extract/relax-parse.h +++ b/phrase-extract/relax-parse.h @@ -28,7 +28,7 @@ #include #include -#include "SyntaxTree.h" +#include "SyntaxNodeCollection.h" #include "XmlTree.h" #define LINE_MAX_LENGTH 1000000 diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index e530b84ef..c84ea25ec 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -5,7 +5,8 @@ #include #include -#include "SyntaxTree.h" +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" #include "exception.h" #include "string_tree.h" From ab9b9ae3493da391d19b98551af966b6426bd400 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 31 May 2015 21:27:55 +0400 Subject: [PATCH 021/108] 1st pass to automatically beautify --- cruise-control/test_all_new_commits.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cruise-control/test_all_new_commits.sh b/cruise-control/test_all_new_commits.sh index 1e0a9c47f..7f1520452 100755 --- a/cruise-control/test_all_new_commits.sh +++ b/cruise-control/test_all_new_commits.sh @@ -107,6 +107,7 @@ function run_single_test () { #regtest_dir=$PWD/$(basename $regtest_file .tgz) cd .. + ./scripts/other/beautify.py --format echo "## ./bjam clean" >> $longlog ./bjam clean $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || warn "bjam clean failed, suspicious" @@ -153,8 +154,10 @@ function run_single_test () { date >> $longlog if [ -z "$err" ]; then + git commit -am "automatic daily beautifier" status="OK" else + git reset --hard HEAD status="FAIL:$err" fi echo "## Status: $status" >> $longlog From afb032014dc22cc184046fbf99fc08569781afe5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 31 May 2015 21:51:43 +0400 Subject: [PATCH 022/108] skip perltidy. Not available on thor (Ubuntu 12.04) --- cruise-control/test_all_new_commits.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cruise-control/test_all_new_commits.sh b/cruise-control/test_all_new_commits.sh index 7f1520452..bb9305768 100755 --- a/cruise-control/test_all_new_commits.sh +++ b/cruise-control/test_all_new_commits.sh @@ -107,7 +107,7 @@ function run_single_test () { #regtest_dir=$PWD/$(basename $regtest_file .tgz) cd .. - ./scripts/other/beautify.py --format + ./scripts/other/beautify.py --format --skip-perltidy echo "## ./bjam clean" >> $longlog ./bjam clean $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || warn "bjam clean failed, suspicious" @@ -190,7 +190,7 @@ done #### Main loop over all commits for i in $MCC_SCAN_BRANCHES; do - warn "On brach $i" + warn "On branch $i" git rev-list $i \ | while read commit; do first_char=$(echo $commit | grep -o '^.') From c754aef37a804c5ee74ff7b5ccbe1b4cdc80e81c Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 1 Jun 2015 08:45:04 +0100 Subject: [PATCH 023/108] Oops. Fix compile error. --- phrase-extract/extract-ghkm/StsgRule.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/phrase-extract/extract-ghkm/StsgRule.cpp b/phrase-extract/extract-ghkm/StsgRule.cpp index 83398f80a..271249e1b 100644 --- a/phrase-extract/extract-ghkm/StsgRule.cpp +++ b/phrase-extract/extract-ghkm/StsgRule.cpp @@ -2,7 +2,6 @@ #include "Node.h" #include "Subgraph.h" -#include "SyntaxTree.h" #include From f3ccd68bee73e2d8dfe8b7d57c9ea2b33d0d99ae Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 1 Jun 2015 10:35:50 +0100 Subject: [PATCH 024/108] Add ConstPreOrderIterator to MosesTraining::Syntax::Tree --- phrase-extract/syntax-common/tree-inl.h | 43 +++++++++++++---------- phrase-extract/syntax-common/tree.h | 28 +++++++++------ phrase-extract/syntax-common/tree_test.cc | 40 +++++++++++++++++++++ 3 files changed, 83 insertions(+), 28 deletions(-) diff --git a/phrase-extract/syntax-common/tree-inl.h b/phrase-extract/syntax-common/tree-inl.h index 2ba55df1a..9101fc490 100644 --- a/phrase-extract/syntax-common/tree-inl.h +++ b/phrase-extract/syntax-common/tree-inl.h @@ -35,23 +35,24 @@ std::size_t Tree::Depth() const { } template -class Tree::PreOrderIterator { +template +class Tree::PreOrderIter { public: - PreOrderIterator(); - PreOrderIterator(Tree &); + PreOrderIter(); + PreOrderIter(V &); - Tree &operator*() { return *node_; } - Tree *operator->() { return node_; } + V &operator*() { return *node_; } + V *operator->() { return node_; } - PreOrderIterator &operator++(); - PreOrderIterator operator++(int); + PreOrderIter &operator++(); + PreOrderIter operator++(int); - bool operator==(const Tree::PreOrderIterator &); - bool operator!=(const Tree::PreOrderIterator &); + bool operator==(const PreOrderIter &); + bool operator!=(const PreOrderIter &); private: // Pointer to the current node. - Tree *node_; + V *node_; // Stack of indices defining the position of node_ within the child vectors // of its ancestors. @@ -59,17 +60,20 @@ class Tree::PreOrderIterator { }; template -Tree::PreOrderIterator::PreOrderIterator() +template +Tree::PreOrderIter::PreOrderIter() : node_(0) { } template -Tree::PreOrderIterator::PreOrderIterator(Tree &t) +template +Tree::PreOrderIter::PreOrderIter(V &t) : node_(&t) { } template -typename Tree::PreOrderIterator &Tree::PreOrderIterator::operator++() { +template +Tree::PreOrderIter &Tree::PreOrderIter::operator++() { // If the current node has children then visit the left-most child next. if (!node_->children().empty()) { index_stack_.push(0); @@ -79,7 +83,7 @@ typename Tree::PreOrderIterator &Tree::PreOrderIterator::operator++() { // Otherwise, try node's ancestors until either a node is found with a // sibling to the right or we reach the root (in which case the traversal // is complete). - Tree *ancestor = node_->parent_; + V *ancestor = node_->parent_; while (ancestor) { std::size_t index = index_stack_.top(); index_stack_.pop(); @@ -95,19 +99,22 @@ typename Tree::PreOrderIterator &Tree::PreOrderIterator::operator++() { } template -typename Tree::PreOrderIterator Tree::PreOrderIterator::operator++(int) { - PreOrderIterator tmp(*this); +template +Tree::PreOrderIter Tree::PreOrderIter::operator++(int) { + PreOrderIter tmp(*this); ++*this; return tmp; } template -bool Tree::PreOrderIterator::operator==(const PreOrderIterator &rhs) { +template +bool Tree::PreOrderIter::operator==(const PreOrderIter &rhs) { return node_ == rhs.node_; } template -bool Tree::PreOrderIterator::operator!=(const PreOrderIterator &rhs) { +template +bool Tree::PreOrderIter::operator!=(const PreOrderIter &rhs) { return node_ != rhs.node_; } diff --git a/phrase-extract/syntax-common/tree.h b/phrase-extract/syntax-common/tree.h index 52adaa699..e37c2c21f 100644 --- a/phrase-extract/syntax-common/tree.h +++ b/phrase-extract/syntax-common/tree.h @@ -61,23 +61,31 @@ class Tree { // // All iterators are forward iterators. Example use: // - // Tree &root = GetMeATree(); - // for (Tree::PreOrderIterator p(root); - // p != Tree::PreOrderIterator(); ++p) { - // std::cout << p->value() << " "; + // const Tree &root = GetMeATree(); + // for (Tree::ConstPreOrderIterator p(root); + // p != Tree::ConstPreOrderIterator(); ++p) { + // std::cout << p->value() << "\n"; // } + private: + // Use templates to avoid code duplication between const and non-const + // iterators. V is the value type: either Tree or const Tree. + template class PreOrderIter; + // template class PostOrderIter; TODO + // template class LeafIter; TODO + + public: // Pre-order iterators. - class PreOrderIterator; - // class ConstPreOrderIterator; TODO + typedef PreOrderIter > PreOrderIterator; + typedef PreOrderIter > ConstPreOrderIterator; // Post-order iterators. - // class PostOrderIterator; TODO - // class ConstPostOrderIterator; TODO + // typedef PostOrderIter > PostOrderIterator; TODO + // typedef PostOrderIter > ConstPostOrderIterator; TODO // Leaf iterators (left-to-right). - // class LeafIterator; TODO - // class ConstLeafIterator; TODO + // typedef LeafIter > LeafIterator; TODO + // typedef LeafIter > ConstLeafIterator; TODO private: T value_; diff --git a/phrase-extract/syntax-common/tree_test.cc b/phrase-extract/syntax-common/tree_test.cc index 0a54ad3f1..198f52310 100644 --- a/phrase-extract/syntax-common/tree_test.cc +++ b/phrase-extract/syntax-common/tree_test.cc @@ -61,6 +61,46 @@ BOOST_AUTO_TEST_CASE(pre_order_2) { BOOST_REQUIRE(p == end); } +// Test Tree<>::ConstPreOrderIterator on this tree: (1 (2 (3 (4 (5) (6)))) (7)) +BOOST_AUTO_TEST_CASE(const_pre_order_1) { + boost::scoped_ptr > root(new Tree(1)); + root->children().push_back(new Tree(2)); + root->children()[0]->children().push_back(new Tree(3)); + root->children()[0]->children()[0]->children().push_back(new Tree(4)); + root->children()[0]->children()[0]->children()[0]->children().push_back( + new Tree(5)); + root->children()[0]->children()[0]->children()[0]->children().push_back( + new Tree(6)); + root->children().push_back(new Tree(7)); + root->SetParents(); + + Tree::ConstPreOrderIterator p(*root); + Tree::ConstPreOrderIterator end; + + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 1); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 2); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 3); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 4); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 5); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 6); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 7); + ++p; + BOOST_REQUIRE(p == end); +} + } // namespace } // namespace Syntax } // namespace MosesTraining From bf42fa058c424b642afd91a40257bff1c4c82241 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 1 Jun 2015 11:01:00 +0100 Subject: [PATCH 025/108] Add LeafIterator and ConstLeafIterator to MosesTraining::Syntax::Tree --- phrase-extract/syntax-common/tree-inl.h | 87 +++++++++++++++++++++++ phrase-extract/syntax-common/tree.h | 6 +- phrase-extract/syntax-common/tree_test.cc | 40 +++++++++++ 3 files changed, 130 insertions(+), 3 deletions(-) diff --git a/phrase-extract/syntax-common/tree-inl.h b/phrase-extract/syntax-common/tree-inl.h index 9101fc490..811bae2d2 100644 --- a/phrase-extract/syntax-common/tree-inl.h +++ b/phrase-extract/syntax-common/tree-inl.h @@ -118,5 +118,92 @@ bool Tree::PreOrderIter::operator!=(const PreOrderIter &rhs) { return node_ != rhs.node_; } +template +template +class Tree::LeafIter { + public: + LeafIter(); + LeafIter(V &); + + V &operator*() { return *node_; } + V *operator->() { return node_; } + + LeafIter &operator++(); + LeafIter operator++(int); + + bool operator==(const LeafIter &); + bool operator!=(const LeafIter &); + + private: + // Pointer to the current node. + V *node_; + + // Stack of indices defining the position of node_ within the child vectors + // of its ancestors. + std::stack index_stack_; +}; + +template +template +Tree::LeafIter::LeafIter() + : node_(0) { +} + +template +template +Tree::LeafIter::LeafIter(V &t) + : node_(&t) { + // Navigate to the first leaf. + while (!node_->IsLeaf()) { + index_stack_.push(0); + node_ = node_->children()[0]; + } +} + +template +template +Tree::LeafIter &Tree::LeafIter::operator++() { + // Try node's ancestors until either a node is found with a sibling to the + // right or we reach the root (in which case the traversal is complete). + V *ancestor = node_->parent_; + while (ancestor) { + std::size_t index = index_stack_.top(); + index_stack_.pop(); + if (index+1 < ancestor->children_.size()) { + index_stack_.push(index+1); + node_ = ancestor->children()[index+1]; + // Navigate to the first leaf. + while (!node_->IsLeaf()) { + index_stack_.push(0); + node_ = node_->children()[0]; + } + return *this; + } + ancestor = ancestor->parent_; + } + node_ = 0; + return *this; +} + +template +template +Tree::LeafIter Tree::LeafIter::operator++(int) { + LeafIter tmp(*this); + ++*this; + return tmp; +} + +template +template +bool Tree::LeafIter::operator==(const LeafIter &rhs) { + return node_ == rhs.node_; +} + +template +template +bool Tree::LeafIter::operator!=(const LeafIter &rhs) { + return node_ != rhs.node_; +} + } // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/syntax-common/tree.h b/phrase-extract/syntax-common/tree.h index e37c2c21f..8cec07a54 100644 --- a/phrase-extract/syntax-common/tree.h +++ b/phrase-extract/syntax-common/tree.h @@ -72,7 +72,7 @@ class Tree { // iterators. V is the value type: either Tree or const Tree. template class PreOrderIter; // template class PostOrderIter; TODO - // template class LeafIter; TODO + template class LeafIter; public: // Pre-order iterators. @@ -84,8 +84,8 @@ class Tree { // typedef PostOrderIter > ConstPostOrderIterator; TODO // Leaf iterators (left-to-right). - // typedef LeafIter > LeafIterator; TODO - // typedef LeafIter > ConstLeafIterator; TODO + typedef LeafIter > LeafIterator; + typedef LeafIter > ConstLeafIterator; private: T value_; diff --git a/phrase-extract/syntax-common/tree_test.cc b/phrase-extract/syntax-common/tree_test.cc index 198f52310..8e689f000 100644 --- a/phrase-extract/syntax-common/tree_test.cc +++ b/phrase-extract/syntax-common/tree_test.cc @@ -101,6 +101,46 @@ BOOST_AUTO_TEST_CASE(const_pre_order_1) { BOOST_REQUIRE(p == end); } +// Test Tree<>::LeafIterator with a trivial, single-node tree. +BOOST_AUTO_TEST_CASE(leaf_1) { + boost::scoped_ptr > root(new Tree(123)); + Tree::LeafIterator p(*root); + BOOST_REQUIRE(p != Tree::LeafIterator()); + BOOST_REQUIRE(p->value() == 123); + ++p; + BOOST_REQUIRE(p == Tree::LeafIterator()); +} + +// Test Tree<>::LeafIterator on this tree: (1 (2 3) (4) (5 6 (7 8))) +BOOST_AUTO_TEST_CASE(leaf_2) { + boost::scoped_ptr > root(new Tree(1)); + root->children().push_back(new Tree(2)); + root->children()[0]->children().push_back(new Tree(3)); + root->children().push_back(new Tree(4)); + root->children().push_back(new Tree(5)); + root->children()[2]->children().push_back(new Tree(6)); + root->children()[2]->children().push_back(new Tree(7)); + root->children()[2]->children()[1]->children().push_back(new Tree(8)); + root->SetParents(); + + Tree::LeafIterator p(*root); + Tree::LeafIterator end; + + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 3); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 4); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 6); + ++p; + BOOST_REQUIRE(p != end); + BOOST_REQUIRE(p->value() == 8); + ++p; + BOOST_REQUIRE(p == end); +} + } // namespace } // namespace Syntax } // namespace MosesTraining From f61091e38dc597644c76b65f3c1e0ed6cbc641ab Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 1 Jun 2015 14:23:25 +0100 Subject: [PATCH 026/108] Ongoing moses/phrase-extract refactoring --- .../extract-ghkm/AlignmentGraph.cpp | 6 +- phrase-extract/extract-ghkm/AlignmentGraph.h | 2 +- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 30 ++++----- phrase-extract/extract-ghkm/ExtractGHKM.h | 3 +- phrase-extract/extract-ghkm/Jamfile | 2 +- phrase-extract/extract-ghkm/ParseTree.cpp | 56 ---------------- phrase-extract/extract-ghkm/ParseTree.h | 67 ++----------------- phrase-extract/extract-ghkm/XmlTreeParser.cpp | 14 ++-- phrase-extract/extract-ghkm/XmlTreeParser.h | 3 +- 9 files changed, 32 insertions(+), 151 deletions(-) delete mode 100644 phrase-extract/extract-ghkm/ParseTree.cpp diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 974188dbd..52a4b41db 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -212,13 +212,13 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root) { NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE; - std::auto_ptr n(new Node(root->GetLabel(), nodeType)); + std::auto_ptr n(new Node(root->value().GetLabel(), nodeType)); if (nodeType == TREE) { - n->SetPcfgScore(root->GetPcfgScore()); + n->SetPcfgScore(root->value().GetPcfgScore()); } - const std::vector &children = root->GetChildren(); + const std::vector &children = root->children(); std::vector childNodes; childNodes.reserve(children.size()); for (std::vector::const_iterator p(children.begin()); diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.h b/phrase-extract/extract-ghkm/AlignmentGraph.h index cf26b8c27..7ae3784cd 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.h +++ b/phrase-extract/extract-ghkm/AlignmentGraph.h @@ -23,6 +23,7 @@ #include "Alignment.h" #include "Options.h" +#include "ParseTree.h" #include #include @@ -34,7 +35,6 @@ namespace GHKM { class Node; -class ParseTree; class Subgraph; class AlignmentGraph diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 937d88030..7891bc2c7 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -828,24 +828,22 @@ void ExtractGHKM::CollectWordLabelCounts( std::map &wordCount, std::map &wordLabel) { - std::vector leaves; - root.GetLeaves(std::back_inserter(leaves)); - for (std::vector::const_iterator p = leaves.begin(); - p != leaves.end(); ++p) { - const ParseTree &leaf = **p; - const std::string &word = leaf.GetLabel(); - const ParseTree *ancestor = leaf.GetParent(); + for (ParseTree::ConstLeafIterator p(root); + p != ParseTree::ConstLeafIterator(); ++p) { + const ParseTree &leaf = *p; + const std::string &word = leaf.value().GetLabel(); + const ParseTree *ancestor = leaf.parent(); // If unary rule elimination is enabled and this word is at the end of a // chain of unary rewrites, e.g. // PN-SB -> NE -> word // then record the constituent label at the top of the chain instead of // the part-of-speech label. while (!options.allowUnary && - ancestor->GetParent() && - ancestor->GetParent()->GetChildren().size() == 1) { - ancestor = ancestor->GetParent(); + ancestor->parent() && + ancestor->parent()->children().size() == 1) { + ancestor = ancestor->parent(); } - const std::string &label = ancestor->GetLabel(); + const std::string &label = ancestor->value().GetLabel(); ++wordCount[word]; wordLabel[word] = label; } @@ -854,12 +852,10 @@ void ExtractGHKM::CollectWordLabelCounts( std::vector ExtractGHKM::ReadTokens(const ParseTree &root) const { std::vector tokens; - std::vector leaves; - root.GetLeaves(std::back_inserter(leaves)); - for (std::vector::const_iterator p = leaves.begin(); - p != leaves.end(); ++p) { - const ParseTree &leaf = **p; - const std::string &word = leaf.GetLabel(); + for (ParseTree::ConstLeafIterator p(root); + p != ParseTree::ConstLeafIterator(); ++p) { + const ParseTree &leaf = *p; + const std::string &word = leaf.value().GetLabel(); tokens.push_back(word); } return tokens; diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h index 68babdccf..5954e7425 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.h +++ b/phrase-extract/extract-ghkm/ExtractGHKM.h @@ -25,6 +25,8 @@ #include #include +#include "ParseTree.h" + namespace Moses { @@ -34,7 +36,6 @@ namespace GHKM { struct Options; -class ParseTree; class ExtractGHKM { diff --git a/phrase-extract/extract-ghkm/Jamfile b/phrase-extract/extract-ghkm/Jamfile index f2d1ac5a8..4692937de 100644 --- a/phrase-extract/extract-ghkm/Jamfile +++ b/phrase-extract/extract-ghkm/Jamfile @@ -1 +1 @@ -exe extract-ghkm : [ glob *.cpp ] ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : .. ; +exe extract-ghkm : [ glob *.cpp ] ..//syntax-common ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : .. ; diff --git a/phrase-extract/extract-ghkm/ParseTree.cpp b/phrase-extract/extract-ghkm/ParseTree.cpp deleted file mode 100644 index f86486487..000000000 --- a/phrase-extract/extract-ghkm/ParseTree.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2011 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#include "ParseTree.h" - -namespace Moses -{ -namespace GHKM -{ - -ParseTree::~ParseTree() -{ - for (std::vector::iterator p(m_children.begin()); - p != m_children.end(); ++p) { - delete *p; - } -} - -void ParseTree::SetChildren(const std::vector &children) -{ - m_children = children; -} - -void ParseTree::SetParent(ParseTree *parent) -{ - m_parent = parent; -} - -void ParseTree::AddChild(ParseTree *child) -{ - m_children.push_back(child); -} - -bool ParseTree::IsLeaf() const -{ - return m_children.empty(); -} - -} // namespace GHKM -} // namespace Moses diff --git a/phrase-extract/extract-ghkm/ParseTree.h b/phrase-extract/extract-ghkm/ParseTree.h index 694286c9d..f0b83f63f 100644 --- a/phrase-extract/extract-ghkm/ParseTree.h +++ b/phrase-extract/extract-ghkm/ParseTree.h @@ -21,75 +21,16 @@ #ifndef EXTRACT_GHKM_PARSE_TREE_H_ #define EXTRACT_GHKM_PARSE_TREE_H_ -#include -#include +#include "syntax-common/tree.h" + +#include "SyntaxNode.h" namespace Moses { namespace GHKM { -class ParseTree -{ -public: - ParseTree(const std::string &label) - : m_label(label) - , m_parent(0) - , m_pcfgScore(0.0) {} - - ~ParseTree(); - - const std::string &GetLabel() const { - return m_label; - } - const std::vector &GetChildren() const { - return m_children; - } - const ParseTree *GetParent() const { - return m_parent; - } - float GetPcfgScore() const { - return m_pcfgScore; - } - - void SetParent(ParseTree *); - void SetChildren(const std::vector &); - void SetPcfgScore(float score) { - m_pcfgScore = score; - } - - void AddChild(ParseTree *); - - bool IsLeaf() const; - - template - void GetLeaves(OutputIterator) const; - -private: - // Disallow copying - ParseTree(const ParseTree &); - ParseTree &operator=(const ParseTree &); - - std::string m_label; - std::vector m_children; - ParseTree *m_parent; - float m_pcfgScore; // log probability -}; - -template -void ParseTree::GetLeaves(OutputIterator result) const -{ - if (IsLeaf()) { - *result++ = this; - } else { - std::vector::const_iterator p = m_children.begin(); - std::vector::const_iterator end = m_children.end(); - while (p != end) { - ParseTree &child = **p++; - child.GetLeaves(result); - } - } -} +typedef MosesTraining::Syntax::Tree ParseTree; } // namespace GHKM } // namespace Moses diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp index f9800c8e0..671b03a78 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -66,8 +66,7 @@ std::auto_ptr XmlTreeParser::ConvertTree( const SyntaxNode &tree, const std::vector &words) { - std::auto_ptr root(new ParseTree(tree.GetLabel())); - root->SetPcfgScore(tree.GetPcfgScore()); + std::auto_ptr root(new ParseTree(tree)); const std::vector &children = tree.GetChildren(); if (children.empty()) { if (tree.GetStart() != tree.GetEnd()) { @@ -76,16 +75,17 @@ std::auto_ptr XmlTreeParser::ConvertTree( << "-" << tree.GetEnd() << "): this is currently unsupported"; throw Exception(msg.str()); } - std::auto_ptr leaf(new ParseTree(words[tree.GetStart()])); - leaf->SetParent(root.get()); - root->AddChild(leaf.release()); + SyntaxNode value(tree.GetStart(), tree.GetStart(), words[tree.GetStart()]); + std::auto_ptr leaf(new ParseTree(value)); + leaf->parent() = root.get(); + root->children().push_back(leaf.release()); } else { for (std::vector::const_iterator p = children.begin(); p != children.end(); ++p) { assert(*p); std::auto_ptr child = ConvertTree(**p, words); - child->SetParent(root.get()); - root->AddChild(child.release()); + child->parent() = root.get(); + root->children().push_back(child.release()); } } return root; diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index db9fa8bf2..a82862428 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -29,6 +29,7 @@ #include #include +#include "ParseTree.h" #include "SyntaxNode.h" #include "SyntaxNodeCollection.h" @@ -37,8 +38,6 @@ namespace Moses namespace GHKM { -class ParseTree; - // Parses a string in Moses' XML parse tree format and returns a ParseTree // object. class XmlTreeParser From f37415a259f19116d90c7bc82ecf16fd8bbbf23b Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 1 Jun 2015 16:40:35 +0100 Subject: [PATCH 027/108] Ongoing moses/phrase-extract refactoring --- moses/FF/PhraseOrientationFeature.cpp | 70 +++++++------- moses/FF/PhraseOrientationFeature.h | 6 +- phrase-extract/InternalStructFeature.h | 4 - phrase-extract/SyntaxTree.h | 12 +++ phrase-extract/extract-ghkm/Alignment.cpp | 4 +- phrase-extract/extract-ghkm/Alignment.h | 4 +- .../extract-ghkm/AlignmentGraph.cpp | 25 ++--- phrase-extract/extract-ghkm/AlignmentGraph.h | 17 ++-- phrase-extract/extract-ghkm/ComposedRule.cpp | 12 +-- phrase-extract/extract-ghkm/ComposedRule.h | 8 +- phrase-extract/extract-ghkm/Exception.h | 4 +- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 95 ++++++++++--------- phrase-extract/extract-ghkm/ExtractGHKM.h | 19 ++-- phrase-extract/extract-ghkm/Main.cpp | 2 +- phrase-extract/extract-ghkm/Node.cpp | 4 +- phrase-extract/extract-ghkm/Node.h | 8 +- phrase-extract/extract-ghkm/Options.h | 4 +- phrase-extract/extract-ghkm/ParseTree.h | 38 -------- .../extract-ghkm/PhraseOrientation.cpp | 4 +- .../extract-ghkm/PhraseOrientation.h | 18 ++-- phrase-extract/extract-ghkm/Rule.cpp | 4 +- phrase-extract/extract-ghkm/Rule.h | 4 +- phrase-extract/extract-ghkm/ScfgRule.cpp | 14 +-- phrase-extract/extract-ghkm/ScfgRule.h | 11 +-- .../extract-ghkm/ScfgRuleWriter.cpp | 12 +-- phrase-extract/extract-ghkm/ScfgRuleWriter.h | 8 +- phrase-extract/extract-ghkm/Span.cpp | 4 +- phrase-extract/extract-ghkm/Span.h | 4 +- phrase-extract/extract-ghkm/StsgRule.cpp | 8 +- phrase-extract/extract-ghkm/StsgRule.h | 8 +- .../extract-ghkm/StsgRuleWriter.cpp | 12 +-- phrase-extract/extract-ghkm/StsgRuleWriter.h | 8 +- phrase-extract/extract-ghkm/Subgraph.cpp | 9 +- phrase-extract/extract-ghkm/Subgraph.h | 8 +- phrase-extract/extract-ghkm/XmlTreeParser.cpp | 29 +++--- phrase-extract/extract-ghkm/XmlTreeParser.h | 18 ++-- 36 files changed, 246 insertions(+), 273 deletions(-) create mode 100644 phrase-extract/SyntaxTree.h delete mode 100644 phrase-extract/extract-ghkm/ParseTree.h diff --git a/moses/FF/PhraseOrientationFeature.cpp b/moses/FF/PhraseOrientationFeature.cpp index 1c9a3f738..fea8dafad 100644 --- a/moses/FF/PhraseOrientationFeature.cpp +++ b/moses/FF/PhraseOrientationFeature.cpp @@ -134,7 +134,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source, if (targetPhrase.GetAlignNonTerm().GetSize() != 0) { // Initialize phrase orientation scoring object - Moses::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(), + MosesTraining::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(), targetPhrase.GetAlignTerm(), targetPhrase.GetAlignNonTerm()); PhraseOrientationFeature::ReoClassData* reoClassData = new PhraseOrientationFeature::ReoClassData(); @@ -150,7 +150,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source, // LEFT-TO-RIGHT DIRECTION - Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,Moses::GHKM::PhraseOrientation::REO_DIR_L2R); + MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_L2R); if ( ((targetIndex == 0) || !phraseOrientation.TargetSpanIsAligned(0,targetIndex)) // boundary non-terminal in rule-initial position (left boundary) && (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule @@ -170,7 +170,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source, if (reoClassData->firstNonTerminalPreviousSourceSpanIsAligned && reoClassData->firstNonTerminalFollowingSourceSpanIsAligned) { // discontinuous - l2rOrientation = Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT; + l2rOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT; } else { reoClassData->firstNonTerminalIsBoundary = true; } @@ -180,7 +180,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source, // RIGHT-TO-LEFT DIRECTION - Moses::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,Moses::GHKM::PhraseOrientation::REO_DIR_R2L); + MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_R2L); if ( ((targetIndex == targetPhrase.GetSize()-1) || !phraseOrientation.TargetSpanIsAligned(targetIndex,targetPhrase.GetSize()-1)) // boundary non-terminal in rule-final position (right boundary) && (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule @@ -200,7 +200,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source, if (reoClassData->lastNonTerminalPreviousSourceSpanIsAligned && reoClassData->lastNonTerminalFollowingSourceSpanIsAligned) { // discontinuous - r2lOrientation = Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT; + r2lOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT; } else { reoClassData->lastNonTerminalIsBoundary = true; } @@ -335,25 +335,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied( // LEFT-TO-RIGHT DIRECTION - Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT]; + MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT]; IFFEATUREVERBOSE(2) { FEATUREVERBOSE(2, "l2rOrientation "); switch (l2rOrientation) { - case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT: + case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT: FEATUREVERBOSE2(2, "mono" << std::endl); break; - case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT: + case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT: FEATUREVERBOSE2(2, "swap" << std::endl); break; - case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT: + case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT: FEATUREVERBOSE2(2, "dleft" << std::endl); break; - case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: + case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: FEATUREVERBOSE2(2, "dright" << std::endl); break; - case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: - // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR + case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: + // modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR FEATUREVERBOSE2(2, "unknown->dleft" << std::endl); break; default: @@ -396,23 +396,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied( } else { - if ( l2rOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { + if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { newScores[0] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityMono()); // if sub-derivation has left-boundary non-terminal: // add recursive actual score of boundary non-terminal from subderivation LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x1, newScores, accumulator); - } else if ( l2rOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { + } else if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { newScores[1] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilitySwap()); // if sub-derivation has left-boundary non-terminal: // add recursive actual score of boundary non-terminal from subderivation LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x2, newScores, accumulator); - } else if ( ( l2rOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || - ( l2rOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || - ( l2rOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { + } else if ( ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || + ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || + ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { newScores[2] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous()); // if sub-derivation has left-boundary non-terminal: @@ -437,25 +437,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied( // RIGHT-TO-LEFT DIRECTION - Moses::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT]; + MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT]; IFFEATUREVERBOSE(2) { FEATUREVERBOSE(2, "r2lOrientation "); switch (r2lOrientation) { - case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT: + case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT: FEATUREVERBOSE2(2, "mono" << std::endl); break; - case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT: + case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT: FEATUREVERBOSE2(2, "swap" << std::endl); break; - case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT: + case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT: FEATUREVERBOSE2(2, "dleft" << std::endl); break; - case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: + case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: FEATUREVERBOSE2(2, "dright" << std::endl); break; - case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: - // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR + case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: + // modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR FEATUREVERBOSE2(2, "unknown->dleft" << std::endl); break; default: @@ -498,23 +498,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied( } else { - if ( r2lOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { + if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { newScores[m_offsetR2LScores+0] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityMono()); // if sub-derivation has right-boundary non-terminal: // add recursive actual score of boundary non-terminal from subderivation RightBoundaryR2LScoreRecursive(featureID, prevState, 0x1, newScores, accumulator); - } else if ( r2lOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { + } else if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { newScores[m_offsetR2LScores+1] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilitySwap()); // if sub-derivation has right-boundary non-terminal: // add recursive actual score of boundary non-terminal from subderivation RightBoundaryR2LScoreRecursive(featureID, prevState, 0x2, newScores, accumulator); - } else if ( ( r2lOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || - ( r2lOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || - ( r2lOrientation == Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { + } else if ( ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || + ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || + ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { newScores[m_offsetR2LScores+2] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous()); // if sub-derivation has right-boundary non-terminal: @@ -862,17 +862,17 @@ void PhraseOrientationFeature::SparseNonTerminalR2LScore(const Factor* nonTermin } -const std::string* PhraseOrientationFeature::ToString(const Moses::GHKM::PhraseOrientation::REO_CLASS o) const +const std::string* PhraseOrientationFeature::ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const { - if ( o == Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { + if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { return &MORIENT; - } else if ( o == Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { + } else if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { return &SORIENT; - } else if ( ( o == Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || - ( o == Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || - ( o == Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { + } else if ( ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || + ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || + ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { return &DORIENT; } else { diff --git a/moses/FF/PhraseOrientationFeature.h b/moses/FF/PhraseOrientationFeature.h index 4460a1ea7..7c429dd1c 100644 --- a/moses/FF/PhraseOrientationFeature.h +++ b/moses/FF/PhraseOrientationFeature.h @@ -302,8 +302,8 @@ public: struct ReoClassData { public: - std::vector nonTerminalReoClassL2R; - std::vector nonTerminalReoClassR2L; + std::vector nonTerminalReoClassL2R; + std::vector nonTerminalReoClassR2L; bool firstNonTerminalIsBoundary; bool firstNonTerminalPreviousSourceSpanIsAligned; bool firstNonTerminalFollowingSourceSpanIsAligned; @@ -401,7 +401,7 @@ protected: ScoreComponentCollection* scoreBreakdown, const std::string* o) const; - const std::string* ToString(const Moses::GHKM::PhraseOrientation::REO_CLASS o) const; + const std::string* ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const; static const std::string MORIENT; static const std::string SORIENT; diff --git a/phrase-extract/InternalStructFeature.h b/phrase-extract/InternalStructFeature.h index 2ac3ecd9d..66d61c6f9 100644 --- a/phrase-extract/InternalStructFeature.h +++ b/phrase-extract/InternalStructFeature.h @@ -10,10 +10,6 @@ #include "ScoreFeature.h" #include "extract-ghkm/Node.h" -using namespace MosesTraining; -using namespace Moses; -using namespace GHKM; - namespace MosesTraining { diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxTree.h new file mode 100644 index 000000000..c2132fda3 --- /dev/null +++ b/phrase-extract/SyntaxTree.h @@ -0,0 +1,12 @@ +#pragma once + +#include "syntax-common/tree.h" + +#include "SyntaxNode.h" + +namespace MosesTraining +{ + +typedef Syntax::Tree SyntaxTree; + +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp index 7e084e495..6f946fe5a 100644 --- a/phrase-extract/extract-ghkm/Alignment.cpp +++ b/phrase-extract/extract-ghkm/Alignment.cpp @@ -25,7 +25,7 @@ #include #include -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -70,4 +70,4 @@ void FlipAlignment(Alignment &a) } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/extract-ghkm/Alignment.h index e8381a602..154e1fc4f 100644 --- a/phrase-extract/extract-ghkm/Alignment.h +++ b/phrase-extract/extract-ghkm/Alignment.h @@ -23,7 +23,7 @@ #include #include -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -35,5 +35,5 @@ void ReadAlignment(const std::string &, Alignment &); void FlipAlignment(Alignment &); } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 52a4b41db..3fa65656c 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -19,23 +19,24 @@ #include "AlignmentGraph.h" -#include "ComposedRule.h" -#include "Node.h" -#include "Options.h" -#include "ParseTree.h" -#include "Subgraph.h" - #include #include #include #include -namespace Moses +#include "SyntaxTree.h" + +#include "ComposedRule.h" +#include "Node.h" +#include "Options.h" +#include "Subgraph.h" + +namespace MosesTraining { namespace GHKM { -AlignmentGraph::AlignmentGraph(const ParseTree *t, +AlignmentGraph::AlignmentGraph(const SyntaxTree *t, const std::vector &s, const Alignment &a) { @@ -208,7 +209,7 @@ void AlignmentGraph::ExtractComposedRules(Node *node, const Options &options) } } -Node *AlignmentGraph::CopyParseTree(const ParseTree *root) +Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root) { NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE; @@ -218,10 +219,10 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root) n->SetPcfgScore(root->value().GetPcfgScore()); } - const std::vector &children = root->children(); + const std::vector &children = root->children(); std::vector childNodes; childNodes.reserve(children.size()); - for (std::vector::const_iterator p(children.begin()); + for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { Node *child = CopyParseTree(*p); child->AddParent(n.get()); @@ -385,4 +386,4 @@ Node *AlignmentGraph::DetermineAttachmentPoint(int index) } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.h b/phrase-extract/extract-ghkm/AlignmentGraph.h index 7ae3784cd..032b946f0 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.h +++ b/phrase-extract/extract-ghkm/AlignmentGraph.h @@ -21,15 +21,16 @@ #ifndef EXTRACT_GHKM_ALIGNMENT_GRAPH_H_ #define EXTRACT_GHKM_ALIGNMENT_GRAPH_H_ -#include "Alignment.h" -#include "Options.h" -#include "ParseTree.h" - #include #include #include -namespace Moses +#include "SyntaxTree.h" + +#include "Alignment.h" +#include "Options.h" + +namespace MosesTraining { namespace GHKM { @@ -40,7 +41,7 @@ class Subgraph; class AlignmentGraph { public: - AlignmentGraph(const ParseTree *, + AlignmentGraph(const SyntaxTree *, const std::vector &, const Alignment &); @@ -61,7 +62,7 @@ private: AlignmentGraph(const AlignmentGraph &); AlignmentGraph &operator=(const AlignmentGraph &); - Node *CopyParseTree(const ParseTree *); + Node *CopyParseTree(const SyntaxTree *); void ComputeFrontierSet(Node *, const Options &, std::set &) const; void CalcComplementSpans(Node *); void GetTargetTreeLeaves(Node *, std::vector &); @@ -77,6 +78,6 @@ private: }; } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/ComposedRule.cpp b/phrase-extract/extract-ghkm/ComposedRule.cpp index e9fc826b7..d322a255f 100644 --- a/phrase-extract/extract-ghkm/ComposedRule.cpp +++ b/phrase-extract/extract-ghkm/ComposedRule.cpp @@ -19,15 +19,15 @@ #include "ComposedRule.h" -#include "Node.h" -#include "Options.h" -#include "Subgraph.h" - #include #include #include -namespace Moses +#include "Node.h" +#include "Options.h" +#include "Subgraph.h" + +namespace MosesTraining { namespace GHKM { @@ -128,4 +128,4 @@ Subgraph ComposedRule::CreateSubgraph() } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ComposedRule.h b/phrase-extract/extract-ghkm/ComposedRule.h index b5f72a492..d456fd27c 100644 --- a/phrase-extract/extract-ghkm/ComposedRule.h +++ b/phrase-extract/extract-ghkm/ComposedRule.h @@ -21,12 +21,12 @@ #ifndef EXTRACT_GHKM_COMPOSED_RULE_H_ #define EXTRACT_GHKM_COMPOSED_RULE_H_ -#include "Subgraph.h" - #include #include -namespace Moses +#include "Subgraph.h" + +namespace MosesTraining { namespace GHKM { @@ -67,6 +67,6 @@ private: }; } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/Exception.h b/phrase-extract/extract-ghkm/Exception.h index a1e623cd1..99e1067f4 100644 --- a/phrase-extract/extract-ghkm/Exception.h +++ b/phrase-extract/extract-ghkm/Exception.h @@ -23,7 +23,7 @@ #include -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -41,6 +41,6 @@ private: }; } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 7891bc2c7..0c7dadd4d 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -19,29 +19,6 @@ #include "ExtractGHKM.h" -#include "Alignment.h" -#include "AlignmentGraph.h" -#include "Exception.h" -#include "InputFileStream.h" -#include "Node.h" -#include "OutputFileStream.h" -#include "Options.h" -#include "ParseTree.h" -#include "PhraseOrientation.h" -#include "ScfgRule.h" -#include "ScfgRuleWriter.h" -#include "Span.h" -#include "StsgRule.h" -#include "StsgRuleWriter.h" -#include "SyntaxNode.h" -#include "SyntaxNodeCollection.h" -#include "tables-core.h" -#include "XmlException.h" -#include "XmlTree.h" -#include "XmlTreeParser.h" - -#include - #include #include #include @@ -51,13 +28,40 @@ #include #include -namespace Moses +#include + +#include "InputFileStream.h" +#include "OutputFileStream.h" +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" +#include "SyntaxTree.h" +#include "tables-core.h" +#include "XmlException.h" +#include "XmlTree.h" + +#include "Alignment.h" +#include "AlignmentGraph.h" +#include "Exception.h" +#include "Node.h" +#include "Options.h" +#include "PhraseOrientation.h" +#include "ScfgRule.h" +#include "ScfgRuleWriter.h" +#include "Span.h" +#include "StsgRule.h" +#include "StsgRuleWriter.h" +#include "XmlTreeParser.h" + +namespace MosesTraining { namespace GHKM { int ExtractGHKM::Main(int argc, char *argv[]) { + using Moses::InputFileStream; + using Moses::OutputFileStream; + // Process command-line options. Options options; ProcessOptions(argc, argv, options); @@ -158,7 +162,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::cerr << "skipping line " << lineNum << " with empty target tree\n"; continue; } - std::auto_ptr targetParseTree; + std::auto_ptr targetParseTree; try { targetParseTree = targetXmlTreeParser.Parse(targetLine); assert(targetParseTree.get()); @@ -173,8 +177,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Parse source tree and construct a SyntaxTree object. - MosesTraining::SyntaxNodeCollection sourceSyntaxTree; - MosesTraining::SyntaxNode *sourceSyntaxTreeRoot=NULL; + SyntaxNodeCollection sourceSyntaxTree; + SyntaxNode *sourceSyntaxTreeRoot=NULL; if (options.sourceLabels) { try { @@ -197,8 +201,9 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Read source tokens. std::vector sourceTokens(ReadTokens(sourceLine)); - // Construct a source ParseTree object from the SyntaxNodeCollection object. - std::auto_ptr sourceParseTree; + // Construct a source SyntaxTree object from the SyntaxNodeCollection + // object. + std::auto_ptr sourceParseTree; if (options.sourceLabels) { try { @@ -264,12 +269,12 @@ int ExtractGHKM::Main(int argc, char *argv[]) const std::vector &rules = (*p)->GetRules(); - Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation=Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN, r2lOrientation=Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN; + PhraseOrientation::REO_CLASS l2rOrientation=PhraseOrientation::REO_CLASS_UNKNOWN, r2lOrientation=PhraseOrientation::REO_CLASS_UNKNOWN; if (options.phraseOrientation && !rules.empty()) { int sourceSpanBegin = *((*p)->GetSpan().begin()); int sourceSpanEnd = *((*p)->GetSpan().rbegin()); - l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,Moses::GHKM::PhraseOrientation::REO_DIR_L2R); - r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,Moses::GHKM::PhraseOrientation::REO_DIR_R2L); + l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,PhraseOrientation::REO_DIR_L2R); + r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,PhraseOrientation::REO_DIR_R2L); // std::cerr << "span " << sourceSpanBegin << " " << sourceSpanEnd << std::endl; // std::cerr << "phraseOrientation " << phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd) << std::endl; } @@ -310,8 +315,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) fwdExtractStream << " "; phraseOrientation.WriteOrientation(fwdExtractStream,r2lOrientation); fwdExtractStream << "}}"; - phraseOrientation.IncrementPriorCount(Moses::GHKM::PhraseOrientation::REO_DIR_L2R,l2rOrientation,1); - phraseOrientation.IncrementPriorCount(Moses::GHKM::PhraseOrientation::REO_DIR_R2L,r2lOrientation,1); + phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_L2R,l2rOrientation,1); + phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_R2L,r2lOrientation,1); } fwdExtractStream << std::endl; invExtractStream << std::endl; @@ -400,7 +405,7 @@ void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename, } void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename, - OutputFileStream &stream) + Moses::OutputFileStream &stream) { bool ret = stream.Open(filename); if (!ret) { @@ -823,16 +828,16 @@ void ExtractGHKM::WriteSourceLabelSet( } void ExtractGHKM::CollectWordLabelCounts( - ParseTree &root, + SyntaxTree &root, const Options &options, std::map &wordCount, std::map &wordLabel) { - for (ParseTree::ConstLeafIterator p(root); - p != ParseTree::ConstLeafIterator(); ++p) { - const ParseTree &leaf = *p; + for (SyntaxTree::ConstLeafIterator p(root); + p != SyntaxTree::ConstLeafIterator(); ++p) { + const SyntaxTree &leaf = *p; const std::string &word = leaf.value().GetLabel(); - const ParseTree *ancestor = leaf.parent(); + const SyntaxTree *ancestor = leaf.parent(); // If unary rule elimination is enabled and this word is at the end of a // chain of unary rewrites, e.g. // PN-SB -> NE -> word @@ -849,12 +854,12 @@ void ExtractGHKM::CollectWordLabelCounts( } } -std::vector ExtractGHKM::ReadTokens(const ParseTree &root) const +std::vector ExtractGHKM::ReadTokens(const SyntaxTree &root) const { std::vector tokens; - for (ParseTree::ConstLeafIterator p(root); - p != ParseTree::ConstLeafIterator(); ++p) { - const ParseTree &leaf = *p; + for (SyntaxTree::ConstLeafIterator p(root); + p != SyntaxTree::ConstLeafIterator(); ++p) { + const SyntaxTree &leaf = *p; const std::string &word = leaf.value().GetLabel(); tokens.push_back(word); } @@ -956,4 +961,4 @@ void ExtractGHKM::StripBitParLabels( } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h index 5954e7425..66c4c55f8 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.h +++ b/phrase-extract/extract-ghkm/ExtractGHKM.h @@ -25,13 +25,11 @@ #include #include -#include "ParseTree.h" +#include "OutputFileStream.h" +#include "SyntaxTree.h" -namespace Moses +namespace MosesTraining { - -class OutputFileStream; - namespace GHKM { @@ -52,9 +50,9 @@ private: void Error(const std::string &) const; void OpenInputFileOrDie(const std::string &, std::ifstream &); void OpenOutputFileOrDie(const std::string &, std::ofstream &); - void OpenOutputFileOrDie(const std::string &, OutputFileStream &); - void RecordTreeLabels(const ParseTree &, std::set &); - void CollectWordLabelCounts(ParseTree &, + void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &); + void RecordTreeLabels(const SyntaxTree &, std::set &); + void CollectWordLabelCounts(SyntaxTree &, const Options &, std::map &, std::map &); @@ -78,7 +76,7 @@ private: std::map &outTopLabelSet) const; std::vector ReadTokens(const std::string &) const; - std::vector ReadTokens(const ParseTree &root) const; + std::vector ReadTokens(const SyntaxTree &root) const; void ProcessOptions(int, char *[], Options &) const; @@ -86,5 +84,4 @@ private: }; } // namespace GHKM -} // namespace Moses - +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Main.cpp b/phrase-extract/extract-ghkm/Main.cpp index 14064406b..64b3e0f00 100644 --- a/phrase-extract/extract-ghkm/Main.cpp +++ b/phrase-extract/extract-ghkm/Main.cpp @@ -21,6 +21,6 @@ int main(int argc, char *argv[]) { - Moses::GHKM::ExtractGHKM tool; + MosesTraining::GHKM::ExtractGHKM tool; return tool.Main(argc, argv); } diff --git a/phrase-extract/extract-ghkm/Node.cpp b/phrase-extract/extract-ghkm/Node.cpp index e14d8c050..384db3306 100644 --- a/phrase-extract/extract-ghkm/Node.cpp +++ b/phrase-extract/extract-ghkm/Node.cpp @@ -21,7 +21,7 @@ #include "Subgraph.h" -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -70,4 +70,4 @@ void Node::GetTargetWords(std::vector &targetWords) const } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Node.h b/phrase-extract/extract-ghkm/Node.h index 2eed01311..71a24b28e 100644 --- a/phrase-extract/extract-ghkm/Node.h +++ b/phrase-extract/extract-ghkm/Node.h @@ -21,14 +21,14 @@ #ifndef EXTRACT_GHKM_NODE_H_ #define EXTRACT_GHKM_NODE_H_ -#include "Span.h" - #include #include #include #include -namespace Moses +#include "Span.h" + +namespace MosesTraining { namespace GHKM { @@ -215,6 +215,6 @@ Node *Node::LowestCommonAncestor(InputIterator first, InputIterator last) } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h index 00d59f9c7..f694fb55c 100644 --- a/phrase-extract/extract-ghkm/Options.h +++ b/phrase-extract/extract-ghkm/Options.h @@ -21,7 +21,7 @@ #include -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -89,5 +89,5 @@ public: }; } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ParseTree.h b/phrase-extract/extract-ghkm/ParseTree.h deleted file mode 100644 index f0b83f63f..000000000 --- a/phrase-extract/extract-ghkm/ParseTree.h +++ /dev/null @@ -1,38 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2011 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef EXTRACT_GHKM_PARSE_TREE_H_ -#define EXTRACT_GHKM_PARSE_TREE_H_ - -#include "syntax-common/tree.h" - -#include "SyntaxNode.h" - -namespace Moses -{ -namespace GHKM -{ - -typedef MosesTraining::Syntax::Tree ParseTree; - -} // namespace GHKM -} // namespace Moses - -#endif diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/extract-ghkm/PhraseOrientation.cpp index 8f1356cb3..57952d580 100644 --- a/phrase-extract/extract-ghkm/PhraseOrientation.cpp +++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp @@ -26,7 +26,7 @@ #include -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -469,5 +469,5 @@ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/extract-ghkm/PhraseOrientation.h index d826c127c..572124e61 100644 --- a/phrase-extract/extract-ghkm/PhraseOrientation.h +++ b/phrase-extract/extract-ghkm/PhraseOrientation.h @@ -1,4 +1,3 @@ - /*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh @@ -20,16 +19,18 @@ #pragma once -#include "Alignment.h" -#include "moses/AlignmentInfo.h" - #include #include #include #include + #include -namespace Moses +#include "moses/AlignmentInfo.h" + +#include "Alignment.h" + +namespace MosesTraining { namespace GHKM { @@ -53,8 +54,8 @@ public: PhraseOrientation(int sourceSize, int targetSize, - const AlignmentInfo &alignTerm, - const AlignmentInfo &alignNonTerm); + const Moses::AlignmentInfo &alignTerm, + const Moses::AlignmentInfo &alignNonTerm); REO_CLASS GetOrientationInfo(int startF, int endF, REO_DIR direction) const; REO_CLASS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const; @@ -119,5 +120,4 @@ private: }; } // namespace GHKM -} // namespace Moses - +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Rule.cpp b/phrase-extract/extract-ghkm/Rule.cpp index da6b2ff23..1b7207c3c 100644 --- a/phrase-extract/extract-ghkm/Rule.cpp +++ b/phrase-extract/extract-ghkm/Rule.cpp @@ -3,7 +3,7 @@ #include "Node.h" #include "Subgraph.h" -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -38,4 +38,4 @@ bool Rule::PartitionOrderComp(const Node *a, const Node *b) } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Rule.h b/phrase-extract/extract-ghkm/Rule.h index 36e24c799..b87934735 100644 --- a/phrase-extract/extract-ghkm/Rule.h +++ b/phrase-extract/extract-ghkm/Rule.h @@ -7,7 +7,7 @@ #include "Alignment.h" -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -54,6 +54,6 @@ protected: }; } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index 918c88eeb..fc960b598 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -26,13 +26,13 @@ #include "SyntaxNode.h" #include "SyntaxNodeCollection.h" -namespace Moses +namespace MosesTraining { namespace GHKM { ScfgRule::ScfgRule(const Subgraph &fragment, - const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree) + const SyntaxNodeCollection *sourceSyntaxTree) : m_graphFragment(fragment) , m_sourceLHS("X", NonTerminal) , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal) @@ -134,13 +134,13 @@ ScfgRule::ScfgRule(const Subgraph &fragment, } } -void ScfgRule::PushSourceLabel( - const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree, - const Node *node, const std::string &nonMatchingLabel) +void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceSyntaxTree, + const Node *node, + const std::string &nonMatchingLabel) { ContiguousSpan span = Closure(node->GetSpan()); if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span? - std::vector sourceLabels = + std::vector sourceLabels = sourceSyntaxTree->GetNodes(span.first,span.second); if (!sourceLabels.empty()) { // store the topmost matching label from the source syntax tree @@ -197,4 +197,4 @@ void ScfgRule::UpdateSourceLabelCoocCounts(std::map< std::string, std::map #include #include @@ -30,7 +26,11 @@ #include #include -namespace Moses +#include "Alignment.h" +#include "Options.h" +#include "ScfgRule.h" + +namespace MosesTraining { namespace GHKM { @@ -229,4 +229,4 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out) } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/phrase-extract/extract-ghkm/ScfgRuleWriter.h index 41ef9a6d2..31358c57d 100644 --- a/phrase-extract/extract-ghkm/ScfgRuleWriter.h +++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h @@ -19,11 +19,11 @@ #pragma once -#include "Subgraph.h" - #include -namespace Moses +#include "Subgraph.h" + +namespace MosesTraining { namespace GHKM { @@ -57,5 +57,5 @@ private: }; } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Span.cpp b/phrase-extract/extract-ghkm/Span.cpp index d637ec3d2..f6636cebb 100644 --- a/phrase-extract/extract-ghkm/Span.cpp +++ b/phrase-extract/extract-ghkm/Span.cpp @@ -19,7 +19,7 @@ #include "Span.h" -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -45,4 +45,4 @@ ContiguousSpan Closure(const Span &s) } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Span.h b/phrase-extract/extract-ghkm/Span.h index c4d146c4e..90bed416a 100644 --- a/phrase-extract/extract-ghkm/Span.h +++ b/phrase-extract/extract-ghkm/Span.h @@ -24,7 +24,7 @@ #include #include -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -36,7 +36,7 @@ bool SpansIntersect(const Span &, const ContiguousSpan &); ContiguousSpan Closure(const Span &); -} // namespace Moses +} // namespace MosesTraining } // namespace GHKM #endif diff --git a/phrase-extract/extract-ghkm/StsgRule.cpp b/phrase-extract/extract-ghkm/StsgRule.cpp index 271249e1b..10368e4c0 100644 --- a/phrase-extract/extract-ghkm/StsgRule.cpp +++ b/phrase-extract/extract-ghkm/StsgRule.cpp @@ -1,11 +1,11 @@ #include "StsgRule.h" +#include + #include "Node.h" #include "Subgraph.h" -#include - -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -91,4 +91,4 @@ StsgRule::StsgRule(const Subgraph &fragment) } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/StsgRule.h b/phrase-extract/extract-ghkm/StsgRule.h index b14695c5c..a037a8d91 100644 --- a/phrase-extract/extract-ghkm/StsgRule.h +++ b/phrase-extract/extract-ghkm/StsgRule.h @@ -2,12 +2,12 @@ #ifndef EXTRACT_GHKM_STSG_RULE_H_ #define EXTRACT_GHKM_STSG_RULE_H_ +#include + #include "Rule.h" #include "Subgraph.h" -#include - -namespace Moses +namespace MosesTraining { namespace GHKM { @@ -39,6 +39,6 @@ private: }; } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/StsgRuleWriter.cpp b/phrase-extract/extract-ghkm/StsgRuleWriter.cpp index a9596b65c..32953bf68 100644 --- a/phrase-extract/extract-ghkm/StsgRuleWriter.cpp +++ b/phrase-extract/extract-ghkm/StsgRuleWriter.cpp @@ -1,9 +1,5 @@ #include "StsgRuleWriter.h" -#include "Alignment.h" -#include "Options.h" -#include "StsgRule.h" - #include #include #include @@ -11,7 +7,11 @@ #include #include -namespace Moses +#include "Alignment.h" +#include "Options.h" +#include "StsgRule.h" + +namespace MosesTraining { namespace GHKM { @@ -92,4 +92,4 @@ void StsgRuleWriter::Write(const StsgRule &rule) } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/StsgRuleWriter.h b/phrase-extract/extract-ghkm/StsgRuleWriter.h index efba44d2c..3f215a5c9 100644 --- a/phrase-extract/extract-ghkm/StsgRuleWriter.h +++ b/phrase-extract/extract-ghkm/StsgRuleWriter.h @@ -2,11 +2,11 @@ #ifndef EXTRACT_GHKM_STSG_RULE_WRITER_H_ #define EXTRACT_GHKM_STSG_RULE_WRITER_H_ -#include "Subgraph.h" - #include -namespace Moses +#include "Subgraph.h" + +namespace MosesTraining { namespace GHKM { @@ -36,6 +36,6 @@ private: }; } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/Subgraph.cpp b/phrase-extract/extract-ghkm/Subgraph.cpp index 0d673edca..f04c6982c 100644 --- a/phrase-extract/extract-ghkm/Subgraph.cpp +++ b/phrase-extract/extract-ghkm/Subgraph.cpp @@ -18,10 +18,11 @@ ***********************************************************************/ #include -#include "Subgraph.h" -#include "Node.h" -namespace Moses +#include "Node.h" +#include "Subgraph.h" + +namespace MosesTraining { namespace GHKM { @@ -193,5 +194,5 @@ void Subgraph::RecursivelyGetPartsOfSpeech(const Node *n, std::vector #include -namespace Moses +#include "Node.h" + +namespace MosesTraining { namespace GHKM { @@ -137,5 +137,5 @@ private: }; } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp index 671b03a78..bbf20c765 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -19,18 +19,17 @@ #include "XmlTreeParser.h" -#include "ParseTree.h" -#include "tables-core.h" -#include "XmlException.h" -#include "XmlTree.h" -#include "util/tokenize.hh" - #include #include -using namespace MosesTraining; +#include "util/tokenize.hh" -namespace Moses +#include "SyntaxTree.h" +#include "tables-core.h" +#include "XmlException.h" +#include "XmlTree.h" + +namespace MosesTraining { namespace GHKM { @@ -42,7 +41,7 @@ XmlTreeParser::XmlTreeParser(std::set &labelSet, { } -std::auto_ptr XmlTreeParser::Parse(const std::string &line) +std::auto_ptr XmlTreeParser::Parse(const std::string &line) { m_line = line; m_tree.Clear(); @@ -61,12 +60,12 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) return ConvertTree(*root, m_words); } -// Converts a SyntaxNode tree to a Moses::GHKM::ParseTree. -std::auto_ptr XmlTreeParser::ConvertTree( +// Converts a SyntaxNode tree to a MosesTraining::GHKM::SyntaxTree. +std::auto_ptr XmlTreeParser::ConvertTree( const SyntaxNode &tree, const std::vector &words) { - std::auto_ptr root(new ParseTree(tree)); + std::auto_ptr root(new SyntaxTree(tree)); const std::vector &children = tree.GetChildren(); if (children.empty()) { if (tree.GetStart() != tree.GetEnd()) { @@ -76,14 +75,14 @@ std::auto_ptr XmlTreeParser::ConvertTree( throw Exception(msg.str()); } SyntaxNode value(tree.GetStart(), tree.GetStart(), words[tree.GetStart()]); - std::auto_ptr leaf(new ParseTree(value)); + std::auto_ptr leaf(new SyntaxTree(value)); leaf->parent() = root.get(); root->children().push_back(leaf.release()); } else { for (std::vector::const_iterator p = children.begin(); p != children.end(); ++p) { assert(*p); - std::auto_ptr child = ConvertTree(**p, words); + std::auto_ptr child = ConvertTree(**p, words); child->parent() = root.get(); root->children().push_back(child.release()); } @@ -92,4 +91,4 @@ std::auto_ptr XmlTreeParser::ConvertTree( } } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index a82862428..4e89e7167 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -21,32 +21,32 @@ #ifndef EXTRACT_GHKM_XML_TREE_PARSER_H_ #define EXTRACT_GHKM_XML_TREE_PARSER_H_ -#include "Exception.h" - #include #include #include #include #include -#include "ParseTree.h" #include "SyntaxNode.h" #include "SyntaxNodeCollection.h" +#include "SyntaxTree.h" -namespace Moses +#include "Exception.h" + +namespace MosesTraining { namespace GHKM { -// Parses a string in Moses' XML parse tree format and returns a ParseTree +// Parses a string in Moses' XML parse tree format and returns a SyntaxTree // object. class XmlTreeParser { public: XmlTreeParser(std::set &, std::map &); - std::auto_ptr Parse(const std::string &); + std::auto_ptr Parse(const std::string &); - static std::auto_ptr ConvertTree(const MosesTraining::SyntaxNode &, + static std::auto_ptr ConvertTree(const SyntaxNode &, const std::vector &); const std::vector& GetWords() { @@ -58,11 +58,11 @@ private: std::set &m_labelSet; std::map &m_topLabelSet; std::string m_line; - MosesTraining::SyntaxNodeCollection m_tree; + SyntaxNodeCollection m_tree; std::vector m_words; }; } // namespace GHKM -} // namespace Moses +} // namespace MosesTraining #endif From 8a9505d72fcf61a32e16397e46e04acc3561027b Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 1 Jun 2015 16:54:12 +0100 Subject: [PATCH 028/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNodeCollection.cpp | 61 +++++++++++++++++++ phrase-extract/SyntaxNodeCollection.h | 6 ++ phrase-extract/extract-ghkm/XmlTreeParser.cpp | 35 +++++++++-- phrase-extract/extract-ghkm/XmlTreeParser.h | 3 +- 4 files changed, 100 insertions(+), 5 deletions(-) diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 099a5697f..f67bee587 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -23,6 +23,8 @@ #include #include +#include + namespace MosesTraining { @@ -152,4 +154,63 @@ void SyntaxNodeCollection::ConnectNodes() } } +//boost::shared_ptr SyntaxNodeCollection::ExtractTree() +std::auto_ptr SyntaxNodeCollection::ExtractTree() +{ + std::map nodeToTree; + + // Create a SyntaxTree object for each SyntaxNode. + for (std::vector::const_iterator p = m_nodes.begin(); + p != m_nodes.end(); ++p) { + nodeToTree[*p] = new SyntaxTree(**p); + } + + // Connect the SyntaxTrees. + typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator; + + SyntaxTree *root = 0; + SyntaxNode *prevNode = 0; + SyntaxTree *prevTree = 0; + // Iterate over all start indices from lowest to highest. + for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) { + const SyntaxTreeIndex2 &inner = p->second; + // Iterate over all end indices from highest to lowest. + for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) { + const std::vector &nodes = q->second; + // Iterate over all nodes that cover the same span in order of tree + // depth, top-most first. + for (std::vector::const_reverse_iterator r = nodes.rbegin(); + r != nodes.rend(); ++r) { + SyntaxNode *node = *r; + SyntaxTree *tree = nodeToTree[node]; + if (!prevNode) { + // node is the root. + root = tree; + tree->parent() = 0; + } else if (prevNode->GetStart() == node->GetStart()) { + // prevNode is the parent of node. + assert(prevNode->GetEnd() >= node->GetEnd()); + tree->parent() = prevTree; + prevTree->children().push_back(tree); + } else { + // prevNode is a descendant of node's parent. The lowest common + // ancestor of prevNode and node will be node's parent. + SyntaxTree *ancestor = prevTree->parent(); + while (ancestor->value().GetEnd() < tree->value().GetEnd()) { + ancestor = ancestor->parent(); + } + assert(ancestor); + tree->parent() = ancestor; + ancestor->children().push_back(tree); + } + prevNode = node; + prevTree = tree; + } + } + } + + //return boost::shared_ptr(root); + return std::auto_ptr(root); +} + } // namespace MosesTraining diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index 70b14206d..c54400ca1 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -24,7 +24,10 @@ #include #include +#include + #include "SyntaxNode.h" +#include "SyntaxTree.h" namespace MosesTraining { @@ -70,6 +73,9 @@ public: } void ConnectNodes(); void Clear(); + + std::auto_ptr ExtractTree(); + //boost::shared_ptr ExtractTree(); }; } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp index bbf20c765..83dfbd42f 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -53,11 +53,11 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line) } catch (const XmlException &e) { throw Exception(e.getMsg()); } - m_tree.ConnectNodes(); - SyntaxNode *root = m_tree.GetTop(); - assert(root); + //boost::shared_ptr root = m_tree.ExtractTree(); + std::auto_ptr root = m_tree.ExtractTree(); m_words = util::tokenize(m_line); - return ConvertTree(*root, m_words); + AttachWords(m_words, *root); + return root; } // Converts a SyntaxNode tree to a MosesTraining::GHKM::SyntaxTree. @@ -90,5 +90,32 @@ std::auto_ptr XmlTreeParser::ConvertTree( return root; } +void XmlTreeParser::AttachWords(const std::vector &words, + SyntaxTree &root) +{ + std::vector leaves; + leaves.reserve(words.size()); + for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) { + leaves.push_back(&*p); + } + + std::vector::const_iterator q = words.begin(); + for (std::vector::iterator p = leaves.begin(); p != leaves.end(); + ++p) { + SyntaxTree *leaf = *p; + const int start = leaf->value().GetStart(); + const int end = leaf->value().GetEnd(); + if (start != end) { + std::ostringstream msg; + msg << "leaf node covers multiple words (" << start << "-" << end + << "): this is currently unsupported"; + throw Exception(msg.str()); + } + SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++)); + leaf->children().push_back(newLeaf); + newLeaf->parent() = leaf; + } +} + } // namespace GHKM } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index 4e89e7167..2fcdd9b56 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -54,12 +54,13 @@ public: }; private: - std::set &m_labelSet; std::map &m_topLabelSet; std::string m_line; SyntaxNodeCollection m_tree; std::vector m_words; + + void AttachWords(const std::vector &, SyntaxTree &); }; } // namespace GHKM From 25f98a446e8802398a5f06bf299ca1587aad157f Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 1 Jun 2015 18:19:34 +0100 Subject: [PATCH 029/108] Bug fix in building imTtrack directly from input stream. --- moses/TranslationModel/UG/mm/ug_im_ttrack.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h index 20ab653f4..503a5546c 100644 --- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h @@ -57,7 +57,7 @@ namespace ugdiss public: imTtrack(boost::shared_ptr > > const& d); - imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL); + imTtrack(istream& in, TokenIndex& V, ostream* log = NULL); imTtrack(size_t reserve = 0); // imTtrack(istream& in, Vocab& V); @@ -131,24 +131,30 @@ namespace ugdiss template imTtrack:: - imTtrack(istream& in, TokenIndex const& V, ostream* log) + imTtrack(istream& in, TokenIndex& V, ostream* log) : numToks(0) { myData.reset(new vector >()); string line,w; size_t linectr=0; boost::unordered_map H; - for (id_type i = 0; i < V.knownVocabSize(); ++i) - H[V[i]] = i; + // for (id_type i = 0; i < V.knownVocabSize(); ++i) + // H[V[i]] = i; while (getline(in,line)) { + // cout << line << endl; myData->push_back(vector()); if (log && ++linectr%1000000==0) *log << linectr/1000000 << "M lines of input processed" << endl; istringstream buf(line); + // cout << line << endl; while (buf>>w) - myData->back().push_back(Token(H[w])); - myData->back().resize(myData.back().size()); + { + myData->back().push_back(Token(V[w])); + // cout << w << " " << myData->back().back().id() << " " + // << V[w] << endl; + } + // myData->back().resize(myData->back().size(), Token(0)); numToks += myData->back().size(); } } From 349163f3fd915c9c61241778db3eecb36d6f526d Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 1 Jun 2015 18:21:52 +0100 Subject: [PATCH 030/108] Bug fix and in-line code documentation. --- .../TranslationModel/UG/mm/tpt_tokenindex.cc | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/moses/TranslationModel/UG/mm/tpt_tokenindex.cc b/moses/TranslationModel/UG/mm/tpt_tokenindex.cc index 5fc6a6acc..0be8aa082 100644 --- a/moses/TranslationModel/UG/mm/tpt_tokenindex.cc +++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.cc @@ -16,7 +16,8 @@ namespace ugdiss TokenIndex:: TokenIndex(string unkToken) - : ridx(0),unkLabel(unkToken),unkId(1),numTokens(0) + : ridx(0), unkLabel(unkToken), unkId(1), numTokens(0) + , startIdx(0), endIdx(0) { lock.reset(new boost::mutex()); }; @@ -94,15 +95,25 @@ namespace ugdiss TokenIndex:: operator[](char const* p) const { - if (startIdx==endIdx && !dynamic) return strcmp(p,"NULL") && unkId; - Entry const* bla = lower_bound(startIdx,endIdx,p,comp); - if (bla != endIdx && !strcmp(comp.base+bla->offset,p)) - return bla->id; - if (!dynamic) return unkId; + if (startIdx != endIdx) + { + Entry const* bla = lower_bound(startIdx,endIdx,p,comp); + if (bla != endIdx && !strcmp(comp.base+bla->offset,p)) + return bla->id; + if (!dynamic) return unkId; + } + else if (!dynamic) return strcmp(p,"NULL") && unkId; + boost::lock_guard lk(*this->lock); - // stuff below is new as of 2011-01-30, for dynamic adding of unknown items - // IMPORTANT: numTokens is not currently not changed, it is the number of - // PRE-EXISING TOKENS, not including dynamically added Items + // stuff below is new as of 2011-01-30, for dynamic adding of + // unknown items IMPORTANT: numTokens is not currently not + // changed, it is the number of PRE-EXISING TOKENS, not including + // dynamically added Items + // if (!str2idExtra) + // { + // this->str2idExtra.reset(new map()); + // this->newWords.reset(new vector()); + // } map::value_type newItem(p,str2idExtra->size()+numTokens); pair::iterator,bool> foo = str2idExtra->insert(newItem); if (foo.second) // it actually is a new item @@ -144,10 +155,13 @@ namespace ugdiss if (!ridx.size()) { boost::lock_guard lk(*this->lock); + // Someone else (multi-threading!) may have created the + // reverse index in the meantime, so let's check again if (!ridx.size()) ridx = reverseIndex(); } if (id < ridx.size()) return ridx[id]; + boost::lock_guard lk(*this->lock); if (dynamic && id < ridx.size()+newWords->size()) return (*newWords)[id-ridx.size()].c_str(); From 99896cfd2cbbe9bdde1f16c0bf1adbbfc6579296 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 1 Jun 2015 18:22:37 +0100 Subject: [PATCH 031/108] Untangling bitext class from Moses dependencies, so that the class can be used independently of Moses again. --- moses/TranslationModel/UG/mm/ug_bitext.h | 186 +++--------------- .../TranslationModel/UG/mm/ug_bitext_moses.h | 88 +++++++++ .../UG/mm/ug_lexical_reordering.h | 30 ++- moses/TranslationModel/UG/mm/ug_mm_bitext.h | 5 +- moses/TranslationModel/UG/mm/ug_phrasepair.h | 6 + 5 files changed, 156 insertions(+), 159 deletions(-) create mode 100644 moses/TranslationModel/UG/mm/ug_bitext_moses.h diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index c1a065b0a..2d2afc3ca 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -35,12 +35,18 @@ #include "moses/TranslationModel/UG/generic/sampling/Sampling.h" #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h" #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h" -#include "moses/FF/LexicalReordering/LexicalReorderingState.h" +// #include "moses/FF/LexicalReordering/LexicalReorderingState.h" #include "moses/Util.h" -// #include "moses/StaticData.h" + +#ifndef NO_MOSES +// #pragma message "COMPILING WITH MOSES SUPPORT!" +#include "moses/StaticData.h" #include "moses/thread_safe_container.h" #include "moses/ContextScope.h" #include "moses/TranslationTask.h" +#else +// #pragma message "COMPILING WITHOUT MOSES SUPPORT!" +#endif #include "util/exception.hh" // #include "util/check.hh" @@ -70,6 +76,7 @@ namespace Moses { float lbop(size_t const tries, size_t const succ, float const confidence); void write_bitvector(bitvector const& v, ostream& out); +#ifndef NO_MOSES struct ContextForQuery { @@ -82,7 +89,7 @@ namespace Moses { ostream* bias_log; ContextForQuery() : bias_log(NULL) { } }; - +#endif template class Bitext @@ -140,8 +147,13 @@ namespace Moses { // prep2 launches sampling and returns immediately. // lookup (below) waits for the job to finish before it returns + sptr + prep2(iter const& phrase, int max_sample = -1) const; + +#ifndef NO_MOSES sptr prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; +#endif public: Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16); @@ -157,9 +169,15 @@ namespace Moses { open(string const base, string const L1, string const L2) = 0; sptr - lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; + lookup(iter const& phrase, int max_sample = -1) const; + void prep(iter const& phrase) const; +#ifndef NO_MOSES + sptr + lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; void prep(ttasksptr const& ttask, iter const& phrase) const; +#endif + void setDefaultSampleSize(size_t const max_samples); size_t getDefaultSampleSize() const; @@ -181,16 +199,7 @@ namespace Moses { void write_yawat_alignment ( id_type const sid, iter const* m1, iter const* m2, ostream& out ) const; -#if 0 - // needs to be adapted to the new API - void - lookup(std::vector const& snt, TSA& idx, - std::vector > > > >& dest, - std::vector >* pidmap = NULL, - typename PhrasePair::Scorer* scorer=NULL, - sptr const bias, - bool multithread=true) const; -#endif + string docname(id_type const sid) const; }; @@ -427,11 +436,13 @@ namespace Moses { template void Bitext:: - prep(ttasksptr const& ttask, iter const& phrase) const + prep(iter const& phrase) const { - prep2(ttask, phrase, m_default_sample_size); + prep2(phrase, m_default_sample_size); } + + // prep2 schedules a phrase for sampling, and returns immediately // the member function lookup retrieves the respective pstats instance // and waits until the sampling is finished before it returns. @@ -440,26 +451,20 @@ namespace Moses { sptr Bitext ::prep2 - ( ttasksptr const& ttask, iter const& phrase, int max_sample) const + (iter const& phrase, int max_sample) const { if (max_sample < 0) max_sample = m_default_sample_size; - sptr scope = ttask->GetScope(); - sptr context = scope->get(this); sptr bias; - if (context) bias = context->bias; sptr cache; - // - no caching for rare phrases and special requests (max_sample) // (still need to test what a good caching threshold is ...) // - use the task-specific cache when there is a sampling bias if (max_sample == int(m_default_sample_size) && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) { - cache = (phrase.root == I1.get() - ? (bias ? context->cache1 : m_cache1) - : (bias ? context->cache2 : m_cache2)); - // if (bias) cerr << "Using bias." << endl; + cache = (phrase.root == I1.get() ? m_cache1 : m_cache2); } + sptr ret; sptr const* cached; @@ -472,9 +477,6 @@ namespace Moses { if (m_num_workers > 1) ag->add_workers(m_num_workers); } - // cerr << "NEW FREQUENT PHRASE: " - // << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount() - // << " at " << __FILE__ << ":" << __LINE__ << endl; ret = ag->add_job(this, phrase, max_sample, bias); if (cache) cache->set(phrase.getPid(),ret); UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job."); @@ -545,87 +547,6 @@ namespace Moses { } }; -#if 0 - template - void - Bitext:: - lookup(std::vector const& snt, TSA& idx, - std::vector > > > >& dest, - std::vector >* pidmap, - typename PhrasePair::Scorer* scorer, - sptr const& bias, bool multithread) const - { - // typedef std::vector > > > > ret_t; - - dest.clear(); - dest.resize(snt.size()); - if (pidmap) { pidmap->clear(); pidmap->resize(snt.size()); } - - // collect statistics in parallel, then build PT entries as - // the sampling finishes - bool fwd = &idx == I1.get(); - std::vector workers; // background threads doing the lookup - pplist_cache_t& C = (fwd ? m_pplist_cache1 : m_pplist_cache2); - if (C.capacity() < 100000) C.reserve(100000); - for (size_t i = 0; i < snt.size(); ++i) - { - dest[i].reserve(snt.size()-i); - typename TSA::tree_iterator m(&idx); - for (size_t k = i; k < snt.size() && m.extend(snt[k].id()); ++k) - { - uint64_t key = m.getPid(); - if (pidmap) (*pidmap)[i].push_back(key); - sptr > > pp = C.get(key); - if (pp) - dest[i].push_back(pp); - else - { - pp.reset(new std::vector >()); - C.set(key,pp); - dest[i].push_back(pp); - sptr x = prep2(m, this->default_sample_size,bias); - pstats2pplist w(m,*(fwd?T2:T1),x,*pp,scorer); - if (multithread) - { - boost::thread* t = new boost::thread(w); - workers.push_back(t); - } - else w(); - } - } - } - for (size_t w = 0; w < workers.size(); ++w) - { - workers[w]->join(); - delete workers[w]; - } - } -#endif - - template - sptr - Bitext:: - lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const - { - sptr ret = prep2(ttask, phrase, max_sample); - - UTIL_THROW_IF2(!ret, "Got NULL pointer where I expected a valid pointer."); - - // Why were we locking here? - if (m_num_workers <= 1) - { - boost::unique_lock guard(m_lock); - typename agenda::worker(*this->ag)(); - } - else - { - boost::unique_lock lock(ret->lock); - while (ret->in_progress) - ret->ready.wait(lock); - } - return ret; - } - template void Bitext @@ -729,27 +650,6 @@ namespace Moses { } } -#if 0 - template - sptr - Bitext:: - lookup(siter const& phrase, size_t const max_sample, - sptr const& bias) const - { - sptr ret = prep2(phrase, max_sample); - boost::unique_lock guard(m_lock); - if (this->num_workers <= 1) - typename agenda::worker(*this->ag)(); - else - { - boost::unique_lock lock(ret->lock); - while (ret->in_progress) - ret->ready.wait(lock); - } - return ret; - } -#endif - template void expand(typename Bitext::iter const& m, @@ -773,33 +673,9 @@ namespace Moses { } } -#if 0 - template - class - PStatsCache - { - typedef boost::unordered_map > my_cache_t; - boost::shared_mutex m_lock; - my_cache_t m_cache; - - public: - sptr get(Bitext::iter const& phrase) const; - - sptr - add(Bitext::iter const& phrase) const - { - uint64_t pid = phrase.getPid(); - std::pair - } - - - }; -#endif } // end of namespace bitext } // end of namespace moses #include "ug_im_bitext.h" #include "ug_mm_bitext.h" - - - +#include "ug_bitext_moses.h" diff --git a/moses/TranslationModel/UG/mm/ug_bitext_moses.h b/moses/TranslationModel/UG/mm/ug_bitext_moses.h new file mode 100644 index 000000000..539a9166d --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h @@ -0,0 +1,88 @@ +// -*- mode: c++; cc-style: moses-cc-style -*- +#pragma once +#ifndef NO_MOSES +namespace Moses { +namespace bitext { + +template +sptr +Bitext:: +lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const +{ + sptr ret = prep2(ttask, phrase, max_sample); + UTIL_THROW_IF2(!ret, "Got NULL pointer where I expected a valid pointer."); + + // Why were we locking here? + if (m_num_workers <= 1) + { + boost::unique_lock guard(m_lock); + typename agenda::worker(*this->ag)(); + } + else + { + boost::unique_lock lock(ret->lock); + while (ret->in_progress) + ret->ready.wait(lock); + } + return ret; +} + + +template +void +Bitext:: +prep(ttasksptr const& ttask, iter const& phrase) const +{ + prep2(ttask, phrase, m_default_sample_size); +} + + +// prep2 schedules a phrase for sampling, and returns immediately +// the member function lookup retrieves the respective pstats instance +// and waits until the sampling is finished before it returns. +// This allows sampling in the background +template +sptr +Bitext +::prep2 +( ttasksptr const& ttask, iter const& phrase, int max_sample) const +{ + if (max_sample < 0) max_sample = m_default_sample_size; + sptr bias; + sptr scope = ttask->GetScope(); + sptr context = scope->get(this); + if (context) bias = context->bias; + sptr cache; + // - no caching for rare phrases and special requests (max_sample) + // (still need to test what a good caching threshold is ...) + // - use the task-specific cache when there is a sampling bias + if (max_sample == int(m_default_sample_size) + && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) + { + cache = (phrase.root == I1.get() + ? (bias ? context->cache1 : m_cache1) + : (bias ? context->cache2 : m_cache2)); + } + sptr ret; + sptr const* cached; + + if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached) + return *cached; + boost::unique_lock guard(m_lock); + if (!ag) + { + ag.reset(new agenda(*this)); + if (m_num_workers > 1) + ag->add_workers(m_num_workers); + } + ret = ag->add_job(this, phrase, max_sample, bias); + if (cache) cache->set(phrase.getPid(),ret); + UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job."); + return ret; +} + + + +} +} +#endif diff --git a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h index 9004b757e..9c56e6cb5 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h @@ -1,9 +1,35 @@ // -*- c++ -*- #pragma once #include -#include "moses/FF/LexicalReordering/LexicalReorderingState.h" -namespace Moses { namespace bitext { +#ifndef NO_MOSES +#include "moses/FF/LexicalReordering/LexicalReorderingState.h" +#endif + +namespace Moses { +#ifdef NO_MOSES +namespace LRModel{ + + enum ModelType { Monotonic, MSD, MSLR, LeftRight, None }; + enum Direction { Forward, Backward, Bidirectional }; + + enum ReorderingType { + M = 0, // monotonic + NM = 1, // non-monotonic + S = 1, // swap + D = 2, // discontinuous + DL = 2, // discontinuous, left + DR = 3, // discontinuous, right + R = 0, // right + L = 1, // left + MAX = 3, // largest possible + NONE = 4 // largest possible + }; + +} +#endif + +namespace bitext { typedef Moses::LRModel::ReorderingType PhraseOrientation; diff --git a/moses/TranslationModel/UG/mm/ug_mm_bitext.h b/moses/TranslationModel/UG/mm/ug_mm_bitext.h index be3fdfce8..4f93d4d3c 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_mm_bitext.h @@ -45,8 +45,9 @@ namespace Moses this->m_docname2docid[docname] = docid; this->m_docname.push_back(docname); line >> b; - VERBOSE(1, "DOCUMENT MAP " << docname - << " " << a << "-" << b+a << endl); +#ifndef NO_MOSES + VERBOSE(1, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << endl); +#endif for (b += a; a < b; ++a) (*this->m_sid2docid)[a] = docid; } diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h index 53a9f761c..7e565c2df 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.h +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -3,7 +3,9 @@ #include #include "ug_typedefs.h" #include "ug_bitext_pstats.h" +#ifndef NO_MOSES #include "moses/FF/LexicalReordering/LexicalReorderingState.h" +#endif #include "boost/format.hpp" #include "tpt_tokenindex.h" namespace Moses @@ -52,9 +54,11 @@ namespace Moses fill_lr_vec(LRModel::Direction const& dir, LRModel::ModelType const& mdl, vector& v) const; +#ifndef NO_MOSES void print(ostream& out, TokenIndex const& V1, TokenIndex const& V2, LRModel const& LR) const; +#endif class SortByTargetIdSeq { @@ -292,6 +296,7 @@ namespace Moses } +#ifndef NO_MOSES template void PhrasePair @@ -331,5 +336,6 @@ namespace Moses } #endif } +#endif } // namespace bitext } // namespace Moses From cc800742b1163545d3c72544f5cfd6a5059eeba0 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 1 Jun 2015 18:26:27 +0100 Subject: [PATCH 032/108] Updated Makefile for local compiles. --- moses/TranslationModel/UG/Makefile | 174 +++++++++++++++-------------- 1 file changed, 92 insertions(+), 82 deletions(-) diff --git a/moses/TranslationModel/UG/Makefile b/moses/TranslationModel/UG/Makefile index 56fad1feb..ed1dead52 100644 --- a/moses/TranslationModel/UG/Makefile +++ b/moses/TranslationModel/UG/Makefile @@ -1,7 +1,22 @@ -# Some systems apparently distinguish between shell -# variables and environment variables. The latter are -# visible to the make utility, the former apparently not, -# so we need to set them if they are not defined yet +SHELL = bash +MAKEFLAGS += --warn-undefined-variables +.DEFAULT_GOAL = all +.SUFFIXES: + +# =============================================================================== +# LOCAL ENVIRONMENT +# =============================================================================== + +# shell script snippet: +define find_moses_root +d=$$(pwd); +while [[ ! -e $$d/Jamroot && $$d != "/" ]] ; do + d=$$(dirname $$d); +done; +echo $$d +endef + +MOSES_ROOT := $(shell $(find_moses_root)) # =============================================================================== # COMPILATION PREFERENCES @@ -10,107 +25,102 @@ # OPTI: optimization level # PROF: profiler switches -CCACHE = ccache -OPTI = 3 -EXE_TAG = exe -PROF = -# PROF = -g -pg +# compiler command +compiler ?= g++ +variant ?= runtime +link ?= static -# =============================================================================== +CXX = $(shell which ccache) $(compiler) +CXXFLAGS += -DMAX_NUM_FACTORS=4 +CXXFLAGS += -DKENLM_MAX_ORDER=5 +CXXFLAGS += -DWITH_THREADS +CXXFLAGS += -DNO_MOSES +CXXFLAGS += -I${MOSES_ROOT} -I. -SHELL = bash -MAKEFLAGS += --warn-undefined-variables -.DEFAULT_GOAL = all -.SUFFIXES: +ifeq ($(variant),debug) +CXXFLAGS += -ggdb -O0 +else ifeq ($(variant),profile) +CXXFLAGS += -g -pg -O3 +else ifeq ($(variant),syntax) +CXXFLAGS += -fsyntax-only +endif + +# LDFLAGS = -L${MOSES_ROOT}/lib -L ./lib/ + +# WDIR = build/$(variant)/${HOSTTYPE}/${KERNEL} +WDIR = build/$(variant) # =============================================================================== # COMPILATION 'LOCALIZATION' -HOST ?= $(shell hostname) -HOSTTYPE ?= $(shell uname -m) -KERNEL = $(shell uname -r) +HOST ?= $(shell hostname) +HOSTTYPE ?= $(shell uname -m) +KERNEL = $(shell uname -r) -MOSES_ROOT = ${HOME}/code/mosesdecoder -WDIR = build/${HOSTTYPE}/${KERNEL}/${OPTI} -VPATH = ${HOME}/code/mosesdecoder/ -CXXFLAGS = ${PROF} -ggdb -Wall -O${OPTI} ${INCLUDES} -CXXFLAGS += -DMAX_NUM_FACTORS=4 -CXXFLAGS += -DKENLM_MAX_ORDER=5 -modirs := $(addprefix -I,$(shell find ${MOSES_ROOT}/moses ${MOSES_ROOT}/contrib -type d)) -CXXFLAGS += -I${MOSES_ROOT} -INCLUDES = -BZLIB = -BOOSTLIBTAG = +nil: -lzma = lzma -#lzma = -REQLIBS = m z pthread dl ${lzma} ${BZLIB} \ - boost_thread${BOOSTLIBTAG} \ - boost_program_options${BOOSTLIBTAG} \ - boost_system${BOOSTLIBTAG} \ - boost_filesystem${BOOSTLIBTAG} \ - boost_iostreams${BOOSTLIBTAG} z bz2 +# libraries required -# icuuc icuio icui18n \ - -LIBS = $(addprefix -l, moses ${REQLIBS}) -LIBS = $(addprefix -l, ${REQLIBS}) -LIBDIRS = -L${HOME}/code/mosesdecoder/lib -LIBDIRS += -L${HOME}/lib -PREFIX ?= . -BINDIR ?= ${PREFIX}/bin -ifeq "$(OPTI)" "0" -BINPREF = debug. -else -BINPREF = +LIBS = m z bz2 pthread dl ${BOOSTLIBS} +BOOSTLIBS := thread system filesystem program_options iostreams +BOOSTLIBS := $(addprefix boost_,${BOOSTLIBS}) +ifdef ($(BOOSTLIBTAG),"") +BOOSTLIBS := $(addsuffix ${BOOSTLIBTAG},${BOOSTLIBS}) endif +cc2obj = $(addsuffix .o,$(patsubst ${MOSES_ROOT}%,$(WDIR)%,\ + $(patsubst .%,$(WDIR)%,$(basename $1)))) +cc2exe = $(addprefix ./bin/$(variant)/,$(basename $(notdir $1))) +cc2trg = $(basename $(notdir $1)) -OBJ2 := +define compile -define compile - -DEP += ${WDIR}/$(basename $(notdir $1)).d -${WDIR}/$(basename $(notdir $1)).o : $1 $(wildcard $(basename $1).h) +DEP += $(basename $(call cc2obj,$1)).d +$(call cc2obj,$1): $1 @echo -e "COMPILING $1" @mkdir -p $$(@D) - ${CXX} ${CXXFLAGS} -MD -MP -c $$(abspath $$<) -o $$@ + @${CXX} ${CXXFLAGS} -MD -MP -c $$< -o $$@ endef -testprogs = test-dynamic-im-tsa try-align -programs = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs} -programs += mtt-count-words +define build -all: $(addprefix ${BINDIR}/${BINPREF}, $(programs)) - @echo $^ -clean: - rm -f ${WDIR}/*.o ${WDIR}/*.d +$(call cc2trg,$1): $(call cc2exe,$1) +$(call cc2exe,$1): $(call cc2obj,$1) $(LIBOBJ) +ifneq ($(variant),syntax) + @echo -e "LINKING $$@" + @mkdir -p $${@D} + @${CXX} ${CXXFLAGS} -o $$@ $(LIBOBJ) $(addprefix -l,${LIBS}) $$< +endif -custom-pt: ${BINDIR}/${BINPREF}custom-pt - echo $^ +endef -INMOGEN = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/generic/*/*.cpp) -#INMOMM = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/mm/*.cc) -#INMOMM += $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/mm/*.cpp) -OBJ = $(patsubst %.cc,%.o,$(wildcard $(patsubst %.h,%.cc,$(wildcard *.h)))) -OBJ += $(patsubst %.cpp,%.o,${INMOGEN}) -#OBJ += $(patsubst %.cpp,%.o,${INMOMM}) -#OBJ += $(patsubst %.cc,%.o,${INMOMM}) -EXE = $(patsubst %.cc,%.o,$(filter-out $(patsubst %.h,%.cc,$(wildcard *.h)),$(wildcard *.cc))) +# list files here that you want explicitly excluded from compilation +skip = sim-pe.cc +skip += mtt.count.cc +skip += try-align2.cc +skip += spe-check-coverage3.cc +skip += mmsapt.cpp +skip += ug_stringdist.cc +skip += ug_splice_arglist.cc +skip += ug_lexical_reordering.cc +skip += ug_sampling_bias.cc -$(foreach cpp,${INMOGEN},$(eval $(call compile,${cpp}))) -$(foreach cpp,$(wildcard *.cc),$(eval $(call compile,${cpp}))) -$(addprefix ${BINDIR}/${BINPREF}, $(programs)): $(addprefix ${WDIR}/,$(notdir ${OBJ})) -$(addprefix ${BINDIR}/${BINPREF}, $(programs)): ${MOSES_ROOT}/lib/libmoses.a -${BINDIR}/${BINPREF}%: ${WDIR}/%.o ${WDIR}/mmsapt_align.o - @mkdir -p ${BINDIR} - echo PREREQS: $^ - $(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS} ${LIBS} +# objects from elsewhere in the moses tree that are needed +extra = ${MOSES_ROOT}/util/exception.cc -#try-align: ${WDIR}/try-align.o ${WDIR}/tpt_tokenindex.o -# $(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS} +$(foreach f,$(skip),$(eval broken+=$(shell find -name $f))) -.SECONDARY: +$(info SCANNING DIRECTORY TREE FOR FILES) +find_cfiles = find -name '*.cc' -or -name '*.cpp' +CFILES = $(filter-out $(broken), $(shell $(find_cfiles))) +PROGRAMS := $(shell $(find_cfiles) | xargs grep -lP '^(int +)?main') +PROGRAMS := $(filter-out $(broken),$(PROGRAMS)) + +ALLOBJ = $(call cc2obj,$(CFILES) $(extra)) +LIBOBJ = $(call cc2obj,$(filter-out $(PROGRAMS),$(CFILES) $(extra))) + +$(foreach f,$(CFILES) $(extra),$(eval $(call compile,$f))) +$(foreach p,$(PROGRAMS),$(eval $(call build,$p))) -include $(DEP) From aa4eed93d5791f98e8fc3f51db650d2aa231cc2c Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 1 Jun 2015 18:55:40 +0100 Subject: [PATCH 033/108] Bug fix related to getting rid of using namespace std; . --- .../UG/generic/sorting/VectorIndexSorter.h | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h index 31132c63c..f224b3bae 100644 --- a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h +++ b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h @@ -16,24 +16,28 @@ namespace Moses { - using namespace std; + // using namespace std; + + + using std::greater; + template, typename IDX_T=size_t> class VectorIndexSorter - : public binary_function + : public std::binary_function { - vector const& m_vecref; + std::vector const& m_vecref; boost::shared_ptr m_comp; public: COMP const& Compare; - VectorIndexSorter(vector const& v, COMP const& comp) + VectorIndexSorter(std::vector const& v, COMP const& comp) : m_vecref(v), Compare(comp) { } - VectorIndexSorter(vector const& v) + VectorIndexSorter(std::vector const& v) : m_vecref(v), m_comp(new COMP()), Compare(*m_comp) { } @@ -43,20 +47,20 @@ namespace Moses return (fwd == bwd ? a < b : fwd); } - boost::shared_ptr > + boost::shared_ptr > GetOrder() const; void - GetOrder(vector & order) const; + GetOrder(std::vector & order) const; }; template - boost::shared_ptr > + boost::shared_ptr > VectorIndexSorter:: GetOrder() const { - boost::shared_ptr > ret(new vector(m_vecref.size())); + boost::shared_ptr > ret(new std::vector(m_vecref.size())); get_order(*ret); return ret; } @@ -64,7 +68,7 @@ namespace Moses template void VectorIndexSorter:: - GetOrder(vector & order) const + GetOrder(std::vector & order) const { order.resize(m_vecref.size()); for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i; From 35cf55d4d25eaff8c99a6467e7fc923f35ac7aa7 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Tue, 2 Jun 2015 15:03:18 +0700 Subject: [PATCH 034/108] Trailing spaces. --- moses/TranslationModel/UG/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/moses/TranslationModel/UG/Makefile b/moses/TranslationModel/UG/Makefile index ed1dead52..e595609ad 100644 --- a/moses/TranslationModel/UG/Makefile +++ b/moses/TranslationModel/UG/Makefile @@ -11,8 +11,8 @@ MAKEFLAGS += --warn-undefined-variables define find_moses_root d=$$(pwd); while [[ ! -e $$d/Jamroot && $$d != "/" ]] ; do - d=$$(dirname $$d); -done; + d=$$(dirname $$d); +done; echo $$d endef @@ -23,14 +23,14 @@ MOSES_ROOT := $(shell $(find_moses_root)) # =============================================================================== # CCACHE: if set to ccache, use ccache to speed up compilation # OPTI: optimization level -# PROF: profiler switches +# PROF: profiler switches # compiler command compiler ?= g++ variant ?= runtime link ?= static -CXX = $(shell which ccache) $(compiler) +CXX = $(shell which ccache) $(compiler) CXXFLAGS += -DMAX_NUM_FACTORS=4 CXXFLAGS += -DKENLM_MAX_ORDER=5 CXXFLAGS += -DWITH_THREADS @@ -70,7 +70,7 @@ endif cc2obj = $(addsuffix .o,$(patsubst ${MOSES_ROOT}%,$(WDIR)%,\ $(patsubst .%,$(WDIR)%,$(basename $1)))) cc2exe = $(addprefix ./bin/$(variant)/,$(basename $(notdir $1))) -cc2trg = $(basename $(notdir $1)) +cc2trg = $(basename $(notdir $1)) define compile @@ -95,9 +95,9 @@ endif endef # list files here that you want explicitly excluded from compilation -skip = sim-pe.cc -skip += mtt.count.cc -skip += try-align2.cc +skip = sim-pe.cc +skip += mtt.count.cc +skip += try-align2.cc skip += spe-check-coverage3.cc skip += mmsapt.cpp skip += ug_stringdist.cc From 0981d2370505672d027d0f6e17890fb36286c439 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Tue, 2 Jun 2015 16:02:39 +0700 Subject: [PATCH 035/108] Lint-fixing binge. --- .beautify-ignore | 2 + mingw/MosesGUI/addMTModel.py | 11 ++- mingw/MosesGUI/chooseMTModel.py | 31 +++++--- mingw/MosesGUI/main.py | 4 +- mingw/MosesGUI/mainWindow.py | 71 ++++++++++++------ misc/processLexicalTableMin.cpp | 2 +- misc/processPhraseTableMin.cpp | 2 +- moses/FF/GlobalLexicalModel.cpp | 2 +- moses/FF/GlobalLexicalModelUnlimited.cpp | 2 +- moses/IOWrapper.cpp | 4 +- moses/StaticData.cpp | 13 ++-- moses/StaticData.h | 2 +- moses/WordsBitmapTest.cpp | 2 +- moses/parameters/BookkeepingOptions.cpp | 2 +- moses/parameters/NBestOptions.cpp | 14 ++-- moses/parameters/NBestOptions.h | 4 +- phrase-extract/score-main.cpp | 13 +++- .../in-decoding-transliteration.pl | 29 ++++++- .../post-decoding-transliteration.pl | 75 ++++++++++++++++--- .../prepare-transliteration-phrase-table.pl | 26 ++++++- .../train-transliteration-module.pl | 70 ++++++++++++++--- scripts/ems/example/data/weight.ini | 10 +-- scripts/ems/support/berkeley-process.sh | 10 ++- 23 files changed, 299 insertions(+), 102 deletions(-) diff --git a/.beautify-ignore b/.beautify-ignore index 9acdb5824..b05524e1d 100644 --- a/.beautify-ignore +++ b/.beautify-ignore @@ -18,6 +18,8 @@ irstlm jam-files lm mingw/MosesGUI/icons_rc.py +mingw/MosesGUI/Ui_credits.py +mingw/MosesGUI/Ui_mainWindow.py moses/TranslationModel/UG phrase-extract/pcfg-common phrase-extract/syntax-common diff --git a/mingw/MosesGUI/addMTModel.py b/mingw/MosesGUI/addMTModel.py index 8d55400d5..09e6fc542 100644 --- a/mingw/MosesGUI/addMTModel.py +++ b/mingw/MosesGUI/addMTModel.py @@ -4,14 +4,17 @@ Module implementing Dialog. """ -from PyQt4.QtGui import * -from PyQt4.QtCore import * +from PyQt4.QtGui import ( + QDialog, + QFileDialog, + ) +from PyQt4.QtCore import pyqtSignature import datetime import os from Ui_addMTModel import Ui_Dialog -from util import * +from util import doAlert class AddMTModelDialog(QDialog, Ui_Dialog): @@ -88,7 +91,7 @@ class AddMTModelDialog(QDialog, Ui_Dialog): def checkEmpty(mystr): return len(str(mystr).strip()) <= 0 - #check everything + # Check everything. self.modelName = self.editName.text() if checkEmpty(self.modelName): doAlert("Please provide non-empty Model Name") diff --git a/mingw/MosesGUI/chooseMTModel.py b/mingw/MosesGUI/chooseMTModel.py index 95c566f1e..5702216b8 100644 --- a/mingw/MosesGUI/chooseMTModel.py +++ b/mingw/MosesGUI/chooseMTModel.py @@ -4,11 +4,18 @@ Module implementing ChooseMTModelDialog. """ -from PyQt4.QtCore import * -from PyQt4.QtGui import * -from PyQt4.QtSql import * +import sys + +from PyQt4.QtCore import ( + pyqtSignature, + QObject, + SIGNAL, + ) +from PyQt4.QtGui import QDialog +from PyQt4.QtSql import QSqlQueryModel from Ui_chooseMTModel import Ui_Dialog +from util import doAlert class ChooseMTModelDialog(QDialog, Ui_Dialog): @@ -28,14 +35,20 @@ class ChooseMTModelDialog(QDialog, Ui_Dialog): self.selTableView.hideColumn(0) self.selTableView.hideColumn(5) self.selTableView.hideColumn(6) - #change status and keep the column - QObject.connect(datamodel, SIGNAL("modelInstalled()"), self.on_datamodel_modelInstalled) + # Change status and keep the column. + QObject.connect( + datamodel, SIGNAL("modelInstalled()"), + self.on_datamodel_modelInstalled) def updateModel(self): - self.model.setQuery('SELECT ID, name, srclang, trglang, status, path, mosesini FROM models WHERE status = "READY" AND deleted != "True"', self.database) + self.model.setQuery( + 'SELECT ID, name, srclang, trglang, status, path, mosesini ' + 'FROM models ' + 'WHERE status = "READY" AND deleted != "True"', + self.database) def on_datamodel_recordUpdated(self, bRecord): - #deal with the selection changed problem + """Deal with the selection changed problem.""" try: if bRecord: current = self.selTableView.currentIndex() @@ -44,9 +57,9 @@ class ChooseMTModelDialog(QDialog, Ui_Dialog): else: self.curSelection = None else: - if not self.curSelection is None: + if self.curSelection is not None: self.selTableView.selectRow(self.curSelection) - except Exception, e: + except Exception as e: print >> sys.stderr, str(e) def on_datamodel_modelInstalled(self): diff --git a/mingw/MosesGUI/main.py b/mingw/MosesGUI/main.py index 805a7bc0c..3bab0e617 100644 --- a/mingw/MosesGUI/main.py +++ b/mingw/MosesGUI/main.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -from PyQt4.QtCore import * -from PyQt4.QtGui import * +from PyQt4.QtGui import QApplication import os import sys @@ -9,7 +8,6 @@ import sys from mainWindow import MainWindow from datamodel import DataModel from moses import Moses -from util import * if __name__ == "__main__": app = QApplication(sys.argv) diff --git a/mingw/MosesGUI/mainWindow.py b/mingw/MosesGUI/mainWindow.py index 5fb031c50..e92cdbb92 100644 --- a/mingw/MosesGUI/mainWindow.py +++ b/mingw/MosesGUI/mainWindow.py @@ -4,10 +4,19 @@ Module implementing MainWindow. """ -from PyQt4.QtCore import * -from PyQt4.QtGui import * -from PyQt4.QtSql import * +from PyQt4.QtCore import ( + pyqtSignature, + QObject, + Qt, + SIGNAL, + ) +from PyQt4.QtGui import ( + QMainWindow, + QMessageBox, + QProgressDialog, + ) +import sys import threading from Ui_mainWindow import Ui_MainWindow @@ -15,7 +24,7 @@ from addMTModel import AddMTModelDialog from chooseMTModel import ChooseMTModelDialog from engine import Engine from credits import DlgCredits -from util import * +from util import doAlert class MainWindow(QMainWindow, Ui_MainWindow): @@ -54,18 +63,27 @@ class MainWindow(QMainWindow, Ui_MainWindow): Slot documentation goes here. """ current = self.tableView.currentIndex() - if current and current.row() >= 0: - if self.engine and self.datamodel.getRowID(current.row()) == self.engine.model['ID']: - text = '''The model is still in use, do you want to stop and delete it? -It might take a while...''' - reply = QMessageBox.question(None, 'Message', text, QMessageBox.Yes, QMessageBox.No) - if reply == QMessageBox.No: - return - t = self.stopEngine(self.engine) - t.join() - self.engine = None - self.clearPanel() - self.datamodel.delModel(current.row()) + if not current or current.row() < 0: + return + model_in_use = ( + self.engine and + self.datamodel.getRowID(current.row()) == self.engine.model['ID'] + ) + if model_in_use: + text = ( + "The model is still in use, do you want to " + "stop and delete it?\n" + "It might take a while..." + ) + reply = QMessageBox.question( + None, 'Message', text, QMessageBox.Yes, QMessageBox.No) + if reply == QMessageBox.No: + return + t = self.stopEngine(self.engine) + t.join() + self.engine = None + self.clearPanel() + self.datamodel.delModel(current.row()) @pyqtSignature("") def on_newModelBtn_clicked(self): @@ -153,17 +171,24 @@ It might take a while...''' if self.progress: self.progress.close() self.progress = None - self.progress = QProgressDialog("Model: %s" % model['name'], "Cancel", 0, self.engine.countSteps(), self) + self.progress = QProgressDialog( + "Model: %s" % model['name'], "Cancel", 0, + self.engine.countSteps(), self) self.progress.setAutoReset(True) self.progress.setAutoClose(True) self.progress.setWindowModality(Qt.WindowModal) self.progress.setWindowTitle('Loading Model...') - QObject.connect(self.progress, SIGNAL("canceled()"), self.progressCancelled) + QObject.connect( + self.progress, SIGNAL("canceled()"), self.progressCancelled) self.progress.show() - #connect engine signal - QObject.connect(self.engine, SIGNAL("stepFinished(int)"), self.engineStepFinished) - QObject.connect(self.engine, SIGNAL("loaded(bool, QString)"), self.engineLoaded) + # Connect engine signal. + QObject.connect( + self.engine, SIGNAL("stepFinished(int)"), + self.engineStepFinished) + QObject.connect( + self.engine, SIGNAL("loaded(bool, QString)"), + self.engineLoaded) def startEngineThread(): self.engine.start() @@ -225,7 +250,9 @@ It might take a while...''' if text.strip() == "": trans.append(text) else: - trans.append(self.engine.translate(text.replace('\r', ' ').strip()).decode('utf8')) + trans.append( + self.engine.translate( + text.replace('\r', ' ').strip()).decode('utf8')) self.editTrg.setText('\n'.join(trans)) except Exception, e: print >> sys.stderr, str(e) diff --git a/misc/processLexicalTableMin.cpp b/misc/processLexicalTableMin.cpp index 8d309c331..8eee489ad 100644 --- a/misc/processLexicalTableMin.cpp +++ b/misc/processLexicalTableMin.cpp @@ -54,7 +54,7 @@ int main(int argc, char** argv) bool multipleScoreTrees = true; size_t quantize = 0; - size_t threads = + size_t threads = #ifdef WITH_THREADS boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() : #endif diff --git a/misc/processPhraseTableMin.cpp b/misc/processPhraseTableMin.cpp index 92d63433e..3948a692c 100644 --- a/misc/processPhraseTableMin.cpp +++ b/misc/processPhraseTableMin.cpp @@ -67,7 +67,7 @@ int main(int argc, char **argv) bool sortScoreIndexSet = false; size_t sortScoreIndex = 2; bool warnMe = true; - size_t threads = + size_t threads = #ifdef WITH_THREADS boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() : #endif diff --git a/moses/FF/GlobalLexicalModel.cpp b/moses/FF/GlobalLexicalModel.cpp index f4df403ae..b5a07b1ef 100644 --- a/moses/FF/GlobalLexicalModel.cpp +++ b/moses/FF/GlobalLexicalModel.cpp @@ -111,7 +111,7 @@ void GlobalLexicalModel::Load() void GlobalLexicalModel::InitializeForInput(ttasksptr const& ttask) { - UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput, + UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput, "GlobalLexicalModel works only with sentence input."); Sentence const* s = reinterpret_cast(ttask->GetSource().get()); m_local.reset(new ThreadLocalStorage); diff --git a/moses/FF/GlobalLexicalModelUnlimited.cpp b/moses/FF/GlobalLexicalModelUnlimited.cpp index 434fa7fbb..d507054c2 100644 --- a/moses/FF/GlobalLexicalModelUnlimited.cpp +++ b/moses/FF/GlobalLexicalModelUnlimited.cpp @@ -107,7 +107,7 @@ bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource, void GlobalLexicalModelUnlimited::InitializeForInput(ttasksptr const& ttask) { - UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput, + UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput, "GlobalLexicalModel works only with sentence input."); Sentence const* s = reinterpret_cast(ttask->GetSource().get()); m_local.reset(new ThreadLocalStorage); diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp index 8cbf4f091..d1bdeb44f 100644 --- a/moses/IOWrapper.cpp +++ b/moses/IOWrapper.cpp @@ -303,10 +303,10 @@ ReadInput() boost::lock_guard lock(m_lock); #endif boost::shared_ptr source = GetBufferedInput(); - if (source) + if (source) { source->SetTranslationId(m_currentLine++); - if (m_look_ahead || m_look_back) + if (m_look_ahead || m_look_back) this->set_context_for(*source); } m_past_input.push_back(source); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index ac0c3c990..b41768604 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -593,7 +593,7 @@ bool StaticData::LoadData(Parameter *parameter) ini_factor_maps(); ini_input_options(); m_bookkeeping_options.init(*parameter); - m_nbest_options.init(*parameter); // if (!ini_nbest_options()) return false; + m_nbest_options.init(*parameter); // if (!ini_nbest_options()) return false; if (!ini_output_options()) return false; // threading etc. @@ -616,14 +616,14 @@ bool StaticData::LoadData(Parameter *parameter) ini_mira_options(); // set m_nbest_options.enabled = true if necessary: - if (m_mbr || m_useLatticeMBR || m_outputSearchGraph || m_outputSearchGraphSLF - || m_mira || m_outputSearchGraphHypergraph || m_useConsensusDecoding + if (m_mbr || m_useLatticeMBR || m_outputSearchGraph || m_outputSearchGraphSLF + || m_mira || m_outputSearchGraphHypergraph || m_useConsensusDecoding #ifdef HAVE_PROTOBUF - || m_outputSearchGraphPB + || m_outputSearchGraphPB #endif || m_latticeSamplesFilePath.size()) - { - m_nbest_options.enabled = true; + { + m_nbest_options.enabled = true; } // S2T decoder @@ -1371,4 +1371,3 @@ void StaticData::ResetWeights(const std::string &denseWeights, const std::string } } // namespace - diff --git a/moses/StaticData.h b/moses/StaticData.h index 2b1e37b83..a93e67003 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -103,7 +103,7 @@ protected: BookkeepingOptions m_bookkeeping_options; // size_t m_nBestSize; // size_t m_nBestFactor; - + size_t m_latticeSamplesSize; size_t m_maxNoTransOptPerCoverage; size_t m_maxNoPartTransOpt; diff --git a/moses/WordsBitmapTest.cpp b/moses/WordsBitmapTest.cpp index 3acd1351a..543c96bd1 100644 --- a/moses/WordsBitmapTest.cpp +++ b/moses/WordsBitmapTest.cpp @@ -40,7 +40,7 @@ BOOST_AUTO_TEST_CASE(initialise) bitvec[2] = true; bitvec[3] = true; bitvec[7] = true; - + WordsBitmap wbm2(7,bitvec); BOOST_CHECK_EQUAL(wbm2.GetSize(),7); for (size_t i = 0; i < 7; ++i) { diff --git a/moses/parameters/BookkeepingOptions.cpp b/moses/parameters/BookkeepingOptions.cpp index 875c605bf..2ab26b53c 100644 --- a/moses/parameters/BookkeepingOptions.cpp +++ b/moses/parameters/BookkeepingOptions.cpp @@ -8,7 +8,7 @@ namespace Moses { bool& x = need_alignment_info; P.SetParameter(x, "print-alignment-info", false); if (!x) P.SetParameter(x, "print-alignment-info-in-n-best", false); - if (!x) + if (!x) { PARAM_VEC const* params = P.GetParam("alignment-output-file"); x = params && params->size(); diff --git a/moses/parameters/NBestOptions.cpp b/moses/parameters/NBestOptions.cpp index 6ec97c91b..45747011a 100644 --- a/moses/parameters/NBestOptions.cpp +++ b/moses/parameters/NBestOptions.cpp @@ -10,22 +10,22 @@ init(Parameter const& P) { const PARAM_VEC *params; params = P.GetParam("n-best-list"); - if (params) + if (params) { - if (params->size() >= 2) + if (params->size() >= 2) { output_file_path = params->at(0); nbest_size = Scan( params->at(1) ); only_distinct = (params->size()>2 && params->at(2)=="distinct"); - } - else + } + else { std::cerr << "wrong format for switch -n-best-list file size [disinct]"; return false; } - } + } else nbest_size = 0; - + P.SetParameter(factor, "n-best-factor", 20); P.SetParameter(include_alignment_info, "print-alignment-info-in-n-best", false ); P.SetParameter(include_feature_labels, "labeled-n-best-list", true ); @@ -33,7 +33,7 @@ init(Parameter const& P) P.SetParameter(include_passthrough, "print-passthrough-in-n-best", false ); P.SetParameter(include_all_factors, "report-all-factors-in-n-best", false ); P.SetParameter(print_trees, "n-best-trees", false ); - + enabled = output_file_path.size(); return true; } diff --git a/moses/parameters/NBestOptions.h b/moses/parameters/NBestOptions.h index e844c1eac..6c868990c 100644 --- a/moses/parameters/NBestOptions.h +++ b/moses/parameters/NBestOptions.h @@ -19,11 +19,9 @@ namespace Moses { bool include_all_factors; std::string output_file_path; - + bool init(Parameter const& param); }; - - } diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index b65dce4ba..185c0ae9e 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -130,7 +130,15 @@ int main(int argc, char* argv[]) ScoreFeatureManager featureManager; if (argc < 4) { - std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl; + std::cerr << + "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] " + "[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] " + "[--NoWordAlignment] [--UnalignedPenalty] " + "[--UnalignedFunctionWordPenalty function-word-file] " + "[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] " + "[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] " + "[--TargetPreferenceLabels] [--UnpairedExtractFormat] " + "[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl; std::cerr << featureManager.usage() << std::endl; exit(1); } @@ -147,7 +155,8 @@ int main(int argc, char* argv[]) std::string fileNameLeftHandSideTargetPreferenceLabelCounts; std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts; std::string fileNamePhraseOrientationPriors; - std::vector featureArgs; // all unknown args passed to feature manager + // All unknown args are passed to feature manager. + std::vector featureArgs; for(int i=4; i $TRANSLIT_MODEL/evaluation/$eval_file.op`; + `$MOSES_SRC/bin/moses \ + -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \ + -threads 16 -drop-unknown -distortion-limit 0 \ + -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 100 \ + distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \ + < $TRANSLIT_MODEL/evaluation/$eval_file \ + > $TRANSLIT_MODEL/evaluation/$eval_file.op`; } diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl index 2c7908085..df840c709 100755 --- a/scripts/Transliteration/post-decoding-transliteration.pl +++ b/scripts/Transliteration/post-decoding-transliteration.pl @@ -137,18 +137,39 @@ sub run_transliteration print "Filter Table\n"; - `$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`; + `$MOSES_SRC/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -score-options '--KneserNey' \ + -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \ + -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \ + -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`; - `$MOSES_SRC/scripts/training/filter-model-given-input.pl $TRANSLIT_MODEL/evaluation/$eval_file.filtered $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini $TRANSLIT_MODEL/evaluation/$eval_file -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; + `$MOSES_SRC/scripts/training/filter-model-given-input.pl \ + $TRANSLIT_MODEL/evaluation/$eval_file.filtered \ + $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \ + $TRANSLIT_MODEL/evaluation/$eval_file \ + -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; `rm $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`; print "Apply Filter\n"; - `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`; + `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \ + $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \ + $TRANSLIT_MODEL/model/moses.ini \ + $TRANSLIT_MODEL/tuning/moses.tuned.ini \ + $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`; my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null"; - `$DECODER -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini < $TRANSLIT_MODEL/evaluation/$eval_file > $TRANSLIT_MODEL/evaluation/$eval_file.op $drop_stderr`; + `$DECODER \ + -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \ + -threads 16 -drop-unknown -distortion-limit 0 \ + -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 \ + distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \ + < $TRANSLIT_MODEL/evaluation/$eval_file \ + > $TRANSLIT_MODEL/evaluation/$eval_file.op $drop_stderr`; } @@ -294,22 +315,52 @@ sub run_decoder `mkdir $corpus_dir/evaluation`; - `$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -lmodel-oov-feature "yes" -post-decoding-translit "yes" -phrase-translation-table $corpus_dir/model/phrase-table -config $corpus_dir/model/moses.ini -lm 0:5:$LM_FILE:8`; + `$MOSES_SRC/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -lmodel-oov-feature "yes" -post-decoding-translit "yes" \ + -phrase-translation-table $corpus_dir/model/phrase-table \ + -config $corpus_dir/model/moses.ini -lm 0:5:$LM_FILE:8`; `touch $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`; - `$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -lmodel-oov-feature "yes" -post-decoding-translit "yes" -phrase-translation-table $corpus_dir/model/phrase-table -config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini -lm 0:3:$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini:8`; + `$MOSES_SRC/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -lmodel-oov-feature "yes" -post-decoding-translit "yes" \ + -phrase-translation-table $corpus_dir/model/phrase-table \ + -config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \ + -lm 0:3:$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini:8`; - `$MOSES_SRC/scripts/training/filter-model-given-input.pl $corpus_dir/evaluation/filtered $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini $INPUT_FILE -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; + `$MOSES_SRC/scripts/training/filter-model-given-input.pl \ + $corpus_dir/evaluation/filtered \ + $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \ + $INPUT_FILE -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt \ + 1 1 4 100 2"`; `rm $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`; - `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl $corpus_dir/evaluation/filtered/moses.ini < $corpus_dir/model/moses.ini > $corpus_dir/evaluation/moses.filtered.ini`; + `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl \ + $corpus_dir/evaluation/filtered/moses.ini \ + < $corpus_dir/model/moses.ini \ + > $corpus_dir/evaluation/moses.filtered.ini`; my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null"; - `$DECODER -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' -max-trans-opt-per-coverage 100 -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 < $INPUT_FILE > $OUTPUT_FILE $drop_stderr`; + `$DECODER \ + -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \ + -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \ + -max-trans-opt-per-coverage 100 \ + -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \ + < $INPUT_FILE \ + > $OUTPUT_FILE $drop_stderr`; - print "$DECODER -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' -max-trans-opt-per-coverage 100 -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 < $INPUT_FILE > $OUTPUT_FILE $drop_stderr\n"; + print "$DECODER \ + -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \ + -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \ + -max-trans-opt-per-coverage 100 \ + -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \ + < $INPUT_FILE \ + > $OUTPUT_FILE $drop_stderr\n"; } - - diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl index 0a9f554c5..fd8b5a978 100755 --- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl +++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl @@ -103,17 +103,35 @@ sub run_transliteration print STDERR "Filter Table\n"; - `$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -reordering-table $TRANSLIT_MODEL/model/reordering-table -config $eval_file.moses.table.ini -lm 0:3:$eval_file.moses.table.ini:8`; + `$MOSES_SRC/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -reordering msd-bidirectional-fe -score-options '--KneserNey' \ + -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \ + -reordering-table $TRANSLIT_MODEL/model/reordering-table \ + -config $eval_file.moses.table.ini \ + -lm 0:3:$eval_file.moses.table.ini:8`; - `$MOSES_SRC/scripts/training/filter-model-given-input.pl $eval_file.filtered $eval_file.moses.table.ini $eval_file -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; + `$MOSES_SRC/scripts/training/filter-model-given-input.pl \ + $eval_file.filtered $eval_file.moses.table.ini $eval_file \ + -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; `rm $eval_file.moses.table.ini`; print STDERR "Apply Filter\n"; - `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`; + `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \ + $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini \ + $TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`; - `$MOSES_SRC/bin/moses -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $eval_file.op.nBest 50 -f $eval_file.filtered.ini < $eval_file > $eval_file.op`; + `$MOSES_SRC/bin/moses \ + -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \ + -threads 16 -drop-unknown -distortion-limit 0 \ + -n-best-list $eval_file.op.nBest 50 \ + -f $eval_file.filtered.ini \ + < $eval_file \ + > $eval_file.op`; } diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl index b1d4d0ff5..817e2d815 100755 --- a/scripts/Transliteration/train-transliteration-module.pl +++ b/scripts/Transliteration/train-transliteration-module.pl @@ -118,31 +118,81 @@ sub learn_transliteration_model{ print "Align Corpus\n"; - `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -last-step 1 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t -corpus-dir $OUT_DIR/training/prepared`; + `$MOSES_SRC_DIR/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -last-step 1 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t \ + -corpus-dir $OUT_DIR/training/prepared`; - `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared -giza-e2f $OUT_DIR/training/giza -direction 2`; + `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 \ + -dont-zip -first-step 2 -last-step 2 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \ + -giza-e2f $OUT_DIR/training/giza -direction 2`; - `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared -giza-f2e $OUT_DIR/training/giza-inverse -direction 1`; + `$MOSES_SRC_DIR/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \ + -giza-f2e $OUT_DIR/training/giza-inverse -direction 1`; - `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza -giza-f2e $OUT_DIR/training/giza-inverse -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -alignment grow-diag-final-and`; + `$MOSES_SRC_DIR/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza \ + -giza-f2e $OUT_DIR/training/giza-inverse \ + -alignment-file $OUT_DIR/model/aligned \ + -alignment-stem $OUT_DIR/model/aligned -alignment grow-diag-final-and`; print "Train Translation Models\n"; - `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -corpus $OUT_DIR/training/corpus$t`; + `$MOSES_SRC_DIR/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex \ + -alignment-file $OUT_DIR/model/aligned \ + -alignment-stem $OUT_DIR/model/aligned \ + -corpus $OUT_DIR/training/corpus$t`; - `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -extract-file $OUT_DIR/model/extract -corpus $OUT_DIR/training/corpus$t`; + `$MOSES_SRC_DIR/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned \ + -alignment-stem $OUT_DIR/model/aligned -extract-file \ + $OUT_DIR/model/extract -corpus $OUT_DIR/training/corpus$t`; - `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract -lexical-file $OUT_DIR/model/lex -phrase-translation-table $OUT_DIR/model/phrase-table`; + `$MOSES_SRC_DIR/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract \ + -lexical-file $OUT_DIR/model/lex -phrase-translation-table \ + $OUT_DIR/model/phrase-table`; print "Train Language Models\n"; - `$SRILM_DIR/ngram-count -order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk -text $OUT_DIR/lm/target -lm $OUT_DIR/lm/targetLM`; + `$SRILM_DIR/ngram-count \ + -order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk \ + -text $OUT_DIR/lm/target -lm $OUT_DIR/lm/targetLM`; - `$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/lm/targetLM $OUT_DIR/lm/targetLM.bin`; + `$MOSES_SRC_DIR/bin/build_binary \ + $OUT_DIR/lm/targetLM $OUT_DIR/lm/targetLM.bin`; print "Create Config File\n"; - `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $OUT_DIR/model/phrase-table -config $OUT_DIR/model/moses.ini -lm 0:5:$OUT_DIR/lm/targetLM.bin:8`; + `$MOSES_SRC_DIR/scripts/training/train-model.perl \ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ + -score-options '--KneserNey' \ + -phrase-translation-table $OUT_DIR/model/phrase-table \ + -config $OUT_DIR/model/moses.ini -lm 0:5:$OUT_DIR/lm/targetLM.bin:8`; } diff --git a/scripts/ems/example/data/weight.ini b/scripts/ems/example/data/weight.ini index 4e941b662..e42fbb529 100644 --- a/scripts/ems/example/data/weight.ini +++ b/scripts/ems/example/data/weight.ini @@ -3,12 +3,12 @@ ######################### [weight] -Distortion0= 0.3 -UnknownWordPenalty0= 1 -WordPenalty0= -1 +Distortion0= 0.3 +UnknownWordPenalty0= 1 +WordPenalty0= -1 TranslationModel0= 0.2 0.2 0.2 0.2 PhrasePenalty0= 0.2 -LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3 -LM0= 0.5 +LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3 +LM0= 0.5 diff --git a/scripts/ems/support/berkeley-process.sh b/scripts/ems/support/berkeley-process.sh index 347ebba3c..4b23f0c16 100755 --- a/scripts/ems/support/berkeley-process.sh +++ b/scripts/ems/support/berkeley-process.sh @@ -28,7 +28,15 @@ shift shift shift -JAVA_CMD="/usr/local/share/java/bin/java $JAVA_OPTS -jar $JAR -Data.trainSources $INFILE.list -Main.loadParamsDir $PARAMDIR -exec.execDir $OUTNAME -Main.loadLexicalModelOnly false -Data.englishSuffix $SLANG -Data.foreignSuffix $TLANG -exec.create true -Main.saveParams false -Main.alignTraining true -Main.forwardModels HMM -Main.reverseModels HMM -Main.mode JOINT -Main.iters 0 -Data.testSources -EMWordAligner.posteriorDecodingThreshold $POSTERIOR $@" +JAVA_CMD="/usr/local/share/java/bin/java \ + $JAVA_OPTS -jar $JAR -Data.trainSources $INFILE.list \ + -Main.loadParamsDir $PARAMDIR -exec.execDir $OUTNAME \ + -Main.loadLexicalModelOnly false -Data.englishSuffix $SLANG \ + -Data.foreignSuffix $TLANG -exec.create true -Main.saveParams false \ + -Main.alignTraining true -Main.forwardModels HMM \ + -Main.reverseModels HMM -Main.mode JOINT -Main.iters 0 \ + -Data.testSources -EMWordAligner.posteriorDecodingThreshold $POSTERIOR \ + $@" echo "Running $JAVA_CMD" $JAVA_CMD From d3fb4a8002702685d322fcbcb9fc2e5797b4aeb8 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Tue, 2 Jun 2015 10:16:42 +0100 Subject: [PATCH 036/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNode.h | 4 +++ phrase-extract/XmlTree.cpp | 34 +++++++++++++++++++ .../extract-ghkm/AlignmentGraph.cpp | 8 ++++- 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h index 46e0f456f..5f57e1790 100644 --- a/phrase-extract/SyntaxNode.h +++ b/phrase-extract/SyntaxNode.h @@ -36,6 +36,10 @@ protected: SyntaxNode* m_parent; float m_pcfgScore; public: + typedef std::map AttributeMap; + + AttributeMap attributes; + SyntaxNode( int startPos, int endPos, std::string label ) :m_start(startPos) ,m_end(endPos) diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index 0f068fca7..d3c5da900 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -80,6 +80,39 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName) return tag.substr(contentsStart,contentsEnd-contentsStart); } +// TODO Special handling of "label" attribute +// s should be a sequence of name=attribute pairs separated by whitespace. +// e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\"" +void ParseXmlTagAttributes(const std::string &s, + std::map &attributes) +{ + std::size_t begin = 0; + while (true) { + std::size_t pos = s.find('=', begin); + if (pos == std::string::npos) { + return; + } + std::string name = Trim(s.substr(begin, pos-begin)); + begin = s.find('"', pos+1); + if (begin == std::string::npos) { + throw XmlException("invalid tag content"); + } + pos = s.find('"', begin+1); + if (pos == std::string::npos) { + throw XmlException("invalid tag content"); + } + while (s[pos-1] == '\\') { + pos = s.find('"', pos+1); + if (pos == std::string::npos) { + throw XmlException("invalid tag content"); + } + } + // TODO unescape \" + attributes[name] = s.substr(begin+1, pos-begin-1); + begin = pos+1; + } +} + /** * Remove "<" and ">" from XML tag * @@ -377,6 +410,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, } SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label ); node->SetPcfgScore(pcfgScore); + ParseXmlTagAttributes(tagContent, node->attributes); } } } diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 3fa65656c..1a3c23de5 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -216,7 +216,13 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root) std::auto_ptr n(new Node(root->value().GetLabel(), nodeType)); if (nodeType == TREE) { - n->SetPcfgScore(root->value().GetPcfgScore()); + float score = 0.0f; + SyntaxNode::AttributeMap::const_iterator p = + root->value().attributes.find("pcfg"); + if (p != root->value().attributes.end()) { + score = std::atof(p->second.c_str()); + } + n->SetPcfgScore(score); } const std::vector &children = root->children(); From b3e577be769ecc80257274b3af9f5f2a2490020b Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Tue, 2 Jun 2015 17:29:32 +0700 Subject: [PATCH 037/108] Fixing lint. Only 600 or so lines of errors left! --- scripts/recaser/train-truecaser.perl | 9 +- .../bilingual-lm/averageNullEmbedding.py | 5 +- .../training/convert-moses-ini-v2-to-v1.py | 390 +++++++++--------- scripts/training/train-neurallm.py | 74 ++-- .../wrappers/mosesxml2berkeleyparsed.perl | 12 +- scripts/training/wrappers/parse-en-senna.perl | 4 +- .../training/wrappers/parse-en-stanford.py | 52 ++- scripts/training/wrappers/senna2brackets.py | 6 + 8 files changed, 301 insertions(+), 251 deletions(-) diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 7f8909082..4f600a640 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -8,8 +8,13 @@ # # Options: # -# --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token. -# +# --possiblyUseFirstToken : boolean option; the default behaviour (when this +# option is not provided) is that the first token of a sentence is ignored, on +# the basis that the first word of a sentence is always capitalized; if this +# option is provided then: a) if a sentence-initial token is *not* capitalized, +# then it is counted, and b) if a capitalized sentence-initial token is the +# only token of the segment, then it is counted, but with only 10% of the +# weight of a normal token. use warnings; use strict; diff --git a/scripts/training/bilingual-lm/averageNullEmbedding.py b/scripts/training/bilingual-lm/averageNullEmbedding.py index 54c9a1bc4..bf0d465f6 100755 --- a/scripts/training/bilingual-lm/averageNullEmbedding.py +++ b/scripts/training/bilingual-lm/averageNullEmbedding.py @@ -6,6 +6,7 @@ import sys import numpy import argparse + parser = argparse.ArgumentParser( description=( "Set input embedding of token to weighted average " @@ -28,6 +29,7 @@ def load_model(model_file): import nplm return nplm.NeuralLM.from_file(model_file) + def get_weights(path, length): counter = [0] * length for line in open(path): @@ -35,6 +37,7 @@ def get_weights(path, length): counter[last_context] += 1 return counter + def main(options): sys.path.append(options.nplm_python_path) @@ -49,7 +52,7 @@ def main(options): numpy.array(model.input_embeddings), weights=weights, axis=0) model.to_file(open(options.output_model, 'w')) -if __name__ == "__main__": +if __name__ == "__main__": options = parser.parse_args() main(options) diff --git a/scripts/training/convert-moses-ini-v2-to-v1.py b/scripts/training/convert-moses-ini-v2-to-v1.py index 3ef7d7c0d..4b7cfa5fa 100755 --- a/scripts/training/convert-moses-ini-v2-to-v1.py +++ b/scripts/training/convert-moses-ini-v2-to-v1.py @@ -6,10 +6,10 @@ from __future__ import ( - absolute_import, - print_function, - unicode_literals, - ) + absolute_import, + print_function, + unicode_literals, + ) __version__ = '1.0' __license__ = 'LGPL3' @@ -19,248 +19,248 @@ import errno from sys import stdout from copy import deepcopy from os.path import ( - dirname, - basename, - exists, - realpath, - ) -from os import ( - sep, - makedirs, - ) + dirname, + basename, + exists, + realpath, + ) +from os import makedirs + root_escape = '%(escape-prefix)s' class moses2_to_ini(object): + def __init__(self, inp, out, escape_prefix): + self.inp = inp + self.out = out + self.escape_prefix = escape_prefix + self._config = {} - def __init__(self, inp, out, escape_prefix): - self.inp = inp - self.out = out - self.escape_prefix = escape_prefix - self._config = {} + def parse(self): + key = '' + section = None + self._config = {} + counter = 0 + with open(self.inp, 'rb') as f: + contents = f.read().decode('utf8') - def parse(self): + lines = contents.splitlines() - content = '' - key = '' - section = None - self._config = {} - counter = 0 + # Known feature/functions without attributes. + attrless_ffs = [ + 'UnknownWordPenalty', + 'WordPenalty', + 'PhrasePenalty', + 'Distortion', + ] - with open(self.inp, 'rb' ) as f: - contents = f.read().decode('utf8') + # Retrieve all values except feature/functions with attributes. + for i, line in [(i, line.strip()) for i, line in enumerate(lines) + if line.strip() and not line.strip().startswith('#')]: - lines = contents.splitlines() + if line.startswith('[') and line.endswith(']'): - # retrieve all values except feature/functions with attributes - for i, line in [(i, line.strip()) for i, line in enumerate(lines) - if line.strip() and not line.strip().startswith('#')]: + section = line.strip('] [') - if line.startswith('[') and line.endswith(']'): + if section not in self._config.keys() + ['feature', 'weight']: + # New section not in config and not a reserved section. + counter = 0 + key = section + self._config[key] = {} - section = line.strip('] [') + elif section == 'feature' and line in attrless_ffs: + # Known feature/funcions without attributes. + key = '%s0' % line + if key not in self._config: + self._config[key] = {} + self._config[key]['feature'] = line - if section not in self._config.keys() + ['feature', 'weight']: - # new section not in config and not a reserved section - counter = 0 - key = section - self._config[key] = {} + elif section == 'feature': + # Skip feature/funcions with arguments. + continue - elif section == 'feature' and line in ['UnknownWordPenalty', - 'WordPenalty', 'PhrasePenalty', 'Distortion']: - # known feature/funcions without attributes - key = '%s0' % line - if key not in self._config: - self._config[key] = {} - self._config[key]['feature'] = line + elif section == 'weight': + # Add weight value to feature sections. + config_items = [ + (key.strip(), value.strip()) + for key, value in [line.split('=', 1)] + ] + for key, value in config_items: + if key not in self._config: + self._config[key] = {} + self._config[key]['weight'] = value - elif section == 'feature': - # skip feature/funcions with artuments - continue + else: + self._config[key][counter] = line + counter += 0 - elif section == 'weight': - # add weight value to feature sections - for key, value in [(key.strip(), value.strip()) - for key, value in [line.split('=', 1)]]: - if key not in self._config: - self._config[key] = {} - self._config[key]['weight'] = value + lines[i] = '' - else: - self._config[key][counter] = line - counter += 0 + # Second, match feature/functions attributes to [weight] section + # values. + stripped_lines = [line.strip() for line in lines] + nonempty_lines = [ + line + for line in stripped_lines + if line != '' and not line.startswith('#') + ] + for i, line in enumerate(nonempty_lines): + # Add "feature" to assist creating tmpdict for feature/functions. + line = 'feature=%s' % line + tmpdict = dict([key.split('=', 1) for key in line.split()]) - lines[i] = '' + # Feature/functions 'name' attribute must match an entry in + # [weight]. + if tmpdict.get('name') not in self._config: + raise RuntimeError('malformed moses.ini v2 file') - # second, match feature/functions attributes to [weight] section values - for i, line in [(i, line.strip()) for i, line in enumerate(lines) - if line.strip() and not line.strip().startswith('#')]: + config_items = [ + (key.strip(), value.strip()) + for key, value in tmpdict.items() + if key.strip() != 'name' + ] + for key, value in config_items: + self._config[tmpdict['name']][key] = value - # add "feature" to assist creating tmpdict for feature/functions - line = 'feature=%s' % line - tmpdict = dict([key.split('=',1) for key in line.split()]) + return deepcopy(self._config) - # feature/functions 'name' attribute must match an entry in [weight] - if tmpdict.get('name') not in self._config: - raise RuntimeError('malformed moses.ini v2 file') + def render(self, config): + self._config = deepcopy(config) + _config = deepcopy(config) + lines = _tolines(_config, self.escape_prefix) + if self.out == '-': + stdout.write('\n'.join(lines)) + else: + contents = '\r\n'.join(lines) + makedir(dirname(self.out)) + with open(self.out, 'wb') as f: + f.write(contents.encode('utf8')) - for key, value in [(key.strip(), value.strip()) for key, value - in tmpdict.items() if key.strip() != 'name']: + def __str__(self): + return '\n'.join(_tolines(self._config, self.escape_prefix)) - self._config[tmpdict['name']][key] = value - - return deepcopy(self._config) - - - def render(self, config): - - self._config = deepcopy(config) - - _config = deepcopy(config) - - lines = _tolines(_config, self.escape_prefix) - - if self.out == '-': - - stdout.write('\n'.join(lines)) - - else: - - contents = '\r\n'.join(lines) - - makedir(dirname(self.out)) - - with open(self.out, 'wb') as f: - f.write(contents.encode('utf8')) - - - def __str__(self): - return '\n'.join(_tolines(self._config, self.escape_prefix)) - - - @property - def config(self): - return deepcopy(self._config) + @property + def config(self): + return deepcopy(self._config) def _tolines(config, escape_prefix): - lines = [] + section_names = sorted(config) + lines = [] - # group feature/functions first - for sectionname in [sectionname for sectionname in sorted(config) - if sectionname[-1] in '0123456789']: + # Group feature/functions first. + group_ffs = [ + name + for name in section_names + if name[-1].isdigit() + ] + for sectionname in group_ffs: + section = config[sectionname] + lines.append('[%s]' % sectionname) + for option, value in section.items(): + if option == 'path' \ + and escape_prefix is not None \ + and value.startswith(escape_prefix): + value = value.replace(escape_prefix, root_escape, 1) + lines.append('%s=%s' % (option, value)) + lines.append('') - section = config[sectionname] + other_ffs = [ + name + for name in section_names + if not name[-1].isdigit() + ] + for sectionname in other_ffs: + section = config[sectionname] + lines.append('[%s]' % sectionname) + for option, value in section.items(): + lines.append('%s=%s' % (option, value)) + lines.append('') - lines.append('[%s]' % sectionname) - - for option, value in section.items(): - - if option == 'path' \ - and escape_prefix is not None \ - and value.startswith(escape_prefix): - - value = value.replace(escape_prefix, root_escape, 1) - - lines.append('%s=%s' % (option, value)) - - lines.append('') - - for sectionname in [sectionname for sectionname in sorted(config) - if sectionname[-1] not in '0123456789']: - - section = config[sectionname] - - lines.append('[%s]' % sectionname) - - for option, value in section.items(): - - lines.append('%s=%s' % (option, value)) - - lines.append('') - - return deepcopy(lines) + return deepcopy(lines) def makedir(path, mode=0o777): - try: - makedirs(path, mode) - except OSError as e: - if e.errno not in [errno.EEXIST, - errno.EPERM, errno.EACCES, errno.ENOENT]: - raise + try: + makedirs(path, mode) + except OSError as e: + accepted_errors = [ + errno.EEXIST, + errno.EPERM, + errno.EACCES, + errno.ENOENT, + ] + if e.errno not in accepted_errors: + raise def get_args(): - '''Parse command-line arguments + '''Parse command-line arguments - Uses the API compatibility between the legacy - argparse.OptionParser and its replacement argparse.ArgumentParser - for functional equivelancy and nearly identical help prompt. - ''' + Uses the API compatibility between the legacy + argparse.OptionParser and its replacement argparse.ArgumentParser + for functional equivelancy and nearly identical help prompt. + ''' - description = 'Convert Moses.ini v2 file to standard INI format' - usage = '%s [arguments]' % basename(__file__) + description = 'Convert Moses.ini v2 file to standard INI format' + usage = '%s [arguments]' % basename(__file__) - try: - from argparse import ArgumentParser - except ImportError: - from optparse import OptionParser - argparser = False - escape_help = ('Optional. Path of SMT model. If provided, ' - 'escapes \"escape-prefix\" with \"%(escape-prefix)s\"') - parser = OptionParser(usage=usage, description=description) - add_argument = parser.add_option - else: - argparser = True - escape_help = ('Optional. Path of SMT model. If provided, ' - 'escape \"escape-prefix\" with \"%%(escape-prefix)s\"') - parser = ArgumentParser(usage=usage, description=description) - add_argument = parser.add_argument + try: + from argparse import ArgumentParser + except ImportError: + from optparse import OptionParser + argparser = False + escape_help = ( + "Optional. Path of SMT model. If provided, " + "escapes \"escape-prefix\" with \"%(escape-prefix)s\"") + parser = OptionParser(usage=usage, description=description) + add_argument = parser.add_option + else: + argparser = True + escape_help = ( + "Optional. Path of SMT model. If provided, " + "escape \"escape-prefix\" with \"%%(escape-prefix)s\"") + parser = ArgumentParser(usage=usage, description=description) + add_argument = parser.add_argument - add_argument('-i','--inp', action='store', - help='moses.ini v2 file to convert (required)') + add_argument( + '-i', '--inp', action='store', + help="moses.ini v2 file to convert (required)") - add_argument('-o','--out', action='store', default='-', - help='standard INI file (default: "-" outputs to stdout)') + add_argument( + '-o', '--out', action='store', default='-', + help="standard INI file (default: '-' outputs to stdout)") - add_argument('-r','--escape-prefix', action='store', - help=escape_help) + add_argument('-r', '--escape-prefix', action='store', help=escape_help) - if argparser: + if argparser: + args = vars(parser.parse_args()) + else: + opts = parser.parse_args() + args = vars(opts[0]) - args = vars(parser.parse_args()) + if args['inp'] is None: + parser.error('argument -i/--inp required') - else: + args['inp'] = realpath(args['inp']) - opts = parser.parse_args() - args = vars(opts[0]) + if not exists(args['inp']): + parser.error( + "argument -i/--inp invalid.\n" + "reference: %s" % args['inp']) - if args['inp'] is None: - parser.error('argument -i/--inp required') + if args['out'] != '-': + args['out'] = realpath(args['out']) - args['inp'] = realpath(args['inp']) - - if not exists(args['inp']): - parser.error('argument -i/--inp invalid.\n' - 'reference: %s' % args['inp']) - - if args['out'] != '-': - args['out'] = realpath(args['out']) - - return args + return args if __name__ == '__main__': - - args = get_args() - - converter = moses2_to_ini(**args) - - config = converter.parse() - - converter.render(config) + args = get_args() + converter = moses2_to_ini(**args) + config = converter.parse() + converter.render(config) diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py index ae77a42af..4f0e8bdaf 100755 --- a/scripts/training/train-neurallm.py +++ b/scripts/training/train-neurallm.py @@ -23,6 +23,7 @@ sys.path.append(os.path.join(sys.path[0], 'bilingual-lm')) import train_nplm import averageNullEmbedding + logging.basicConfig( format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) @@ -30,7 +31,8 @@ parser = argparse.ArgumentParser() parser.add_argument( "--working-dir", dest="working_dir", metavar="PATH") parser.add_argument( - "--corpus", '-text', dest="corpus_stem", metavar="PATH", help="Input file.") + "--corpus", '-text', dest="corpus_stem", metavar="PATH", + help="Input file.") parser.add_argument( "--nplm-home", dest="nplm_home", metavar="PATH", required=True, help="Location of NPLM.") @@ -113,6 +115,7 @@ parser.set_defaults( words_file='vocab', vocab_size=500000) + def main(options): options.ngram_size = options.order @@ -129,14 +132,16 @@ def main(options): if options.mmap: train_file += '.mmap' - extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), - '--train_text', options.corpus_stem, - '--ngramize', '1', - '--ngram_size', str(options.ngram_size), - '--vocab_size', str(options.vocab_size), - '--write_words_file', os.path.join(options.working_dir, options.words_file), - '--train_file', os.path.join(options.working_dir, numberized_file) - ] + extraction_cmd = [ + os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), + '--train_text', options.corpus_stem, + '--ngramize', '1', + '--ngram_size', str(options.ngram_size), + '--vocab_size', str(options.vocab_size), + '--write_words_file', os.path.join( + options.working_dir, options.words_file), + '--train_file', os.path.join(options.working_dir, numberized_file) + ] sys.stderr.write('extracting n-grams\n') sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n') @@ -149,12 +154,13 @@ def main(options): os.remove(os.path.join(options.working_dir, train_file)) except OSError: pass - mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'), - '--input_file', - os.path.join(options.working_dir, numberized_file), - '--output_file', - os.path.join(options.working_dir, train_file) - ] + mmap_cmd = [ + os.path.join(options.nplm_home, 'src', 'createMmap'), + '--input_file', + os.path.join(options.working_dir, numberized_file), + '--output_file', + os.path.join(options.working_dir, train_file) + ] sys.stderr.write('creating memory-mapped file\n') sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n') ret = subprocess.call(mmap_cmd) @@ -163,14 +169,18 @@ def main(options): if options.validation_corpus: - extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), - '--train_text', options.validation_corpus, - '--ngramize', '1', - '--ngram_size', str(options.ngram_size), - '--vocab_size', str(options.vocab_size), - '--words_file', os.path.join(options.working_dir, options.words_file), - '--train_file', os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized') - ] + extraction_cmd = [ + os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), + '--train_text', options.validation_corpus, + '--ngramize', '1', + '--ngram_size', str(options.ngram_size), + '--vocab_size', str(options.vocab_size), + '--words_file', os.path.join( + options.working_dir, options.words_file), + '--train_file', os.path.join( + options.working_dir, + os.path.basename(options.validation_corpus) + '.numberized') + ] sys.stderr.write('extracting n-grams (validation file)\n') sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n') @@ -190,11 +200,15 @@ def main(options): train_nplm.main(options) sys.stderr.write('averaging null words\n') - average_options = averageNullEmbedding.parser.parse_args( - ['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)), - '-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'), - '-t', os.path.join(options.working_dir, numberized_file), - '-p', os.path.join(options.nplm_home, 'python')]) + average_options = averageNullEmbedding.parser.parse_args([ + '-i', os.path.join( + options.output_dir, + options.output_model + '.model.nplm.' + str(options.epochs)), + '-o', os.path.join( + options.output_dir, options.output_model + '.model.nplm'), + '-t', os.path.join(options.working_dir, numberized_file), + '-p', os.path.join(options.nplm_home, 'python'), + ]) averageNullEmbedding.main(average_options) @@ -206,5 +220,7 @@ if __name__ == "__main__": options = parser.parse_known_args()[0] if parser.parse_known_args()[1]: - sys.stderr.write('Warning: unknown arguments: {0}\n'.format(parser.parse_known_args()[1])) + sys.stderr.write( + "Warning: unknown arguments: {0}\n".format( + parser.parse_known_args()[1])) main(options) diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl index 02bc7b88e..9449e6bc4 100755 --- a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl +++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl @@ -6,8 +6,16 @@ use warnings; use strict; -#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) ) -#( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP (NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) ) +# ( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) ) +# ( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) +# (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) +# (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP +#(NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) +# (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) +# (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) +# (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) +#(SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) +# (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) ) while() { if (/^$/) { diff --git a/scripts/training/wrappers/parse-en-senna.perl b/scripts/training/wrappers/parse-en-senna.perl index 2df46284b..9297b127f 100755 --- a/scripts/training/wrappers/parse-en-senna.perl +++ b/scripts/training/wrappers/parse-en-senna.perl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. @@ -66,7 +66,7 @@ while() { my $num_bytes; { use bytes; - $num_bytes = length($_); + $num_bytes = length($_); } if ($num_bytes > 1023) { print TMP_PROCESSED "SENTENCE_TOO_LONG\n"; diff --git a/scripts/training/wrappers/parse-en-stanford.py b/scripts/training/wrappers/parse-en-stanford.py index 06b027e55..f77a2d92e 100755 --- a/scripts/training/wrappers/parse-en-stanford.py +++ b/scripts/training/wrappers/parse-en-stanford.py @@ -19,7 +19,6 @@ import sys import codecs import argparse -from collections import defaultdict from subprocess import Popen, PIPE # hack for python2/3 compatibility @@ -54,17 +53,25 @@ def create_parser(): return parser + def process_stanford(infile, javacmd, stanfordpath): - stanford = Popen([javacmd, - '-cp', os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar') + ':' + os.path.join(stanfordpath, 'stanford-corenlp-3.5.0-models.jar'), - 'edu.stanford.nlp.pipeline.StanfordCoreNLP', - '-annotators', 'tokenize, ssplit, pos, depparse, lemma', - '-ssplit.eolonly', 'true', - '-tokenize.whitespace', 'true', - '-numThreads', '8', - '-textFile', '-', - 'outFile', '-'], stdin=infile, stdout = PIPE, stderr = open('/dev/null', 'w')) + corenlp_jar = os.path.join(stanfordpath, 'stanford-corenlp-3.5.0.jar') + corenlp_models_jar = os.path.join( + stanfordpath, 'stanford-corenlp-3.5.0-models.jar') + stanford = Popen( + [ + javacmd, + '-cp', "%s:%s" % (corenlp_jar, corenlp_models_jar), + 'edu.stanford.nlp.pipeline.StanfordCoreNLP', + '-annotators', 'tokenize, ssplit, pos, depparse, lemma', + '-ssplit.eolonly', 'true', + '-tokenize.whitespace', 'true', + '-numThreads', '8', + '-textFile', '-', + 'outFile', '-', + ], + stdin=infile, stdout=PIPE, stderr=open('/dev/null', 'w')) return stanford.stdout @@ -87,13 +94,14 @@ def get_sentences(instream): head, dep = remainder.split() head_int = int(head.split('-')[-1][:-1]) dep_int = int(dep.split('-')[-1][:-1]) - sentence[dep_int-1]['head'] = head_int - sentence[dep_int-1]['label'] = rel + sentence[dep_int - 1]['head'] = head_int + sentence[dep_int - 1]['label'] = rel elif expect == 2: - linesplit = line.split('[',1)[1].rsplit(']',1)[0].split('] [') + linesplit = line.split('[', 1)[1].rsplit(']', 1)[0].split('] [') if len(linesplit) != len(sentence): - sys.stderr.write('Warning: mismatch in number of words in sentence\n') + sys.stderr.write( + "Warning: mismatch in number of words in sentence\n") sys.stderr.write(' '.join(w['word'] for w in sentence)) for i in range(len(sentence)): sentence[i]['pos'] = '-' @@ -102,22 +110,27 @@ def get_sentences(instream): sentence[i]['label'] = '-' expect = 0 continue - for i,w in enumerate(linesplit): + for i, w in enumerate(linesplit): sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0] sentence[i]['lemma'] = w.split(' Lemma=')[-1] expect = 3 elif expect == 1: for w in line.split(): - sentence.append({'word':w}) + sentence.append({'word': w}) expect = 2 if sentence: yield sentence + def write(sentence, outstream): for i, w in enumerate(sentence): - outstream.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(i+1, w['word'], w['lemma'], w['pos'], w['pos'], '-', w['head'], w['label'])) + outstream.write( + '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format( + i + 1, w['word'], w['lemma'], w['pos'], w['pos'], '-', + w['head'], w['label'])) + if __name__ == '__main__': if sys.version_info < (3, 0): @@ -125,11 +138,10 @@ if __name__ == '__main__': sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) sys.stdin = codecs.getreader('UTF-8')(sys.stdin) - parser = create_parser() options = parser.parse_args() stanford = process_stanford(options.input, options.java, options.stanford) for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)): - write(sentence, options.output) - options.output.write('\n') + write(sentence, options.output) + options.output.write('\n') diff --git a/scripts/training/wrappers/senna2brackets.py b/scripts/training/wrappers/senna2brackets.py index a81100277..5b8495c84 100755 --- a/scripts/training/wrappers/senna2brackets.py +++ b/scripts/training/wrappers/senna2brackets.py @@ -24,6 +24,7 @@ import optparse import os import sys + def main(): usage = "usage: %prog [options]" parser = optparse.OptionParser(usage=usage) @@ -71,6 +72,7 @@ def main(): word = "-RCB-" tree += frag.replace("*", "(%s %s)" % (pos, word)) + def balanced(s): num_left = 0 num_right = 0 @@ -81,10 +83,12 @@ def balanced(s): num_right += 1 return num_left == num_right + def beautify(tree): s = tree.replace("(", " (") return s.strip() + def berkelify(tree): if tree == "": return "(())" @@ -94,9 +98,11 @@ def berkelify(tree): old_root = tree[1:pos] return tree.replace(old_root, "TOP") + def warn(msg): prog_name = os.path.basename(sys.argv[0]) sys.stderr.write("%s: warning: %s\n" % (prog_name, msg)) + if __name__ == "__main__": main() From 85c23ed7dcbbd312d1c9ea7b64177c9e80e06088 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Tue, 2 Jun 2015 18:05:12 +0700 Subject: [PATCH 038/108] Fix some JS lint. --- scripts/ems/web/base64.js | 33 +++--- scripts/ems/web/hierarchical-segmentation.js | 115 ++++++++++--------- 2 files changed, 77 insertions(+), 71 deletions(-) diff --git a/scripts/ems/web/base64.js b/scripts/ems/web/base64.js index 67fd9ad8d..a35940c5a 100644 --- a/scripts/ems/web/base64.js +++ b/scripts/ems/web/base64.js @@ -21,16 +21,19 @@ // constants var b64chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'; - var b64tab = function(bin) { - var t = {}; - for (var i = 0, l = bin.length; i < l; i++) t[bin.charAt(i)] = i; + var b64tab = (function(bin) { + var t = {}, i, l; + for (i = 0, l = bin.length; i < l; i++) { + t[bin.charAt(i)] = i; + } return t; - }(b64chars); + })(b64chars); var fromCharCode = String.fromCharCode; // encoder stuff var cb_utob = function(c) { + var cc; if (c.length < 2) { - var cc = c.charCodeAt(0); + cc = c.charCodeAt(0); return cc < 0x80 ? c : cc < 0x800 ? (fromCharCode(0xc0 | (cc >>> 6)) + fromCharCode(0x80 | (cc & 0x3f))) @@ -38,7 +41,7 @@ + fromCharCode(0x80 | ((cc >>> 6) & 0x3f)) + fromCharCode(0x80 | ( cc & 0x3f))); } else { - var cc = 0x10000 + cc = 0x10000 + (c.charCodeAt(0) - 0xD800) * 0x400 + (c.charCodeAt(1) - 0xDC00); return (fromCharCode(0xf0 | ((cc >>> 18) & 0x07)) @@ -70,19 +73,21 @@ return b.replace(/[\s\S]{1,3}/g, cb_encode); }; var _encode = buffer ? function (u) { - return (u.constructor === buffer.constructor ? u : new buffer(u)) - .toString('base64') + return ( + u.constructor === buffer.constructor ? u : new buffer(u) + ).toString('base64'); } - : function (u) { return btoa(utob(u)) } + : function (u) { return btoa(utob(u)); } ; var encode = function(u, urisafe) { - return !urisafe - ? _encode(String(u)) - : _encode(String(u)).replace(/[+\/]/g, function(m0) { - return m0 == '+' ? '-' : '_'; + return ( + !urisafe ? + _encode(String(u)) : + _encode(String(u)).replace(/[+\/]/g, function(m0) { + return (m0 === '+') ? '-' : '_'; }).replace(/=/g, ''); }; - var encodeURI = function(u) { return encode(u, true) }; + var encodeURI = function(u) { return encode(u, true); }; // decoder stuff var re_btou = new RegExp([ '[\xC0-\xDF][\x80-\xBF]', diff --git a/scripts/ems/web/hierarchical-segmentation.js b/scripts/ems/web/hierarchical-segmentation.js index 7f0df85ff..a1e16eff7 100644 --- a/scripts/ems/web/hierarchical-segmentation.js +++ b/scripts/ems/web/hierarchical-segmentation.js @@ -10,6 +10,63 @@ var span_count_in = []; var span_count_out = []; var current_depth = -1; +function highlightSingleNode( sentence, id, color ) { + var i, j, item; + for(i=nodeIn[sentence][id].start;i<=nodeIn[sentence][id].end;i++) { + for(j=nodeIn[sentence][id].depth;j<=max_depth[sentence];j++) { + item = "in-" + sentence + "-" + i + "-" + j; + if ($(item) !== null) { + $(item).setStyle({ backgroundColor: color, borderColor: 'red' }); + } + } + } + //$("debug").innerHTML = "highlight: "+id+", of "+nodeOut[sentence].size()+"
"; + for(i=nodeOut[sentence][id].start;i<=nodeOut[sentence][id].end;i++) { + for(j=nodeOut[sentence][id].depth;j<=max_depth[sentence];j++) { + item = "out-" + sentence + "-" + i + "-" + j; + //$("debug").innerHTML += item; + if ($(item) !== null) { + $(item).setStyle({ backgroundColor: color, borderColor: 'red' }); + } + } + } +} + +function lowlightAllNodes( sentence ) { + var i, j, item; + for(i=0;i"; - for(i=nodeOut[sentence][id].start;i<=nodeOut[sentence][id].end;i++) { - for(j=nodeOut[sentence][id].depth;j<=max_depth[sentence];j++) { - var item = "out-" + sentence + "-" + i + "-" + j; - //$("debug").innerHTML += item; - if ($(item) != null) { - $(item).setStyle({ backgroundColor: color, borderColor: 'red' }); - } - } - } -} - -function lowlightAllNodes( sentence ) { - var i, j; - for(i=0;i Date: Tue, 2 Jun 2015 13:56:03 +0100 Subject: [PATCH 039/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNodeCollection.cpp | 4 -- phrase-extract/SyntaxNodeCollection.h | 3 -- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 51 ++++++------------- phrase-extract/extract-ghkm/ScfgRule.cpp | 20 ++++---- phrase-extract/extract-ghkm/ScfgRule.h | 4 +- phrase-extract/extract-ghkm/XmlTreeParser.cpp | 39 ++------------ phrase-extract/extract-ghkm/XmlTreeParser.h | 12 +++-- 7 files changed, 38 insertions(+), 95 deletions(-) diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index f67bee587..60a2f6c2f 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -23,8 +23,6 @@ #include #include -#include - namespace MosesTraining { @@ -154,7 +152,6 @@ void SyntaxNodeCollection::ConnectNodes() } } -//boost::shared_ptr SyntaxNodeCollection::ExtractTree() std::auto_ptr SyntaxNodeCollection::ExtractTree() { std::map nodeToTree; @@ -209,7 +206,6 @@ std::auto_ptr SyntaxNodeCollection::ExtractTree() } } - //return boost::shared_ptr(root); return std::auto_ptr(root); } diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index c54400ca1..604b8d629 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -24,8 +24,6 @@ #include #include -#include - #include "SyntaxNode.h" #include "SyntaxTree.h" @@ -75,7 +73,6 @@ public: void Clear(); std::auto_ptr ExtractTree(); - //boost::shared_ptr ExtractTree(); }; } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 0c7dadd4d..43873e804 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -139,6 +139,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::string alignmentLine; Alignment alignment; XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet); + XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet); ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options); StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options); size_t lineNum = options.sentenceOffset; @@ -175,39 +176,14 @@ int ExtractGHKM::Main(int argc, char *argv[]) Error(oss.str()); } - - // Parse source tree and construct a SyntaxTree object. - SyntaxNodeCollection sourceSyntaxTree; - SyntaxNode *sourceSyntaxTreeRoot=NULL; - - if (options.sourceLabels) { - try { - if (!ProcessAndStripXMLTags(sourceLine, sourceSyntaxTree, sourceLabelSet, sourceTopLabelSet, false)) { - throw Exception(""); - } - sourceSyntaxTree.ConnectNodes(); - sourceSyntaxTreeRoot = sourceSyntaxTree.GetTop(); - assert(sourceSyntaxTreeRoot); - } catch (const Exception &e) { - std::ostringstream oss; - oss << "Failed to parse source XML tree at line " << lineNum; - if (!e.GetMsg().empty()) { - oss << ": " << e.GetMsg(); - } - Error(oss.str()); - } - } - - // Read source tokens. - std::vector sourceTokens(ReadTokens(sourceLine)); - - // Construct a source SyntaxTree object from the SyntaxNodeCollection - // object. + // Read source tokens (and parse tree if using source labels). + std::vector sourceTokens; std::auto_ptr sourceParseTree; - - if (options.sourceLabels) { + if (!options.sourceLabels) { + sourceTokens = ReadTokens(sourceLine); + } else { try { - sourceParseTree = XmlTreeParser::ConvertTree(*sourceSyntaxTreeRoot, sourceTokens); + sourceParseTree = sourceXmlTreeParser.Parse(sourceLine); assert(sourceParseTree.get()); } catch (const Exception &e) { std::ostringstream oss; @@ -217,9 +193,9 @@ int ExtractGHKM::Main(int argc, char *argv[]) } Error(oss.str()); } + sourceTokens = sourceXmlTreeParser.GetWords(); } - // Read word alignments. try { ReadAlignment(alignmentLine, alignment); @@ -239,12 +215,14 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Record word counts. if (!options.targetUnknownWordFile.empty()) { - CollectWordLabelCounts(*targetParseTree, options, targetWordCount, targetWordLabel); + CollectWordLabelCounts(*targetParseTree, options, targetWordCount, + targetWordLabel); } // Record word counts: source side. if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) { - CollectWordLabelCounts(*sourceParseTree, options, sourceWordCount, sourceWordLabel); + CollectWordLabelCounts(*sourceParseTree, options, sourceWordCount, + sourceWordLabel); } // Form an alignment graph from the target tree, source words, and @@ -260,7 +238,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) } // Initialize phrase orientation scoring object - PhraseOrientation phraseOrientation( sourceTokens.size(), targetXmlTreeParser.GetWords().size(), alignment); + PhraseOrientation phraseOrientation(sourceTokens.size(), + targetXmlTreeParser.GetWords().size(), alignment); // Write the rules, subject to scope pruning. const std::vector &targetNodes = graph.GetTargetNodes(); @@ -292,7 +271,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // SCFG output. ScfgRule *r = 0; if (options.sourceLabels) { - r = new ScfgRule(**q, &sourceSyntaxTree); + r = new ScfgRule(**q, &sourceXmlTreeParser.GetNodeCollection()); } else { r = new ScfgRule(**q); } diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index fc960b598..a6fc19dd9 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -32,12 +32,12 @@ namespace GHKM { ScfgRule::ScfgRule(const Subgraph &fragment, - const SyntaxNodeCollection *sourceSyntaxTree) + const SyntaxNodeCollection *sourceNodeCollection) : m_graphFragment(fragment) , m_sourceLHS("X", NonTerminal) , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal) , m_pcfgScore(fragment.GetPcfgScore()) - , m_hasSourceLabels(sourceSyntaxTree) + , m_hasSourceLabels(sourceNodeCollection) { // Source RHS @@ -82,9 +82,9 @@ ScfgRule::ScfgRule(const Subgraph &fragment, } } } - if (sourceSyntaxTree) { + if (sourceNodeCollection) { // Source syntax label - PushSourceLabel(sourceSyntaxTree,&sinkNode,"XRHS"); + PushSourceLabel(sourceNodeCollection,&sinkNode,"XRHS"); } } @@ -125,23 +125,23 @@ ScfgRule::ScfgRule(const Subgraph &fragment, } } - if (sourceSyntaxTree) { - // Source syntax label for root node (if sourceSyntaxTree available) - PushSourceLabel(sourceSyntaxTree,fragment.GetRoot(),"XLHS"); + if (sourceNodeCollection) { + // Source syntax label for root node (if sourceNodeCollection available) + PushSourceLabel(sourceNodeCollection,fragment.GetRoot(),"XLHS"); // All non-terminal spans (including the LHS) should have obtained a label // (a source-side syntactic constituent label if the span matches, "XLHS" otherwise) // assert(m_sourceLabels.size() == m_numberOfNonTerminals+1); } } -void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceSyntaxTree, +void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection, const Node *node, const std::string &nonMatchingLabel) { ContiguousSpan span = Closure(node->GetSpan()); - if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span? + if (sourceNodeCollection->HasNode(span.first,span.second)) { // does a source constituent match the span? std::vector sourceLabels = - sourceSyntaxTree->GetNodes(span.first,span.second); + sourceNodeCollection->GetNodes(span.first,span.second); if (!sourceLabels.empty()) { // store the topmost matching label from the source syntax tree m_sourceLabels.push_back(sourceLabels.back()->GetLabel()); diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h index c8cdbb143..439c19fd7 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.h +++ b/phrase-extract/extract-ghkm/ScfgRule.h @@ -41,7 +41,7 @@ class ScfgRule : public Rule { public: ScfgRule(const Subgraph &fragment, - const SyntaxNodeCollection *sourceSyntaxTree = 0); + const SyntaxNodeCollection *sourceNodeCollection = 0); const Subgraph &GetGraphFragment() const { return m_graphFragment; @@ -78,7 +78,7 @@ public: } private: - void PushSourceLabel(const SyntaxNodeCollection *sourceSyntaxTree, + void PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection, const Node *node, const std::string &nonMatchingLabel); const Subgraph& m_graphFragment; diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp index 83dfbd42f..17513fdd4 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -44,52 +44,21 @@ XmlTreeParser::XmlTreeParser(std::set &labelSet, std::auto_ptr XmlTreeParser::Parse(const std::string &line) { m_line = line; - m_tree.Clear(); + m_nodeCollection.Clear(); try { - if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet, - false)) { + if (!ProcessAndStripXMLTags(m_line, m_nodeCollection, m_labelSet, + m_topLabelSet, false)) { throw Exception(""); } } catch (const XmlException &e) { throw Exception(e.getMsg()); } - //boost::shared_ptr root = m_tree.ExtractTree(); - std::auto_ptr root = m_tree.ExtractTree(); + std::auto_ptr root = m_nodeCollection.ExtractTree(); m_words = util::tokenize(m_line); AttachWords(m_words, *root); return root; } -// Converts a SyntaxNode tree to a MosesTraining::GHKM::SyntaxTree. -std::auto_ptr XmlTreeParser::ConvertTree( - const SyntaxNode &tree, - const std::vector &words) -{ - std::auto_ptr root(new SyntaxTree(tree)); - const std::vector &children = tree.GetChildren(); - if (children.empty()) { - if (tree.GetStart() != tree.GetEnd()) { - std::ostringstream msg; - msg << "leaf node covers multiple words (" << tree.GetStart() - << "-" << tree.GetEnd() << "): this is currently unsupported"; - throw Exception(msg.str()); - } - SyntaxNode value(tree.GetStart(), tree.GetStart(), words[tree.GetStart()]); - std::auto_ptr leaf(new SyntaxTree(value)); - leaf->parent() = root.get(); - root->children().push_back(leaf.release()); - } else { - for (std::vector::const_iterator p = children.begin(); - p != children.end(); ++p) { - assert(*p); - std::auto_ptr child = ConvertTree(**p, words); - child->parent() = root.get(); - root->children().push_back(child.release()); - } - } - return root; -} - void XmlTreeParser::AttachWords(const std::vector &words, SyntaxTree &root) { diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index 2fcdd9b56..339a2bd13 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -44,20 +44,22 @@ class XmlTreeParser { public: XmlTreeParser(std::set &, std::map &); - std::auto_ptr Parse(const std::string &); - static std::auto_ptr ConvertTree(const SyntaxNode &, - const std::vector &); + std::auto_ptr Parse(const std::string &); const std::vector& GetWords() { return m_words; - }; + } + + const SyntaxNodeCollection &GetNodeCollection() const { + return m_nodeCollection; + } private: std::set &m_labelSet; std::map &m_topLabelSet; std::string m_line; - SyntaxNodeCollection m_tree; + SyntaxNodeCollection m_nodeCollection; std::vector m_words; void AttachWords(const std::vector &, SyntaxTree &); From 5ece895ab4d7fafe32d76cb2dd7bd7995cd06c7c Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Tue, 2 Jun 2015 14:00:56 +0100 Subject: [PATCH 040/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNodeCollection.h | 1 + 1 file changed, 1 insertion(+) diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index 604b8d629..a0d19841c 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -20,6 +20,7 @@ #pragma once #include +#include #include #include #include From 2f04d4a56ebab78a97b9fa9ecf4b50ef845a1bdb Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Tue, 2 Jun 2015 15:23:41 +0100 Subject: [PATCH 041/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 7 +- phrase-extract/extract-ghkm/XmlTreeParser.cpp | 90 ------------------- phrase-extract/extract-ghkm/XmlTreeParser.h | 71 --------------- .../filter-rule-table/FilterRuleTable.cpp | 11 ++- .../filter-rule-table/FilterRuleTable.h | 6 +- .../filter-rule-table/ForestTsgFilter.h | 1 - .../filter-rule-table/TreeCfgFilter.cpp | 2 +- .../filter-rule-table/TreeCfgFilter.h | 5 +- .../filter-rule-table/TreeTsgFilter.cpp | 18 ++-- .../filter-rule-table/TreeTsgFilter.h | 9 +- .../syntax-common/xml_tree_parser.cc | 71 ++++++++------- .../syntax-common/xml_tree_parser.h | 28 ++++-- 12 files changed, 91 insertions(+), 228 deletions(-) delete mode 100644 phrase-extract/extract-ghkm/XmlTreeParser.cpp delete mode 100644 phrase-extract/extract-ghkm/XmlTreeParser.h diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 43873e804..2293371ac 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -30,6 +30,8 @@ #include +#include "syntax-common/xml_tree_parser.h" + #include "InputFileStream.h" #include "OutputFileStream.h" #include "SyntaxNode.h" @@ -50,7 +52,6 @@ #include "Span.h" #include "StsgRule.h" #include "StsgRuleWriter.h" -#include "XmlTreeParser.h" namespace MosesTraining { @@ -138,8 +139,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::string sourceLine; std::string alignmentLine; Alignment alignment; - XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet); - XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet); + Syntax::XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet); + Syntax::XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet); ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options); StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options); size_t lineNum = options.sentenceOffset; diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp deleted file mode 100644 index 17513fdd4..000000000 --- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2011 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#include "XmlTreeParser.h" - -#include -#include - -#include "util/tokenize.hh" - -#include "SyntaxTree.h" -#include "tables-core.h" -#include "XmlException.h" -#include "XmlTree.h" - -namespace MosesTraining -{ -namespace GHKM -{ - -XmlTreeParser::XmlTreeParser(std::set &labelSet, - std::map &topLabelSet) - : m_labelSet(labelSet) - , m_topLabelSet(topLabelSet) -{ -} - -std::auto_ptr XmlTreeParser::Parse(const std::string &line) -{ - m_line = line; - m_nodeCollection.Clear(); - try { - if (!ProcessAndStripXMLTags(m_line, m_nodeCollection, m_labelSet, - m_topLabelSet, false)) { - throw Exception(""); - } - } catch (const XmlException &e) { - throw Exception(e.getMsg()); - } - std::auto_ptr root = m_nodeCollection.ExtractTree(); - m_words = util::tokenize(m_line); - AttachWords(m_words, *root); - return root; -} - -void XmlTreeParser::AttachWords(const std::vector &words, - SyntaxTree &root) -{ - std::vector leaves; - leaves.reserve(words.size()); - for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) { - leaves.push_back(&*p); - } - - std::vector::const_iterator q = words.begin(); - for (std::vector::iterator p = leaves.begin(); p != leaves.end(); - ++p) { - SyntaxTree *leaf = *p; - const int start = leaf->value().GetStart(); - const int end = leaf->value().GetEnd(); - if (start != end) { - std::ostringstream msg; - msg << "leaf node covers multiple words (" << start << "-" << end - << "): this is currently unsupported"; - throw Exception(msg.str()); - } - SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++)); - leaf->children().push_back(newLeaf); - newLeaf->parent() = leaf; - } -} - -} // namespace GHKM -} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h deleted file mode 100644 index 339a2bd13..000000000 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ /dev/null @@ -1,71 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2011 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef EXTRACT_GHKM_XML_TREE_PARSER_H_ -#define EXTRACT_GHKM_XML_TREE_PARSER_H_ - -#include -#include -#include -#include -#include - -#include "SyntaxNode.h" -#include "SyntaxNodeCollection.h" -#include "SyntaxTree.h" - -#include "Exception.h" - -namespace MosesTraining -{ -namespace GHKM -{ - -// Parses a string in Moses' XML parse tree format and returns a SyntaxTree -// object. -class XmlTreeParser -{ -public: - XmlTreeParser(std::set &, std::map &); - - std::auto_ptr Parse(const std::string &); - - const std::vector& GetWords() { - return m_words; - } - - const SyntaxNodeCollection &GetNodeCollection() const { - return m_nodeCollection; - } - -private: - std::set &m_labelSet; - std::map &m_topLabelSet; - std::string m_line; - SyntaxNodeCollection m_nodeCollection; - std::vector m_words; - - void AttachWords(const std::vector &, SyntaxTree &); -}; - -} // namespace GHKM -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp index c42c13de6..0c6f132f8 100644 --- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp +++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp @@ -82,7 +82,7 @@ int FilterRuleTable::Main(int argc, char *argv[]) StringCfgFilter filter(testStrings); filter.Filter(std::cin, std::cout); } else if (testSentenceFormat == kTree) { - std::vector > testTrees; + std::vector > testTrees; ReadTestSet(testStream, testTrees); if (sourceSideRuleFormat == kCfg) { // TODO Implement TreeCfgFilter @@ -124,9 +124,11 @@ void FilterRuleTable::ReadTestSet( } void FilterRuleTable::ReadTestSet( - std::istream &input, std::vector > &sentences) + std::istream &input, std::vector > &sentences) { - XmlTreeParser parser; + std::set labelSet; + std::map topLabelSet; + XmlTreeParser parser(labelSet, topLabelSet); int lineNum = 0; std::string line; while (std::getline(input, line)) { @@ -136,7 +138,8 @@ void FilterRuleTable::ReadTestSet( << std::endl; continue; } - sentences.push_back(boost::shared_ptr(parser.Parse(line))); + sentences.push_back( + boost::shared_ptr(parser.Parse(line).release())); } } diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.h b/phrase-extract/filter-rule-table/FilterRuleTable.h index 3a9489428..3077e690d 100644 --- a/phrase-extract/filter-rule-table/FilterRuleTable.h +++ b/phrase-extract/filter-rule-table/FilterRuleTable.h @@ -5,7 +5,7 @@ #include -#include "syntax-common/string_tree.h" +#include "SyntaxTree.h" #include "StringForest.h" @@ -36,7 +36,7 @@ private: void Filter(const std::vector > &); // Filter rule table (on std::cin) for test set (parse tree version). - void Filter(const std::vector > &); + void Filter(const std::vector > &); void ProcessOptions(int, char *[], Options &) const; @@ -46,7 +46,7 @@ private: // Read test set (tree version) void ReadTestSet(std::istream &, - std::vector > &); + std::vector > &); // Read test set (forest version) void ReadTestSet(std::istream &, diff --git a/phrase-extract/filter-rule-table/ForestTsgFilter.h b/phrase-extract/filter-rule-table/ForestTsgFilter.h index ff48b2e22..c9fe41f57 100644 --- a/phrase-extract/filter-rule-table/ForestTsgFilter.h +++ b/phrase-extract/filter-rule-table/ForestTsgFilter.h @@ -10,7 +10,6 @@ #include #include "syntax-common/numbered_set.h" -#include "syntax-common/string_tree.h" #include "syntax-common/tree.h" #include "syntax-common/tree_fragment_tokenizer.h" diff --git a/phrase-extract/filter-rule-table/TreeCfgFilter.cpp b/phrase-extract/filter-rule-table/TreeCfgFilter.cpp index cb04dc94e..dc938ac19 100644 --- a/phrase-extract/filter-rule-table/TreeCfgFilter.cpp +++ b/phrase-extract/filter-rule-table/TreeCfgFilter.cpp @@ -12,7 +12,7 @@ namespace FilterRuleTable { TreeCfgFilter::TreeCfgFilter( - const std::vector > &sentences) + const std::vector > &sentences) { } diff --git a/phrase-extract/filter-rule-table/TreeCfgFilter.h b/phrase-extract/filter-rule-table/TreeCfgFilter.h index 7dd0fa072..3434ff200 100644 --- a/phrase-extract/filter-rule-table/TreeCfgFilter.h +++ b/phrase-extract/filter-rule-table/TreeCfgFilter.h @@ -8,8 +8,9 @@ #include #include +#include "SyntaxTree.h" + #include "syntax-common/numbered_set.h" -#include "syntax-common/string_tree.h" #include "syntax-common/tree.h" #include "syntax-common/tree_fragment_tokenizer.h" @@ -29,7 +30,7 @@ class TreeCfgFilter : public CfgFilter { public: // Initialize the filter for a given set of test sentences. - TreeCfgFilter(const std::vector > &); + TreeCfgFilter(const std::vector > &); void Filter(std::istream &in, std::ostream &out); }; diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp index 32a59fd6c..17a8dcb22 100644 --- a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp +++ b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp @@ -8,13 +8,13 @@ namespace FilterRuleTable { TreeTsgFilter::TreeTsgFilter( - const std::vector > &sentences) + const std::vector > &sentences) { - // Convert each StringTree to an IdTree. + // Convert each SyntaxTree to an IdTree. m_sentences.reserve(sentences.size()); - for (std::vector >::const_iterator p = + for (std::vector >::const_iterator p = sentences.begin(); p != sentences.end(); ++p) { - m_sentences.push_back(boost::shared_ptr(StringTreeToIdTree(**p))); + m_sentences.push_back(boost::shared_ptr(SyntaxTreeToIdTree(**p))); } m_labelToTree.resize(m_testVocab.Size()); @@ -25,15 +25,15 @@ TreeTsgFilter::TreeTsgFilter( } } -TreeTsgFilter::IdTree *TreeTsgFilter::StringTreeToIdTree(const StringTree &s) +TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s) { - IdTree *t = new IdTree(m_testVocab.Insert(s.value())); - const std::vector &sChildren = s.children(); + IdTree *t = new IdTree(m_testVocab.Insert(s.value().GetLabel())); + const std::vector &sChildren = s.children(); std::vector &tChildren = t->children(); tChildren.reserve(sChildren.size()); - for (std::vector::const_iterator p = sChildren.begin(); + for (std::vector::const_iterator p = sChildren.begin(); p != sChildren.end(); ++p) { - IdTree *child = StringTreeToIdTree(**p); + IdTree *child = SyntaxTreeToIdTree(**p); child->parent() = t; tChildren.push_back(child); } diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.h b/phrase-extract/filter-rule-table/TreeTsgFilter.h index 17378b552..fa11350b6 100644 --- a/phrase-extract/filter-rule-table/TreeTsgFilter.h +++ b/phrase-extract/filter-rule-table/TreeTsgFilter.h @@ -8,8 +8,9 @@ #include #include +#include "SyntaxTree.h" + #include "syntax-common/numbered_set.h" -#include "syntax-common/string_tree.h" #include "syntax-common/tree.h" #include "syntax-common/tree_fragment_tokenizer.h" @@ -29,7 +30,7 @@ class TreeTsgFilter : public TsgFilter { public: // Initialize the filter for a given set of test sentences. - TreeTsgFilter(const std::vector > &); + TreeTsgFilter(const std::vector > &); private: // Add an entry to m_labelToTree for every subtree of the given tree. @@ -41,9 +42,9 @@ private: // Try to match a fragment against a specific subtree of a test tree. bool MatchFragment(const IdTree &, const IdTree &); - // Convert a StringTree to an IdTree (wrt m_testVocab). Inserts symbols into + // Convert a SyntaxTree to an IdTree (wrt m_testVocab). Inserts symbols into // m_testVocab. - IdTree *StringTreeToIdTree(const StringTree &); + IdTree *SyntaxTreeToIdTree(const SyntaxTree &); std::vector > m_sentences; std::vector > m_labelToTree; diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index 2f8a904fa..bf3c6d87e 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -1,17 +1,27 @@ #include "xml_tree_parser.h" -#include "tables-core.h" -#include "XmlException.h" -#include "XmlTree.h" -#include "util/tokenize.hh" - #include #include +#include "util/tokenize.hh" + +#include "SyntaxTree.h" +#include "tables-core.h" +#include "XmlException.h" +#include "XmlTree.h" + namespace MosesTraining { namespace Syntax { -StringTree *XmlTreeParser::Parse(const std::string &line) { +XmlTreeParser::XmlTreeParser(std::set &labelSet, + std::map &topLabelSet) + : label_set_(labelSet) + , top_label_set_(topLabelSet) +{ +} + +std::auto_ptr XmlTreeParser::Parse(const std::string &line) +{ line_ = line; node_collection_.Clear(); try { @@ -22,38 +32,37 @@ StringTree *XmlTreeParser::Parse(const std::string &line) { } catch (const XmlException &e) { throw Exception(e.getMsg()); } - node_collection_.ConnectNodes(); - SyntaxNode *root = node_collection_.GetTop(); - assert(root); + std::auto_ptr root = node_collection_.ExtractTree(); words_ = util::tokenize(line_); - return ConvertTree(*root, words_); + AttachWords(words_, *root); + return root; } -// Converts a SyntaxNode tree to a StringTree. -StringTree *XmlTreeParser::ConvertTree(const SyntaxNode &tree, - const std::vector &words) { - StringTree *root = new StringTree(tree.GetLabel()); - const std::vector &children = tree.GetChildren(); - if (children.empty()) { - if (tree.GetStart() != tree.GetEnd()) { +void XmlTreeParser::AttachWords(const std::vector &words, + SyntaxTree &root) +{ + std::vector leaves; + leaves.reserve(words.size()); + for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) { + leaves.push_back(&*p); + } + + std::vector::const_iterator q = words.begin(); + for (std::vector::iterator p = leaves.begin(); p != leaves.end(); + ++p) { + SyntaxTree *leaf = *p; + const int start = leaf->value().GetStart(); + const int end = leaf->value().GetEnd(); + if (start != end) { std::ostringstream msg; - msg << "leaf node covers multiple words (" << tree.GetStart() - << "-" << tree.GetEnd() << "): this is currently unsupported"; + msg << "leaf node covers multiple words (" << start << "-" << end + << "): this is currently unsupported"; throw Exception(msg.str()); } - StringTree *leaf = new StringTree(words[tree.GetStart()]); - leaf->parent() = root; - root->children().push_back(leaf); - } else { - for (std::vector::const_iterator p = children.begin(); - p != children.end(); ++p) { - assert(*p); - StringTree *child = ConvertTree(**p, words); - child->parent() = root; - root->children().push_back(child); - } + SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++)); + leaf->children().push_back(newLeaf); + newLeaf->parent() = leaf; } - return root; } } // namespace Syntax diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index c84ea25ec..e0b75c830 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -1,34 +1,44 @@ #pragma once #include +#include #include #include #include #include "SyntaxNode.h" #include "SyntaxNodeCollection.h" +#include "SyntaxTree.h" #include "exception.h" -#include "string_tree.h" namespace MosesTraining { namespace Syntax { -// Parses a string in Moses' XML parse tree format and returns a StringTree +// Parses a string in Moses' XML parse tree format and returns a SyntaxTree // object. This is a wrapper around the ProcessAndStripXMLTags function. class XmlTreeParser { public: - StringTree *Parse(const std::string &); + XmlTreeParser(std::set &, std::map &); + + std::auto_ptr Parse(const std::string &); + + const std::vector& GetWords() { + return words_; + } + + const SyntaxNodeCollection &GetNodeCollection() const { + return node_collection_; + } private: - static StringTree *ConvertTree(const MosesTraining::SyntaxNode &, - const std::vector &); - - std::set label_set_; - std::map top_label_set_; + std::set &label_set_; + std::map &top_label_set_; std::string line_; - MosesTraining::SyntaxNodeCollection node_collection_; + SyntaxNodeCollection node_collection_; std::vector words_; + + void AttachWords(const std::vector &, SyntaxTree &); }; } // namespace Syntax From efdb8566b17d19783aa65caf22b24e48a789fbb8 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 2 Jun 2015 21:00:32 +0400 Subject: [PATCH 042/108] delete ChangeSource(). Not used --- contrib/other-builds/all.workspace | 1 - contrib/other-builds/moses/moses.project | 8 +- moses-cmd/MainVW.cpp | 3 - moses/ExportInterface.cpp | 4 - moses/FF/Factory.cpp | 2 - moses/FF/FeatureFunction.cpp | 14 ---- moses/FF/FeatureFunction.h | 6 -- moses/FF/SkeletonChangeInput.cpp | 96 ------------------------ moses/FF/SkeletonChangeInput.h | 45 ----------- 9 files changed, 3 insertions(+), 176 deletions(-) delete mode 100644 moses/FF/SkeletonChangeInput.cpp delete mode 100644 moses/FF/SkeletonChangeInput.h diff --git a/contrib/other-builds/all.workspace b/contrib/other-builds/all.workspace index 3df758293..66dafe3d2 100644 --- a/contrib/other-builds/all.workspace +++ b/contrib/other-builds/all.workspace @@ -1,6 +1,5 @@ - diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project index 2c2affd45..f902dd1f4 100644 --- a/contrib/other-builds/moses/moses.project +++ b/contrib/other-builds/moses/moses.project @@ -1,6 +1,9 @@ + + + - - - @@ -531,8 +531,6 @@ - - diff --git a/moses-cmd/MainVW.cpp b/moses-cmd/MainVW.cpp index ac54c1ed6..c8047c201 100644 --- a/moses-cmd/MainVW.cpp +++ b/moses-cmd/MainVW.cpp @@ -151,9 +151,6 @@ int main(int argc, char** argv) ResetUserTime(); } - InputType* foo = source.get(); - FeatureFunction::CallChangeSource(foo); - // set up task of training one sentence boost::shared_ptr task; task = TrainingTask::create(source, ioWrapper); diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index 0ceeceec1..c444e98c9 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -118,8 +118,6 @@ string SimpleTranslationInterface::translate(const string &inputString) ResetUserTime(); } - FeatureFunction::CallChangeSource(&*source); - // set up task of translating one sentence boost::shared_ptr task = TranslationTask::create(source, ioWrapper); @@ -223,8 +221,6 @@ batch_run() while ((source = ioWrapper->ReadInput()) != NULL) { IFVERBOSE(1) ResetUserTime(); - FeatureFunction::CallChangeSource(source.get()); - // set up task of translating one sentence boost::shared_ptr task = TranslationTask::create(source, ioWrapper); diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index c797381ff..167e02370 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -62,7 +62,6 @@ #include "moses/LM/SkeletonLM.h" #include "moses/FF/SkeletonTranslationOptionListFeature.h" #include "moses/LM/BilingualLM.h" -#include "SkeletonChangeInput.h" #include "moses/TranslationModel/SkeletonPT.h" #include "moses/Syntax/InputWeightFF.h" #include "moses/Syntax/RuleTableFF.h" @@ -268,7 +267,6 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME(SkeletonStatelessFF); MOSES_FNAME(SkeletonStatefulFF); MOSES_FNAME(SkeletonLM); - MOSES_FNAME(SkeletonChangeInput); MOSES_FNAME(SkeletonTranslationOptionListFeature); MOSES_FNAME(SkeletonPT); diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp index 5eab202ae..08ad26db8 100644 --- a/moses/FF/FeatureFunction.cpp +++ b/moses/FF/FeatureFunction.cpp @@ -38,20 +38,6 @@ void FeatureFunction::Destroy() RemoveAllInColl(s_staticColl); } -// The original declaration as -// void FeatureFunction::CallChangeSource(InputType *&input) -// had me a bit perplexed. Would you really want to allow -// any feature function to replace the InputType behind the -// back of the others? And change what the vector is pointing to? - -void FeatureFunction::CallChangeSource(InputType * const&input) -{ - for (size_t i = 0; i < s_staticColl.size(); ++i) { - const FeatureFunction &ff = *s_staticColl[i]; - ff.ChangeSource(input); - } -} - void FeatureFunction::SetupAll(TranslationTask const& ttask) { BOOST_FOREACH(FeatureFunction* ff, s_staticColl) diff --git a/moses/FF/FeatureFunction.h b/moses/FF/FeatureFunction.h index d3d6ab168..c95b5eb25 100644 --- a/moses/FF/FeatureFunction.h +++ b/moses/FF/FeatureFunction.h @@ -62,9 +62,6 @@ public: static FeatureFunction &FindFeatureFunction(const std::string& name); static void Destroy(); - static void CallChangeSource(InputType * const&input); - // see my note in FeatureFunction.cpp --- UG - FeatureFunction(const std::string &line, bool initializeNow); FeatureFunction(size_t numScoreComponents, const std::string &line); virtual bool IsStateless() const = 0; @@ -156,9 +153,6 @@ public: ScoreComponentCollection& scoreBreakdown, ScoreComponentCollection& estimatedFutureScore) const = 0; - // override this method if you want to change the input before decoding - virtual void ChangeSource(InputType * const&input) const { } - // for context-dependent processing static void SetupAll(TranslationTask const& task); virtual void Setup(TranslationTask const& task) const { }; diff --git a/moses/FF/SkeletonChangeInput.cpp b/moses/FF/SkeletonChangeInput.cpp deleted file mode 100644 index 7937d7771..000000000 --- a/moses/FF/SkeletonChangeInput.cpp +++ /dev/null @@ -1,96 +0,0 @@ -#include -#include "SkeletonChangeInput.h" -#include "moses/ScoreComponentCollection.h" -#include "moses/TargetPhrase.h" -#include "moses/Sentence.h" -#include "moses/FactorCollection.h" -#include "util/exception.hh" - -using namespace std; - -namespace Moses -{ -SkeletonChangeInput::SkeletonChangeInput(const std::string &line) - :StatelessFeatureFunction(2, line) -{ - ReadParameters(); -} - -void SkeletonChangeInput::EvaluateInIsolation(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const -{ - // dense scores - vector newScores(m_numScoreComponents); - newScores[0] = 1.5; - newScores[1] = 0.3; - scoreBreakdown.PlusEquals(this, newScores); - - // sparse scores - scoreBreakdown.PlusEquals(this, "sparse-name", 2.4); - -} - -void SkeletonChangeInput::EvaluateWithSourceContext(const InputType &input - , const InputPath &inputPath - , const TargetPhrase &targetPhrase - , const StackVec *stackVec - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection *estimatedFutureScore) const -{ - if (targetPhrase.GetNumNonTerminals()) { - vector newScores(m_numScoreComponents); - newScores[0] = - std::numeric_limits::infinity(); - scoreBreakdown.PlusEquals(this, newScores); - } - -} - -void SkeletonChangeInput::EvaluateTranslationOptionListWithSourceContext(const InputType &input - , const TranslationOptionList &translationOptionList) const -{} - -void SkeletonChangeInput::EvaluateWhenApplied(const Hypothesis& hypo, - ScoreComponentCollection* accumulator) const -{} - -void SkeletonChangeInput::EvaluateWhenApplied(const ChartHypothesis &hypo, - ScoreComponentCollection* accumulator) const -{} - -void SkeletonChangeInput::ChangeSource(InputType* const& input) const -{ - // add factor[1] to each word. Created from first 4 letter of factor[0] - - Sentence *sentence = dynamic_cast(input); - UTIL_THROW_IF2(sentence == NULL, "Not a sentence input"); - - FactorCollection &fc = FactorCollection::Instance(); - - size_t size = sentence->GetSize(); - for (size_t i = 0; i < size; ++i) { - Word &word = sentence->Phrase::GetWord(i); - const Factor *factor0 = word[0]; - - std::string str = factor0->GetString().as_string(); - if (str.length() > 4) { - str = str.substr(0, 4); - } - - const Factor *factor1 = fc.AddFactor(str); - word.SetFactor(1, factor1); - } -} - -void SkeletonChangeInput::SetParameter(const std::string& key, const std::string& value) -{ - if (key == "arg") { - // set value here - } else { - StatelessFeatureFunction::SetParameter(key, value); - } -} - -} - diff --git a/moses/FF/SkeletonChangeInput.h b/moses/FF/SkeletonChangeInput.h deleted file mode 100644 index f8d9010ce..000000000 --- a/moses/FF/SkeletonChangeInput.h +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include -#include "StatelessFeatureFunction.h" - -namespace Moses -{ - -class SkeletonChangeInput : public StatelessFeatureFunction -{ -public: - SkeletonChangeInput(const std::string &line); - - bool IsUseable(const FactorMask &mask) const { - return true; - } - - void EvaluateInIsolation(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const; - - void ChangeSource(InputType* const&input) const; - - void EvaluateWithSourceContext(const InputType &input - , const InputPath &inputPath - , const TargetPhrase &targetPhrase - , const StackVec *stackVec - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection *estimatedFutureScore = NULL) const; - - void EvaluateTranslationOptionListWithSourceContext(const InputType &input - , const TranslationOptionList &translationOptionList) const; - - void EvaluateWhenApplied(const Hypothesis& hypo, - ScoreComponentCollection* accumulator) const; - void EvaluateWhenApplied(const ChartHypothesis &hypo, - ScoreComponentCollection* accumulator) const; - - void SetParameter(const std::string& key, const std::string& value); - -}; - -} - From 1d7ed728eec85d916ab5331f4aa20a259b047b38 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Wed, 3 Jun 2015 00:00:57 +0700 Subject: [PATCH 043/108] =?UTF-8?q?Rename=20=E2=80=98aux=E2=80=99=20to=20?= =?UTF-8?q?=E2=80=98auxiliary=E2=80=99=20for=20Windows'=20sake.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Any filename like “aux” or “aux.*” is special in Windows, and can't be opened, dir'ed, and so on. This was causing some people problems. --- contrib/m4m/examples/giza-vs-fast.m4m | 2 +- contrib/m4m/modules/{aux.m4m => auxiliary.m4m} | 0 contrib/m4m/modules/m4m.m4m | 2 +- contrib/m4m/modules/prepare-corpus.m4m | 12 ++++++------ 4 files changed, 8 insertions(+), 8 deletions(-) rename contrib/m4m/modules/{aux.m4m => auxiliary.m4m} (100%) diff --git a/contrib/m4m/examples/giza-vs-fast.m4m b/contrib/m4m/examples/giza-vs-fast.m4m index 3ce336611..e5e56dc2a 100644 --- a/contrib/m4m/examples/giza-vs-fast.m4m +++ b/contrib/m4m/examples/giza-vs-fast.m4m @@ -96,4 +96,4 @@ reset-lm: -rm -rf lm reset-all: reset-lm reset-aln -rm -rf $(wildcard crp/trn/*/[ct]* crp/dev/[ct]* crp/tst/[ct]*) - -rm -rf aux + -rm -rf auxiliary diff --git a/contrib/m4m/modules/aux.m4m b/contrib/m4m/modules/auxiliary.m4m similarity index 100% rename from contrib/m4m/modules/aux.m4m rename to contrib/m4m/modules/auxiliary.m4m diff --git a/contrib/m4m/modules/m4m.m4m b/contrib/m4m/modules/m4m.m4m index d6c597db9..1a88e80b5 100644 --- a/contrib/m4m/modules/m4m.m4m +++ b/contrib/m4m/modules/m4m.m4m @@ -8,7 +8,7 @@ m4mdir := $(patsubst %modules/,%,\ # $(info M4MDIR is ${m4mdir}) # m4m modules to be included -M4M_MODULES := aux init +M4M_MODULES := auxiliary init M4M_MODULES += tools moses-parameters prepare-corpus M4M_MODULES += mgiza fastalign mmbitext phrase-table moses-ini M4M_MODULES += tune-moses eval-system kenlm diff --git a/contrib/m4m/modules/prepare-corpus.m4m b/contrib/m4m/modules/prepare-corpus.m4m index 3c88069c3..2c064c9c7 100644 --- a/contrib/m4m/modules/prepare-corpus.m4m +++ b/contrib/m4m/modules/prepare-corpus.m4m @@ -40,8 +40,8 @@ endef define truecase $2/cased/%.$3.gz: caser = ${run-truecaser} -$2/cased/%.$3.gz: caser += -model ${WDIR}/aux/truecasing-model.$1 -$2/cased/%.$3.gz: | $2/tok/%.$3.gz ${WDIR}/aux/truecasing-model.$1 +$2/cased/%.$3.gz: caser += -model ${WDIR}/auxiliary/truecasing-model.$1 +$2/cased/%.$3.gz: | $2/tok/%.$3.gz ${WDIR}/auxiliary/truecasing-model.$1 $$(lock) zcat $$(word 1, $$|) | ${parallel} --pipe -k $${caser} | gzip > $$@_ mv $$@_ $$@ @@ -127,8 +127,8 @@ endef # .SECONDARY: $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1}) # .SECONDARY: $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2}) -#${WDIR}/aux/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1}) -${WDIR}/aux/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) +#${WDIR}/auxiliary/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) $(call trn.tok-pll,${L1}) +${WDIR}/auxiliary/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) $(lock) $(if $|,,$(error Can't find training data for $@!))#' ${train-truecaser} -model $@_ -corpus <(echo $| | xargs zcat -f) @@ -136,8 +136,8 @@ ${WDIR}/aux/truecasing-model.${L1}: | $(call trn.tok-mno,${L1}) mv $@_ $@ $(unlock) -#${WDIR}/aux/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2}) -${WDIR}/aux/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) +#${WDIR}/auxiliary/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) $(call trn.tok-pll,${L2}) +${WDIR}/auxiliary/truecasing-model.${L2}: | $(call trn.tok-mno,${L2}) $(lock) $(if $|,,$(error Can't find training data for $@!))#' ${train-truecaser} -model $@_ -corpus <(echo $| | xargs zcat -f) From 3ea5faead8bb21d93ada5553dcb37d2229394415 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 2 Jun 2015 21:44:58 +0400 Subject: [PATCH 044/108] codelite --- contrib/other-builds/moses/moses.project | 10 +++++++--- .../CYKPlusParser/ChartRuleLookupManagerMemory.h | 3 --- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project index f902dd1f4..66e0b9bad 100644 --- a/contrib/other-builds/moses/moses.project +++ b/contrib/other-builds/moses/moses.project @@ -798,6 +798,10 @@ + + + + @@ -812,9 +816,9 @@ - - - + + + diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h index 84e5f085d..c8e2db2d7 100644 --- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h +++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h @@ -18,8 +18,6 @@ ***********************************************************************/ #pragma once -#ifndef moses_ChartRuleLookupManagerMemory_h -#define moses_ChartRuleLookupManagerMemory_h #include @@ -97,4 +95,3 @@ private: } // namespace Moses -#endif From 6bea23357c1d5a9a50382330d14f4c734f94ac98 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 09:28:38 +0100 Subject: [PATCH 045/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/pcfg-common/pcfg_tree.h | 79 ---------- phrase-extract/pcfg-common/syntax_tree.h | 93 ------------ phrase-extract/pcfg-common/typedef.h | 1 - phrase-extract/pcfg-common/xml_tree_parser.cc | 89 ------------ phrase-extract/pcfg-common/xml_tree_parser.h | 59 -------- phrase-extract/pcfg-common/xml_tree_writer.h | 135 ------------------ phrase-extract/pcfg-extract/Jamfile | 2 +- phrase-extract/pcfg-extract/pcfg_extract.cc | 34 ++--- phrase-extract/pcfg-extract/rule_extractor.cc | 16 +-- phrase-extract/pcfg-extract/rule_extractor.h | 6 +- phrase-extract/pcfg-score/pcfg_score.cc | 19 +-- phrase-extract/pcfg-score/tree_scorer.cc | 66 +++++++-- phrase-extract/pcfg-score/tree_scorer.h | 10 +- .../syntax-common/xml_tree_parser.cc | 5 +- .../syntax-common/xml_tree_parser.h | 36 ++++- .../syntax-common/xml_tree_writer.cc | 82 +++++++++++ .../syntax-common/xml_tree_writer.h | 27 ++++ 17 files changed, 245 insertions(+), 514 deletions(-) delete mode 100644 phrase-extract/pcfg-common/pcfg_tree.h delete mode 100644 phrase-extract/pcfg-common/syntax_tree.h delete mode 100644 phrase-extract/pcfg-common/xml_tree_parser.cc delete mode 100644 phrase-extract/pcfg-common/xml_tree_parser.h delete mode 100644 phrase-extract/pcfg-common/xml_tree_writer.h create mode 100644 phrase-extract/syntax-common/xml_tree_writer.cc create mode 100644 phrase-extract/syntax-common/xml_tree_writer.h diff --git a/phrase-extract/pcfg-common/pcfg_tree.h b/phrase-extract/pcfg-common/pcfg_tree.h deleted file mode 100644 index ce28eb8dd..000000000 --- a/phrase-extract/pcfg-common/pcfg_tree.h +++ /dev/null @@ -1,79 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_PCFG_TREE_H_ -#define PCFG_PCFG_TREE_H_ - -#include - -#include "syntax_tree.h" -#include "xml_tree_writer.h" - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -template -class PcfgTreeBase : public SyntaxTreeBase { - public: - typedef std::string LabelType; - typedef SyntaxTreeBase BaseType; - - PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {} - - double score() const { return score_; } - void set_score(double s) { score_ = s; } - - private: - double score_; -}; - -class PcfgTree : public PcfgTreeBase { - public: - typedef PcfgTreeBase BaseType; - PcfgTree(const BaseType::LabelType &label) : BaseType(label) {} -}; - -// Specialise XmlOutputHandler for PcfgTree. -template<> -class XmlOutputHandler { - public: - typedef std::map AttributeMap; - - void GetLabel(const PcfgTree &tree, std::string &label) const { - label = tree.label(); - } - - void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const { - attribute_map.clear(); - double score = tree.score(); - if (score != 0.0) { - std::ostringstream out; - out << tree.score(); - attribute_map["pcfg"] = out.str(); - } - } -}; - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/pcfg-common/syntax_tree.h b/phrase-extract/pcfg-common/syntax_tree.h deleted file mode 100644 index c0c6eaef9..000000000 --- a/phrase-extract/pcfg-common/syntax_tree.h +++ /dev/null @@ -1,93 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_SYNTAX_TREE_H_ -#define PCFG_SYNTAX_TREE_H_ - -#include -#include - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -// Base class for SyntaxTree, AgreementTree, and friends. -template -class SyntaxTreeBase { - public: - // Constructors - SyntaxTreeBase(const T &label) - : label_(label) - , children_() - , parent_(0) {} - - SyntaxTreeBase(const T &label, const std::vector &children) - : label_(label) - , children_(children) - , parent_(0) {} - - // Destructor - virtual ~SyntaxTreeBase(); - - const T &label() const { return label_; } - const DerivedType *parent() const { return parent_; } - DerivedType *parent() { return parent_; } - const std::vector &children() const { return children_; } - std::vector &children() { return children_; } - - void set_label(const T &label) { label_ = label; } - void set_parent(DerivedType *parent) { parent_ = parent; } - void set_children(const std::vector &c) { children_ = c; } - - bool IsLeaf() const { return children_.empty(); } - - bool IsPreterminal() const { - return children_.size() == 1 && children_[0]->IsLeaf(); - } - - void AddChild(DerivedType *child) { children_.push_back(child); } - - private: - T label_; - std::vector children_; - DerivedType *parent_; -}; - -template -class SyntaxTree : public SyntaxTreeBase > { - public: - typedef SyntaxTreeBase > BaseType; - SyntaxTree(const T &label) : BaseType(label) {} - SyntaxTree(const T &label, const std::vector &children) - : BaseType(label, children) {} -}; - -template -SyntaxTreeBase::~SyntaxTreeBase() { - for (std::size_t i = 0; i < children_.size(); ++i) { - delete children_[i]; - } -} - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h index e738163df..1280b89cf 100644 --- a/phrase-extract/pcfg-common/typedef.h +++ b/phrase-extract/pcfg-common/typedef.h @@ -24,7 +24,6 @@ #include #include "syntax-common/numbered_set.h" -#include "syntax_tree.h" namespace MosesTraining { namespace Syntax { diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc deleted file mode 100644 index f15a04811..000000000 --- a/phrase-extract/pcfg-common/xml_tree_parser.cc +++ /dev/null @@ -1,89 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#include "xml_tree_parser.h" - -#include -#include - -#include "tables-core.h" -#include "XmlException.h" -#include "XmlTree.h" -#include "util/tokenize.hh" - -#include "syntax-common/exception.h" - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -XmlTreeParser::XmlTreeParser() { -} - -std::auto_ptr XmlTreeParser::Parse(const std::string &line) { - m_line = line; - m_tree.Clear(); - try { - if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) { - throw Exception(""); - } - } catch (const XmlException &e) { - throw Exception(e.getMsg()); - } - m_tree.ConnectNodes(); - SyntaxNode *root = m_tree.GetTop(); - if (!root) { - // There is no XML tree. - return std::auto_ptr(); - } - m_words = util::tokenize(m_line); - return ConvertTree(*root, m_words); -} - -// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree. -std::auto_ptr XmlTreeParser::ConvertTree( - const SyntaxNode &tree, - const std::vector &words) { - std::auto_ptr root(new PcfgTree(tree.GetLabel())); - const std::vector &children = tree.GetChildren(); - if (children.empty()) { - if (tree.GetStart() != tree.GetEnd()) { - std::ostringstream msg; - msg << "leaf node covers multiple words (" << tree.GetStart() - << "-" << tree.GetEnd() << "): this is currently unsupported"; - throw Exception(msg.str()); - } - std::auto_ptr leaf(new PcfgTree(words[tree.GetStart()])); - leaf->set_parent(root.get()); - root->AddChild(leaf.release()); - } else { - for (std::vector::const_iterator p = children.begin(); - p != children.end(); ++p) { - assert(*p); - std::auto_ptr child = ConvertTree(**p, words); - child->set_parent(root.get()); - root->AddChild(child.release()); - } - } - return root; -} - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h deleted file mode 100644 index 8605c0691..000000000 --- a/phrase-extract/pcfg-common/xml_tree_parser.h +++ /dev/null @@ -1,59 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_XML_TREE_PARSER_H_ -#define PCFG_XML_TREE_PARSER_H_ - -#include -#include -#include -#include -#include - -#include "pcfg_tree.h" -#include "SyntaxNode.h" -#include "SyntaxNodeCollection.h" - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -// Parses a string in Moses' XML parse tree format and returns a PcfgTree -// object. -class XmlTreeParser { - public: - XmlTreeParser(); - std::auto_ptr Parse(const std::string &); - private: - std::auto_ptr ConvertTree(const MosesTraining::SyntaxNode &, - const std::vector &); - - std::set m_labelSet; - std::map m_topLabelSet; - std::string m_line; - MosesTraining::SyntaxNodeCollection m_tree; - std::vector m_words; -}; - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/pcfg-common/xml_tree_writer.h b/phrase-extract/pcfg-common/xml_tree_writer.h deleted file mode 100644 index 8582e544f..000000000 --- a/phrase-extract/pcfg-common/xml_tree_writer.h +++ /dev/null @@ -1,135 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_XML_TREE_WRITER_H_ -#define PCFG_XML_TREE_WRITER_H_ - -#include -#include -#include -#include -#include -#include - -#include "XmlTree.h" - -#include "syntax_tree.h" - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -template -class XmlOutputHandler { - public: - typedef std::map AttributeMap; - - void GetLabel(const InputTree &, std::string &) const; - void GetAttributes(const InputTree &, AttributeMap &) const; -}; - -template -class XmlTreeWriter : public XmlOutputHandler { - public: - typedef XmlOutputHandler Base; - void Write(const InputTree &, std::ostream &) const; - private: - std::string Escape(const std::string &) const; -}; - -template -void XmlTreeWriter::Write(const InputTree &tree, - std::ostream &out) const { - assert(!tree.IsLeaf()); - - // Opening tag - - std::string label; - Base::GetLabel(tree, label); - out << "first << "=\"" << p->second << "\""; - } - - out << ">"; - - // Children - - const std::vector &children = tree.children(); - for (typename std::vector::const_iterator p = children.begin(); - p != children.end(); ++p) { - InputTree &child = **p; - if (child.IsLeaf()) { - Base::GetLabel(child, label); - out << " " << Escape(label); - } else { - out << " "; - Write(**p, out); - } - } - - // Closing tag - out << " "; - - if (tree.parent() == 0) { - out << std::endl; - } -} - -// Escapes XML special characters. -template -std::string XmlTreeWriter::Escape(const std::string &s) const { - std::string t; - std::size_t len = s.size(); - t.reserve(len); - for (std::size_t i = 0; i < len; ++i) { - if (s[i] == '<') { - t += "<"; - } else if (s[i] == '>') { - t += ">"; - } else if (s[i] == '[') { - t += "["; - } else if (s[i] == ']') { - t += "]"; - } else if (s[i] == '|') { - t += "|"; - } else if (s[i] == '&') { - t += "&"; - } else if (s[i] == '\'') { - t += "'"; - } else if (s[i] == '"') { - t += """; - } else { - t += s[i]; - } - } - return t; -} - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/pcfg-extract/Jamfile b/phrase-extract/pcfg-extract/Jamfile index 61f056599..2442b967a 100644 --- a/phrase-extract/pcfg-extract/Jamfile +++ b/phrase-extract/pcfg-extract/Jamfile @@ -1 +1 @@ -exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : .. ; +exe pcfg-extract : [ glob *.cc ] ..//syntax-common ..//pcfg-common ../..//boost_program_options : .. ; diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc index 29d63b994..8e7a40e07 100644 --- a/phrase-extract/pcfg-extract/pcfg_extract.cc +++ b/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -19,20 +19,6 @@ #include "pcfg_extract.h" -#include "options.h" -#include "rule_collection.h" -#include "rule_extractor.h" - -#include "syntax-common/exception.h" - -#include "pcfg-common/pcfg.h" -#include "pcfg-common/pcfg_tree.h" -#include "pcfg-common/syntax_tree.h" -#include "pcfg-common/typedef.h" -#include "pcfg-common/xml_tree_parser.h" - -#include - #include #include #include @@ -43,6 +29,20 @@ #include #include +#include + +#include "syntax-common/exception.h" +#include "syntax-common/xml_tree_parser.h" + +#include "SyntaxTree.h" + +#include "pcfg-common/pcfg.h" +#include "pcfg-common/typedef.h" + +#include "options.h" +#include "rule_collection.h" +#include "rule_extractor.h" + namespace MosesTraining { namespace Syntax @@ -60,10 +60,12 @@ int PcfgExtract::Main(int argc, char *argv[]) Vocabulary non_term_vocab; RuleExtractor rule_extractor(non_term_vocab); RuleCollection rule_collection; - XmlTreeParser parser; + std::set label_set; + std::map top_label_set; + XmlTreeParser parser(label_set, top_label_set); std::string line; std::size_t line_num = 0; - std::auto_ptr tree; + std::auto_ptr tree; while (std::getline(std::cin, line)) { ++line_num; try { diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc index bd2c48c8a..39da54ef2 100644 --- a/phrase-extract/pcfg-extract/rule_extractor.cc +++ b/phrase-extract/pcfg-extract/rule_extractor.cc @@ -19,8 +19,6 @@ #include "rule_extractor.h" -#include "pcfg-common/pcfg_tree.h" - namespace MosesTraining { namespace Syntax @@ -33,21 +31,21 @@ RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab) { } -void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const +void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const { - if (tree.IsPreterminal() || tree.IsLeaf()) { + if (tree.IsLeaf() || tree.children()[0]->IsLeaf()) { return; } - std::size_t lhs = non_term_vocab_.Insert(tree.label()); + std::size_t lhs = non_term_vocab_.Insert(tree.value().GetLabel()); std::vector rhs; - const std::vector &children = tree.children(); + const std::vector &children = tree.children(); rhs.reserve(children.size()); - for (std::vector::const_iterator p(children.begin()); + for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { - const PcfgTree &child = **p; - rhs.push_back(non_term_vocab_.Insert(child.label())); + const SyntaxTree &child = **p; + rhs.push_back(non_term_vocab_.Insert(child.value().GetLabel())); Extract(child, rc); } rc.Add(lhs, rhs); diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h index f35460909..d32d76992 100644 --- a/phrase-extract/pcfg-extract/rule_extractor.h +++ b/phrase-extract/pcfg-extract/rule_extractor.h @@ -21,6 +21,8 @@ #ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_ #define PCFG_EXTRACT_RULE_EXTRACTOR_H_ +#include "SyntaxTree.h" + #include "pcfg-common/typedef.h" #include "rule_collection.h" @@ -32,14 +34,12 @@ namespace Syntax namespace PCFG { -class PcfgTree; - // Extracts PCFG rules from syntax trees and adds them to a RuleCollection. class RuleExtractor { public: RuleExtractor(Vocabulary &); - void Extract(const PcfgTree &, RuleCollection &) const; + void Extract(const MosesTraining::SyntaxTree &, RuleCollection &) const; private: Vocabulary &non_term_vocab_; }; diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc index 314e0fb38..d656d2882 100644 --- a/phrase-extract/pcfg-score/pcfg_score.cc +++ b/phrase-extract/pcfg-score/pcfg_score.cc @@ -33,13 +33,14 @@ #include +#include "SyntaxTree.h" + #include "syntax-common/exception.h" +#include "syntax-common/xml_tree_parser.h" +#include "syntax-common/xml_tree_writer.h" #include "pcfg-common/pcfg.h" -#include "pcfg-common/pcfg_tree.h" -#include "pcfg-common/syntax_tree.h" #include "pcfg-common/typedef.h" -#include "pcfg-common/xml_tree_parser.h" namespace MosesTraining { @@ -65,15 +66,17 @@ int PcfgScore::Main(int argc, char *argv[]) // Score corpus according to PCFG. TreeScorer scorer(pcfg, non_term_vocab); - XmlTreeParser parser; - XmlTreeWriter writer; + std::set label_set; + std::map top_label_set; + XmlTreeParser parser(label_set, top_label_set); + XmlTreeWriter writer(std::cout); std::string line; std::size_t line_num = 0; - std::auto_ptr tree; + std::auto_ptr tree; while (std::getline(std::cin, line)) { ++line_num; try { - tree = parser.Parse(line); + tree = parser.Parse(line, true); } catch (Exception &e) { std::ostringstream msg; msg << "line " << line_num << ": " << e.msg(); @@ -93,7 +96,7 @@ int PcfgScore::Main(int argc, char *argv[]) std::cout << line << std::endl; continue; } - writer.Write(*tree, std::cout); + writer.Write(*tree); } return 0; diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc index 74d6e79ef..61ae16e4c 100644 --- a/phrase-extract/pcfg-score/tree_scorer.cc +++ b/phrase-extract/pcfg-score/tree_scorer.cc @@ -20,6 +20,7 @@ #include "tree_scorer.h" #include +#include namespace MosesTraining { @@ -34,30 +35,41 @@ TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab) { } -bool TreeScorer::Score(PcfgTree &root) const +bool TreeScorer::Score(SyntaxTree &root) { - if (root.IsPreterminal() || root.IsLeaf()) { + scores_.clear(); + ZeroScores(root); + if (!CalcScores(root)) { + return false; + } + SetAttributes(root); + return true; +} + +bool TreeScorer::CalcScores(SyntaxTree &root) +{ + if (root.IsLeaf() || root.children()[0]->IsLeaf()) { return true; } - const std::vector &children = root.children(); + const std::vector &children = root.children(); double log_prob = 0.0; std::vector key; key.reserve(children.size()+1); - key.push_back(non_term_vocab_.Lookup(root.label())); + key.push_back(non_term_vocab_.Lookup(root.value().GetLabel())); - for (std::vector::const_iterator p(children.begin()); + for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { - PcfgTree *child = *p; + SyntaxTree *child = *p; assert(!child->IsLeaf()); - key.push_back(non_term_vocab_.Lookup(child->label())); - if (!Score(*child)) { + key.push_back(non_term_vocab_.Lookup(child->value().GetLabel())); + if (!CalcScores(*child)) { return false; } - if (!child->IsPreterminal()) { - log_prob += child->score(); + if (!child->children()[0]->IsLeaf()) { + log_prob += scores_[child]; } } double rule_score; @@ -66,10 +78,42 @@ bool TreeScorer::Score(PcfgTree &root) const return false; } log_prob += rule_score; - root.set_score(log_prob); + scores_[&root] = log_prob; return true; } +void TreeScorer::SetAttributes(SyntaxTree &root) +{ + // Terminals don't need attributes. + if (root.IsLeaf()) { + return; + } + // Preterminals don't need attributes (they have the implicit score 0.0). + if (root.children()[0]->IsLeaf()) { + return; + } + double score = scores_[&root]; + if (score != 0.0) { + std::ostringstream out; + out << score; + root.value().attributes["pcfg"] = out.str(); + } + for (std::vector::const_iterator p(root.children().begin()); + p != root.children().end(); ++p) { + SetAttributes(**p); + } +} + +void TreeScorer::ZeroScores(SyntaxTree &root) +{ + scores_[&root] = 0.0f; + const std::vector &children = root.children(); + for (std::vector::const_iterator p(children.begin()); + p != children.end(); ++p) { + ZeroScores(**p); + } +} + } // namespace PCFG } // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h index 8b1afcc3a..cf9fdd1a3 100644 --- a/phrase-extract/pcfg-score/tree_scorer.h +++ b/phrase-extract/pcfg-score/tree_scorer.h @@ -21,8 +21,9 @@ #ifndef PCFG_SCORE_TREE_SCORER_H_ #define PCFG_SCORE_TREE_SCORER_H_ +#include "SyntaxTree.h" + #include "pcfg-common/pcfg.h" -#include "pcfg-common/pcfg_tree.h" #include "pcfg-common/typedef.h" namespace MosesTraining @@ -39,11 +40,16 @@ public: // Score tree according to PCFG. Returns false if unsuccessful (due to // missing rule). - bool Score(PcfgTree &) const; + bool Score(SyntaxTree &); private: const Pcfg &pcfg_; const Vocabulary &non_term_vocab_; + std::map scores_; + + bool CalcScores(SyntaxTree &); + void SetAttributes(SyntaxTree &); + void ZeroScores(SyntaxTree &); }; } // namespace PCFG diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index bf3c6d87e..6eeb110e9 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -20,13 +20,14 @@ XmlTreeParser::XmlTreeParser(std::set &labelSet, { } -std::auto_ptr XmlTreeParser::Parse(const std::string &line) +std::auto_ptr XmlTreeParser::Parse(const std::string &line, + bool unescape) { line_ = line; node_collection_.Clear(); try { if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_, - top_label_set_, false)) { + top_label_set_, unescape)) { throw Exception(""); } } catch (const XmlException &e) { diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index e0b75c830..0f671c65a 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -15,18 +15,42 @@ namespace MosesTraining { namespace Syntax { -// Parses a string in Moses' XML parse tree format and returns a SyntaxTree -// object. This is a wrapper around the ProcessAndStripXMLTags function. +/** Parses string representations of parse trees in Moses' XML format and + * converts them to SyntaxTree objects. + * + * This is a thin wrapper around the ProcessAndStripXMLTags function. After + * calling Parse(), the output of the ProcessAndStripXMLTags function (the + * sentence, node collection, label set, and top label set) are available via + * accessors. + */ class XmlTreeParser { public: XmlTreeParser(std::set &, std::map &); - std::auto_ptr Parse(const std::string &); + //! Parse a single sentence and return a SyntaxTree (with words attached). + std::auto_ptr Parse(const std::string &, bool=false); - const std::vector& GetWords() { - return words_; - } + // TODO + //! Get the sentence string (see ProcessAndStripXMLTags) + //const std::string &sentence() const; + // FIXME + //! Get the sentence as a vector of tokens + const std::vector& GetWords() { return words_; } + + // TODO + //! Get the node collection (see ProcessAndStripXMLTags) + const SyntaxNodeCollection &node_collection() const; + + // TODO + //! Get the label set (see ProcessAndStripXMLTags) + const std::set &label_set() const; + + // TODO + //! Get the top label set (see ProcessAndStripXMLTags) + const std::map &top_label_set() const; + + // FIXME const SyntaxNodeCollection &GetNodeCollection() const { return node_collection_; } diff --git a/phrase-extract/syntax-common/xml_tree_writer.cc b/phrase-extract/syntax-common/xml_tree_writer.cc new file mode 100644 index 000000000..3c16cb2eb --- /dev/null +++ b/phrase-extract/syntax-common/xml_tree_writer.cc @@ -0,0 +1,82 @@ +#include "xml_tree_writer.h" + +#include +#include +#include +#include + +#include "SyntaxTree.h" +#include "XmlTree.h" + + +namespace MosesTraining { +namespace Syntax { + +void XmlTreeWriter::Write(const SyntaxTree &tree) const { + assert(!tree.IsLeaf()); + + // Opening tag + out_ << "first != "label") { + out_ << " " << p->first << "=\"" << p->second << "\""; + } + } + out_ << ">"; + + // Children + for (std::vector::const_iterator p = tree.children().begin(); + p != tree.children().end(); ++p) { + SyntaxTree &child = **p; + if (child.IsLeaf()) { + out_ << " " << Escape(child.value().GetLabel()); + } else { + out_ << " "; + Write(child); + } + } + + // Closing tag + out_ << " "; + + if (tree.parent() == 0) { + out_ << std::endl; + } +} + +// Escapes XML special characters. +std::string XmlTreeWriter::Escape(const std::string &s) const { + if (!escape_) { + return s; + } + std::string t; + std::size_t len = s.size(); + t.reserve(len); + for (std::size_t i = 0; i < len; ++i) { + if (s[i] == '<') { + t += "<"; + } else if (s[i] == '>') { + t += ">"; + } else if (s[i] == '[') { + t += "["; + } else if (s[i] == ']') { + t += "]"; + } else if (s[i] == '|') { + t += "|"; + } else if (s[i] == '&') { + t += "&"; + } else if (s[i] == '\'') { + t += "'"; + } else if (s[i] == '"') { + t += """; + } else { + t += s[i]; + } + } + return t; +} + +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/syntax-common/xml_tree_writer.h b/phrase-extract/syntax-common/xml_tree_writer.h new file mode 100644 index 000000000..b39d01fab --- /dev/null +++ b/phrase-extract/syntax-common/xml_tree_writer.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include + +#include "SyntaxTree.h" + +namespace MosesTraining { +namespace Syntax { + +class XmlTreeWriter { + public: + XmlTreeWriter(std::ostream &out, bool escape=true) + : out_(out) + , escape_(escape) {} + + void Write(const SyntaxTree &) const; + + private: + std::string Escape(const std::string &) const; + + std::ostream &out_; + bool escape_; +}; + +} // namespace Syntax +} // namespace MosesTraining From 2e21f051f217a6b835433cbc456bdcc841187ec0 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 10:05:36 +0100 Subject: [PATCH 046/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 43 +++++++-------- .../filter-rule-table/FilterRuleTable.cpp | 4 +- phrase-extract/pcfg-extract/pcfg_extract.cc | 4 +- phrase-extract/pcfg-score/pcfg_score.cc | 4 +- .../syntax-common/xml_tree_parser.cc | 15 ++---- .../syntax-common/xml_tree_parser.h | 53 ++++++++----------- 6 files changed, 49 insertions(+), 74 deletions(-) diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 2293371ac..c48a37367 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -119,14 +119,6 @@ int ExtractGHKM::Main(int argc, char *argv[]) OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream); } - // Target label sets for producing glue grammar. - std::set targetLabelSet; - std::map targetTopLabelSet; - - // Source label sets for producing glue grammar. - std::set sourceLabelSet; - std::map sourceTopLabelSet; - // Word count statistics for producing unknown word labels. std::map targetWordCount; std::map targetWordLabel; @@ -139,8 +131,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::string sourceLine; std::string alignmentLine; Alignment alignment; - Syntax::XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet); - Syntax::XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet); + Syntax::XmlTreeParser targetXmlTreeParser; + Syntax::XmlTreeParser sourceXmlTreeParser; ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options); StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options); size_t lineNum = options.sentenceOffset; @@ -194,7 +186,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) } Error(oss.str()); } - sourceTokens = sourceXmlTreeParser.GetWords(); + sourceTokens = sourceXmlTreeParser.words(); } // Read word alignments. @@ -240,7 +232,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Initialize phrase orientation scoring object PhraseOrientation phraseOrientation(sourceTokens.size(), - targetXmlTreeParser.GetWords().size(), alignment); + targetXmlTreeParser.words().size(), alignment); // Write the rules, subject to scope pruning. const std::vector &targetNodes = graph.GetTargetNodes(); @@ -272,7 +264,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // SCFG output. ScfgRule *r = 0; if (options.sourceLabels) { - r = new ScfgRule(**q, &sourceXmlTreeParser.GetNodeCollection()); + r = new ScfgRule(**q, &sourceXmlTreeParser.node_collection()); } else { r = new ScfgRule(**q); } @@ -315,14 +307,14 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::map sourceLabels; if (options.sourceLabels && !options.sourceLabelSetFile.empty()) { - - sourceLabelSet.insert("XLHS"); // non-matching label (left-hand side) - sourceLabelSet.insert("XRHS"); // non-matching label (right-hand side) - sourceLabelSet.insert("TOPLABEL"); // as used in the glue grammar - sourceLabelSet.insert("SOMELABEL"); // as used in the glue grammar + std::set extendedLabelSet = sourceXmlTreeParser.label_set(); + extendedLabelSet.insert("XLHS"); // non-matching label (left-hand side) + extendedLabelSet.insert("XRHS"); // non-matching label (right-hand side) + extendedLabelSet.insert("TOPLABEL"); // as used in the glue grammar + extendedLabelSet.insert("SOMELABEL"); // as used in the glue grammar size_t index = 0; - for (std::set::const_iterator iter=sourceLabelSet.begin(); - iter!=sourceLabelSet.end(); ++iter, ++index) { + for (std::set::const_iterator iter=extendedLabelSet.begin(); + iter!=extendedLabelSet.end(); ++iter, ++index) { sourceLabels.insert(std::pair(*iter,index)); } WriteSourceLabelSet(sourceLabels, sourceLabelSetStream); @@ -332,14 +324,18 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::map strippedTargetTopLabelSet; if (options.stripBitParLabels && (!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) { - StripBitParLabels(targetLabelSet, targetTopLabelSet, strippedTargetLabelSet, strippedTargetTopLabelSet); + StripBitParLabels(targetXmlTreeParser.label_set(), + targetXmlTreeParser.top_label_set(), + strippedTargetLabelSet, strippedTargetTopLabelSet); } if (!options.glueGrammarFile.empty()) { if (options.stripBitParLabels) { WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream); } else { - WriteGlueGrammar(targetLabelSet, targetTopLabelSet, sourceLabels, options, glueGrammarStream); + WriteGlueGrammar(targetXmlTreeParser.label_set(), + targetXmlTreeParser.top_label_set(), + sourceLabels, options, glueGrammarStream); } } @@ -355,7 +351,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) if (options.stripBitParLabels) { WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream); } else { - WriteUnknownWordSoftMatches(targetLabelSet, unknownWordSoftMatchesStream); + WriteUnknownWordSoftMatches(targetXmlTreeParser.label_set(), + unknownWordSoftMatchesStream); } } diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp index 0c6f132f8..32d2019cf 100644 --- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp +++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp @@ -126,9 +126,7 @@ void FilterRuleTable::ReadTestSet( void FilterRuleTable::ReadTestSet( std::istream &input, std::vector > &sentences) { - std::set labelSet; - std::map topLabelSet; - XmlTreeParser parser(labelSet, topLabelSet); + XmlTreeParser parser; int lineNum = 0; std::string line; while (std::getline(input, line)) { diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc index 8e7a40e07..87419edb7 100644 --- a/phrase-extract/pcfg-extract/pcfg_extract.cc +++ b/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -60,9 +60,7 @@ int PcfgExtract::Main(int argc, char *argv[]) Vocabulary non_term_vocab; RuleExtractor rule_extractor(non_term_vocab); RuleCollection rule_collection; - std::set label_set; - std::map top_label_set; - XmlTreeParser parser(label_set, top_label_set); + XmlTreeParser parser; std::string line; std::size_t line_num = 0; std::auto_ptr tree; diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc index d656d2882..e11f73f70 100644 --- a/phrase-extract/pcfg-score/pcfg_score.cc +++ b/phrase-extract/pcfg-score/pcfg_score.cc @@ -66,9 +66,7 @@ int PcfgScore::Main(int argc, char *argv[]) // Score corpus according to PCFG. TreeScorer scorer(pcfg, non_term_vocab); - std::set label_set; - std::map top_label_set; - XmlTreeParser parser(label_set, top_label_set); + XmlTreeParser parser; XmlTreeWriter writer(std::cout); std::string line; std::size_t line_num = 0; diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index 6eeb110e9..34f566a03 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -10,23 +10,18 @@ #include "XmlException.h" #include "XmlTree.h" +#include "exception.h" + namespace MosesTraining { namespace Syntax { -XmlTreeParser::XmlTreeParser(std::set &labelSet, - std::map &topLabelSet) - : label_set_(labelSet) - , top_label_set_(topLabelSet) -{ -} - std::auto_ptr XmlTreeParser::Parse(const std::string &line, bool unescape) { - line_ = line; + sentence_ = line; node_collection_.Clear(); try { - if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_, + if (!ProcessAndStripXMLTags(sentence_, node_collection_, label_set_, top_label_set_, unescape)) { throw Exception(""); } @@ -34,7 +29,7 @@ std::auto_ptr XmlTreeParser::Parse(const std::string &line, throw Exception(e.getMsg()); } std::auto_ptr root = node_collection_.ExtractTree(); - words_ = util::tokenize(line_); + words_ = util::tokenize(sentence_); AttachWords(words_, *root); return root; } diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index 0f671c65a..48ea056b8 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -6,12 +6,9 @@ #include #include -#include "SyntaxNode.h" #include "SyntaxNodeCollection.h" #include "SyntaxTree.h" -#include "exception.h" - namespace MosesTraining { namespace Syntax { @@ -25,44 +22,36 @@ namespace Syntax { */ class XmlTreeParser { public: - XmlTreeParser(std::set &, std::map &); - //! Parse a single sentence and return a SyntaxTree (with words attached). - std::auto_ptr Parse(const std::string &, bool=false); + std::auto_ptr Parse(const std::string &, bool unescape=false); - // TODO - //! Get the sentence string (see ProcessAndStripXMLTags) - //const std::string &sentence() const; + //! Get the sentence string (as returned by ProcessAndStripXMLTags). + const std::string &sentence() const { return sentence_; } - // FIXME - //! Get the sentence as a vector of tokens - const std::vector& GetWords() { return words_; } + //! Get the sentence as a vector of words. + const std::vector &words() const { return words_; } - // TODO - //! Get the node collection (see ProcessAndStripXMLTags) - const SyntaxNodeCollection &node_collection() const; - - // TODO - //! Get the label set (see ProcessAndStripXMLTags) - const std::set &label_set() const; - - // TODO - //! Get the top label set (see ProcessAndStripXMLTags) - const std::map &top_label_set() const; - - // FIXME - const SyntaxNodeCollection &GetNodeCollection() const { + //! Get the node collection (as returned by ProcessAndStripXMLTags). + const SyntaxNodeCollection &node_collection() const { return node_collection_; } - private: - std::set &label_set_; - std::map &top_label_set_; - std::string line_; - SyntaxNodeCollection node_collection_; - std::vector words_; + //! Get the label set (as returned by ProcessAndStripXMLTags). + const std::set &label_set() const { return label_set_; } + //! Get the top label set (as returned by ProcessAndStripXMLTags). + const std::map &top_label_set() const { + return top_label_set_; + } + + private: void AttachWords(const std::vector &, SyntaxTree &); + + std::string sentence_; + SyntaxNodeCollection node_collection_; + std::set label_set_; + std::map top_label_set_; + std::vector words_; }; } // namespace Syntax From 5e09d3dc71ab8391c651418c01aa5c324e53683b Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 10:33:46 +0100 Subject: [PATCH 047/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNode.h | 25 +------------- phrase-extract/SyntaxNodeCollection.cpp | 43 ------------------------- phrase-extract/SyntaxNodeCollection.h | 10 +----- phrase-extract/XmlTree.cpp | 5 --- phrase-extract/extract-rules-main.cpp | 19 ++++++++--- 5 files changed, 17 insertions(+), 85 deletions(-) diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h index 5f57e1790..883f9724f 100644 --- a/phrase-extract/SyntaxNode.h +++ b/phrase-extract/SyntaxNode.h @@ -32,9 +32,6 @@ class SyntaxNode protected: int m_start, m_end; std::string m_label; - std::vector< SyntaxNode* > m_children; - SyntaxNode* m_parent; - float m_pcfgScore; public: typedef std::map AttributeMap; @@ -43,9 +40,7 @@ public: SyntaxNode( int startPos, int endPos, std::string label ) :m_start(startPos) ,m_end(endPos) - ,m_label(label) - ,m_parent(0) - ,m_pcfgScore(0.0f) { + ,m_label(label) { } int GetStart() const { return m_start; @@ -56,24 +51,6 @@ public: std::string GetLabel() const { return m_label; } - float GetPcfgScore() const { - return m_pcfgScore; - } - void SetPcfgScore(float score) { - m_pcfgScore = score; - } - SyntaxNode *GetParent() { - return m_parent; - } - void SetParent(SyntaxNode *parent) { - m_parent = parent; - } - void AddChild(SyntaxNode* child) { - m_children.push_back(child); - } - const std::vector< SyntaxNode* > &GetChildren() const { - return m_children; - } }; } // namespace MosesTraining diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 60a2f6c2f..e1c9c44e1 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -33,7 +33,6 @@ SyntaxNodeCollection::~SyntaxNodeCollection() void SyntaxNodeCollection::Clear() { - m_top = 0; // loop through all m_nodes, delete them for(size_t i=0; i& SyntaxNodeCollection::GetNodes( int startPos, return endIndex->second; } -void SyntaxNodeCollection::ConnectNodes() -{ - typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator; - - SyntaxNode *prev = 0; - // Iterate over all start indices from lowest to highest. - for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) { - const SyntaxTreeIndex2 &inner = p->second; - // Iterate over all end indices from highest to lowest. - for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) { - const std::vector &nodes = q->second; - // Iterate over all nodes that cover the same span in order of tree - // depth, top-most first. - for (std::vector::const_reverse_iterator r = nodes.rbegin(); - r != nodes.rend(); ++r) { - SyntaxNode *node = *r; - if (!prev) { - // node is the root. - m_top = node; - node->SetParent(0); - } else if (prev->GetStart() == node->GetStart()) { - // prev is the parent of node. - assert(prev->GetEnd() >= node->GetEnd()); - node->SetParent(prev); - prev->AddChild(node); - } else { - // prev is a descendant of node's parent. The lowest common - // ancestor of prev and node will be node's parent. - SyntaxNode *ancestor = prev->GetParent(); - while (ancestor->GetEnd() < node->GetEnd()) { - ancestor = ancestor->GetParent(); - } - assert(ancestor); - node->SetParent(ancestor); - ancestor->AddChild(node); - } - prev = node; - } - } - } -} - std::auto_ptr SyntaxNodeCollection::ExtractTree() { std::map nodeToTree; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index a0d19841c..c8ca67d3d 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -38,7 +38,6 @@ class SyntaxNodeCollection { protected: std::vector< SyntaxNode* > m_nodes; - SyntaxNode* m_top; typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2; typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2; @@ -49,18 +48,12 @@ protected: std::vector< SyntaxNode* > m_emptyNode; public: - SyntaxNodeCollection() - : m_top(0) // m_top doesn't get set unless ConnectNodes is called. - , m_size(0) {} + SyntaxNodeCollection() : m_size(0) {} ~SyntaxNodeCollection(); SyntaxNode *AddNode( int startPos, int endPos, const std::string &label ); - SyntaxNode *GetTop() { - return m_top; - } - ParentNodes Parse(); bool HasNode( int startPos, int endPos ) const; const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; @@ -70,7 +63,6 @@ public: size_t GetNumWords() const { return m_size; } - void ConnectNodes(); void Clear(); std::auto_ptr ExtractTree(); diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index d3c5da900..ffbbd453a 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -398,10 +398,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, string label = ParseXmlTagAttribute(tagContent,"label"); labelCollection.insert( label ); - string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg"); - float pcfgScore = pcfgString == "" ? 0.0f - : std::atof(pcfgString.c_str()); - // report what we have processed so far if (0) { cerr << "XML TAG NAME IS: '" << tagName << "'" << endl; @@ -409,7 +405,6 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; } SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label ); - node->SetPcfgScore(pcfgScore); ParseXmlTagAttributes(tagContent, node->attributes); } } diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 825f12d89..8f1ff758b 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -110,6 +110,8 @@ void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence ); void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection); void writeUnknownWordLabel(const string &); +double getPcfgScore(const SyntaxNode &); + int main(int argc, char* argv[]) { @@ -564,8 +566,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int } if (m_options.pcfgScore) { - double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore(); - logPCFGScore -= score; + logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]); } currPos = hole.GetEnd(1); @@ -689,7 +690,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS // target if (m_options.pcfgScore) { - double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore(); + double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]); rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS) + " [" + targetLabel + "]"; rule.pcfgScore = std::exp(logPCFGScore); @@ -973,7 +974,7 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count rule.target += "[" + targetLabel + "]"; if (m_options.pcfgScore) { - double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore(); + double logPCFGScore = getPcfgScore(*m_sentence.targetTree.GetNodes(startT,endT)[0]); rule.pcfgScore = std::exp(logPCFGScore); } @@ -1194,3 +1195,13 @@ void writeUnknownWordLabel(const string & fileName) outFile.close(); } + +double getPcfgScore(const SyntaxNode &node) +{ + double score = 0.0f; + SyntaxNode::AttributeMap::const_iterator p = node.attributes.find("pcfg"); + if (p != node.attributes.end()) { + score = std::atof(p->second.c_str()); + } + return score; +} From ed321791a75c6177b218a0098d184c308bc9c561 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 11:10:45 +0100 Subject: [PATCH 048/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNode.h | 36 +++++---------- phrase-extract/SyntaxNodeCollection.cpp | 8 ++-- phrase-extract/XmlTree.cpp | 2 +- .../extract-ghkm/AlignmentGraph.cpp | 3 +- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 6 +-- phrase-extract/extract-ghkm/ScfgRule.cpp | 2 +- phrase-extract/extract-rules-main.cpp | 16 +++---- .../filter-rule-table/TreeTsgFilter.cpp | 2 +- phrase-extract/pcfg-extract/rule_extractor.cc | 4 +- phrase-extract/pcfg-score/tree_scorer.cc | 4 +- phrase-extract/relax-parse-main.cpp | 44 +++++++++---------- .../syntax-common/xml_tree_parser.cc | 6 +-- .../syntax-common/xml_tree_writer.cc | 4 +- 13 files changed, 62 insertions(+), 75 deletions(-) diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h index 883f9724f..f38e94713 100644 --- a/phrase-extract/SyntaxNode.h +++ b/phrase-extract/SyntaxNode.h @@ -20,37 +20,23 @@ #pragma once #include -#include #include -#include -namespace MosesTraining -{ +namespace MosesTraining { -class SyntaxNode -{ -protected: - int m_start, m_end; - std::string m_label; -public: +struct SyntaxNode { typedef std::map AttributeMap; - AttributeMap attributes; + SyntaxNode(const std::string &label_, int start_, int end_) + : label(label_) + , start(start_) + , end(end_) { + } - SyntaxNode( int startPos, int endPos, std::string label ) - :m_start(startPos) - ,m_end(endPos) - ,m_label(label) { - } - int GetStart() const { - return m_start; - } - int GetEnd() const { - return m_end; - } - std::string GetLabel() const { - return m_label; - } + std::string label; + int start; + int end; + AttributeMap attributes; }; } // namespace MosesTraining diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index e1c9c44e1..7421cc0ed 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -44,7 +44,7 @@ void SyntaxNodeCollection::Clear() SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, const std::string &label) { - SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label ); + SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos); m_nodes.push_back( newNode ); m_index[ startPos ][ endPos ].push_back( newNode ); m_size = std::max(endPos+1, m_size); @@ -141,16 +141,16 @@ std::auto_ptr SyntaxNodeCollection::ExtractTree() // node is the root. root = tree; tree->parent() = 0; - } else if (prevNode->GetStart() == node->GetStart()) { + } else if (prevNode->start == node->start) { // prevNode is the parent of node. - assert(prevNode->GetEnd() >= node->GetEnd()); + assert(prevNode->end >= node->end); tree->parent() = prevTree; prevTree->children().push_back(tree); } else { // prevNode is a descendant of node's parent. The lowest common // ancestor of prevNode and node will be node's parent. SyntaxTree *ancestor = prevTree->parent(); - while (ancestor->value().GetEnd() < tree->value().GetEnd()) { + while (ancestor->value().end < tree->value().end) { ancestor = ancestor->parent(); } assert(ancestor); diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index ffbbd453a..d8b77b6e6 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -419,7 +419,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 ); for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) { SyntaxNode *n = *node; - const string &label = n->GetLabel(); + const string &label = n->label; if (topLabelCollection.find( label ) == topLabelCollection.end()) topLabelCollection[ label ] = 0; topLabelCollection[ label ]++; diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 1a3c23de5..7c179295f 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -213,7 +214,7 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root) { NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE; - std::auto_ptr n(new Node(root->value().GetLabel(), nodeType)); + std::auto_ptr n(new Node(root->value().label, nodeType)); if (nodeType == TREE) { float score = 0.0f; diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index c48a37367..c96cda146 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -813,7 +813,7 @@ void ExtractGHKM::CollectWordLabelCounts( for (SyntaxTree::ConstLeafIterator p(root); p != SyntaxTree::ConstLeafIterator(); ++p) { const SyntaxTree &leaf = *p; - const std::string &word = leaf.value().GetLabel(); + const std::string &word = leaf.value().label; const SyntaxTree *ancestor = leaf.parent(); // If unary rule elimination is enabled and this word is at the end of a // chain of unary rewrites, e.g. @@ -825,7 +825,7 @@ void ExtractGHKM::CollectWordLabelCounts( ancestor->parent()->children().size() == 1) { ancestor = ancestor->parent(); } - const std::string &label = ancestor->value().GetLabel(); + const std::string &label = ancestor->value().label; ++wordCount[word]; wordLabel[word] = label; } @@ -837,7 +837,7 @@ std::vector ExtractGHKM::ReadTokens(const SyntaxTree &root) const for (SyntaxTree::ConstLeafIterator p(root); p != SyntaxTree::ConstLeafIterator(); ++p) { const SyntaxTree &leaf = *p; - const std::string &word = leaf.value().GetLabel(); + const std::string &word = leaf.value().label; tokens.push_back(word); } return tokens; diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index a6fc19dd9..1a49c862e 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -144,7 +144,7 @@ void ScfgRule::PushSourceLabel(const SyntaxNodeCollection *sourceNodeCollection, sourceNodeCollection->GetNodes(span.first,span.second); if (!sourceLabels.empty()) { // store the topmost matching label from the source syntax tree - m_sourceLabels.push_back(sourceLabels.back()->GetLabel()); + m_sourceLabels.push_back(sourceLabels.back()->label); } } else { // no matching source-side syntactic constituent: store nonMatchingLabel diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 8f1ff758b..e6fff965d 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -507,7 +507,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS, int labelI = labelIndex[ 2+holeCount+holeTotal ]; string label = m_options.sourceSyntax ? - m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X"; + m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->label : "X"; hole.SetLabel(label, 0); currPos = hole.GetEnd(0); @@ -550,7 +550,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int int labelI = labelIndex[ 2+holeCount ]; string targetLabel; if (m_options.targetSyntax) { - targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel(); + targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label; } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; } else { @@ -675,7 +675,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS // phrase labels string targetLabel; if (m_options.targetSyntax) { - targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel(); + targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label; } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; } else { @@ -683,7 +683,7 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS } string sourceLabel = m_options.sourceSyntax ? - m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X"; + m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->label : "X"; // create non-terms on the source side preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex); @@ -947,13 +947,13 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count // phrase labels string targetLabel,sourceLabel; if (m_options.targetSyntax && m_options.conditionOnTargetLhs) { - sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel(); + sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label; } else { sourceLabel = m_options.sourceSyntax ? - m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X"; + m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X"; if (m_options.targetSyntax) { - targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel(); + targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label; } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; } else { @@ -1166,7 +1166,7 @@ void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence ) const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti); if (labels.size() > 0) { wordCount[ word ]++; - wordLabel[ word ] = labels[0]->GetLabel(); + wordLabel[ word ] = labels[0]->label; } } } diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp index 17a8dcb22..b9c58228d 100644 --- a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp +++ b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp @@ -27,7 +27,7 @@ TreeTsgFilter::TreeTsgFilter( TreeTsgFilter::IdTree *TreeTsgFilter::SyntaxTreeToIdTree(const SyntaxTree &s) { - IdTree *t = new IdTree(m_testVocab.Insert(s.value().GetLabel())); + IdTree *t = new IdTree(m_testVocab.Insert(s.value().label)); const std::vector &sChildren = s.children(); std::vector &tChildren = t->children(); tChildren.reserve(sChildren.size()); diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc index 39da54ef2..f20f2d978 100644 --- a/phrase-extract/pcfg-extract/rule_extractor.cc +++ b/phrase-extract/pcfg-extract/rule_extractor.cc @@ -37,7 +37,7 @@ void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const return; } - std::size_t lhs = non_term_vocab_.Insert(tree.value().GetLabel()); + std::size_t lhs = non_term_vocab_.Insert(tree.value().label); std::vector rhs; const std::vector &children = tree.children(); @@ -45,7 +45,7 @@ void RuleExtractor::Extract(const SyntaxTree &tree, RuleCollection &rc) const for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { const SyntaxTree &child = **p; - rhs.push_back(non_term_vocab_.Insert(child.value().GetLabel())); + rhs.push_back(non_term_vocab_.Insert(child.value().label)); Extract(child, rc); } rc.Add(lhs, rhs); diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc index 61ae16e4c..3c6b6b0c8 100644 --- a/phrase-extract/pcfg-score/tree_scorer.cc +++ b/phrase-extract/pcfg-score/tree_scorer.cc @@ -58,13 +58,13 @@ bool TreeScorer::CalcScores(SyntaxTree &root) std::vector key; key.reserve(children.size()+1); - key.push_back(non_term_vocab_.Lookup(root.value().GetLabel())); + key.push_back(non_term_vocab_.Lookup(root.value().label)); for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { SyntaxTree *child = *p; assert(!child->IsLeaf()); - key.push_back(non_term_vocab_.Lookup(child->value().GetLabel())); + key.push_back(non_term_vocab_.Lookup(child->value().label)); if (!CalcScores(*child)) { return false; } diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index 5bca886bf..4b5c2d573 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -118,9 +118,9 @@ void store( SyntaxNodeCollection &tree, const vector< string > &words ) // output tree nodes vector< SyntaxNode* > nodes = tree.GetAllNodes(); for( size_t i=0; iGetStart() - << "-" << nodes[i]->GetEnd() - << "\" label=\"" << nodes[i]->GetLabel() + cout << " start + << "-" << nodes[i]->end + << "\" label=\"" << nodes[i]->label << "\"/>"; } cout << endl; @@ -133,7 +133,7 @@ void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents ) if (point.size() > 3) { const vector< SyntaxNode* >& topNodes = tree.GetNodes( point[0], point[point.size()-1]-1); - string topLabel = topNodes[0]->GetLabel(); + string topLabel = topNodes[0]->label; for(size_t i=2; i& topNodes = tree.GetNodes( point[0], endPoint); - string topLabel = topNodes[0]->GetLabel(); + string topLabel = topNodes[0]->label; for(size_t i=1; iGetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl; + // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->label << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->label << endl; newTree.AddNode( point[i],point[i+2]-1, - tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() + tree.GetNodes(point[i ],point[i+1]-1)[0]->label + "+" + - tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() ); + tree.GetNodes(point[i+1],point[i+2]-1)[0]->label); } } if (point.size() >= 4) { int ps = point.size(); - string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel(); + string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->label; - // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl; + // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->label << endl; newTree.AddNode( point[1],point[ps-1]-1, topLabel + "\\" + - tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() ); + tree.GetNodes(point[0],point[1]-1)[0]->label ); - // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl; + // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label << endl; newTree.AddNode( point[0],point[ps-2]-1, topLabel + "/" + - tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() ); + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label ); } } @@ -219,12 +219,12 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) for(int mid=start+1; mid<=end && !done; mid++) { if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) { - // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid, end )[0]->GetLabel() << endl; + // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->label << "++" << tree.GetNodes(mid, end )[0]->label << endl; newTree.AddNode( start, end, - tree.GetNodes(start,mid-1)[0]->GetLabel() + tree.GetNodes(start,mid-1)[0]->label + "++" + - tree.GetNodes(mid, end )[0]->GetLabel() ); + tree.GetNodes(mid, end )[0]->label ); done = true; } } @@ -234,9 +234,9 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) for(int postEnd=end+1; postEndGetLabel() + tree.GetNodes(start,postEnd)[0]->label + "//" + - tree.GetNodes(end+1,postEnd)[0]->GetLabel() ); + tree.GetNodes(end+1,postEnd)[0]->label ); done = true; } } @@ -245,11 +245,11 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) // if matching a constituent A left-minus constituent B: use A\\B for(int preStart=start-1; preStart>=0; preStart--) { if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) { - // cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->GetLabel() << "\\\\" <GetLabel() << endl; + // cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->label << "\\\\" <label << endl; newTree.AddNode( start, end, - tree.GetNodes(preStart,end )[0]->GetLabel() + tree.GetNodes(preStart,end )[0]->label + "\\\\" + - tree.GetNodes(preStart,start-1)[0]->GetLabel() ); + tree.GetNodes(preStart,start-1)[0]->label ); done = true; } } @@ -268,6 +268,6 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) // adding all new nodes vector< SyntaxNode* > nodes = newTree.GetAllNodes(); for( size_t i=0; iGetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel()); + tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label); } } diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index 34f566a03..8bd511522 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -47,15 +47,15 @@ void XmlTreeParser::AttachWords(const std::vector &words, for (std::vector::iterator p = leaves.begin(); p != leaves.end(); ++p) { SyntaxTree *leaf = *p; - const int start = leaf->value().GetStart(); - const int end = leaf->value().GetEnd(); + const int start = leaf->value().start; + const int end = leaf->value().end; if (start != end) { std::ostringstream msg; msg << "leaf node covers multiple words (" << start << "-" << end << "): this is currently unsupported"; throw Exception(msg.str()); } - SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(start, end, *q++)); + SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(*q++, start, end)); leaf->children().push_back(newLeaf); newLeaf->parent() = leaf; } diff --git a/phrase-extract/syntax-common/xml_tree_writer.cc b/phrase-extract/syntax-common/xml_tree_writer.cc index 3c16cb2eb..d17937fa8 100644 --- a/phrase-extract/syntax-common/xml_tree_writer.cc +++ b/phrase-extract/syntax-common/xml_tree_writer.cc @@ -16,7 +16,7 @@ void XmlTreeWriter::Write(const SyntaxTree &tree) const { assert(!tree.IsLeaf()); // Opening tag - out_ << " Date: Wed, 3 Jun 2015 14:09:49 +0100 Subject: [PATCH 049/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNodeCollection.cpp | 20 ++++++----- phrase-extract/SyntaxNodeCollection.h | 44 ++++++++++++++++--------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 7421cc0ed..356c49bf4 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -47,7 +47,7 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos); m_nodes.push_back( newNode ); m_index[ startPos ][ endPos ].push_back( newNode ); - m_size = std::max(endPos+1, m_size); + m_numWords = std::max(endPos+1, m_numWords); return newNode; } @@ -56,8 +56,8 @@ ParentNodes SyntaxNodeCollection::Parse() ParentNodes parents; // looping through all spans of size >= 2 - for( int length=2; length<=m_size; length++ ) { - for( int startPos = 0; startPos <= m_size-length; startPos++ ) { + for( int length=2; length<=m_numWords; length++ ) { + for( int startPos = 0; startPos <= m_numWords-length; startPos++ ) { if (HasNode( startPos, startPos+length-1 )) { // processing one (parent) span @@ -96,13 +96,14 @@ bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const return GetNodes( startPos, endPos).size() > 0; } -const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const +const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( + int startPos, int endPos ) const { - SyntaxTreeIndexIterator startIndex = m_index.find( startPos ); + NodeIndex::const_iterator startIndex = m_index.find( startPos ); if (startIndex == m_index.end() ) return m_emptyNode; - SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos ); + InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos ); if (endIndex == startIndex->second.end()) return m_emptyNode; @@ -120,14 +121,15 @@ std::auto_ptr SyntaxNodeCollection::ExtractTree() } // Connect the SyntaxTrees. - typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator; + typedef NodeIndex::const_iterator OuterIterator; + typedef InnerNodeIndex::const_reverse_iterator InnerIterator; SyntaxTree *root = 0; SyntaxNode *prevNode = 0; SyntaxTree *prevTree = 0; // Iterate over all start indices from lowest to highest. - for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) { - const SyntaxTreeIndex2 &inner = p->second; + for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) { + const InnerNodeIndex &inner = p->second; // Iterate over all end indices from highest to lowest. for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) { const std::vector &nodes = q->second; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index c8ca67d3d..060192980 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -34,38 +34,50 @@ namespace MosesTraining typedef std::vector< int > SplitPoints; typedef std::vector< SplitPoints > ParentNodes; +/** A collection of SyntaxNodes organized by start and end position. + * + */ class SyntaxNodeCollection { -protected: - std::vector< SyntaxNode* > m_nodes; - - typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2; - typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2; - typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex; - typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator; - SyntaxTreeIndex m_index; - int m_size; - std::vector< SyntaxNode* > m_emptyNode; - public: - SyntaxNodeCollection() : m_size(0) {} + SyntaxNodeCollection() : m_numWords(0) {} ~SyntaxNodeCollection(); + //! Construct and insert a new SyntaxNode. SyntaxNode *AddNode( int startPos, int endPos, const std::string &label ); + // TODO Rename (and move?) ParentNodes Parse(); + + //! Return true iff there are one or more SyntaxNodes with the given span. bool HasNode( int startPos, int endPos ) const; + + //! Lookup the SyntaxNodes for a given span. const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; - const std::vector< SyntaxNode* >& GetAllNodes() { - return m_nodes; - }; + + //! Get a vector of pointers to all SyntaxNodes (unordered). + const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; }; + size_t GetNumWords() const { - return m_size; + return m_numWords; } void Clear(); std::auto_ptr ExtractTree(); + +private: + typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex; + typedef std::map< int, InnerNodeIndex > NodeIndex; + + // Not copyable. + SyntaxNodeCollection(const SyntaxNodeCollection &); + SyntaxNodeCollection &operator=(const SyntaxNodeCollection &); + + std::vector< SyntaxNode* > m_nodes; + NodeIndex m_index; + int m_numWords; + std::vector< SyntaxNode* > m_emptyNode; }; } // namespace MosesTraining From 8653bd81590d1f9f658d9560458dc72d9556e197 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Jun 2015 14:20:00 +0100 Subject: [PATCH 050/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/SyntaxNodeCollection.cpp | 40 ---------------------- phrase-extract/SyntaxNodeCollection.h | 6 ---- phrase-extract/relax-parse-main.cpp | 44 ++++++++++++++++++++++++- phrase-extract/relax-parse.h | 10 ++++-- 4 files changed, 50 insertions(+), 50 deletions(-) diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 356c49bf4..0a344fcd7 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -51,46 +51,6 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, return newNode; } -ParentNodes SyntaxNodeCollection::Parse() -{ - ParentNodes parents; - - // looping through all spans of size >= 2 - for( int length=2; length<=m_numWords; length++ ) { - for( int startPos = 0; startPos <= m_numWords-length; startPos++ ) { - if (HasNode( startPos, startPos+length-1 )) { - // processing one (parent) span - - //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":"; - SplitPoints splitPoints; - splitPoints.push_back( startPos ); - //std::cerr << " " << startPos; - - int first = 1; - int covered = 0; - int found_somehing = 1; // break loop if nothing found - while( covered < length && found_somehing ) { - // find largest covering subspan (child) - // starting at last covered position - found_somehing = 0; - for( int midPos=length-first; midPos>covered; midPos-- ) { - if( HasNode( startPos+covered, startPos+midPos-1 ) ) { - covered = midPos; - splitPoints.push_back( startPos+covered ); - // std::cerr << " " << ( startPos+covered ); - first = 0; - found_somehing = 1; - } - } - } - // std::cerr << std::endl; - parents.push_back( splitPoints ); - } - } - } - return parents; -} - bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const { return GetNodes( startPos, endPos).size() > 0; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index 060192980..8de151c55 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -31,9 +31,6 @@ namespace MosesTraining { -typedef std::vector< int > SplitPoints; -typedef std::vector< SplitPoints > ParentNodes; - /** A collection of SyntaxNodes organized by start and end position. * */ @@ -47,9 +44,6 @@ public: //! Construct and insert a new SyntaxNode. SyntaxNode *AddNode( int startPos, int endPos, const std::string &label ); - // TODO Rename (and move?) - ParentNodes Parse(); - //! Return true iff there are one or more SyntaxNodes with the given span. bool HasNode( int startPos, int endPos ) const; diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index 4b5c2d573..f7a2a271b 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -50,7 +50,7 @@ int main(int argc, char* argv[]) // output tree // cerr << "BEFORE:" << endl << tree; - ParentNodes parents = tree.Parse(); + ParentNodes parents = determineSplitPoints(tree); // execute selected grammar relaxation schemes if (leftBinarizeFlag) @@ -271,3 +271,45 @@ void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label); } } + +ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl) +{ + ParentNodes parents; + + const std::size_t numWords = nodeColl.GetNumWords(); + + // looping through all spans of size >= 2 + for( int length=2; length<=numWords; length++ ) { + for( int startPos = 0; startPos <= numWords-length; startPos++ ) { + if (nodeColl.HasNode( startPos, startPos+length-1 )) { + // processing one (parent) span + + //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":"; + SplitPoints splitPoints; + splitPoints.push_back( startPos ); + //std::cerr << " " << startPos; + + int first = 1; + int covered = 0; + int found_somehing = 1; // break loop if nothing found + while( covered < length && found_somehing ) { + // find largest covering subspan (child) + // starting at last covered position + found_somehing = 0; + for( int midPos=length-first; midPos>covered; midPos-- ) { + if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) { + covered = midPos; + splitPoints.push_back( startPos+covered ); + // std::cerr << " " << ( startPos+covered ); + first = 0; + found_somehing = 1; + } + } + } + // std::cerr << std::endl; + parents.push_back( splitPoints ); + } + } + } + return parents; +} diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h index a00aa6deb..7c412646a 100644 --- a/phrase-extract/relax-parse.h +++ b/phrase-extract/relax-parse.h @@ -37,10 +37,14 @@ bool leftBinarizeFlag = false; bool rightBinarizeFlag = false; char SAMTLevel = 0; +typedef std::vector< int > SplitPoints; +typedef std::vector< SplitPoints > ParentNodes; + // functions void init(int argc, char* argv[]); +ParentNodes determineSplitPoints(const MosesTraining::SyntaxNodeCollection &); void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector &words ); -void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); -void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); -void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); +void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents ); +void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents ); +void SAMT( MosesTraining::SyntaxNodeCollection &tree, ParentNodes &parents ); From ca82e9a244773d834b7ffdef548f1966f040a4d5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 4 Jun 2015 16:34:02 +0400 Subject: [PATCH 051/108] don't run beautify from cruise control. Not master, a particular commit --- cruise-control/test_all_new_commits.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/cruise-control/test_all_new_commits.sh b/cruise-control/test_all_new_commits.sh index bb9305768..433857eb5 100755 --- a/cruise-control/test_all_new_commits.sh +++ b/cruise-control/test_all_new_commits.sh @@ -107,8 +107,6 @@ function run_single_test () { #regtest_dir=$PWD/$(basename $regtest_file .tgz) cd .. - ./scripts/other/beautify.py --format --skip-perltidy - echo "## ./bjam clean" >> $longlog ./bjam clean $MCC_CONFIGURE_ARGS --with-regtest=$regtest_dir >> $longlog 2>&1 || warn "bjam clean failed, suspicious" @@ -154,7 +152,6 @@ function run_single_test () { date >> $longlog if [ -z "$err" ]; then - git commit -am "automatic daily beautifier" status="OK" else git reset --hard HEAD From 5696a59ae46862221901226cfd232b18ddf74357 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Thu, 4 Jun 2015 13:41:46 +0100 Subject: [PATCH 052/108] daily automatic beautifier --- mert/Fdstream.h | 4 ++- misc/processLexicalTableMin.cpp | 8 ++--- misc/processPhraseTableMin.cpp | 8 ++--- moses/FF/GlobalLexicalModel.cpp | 2 +- moses/FF/GlobalLexicalModelUnlimited.cpp | 2 +- moses/IOWrapper.cpp | 11 +++---- moses/StaticData.cpp | 11 +++---- moses/parameters/BookkeepingOptions.cpp | 28 ++++++++-------- moses/parameters/BookkeepingOptions.h | 12 +++---- moses/parameters/NBestOptions.cpp | 26 +++++++-------- moses/parameters/NBestOptions.h | 32 +++++++++---------- phrase-extract/SyntaxNode.h | 3 +- phrase-extract/SyntaxNodeCollection.cpp | 4 +-- phrase-extract/SyntaxNodeCollection.h | 4 ++- phrase-extract/consolidate-main.cpp | 22 ++++++------- .../extract-ghkm/AlignmentGraph.cpp | 2 +- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 24 +++++++------- phrase-extract/extract-main.cpp | 14 ++++---- .../filter-rule-table/FilterRuleTable.cpp | 2 +- phrase-extract/score-main.cpp | 16 +++++----- 20 files changed, 117 insertions(+), 118 deletions(-) diff --git a/mert/Fdstream.h b/mert/Fdstream.h index f6d4f039e..61529db6f 100644 --- a/mert/Fdstream.h +++ b/mert/Fdstream.h @@ -67,7 +67,9 @@ private: protected: /// For child classes only: retrieve filebuf. - __gnu_cxx::stdio_filebuf *get_filebuf() { return _filebuf; } + __gnu_cxx::stdio_filebuf *get_filebuf() { + return _filebuf; + } }; class ifdstream : public _fdstream diff --git a/misc/processLexicalTableMin.cpp b/misc/processLexicalTableMin.cpp index 8eee489ad..fac3d632c 100644 --- a/misc/processLexicalTableMin.cpp +++ b/misc/processLexicalTableMin.cpp @@ -55,10 +55,10 @@ int main(int argc, char** argv) size_t quantize = 0; size_t threads = - #ifdef WITH_THREADS - boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() : - #endif - 1; +#ifdef WITH_THREADS + boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() : +#endif + 1; if(1 >= argc) { printHelp(argv); diff --git a/misc/processPhraseTableMin.cpp b/misc/processPhraseTableMin.cpp index 3948a692c..a124d25df 100644 --- a/misc/processPhraseTableMin.cpp +++ b/misc/processPhraseTableMin.cpp @@ -68,10 +68,10 @@ int main(int argc, char **argv) size_t sortScoreIndex = 2; bool warnMe = true; size_t threads = - #ifdef WITH_THREADS - boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() : - #endif - 1; +#ifdef WITH_THREADS + boost::thread::hardware_concurrency() ? boost::thread::hardware_concurrency() : +#endif + 1; if(1 >= argc) { printHelp(argv); diff --git a/moses/FF/GlobalLexicalModel.cpp b/moses/FF/GlobalLexicalModel.cpp index b5a07b1ef..ef3fa4691 100644 --- a/moses/FF/GlobalLexicalModel.cpp +++ b/moses/FF/GlobalLexicalModel.cpp @@ -112,7 +112,7 @@ void GlobalLexicalModel::Load() void GlobalLexicalModel::InitializeForInput(ttasksptr const& ttask) { UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput, - "GlobalLexicalModel works only with sentence input."); + "GlobalLexicalModel works only with sentence input."); Sentence const* s = reinterpret_cast(ttask->GetSource().get()); m_local.reset(new ThreadLocalStorage); m_local->input = s; diff --git a/moses/FF/GlobalLexicalModelUnlimited.cpp b/moses/FF/GlobalLexicalModelUnlimited.cpp index d507054c2..675af2b6b 100644 --- a/moses/FF/GlobalLexicalModelUnlimited.cpp +++ b/moses/FF/GlobalLexicalModelUnlimited.cpp @@ -108,7 +108,7 @@ bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource, void GlobalLexicalModelUnlimited::InitializeForInput(ttasksptr const& ttask) { UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput, - "GlobalLexicalModel works only with sentence input."); + "GlobalLexicalModel works only with sentence input."); Sentence const* s = reinterpret_cast(ttask->GetSource().get()); m_local.reset(new ThreadLocalStorage); m_local->input = s; diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp index d1bdeb44f..94287dd0b 100644 --- a/moses/IOWrapper.cpp +++ b/moses/IOWrapper.cpp @@ -303,12 +303,11 @@ ReadInput() boost::lock_guard lock(m_lock); #endif boost::shared_ptr source = GetBufferedInput(); - if (source) - { - source->SetTranslationId(m_currentLine++); - if (m_look_ahead || m_look_back) - this->set_context_for(*source); - } + if (source) { + source->SetTranslationId(m_currentLine++); + if (m_look_ahead || m_look_back) + this->set_context_for(*source); + } m_past_input.push_back(source); return source; } diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index b41768604..6fd5ced57 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -63,8 +63,8 @@ StaticData::StaticData() : m_sourceStartPosMattersForRecombination(false) , m_requireSortingAfterSourceContext(false) , m_inputType(SentenceInput) - // , m_onlyDistinctNBest(false) - // , m_needAlignmentInfo(false) + // , m_onlyDistinctNBest(false) + // , m_needAlignmentInfo(false) , m_lmEnableOOVFeature(false) , m_isAlwaysCreateDirectTranslationOption(false) , m_currentWeightSetting("default") @@ -621,10 +621,9 @@ bool StaticData::LoadData(Parameter *parameter) #ifdef HAVE_PROTOBUF || m_outputSearchGraphPB #endif - || m_latticeSamplesFilePath.size()) - { - m_nbest_options.enabled = true; - } + || m_latticeSamplesFilePath.size()) { + m_nbest_options.enabled = true; + } // S2T decoder m_parameter->SetParameter(m_s2tParsingAlgorithm, "s2t-parsing-algorithm", diff --git a/moses/parameters/BookkeepingOptions.cpp b/moses/parameters/BookkeepingOptions.cpp index 2ab26b53c..db8fbd909 100644 --- a/moses/parameters/BookkeepingOptions.cpp +++ b/moses/parameters/BookkeepingOptions.cpp @@ -1,18 +1,18 @@ #include "BookkeepingOptions.h" -namespace Moses { - bool - BookkeepingOptions:: - init(Parameter const& P) - { - bool& x = need_alignment_info; - P.SetParameter(x, "print-alignment-info", false); - if (!x) P.SetParameter(x, "print-alignment-info-in-n-best", false); - if (!x) - { - PARAM_VEC const* params = P.GetParam("alignment-output-file"); - x = params && params->size(); - } - return true; +namespace Moses +{ +bool +BookkeepingOptions:: +init(Parameter const& P) +{ + bool& x = need_alignment_info; + P.SetParameter(x, "print-alignment-info", false); + if (!x) P.SetParameter(x, "print-alignment-info-in-n-best", false); + if (!x) { + PARAM_VEC const* params = P.GetParam("alignment-output-file"); + x = params && params->size(); } + return true; +} } diff --git a/moses/parameters/BookkeepingOptions.h b/moses/parameters/BookkeepingOptions.h index 8e800c587..08bc1d59d 100644 --- a/moses/parameters/BookkeepingOptions.h +++ b/moses/parameters/BookkeepingOptions.h @@ -2,13 +2,13 @@ #include "moses/Parameter.h" // #include -namespace Moses { +namespace Moses +{ - struct BookkeepingOptions - { - bool need_alignment_info; - bool init(Parameter const& param); - }; +struct BookkeepingOptions { + bool need_alignment_info; + bool init(Parameter const& param); +}; diff --git a/moses/parameters/NBestOptions.cpp b/moses/parameters/NBestOptions.cpp index 45747011a..d61a67c2f 100644 --- a/moses/parameters/NBestOptions.cpp +++ b/moses/parameters/NBestOptions.cpp @@ -2,7 +2,8 @@ #include "moses/Parameter.h" #include "NBestOptions.h" -namespace Moses { +namespace Moses +{ bool NBestOptions:: @@ -10,21 +11,16 @@ init(Parameter const& P) { const PARAM_VEC *params; params = P.GetParam("n-best-list"); - if (params) - { - if (params->size() >= 2) - { - output_file_path = params->at(0); - nbest_size = Scan( params->at(1) ); - only_distinct = (params->size()>2 && params->at(2)=="distinct"); - } - else - { - std::cerr << "wrong format for switch -n-best-list file size [disinct]"; - return false; - } + if (params) { + if (params->size() >= 2) { + output_file_path = params->at(0); + nbest_size = Scan( params->at(1) ); + only_distinct = (params->size()>2 && params->at(2)=="distinct"); + } else { + std::cerr << "wrong format for switch -n-best-list file size [disinct]"; + return false; } - else nbest_size = 0; + } else nbest_size = 0; P.SetParameter(factor, "n-best-factor", 20); P.SetParameter(include_alignment_info, "print-alignment-info-in-n-best", false ); diff --git a/moses/parameters/NBestOptions.h b/moses/parameters/NBestOptions.h index 6c868990c..bc125c2b6 100644 --- a/moses/parameters/NBestOptions.h +++ b/moses/parameters/NBestOptions.h @@ -1,27 +1,27 @@ // -*- mode: c++; cc-style: gnu -*- #include -namespace Moses { +namespace Moses +{ - struct NBestOptions - { - size_t nbest_size; - size_t factor; - bool enabled; - bool print_trees; - bool only_distinct; +struct NBestOptions { + size_t nbest_size; + size_t factor; + bool enabled; + bool print_trees; + bool only_distinct; - bool include_alignment_info; - bool include_segmentation; - bool include_feature_labels; - bool include_passthrough; + bool include_alignment_info; + bool include_segmentation; + bool include_feature_labels; + bool include_passthrough; - bool include_all_factors; + bool include_all_factors; - std::string output_file_path; + std::string output_file_path; - bool init(Parameter const& param); + bool init(Parameter const& param); - }; +}; } diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h index f38e94713..49e2eb695 100644 --- a/phrase-extract/SyntaxNode.h +++ b/phrase-extract/SyntaxNode.h @@ -22,7 +22,8 @@ #include #include -namespace MosesTraining { +namespace MosesTraining +{ struct SyntaxNode { typedef std::map AttributeMap; diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 0a344fcd7..70f52317e 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -42,7 +42,7 @@ void SyntaxNodeCollection::Clear() } SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, - const std::string &label) + const std::string &label) { SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos); m_nodes.push_back( newNode ); @@ -57,7 +57,7 @@ bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const } const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( - int startPos, int endPos ) const + int startPos, int endPos ) const { NodeIndex::const_iterator startIndex = m_index.find( startPos ); if (startIndex == m_index.end() ) diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index 8de151c55..405a77c5f 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -51,7 +51,9 @@ public: const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; //! Get a vector of pointers to all SyntaxNodes (unordered). - const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; }; + const std::vector< SyntaxNode* >& GetAllNodes() { + return m_nodes; + }; size_t GetNumWords() const { return m_numWords; diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index d52e8797b..5964bf686 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -73,17 +73,17 @@ int main(int argc, char* argv[]) if (argc < 4) { std::cerr << - "syntax: " - "consolidate phrase-table.direct " - "phrase-table.indirect " - "phrase-table.consolidated " - "[--Hierarchical] [--OnlyDirect] [--PhraseCount] " - "[--GoodTuring counts-of-counts-file] " - "[--KneserNey counts-of-counts-file] [--LowCountFeature] " - "[--SourceLabels source-labels-file] " - "[--PartsOfSpeech parts-of-speech-file] " - "[--MinScore id:threshold[,id:threshold]*]" - << std::endl; + "syntax: " + "consolidate phrase-table.direct " + "phrase-table.indirect " + "phrase-table.consolidated " + "[--Hierarchical] [--OnlyDirect] [--PhraseCount] " + "[--GoodTuring counts-of-counts-file] " + "[--KneserNey counts-of-counts-file] [--LowCountFeature] " + "[--SourceLabels source-labels-file] " + "[--PartsOfSpeech parts-of-speech-file] " + "[--MinScore id:threshold[,id:threshold]*]" + << std::endl; exit(1); } const std::string fileNameDirect = argv[1]; diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 7c179295f..9dba71331 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -219,7 +219,7 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root) if (nodeType == TREE) { float score = 0.0f; SyntaxNode::AttributeMap::const_iterator p = - root->value().attributes.find("pcfg"); + root->value().attributes.find("pcfg"); if (p != root->value().attributes.end()) { score = std::atof(p->second.c_str()); } diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index c96cda146..777e56f52 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -232,7 +232,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Initialize phrase orientation scoring object PhraseOrientation phraseOrientation(sourceTokens.size(), - targetXmlTreeParser.words().size(), alignment); + targetXmlTreeParser.words().size(), alignment); // Write the rules, subject to scope pruning. const std::vector &targetNodes = graph.GetTargetNodes(); @@ -413,21 +413,21 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], << "\nThe parse tree is assumed to contain part-of-speech preterminal nodes.\n" << "\n" << "For the composed rule constraints: rule depth is the " - "maximum distance from the\nrule's root node to a sink " - "node, not counting preterminal expansions or word\n" - "alignments. Rule size is the measure defined in DeNeefe " - "et al (2007): the\nnumber of non-part-of-speech, non-leaf " - "constituent labels in the target tree.\nNode count is the " - "number of target tree nodes (excluding target words).\n" + "maximum distance from the\nrule's root node to a sink " + "node, not counting preterminal expansions or word\n" + "alignments. Rule size is the measure defined in DeNeefe " + "et al (2007): the\nnumber of non-part-of-speech, non-leaf " + "constituent labels in the target tree.\nNode count is the " + "number of target tree nodes (excluding target words).\n" << "\n" << "Scope pruning (Hopkins and Langmead, 2010) is applied to both minimal and\ncomposed rules.\n" << "\n" << "Unaligned source words are attached to the tree using the " - "following heuristic:\nif there are aligned source words to " - "both the left and the right of an unaligned\nsource word " - "then it is attached to the lowest common ancestor of its " - "nearest\nsuch left and right neighbours. Otherwise, it is " - "attached to the root of the\nparse tree.\n" + "following heuristic:\nif there are aligned source words to " + "both the left and the right of an unaligned\nsource word " + "then it is attached to the lowest common ancestor of its " + "nearest\nsuch left and right neighbours. Otherwise, it is " + "attached to the root of the\nparse tree.\n" << "\n" << "Unless the --AllowUnary option is given, unary rules containing no lexical\nsource items are eliminated using the method described in Chung et al. (2011).\nThe parsing algorithm used in Moses is unable to handle such rules.\n" << "\n" diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index eb44b83d1..70d4cad35 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -87,13 +87,13 @@ class ExtractTask { public: ExtractTask( - size_t id, SentenceAlignment &sentence, - PhraseExtractionOptions &initoptions, - Moses::OutputFileStream &extractFile, - Moses::OutputFileStream &extractFileInv, - Moses::OutputFileStream &extractFileOrientation, - Moses::OutputFileStream &extractFileContext, - Moses::OutputFileStream &extractFileContextInv): + size_t id, SentenceAlignment &sentence, + PhraseExtractionOptions &initoptions, + Moses::OutputFileStream &extractFile, + Moses::OutputFileStream &extractFileInv, + Moses::OutputFileStream &extractFileOrientation, + Moses::OutputFileStream &extractFileContext, + Moses::OutputFileStream &extractFileContextInv): m_sentence(sentence), m_options(initoptions), m_extractFile(extractFile), diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp index 32d2019cf..89e59b3e9 100644 --- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp +++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp @@ -137,7 +137,7 @@ void FilterRuleTable::ReadTestSet( continue; } sentences.push_back( - boost::shared_ptr(parser.Parse(line).release())); + boost::shared_ptr(parser.Parse(line).release())); } } diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 185c0ae9e..cf28f90b9 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -131,14 +131,14 @@ int main(int argc, char* argv[]) ScoreFeatureManager featureManager; if (argc < 4) { std::cerr << - "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] " - "[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] " - "[--NoWordAlignment] [--UnalignedPenalty] " - "[--UnalignedFunctionWordPenalty function-word-file] " - "[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] " - "[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] " - "[--TargetPreferenceLabels] [--UnpairedExtractFormat] " - "[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl; + "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] " + "[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] " + "[--NoWordAlignment] [--UnalignedPenalty] " + "[--UnalignedFunctionWordPenalty function-word-file] " + "[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] " + "[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] " + "[--TargetPreferenceLabels] [--UnpairedExtractFormat] " + "[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl; std::cerr << featureManager.usage() << std::endl; exit(1); } From 7047c591c8414864ef13020e256e841aa217c4d2 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 4 Jun 2015 16:46:56 +0400 Subject: [PATCH 053/108] Beautify before cruise control test. Push using MosesAdmin account, must be set up with ssh. --- cruise-control/test_all_new_commits.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cruise-control/test_all_new_commits.sh b/cruise-control/test_all_new_commits.sh index 433857eb5..79d44d3a3 100755 --- a/cruise-control/test_all_new_commits.sh +++ b/cruise-control/test_all_new_commits.sh @@ -17,6 +17,14 @@ configname=$(basename $configf | sed 's/\.config$//') source "$configf" +# beautifier +git clone git@github.com:moses-smt/mosesdecoder.git /tmp/moses +cd /tmp/moses +./scripts/other/beautify.py --format --skip-perltidy +git commit -am "daily automatic beautifier" +git push +rm -rf /tmp/moses + [ -z "$MCC_SCAN_BRANCHES" ] \ && die "Bad config $configf; does not define MCC_SCAN_BRANCHES" From 6a09042e6abd5dab3d0cf0d358804f7a0ef9ca9a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 4 Jun 2015 16:51:22 +0400 Subject: [PATCH 054/108] Beautify before cruise control test. Push using MosesAdmin account, must be set up with ssh. --- cruise-control/test_all_new_commits.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/cruise-control/test_all_new_commits.sh b/cruise-control/test_all_new_commits.sh index 79d44d3a3..c30afa0d1 100755 --- a/cruise-control/test_all_new_commits.sh +++ b/cruise-control/test_all_new_commits.sh @@ -24,6 +24,7 @@ cd /tmp/moses git commit -am "daily automatic beautifier" git push rm -rf /tmp/moses +cd - [ -z "$MCC_SCAN_BRANCHES" ] \ && die "Bad config $configf; does not define MCC_SCAN_BRANCHES" From f6ddc452241755733c947723a8618aab7245c8f1 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Thu, 4 Jun 2015 14:36:39 +0100 Subject: [PATCH 055/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/pcfg-common/Jamfile | 1 - phrase-extract/pcfg-common/pcfg.h | 63 ------------------- phrase-extract/pcfg-common/typedef.h | 38 ----------- phrase-extract/pcfg-extract/Jamfile | 2 +- phrase-extract/pcfg-extract/pcfg_extract.cc | 5 +- phrase-extract/pcfg-extract/pcfg_extract.h | 2 +- .../pcfg-extract/rule_collection.cc | 2 +- phrase-extract/pcfg-extract/rule_collection.h | 2 +- phrase-extract/pcfg-extract/rule_extractor.h | 4 +- phrase-extract/pcfg-score/Jamfile | 2 +- phrase-extract/pcfg-score/pcfg_score.cc | 5 +- phrase-extract/pcfg-score/pcfg_score.h | 2 +- phrase-extract/pcfg-score/tree_scorer.h | 4 +- .../{pcfg-common => syntax-common}/pcfg.cc | 21 ------- phrase-extract/syntax-common/pcfg.h | 38 +++++++++++ .../{pcfg-common => syntax-common}/tool.cc | 0 .../{pcfg-common => syntax-common}/tool.h | 0 phrase-extract/syntax-common/vocabulary.h | 13 ++++ 18 files changed, 65 insertions(+), 139 deletions(-) delete mode 100644 phrase-extract/pcfg-common/Jamfile delete mode 100644 phrase-extract/pcfg-common/pcfg.h delete mode 100644 phrase-extract/pcfg-common/typedef.h rename phrase-extract/{pcfg-common => syntax-common}/pcfg.cc (69%) create mode 100644 phrase-extract/syntax-common/pcfg.h rename phrase-extract/{pcfg-common => syntax-common}/tool.cc (100%) rename phrase-extract/{pcfg-common => syntax-common}/tool.h (100%) create mode 100644 phrase-extract/syntax-common/vocabulary.h diff --git a/phrase-extract/pcfg-common/Jamfile b/phrase-extract/pcfg-common/Jamfile deleted file mode 100644 index 5669b443e..000000000 --- a/phrase-extract/pcfg-common/Jamfile +++ /dev/null @@ -1 +0,0 @@ -lib pcfg_common : [ glob *.cc ] ..//syntax-common ..//deps : .. ; diff --git a/phrase-extract/pcfg-common/pcfg.h b/phrase-extract/pcfg-common/pcfg.h deleted file mode 100644 index c5c04cba4..000000000 --- a/phrase-extract/pcfg-common/pcfg.h +++ /dev/null @@ -1,63 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_PCFG_H_ -#define PCFG_PCFG_H_ - -#include -#include -#include -#include - -#include "typedef.h" - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -class Pcfg { - public: - typedef std::vector Key; - typedef std::map Map; - typedef Map::iterator iterator; - typedef Map::const_iterator const_iterator; - - Pcfg() {} - - iterator begin() { return rules_.begin(); } - const_iterator begin() const { return rules_.begin(); } - - iterator end() { return rules_.end(); } - const_iterator end() const { return rules_.end(); } - - void Add(const Key &, double); - bool Lookup(const Key &, double &) const; - void Read(std::istream &, Vocabulary &); - void Write(const Vocabulary &, std::ostream &) const; - - private: - Map rules_; -}; - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h deleted file mode 100644 index 1280b89cf..000000000 --- a/phrase-extract/pcfg-common/typedef.h +++ /dev/null @@ -1,38 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef PCFG_TYPEDEF_H_ -#define PCFG_TYPEDEF_H_ - -#include - -#include "syntax-common/numbered_set.h" - -namespace MosesTraining { -namespace Syntax { -namespace PCFG { - -typedef NumberedSet Vocabulary; - -} // namespace PCFG -} // namespace Syntax -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/pcfg-extract/Jamfile b/phrase-extract/pcfg-extract/Jamfile index 2442b967a..2f4ae1e7d 100644 --- a/phrase-extract/pcfg-extract/Jamfile +++ b/phrase-extract/pcfg-extract/Jamfile @@ -1 +1 @@ -exe pcfg-extract : [ glob *.cc ] ..//syntax-common ..//pcfg-common ../..//boost_program_options : .. ; +exe pcfg-extract : [ glob *.cc ] ..//syntax-common ../..//boost_program_options : .. ; diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc index 87419edb7..45eb9ff3d 100644 --- a/phrase-extract/pcfg-extract/pcfg_extract.cc +++ b/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -32,13 +32,12 @@ #include #include "syntax-common/exception.h" +#include "syntax-common/pcfg.h" +#include "syntax-common/vocabulary.h" #include "syntax-common/xml_tree_parser.h" #include "SyntaxTree.h" -#include "pcfg-common/pcfg.h" -#include "pcfg-common/typedef.h" - #include "options.h" #include "rule_collection.h" #include "rule_extractor.h" diff --git a/phrase-extract/pcfg-extract/pcfg_extract.h b/phrase-extract/pcfg-extract/pcfg_extract.h index 5882e45da..3b084acbe 100644 --- a/phrase-extract/pcfg-extract/pcfg_extract.h +++ b/phrase-extract/pcfg-extract/pcfg_extract.h @@ -21,7 +21,7 @@ #ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_ #define PCFG_EXTRACT_PCFG_EXTRACT_H_ -#include "pcfg-common/tool.h" +#include "syntax-common/tool.h" namespace MosesTraining { diff --git a/phrase-extract/pcfg-extract/rule_collection.cc b/phrase-extract/pcfg-extract/rule_collection.cc index 9db0ce9bf..a814f82d6 100644 --- a/phrase-extract/pcfg-extract/rule_collection.cc +++ b/phrase-extract/pcfg-extract/rule_collection.cc @@ -19,7 +19,7 @@ #include "rule_collection.h" -#include "pcfg-common/pcfg.h" +#include "syntax-common/pcfg.h" #include diff --git a/phrase-extract/pcfg-extract/rule_collection.h b/phrase-extract/pcfg-extract/rule_collection.h index 3d9a9f98b..3bbc32721 100644 --- a/phrase-extract/pcfg-extract/rule_collection.h +++ b/phrase-extract/pcfg-extract/rule_collection.h @@ -25,7 +25,7 @@ #include -#include "pcfg-common/pcfg.h" +#include "syntax-common/pcfg.h" namespace MosesTraining { diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h index d32d76992..91014747c 100644 --- a/phrase-extract/pcfg-extract/rule_extractor.h +++ b/phrase-extract/pcfg-extract/rule_extractor.h @@ -23,7 +23,7 @@ #include "SyntaxTree.h" -#include "pcfg-common/typedef.h" +#include "syntax-common/vocabulary.h" #include "rule_collection.h" @@ -39,7 +39,7 @@ class RuleExtractor { public: RuleExtractor(Vocabulary &); - void Extract(const MosesTraining::SyntaxTree &, RuleCollection &) const; + void Extract(const SyntaxTree &, RuleCollection &) const; private: Vocabulary &non_term_vocab_; }; diff --git a/phrase-extract/pcfg-score/Jamfile b/phrase-extract/pcfg-score/Jamfile index 45d46492a..ca321d04c 100644 --- a/phrase-extract/pcfg-score/Jamfile +++ b/phrase-extract/pcfg-score/Jamfile @@ -1 +1 @@ -exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options : .. ; +exe pcfg-score : [ glob *.cc ] ..//syntax-common ../..//boost_program_options : .. ; diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc index e11f73f70..cec84211a 100644 --- a/phrase-extract/pcfg-score/pcfg_score.cc +++ b/phrase-extract/pcfg-score/pcfg_score.cc @@ -36,12 +36,11 @@ #include "SyntaxTree.h" #include "syntax-common/exception.h" +#include "syntax-common/pcfg.h" +#include "syntax-common/vocabulary.h" #include "syntax-common/xml_tree_parser.h" #include "syntax-common/xml_tree_writer.h" -#include "pcfg-common/pcfg.h" -#include "pcfg-common/typedef.h" - namespace MosesTraining { namespace Syntax diff --git a/phrase-extract/pcfg-score/pcfg_score.h b/phrase-extract/pcfg-score/pcfg_score.h index b0b4a77cd..b691b107f 100644 --- a/phrase-extract/pcfg-score/pcfg_score.h +++ b/phrase-extract/pcfg-score/pcfg_score.h @@ -21,7 +21,7 @@ #ifndef PCFG_SCORE_PCFG_SCORE_H_ #define PCFG_SCORE_PCFG_SCORE_H_ -#include "pcfg-common/tool.h" +#include "syntax-common/tool.h" namespace MosesTraining { diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h index cf9fdd1a3..b95d13ddb 100644 --- a/phrase-extract/pcfg-score/tree_scorer.h +++ b/phrase-extract/pcfg-score/tree_scorer.h @@ -23,8 +23,8 @@ #include "SyntaxTree.h" -#include "pcfg-common/pcfg.h" -#include "pcfg-common/typedef.h" +#include "syntax-common/vocabulary.h" +#include "syntax-common/pcfg.h" namespace MosesTraining { diff --git a/phrase-extract/pcfg-common/pcfg.cc b/phrase-extract/syntax-common/pcfg.cc similarity index 69% rename from phrase-extract/pcfg-common/pcfg.cc rename to phrase-extract/syntax-common/pcfg.cc index 988367c9b..3efe04218 100644 --- a/phrase-extract/pcfg-common/pcfg.cc +++ b/phrase-extract/syntax-common/pcfg.cc @@ -1,22 +1,3 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - #include "pcfg.h" #include @@ -28,7 +9,6 @@ namespace MosesTraining { namespace Syntax { -namespace PCFG { void Pcfg::Add(const Key &key, double score) { rules_[key] = score; @@ -103,6 +83,5 @@ void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const { } } -} // namespace PCFG } // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/syntax-common/pcfg.h b/phrase-extract/syntax-common/pcfg.h new file mode 100644 index 000000000..0a731cc7a --- /dev/null +++ b/phrase-extract/syntax-common/pcfg.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include +#include + +#include "vocabulary.h" + +namespace MosesTraining { +namespace Syntax { + +class Pcfg { + public: + typedef std::vector Key; + typedef std::map Map; + typedef Map::iterator iterator; + typedef Map::const_iterator const_iterator; + + Pcfg() {} + + iterator begin() { return rules_.begin(); } + const_iterator begin() const { return rules_.begin(); } + + iterator end() { return rules_.end(); } + const_iterator end() const { return rules_.end(); } + + void Add(const Key &, double); + bool Lookup(const Key &, double &) const; + void Read(std::istream &, Vocabulary &); + void Write(const Vocabulary &, std::ostream &) const; + + private: + Map rules_; +}; + +} // namespace Syntax +} // namespace MosesTraining diff --git a/phrase-extract/pcfg-common/tool.cc b/phrase-extract/syntax-common/tool.cc similarity index 100% rename from phrase-extract/pcfg-common/tool.cc rename to phrase-extract/syntax-common/tool.cc diff --git a/phrase-extract/pcfg-common/tool.h b/phrase-extract/syntax-common/tool.h similarity index 100% rename from phrase-extract/pcfg-common/tool.h rename to phrase-extract/syntax-common/tool.h diff --git a/phrase-extract/syntax-common/vocabulary.h b/phrase-extract/syntax-common/vocabulary.h new file mode 100644 index 000000000..119767245 --- /dev/null +++ b/phrase-extract/syntax-common/vocabulary.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +#include "numbered_set.h" + +namespace MosesTraining { +namespace Syntax { + +typedef NumberedSet Vocabulary; + +} // namespace Syntax +} // namespace MosesTraining From 721bfe823bd4142f73499be371036c775887f9d5 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 4 Jun 2015 16:07:22 +0100 Subject: [PATCH 056/108] Bug fix: m_nbestSize wasn't initialized in class TranslationRequest. --- moses/server/TranslationRequest.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index 5c87eb1a7..3762fbd96 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -221,6 +221,7 @@ TranslationRequest:: TranslationRequest(xmlrpc_c::paramList const& paramList, boost::condition_variable& cond, boost::mutex& mut) : m_cond(cond), m_mutex(mut), m_done(false), m_paramList(paramList) + , m_nbestSize(0) { } void @@ -264,7 +265,12 @@ parse_request(std::map const& params) pdmm->SetTemporaryMultiModelWeightsVector(w); } } - + + si = params.find("nbest"); + if (si != params.end()) + m_nbestSize = xmlrpc_c::value_int(si->second); + + // // biased sampling for suffix-array-based sampling phrase table? // if ((si = params.find("bias")) != params.end()) // { From c6a3d8e54aa84933875160873d7bf837a6210b25 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Thu, 4 Jun 2015 16:54:31 +0100 Subject: [PATCH 057/108] Ongoing moses/phrase-extract refactoring --- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 46 +---------- phrase-extract/extract-ghkm/ExtractGHKM.h | 18 ++-- .../filter-rule-table/FilterRuleTable.cpp | 18 +--- .../filter-rule-table/FilterRuleTable.h | 18 ++-- phrase-extract/pcfg-extract/pcfg_extract.cc | 2 +- phrase-extract/pcfg-score/pcfg_score.cc | 4 +- .../postprocess-egret-forests/Main.cpp | 6 +- .../PostprocessEgretForests.cpp | 50 ++++------- .../PostprocessEgretForests.h | 18 ++-- phrase-extract/score-stsg/ScoreStsg.cpp | 26 +----- phrase-extract/score-stsg/ScoreStsg.h | 15 +--- phrase-extract/syntax-common/tool.cc | 81 +++++++----------- phrase-extract/syntax-common/tool.h | 82 +++++-------------- 13 files changed, 96 insertions(+), 288 deletions(-) diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 777e56f52..c2ee43767 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -359,39 +359,6 @@ int ExtractGHKM::Main(int argc, char *argv[]) return 0; } -void ExtractGHKM::OpenInputFileOrDie(const std::string &filename, - std::ifstream &stream) -{ - stream.open(filename.c_str()); - if (!stream) { - std::ostringstream msg; - msg << "failed to open input file: " << filename; - Error(msg.str()); - } -} - -void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename, - std::ofstream &stream) -{ - stream.open(filename.c_str()); - if (!stream) { - std::ostringstream msg; - msg << "failed to open output file: " << filename; - Error(msg.str()); - } -} - -void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename, - Moses::OutputFileStream &stream) -{ - bool ret = stream.Open(filename); - if (!ret) { - std::ostringstream msg; - msg << "failed to open output file: " << filename; - Error(msg.str()); - } -} - void ExtractGHKM::ProcessOptions(int argc, char *argv[], Options &options) const { @@ -401,7 +368,7 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], // Construct the 'top' of the usage message: the bit that comes before the // options list. std::ostringstream usageTop; - usageTop << "Usage: " << GetName() + usageTop << "Usage: " << name() << " [OPTION]... TARGET SOURCE ALIGNMENT EXTRACT\n\n" << "SCFG rule extractor based on the GHKM algorithm described in\n" << "Galley et al. (2004).\n\n" @@ -547,11 +514,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], // Process the command-line. po::variables_map vm; - const int optionStyle = cls::allow_long - | cls::long_allow_adjacent - | cls::long_allow_next; try { - po::store(po::command_line_parser(argc, argv).style(optionStyle). + po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()). options(cmdLineOptions).positional(p).run(), vm); po::notify(vm); } catch (const std::exception &e) { @@ -635,12 +599,6 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], } } -void ExtractGHKM::Error(const std::string &msg) const -{ - std::cerr << GetName() << ": " << msg << std::endl; - std::exit(1); -} - std::vector ExtractGHKM::ReadTokens(const std::string &s) const { std::vector tokens; diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h index 66c4c55f8..0d0fa8bf1 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.h +++ b/phrase-extract/extract-ghkm/ExtractGHKM.h @@ -28,6 +28,8 @@ #include "OutputFileStream.h" #include "SyntaxTree.h" +#include "syntax-common/tool.h" + namespace MosesTraining { namespace GHKM @@ -35,22 +37,14 @@ namespace GHKM struct Options; -class ExtractGHKM +class ExtractGHKM : public Syntax::Tool { public: + ExtractGHKM() : Tool("extract-ghkm") {} - ExtractGHKM() : m_name("extract-ghkm") {} - const std::string &GetName() const { - return m_name; - } - int Main(int argc, char *argv[]); + virtual int Main(int argc, char *argv[]); private: - - void Error(const std::string &) const; - void OpenInputFileOrDie(const std::string &, std::ifstream &); - void OpenOutputFileOrDie(const std::string &, std::ofstream &); - void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &); void RecordTreeLabels(const SyntaxTree &, std::set &); void CollectWordLabelCounts(SyntaxTree &, const Options &, @@ -79,8 +73,6 @@ private: std::vector ReadTokens(const SyntaxTree &root) const; void ProcessOptions(int, char *[], Options &) const; - - std::string m_name; }; } // namespace GHKM diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp index 89e59b3e9..24c2803a7 100644 --- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp +++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp @@ -167,7 +167,7 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[], // Construct the 'top' of the usage message: the bit that comes before the // options list. std::ostringstream usageTop; - usageTop << "Usage: " << GetName() + usageTop << "Usage: " << name() << " [OPTION]... MODEL TEST\n\n" << "Filter for SCFG/STSG rule tables.\n\n" << "Options"; @@ -203,11 +203,8 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[], // Process the command-line. po::variables_map vm; - const int optionStyle = cls::allow_long - | cls::long_allow_adjacent - | cls::long_allow_next; try { - po::store(po::command_line_parser(argc, argv).style(optionStyle). + po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()). options(cmdLineOptions).positional(p).run(), vm); po::notify(vm); } catch (const std::exception &e) { @@ -229,17 +226,6 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[], } } -void FilterRuleTable::Error(const std::string &msg) const -{ - std::cerr << GetName() << ": error: " << msg << std::endl; - std::exit(1); -} - -void FilterRuleTable::Warn(const std::string &msg) const -{ - std::cerr << GetName() << ": warning: " << msg << std::endl; -} - } // namespace FilterRuleTable } // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.h b/phrase-extract/filter-rule-table/FilterRuleTable.h index 3077e690d..7b51bb8fa 100644 --- a/phrase-extract/filter-rule-table/FilterRuleTable.h +++ b/phrase-extract/filter-rule-table/FilterRuleTable.h @@ -7,6 +7,8 @@ #include "SyntaxTree.h" +#include "syntax-common/tool.h" + #include "StringForest.h" namespace MosesTraining @@ -18,20 +20,14 @@ namespace FilterRuleTable struct Options; -class FilterRuleTable +class FilterRuleTable : public Tool { public: - FilterRuleTable() : m_name("filter-rule-table") {} + FilterRuleTable() : Tool("filter-rule-table") {} - const std::string &GetName() const { - return m_name; - } - - int Main(int argc, char *argv[]); + virtual int Main(int argc, char *argv[]); private: - void Error(const std::string &) const; - // Filter rule table (on std::cin) for test set (string version). void Filter(const std::vector > &); @@ -51,10 +47,6 @@ private: // Read test set (forest version) void ReadTestSet(std::istream &, std::vector > &); - - void Warn(const std::string &) const; - - std::string m_name; }; } // namespace FilterRuleTable diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc index 45eb9ff3d..0e89e26be 100644 --- a/phrase-extract/pcfg-extract/pcfg_extract.cc +++ b/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -118,7 +118,7 @@ void PcfgExtract::ProcessOptions(int argc, char *argv[], // Process the command-line. po::variables_map vm; try { - po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()). + po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()). options(cmd_line_options).positional(p).run(), vm); po::notify(vm); } catch (const std::exception &e) { diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc index cec84211a..bdbb761f9 100644 --- a/phrase-extract/pcfg-score/pcfg_score.cc +++ b/phrase-extract/pcfg-score/pcfg_score.cc @@ -56,7 +56,7 @@ int PcfgScore::Main(int argc, char *argv[]) // Open PCFG stream. std::ifstream pcfg_stream; - OpenNamedInputOrDie(options.pcfg_file, pcfg_stream); + OpenInputFileOrDie(options.pcfg_file, pcfg_stream); // Read PCFG. Pcfg pcfg; @@ -131,7 +131,7 @@ void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const // Process the command-line. po::variables_map vm; try { - po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()). + po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()). options(cmd_line_options).positional(p).run(), vm); po::notify(vm); } catch (const std::exception &e) { diff --git a/phrase-extract/postprocess-egret-forests/Main.cpp b/phrase-extract/postprocess-egret-forests/Main.cpp index ec2bab185..fead94652 100644 --- a/phrase-extract/postprocess-egret-forests/Main.cpp +++ b/phrase-extract/postprocess-egret-forests/Main.cpp @@ -5,9 +5,5 @@ int main(int argc, char *argv[]) { MosesTraining::Syntax::PostprocessEgretForests::PostprocessEgretForests tool; - try { - return tool.Main(argc, argv); - } catch (const MosesTraining::Syntax::Exception &e) { - tool.Error(e.msg()); - } + return tool.Main(argc, argv); } diff --git a/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp index d87e082dc..4911d4913 100644 --- a/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp +++ b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.cpp @@ -30,19 +30,23 @@ namespace PostprocessEgretForests int PostprocessEgretForests::Main(int argc, char *argv[]) { - // Process command-line options. - Options options; - ProcessOptions(argc, argv, options); + try { + // Process command-line options. + Options options; + ProcessOptions(argc, argv, options); - // Open input files. - boost::scoped_ptr splitPointParser; - std::ifstream splitPointFileStream; - if (!options.splitPointsFile.empty()) { - OpenInputFileOrDie(options.splitPointsFile, splitPointFileStream); - splitPointParser.reset(new SplitPointFileParser(splitPointFileStream)); + // Open input files. + boost::scoped_ptr splitPointParser; + std::ifstream splitPointFileStream; + if (!options.splitPointsFile.empty()) { + OpenInputFileOrDie(options.splitPointsFile, splitPointFileStream); + splitPointParser.reset(new SplitPointFileParser(splitPointFileStream)); + } + + ProcessForest(std::cin, std::cout, splitPointParser.get(), options); + } catch (const MosesTraining::Syntax::Exception &e) { + Error(e.msg()); } - - ProcessForest(std::cin, std::cout, splitPointParser.get(), options); return 0; } @@ -76,17 +80,6 @@ void PostprocessEgretForests::ProcessForest( } } -void PostprocessEgretForests::OpenInputFileOrDie(const std::string &filename, - std::ifstream &stream) -{ - stream.open(filename.c_str()); - if (!stream) { - std::ostringstream msg; - msg << "failed to open input file: " << filename; - Error(msg.str()); - } -} - void PostprocessEgretForests::ProcessOptions(int argc, char *argv[], Options &options) const { @@ -96,7 +89,7 @@ void PostprocessEgretForests::ProcessOptions(int argc, char *argv[], // Construct the 'top' of the usage message: the bit that comes before the // options list. std::ostringstream usageTop; - usageTop << "Usage: " << GetName() + usageTop << "Usage: " << name() << " [OPTION]...\n\n" << "TODO\n\n" << "Options"; @@ -132,11 +125,8 @@ void PostprocessEgretForests::ProcessOptions(int argc, char *argv[], // Process the command-line. po::variables_map vm; - const int optionStyle = cls::allow_long - | cls::long_allow_adjacent - | cls::long_allow_next; try { - po::store(po::command_line_parser(argc, argv).style(optionStyle). + po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()). options(cmdLineOptions).positional(p).run(), vm); po::notify(vm); } catch (const std::exception &e) { @@ -156,12 +146,6 @@ void PostprocessEgretForests::ProcessOptions(int argc, char *argv[], } } -void PostprocessEgretForests::Error(const std::string &msg) const -{ - std::cerr << GetName() << ": " << msg << std::endl; - std::exit(1); -} - } // namespace PostprocessEgretForests } // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h index 95da24c71..51970084e 100644 --- a/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h +++ b/phrase-extract/postprocess-egret-forests/PostprocessEgretForests.h @@ -4,6 +4,8 @@ #include #include +#include "syntax-common/tool.h" + namespace MosesTraining { namespace Syntax @@ -14,31 +16,21 @@ namespace PostprocessEgretForests struct Options; class SplitPointFileParser; -class PostprocessEgretForests +class PostprocessEgretForests : public Tool { public: - PostprocessEgretForests() : m_name("postprocess-egret-forests") {} + PostprocessEgretForests() : Tool("postprocess-egret-forests") {} - void Error(const std::string &) const; - - const std::string &GetName() const { - return m_name; - } - - int Main(int argc, char *argv[]); + virtual int Main(int argc, char *argv[]); private: void OneBestTree(std::istream &, std::ostream &, SplitPointFileParser *, const Options &); - void OpenInputFileOrDie(const std::string &, std::ifstream &); - void ProcessForest(std::istream &, std::ostream &, SplitPointFileParser *, const Options &); void ProcessOptions(int, char *[], Options &) const; - - std::string m_name; }; } // namespace PostprocessEgretForests diff --git a/phrase-extract/score-stsg/ScoreStsg.cpp b/phrase-extract/score-stsg/ScoreStsg.cpp index 09395e21e..f6df0d0da 100644 --- a/phrase-extract/score-stsg/ScoreStsg.cpp +++ b/phrase-extract/score-stsg/ScoreStsg.cpp @@ -35,7 +35,7 @@ namespace ScoreStsg const int ScoreStsg::kCountOfCountsMax = 10; ScoreStsg::ScoreStsg() - : m_name("score-stsg") + : Tool("score-stsg") , m_lexTable(m_srcVocab, m_tgtVocab) , m_countOfCounts(kCountOfCountsMax, 0) , m_totalDistinct(0) @@ -300,17 +300,6 @@ double ScoreStsg::ComputeLexProb(const std::vector &sourceFrontier, return lexScore; } -void ScoreStsg::OpenOutputFileOrDie(const std::string &filename, - Moses::OutputFileStream &stream) -{ - bool ret = stream.Open(filename); - if (!ret) { - std::ostringstream msg; - msg << "failed to open output file: " << filename; - Error(msg.str()); - } -} - void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const { namespace po = boost::program_options; @@ -319,7 +308,7 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const // Construct the 'top' of the usage message: the bit that comes before the // options list. std::ostringstream usageTop; - usageTop << "Usage: " << GetName() + usageTop << "Usage: " << name() << " [OPTION]... EXTRACT LEX TABLE\n\n" << "STSG rule scorer\n\n" << "Options"; @@ -386,11 +375,8 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const // Process the command-line. po::variables_map vm; - const int optionStyle = cls::allow_long - | cls::long_allow_adjacent - | cls::long_allow_next; try { - po::store(po::command_line_parser(argc, argv).style(optionStyle). + po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()). options(cmdLineOptions).positional(p).run(), vm); po::notify(vm); } catch (const std::exception &e) { @@ -440,12 +426,6 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const } } -void ScoreStsg::Error(const std::string &msg) const -{ - std::cerr << GetName() << ": " << msg << std::endl; - std::exit(1); -} - } // namespace ScoreStsg } // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/score-stsg/ScoreStsg.h b/phrase-extract/score-stsg/ScoreStsg.h index 628c0080e..1757e181b 100644 --- a/phrase-extract/score-stsg/ScoreStsg.h +++ b/phrase-extract/score-stsg/ScoreStsg.h @@ -9,6 +9,8 @@ #include "ExtractionPhrasePair.h" #include "OutputFileStream.h" +#include "syntax-common/tool.h" + #include "LexicalTable.h" #include "Options.h" #include "RuleSymbol.h" @@ -25,16 +27,12 @@ namespace ScoreStsg class RuleGroup; class RuleTableWriter; -class ScoreStsg +class ScoreStsg : public Tool { public: ScoreStsg(); - const std::string &GetName() const { - return m_name; - } - - int Main(int argc, char *argv[]); + virtual int Main(int argc, char *argv[]); private: static const int kCountOfCountsMax; @@ -43,10 +41,6 @@ private: const std::vector &, const ALIGNMENT &); - void Error(const std::string &) const; - - void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &); - void ParseAlignmentString(const std::string &, int, ALIGNMENT &); @@ -59,7 +53,6 @@ private: void TokenizeRuleHalf(const std::string &, TokenizedRuleHalf &); - std::string m_name; Options m_options; Vocabulary m_srcVocab; Vocabulary m_tgtVocab; diff --git a/phrase-extract/syntax-common/tool.cc b/phrase-extract/syntax-common/tool.cc index c41eaf9bd..e145b78be 100644 --- a/phrase-extract/syntax-common/tool.cc +++ b/phrase-extract/syntax-common/tool.cc @@ -1,64 +1,30 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - #include "tool.h" +#include +#include #include +#include + namespace MosesTraining { namespace Syntax { -namespace PCFG { -std::istream &Tool::OpenInputOrDie(const std::string &filename) { - // TODO Check that function is only called once? - if (filename.empty() || filename == "-") { - input_ptr_ = &(std::cin); - } else { - input_file_stream_.open(filename.c_str()); - if (!input_file_stream_) { - std::ostringstream msg; - msg << "failed to open input file: " << filename; - Error(msg.str()); - } - input_ptr_ = &input_file_stream_; - } - return *input_ptr_; +int Tool::MosesOptionStyle() { + namespace cls = boost::program_options::command_line_style; + return cls::allow_long | cls::long_allow_adjacent | cls::long_allow_next; } -std::ostream &Tool::OpenOutputOrDie(const std::string &filename) { - // TODO Check that function is only called once? - if (filename.empty() || filename == "-") { - output_ptr_ = &(std::cout); - } else { - output_file_stream_.open(filename.c_str()); - if (!output_file_stream_) { - std::ostringstream msg; - msg << "failed to open output file: " << filename; - Error(msg.str()); - } - output_ptr_ = &output_file_stream_; - } - return *output_ptr_; +void Tool::Warn(const std::string &msg) const { + std::cerr << name_ << ": warning: " << msg << std::endl; } -void Tool::OpenNamedInputOrDie(const std::string &filename, - std::ifstream &stream) { +void Tool::Error(const std::string &msg) const { + std::cerr << name_ << ": error: " << msg << std::endl; + std::exit(1); +} + +void Tool::OpenInputFileOrDie(const std::string &filename, + std::ifstream &stream) { stream.open(filename.c_str()); if (!stream) { std::ostringstream msg; @@ -67,8 +33,8 @@ void Tool::OpenNamedInputOrDie(const std::string &filename, } } -void Tool::OpenNamedOutputOrDie(const std::string &filename, - std::ofstream &stream) { +void Tool::OpenOutputFileOrDie(const std::string &filename, + std::ofstream &stream) { stream.open(filename.c_str()); if (!stream) { std::ostringstream msg; @@ -77,6 +43,15 @@ void Tool::OpenNamedOutputOrDie(const std::string &filename, } } -} // namespace PCFG +void Tool::OpenOutputFileOrDie(const std::string &filename, + Moses::OutputFileStream &stream) { + bool ret = stream.Open(filename); + if (!ret) { + std::ostringstream msg; + msg << "failed to open output file: " << filename; + Error(msg.str()); + } +} + } // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/syntax-common/tool.h b/phrase-extract/syntax-common/tool.h index 2c903a11e..e1df8025f 100644 --- a/phrase-extract/syntax-common/tool.h +++ b/phrase-extract/syntax-common/tool.h @@ -1,93 +1,53 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2012 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - #pragma once -#ifndef PCFG_TOOL_H_ -#define PCFG_TOOL_H_ -#include #include -#include #include -#include +#include "OutputFileStream.h" namespace MosesTraining { namespace Syntax { -namespace PCFG { +/*! Base class for command-line based tools. + */ class Tool { public: virtual ~Tool() {} + //! Get the name of the tool. const std::string &name() const { return name_; } + //! Virtual main function to be provided by subclass. virtual int Main(int argc, char *argv[]) = 0; protected: Tool(const std::string &name) : name_(name) {} - // Returns the boost::program_options style that should be used by all tools. - static int CommonOptionStyle() { - namespace cls = boost::program_options::command_line_style; - return cls::default_style & (~cls::allow_guessing); - } + //! Returns a boost::program_options style that is consistent with other + //! Moses tools (extract-rules, score, etc.). + static int MosesOptionStyle(); - void Warn(const std::string &msg) const { - std::cerr << name_ << ": warning: " << msg << std::endl; - } + //! Write a formatted warning message to standard error. + void Warn(const std::string &) const; - void Error(const std::string &msg) const { - std::cerr << name_ << ": error: " << msg << std::endl; - std::exit(1); - } + //! Write a formatted error message to standard error and call exit(1). + void Error(const std::string &msg) const; - // Initialises the tool's main input stream and returns a reference that is - // valid for the remainder of the tool's lifetime. If filename is empty or - // "-" then input is standard input; otherwise it is the named file. Calls - // Error() if the file cannot be opened for reading. - std::istream &OpenInputOrDie(const std::string &filename); + //! Opens the named input file using the supplied ifstream. Calls Error() if + //! the file cannot be opened for reading. + void OpenInputFileOrDie(const std::string &, std::ifstream &); - // Initialises the tool's main output stream and returns a reference that is - // valid for the remainder of the tool's lifetime. If filename is empty or - // "-" then output is standard output; otherwise it is the named file. Calls - // Error() if the file cannot be opened for writing. - std::ostream &OpenOutputOrDie(const std::string &filename); + //! Opens the named output file using the supplied ofstream. Calls Error() if + //! the file cannot be opened for writing. + void OpenOutputFileOrDie(const std::string &, std::ofstream &); - // Opens the named input file using the supplied ifstream. Calls Error() if - // the file cannot be opened for reading. - void OpenNamedInputOrDie(const std::string &, std::ifstream &); - - // Opens the named output file using the supplied ofstream. Calls Error() if - // the file cannot be opened for writing. - void OpenNamedOutputOrDie(const std::string &, std::ofstream &); + //! Opens the named output file using the supplied OutputFileStream. Calls + //! Error() if the file cannot be opened for writing. + void OpenOutputFileOrDie(const std::string &, Moses::OutputFileStream &); private: std::string name_; - std::istream *input_ptr_; - std::ifstream input_file_stream_; - std::ostream *output_ptr_; - std::ofstream output_file_stream_; }; -} // namespace PCFG } // namespace Syntax } // namespace MosesTraining - -#endif From 42b53b7a3939e9c5b0875a2eeb53010774527a95 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Fri, 5 Jun 2015 00:00:42 +0100 Subject: [PATCH 058/108] daily automatic beautifier --- moses/server/TranslationRequest.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index 3762fbd96..cad3696d1 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -265,12 +265,12 @@ parse_request(std::map const& params) pdmm->SetTemporaryMultiModelWeightsVector(w); } } - + si = params.find("nbest"); if (si != params.end()) m_nbestSize = xmlrpc_c::value_int(si->second); - - + + // // biased sampling for suffix-array-based sampling phrase table? // if ((si = params.find("bias")) != params.end()) // { From c306715e828f23ffceefebde8e227fc1bd7ff4d0 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 8 Jun 2015 14:35:36 +0400 Subject: [PATCH 059/108] add back arg -always-create-direct-transoption. Seemed to have dropped out a while ago --- moses/Parameter.cpp | 1 + moses/StaticData.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 33441570f..cf8737e3b 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -143,6 +143,7 @@ Parameter::Parameter() AddParam(oov_opts,"mark-unknown", "mu", "mark unknown words in output"); AddParam(oov_opts,"lmodel-oov-feature", "add language model oov feature, one per model"); AddParam(oov_opts,"output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence"); + AddParam(oov_opts,"always-create-direct-transopt", "Always create a translation that translates the source word ad-verbatim"); /////////////////////////////////////////////////////////////////////////////////////// // input options diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 6fd5ced57..8fb88c257 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -444,6 +444,7 @@ StaticData //source word deletion m_parameter->SetParameter(m_wordDeletionEnabled, "phrase-drop-allowed", false ); + m_parameter->SetParameter(m_isAlwaysCreateDirectTranslationOption, "always-create-direct-transopt", false ); } void From 501c51947b192e8559fa35d820ebd951566bebba Mon Sep 17 00:00:00 2001 From: Lexi Birch Date: Mon, 8 Jun 2015 16:58:50 +0100 Subject: [PATCH 060/108] Allowing the truecaser to work on uncased ASR input, pass the -a flag --- scripts/recaser/truecase.perl | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index 0a4d366e0..7b3dc20fb 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -8,11 +8,14 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); # apply switches -my ($MODEL, $UNBUFFERED); -die("truecase.perl --model MODEL [-b] < in > out") - unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED) +# ASR input has no case, make sure it is lowercase, and make sure known are cased eg. 'i' to be uppercased even if i is known +my ($MODEL, $UNBUFFERED, $ASR); +die("truecase.perl --model MODEL [-b] [-a] < in > out") + unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED, 'a|asr' => \$ASR) && defined($MODEL); if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } +my $asr = 0; +if (defined($ASR) && $ASR) { $asr = 1; } my (%BEST,%KNOWN); open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); @@ -20,9 +23,11 @@ binmode(MODEL, ":utf8"); while() { my ($word,@OPTIONS) = split; $BEST{ lc($word) } = $word; - $KNOWN{ $word } = 1; - for(my $i=1;$i<$#OPTIONS;$i+=2) { - $KNOWN{ $OPTIONS[$i] } = 1; + if ($asr == 0) { + $KNOWN{ $word } = 1; + for(my $i=1;$i<$#OPTIONS;$i+=2) { + $KNOWN{ $OPTIONS[$i] } = 1; + } } } close(MODEL); @@ -49,6 +54,9 @@ while() { $word = $$WORD[$i]; $otherfactors = ""; } + if ($asr){ + $word = lc($word); #make sure ASR output is not uc + } if ($sentence_start && defined($BEST{lc($word)})) { print $BEST{lc($word)}; # truecase sentence start From fa51da28c5f21881b716026b69b07b0fd2e3a015 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Tue, 9 Jun 2015 16:50:27 +0100 Subject: [PATCH 061/108] moses/phrase-extract refactoring Final commit in this round of refactoring (which started with commit 2f735998...). The main changes are: - a general storage mechanism for attribute/value pairs in XML-style tree / lattice input. E.g. the "pcfg-score" and "semantic-role" attributes in: I - consolidation of the various near-duplicate Tree / XmlTreeParser classes that have accumulated over the years (my fault) - miscellaneous de-crufting --- phrase-extract/SyntaxNode.h | 3 +++ phrase-extract/SyntaxNodeCollection.h | 8 +++--- phrase-extract/XmlTree.cpp | 27 +++++++++---------- .../syntax-common/xml_tree_parser.h | 2 +- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h index 49e2eb695..25a75b784 100644 --- a/phrase-extract/SyntaxNode.h +++ b/phrase-extract/SyntaxNode.h @@ -25,6 +25,9 @@ namespace MosesTraining { +/*! A node in a syntactic structure (tree, lattice, etc.). SyntaxNodes have a + * label and a span plus an arbitrary set of name/value attributes. + */ struct SyntaxNode { typedef std::map AttributeMap; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index 405a77c5f..da0e1eca3 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -55,11 +55,13 @@ public: return m_nodes; }; - size_t GetNumWords() const { - return m_numWords; - } + //! Get the number of words (defined as 1 + the max end pos of any node). + std::size_t GetNumWords() const { return m_numWords; } + + //! Clear the container (this deletes the SyntaxNodes). void Clear(); + //! Extract a SyntaxTree (assuming the collection's nodes constitute a tree). std::auto_ptr ExtractTree(); private: diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index d8b77b6e6..d88c78c0b 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -80,7 +80,6 @@ string ParseXmlTagAttribute(const string& tag,const string& attributeName) return tag.substr(contentsStart,contentsEnd-contentsStart); } -// TODO Special handling of "label" attribute // s should be a sequence of name=attribute pairs separated by whitespace. // e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\"" void ParseXmlTagAttributes(const std::string &s, @@ -107,8 +106,9 @@ void ParseXmlTagAttributes(const std::string &s, throw XmlException("invalid tag content"); } } - // TODO unescape \" - attributes[name] = s.substr(begin+1, pos-begin-1); + if (name != "label" && name != "span") { + attributes[name] = s.substr(begin+1, pos-begin-1); + } begin = pos+1; } } @@ -245,20 +245,17 @@ vector TokenizeXml(const string& str) } /** - * Process a sentence with xml annotation - * Xml tags may specifiy additional/replacing translation options - * and reordering constraints + * Process a sentence with XML-style annotation of syntactic nodes. * - * \param line in: sentence, out: sentence without the xml - * \param res vector with translation options specified by xml - * \param reorderingConstraint reordering constraint zones specified by xml - * \param walls reordering constraint walls specified by xml + * \param line[in,out] in: sentence, out: sentence without the XML + * \param nodeCollection[out] the collection of SyntaxNode objects for this + * sentence + * \param labelCollection[out] label values are inserted into this set + * \param topLabelCollection[out] top labels (key) and their counts (value) + * are inserted into this map + * \param unescapeSpecialChars flag indicating whether XML special characters + * should be unescaped */ -/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector - is so we can link things up afterwards. We can't create TranslationOptions as we - parse because we don't have the completed source parsed until after this function - removes all the markup from it (CreateFromString in Sentence::Read). -*/ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, set< string > &labelCollection, map< string, int > &topLabelCollection, diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index 48ea056b8..04ad74e24 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -16,7 +16,7 @@ namespace Syntax { * converts them to SyntaxTree objects. * * This is a thin wrapper around the ProcessAndStripXMLTags function. After - * calling Parse(), the output of the ProcessAndStripXMLTags function (the + * calling Parse(), the output from the ProcessAndStripXMLTags call (the * sentence, node collection, label set, and top label set) are available via * accessors. */ From dbcc264506ca26b471b821ab8fc0b78d88457185 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Tue, 9 Jun 2015 23:10:27 +0700 Subject: [PATCH 062/108] Remove unneeded script. Tom Hoar, the author of this script, asked me to remove it because it doesn't actually do what the current name says, and can't work without an additional script which isn't in the repository. --- .../training/convert-moses-ini-v2-to-v1.py | 266 ------------------ 1 file changed, 266 deletions(-) delete mode 100755 scripts/training/convert-moses-ini-v2-to-v1.py diff --git a/scripts/training/convert-moses-ini-v2-to-v1.py b/scripts/training/convert-moses-ini-v2-to-v1.py deleted file mode 100755 index 4b7cfa5fa..000000000 --- a/scripts/training/convert-moses-ini-v2-to-v1.py +++ /dev/null @@ -1,266 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf8 -*- -# -# This file is part of moses. Its use is licensed under the GNU Lesser General -# Public License version 3 or, at your option, any later version. - - -from __future__ import ( - absolute_import, - print_function, - unicode_literals, - ) - -__version__ = '1.0' -__license__ = 'LGPL3' -__source__ = 'Precision Translation Tools Pte Lte' - -import errno -from sys import stdout -from copy import deepcopy -from os.path import ( - dirname, - basename, - exists, - realpath, - ) -from os import makedirs - - -root_escape = '%(escape-prefix)s' - - -class moses2_to_ini(object): - - def __init__(self, inp, out, escape_prefix): - self.inp = inp - self.out = out - self.escape_prefix = escape_prefix - self._config = {} - - def parse(self): - key = '' - section = None - self._config = {} - counter = 0 - - with open(self.inp, 'rb') as f: - contents = f.read().decode('utf8') - - lines = contents.splitlines() - - # Known feature/functions without attributes. - attrless_ffs = [ - 'UnknownWordPenalty', - 'WordPenalty', - 'PhrasePenalty', - 'Distortion', - ] - - # Retrieve all values except feature/functions with attributes. - for i, line in [(i, line.strip()) for i, line in enumerate(lines) - if line.strip() and not line.strip().startswith('#')]: - - if line.startswith('[') and line.endswith(']'): - - section = line.strip('] [') - - if section not in self._config.keys() + ['feature', 'weight']: - # New section not in config and not a reserved section. - counter = 0 - key = section - self._config[key] = {} - - elif section == 'feature' and line in attrless_ffs: - # Known feature/funcions without attributes. - key = '%s0' % line - if key not in self._config: - self._config[key] = {} - self._config[key]['feature'] = line - - elif section == 'feature': - # Skip feature/funcions with arguments. - continue - - elif section == 'weight': - # Add weight value to feature sections. - config_items = [ - (key.strip(), value.strip()) - for key, value in [line.split('=', 1)] - ] - for key, value in config_items: - if key not in self._config: - self._config[key] = {} - self._config[key]['weight'] = value - - else: - self._config[key][counter] = line - counter += 0 - - lines[i] = '' - - # Second, match feature/functions attributes to [weight] section - # values. - stripped_lines = [line.strip() for line in lines] - nonempty_lines = [ - line - for line in stripped_lines - if line != '' and not line.startswith('#') - ] - for i, line in enumerate(nonempty_lines): - # Add "feature" to assist creating tmpdict for feature/functions. - line = 'feature=%s' % line - tmpdict = dict([key.split('=', 1) for key in line.split()]) - - # Feature/functions 'name' attribute must match an entry in - # [weight]. - if tmpdict.get('name') not in self._config: - raise RuntimeError('malformed moses.ini v2 file') - - config_items = [ - (key.strip(), value.strip()) - for key, value in tmpdict.items() - if key.strip() != 'name' - ] - for key, value in config_items: - self._config[tmpdict['name']][key] = value - - return deepcopy(self._config) - - def render(self, config): - self._config = deepcopy(config) - _config = deepcopy(config) - lines = _tolines(_config, self.escape_prefix) - if self.out == '-': - stdout.write('\n'.join(lines)) - else: - contents = '\r\n'.join(lines) - makedir(dirname(self.out)) - with open(self.out, 'wb') as f: - f.write(contents.encode('utf8')) - - def __str__(self): - return '\n'.join(_tolines(self._config, self.escape_prefix)) - - @property - def config(self): - return deepcopy(self._config) - - -def _tolines(config, escape_prefix): - - section_names = sorted(config) - lines = [] - - # Group feature/functions first. - group_ffs = [ - name - for name in section_names - if name[-1].isdigit() - ] - for sectionname in group_ffs: - section = config[sectionname] - lines.append('[%s]' % sectionname) - for option, value in section.items(): - if option == 'path' \ - and escape_prefix is not None \ - and value.startswith(escape_prefix): - value = value.replace(escape_prefix, root_escape, 1) - lines.append('%s=%s' % (option, value)) - lines.append('') - - other_ffs = [ - name - for name in section_names - if not name[-1].isdigit() - ] - for sectionname in other_ffs: - section = config[sectionname] - lines.append('[%s]' % sectionname) - for option, value in section.items(): - lines.append('%s=%s' % (option, value)) - lines.append('') - - return deepcopy(lines) - - -def makedir(path, mode=0o777): - try: - makedirs(path, mode) - except OSError as e: - accepted_errors = [ - errno.EEXIST, - errno.EPERM, - errno.EACCES, - errno.ENOENT, - ] - if e.errno not in accepted_errors: - raise - - -def get_args(): - '''Parse command-line arguments - - Uses the API compatibility between the legacy - argparse.OptionParser and its replacement argparse.ArgumentParser - for functional equivelancy and nearly identical help prompt. - ''' - - description = 'Convert Moses.ini v2 file to standard INI format' - usage = '%s [arguments]' % basename(__file__) - - try: - from argparse import ArgumentParser - except ImportError: - from optparse import OptionParser - argparser = False - escape_help = ( - "Optional. Path of SMT model. If provided, " - "escapes \"escape-prefix\" with \"%(escape-prefix)s\"") - parser = OptionParser(usage=usage, description=description) - add_argument = parser.add_option - else: - argparser = True - escape_help = ( - "Optional. Path of SMT model. If provided, " - "escape \"escape-prefix\" with \"%%(escape-prefix)s\"") - parser = ArgumentParser(usage=usage, description=description) - add_argument = parser.add_argument - - add_argument( - '-i', '--inp', action='store', - help="moses.ini v2 file to convert (required)") - - add_argument( - '-o', '--out', action='store', default='-', - help="standard INI file (default: '-' outputs to stdout)") - - add_argument('-r', '--escape-prefix', action='store', help=escape_help) - - if argparser: - args = vars(parser.parse_args()) - else: - opts = parser.parse_args() - args = vars(opts[0]) - - if args['inp'] is None: - parser.error('argument -i/--inp required') - - args['inp'] = realpath(args['inp']) - - if not exists(args['inp']): - parser.error( - "argument -i/--inp invalid.\n" - "reference: %s" % args['inp']) - - if args['out'] != '-': - args['out'] = realpath(args['out']) - - return args - - -if __name__ == '__main__': - args = get_args() - converter = moses2_to_ini(**args) - config = converter.parse() - converter.render(config) From 47c793ca4647a6868fabc041cbd63e5114994e3f Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Wed, 10 Jun 2015 00:00:40 +0100 Subject: [PATCH 063/108] daily automatic beautifier --- phrase-extract/SyntaxNodeCollection.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index da0e1eca3..ef0989cd0 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -56,7 +56,9 @@ public: }; //! Get the number of words (defined as 1 + the max end pos of any node). - std::size_t GetNumWords() const { return m_numWords; } + std::size_t GetNumWords() const { + return m_numWords; + } //! Clear the container (this deletes the SyntaxNodes). void Clear(); From 0d54286d3f11dda748de91d0a8a2977551066826 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 11 Jun 2015 14:43:10 -0400 Subject: [PATCH 064/108] Require __SSE2__ for i386 to use SSE2 --- util/integer_to_string.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util/integer_to_string.cc b/util/integer_to_string.cc index 32047291d..6b8766119 100644 --- a/util/integer_to_string.cc +++ b/util/integer_to_string.cc @@ -7,6 +7,7 @@ Local modifications: 4. Remove test hook 5. Non-x86 support from the branch_lut code 6. Rename functions +7. Require __SSE2__ on i386 Copyright (C) 2014 Milo Yip @@ -66,7 +67,7 @@ const char gDigitsLut[200] = { // SSE2 implementation according to http://0x80.pl/articles/sse-itoa.html // Modifications: (1) fix incorrect digits (2) accept all ranges (3) write to user provided buffer. -#if defined(i386) || defined(__amd64) || defined(_M_IX86) || defined(_M_X64) +#if defined(__amd64) || defined(_M_X64) || (defined(__SSE2__) && (defined(_M_IX86) || defined(i386))) #include From 924710f53e959f7214f740db691ebe4f7778dfd7 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Fri, 12 Jun 2015 15:11:57 +0700 Subject: [PATCH 065/108] On MinGW use Windows _chsize_t, not ftruncate. This works around a problem when building against MinGW and then running the resulting Windows binary on WINE. (Perverse, I know.) For some reason the ftruncate() to 0 bytes succeeds, but the subsequent one to a larger size fails. Even if the size is just 1 byte. This happened where GenericModel::InitializeFromARPA called BinaryFormat::SetupJustVocab, which called MapZeroedWrite, which calls ResizeOrThrow twice; the second one failed. --- util/file.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/util/file.cc b/util/file.cc index 414d0471c..046b9ff90 100644 --- a/util/file.cc +++ b/util/file.cc @@ -111,10 +111,7 @@ uint64_t SizeOrThrow(int fd) { } void ResizeOrThrow(int fd, uint64_t to) { -#if defined __MINGW32__ - // Does this handle 64-bit? - int ret = ftruncate -#elif defined(_WIN32) || defined(_WIN64) +#if defined(_WIN32) || defined(_WIN64) errno_t ret = _chsize_s #elif defined(OS_ANDROID) int ret = ftruncate64 From ffd3f2bb6e34c9eb0fba01f7a76d573c2d7105d9 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Fri, 12 Jun 2015 16:21:24 +0100 Subject: [PATCH 066/108] Added basic BilingualNPLM support to EMS and an example config. --- scripts/ems/example/config.toy.bilinguallm | 682 ++++++++++++++++++ .../ems/example/data/weight_bilinguallm.ini | 14 + scripts/ems/experiment.meta | 17 +- 3 files changed, 712 insertions(+), 1 deletion(-) create mode 100644 scripts/ems/example/config.toy.bilinguallm create mode 100644 scripts/ems/example/data/weight_bilinguallm.ini diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm new file mode 100644 index 000000000..37a34b70b --- /dev/null +++ b/scripts/ems/example/config.toy.bilinguallm @@ -0,0 +1,682 @@ +################################################ +### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### +################################################ + +[GENERAL] + +### directory in which experiment is run +# +working-dir = /mnt/gna0/nbogoych/ems_work + +# specification of the language pair +input-extension = fr +output-extension = en +pair-extension = fr-en + +### directories that contain tools and data +# +# moses +moses-src-dir = /mnt/gna0/nbogoych/mosesdecoder +# +# moses binaries +moses-bin-dir = $moses-src-dir/bin +# +# moses scripts +moses-script-dir = $moses-src-dir/scripts +# +# directory where GIZA++/MGIZA programs resides +external-bin-dir = /home/pkoehn/statmt/bin +# +# srilm +#srilm-dir = $moses-src-dir/srilm/bin/i686 +# +# irstlm +#irstlm-dir = $moses-src-dir/irstlm/bin +# +# randlm +#randlm-dir = $moses-src-dir/randlm/bin +# +# data +toy-data = $moses-script-dir/ems/example/data + +### basic tools +# +# moses decoder +decoder = $moses-bin-dir/moses + +# conversion of rule table into binary on-disk format +ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" + +# tokenizers - comment out if all your data is already tokenized +input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" +output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" + +# truecasers - comment out if you do not use the truecaser +input-truecaser = $moses-script-dir/recaser/truecase.perl +output-truecaser = $moses-script-dir/recaser/truecase.perl +detruecaser = $moses-script-dir/recaser/detruecase.perl + +# lowercaser - comment out if you use truecasing +#input-lowercaser = $moses-script-dir/tokenizer/lowercase.perl +#output-lowercaser = $moses-script-dir/tokenizer/lowercase.perl + +### generic parallelizer for cluster and multi-core machines +# you may specify a script that allows the parallel execution +# parallizable steps (see meta file). you also need specify +# the number of jobs (cluster) or cores (multicore) +# +#generic-parallelizer = $moses-script-dir/ems/support/generic-parallelizer.perl +#generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl + +### cluster settings (if run on a cluster machine) +# number of jobs to be submitted in parallel +# +#jobs = 10 + +# arguments to qsub when scheduling a job +#qsub-settings = "" + +# project for priviledges and usage accounting +#qsub-project = iccs_smt + +# memory and time +#qsub-memory = 4 +#qsub-hours = 48 + +### multi-core settings +# when the generic parallelizer is used, the number of cores +# specified here +cores = 8 + +################################################################# +# PARALLEL CORPUS PREPARATION: +# create a tokenized, sentence-aligned corpus, ready for training + +[CORPUS] + +### long sentences are filtered out, since they slow down GIZA++ +# and are a less reliable source of data. set here the maximum +# length of a sentence +# +max-sentence-length = 80 + +[CORPUS:toy] + +### command to run to get raw corpus files +# +# get-corpus-script = + +### raw corpus files (untokenized, but sentence aligned) +# +raw-stem = $toy-data/nc-5k + +### tokenized corpus files (may contain long sentences) +# +#tokenized-stem = + +### if sentence filtering should be skipped, +# point to the clean training data +# +#clean-stem = + +### if corpus preparation should be skipped, +# point to the prepared training data +# +#lowercased-stem = + +################################################################# +# LANGUAGE MODEL TRAINING + +[LM] + +### tool to be used for language model training +# kenlm training +lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz" +settings = "--prune '0 0 1' -T $working-dir/lm -S 20%" + +# srilm +#lm-training = $srilm-dir/ngram-count +#settings = "-interpolate -kndiscount -unk" + +# irstlm training +# msb = modified kneser ney; p=0 no singleton pruning +#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp" +#settings = "-s msb -p 0" + +# order of the language model +order = 5 + +### tool to be used for training randomized language model from scratch +# (more commonly, a SRILM is trained) +# +#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8" + +### script to use for binary table format for irstlm or kenlm +# (default: no binarization) + +# irstlm +#lm-binarizer = $irstlm-dir/compile-lm + +# kenlm, also set type to 8 +lm-binarizer = $moses-bin-dir/build_binary +type = 8 + +### script to create quantized language model format (irstlm) +# (default: no quantization) +# +#lm-quantizer = $irstlm-dir/quantize-lm + +### script to use for converting into randomized table format +# (default: no randomization) +# +#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" + +### each language model to be used has its own section here + +[LM:toy] + +### command to run to get raw corpus files +# +#get-corpus-script = "" + +### raw corpus (untokenized) +# +raw-corpus = $toy-data/nc-5k.$output-extension + +### tokenized corpus files (may contain long sentences) +# +#tokenized-corpus = + +### if corpus preparation should be skipped, +# point to the prepared language model +# +#lm = + +[LM:bilingual-lm] +#bilingual-lm +exclude-from-interpolation = true +bilingual-lm = "yes" +bilingual-lm-workdir = "bilingual" +bilingual-lm-settings = "" +order = "5" +source-window = "4" + +#actual training +train_order = "14" #this is equal to order + 2*source-window + 1 +nplm-output-dir = "nplm_out" +nplm-settings = "-l /mnt/gna0/rsennrich/tools/nplm-0.3-gpu-experimental/" + +#Config file generation: +config-feature-line = "BilingualNPLM order=$order source_window=$source-window path=$working-dir/$nplm-output-dir/train.10k.model.nplm.10 source_vocab=$working-dir/$bilingual-lm-workdir/vocab.source target_vocab=$working-dir/$bilingual-lm-workdir/vocab.target" +config-weight-line = "BilingualNPLM0= 0.1" + +################################################################# +# INTERPOLATING LANGUAGE MODELS + +[INTERPOLATED-LM] + +# if multiple language models are used, these may be combined +# by optimizing perplexity on a tuning set +# see, for instance [Koehn and Schwenk, IJCNLP 2008] + +### script to interpolate language models +# if commented out, no interpolation is performed +# +# script = $moses-script-dir/ems/support/interpolate-lm.perl + +### tuning set +# you may use the same set that is used for mert tuning (reference set) +# +#tuning-sgm = +#raw-tuning = +#tokenized-tuning = +#factored-tuning = +#lowercased-tuning = +#split-tuning = + +### group language models for hierarchical interpolation +# (flat interpolation is limited to 10 language models) +#group = "first,second fourth,fifth" + +### script to use for binary table format for irstlm or kenlm +# (default: no binarization) + +# irstlm +#lm-binarizer = $irstlm-dir/compile-lm + +# kenlm, also set type to 8 +lm-binarizer = $moses-bin-dir/build_binary +type = 8 + +### script to create quantized language model format (irstlm) +# (default: no quantization) +# +#lm-quantizer = $irstlm-dir/quantize-lm + +### script to use for converting into randomized table format +# (default: no randomization) +# +#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" + +################################################################# +# MODIFIED MOORE LEWIS FILTERING + +[MML] IGNORE + +### specifications for language models to be trained +# +#lm-training = $srilm-dir/ngram-count +#lm-settings = "-interpolate -kndiscount -unk" +#lm-binarizer = $moses-src-dir/bin/build_binary +#lm-query = $moses-src-dir/bin/query +#order = 5 + +### in-/out-of-domain source/target corpora to train the 4 language model +# +# in-domain: point either to a parallel corpus +#outdomain-stem = [CORPUS:toy:clean-split-stem] + +# ... or to two separate monolingual corpora +#indomain-target = [LM:toy:lowercased-corpus] +#raw-indomain-source = $toy-data/nc-5k.$input-extension + +# point to out-of-domain parallel corpus +#outdomain-stem = [CORPUS:giga:clean-split-stem] + +# settings: number of lines sampled from the corpora to train each language model on +# (if used at all, should be small as a percentage of corpus) +#settings = "--line-count 100000" + +################################################################# +# TRANSLATION MODEL TRAINING + +[TRAINING] + +### training script to be used: either a legacy script or +# current moses training script (default) +# +script = $moses-script-dir/training/train-model.perl + +### general options +# these are options that are passed on to train-model.perl, for instance +# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza +# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting +# * "-sort-parallel 8 -cores 8" to speed up phrase table building +# * "-parallel" for parallel execution of mkcls and giza +# +#training-options = "" + +### factored training: specify here which factors used +# if none specified, single factor training is assumed +# (one translation step, surface to surface) +# +#input-factors = word lemma pos morph +#output-factors = word lemma pos +#alignment-factors = "word -> word" +#translation-factors = "word -> word" +#reordering-factors = "word -> word" +#generation-factors = "word -> pos" +#decoding-steps = "t0, g0" + +### parallelization of data preparation step +# the two directions of the data preparation can be run in parallel +# comment out if not needed +# +parallel = yes + +### pre-computation for giza++ +# giza++ has a more efficient data structure that needs to be +# initialized with snt2cooc. if run in parallel, this may reduces +# memory requirements. set here the number of parts +# +#run-giza-in-parts = 5 + +### symmetrization method to obtain word alignments from giza output +# (commonly used: grow-diag-final-and) +# +alignment-symmetrization-method = grow-diag-final-and + +### use of Chris Dyer's fast align for word alignment +# +#fast-align-settings = "-d -o -v" + +### use of berkeley aligner for word alignment +# +#use-berkeley = true +#alignment-symmetrization-method = berkeley +#berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh +#berkeley-process = $moses-script-dir/ems/support/berkeley-process.sh +#berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar +#berkeley-java-options = "-server -mx30000m -ea" +#berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8" +#berkeley-process-options = "-EMWordAligner.numThreads 8" +#berkeley-posterior = 0.5 + +### use of baseline alignment model (incremental training) +# +#baseline = 68 +#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \ +# $working-dir/training/prepared.$baseline/$output-extension.vcb \ +# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \ +# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \ +# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \ +# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \ +# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \ +# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5" + +### if word alignment should be skipped, +# point to word alignment files +# +#word-alignment = $working-dir/model/aligned.1 + +### filtering some corpora with modified Moore-Lewis +# specify corpora to be filtered and ratio to be kept, either before or after word alignment +#mml-filter-corpora = toy +#mml-before-wa = "-proportion 0.9" +#mml-after-wa = "-proportion 0.9" + +### build memory mapped suffix array phrase table +# (binarizing the reordering table is a good idea, since filtering makes little sense) +#mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1" +#binarize-all = $moses-script-dir/training/binarize-model.perl + +### create a bilingual concordancer for the model +# +#biconcor = $moses-bin-dir/biconcor + +## Operation Sequence Model (OSM) +# Durrani, Schmid and Fraser. (2011): +# "A Joint Sequence Translation Model with Integrated Reordering" +# compile Moses with --max-kenlm-order=9 if higher order is required +# +#operation-sequence-model = "yes" +#operation-sequence-model-order = 5 +#operation-sequence-model-settings = "" +# +# if OSM training should be skipped, point to OSM Model +#osm-model = + +### unsupervised transliteration module +# Durrani, Sajjad, Hoang and Koehn (EACL, 2014). +# "Integrating an Unsupervised Transliteration Model +# into Statistical Machine Translation." +# +#transliteration-module = "yes" +#post-decoding-transliteration = "yes" + +### lexicalized reordering: specify orientation type +# (default: only distance-based reordering model) +# +lexicalized-reordering = msd-bidirectional-fe + +### hierarchical rule set +# +#hierarchical-rule-set = true + +### settings for rule extraction +# +#extract-settings = "" +max-phrase-length = 5 + +### add extracted phrases from baseline model +# +#baseline-extract = $working-dir/model/extract.$baseline +# +# requires aligned parallel corpus for re-estimating lexical translation probabilities +#baseline-corpus = $working-dir/training/corpus.$baseline +#baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method + +### unknown word labels (target syntax only) +# enables use of unknown word labels during decoding +# label file is generated during rule extraction +# +#use-unknown-word-labels = true + +### if phrase extraction should be skipped, +# point to stem for extract files +# +# extracted-phrases = + +### settings for rule scoring +# +score-settings = "--GoodTuring --MinScore 2:0.0001" + +### include word alignment in phrase table +# +#include-word-alignment-in-rules = yes + +### sparse lexical features +# +#sparse-features = "target-word-insertion top 50, source-word-deletion top 50, word-translation top 50 50, phrase-length" + +### domain adaptation settings +# options: sparse, any of: indicator, subset, ratio +#domain-features = "subset" + +### if phrase table training should be skipped, +# point to phrase translation table +# +# phrase-translation-table = + +### if reordering table training should be skipped, +# point to reordering table +# +# reordering-table = + +### filtering the phrase table based on significance tests +# Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable" +# options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold +#salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64 +#sigtest-filter = "-l a+e -n 50" + +### if training should be skipped, +# point to a configuration file that contains +# pointers to all relevant model files +# +#config-with-reused-weights = + +##################################################### +### TUNING: finding good weights for model components + +[TUNING] + +### instead of tuning with this setting, old weights may be recycled +# specify here an old configuration file with matching weights +# +weight-config = $toy-data/weight.ini + +### tuning script to be used +# +tuning-script = $moses-script-dir/training/mert-moses.pl +tuning-settings = "-mertdir $moses-bin-dir" + +### specify the corpus used for tuning +# it should contain 1000s of sentences +# +#input-sgm = +#raw-input = +#tokenized-input = +#factorized-input = +#input = +# +#reference-sgm = +#raw-reference = +#tokenized-reference = +#factorized-reference = +#reference = + +### size of n-best list used (typically 100) +# +nbest = 100 + +### ranges for weights for random initialization +# if not specified, the tuning script will use generic ranges +# it is not clear, if this matters +# +# lambda = + +### additional flags for the filter script +# +filter-settings = "" + +### additional flags for the decoder +# +decoder-settings = "" + +### if tuning should be skipped, specify this here +# and also point to a configuration file that contains +# pointers to all relevant model files +# +#config-with-reused-weights = + +######################################################### +## RECASER: restore case, this part only trains the model + +[RECASING] IGNORE + +### training data +# raw input needs to be still tokenized, +# also also tokenized input may be specified +# +#tokenized = [LM:europarl:tokenized-corpus] + +### additinal settings +# +recasing-settings = "" +#lm-training = $srilm-dir/ngram-count +decoder = $moses-bin-dir/moses + +# already a trained recaser? point to config file +#recase-config = + +####################################################### +## TRUECASER: train model to truecase corpora and input + +[TRUECASER] + +### script to train truecaser models +# +trainer = $moses-script-dir/recaser/train-truecaser.perl + +### training data +# data on which truecaser is trained +# if no training data is specified, parallel corpus is used +# +# raw-stem = +# tokenized-stem = + +### trained model +# +# truecase-model = + +###################################################################### +## EVALUATION: translating a test set using the tuned system and score it + +[EVALUATION] + +### additional flags for the filter script +# +#filter-settings = "" + +### additional decoder settings +# switches for the Moses decoder +# common choices: +# "-threads N" for multi-threading +# "-mbr" for MBR decoding +# "-drop-unknown" for dropping unknown source words +# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning +# +decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" + +### specify size of n-best list, if produced +# +#nbest = 100 + +### multiple reference translations +# +#multiref = yes + +### prepare system output for scoring +# this may include detokenization and wrapping output in sgm +# (needed for nist-bleu, ter, meteor) +# +detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension" +#recaser = $moses-script-dir/recaser/recase.perl +wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" +#output-sgm = + +### BLEU +# +nist-bleu = $moses-script-dir/generic/mteval-v13a.pl +nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" +#multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc" +#multi-bleu-c = $moses-script-dir/generic/multi-bleu.perl +#ibm-bleu = + +### TER: translation error rate (BBN metric) based on edit distance +# not yet integrated +# +# ter = + +### METEOR: gives credit to stem / worknet synonym matches +# not yet integrated +# +# meteor = + +### Analysis: carry out various forms of analysis on the output +# +analysis = $moses-script-dir/ems/support/analysis.perl +# +# also report on input coverage +analyze-coverage = yes +# +# also report on phrase mappings used +report-segmentation = yes +# +# report precision of translations for each input word, broken down by +# count of input word in corpus and model +#report-precision-by-coverage = yes +# +# further precision breakdown by factor +#precision-by-coverage-factor = pos +# +# visualization of the search graph in tree-based models +#analyze-search-graph = yes + +[EVALUATION:test] + +### input data +# +input-sgm = $toy-data/test-src.$input-extension.sgm +# raw-input = +# tokenized-input = +# factorized-input = +# input = + +### reference data +# +reference-sgm = $toy-data/test-ref.$output-extension.sgm +# raw-reference = +# tokenized-reference = +# reference = + +### analysis settings +# may contain any of the general evaluation analysis settings +# specific setting: base coverage statistics on earlier run +# +#precision-by-coverage-base = $working-dir/evaluation/test.analysis.5 + +### wrapping frame +# for nist-bleu and other scoring scripts, the output needs to be wrapped +# in sgm markup (typically like the input sgm) +# +wrapping-frame = $input-sgm + +########################################## +### REPORTING: summarize evaluation scores + +[REPORTING] + +### currently no parameters for reporting section + + diff --git a/scripts/ems/example/data/weight_bilinguallm.ini b/scripts/ems/example/data/weight_bilinguallm.ini new file mode 100644 index 000000000..fbe26fc03 --- /dev/null +++ b/scripts/ems/example/data/weight_bilinguallm.ini @@ -0,0 +1,14 @@ +######################### +### MOSES CONFIG FILE ### +######################### + +[weight] +Distortion0= 0.3 +UnknownWordPenalty0= 1 +WordPenalty0= -1 +TranslationModel0= 0.2 0.2 0.2 0.2 +PhrasePenalty0= 0.2 +LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3 +LM0= 0.5 +BilingualNPLM0= 0.1 + diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index dafbe4a42..62e38128c 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -160,6 +160,20 @@ train ignore-if: no-splitter-training [LM] multiple +prepare-bilingual-nplm + in: TRAINING:corpus TRAINING:word-alignment + out: numberized_ngrams + ignore-unless: bilingual-lm + rerun-on-change: TRAINING:corpus TRAINING:word-alignment + template: $moses-script-dir/training/bilingual-lm/extract_training.py -c IN0 -e $output-extension -f $input-extension -a IN1.$TRAINING:alignment-symmetrization-method -w $working-dir/$bilingual-lm-workdir -n $order -m $source-window $bilingual-lm-settings + default-name: LM/bilingualLM_prep +train-bilingual-lm + in: numberized_ngrams TRAINING:corpus + out: binlm + ignore-unless: bilingual-lm + rerun-on-change: numberized_ngrams + template: $moses-script-dir/training/bilingual-lm/train_nplm.py -w $working-dir/$bilingual-lm-workdir -c IN1 -r $working-dir/$nplm-output-dir -n $train_order $nplm-settings + default-name: LM/BilingualLM get-corpus in: get-corpus-script out: raw-corpus @@ -247,7 +261,7 @@ train in: stripped-corpus out: lm default-name: lm/lm - ignore-if: rlm-training custom-training + ignore-if: rlm-training custom-training bilingual-lm rerun-on-change: lm-training order settings template: $lm-training -order $order $settings -text IN -lm OUT error: cannot execute binary file @@ -293,6 +307,7 @@ binarize in: qlm out: binlm pass-unless: lm-binarizer + ignore-if: bilingual-lm rerun-on-change: lm default-name: lm/binlm template: $lm-binarizer IN OUT From 166bf7365f02cf573265d7aded822f0d08215de0 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Fri, 12 Jun 2015 16:56:36 +0100 Subject: [PATCH 067/108] Forgot to update the weight config path --- scripts/ems/example/config.toy.bilinguallm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm index 37a34b70b..cd6880f32 100644 --- a/scripts/ems/example/config.toy.bilinguallm +++ b/scripts/ems/example/config.toy.bilinguallm @@ -483,7 +483,7 @@ score-settings = "--GoodTuring --MinScore 2:0.0001" ### instead of tuning with this setting, old weights may be recycled # specify here an old configuration file with matching weights # -weight-config = $toy-data/weight.ini +weight-config = $toy-data/weight_bilinguallm.ini ### tuning script to be used # From bd86ceffbe6d748a863e15def2443150ca360b38 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Sat, 13 Jun 2015 21:31:53 +0700 Subject: [PATCH 068/108] Check for error when opening gzfilebuf. This replaces a segfault when a file can't be found with an exception. Not as helpful as it could be yet, but certainly better than just crashing. Also, make InputFileStream constructor from path "explicit" to avoid mistakes. --- moses/InputFileStream.h | 2 +- moses/gzfilebuf.h | 3 +++ phrase-extract/InputFileStream.h | 2 +- phrase-extract/extract-mixed-syntax/InputFileStream.h | 2 +- phrase-extract/extract-mixed-syntax/gzfilebuf.h | 4 +++- phrase-extract/gzfilebuf.h | 4 +++- phrase-extract/lexical-reordering/InputFileStream.h | 2 +- phrase-extract/lexical-reordering/gzfilebuf.h | 4 +++- 8 files changed, 16 insertions(+), 7 deletions(-) diff --git a/moses/InputFileStream.h b/moses/InputFileStream.h index d53abfc23..313ddfed7 100644 --- a/moses/InputFileStream.h +++ b/moses/InputFileStream.h @@ -37,7 +37,7 @@ protected: std::streambuf *m_streambuf; public: - InputFileStream(const std::string &filePath); + explicit InputFileStream(const std::string &filePath); ~InputFileStream(); void Close(); diff --git a/moses/gzfilebuf.h b/moses/gzfilebuf.h index 2376c2875..c82092933 100644 --- a/moses/gzfilebuf.h +++ b/moses/gzfilebuf.h @@ -1,6 +1,7 @@ #ifndef moses_gzfile_buf_h #define moses_gzfile_buf_h +#include #include #include #include @@ -13,6 +14,8 @@ class gzfilebuf : public std::streambuf public: gzfilebuf(const char *filename) { _gzf = gzopen(filename, "rb"); + if (!_gzf) + throw std::runtime_error("Could not open " + std::string(filename) + "."); setg (_buff+sizeof(int), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)); // end position diff --git a/phrase-extract/InputFileStream.h b/phrase-extract/InputFileStream.h index e2a31bc82..5de416237 100644 --- a/phrase-extract/InputFileStream.h +++ b/phrase-extract/InputFileStream.h @@ -37,7 +37,7 @@ protected: std::streambuf *m_streambuf; public: - InputFileStream(const std::string &filePath); + explicit InputFileStream(const std::string &filePath); ~InputFileStream(); void Close(); diff --git a/phrase-extract/extract-mixed-syntax/InputFileStream.h b/phrase-extract/extract-mixed-syntax/InputFileStream.h index e2a31bc82..5de416237 100644 --- a/phrase-extract/extract-mixed-syntax/InputFileStream.h +++ b/phrase-extract/extract-mixed-syntax/InputFileStream.h @@ -37,7 +37,7 @@ protected: std::streambuf *m_streambuf; public: - InputFileStream(const std::string &filePath); + explicit InputFileStream(const std::string &filePath); ~InputFileStream(); void Close(); diff --git a/phrase-extract/extract-mixed-syntax/gzfilebuf.h b/phrase-extract/extract-mixed-syntax/gzfilebuf.h index b5b0ce87f..4c818ddbb 100644 --- a/phrase-extract/extract-mixed-syntax/gzfilebuf.h +++ b/phrase-extract/extract-mixed-syntax/gzfilebuf.h @@ -1,6 +1,7 @@ #ifndef moses_gzfile_buf_h #define moses_gzfile_buf_h +#include #include #include #include @@ -10,7 +11,8 @@ class gzfilebuf : public std::streambuf public: gzfilebuf(const char *filename) { _gzf = gzopen(filename, "rb"); - setg (_buff+sizeof(int), // beginning of putback area + if (!_gzf) + throw std::runtime_error("Could not open " + std::string(filename) + "."); setg (_buff+sizeof(int), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)); // end position } diff --git a/phrase-extract/gzfilebuf.h b/phrase-extract/gzfilebuf.h index b5b0ce87f..4c818ddbb 100644 --- a/phrase-extract/gzfilebuf.h +++ b/phrase-extract/gzfilebuf.h @@ -1,6 +1,7 @@ #ifndef moses_gzfile_buf_h #define moses_gzfile_buf_h +#include #include #include #include @@ -10,7 +11,8 @@ class gzfilebuf : public std::streambuf public: gzfilebuf(const char *filename) { _gzf = gzopen(filename, "rb"); - setg (_buff+sizeof(int), // beginning of putback area + if (!_gzf) + throw std::runtime_error("Could not open " + std::string(filename) + "."); setg (_buff+sizeof(int), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)); // end position } diff --git a/phrase-extract/lexical-reordering/InputFileStream.h b/phrase-extract/lexical-reordering/InputFileStream.h index 1f37715fd..dcc28a60c 100755 --- a/phrase-extract/lexical-reordering/InputFileStream.h +++ b/phrase-extract/lexical-reordering/InputFileStream.h @@ -37,7 +37,7 @@ protected: std::streambuf *m_streambuf; public: - InputFileStream(const std::string &filePath); + explicit InputFileStream(const std::string &filePath); ~InputFileStream(); void Open(const std::string &filePath); diff --git a/phrase-extract/lexical-reordering/gzfilebuf.h b/phrase-extract/lexical-reordering/gzfilebuf.h index b5b0ce87f..4c818ddbb 100755 --- a/phrase-extract/lexical-reordering/gzfilebuf.h +++ b/phrase-extract/lexical-reordering/gzfilebuf.h @@ -1,6 +1,7 @@ #ifndef moses_gzfile_buf_h #define moses_gzfile_buf_h +#include #include #include #include @@ -10,7 +11,8 @@ class gzfilebuf : public std::streambuf public: gzfilebuf(const char *filename) { _gzf = gzopen(filename, "rb"); - setg (_buff+sizeof(int), // beginning of putback area + if (!_gzf) + throw std::runtime_error("Could not open " + std::string(filename) + "."); setg (_buff+sizeof(int), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)); // end position } From 89c2df558ca3533369586d8d4e2c7451b2d2732e Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Sun, 14 Jun 2015 00:00:44 +0100 Subject: [PATCH 069/108] daily automatic beautifier --- phrase-extract/extract-mixed-syntax/gzfilebuf.h | 3 ++- phrase-extract/gzfilebuf.h | 3 ++- phrase-extract/lexical-reordering/gzfilebuf.h | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) mode change 100755 => 100644 phrase-extract/lexical-reordering/gzfilebuf.h diff --git a/phrase-extract/extract-mixed-syntax/gzfilebuf.h b/phrase-extract/extract-mixed-syntax/gzfilebuf.h index 4c818ddbb..e070da306 100644 --- a/phrase-extract/extract-mixed-syntax/gzfilebuf.h +++ b/phrase-extract/extract-mixed-syntax/gzfilebuf.h @@ -12,7 +12,8 @@ public: gzfilebuf(const char *filename) { _gzf = gzopen(filename, "rb"); if (!_gzf) - throw std::runtime_error("Could not open " + std::string(filename) + "."); setg (_buff+sizeof(int), // beginning of putback area + throw std::runtime_error("Could not open " + std::string(filename) + "."); + setg (_buff+sizeof(int), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)); // end position } diff --git a/phrase-extract/gzfilebuf.h b/phrase-extract/gzfilebuf.h index 4c818ddbb..e070da306 100644 --- a/phrase-extract/gzfilebuf.h +++ b/phrase-extract/gzfilebuf.h @@ -12,7 +12,8 @@ public: gzfilebuf(const char *filename) { _gzf = gzopen(filename, "rb"); if (!_gzf) - throw std::runtime_error("Could not open " + std::string(filename) + "."); setg (_buff+sizeof(int), // beginning of putback area + throw std::runtime_error("Could not open " + std::string(filename) + "."); + setg (_buff+sizeof(int), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)); // end position } diff --git a/phrase-extract/lexical-reordering/gzfilebuf.h b/phrase-extract/lexical-reordering/gzfilebuf.h old mode 100755 new mode 100644 index 4c818ddbb..e070da306 --- a/phrase-extract/lexical-reordering/gzfilebuf.h +++ b/phrase-extract/lexical-reordering/gzfilebuf.h @@ -12,7 +12,8 @@ public: gzfilebuf(const char *filename) { _gzf = gzopen(filename, "rb"); if (!_gzf) - throw std::runtime_error("Could not open " + std::string(filename) + "."); setg (_buff+sizeof(int), // beginning of putback area + throw std::runtime_error("Could not open " + std::string(filename) + "."); + setg (_buff+sizeof(int), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)); // end position } From ad8114ddb0b7af26c74680a9657c5fa6f82cf1eb Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Mon, 15 Jun 2015 16:23:12 +0100 Subject: [PATCH 070/108] capitalisation --- scripts/ems/experiment.meta | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 62e38128c..9edeec460 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -166,14 +166,14 @@ prepare-bilingual-nplm ignore-unless: bilingual-lm rerun-on-change: TRAINING:corpus TRAINING:word-alignment template: $moses-script-dir/training/bilingual-lm/extract_training.py -c IN0 -e $output-extension -f $input-extension -a IN1.$TRAINING:alignment-symmetrization-method -w $working-dir/$bilingual-lm-workdir -n $order -m $source-window $bilingual-lm-settings - default-name: LM/bilingualLM_prep + default-name: lm/bilingualLM_prep train-bilingual-lm in: numberized_ngrams TRAINING:corpus out: binlm ignore-unless: bilingual-lm rerun-on-change: numberized_ngrams template: $moses-script-dir/training/bilingual-lm/train_nplm.py -w $working-dir/$bilingual-lm-workdir -c IN1 -r $working-dir/$nplm-output-dir -n $train_order $nplm-settings - default-name: LM/BilingualLM + default-name: lm/bilingualLM get-corpus in: get-corpus-script out: raw-corpus From 6c0f875385ffaf827139c9a9220a2c5bfd195178 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Tue, 16 Jun 2015 16:19:41 +0100 Subject: [PATCH 071/108] testing the waters for c++11 please adjust your compiler options or complain if you rely on a compiler that doesn't support c++11 yet. --- Jamroot | 2 +- moses/StaticData.cpp | 3 +-- phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++---------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/Jamroot b/Jamroot index 119c6183e..a4957dfa2 100644 --- a/Jamroot +++ b/Jamroot @@ -108,7 +108,7 @@ external-lib z ; #lib dl : : static:static shared:shared ; #requirements += dl ; - +requirements += gcc:-std=c++0x ; if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] { if [ option.get "full-tcmalloc" : : "yes" ] { diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 8fb88c257..28d9f7831 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -1115,8 +1115,7 @@ void StaticData::LoadSparseWeightsFromConfig() } std::map > weights = m_parameter->GetAllWeights(); - std::map >::iterator iter; - for (iter = weights.begin(); iter != weights.end(); ++iter) { + for (auto iter = weights.begin(); iter != weights.end(); ++iter) { // this indicates that it is sparse feature if (featureNames.find(iter->first) == featureNames.end()) { UTIL_THROW_IF2(iter->second.size() != 1, "ERROR: only one weight per sparse feature allowed: " << iter->first); diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 93a452dad..2863122dd 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")), + manager.configure({"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")), + manager.configure({"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")), + manager.configure({"--SparseDomainBlah","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--DomainSubset")), + manager.configure({"--DomainSubset"}), ScoreFeatureArgumentException); } @@ -84,16 +84,16 @@ static void checkDomainConfigured( BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - (boost::assign::list_of ("--DomainRatio")("/dev/null")); + ({"--DomainRatio","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--DomainIndicator")("/dev/null")); + ({"--DomainIndicator","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--DomainSubset")("/dev/null")); + ({"--DomainSubset","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); + ({"--SparseDomainRatio","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); + ({"--SparseDomainIndicator","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); + ({"--SparseDomainSubset","/dev/null"}); } From 2a798c0b9f19e44c1a63c7c75f657ae15968c8d0 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Wed, 17 Jun 2015 00:00:42 +0100 Subject: [PATCH 072/108] daily automatic beautifier --- phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 2863122dd..51d4e1297 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure({"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), + manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure({"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), + manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure({"--SparseDomainBlah","/dev/null"}), + manager.configure( {"--SparseDomainBlah","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure({"--DomainSubset"}), + manager.configure( {"--DomainSubset"}), ScoreFeatureArgumentException); } @@ -84,16 +84,16 @@ static void checkDomainConfigured( BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - ({"--DomainRatio","/dev/null"}); + ( {"--DomainRatio","/dev/null"}); checkDomainConfigured - ({"--DomainIndicator","/dev/null"}); + ( {"--DomainIndicator","/dev/null"}); checkDomainConfigured - ({"--DomainSubset","/dev/null"}); + ( {"--DomainSubset","/dev/null"}); checkDomainConfigured - ({"--SparseDomainRatio","/dev/null"}); + ( {"--SparseDomainRatio","/dev/null"}); checkDomainConfigured - ({"--SparseDomainIndicator","/dev/null"}); + ( {"--SparseDomainIndicator","/dev/null"}); checkDomainConfigured - ({"--SparseDomainSubset","/dev/null"}); + ( {"--SparseDomainSubset","/dev/null"}); } From 42c5424c86bc2f7f79b70821169dc24433e04b28 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 17 Jun 2015 10:58:47 +0400 Subject: [PATCH 073/108] 1st casualty of c++11. clang 2.6 (latest c++ compiler on osx) doesn't support list of object init --- phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 51d4e1297..93a452dad 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), + manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), + manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--SparseDomainBlah","/dev/null"}), + manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--DomainSubset"}), + manager.configure(boost::assign::list_of("--DomainSubset")), ScoreFeatureArgumentException); } @@ -84,16 +84,16 @@ static void checkDomainConfigured( BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - ( {"--DomainRatio","/dev/null"}); + (boost::assign::list_of ("--DomainRatio")("/dev/null")); checkDomainConfigured - ( {"--DomainIndicator","/dev/null"}); + (boost::assign::list_of("--DomainIndicator")("/dev/null")); checkDomainConfigured - ( {"--DomainSubset","/dev/null"}); + (boost::assign::list_of("--DomainSubset")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainRatio","/dev/null"}); + (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainIndicator","/dev/null"}); + (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainSubset","/dev/null"}); + (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); } From 80f0f71d03b0348649835e674692938dc6862840 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 17 Jun 2015 11:25:27 +0400 Subject: [PATCH 074/108] Revert "1st casualty of c++11. clang 2.6 (latest c++ compiler on osx) doesn't support list of object init" This reverts commit 42c5424c86bc2f7f79b70821169dc24433e04b28. --- phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 93a452dad..51d4e1297 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")), + manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")), + manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")), + manager.configure( {"--SparseDomainBlah","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--DomainSubset")), + manager.configure( {"--DomainSubset"}), ScoreFeatureArgumentException); } @@ -84,16 +84,16 @@ static void checkDomainConfigured( BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - (boost::assign::list_of ("--DomainRatio")("/dev/null")); + ( {"--DomainRatio","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--DomainIndicator")("/dev/null")); + ( {"--DomainIndicator","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--DomainSubset")("/dev/null")); + ( {"--DomainSubset","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); + ( {"--SparseDomainRatio","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); + ( {"--SparseDomainIndicator","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); + ( {"--SparseDomainSubset","/dev/null"}); } From 127b860c6a7b54daa9b8808006835410510241aa Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 17 Jun 2015 11:27:50 +0400 Subject: [PATCH 075/108] false alarm. clang does support object list init. Needed to enable c++11 for all toolsets --- Jamroot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jamroot b/Jamroot index a4957dfa2..4f76ec3ba 100644 --- a/Jamroot +++ b/Jamroot @@ -108,7 +108,7 @@ external-lib z ; #lib dl : : static:static shared:shared ; #requirements += dl ; -requirements += gcc:-std=c++0x ; +requirements += -std=c++0x ; if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] { if [ option.get "full-tcmalloc" : : "yes" ] { From 7031992caa2bd850d2442ae99b697f01194046db Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 17 Jun 2015 11:42:46 +0400 Subject: [PATCH 076/108] use c++11 unordered set code --- phrase-extract/ScoreFeatureTest.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 51d4e1297..9497414be 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -25,7 +25,7 @@ #include #include -#include +#include using namespace MosesTraining; using namespace std; @@ -95,5 +95,15 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) ( {"--SparseDomainIndicator","/dev/null"}); checkDomainConfigured ( {"--SparseDomainSubset","/dev/null"}); + + unordered_set s; + s.insert(4); + s.insert(7); + s.insert(4); + s.insert(1); + + for (auto i: s) { + cerr << i << " "; + } } From 425118aa5d794a43a1aff6e692c4e90c7e0f800e Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Wed, 17 Jun 2015 09:32:29 +0100 Subject: [PATCH 077/108] bugfixes - working directory --- scripts/training/train-neurallm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py index 4f0e8bdaf..00da64986 100755 --- a/scripts/training/train-neurallm.py +++ b/scripts/training/train-neurallm.py @@ -187,12 +187,14 @@ def main(options): ret = subprocess.call(extraction_cmd) if ret: raise Exception("preparing neural LM failed") + options.validation_file = os.path.join( + options.working_dir, os.path.basename(options.validation_corpus)) else: options.validation_file = None - options.input_words_file = options.words_file - options.output_words_file = options.words_file + options.input_words_file = os.path.join(options.working_dir, options.words_file) + options.output_words_file = os.path.join(options.working_dir, options.words_file) options.input_vocab_size = options.vocab_size options.output_vocab_size = options.vocab_size From f29f67710e980db7f965b9b2e849b7c14dcf338d Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Thu, 18 Jun 2015 00:00:39 +0100 Subject: [PATCH 078/108] daily automatic beautifier --- phrase-extract/ScoreFeatureTest.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 9497414be..cc22f8630 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -95,15 +95,15 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) ( {"--SparseDomainIndicator","/dev/null"}); checkDomainConfigured ( {"--SparseDomainSubset","/dev/null"}); - + unordered_set s; s.insert(4); s.insert(7); s.insert(4); s.insert(1); - - for (auto i: s) { - cerr << i << " "; + +for (auto i: s) { + cerr << i << " "; } } From 90470e878d7ee150baafbb718ee6a402f641c9a5 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 19 Jun 2015 15:58:14 +0100 Subject: [PATCH 079/108] Fix some C++11-related compilation errors (clang) --- biconcor/Vocabulary.cpp | 4 ++-- moses/TranslationModel/RuleTable/LoaderFactory.cpp | 3 +-- phrase-extract/extract-mixed-syntax/Main.cpp | 7 ++----- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/biconcor/Vocabulary.cpp b/biconcor/Vocabulary.cpp index f0f07c97d..3879b451d 100644 --- a/biconcor/Vocabulary.cpp +++ b/biconcor/Vocabulary.cpp @@ -62,7 +62,7 @@ void Vocabulary::Save(const string& fileName ) const vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc); if (!vcbFile) { - cerr << "Failed to open " << vcbFile << endl; + cerr << "Failed to open " << fileName << endl; exit(1); } @@ -81,7 +81,7 @@ void Vocabulary::Load(const string& fileName ) vcbFile.open(fileName.c_str()); if (!vcbFile) { - cerr << "no such file or directory: " << vcbFile << endl; + cerr << "no such file or directory: " << fileName << endl; exit(1); } diff --git a/moses/TranslationModel/RuleTable/LoaderFactory.cpp b/moses/TranslationModel/RuleTable/LoaderFactory.cpp index 66a39e3bd..5569f952c 100644 --- a/moses/TranslationModel/RuleTable/LoaderFactory.cpp +++ b/moses/TranslationModel/RuleTable/LoaderFactory.cpp @@ -40,9 +40,8 @@ std::auto_ptr RuleTableLoaderFactory::Create( { InputFileStream input(path); std::string line; - bool cont = std::getline(input, line); - if (cont) { + if (std::getline(input, line)) { std::vector tokens; Tokenize(tokens, line); if (tokens.size() == 1) { diff --git a/phrase-extract/extract-mixed-syntax/Main.cpp b/phrase-extract/extract-mixed-syntax/Main.cpp index 5d1b3e7f5..f011e6e8d 100644 --- a/phrase-extract/extract-mixed-syntax/Main.cpp +++ b/phrase-extract/extract-mixed-syntax/Main.cpp @@ -148,13 +148,10 @@ int main(int argc, char** argv) cerr << lineNum << " "; } - bool success; - success = getline(strmSource, lineSource); - if (!success) { + if (!getline(strmSource, lineSource)) { throw "Couldn't read source"; } - success = getline(strmAlignment, lineAlignment); - if (!success) { + if (!getline(strmAlignment, lineAlignment)) { throw "Couldn't read alignment"; } From 1bd10e104ce5a8e51e7336ad5bbf1c61b56a0883 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sun, 21 Jun 2015 18:27:56 +0200 Subject: [PATCH 080/108] workaround/cleaning for weird copy-constructor behaviour with C++11 --- .../CompactPT/BlockHashIndex.cpp | 2 +- .../LexicalReorderingTableCreator.cpp | 9 +++-- .../CompactPT/MmapAllocator.h | 12 ++++--- .../CompactPT/PhraseTableCreator.cpp | 6 ++-- .../TranslationModel/CompactPT/StringVector.h | 35 +++++++++---------- 5 files changed, 33 insertions(+), 31 deletions(-) diff --git a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp index c90dcd6d9..27209f5bc 100644 --- a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp +++ b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp @@ -34,7 +34,7 @@ namespace Moses BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits, size_t threadsNum) : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), - m_fileHandle(0), m_fileHandleStart(0), m_size(0), + m_fileHandle(0), m_fileHandleStart(0), m_landmarks(true), m_size(0), m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0), m_threadPool(threadsNum) { diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp index 9fe9eec30..8e9f4fa0a 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp @@ -52,13 +52,12 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl; m_hash.BeginSave(m_outFile); - - + if(tempfilePath.size()) { MmapAllocator allocEncoded(util::FMakeTemp(tempfilePath)); m_encodedScores = new StringVector(allocEncoded); } else { - m_encodedScores = new StringVector(); + m_encodedScores = new StringVector(true); } EncodeScores(); @@ -68,12 +67,12 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::cerr << "Pass 2/2: Compressing scores" << std::endl; - + if(tempfilePath.size()) { MmapAllocator allocCompressed(util::FMakeTemp(tempfilePath)); m_compressedScores = new StringVector(allocCompressed); } else { - m_compressedScores = new StringVector(); + m_compressedScores = new StringVector(true); } CompressScores(); diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h index 78084f883..0e04890bd 100644 --- a/moses/TranslationModel/CompactPT/MmapAllocator.h +++ b/moses/TranslationModel/CompactPT/MmapAllocator.h @@ -62,6 +62,9 @@ public: typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; + MmapAllocator(MmapAllocator &&) = delete; + MmapAllocator(const MmapAllocator &&) = delete; + MmapAllocator() throw() : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0), @@ -151,11 +154,12 @@ public: if(!m_fixed) { util::UnmapOrThrow(p, num * sizeof(T)); } else { - size_t map_offset = (m_data_offset / m_page_size) * m_page_size; - size_t relative_offset = m_data_offset - map_offset; - util::UnmapOrThrow((pointer)((char*)p - relative_offset), num * sizeof(T)); + const size_t map_offset = (m_data_offset / m_page_size) * m_page_size; + const size_t relative_offset = m_data_offset - map_offset; + const size_t adjusted_map_size = m_map_size + relative_offset; + + util::UnmapOrThrow((pointer)((char*)p - relative_offset), adjusted_map_size); } - } void construct (pointer p, const T& value) { diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp index ba1dfc578..d590ef9b3 100644 --- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp @@ -130,7 +130,7 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath, MmapAllocator allocCompressed(util::FMakeTemp(tempfilePath)); m_compressedTargetPhrases = new StringVector(allocCompressed); } else { - m_compressedTargetPhrases = new StringVector(); + m_compressedTargetPhrases = new StringVector(true); } CompressTargetPhrases(); @@ -203,7 +203,7 @@ void PhraseTableCreator::Save() = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) temp1[it->second] = it->first; std::sort(temp1.begin(), temp1.end()); - StringVector sourceSymbols; + StringVector sourceSymbols(true); for(std::vector::iterator it = temp1.begin(); it != temp1.end(); it++) sourceSymbols.push_back(*it); @@ -224,7 +224,7 @@ void PhraseTableCreator::Save() for(boost::unordered_map::iterator it = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++) temp2[it->second] = it->first; - StringVector targetSymbols; + StringVector targetSymbols(true); for(std::vector::iterator it = temp2.begin(); it != temp2.end(); it++) targetSymbols.push_back(*it); diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h index bb2bc11ef..3af970c41 100644 --- a/moses/TranslationModel/CompactPT/StringVector.h +++ b/moses/TranslationModel/CompactPT/StringVector.h @@ -147,8 +147,8 @@ public: typedef RangeIterator iterator; typedef StringIterator string_iterator; - StringVector(); - StringVector(Allocator alloc); + StringVector(bool allocate = false); + StringVector(Allocator& alloc); virtual ~StringVector() { delete m_charArray; @@ -203,13 +203,13 @@ public: m_memoryMapped = memoryMapped; size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool); - size += m_positions.load(in, m_memoryMapped); + size += m_positions.load(in, false); - size += loadCharArray(*m_charArray, in, m_memoryMapped); + size += loadCharArray(m_charArray, in, m_memoryMapped); return size; } - size_t loadCharArray(std::vector >& c, + size_t loadCharArray(std::vector >*& c, std::FILE* in, bool map = false) { // Can only be read into memory. Mapping not possible with std:allocator. assert(map == false); @@ -219,13 +219,13 @@ public: size_t valSize; byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); - c.resize(valSize, 0); - byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + c = new std::vector >(valSize, 0); + byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); return byteSize; } - size_t loadCharArray(std::vector >& c, + size_t loadCharArray(std::vector >*& c, std::FILE* in, bool map = false) { size_t byteSize = 0; @@ -235,19 +235,17 @@ public: if(map == false) { // Read data into temporary file (default constructor of MmapAllocator) // and map memory onto temporary file. Can be resized. - - c.resize(valSize, 0); - byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + c = new std::vector >(valSize, 0); + byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); } else { // Map it directly on specified region of file "in" starting at valPos // with length valSize * sizeof(ValueT). Mapped region cannot be resized. size_t valPos = std::ftell(in); Allocator alloc(in, valPos); - std::vector > charArrayTemp(alloc); - charArrayTemp.resize(valSize, 0); - c.swap(charArrayTemp); - + c = new std::vector >(alloc); + c->resize(valSize, 0); + byteSize += valSize * sizeof(ValueT); } @@ -369,11 +367,12 @@ OStream& operator<<(OStream &os, ValueIteratorRange cr) // StringVector template class Allocator> -StringVector::StringVector() - : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector >()) { } +StringVector::StringVector(bool allocate) + : m_sorted(true), m_memoryMapped(false), + m_charArray(allocate ? new std::vector >() : 0) { } template class Allocator> -StringVector::StringVector(Allocator alloc) +StringVector::StringVector(Allocator &alloc) : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector >(alloc)) { } template class Allocator> From 0f943dd9c10acf4ac0cae5b642175d763594e4b1 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 21 Jun 2015 21:16:12 +0400 Subject: [PATCH 081/108] clang compile errors --- contrib/other-builds/all.workspace | 4 ++-- contrib/other-builds/moses/moses.project | 2 +- .../CompactPT/LexicalReorderingTableCompact.cpp | 4 ++-- moses/TranslationModel/CompactPT/PhraseDecoder.cpp | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/contrib/other-builds/all.workspace b/contrib/other-builds/all.workspace index 66dafe3d2..621bafdc2 100644 --- a/contrib/other-builds/all.workspace +++ b/contrib/other-builds/all.workspace @@ -6,10 +6,10 @@ - + - + diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project index 66e0b9bad..81072d667 100644 --- a/contrib/other-builds/moses/moses.project +++ b/contrib/other-builds/moses/moses.project @@ -814,7 +814,7 @@ - + diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp index fe475507c..cd71b1776 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp @@ -78,9 +78,9 @@ GetScore(const Phrase& f, const Phrase& e, const Phrase& c) if(m_hash.GetSize() != index) { std::string scoresString; if(m_inMemory) - scoresString = m_scoresMemory[index]; + scoresString = m_scoresMemory[index].str(); else - scoresString = m_scoresMapped[index]; + scoresString = m_scoresMapped[index].str(); BitWrapper<> bitStream(scoresString); for(size_t i = 0; i < m_numScoreComponent; i++) diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp index 3cf2f010e..54e6815a1 100644 --- a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp +++ b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp @@ -224,9 +224,9 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase & // Retrieve compressed and encoded target phrase collection std::string encodedPhraseCollection; if(m_phraseDictionary.m_inMemory) - encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId]; + encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId].str(); else - encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId]; + encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId].str(); BitWrapper<> encodedBitStream(encodedPhraseCollection); if(m_coding == PREnc && bitsLeft) From 6151003c1362f7ba12e769c3dd69bf21992ac48e Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sun, 21 Jun 2015 19:24:43 +0200 Subject: [PATCH 082/108] Remove C++11 oddities --- moses/TranslationModel/CompactPT/MmapAllocator.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h index 0e04890bd..1d0d06f77 100644 --- a/moses/TranslationModel/CompactPT/MmapAllocator.h +++ b/moses/TranslationModel/CompactPT/MmapAllocator.h @@ -61,9 +61,6 @@ public: typedef const T& const_reference; typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; - - MmapAllocator(MmapAllocator &&) = delete; - MmapAllocator(const MmapAllocator &&) = delete; MmapAllocator() throw() : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), From e57ca5ec34c8723a73122b3e0963a1e8ff719a45 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Mon, 22 Jun 2015 00:00:43 +0100 Subject: [PATCH 083/108] daily automatic beautifier --- .../CompactPT/LexicalReorderingTableCreator.cpp | 4 ++-- moses/TranslationModel/CompactPT/MmapAllocator.h | 4 ++-- moses/TranslationModel/CompactPT/StringVector.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp index 8e9f4fa0a..4941d32ec 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp @@ -52,7 +52,7 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl; m_hash.BeginSave(m_outFile); - + if(tempfilePath.size()) { MmapAllocator allocEncoded(util::FMakeTemp(tempfilePath)); m_encodedScores = new StringVector(allocEncoded); @@ -67,7 +67,7 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::cerr << "Pass 2/2: Compressing scores" << std::endl; - + if(tempfilePath.size()) { MmapAllocator allocCompressed(util::FMakeTemp(tempfilePath)); m_compressedScores = new StringVector(allocCompressed); diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h index 1d0d06f77..72d0c1663 100644 --- a/moses/TranslationModel/CompactPT/MmapAllocator.h +++ b/moses/TranslationModel/CompactPT/MmapAllocator.h @@ -61,7 +61,7 @@ public: typedef const T& const_reference; typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; - + MmapAllocator() throw() : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0), @@ -154,7 +154,7 @@ public: const size_t map_offset = (m_data_offset / m_page_size) * m_page_size; const size_t relative_offset = m_data_offset - map_offset; const size_t adjusted_map_size = m_map_size + relative_offset; - + util::UnmapOrThrow((pointer)((char*)p - relative_offset), adjusted_map_size); } } diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h index 3af970c41..aaec500f0 100644 --- a/moses/TranslationModel/CompactPT/StringVector.h +++ b/moses/TranslationModel/CompactPT/StringVector.h @@ -235,7 +235,7 @@ public: if(map == false) { // Read data into temporary file (default constructor of MmapAllocator) // and map memory onto temporary file. Can be resized. - c = new std::vector >(valSize, 0); + c = new std::vector >(valSize, 0); byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); } else { // Map it directly on specified region of file "in" starting at valPos @@ -245,7 +245,7 @@ public: Allocator alloc(in, valPos); c = new std::vector >(alloc); c->resize(valSize, 0); - + byteSize += valSize * sizeof(ValueT); } From 2a242afa346b70a6c8dc22522349300b6d28e563 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 22 Jun 2015 10:46:12 -0400 Subject: [PATCH 084/108] Didn't need header --- moses/IOWrapper.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h index f1bcefa92..e3057794f 100644 --- a/moses/IOWrapper.h +++ b/moses/IOWrapper.h @@ -61,8 +61,6 @@ POSSIBILITY OF SUCH DAMAGE. #include "moses/ChartKBestExtractor.h" #include "moses/Syntax/KBestExtractor.h" -#include "search/applied.hh" - #include namespace Moses From 0d34023aad0dbf28c28bcc17876b4016b5b1b3ea Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 24 Jun 2015 14:56:37 +0400 Subject: [PATCH 085/108] prune generation table --- misc/Jamfile | 4 ++- misc/pruneGeneration.cpp | 55 ++++++++++++++++++++++++++++++++++++++++ misc/pruneGeneration.h | 45 ++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 misc/pruneGeneration.cpp create mode 100644 misc/pruneGeneration.h diff --git a/misc/Jamfile b/misc/Jamfile index bfea14d58..46a18e253 100644 --- a/misc/Jamfile +++ b/misc/Jamfile @@ -14,6 +14,8 @@ exe 1-1-Extraction : 1-1-Extraction.cpp ..//boost_filesystem ../moses//moses ; exe prunePhraseTable : prunePhraseTable.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ; +exe pruneGeneration : pruneGeneration.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ; + local with-cmph = [ option.get "with-cmph" ] ; if $(with-cmph) { exe processPhraseTableMin : processPhraseTableMin.cpp ..//boost_filesystem ../moses//moses ; @@ -46,6 +48,6 @@ $(TOP)//boost_iostreams $(TOP)//boost_program_options ; -alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable ; +alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable pruneGeneration ; #processPhraseTable queryPhraseTable diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp new file mode 100644 index 000000000..45873a4ac --- /dev/null +++ b/misc/pruneGeneration.cpp @@ -0,0 +1,55 @@ +#include +#include +#include +#include +#include "pruneGeneration.h" + +using namespace std; + +int main(int argc, char **argv) +{ + cerr << "Starting" << endl; + int limit = atoi(argv[1]); + + vector records; + string prevInWord; + string line; + while (getline(cin, line)) { + vector toks; + Tokenize(toks, line); + assert(toks.size() == 4); + + if (prevInWord != toks[0]) { + Output(limit, records); + records.clear(); + } + + // add new record + float prob = atof(toks[2].c_str()); + records.push_back(Rec(prob, line)); + + prevInWord = toks[0]; + } + + // last + Output(limit, records); + records.clear(); + + cerr << "Finished" << endl; +} + +void Output(int limit, vector &records) +{ + Prune(limit, records); + + for (size_t i = 0; i < limit && i < records.size(); ++i) { + const Rec &rec = records[i]; + cout << rec.line << endl; + } +} + +void Prune(int limit, std::vector &records) +{ + std::sort(records.rbegin(), records.rend()); + +} diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h new file mode 100644 index 000000000..693c5f149 --- /dev/null +++ b/misc/pruneGeneration.h @@ -0,0 +1,45 @@ +#pragma once +#include +#include + +class Rec +{ +public: + float prob; + std::string line; + + Rec(float aprob, const std::string &aline) + :prob(aprob) + ,line(aline) + {} + + inline bool operator< (const Rec &compare) const { + return prob < compare.prob; + } +}; + +//////////////////////////////////////////////////////////// + +void Output(int limit, std::vector &records); +void Prune(int limit, std::vector &records); + +//////////////////////////////////////////////////////////// +inline void Tokenize(std::vector &output + , const std::string& str + , const std::string& delimiters = " \t") +{ + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (std::string::npos != pos || std::string::npos != lastPos) { + // Found a token, add it to the vector. + output.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } +} + From bac5c2e55c1b2454328bf18207b6d9633d2b9adf Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 24 Jun 2015 16:24:12 +0400 Subject: [PATCH 086/108] compile error with gcc --- misc/pruneGeneration.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index 45873a4ac..275d599df 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include "pruneGeneration.h" using namespace std; From 9936c9f264f95c02e47a6e987bea0e2026b78727 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 22 Jun 2015 10:46:12 -0400 Subject: [PATCH 087/108] Didn't need header --- moses/IOWrapper.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h index f1bcefa92..e3057794f 100644 --- a/moses/IOWrapper.h +++ b/moses/IOWrapper.h @@ -61,8 +61,6 @@ POSSIBILITY OF SUCH DAMAGE. #include "moses/ChartKBestExtractor.h" #include "moses/Syntax/KBestExtractor.h" -#include "search/applied.hh" - #include namespace Moses From d928340cd4a0a07fb8058a3a586cba2d4633c416 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 24 Jun 2015 14:34:27 +0100 Subject: [PATCH 088/108] Added context handling to TranslationRequest for moses server. --- moses/server/TranslationRequest.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index cad3696d1..3848f81ba 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -1,4 +1,5 @@ #include "TranslationRequest.h" +#include "moses/ContextScope.h" #include namespace MosesServer @@ -30,6 +31,7 @@ create(xmlrpc_c::paramList const& paramList, boost::shared_ptr ret; ret.reset(new TranslationRequest(paramList,cond, mut)); ret->m_self = ret; + ret->m_scope.reset(new Moses::ContextScope); return ret; } @@ -270,7 +272,10 @@ parse_request(std::map const& params) if (si != params.end()) m_nbestSize = xmlrpc_c::value_int(si->second); - + si = params.find("context"); + if (si != params.end()) { + m_context_string = xmlrpc_c::value_string(si->second); + } // // biased sampling for suffix-array-based sampling phrase table? // if ((si = params.find("bias")) != params.end()) // { From 555f91eb7ec79cc69e1b18889fd17217d3425389 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 24 Jun 2015 18:31:05 +0400 Subject: [PATCH 089/108] codelite --- contrib/other-builds/all.workspace | 5 +- .../other-builds/moses-cmd/moses-cmd.project | 14 +-- contrib/other-builds/moses/moses.project | 4 +- .../pruneGeneration/pruneGeneration.project | 97 +++++++++++++++++++ misc/pruneGeneration.cpp | 7 +- misc/pruneGeneration.h | 1 - 6 files changed, 111 insertions(+), 17 deletions(-) create mode 100644 contrib/other-builds/pruneGeneration/pruneGeneration.project diff --git a/contrib/other-builds/all.workspace b/contrib/other-builds/all.workspace index 621bafdc2..5a7eaf114 100644 --- a/contrib/other-builds/all.workspace +++ b/contrib/other-builds/all.workspace @@ -9,7 +9,8 @@ - + + @@ -23,6 +24,7 @@ + @@ -36,6 +38,7 @@ + diff --git a/contrib/other-builds/moses-cmd/moses-cmd.project b/contrib/other-builds/moses-cmd/moses-cmd.project index ac567ffce..44a0d621f 100644 --- a/contrib/other-builds/moses-cmd/moses-cmd.project +++ b/contrib/other-builds/moses-cmd/moses-cmd.project @@ -26,13 +26,6 @@ - - - - - - - @@ -150,4 +143,11 @@
+ + + + + + + diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project index 81072d667..0fbd942c6 100644 --- a/contrib/other-builds/moses/moses.project +++ b/contrib/other-builds/moses/moses.project @@ -793,8 +793,6 @@ - - @@ -897,4 +895,6 @@
+ + diff --git a/contrib/other-builds/pruneGeneration/pruneGeneration.project b/contrib/other-builds/pruneGeneration/pruneGeneration.project new file mode 100644 index 000000000..7060d55ea --- /dev/null +++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + None + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + None + + + + + + + + + + + + + + + + + + + diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index 275d599df..19ae2184f 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -42,7 +42,7 @@ int main(int argc, char **argv) void Output(int limit, vector &records) { - Prune(limit, records); + std::sort(records.rbegin(), records.rend()); for (size_t i = 0; i < limit && i < records.size(); ++i) { const Rec &rec = records[i]; @@ -50,8 +50,3 @@ void Output(int limit, vector &records) } } -void Prune(int limit, std::vector &records) -{ - std::sort(records.rbegin(), records.rend()); - -} diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h index 693c5f149..470e607d4 100644 --- a/misc/pruneGeneration.h +++ b/misc/pruneGeneration.h @@ -21,7 +21,6 @@ public: //////////////////////////////////////////////////////////// void Output(int limit, std::vector &records); -void Prune(int limit, std::vector &records); //////////////////////////////////////////////////////////// inline void Tokenize(std::vector &output From dce0f33270bd6e169850a9337141c5af39f3f765 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 24 Jun 2015 18:35:59 +0400 Subject: [PATCH 090/108] prune generation table in ems --- scripts/ems/experiment.meta | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 9edeec460..4177f967e 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -739,6 +739,14 @@ build-generation-custom ignore-unless: AND generation-factors generation-corpus default-name: model/generation-table final-model: yes +generation-prune + in: generation-table + out: generation-table-pruned + rerun-on-change: TRAINING:prune-generation + pass-unless: AND TRAINING:prune-generation + default-name: model/generation-table-pruned + final-model: yes + template: $TRAINING:prune-generation < IN > OUT build-sparse in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus out: sparse @@ -747,7 +755,7 @@ build-sparse default-name: model/sparse-features template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features" create-config - in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm + in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table-pruned sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm out: config ignore-if: use-hiero thot rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature From 78b2810cfe52d0a7246c4c376e32e4f1bc321577 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 24 Jun 2015 18:09:22 +0100 Subject: [PATCH 091/108] Allow context server to use ports other than 80. --- .../TranslationModel/UG/mm/ug_http_client.cc | 34 +++++++++++++------ moses/TranslationModel/UG/mm/ug_http_client.h | 10 ++++-- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_http_client.cc b/moses/TranslationModel/UG/mm/ug_http_client.cc index 1d6d70edb..1bbb93b23 100644 --- a/moses/TranslationModel/UG/mm/ug_http_client.cc +++ b/moses/TranslationModel/UG/mm/ug_http_client.cc @@ -7,28 +7,40 @@ std::string http_client::content() const { return m_content.str(); } http_client:: http_client(boost::asio::io_service& io_service, - const std::string& server, const std::string& path) + std::string const& server, + std::string const& port, + std::string const& path) : resolver_(io_service), socket_(io_service) { - init(server,path); + init(server, port, path); } - + http_client:: http_client(boost::asio::io_service& io_service, std::string url) : resolver_(io_service), socket_(io_service) { - size_t p = url.find("://"); - if (p < url.size()) url.erase(0,p+3); - p = url.find("/"); + std::string server; + std::string path = "/"; + std::string port = "http"; + size_t p = url.find("://"), q; if (p < url.size()) - init(url.substr(0,p),url.substr(p)); - else - init(url,"/"); + { + port = url.substr(0,p); + url.erase(0, p+3); + } + p = std::min(url.find_first_of(":/"), url.size()); + q = std::min(url.find("/"), url.size()); + if (p < url.size() && url[p] == ':') + port = url.substr(p,q-p); + server = url.substr(0,p); + if (q < url.size()) + path = url.substr(q); + init(server, port, path); } void http_client:: -init(std::string const& server, std::string const& path) +init(std::string const& server, std::string const& port, std::string const& path) { // Form the request. We specify the "Connection: close" header so // that the server will close the socket after transmitting the @@ -43,7 +55,7 @@ init(std::string const& server, std::string const& path) // Start an asynchronous resolve to translate the server and service names // into a list of endpoints. - tcp::resolver::query query(server, "http"); + tcp::resolver::query query(server, port); resolver_.async_resolve(query, boost::bind(&http_client::handle_resolve, this, boost::asio::placeholders::error, diff --git a/moses/TranslationModel/UG/mm/ug_http_client.h b/moses/TranslationModel/UG/mm/ug_http_client.h index 53ee258f9..825c0c37e 100644 --- a/moses/TranslationModel/UG/mm/ug_http_client.h +++ b/moses/TranslationModel/UG/mm/ug_http_client.h @@ -35,9 +35,15 @@ class http_client public: http_client(boost::asio::io_service& io_service, std::string url); http_client(boost::asio::io_service& io_service, - const std::string& server, const std::string& path); + std::string const& server, + std::string const& port, + std::string const& path); private: - void init(std::string const& server, std::string const& path); + + void init(std::string const& server, + std::string const& port, + std::string const& path); + void handle_resolve(const boost::system::error_code& err, tcp::resolver::iterator endpoint_iterator); void handle_connect(const boost::system::error_code& err, From 4ec69fbfdff104218db16c9c1ba8c8c381c331c3 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Thu, 25 Jun 2015 00:00:42 +0100 Subject: [PATCH 092/108] daily automatic beautifier --- misc/pruneGeneration.cpp | 10 +++++----- misc/pruneGeneration.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index 19ae2184f..8207e287f 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -12,7 +12,7 @@ int main(int argc, char **argv) { cerr << "Starting" << endl; int limit = atoi(argv[1]); - + vector records; string prevInWord; string line; @@ -20,12 +20,12 @@ int main(int argc, char **argv) vector toks; Tokenize(toks, line); assert(toks.size() == 4); - + if (prevInWord != toks[0]) { Output(limit, records); records.clear(); } - + // add new record float prob = atof(toks[2].c_str()); records.push_back(Rec(prob, line)); @@ -37,13 +37,13 @@ int main(int argc, char **argv) Output(limit, records); records.clear(); - cerr << "Finished" << endl; + cerr << "Finished" << endl; } void Output(int limit, vector &records) { std::sort(records.rbegin(), records.rend()); - + for (size_t i = 0; i < limit && i < records.size(); ++i) { const Rec &rec = records[i]; cout << rec.line << endl; diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h index 470e607d4..dae5958f8 100644 --- a/misc/pruneGeneration.h +++ b/misc/pruneGeneration.h @@ -7,12 +7,12 @@ class Rec public: float prob; std::string line; - + Rec(float aprob, const std::string &aline) - :prob(aprob) - ,line(aline) + :prob(aprob) + ,line(aline) {} - + inline bool operator< (const Rec &compare) const { return prob < compare.prob; } From c80df1212ede1c8db39fbd5fe21f11d8f2ea60f7 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 25 Jun 2015 10:48:35 +0400 Subject: [PATCH 093/108] prune multiple files at once. Make up for failure in ems to give the full path of the gen table --- .../other-builds/OnDiskPt/OnDiskPt.project | 31 ++++++++++---- .../extract-mixed-syntax.project | 40 ++++++++++++++----- contrib/other-builds/extract/extract.project | 31 ++++++++++---- contrib/other-builds/lm/lm.project | 31 ++++++++++---- .../other-builds/moses-cmd/moses-cmd.project | 32 +++++++-------- .../pruneGeneration/pruneGeneration.project | 9 +++-- contrib/other-builds/score/score.project | 30 +++++++------- contrib/other-builds/search/search.project | 14 +++---- contrib/other-builds/util/util.project | 12 +++--- misc/pruneGeneration.cpp | 18 ++++++--- misc/pruneGeneration.h | 4 +- 11 files changed, 165 insertions(+), 87 deletions(-) diff --git a/contrib/other-builds/OnDiskPt/OnDiskPt.project b/contrib/other-builds/OnDiskPt/OnDiskPt.project index 06f80d233..3a89ec832 100644 --- a/contrib/other-builds/OnDiskPt/OnDiskPt.project +++ b/contrib/other-builds/OnDiskPt/OnDiskPt.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -27,6 +44,8 @@ + + @@ -40,9 +59,9 @@ - - - + + + @@ -72,7 +91,7 @@ - + @@ -110,7 +129,7 @@ - + @@ -118,6 +137,4 @@ - - diff --git a/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project b/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project index 83c652f8c..87d76689a 100644 --- a/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project +++ b/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -43,6 +60,10 @@ + + + + @@ -56,13 +77,14 @@ - - - + + + - - + + + @@ -94,7 +116,7 @@ - + @@ -133,7 +155,7 @@ - + @@ -141,8 +163,4 @@ - - - - diff --git a/contrib/other-builds/extract/extract.project b/contrib/other-builds/extract/extract.project index ac74607f2..d86e89035 100644 --- a/contrib/other-builds/extract/extract.project +++ b/contrib/other-builds/extract/extract.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -13,6 +30,8 @@ + + @@ -26,11 +45,11 @@ - - + + - + @@ -60,7 +79,7 @@ - + @@ -99,7 +118,7 @@ - + @@ -107,6 +126,4 @@ - - diff --git a/contrib/other-builds/lm/lm.project b/contrib/other-builds/lm/lm.project index a184fe3d1..c30ebe533 100644 --- a/contrib/other-builds/lm/lm.project +++ b/contrib/other-builds/lm/lm.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -27,6 +44,8 @@ + + @@ -40,9 +59,9 @@ - - - + + + @@ -72,7 +91,7 @@ - + @@ -110,7 +129,7 @@ - + @@ -118,6 +137,4 @@ - - diff --git a/contrib/other-builds/moses-cmd/moses-cmd.project b/contrib/other-builds/moses-cmd/moses-cmd.project index 44a0d621f..5303ba7c7 100644 --- a/contrib/other-builds/moses-cmd/moses-cmd.project +++ b/contrib/other-builds/moses-cmd/moses-cmd.project @@ -26,6 +26,13 @@ + + + + + + + @@ -39,20 +46,20 @@ - - - + + + - - - - - - + + + + + + @@ -143,11 +150,4 @@ - - - - - - - diff --git a/contrib/other-builds/pruneGeneration/pruneGeneration.project b/contrib/other-builds/pruneGeneration/pruneGeneration.project index 7060d55ea..39109197a 100644 --- a/contrib/other-builds/pruneGeneration/pruneGeneration.project +++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project @@ -2,6 +2,10 @@ + + + + @@ -15,6 +19,7 @@ + @@ -90,8 +95,4 @@ - - - - diff --git a/contrib/other-builds/score/score.project b/contrib/other-builds/score/score.project index c88df0e78..08e0b9414 100644 --- a/contrib/other-builds/score/score.project +++ b/contrib/other-builds/score/score.project @@ -19,6 +19,10 @@ + + + + @@ -32,17 +36,17 @@ - - - + + + - - - - - - + + + + + + @@ -86,7 +90,7 @@ - + @@ -125,7 +129,7 @@ - + @@ -133,8 +137,4 @@ - - - - diff --git a/contrib/other-builds/search/search.project b/contrib/other-builds/search/search.project index d96252a89..8be29fd1d 100644 --- a/contrib/other-builds/search/search.project +++ b/contrib/other-builds/search/search.project @@ -10,6 +10,8 @@ + + @@ -23,9 +25,9 @@ - - - + + + @@ -55,7 +57,7 @@ - + @@ -93,7 +95,7 @@ - + @@ -101,6 +103,4 @@ - - diff --git a/contrib/other-builds/util/util.project b/contrib/other-builds/util/util.project index 1006ddb52..4bb27306e 100644 --- a/contrib/other-builds/util/util.project +++ b/contrib/other-builds/util/util.project @@ -62,6 +62,8 @@ + + @@ -75,8 +77,8 @@ - - + + @@ -105,7 +107,7 @@ - + @@ -143,7 +145,7 @@ - + @@ -151,6 +153,4 @@ - - diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index 8207e287f..98b21530c 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -1,10 +1,10 @@ #include #include -#include #include #include #include #include "pruneGeneration.h" +#include "moses/InputFileStream.h" using namespace std; @@ -13,16 +13,23 @@ int main(int argc, char **argv) cerr << "Starting" << endl; int limit = atoi(argv[1]); + Process(limit, cin, cout); + + cerr << "Finished" << endl; +} + +void Process(int limit, istream &inStrme, ostream &outStrme) +{ vector records; string prevInWord; string line; - while (getline(cin, line)) { + while (getline(inStrme, line)) { vector toks; Tokenize(toks, line); assert(toks.size() == 4); if (prevInWord != toks[0]) { - Output(limit, records); + Output(outStrme, records, limit); records.clear(); } @@ -34,13 +41,12 @@ int main(int argc, char **argv) } // last - Output(limit, records); + Output(outStrme, records, limit); records.clear(); - cerr << "Finished" << endl; } -void Output(int limit, vector &records) +void Output(ostream &outStrme, vector &records, int limit) { std::sort(records.rbegin(), records.rend()); diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h index dae5958f8..b22d09869 100644 --- a/misc/pruneGeneration.h +++ b/misc/pruneGeneration.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include class Rec { @@ -20,7 +21,8 @@ public: //////////////////////////////////////////////////////////// -void Output(int limit, std::vector &records); +void Process(int limit, std::istream &inStrme, std::ostream &outStrme); +void Output(std::ostream &outStrme, std::vector &records, int limit); //////////////////////////////////////////////////////////// inline void Tokenize(std::vector &output From 930dce10bff821431213441fa1c07c1195d916b9 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 25 Jun 2015 13:02:29 +0400 Subject: [PATCH 094/108] prune multiple files at once. Make up for failure in ems to give the full path of the gen table --- contrib/other-builds/moses/moses.project | 2 + .../pruneGeneration/pruneGeneration.project | 27 +++++- misc/pruneGeneration.cpp | 44 ++++++++- moses/OutputFileStream.cpp | 90 +++++++++++++++++++ moses/OutputFileStream.h | 81 +++++++++++++++++ 5 files changed, 241 insertions(+), 3 deletions(-) create mode 100644 moses/OutputFileStream.cpp create mode 100644 moses/OutputFileStream.h diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project index 0fbd942c6..0ceb40723 100644 --- a/contrib/other-builds/moses/moses.project +++ b/contrib/other-builds/moses/moses.project @@ -775,6 +775,8 @@ + + diff --git a/contrib/other-builds/pruneGeneration/pruneGeneration.project b/contrib/other-builds/pruneGeneration/pruneGeneration.project index 39109197a..6f8a6adf5 100644 --- a/contrib/other-builds/pruneGeneration/pruneGeneration.project +++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -20,8 +37,16 @@ + - + + + + + + + + diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index 98b21530c..e436263e9 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -3,8 +3,10 @@ #include #include #include +#include #include "pruneGeneration.h" #include "moses/InputFileStream.h" +#include "moses/OutputFileStream.h" using namespace std; @@ -12,8 +14,46 @@ int main(int argc, char **argv) { cerr << "Starting" << endl; int limit = atoi(argv[1]); + string inPathStem = argv[2]; + string outPathStem = argv[3]; - Process(limit, cin, cout); + namespace fs = boost::filesystem; + + //cerr << "inPathStem=" << inPathStem << endl; + fs::path p(inPathStem); + fs::path dir = p.parent_path(); + //cerr << "dir=" << dir << endl; + + fs::path fileStem = p.filename(); + string fileStemStr = fileStem.native(); + size_t fileStemStrSize = fileStemStr.size(); + //cerr << "fileStem=" << fileStemStr << endl; + + // loop thru each file in directory + fs::directory_iterator end_iter; + for( fs::directory_iterator dir_iter(dir) ; dir_iter != end_iter ; ++dir_iter) { + if (fs::is_regular_file(dir_iter->status())) { + fs::path currPath = *dir_iter; + string currPathStr = currPath.native(); + //cerr << "currPathStr=" << currPathStr << endl; + + fs::path currFile = currPath.filename(); + string currFileStr = currFile.native(); + + if (currFileStr.find(fileStemStr) == 0) { + // found gen table we need + //cerr << "found=" << currPathStr << endl; + string suffix = currFileStr.substr(fileStemStrSize, currFileStr.size() - fileStemStrSize); + string outPath = outPathStem + suffix; + cerr << "PRUNING " << currPathStr << " TO " << outPath << endl; + + Moses::InputFileStream inStrme(currPathStr); + Moses::OutputFileStream outStrme(outPath); + Process(limit, inStrme, outStrme); + + } + } + } cerr << "Finished" << endl; } @@ -52,7 +92,7 @@ void Output(ostream &outStrme, vector &records, int limit) for (size_t i = 0; i < limit && i < records.size(); ++i) { const Rec &rec = records[i]; - cout << rec.line << endl; + outStrme << rec.line << endl; } } diff --git a/moses/OutputFileStream.cpp b/moses/OutputFileStream.cpp new file mode 100644 index 000000000..d7874b06f --- /dev/null +++ b/moses/OutputFileStream.cpp @@ -0,0 +1,90 @@ +// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include +#include "OutputFileStream.h" +#include "gzfilebuf.h" + +using namespace std; +using namespace boost::algorithm; + +namespace Moses +{ +OutputFileStream::OutputFileStream() + :boost::iostreams::filtering_ostream() + ,m_outFile(NULL) + ,m_open(false) +{ +} + +OutputFileStream::OutputFileStream(const std::string &filePath) + :m_outFile(NULL) + ,m_open(false) +{ + Open(filePath); +} + +OutputFileStream::~OutputFileStream() +{ + Close(); +} + +bool OutputFileStream::Open(const std::string &filePath) +{ + assert(!m_open); + if (filePath == std::string("-")) { + // Write to standard output. Leave m_outFile null. + this->push(std::cout); + } else { + m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary); + if (m_outFile->fail()) { + return false; + } + + if (ends_with(filePath, ".gz")) { + this->push(boost::iostreams::gzip_compressor()); + } + this->push(*m_outFile); + } + + m_open = true; + return true; +} + +void OutputFileStream::Close() +{ + if (!m_open) return; + this->flush(); + if (m_outFile) { + this->pop(); // file + + m_outFile->close(); + delete m_outFile; + m_outFile = NULL; + } + m_open = false; +} + + +} + diff --git a/moses/OutputFileStream.h b/moses/OutputFileStream.h new file mode 100644 index 000000000..b77741a73 --- /dev/null +++ b/moses/OutputFileStream.h @@ -0,0 +1,81 @@ +// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include + +namespace Moses +{ + +/** Version of std::ostream with transparent compression. + * + * Transparently compresses output when writing to a file whose name ends in + * ".gz". Or, writes to stdout instead of a file when given a filename + * consisting of just a dash ("-"). + */ +class OutputFileStream : public boost::iostreams::filtering_ostream +{ +private: + /** File that needs flushing & closing when we close this stream. + * + * Is NULL when no file is opened, e.g. when writing to standard output. + */ + std::ofstream *m_outFile; + + /// Is this stream open? + bool m_open; + +public: + /** Create an unopened OutputFileStream. + * + * Until it's been opened, nothing can be done with this stream. + */ + OutputFileStream(); + + /// Create an OutputFileStream, and open it by calling Open(). + OutputFileStream(const std::string &filePath); + virtual ~OutputFileStream(); + + // TODO: Can we please just always throw an exception when this fails? + /** Open stream. + * + * If filePath is "-" (just a dash), this opens the stream for writing to + * standard output. Otherwise, it opens the given file. If the filename + * has the ".gz" suffix, output will be transparently compressed. + * + * Call Close() to close the file. + * + * Returns whether opening the file was successful. It may also throw an + * exception on failure. + */ + bool Open(const std::string &filePath); + + /// Flush and close stream. After this, the stream can be opened again. + void Close(); +}; + +} + From b83803203e94535aa4405df244ccbd32ab80ed34 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 25 Jun 2015 18:10:31 +0400 Subject: [PATCH 095/108] prune generation table in ems --- scripts/ems/experiment.meta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 4177f967e..110ab39b7 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -746,7 +746,7 @@ generation-prune pass-unless: AND TRAINING:prune-generation default-name: model/generation-table-pruned final-model: yes - template: $TRAINING:prune-generation < IN > OUT + template: $TRAINING:prune-generation IN OUT build-sparse in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus out: sparse From 22cc22064c3cfcd6a762ebf8e597a3ed13642814 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 25 Jun 2015 15:17:26 +0100 Subject: [PATCH 096/108] Changed implementation of indocs (to keep track of which documents phrases come from) from vector to map. --- .../UG/mm/ug_bitext_agenda_job.h | 17 +++++++++++++---- .../TranslationModel/UG/mm/ug_bitext_jstats.cc | 2 +- moses/TranslationModel/UG/mm/ug_bitext_jstats.h | 3 ++- .../TranslationModel/UG/mm/ug_bitext_pstats.cc | 2 +- moses/TranslationModel/UG/mm/ug_bitext_pstats.h | 4 ++-- moses/TranslationModel/UG/mm/ug_phrasepair.h | 11 +++++++---- 6 files changed, 26 insertions(+), 13 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h index 0e0624351..36b9873e0 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -137,7 +137,10 @@ int Bitext::agenda::job float p = (*m_bias)[sid]; id_type docid = m_bias->GetClass(sid); - uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0; + + // uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0; + std::map::const_iterator m = stats->indoc.find(docid); + uint32_t k = m != stats->indoc.end() ? m->second : 0 ; // always consider candidates from dominating documents and // from documents that have not been considered at all yet @@ -159,11 +162,17 @@ int Bitext::agenda::job e = root->getCorpus()->sntEnd(sid); *log << docid << ":" << sid << " " << size_t(k) << "/" << N << " @" << p << " => " << d << " ["; - for (size_t i = 0; i < stats->indoc.size(); ++i) + for (std::map::const_iterator m = stats->indoc.begin(); + m != stats->indoc.end(); ++m) { - if (i) *log << " "; - *log << stats->indoc[i]; + if (m != stats->indoc.begin()) *log << " "; + *log << m->first << ":" << m->second; } + // for (size_t i = 0; i < stats->indoc.size(); ++i) + // { + // if (i) *log << " "; + // *log << stats->indoc[i]; + // } *log << "] "; for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " "; if (!ret) *log << "SKIP"; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc index bcda9ebf3..517caf783 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc @@ -76,7 +76,7 @@ namespace Moses ++obwd[bwd_orient]; if (docid >= 0) { - while (int(indoc.size()) <= docid) indoc.push_back(0); + // while (int(indoc.size()) <= docid) indoc.push_back(0); ++indoc[docid]; } } diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h index dade27649..03b231487 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h @@ -27,7 +27,8 @@ namespace Moses uint32_t obwd[Moses::LRModel::NONE+1]; // backward distortion type counts public: - vector indoc; // counts origin of samples (for biased sampling) + std::map indoc; + // vector indoc; // counts origin of samples (for biased sampling) jstats(); jstats(jstats const& other); uint32_t rcnt() const; // raw joint counts diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc index 580d7669b..8702d9c50 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc @@ -58,7 +58,7 @@ namespace Moses ++obwd[po_bwd]; if (docid >= 0) { - while (int(indoc.size()) <= docid) indoc.push_back(0); + // while (int(indoc.size()) <= docid) indoc.push_back(0); ++indoc[docid]; } } diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index 9a14e378b..e5cf4ab26 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -33,8 +33,8 @@ namespace Moses uint32_t ofwd[Moses::LRModel::NONE+1]; // distribution of fwd phrase orientations uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations - std::vector indoc; // distribution over where samples came from - + // std::vector indoc; // distribution over where samples came from + std::map indoc; typedef std::map trg_map_t; trg_map_t trg; pstats(); diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h index 7e565c2df..7f03d89df 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.h +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -30,7 +30,8 @@ namespace Moses std::vector aln; float score; bool inverse; - std::vector indoc; + // std::vector indoc; + std::map indoc; PhrasePair() { }; PhrasePair(PhrasePair const& o); @@ -306,10 +307,12 @@ namespace Moses out << toString (V1, this->start1, this->len1) << " ::: " << toString (V2, this->start2, this->len2) << " " << this->joint << " ["; - for (size_t i = 0; i < this->indoc.size(); ++i) + // for (size_t i = 0; i < this->indoc.size(); ++i) + for (std::map::const_iterator m = indoc.begin(); + m != indoc.end(); ++m) { - if (i) out << " "; - out << this->indoc[i]; + if (m != indoc.begin()) out << " "; + out << m->first << ":" << m->second; } out << "] ["; vector lrscores; From 41a11dfe8ac9e7d01e950607afdd13492113e9d5 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 25 Jun 2015 18:20:03 +0100 Subject: [PATCH 097/108] Allow ports other than 80 as the server ports for the context bias server. --- .../TranslationModel/UG/mm/ug_http_client.cc | 11 ++- .../UG/mm/ug_sampling_bias.cc | 77 ++++++++++++++++--- .../TranslationModel/UG/mm/ug_sampling_bias.h | 1 + 3 files changed, 78 insertions(+), 11 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_http_client.cc b/moses/TranslationModel/UG/mm/ug_http_client.cc index 1bbb93b23..da8537910 100644 --- a/moses/TranslationModel/UG/mm/ug_http_client.cc +++ b/moses/TranslationModel/UG/mm/ug_http_client.cc @@ -1,4 +1,5 @@ #include "ug_http_client.h" +#include "moses/Util.h" namespace Moses { using boost::asio::ip::tcp; @@ -31,10 +32,16 @@ http_client(boost::asio::io_service& io_service, std::string url) p = std::min(url.find_first_of(":/"), url.size()); q = std::min(url.find("/"), url.size()); if (p < url.size() && url[p] == ':') - port = url.substr(p,q-p); + port = url.substr(p+1,q-p-1); server = url.substr(0,p); if (q < url.size()) path = url.substr(q); +#if 0 + std::cerr << HERE << std::endl; + std::cerr << "SERVER " << server << std::endl; + std::cerr << "PORT |" << port << "|" << std::endl; + std::cerr << "PATH " << path << std::endl; +#endif init(server, port, path); } @@ -55,7 +62,7 @@ init(std::string const& server, std::string const& port, std::string const& path // Start an asynchronous resolve to translate the server and service names // into a list of endpoints. - tcp::resolver::query query(server, port); + tcp::resolver::query query(server, port.c_str()); resolver_.async_resolve(query, boost::bind(&http_client::handle_resolve, this, boost::asio::placeholders::error, diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index da408dfb3..d54305997 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -2,7 +2,7 @@ #include #include #include "moses/Timer.h" - +// #include // #ifdef HAVE_CURLPP // #include // #include @@ -19,19 +19,77 @@ namespace Moses { using ugdiss::id_type; - // #ifdef WITH_MMT_BIAS_CLIENT - std::string - query_bias_server(std::string const& url, std::string const& text) + size_t ca_write_callback(void *ptr, size_t size, size_t nmemb, + std::string* response) { - std::string query = url+uri_encode(text); + char const* c = reinterpret_cast(ptr); + *response += std::string(c, size * nmemb); + return size * nmemb; + } + + std::string + query_bias_server(std::string const& server, std::string const& context) + { +#if 0 + std::string query = server + uri_encode(context); + std::string response; + + CURL* curl = curl_easy_init(); + UTIL_THROW_IF2(!curl, "Could not init curl."); + curl_easy_setopt(curl, CURLOPT_URL, query.c_str()); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ca_write_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); + CURLcode res = curl_easy_perform(curl); + curl_easy_cleanup(curl); + return response; +#else + std::string query = server+uri_encode(context); boost::asio::io_service io_service; Moses::http_client c(io_service, query); io_service.run(); - return c.content(); - } - // #endif - DocumentBias + std::string response = c.content(); + std::cerr << "SERVER RESPONSE: " << response << std::endl; + + return c.content(); +#endif + } + +// // #ifdef WITH_MMT_BIAS_CLIENT +// std::string +// query_bias_server(std::string const& url, std::string const& text) +// { +// #if 1 +// std::string query = url+uri_encode(text); +// boost::asio::io_service io_service; +// Moses::http_client c(io_service, query); +// io_service.run(); + +// std::string response = c.content(); +// std::cerr << "SERVER RESPONSE: " << response << std::endl; + +// return c.content(); +// #else +// return ""; +// #endif +// } +// // #endif + + + // std::string + // query_bias_server(std::string const& url, int const port, + // std::string const& context, + // std::string const& src_lang) + // { + // char* response + // = ca_get_context(url.c_str(), port, context.c_str(), src_lang.c_str()); + // UTIL_THROW_IF2(!response, "No response from server"); + // std::string json = response; + // free(response); + // return json; + // } + + DocumentBias ::DocumentBias ( std::vector const& sid2doc, std::map const& docname2docid, @@ -44,6 +102,7 @@ namespace Moses Timer timer; if (log) timer.start(NULL); std::string json = query_bias_server(server_url, text); + std::cerr << "SERVER RESPONSE " << json << std::endl; init_from_json(json, docname2docid, log); if (log) *log << "Bias query took " << timer << " seconds." << std::endl; // #endif diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index f540ddc76..24d39689e 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -37,6 +37,7 @@ namespace Moses { std::vector const& m_sid2docid; std::vector m_bias; + // std::map m_bias; public: From faf7b51fb7ad8e382c751c832de74fda745a2f57 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Fri, 26 Jun 2015 00:01:00 +0100 Subject: [PATCH 098/108] daily automatic beautifier --- misc/pruneGeneration.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index e436263e9..d58c10ebd 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -31,7 +31,7 @@ int main(int argc, char **argv) // loop thru each file in directory fs::directory_iterator end_iter; - for( fs::directory_iterator dir_iter(dir) ; dir_iter != end_iter ; ++dir_iter) { + for( fs::directory_iterator dir_iter(dir) ; dir_iter != end_iter ; ++dir_iter) { if (fs::is_regular_file(dir_iter->status())) { fs::path currPath = *dir_iter; string currPathStr = currPath.native(); @@ -46,15 +46,15 @@ int main(int argc, char **argv) string suffix = currFileStr.substr(fileStemStrSize, currFileStr.size() - fileStemStrSize); string outPath = outPathStem + suffix; cerr << "PRUNING " << currPathStr << " TO " << outPath << endl; - + Moses::InputFileStream inStrme(currPathStr); Moses::OutputFileStream outStrme(outPath); Process(limit, inStrme, outStrme); - + } } } - + cerr << "Finished" << endl; } From ca5485264196fbc79e4f478e1937e95c170645e8 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 26 Jun 2015 11:37:35 +0400 Subject: [PATCH 099/108] tighten up extract-parallel on osx. Can now use gsplit and bsd split --- scripts/generic/extract-parallel.perl | 38 ++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index 3240f24eb..226dbeb6e 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. @@ -15,8 +15,7 @@ sub systemCheck($); sub NumStr($); sub DigitStr($); sub CharStr($); - -my $is_osx = ($^O eq "darwin"); +sub GetSplitVersion($); my $alph = "abcdefghijklmnopqrstuvwxyz"; my @alph = (split(//,$alph)); @@ -42,7 +41,7 @@ my $baselineExtract; my $glueFile; my $phraseOrientation = 0; my $phraseOrientationPriorsFile; -my $splitCmdOption="-d"; +my $splitCmdOption = ""; my $GZIP_EXEC; if(`which pigz`) { @@ -53,6 +52,15 @@ else { } print STDERR "using $GZIP_EXEC \n"; +my $isBSDSplit = GetSplitVersion($splitCmd); +print STDERR "isBSDSplit=$isBSDSplit \n"; + +if ($isBSDSplit == 0) { + $splitCmdOption .= "-d"; +} + +my $gzOut = 0; + for (my $i = 8; $i < $#ARGV + 1; ++$i) { $makeTTable = 0 if $ARGV[$i] eq "--NoTTable"; @@ -73,11 +81,15 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i) $phraseOrientationPriorsFile = $ARGV[++$i]; next; } - $splitCmdOption="",next if $ARGV[$i] eq "--NoNumericSuffix"; + if ($ARGV[$i] eq '--GZOutput') { + $gzOut = 1; + } $otherExtractArgs .= $ARGV[$i] ." "; } +die("Need to specify --GZOutput for parallel extract") if ($gzOut == 0); + my $cmd; my $TMPDIR=dirname($extract) ."/tmp.$$"; $cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR"; @@ -272,7 +284,7 @@ if ($phraseOrientation && defined($phraseOrientationPriorsFile)) { # delete temporary files $cmd = "rm -rf $TMPDIR \n"; -`$cmd`; +systemCheck($cmd); print STDERR "Finished ".localtime() ."\n"; @@ -352,10 +364,22 @@ sub CharStr($) sub NumStr($) { my $i = shift; - if ($is_osx){ + if ($isBSDSplit){ return CharStr($i); }else{ return DigitStr($i); } } +sub GetSplitVersion($) +{ + my $splitCmd = shift; + my $retVal = system("$splitCmd -h"); + if ($retVal != 0) { + return 1; + } + else { + return 0; + } +} + From 57e213ed190a15ebfbc193e9eeb525813e92cc1a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 26 Jun 2015 12:18:21 +0400 Subject: [PATCH 100/108] tighten up extract-parallel on osx. Can now use gsplit and bsd split --- scripts/generic/extract-parallel.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index 226dbeb6e..2424c1bd2 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -374,7 +374,7 @@ sub NumStr($) sub GetSplitVersion($) { my $splitCmd = shift; - my $retVal = system("$splitCmd -h"); + my $retVal = system("$splitCmd --help"); if ($retVal != 0) { return 1; } From 82edbb98a7aa9186287f8f00dfcbbeb2906e7a5a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 28 Jun 2015 10:40:43 +0400 Subject: [PATCH 101/108] comments in ini file about default weights --- scripts/training/train-model.perl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index b693d774d..4d73ef4ee 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -2358,6 +2358,8 @@ sub create_ini { print INI "\n# dense weights for feature functions\n"; print INI "[weight]\n"; + print INI "# The default weights are NOT optimized for translation quality. You MUST tune the weights.\n"; + print INI "# Documentation for tuning is here: http://www.statmt.org/moses/?n=FactoredTraining.Tuning \n"; print INI "UnknownWordPenalty0= 1\n"; print INI "WordPenalty0= -1\n"; print INI "PhrasePenalty0= 0.2\n"; From f66beabf4f0dca33a6bbcc37072811e9017e19b5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 28 Jun 2015 14:03:54 +0400 Subject: [PATCH 102/108] Generation error in EMS due to pruning. Lets see if this works. --- scripts/ems/experiment.meta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 110ab39b7..ee6b188e8 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -743,7 +743,7 @@ generation-prune in: generation-table out: generation-table-pruned rerun-on-change: TRAINING:prune-generation - pass-unless: AND TRAINING:prune-generation + ignore-unless: AND TRAINING:prune-generation default-name: model/generation-table-pruned final-model: yes template: $TRAINING:prune-generation IN OUT From f7c3d00824e1664ba0cbfbc80ff94a82f3eb7561 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 28 Jun 2015 22:20:42 +0400 Subject: [PATCH 103/108] more testing of c++11 waters --- phrase-extract/ScoreFeatureTest.cpp | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index cc22f8630..0ed2f71e6 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -26,6 +26,7 @@ #include #include +#include using namespace MosesTraining; using namespace std; @@ -81,6 +82,16 @@ static void checkDomainConfigured( BOOST_CHECK(manager.includeSentenceId()); } +template +T adder(T v) { + return v; +} + +template +T adder(T first, Args... args) { + return first + adder(args...); +} + BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured @@ -102,8 +113,23 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) s.insert(4); s.insert(1); -for (auto i: s) { + for (auto i: s) { cerr << i << " "; } + + unordered_map m; + m["a"] = 4; + m["ba"] = 6; + m["aabc"] = 7; + + for (auto i: m) { + cerr << i.first << "=" << i.second << " "; + } + + long sum = adder(1, 2, 3, 8, 7); + + std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy"; + std::string ssum = adder(s1, s2, s3, s4); + } From fba4a3e24da01a01088c95c8c85f71d551ba4634 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Mon, 29 Jun 2015 00:00:54 +0100 Subject: [PATCH 104/108] daily automatic beautifier --- phrase-extract/ScoreFeatureTest.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 0ed2f71e6..94a5a0480 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -83,12 +83,14 @@ static void checkDomainConfigured( } template -T adder(T v) { +T adder(T v) +{ return v; } template -T adder(T first, Args... args) { +T adder(T first, Args... args) +{ return first + adder(args...); } @@ -113,7 +115,7 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) s.insert(4); s.insert(1); - for (auto i: s) { +for (auto i: s) { cerr << i << " "; } @@ -121,15 +123,15 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) m["a"] = 4; m["ba"] = 6; m["aabc"] = 7; - - for (auto i: m) { + +for (auto i: m) { cerr << i.first << "=" << i.second << " "; } - + long sum = adder(1, 2, 3, 8, 7); - std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy"; - std::string ssum = adder(s1, s2, s3, s4); + std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy"; + std::string ssum = adder(s1, s2, s3, s4); } From 5e81e4b9c37bcfe4f7828ca16bb03c28cbf4f491 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Mon, 29 Jun 2015 12:23:53 +0700 Subject: [PATCH 105/108] Simplify unnecessarily complicated condition. --- moses/ChartHypothesisCollection.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/moses/ChartHypothesisCollection.h b/moses/ChartHypothesisCollection.h index 37cd907d9..b2464e151 100644 --- a/moses/ChartHypothesisCollection.h +++ b/moses/ChartHypothesisCollection.h @@ -52,11 +52,7 @@ public: // shouldn't be mixing hypos with different lhs assert(hypoA->GetTargetLHS() == hypoB->GetTargetLHS()); - int ret = hypoA->RecombineCompare(*hypoB); - if (ret != 0) - return (ret < 0); - - return false; + return (hypoA->RecombineCompare(*hypoB) < 0); } }; From a374706bd4a995aa810b748f122b2d6279866088 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Wed, 1 Jul 2015 00:00:59 +0100 Subject: [PATCH 106/108] daily automatic beautifier --- moses/server/TranslationRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index 3848f81ba..bc2b5032b 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -1,5 +1,5 @@ #include "TranslationRequest.h" -#include "moses/ContextScope.h" +#include "moses/ContextScope.h" #include namespace MosesServer From 81f337bcd838a69bf0e275c8138b173427a17d02 Mon Sep 17 00:00:00 2001 From: hieu Date: Wed, 1 Jul 2015 09:42:07 +0400 Subject: [PATCH 107/108] rollback c++11 for now --- Jamroot | 2 +- moses/StaticData.cpp | 3 ++- phrase-extract/ScoreFeatureTest.cpp | 33 ++++++++++++++++------------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/Jamroot b/Jamroot index 4f76ec3ba..b3544274b 100644 --- a/Jamroot +++ b/Jamroot @@ -108,7 +108,7 @@ external-lib z ; #lib dl : : static:static shared:shared ; #requirements += dl ; -requirements += -std=c++0x ; +#requirements += -std=c++0x ; if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] { if [ option.get "full-tcmalloc" : : "yes" ] { diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 28d9f7831..281129a2e 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -1115,7 +1115,8 @@ void StaticData::LoadSparseWeightsFromConfig() } std::map > weights = m_parameter->GetAllWeights(); - for (auto iter = weights.begin(); iter != weights.end(); ++iter) { + std::map >::iterator iter; + for (iter = weights.begin(); iter != weights.end(); ++iter) { // this indicates that it is sparse feature if (featureNames.find(iter->first) == featureNames.end()) { UTIL_THROW_IF2(iter->second.size() != 1, "ERROR: only one weight per sparse feature allowed: " << iter->first); diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 94a5a0480..9537b970f 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -24,9 +24,10 @@ #define BOOST_TEST_MODULE MosesTrainingScoreFeature #include #include +#include -#include -#include +//#include +//#include using namespace MosesTraining; using namespace std; @@ -54,16 +55,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), + manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), + manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--SparseDomainBlah","/dev/null"}), + manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--DomainSubset"}), + manager.configure(boost::assign::list_of("--DomainSubset")), ScoreFeatureArgumentException); } @@ -97,25 +98,27 @@ T adder(T first, Args... args) BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - ( {"--DomainRatio","/dev/null"}); + (boost::assign::list_of("--DomainRatio")("/dev/null")); checkDomainConfigured - ( {"--DomainIndicator","/dev/null"}); + (boost::assign::list_of("--DomainIndicator")("/dev/null")); checkDomainConfigured - ( {"--DomainSubset","/dev/null"}); + (boost::assign::list_of("--DomainSubset")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainRatio","/dev/null"}); + (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainIndicator","/dev/null"}); + (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainSubset","/dev/null"}); + (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); + /* + // C++11 testing unordered_set s; s.insert(4); s.insert(7); s.insert(4); s.insert(1); -for (auto i: s) { + for (auto i: s) { cerr << i << " "; } @@ -124,7 +127,7 @@ for (auto i: s) { m["ba"] = 6; m["aabc"] = 7; -for (auto i: m) { + for (auto i: m) { cerr << i.first << "=" << i.second << " "; } @@ -132,6 +135,6 @@ for (auto i: m) { std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy"; std::string ssum = adder(s1, s2, s3, s4); - + */ } From 86292f2ce332013c187afd8046a9eeec2770561e Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Thu, 2 Jul 2015 00:01:16 +0100 Subject: [PATCH 108/108] daily automatic beautifier --- moses/StaticData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 281129a2e..8fb88c257 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -1115,7 +1115,7 @@ void StaticData::LoadSparseWeightsFromConfig() } std::map > weights = m_parameter->GetAllWeights(); - std::map >::iterator iter; + std::map >::iterator iter; for (iter = weights.begin(); iter != weights.end(); ++iter) { // this indicates that it is sparse feature if (featureNames.find(iter->first) == featureNames.end()) {