From ab2d396781e57d8d7e3526d102db24597e289fab Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Thu, 28 May 2015 17:10:21 +0100 Subject: [PATCH 1/7] Min score parameter --- scripts/training/binarize-model.perl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl index 0239f5fc8..cca74f1ab 100755 --- a/scripts/training/binarize-model.perl +++ b/scripts/training/binarize-model.perl @@ -17,12 +17,14 @@ if ($SCRIPTS_ROOTDIR eq '') { } $SCRIPTS_ROOTDIR =~ s/\/training$//; -my ($binarizer, $input_config, $output_config); +my ($binarizer, $input_config, $output_config, $min_score); my $opt_hierarchical = 0; -$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTable"; +$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTableMin"; +$min_score = "0"; GetOptions( "Hierarchical" => \$opt_hierarchical, - "Binarizer=s" => \$binarizer + "Binarizer=s" => \$binarizer, + "MinScore=s" => \$min_score ) or exit(1); $input_config = shift; @@ -37,7 +39,9 @@ my $hierarchical = ""; $hierarchical = "-Hierarchical" if $opt_hierarchical; my $targetdir = "$output_config.tables"; -safesystem("$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer") || die "binarising failed"; +my $cmd = "$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer"; +$cmd .= "--MinScore $min_score" if (defined $min_score); +safesystem($cmd) || die "binarising failed"; safesystem("rm -f $output_config; ln -s $targetdir/moses.ini $output_config") || die "failed to link new ini file"; #FIXME: Why isn't this in a module? From c27aa193eaa3c73754c8d90dea0cd32dd5a22e7d Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Thu, 28 May 2015 17:44:26 +0100 Subject: [PATCH 2/7] Revert "Min score parameter". Doesn't work without filter. This reverts commit ab2d396781e57d8d7e3526d102db24597e289fab. --- scripts/training/binarize-model.perl | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl index cca74f1ab..0239f5fc8 100755 --- a/scripts/training/binarize-model.perl +++ b/scripts/training/binarize-model.perl @@ -17,14 +17,12 @@ if ($SCRIPTS_ROOTDIR eq '') { } $SCRIPTS_ROOTDIR =~ s/\/training$//; -my ($binarizer, $input_config, $output_config, $min_score); +my ($binarizer, $input_config, $output_config); my $opt_hierarchical = 0; -$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTableMin"; -$min_score = "0"; +$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTable"; GetOptions( "Hierarchical" => \$opt_hierarchical, - "Binarizer=s" => \$binarizer, - "MinScore=s" => \$min_score + "Binarizer=s" => \$binarizer ) or exit(1); $input_config = shift; @@ -39,9 +37,7 @@ my $hierarchical = ""; $hierarchical = "-Hierarchical" if $opt_hierarchical; my $targetdir = "$output_config.tables"; -my $cmd = "$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer"; -$cmd .= "--MinScore $min_score" if (defined $min_score); -safesystem($cmd) || die "binarising failed"; +safesystem("$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer") || die "binarising failed"; safesystem("rm -f $output_config; ln -s $targetdir/moses.ini $output_config") || die "failed to link new ini file"; #FIXME: Why isn't this in a module? From 26170a41790bc1dfbc01c90dbcbf2699a0fe3cd0 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Fri, 29 May 2015 09:37:37 +0700 Subject: [PATCH 3/7] Friendlier error reporting in beautify.py. --- scripts/other/beautify.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/scripts/other/beautify.py b/scripts/other/beautify.py index f03a58ce7..0caa6b162 100755 --- a/scripts/other/beautify.py +++ b/scripts/other/beautify.py @@ -38,6 +38,17 @@ BEAUTIFY_IGNORE = '.beautify-ignore' class LintCheckFailure(Exception): """Lint was found, or the lint checker otherwise returned failure.""" + exit_code = 1 + + +class ProgramFailure(Exception): + """The program failed, but it's not a bug. No traceback.""" + exit_code = 2 + + +class CommandLineError(Exception): + """Something wrong with the command-line arguments.""" + exit_code = 3 def read_ignore_file(root_dir): @@ -52,7 +63,7 @@ def read_ignore_file(root_dir): ignore_contents = ignore_file.read() except IOError as error: if error.errno == ENOENT: - raise Exception( + raise ProgramFailure( "No .gitignore file found in %s. " "Is it really the project's root directory?" % root_dir) @@ -200,7 +211,7 @@ def check_astyle_version(verbose=False): ['astyle', '--version'], verbose=verbose, env={'LC_ALL': 'C'}) version = version.strip() if version != EXPECTED_ASTYLE_VERSION: - raise Exception( + raise ProgramFailure( "Wrong astyle version. " "Expected '%s', but got version string '%s'." % (EXPECTED_ASTYLE_VERSION, version)) @@ -226,8 +237,15 @@ def run_perltidy(source_files, verbose=False, dry_run=False): # Write "} else {", with 'else' on the same line as the braces. '--cuddled-else', ] - _, stderr = run_command( - command_line + source_files, verbose=verbose, dry_run=dry_run) + try: + _, stderr = run_command( + command_line + source_files, verbose=verbose, dry_run=dry_run) + except OSError as error: + if error.errno == ENOENT: + raise ProgramFailure( + "Could not run 'perltidy'. Make sure that it is installed.") + else: + raise if stderr != '': sys.stderr.write(stderr) @@ -386,7 +404,7 @@ def main(): """Find and format source files.""" args = parse_arguments() if not args.format and not args.lint: - raise Exception("Select action: --format, --lint, or both.") + raise CommandLineError("Select action: --format, --lint, or both.") ignore = read_ignore_file(args.root_dir) @@ -409,8 +427,8 @@ def main(): if __name__ == '__main__': try: main() - except LintCheckFailure as error: + except (CommandLineError, LintCheckFailure, ProgramFailure) as error: # This is a failure, but not a bug. Print a friendly error # message, not a traceback. sys.stderr.write('%s\n' % error) - sys.exit(1) + sys.exit(error.exit_code) From ef028446f3640e007215b4576a4dc52a9c9de6db Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Fri, 29 May 2015 18:30:26 +0700 Subject: [PATCH 4/7] Add license notices to scripts. This is not pleasant to read (and much, much less pleasant to write!) but sort of necessary in an open project. Right now it's quite hard to figure out what is licensed how, which doesn't matter much to most people but can suddenly become very important when people want to know what they're being allowed to do. I kept the notices as short as I could. As far as I could see, everything without a clear license notice is LGPL v2.1 or later. --- scripts/OSM/OSM-Train.perl | 3 ++ scripts/OSM/extract-singletons.perl | 3 ++ scripts/OSM/flipAlignment.perl | 3 ++ scripts/Transliteration/clean.pl | 5 ++- scripts/Transliteration/corpusCreator.pl | 3 ++ .../in-decoding-transliteration.pl | 3 ++ .../post-decoding-transliteration.pl | 3 ++ .../prepare-transliteration-phrase-table.pl | 3 ++ scripts/Transliteration/threshold.pl | 3 ++ .../train-transliteration-module.pl | 3 ++ ...trap-hypothesis-difference-significance.pl | 3 ++ scripts/analysis/extract-target-trees.py | 12 ++++-- scripts/analysis/nontranslated_words.pl | 3 ++ scripts/analysis/oov.pl | 3 ++ scripts/analysis/sentence-by-sentence.pl | 3 ++ scripts/analysis/sg2dot.perl | 2 + scripts/analysis/show-phrases-used.pl | 3 ++ scripts/analysis/smtgui/Corpus.pm | 3 ++ .../analysis/smtgui/filter-phrase-table.pl | 3 ++ scripts/analysis/smtgui/newsmtgui.cgi | 3 ++ scripts/analysis/suspicious_tokenization.pl | 3 ++ scripts/analysis/weight-scan-summarize.sh | 4 ++ scripts/analysis/weight-scan.pl | 4 ++ scripts/ems/experiment.perl | 3 ++ scripts/ems/fix-info.perl | 3 ++ scripts/ems/support/analysis.perl | 3 ++ scripts/ems/support/berkeley-process.sh | 3 ++ scripts/ems/support/berkeley-train.sh | 3 ++ .../build-domain-file-from-subcorpora.perl | 3 ++ .../ems/support/build-sparse-features.perl | 3 ++ .../support/consolidate-training-data.perl | 3 ++ scripts/ems/support/defaultconfig.py | 3 ++ scripts/ems/support/fast-align-in-parts.perl | 3 ++ .../generic-multicore-parallelizer.perl | 3 ++ scripts/ems/support/generic-parallelizer.perl | 3 ++ scripts/ems/support/input-from-sgm.perl | 3 ++ scripts/ems/support/interpolate-lm.perl | 3 ++ scripts/ems/support/lmplz-wrapper.perl | 3 ++ scripts/ems/support/mml-filter.perl | 3 ++ scripts/ems/support/mml-score.perl | 3 ++ scripts/ems/support/mml-train.perl | 3 ++ scripts/ems/support/prepare-fast-align.perl | 3 ++ scripts/ems/support/reference-from-sgm.perl | 3 ++ .../support/remove-segmentation-markup.perl | 3 ++ .../ems/support/report-experiment-scores.perl | 3 ++ .../run-command-on-multiple-refsets.perl | 3 ++ scripts/ems/support/run-wade.perl | 3 ++ scripts/ems/support/split-sentences.perl | 3 ++ scripts/ems/support/submit-grid.perl | 3 ++ ...ubstitute-filtered-tables-and-weights.perl | 3 ++ .../support/substitute-filtered-tables.perl | 3 ++ scripts/ems/support/substitute-weights.perl | 3 ++ .../ems/support/symmetrize-fast-align.perl | 3 ++ scripts/ems/support/thot-lm-wrapper.perl | 3 ++ .../ems/support/tree-converter-wrapper.perl | 3 ++ scripts/ems/support/wrap-xml.perl | 3 ++ scripts/ems/web/analysis.php | 5 +++ scripts/ems/web/analysis_diff.php | 4 ++ scripts/ems/web/diff.php | 5 +++ scripts/ems/web/hierarchical-segmentation.js | 4 ++ scripts/ems/web/index.php | 5 +++ scripts/ems/web/lib.php | 5 +++ scripts/ems/web/overview.php | 4 ++ scripts/ems/web/progress.perl | 3 ++ scripts/ems/web/sgviz.js | 4 ++ scripts/ems/web/sgviz.php | 6 +++ scripts/fuzzy-match/create_xml.perl | 3 ++ scripts/generic/bsbleu.py | 3 ++ scripts/generic/compound-splitter.perl | 3 ++ scripts/generic/extract-factors.pl | 3 ++ scripts/generic/extract-parallel.perl | 3 ++ scripts/generic/fsa2fsal.pl | 3 ++ scripts/generic/fsa2plf.pl | 3 ++ scripts/generic/fsal2fsa.pl | 3 ++ scripts/generic/generic-parallel.perl | 3 ++ scripts/generic/giza-parallel.perl | 3 ++ scripts/generic/lopar2pos.pl | 3 ++ scripts/generic/moses-parallel.pl | 3 ++ scripts/generic/moses_sim_pe.py | 29 ++++++++------ scripts/generic/mteval-v12.pl | 3 ++ scripts/generic/mteval-v13a.pl | 3 ++ scripts/generic/multi-bleu.perl | 3 ++ scripts/generic/ph_numbers.perl | 3 ++ scripts/generic/qsub-wrapper.pl | 3 ++ scripts/generic/reverse-alignment.perl | 3 ++ scripts/generic/score-parallel.perl | 3 ++ scripts/generic/strip-xml.perl | 3 ++ scripts/generic/trainlm-irst2.perl | 3 ++ scripts/other/beautify.py | 5 +++ scripts/other/convert-pt.perl | 3 ++ scripts/other/delete-scores.perl | 3 ++ scripts/other/gacha_filter.py | 3 ++ .../get_many_translations_from_google.perl | 3 ++ scripts/other/retain-lines.perl | 3 ++ .../other/translate_by_microsoft_bing.perl | 3 ++ scripts/recaser/detruecase.perl | 3 ++ scripts/recaser/recase.perl | 3 ++ scripts/recaser/train-recaser.perl | 3 ++ scripts/recaser/train-truecaser.perl | 3 ++ scripts/recaser/truecase.perl | 3 ++ .../MosesScriptsRegressionTesting.pm | 3 ++ scripts/regression-testing/compare-results.pl | 3 ++ .../create_localized_moses_ini.pl | 3 ++ scripts/regression-testing/modify-pars.pl | 3 ++ scripts/regression-testing/moses-virtual.pl | 3 ++ scripts/regression-testing/run-single-test.pl | 3 ++ scripts/regression-testing/run-test-suite.pl | 3 ++ scripts/server/moses.py | 3 ++ scripts/server/sim-pe.py | 10 ++++- .../tokenizer/deescape-special-chars-PTB.perl | 3 ++ scripts/tokenizer/deescape-special-chars.perl | 3 ++ scripts/tokenizer/detokenizer.perl | 3 ++ scripts/tokenizer/escape-special-chars.perl | 3 ++ scripts/tokenizer/lowercase.perl | 3 ++ scripts/tokenizer/normalize-punctuation.perl | 3 ++ scripts/tokenizer/pre-tok-clean.perl | 3 ++ scripts/tokenizer/pre-tokenizer.perl | 3 ++ scripts/tokenizer/pre_tokenize_cleaning.py | 3 ++ .../tokenizer/remove-non-printing-char.perl | 3 ++ .../replace-unicode-punctuation.perl | 3 ++ scripts/tokenizer/tokenizer.perl | 3 ++ scripts/tokenizer/tokenizer_PTB.perl | 3 ++ scripts/training/LexicalTranslationModel.pm | 3 ++ scripts/training/absolutize_moses_model.pl | 3 ++ scripts/training/analyse_moses_model.pl | 3 ++ .../bilingual-lm/averageNullEmbedding.py | 3 ++ scripts/training/bilingual-lm/extract.py | 3 ++ scripts/training/bilingual-lm/extract_test.py | 3 ++ .../training/bilingual-lm/extract_training.py | 3 ++ .../training/bilingual-lm/reduce_ngrams.py | 5 ++- scripts/training/bilingual-lm/test_nplm.py | 3 ++ scripts/training/bilingual-lm/train_nplm.py | 3 ++ scripts/training/binarize-model.perl | 3 ++ scripts/training/build-generation-table.perl | 3 ++ scripts/training/build-mmsapt.perl | 3 ++ scripts/training/clean-corpus-n.perl | 3 ++ scripts/training/clone_moses_model.pl | 3 ++ scripts/training/combine_factors.pl | 3 ++ scripts/training/convert-moses-ini-to-v2.perl | 3 ++ .../training/convert-moses-ini-v2-to-v1.py | 3 ++ scripts/training/corpus-sizes.perl | 3 ++ scripts/training/create_count_tables.py | 3 ++ scripts/training/exodus.perl | 3 ++ scripts/training/filter-model-given-input.pl | 3 ++ scripts/training/filter-rule-table.py | 40 ++++++++++--------- scripts/training/flexibility_score.py | 3 ++ scripts/training/giza2bal.pl | 3 ++ scripts/training/mert-moses.pl | 4 ++ scripts/training/postprocess-lopar.perl | 3 ++ .../training/rdlm/average_null_embedding.py | 3 ++ .../training/rdlm/extract_syntactic_ngrams.py | 3 ++ scripts/training/rdlm/extract_vocab.py | 3 ++ scripts/training/rdlm/train_rdlm.py | 3 ++ scripts/training/reduce-factors.perl | 3 ++ scripts/training/reduce-topt-count.pl | 3 ++ scripts/training/reduce_combine.pl | 3 ++ ...an-phrase-pairs-from-reordering-table.perl | 3 ++ scripts/training/threshold-filter.perl | 3 ++ .../training/train-global-lexicon-model.perl | 3 ++ scripts/training/train-model.perl | 3 ++ scripts/training/train-neurallm.py | 8 +++- .../adam-suffix-array/suffix-array-create.sh | 3 ++ .../adam-suffix-array/suffix-array-extract.sh | 3 ++ .../wrappers/berkeleyparsed2mosesxml.perl | 3 ++ .../wrappers/berkeleyparsed2mosesxml_PTB.perl | 3 ++ scripts/training/wrappers/conll2mosesxml.py | 3 ++ .../wrappers/filter-excluded-lines.perl | 3 ++ .../training/wrappers/find-unparseable.perl | 3 ++ scripts/training/wrappers/mada-wrapper.perl | 3 ++ scripts/training/wrappers/madamira-tok.perl | 3 ++ .../training/wrappers/madamira-wrapper.perl | 3 ++ .../make-factor-brown-cluster-mkcls.perl | 3 ++ .../wrappers/make-factor-de-lemma.perl | 3 ++ .../wrappers/make-factor-de-morph.perl | 3 ++ .../training/wrappers/make-factor-de-pos.perl | 3 ++ .../wrappers/make-factor-en-porter.perl | 3 ++ .../wrappers/make-factor-en-pos.mxpost.perl | 3 ++ .../wrappers/make-factor-pos.tree-tagger.perl | 3 ++ .../training/wrappers/make-factor-stem.perl | 3 ++ .../training/wrappers/make-factor-suffix.perl | 3 ++ .../training/wrappers/morfessor-wrapper.perl | 3 ++ .../wrappers/mosesxml2berkeleyparsed.perl | 3 ++ .../training/wrappers/mosesxml2brackets.py | 5 ++- .../training/wrappers/parse-de-berkeley.perl | 3 ++ .../training/wrappers/parse-de-bitpar.perl | 3 ++ .../training/wrappers/parse-en-collins.perl | 3 ++ scripts/training/wrappers/parse-en-egret.perl | 3 ++ scripts/training/wrappers/parse-en-senna.perl | 3 ++ .../training/wrappers/parse-en-stanford.py | 12 ++++-- scripts/training/wrappers/senna2brackets.py | 33 ++++++++------- .../wrappers/syntax-hyphen-splitting.perl | 3 ++ .../wrappers/tagger-german-chunk.perl | 3 ++ 192 files changed, 666 insertions(+), 58 deletions(-) diff --git a/scripts/OSM/OSM-Train.perl b/scripts/OSM/OSM-Train.perl index 895a821db..07ad71f68 100755 --- a/scripts/OSM/OSM-Train.perl +++ b/scripts/OSM/OSM-Train.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/OSM/extract-singletons.perl b/scripts/OSM/extract-singletons.perl index 5a1665a8c..6295edfad 100755 --- a/scripts/OSM/extract-singletons.perl +++ b/scripts/OSM/extract-singletons.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #use strict; use warnings; diff --git a/scripts/OSM/flipAlignment.perl b/scripts/OSM/flipAlignment.perl index b896c0a23..57a1e9bb0 100755 --- a/scripts/OSM/flipAlignment.perl +++ b/scripts/OSM/flipAlignment.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/clean.pl b/scripts/Transliteration/clean.pl index ccc364fc9..7a08271da 100755 --- a/scripts/Transliteration/clean.pl +++ b/scripts/Transliteration/clean.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #input hindi word urdu word, delete all those entries that have number on any side use warnings; @@ -314,4 +317,4 @@ sub charFreqFilter{ } } } -} \ No newline at end of file +} diff --git a/scripts/Transliteration/corpusCreator.pl b/scripts/Transliteration/corpusCreator.pl index 4c62449df..ac67f5d74 100755 --- a/scripts/Transliteration/corpusCreator.pl +++ b/scripts/Transliteration/corpusCreator.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl index c3cc31f26..e8130db02 100755 --- a/scripts/Transliteration/in-decoding-transliteration.pl +++ b/scripts/Transliteration/in-decoding-transliteration.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl index 60c3200f6..2c7908085 100755 --- a/scripts/Transliteration/post-decoding-transliteration.pl +++ b/scripts/Transliteration/post-decoding-transliteration.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl index df3b1ceca..0a9f554c5 100755 --- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl +++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/Transliteration/threshold.pl b/scripts/Transliteration/threshold.pl index bf6657742..3baa8e0a7 100755 --- a/scripts/Transliteration/threshold.pl +++ b/scripts/Transliteration/threshold.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use utf8; diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl index 35e4ee396..b1d4d0ff5 100755 --- a/scripts/Transliteration/train-transliteration-module.pl +++ b/scripts/Transliteration/train-transliteration-module.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use utf8; diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl index 8e6a6255a..9a3f63d69 100755 --- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl +++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use utf8; ############################################### diff --git a/scripts/analysis/extract-target-trees.py b/scripts/analysis/extract-target-trees.py index 3a92fdc4d..7166211d9 100755 --- a/scripts/analysis/extract-target-trees.py +++ b/scripts/analysis/extract-target-trees.py @@ -1,9 +1,13 @@ #!/usr/bin/env python - -# Usage: extract-target-trees.py [FILE] # -# Reads moses-chart's -T output from FILE or standard input and writes trees to -# standard output in Moses' XML tree format. +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +"""Usage: extract-target-trees.py [FILE] + +Reads moses-chart's -T output from FILE or standard input and writes trees to +standard output in Moses' XML tree format. +""" import re import sys diff --git a/scripts/analysis/nontranslated_words.pl b/scripts/analysis/nontranslated_words.pl index 51a4f9d20..7213deb76 100755 --- a/scripts/analysis/nontranslated_words.pl +++ b/scripts/analysis/nontranslated_words.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # Reads a source and hypothesis file and counts equal tokens. Some of these diff --git a/scripts/analysis/oov.pl b/scripts/analysis/oov.pl index 052c9994d..9756887c9 100755 --- a/scripts/analysis/oov.pl +++ b/scripts/analysis/oov.pl @@ -1,6 +1,9 @@ #!/usr/bin/env perl # Display OOV rate of a test set against a training corpus or a phrase table. # Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl index 72b70dc72..b9eb6e56d 100755 --- a/scripts/analysis/sentence-by-sentence.pl +++ b/scripts/analysis/sentence-by-sentence.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors diff --git a/scripts/analysis/sg2dot.perl b/scripts/analysis/sg2dot.perl index e9c1639ed..5f9a5ea1d 100755 --- a/scripts/analysis/sg2dot.perl +++ b/scripts/analysis/sg2dot.perl @@ -3,6 +3,8 @@ # Author : Loic BARRAULT # Script to convert MOSES searchgraph to DOT format # +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/analysis/show-phrases-used.pl b/scripts/analysis/show-phrases-used.pl index 522e6d3ff..9428ea9b8 100755 --- a/scripts/analysis/show-phrases-used.pl +++ b/scripts/analysis/show-phrases-used.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #show-phrases-used: display all source and target phrases for each sentence in a corpus, and give average phrase length used diff --git a/scripts/analysis/smtgui/Corpus.pm b/scripts/analysis/smtgui/Corpus.pm index f050a9f6d..2391a6c15 100644 --- a/scripts/analysis/smtgui/Corpus.pm +++ b/scripts/analysis/smtgui/Corpus.pm @@ -1,5 +1,8 @@ #package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole #Evan Herbst, 7 / 25 / 06 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. package Corpus; BEGIN diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl index 55f2619c0..cd0f6b91b 100755 --- a/scripts/analysis/smtgui/filter-phrase-table.pl +++ b/scripts/analysis/smtgui/filter-phrase-table.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #by Philipp Koehn, de-augmented by Evan Herbst diff --git a/scripts/analysis/smtgui/newsmtgui.cgi b/scripts/analysis/smtgui/newsmtgui.cgi index 32ad3a948..034ee265e 100755 --- a/scripts/analysis/smtgui/newsmtgui.cgi +++ b/scripts/analysis/smtgui/newsmtgui.cgi @@ -1,4 +1,7 @@ #!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use strict; diff --git a/scripts/analysis/suspicious_tokenization.pl b/scripts/analysis/suspicious_tokenization.pl index 3ea15154e..f807153d9 100755 --- a/scripts/analysis/suspicious_tokenization.pl +++ b/scripts/analysis/suspicious_tokenization.pl @@ -2,6 +2,9 @@ # Collects and prints all n-grams that appear in the given corpus both # tokenized as well as untokenized. # Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/analysis/weight-scan-summarize.sh b/scripts/analysis/weight-scan-summarize.sh index 237182736..2fccb6470 100755 --- a/scripts/analysis/weight-scan-summarize.sh +++ b/scripts/analysis/weight-scan-summarize.sh @@ -1,4 +1,8 @@ #!/bin/bash +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + # Hackish summarization of weight-scan.pl results, heavily relies on tools by # Ondrej Bojar (bojar@ufal.mff.cuni.cz), some of which need Mercury; beware. diff --git a/scripts/analysis/weight-scan.pl b/scripts/analysis/weight-scan.pl index b33360694..b51a6bcd1 100755 --- a/scripts/analysis/weight-scan.pl +++ b/scripts/analysis/weight-scan.pl @@ -1,4 +1,8 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + # runs Moses many times changing the values of one weight, all others fixed # nbest lists are always produced to allow for comparison of real and # 'projected' BLEU (BLEU estimated from n-best lists collected at a neighouring diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index ece110fbc..a3f5310a5 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Experiment Management System # Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS diff --git a/scripts/ems/fix-info.perl b/scripts/ems/fix-info.perl index abe58fe83..6659027b2 100755 --- a/scripts/ems/fix-info.perl +++ b/scripts/ems/fix-info.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl index f4d5a55b4..01bb21773 100755 --- a/scripts/ems/support/analysis.perl +++ b/scripts/ems/support/analysis.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/berkeley-process.sh b/scripts/ems/support/berkeley-process.sh index e68056c96..347ebba3c 100755 --- a/scripts/ems/support/berkeley-process.sh +++ b/scripts/ems/support/berkeley-process.sh @@ -1,4 +1,7 @@ #!/bin/sh +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. if [ $# -lt 8 ] then diff --git a/scripts/ems/support/berkeley-train.sh b/scripts/ems/support/berkeley-train.sh index 96f6b648c..530cf978f 100755 --- a/scripts/ems/support/berkeley-train.sh +++ b/scripts/ems/support/berkeley-train.sh @@ -1,4 +1,7 @@ #!/bin/sh +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. if [ $# -lt 6 ] then diff --git a/scripts/ems/support/build-domain-file-from-subcorpora.perl b/scripts/ems/support/build-domain-file-from-subcorpora.perl index 085fd2629..f45b5ba2a 100755 --- a/scripts/ems/support/build-domain-file-from-subcorpora.perl +++ b/scripts/ems/support/build-domain-file-from-subcorpora.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl index 79fc1e394..b134cee69 100755 --- a/scripts/ems/support/build-sparse-features.perl +++ b/scripts/ems/support/build-sparse-features.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/consolidate-training-data.perl b/scripts/ems/support/consolidate-training-data.perl index 4ab7f82cf..2a732be77 100755 --- a/scripts/ems/support/consolidate-training-data.perl +++ b/scripts/ems/support/consolidate-training-data.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ diff --git a/scripts/ems/support/defaultconfig.py b/scripts/ems/support/defaultconfig.py index a118e96b3..53913da08 100644 --- a/scripts/ems/support/defaultconfig.py +++ b/scripts/ems/support/defaultconfig.py @@ -1,4 +1,7 @@ #!/usr/bin/env python2 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Version of ConfigParser which accepts default values.""" diff --git a/scripts/ems/support/fast-align-in-parts.perl b/scripts/ems/support/fast-align-in-parts.perl index f777d7e52..bc340a50f 100755 --- a/scripts/ems/support/fast-align-in-parts.perl +++ b/scripts/ems/support/fast-align-in-parts.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. ####################### # Revision history diff --git a/scripts/ems/support/generic-multicore-parallelizer.perl b/scripts/ems/support/generic-multicore-parallelizer.perl index 0f7910603..d821aa114 100755 --- a/scripts/ems/support/generic-multicore-parallelizer.perl +++ b/scripts/ems/support/generic-multicore-parallelizer.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl index 811a99bde..087498ccf 100755 --- a/scripts/ems/support/generic-parallelizer.perl +++ b/scripts/ems/support/generic-parallelizer.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/input-from-sgm.perl b/scripts/ems/support/input-from-sgm.perl index 18000581a..eb6a2e3a1 100755 --- a/scripts/ems/support/input-from-sgm.perl +++ b/scripts/ems/support/input-from-sgm.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl index 7d52fd877..4d9a513f6 100755 --- a/scripts/ems/support/interpolate-lm.perl +++ b/scripts/ems/support/interpolate-lm.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl index df503754f..89b2847d6 100755 --- a/scripts/ems/support/lmplz-wrapper.perl +++ b/scripts/ems/support/lmplz-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/mml-filter.perl b/scripts/ems/support/mml-filter.perl index 51bc4cda5..32bca335b 100755 --- a/scripts/ems/support/mml-filter.perl +++ b/scripts/ems/support/mml-filter.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/mml-score.perl b/scripts/ems/support/mml-score.perl index 6f7b724ea..f88021818 100755 --- a/scripts/ems/support/mml-score.perl +++ b/scripts/ems/support/mml-score.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl index dcc998711..bdf6c1c1a 100755 --- a/scripts/ems/support/mml-train.perl +++ b/scripts/ems/support/mml-train.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/prepare-fast-align.perl b/scripts/ems/support/prepare-fast-align.perl index 80fec36b2..68b1f0189 100755 --- a/scripts/ems/support/prepare-fast-align.perl +++ b/scripts/ems/support/prepare-fast-align.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/reference-from-sgm.perl b/scripts/ems/support/reference-from-sgm.perl index ebb9ae4ae..b8e1d108d 100755 --- a/scripts/ems/support/reference-from-sgm.perl +++ b/scripts/ems/support/reference-from-sgm.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/remove-segmentation-markup.perl b/scripts/ems/support/remove-segmentation-markup.perl index a0bd61fff..3b02bceaf 100755 --- a/scripts/ems/support/remove-segmentation-markup.perl +++ b/scripts/ems/support/remove-segmentation-markup.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl index b649951ce..c859508cb 100755 --- a/scripts/ems/support/report-experiment-scores.perl +++ b/scripts/ems/support/report-experiment-scores.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $ diff --git a/scripts/ems/support/run-command-on-multiple-refsets.perl b/scripts/ems/support/run-command-on-multiple-refsets.perl index 1e914b44b..41823b4ee 100755 --- a/scripts/ems/support/run-command-on-multiple-refsets.perl +++ b/scripts/ems/support/run-command-on-multiple-refsets.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/run-wade.perl b/scripts/ems/support/run-wade.perl index 175948b98..dfdb8e59d 100755 --- a/scripts/ems/support/run-wade.perl +++ b/scripts/ems/support/run-wade.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 02a1e2315..f72767054 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Based on Preprocessor written by Philipp Koehn diff --git a/scripts/ems/support/submit-grid.perl b/scripts/ems/support/submit-grid.perl index a0967f9a5..ff43cd123 100755 --- a/scripts/ems/support/submit-grid.perl +++ b/scripts/ems/support/submit-grid.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/substitute-filtered-tables-and-weights.perl b/scripts/ems/support/substitute-filtered-tables-and-weights.perl index 13be52c6b..2e6908ab4 100755 --- a/scripts/ems/support/substitute-filtered-tables-and-weights.perl +++ b/scripts/ems/support/substitute-filtered-tables-and-weights.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl index c5ebabded..548982592 100755 --- a/scripts/ems/support/substitute-filtered-tables.perl +++ b/scripts/ems/support/substitute-filtered-tables.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/ems/support/substitute-weights.perl b/scripts/ems/support/substitute-weights.perl index b692f3f85..efa9338ca 100755 --- a/scripts/ems/support/substitute-weights.perl +++ b/scripts/ems/support/substitute-weights.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/ems/support/symmetrize-fast-align.perl b/scripts/ems/support/symmetrize-fast-align.perl index 9f7fec248..4ed3e087d 100755 --- a/scripts/ems/support/symmetrize-fast-align.perl +++ b/scripts/ems/support/symmetrize-fast-align.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/thot-lm-wrapper.perl b/scripts/ems/support/thot-lm-wrapper.perl index 59d483e65..ffbcb50e2 100755 --- a/scripts/ems/support/thot-lm-wrapper.perl +++ b/scripts/ems/support/thot-lm-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/tree-converter-wrapper.perl b/scripts/ems/support/tree-converter-wrapper.perl index aae55991a..ae7e2c5a6 100755 --- a/scripts/ems/support/tree-converter-wrapper.perl +++ b/scripts/ems/support/tree-converter-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl index 52190309a..09ea2a2f8 100755 --- a/scripts/ems/support/wrap-xml.perl +++ b/scripts/ems/support/wrap-xml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php index 57776dd22..5e5f707f6 100644 --- a/scripts/ems/web/analysis.php +++ b/scripts/ems/web/analysis.php @@ -1,5 +1,10 @@ Search Graph Visualization, Sentence <?php $sentence ?> diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl index 4ab281eae..97025d62a 100755 --- a/scripts/fuzzy-match/create_xml.perl +++ b/scripts/fuzzy-match/create_xml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. binmode( STDIN, ":utf8" ); binmode( STDOUT, ":utf8" ); diff --git a/scripts/generic/bsbleu.py b/scripts/generic/bsbleu.py index 12d2201de..296900b18 100755 --- a/scripts/generic/bsbleu.py +++ b/scripts/generic/bsbleu.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # compute Bleu scores with confidence intervals via boostrap resampling # written by Ulrich Germann +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from argparse import ArgumentParser import math diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl index b39d4d660..2ece80a60 100755 --- a/scripts/generic/compound-splitter.perl +++ b/scripts/generic/compound-splitter.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/extract-factors.pl b/scripts/generic/extract-factors.pl index 38cf97bd4..2b1c51cd1 100755 --- a/scripts/generic/extract-factors.pl +++ b/scripts/generic/extract-factors.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #extract-factors.pl: extract only the desired factors from a factored corpus diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index be30ff652..3240f24eb 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # example # ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput diff --git a/scripts/generic/fsa2fsal.pl b/scripts/generic/fsa2fsal.pl index 7dc7751ee..28ec28a26 100755 --- a/scripts/generic/fsa2fsal.pl +++ b/scripts/generic/fsa2fsal.pl @@ -4,6 +4,9 @@ # ' ' to delimit nodes (i.e. original lines). # Some rudimentary sanity checks are done on the fly. # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/fsa2plf.pl b/scripts/generic/fsa2plf.pl index 07c8a4cc1..4b9474d5a 100755 --- a/scripts/generic/fsa2plf.pl +++ b/scripts/generic/fsa2plf.pl @@ -7,6 +7,9 @@ # final nodes. # Note that the output format may not contain any spaces. # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/fsal2fsa.pl b/scripts/generic/fsal2fsa.pl index a21305dad..158dab5b3 100755 --- a/scripts/generic/fsal2fsa.pl +++ b/scripts/generic/fsal2fsa.pl @@ -1,6 +1,9 @@ #!/usr/bin/env perl # A very simple script that converts fsal back to fsa format (openfst lattices) # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/generic-parallel.perl b/scripts/generic/generic-parallel.perl index a9bc73d85..07f6a210a 100755 --- a/scripts/generic/generic-parallel.perl +++ b/scripts/generic/generic-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/giza-parallel.perl b/scripts/generic/giza-parallel.perl index 9a6516a8f..a9921a992 100755 --- a/scripts/generic/giza-parallel.perl +++ b/scripts/generic/giza-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # example # ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align diff --git a/scripts/generic/lopar2pos.pl b/scripts/generic/lopar2pos.pl index 2b9245e0f..fc2c35c7f 100755 --- a/scripts/generic/lopar2pos.pl +++ b/scripts/generic/lopar2pos.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ #lopar2pos: extract POSs from LOPAR output diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl index eb51daa98..144b7d6b2 100755 --- a/scripts/generic/moses-parallel.pl +++ b/scripts/generic/moses-parallel.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ ####################### diff --git a/scripts/generic/moses_sim_pe.py b/scripts/generic/moses_sim_pe.py index 32f785961..3497ca558 100755 --- a/scripts/generic/moses_sim_pe.py +++ b/scripts/generic/moses_sim_pe.py @@ -1,20 +1,25 @@ #!/usr/bin/env python # Written by Michael Denkowski +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# This script parallelizes decoding with simulated post-editing via moses XML -# input (XML entities need to be escaped in tokenization). Memory mapped -# dynamic phrase tables (Ulrich Germann, -# www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models -# (Kenneth Heafield, -# http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19) -# facilitate memory efficient multi process decoding. Input is divided into -# batches, each of which is decoded sequentially. Each batch pre-loads the -# data from previous batches. +"""Parallelize decoding with simulated post-editing via moses XML input. -# To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the -# alignment from input to references. Specify the number of jobs with -# --decoder-flags="-threads N". +(XML entities need to be escaped in tokenization). Memory mapped +dynamic phrase tables (Ulrich Germann, +www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40) and language models +(Kenneth Heafield, +http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19) +facilitate memory efficient multi process decoding. Input is divided into +batches, each of which is decoded sequentially. Each batch pre-loads the +data from previous batches. + +To use in tuning, run mert-moses.pl with --sim-pe=SYMAL where SYMAL is the +alignment from input to references. Specify the number of jobs with +--decoder-flags="-threads N". +""" import gzip import itertools diff --git a/scripts/generic/mteval-v12.pl b/scripts/generic/mteval-v12.pl index 2666c8012..b4dfbf83a 100755 --- a/scripts/generic/mteval-v12.pl +++ b/scripts/generic/mteval-v12.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl index 41a88800a..bdc2d9479 100755 --- a/scripts/generic/mteval-v13a.pl +++ b/scripts/generic/mteval-v13a.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/multi-bleu.perl b/scripts/generic/multi-bleu.perl index 344f58c6f..61de10d45 100755 --- a/scripts/generic/multi-bleu.perl +++ b/scripts/generic/multi-bleu.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/generic/ph_numbers.perl b/scripts/generic/ph_numbers.perl index 612263249..f0ae1f851 100755 --- a/scripts/generic/ph_numbers.perl +++ b/scripts/generic/ph_numbers.perl @@ -6,6 +6,9 @@ package ph_numbers; # and decoder input # # (c) 2013 TAUS +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl index ac3d0900a..ef9938e07 100755 --- a/scripts/generic/qsub-wrapper.pl +++ b/scripts/generic/qsub-wrapper.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/generic/reverse-alignment.perl b/scripts/generic/reverse-alignment.perl index 681b3221e..f01acf5b0 100755 --- a/scripts/generic/reverse-alignment.perl +++ b/scripts/generic/reverse-alignment.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index 81bc6f7d0..625b449c0 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # example # ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0 diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl index c993421f0..a5dbbaa37 100755 --- a/scripts/generic/strip-xml.perl +++ b/scripts/generic/strip-xml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/generic/trainlm-irst2.perl b/scripts/generic/trainlm-irst2.perl index f664e96ee..8af372fac 100755 --- a/scripts/generic/trainlm-irst2.perl +++ b/scripts/generic/trainlm-irst2.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Compatible with sri LM-creating script, eg. # ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt diff --git a/scripts/other/beautify.py b/scripts/other/beautify.py index 0caa6b162..56df24bc8 100755 --- a/scripts/other/beautify.py +++ b/scripts/other/beautify.py @@ -1,4 +1,9 @@ #! /usr/bin/env python +# +# Originally written in 2015 by Jeroen Vermeulen (Precision Translation Tools). +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Reformat project source code, and/or check for style errors ("lint"). diff --git a/scripts/other/convert-pt.perl b/scripts/other/convert-pt.perl index e087126f1..60c8cbdb2 100755 --- a/scripts/other/convert-pt.perl +++ b/scripts/other/convert-pt.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # convert a phrase-table with alignment in Moses' dead-end format diff --git a/scripts/other/delete-scores.perl b/scripts/other/delete-scores.perl index ffb788867..ebaf277fa 100755 --- a/scripts/other/delete-scores.perl +++ b/scripts/other/delete-scores.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/other/gacha_filter.py b/scripts/other/gacha_filter.py index 0deb45761..af5921d41 100644 --- a/scripts/other/gacha_filter.py +++ b/scripts/other/gacha_filter.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ The Gacha filter cleans out sentence pairs that have global character mean diff --git a/scripts/other/get_many_translations_from_google.perl b/scripts/other/get_many_translations_from_google.perl index 0b1436c20..ac2933296 100755 --- a/scripts/other/get_many_translations_from_google.perl +++ b/scripts/other/get_many_translations_from_google.perl @@ -5,6 +5,9 @@ # Expects one sentence per line, not tokenized! # # Ondrej Bojar, bojar@ufal.mff.cuni.cz +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/other/retain-lines.perl b/scripts/other/retain-lines.perl index f04a8ebad..c789f96c7 100755 --- a/scripts/other/retain-lines.perl +++ b/scripts/other/retain-lines.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #retain lines in clean.lines-retained.1 use strict; diff --git a/scripts/other/translate_by_microsoft_bing.perl b/scripts/other/translate_by_microsoft_bing.perl index c9b1b31de..d4222878e 100755 --- a/scripts/other/translate_by_microsoft_bing.perl +++ b/scripts/other/translate_by_microsoft_bing.perl @@ -2,6 +2,9 @@ # Script implemented by Pranava Swaroop Madhyastha (a student at Charles # University, UFAL) +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl index b882852a0..66ca24fa2 100755 --- a/scripts/recaser/detruecase.perl +++ b/scripts/recaser/detruecase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl index 52cec36ea..b951ca764 100755 --- a/scripts/recaser/recase.perl +++ b/scripts/recaser/recase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl index dce388bca..cb3388c38 100755 --- a/scripts/recaser/train-recaser.perl +++ b/scripts/recaser/train-recaser.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 753183324..7f8909082 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index 544b79c47..aab185ce9 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ diff --git a/scripts/regression-testing/MosesScriptsRegressionTesting.pm b/scripts/regression-testing/MosesScriptsRegressionTesting.pm index d8b0590c8..acc134d70 100644 --- a/scripts/regression-testing/MosesScriptsRegressionTesting.pm +++ b/scripts/regression-testing/MosesScriptsRegressionTesting.pm @@ -1,3 +1,6 @@ +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + package MosesScriptsRegressionTesting; use strict; diff --git a/scripts/regression-testing/compare-results.pl b/scripts/regression-testing/compare-results.pl index 572431951..8f1461cec 100755 --- a/scripts/regression-testing/compare-results.pl +++ b/scripts/regression-testing/compare-results.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/create_localized_moses_ini.pl b/scripts/regression-testing/create_localized_moses_ini.pl index 1d03e5ab8..3e2b6f37f 100755 --- a/scripts/regression-testing/create_localized_moses_ini.pl +++ b/scripts/regression-testing/create_localized_moses_ini.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/modify-pars.pl b/scripts/regression-testing/modify-pars.pl index de2df2919..7726af9e6 100755 --- a/scripts/regression-testing/modify-pars.pl +++ b/scripts/regression-testing/modify-pars.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/moses-virtual.pl b/scripts/regression-testing/moses-virtual.pl index 3af3c79e4..3b23b525a 100755 --- a/scripts/regression-testing/moses-virtual.pl +++ b/scripts/regression-testing/moses-virtual.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/run-single-test.pl b/scripts/regression-testing/run-single-test.pl index e8307da36..037de8285 100755 --- a/scripts/regression-testing/run-single-test.pl +++ b/scripts/regression-testing/run-single-test.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/regression-testing/run-test-suite.pl b/scripts/regression-testing/run-test-suite.pl index b384f8b98..a12938e61 100755 --- a/scripts/regression-testing/run-test-suite.pl +++ b/scripts/regression-testing/run-test-suite.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/server/moses.py b/scripts/server/moses.py index 7cf152187..e825ab39e 100644 --- a/scripts/server/moses.py +++ b/scripts/server/moses.py @@ -1,5 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ Python utilities for moses diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py index 5f1407524..6f76bf46d 100755 --- a/scripts/server/sim-pe.py +++ b/scripts/server/sim-pe.py @@ -2,8 +2,14 @@ # -*- coding: utf-8 -*- # Written by Ulrich Germann on the basis of contrib/server/client.py. -# This script simulates post-editing of MT output and incrementally -# updates the dynamic phrase tables in the moses server. +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +"""Simulate post-editing of MT output. + +Incrementally updates the dynamic phrase tables in the moses server. +""" import argparse import os diff --git a/scripts/tokenizer/deescape-special-chars-PTB.perl b/scripts/tokenizer/deescape-special-chars-PTB.perl index f9601924f..ad2529b21 100755 --- a/scripts/tokenizer/deescape-special-chars-PTB.perl +++ b/scripts/tokenizer/deescape-special-chars-PTB.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl index 002955e62..b9d1ad74c 100755 --- a/scripts/tokenizer/deescape-special-chars.perl +++ b/scripts/tokenizer/deescape-special-chars.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl index 3a92bd024..881b93dd1 100755 --- a/scripts/tokenizer/detokenizer.perl +++ b/scripts/tokenizer/detokenizer.perl @@ -4,6 +4,9 @@ # Sample De-Tokenizer # written by Josh Schroeder, based on code by Philipp Koehn # further modifications by Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl index fbbbae292..143e85490 100755 --- a/scripts/tokenizer/escape-special-chars.perl +++ b/scripts/tokenizer/escape-special-chars.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/lowercase.perl b/scripts/tokenizer/lowercase.perl index e5c41bbed..bc75e5e5c 100755 --- a/scripts/tokenizer/lowercase.perl +++ b/scripts/tokenizer/lowercase.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl index 13e9fd3fc..7dab7543a 100755 --- a/scripts/tokenizer/normalize-punctuation.perl +++ b/scripts/tokenizer/normalize-punctuation.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/pre-tok-clean.perl b/scripts/tokenizer/pre-tok-clean.perl index 900e992ee..064f7b187 100755 --- a/scripts/tokenizer/pre-tok-clean.perl +++ b/scripts/tokenizer/pre-tok-clean.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; diff --git a/scripts/tokenizer/pre-tokenizer.perl b/scripts/tokenizer/pre-tokenizer.perl index 514d8da8d..541ce77fb 100755 --- a/scripts/tokenizer/pre-tokenizer.perl +++ b/scripts/tokenizer/pre-tokenizer.perl @@ -3,6 +3,9 @@ # script for preprocessing language data prior to tokenization # Start by Ulrich Germann, after noticing systematic preprocessing errors # in some of the English Europarl data. +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/pre_tokenize_cleaning.py b/scripts/tokenizer/pre_tokenize_cleaning.py index 096a45dc4..c03af8f66 100644 --- a/scripts/tokenizer/pre_tokenize_cleaning.py +++ b/scripts/tokenizer/pre_tokenize_cleaning.py @@ -1,4 +1,7 @@ #!/usr/bin/env python -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ The Gacha filter cleans out sentence pairs that have global character mean diff --git a/scripts/tokenizer/remove-non-printing-char.perl b/scripts/tokenizer/remove-non-printing-char.perl index 9125b7691..92f6ade16 100755 --- a/scripts/tokenizer/remove-non-printing-char.perl +++ b/scripts/tokenizer/remove-non-printing-char.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use utf8; diff --git a/scripts/tokenizer/replace-unicode-punctuation.perl b/scripts/tokenizer/replace-unicode-punctuation.perl index cda69ddf7..c2c7088d6 100755 --- a/scripts/tokenizer/replace-unicode-punctuation.perl +++ b/scripts/tokenizer/replace-unicode-punctuation.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index a5d4fadd3..e08bac941 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/tokenizer/tokenizer_PTB.perl b/scripts/tokenizer/tokenizer_PTB.perl index 6fff8d7f7..46b14775c 100755 --- a/scripts/tokenizer/tokenizer_PTB.perl +++ b/scripts/tokenizer/tokenizer_PTB.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # Sample Tokenizer ### Version 1.1 diff --git a/scripts/training/LexicalTranslationModel.pm b/scripts/training/LexicalTranslationModel.pm index c5dad60fb..3adc45f5e 100644 --- a/scripts/training/LexicalTranslationModel.pm +++ b/scripts/training/LexicalTranslationModel.pm @@ -1,3 +1,6 @@ +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + package LexicalTranslationModel; use strict; diff --git a/scripts/training/absolutize_moses_model.pl b/scripts/training/absolutize_moses_model.pl index bb7085895..27eccd8c7 100755 --- a/scripts/training/absolutize_moses_model.pl +++ b/scripts/training/absolutize_moses_model.pl @@ -5,6 +5,9 @@ # paths with absolute paths. # # Ondrej Bojar. +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; diff --git a/scripts/training/analyse_moses_model.pl b/scripts/training/analyse_moses_model.pl index 656f4a59b..7a5c2e701 100755 --- a/scripts/training/analyse_moses_model.pl +++ b/scripts/training/analyse_moses_model.pl @@ -4,6 +4,9 @@ # given a moses.ini file, checks the translation and generation tables and reports # statistics on ambiguity # Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/bilingual-lm/averageNullEmbedding.py b/scripts/training/bilingual-lm/averageNullEmbedding.py index 891595aff..54c9a1bc4 100755 --- a/scripts/training/bilingual-lm/averageNullEmbedding.py +++ b/scripts/training/bilingual-lm/averageNullEmbedding.py @@ -1,4 +1,7 @@ #!/usr/bin/env python2 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. import sys import numpy import argparse diff --git a/scripts/training/bilingual-lm/extract.py b/scripts/training/bilingual-lm/extract.py index f620edb5d..876fba9ee 100755 --- a/scripts/training/bilingual-lm/extract.py +++ b/scripts/training/bilingual-lm/extract.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from collections import Counter import logging diff --git a/scripts/training/bilingual-lm/extract_test.py b/scripts/training/bilingual-lm/extract_test.py index 3c9a03b85..8cade1e04 100755 --- a/scripts/training/bilingual-lm/extract_test.py +++ b/scripts/training/bilingual-lm/extract_test.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Create a test corpus, using a previously pruned vocabulary.""" diff --git a/scripts/training/bilingual-lm/extract_training.py b/scripts/training/bilingual-lm/extract_training.py index bd3538188..e39a70318 100755 --- a/scripts/training/bilingual-lm/extract_training.py +++ b/scripts/training/bilingual-lm/extract_training.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from collections import Counter import logging diff --git a/scripts/training/bilingual-lm/reduce_ngrams.py b/scripts/training/bilingual-lm/reduce_ngrams.py index 3442fb302..4db41378d 100755 --- a/scripts/training/bilingual-lm/reduce_ngrams.py +++ b/scripts/training/bilingual-lm/reduce_ngrams.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -"""Reduces an ngrams file for training nplm to a smaller version of it. +"""Reduce an ngrams file for training nplm to a smaller version of it. The smaller version will have fewer ngrams. """ diff --git a/scripts/training/bilingual-lm/test_nplm.py b/scripts/training/bilingual-lm/test_nplm.py index 737266bc3..3a59fd344 100755 --- a/scripts/training/bilingual-lm/test_nplm.py +++ b/scripts/training/bilingual-lm/test_nplm.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. import logging import optparse diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py index 7bc74429e..cb5980a91 100755 --- a/scripts/training/bilingual-lm/train_nplm.py +++ b/scripts/training/bilingual-lm/train_nplm.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from __future__ import print_function, unicode_literals diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl index 0239f5fc8..0131d2222 100755 --- a/scripts/training/binarize-model.perl +++ b/scripts/training/binarize-model.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # # Binarize a Moses model diff --git a/scripts/training/build-generation-table.perl b/scripts/training/build-generation-table.perl index 435f7f58e..14176908a 100755 --- a/scripts/training/build-generation-table.perl +++ b/scripts/training/build-generation-table.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ use warnings; diff --git a/scripts/training/build-mmsapt.perl b/scripts/training/build-mmsapt.perl index 00cbd09d6..d0c5b818e 100755 --- a/scripts/training/build-mmsapt.perl +++ b/scripts/training/build-mmsapt.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl index cee4c76a2..76a09e539 100755 --- a/scripts/training/clean-corpus-n.perl +++ b/scripts/training/clean-corpus-n.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $ use warnings; diff --git a/scripts/training/clone_moses_model.pl b/scripts/training/clone_moses_model.pl index bf6708fca..18dc4aa41 100755 --- a/scripts/training/clone_moses_model.pl +++ b/scripts/training/clone_moses_model.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # given a moses.ini file, creates a fresh version of it diff --git a/scripts/training/combine_factors.pl b/scripts/training/combine_factors.pl index fa6f15db2..fcc9ab3f5 100755 --- a/scripts/training/combine_factors.pl +++ b/scripts/training/combine_factors.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # given a list of files, combines them to a single corpus (sent to stdout) diff --git a/scripts/training/convert-moses-ini-to-v2.perl b/scripts/training/convert-moses-ini-to-v2.perl index e091a710d..3fdfa53a6 100755 --- a/scripts/training/convert-moses-ini-to-v2.perl +++ b/scripts/training/convert-moses-ini-to-v2.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/convert-moses-ini-v2-to-v1.py b/scripts/training/convert-moses-ini-v2-to-v1.py index 44f192efe..3ef7d7c0d 100755 --- a/scripts/training/convert-moses-ini-v2-to-v1.py +++ b/scripts/training/convert-moses-ini-v2-to-v1.py @@ -1,5 +1,8 @@ #! /usr/bin/env python # -*- coding: utf8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 3 or, at your option, any later version. from __future__ import ( diff --git a/scripts/training/corpus-sizes.perl b/scripts/training/corpus-sizes.perl index 30ae67ebb..1a6db669b 100755 --- a/scripts/training/corpus-sizes.perl +++ b/scripts/training/corpus-sizes.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $ diff --git a/scripts/training/create_count_tables.py b/scripts/training/create_count_tables.py index 2288c034a..12499b1d7 100755 --- a/scripts/training/create_count_tables.py +++ b/scripts/training/create_count_tables.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # This script creates tables that store phrase pair frequencies rather than # probabilities. diff --git a/scripts/training/exodus.perl b/scripts/training/exodus.perl index bb8616007..579056ff0 100755 --- a/scripts/training/exodus.perl +++ b/scripts/training/exodus.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index e3a34c40b..a44d9c193 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # Given a moses.ini file and an input text prepare minimized translation diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py index 14736fe1f..d28fa0c89 100755 --- a/scripts/training/filter-rule-table.py +++ b/scripts/training/filter-rule-table.py @@ -1,25 +1,29 @@ #!/usr/bin/env python # Author: Phil Williams +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT -# -# Given a rule table (on stdin) and an input text, filter out rules that -# couldn't be used in parsing the input and write the resulting rule table -# to stdout. The input text is assumed to contain the same factors as -# the rule table and is assumed to be small (not more than a few thousand -# sentences): the current algorithm won't scale well to large input sets. -# -# The filtering algorithm considers a source RHS to be a sequence of -# words and gaps, which must match a sequence of words in one of the -# input sentences, with at least one input word per gap. The NT labels -# are ignored, so for example a rule with the source RHS "the JJ dog" -# would be allowed if the sequence "the slobbering dog" occurs in one of -# the input sentences, even if there's no rule to derive a JJ from -# "slobbering." (If "slobbering" were an unknown word, the 'unknown-lhs' -# decoder option would allow it to take a number of NT labels, likely -# including JJ, with varying probabilities, so removing the rule would -# be a bad idea.) +"""Usage: filter-rule-table.py [--min-non-initial-rule-count=N] INPUT + +Given a rule table (on stdin) and an input text, filter out rules that +couldn't be used in parsing the input and write the resulting rule table +to stdout. The input text is assumed to contain the same factors as +the rule table and is assumed to be small (not more than a few thousand +sentences): the current algorithm won't scale well to large input sets. + +The filtering algorithm considers a source RHS to be a sequence of +words and gaps, which must match a sequence of words in one of the +input sentences, with at least one input word per gap. The NT labels +are ignored, so for example a rule with the source RHS "the JJ dog" +would be allowed if the sequence "the slobbering dog" occurs in one of +the input sentences, even if there's no rule to derive a JJ from +"slobbering." (If "slobbering" were an unknown word, the 'unknown-lhs' +decoder option would allow it to take a number of NT labels, likely +including JJ, with varying probabilities, so removing the rule would +be a bad idea.) +""" import optparse import sys diff --git a/scripts/training/flexibility_score.py b/scripts/training/flexibility_score.py index 496184616..56d4f9425 100755 --- a/scripts/training/flexibility_score.py +++ b/scripts/training/flexibility_score.py @@ -2,6 +2,9 @@ # -*- coding: utf-8 -*- # author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Add flexibility scores to a phrase table half. diff --git a/scripts/training/giza2bal.pl b/scripts/training/giza2bal.pl index 27ba9d659..ad9edb584 100755 --- a/scripts/training/giza2bal.pl +++ b/scripts/training/giza2bal.pl @@ -6,6 +6,9 @@ #produced by giza containing the frequency of each traning sentence. #Copyright Marcello Federico, November 2004 +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. #use warnings; diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 92e1a79ff..c73e75a87 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -1,4 +1,8 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + # $Id$ # Usage: # mert-moses.pl diff --git a/scripts/training/postprocess-lopar.perl b/scripts/training/postprocess-lopar.perl index 44be9c26c..05a56a3b5 100755 --- a/scripts/training/postprocess-lopar.perl +++ b/scripts/training/postprocess-lopar.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ diff --git a/scripts/training/rdlm/average_null_embedding.py b/scripts/training/rdlm/average_null_embedding.py index 28abc9508..899b402c1 100755 --- a/scripts/training/rdlm/average_null_embedding.py +++ b/scripts/training/rdlm/average_null_embedding.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """Average embeddings of special null words for RDLM. diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py index 1292e90f2..be4ed2335 100755 --- a/scripts/training/rdlm/extract_syntactic_ngrams.py +++ b/scripts/training/rdlm/extract_syntactic_ngrams.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ Extract syntactic n-grams from dependency treebank in Moses XML format for diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py index ed9266fd9..48e5215c3 100755 --- a/scripts/training/rdlm/extract_vocab.py +++ b/scripts/training/rdlm/extract_vocab.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # extract 5 vocabulary files from parsed corpus in moses XML format diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py index 639c1b32c..a7edbab36 100755 --- a/scripts/training/rdlm/train_rdlm.py +++ b/scripts/training/rdlm/train_rdlm.py @@ -1,5 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. from __future__ import print_function, unicode_literals diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl index 09f9c7f2b..82aed4355 100755 --- a/scripts/training/reduce-factors.perl +++ b/scripts/training/reduce-factors.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/reduce-topt-count.pl b/scripts/training/reduce-topt-count.pl index f760051c4..85ce0d6d9 100755 --- a/scripts/training/reduce-topt-count.pl +++ b/scripts/training/reduce-topt-count.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # given a moses.ini, filter the phrase tables to contain # only ttable-limit options per source phrase diff --git a/scripts/training/reduce_combine.pl b/scripts/training/reduce_combine.pl index a7614f73e..2055bed5b 100755 --- a/scripts/training/reduce_combine.pl +++ b/scripts/training/reduce_combine.pl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # $Id$ # given a pathname to a factored corpus, a list of (numeric) factors to keep diff --git a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl index eda529393..25c5cc028 100755 --- a/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl +++ b/scripts/training/remove-orphan-phrase-pairs-from-reordering-table.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/threshold-filter.perl b/scripts/training/threshold-filter.perl index 3e42ca795..0aed67d25 100755 --- a/scripts/training/threshold-filter.perl +++ b/scripts/training/threshold-filter.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/train-global-lexicon-model.perl b/scripts/training/train-global-lexicon-model.perl index d3c55789d..528bfbd72 100755 --- a/scripts/training/train-global-lexicon-model.perl +++ b/scripts/training/train-global-lexicon-model.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 5a304c2f9..b693d774d 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py index 2d2f12015..fec859611 100755 --- a/scripts/training/train-neurallm.py +++ b/scripts/training/train-neurallm.py @@ -1,8 +1,12 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -""" train feed-forward neural network LM with NPLM tool -resulting model can be used in Moses as feature function NeuralLM +"""Train feed-forward neural network LM with NPLM tool. + +The resulting model can be used in Moses as feature function NeuralLM. """ from __future__ import print_function, unicode_literals diff --git a/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh b/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh index 238a53349..5db5e9aa9 100755 --- a/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh +++ b/scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh @@ -1,4 +1,7 @@ #!/bin/bash +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/suffix-array-create.sh $SA_EXEC_DIR $SOURCE_CORPUS $TARGET_CORPUS $ALIGNMENT $SA_OUTPUT diff --git a/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh b/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh index 8c255b1b6..128ccaa9e 100755 --- a/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh +++ b/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh @@ -1,4 +1,7 @@ #!/bin/bash +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. # execute: ~/workspace/bin/moses-smt/scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $SA_EXEC_DIR $MODEL_DIR $INPUT_FILE $OUTPUT_DIR diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl index 232cfefab..9c376200c 100755 --- a/scripts/training/wrappers/berkeleyparsed2mosesxml.perl +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl index 9e8c30d42..b8ba146c9 100755 --- a/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl +++ b/scripts/training/wrappers/berkeleyparsed2mosesxml_PTB.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py index 761037488..6473166d9 100755 --- a/scripts/training/wrappers/conll2mosesxml.py +++ b/scripts/training/wrappers/conll2mosesxml.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. """ Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on diff --git a/scripts/training/wrappers/filter-excluded-lines.perl b/scripts/training/wrappers/filter-excluded-lines.perl index dff104dba..508ab8a06 100755 --- a/scripts/training/wrappers/filter-excluded-lines.perl +++ b/scripts/training/wrappers/filter-excluded-lines.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/find-unparseable.perl b/scripts/training/wrappers/find-unparseable.perl index 00009e2e9..fd0664f1d 100755 --- a/scripts/training/wrappers/find-unparseable.perl +++ b/scripts/training/wrappers/find-unparseable.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/mada-wrapper.perl b/scripts/training/wrappers/mada-wrapper.perl index f2cf14f40..d4124e34c 100755 --- a/scripts/training/wrappers/mada-wrapper.perl +++ b/scripts/training/wrappers/mada-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/madamira-tok.perl b/scripts/training/wrappers/madamira-tok.perl index 37e70079e..e9f19d53a 100755 --- a/scripts/training/wrappers/madamira-tok.perl +++ b/scripts/training/wrappers/madamira-tok.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/madamira-wrapper.perl b/scripts/training/wrappers/madamira-wrapper.perl index 6535b6187..05ec44d7d 100755 --- a/scripts/training/wrappers/madamira-wrapper.perl +++ b/scripts/training/wrappers/madamira-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl index 1e3a1ce3f..a8ce5f24e 100755 --- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl +++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-de-lemma.perl b/scripts/training/wrappers/make-factor-de-lemma.perl index db978317e..0b93002a9 100755 --- a/scripts/training/wrappers/make-factor-de-lemma.perl +++ b/scripts/training/wrappers/make-factor-de-lemma.perl @@ -1,4 +1,7 @@ #!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use Encode; diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl index 366a5a76d..d09196745 100755 --- a/scripts/training/wrappers/make-factor-de-morph.perl +++ b/scripts/training/wrappers/make-factor-de-morph.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-de-pos.perl b/scripts/training/wrappers/make-factor-de-pos.perl index 495517352..585323bd4 100755 --- a/scripts/training/wrappers/make-factor-de-pos.perl +++ b/scripts/training/wrappers/make-factor-de-pos.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-en-porter.perl b/scripts/training/wrappers/make-factor-en-porter.perl index 749dc1318..7ae5fd0b3 100755 --- a/scripts/training/wrappers/make-factor-en-porter.perl +++ b/scripts/training/wrappers/make-factor-en-porter.perl @@ -1,4 +1,7 @@ #!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl index 4aa66bac6..2bff8e329 100755 --- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl +++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl index 0ad04d4de..1e8ccd0ee 100755 --- a/scripts/training/wrappers/make-factor-pos.tree-tagger.perl +++ b/scripts/training/wrappers/make-factor-pos.tree-tagger.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-stem.perl b/scripts/training/wrappers/make-factor-stem.perl index 662f1d882..9bde7648f 100755 --- a/scripts/training/wrappers/make-factor-stem.perl +++ b/scripts/training/wrappers/make-factor-stem.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/make-factor-suffix.perl b/scripts/training/wrappers/make-factor-suffix.perl index 6a59254e4..015df3874 100755 --- a/scripts/training/wrappers/make-factor-suffix.perl +++ b/scripts/training/wrappers/make-factor-suffix.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/morfessor-wrapper.perl b/scripts/training/wrappers/morfessor-wrapper.perl index c65a2cebc..0269045a0 100755 --- a/scripts/training/wrappers/morfessor-wrapper.perl +++ b/scripts/training/wrappers/morfessor-wrapper.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl index e929658ff..02bc7b88e 100755 --- a/scripts/training/wrappers/mosesxml2berkeleyparsed.perl +++ b/scripts/training/wrappers/mosesxml2berkeleyparsed.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py index 6ff1d20c9..6b90aa256 100755 --- a/scripts/training/wrappers/mosesxml2brackets.py +++ b/scripts/training/wrappers/mosesxml2brackets.py @@ -1,8 +1,11 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# convert trees in moses XML format to PTB-style bracketed format +"""Convert trees in moses XML format to PTB-style bracketed format.""" from __future__ import print_function, unicode_literals import sys diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl index 596fb3eff..f605a37ae 100755 --- a/scripts/training/wrappers/parse-de-berkeley.perl +++ b/scripts/training/wrappers/parse-de-berkeley.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-de-bitpar.perl b/scripts/training/wrappers/parse-de-bitpar.perl index 1bbcf5329..0d5346058 100755 --- a/scripts/training/wrappers/parse-de-bitpar.perl +++ b/scripts/training/wrappers/parse-de-bitpar.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-en-collins.perl b/scripts/training/wrappers/parse-en-collins.perl index 252d3d2b7..c9a960912 100755 --- a/scripts/training/wrappers/parse-en-collins.perl +++ b/scripts/training/wrappers/parse-en-collins.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-en-egret.perl b/scripts/training/wrappers/parse-en-egret.perl index 9f434063b..e97bc1ae0 100755 --- a/scripts/training/wrappers/parse-en-egret.perl +++ b/scripts/training/wrappers/parse-en-egret.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/parse-en-senna.perl b/scripts/training/wrappers/parse-en-senna.perl index f271633ea..2df46284b 100755 --- a/scripts/training/wrappers/parse-en-senna.perl +++ b/scripts/training/wrappers/parse-en-senna.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use strict; use warnings; diff --git a/scripts/training/wrappers/parse-en-stanford.py b/scripts/training/wrappers/parse-en-stanford.py index 7d8be4bcf..06b027e55 100755 --- a/scripts/training/wrappers/parse-en-stanford.py +++ b/scripts/training/wrappers/parse-en-stanford.py @@ -1,11 +1,17 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # Author: Rico Sennrich +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# (hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format. -# assumes tokenized and sentence-split text. +""" +(Hacky) wrapper around Stanford CoreNLP to produce CoNLL dependency format. +Assumes tokenized and sentence-split text. -# to get Moses XML format, first projectivize the trees, then use conll2mosesxml.py. +To get Moses XML format, first projectivize the trees, then use +conll2mosesxml.py. +""" from __future__ import print_function, unicode_literals import os diff --git a/scripts/training/wrappers/senna2brackets.py b/scripts/training/wrappers/senna2brackets.py index 4fc71ed44..a81100277 100755 --- a/scripts/training/wrappers/senna2brackets.py +++ b/scripts/training/wrappers/senna2brackets.py @@ -1,19 +1,24 @@ #!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. -# Read SENNA output (from stdin), extract the parse trees, and write them in -# PTB-style bracketed format (to stdout). -# -# The SENNA output is assumed to contain tokens in the first column, POS tags -# in the second column, and PSG fragments in the final column. -# -# It is also assumed that SENNA was run through the parse-en-senna.perl wrapper, -# which: -# -# - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that -# exceed SENNA's hardcoded limit. -# -# - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")", -# etc. +""" +Read SENNA output (from stdin), extract the parse trees, and write them in +PTB-style bracketed format (to stdout). + +The SENNA output is assumed to contain tokens in the first column, POS tags +in the second column, and PSG fragments in the final column. + +It is also assumed that SENNA was run through the parse-en-senna.perl wrapper, +which: + + - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that + exceed SENNA's hardcoded limit. + + - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")", + etc. +""" import optparse import os diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl index 653b410d0..1a260df10 100755 --- a/scripts/training/wrappers/syntax-hyphen-splitting.perl +++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; diff --git a/scripts/training/wrappers/tagger-german-chunk.perl b/scripts/training/wrappers/tagger-german-chunk.perl index c57031889..0b707a579 100755 --- a/scripts/training/wrappers/tagger-german-chunk.perl +++ b/scripts/training/wrappers/tagger-german-chunk.perl @@ -1,4 +1,7 @@ #!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. use warnings; use strict; From 5d8af9c2896d86785c5db2fd3a8029ae9b741e26 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Fri, 29 May 2015 16:07:26 +0100 Subject: [PATCH 5/7] support memory-mapped files for NPLM training --- scripts/training/bilingual-lm/train_nplm.py | 14 ++++++--- scripts/training/rdlm/train_rdlm.py | 33 +++++++++++++++++---- scripts/training/train-neurallm.py | 33 +++++++++++++++++++-- 3 files changed, 68 insertions(+), 12 deletions(-) diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py index cb5980a91..572076006 100755 --- a/scripts/training/bilingual-lm/train_nplm.py +++ b/scripts/training/bilingual-lm/train_nplm.py @@ -39,7 +39,8 @@ parser.add_argument("--input-words-file", dest="input_words_file") parser.add_argument("--output-words-file", dest="output_words_file") parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int) parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int) - +parser.add_argument("--mmap", dest="mmap", action="store_true", + help="Use memory-mapped file (for lower memory consumption).") parser.set_defaults( working_dir="working", @@ -113,6 +114,11 @@ def main(options): options.working_dir, os.path.basename(options.corpus_stem) + ".numberized") + mmap_command = [] + if options.mmap: + in_file += '.mmap' + mmap_command = ['--mmap_file', '1'] + model_prefix = os.path.join( options.output_dir, options.output_model + ".model.nplm") train_args = [ @@ -127,9 +133,9 @@ def main(options): "--input_embedding_dimension", str(options.input_embedding), "--output_embedding_dimension", str(options.output_embedding), "--num_threads", str(options.threads), - "--activation_function", - options.activation_fn, - ] + validations_command + vocab_command + "--activation_function", options.activation_fn, + "--ngram_size", str(options.ngram_size), + ] + validations_command + vocab_command + mmap_command print("Train model command: ") print(', '.join(train_args)) diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py index a7edbab36..289ab405c 100755 --- a/scripts/training/rdlm/train_rdlm.py +++ b/scripts/training/rdlm/train_rdlm.py @@ -94,11 +94,14 @@ parser.add_argument( "--output-words-file", dest="output_words_file", metavar="PATH", help="Output vocabulary (default: %(default)s).") parser.add_argument( - "--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", + "--input-vocab-size", dest="input_vocab_size", type=int, metavar="INT", help="Input vocabulary size (default: %(default)s).") parser.add_argument( "--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT", help="Output vocabulary size (default: %(default)s).") +parser.add_argument( + "--mmap", dest="mmap", action="store_true", + help="Use memory-mapped file (for lower memory consumption).") parser.set_defaults( @@ -195,11 +198,14 @@ def main(options): "extracting vocabulary from training text.\n") prepare_vocabulary(options) + numberized_file = os.path.basename(options.corpus_stem) + '.numberized' + train_file = numberized_file + if options.mmap: + train_file += '.mmap' + extract_options = extract_syntactic_ngrams.create_parser().parse_args([ '--input', options.corpus_stem, - '--output', os.path.join( - options.working_dir, - os.path.basename(options.corpus_stem) + '.numberized'), + '--output', os.path.join(options.working_dir, numberized_file), '--vocab', options.input_words_file, '--output_vocab', options.output_words_file, '--right_context', str(options.right_context_size), @@ -222,6 +228,23 @@ def main(options): else: options.validation_file = None + if options.mmap: + try: + os.remove(os.path.join(options.working_dir, train_file)) + except OSError: + pass + mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'), + '--input_file', + os.path.join(options.working_dir, numberized_file), + '--output_file', + os.path.join(options.working_dir, train_file) + ] + sys.stderr.write('creating memory-mapped file\n') + sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n') + ret = subprocess.call(mmap_cmd) + if ret: + raise Exception("creating memory-mapped file failed") + sys.stderr.write('training neural network\n') train_nplm.main(options) @@ -234,7 +257,7 @@ def main(options): options.output_model + '.model.nplm.' + str(options.epochs)), os.path.join( options.working_dir, - os.path.basename(options.corpus_stem) + '.numberized'), + numberized_file), os.path.join(options.output_dir, options.output_model + '.model.nplm') ]) if ret: diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py index fec859611..ae77a42af 100755 --- a/scripts/training/train-neurallm.py +++ b/scripts/training/train-neurallm.py @@ -87,6 +87,9 @@ parser.add_argument( parser.add_argument( "--vocab-size", dest="vocab_size", type=int, metavar="INT", help="Vocabulary size (default: %(default)s).") +parser.add_argument( + "--mmap", dest="mmap", action="store_true", + help="Use memory-mapped file (for lower memory consumption).") parser.set_defaults( working_dir="working", @@ -121,20 +124,43 @@ def main(options): if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) + numberized_file = os.path.basename(options.corpus_stem) + '.numberized' + train_file = numberized_file + if options.mmap: + train_file += '.mmap' + extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), '--train_text', options.corpus_stem, '--ngramize', '1', '--ngram_size', str(options.ngram_size), '--vocab_size', str(options.vocab_size), '--write_words_file', os.path.join(options.working_dir, options.words_file), - '--train_file', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized') + '--train_file', os.path.join(options.working_dir, numberized_file) ] sys.stderr.write('extracting n-grams\n') + sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n') ret = subprocess.call(extraction_cmd) if ret: raise Exception("preparing neural LM failed") - + + if options.mmap: + try: + os.remove(os.path.join(options.working_dir, train_file)) + except OSError: + pass + mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'), + '--input_file', + os.path.join(options.working_dir, numberized_file), + '--output_file', + os.path.join(options.working_dir, train_file) + ] + sys.stderr.write('creating memory-mapped file\n') + sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n') + ret = subprocess.call(mmap_cmd) + if ret: + raise Exception("creating memory-mapped file failed") + if options.validation_corpus: extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), @@ -147,6 +173,7 @@ def main(options): ] sys.stderr.write('extracting n-grams (validation file)\n') + sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n') ret = subprocess.call(extraction_cmd) if ret: raise Exception("preparing neural LM failed") @@ -166,7 +193,7 @@ def main(options): average_options = averageNullEmbedding.parser.parse_args( ['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)), '-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'), - '-t', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'), + '-t', os.path.join(options.working_dir, numberized_file), '-p', os.path.join(options.nplm_home, 'python')]) averageNullEmbedding.main(average_options) From 2f735998ca8755263ec8dcc30303358988519091 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 29 May 2015 18:46:02 +0100 Subject: [PATCH 6/7] Rename MosesTraining::SyntaxTree to MosesTraining::SyntaxNodeCollection This is the first step in a small-scale refactoring effort that will touch a lot of the syntax-related code in moses/phrase-extract. The end goals are: - a storage mechanism for general attribute/value pairs in XML-style tree / lattice input. E.g. the "pcfg-score" and "semantic-role" attributes in: I - consolidation of the various near-duplicate Tree / XmlTreeParser classes that have accumulated over the years (my fault) - general de-crufting --- phrase-extract/SentenceAlignmentWithSyntax.h | 4 +- phrase-extract/SyntaxTree.cpp | 48 +++---------------- phrase-extract/SyntaxTree.h | 23 +++------ phrase-extract/XmlTree.cpp | 12 ++--- phrase-extract/XmlTree.h | 2 +- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 4 +- phrase-extract/extract-ghkm/ScfgRule.cpp | 8 ++-- phrase-extract/extract-ghkm/ScfgRule.h | 8 ++-- phrase-extract/extract-ghkm/XmlTreeParser.h | 2 +- phrase-extract/pcfg-common/xml_tree_parser.h | 2 +- phrase-extract/relax-parse-main.cpp | 12 ++--- phrase-extract/relax-parse.h | 8 ++-- .../syntax-common/xml_tree_parser.cc | 10 ++-- .../syntax-common/xml_tree_parser.h | 2 +- 14 files changed, 51 insertions(+), 94 deletions(-) diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h index 8b9088770..a603f7722 100644 --- a/phrase-extract/SentenceAlignmentWithSyntax.h +++ b/phrase-extract/SentenceAlignmentWithSyntax.h @@ -36,8 +36,8 @@ namespace MosesTraining class SentenceAlignmentWithSyntax : public SentenceAlignment { public: - SyntaxTree targetTree; - SyntaxTree sourceTree; + SyntaxNodeCollection targetTree; + SyntaxNodeCollection sourceTree; std::set & m_targetLabelCollection; std::set & m_sourceLabelCollection; std::map & m_targetTopLabelCollection; diff --git a/phrase-extract/SyntaxTree.cpp b/phrase-extract/SyntaxTree.cpp index c50693e0d..7f641125e 100644 --- a/phrase-extract/SyntaxTree.cpp +++ b/phrase-extract/SyntaxTree.cpp @@ -1,6 +1,3 @@ -// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2009 University of Edinburgh @@ -29,12 +26,12 @@ namespace MosesTraining { -SyntaxTree::~SyntaxTree() +SyntaxNodeCollection::~SyntaxNodeCollection() { Clear(); } -void SyntaxTree::Clear() +void SyntaxNodeCollection::Clear() { m_top = 0; // loop through all m_nodes, delete them @@ -45,7 +42,7 @@ void SyntaxTree::Clear() m_index.clear(); } -SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label ) +SyntaxNode *SyntaxNodeCollection::AddNode( int startPos, int endPos, std::string label ) { SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label ); m_nodes.push_back( newNode ); @@ -54,7 +51,7 @@ SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label ) return newNode; } -ParentNodes SyntaxTree::Parse() +ParentNodes SyntaxNodeCollection::Parse() { ParentNodes parents; @@ -94,12 +91,12 @@ ParentNodes SyntaxTree::Parse() return parents; } -bool SyntaxTree::HasNode( int startPos, int endPos ) const +bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const { return GetNodes( startPos, endPos).size() > 0; } -const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos ) const +const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( int startPos, int endPos ) const { SyntaxTreeIndexIterator startIndex = m_index.find( startPos ); if (startIndex == m_index.end() ) @@ -112,15 +109,7 @@ const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos return endIndex->second; } -// for printing out tree -std::string SyntaxTree::ToString() const -{ - std::stringstream out; - out << *this; - return out.str(); -} - -void SyntaxTree::ConnectNodes() +void SyntaxNodeCollection::ConnectNodes() { typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator; @@ -162,27 +151,4 @@ void SyntaxTree::ConnectNodes() } } -std::ostream& operator<<(std::ostream& os, const SyntaxTree& t) -{ - size_t size = t.m_index.size(); - for(size_t length=1; length<=size; length++) { - for(size_t space=0; spaceGetLabel() + "#######"; - - os << label.substr(0,7) << " "; - } else { - os << "------- "; - } - } - os << std::endl; - } - return os; } - -} - diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxTree.h index 6ffb5da34..649a6197b 100644 --- a/phrase-extract/SyntaxTree.h +++ b/phrase-extract/SyntaxTree.h @@ -1,6 +1,3 @@ -// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2009 University of Edinburgh @@ -20,12 +17,12 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ - #pragma once -#include -#include + #include #include +#include +#include namespace MosesTraining { @@ -79,7 +76,7 @@ public: typedef std::vector< int > SplitPoints; typedef std::vector< SplitPoints > ParentNodes; -class SyntaxTree +class SyntaxNodeCollection { protected: std::vector< SyntaxNode* > m_nodes; @@ -93,14 +90,12 @@ protected: int m_size; std::vector< SyntaxNode* > m_emptyNode; - friend std::ostream& operator<<(std::ostream&, const SyntaxTree&); - public: - SyntaxTree() + SyntaxNodeCollection() : m_top(0) // m_top doesn't get set unless ConnectNodes is called. , m_size(0) {} - ~SyntaxTree(); + ~SyntaxNodeCollection(); SyntaxNode *AddNode( int startPos, int endPos, std::string label ); @@ -119,10 +114,6 @@ public: } void ConnectNodes(); void Clear(); - std::string ToString() const; }; -std::ostream& operator<<(std::ostream&, const SyntaxTree&); - -} - +} // namespace MosesTraining diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index 6efa1bf5c..d45fd99eb 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -1,6 +1,3 @@ -// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh @@ -228,7 +225,10 @@ vector TokenizeXml(const string& str) parse because we don't have the completed source parsed until after this function removes all the markup from it (CreateFromString in Sentence::Read). */ -bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection, bool unescapeSpecialChars ) +bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, + set< string > &labelCollection, + map< string, int > &topLabelCollection, + bool unescapeSpecialChars ) { //parse XML markup in translation line @@ -374,7 +374,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label cerr << "XML TAG LABEL IS: '" << label << "'" << endl; cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; } - SyntaxNode *node = tree.AddNode( startPos, endPos-1, label ); + SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label ); node->SetPcfgScore(pcfgScore); } } @@ -386,7 +386,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label } // collect top labels - const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 ); + const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 ); for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) { SyntaxNode *n = *node; const string &label = n->GetLabel(); diff --git a/phrase-extract/XmlTree.h b/phrase-extract/XmlTree.h index 50b1c0acc..392192ae6 100644 --- a/phrase-extract/XmlTree.h +++ b/phrase-extract/XmlTree.h @@ -35,7 +35,7 @@ std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r" std::string TrimXml(const std::string& str); bool isXmlTag(const std::string& tag); std::vector TokenizeXml(const std::string& str); -bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true); +bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true); std::string unescape(const std::string &str); diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index bc687ec6b..9e6aacc20 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -172,7 +172,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Parse source tree and construct a SyntaxTree object. - MosesTraining::SyntaxTree sourceSyntaxTree; + MosesTraining::SyntaxNodeCollection sourceSyntaxTree; MosesTraining::SyntaxNode *sourceSyntaxTreeRoot=NULL; if (options.sourceLabels) { @@ -196,7 +196,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Read source tokens. std::vector sourceTokens(ReadTokens(sourceLine)); - // Construct a source ParseTree object from the SyntaxTree object. + // Construct a source ParseTree object from the SyntaxNodeCollection object. std::auto_ptr sourceParseTree; if (options.sourceLabels) { diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index 01178b72c..94ff3c605 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -31,7 +31,7 @@ namespace GHKM { ScfgRule::ScfgRule(const Subgraph &fragment, - const MosesTraining::SyntaxTree *sourceSyntaxTree) + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree) : m_graphFragment(fragment) , m_sourceLHS("X", NonTerminal) , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal) @@ -133,9 +133,9 @@ ScfgRule::ScfgRule(const Subgraph &fragment, } } -void ScfgRule::PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree, - const Node *node, - const std::string &nonMatchingLabel) +void ScfgRule::PushSourceLabel( + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree, + const Node *node, const std::string &nonMatchingLabel) { ContiguousSpan span = Closure(node->GetSpan()); if (sourceSyntaxTree->HasNode(span.first,span.second)) { // does a source constituent match the span? diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h index 94ee7b82e..b3d8ad017 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.h +++ b/phrase-extract/extract-ghkm/ScfgRule.h @@ -41,7 +41,7 @@ class ScfgRule : public Rule { public: ScfgRule(const Subgraph &fragment, - const MosesTraining::SyntaxTree *sourceSyntaxTree = 0); + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree = 0); const Subgraph &GetGraphFragment() const { return m_graphFragment; @@ -78,9 +78,9 @@ public: } private: - void PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree, - const Node *node, - const std::string &nonMatchingLabel); + void PushSourceLabel( + const MosesTraining::SyntaxNodeCollection *sourceSyntaxTree, + const Node *node, const std::string &nonMatchingLabel); const Subgraph& m_graphFragment; Symbol m_sourceLHS; diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index ff0baeace..03450383a 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -58,7 +58,7 @@ private: std::set &m_labelSet; std::map &m_topLabelSet; std::string m_line; - MosesTraining::SyntaxTree m_tree; + MosesTraining::SyntaxNodeCollection m_tree; std::vector m_words; }; diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h index 675a112d8..69754bb56 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.h +++ b/phrase-extract/pcfg-common/xml_tree_parser.h @@ -47,7 +47,7 @@ class XmlTreeParser { std::set m_labelSet; std::map m_topLabelSet; std::string m_line; - MosesTraining::SyntaxTree m_tree; + MosesTraining::SyntaxNodeCollection m_tree; std::vector m_words; }; diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index 5c9daa7ae..5bca886bf 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) // process into syntax tree representation set< string > labelCollection; // set of labels, not used map< string, int > topLabelCollection; // count of top labels, not used - SyntaxTree tree; + SyntaxNodeCollection tree; ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false ); const vector< string > inWords = util::tokenize( inBufferString ); @@ -105,7 +105,7 @@ void init(int argc, char* argv[]) } } -void store( SyntaxTree &tree, const vector< string > &words ) +void store( SyntaxNodeCollection &tree, const vector< string > &words ) { // output words for( size_t i=0; i &words ) cout << endl; } -void LeftBinarize( SyntaxTree &tree, ParentNodes &parents ) +void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents ) { for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; @@ -143,7 +143,7 @@ void LeftBinarize( SyntaxTree &tree, ParentNodes &parents ) } } -void RightBinarize( SyntaxTree &tree, ParentNodes &parents ) +void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents ) { for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; @@ -161,11 +161,11 @@ void RightBinarize( SyntaxTree &tree, ParentNodes &parents ) } } -void SAMT( SyntaxTree &tree, ParentNodes &parents ) +void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents ) { int numWords = tree.GetNumWords(); - SyntaxTree newTree; // to store new nodes + SyntaxNodeCollection newTree; // to store new nodes // look through parents to combine children for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h index 9bd0bfb23..af41b0945 100644 --- a/phrase-extract/relax-parse.h +++ b/phrase-extract/relax-parse.h @@ -39,8 +39,8 @@ char SAMTLevel = 0; // functions void init(int argc, char* argv[]); -void store( MosesTraining::SyntaxTree &tree, const std::vector &words ); -void LeftBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); -void RightBinarize( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); -void SAMT( MosesTraining::SyntaxTree &tree, MosesTraining::ParentNodes &parents ); +void store( MosesTraining::SyntaxNodeCollection &tree, const std::vector &words ); +void LeftBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); +void RightBinarize( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); +void SAMT( MosesTraining::SyntaxNodeCollection &tree, MosesTraining::ParentNodes &parents ); diff --git a/phrase-extract/syntax-common/xml_tree_parser.cc b/phrase-extract/syntax-common/xml_tree_parser.cc index c6e3cd3c3..2f8a904fa 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.cc +++ b/phrase-extract/syntax-common/xml_tree_parser.cc @@ -13,17 +13,17 @@ namespace Syntax { StringTree *XmlTreeParser::Parse(const std::string &line) { line_ = line; - tree_.Clear(); + node_collection_.Clear(); try { - if (!ProcessAndStripXMLTags(line_, tree_, label_set_, top_label_set_, - false)) { + if (!ProcessAndStripXMLTags(line_, node_collection_, label_set_, + top_label_set_, false)) { throw Exception(""); } } catch (const XmlException &e) { throw Exception(e.getMsg()); } - tree_.ConnectNodes(); - SyntaxNode *root = tree_.GetTop(); + node_collection_.ConnectNodes(); + SyntaxNode *root = node_collection_.GetTop(); assert(root); words_ = util::tokenize(line_); return ConvertTree(*root, words_); diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index a5563f63a..e530b84ef 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -26,7 +26,7 @@ class XmlTreeParser { std::set label_set_; std::map top_label_set_; std::string line_; - MosesTraining::SyntaxTree tree_; + MosesTraining::SyntaxNodeCollection node_collection_; std::vector words_; }; From 985e7bbfc30c6f124c546e769948caf22eacfc66 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 29 May 2015 20:57:25 +0100 Subject: [PATCH 7/7] Ongoing moses/phrase-extract refactoring --- phrase-extract/SentenceAlignmentWithSyntax.h | 2 +- phrase-extract/SyntaxNode.h | 75 +++++++++++++++++++ ...yntaxTree.cpp => SyntaxNodeCollection.cpp} | 7 +- .../{SyntaxTree.h => SyntaxNodeCollection.h} | 50 +------------ phrase-extract/XmlTree.cpp | 3 +- phrase-extract/XmlTree.h | 10 +-- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 3 +- phrase-extract/extract-ghkm/ScfgRule.cpp | 7 +- phrase-extract/extract-ghkm/ScfgRule.h | 9 +-- phrase-extract/extract-ghkm/XmlTreeParser.h | 5 +- phrase-extract/extract-rules-main.cpp | 2 +- phrase-extract/pcfg-common/xml_tree_parser.h | 3 +- phrase-extract/relax-parse.h | 2 +- .../syntax-common/xml_tree_parser.h | 3 +- 14 files changed, 108 insertions(+), 73 deletions(-) create mode 100644 phrase-extract/SyntaxNode.h rename phrase-extract/{SyntaxTree.cpp => SyntaxNodeCollection.cpp} (96%) rename phrase-extract/{SyntaxTree.h => SyntaxNodeCollection.h} (69%) diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h index a603f7722..604b6d0e2 100644 --- a/phrase-extract/SentenceAlignmentWithSyntax.h +++ b/phrase-extract/SentenceAlignmentWithSyntax.h @@ -28,7 +28,7 @@ #include "RuleExtractionOptions.h" #include "SentenceAlignment.h" -#include "SyntaxTree.h" +#include "SyntaxNodeCollection.h" namespace MosesTraining { diff --git a/phrase-extract/SyntaxNode.h b/phrase-extract/SyntaxNode.h new file mode 100644 index 000000000..46e0f456f --- /dev/null +++ b/phrase-extract/SyntaxNode.h @@ -0,0 +1,75 @@ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include + +namespace MosesTraining +{ + +class SyntaxNode +{ +protected: + int m_start, m_end; + std::string m_label; + std::vector< SyntaxNode* > m_children; + SyntaxNode* m_parent; + float m_pcfgScore; +public: + SyntaxNode( int startPos, int endPos, std::string label ) + :m_start(startPos) + ,m_end(endPos) + ,m_label(label) + ,m_parent(0) + ,m_pcfgScore(0.0f) { + } + int GetStart() const { + return m_start; + } + int GetEnd() const { + return m_end; + } + std::string GetLabel() const { + return m_label; + } + float GetPcfgScore() const { + return m_pcfgScore; + } + void SetPcfgScore(float score) { + m_pcfgScore = score; + } + SyntaxNode *GetParent() { + return m_parent; + } + void SetParent(SyntaxNode *parent) { + m_parent = parent; + } + void AddChild(SyntaxNode* child) { + m_children.push_back(child); + } + const std::vector< SyntaxNode* > &GetChildren() const { + return m_children; + } +}; + +} // namespace MosesTraining diff --git a/phrase-extract/SyntaxTree.cpp b/phrase-extract/SyntaxNodeCollection.cpp similarity index 96% rename from phrase-extract/SyntaxTree.cpp rename to phrase-extract/SyntaxNodeCollection.cpp index 7f641125e..099a5697f 100644 --- a/phrase-extract/SyntaxTree.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -18,7 +18,7 @@ ***********************************************************************/ -#include "SyntaxTree.h" +#include "SyntaxNodeCollection.h" #include #include @@ -42,7 +42,8 @@ void SyntaxNodeCollection::Clear() m_index.clear(); } -SyntaxNode *SyntaxNodeCollection::AddNode( int startPos, int endPos, std::string label ) +SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, + const std::string &label) { SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label ); m_nodes.push_back( newNode ); @@ -151,4 +152,4 @@ void SyntaxNodeCollection::ConnectNodes() } } -} +} // namespace MosesTraining diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxNodeCollection.h similarity index 69% rename from phrase-extract/SyntaxTree.h rename to phrase-extract/SyntaxNodeCollection.h index 649a6197b..70b14206d 100644 --- a/phrase-extract/SyntaxTree.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -24,55 +24,11 @@ #include #include +#include "SyntaxNode.h" + namespace MosesTraining { -class SyntaxNode -{ -protected: - int m_start, m_end; - std::string m_label; - std::vector< SyntaxNode* > m_children; - SyntaxNode* m_parent; - float m_pcfgScore; -public: - SyntaxNode( int startPos, int endPos, std::string label ) - :m_start(startPos) - ,m_end(endPos) - ,m_label(label) - ,m_parent(0) - ,m_pcfgScore(0.0f) { - } - int GetStart() const { - return m_start; - } - int GetEnd() const { - return m_end; - } - std::string GetLabel() const { - return m_label; - } - float GetPcfgScore() const { - return m_pcfgScore; - } - void SetPcfgScore(float score) { - m_pcfgScore = score; - } - SyntaxNode *GetParent() { - return m_parent; - } - void SetParent(SyntaxNode *parent) { - m_parent = parent; - } - void AddChild(SyntaxNode* child) { - m_children.push_back(child); - } - const std::vector< SyntaxNode* > &GetChildren() const { - return m_children; - } -}; - - typedef std::vector< int > SplitPoints; typedef std::vector< SplitPoints > ParentNodes; @@ -97,7 +53,7 @@ public: ~SyntaxNodeCollection(); - SyntaxNode *AddNode( int startPos, int endPos, std::string label ); + SyntaxNode *AddNode( int startPos, int endPos, const std::string &label ); SyntaxNode *GetTop() { return m_top; diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp index d45fd99eb..0f068fca7 100644 --- a/phrase-extract/XmlTree.cpp +++ b/phrase-extract/XmlTree.cpp @@ -24,7 +24,8 @@ #include #include #include -#include "SyntaxTree.h" + +#include "SyntaxNodeCollection.h" #include "XmlException.h" using namespace std; diff --git a/phrase-extract/XmlTree.h b/phrase-extract/XmlTree.h index 392192ae6..3b5afd4dd 100644 --- a/phrase-extract/XmlTree.h +++ b/phrase-extract/XmlTree.h @@ -1,6 +1,3 @@ -// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $ -// vim:tabstop=2 - /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh @@ -21,11 +18,13 @@ ***********************************************************************/ #pragma once + #include #include #include #include -#include "SyntaxTree.h" + +#include "SyntaxNodeCollection.h" namespace MosesTraining { @@ -39,5 +38,4 @@ bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std:: std::string unescape(const std::string &str); -} // namespace - +} // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 9e6aacc20..937d88030 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -33,7 +33,8 @@ #include "Span.h" #include "StsgRule.h" #include "StsgRuleWriter.h" -#include "SyntaxTree.h" +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index 94ff3c605..918c88eeb 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -19,11 +19,12 @@ #include "ScfgRule.h" +#include + #include "Node.h" #include "Subgraph.h" -#include "SyntaxTree.h" - -#include +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" namespace Moses { diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h index b3d8ad017..c8b76114a 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.h +++ b/phrase-extract/extract-ghkm/ScfgRule.h @@ -19,16 +19,16 @@ #pragma once -#include "Alignment.h" -#include "Rule.h" -#include "SyntaxTree.h" - #include #include #include #include #include +#include "Alignment.h" +#include "Rule.h" +#include "SyntaxNodeCollection.h" + namespace Moses { namespace GHKM @@ -95,4 +95,3 @@ private: } // namespace GHKM } // namespace Moses - diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index 03450383a..db9fa8bf2 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -23,14 +23,15 @@ #include "Exception.h" -#include "SyntaxTree.h" - #include #include #include #include #include +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" + namespace Moses { namespace GHKM diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 50baa4e0d..825f12d89 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -41,7 +41,7 @@ #include "HoleCollection.h" #include "RuleExist.h" #include "SentenceAlignmentWithSyntax.h" -#include "SyntaxTree.h" +#include "SyntaxNode.h" #include "tables-core.h" #include "XmlTree.h" #include "InputFileStream.h" diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h index 69754bb56..8605c0691 100644 --- a/phrase-extract/pcfg-common/xml_tree_parser.h +++ b/phrase-extract/pcfg-common/xml_tree_parser.h @@ -28,7 +28,8 @@ #include #include "pcfg_tree.h" -#include "SyntaxTree.h" +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" namespace MosesTraining { namespace Syntax { diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h index af41b0945..a00aa6deb 100644 --- a/phrase-extract/relax-parse.h +++ b/phrase-extract/relax-parse.h @@ -28,7 +28,7 @@ #include #include -#include "SyntaxTree.h" +#include "SyntaxNodeCollection.h" #include "XmlTree.h" #define LINE_MAX_LENGTH 1000000 diff --git a/phrase-extract/syntax-common/xml_tree_parser.h b/phrase-extract/syntax-common/xml_tree_parser.h index e530b84ef..c84ea25ec 100644 --- a/phrase-extract/syntax-common/xml_tree_parser.h +++ b/phrase-extract/syntax-common/xml_tree_parser.h @@ -5,7 +5,8 @@ #include #include -#include "SyntaxTree.h" +#include "SyntaxNode.h" +#include "SyntaxNodeCollection.h" #include "exception.h" #include "string_tree.h"