From d4642a34c1550564b59f852af76426574bfd774f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 1 Jan 2017 22:54:48 +0000 Subject: [PATCH 01/12] add completed-hypo to Distortion FF --- contrib/moses2/FF/Distortion.cpp | 31 ++++++++++++++++++++++++++----- contrib/moses2/FF/Distortion.h | 11 ++++++++--- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/contrib/moses2/FF/Distortion.cpp b/contrib/moses2/FF/Distortion.cpp index 343e1d21f..9e55ff798 100644 --- a/contrib/moses2/FF/Distortion.cpp +++ b/contrib/moses2/FF/Distortion.cpp @@ -57,6 +57,7 @@ struct DistortionState_traditional: public FFState Distortion::Distortion(size_t startInd, const std::string &line) : StatefulFeatureFunction(startInd, line) { + m_completedHypo = false; ReadParameters(); } @@ -65,6 +66,16 @@ Distortion::~Distortion() // TODO Auto-generated destructor stub } +void Distortion::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "completed-hypo") { + m_completedHypo = Scan(value); + } + else { + StatefulFeatureFunction::SetParameter(key, value); + } +} + FFState* Distortion::BlankState(MemPool &pool, const System &sys) const { return new (pool.Allocate()) DistortionState_traditional(); @@ -110,7 +121,7 @@ void Distortion::EvaluateWhenApplied(const ManagerBase &mgr, const DistortionState_traditional &prev = static_cast(prevState); SCORE distortionScore = CalculateDistortionScore(prev.range, - hypo.GetInputPath().range, prev.first_gap); + hypo.GetInputPath().range, prev.first_gap, hypo.GetBitmap()); //cerr << "distortionScore=" << distortionScore << endl; scores.PlusEquals(mgr.system, *this, distortionScore); @@ -123,11 +134,11 @@ void Distortion::EvaluateWhenApplied(const ManagerBase &mgr, } SCORE Distortion::CalculateDistortionScore(const Range &prev, const Range &curr, - const int FirstGap) const + const int FirstGap, const Bitmap &coverage) const { bool useEarlyDistortionCost = false; if (!useEarlyDistortionCost) { - return -(SCORE) ComputeDistortionDistance(prev, curr); + return -(SCORE) ComputeDistortionDistance(prev, curr, coverage); } else { /* Pay distortion score as soon as possible, from Moore and Quirk MT Summit 2007 @@ -168,7 +179,7 @@ SCORE Distortion::CalculateDistortionScore(const Range &prev, const Range &curr, } int Distortion::ComputeDistortionDistance(const Range& prev, - const Range& current) const + const Range& current, const Bitmap &coverage) const { int dist = 0; if (prev.GetNumWordsCovered() == 0) { @@ -176,8 +187,18 @@ int Distortion::ComputeDistortionDistance(const Range& prev, } else { dist = (int) prev.GetEndPos() - (int) current.GetStartPos() + 1; + dist = abs(dist); + + if (m_completedHypo && coverage.IsComplete()) { + dist += coverage.GetSize() - current.GetEndPos() - 1; + /* + cerr << "completed=" << coverage << " " << coverage.GetSize() << " " + << prev << " " + << current << " " << dist << endl; + */ + } } - return abs(dist); + return dist; } void Distortion::EvaluateWhenApplied(const SCFG::Manager &mgr, diff --git a/contrib/moses2/FF/Distortion.h b/contrib/moses2/FF/Distortion.h index 45577d1c3..bc843fe54 100644 --- a/contrib/moses2/FF/Distortion.h +++ b/contrib/moses2/FF/Distortion.h @@ -14,6 +14,7 @@ namespace Moses2 { +class Bitmap; class Distortion: public StatefulFeatureFunction { @@ -21,6 +22,8 @@ public: Distortion(size_t startInd, const std::string &line); virtual ~Distortion(); + virtual void SetParameter(const std::string& key, const std::string& value); + virtual FFState* BlankState(MemPool &pool, const System &sys) const; virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, const InputType &input, const Hypothesis &hypo) const; @@ -48,10 +51,12 @@ public: FFState &state) const; protected: - SCORE CalculateDistortionScore(const Range &prev, const Range &curr, - const int FirstGap) const; + bool m_completedHypo; - int ComputeDistortionDistance(const Range& prev, const Range& current) const; + SCORE CalculateDistortionScore(const Range &prev, const Range &curr, + const int FirstGap, const Bitmap &coverage) const; + + int ComputeDistortionDistance(const Range& prev, const Range& current, const Bitmap &coverage) const; }; From 29b0072edac3312a82564ea614a5c64030997061 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 2 Jan 2017 06:02:54 -0500 Subject: [PATCH 02/12] CreateProbingPT2 -> CreateProbingPT --- scripts/training/filter-model-given-input.pl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index a16aeac4a..65b2e3502 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -228,7 +228,7 @@ while ( my $line = ) { $phrase_table_impl = "PhraseDictionaryOnDisk"; @toks = set_value( \@toks, "path", "$new_name.bin$table_flag" ); } - elsif ( $binarizer =~ /CreateProbingPT2/ ) { + elsif ( $binarizer =~ /CreateProbingPT/ ) { $phrase_table_impl = "ProbingPT"; @toks = set_value( \@toks, "path", "$new_name.probing$table_flag" ); } @@ -488,7 +488,7 @@ for ( my $i = 0 ; $i <= $#TABLE ; $i++ ) { my $cmd = "$binarizer $mid_file $new_file.bin"; safesystem($cmd) or die "Can't binarize"; } - elsif ( $binarizer =~ /CreateProbingPT2/ ) { + elsif ( $binarizer =~ /CreateProbingPT/ ) { my $cmd = "$binarizer --input-pt $mid_file --output-dir $new_file.probing"; if ($opt_hierarchical) { $cmd .= " --scfg"; @@ -509,8 +509,8 @@ for ( my $i = 0 ; $i <= $#TABLE ; $i++ ) { if ( $binarizer =~ /CreateOnDiskPt/ ) { $lexbin =~ s/CreateOnDiskPt/processLexicalTable/; } - elsif ( $binarizer =~ /CreateProbingPT2/ ) { - $lexbin =~ s/CreateProbingPT2/processLexicalTableMin/; + elsif ( $binarizer =~ /CreateProbingPT/ ) { + $lexbin =~ s/CreateProbingPT/processLexicalTableMin/; } $lexbin =~ s/PhraseTable/LexicalTable/; From cf93594af98e35329be7120d01255a98d0ad1fa4 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 2 Jan 2017 12:44:08 +0000 Subject: [PATCH 03/12] re-implement -feature-overwrite --- contrib/moses2/FF/FeatureFunction.h | 2 +- contrib/moses2/FF/FeatureFunctions.cpp | 42 +++++++++++++++++++++++++- contrib/moses2/FF/FeatureFunctions.h | 5 ++- contrib/moses2/legacy/Parameter.cpp | 4 +-- 4 files changed, 48 insertions(+), 5 deletions(-) diff --git a/contrib/moses2/FF/FeatureFunction.h b/contrib/moses2/FF/FeatureFunction.h index d38c72b89..1e25fce39 100644 --- a/contrib/moses2/FF/FeatureFunction.h +++ b/contrib/moses2/FF/FeatureFunction.h @@ -80,6 +80,7 @@ public: return m_tuneable; } + virtual void SetParameter(const std::string& key, const std::string& value); // may have more factors than actually need, but not guaranteed. virtual void @@ -118,7 +119,6 @@ protected: std::vector > m_args; bool m_tuneable; - virtual void SetParameter(const std::string& key, const std::string& value); virtual void ReadParameters(); void ParseLine(const std::string &line); }; diff --git a/contrib/moses2/FF/FeatureFunctions.cpp b/contrib/moses2/FF/FeatureFunctions.cpp index 8ca145060..49a0ace67 100644 --- a/contrib/moses2/FF/FeatureFunctions.cpp +++ b/contrib/moses2/FF/FeatureFunctions.cpp @@ -103,8 +103,9 @@ void FeatureFunctions::Create() unkWP->SetParameter("suffix", m_system.options.unk.suffix); } } - } + + OverrideFeatures(); } FeatureFunction *FeatureFunctions::Create(const std::string &line) @@ -150,6 +151,17 @@ const FeatureFunction *FeatureFunctions::FindFeatureFunction( return NULL; } +FeatureFunction *FeatureFunctions::FindFeatureFunction( + const std::string &name) +{ + BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions){ + if (ff->GetName() == name) { + return const_cast(ff); + } + } + return NULL; +} + const PhraseTable *FeatureFunctions::GetPhraseTableExcludeUnknownWordPenalty(size_t ptInd) { // assume only 1 unk wp @@ -243,5 +255,33 @@ void FeatureFunctions::ShowWeights(const Weights &allWeights) } } +void FeatureFunctions::OverrideFeatures() +{ + const Parameter ¶meter = m_system.params; + + const PARAM_VEC *params = parameter.GetParam("feature-overwrite"); + for (size_t i = 0; params && i < params->size(); ++i) { + const string &str = params->at(i); + vector toks = Tokenize(str); + UTIL_THROW_IF2(toks.size() <= 1, "Incorrect format for feature override: " << str); + + FeatureFunction *ff = FindFeatureFunction(toks[0]); + UTIL_THROW_IF2(ff == NULL, "Feature function not found: " << toks[0]); + + for (size_t j = 1; j < toks.size(); ++j) { + const string &keyValStr = toks[j]; + vector keyVal = Tokenize(keyValStr, "="); + UTIL_THROW_IF2(keyVal.size() != 2, "Incorrect format for parameter override: " << keyValStr); + + cerr << "Override " << ff->GetName() << " " + << keyVal[0] << "=" << keyVal[1] << endl; + + ff->SetParameter(keyVal[0], keyVal[1]); + + } + } + +} + } diff --git a/contrib/moses2/FF/FeatureFunctions.h b/contrib/moses2/FF/FeatureFunctions.h index 74c77c7e6..2232e2a97 100644 --- a/contrib/moses2/FF/FeatureFunctions.h +++ b/contrib/moses2/FF/FeatureFunctions.h @@ -95,10 +95,13 @@ protected: System &m_system; size_t m_ffStartInd; + FeatureRegistry m_registry; + FeatureFunction *Create(const std::string &line); std::string GetDefaultName(const std::string &stub); + void OverrideFeatures(); + FeatureFunction *FindFeatureFunction(const std::string &name); - FeatureRegistry m_registry; }; } diff --git a/contrib/moses2/legacy/Parameter.cpp b/contrib/moses2/legacy/Parameter.cpp index 666eb0e98..bd2cd4676 100644 --- a/contrib/moses2/legacy/Parameter.cpp +++ b/contrib/moses2/legacy/Parameter.cpp @@ -94,8 +94,8 @@ Parameter::Parameter() AddParam(search_opts, "weight", "weights for ALL models, 1 per line 'WeightName value'. Weight names can be repeated"); - //AddParam(search_opts, "feature-overwrite", - // "Override arguments in a particular feature function with a particular key. Format: -feature-overwrite \"FeatureName key=value\""); + AddParam(search_opts, "feature-overwrite", + "Override arguments in a particular feature function with a particular key. Format: -feature-overwrite \"FeatureName key=value\""); po::options_description tune_opts("Options used in tuning."); AddParam(tune_opts, "weight-overwrite", From ab2e48415fa50faa41106f9f69339ff4ab01de73 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 2 Jan 2017 15:55:33 -0500 Subject: [PATCH 04/12] add back -text-type for EMS --- contrib/moses2/legacy/Parameter.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/contrib/moses2/legacy/Parameter.cpp b/contrib/moses2/legacy/Parameter.cpp index bd2cd4676..870a49f2a 100644 --- a/contrib/moses2/legacy/Parameter.cpp +++ b/contrib/moses2/legacy/Parameter.cpp @@ -373,6 +373,9 @@ Parameter::Parameter() /////////////////////////////////////////////////////////////////////////////////////// // DEPRECATED options po::options_description deprec_opts("Deprecated Options"); + AddParam(deprec_opts, "text-type", + "DEPRECATED. DO NOT USE. should be one of dev/devtest/test, used for domain adaptation features"); + /* AddParam(deprec_opts, "link-param-count", "DEPRECATED. DO NOT USE. Number of parameters on word links when using confusion networks or lattices (default = 1)"); @@ -412,8 +415,6 @@ Parameter::Parameter() "DEPRECATED. DO NOT USE. weight for unknown word penalty"); AddParam(deprec_opts, "weight-e", "e", "DEPRECATED. DO NOT USE. weight for word deletion"); - AddParam(deprec_opts, "text-type", - "DEPRECATED. DO NOT USE. should be one of dev/devtest/test, used for domain adaptation features"); AddParam(deprec_opts, "input-scores", "DEPRECATED. DO NOT USE. 2 numbers on 2 lines - [1] of scores on each edge of a confusion network or lattice input (default=1). [2] Number of 'real' word scores (0 or 1. default=0)"); AddParam(deprec_opts, "dlm-model", From ff12a13eaaef2e6272123d5865f516ed4513bc07 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 2 Jan 2017 16:37:56 -0500 Subject: [PATCH 05/12] re-tune if decoder changed. eg moses -> moses2 --- scripts/ems/experiment.meta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 16fc20336..d6e6dc133 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -1196,7 +1196,7 @@ tune default-name: tuning/moses.ini tmp-name: tuning/tmp final-model: yes - rerun-on-change: decoder-settings tuning-settings nbest lambda async + rerun-on-change: decoder decoder-settings tuning-settings nbest lambda async not-error: trans: No such file or directory thot-tune in: TRAINING:config input reference From 80bd5597578cddc72244c8f53d18a2aabdca27b9 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Tue, 3 Jan 2017 00:00:36 +0000 Subject: [PATCH 06/12] daily automatic beautifier --- moses/LM/InMemoryPerSentenceOnDemandLM.cpp | 10 ++++++---- moses/LM/InMemoryPerSentenceOnDemandLM.h | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/moses/LM/InMemoryPerSentenceOnDemandLM.cpp b/moses/LM/InMemoryPerSentenceOnDemandLM.cpp index 12ef78f4e..364aebe42 100644 --- a/moses/LM/InMemoryPerSentenceOnDemandLM.cpp +++ b/moses/LM/InMemoryPerSentenceOnDemandLM.cpp @@ -17,7 +17,7 @@ using namespace std; namespace Moses { - InMemoryPerSentenceOnDemandLM::InMemoryPerSentenceOnDemandLM(const std::string &line) : LanguageModel(line), initialized(false) +InMemoryPerSentenceOnDemandLM::InMemoryPerSentenceOnDemandLM(const std::string &line) : LanguageModel(line), initialized(false) { ReadParameters(); } @@ -26,7 +26,8 @@ InMemoryPerSentenceOnDemandLM::~InMemoryPerSentenceOnDemandLM() { } -void InMemoryPerSentenceOnDemandLM::InitializeForInput(ttasksptr const& ttask) { +void InMemoryPerSentenceOnDemandLM::InitializeForInput(ttasksptr const& ttask) +{ // The context scope object for this translation task // contains a map of translation task-specific data @@ -63,14 +64,15 @@ void InMemoryPerSentenceOnDemandLM::InitializeForInput(ttasksptr const& ttask) { VERBOSE(1, filename); if (initialized) { - VERBOSE(1, "\tLM initialized\n"); + VERBOSE(1, "\tLM initialized\n"); } // std::remove(filename); } -LanguageModelKen& InMemoryPerSentenceOnDemandLM::GetPerThreadLM() const { +LanguageModelKen& InMemoryPerSentenceOnDemandLM::GetPerThreadLM() const +{ LanguageModelKen *lm; lm = m_perThreadLM.get(); diff --git a/moses/LM/InMemoryPerSentenceOnDemandLM.h b/moses/LM/InMemoryPerSentenceOnDemandLM.h index f0c1effa7..022ba9289 100644 --- a/moses/LM/InMemoryPerSentenceOnDemandLM.h +++ b/moses/LM/InMemoryPerSentenceOnDemandLM.h @@ -89,7 +89,7 @@ public: virtual void sync() { GetPerThreadLM().sync(); } - + virtual void SetFFStateIdx(int state_idx) { if (initialized) { GetPerThreadLM().SetFFStateIdx(state_idx); @@ -107,7 +107,7 @@ public: GetPerThreadLM().ReportHistoryOrder(out, phrase); } } - + virtual void EvaluateInIsolation(const Phrase &source , const TargetPhrase &targetPhrase , ScoreComponentCollection &scoreBreakdown From 02772c07dec22acb1d50397651ad189b0f97e1e6 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 3 Jan 2017 11:06:26 +0000 Subject: [PATCH 07/12] revert changes to Distortion FF --- contrib/moses2/FF/Distortion.cpp | 32 +++++--------------------------- contrib/moses2/FF/Distortion.h | 9 ++------- 2 files changed, 7 insertions(+), 34 deletions(-) diff --git a/contrib/moses2/FF/Distortion.cpp b/contrib/moses2/FF/Distortion.cpp index 9e55ff798..1d7b7246d 100644 --- a/contrib/moses2/FF/Distortion.cpp +++ b/contrib/moses2/FF/Distortion.cpp @@ -57,7 +57,6 @@ struct DistortionState_traditional: public FFState Distortion::Distortion(size_t startInd, const std::string &line) : StatefulFeatureFunction(startInd, line) { - m_completedHypo = false; ReadParameters(); } @@ -66,16 +65,6 @@ Distortion::~Distortion() // TODO Auto-generated destructor stub } -void Distortion::SetParameter(const std::string& key, const std::string& value) -{ - if (key == "completed-hypo") { - m_completedHypo = Scan(value); - } - else { - StatefulFeatureFunction::SetParameter(key, value); - } -} - FFState* Distortion::BlankState(MemPool &pool, const System &sys) const { return new (pool.Allocate()) DistortionState_traditional(); @@ -121,7 +110,7 @@ void Distortion::EvaluateWhenApplied(const ManagerBase &mgr, const DistortionState_traditional &prev = static_cast(prevState); SCORE distortionScore = CalculateDistortionScore(prev.range, - hypo.GetInputPath().range, prev.first_gap, hypo.GetBitmap()); + hypo.GetInputPath().range, prev.first_gap); //cerr << "distortionScore=" << distortionScore << endl; scores.PlusEquals(mgr.system, *this, distortionScore); @@ -134,11 +123,11 @@ void Distortion::EvaluateWhenApplied(const ManagerBase &mgr, } SCORE Distortion::CalculateDistortionScore(const Range &prev, const Range &curr, - const int FirstGap, const Bitmap &coverage) const + const int FirstGap) const { bool useEarlyDistortionCost = false; if (!useEarlyDistortionCost) { - return -(SCORE) ComputeDistortionDistance(prev, curr, coverage); + return -(SCORE) ComputeDistortionDistance(prev, curr); } else { /* Pay distortion score as soon as possible, from Moore and Quirk MT Summit 2007 @@ -179,7 +168,7 @@ SCORE Distortion::CalculateDistortionScore(const Range &prev, const Range &curr, } int Distortion::ComputeDistortionDistance(const Range& prev, - const Range& current, const Bitmap &coverage) const + const Range& current) const { int dist = 0; if (prev.GetNumWordsCovered() == 0) { @@ -187,18 +176,8 @@ int Distortion::ComputeDistortionDistance(const Range& prev, } else { dist = (int) prev.GetEndPos() - (int) current.GetStartPos() + 1; - dist = abs(dist); - - if (m_completedHypo && coverage.IsComplete()) { - dist += coverage.GetSize() - current.GetEndPos() - 1; - /* - cerr << "completed=" << coverage << " " << coverage.GetSize() << " " - << prev << " " - << current << " " << dist << endl; - */ - } } - return dist; + return abs(dist); } void Distortion::EvaluateWhenApplied(const SCFG::Manager &mgr, @@ -209,4 +188,3 @@ void Distortion::EvaluateWhenApplied(const SCFG::Manager &mgr, } } - diff --git a/contrib/moses2/FF/Distortion.h b/contrib/moses2/FF/Distortion.h index bc843fe54..45577d1c3 100644 --- a/contrib/moses2/FF/Distortion.h +++ b/contrib/moses2/FF/Distortion.h @@ -14,7 +14,6 @@ namespace Moses2 { -class Bitmap; class Distortion: public StatefulFeatureFunction { @@ -22,8 +21,6 @@ public: Distortion(size_t startInd, const std::string &line); virtual ~Distortion(); - virtual void SetParameter(const std::string& key, const std::string& value); - virtual FFState* BlankState(MemPool &pool, const System &sys) const; virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr, const InputType &input, const Hypothesis &hypo) const; @@ -51,12 +48,10 @@ public: FFState &state) const; protected: - bool m_completedHypo; - SCORE CalculateDistortionScore(const Range &prev, const Range &curr, - const int FirstGap, const Bitmap &coverage) const; + const int FirstGap) const; - int ComputeDistortionDistance(const Range& prev, const Range& current, const Bitmap &coverage) const; + int ComputeDistortionDistance(const Range& prev, const Range& current) const; }; From 2a5e40ed60d351f05ca58ad3be6ec0865d08373f Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Wed, 4 Jan 2017 22:01:45 -0600 Subject: [PATCH 08/12] New file: Lithuanian --- .../nonbreaking_prefix.lt | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt new file mode 100644 index 000000000..d7829e3c0 --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt @@ -0,0 +1,110 @@ +# Anything in this file, followed by a period (and an upper-case word), +# does NOT indicate an end-of-sentence marker. +# Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +# Any single upper case letter followed by a period is not a sentence ender +# (excluding I occasionally, but we leave it in) +# usually upper case letters are initials in a name +A +Ā +B +C +Č +D +E +Ē +F +G +Ģ +H +I +Ī +J +K +Ķ +L +Ļ +M +N +Ņ +O +P +Q +R +S +Š +T +U +Ū +V +W +X +Y +Z +Ž + +# Abbreviations m. menesis d. diena g. gimes +m +d +g + +# Day and month abbreviations +# Pirmadienis Penktadienis +Pr +Pn +Pirm +Antr +Treč +Ketv +Penkt +Šešt +Sekm +Saus +Vas +Kov +Bal +Geg +Birž +Liep +Rugpj +Rugs +Spal +Lapkr +Gruod + +# List of titles. These are often followed by upper-case names, but do +# not indicate sentence breaks +# +# Gerbiamasis +Gerb + +# XXX TODO .. Below are not quite correct, copied from latvian +dr +Dr +med +prof +Prof +inž +Inž +ist.loc +Ist.loc +kor.loc +Kor.loc +v.i +vietn +Vietn + +# misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT +# fall into this category - it sometimes ends a sentence) +# angl angliskai +# dab dabartine +angl +dab + + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nr #NUMERIC_ONLY# From 3ef84b133cc8bf64862b3c2dad254e7043439fb7 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Wed, 4 Jan 2017 22:30:53 -0600 Subject: [PATCH 09/12] More abbreviations --- .../nonbreaking_prefix.lt | 277 +++++++++++++++++- 1 file changed, 274 insertions(+), 3 deletions(-) diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt index d7829e3c0..4e2f6677e 100644 --- a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt @@ -43,12 +43,11 @@ Y Z Ž -# Abbreviations m. menesis d. diena g. gimes +# Day and month abbreviations +# m. menesis d. diena g. gimes m d g - -# Day and month abbreviations # Pirmadienis Penktadienis Pr Pn @@ -72,6 +71,278 @@ Spal Lapkr Gruod +# Technical terms, abbreviations used in guidebooks, advertisments, etc. +# Generally lower-case. +air +# airiškai +amer +# amerikanizmas +anat +# anatomija +arab +# arabų +archeol +archit +asm +# asmuo +astr +# astronomija +austral +# australiškai +aut +# automobilis +av +# aviacija +bažn +bdv +# būdvardis +bibl +# Biblija +biol +# biologija +bot +# botanika +buh +# buhalterija +chem +# chemija +d +# didysis +dgs +# daugiskaita +dial +# dialektizmas +dipl +dktv +# daiktavardis +džn +# dažnai +ekon +el +# elektra +esam +# esamasis laikas +euf +# eufemizmas +fam +# familiariai +farm +# farmacija +filos +# filosofija +fin +# finansai +fiz +# fizika +fiziol +flk +# folkloras +fon +# fonetika +fot +# fotografija +geod +# geodezija +geogr +geol +# geologija +geom +# geometrija +glžk +gr +# graikų +gram +her +# heraldika +hidr +# hidrotechnika +ind +# Indų +iron +# ironiškai +isp +# ispanų +ist +# istorija +it +# italų +įv +reikšm +įv.reikšm +# įvairiomis reikšmėmis +jap +# japonų +juok +# juokaujamai +jūr +# jūrininkystė +kalb +# kalbotyra +kar +# karyba +kas +# kasyba +kin +# kinematografija +klaus +# klausiamasis +knyg +# knyginis +kom +# komercija +komp +# kompiuteris +kosm +# kosmonautika +kt +# kitas +kul +# kulinarija +kuop +# kuopine +l +# laikas +lit +# literatūrinis +log +# logika +lot +# lotynų +mat +# matematika +maž +# mažybinis +med +# medicina +medž +# medžioklė +men +# menas +menk +# menkinamai +metal +# metalurgija +meteor +min +# mineralogija +mit +# mitologija +mok +# mokyklinis +muz +# muzikinis +n +# naujasis +neig +# neigiamasis +niek +# niekinamai +ofic +# oficialus +opt +# optika +p +# pietūs +pan +# panašiai +parl +# parlamentas +pat +# patarlė +paž +# pažodžiui +plg +# palygink +poet +# poetizmas +poligr +# poligrafija +polit +# politika +ppr +# paprastai +pr +# prancūzų +prk +# perkeltine +psn +# pasenęs žodis +psich +# psichologija +pvz +# pavyzdžiui +r +# rytai +rad +# radiotechnika +rel +# religija +ret +# retai +rus +# rusų +sen +# senasis +sl +# slengas +spec +# specialus +sport +stat +# statyba +sudurt +# sudurtinis +sutr +# sutrumpintas +š +# šiaurė +šach +# šachmatai +šiaur +škot +# škotiškai +šnek +# šnekamoji +teatr +tech +# technika +teig +# teigiamas +teis +# teisė +tekst +# tekstilė +tel +v +# tik vyriškosios, vakarai +t.p +t +p +# taip pat +vaik +# vaikų +vart +# vartojama +vet +# veterinarija +vid +# vidurinis +vksm +# veiksmažodis +vns +# vienaskaita +vok +# vokiečių +vulg +# vulgariai +zool +žr +# žiūrėk +ž.ū +ž +ū +# žemės ūkis + # List of titles. These are often followed by upper-case names, but do # not indicate sentence breaks # From d10ba6f049d8dc08d95a6a6e6934adf808160320 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Wed, 4 Jan 2017 23:52:28 -0600 Subject: [PATCH 10/12] More abbreviations for LLithuanian. --- .../nonbreaking_prefix.lt | 369 ++++++++++++++++-- 1 file changed, 343 insertions(+), 26 deletions(-) diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt index 4e2f6677e..fa72196d9 100644 --- a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lt @@ -43,11 +43,18 @@ Y Z Ž +# Initialis -- Džonas +Dz +Dž +Just + # Day and month abbreviations # m. menesis d. diena g. gimes m +mėn d g +gim # Pirmadienis Penktadienis Pr Pn @@ -71,6 +78,279 @@ Spal Lapkr Gruod +# Business, governmental, geographical terms +a +# aikštė +adv +# advokatas +akad +# akademikas +aklg +# akligatvis +akt +# aktorius +al +# alėja +A.V +# antspaudo vieta +aps +apskr +# apskritis +apyg +# apygarda +aps +apskr +# apskritis +asist +# asistentas +asmv +avd +# asmenvardis +a.k +asm +asm.k +# asmens kodas +atsak +# atsakingasis +atsisk +sąsk +# atsiskaitomoji sąskaita +aut +# autorius +b +k +b.k +# banko kodas +bkl +# bakalauras +bt +# butas +buv +# buvęs, -usi +dail +# dailininkas +dek +# dekanas +dėst +# dėstytojas +dir +# direktorius +dirig +# dirigentas +doc +# docentas +drp +# durpynas +dš +# dešinysis +egz +# egzempliorius +eil +# eilutė +ekon +# ekonomika +el +# elektroninis +etc +ež +# ežeras +faks +# faksas +fak +# fakultetas +gen +# generolas +gyd +# gydytojas +gv +# gyvenvietė +įl +# įlanka +Įn +# įnagininkas +insp +# inspektorius +pan +# ir panašiai +t.t +# ir taip toliau +k.a +# kaip antai +kand +# kandidatas +kat +# katedra +kyš +# kyšulys +kl +# klasė +kln +# kalnas +kn +# knyga +koresp +# korespondentas +kpt +# kapitonas +kr +# kairysis +kt +# kitas +kun +# kunigas +l +e +p +l.e.p +# laikinai einantis pareigas +ltn +# leitenantas +m +mst +# miestas +m.e +# mūsų eros +m.m +# mokslo metai +mot +# moteris +mstl +# miestelis +mgr +# magistras +mgnt +# magistrantas +mjr +# majoras +mln +# milijonas +mlrd +# milijardas +mok +# mokinys +mokyt +# mokytojas +moksl +# mokslinis +nkt +# nekaitomas +ntk +# neteiktinas +Nr +nr +# numeris +p +# ponas +p.d +a.d +# pašto dėžutė, abonentinė dėžutė +p.m.e +# prieš mūsų erą +pan +# ir panašiai +pav +# paveikslas +pavad +# pavaduotojas +pirm +# pirmininkas +pl +# plentas +plg +# palygink +plk +# pulkininkas; pelkė +pr +# prospektas +Kr +pr.Kr +# prieš Kristų +prok +# prokuroras +prot +# protokolas +pss +# pusiasalis +pšt +# paštas +pvz +# pavyzdžiui +r +# rajonas +red +# redaktorius +rš +# raštų kalbos +sąs +# sąsiuvinis +saviv +sav +# savivaldybė +sekr +# sekretorius +sen +# seniūnija, seniūnas +sk +# skaityk; skyrius +skg +# skersgatvis +skyr +sk +# skyrius +skv +# skveras +sp +# spauda; spaustuvė +spec +# specialistas +sr +# sritis +st +# stotis +str +# straipsnis +stud +# studentas +š +š.m +# šių metų +šnek +# šnekamosios +tir +# tiražas +tūkst +# tūkstantis +up +# upė +upl +# upelis +vad +# vadinamasis, -oji +vlsč +# valsčius +ved +# vedėjas +vet +# veterinarija +virš +# viršininkas, viršaitis +vyr +# vyriausiasis, -ioji; vyras +vyresn +# vyresnysis +vlsč +# valsčius +vs +# viensėdis +Vt +vt +# vietininkas +vtv +vv +# vietovardis +žml +# žemėlapis + # Technical terms, abbreviations used in guidebooks, advertisments, etc. # Generally lower-case. air @@ -79,6 +359,8 @@ amer # amerikanizmas anat # anatomija +angl +# angl. angliskai arab # arabų archeol @@ -102,12 +384,21 @@ biol # biologija bot # botanika +brt +# burtai, burtažodis. +brus +# baltarusių buh # buhalterija chem # chemija -d -# didysis +col +# collectivum +con +conj +# conjunctivus, jungtukas +dab +# dab. dabartine dgs # daugiskaita dial @@ -128,6 +419,8 @@ fam # familiariai farm # farmacija +filol +# filologija filos # filosofija fin @@ -135,6 +428,7 @@ fin fiz # fizika fiziol +# fiziologija flk # folkloras fon @@ -163,6 +457,7 @@ iron isp # ispanų ist +istor # istorija it # italų @@ -204,6 +499,8 @@ l # laikas lit # literatūrinis +lingv +# lingvistika log # logika lot @@ -229,18 +526,24 @@ mit # mitologija mok # mokyklinis +ms +# mįslė muz # muzikinis n # naujasis neig # neigiamasis +neol +# neologizmas niek # niekinamai ofic # oficialus opt # optika +orig +# original p # pietūs pan @@ -255,16 +558,25 @@ plg # palygink poet # poetizmas +poez +# poezija poligr # poligrafija polit # politika ppr # paprastai +pranc pr -# prancūzų +# prancūzų, prūsų +priet +# prietaras +prek +# prekyba prk # perkeltine +prs +# persona, asmuo psn # pasenęs žodis psich @@ -284,7 +596,9 @@ rus sen # senasis sl -# slengas +# slengas, slavų +sov +# sovietinis spec # specialus sport @@ -294,6 +608,8 @@ sudurt # sudurtinis sutr # sutrumpintas +suv +# suvalkiečių š # šiaurė šach @@ -305,6 +621,7 @@ sutr # šnekamoji teatr tech +techn # technika teig # teigiamas @@ -313,12 +630,19 @@ teis tekst # tekstilė tel +# telefonas +teol +# teologija v # tik vyriškosios, vakarai t.p t p -# taip pat +# ir taip pat +t.t +# ir taip toliau +t.y +# tai yra vaik # vaikų vart @@ -336,6 +660,7 @@ vok vulg # vulgariai zool +# zoologija žr # žiūrėk ž.ū @@ -346,31 +671,24 @@ zool # List of titles. These are often followed by upper-case names, but do # not indicate sentence breaks # +# Jo Eminencija +Em. # Gerbiamasis Gerb - -# XXX TODO .. Below are not quite correct, copied from latvian -dr -Dr -med -prof +gerb +# malonus +malon +# profesorius Prof +prof +# daktaras (mokslų) +Dr +dr +habil +med +# inž inžinierius inž Inž -ist.loc -Ist.loc -kor.loc -Kor.loc -v.i -vietn -Vietn - -# misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT -# fall into this category - it sometimes ends a sentence) -# angl angliskai -# dab dabartine -angl -dab #Numbers only. These should only induce breaks when followed by a numeric sequence @@ -378,4 +696,3 @@ dab #This case is mostly for the english "No." which can either be a sentence of its own, or #if followed by a number, a non-breaking prefix No #NUMERIC_ONLY# -Nr #NUMERIC_ONLY# From ab6816f9a755f37de00090829f62848372e8222e Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Thu, 5 Jan 2017 10:08:06 -0600 Subject: [PATCH 11/12] Purely cosmetic cleanup. Use same indentation style throughout; wrap long lines; capitalize sentences; add punctuation; remove trailing whitespace. --- scripts/ems/support/split-sentences.perl | 45 ++++++++++++------------ 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index f72767054..7bad038a1 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -29,10 +29,10 @@ while (@ARGV) { } if ($HELP) { - print "Usage ./split-sentences.perl (-l [en|de|...]) [-q] [-b] < textfile > splitfile\n"; - print "-q: quiet mode\n"; - print "-b: no output buffering (for use in bidirectional pipes)\n"; - exit; + print "Usage ./split-sentences.pl (-l [en|de|...]) [-q] [-b] < textfile > splitfile\n"; + print "-q: quiet mode\n"; + print "-b: no output buffering (for use in bidirectional pipes)\n"; + exit; } if (!$QUIET) { print STDERR "Sentence Splitter v3\n"; @@ -64,9 +64,9 @@ if (-e "$prefixfile") { close(PREFIX); } -##loop text, add lines together until we get a blank line or a

+## Loop over text, add lines together until we get a blank line or a

my $text = ""; -while() { +while () { chop; if (/^<.+>$/ || /^\s*$/) { #time to process this block, we've hit a blank or

@@ -79,7 +79,7 @@ while() { $text .= $_. " "; } } -#do the leftover text +# Do the leftover text. &do_it_for($text,"") if $text; @@ -91,28 +91,32 @@ sub do_it_for { } sub preprocess { - #this is one paragraph + # This is one paragraph. my($text) = @_; - # clean up spaces at head and tail of each line as well as any double-spacing + # Clean up spaces at head and tail of each line, as well as + # any double-spacing. $text =~ s/ +/ /g; $text =~ s/\n /\n/g; $text =~ s/ \n/\n/g; $text =~ s/^ //g; $text =~ s/ $//g; - #####add sentence breaks as needed##### + ##### Add sentence breaks as needed ##### - #non-period end of sentence markers (?!) followed by sentence starters. + # Non-period end of sentence markers (?!) followed by sentence starters. $text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; - #multi-dots followed by sentence starters + # Multi-dots followed by sentence starters. $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; - # add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case + # Add breaks for sentences that end with some sort of punctuation + # inside a quote or parenthetical and are followed by a possible + # sentence starter punctuation and upper case. $text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g; - # add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case + # Add breaks for sentences that end with some sort of punctuation, + # and are followed by a sentence starter punctuation and upper case. $text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g; # special punctuation cases are covered. Check all remaining periods. @@ -130,30 +134,27 @@ sub preprocess { } elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) { #not breaking - upper case acronym } elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) { - #the next word has a bunch of initial quotes, maybe a space, then either upper case or a number + # The next word has a bunch of initial quotes, maybe a + # space, then either upper case or a number $words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/)); #we always add a return for these unless we have a numeric non-breaker and a number start } - } $text = $text.$words[$i]." "; } - #we stopped one token from the end to allow for easy look-ahead. Append it now. + # We stopped one token from the end to allow for easy look-ahead. Append it now. $text = $text.$words[$i]; - # clean up spaces at head and tail of each line as well as any double-spacing + # Clean up spaces at head and tail of each line as well as any double-spacing $text =~ s/ +/ /g; $text =~ s/\n /\n/g; $text =~ s/ \n/\n/g; $text =~ s/^ //g; $text =~ s/ $//g; - #add trailing break + # Add trailing break. $text .= "\n" unless $text =~ /\n$/; return $text; - } - - From 9f5500a3a8df10ff0a99238c8c81679c9b9420a2 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Thu, 5 Jan 2017 10:09:34 -0600 Subject: [PATCH 12/12] oops. --- scripts/ems/support/split-sentences.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 7bad038a1..7f2fb3ced 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -29,7 +29,7 @@ while (@ARGV) { } if ($HELP) { - print "Usage ./split-sentences.pl (-l [en|de|...]) [-q] [-b] < textfile > splitfile\n"; + print "Usage ./split-sentences.perl (-l [en|de|...]) [-q] [-b] < textfile > splitfile\n"; print "-q: quiet mode\n"; print "-b: no output buffering (for use in bidirectional pipes)\n"; exit;