This commit is contained in:
Lane Schwartz 2017-01-05 15:53:04 -06:00
commit 171edca393
11 changed files with 784 additions and 40 deletions

View File

@ -188,4 +188,3 @@ void Distortion::EvaluateWhenApplied(const SCFG::Manager &mgr,
}
}

View File

@ -80,6 +80,7 @@ public:
return m_tuneable;
}
virtual void SetParameter(const std::string& key, const std::string& value);
// may have more factors than actually need, but not guaranteed.
virtual void
@ -118,7 +119,6 @@ protected:
std::vector<std::vector<std::string> > m_args;
bool m_tuneable;
virtual void SetParameter(const std::string& key, const std::string& value);
virtual void ReadParameters();
void ParseLine(const std::string &line);
};

View File

@ -103,8 +103,9 @@ void FeatureFunctions::Create()
unkWP->SetParameter("suffix", m_system.options.unk.suffix);
}
}
}
OverrideFeatures();
}
FeatureFunction *FeatureFunctions::Create(const std::string &line)
@ -150,6 +151,17 @@ const FeatureFunction *FeatureFunctions::FindFeatureFunction(
return NULL;
}
FeatureFunction *FeatureFunctions::FindFeatureFunction(
const std::string &name)
{
BOOST_FOREACH(const FeatureFunction *ff, m_featureFunctions){
if (ff->GetName() == name) {
return const_cast<FeatureFunction *>(ff);
}
}
return NULL;
}
const PhraseTable *FeatureFunctions::GetPhraseTableExcludeUnknownWordPenalty(size_t ptInd)
{
// assume only 1 unk wp
@ -243,5 +255,33 @@ void FeatureFunctions::ShowWeights(const Weights &allWeights)
}
}
void FeatureFunctions::OverrideFeatures()
{
const Parameter &parameter = m_system.params;
const PARAM_VEC *params = parameter.GetParam("feature-overwrite");
for (size_t i = 0; params && i < params->size(); ++i) {
const string &str = params->at(i);
vector<string> toks = Tokenize(str);
UTIL_THROW_IF2(toks.size() <= 1, "Incorrect format for feature override: " << str);
FeatureFunction *ff = FindFeatureFunction(toks[0]);
UTIL_THROW_IF2(ff == NULL, "Feature function not found: " << toks[0]);
for (size_t j = 1; j < toks.size(); ++j) {
const string &keyValStr = toks[j];
vector<string> keyVal = Tokenize(keyValStr, "=");
UTIL_THROW_IF2(keyVal.size() != 2, "Incorrect format for parameter override: " << keyValStr);
cerr << "Override " << ff->GetName() << " "
<< keyVal[0] << "=" << keyVal[1] << endl;
ff->SetParameter(keyVal[0], keyVal[1]);
}
}
}
}

View File

@ -95,10 +95,13 @@ protected:
System &m_system;
size_t m_ffStartInd;
FeatureRegistry m_registry;
FeatureFunction *Create(const std::string &line);
std::string GetDefaultName(const std::string &stub);
void OverrideFeatures();
FeatureFunction *FindFeatureFunction(const std::string &name);
FeatureRegistry m_registry;
};
}

View File

@ -94,8 +94,8 @@ Parameter::Parameter()
AddParam(search_opts, "weight",
"weights for ALL models, 1 per line 'WeightName value'. Weight names can be repeated");
//AddParam(search_opts, "feature-overwrite",
// "Override arguments in a particular feature function with a particular key. Format: -feature-overwrite \"FeatureName key=value\"");
AddParam(search_opts, "feature-overwrite",
"Override arguments in a particular feature function with a particular key. Format: -feature-overwrite \"FeatureName key=value\"");
po::options_description tune_opts("Options used in tuning.");
AddParam(tune_opts, "weight-overwrite",
@ -373,6 +373,9 @@ Parameter::Parameter()
///////////////////////////////////////////////////////////////////////////////////////
// DEPRECATED options
po::options_description deprec_opts("Deprecated Options");
AddParam(deprec_opts, "text-type",
"DEPRECATED. DO NOT USE. should be one of dev/devtest/test, used for domain adaptation features");
/*
AddParam(deprec_opts, "link-param-count",
"DEPRECATED. DO NOT USE. Number of parameters on word links when using confusion networks or lattices (default = 1)");
@ -412,8 +415,6 @@ Parameter::Parameter()
"DEPRECATED. DO NOT USE. weight for unknown word penalty");
AddParam(deprec_opts, "weight-e", "e",
"DEPRECATED. DO NOT USE. weight for word deletion");
AddParam(deprec_opts, "text-type",
"DEPRECATED. DO NOT USE. should be one of dev/devtest/test, used for domain adaptation features");
AddParam(deprec_opts, "input-scores",
"DEPRECATED. DO NOT USE. 2 numbers on 2 lines - [1] of scores on each edge of a confusion network or lattice input (default=1). [2] Number of 'real' word scores (0 or 1. default=0)");
AddParam(deprec_opts, "dlm-model",

View File

@ -17,7 +17,7 @@ using namespace std;
namespace Moses
{
InMemoryPerSentenceOnDemandLM::InMemoryPerSentenceOnDemandLM(const std::string &line) : LanguageModel(line), m_factorType(0)
InMemoryPerSentenceOnDemandLM::InMemoryPerSentenceOnDemandLM(const std::string &line) : LanguageModel(line), initialized(false)
{
ReadParameters();
}
@ -26,7 +26,8 @@ InMemoryPerSentenceOnDemandLM::~InMemoryPerSentenceOnDemandLM()
{
}
void InMemoryPerSentenceOnDemandLM::InitializeForInput(ttasksptr const& ttask) {
void InMemoryPerSentenceOnDemandLM::InitializeForInput(ttasksptr const& ttask)
{
// The context scope object for this translation task
// contains a map of translation task-specific data
@ -66,7 +67,8 @@ void InMemoryPerSentenceOnDemandLM::InitializeForInput(ttasksptr const& ttask) {
}
LanguageModelKen<lm::ngram::ProbingModel>& InMemoryPerSentenceOnDemandLM::GetPerThreadLM() const {
LanguageModelKen<lm::ngram::ProbingModel>& InMemoryPerSentenceOnDemandLM::GetPerThreadLM() const
{
LanguageModelKen<lm::ngram::ProbingModel> *lm;
lm = m_perThreadLM.get();

View File

@ -102,7 +102,7 @@ public:
UTIL_THROW(util::Exception, "WARNING: InMemoryPerSentenceOnDemand::sync called prior to being initialized");
}
}
virtual void SetFFStateIdx(int state_idx) {
if (isInitialized()) {
GetPerThreadLM().SetFFStateIdx(state_idx);
@ -126,7 +126,7 @@ public:
UTIL_THROW(util::Exception, "WARNING: InMemoryPerSentenceOnDemand::ReportHistoryOrder called prior to being initialized");
}
}
virtual void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown

View File

@ -1196,7 +1196,7 @@ tune
default-name: tuning/moses.ini
tmp-name: tuning/tmp
final-model: yes
rerun-on-change: decoder-settings tuning-settings nbest lambda async
rerun-on-change: decoder decoder-settings tuning-settings nbest lambda async
not-error: trans: No such file or directory
thot-tune
in: TRAINING:config input reference

View File

@ -29,10 +29,10 @@ while (@ARGV) {
}
if ($HELP) {
print "Usage ./split-sentences.perl (-l [en|de|...]) [-q] [-b] < textfile > splitfile\n";
print "-q: quiet mode\n";
print "-b: no output buffering (for use in bidirectional pipes)\n";
exit;
print "Usage ./split-sentences.perl (-l [en|de|...]) [-q] [-b] < textfile > splitfile\n";
print "-q: quiet mode\n";
print "-b: no output buffering (for use in bidirectional pipes)\n";
exit;
}
if (!$QUIET) {
print STDERR "Sentence Splitter v3\n";
@ -64,9 +64,9 @@ if (-e "$prefixfile") {
close(PREFIX);
}
##loop text, add lines together until we get a blank line or a <p>
## Loop over text, add lines together until we get a blank line or a <p>
my $text = "";
while(<STDIN>) {
while (<STDIN>) {
chop;
if (/^<.+>$/ || /^\s*$/) {
#time to process this block, we've hit a blank or <p>
@ -79,7 +79,7 @@ while(<STDIN>) {
$text .= $_. " ";
}
}
#do the leftover text
# Do the leftover text.
&do_it_for($text,"") if $text;
@ -91,28 +91,32 @@ sub do_it_for {
}
sub preprocess {
#this is one paragraph
# This is one paragraph.
my($text) = @_;
# clean up spaces at head and tail of each line as well as any double-spacing
# Clean up spaces at head and tail of each line, as well as
# any double-spacing.
$text =~ s/ +/ /g;
$text =~ s/\n /\n/g;
$text =~ s/ \n/\n/g;
$text =~ s/^ //g;
$text =~ s/ $//g;
#####add sentence breaks as needed#####
##### Add sentence breaks as needed #####
#non-period end of sentence markers (?!) followed by sentence starters.
# Non-period end of sentence markers (?!) followed by sentence starters.
$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
#multi-dots followed by sentence starters
# Multi-dots followed by sentence starters.
$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
# add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
# Add breaks for sentences that end with some sort of punctuation
# inside a quote or parenthetical and are followed by a possible
# sentence starter punctuation and upper case.
$text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
# add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
# Add breaks for sentences that end with some sort of punctuation,
# and are followed by a sentence starter punctuation and upper case.
$text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
# special punctuation cases are covered. Check all remaining periods.
@ -130,30 +134,27 @@ sub preprocess {
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
#not breaking - upper case acronym
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
#the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
# The next word has a bunch of initial quotes, maybe a
# space, then either upper case or a number
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
#we always add a return for these unless we have a numeric non-breaker and a number start
}
}
$text = $text.$words[$i]." ";
}
#we stopped one token from the end to allow for easy look-ahead. Append it now.
# We stopped one token from the end to allow for easy look-ahead. Append it now.
$text = $text.$words[$i];
# clean up spaces at head and tail of each line as well as any double-spacing
# Clean up spaces at head and tail of each line as well as any double-spacing
$text =~ s/ +/ /g;
$text =~ s/\n /\n/g;
$text =~ s/ \n/\n/g;
$text =~ s/^ //g;
$text =~ s/ $//g;
#add trailing break
# Add trailing break.
$text .= "\n" unless $text =~ /\n$/;
return $text;
}

View File

@ -0,0 +1,698 @@
# Anything in this file, followed by a period (and an upper-case word),
# does NOT indicate an end-of-sentence marker.
# Special cases are included for prefixes that ONLY appear before 0-9 numbers.
# Any single upper case letter followed by a period is not a sentence ender
# (excluding I occasionally, but we leave it in)
# usually upper case letters are initials in a name
A
Ā
B
C
Č
D
E
Ē
F
G
Ģ
H
I
Ī
J
K
Ķ
L
Ļ
M
N
Ņ
O
P
Q
R
S
Š
T
U
Ū
V
W
X
Y
Z
Ž
# Initialis -- Džonas
Dz
Just
# Day and month abbreviations
# m. menesis d. diena g. gimes
m
mėn
d
g
gim
# Pirmadienis Penktadienis
Pr
Pn
Pirm
Antr
Treč
Ketv
Penkt
Šešt
Sekm
Saus
Vas
Kov
Bal
Geg
Birž
Liep
Rugpj
Rugs
Spal
Lapkr
Gruod
# Business, governmental, geographical terms
a
# aikštė
adv
# advokatas
akad
# akademikas
aklg
# akligatvis
akt
# aktorius
al
# alėja
A.V
# antspaudo vieta
aps
apskr
# apskritis
apyg
# apygarda
aps
apskr
# apskritis
asist
# asistentas
asmv
avd
# asmenvardis
a.k
asm
asm.k
# asmens kodas
atsak
# atsakingasis
atsisk
sąsk
# atsiskaitomoji sąskaita
aut
# autorius
b
k
b.k
# banko kodas
bkl
# bakalauras
bt
# butas
buv
# buvęs, -usi
dail
# dailininkas
dek
# dekanas
dėst
# dėstytojas
dir
# direktorius
dirig
# dirigentas
doc
# docentas
drp
# durpynas
# dešinysis
egz
# egzempliorius
eil
# eilutė
ekon
# ekonomika
el
# elektroninis
etc
# ežeras
faks
# faksas
fak
# fakultetas
gen
# generolas
gyd
# gydytojas
gv
# gyvenvietė
įl
# įlanka
Įn
# įnagininkas
insp
# inspektorius
pan
# ir panašiai
t.t
# ir taip toliau
k.a
# kaip antai
kand
# kandidatas
kat
# katedra
kyš
# kyšulys
kl
# klasė
kln
# kalnas
kn
# knyga
koresp
# korespondentas
kpt
# kapitonas
kr
# kairysis
kt
# kitas
kun
# kunigas
l
e
p
l.e.p
# laikinai einantis pareigas
ltn
# leitenantas
m
mst
# miestas
m.e
# mūsų eros
m.m
# mokslo metai
mot
# moteris
mstl
# miestelis
mgr
# magistras
mgnt
# magistrantas
mjr
# majoras
mln
# milijonas
mlrd
# milijardas
mok
# mokinys
mokyt
# mokytojas
moksl
# mokslinis
nkt
# nekaitomas
ntk
# neteiktinas
Nr
nr
# numeris
p
# ponas
p.d
a.d
# pašto dėžutė, abonentinė dėžutė
p.m.e
# prieš mūsų erą
pan
# ir panašiai
pav
# paveikslas
pavad
# pavaduotojas
pirm
# pirmininkas
pl
# plentas
plg
# palygink
plk
# pulkininkas; pelkė
pr
# prospektas
Kr
pr.Kr
# prieš Kristų
prok
# prokuroras
prot
# protokolas
pss
# pusiasalis
pšt
# paštas
pvz
# pavyzdžiui
r
# rajonas
red
# redaktorius
# raštų kalbos
sąs
# sąsiuvinis
saviv
sav
# savivaldybė
sekr
# sekretorius
sen
# seniūnija, seniūnas
sk
# skaityk; skyrius
skg
# skersgatvis
skyr
sk
# skyrius
skv
# skveras
sp
# spauda; spaustuvė
spec
# specialistas
sr
# sritis
st
# stotis
str
# straipsnis
stud
# studentas
š
š.m
# šių metų
šnek
# šnekamosios
tir
# tiražas
tūkst
# tūkstantis
up
# upė
upl
# upelis
vad
# vadinamasis, -oji
vlsč
# valsčius
ved
# vedėjas
vet
# veterinarija
virš
# viršininkas, viršaitis
vyr
# vyriausiasis, -ioji; vyras
vyresn
# vyresnysis
vlsč
# valsčius
vs
# viensėdis
Vt
vt
# vietininkas
vtv
vv
# vietovardis
žml
# žemėlapis
# Technical terms, abbreviations used in guidebooks, advertisments, etc.
# Generally lower-case.
air
# airiškai
amer
# amerikanizmas
anat
# anatomija
angl
# angl. angliskai
arab
# arabų
archeol
archit
asm
# asmuo
astr
# astronomija
austral
# australiškai
aut
# automobilis
av
# aviacija
bažn
bdv
# būdvardis
bibl
# Biblija
biol
# biologija
bot
# botanika
brt
# burtai, burtažodis.
brus
# baltarusių
buh
# buhalterija
chem
# chemija
col
# collectivum
con
conj
# conjunctivus, jungtukas
dab
# dab. dabartine
dgs
# daugiskaita
dial
# dialektizmas
dipl
dktv
# daiktavardis
džn
# dažnai
ekon
el
# elektra
esam
# esamasis laikas
euf
# eufemizmas
fam
# familiariai
farm
# farmacija
filol
# filologija
filos
# filosofija
fin
# finansai
fiz
# fizika
fiziol
# fiziologija
flk
# folkloras
fon
# fonetika
fot
# fotografija
geod
# geodezija
geogr
geol
# geologija
geom
# geometrija
glžk
gr
# graikų
gram
her
# heraldika
hidr
# hidrotechnika
ind
# Indų
iron
# ironiškai
isp
# ispanų
ist
istor
# istorija
it
# italų
įv
reikšm
įv.reikšm
# įvairiomis reikšmėmis
jap
# japonų
juok
# juokaujamai
jūr
# jūrininkystė
kalb
# kalbotyra
kar
# karyba
kas
# kasyba
kin
# kinematografija
klaus
# klausiamasis
knyg
# knyginis
kom
# komercija
komp
# kompiuteris
kosm
# kosmonautika
kt
# kitas
kul
# kulinarija
kuop
# kuopine
l
# laikas
lit
# literatūrinis
lingv
# lingvistika
log
# logika
lot
# lotynų
mat
# matematika
maž
# mažybinis
med
# medicina
medž
# medžioklė
men
# menas
menk
# menkinamai
metal
# metalurgija
meteor
min
# mineralogija
mit
# mitologija
mok
# mokyklinis
ms
# mįslė
muz
# muzikinis
n
# naujasis
neig
# neigiamasis
neol
# neologizmas
niek
# niekinamai
ofic
# oficialus
opt
# optika
orig
# original
p
# pietūs
pan
# panašiai
parl
# parlamentas
pat
# patarlė
paž
# pažodžiui
plg
# palygink
poet
# poetizmas
poez
# poezija
poligr
# poligrafija
polit
# politika
ppr
# paprastai
pranc
pr
# prancūzų, prūsų
priet
# prietaras
prek
# prekyba
prk
# perkeltine
prs
# persona, asmuo
psn
# pasenęs žodis
psich
# psichologija
pvz
# pavyzdžiui
r
# rytai
rad
# radiotechnika
rel
# religija
ret
# retai
rus
# rusų
sen
# senasis
sl
# slengas, slavų
sov
# sovietinis
spec
# specialus
sport
stat
# statyba
sudurt
# sudurtinis
sutr
# sutrumpintas
suv
# suvalkiečių
š
# šiaurė
šach
# šachmatai
šiaur
škot
# škotiškai
šnek
# šnekamoji
teatr
tech
techn
# technika
teig
# teigiamas
teis
# teisė
tekst
# tekstilė
tel
# telefonas
teol
# teologija
v
# tik vyriškosios, vakarai
t.p
t
p
# ir taip pat
t.t
# ir taip toliau
t.y
# tai yra
vaik
# vaikų
vart
# vartojama
vet
# veterinarija
vid
# vidurinis
vksm
# veiksmažodis
vns
# vienaskaita
vok
# vokiečių
vulg
# vulgariai
zool
# zoologija
žr
# žiūrėk
ž.ū
ž
ū
# žemės ūkis
# List of titles. These are often followed by upper-case names, but do
# not indicate sentence breaks
#
# Jo Eminencija
Em.
# Gerbiamasis
Gerb
gerb
# malonus
malon
# profesorius
Prof
prof
# daktaras (mokslų)
Dr
dr
habil
med
# inž inžinierius
inž
Inž
#Numbers only. These should only induce breaks when followed by a numeric sequence
# add NUMERIC_ONLY after the word for this function
#This case is mostly for the english "No." which can either be a sentence of its own, or
#if followed by a number, a non-breaking prefix
No #NUMERIC_ONLY#

View File

@ -228,7 +228,7 @@ while ( my $line = <INI> ) {
$phrase_table_impl = "PhraseDictionaryOnDisk";
@toks = set_value( \@toks, "path", "$new_name.bin$table_flag" );
}
elsif ( $binarizer =~ /CreateProbingPT2/ ) {
elsif ( $binarizer =~ /CreateProbingPT/ ) {
$phrase_table_impl = "ProbingPT";
@toks = set_value( \@toks, "path", "$new_name.probing$table_flag" );
}
@ -488,7 +488,7 @@ for ( my $i = 0 ; $i <= $#TABLE ; $i++ ) {
my $cmd = "$binarizer $mid_file $new_file.bin";
safesystem($cmd) or die "Can't binarize";
}
elsif ( $binarizer =~ /CreateProbingPT2/ ) {
elsif ( $binarizer =~ /CreateProbingPT/ ) {
my $cmd = "$binarizer --input-pt $mid_file --output-dir $new_file.probing";
if ($opt_hierarchical) {
$cmd .= " --scfg";
@ -509,8 +509,8 @@ for ( my $i = 0 ; $i <= $#TABLE ; $i++ ) {
if ( $binarizer =~ /CreateOnDiskPt/ ) {
$lexbin =~ s/CreateOnDiskPt/processLexicalTable/;
}
elsif ( $binarizer =~ /CreateProbingPT2/ ) {
$lexbin =~ s/CreateProbingPT2/processLexicalTableMin/;
elsif ( $binarizer =~ /CreateProbingPT/ ) {
$lexbin =~ s/CreateProbingPT/processLexicalTableMin/;
}
$lexbin =~ s/PhraseTable/LexicalTable/;