mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
Merge pull request #31 from achimr/master
Hungarian and Latvian non-breaking prefix files
This commit is contained in:
commit
656b70c829
103
scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu
Normal file
103
scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu
Normal file
@ -0,0 +1,103 @@
|
||||
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
||||
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
||||
|
||||
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
|
||||
#usually upper case letters are initials in a name
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
Á
|
||||
É
|
||||
Í
|
||||
Ó
|
||||
Ö
|
||||
Ő
|
||||
Ú
|
||||
Ü
|
||||
Ű
|
||||
|
||||
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
|
||||
Dr
|
||||
dr
|
||||
kb
|
||||
Kb
|
||||
vö
|
||||
Vö
|
||||
pl
|
||||
Pl
|
||||
ca
|
||||
Ca
|
||||
min
|
||||
Min
|
||||
max
|
||||
Max
|
||||
ún
|
||||
Ún
|
||||
prof
|
||||
Prof
|
||||
de
|
||||
De
|
||||
du
|
||||
Du
|
||||
Szt
|
||||
St
|
||||
|
||||
#Numbers only. These should only induce breaks when followed by a numeric sequence
|
||||
# add NUMERIC_ONLY after the word for this function
|
||||
#This case is mostly for the english "No." which can either be a sentence of its own, or
|
||||
#if followed by a number, a non-breaking prefix
|
||||
|
||||
# Month name abbreviations
|
||||
jan #NUMERIC_ONLY#
|
||||
Jan #NUMERIC_ONLY#
|
||||
Feb #NUMERIC_ONLY#
|
||||
feb #NUMERIC_ONLY#
|
||||
márc #NUMERIC_ONLY#
|
||||
Márc #NUMERIC_ONLY#
|
||||
ápr #NUMERIC_ONLY#
|
||||
Ápr #NUMERIC_ONLY#
|
||||
máj #NUMERIC_ONLY#
|
||||
Máj #NUMERIC_ONLY#
|
||||
jún #NUMERIC_ONLY#
|
||||
Jún #NUMERIC_ONLY#
|
||||
Júl #NUMERIC_ONLY#
|
||||
júl #NUMERIC_ONLY#
|
||||
aug #NUMERIC_ONLY#
|
||||
Aug #NUMERIC_ONLY#
|
||||
Szept #NUMERIC_ONLY#
|
||||
szept #NUMERIC_ONLY#
|
||||
okt #NUMERIC_ONLY#
|
||||
Okt #NUMERIC_ONLY#
|
||||
nov #NUMERIC_ONLY#
|
||||
Nov #NUMERIC_ONLY#
|
||||
dec #NUMERIC_ONLY#
|
||||
Dec #NUMERIC_ONLY#
|
||||
|
||||
# Other abbreviations
|
||||
tel #NUMERIC_ONLY#
|
||||
Tel #NUMERIC_ONLY#
|
||||
Fax #NUMERIC_ONLY#
|
||||
fax #NUMERIC_ONLY#
|
100
scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv
Normal file
100
scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv
Normal file
@ -0,0 +1,100 @@
|
||||
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
||||
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
||||
|
||||
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
|
||||
#usually upper case letters are initials in a name
|
||||
A
|
||||
Ā
|
||||
B
|
||||
C
|
||||
Č
|
||||
D
|
||||
E
|
||||
Ē
|
||||
F
|
||||
G
|
||||
Ģ
|
||||
H
|
||||
I
|
||||
Ī
|
||||
J
|
||||
K
|
||||
Ķ
|
||||
L
|
||||
Ļ
|
||||
M
|
||||
N
|
||||
Ņ
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
Š
|
||||
T
|
||||
U
|
||||
Ū
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
Ž
|
||||
|
||||
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
|
||||
dr
|
||||
Dr
|
||||
med
|
||||
prof
|
||||
Prof
|
||||
inž
|
||||
Inž
|
||||
ist.loc
|
||||
Ist.loc
|
||||
kor.loc
|
||||
Kor.loc
|
||||
v.i
|
||||
vietn
|
||||
Vietn
|
||||
|
||||
#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
|
||||
a.l
|
||||
t.p
|
||||
pārb
|
||||
Pārb
|
||||
vec
|
||||
Vec
|
||||
inv
|
||||
Inv
|
||||
sk
|
||||
Sk
|
||||
spec
|
||||
Spec
|
||||
vienk
|
||||
Vienk
|
||||
virz
|
||||
Virz
|
||||
māksl
|
||||
Māksl
|
||||
mūz
|
||||
Mūz
|
||||
akad
|
||||
Akad
|
||||
soc
|
||||
Soc
|
||||
galv
|
||||
Galv
|
||||
vad
|
||||
Vad
|
||||
sertif
|
||||
Sertif
|
||||
folkl
|
||||
Folkl
|
||||
hum
|
||||
Hum
|
||||
|
||||
#Numbers only. These should only induce breaks when followed by a numeric sequence
|
||||
# add NUMERIC_ONLY after the word for this function
|
||||
#This case is mostly for the english "No." which can either be a sentence of its own, or
|
||||
#if followed by a number, a non-breaking prefix
|
||||
Nr #NUMERIC_ONLY#
|
Loading…
Reference in New Issue
Block a user