mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-07-14 23:00:29 +03:00
Add tokenisation support for the Tetun language
This commit is contained in:
parent
7dd812180e
commit
75d4c672e8
210
scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt
Normal file
210
scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt
Normal file
@ -0,0 +1,210 @@
|
||||
#File adapted for TDT from PT by Raphael Merx. Last update: 10.11.2009.
|
||||
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
||||
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
||||
|
||||
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
|
||||
#usually upper case letters are initials in a name
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
a
|
||||
b
|
||||
c
|
||||
d
|
||||
e
|
||||
f
|
||||
g
|
||||
h
|
||||
i
|
||||
j
|
||||
k
|
||||
l
|
||||
m
|
||||
n
|
||||
o
|
||||
p
|
||||
q
|
||||
r
|
||||
s
|
||||
t
|
||||
u
|
||||
v
|
||||
w
|
||||
x
|
||||
y
|
||||
z
|
||||
|
||||
|
||||
#Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
|
||||
I
|
||||
II
|
||||
III
|
||||
IV
|
||||
V
|
||||
VI
|
||||
VII
|
||||
VIII
|
||||
IX
|
||||
X
|
||||
XI
|
||||
XII
|
||||
XIII
|
||||
XIV
|
||||
XV
|
||||
XVI
|
||||
XVII
|
||||
XVIII
|
||||
XIX
|
||||
XX
|
||||
i
|
||||
ii
|
||||
iii
|
||||
iv
|
||||
v
|
||||
vi
|
||||
vii
|
||||
viii
|
||||
ix
|
||||
x
|
||||
xi
|
||||
xii
|
||||
xiii
|
||||
xiv
|
||||
xv
|
||||
xvi
|
||||
xvii
|
||||
xviii
|
||||
xix
|
||||
xx
|
||||
|
||||
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
|
||||
Adj
|
||||
Adm
|
||||
Adv
|
||||
Art
|
||||
Ca
|
||||
Capt
|
||||
Cmdr
|
||||
Col
|
||||
Comdr
|
||||
Con
|
||||
Corp
|
||||
Cpl
|
||||
DR
|
||||
DRA
|
||||
Dr
|
||||
Dra
|
||||
Dras
|
||||
Drs
|
||||
Eng
|
||||
Enga
|
||||
Engas
|
||||
Engos
|
||||
Ex
|
||||
Exo
|
||||
Exmo
|
||||
Fig
|
||||
Gen
|
||||
Hosp
|
||||
Insp
|
||||
Lda
|
||||
MM
|
||||
MR
|
||||
MRS
|
||||
MS
|
||||
Maj
|
||||
Mrs
|
||||
Ms
|
||||
Msgr
|
||||
Op
|
||||
Ord
|
||||
Pfc
|
||||
Ph
|
||||
Prof
|
||||
Pvt
|
||||
Rep
|
||||
Reps
|
||||
Res
|
||||
Rev
|
||||
Rt
|
||||
Sen
|
||||
Sens
|
||||
Sfc
|
||||
Sgt
|
||||
Sr
|
||||
Sra
|
||||
Sras
|
||||
Srs
|
||||
Sto
|
||||
Supt
|
||||
Surg
|
||||
adj
|
||||
adm
|
||||
adv
|
||||
art
|
||||
cit
|
||||
col
|
||||
con
|
||||
corp
|
||||
cpl
|
||||
dr
|
||||
dra
|
||||
dras
|
||||
drs
|
||||
eng
|
||||
enga
|
||||
engas
|
||||
engos
|
||||
ex
|
||||
exo
|
||||
exmo
|
||||
fig
|
||||
op
|
||||
prof
|
||||
sr
|
||||
sra
|
||||
sras
|
||||
srs
|
||||
sto
|
||||
|
||||
#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
|
||||
v
|
||||
vs
|
||||
i.e
|
||||
rev
|
||||
e.g
|
||||
|
||||
#Numbers only. These should only induce breaks when followed by a numeric sequence
|
||||
# add NUMERIC_ONLY after the word for this function
|
||||
#This case is mostly for the english "No." which can either be a sentence of its own, or
|
||||
#if followed by a number, a non-breaking prefix
|
||||
No #NUMERIC_ONLY#
|
||||
Nos
|
||||
Art #NUMERIC_ONLY#
|
||||
Nr
|
||||
p #NUMERIC_ONLY#
|
||||
pp #NUMERIC_ONLY#
|
||||
|
@ -265,6 +265,12 @@ sub tokenize
|
||||
# if a colon is not immediately followed by lower-case characters, separate it out anyway
|
||||
$text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
|
||||
}
|
||||
elsif ($language eq "tdt") {
|
||||
# in Tetun, the apostrophe can be used inside words as an apostrophe-like character:
|
||||
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
|
||||
# if an apostrophe is not immediately followed by lower-case characters, separate it out anyway
|
||||
$text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g;
|
||||
}
|
||||
elsif (($language eq "ca")) {
|
||||
# in Catalan, the middle dot can be used inside words:
|
||||
# il<69>lusio
|
||||
@ -332,7 +338,7 @@ sub tokenize
|
||||
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
|
||||
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
|
||||
}
|
||||
elsif ($language eq "so")
|
||||
elsif (($language eq "so") or ($language eq "tdt"))
|
||||
{
|
||||
# Don't split glottals
|
||||
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
|
||||
|
Loading…
Reference in New Issue
Block a user