mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-06 19:49:41 +03:00
190 lines
4.5 KiB
Plaintext
190 lines
4.5 KiB
Plaintext
# Copyright 2002 Dave Abrahams
|
|
# Copyright 2002, 2003 Rene Rivera
|
|
# Distributed under the Boost Software License, Version 1.0.
|
|
# (See accompanying file LICENSE_1_0.txt or http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
import regex ;
|
|
|
|
|
|
# Characters considered whitespace, as a list.
|
|
.whitespace-chars = " " " " "
|
|
" ;
|
|
|
|
# Characters considered whitespace, as a single string.
|
|
.whitespace = $(.whitespace-chars:J="") ;
|
|
|
|
|
|
# Returns the canonical set of whitespace characters, as a list.
|
|
#
|
|
rule whitespace-chars ( )
|
|
{
|
|
return $(.whitespace-chars) ;
|
|
}
|
|
|
|
|
|
# Returns the canonical set of whitespace characters, as a single string.
|
|
#
|
|
rule whitespace ( )
|
|
{
|
|
return $(.whitespace) ;
|
|
}
|
|
|
|
|
|
# Splits the given string into a list of strings composed of each character of
|
|
# the string in sequence.
|
|
#
|
|
rule chars (
|
|
string # The string to split.
|
|
)
|
|
{
|
|
local result ;
|
|
while $(string)
|
|
{
|
|
local s = [ MATCH (.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.*) : $(string) ] ;
|
|
string = $(s[9]) ;
|
|
result += $(s[1-8]) ;
|
|
}
|
|
|
|
# Trim off empty strings.
|
|
while $(result[1]) && ! $(result[-1])
|
|
{
|
|
result = $(result[1--2]) ;
|
|
}
|
|
|
|
return $(result) ;
|
|
}
|
|
|
|
|
|
# Apply a set of standard transformations to string to produce an abbreviation
|
|
# no more than 5 characters long.
|
|
#
|
|
rule abbreviate ( string )
|
|
{
|
|
local r = $(.abbreviated-$(string)) ;
|
|
if $(r)
|
|
{
|
|
return $(r) ;
|
|
}
|
|
# Anything less than 4 characters gets no abbreviation.
|
|
else if ! [ MATCH (....) : $(string) ]
|
|
{
|
|
.abbreviated-$(string) = $(string) ;
|
|
return $(string) ;
|
|
}
|
|
else
|
|
{
|
|
# Separate the initial letter in case it's a vowel.
|
|
local s1 = [ MATCH ^(.)(.*) : $(string) ] ;
|
|
|
|
# Drop trailing "ing".
|
|
local s2 = [ MATCH ^(.*)ing$ : $(s1[2]) ] ;
|
|
s2 ?= $(s1[2]) ;
|
|
|
|
# Reduce all doubled characters to one.
|
|
local last = "" ;
|
|
for local c in [ chars $(s2) ]
|
|
{
|
|
if $(c) != $(last)
|
|
{
|
|
r += $(c) ;
|
|
last = $(c) ;
|
|
}
|
|
}
|
|
s2 = $(r:J="") ;
|
|
|
|
# Chop all vowels out of the remainder.
|
|
s2 = [ regex.replace $(s2) [AEIOUaeiou] "" ] ;
|
|
|
|
# Shorten remaining consonants to 4 characters.
|
|
s2 = [ MATCH ^(.?.?.?.?) : $(s2) ] ;
|
|
|
|
# Glue the initial character back on to the front.
|
|
s2 = $(s1[1])$(s2) ;
|
|
|
|
.abbreviated-$(string) = $(s2) ;
|
|
return $(s2) ;
|
|
}
|
|
}
|
|
|
|
|
|
# Concatenates the given strings, inserting the given separator between each
|
|
# string.
|
|
#
|
|
rule join (
|
|
strings * # The strings to join.
|
|
: separator ? # The optional separator.
|
|
)
|
|
{
|
|
separator ?= "" ;
|
|
return $(strings:J=$(separator)) ;
|
|
}
|
|
|
|
|
|
# Split a string into whitespace separated words.
|
|
#
|
|
rule words (
|
|
string # The string to split.
|
|
: whitespace * # Optional, characters to consider as whitespace.
|
|
)
|
|
{
|
|
whitespace = $(whitespace:J="") ;
|
|
whitespace ?= $(.whitespace) ;
|
|
local w = ;
|
|
while $(string)
|
|
{
|
|
string = [ MATCH "^[$(whitespace)]*([^$(whitespace)]*)(.*)" : $(string) ] ;
|
|
if $(string[1]) && $(string[1]) != ""
|
|
{
|
|
w += $(string[1]) ;
|
|
}
|
|
string = $(string[2]) ;
|
|
}
|
|
return $(w) ;
|
|
}
|
|
|
|
|
|
# Check that the given string is composed entirely of whitespace.
|
|
#
|
|
rule is-whitespace (
|
|
string ? # The string to test.
|
|
)
|
|
{
|
|
if ! $(string) { return true ; }
|
|
else if $(string) = "" { return true ; }
|
|
else if [ MATCH "^([$(.whitespace)]+)$" : $(string) ] { return true ; }
|
|
else { return ; }
|
|
}
|
|
|
|
rule __test__ ( )
|
|
{
|
|
import assert ;
|
|
assert.result a b c : chars abc ;
|
|
|
|
assert.result rntm : abbreviate runtime ;
|
|
assert.result ovrld : abbreviate overload ;
|
|
assert.result dbg : abbreviate debugging ;
|
|
assert.result async : abbreviate asynchronous ;
|
|
assert.result pop : abbreviate pop ;
|
|
assert.result aaa : abbreviate aaa ;
|
|
assert.result qck : abbreviate quack ;
|
|
assert.result sttc : abbreviate static ;
|
|
|
|
# Check boundary cases.
|
|
assert.result a : chars a ;
|
|
assert.result : chars "" ;
|
|
assert.result a b c d e f g h : chars abcdefgh ;
|
|
assert.result a b c d e f g h i : chars abcdefghi ;
|
|
assert.result a b c d e f g h i j : chars abcdefghij ;
|
|
assert.result a b c d e f g h i j k : chars abcdefghijk ;
|
|
|
|
assert.result a//b/c/d : join a "" b c d : / ;
|
|
assert.result abcd : join a "" b c d ;
|
|
|
|
assert.result a b c : words "a b c" ;
|
|
|
|
assert.true is-whitespace " " ;
|
|
assert.false is-whitespace " a b c " ;
|
|
assert.true is-whitespace "" ;
|
|
assert.true is-whitespace ;
|
|
}
|