mirror of
https://github.com/facebook/duckling.git
synced 2024-12-11 06:46:26 +03:00
83ea150d94
Summary: We noticed that using UTF-8 characters directly in regexes work. Hence converting back the escaped characters for readability and maintenance. Reviewed By: blandinw Differential Revision: D5787146 fbshipit-source-id: e5a4b9a
390 lines
10 KiB
Haskell
390 lines
10 KiB
Haskell
-- Copyright (c) 2016-present, Facebook, Inc.
|
|
-- All rights reserved.
|
|
--
|
|
-- This source code is licensed under the BSD-style license found in the
|
|
-- LICENSE file in the root directory of this source tree. An additional grant
|
|
-- of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
|
|
{-# LANGUAGE GADTs #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
module Duckling.Numeral.DE.Rules
|
|
( rules ) where
|
|
|
|
import Control.Monad (join)
|
|
import Data.HashMap.Strict (HashMap)
|
|
import qualified Data.HashMap.Strict as HashMap
|
|
import Data.Maybe
|
|
import Data.Text (Text)
|
|
import qualified Data.Text as Text
|
|
import Prelude
|
|
import Data.String
|
|
|
|
import Duckling.Dimensions.Types
|
|
import Duckling.Numeral.Helpers
|
|
import Duckling.Numeral.Types (NumeralData (..))
|
|
import qualified Duckling.Numeral.Types as TNumeral
|
|
import Duckling.Regex.Types
|
|
import Duckling.Types
|
|
|
|
ruleNumeralsPrefixWithNegativeOrMinus :: Rule
|
|
ruleNumeralsPrefixWithNegativeOrMinus = Rule
|
|
{ name = "numbers prefix with -, negative or minus"
|
|
, pattern =
|
|
[ regex "-|minus|negativ"
|
|
, dimension Numeral
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(_:Token Numeral (NumeralData {TNumeral.value = v}):_) ->
|
|
double $ v * (- 1)
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleIntegerNumeric :: Rule
|
|
ruleIntegerNumeric = Rule
|
|
{ name = "integer (numeric)"
|
|
, pattern =
|
|
[ regex "(\\d{1,18})"
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token RegexMatch (GroupMatch (match:_)):_) -> do
|
|
v <- parseInt match
|
|
integer $ toInteger v
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleFew :: Rule
|
|
ruleFew = Rule
|
|
{ name = "few"
|
|
, pattern =
|
|
[ regex "mehrere"
|
|
]
|
|
, prod = \_ -> integer 3
|
|
}
|
|
|
|
ruleTen :: Rule
|
|
ruleTen = Rule
|
|
{ name = "ten"
|
|
, pattern =
|
|
[ regex "zehn"
|
|
]
|
|
, prod = \_ -> integer 10 >>= withGrain 1
|
|
}
|
|
|
|
ruleDecimalWithThousandsSeparator :: Rule
|
|
ruleDecimalWithThousandsSeparator = Rule
|
|
{ name = "decimal with thousands separator"
|
|
, pattern =
|
|
[ regex "(\\d+(\\.\\d\\d\\d)+\\,\\d+)"
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token RegexMatch (GroupMatch (match:_)):_) ->
|
|
let dot = Text.singleton '.'
|
|
comma = Text.singleton ','
|
|
fmt = Text.replace comma dot $ Text.replace dot Text.empty match
|
|
in parseDouble fmt >>= double
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleDecimalNumeral :: Rule
|
|
ruleDecimalNumeral = Rule
|
|
{ name = "decimal number"
|
|
, pattern =
|
|
[ regex "(\\d*,\\d+)"
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token RegexMatch (GroupMatch (match:_)):_) ->
|
|
parseDecimal False match
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleInteger3 :: Rule
|
|
ruleInteger3 = Rule
|
|
{ name = "integer ([2-9][1-9])"
|
|
, pattern =
|
|
[ regex "(ein|zwei|drei|vier|fünf|sechs|sieben|acht|neun)und(zwanzig|dreissig|vierzig|fünfzig|sechzig|siebzig|achtzig|neunzig)"
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token RegexMatch (GroupMatch (m1:m2:_)):_) -> do
|
|
v1 <- case Text.toLower m1 of
|
|
"ein" -> Just 1
|
|
"zwei" -> Just 2
|
|
"drei" -> Just 3
|
|
"vier" -> Just 4
|
|
"fünf" -> Just 5
|
|
"sechs" -> Just 6
|
|
"sieben" -> Just 7
|
|
"acht" -> Just 8
|
|
"neun" -> Just 9
|
|
_ -> Nothing
|
|
v2 <- case Text.toLower m2 of
|
|
"zwanzig" -> Just 20
|
|
"dreissig" -> Just 30
|
|
"vierzig" -> Just 40
|
|
"fünfzig" -> Just 50
|
|
"sechzig" -> Just 60
|
|
"siebzig" -> Just 70
|
|
"achtzig" -> Just 80
|
|
"neunzig" -> Just 90
|
|
_ -> Nothing
|
|
integer $ v1 + v2
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleNumeralsUnd :: Rule
|
|
ruleNumeralsUnd = Rule
|
|
{ name = "numbers und"
|
|
, pattern =
|
|
[ numberBetween 1 10
|
|
, regex "und"
|
|
, oneOf [20, 30 .. 90]
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token Numeral (NumeralData {TNumeral.value = v1}):
|
|
_:
|
|
Token Numeral (NumeralData {TNumeral.value = v2}):
|
|
_) -> double $ v1 + v2
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleMultiply :: Rule
|
|
ruleMultiply = Rule
|
|
{ name = "compose by multiplication"
|
|
, pattern =
|
|
[ dimension Numeral
|
|
, numberWith TNumeral.multipliable id
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(token1:token2:_) -> multiply token1 token2
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleIntersect :: Rule
|
|
ruleIntersect = Rule
|
|
{ name = "intersect"
|
|
, pattern =
|
|
[ numberWith (fromMaybe 0 . TNumeral.grain) (>1)
|
|
, dimension Numeral
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token Numeral (NumeralData {TNumeral.value = val1, TNumeral.grain = Just g}):
|
|
Token Numeral (NumeralData {TNumeral.value = val2}):
|
|
_) | (10 ** fromIntegral g) > val2 -> double $ val1 + val2
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleNumeralsSuffixesKMG :: Rule
|
|
ruleNumeralsSuffixesKMG = Rule
|
|
{ name = "numbers suffixes (K, M, G)"
|
|
, pattern =
|
|
[ dimension Numeral
|
|
, regex "([kmg])(?=[\\W\\$€]|$)"
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token Numeral (NumeralData {TNumeral.value = v}):
|
|
Token RegexMatch (GroupMatch (match:_)):
|
|
_) -> case Text.toLower match of
|
|
"k" -> double $ v * 1e3
|
|
"m" -> double $ v * 1e6
|
|
"g" -> double $ v * 1e9
|
|
_ -> Nothing
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleCouple :: Rule
|
|
ruleCouple = Rule
|
|
{ name = "couple"
|
|
, pattern =
|
|
[ regex "(ein )?paar"
|
|
]
|
|
, prod = \_ -> integer 2
|
|
}
|
|
|
|
ruleDozen :: Rule
|
|
ruleDozen = Rule
|
|
{ name = "dozen"
|
|
, pattern =
|
|
[ regex "dutzend"
|
|
]
|
|
, prod = \_ -> integer 12 >>= withGrain 1 >>= withMultipliable
|
|
}
|
|
|
|
rulePowersOfTen :: Rule
|
|
rulePowersOfTen = Rule
|
|
{ name = "powers of tens"
|
|
, pattern =
|
|
[ regex "(hunderte?|tausende?|million(en)?)"
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token RegexMatch (GroupMatch (match:_)):_) -> case Text.toLower match of
|
|
"hundert" -> double 1e2 >>= withGrain 2 >>= withMultipliable
|
|
"hunderte" -> double 1e2 >>= withGrain 2 >>= withMultipliable
|
|
"tausend" -> double 1e3 >>= withGrain 3 >>= withMultipliable
|
|
"tausende" -> double 1e3 >>= withGrain 3 >>= withMultipliable
|
|
"million" -> double 1e6 >>= withGrain 6 >>= withMultipliable
|
|
"millionen" -> double 1e6 >>= withGrain 6 >>= withMultipliable
|
|
_ -> Nothing
|
|
_ -> Nothing
|
|
}
|
|
|
|
zeroNineteenMap :: HashMap Text Integer
|
|
zeroNineteenMap = HashMap.fromList
|
|
[ ("keine", 0)
|
|
, ("null", 0)
|
|
, ("nichts", 0)
|
|
, ("keiner", 0)
|
|
, ("kein", 0)
|
|
, ("keins", 0)
|
|
, ("keinen", 0)
|
|
, ("keines", 0)
|
|
, ("einer", 1)
|
|
, ("eins", 1)
|
|
, ("ein", 1)
|
|
, ("eine", 1)
|
|
, ("einser", 1)
|
|
, ("zwei", 2)
|
|
, ("drei", 3)
|
|
, ("vier", 4)
|
|
, ("fünf", 5)
|
|
, ("sechs", 6)
|
|
, ("sieben", 7)
|
|
, ("acht", 8)
|
|
, ("neun", 9)
|
|
, ("zehn", 10)
|
|
, ("elf", 11)
|
|
, ("zwölf", 12)
|
|
, ("dreizehn", 13)
|
|
, ("vierzehn", 14)
|
|
, ("fünfzehn", 15)
|
|
, ("sechzehn", 16)
|
|
, ("siebzehn", 17)
|
|
, ("achtzehn", 18)
|
|
, ("neunzehn", 19)
|
|
]
|
|
|
|
ruleToNineteen :: Rule
|
|
ruleToNineteen = Rule
|
|
{ name = "integer (0..19)"
|
|
-- e.g. fourteen must be before four,
|
|
-- otherwise four will always shadow fourteen
|
|
, pattern = [regex "(keine?|keine?s|keiner|keinen|null|nichts|eins?(er)?|zwei|dreizehn|drei|vierzehn|vier|fünf|sechzehn|sechs|siebzehn|sieben|achtzehn|acht|neunzehn|neun|elf|zwölf|füfzehn)"]
|
|
, prod = \tokens -> case tokens of
|
|
(Token RegexMatch (GroupMatch (match:_)):_) ->
|
|
HashMap.lookup (Text.toLower match) zeroNineteenMap >>= integer
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleInteger :: Rule
|
|
ruleInteger = Rule
|
|
{ name = "integer (0..19)"
|
|
, pattern =
|
|
[ regex "(keine?|keine?s|keiner|keinen|null|nichts|eins?(er)?|zwei|dreizehn|drei|vierzehn|vier|fünf|sechzehn|sechs|siebzehn|sieben|achtzehn|acht|neunzehn|neun|elf|zwölf|füfzehn)"
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token RegexMatch (GroupMatch (match:_)):_) -> case Text.toLower match of
|
|
"nichts" -> integer 0
|
|
"keine" -> integer 0
|
|
"null" -> integer 0
|
|
"keiner" -> integer 0
|
|
"kein" -> integer 0
|
|
"keins" -> integer 0
|
|
"keinen" -> integer 0
|
|
"keines" -> integer 0
|
|
"einer" -> integer 1
|
|
"eins" -> integer 1
|
|
"ein" -> integer 1
|
|
"eine" -> integer 1
|
|
"zwei" -> integer 2
|
|
"drei" -> integer 3
|
|
"vier" -> integer 4
|
|
"fünf" -> integer 5
|
|
"sechs" -> integer 6
|
|
"sieben" -> integer 7
|
|
"acht" -> integer 8
|
|
"neun" -> integer 9
|
|
"zehn" -> integer 10
|
|
"elf" -> integer 11
|
|
"zwölf" -> integer 12
|
|
"dreizehn" -> integer 13
|
|
"vierzehn" -> integer 14
|
|
"fünfzehn" -> integer 15
|
|
"sechzehn" -> integer 16
|
|
"siebzehn" -> integer 17
|
|
"achtzehn" -> integer 18
|
|
"neunzehn" -> integer 19
|
|
_ -> Nothing
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleInteger2 :: Rule
|
|
ruleInteger2 = Rule
|
|
{ name = "integer (20..90)"
|
|
, pattern =
|
|
[ regex "(zwanzig|dreissig|vierzig|fünfzig|sechzig|siebzig|achtzig|neunzig)"
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token RegexMatch (GroupMatch (match:_)):_) -> case Text.toLower match of
|
|
"zwanzig" -> integer 20
|
|
"dreissig" -> integer 30
|
|
"vierzig" -> integer 40
|
|
"fünfzig" -> integer 50
|
|
"sechzig" -> integer 60
|
|
"siebzig" -> integer 70
|
|
"achtzig" -> integer 80
|
|
"neunzig" -> integer 90
|
|
_ -> Nothing
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleNumeralDotNumeral :: Rule
|
|
ruleNumeralDotNumeral = Rule
|
|
{ name = "number dot number"
|
|
, pattern =
|
|
[ dimension Numeral
|
|
, regex "komma"
|
|
, numberWith TNumeral.grain isNothing
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token Numeral (NumeralData {TNumeral.value = v1}):
|
|
_:
|
|
Token Numeral (NumeralData {TNumeral.value = v2}):
|
|
_) -> double $ v1 + decimalsToDouble v2
|
|
_ -> Nothing
|
|
}
|
|
|
|
ruleIntegerWithThousandsSeparator :: Rule
|
|
ruleIntegerWithThousandsSeparator = Rule
|
|
{ name = "integer with thousands separator ."
|
|
, pattern =
|
|
[ regex "(\\d{1,3}(\\.\\d\\d\\d){1,5})"
|
|
]
|
|
, prod = \tokens -> case tokens of
|
|
(Token RegexMatch (GroupMatch (match:_)):_) ->
|
|
parseDouble (Text.replace (Text.singleton '.') Text.empty match) >>= double
|
|
_ -> Nothing
|
|
}
|
|
|
|
rules :: [Rule]
|
|
rules =
|
|
[ ruleCouple
|
|
, ruleDecimalNumeral
|
|
, ruleDecimalWithThousandsSeparator
|
|
, ruleDozen
|
|
, ruleFew
|
|
, ruleInteger
|
|
, ruleInteger2
|
|
, ruleInteger3
|
|
, ruleIntegerNumeric
|
|
, ruleIntegerWithThousandsSeparator
|
|
, ruleIntersect
|
|
, ruleMultiply
|
|
, ruleNumeralDotNumeral
|
|
, ruleNumeralsPrefixWithNegativeOrMinus
|
|
, ruleNumeralsSuffixesKMG
|
|
, ruleNumeralsUnd
|
|
, rulePowersOfTen
|
|
, ruleTen
|
|
, ruleToNineteen
|
|
]
|