Add a new Arabic locale (EG) (#554)

Summary:
Egyptian Arabic is a dialect of Arabic that is mostly a spoken language that is used in everyday communications.
This PR adds new locale to Arabic to support the differences between Modern Standard Arabic (MSA) and Egyptian Arabic (EG).
I have mainly depended on the different locales of Spanish that are supported by Duckling to create the new Egyptian Arabic locale.
New modifications are added to the `Numeral` dimension since I didn't spot differences in other dimensions.

Pull Request resolved: https://github.com/facebook/duckling/pull/554

Reviewed By: patapizza

Differential Revision: D25543502

Pulled By: chessai

fbshipit-source-id: 4cbb7be78a52071c8681380077f0b4dc033a60de
This commit is contained in:
Amr Keleg 2020-12-15 11:19:08 -08:00 committed by Facebook GitHub Bot
parent 181037e469
commit 703ff13210
8 changed files with 702 additions and 1 deletions

View File

@ -19,6 +19,7 @@ module Duckling.Locale
, CL
, CN
, CO
, EG
, GB
, HK
, IE
@ -138,7 +139,8 @@ makeLocale lang (Just region)
allLocales :: HashMap Lang (HashSet Region)
allLocales =
HashMap.fromList
[ (EN, HashSet.fromList [AU, BZ, CA, GB, IN, IE, JM, NZ, PH, ZA, TT, US])
[ (AR, HashSet.fromList [EG])
, (EN, HashSet.fromList [AU, BZ, CA, GB, IN, IE, JM, NZ, PH, ZA, TT, US])
, (ES, HashSet.fromList [R.AR, CL, CO, R.ES, MX, PE, VE])
, (NL, HashSet.fromList [BE, R.NL])
, (ZH, HashSet.fromList [CN, HK, MO, TW])

View File

@ -0,0 +1,257 @@
-- Copyright (c) 2016-present, Facebook, Inc.
-- All rights reserved.
--
-- This source code is licensed under the BSD-style license found in the
-- LICENSE file in the root directory of this source tree.
{-# LANGUAGE OverloadedStrings #-}
module Duckling.Numeral.AR.EG.Corpus (allExamples) where
import Data.String
import Prelude
import Duckling.Numeral.Types
import Duckling.Testing.Types
allExamples :: [Example]
allExamples = concat
[ examples (NumeralValue 0)
[ "0"
, "صفر"
, "٠"
, "٠٫٠"
]
, examples (NumeralValue 1)
[ "1"
, "واحد"
, "١"
, "١٫٠"
]
, examples (NumeralValue 4)
[ "4"
, "أربع"
, "اربعه"
, "٤"
, "٤٫٠"
]
, examples (NumeralValue 6)
[ "6"
, "ستة"
, "ست"
, "سته"
, "٦"
, "٦٫٠"
]
, examples (NumeralValue 10)
[ "10"
, "عشرة"
, "عشره"
, "١٠"
, "١٠٫٠"
]
, examples (NumeralValue 11)
[ "11"
, "حداشر"
, "١١"
, "١١٫٠"
]
, examples (NumeralValue 12)
[ "12"
, "إتناشر"
, "١٢"
, "١٢٫٠"
]
, examples (NumeralValue 14)
[ "14"
, "اربعتاشر"
, "١٤"
, "١٤٫٠"
]
, examples (NumeralValue 16)
[ "16"
, "ستاشر"
, "١٦"
, "١٦٫٠"
]
, examples (NumeralValue 17)
[ "17"
, "سبعتاشر"
, "١٧"
, "١٧٫٠"
]
, examples (NumeralValue 18)
[ "18"
, "تمنتاشر"
, "١٨"
, "١٨٫٠"
]
, examples (NumeralValue 20)
[ "عشرين"
, "٢٠"
, "٢٠٫٠"
]
, examples (NumeralValue 21)
[ "واحد و عشرين"
, "21"
, "٢١"
, "٢١٫٠"
]
, examples (NumeralValue 24)
[ "أربعة و عشرين"
, "24"
, "٢٤"
, "٢٤٫٠"
]
, examples (NumeralValue 26)
[ "ستة و عشرين"
, "26"
, "٢٦"
, "٢٦٫٠"
]
, examples (NumeralValue 20)
[ "عشرين"
, "٢٠"
]
, examples (NumeralValue 30)
[ "ثلاثين"
, "٣٠"
, "٣٠٫٠"
]
, examples (NumeralValue 33)
[ "33"
, "تلاتة و تلاتين"
, "٣٣"
, "٣٣٫٠"
]
, examples (NumeralValue 40)
[ "أربعين"
, "٤٠"
, "٤٠٫٠"
]
, examples (NumeralValue 200)
[ "متين"
, "٢٠٠"
, "٢٠٠٫٠"
]
, examples (NumeralValue 210)
[ "متين وعشرة"
, "٢١٠"
, "٢١٠٫٠"
]
, examples (NumeralValue 221)
[ "متين واحد وعشرين"
, "٢٢١"
, "٢٢١٫٠"
]
, examples (NumeralValue 263)
[ "متين تلاتة وستين"
, "٢٦٣"
, "٢٦٣٫٠"
]
, examples (NumeralValue 300)
[ "تلتمية"
, "٣٠٠"
, "٣٠٠٫٠"
]
, examples (NumeralValue 350)
[ "تلتمية وخمسين"
, "٣٥٠"
, "٣٥٠٫٠"
]
, examples (NumeralValue 500)
[ "خمسمية"
, "٥٠٠"
, "٥٠٠٫٠"
]
, examples (NumeralValue 525)
[ "خمسمية خمسة و عشرين"
, "525"
, "٥٢٥"
, "٥٢٥٫٠"
]
, examples (NumeralValue 700)
[ "سبعمية"
, "٧٠٠"
, "٧٠٠٫٠"
]
, examples (NumeralValue 1.1)
[ "1.1"
, "1.10"
, "01.10"
, "1 فاصلة 1"
, "واحد فاصلة واحد"
, "١١/١٠"
, "١٫١"
, "١٫١٠"
]
, examples (NumeralValue 0.77)
[ "0.77"
, ".77"
, "٧٧/١٠٠"
, "٫٧٧"
, "٠٫٧٧"
, "٠٫٧٧٠"
]
, examples (NumeralValue 2000)
[ "2000"
, "٢٠٠٠"
, "الفين"
]
, examples (NumeralValue 100000)
[ "100000"
, "100 الف"
, "١٠٠٠٠٠"
, "١٠٠٠٠٠٫٠٠"
, "١٠٠٬٠٠٠"
, "١٠٠٬٠٠٠٫٠"
]
, examples (NumeralValue 10000)
[ "10000"
, "10 آلاف"
, "١٠٠٠٠"
, "١٠٠٠٠٫٠٠"
, "١٠٬٠٠٠"
, "١٠٬٠٠٠٫٠٠"
]
, examples (NumeralValue 1000000)
[ "1000000"
, "مليون"
, "١٠٠٠٠٠٠"
, "١٠٠٠٠٠٠٫٠٠"
, "١٬٠٠٠٬٠٠٠"
, "١٬٠٠٠٬٠٠٠٫٠٠"
]
, examples (NumeralValue 2000000)
[ "2000000"
, "2 مليون"
, "اتنين مليون"
, "٢٠٠٠٠٠٠"
, "٢٠٠٠٠٠٠٫٠٠"
, "٢٬٠٠٠٬٠٠٠"
, "٢٬٠٠٠٬٠٠٠٫٠٠"
]
, examples (NumeralValue 3000000)
[ "3 ملايين"
, "3000000"
, "3 مليون"
, "٣٠٠٠٠٠٠"
, "٣٠٠٠٠٠٠٫٠٠"
, "٣٬٠٠٠٬٠٠٠"
, "٣٬٠٠٠٬٠٠٠٫٠٠"
]
, examples (NumeralValue (-1200000))
[ "-1200000"
, "-١٢٠٠٠٠٠"
, "-١٢٠٠٠٠٠٫٠٠"
, "-١٬٢٠٠٬٠٠٠"
, "-١٬٢٠٠٬٠٠٠٫٠٠"
]
, examples (NumeralValue (-1.2))
[ "-١٢/١٠"
, "-1.2"
, "-١٫٢"
, "-١٫٢٠"
]
]

View File

@ -0,0 +1,80 @@
-- Copyright (c) 2016-present, Facebook, Inc.
-- All rights reserved.
--
-- This source code is licensed under the BSD-style license found in the
-- LICENSE file in the root directory of this source tree.
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
module Duckling.Numeral.AR.EG.Helpers
( digitsMap
, numeralToStringMap
, parseArabicDoubleAsText
, parseArabicDoubleFromText
, parseArabicIntAsText
, parseArabicIntegerFromText
) where
import Control.Monad (join)
import Data.HashMap.Strict (HashMap)
import Data.Maybe (mapMaybe)
import Data.String
import Data.Text (Text)
import Duckling.Numeral.Helpers
( parseDouble
, parseInteger
)
import Prelude
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text
numeralToStringMap :: HashMap Char String
numeralToStringMap =
HashMap.fromList
[ ('٠', "0")
, ('١', "1")
, ('٢', "2")
, ('٣', "3")
, ('٤', "4")
, ('٥', "5")
, ('٦', "6")
, ('٧', "7")
, ('٨', "8")
, ('٩', "9")
]
digitsMap :: HashMap Text Integer
digitsMap =
HashMap.fromList
[ ("اتنين", 2)
, ("تلات", 3)
, ("اربع", 4)
, ("أربع", 4)
, ("خمس", 5)
, ("ست", 6)
, ("سبع", 7)
, ("تمان", 8)
, ("تسع", 9)
]
parseArabicIntAsText :: Text -> Text
parseArabicIntAsText =
Text.pack
. join
. mapMaybe (`HashMap.lookup` numeralToStringMap)
. Text.unpack
parseArabicIntegerFromText :: Text -> Maybe Integer
parseArabicIntegerFromText = parseInteger . parseArabicIntAsText
parseArabicDoubleAsText :: Text -> Text
parseArabicDoubleAsText =
Text.pack
. join
. mapMaybe (`HashMap.lookup` HashMap.insert '٫' "." numeralToStringMap)
. Text.unpack
parseArabicDoubleFromText :: Text -> Maybe Double
parseArabicDoubleFromText = parseDouble . parseArabicDoubleAsText

View File

@ -0,0 +1,343 @@
-- Copyright (c) 2016-present, Facebook, Inc.
-- All rights reserved.
--
-- This source code is licensed under the BSD-style license found in the
-- LICENSE file in the root directory of this source tree.
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoRebindableSyntax #-}
module Duckling.Numeral.AR.EG.Rules (rules) where
import Data.Maybe
import Prelude
import qualified Data.HashMap.Strict as HashMap
import Duckling.Dimensions.Types
import Duckling.Numeral.AR.EG.Helpers
( digitsMap
, parseArabicDoubleFromText
, parseArabicIntegerFromText
)
import Duckling.Numeral.Helpers
import Duckling.Numeral.Types (NumeralData (..))
import Duckling.Regex.Types
import Duckling.Types
import qualified Duckling.Numeral.Types as TNumeral
ruleInteger2 :: Rule
ruleInteger2 = Rule
{ name = "integer 2"
, pattern =
[ regex "[إأا]?تني[ي]?ن"
]
, prod = \_ -> integer 2
}
ruleInteger3 :: Rule
ruleInteger3 = Rule
{ name = "integer 3"
, pattern =
[ regex "تلات[هة]?"
]
, prod = \_ -> integer 3
}
ruleInteger8 :: Rule
ruleInteger8 = Rule
{ name = "integer 8"
, pattern =
[ regex "تمان(ي[هة])?"
]
, prod = \_ -> integer 8
}
ruleInteger11 :: Rule
ruleInteger11 = Rule
{ name = "integer 11"
, pattern =
[ regex "(إأا)?حداشر"
]
, prod = \_ -> integer 11
}
ruleInteger12 :: Rule
ruleInteger12 = Rule
{ name = "integer 12"
, pattern =
[ regex "[إأا]?[تط]ناشر"
]
, prod = \_ -> integer 12
}
ruleInteger13 :: Rule
ruleInteger13 = Rule
{ name = "integer 13"
, pattern =
[ regex "[تط]ل(ا)?[تط]اشر"
]
, prod = \_ -> integer 13
}
ruleInteger14 :: Rule
ruleInteger14 = Rule
{ name = "integer 14"
, pattern =
[ regex "[إأا]ربع[تط]اشر"
]
, prod = \_ -> integer 14
}
ruleInteger15 :: Rule
ruleInteger15 = Rule
{ name = "integer 15"
, pattern =
[ regex "خمس[تط]اشر"
]
, prod = \_ -> integer 15
}
ruleInteger16 :: Rule
ruleInteger16 = Rule
{ name = "integer 16"
, pattern =
[ regex "س[تط]اشر"
]
, prod = \_ -> integer 16
}
ruleInteger17 :: Rule
ruleInteger17 = Rule
{ name = "integer 17"
, pattern =
[ regex "سبع[تط]اشر"
]
, prod = \_ -> integer 17
}
ruleInteger18 :: Rule
ruleInteger18 = Rule
{ name = "integer 18"
, pattern =
[ regex "[تط]من[تط]اشر"
]
, prod = \_ -> integer 18
}
ruleInteger19 :: Rule
ruleInteger19 = Rule
{ name = "integer 19"
, pattern =
[ regex "تسع[تط]اشر"
]
, prod = \_ -> integer 19
}
ruleInteger30_80 :: Rule
ruleInteger30_80 = Rule
{ name = "integer (30, 80)"
, pattern =
[ regex "([تط]لا[تط]|[تط]مان)(ين)"
]
, prod = \tokens -> case tokens of
Token RegexMatch (GroupMatch (match:_)):_ ->
(* 10) <$> HashMap.lookup match digitsMap >>= integer
_ -> Nothing
}
ruleInteger100 :: Rule
ruleInteger100 = Rule
{ name = "integer (100)"
, pattern =
[ regex "مي[هةت]"
]
, prod = const $ integer 100
}
ruleInteger200 :: Rule
ruleInteger200 = Rule
{ name = "integer (200)"
, pattern =
[ regex "م(ي)?تين"
]
, prod = const $ integer 200
}
ruleInteger300 :: Rule
ruleInteger300 = Rule
{ name = "integer (300)"
, pattern =
[ regex "([تط]و?لا?[تط]و?)مي[هةت]"
]
, prod = const $ integer 300
}
ruleInteger400 :: Rule
ruleInteger400 = Rule
{ name = "integer (400)"
, pattern =
[ regex "(رو?بعو?)مي[هةت]"
]
, prod = const $ integer 400
}
ruleInteger500 :: Rule
ruleInteger500 = Rule
{ name = "integer (500)"
, pattern =
[ regex "(خو?مسو?)مي[هةت]"
]
, prod = const $ integer 500
}
ruleInteger600 :: Rule
ruleInteger600 = Rule
{ name = "integer (600)"
, pattern =
[ regex "(سو?تو?)مي[هةت]"
]
, prod = const $ integer 600
}
ruleInteger700 :: Rule
ruleInteger700 = Rule
{ name = "integer (700)"
, pattern =
[ regex "(سو?بعو?)مي[هةت]"
]
, prod = const $ integer 700
}
ruleInteger800 :: Rule
ruleInteger800 = Rule
{ name = "integer (800)"
, pattern =
[ regex "([تط]و?منو?)مي[هةت]"
]
, prod = const $ integer 800
}
ruleInteger900 :: Rule
ruleInteger900 = Rule
{ name = "integer (900)"
, pattern =
[ regex "([تط]و?سعو?)مي[هةت]"
]
, prod = const $ integer 900
}
ruleInteger101_999 :: Rule
ruleInteger101_999 = Rule
{ name = "integer 101..999"
, pattern =
[ oneOf [100, 200 .. 900]
, regex "\\s"
, oneOf $ map (+) [10, 20 .. 90] <*> [1..9]
]
, prod = \tokens -> case tokens of
(Token Numeral NumeralData{TNumeral.value = v1}:
_:
Token Numeral NumeralData{TNumeral.value = v2}:
_) -> double $ v1 + v2
_ -> Nothing
}
ruleInteger3000 :: Rule
ruleInteger3000 = Rule
{ name = "integer (3000)"
, pattern =
[ regex "([تط]لا?[تط])( ?[اآ])?لاف"
]
, prod = const $ integer 3000
}
ruleInteger4000 :: Rule
ruleInteger4000 = Rule
{ name = "integer (4000)"
, pattern =
[ regex "[أا]ربع(ت| ?[اآ])لاف"
]
, prod = const $ integer 4000
}
ruleInteger5000 :: Rule
ruleInteger5000 = Rule
{ name = "integer (5000)"
, pattern =
[ regex "خمس(ت| ?[اآ])لاف"
]
, prod = const $ integer 5000
}
ruleInteger6000 :: Rule
ruleInteger6000 = Rule
{ name = "integer (6000)"
, pattern =
[ regex "س(ت| ?[اآ])لاف"
]
, prod = const $ integer 6000
}
ruleInteger7000 :: Rule
ruleInteger7000 = Rule
{ name = "integer (7000)"
, pattern =
[ regex "سبع(ت| ?[اآ])لاف"
]
, prod = const $ integer 7000
}
ruleInteger8000 :: Rule
ruleInteger8000 = Rule
{ name = "integer (8000)"
, pattern =
[ regex "[تط]م[ا]?ن(ت| ?[اآ])لاف"
]
, prod = const $ integer 8000
}
ruleInteger9000 :: Rule
ruleInteger9000 = Rule
{ name = "integer (9000)"
, pattern =
[ regex "([تط]سع)(ت| ?[اآ])لاف"
]
, prod = const $ integer 9000
}
rules :: [Rule]
rules =
[ ruleInteger2
, ruleInteger3
, ruleInteger8
, ruleInteger11
, ruleInteger12
, ruleInteger13
, ruleInteger14
, ruleInteger15
, ruleInteger16
, ruleInteger17
, ruleInteger18
, ruleInteger19
, ruleInteger30_80
, ruleInteger100
, ruleInteger200
, ruleInteger300
, ruleInteger400
, ruleInteger500
, ruleInteger600
, ruleInteger700
, ruleInteger800
, ruleInteger900
, ruleInteger101_999
, ruleInteger3000
, ruleInteger4000
, ruleInteger5000
, ruleInteger6000
, ruleInteger7000
, ruleInteger8000
, ruleInteger9000
]

View File

@ -30,6 +30,7 @@ data Region
| CL
| CN
| CO
| EG
| ES
| GB
| HK

View File

@ -20,6 +20,7 @@ import Duckling.Types
import qualified Duckling.AmountOfMoney.AR.Rules as AmountOfMoney
import qualified Duckling.Duration.AR.Rules as Duration
import qualified Duckling.Numeral.AR.Rules as Numeral
import qualified Duckling.Numeral.AR.EG.Rules as NumeralEG
import qualified Duckling.Ordinal.AR.Rules as Ordinal
import qualified Duckling.PhoneNumber.AR.Rules as PhoneNumber
import qualified Duckling.Quantity.AR.Rules as Quantity
@ -32,6 +33,7 @@ defaultRules :: Seal Dimension -> [Rule]
defaultRules = langRules
localeRules :: Region -> Seal Dimension -> [Rule]
localeRules EG (Seal Numeral) = NumeralEG.rules
localeRules region (Seal (CustomDimension dim)) = dimLocaleRules region dim
localeRules _ _ = []

View File

@ -400,6 +400,9 @@ library
, Duckling.Numeral.AR.Corpus
, Duckling.Numeral.AR.Helpers
, Duckling.Numeral.AR.Rules
, Duckling.Numeral.AR.EG.Corpus
, Duckling.Numeral.AR.EG.Helpers
, Duckling.Numeral.AR.EG.Rules
, Duckling.Numeral.BG.Corpus
, Duckling.Numeral.BG.Rules
, Duckling.Numeral.BN.Corpus

View File

@ -13,10 +13,23 @@ import Data.String
import Test.Tasty
import Duckling.Dimensions.Types
import Duckling.Locale
import Duckling.Numeral.AR.Corpus
import Duckling.Testing.Asserts
import Duckling.Testing.Types hiding (examples)
import qualified Duckling.Numeral.AR.EG.Corpus as EG
tests :: TestTree
tests = testGroup "AR Tests"
[ makeCorpusTest [Seal Numeral] corpus
, localeTests
]
localeTests :: TestTree
localeTests = testGroup "Locale Tests"
[ testGroup "AR_EG Tests"
[ makeCorpusTest [Seal Numeral] $ withLocale corpus localeEG EG.allExamples
]
]
where
localeEG = makeLocale AR $ Just EG