From 41f140992d88d0d32da28b7801c5f9af6e74dabf Mon Sep 17 00:00:00 2001 From: Jens Persson Date: Thu, 23 May 2019 15:58:12 -0700 Subject: [PATCH] Add support for larger spelled-out Danish ordinals (#346) Summary: Adds support for larger spelled-out Danish ordinal number expressions, like treoghalvfemsindstyvende (93rd) or tohundrede og femogfyrrende (245th) Pull Request resolved: https://github.com/facebook/duckling/pull/346 Reviewed By: patapizza Differential Revision: D14476918 Pulled By: chinmay87 fbshipit-source-id: eb20ee8d304f291ff4ab2b28c4e272a9d447396e --- Duckling/Numeral/DA/Rules.hs | 4 +- Duckling/Ordinal/DA/Corpus.hs | 27 ++++- Duckling/Ordinal/DA/Rules.hs | 161 ++++++++++++++++++++------ Duckling/Ranking/Classifiers/DA_XX.hs | 53 ++++----- Duckling/Ranking/Classifiers/IT_XX.hs | 4 +- 5 files changed, 176 insertions(+), 73 deletions(-) diff --git a/Duckling/Numeral/DA/Rules.hs b/Duckling/Numeral/DA/Rules.hs index 22438bd4..e60dd886 100644 --- a/Duckling/Numeral/DA/Rules.hs +++ b/Duckling/Numeral/DA/Rules.hs @@ -141,14 +141,16 @@ rulePowersOfTen :: Rule rulePowersOfTen = Rule { name = "powers of tens" , pattern = - [ regex "(hundrede?|tusinde?|million(er)?)" + [ regex "(hundrede?|tohundrede|tusinde?|totusinde|million(er)?)" ] , prod = \tokens -> case tokens of (Token RegexMatch (GroupMatch (match:_)):_) -> case Text.toLower match of "hundred" -> double 1e2 >>= withGrain 2 >>= withMultipliable "hundrede" -> double 1e2 >>= withGrain 2 >>= withMultipliable + "tohundrede" -> double (2 * 1e2) >>= withGrain 2 >>= withMultipliable "tusind" -> double 1e3 >>= withGrain 3 >>= withMultipliable "tusinde" -> double 1e3 >>= withGrain 3 >>= withMultipliable + "totusinde" -> double (2 * 1e3) >>= withGrain 3 >>= withMultipliable "million" -> double 1e6 >>= withGrain 6 >>= withMultipliable "millioner" -> double 1e6 >>= withGrain 6 >>= withMultipliable _ -> Nothing diff --git a/Duckling/Ordinal/DA/Corpus.hs b/Duckling/Ordinal/DA/Corpus.hs index d63c6cd1..8368fb05 100644 --- a/Duckling/Ordinal/DA/Corpus.hs +++ b/Duckling/Ordinal/DA/Corpus.hs @@ -22,9 +22,24 @@ corpus :: Corpus corpus = (testContext {locale = makeLocale DA Nothing}, testOptions, allExamples) allExamples :: [Example] -allExamples = - examples (OrdinalData 4) - [ "4." - , "fjerde" - , "Fjerde" - ] +allExamples = concat + [ examples (OrdinalData 4) + [ "4." + , "fjerde" + , "Fjerde" + ] + , examples (OrdinalData 41) + [ "enogfyrrende" + ] + , examples (OrdinalData 78) + [ "otteoghalvfjerdsindstyvende" + ] + , examples (OrdinalData 263) + [ "to hundrede og treogtresindstyvende" + , "tohundrede og treogtresindstyvende" + ] + , examples (OrdinalData 70) + [ "halvfjerdsende" + , "halvfjerdsindstyvende" + ] + ] diff --git a/Duckling/Ordinal/DA/Rules.hs b/Duckling/Ordinal/DA/Rules.hs index 4fe9678f..e700a6e6 100644 --- a/Duckling/Ordinal/DA/Rules.hs +++ b/Duckling/Ordinal/DA/Rules.hs @@ -7,60 +7,143 @@ {-# LANGUAGE GADTs #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE NoRebindableSyntax #-} module Duckling.Ordinal.DA.Rules ( rules ) where -import qualified Data.Text as Text + +import Data.HashMap.Strict (HashMap) +import Data.Text (Text) import Prelude -import Data.String +import qualified Data.HashMap.Strict as HashMap +import qualified Data.Text as Text import Duckling.Dimensions.Types -import Duckling.Numeral.Helpers (parseInt) +import Duckling.Numeral.Helpers (parseInt, numberWith) +import Duckling.Numeral.Types (NumeralData (..), getIntValue) import Duckling.Ordinal.Helpers +import Duckling.Ordinal.Types (OrdinalData (..)) import Duckling.Regex.Types import Duckling.Types +import qualified Duckling.Numeral.Types as TNumeral + +ordinalsMap :: HashMap Text Int +ordinalsMap = HashMap.fromList + [ ( "første", 1 ) + , ( "anden", 2 ) + , ( "tredje", 3 ) + , ( "fjerde", 4 ) + , ( "femte", 5 ) + , ( "sjette", 6 ) + , ( "syvende", 7 ) + , ( "ottende", 8 ) + , ( "niende", 9 ) + , ( "tiende", 10 ) + , ( "elfte", 11 ) + , ( "tolvte", 12 ) + , ( "trettende", 13 ) + , ( "fjortende", 14 ) + , ( "femtende", 15 ) + , ( "sekstende", 16 ) + , ( "syttende", 17 ) + , ( "attende", 18 ) + , ( "nittende", 19 ) + , ( "tyvende", 20 ) + , ( "tenogtyvende", 21 ) + , ( "toogtyvende", 22 ) + , ( "treogtyvende", 23 ) + , ( "fireogtyvende", 24 ) + , ( "femogtyvende", 25 ) + , ( "seksogtyvende", 26 ) + , ( "syvogtyvende", 27 ) + , ( "otteogtyvende", 28 ) + , ( "niogtyvende", 29 ) + , ( "tredivte", 30 ) + , ( "enogtredivte", 31 ) + ] + +cardinalsMap :: HashMap Text Int +cardinalsMap = HashMap.fromList + [ ( "tyvende", 20 ) + , ( "tredivte", 30 ) + , ( "fyrrende", 40 ) + , ( "fyrretyvende", 40 ) + , ( "halvtredsende", 50 ) + , ( "halvtredsindstyvende", 50 ) + , ( "tressende", 60 ) + , ( "tresindstyvende", 60 ) + , ( "halvfjerdsende", 70 ) + , ( "halvfjerdsindstyvende", 70 ) + , ( "firsende", 80 ) + , ( "firsindsstyvende", 80 ) + , ( "halvfemsende", 90 ) + , ( "halvfemsindstyvende", 90 ) + ] + +oneValMap :: HashMap Text Int +oneValMap = HashMap.fromList + [ ( "", 0 ) + , ( "enog", 1 ) + , ( "toog", 2 ) + , ( "treog", 3 ) + , ( "fireog", 4 ) + , ( "femog", 5 ) + , ( "seksog", 6 ) + , ( "syvog", 7 ) + , ( "otteog", 8 ) + , ( "niog", 9 ) + ] ruleOrdinalsFirstst :: Rule ruleOrdinalsFirstst = Rule - { name = "ordinals (first..31st)" + { name = "ordinals (first..19st)" , pattern = - [ regex "(første|anden|tredje|fjerde|femte|sjette|syvende|ottende|niende|tiende|elfte|tolvte|trettende|fjortende|femtende|sekstende|syttende|attende|nittende|tyvende|tenogtyvende|toogtyvende|treogtyvende|fireogtyvende|femogtyvende|seksogtyvende|syvogtyvende|otteogtyvende|niogtyvende|tredivte|enogtredivte)" + [ regex "(første|anden|tredje|fjerde|femte|sjette|syvende|ottende|niende|tiende|elfte|tolvte|trettende|fjortende|femtende|sekstende|syttende|attende|nittende)" ] , prod = \tokens -> case tokens of - (Token RegexMatch (GroupMatch (match:_)):_) -> case Text.toLower match of - "første" -> Just $ ordinal 1 - "anden" -> Just $ ordinal 2 - "tredje" -> Just $ ordinal 3 - "fjerde" -> Just $ ordinal 4 - "femte" -> Just $ ordinal 5 - "sjette" -> Just $ ordinal 6 - "syvende" -> Just $ ordinal 7 - "ottende" -> Just $ ordinal 8 - "niende" -> Just $ ordinal 9 - "tiende" -> Just $ ordinal 10 - "elfte" -> Just $ ordinal 11 - "tolvte" -> Just $ ordinal 12 - "trettende" -> Just $ ordinal 13 - "fjortende" -> Just $ ordinal 14 - "femtende" -> Just $ ordinal 15 - "sekstende" -> Just $ ordinal 16 - "syttende" -> Just $ ordinal 17 - "attende" -> Just $ ordinal 18 - "nittende" -> Just $ ordinal 19 - "tyvende" -> Just $ ordinal 20 - "tenogtyvende" -> Just $ ordinal 21 - "toogtyvende" -> Just $ ordinal 22 - "treogtyvende" -> Just $ ordinal 23 - "fireogtyvende" -> Just $ ordinal 24 - "femogtyvende" -> Just $ ordinal 25 - "seksogtyvende" -> Just $ ordinal 26 - "syvogtyvende" -> Just $ ordinal 27 - "otteogtyvende" -> Just $ ordinal 28 - "niogtyvende" -> Just $ ordinal 29 - "tredivte" -> Just $ ordinal 30 - "enogtredivte" -> Just $ ordinal 31 - _ -> Nothing + (Token RegexMatch (GroupMatch (match:_)):_) -> + ordinal <$> HashMap.lookup (Text.toLower match) ordinalsMap + _ -> Nothing + } + +ruleSpelledOutOrdinals :: Rule +ruleSpelledOutOrdinals = Rule + { name = "ordinals, 20 to 99, spelled-out" + , pattern = + [ regex (concat ["((?:en|to|tre|fire|fem|seks|syv|otte|ni)og)?", + "(tyvende", + "|tredivte", + "|fyrr(?:etyv)?ende", + "|halvtreds(?:indstyv)?ende", + "|tres(?:indstyv|s)?ende", + "|halvfjerds(?:indstyv)?ende", + "|firs(?:indstyv)?ende", + "|halvfems(?:indstyv)?ende)"]) + ] + , prod = \case + (Token RegexMatch (GroupMatch (ones:tens:_)):_) -> do + oneVal <- HashMap.lookup (Text.toLower ones) oneValMap + tenVal <- HashMap.lookup (Text.toLower tens) cardinalsMap + Just $ ordinal (oneVal + tenVal) + _ -> Nothing + + } + +ruleSpelledOutBigOrdinals :: Rule +ruleSpelledOutBigOrdinals = Rule + { name = "ordinals, above 99, spelled out" + , pattern = + [ numberWith TNumeral.value (> 99) + , regex "og" + , dimension Ordinal + ] + , prod = \case + Token Numeral NumeralData {TNumeral.value=maybenumnum}:_:Token Ordinal (OrdinalData ordnum):_ -> + case getIntValue maybenumnum of + Just numnum -> Just $ ordinal (numnum + ordnum) + Nothing -> Nothing _ -> Nothing } @@ -81,4 +164,6 @@ rules :: [Rule] rules = [ ruleOrdinalDigits , ruleOrdinalsFirstst + , ruleSpelledOutOrdinals + , ruleSpelledOutBigOrdinals ] diff --git a/Duckling/Ranking/Classifiers/DA_XX.hs b/Duckling/Ranking/Classifiers/DA_XX.hs index 89c9547d..4ae6c932 100644 --- a/Duckling/Ranking/Classifiers/DA_XX.hs +++ b/Duckling/Ranking/Classifiers/DA_XX.hs @@ -2,7 +2,8 @@ -- All rights reserved. -- -- This source code is licensed under the BSD-style license found in the --- LICENSE file in the root directory of this source tree. +-- LICENSE file in the root directory of this source tree. An additional grant +-- of patent rights can be found in the PATENTS file in the same directory. ----------------------------------------------------------------- -- Auto-generated by regenClassifiers @@ -276,7 +277,7 @@ classifiers HashMap.fromList [("ordinal (digits)quarter (grain)", -1.252762968495368), ("quarter", -0.8472978603872037), - ("ordinals (first..31st)quarter (grain)", -1.252762968495368)], + ("ordinals (first..19st)quarter (grain)", -1.252762968495368)], n = 2}, koData = ClassData{prior = -0.6931471805599453, @@ -285,7 +286,7 @@ classifiers HashMap.fromList [("ordinal (digits)quarter (grain)", -1.252762968495368), ("quarter", -0.8472978603872037), - ("ordinals (first..31st)quarter (grain)", -1.252762968495368)], + ("ordinals (first..19st)quarter (grain)", -1.252762968495368)], n = 2}}), ("intersect", Classifier{okData = @@ -419,12 +420,12 @@ classifiers likelihoods = HashMap.fromList [("daymonth", -1.7346010553881064), - ("ordinals (first..31st)week (grain)October", + ("ordinals (first..19st)week (grain)intersect", -1.7346010553881064), - ("ordinals (first..31st)week (grain)intersect", + ("ordinals (first..19st)week (grain)October", -1.7346010553881064), ("weekmonth", -1.2237754316221157), - ("ordinals (first..31st)day (grain)October", + ("ordinals (first..19st)day (grain)October", -1.7346010553881064)], n = 6}, koData = @@ -566,7 +567,7 @@ classifiers ClassData{prior = 0.0, unseen = -2.0794415416798357, likelihoods = HashMap.fromList - [("ordinals (first..31st)quarter (grain)year", + [("ordinals (first..19st)quarter (grain)year", -1.252762968495368), ("quarteryear", -0.8472978603872037), ("ordinal (digits)quarter (grain)year", -1.252762968495368)], @@ -624,9 +625,9 @@ classifiers likelihoods = HashMap.fromList [("daymonth", -0.8938178760220964), - ("ordinals (first..31st)TuesdayOctober", -1.9924301646902063), - ("ordinals (first..31st)Tuesdayintersect", -1.9924301646902063), - ("ordinals (first..31st)Wednesdayintersect", + ("ordinals (first..19st)Tuesdayintersect", -1.9924301646902063), + ("ordinals (first..19st)TuesdayOctober", -1.9924301646902063), + ("ordinals (first..19st)Wednesdayintersect", -1.4816045409242156)], n = 8}, koData = @@ -635,8 +636,8 @@ classifiers likelihoods = HashMap.fromList [("daymonth", -0.9444616088408514), - ("ordinals (first..31st)WednesdayOctober", -1.2809338454620642), - ("ordinals (first..31st)TuesdaySeptember", -1.791759469228055)], + ("ordinals (first..19st)WednesdayOctober", -1.2809338454620642), + ("ordinals (first..19st)TuesdaySeptember", -1.791759469228055)], n = 6}}), ("the (non ordinal)", Classifier{okData = @@ -646,15 +647,6 @@ classifiers koData = ClassData{prior = -infinity, unseen = -0.6931471805599453, likelihoods = HashMap.fromList [], n = 0}}), - ("ordinals (first..31st)", - Classifier{okData = - ClassData{prior = -5.406722127027582e-2, - unseen = -2.995732273553991, - likelihoods = HashMap.fromList [("", 0.0)], n = 18}, - koData = - ClassData{prior = -2.9444389791664407, - unseen = -1.0986122886681098, - likelihoods = HashMap.fromList [("", 0.0)], n = 1}}), ("April", Classifier{okData = ClassData{prior = 0.0, unseen = -1.6094379124341003, @@ -707,7 +699,7 @@ classifiers unseen = -3.258096538021482, likelihoods = HashMap.fromList - [("ordinals (first..31st)March", -1.8325814637483102), + [("ordinals (first..19st)March", -1.8325814637483102), ("ordinal (digits)February", -1.8325814637483102), ("month", -0.8209805520698302), ("ordinal (digits)March", -1.6094379124341003)], @@ -717,7 +709,7 @@ classifiers unseen = -2.0794415416798357, likelihoods = HashMap.fromList - [("ordinals (first..31st)April", -1.252762968495368), + [("ordinals (first..19st)April", -1.252762968495368), ("month", -1.252762968495368)], n = 1}}), ("numbers prefix with -, negative or minus", @@ -802,7 +794,7 @@ classifiers ClassData{prior = 0.0, unseen = -2.3978952727983707, likelihoods = HashMap.fromList - [("ordinals (first..31st)", -1.2039728043259361), + [("ordinals (first..19st)", -1.2039728043259361), ("ordinal (digits)", -0.35667494393873245)], n = 8}, koData = @@ -878,6 +870,15 @@ classifiers koData = ClassData{prior = -infinity, unseen = -0.6931471805599453, likelihoods = HashMap.fromList [], n = 0}}), + ("ordinals (first..19st)", + Classifier{okData = + ClassData{prior = -5.406722127027582e-2, + unseen = -2.995732273553991, + likelihoods = HashMap.fromList [("", 0.0)], n = 18}, + koData = + ClassData{prior = -2.9444389791664407, + unseen = -1.0986122886681098, + likelihoods = HashMap.fromList [("", 0.0)], n = 1}}), ("about ", Classifier{okData = ClassData{prior = -0.6931471805599453, @@ -1544,8 +1545,8 @@ classifiers ClassData{prior = 0.0, unseen = -2.3978952727983707, likelihoods = HashMap.fromList - [("ordinals (first..31st)April", -1.6094379124341003), - ("ordinals (first..31st)March", -1.6094379124341003), + [("ordinals (first..19st)April", -1.6094379124341003), + ("ordinals (first..19st)March", -1.6094379124341003), ("month", -0.916290731874155), ("ordinal (digits)March", -1.6094379124341003)], n = 3}, diff --git a/Duckling/Ranking/Classifiers/IT_XX.hs b/Duckling/Ranking/Classifiers/IT_XX.hs index 139753f8..7aa8a9e5 100644 --- a/Duckling/Ranking/Classifiers/IT_XX.hs +++ b/Duckling/Ranking/Classifiers/IT_XX.hs @@ -319,10 +319,10 @@ classifiers unseen = -4.31748811353631, likelihoods = HashMap.fromList - [(" (latent time-of-day)", -0.9718605830289658), + [(" (latent time-of-day)", -0.9718605830289657), ("intersect by \"di\", \"della\", \"del\"", -3.20545280453606), ("day", -2.3581549441488563), ("Lunedi", -3.6109179126442243), - ("hour", -0.9718605830289658), + ("hour", -0.9718605830289657), ("two time tokens separated by `di`", -3.20545280453606), ("Domenica", -3.6109179126442243)], n = 33}}),