From 30b3d29e86030f2cb71291f8670b0df5d8c22aba Mon Sep 17 00:00:00 2001 From: Alex Kapranoff Date: Tue, 30 Nov 2021 09:42:37 -0800 Subject: [PATCH] (Time/TimeGrain/Ordinal)/RU_XX: several extra Time forms for Russian Summary: Some changes were originally suggested by me during the review of https://github.com/facebook/duckling/pull/474. Others are new. 1. "Day after tomorrow/before yesterday" 2. Ordinals in the form of number+suffix like "8th of March" 3. Tuesdays require a special preposition. 4. Support "Yo" (U+0451) in "fourth" and "during daylight". 5. Support special perposition for "next week". 6. Support "one before last" adjective for time grains. 7. Proper suffixes for "quarter" grain. 8. Support "at midnight". 9. Support alternative flag for "afternoon". Changes in Ordinal and TimeGrain are all driven by the new examples in the corpus for Time. There are also a couple of bugfixes: 1. A hidden latin "e" was present in an otherwise Cyrillic regex. 2. Wrong order of options in a regex separated with "|" prevented some matches. Reviewed By: haoxuany Differential Revision: D32311714 fbshipit-source-id: 084f6c3893eb5bfd767c267f558b910c6854eb59 --- Duckling/Ordinal/RU/Corpus.hs | 77 ++ Duckling/Ordinal/RU/Rules.hs | 18 +- Duckling/Ranking/Classifiers/RU_XX.hs | 1256 ++++++++++++++++--------- Duckling/Time/RU/Corpus.hs | 118 ++- Duckling/Time/RU/Rules.hs | 57 +- Duckling/TimeGrain/RU/Rules.hs | 8 +- 6 files changed, 1033 insertions(+), 501 deletions(-) diff --git a/Duckling/Ordinal/RU/Corpus.hs b/Duckling/Ordinal/RU/Corpus.hs index 33e4d300..fea86b12 100644 --- a/Duckling/Ordinal/RU/Corpus.hs +++ b/Duckling/Ordinal/RU/Corpus.hs @@ -30,6 +30,7 @@ allExamples = concat , "первое" , "первой" , "первого" + , "первые" , "1ая" , "1-ая" , "1ый" @@ -40,6 +41,11 @@ allExamples = concat , "1-й" , "1го" , "1-го" + , "1-ого" + , "первой" + , "первым" + , "первому" + , "первым" ] , examples (OrdinalData 3) [ "третий" @@ -47,6 +53,7 @@ allExamples = concat , "третье" , "третьей" , "третьего" + , "третьи" , "3й" , "3ий" , "3я" @@ -61,6 +68,11 @@ allExamples = concat , "3-е" , "3-ье" , "3-го" + , "3-его" + , "третьей" + , "третьим" + , "третьему" + , "третьим" ] , examples (OrdinalData 4) [ "четвертый" @@ -68,6 +80,8 @@ allExamples = concat , "четвертое" , "четвертой" , "четвертого" + , "четвёртый" + , "четвертые" , "4й" , "4ый" , "4ая" @@ -80,39 +94,102 @@ allExamples = concat , "4-ое" , "4-ой" , "4-го" + , "4-ого" + , "четвертой" + , "четвертым" + , "четвёртому" + , "четвёртым" ] , examples (OrdinalData 15) [ "пятнадцатый" , "15й" , "15-й" + , "15-ый" + , "пятнадцатая" + , "15я" + , "15-я" + , "15-ая" + , "пятнадцатое" + , "15е" + , "15-е" + , "15-ое" + , "пятнадцатому" + , "пятнадцатые" + , "пятнадцатой" + , "пятнадцатым" + , "пятнадцатому" + , "пятнадцатым" + ] + , examples (OrdinalData 10) + [ "десятый" + , "10й" + , "10-й" + , "10-ый" + , "десятая" + , "десятой" ] , examples (OrdinalData 21) [ "21й" , "21-й" , "21-го" + , "21-ого" , "Двадцать первый" , "двадцать первый" , "двадцать первого" + , "двадцать первой" ] , examples (OrdinalData 23) [ "23й" , "23-й" , "двадцать третий" , "двадцать третьего" + , "двадцать третьей" + , "23-го" + , "23-его" ] , examples (OrdinalData 31) [ "31ый" , "31-ый" , "тридцать первый" ] + , examples (OrdinalData 30) + [ "30ый" + , "30-ый" + , "тридцатый" + ] , examples (OrdinalData 48) [ "48ое" , "48-ое" , "сорок восьмое" ] + , examples (OrdinalData 40) + [ "40ое" + , "40-ое" + , "сороковой" + , "сороковое" + , "сороковая" + ] , examples (OrdinalData 99) [ "99ый" , "99-й" + , "99-ый" , "девяносто девятый" + , "девяносто девятая" + ] + , examples (OrdinalData 90) + [ "90ый" + , "90-й" + , "90-ый" + , "девяностый" + , "девяностая" + ] + , examples (OrdinalData 100) + [ "сотое" + , "сотая" + , "сотый" + , "100-ая" + , "100-я" + , "100-й" + , "100-е" ] ] diff --git a/Duckling/Ordinal/RU/Rules.hs b/Duckling/Ordinal/RU/Rules.hs index 3a4f846d..50594fd7 100644 --- a/Duckling/Ordinal/RU/Rules.hs +++ b/Duckling/Ordinal/RU/Rules.hs @@ -31,6 +31,7 @@ ordinalsFirstthMap = HashMap.fromList , ( "втор", 2 ) , ( "трет", 3 ) , ( "четверт", 4 ) + , ( "четвёрт", 4 ) , ( "пят", 5 ) , ( "шест", 6 ) , ( "седьм", 7 ) @@ -47,6 +48,14 @@ ordinalsFirstthMap = HashMap.fromList , ( "восемнадцат", 18 ) , ( "девятнадцат", 19 ) , ( "двадцат", 20 ) + , ( "тридцат", 30 ) + , ( "сороков", 40 ) + , ( "пятидесят", 50 ) + , ( "шестидесят", 60 ) + , ( "семидесят", 70 ) + , ( "восьмидесят", 80 ) + , ( "девяност", 90 ) + , ( "сот", 100 ) ] cardinalsMap :: HashMap Text.Text Int @@ -59,13 +68,14 @@ cardinalsMap = HashMap.fromList , ( "семьдесят", 70 ) , ( "восемьдесят", 80 ) , ( "девяносто", 90 ) + , ( "сто", 100 ) ] ruleOrdinalsFirstth :: Rule ruleOrdinalsFirstth = Rule - { name = "ordinals (first..19th)" + { name = "ordinals (first..20th, then 30th, 40th, ..., 100th)" , pattern = - [ regex "(перв|втор|трет|четверт|пят|шест|седьм|восьм|девят|десят|одиннадцат|двенадцат|тринадцат|четырнадцат|пятнадцат|шестнадцат|семнадцат|восемнадцат|девятнадцат|двадцат)(ье(го|й)?|ого|ый|ой|ий|ая|ое|ья)" + [ regex "(перв|втор|трет|четв[её]рт|пят|шест|седьм|восьм|девят|десят|одиннадцат|двенадцат|тринадцат|четырнадцат|пятнадцат|шестнадцат|семнадцат|восемнадцат|девятнадцат|двадцат|тридцат|сороков|пятидесят|шестидесят|семидесят|восьмидесят|девяност|сот)(ь(его|ему|ей|ем|им|их|и|е)|ого|ому|ый|ой|ий|ая|ое|ья|ом|ые|ым|ых)" ] , prod = \tokens -> case tokens of (Token RegexMatch (GroupMatch (match:_)):_) -> @@ -78,7 +88,7 @@ ruleOrdinal = Rule { name = "ordinal 21..99" , pattern = [ regex "(двадцать|тридцать|сорок|пятьдесят|шестьдесят|семьдесят|восемьдесят|девяносто)" - , regex "(перв|втор|трет|четверт|пят|шест|седьм|восьм|девят)(ье(го|й)?|ого|ый|ой|ий|ая|ое|ья)" + , regex "(перв|втор|трет|четв[её]рт|пят|шест|седьм|восьм|девят)(ье(го|й)?|ого|ому|ый|ой|ий|ая|ое|ья|ые|ым|ых)" ] , prod = \tokens -> case tokens of (Token RegexMatch (GroupMatch (m1:_)): @@ -94,7 +104,7 @@ ruleOrdinalDigits :: Rule ruleOrdinalDigits = Rule { name = "ordinal (digits)" , pattern = - [ regex "0*(\\d+)-?((ы|о|и|а|e|ь)?(ее|й|я|е|го))" + [ regex "0*(\\d+)-?((ы|о|и|а|е|ь)?(ее|й|я|е|го|му?))" ] , prod = \tokens -> case tokens of (Token RegexMatch (GroupMatch (match:_)):_) -> ordinal <$> parseInt match diff --git a/Duckling/Ranking/Classifiers/RU_XX.hs b/Duckling/Ranking/Classifiers/RU_XX.hs index e986d1de..9cd5837a 100644 --- a/Duckling/Ranking/Classifiers/RU_XX.hs +++ b/Duckling/Ranking/Classifiers/RU_XX.hs @@ -30,11 +30,12 @@ classifiers likelihoods = HashMap.fromList [], n = 0}}), ("\1055\1086\1085\1077\1076\1077\1083\1100\1085\1080\1082", Classifier{okData = - ClassData{prior = 0.0, unseen = -1.791759469228055, - likelihoods = HashMap.fromList [("", 0.0)], n = 4}, + ClassData{prior = -0.1823215567939546, + unseen = -1.9459101490553135, + likelihoods = HashMap.fromList [("", 0.0)], n = 5}, koData = - ClassData{prior = -infinity, unseen = -0.6931471805599453, - likelihoods = HashMap.fromList [], n = 0}}), + ClassData{prior = -1.791759469228055, unseen = -1.0986122886681098, + likelihoods = HashMap.fromList [("", 0.0)], n = 1}}), ("