From fe4f77bdc0f873549150ca311a9783332b7b6776 Mon Sep 17 00:00:00 2001 From: Filipe Pereira <4221949+fil090302@users.noreply.github.com> Date: Wed, 4 Aug 2021 17:12:58 -0700 Subject: [PATCH] PT time improvements (#633) Summary: New rules for PT time expressions like "5 Maio", "Maio 5", "5 Maio 2022". Pull Request resolved: https://github.com/facebook/duckling/pull/633 Reviewed By: stroxler Differential Revision: D30114330 Pulled By: chessai fbshipit-source-id: f56418d95efa1d7488957b8b8083daec3193949b --- Duckling/Ranking/Classifiers/PT_XX.hs | 742 ++++++++++++++------------ Duckling/Time/PT/Corpus.hs | 13 + Duckling/Time/PT/Rules.hs | 50 +- 3 files changed, 457 insertions(+), 348 deletions(-) diff --git a/Duckling/Ranking/Classifiers/PT_XX.hs b/Duckling/Ranking/Classifiers/PT_XX.hs index 8ee478e5..dd06d1af 100644 --- a/Duckling/Ranking/Classifiers/PT_XX.hs +++ b/Duckling/Ranking/Classifiers/PT_XX.hs @@ -83,9 +83,9 @@ classifiers n = 7}}), ("de ", Classifier{okData = - ClassData{prior = 0.0, unseen = -4.430816798843313, + ClassData{prior = 0.0, unseen = -4.442651256490317, likelihoods = HashMap.fromList [("integer (numeric)", 0.0)], - n = 82}, + n = 83}, koData = ClassData{prior = -infinity, unseen = -0.6931471805599453, likelihoods = HashMap.fromList [], n = 0}}), @@ -103,11 +103,10 @@ classifiers likelihoods = HashMap.fromList [], n = 0}}), ("integer (numeric)", Classifier{okData = - ClassData{prior = -0.23299584775722143, - unseen = -5.318119993844216, - likelihoods = HashMap.fromList [("", 0.0)], n = 202}, + ClassData{prior = -0.2212726433783981, unseen = -5.375278407684165, + likelihoods = HashMap.fromList [("", 0.0)], n = 214}, koData = - ClassData{prior = -1.5709716316063043, unseen = -4.007333185232471, + ClassData{prior = -1.6169567448481277, unseen = -4.007333185232471, likelihoods = HashMap.fromList [("", 0.0)], n = 53}}), ("the day before yesterday", Classifier{okData = @@ -210,119 +209,123 @@ classifiers likelihoods = HashMap.fromList [], n = 0}}), ("Julho", Classifier{okData = - ClassData{prior = 0.0, unseen = -2.890371757896165, - likelihoods = HashMap.fromList [("", 0.0)], n = 16}, + ClassData{prior = 0.0, unseen = -3.1354942159291497, + likelihoods = HashMap.fromList [("", 0.0)], n = 21}, koData = ClassData{prior = -infinity, unseen = -0.6931471805599453, likelihoods = HashMap.fromList [], n = 0}}), ("intersect by `da` or `de`", Classifier{okData = - ClassData{prior = -1.9568392195875037, - unseen = -4.8283137373023015, + ClassData{prior = -1.9564365620423008, unseen = -4.852030263919617, likelihoods = HashMap.fromList - [("daymonth", -3.028522096376982), - (" de two time tokens separated by \",\"2", - -4.127134385045092), - (" de intersect", - -4.127134385045092), - ("dd-dd de (interval)year", -4.127134385045092), - (" de two time tokens separated by \",\"", - -4.127134385045092), - ("dayday", -3.4339872044851463), - ("Quart-feira (que vem)", -4.127134385045092), - ("dayyear", -2.1122313645028266), - ("quarteryear", -4.127134385045092), - ("dd-dd (interval)year", -4.127134385045092), - (" trimestreyear", -4.127134385045092), - ("day of month (1st)Mar\231o", -3.4339872044851463), - ("two time tokens separated by \",\"2year", -3.721669276936927), - ("intersectyear", -4.127134385045092), - (" de year", -2.740840023925201), + [("daymonth", -3.0524276172305362), + ("dd-dd de (interval)year", -4.151039905898646), + ("dayday", -3.457892725338701), + ("Quart-feira (que vem)", -4.151039905898646), + ("dayyear", -2.07159836421881), + ("quarteryear", -4.151039905898646), + ("dd-dd (interval)year", -4.151039905898646), + (" trimestreyear", -4.151039905898646), + (" (ordinal or number) de year", + -2.7647455447787554), + ("day of month (1st)Mar\231o", -3.457892725338701), + ("two time tokens separated by \",\"2year", + -3.7455747977904816), + ("intersectyear", -4.151039905898646), ("dia (non ordinal)Fevereiro", - -3.721669276936927), - ("two time tokens separated by \",\"year", -3.721669276936927), - ("Domingo passado", -4.127134385045092), - ("dayweek", -3.4339872044851463), - ("Quart-feiraproximo ", -4.127134385045092)], - n = 26}, + -3.7455747977904816), + (" (ordinal or number) year", + -4.151039905898646), + ("two time tokens separated by \",\"year", -3.7455747977904816), + ("Domingo passado", -4.151039905898646), + ("dayweek", -3.457892725338701), + (" (ordinal or number) de two time tokens separated by \",\"2", + -4.151039905898646), + (" (ordinal or number) de intersect", + -4.151039905898646), + ("Quart-feiraproximo ", -4.151039905898646), + (" (ordinal or number) de two time tokens separated by \",\"", + -4.151039905898646)], + n = 27}, koData = - ClassData{prior = -0.15234072458201878, - unseen = -5.963579343618447, + ClassData{prior = -0.15240700022243142, + unseen = -5.996452088619021, likelihoods = HashMap.fromList [("time-of-day (latent) - (interval)", - -4.862393050955164), - ("hourday", -3.1277919955670574), - ("year (latent)Abril", -4.862393050955164), - ("year (latent)Julho", -3.396055982161737), - ("time-of-day (latent)Fevereiro", -4.01509519056796), - ("monthday", -4.574710978503383), - ("monthyear", -3.6584202466292277), - ("yearhour", -3.8815637979434374), + -4.895349138638459), + ("hourday", -3.160748083250353), + ("year (latent)Abril", -4.895349138638459), + ("year (latent)Julho", -3.3549040976913105), + ("time-of-day (latent)Fevereiro", -4.048051278251256), + ("monthday", -4.6076670661866785), + ("monthyear", -3.5960661545081987), + ("yearhour", -3.914519885626733), ("entre e (interval)Julho", - -3.6584202466292277), - ("houryear", -3.3219480100080148), - ("year (latent)Maio", -4.351567427189173), + -3.6913763343125234), + ("houryear", -3.3549040976913105), + ("year (latent)Maio", -4.202201958078514), ("time-of-day (latent)two time tokens separated by \",\"2", - -4.862393050955164), - ("Setembroyear", -4.169245870395218), - ("time-of-day (latent)Setembro", -4.169245870395218), - ("time-of-day (latent)intersect", -3.3219480100080148), + -4.895349138638459), + ("Setembroyear", -4.202201958078514), + ("time-of-day (latent)Setembro", -4.202201958078514), + ("time-of-day (latent)intersect", -3.2859112262043593), ("year (latent) ", - -5.267858159063328), - ("\224s Julho", -4.574710978503383), - ("year (latent)intersect", -4.574710978503383), - ("Setembrointersect", -5.267858159063328), + -5.300814246746624), + ("\224s Julho", -4.6076670661866785), + ("year (latent)intersect", -4.6076670661866785), + ("Setembrointersect", -5.300814246746624), ("intersect by `da` or `de`two time tokens separated by \",\"", - -5.267858159063328), + -5.300814246746624), ("year (latent)amanh\227 pela ", - -5.267858159063328), - ("hourmonth", -1.7713505975968478), + -5.300814246746624), + ("hourmonth", -1.7598549227093099), ("time-of-day (latent)intersect by `da` or `de`", - -3.4760986898352733), - ("monthmonth", -5.267858159063328), + -3.509054777518569), + ("monthmonth", -5.300814246746624), ("Setembrotwo time tokens separated by \",\"2", - -5.267858159063328), - ("year (latent)Fevereiro", -4.01509519056796), - ("time-of-day (latent)Dezembro", -5.267858159063328), - ("dayyear", -5.267858159063328), - ("time-of-day (latent)Julho", -3.563110066824903), - ("time-of-day (latent)Abril", -4.862393050955164), - ("passado year", -5.267858159063328), - ("year (latent) da manha", -5.267858159063328), - ("time-of-day (latent)Mar\231o", -4.169245870395218), - ("year (latent)Janeiro", -5.267858159063328), - ("year (latent)Dezembro", -5.267858159063328), + -5.300814246746624), + ("year (latent)Fevereiro", -4.048051278251256), + ("time-of-day (latent)Dezembro", -5.300814246746624), + ("dayyear", -5.300814246746624), + ("time-of-day (latent)Julho", -3.509054777518569), + ("time-of-day (latent)Abril", -4.895349138638459), + ("passado year", -5.300814246746624), + ("year (latent) da manha", -5.300814246746624), + (" (ordinal or number) de year", + -5.300814246746624), + ("time-of-day (latent)Mar\231o", -4.202201958078514), + ("year (latent)Janeiro", -5.300814246746624), + ("year (latent)Dezembro", -5.300814246746624), ("entre e (interval)Janeiro", - -5.267858159063328), - ("yearmonth", -2.1998052239297112), - ("two time tokens separated by \",\"2year", -4.862393050955164), + -5.300814246746624), + ("yearmonth", -2.1872989375362493), + ("two time tokens separated by \",\"2year", -4.895349138638459), ("Setembrotwo time tokens separated by \",\"", - -5.267858159063328), - ("intersect by `da` or `de`intersect", -5.267858159063328), - ("intersectyear", -5.267858159063328), + -5.300814246746624), + ("intersect by `da` or `de`intersect", -5.300814246746624), + ("intersectyear", -5.300814246746624), ("intersect by `da` or `de`two time tokens separated by \",\"2", - -5.267858159063328), - ("time-of-day (latent)Maio", -4.351567427189173), - (" de year", -5.267858159063328), - ("antes das Julho", -4.01509519056796), - ("Maioyear", -4.862393050955164), - ("year (latent)Setembro", -4.169245870395218), - ("\224s Janeiro", -5.267858159063328), - ("two time tokens separated by \",\"year", -4.862393050955164), + -5.300814246746624), + ("time-of-day (latent)Maio", -4.202201958078514), + ("antes das Julho", -4.048051278251256), + ("Maioyear", -4.6076670661866785), + ("year (latent)Setembro", -4.202201958078514), + ("\224s Janeiro", -5.300814246746624), + ("two time tokens separated by \",\"year", -4.895349138638459), ("time-of-day (latent)two time tokens separated by \",\"", - -4.862393050955164), - ("intersect by `da` or `de`year", -3.763780762287054), + -4.895349138638459), + ("intersect by `da` or `de`year", -3.7967368499703498), ("year (latent) - (interval)", - -4.862393050955164), - ("yearday", -5.267858159063328), - ("time-of-day (latent)Janeiro", -5.267858159063328), - ("Julhoyear", -5.267858159063328), + -4.895349138638459), + ("yearday", -5.300814246746624), + ("time-of-day (latent)Janeiro", -5.300814246746624), + ("Julhoyear", -5.300814246746624), ("de - (interval)Julho", - -5.267858159063328), - ("year (latent)Mar\231o", -4.169245870395218)], - n = 158}}), + -5.300814246746624), + ("year (latent)Mar\231o", -4.202201958078514)], + n = 164}}), (" and half", Classifier{okData = ClassData{prior = 0.0, unseen = -3.1354942159291497, @@ -335,6 +338,24 @@ classifiers koData = ClassData{prior = -infinity, unseen = -1.6094379124341003, likelihoods = HashMap.fromList [], n = 0}}), + (" (ordinal or number) ", + Classifier{okData = + ClassData{prior = -0.6190392084062235, unseen = -2.995732273553991, + likelihoods = + HashMap.fromList + [("integer (0..19)Maio", -2.2512917986064953), + ("integer (numeric)Julho", -2.2512917986064953), + ("month", -0.8649974374866046), + ("integer (0..19)Julho", -2.2512917986064953), + ("integer (numeric)Maio", -1.3350010667323402)], + n = 7}, + koData = + ClassData{prior = -0.7731898882334817, unseen = -2.890371757896165, + likelihoods = + HashMap.fromList + [("integer (numeric)Julho", -0.8873031950009028), + ("month", -0.8873031950009028)], + n = 6}}), ("semana (grain)", Classifier{okData = ClassData{prior = 0.0, unseen = -2.995732273553991, @@ -348,18 +369,18 @@ classifiers unseen = -3.6375861597263857, likelihoods = HashMap.fromList - [("yearSexta-feira", -2.917770732084279), + [("Segunda-feira (ordinal or number) de ", + -2.917770732084279), + ("yearSexta-feira", -2.917770732084279), ("dayday", -1.3083328196501787), ("de Sexta-feira", -2.917770732084279), + ("Sexta-feira (ordinal or number) de ", + -2.512305623976115), ("Sexta-feiraintersect", -2.512305623976115), ("intersectSexta-feira", -2.917770732084279), - ("Sexta-feira de ", - -2.512305623976115), ("Sexta-feiraintersect by `da` or `de`", -2.512305623976115), ("intersect by `da` or `de`Sexta-feira", -2.917770732084279), - ("yearday", -2.512305623976115), - ("Segunda-feira de ", - -2.917770732084279)], + ("yearday", -2.512305623976115)], n = 11}, koData = ClassData{prior = -0.4382549309311553, unseen = -4.02535169073515, @@ -384,11 +405,11 @@ classifiers likelihoods = HashMap.fromList [], n = 0}}), ("integer (0..19)", Classifier{okData = - ClassData{prior = -1.9048194970694474e-2, - unseen = -3.9889840465642745, - likelihoods = HashMap.fromList [("", 0.0)], n = 52}, + ClassData{prior = -1.7699577099400975e-2, + unseen = -4.060443010546419, + likelihoods = HashMap.fromList [("", 0.0)], n = 56}, koData = - ClassData{prior = -3.970291913552122, unseen = -1.0986122886681098, + ClassData{prior = -4.04305126783455, unseen = -1.0986122886681098, likelihoods = HashMap.fromList [("", 0.0)], n = 1}}), ("desde dd-dd (interval)", Classifier{okData = @@ -402,40 +423,12 @@ classifiers koData = ClassData{prior = -infinity, unseen = -1.0986122886681098, likelihoods = HashMap.fromList [], n = 0}}), - (" de ", - Classifier{okData = - ClassData{prior = -0.2682639865946794, unseen = -4.189654742026425, - likelihoods = - HashMap.fromList - [("integer (numeric)Fevereiro", -2.228477120840324), - ("integer (numeric)Mar\231o", -3.0757749812275272), - ("integer (0..19)Maio", -3.0757749812275272), - ("integer (numeric)Abril", -3.481240089335692), - ("integer (numeric)Julho", -3.481240089335692), - ("integer (numeric)Dezembro", -3.481240089335692), - ("integer (numeric)Janeiro", -3.481240089335692), - ("month", -0.878550403891308), - ("integer (0..19)Julho", -3.481240089335692), - ("integer (0..19)Abril", -3.481240089335692), - ("integer (numeric)Maio", -3.0757749812275272), - ("integer (0..19)Mar\231o", -2.7880929087757464), - ("integer (numeric)Setembro", -2.382627800667582)], - n = 26}, - koData = - ClassData{prior = -1.4469189829363254, - unseen = -3.4011973816621555, - likelihoods = - HashMap.fromList - [("integer (numeric)Julho", -1.1700712526502546), - ("month", -1.1700712526502546)], - n = 8}}), (" ", Classifier{okData = ClassData{prior = -0.3437715391028245, unseen = -4.61512051684126, likelihoods = HashMap.fromList [(" and quinzeafternoon", -2.995732273553991), - (" de morning", -3.912023005428146), ("dayhour", -3.2188758248682006), ("\224s morning", -3.2188758248682006), ("hourhour", -1.8325814637483102), @@ -444,6 +437,8 @@ classifiers ("time-of-day (latent)morning", -3.2188758248682006), ("time-of-day (latent)evening", -3.506557897319982), ("\224s afternoon", -2.3025850929940455), + (" (ordinal or number) de morning", + -3.912023005428146), ("dia de morning", -3.912023005428146), ("time-of-day (latent)afternoon", -3.506557897319982), @@ -482,8 +477,8 @@ classifiers n = 2}}), ("Maio", Classifier{okData = - ClassData{prior = 0.0, unseen = -1.791759469228055, - likelihoods = HashMap.fromList [("", 0.0)], n = 4}, + ClassData{prior = 0.0, unseen = -2.639057329615259, + likelihoods = HashMap.fromList [("", 0.0)], n = 12}, koData = ClassData{prior = -infinity, unseen = -0.6931471805599453, likelihoods = HashMap.fromList [], n = 0}}), @@ -545,136 +540,147 @@ classifiers n = 7}}), ("intersect", Classifier{okData = - ClassData{prior = -1.281981514494508, unseen = -5.225746673713201, + ClassData{prior = -1.2729656758128873, unseen = -5.288267030694535, likelihoods = HashMap.fromList - [(" de in the ", - -4.121743536410215), - ("dayhour", -2.4477671028385437), - (" de two time tokens separated by \",\"2", - -4.527208644518379), + [("dayhour", -2.5106150064982073), ("nowquinze para as (as relative minutes)", - -4.527208644518379), - ("Quart-feiraamanh\227 pela ", -4.527208644518379), - ("now\224s ", -4.121743536410215), - (" de intersect", - -4.527208644518379), - ("now and 3/4", -4.527208644518379), - (" de two time tokens separated by \",\"", - -4.527208644518379), - ("Segunda-feirain the ", -4.527208644518379), - ("Quart-feira ", -4.527208644518379), - ("yearSexta-feira", -4.527208644518379), - ("dayday", -3.0231312477421053), - ("hourhour", -4.527208644518379), - ("dayyear", -2.512305623976115), - ("de Sexta-feira", -4.527208644518379), - ("quarteryear", -4.527208644518379), - ("minutehour", -3.2744456760230114), + -4.590056548178043), + ("Quart-feiraamanh\227 pela ", -4.590056548178043), + ("now\224s ", -4.1845914400698785), + ("now and 3/4", -4.590056548178043), + ("Segunda-feirain the ", -4.590056548178043), + ("Quart-feira ", -4.590056548178043), + ("yearSexta-feira", -4.590056548178043), + ("dayday", -3.0859791514017694), + ("hourhour", -4.590056548178043), + ("dayyear", -2.392831970841824), + ("de Sexta-feira", -4.590056548178043), + ("quarteryear", -4.590056548178043), + ("minutehour", -3.337293579682675), (" and quinzein the ", - -4.121743536410215), + -4.1845914400698785), + ("Sexta-feira (ordinal or number) de ", + -4.590056548178043), ("intersect by `da` or `de`in the ", - -4.121743536410215), - ("now and ", -4.527208644518379), - ("tomorrow horas", -4.121743536410215), - ("dd-dd de (interval)de ", -4.527208644518379), + -4.1845914400698785), + (" (ordinal or number) de year", + -4.590056548178043), + ("now and ", -4.590056548178043), + ("tomorrow horas", -4.1845914400698785), + ("dd-dd de (interval)de ", -4.590056548178043), ("now para as (as relative minutes)", - -4.527208644518379), - ("\224s in the ", -3.834061463958434), - ("Sexta-feiraintersect", -4.527208644518379), - ("dayminute", -3.2744456760230114), - (" trimestrede ", -4.527208644518379), - ("intersectSexta-feira", -4.527208644518379), - ("Sexta-feira de ", - -4.527208644518379), - ("dd-dd (interval)de ", -4.527208644518379), - ("Sexta-feiraintersect by `da` or `de`", -4.527208644518379), - ("intersect by `da` or `de`Sexta-feira", -4.527208644518379), + -4.590056548178043), + ("\224s in the ", -3.896909367618098), + ("Sexta-feiraintersect", -4.590056548178043), + ("dayminute", -3.337293579682675), + (" trimestrede ", -4.590056548178043), + (" (ordinal or number) de de ", + -3.2037621870581527), + ("intersectSexta-feira", -4.590056548178043), + ("dd-dd (interval)de ", -4.590056548178043), + ("Sexta-feiraintersect by `da` or `de`", -4.590056548178043), + (" (ordinal or number) year", + -4.590056548178043), + ("intersect by `da` or `de`Sexta-feira", -4.590056548178043), + (" (ordinal or number) de in the ", + -4.1845914400698785), ("dia de in the ", - -4.121743536410215), - ("yearday", -4.121743536410215), + -4.1845914400698785), + (" (ordinal or number) de ", + -4.590056548178043), + ("yearday", -4.1845914400698785), + (" (ordinal or number) de two time tokens separated by \",\"2", + -4.590056548178043), + (" (ordinal or number) de intersect", + -4.590056548178043), ("two time tokens separated by \",\"de ", - -4.121743536410215), - ("Quart-feira\224s ", -4.527208644518379), - (" de de ", - -3.1409142833984887), - ("Quart-feira da manha", -4.527208644518379), - ("tomorrowdepois das ", -4.121743536410215), + -4.1845914400698785), + ("Quart-feira\224s ", -4.590056548178043), + ("Quart-feira da manha", -4.590056548178043), + (" (ordinal or number) de two time tokens separated by \",\"", + -4.590056548178043), + ("tomorrowdepois das ", -4.1845914400698785), (" and in the ", - -4.121743536410215), + -4.1845914400698785), ("two time tokens separated by \",\"2de ", - -4.121743536410215), - ("intersectde ", -4.527208644518379)], - n = 53}, + -4.1845914400698785), + ("intersectde ", -4.590056548178043)], + n = 56}, koData = - ClassData{prior = -0.3250197428894252, unseen = -5.87493073085203, + ClassData{prior = -0.3285040669720361, unseen = -5.924255797414532, likelihoods = HashMap.fromList - [("hourday", -3.4742225166770453), - ("dayhour", -3.7926762477955798), - ("daymonth", -5.178970608915471), - ("monthday", -4.080358320247361), - ("monthyear", -3.4742225166770453), - ("now\224s ", -4.773505500807306), + [("hourday", -3.3566290621822787), + ("dayhour", -3.8421368779639797), + ("daymonth", -5.2284312390838705), + ("monthday", -4.1298189504157605), + ("monthyear", -3.282521090028557), + (" (ordinal or number) de \224s ", + -5.2284312390838705), + ("now\224s ", -4.822966130975706), ("entre e (interval)Julho", - -3.56953269648137), - ("houryear", -3.233060459860157), - ("Dezembro\224s ", -5.178970608915471), - ("\224s Julho", -4.773505500807306), - (" am|pm de ", - -4.485823428355525), - ("Fevereiroin the ", -4.773505500807306), - ("de antes das ", -2.4381305849902697), - ("monthhour", -3.9262076404201025), - ("Setembrointersect", -5.178970608915471), + -3.61899332664977), + ("houryear", -3.2135282185416054), + ("Dezembro\224s ", -5.2284312390838705), + ("\224s Julho", -4.822966130975706), + ("Fevereiroin the ", -4.822966130975706), + ("de antes das ", -2.487591215158669), + ("monthhour", -3.9756682705885025), + ("Setembrointersect", -5.2284312390838705), ("intersect by `da` or `de`two time tokens separated by \",\"", - -5.178970608915471), - ("hourmonth", -3.3071684320138792), - ("Julhode ", -5.178970608915471), + -5.2284312390838705), + ("hourmonth", -3.3566290621822787), + ("Julhode ", -5.2284312390838705), (" am|pmintersect by `da` or `de`", - -4.485823428355525), + -4.535284058523925), ("Setembrotwo time tokens separated by \",\"2", - -5.178970608915471), - ("dayyear", -5.178970608915471), + -5.2284312390838705), + ("dayyear", -5.2284312390838705), ("intersect by `da` or `de`\224s ", - -5.178970608915471), - ("passado year", -5.178970608915471), - ("de o de