diff --git a/Duckling/Url/Corpus.hs b/Duckling/Url/Corpus.hs index da1e7368..c4c90d77 100644 --- a/Duckling/Url/Corpus.hs +++ b/Duckling/Url/Corpus.hs @@ -34,34 +34,40 @@ negativeCorpus = (testContext, examples) allExamples :: [Example] allExamples = concat - [ examples (UrlValue "http://www.bla.com") + [ examples (UrlData "http://www.bla.com" "bla.com") [ "http://www.bla.com" ] - , examples (UrlValue "www.bla.com:8080/path") + , examples (UrlData "www.bla.com:8080/path" "bla.com") [ "www.bla.com:8080/path" ] - , examples (UrlValue "https://myserver?foo=bar") + , examples (UrlData "https://myserver?foo=bar" "myserver") [ "https://myserver?foo=bar" ] - , examples (UrlValue "cnn.com/info") + , examples (UrlData "cnn.com/info" "cnn.com") [ "cnn.com/info" ] - , examples (UrlValue "bla.com/path/path?ext=%23&foo=bla") + , examples (UrlData "bla.com/path/path?ext=%23&foo=bla" "bla.com") [ "bla.com/path/path?ext=%23&foo=bla" ] - , examples (UrlValue "localhost") + , examples (UrlData "localhost" "localhost") [ "localhost" ] - , examples (UrlValue "localhost:8000") + , examples (UrlData "localhost:8000" "localhost") [ "localhost:8000" ] - , examples (UrlValue "http://kimchi") + , examples (UrlData "http://kimchi" "kimchi") [ "http://kimchi" ] - , examples (UrlValue "https://500px.com:443/about") + , examples (UrlData "https://500px.com:443/about" "500px.com") [ "https://500px.com:443/about" ] - , examples (UrlValue "www2.foo-bar.net?foo=bar") + , examples (UrlData "www2.foo-bar.net?foo=bar" "foo-bar.net") [ "www2.foo-bar.net?foo=bar" ] + , examples (UrlData "https://api.wit.ai/message?q=hi" "api.wit.ai") + [ "https://api.wit.ai/message?q=hi" + ] + , examples (UrlData "aMaZon.co.uk/?page=home" "amazon.co.uk") + [ "aMaZon.co.uk/?page=home" + ] ] diff --git a/Duckling/Url/Helpers.hs b/Duckling/Url/Helpers.hs index 7ab6868a..0b500c4e 100644 --- a/Duckling/Url/Helpers.hs +++ b/Duckling/Url/Helpers.hs @@ -14,25 +14,18 @@ module Duckling.Url.Helpers import Data.String import Data.Text (Text) +import qualified Data.Text as Text import Prelude import qualified Duckling.Url.Types as TUrl import Duckling.Url.Types (UrlData(..)) -parse :: Text -> Maybe Text -parse "" = Nothing -parse x = Just x - -- ----------------------------------------------------------------- -- Patterns -- ----------------------------------------------------------------- -- Production -url :: Text -> Text -> Text -> Text -> UrlData -url protocol domain path query = UrlData - { TUrl.protocol = parse protocol - , TUrl.domain = domain - , TUrl.path = parse path - , TUrl.query = parse query - } +url :: Text -> Text -> UrlData +url value domain = UrlData + {TUrl.value = value, TUrl.domain = Text.toLower domain} diff --git a/Duckling/Url/Rules.hs b/Duckling/Url/Rules.hs index e3f5dc76..8ff810f3 100644 --- a/Duckling/Url/Rules.hs +++ b/Duckling/Url/Rules.hs @@ -23,32 +23,42 @@ import Duckling.Url.Helpers ruleURL :: Rule ruleURL = Rule { name = "url" - , pattern = [ regex "(([a-zA-Z]+)://)?(([\\w_-]+\\.)+[a-zA-Z]{2,4}(:\\d+)?)(/[^?\\s#]*)?(\\?[^\\s#]+)?" ] + , pattern = + [ regex "((([a-zA-Z]+)://)?(w{2,3}[0-9]*\\.)?(([\\w_-]+\\.)+[a-z]{2,4})(:(\\d+))?(/[^?\\s#]*)?(\\?[^\\s#]+)?)" + ] , prod = \tokens -> case tokens of - (Token RegexMatch (GroupMatch (_:protocol:domain:_:_:path:query:_)):_) -> - Just . Token Url $ url protocol domain path query + (Token RegexMatch (GroupMatch (m:_:_protocol:_:domain:_:_:_port:_path:_query:_)): + _) -> Just . Token Url $ url m domain _ -> Nothing } ruleLocalhost :: Rule ruleLocalhost = Rule { name = "localhost" - , pattern = [ regex "(([a-zA-Z]+)://)?(localhost(:\\d+)?)(/[^?\\s#]*)?(\\?[^\\s#]+)?" ] + , pattern = + [ regex "((([a-zA-Z]+)://)?localhost(:(\\d+))?(/[^?\\s#]*)?(\\?[^\\s#]+)?)" + ] , prod = \tokens -> case tokens of - (Token RegexMatch (GroupMatch (_:protocol:domain:_:path:query:_)):_) -> - Just . Token Url $ url protocol domain path query + (Token RegexMatch (GroupMatch (m:_:_protocol:_:_port:_path:_query:_)):_) -> + Just . Token Url $ url m "localhost" _ -> Nothing } ruleLocalURL :: Rule ruleLocalURL = Rule { name = "local url" - , pattern = [ regex "([a-zA-Z]+)://([\\w_-]+(:\\d+)?)(/[^?\\s#]*)?(\\?[^\\s#]+)?" ] + , pattern = + [ regex "(([a-zA-Z]+)://([\\w_-]+)(:(\\d+))?(/[^?\\s#]*)?(\\?[^\\s#]+)?)" + ] , prod = \tokens -> case tokens of - (Token RegexMatch (GroupMatch (protocol:domain:_:path:query:_)):_) -> - Just . Token Url $ url protocol domain path query + (Token RegexMatch (GroupMatch (m:_protocol:domain:_:_port:_path:_query:_)): + _) -> Just . Token Url $ url m domain _ -> Nothing } rules :: [Rule] -rules = [ ruleURL, ruleLocalhost, ruleLocalURL ] +rules = + [ ruleURL + , ruleLocalhost + , ruleLocalURL + ] diff --git a/Duckling/Url/Tests.hs b/Duckling/Url/Tests.hs index 3e0a9d9d..bbd9c4cc 100644 --- a/Duckling/Url/Tests.hs +++ b/Duckling/Url/Tests.hs @@ -34,6 +34,6 @@ surroundTests :: TestTree surroundTests = testCase "Surround Tests" $ mapM_ (analyzedFirstTest testContext . withTargets [Some Url]) xs where - xs = examples (UrlValue "www.fuck-comment-spammers-they-just-wont-quit.com/episode-7") - [ "phishing link: www.fuck-comment-spammers-they-just-wont-quit.com/episode-7 If you want my job" + xs = examples (UrlData "www.lets-try-this-one.co.uk/episode-7" "lets-try-this-one.co.uk") + [ "phishing link: www.lets-try-this-one.co.uk/episode-7 If you want my job" ] diff --git a/Duckling/Url/Types.hs b/Duckling/Url/Types.hs index 110bcb40..dcb05980 100644 --- a/Duckling/Url/Types.hs +++ b/Duckling/Url/Types.hs @@ -21,32 +21,23 @@ import Data.Hashable import Data.Maybe import Data.Monoid import Data.Text (Text) -import qualified Data.Text as Text import GHC.Generics import Prelude import Duckling.Resolve (Resolve(..)) data UrlData = UrlData - { protocol :: Maybe Text + { value :: Text , domain :: Text - , path :: Maybe Text - , query :: Maybe Text } deriving (Eq, Generic, Hashable, Ord, Show, NFData) instance Resolve UrlData where - type ResolvedValue UrlData = UrlValue - resolve _ UrlData {protocol, domain, path, query} = Just UrlValue - {value = p <> domain <> rest} - where - p = case protocol of - Just p -> p <> "://" - Nothing -> "" - rest = Text.concat $ catMaybes [path, query] + type ResolvedValue UrlData = UrlData + resolve _ x = Just x -data UrlValue = UrlValue { value :: Text } - deriving (Eq, Ord, Show) - -instance ToJSON UrlValue where - toJSON (UrlValue value) = object [ "value" .= value ] +instance ToJSON UrlData where + toJSON (UrlData value domain) = object + [ "value" .= value + , "domain" .= domain + ]