mirror of
https://github.com/facebook/duckling.git
synced 2024-12-01 08:19:36 +03:00
Refactoring + return domain
Summary: * Simplified `Url` to only keep track of what we need (we can change back later) * Normalize domain: remove subdomains like `www`, `www2` and lower case * Return the full domain in the JSON value field * Updated offensive url example Reviewed By: JonCoens Differential Revision: D4705403 fbshipit-source-id: e5d11ee
This commit is contained in:
parent
1b91b70c58
commit
cc016bb178
@ -34,34 +34,40 @@ negativeCorpus = (testContext, examples)
|
||||
|
||||
allExamples :: [Example]
|
||||
allExamples = concat
|
||||
[ examples (UrlValue "http://www.bla.com")
|
||||
[ examples (UrlData "http://www.bla.com" "bla.com")
|
||||
[ "http://www.bla.com"
|
||||
]
|
||||
, examples (UrlValue "www.bla.com:8080/path")
|
||||
, examples (UrlData "www.bla.com:8080/path" "bla.com")
|
||||
[ "www.bla.com:8080/path"
|
||||
]
|
||||
, examples (UrlValue "https://myserver?foo=bar")
|
||||
, examples (UrlData "https://myserver?foo=bar" "myserver")
|
||||
[ "https://myserver?foo=bar"
|
||||
]
|
||||
, examples (UrlValue "cnn.com/info")
|
||||
, examples (UrlData "cnn.com/info" "cnn.com")
|
||||
[ "cnn.com/info"
|
||||
]
|
||||
, examples (UrlValue "bla.com/path/path?ext=%23&foo=bla")
|
||||
, examples (UrlData "bla.com/path/path?ext=%23&foo=bla" "bla.com")
|
||||
[ "bla.com/path/path?ext=%23&foo=bla"
|
||||
]
|
||||
, examples (UrlValue "localhost")
|
||||
, examples (UrlData "localhost" "localhost")
|
||||
[ "localhost"
|
||||
]
|
||||
, examples (UrlValue "localhost:8000")
|
||||
, examples (UrlData "localhost:8000" "localhost")
|
||||
[ "localhost:8000"
|
||||
]
|
||||
, examples (UrlValue "http://kimchi")
|
||||
, examples (UrlData "http://kimchi" "kimchi")
|
||||
[ "http://kimchi"
|
||||
]
|
||||
, examples (UrlValue "https://500px.com:443/about")
|
||||
, examples (UrlData "https://500px.com:443/about" "500px.com")
|
||||
[ "https://500px.com:443/about"
|
||||
]
|
||||
, examples (UrlValue "www2.foo-bar.net?foo=bar")
|
||||
, examples (UrlData "www2.foo-bar.net?foo=bar" "foo-bar.net")
|
||||
[ "www2.foo-bar.net?foo=bar"
|
||||
]
|
||||
, examples (UrlData "https://api.wit.ai/message?q=hi" "api.wit.ai")
|
||||
[ "https://api.wit.ai/message?q=hi"
|
||||
]
|
||||
, examples (UrlData "aMaZon.co.uk/?page=home" "amazon.co.uk")
|
||||
[ "aMaZon.co.uk/?page=home"
|
||||
]
|
||||
]
|
||||
|
@ -14,25 +14,18 @@ module Duckling.Url.Helpers
|
||||
|
||||
import Data.String
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as Text
|
||||
import Prelude
|
||||
|
||||
import qualified Duckling.Url.Types as TUrl
|
||||
import Duckling.Url.Types (UrlData(..))
|
||||
|
||||
parse :: Text -> Maybe Text
|
||||
parse "" = Nothing
|
||||
parse x = Just x
|
||||
|
||||
-- -----------------------------------------------------------------
|
||||
-- Patterns
|
||||
|
||||
-- -----------------------------------------------------------------
|
||||
-- Production
|
||||
|
||||
url :: Text -> Text -> Text -> Text -> UrlData
|
||||
url protocol domain path query = UrlData
|
||||
{ TUrl.protocol = parse protocol
|
||||
, TUrl.domain = domain
|
||||
, TUrl.path = parse path
|
||||
, TUrl.query = parse query
|
||||
}
|
||||
url :: Text -> Text -> UrlData
|
||||
url value domain = UrlData
|
||||
{TUrl.value = value, TUrl.domain = Text.toLower domain}
|
||||
|
@ -23,32 +23,42 @@ import Duckling.Url.Helpers
|
||||
ruleURL :: Rule
|
||||
ruleURL = Rule
|
||||
{ name = "url"
|
||||
, pattern = [ regex "(([a-zA-Z]+)://)?(([\\w_-]+\\.)+[a-zA-Z]{2,4}(:\\d+)?)(/[^?\\s#]*)?(\\?[^\\s#]+)?" ]
|
||||
, pattern =
|
||||
[ regex "((([a-zA-Z]+)://)?(w{2,3}[0-9]*\\.)?(([\\w_-]+\\.)+[a-z]{2,4})(:(\\d+))?(/[^?\\s#]*)?(\\?[^\\s#]+)?)"
|
||||
]
|
||||
, prod = \tokens -> case tokens of
|
||||
(Token RegexMatch (GroupMatch (_:protocol:domain:_:_:path:query:_)):_) ->
|
||||
Just . Token Url $ url protocol domain path query
|
||||
(Token RegexMatch (GroupMatch (m:_:_protocol:_:domain:_:_:_port:_path:_query:_)):
|
||||
_) -> Just . Token Url $ url m domain
|
||||
_ -> Nothing
|
||||
}
|
||||
|
||||
ruleLocalhost :: Rule
|
||||
ruleLocalhost = Rule
|
||||
{ name = "localhost"
|
||||
, pattern = [ regex "(([a-zA-Z]+)://)?(localhost(:\\d+)?)(/[^?\\s#]*)?(\\?[^\\s#]+)?" ]
|
||||
, pattern =
|
||||
[ regex "((([a-zA-Z]+)://)?localhost(:(\\d+))?(/[^?\\s#]*)?(\\?[^\\s#]+)?)"
|
||||
]
|
||||
, prod = \tokens -> case tokens of
|
||||
(Token RegexMatch (GroupMatch (_:protocol:domain:_:path:query:_)):_) ->
|
||||
Just . Token Url $ url protocol domain path query
|
||||
(Token RegexMatch (GroupMatch (m:_:_protocol:_:_port:_path:_query:_)):_) ->
|
||||
Just . Token Url $ url m "localhost"
|
||||
_ -> Nothing
|
||||
}
|
||||
|
||||
ruleLocalURL :: Rule
|
||||
ruleLocalURL = Rule
|
||||
{ name = "local url"
|
||||
, pattern = [ regex "([a-zA-Z]+)://([\\w_-]+(:\\d+)?)(/[^?\\s#]*)?(\\?[^\\s#]+)?" ]
|
||||
, pattern =
|
||||
[ regex "(([a-zA-Z]+)://([\\w_-]+)(:(\\d+))?(/[^?\\s#]*)?(\\?[^\\s#]+)?)"
|
||||
]
|
||||
, prod = \tokens -> case tokens of
|
||||
(Token RegexMatch (GroupMatch (protocol:domain:_:path:query:_)):_) ->
|
||||
Just . Token Url $ url protocol domain path query
|
||||
(Token RegexMatch (GroupMatch (m:_protocol:domain:_:_port:_path:_query:_)):
|
||||
_) -> Just . Token Url $ url m domain
|
||||
_ -> Nothing
|
||||
}
|
||||
|
||||
rules :: [Rule]
|
||||
rules = [ ruleURL, ruleLocalhost, ruleLocalURL ]
|
||||
rules =
|
||||
[ ruleURL
|
||||
, ruleLocalhost
|
||||
, ruleLocalURL
|
||||
]
|
||||
|
@ -34,6 +34,6 @@ surroundTests :: TestTree
|
||||
surroundTests = testCase "Surround Tests" $
|
||||
mapM_ (analyzedFirstTest testContext . withTargets [Some Url]) xs
|
||||
where
|
||||
xs = examples (UrlValue "www.fuck-comment-spammers-they-just-wont-quit.com/episode-7")
|
||||
[ "phishing link: www.fuck-comment-spammers-they-just-wont-quit.com/episode-7 If you want my job"
|
||||
xs = examples (UrlData "www.lets-try-this-one.co.uk/episode-7" "lets-try-this-one.co.uk")
|
||||
[ "phishing link: www.lets-try-this-one.co.uk/episode-7 If you want my job"
|
||||
]
|
||||
|
@ -21,32 +21,23 @@ import Data.Hashable
|
||||
import Data.Maybe
|
||||
import Data.Monoid
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as Text
|
||||
import GHC.Generics
|
||||
import Prelude
|
||||
|
||||
import Duckling.Resolve (Resolve(..))
|
||||
|
||||
data UrlData = UrlData
|
||||
{ protocol :: Maybe Text
|
||||
{ value :: Text
|
||||
, domain :: Text
|
||||
, path :: Maybe Text
|
||||
, query :: Maybe Text
|
||||
}
|
||||
deriving (Eq, Generic, Hashable, Ord, Show, NFData)
|
||||
|
||||
instance Resolve UrlData where
|
||||
type ResolvedValue UrlData = UrlValue
|
||||
resolve _ UrlData {protocol, domain, path, query} = Just UrlValue
|
||||
{value = p <> domain <> rest}
|
||||
where
|
||||
p = case protocol of
|
||||
Just p -> p <> "://"
|
||||
Nothing -> ""
|
||||
rest = Text.concat $ catMaybes [path, query]
|
||||
type ResolvedValue UrlData = UrlData
|
||||
resolve _ x = Just x
|
||||
|
||||
data UrlValue = UrlValue { value :: Text }
|
||||
deriving (Eq, Ord, Show)
|
||||
|
||||
instance ToJSON UrlValue where
|
||||
toJSON (UrlValue value) = object [ "value" .= value ]
|
||||
instance ToJSON UrlData where
|
||||
toJSON (UrlData value domain) = object
|
||||
[ "value" .= value
|
||||
, "domain" .= domain
|
||||
]
|
||||
|
Loading…
Reference in New Issue
Block a user