Refactoring + return domain

Summary:
* Simplified `Url` to only keep track of what we need (we can change back later)
* Normalize domain: remove subdomains like `www`, `www2` and lower case
* Return the full domain in the JSON value field
* Updated offensive url example

Reviewed By: JonCoens

Differential Revision: D4705403

fbshipit-source-id: e5d11ee
This commit is contained in:
Julien Odent 2017-03-14 13:31:44 -07:00 committed by Facebook Github Bot
parent 1b91b70c58
commit cc016bb178
5 changed files with 50 additions and 50 deletions

View File

@ -34,34 +34,40 @@ negativeCorpus = (testContext, examples)
allExamples :: [Example]
allExamples = concat
[ examples (UrlValue "http://www.bla.com")
[ examples (UrlData "http://www.bla.com" "bla.com")
[ "http://www.bla.com"
]
, examples (UrlValue "www.bla.com:8080/path")
, examples (UrlData "www.bla.com:8080/path" "bla.com")
[ "www.bla.com:8080/path"
]
, examples (UrlValue "https://myserver?foo=bar")
, examples (UrlData "https://myserver?foo=bar" "myserver")
[ "https://myserver?foo=bar"
]
, examples (UrlValue "cnn.com/info")
, examples (UrlData "cnn.com/info" "cnn.com")
[ "cnn.com/info"
]
, examples (UrlValue "bla.com/path/path?ext=%23&foo=bla")
, examples (UrlData "bla.com/path/path?ext=%23&foo=bla" "bla.com")
[ "bla.com/path/path?ext=%23&foo=bla"
]
, examples (UrlValue "localhost")
, examples (UrlData "localhost" "localhost")
[ "localhost"
]
, examples (UrlValue "localhost:8000")
, examples (UrlData "localhost:8000" "localhost")
[ "localhost:8000"
]
, examples (UrlValue "http://kimchi")
, examples (UrlData "http://kimchi" "kimchi")
[ "http://kimchi"
]
, examples (UrlValue "https://500px.com:443/about")
, examples (UrlData "https://500px.com:443/about" "500px.com")
[ "https://500px.com:443/about"
]
, examples (UrlValue "www2.foo-bar.net?foo=bar")
, examples (UrlData "www2.foo-bar.net?foo=bar" "foo-bar.net")
[ "www2.foo-bar.net?foo=bar"
]
, examples (UrlData "https://api.wit.ai/message?q=hi" "api.wit.ai")
[ "https://api.wit.ai/message?q=hi"
]
, examples (UrlData "aMaZon.co.uk/?page=home" "amazon.co.uk")
[ "aMaZon.co.uk/?page=home"
]
]

View File

@ -14,25 +14,18 @@ module Duckling.Url.Helpers
import Data.String
import Data.Text (Text)
import qualified Data.Text as Text
import Prelude
import qualified Duckling.Url.Types as TUrl
import Duckling.Url.Types (UrlData(..))
parse :: Text -> Maybe Text
parse "" = Nothing
parse x = Just x
-- -----------------------------------------------------------------
-- Patterns
-- -----------------------------------------------------------------
-- Production
url :: Text -> Text -> Text -> Text -> UrlData
url protocol domain path query = UrlData
{ TUrl.protocol = parse protocol
, TUrl.domain = domain
, TUrl.path = parse path
, TUrl.query = parse query
}
url :: Text -> Text -> UrlData
url value domain = UrlData
{TUrl.value = value, TUrl.domain = Text.toLower domain}

View File

@ -23,32 +23,42 @@ import Duckling.Url.Helpers
ruleURL :: Rule
ruleURL = Rule
{ name = "url"
, pattern = [ regex "(([a-zA-Z]+)://)?(([\\w_-]+\\.)+[a-zA-Z]{2,4}(:\\d+)?)(/[^?\\s#]*)?(\\?[^\\s#]+)?" ]
, pattern =
[ regex "((([a-zA-Z]+)://)?(w{2,3}[0-9]*\\.)?(([\\w_-]+\\.)+[a-z]{2,4})(:(\\d+))?(/[^?\\s#]*)?(\\?[^\\s#]+)?)"
]
, prod = \tokens -> case tokens of
(Token RegexMatch (GroupMatch (_:protocol:domain:_:_:path:query:_)):_) ->
Just . Token Url $ url protocol domain path query
(Token RegexMatch (GroupMatch (m:_:_protocol:_:domain:_:_:_port:_path:_query:_)):
_) -> Just . Token Url $ url m domain
_ -> Nothing
}
ruleLocalhost :: Rule
ruleLocalhost = Rule
{ name = "localhost"
, pattern = [ regex "(([a-zA-Z]+)://)?(localhost(:\\d+)?)(/[^?\\s#]*)?(\\?[^\\s#]+)?" ]
, pattern =
[ regex "((([a-zA-Z]+)://)?localhost(:(\\d+))?(/[^?\\s#]*)?(\\?[^\\s#]+)?)"
]
, prod = \tokens -> case tokens of
(Token RegexMatch (GroupMatch (_:protocol:domain:_:path:query:_)):_) ->
Just . Token Url $ url protocol domain path query
(Token RegexMatch (GroupMatch (m:_:_protocol:_:_port:_path:_query:_)):_) ->
Just . Token Url $ url m "localhost"
_ -> Nothing
}
ruleLocalURL :: Rule
ruleLocalURL = Rule
{ name = "local url"
, pattern = [ regex "([a-zA-Z]+)://([\\w_-]+(:\\d+)?)(/[^?\\s#]*)?(\\?[^\\s#]+)?" ]
, pattern =
[ regex "(([a-zA-Z]+)://([\\w_-]+)(:(\\d+))?(/[^?\\s#]*)?(\\?[^\\s#]+)?)"
]
, prod = \tokens -> case tokens of
(Token RegexMatch (GroupMatch (protocol:domain:_:path:query:_)):_) ->
Just . Token Url $ url protocol domain path query
(Token RegexMatch (GroupMatch (m:_protocol:domain:_:_port:_path:_query:_)):
_) -> Just . Token Url $ url m domain
_ -> Nothing
}
rules :: [Rule]
rules = [ ruleURL, ruleLocalhost, ruleLocalURL ]
rules =
[ ruleURL
, ruleLocalhost
, ruleLocalURL
]

View File

@ -34,6 +34,6 @@ surroundTests :: TestTree
surroundTests = testCase "Surround Tests" $
mapM_ (analyzedFirstTest testContext . withTargets [Some Url]) xs
where
xs = examples (UrlValue "www.fuck-comment-spammers-they-just-wont-quit.com/episode-7")
[ "phishing link: www.fuck-comment-spammers-they-just-wont-quit.com/episode-7 If you want my job"
xs = examples (UrlData "www.lets-try-this-one.co.uk/episode-7" "lets-try-this-one.co.uk")
[ "phishing link: www.lets-try-this-one.co.uk/episode-7 If you want my job"
]

View File

@ -21,32 +21,23 @@ import Data.Hashable
import Data.Maybe
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as Text
import GHC.Generics
import Prelude
import Duckling.Resolve (Resolve(..))
data UrlData = UrlData
{ protocol :: Maybe Text
{ value :: Text
, domain :: Text
, path :: Maybe Text
, query :: Maybe Text
}
deriving (Eq, Generic, Hashable, Ord, Show, NFData)
instance Resolve UrlData where
type ResolvedValue UrlData = UrlValue
resolve _ UrlData {protocol, domain, path, query} = Just UrlValue
{value = p <> domain <> rest}
where
p = case protocol of
Just p -> p <> "://"
Nothing -> ""
rest = Text.concat $ catMaybes [path, query]
type ResolvedValue UrlData = UrlData
resolve _ x = Just x
data UrlValue = UrlValue { value :: Text }
deriving (Eq, Ord, Show)
instance ToJSON UrlValue where
toJSON (UrlValue value) = object [ "value" .= value ]
instance ToJSON UrlData where
toJSON (UrlData value domain) = object
[ "value" .= value
, "domain" .= domain
]