Add support for token filters. The supported filters

include: lowercase, uppercase, shingle, snowball,
apostrophe, and reverse. This commit also adds tests
for all of these.
This commit is contained in:
Andrew Martin 2017-12-19 19:32:54 -05:00
parent 1e14794f5c
commit 7fce32ab3b
2 changed files with 251 additions and 4 deletions

View File

@ -203,6 +203,7 @@ module Database.V5.Bloodhound.Types
, CutoffFrequency(..)
, Analyzer(..)
, Tokenizer(..)
, TokenFilter(..)
, MaxExpansions(..)
, Lenient(..)
, MatchQueryType(..)
@ -381,8 +382,11 @@ module Database.V5.Bloodhound.Types
, Analysis(..)
, AnalyzerDefinition(..)
, TokenizerDefinition(..)
, TokenFilterDefinition(..)
, Ngram(..)
, TokenChar(..)
, Shingle(..)
, Language(..)
) where
import Control.Applicative as A
@ -597,32 +601,244 @@ data UpdatableIndexSetting = NumberOfReplicas ReplicaCount
data Analysis = Analysis
{ analysisAnalyzer :: M.Map Text AnalyzerDefinition
, analysisTokenizer :: M.Map Text TokenizerDefinition
, analysisTokenFilter :: M.Map Text TokenFilterDefinition
} deriving (Eq,Show,Generic,Typeable)
instance ToJSON Analysis where
toJSON (Analysis analyzer tokenizer) = object
toJSON (Analysis analyzer tokenizer tokenFilter) = object
[ "analyzer" .= analyzer
, "tokenizer" .= tokenizer
, "filter" .= tokenFilter
]
instance FromJSON Analysis where
parseJSON = withObject "Analysis" $ \m -> Analysis
<$> m .: "analyzer"
<*> m .: "tokenizer"
<*> m .:? "tokenizer" .!= M.empty
<*> m .:? "filter" .!= M.empty
data AnalyzerDefinition = AnalyzerDefinition
{ analyzerDefinitionTokenizer :: Maybe Tokenizer
, analyzerDefinitionFilter :: [TokenFilter]
} deriving (Eq,Show,Generic,Typeable)
instance ToJSON AnalyzerDefinition where
toJSON (AnalyzerDefinition tokenizer) = object $ catMaybes
toJSON (AnalyzerDefinition tokenizer tokenFilter) = object $ catMaybes
[ fmap ("tokenizer" .=) tokenizer
, Just $ "filter" .= tokenFilter
]
instance FromJSON AnalyzerDefinition where
parseJSON = withObject "AnalyzerDefinition" $ \m -> AnalyzerDefinition
<$> m .:? "tokenizer"
<*> m .:? "filter" .!= []
-- | Token filters are used to create custom analyzers.
data TokenFilterDefinition
= TokenFilterDefinitionLowercase Language
| TokenFilterDefinitionUppercase Language
| TokenFilterDefinitionApostrophe
| TokenFilterDefinitionReverse
| TokenFilterDefinitionSnowball Language
| TokenFilterDefinitionShingle Shingle
deriving (Eq,Show,Generic)
instance ToJSON TokenFilterDefinition where
toJSON x = case x of
TokenFilterDefinitionLowercase lang -> object
[ "type" .= ("lowercase" :: Text)
, "language" .= languageToText lang
]
TokenFilterDefinitionUppercase lang -> object
[ "type" .= ("uppercase" :: Text)
, "language" .= languageToText lang
]
TokenFilterDefinitionApostrophe -> object
[ "type" .= ("apostrophe" :: Text)
]
TokenFilterDefinitionReverse -> object
[ "type" .= ("reverse" :: Text)
]
TokenFilterDefinitionSnowball lang -> object
[ "type" .= ("snowball" :: Text)
, "language" .= languageToText lang
]
TokenFilterDefinitionShingle s -> object
[ "type" .= ("shingle" :: Text)
, "max_shingle_size" .= shingleMaxSize s
, "min_shingle_size" .= shingleMinSize s
, "output_unigrams" .= shingleOutputUnigrams s
, "output_unigrams_if_no_shingles" .= shingleOutputUnigramsIfNoShingles s
, "token_separator" .= shingleTokenSeparator s
, "filler_token" .= shingleFillerToken s
]
instance FromJSON TokenFilterDefinition where
parseJSON = withObject "TokenFilterDefinition" $ \m -> do
t <- m .: "type"
case (t :: Text) of
"reverse" -> return TokenFilterDefinitionReverse
"apostrophe" -> return TokenFilterDefinitionApostrophe
"lowercase" -> TokenFilterDefinitionLowercase
<$> m .:? "language" .!= English
"uppercase" -> TokenFilterDefinitionUppercase
<$> m .:? "language" .!= English
"snowball" -> TokenFilterDefinitionSnowball
<$> m .:? "language" .!= English
"shingle" -> fmap TokenFilterDefinitionShingle $ Shingle
<$> (fmap.fmap) unStringlyTypedInt (m .:? "max_shingle_size") .!= 2
<*> (fmap.fmap) unStringlyTypedInt (m .:? "min_shingle_size") .!= 2
<*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams") .!= True
<*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams_if_no_shingles") .!= False
<*> m .:? "token_separator" .!= " "
<*> m .:? "filler_token" .!= "_"
_ -> fail ("unrecognized token filter type: " ++ T.unpack t)
-- | The set of languages that can be passed to various analyzers,
-- filters, etc. in ElasticSearch. Most data types in this module
-- that have a 'Language' field are actually only actually to
-- handle a subset of these languages. Consult the official
-- ElasticSearch documentation to see what is actually supported.
data Language
= Arabic
| Armenian
| Basque
| Bengali
| Brazilian
| Bulgarian
| Catalan
| Cjk
| Czech
| Danish
| Dutch
| English
| Finnish
| French
| Galician
| German
| German2
| Greek
| Hindi
| Hungarian
| Indonesian
| Irish
| Italian
| Kp
| Latvian
| Lithuanian
| Lovins
| Norwegian
| Persian
| Porter
| Portuguese
| Romanian
| Russian
| Sorani
| Spanish
| Swedish
| Thai
| Turkish
deriving (Show,Eq,Generic)
instance ToJSON Language where
toJSON = Data.Aeson.String . languageToText
instance FromJSON Language where
parseJSON = withText "Language" $ \t -> case languageFromText t of
Nothing -> fail "not a supported ElasticSearch language"
Just lang -> return lang
languageToText :: Language -> Text
languageToText x = case x of
Arabic -> "arabic"
Armenian -> "armenian"
Basque -> "basque"
Bengali -> "bengali"
Brazilian -> "brazilian"
Bulgarian -> "bulgarian"
Catalan -> "catalan"
Cjk -> "cjk"
Czech -> "czech"
Danish -> "danish"
Dutch -> "dutch"
English -> "english"
Finnish -> "finnish"
French -> "french"
Galician -> "galician"
German -> "german"
German2 -> "german2"
Greek -> "greek"
Hindi -> "hindi"
Hungarian -> "hungarian"
Indonesian -> "indonesian"
Irish -> "irish"
Italian -> "italian"
Kp -> "kp"
Latvian -> "latvian"
Lithuanian -> "lithuanian"
Lovins -> "lovins"
Norwegian -> "norwegian"
Persian -> "persian"
Porter -> "porter"
Portuguese -> "portuguese"
Romanian -> "romanian"
Russian -> "russian"
Sorani -> "sorani"
Spanish -> "spanish"
Swedish -> "swedish"
Thai -> "thai"
Turkish -> "turkish"
languageFromText :: Text -> Maybe Language
languageFromText x = case x of
"arabic" -> Just Arabic
"armenian" -> Just Armenian
"basque" -> Just Basque
"bengali" -> Just Bengali
"brazilian" -> Just Brazilian
"bulgarian" -> Just Bulgarian
"catalan" -> Just Catalan
"cjk" -> Just Cjk
"czech" -> Just Czech
"danish" -> Just Danish
"dutch" -> Just Dutch
"english" -> Just English
"finnish" -> Just Finnish
"french" -> Just French
"galician" -> Just Galician
"german" -> Just German
"german2" -> Just German2
"greek" -> Just Greek
"hindi" -> Just Hindi
"hungarian" -> Just Hungarian
"indonesian" -> Just Indonesian
"irish" -> Just Irish
"italian" -> Just Italian
"kp" -> Just Kp
"latvian" -> Just Latvian
"lithuanian" -> Just Lithuanian
"lovins" -> Just Lovins
"norwegian" -> Just Norwegian
"persian" -> Just Persian
"porter" -> Just Porter
"portuguese" -> Just Portuguese
"romanian" -> Just Romanian
"russian" -> Just Russian
"sorani" -> Just Sorani
"spanish" -> Just Spanish
"swedish" -> Just Swedish
"thai" -> Just Thai
"turkish" -> Just Turkish
_ -> Nothing
data Shingle = Shingle
{ shingleMaxSize :: Int
, shingleMinSize :: Int
, shingleOutputUnigrams :: Bool
, shingleOutputUnigramsIfNoShingles :: Bool
, shingleTokenSeparator :: Text
, shingleFillerToken :: Text
} deriving (Eq,Show,Generic,Typeable)
data TokenizerDefinition
= TokenizerDefinitionNgram Ngram
@ -1110,6 +1326,8 @@ newtype Analyzer =
Analyzer Text deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable)
newtype Tokenizer =
Tokenizer Text deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable)
newtype TokenFilter =
TokenFilter Text deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable)
newtype MaxExpansions =
MaxExpansions Int deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable)
@ -4751,6 +4969,11 @@ newtype StringlyTypedInt = StringlyTypedInt { unStringlyTypedInt :: Int }
instance FromJSON StringlyTypedInt where
parseJSON = fmap StringlyTypedInt . parseJSON . unStringlyTypeJSON
newtype StringlyTypedBool = StringlyTypedBool { unStringlyTypedBool :: Bool }
instance FromJSON StringlyTypedBool where
parseJSON = fmap StringlyTypedBool . parseJSON . unStringlyTypeJSON
instance FromJSON NodeFSTotalStats where
parseJSON = withObject "NodeFSTotalStats" parse
where

View File

@ -911,8 +911,12 @@ instance Arbitrary TokenChar where arbitrary = sopArbitrary; shrink = genericShr
instance Arbitrary Ngram where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary TokenizerDefinition where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary AnalyzerDefinition where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary TokenFilterDefinition where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary Shingle where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary Language where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary Analysis where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary Tokenizer where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary TokenFilter where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary UpdatableIndexSetting where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary Bytes where arbitrary = sopArbitrary; shrink = genericShrink
instance Arbitrary AllocationPolicy where arbitrary = sopArbitrary; shrink = genericShrink
@ -943,6 +947,7 @@ instance Arbitrary UpdatableIndexSetting' where
where
dropDuplicateAttrNames = NE.fromList . L.nubBy sameAttrName . NE.toList
sameAttrName a b = nodeAttrFilterName a == nodeAttrFilterName b
shrink (UpdatableIndexSetting' x) = map UpdatableIndexSetting' (genericShrink x)
main :: IO ()
main = hspec $ do
@ -1631,12 +1636,31 @@ main = hspec $ do
it "accepts customer analyzers" $ when' (atleast es50) $ withTestEnv $ do
_ <- deleteExampleIndex
let analysis = Analysis
(M.singleton "ex_analyzer" (AnalyzerDefinition (Just (Tokenizer "ex_tokenizer"))))
(M.singleton "ex_analyzer"
( AnalyzerDefinition
(Just (Tokenizer "ex_tokenizer"))
(map TokenFilter
[ "ex_filter_lowercase","ex_filter_uppercase","ex_filter_apostrophe"
, "ex_filter_reverse","ex_filter_snowball"
, "ex_filter_shingle"
]
)
)
)
(M.singleton "ex_tokenizer"
( TokenizerDefinitionNgram
( Ngram 3 4 [TokenLetter,TokenDigit])
)
)
(M.fromList
[ ("ex_filter_lowercase",TokenFilterDefinitionLowercase Greek)
, ("ex_filter_uppercase",TokenFilterDefinitionUppercase English)
, ("ex_filter_apostrophe",TokenFilterDefinitionApostrophe)
, ("ex_filter_reverse",TokenFilterDefinitionReverse)
, ("ex_filter_snowball",TokenFilterDefinitionSnowball English)
, ("ex_filter_shingle",TokenFilterDefinitionShingle (Shingle 3 3 True False " " "_"))
]
)
updates = [AnalysisSetting analysis]
createResp <- createIndexWith (updates ++ [NumberOfReplicas (ReplicaCount 0)]) 1 testIndex
liftIO $ validateStatus createResp 200