mirror of
https://github.com/typeable/bloodhound.git
synced 2024-10-26 08:33:00 +03:00
Merge pull request #219 from andrewthad/basic_token_filter
Add support for token filters
This commit is contained in:
commit
b161cadd2e
@ -204,6 +204,7 @@ module Database.V5.Bloodhound.Types
|
||||
, CutoffFrequency(..)
|
||||
, Analyzer(..)
|
||||
, Tokenizer(..)
|
||||
, TokenFilter(..)
|
||||
, MaxExpansions(..)
|
||||
, Lenient(..)
|
||||
, MatchQueryType(..)
|
||||
@ -385,8 +386,11 @@ module Database.V5.Bloodhound.Types
|
||||
, Analysis(..)
|
||||
, AnalyzerDefinition(..)
|
||||
, TokenizerDefinition(..)
|
||||
, TokenFilterDefinition(..)
|
||||
, Ngram(..)
|
||||
, TokenChar(..)
|
||||
, Shingle(..)
|
||||
, Language(..)
|
||||
) where
|
||||
|
||||
import Control.Applicative as A
|
||||
@ -602,32 +606,244 @@ data UpdatableIndexSetting = NumberOfReplicas ReplicaCount
|
||||
data Analysis = Analysis
|
||||
{ analysisAnalyzer :: M.Map Text AnalyzerDefinition
|
||||
, analysisTokenizer :: M.Map Text TokenizerDefinition
|
||||
, analysisTokenFilter :: M.Map Text TokenFilterDefinition
|
||||
} deriving (Eq,Show,Generic,Typeable)
|
||||
|
||||
instance ToJSON Analysis where
|
||||
toJSON (Analysis analyzer tokenizer) = object
|
||||
toJSON (Analysis analyzer tokenizer tokenFilter) = object
|
||||
[ "analyzer" .= analyzer
|
||||
, "tokenizer" .= tokenizer
|
||||
, "filter" .= tokenFilter
|
||||
]
|
||||
|
||||
instance FromJSON Analysis where
|
||||
parseJSON = withObject "Analysis" $ \m -> Analysis
|
||||
<$> m .: "analyzer"
|
||||
<*> m .: "tokenizer"
|
||||
<*> m .:? "tokenizer" .!= M.empty
|
||||
<*> m .:? "filter" .!= M.empty
|
||||
|
||||
data AnalyzerDefinition = AnalyzerDefinition
|
||||
{ analyzerDefinitionTokenizer :: Maybe Tokenizer
|
||||
, analyzerDefinitionFilter :: [TokenFilter]
|
||||
} deriving (Eq,Show,Generic,Typeable)
|
||||
|
||||
instance ToJSON AnalyzerDefinition where
|
||||
toJSON (AnalyzerDefinition tokenizer) = object $ catMaybes
|
||||
toJSON (AnalyzerDefinition tokenizer tokenFilter) = object $ catMaybes
|
||||
[ fmap ("tokenizer" .=) tokenizer
|
||||
, Just $ "filter" .= tokenFilter
|
||||
]
|
||||
|
||||
instance FromJSON AnalyzerDefinition where
|
||||
parseJSON = withObject "AnalyzerDefinition" $ \m -> AnalyzerDefinition
|
||||
<$> m .:? "tokenizer"
|
||||
<*> m .:? "filter" .!= []
|
||||
|
||||
-- | Token filters are used to create custom analyzers.
|
||||
data TokenFilterDefinition
|
||||
= TokenFilterDefinitionLowercase (Maybe Language)
|
||||
| TokenFilterDefinitionUppercase (Maybe Language)
|
||||
| TokenFilterDefinitionApostrophe
|
||||
| TokenFilterDefinitionReverse
|
||||
| TokenFilterDefinitionSnowball Language
|
||||
| TokenFilterDefinitionShingle Shingle
|
||||
deriving (Eq,Show,Generic)
|
||||
|
||||
instance ToJSON TokenFilterDefinition where
|
||||
toJSON x = case x of
|
||||
TokenFilterDefinitionLowercase mlang -> object $ catMaybes
|
||||
[ Just $ "type" .= ("lowercase" :: Text)
|
||||
, fmap (\lang -> "language" .= languageToText lang) mlang
|
||||
]
|
||||
TokenFilterDefinitionUppercase mlang -> object $ catMaybes
|
||||
[ Just $ "type" .= ("uppercase" :: Text)
|
||||
, fmap (\lang -> "language" .= languageToText lang) mlang
|
||||
]
|
||||
TokenFilterDefinitionApostrophe -> object
|
||||
[ "type" .= ("apostrophe" :: Text)
|
||||
]
|
||||
TokenFilterDefinitionReverse -> object
|
||||
[ "type" .= ("reverse" :: Text)
|
||||
]
|
||||
TokenFilterDefinitionSnowball lang -> object
|
||||
[ "type" .= ("snowball" :: Text)
|
||||
, "language" .= languageToText lang
|
||||
]
|
||||
TokenFilterDefinitionShingle s -> object
|
||||
[ "type" .= ("shingle" :: Text)
|
||||
, "max_shingle_size" .= shingleMaxSize s
|
||||
, "min_shingle_size" .= shingleMinSize s
|
||||
, "output_unigrams" .= shingleOutputUnigrams s
|
||||
, "output_unigrams_if_no_shingles" .= shingleOutputUnigramsIfNoShingles s
|
||||
, "token_separator" .= shingleTokenSeparator s
|
||||
, "filler_token" .= shingleFillerToken s
|
||||
]
|
||||
|
||||
instance FromJSON TokenFilterDefinition where
|
||||
parseJSON = withObject "TokenFilterDefinition" $ \m -> do
|
||||
t <- m .: "type"
|
||||
case (t :: Text) of
|
||||
"reverse" -> return TokenFilterDefinitionReverse
|
||||
"apostrophe" -> return TokenFilterDefinitionApostrophe
|
||||
"lowercase" -> TokenFilterDefinitionLowercase
|
||||
<$> m .:? "language"
|
||||
"uppercase" -> TokenFilterDefinitionUppercase
|
||||
<$> m .:? "language"
|
||||
"snowball" -> TokenFilterDefinitionSnowball
|
||||
<$> m .: "language"
|
||||
"shingle" -> fmap TokenFilterDefinitionShingle $ Shingle
|
||||
<$> (fmap.fmap) unStringlyTypedInt (m .:? "max_shingle_size") .!= 2
|
||||
<*> (fmap.fmap) unStringlyTypedInt (m .:? "min_shingle_size") .!= 2
|
||||
<*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams") .!= True
|
||||
<*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams_if_no_shingles") .!= False
|
||||
<*> m .:? "token_separator" .!= " "
|
||||
<*> m .:? "filler_token" .!= "_"
|
||||
_ -> fail ("unrecognized token filter type: " ++ T.unpack t)
|
||||
|
||||
-- | The set of languages that can be passed to various analyzers,
|
||||
-- filters, etc. in ElasticSearch. Most data types in this module
|
||||
-- that have a 'Language' field are actually only actually to
|
||||
-- handle a subset of these languages. Consult the official
|
||||
-- ElasticSearch documentation to see what is actually supported.
|
||||
data Language
|
||||
= Arabic
|
||||
| Armenian
|
||||
| Basque
|
||||
| Bengali
|
||||
| Brazilian
|
||||
| Bulgarian
|
||||
| Catalan
|
||||
| Cjk
|
||||
| Czech
|
||||
| Danish
|
||||
| Dutch
|
||||
| English
|
||||
| Finnish
|
||||
| French
|
||||
| Galician
|
||||
| German
|
||||
| German2
|
||||
| Greek
|
||||
| Hindi
|
||||
| Hungarian
|
||||
| Indonesian
|
||||
| Irish
|
||||
| Italian
|
||||
| Kp
|
||||
| Latvian
|
||||
| Lithuanian
|
||||
| Lovins
|
||||
| Norwegian
|
||||
| Persian
|
||||
| Porter
|
||||
| Portuguese
|
||||
| Romanian
|
||||
| Russian
|
||||
| Sorani
|
||||
| Spanish
|
||||
| Swedish
|
||||
| Thai
|
||||
| Turkish
|
||||
deriving (Show,Eq,Generic)
|
||||
|
||||
instance ToJSON Language where
|
||||
toJSON = Data.Aeson.String . languageToText
|
||||
|
||||
instance FromJSON Language where
|
||||
parseJSON = withText "Language" $ \t -> case languageFromText t of
|
||||
Nothing -> fail "not a supported ElasticSearch language"
|
||||
Just lang -> return lang
|
||||
|
||||
languageToText :: Language -> Text
|
||||
languageToText x = case x of
|
||||
Arabic -> "arabic"
|
||||
Armenian -> "armenian"
|
||||
Basque -> "basque"
|
||||
Bengali -> "bengali"
|
||||
Brazilian -> "brazilian"
|
||||
Bulgarian -> "bulgarian"
|
||||
Catalan -> "catalan"
|
||||
Cjk -> "cjk"
|
||||
Czech -> "czech"
|
||||
Danish -> "danish"
|
||||
Dutch -> "dutch"
|
||||
English -> "english"
|
||||
Finnish -> "finnish"
|
||||
French -> "french"
|
||||
Galician -> "galician"
|
||||
German -> "german"
|
||||
German2 -> "german2"
|
||||
Greek -> "greek"
|
||||
Hindi -> "hindi"
|
||||
Hungarian -> "hungarian"
|
||||
Indonesian -> "indonesian"
|
||||
Irish -> "irish"
|
||||
Italian -> "italian"
|
||||
Kp -> "kp"
|
||||
Latvian -> "latvian"
|
||||
Lithuanian -> "lithuanian"
|
||||
Lovins -> "lovins"
|
||||
Norwegian -> "norwegian"
|
||||
Persian -> "persian"
|
||||
Porter -> "porter"
|
||||
Portuguese -> "portuguese"
|
||||
Romanian -> "romanian"
|
||||
Russian -> "russian"
|
||||
Sorani -> "sorani"
|
||||
Spanish -> "spanish"
|
||||
Swedish -> "swedish"
|
||||
Thai -> "thai"
|
||||
Turkish -> "turkish"
|
||||
|
||||
languageFromText :: Text -> Maybe Language
|
||||
languageFromText x = case x of
|
||||
"arabic" -> Just Arabic
|
||||
"armenian" -> Just Armenian
|
||||
"basque" -> Just Basque
|
||||
"bengali" -> Just Bengali
|
||||
"brazilian" -> Just Brazilian
|
||||
"bulgarian" -> Just Bulgarian
|
||||
"catalan" -> Just Catalan
|
||||
"cjk" -> Just Cjk
|
||||
"czech" -> Just Czech
|
||||
"danish" -> Just Danish
|
||||
"dutch" -> Just Dutch
|
||||
"english" -> Just English
|
||||
"finnish" -> Just Finnish
|
||||
"french" -> Just French
|
||||
"galician" -> Just Galician
|
||||
"german" -> Just German
|
||||
"german2" -> Just German2
|
||||
"greek" -> Just Greek
|
||||
"hindi" -> Just Hindi
|
||||
"hungarian" -> Just Hungarian
|
||||
"indonesian" -> Just Indonesian
|
||||
"irish" -> Just Irish
|
||||
"italian" -> Just Italian
|
||||
"kp" -> Just Kp
|
||||
"latvian" -> Just Latvian
|
||||
"lithuanian" -> Just Lithuanian
|
||||
"lovins" -> Just Lovins
|
||||
"norwegian" -> Just Norwegian
|
||||
"persian" -> Just Persian
|
||||
"porter" -> Just Porter
|
||||
"portuguese" -> Just Portuguese
|
||||
"romanian" -> Just Romanian
|
||||
"russian" -> Just Russian
|
||||
"sorani" -> Just Sorani
|
||||
"spanish" -> Just Spanish
|
||||
"swedish" -> Just Swedish
|
||||
"thai" -> Just Thai
|
||||
"turkish" -> Just Turkish
|
||||
_ -> Nothing
|
||||
|
||||
data Shingle = Shingle
|
||||
{ shingleMaxSize :: Int
|
||||
, shingleMinSize :: Int
|
||||
, shingleOutputUnigrams :: Bool
|
||||
, shingleOutputUnigramsIfNoShingles :: Bool
|
||||
, shingleTokenSeparator :: Text
|
||||
, shingleFillerToken :: Text
|
||||
} deriving (Eq,Show,Generic,Typeable)
|
||||
|
||||
data TokenizerDefinition
|
||||
= TokenizerDefinitionNgram Ngram
|
||||
@ -1135,6 +1351,8 @@ newtype Analyzer =
|
||||
Analyzer Text deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable)
|
||||
newtype Tokenizer =
|
||||
Tokenizer Text deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable)
|
||||
newtype TokenFilter =
|
||||
TokenFilter Text deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable)
|
||||
newtype MaxExpansions =
|
||||
MaxExpansions Int deriving (Eq, Read, Show, Generic, ToJSON, FromJSON, Typeable)
|
||||
|
||||
@ -4779,6 +4997,11 @@ newtype StringlyTypedInt = StringlyTypedInt { unStringlyTypedInt :: Int }
|
||||
instance FromJSON StringlyTypedInt where
|
||||
parseJSON = fmap StringlyTypedInt . parseJSON . unStringlyTypeJSON
|
||||
|
||||
newtype StringlyTypedBool = StringlyTypedBool { unStringlyTypedBool :: Bool }
|
||||
|
||||
instance FromJSON StringlyTypedBool where
|
||||
parseJSON = fmap StringlyTypedBool . parseJSON . unStringlyTypeJSON
|
||||
|
||||
instance FromJSON NodeFSTotalStats where
|
||||
parseJSON = withObject "NodeFSTotalStats" parse
|
||||
where
|
||||
|
@ -925,8 +925,12 @@ instance Arbitrary TokenChar where arbitrary = sopArbitrary; shrink = genericShr
|
||||
instance Arbitrary Ngram where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary TokenizerDefinition where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary AnalyzerDefinition where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary TokenFilterDefinition where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary Shingle where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary Language where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary Analysis where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary Tokenizer where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary TokenFilter where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary UpdatableIndexSetting where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary Compression where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
instance Arbitrary Bytes where arbitrary = sopArbitrary; shrink = genericShrink
|
||||
@ -960,6 +964,7 @@ instance Arbitrary UpdatableIndexSetting' where
|
||||
where
|
||||
dropDuplicateAttrNames = NE.fromList . L.nubBy sameAttrName . NE.toList
|
||||
sameAttrName a b = nodeAttrFilterName a == nodeAttrFilterName b
|
||||
shrink (UpdatableIndexSetting' x) = map UpdatableIndexSetting' (genericShrink x)
|
||||
|
||||
main :: IO ()
|
||||
main = hspec $ do
|
||||
@ -1648,12 +1653,31 @@ main = hspec $ do
|
||||
it "accepts customer analyzers" $ when' (atleast es50) $ withTestEnv $ do
|
||||
_ <- deleteExampleIndex
|
||||
let analysis = Analysis
|
||||
(M.singleton "ex_analyzer" (AnalyzerDefinition (Just (Tokenizer "ex_tokenizer"))))
|
||||
(M.singleton "ex_analyzer"
|
||||
( AnalyzerDefinition
|
||||
(Just (Tokenizer "ex_tokenizer"))
|
||||
(map TokenFilter
|
||||
[ "ex_filter_lowercase","ex_filter_uppercase","ex_filter_apostrophe"
|
||||
, "ex_filter_reverse","ex_filter_snowball"
|
||||
, "ex_filter_shingle"
|
||||
]
|
||||
)
|
||||
)
|
||||
)
|
||||
(M.singleton "ex_tokenizer"
|
||||
( TokenizerDefinitionNgram
|
||||
( Ngram 3 4 [TokenLetter,TokenDigit])
|
||||
)
|
||||
)
|
||||
(M.fromList
|
||||
[ ("ex_filter_lowercase",TokenFilterDefinitionLowercase (Just Greek))
|
||||
, ("ex_filter_uppercase",TokenFilterDefinitionUppercase Nothing)
|
||||
, ("ex_filter_apostrophe",TokenFilterDefinitionApostrophe)
|
||||
, ("ex_filter_reverse",TokenFilterDefinitionReverse)
|
||||
, ("ex_filter_snowball",TokenFilterDefinitionSnowball English)
|
||||
, ("ex_filter_shingle",TokenFilterDefinitionShingle (Shingle 3 3 True False " " "_"))
|
||||
]
|
||||
)
|
||||
updates = [AnalysisSetting analysis]
|
||||
createResp <- createIndexWith (updates ++ [NumberOfReplicas (ReplicaCount 0)]) 1 testIndex
|
||||
liftIO $ validateStatus createResp 200
|
||||
|
Loading…
Reference in New Issue
Block a user