mirror of
https://github.com/typeable/bloodhound.git
synced 2024-11-29 23:52:20 +03:00
Merged token filters work
This commit is contained in:
commit
c64536c39c
@ -21,7 +21,8 @@ env:
|
||||
- GHCVER=8.2 ESVER=1.7.6 STACK_YAML=stack.yaml ESFLAG=ES1 DLINK=https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch
|
||||
- GHCVER=8.2 ESVER=5.0.2 STACK_YAML=stack.yaml ESFLAG=ES5 DLINK=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch ES_JAVA_OPTS="-Xms500m -Xmx750m"
|
||||
- GHCVER=8.2 ESVER=5.5.0 STACK_YAML=stack.yaml ESFLAG=ES5 DLINK=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch ES_JAVA_OPTS="-Xms500m -Xmx750m"
|
||||
- GHCVER=8.2 ESVER=6.1.3 STACK_YAML=stack.yaml ESFLAG=ES5 DLINK=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch ES_JAVA_OPTS="-Xms500m -Xmx750m"
|
||||
allow_failures:
|
||||
- GHCVER=8.2 ESVER=6.1.3 STACK_YAML=stack.yaml ESFLAG=ES5 DLINK=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch ES_JAVA_OPTS="-Xms500m -Xmx750m"
|
||||
|
||||
install:
|
||||
# stack
|
||||
|
@ -54,6 +54,7 @@ module Database.V5.Bloodhound.Client
|
||||
-- ** Searching
|
||||
, searchAll
|
||||
, searchByIndex
|
||||
, searchByIndices
|
||||
, searchByType
|
||||
, scanSearch
|
||||
, getInitialScroll
|
||||
@ -995,6 +996,15 @@ searchByIndex :: MonadBH m => IndexName -> Search -> m Reply
|
||||
searchByIndex (IndexName indexName) = bindM2 dispatchSearch url . return
|
||||
where url = joinPath [indexName, "_search"]
|
||||
|
||||
-- | 'searchByIndices' is a variant of 'searchByIndex' that executes a
|
||||
-- 'Search' over many indices. This is much faster than using
|
||||
-- 'mapM' to 'searchByIndex' over a collection since it only
|
||||
-- causes a single HTTP request to be emitted.
|
||||
searchByIndices :: MonadBH m => NonEmpty IndexName -> Search -> m Reply
|
||||
searchByIndices ixs = bindM2 dispatchSearch url . return
|
||||
where url = joinPath [renderedIxs, "_search"]
|
||||
renderedIxs = T.intercalate (T.singleton ',') (map (\(IndexName t) -> t) (toList ixs))
|
||||
|
||||
-- | 'searchByType', given a 'Search', 'IndexName', and 'MappingName', will perform that
|
||||
-- search against a specific mapping within an index on an Elasticsearch server.
|
||||
--
|
||||
|
@ -8,24 +8,29 @@ import Data.Aeson.Types (Parser)
|
||||
import qualified Data.Map.Strict as M
|
||||
import Data.Maybe (catMaybes)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
|
||||
import Database.V5.Bloodhound.Internal.Newtypes
|
||||
import Database.V5.Bloodhound.Internal.StringlyTyped
|
||||
|
||||
data Analysis = Analysis
|
||||
{ analysisAnalyzer :: M.Map Text AnalyzerDefinition
|
||||
, analysisTokenizer :: M.Map Text TokenizerDefinition
|
||||
, analysisTokenFilter :: M.Map Text TokenFilterDefinition
|
||||
} deriving (Eq, Show)
|
||||
|
||||
instance ToJSON Analysis where
|
||||
toJSON (Analysis analyzer tokenizer) = object
|
||||
toJSON (Analysis analyzer tokenizer tokenFilter) = object
|
||||
[ "analyzer" .= analyzer
|
||||
, "tokenizer" .= tokenizer
|
||||
, "filter" .= tokenFilter
|
||||
]
|
||||
|
||||
instance FromJSON Analysis where
|
||||
parseJSON = withObject "Analysis" $ \m -> Analysis
|
||||
<$> m .: "analyzer"
|
||||
<*> m .: "tokenizer"
|
||||
<*> m .:? "tokenizer" .!= M.empty
|
||||
<*> m .:? "filter" .!= M.empty
|
||||
|
||||
newtype Tokenizer =
|
||||
Tokenizer Text
|
||||
@ -33,17 +38,19 @@ newtype Tokenizer =
|
||||
|
||||
data AnalyzerDefinition = AnalyzerDefinition
|
||||
{ analyzerDefinitionTokenizer :: Maybe Tokenizer
|
||||
, analyzerDefinitionFilter :: [TokenFilter]
|
||||
} deriving (Eq,Show)
|
||||
|
||||
instance ToJSON AnalyzerDefinition where
|
||||
toJSON (AnalyzerDefinition tokenizer) = object $ catMaybes
|
||||
toJSON (AnalyzerDefinition tokenizer tokenFilter) = object $ catMaybes
|
||||
[ fmap ("tokenizer" .=) tokenizer
|
||||
, Just $ "filter" .= tokenFilter
|
||||
]
|
||||
|
||||
instance FromJSON AnalyzerDefinition where
|
||||
parseJSON = withObject "AnalyzerDefinition" $ \m -> AnalyzerDefinition
|
||||
<$> m .:? "tokenizer"
|
||||
|
||||
<*> m .:? "filter" .!= []
|
||||
|
||||
data TokenizerDefinition
|
||||
= TokenizerDefinitionNgram Ngram
|
||||
@ -98,3 +105,210 @@ instance FromJSON TokenChar where
|
||||
"punctuation" -> return TokenPunctuation
|
||||
"symbol" -> return TokenSymbol
|
||||
_ -> fail "invalid TokenChar"
|
||||
|
||||
-- | Token filters are used to create custom analyzers.
|
||||
data TokenFilterDefinition
|
||||
= TokenFilterDefinitionLowercase (Maybe Language)
|
||||
| TokenFilterDefinitionUppercase (Maybe Language)
|
||||
| TokenFilterDefinitionApostrophe
|
||||
| TokenFilterDefinitionReverse
|
||||
| TokenFilterDefinitionSnowball Language
|
||||
| TokenFilterDefinitionShingle Shingle
|
||||
deriving (Eq, Show)
|
||||
|
||||
instance ToJSON TokenFilterDefinition where
|
||||
toJSON x = case x of
|
||||
TokenFilterDefinitionLowercase mlang -> object $ catMaybes
|
||||
[ Just $ "type" .= ("lowercase" :: Text)
|
||||
, fmap (\lang -> "language" .= languageToText lang) mlang
|
||||
]
|
||||
TokenFilterDefinitionUppercase mlang -> object $ catMaybes
|
||||
[ Just $ "type" .= ("uppercase" :: Text)
|
||||
, fmap (\lang -> "language" .= languageToText lang) mlang
|
||||
]
|
||||
TokenFilterDefinitionApostrophe -> object
|
||||
[ "type" .= ("apostrophe" :: Text)
|
||||
]
|
||||
TokenFilterDefinitionReverse -> object
|
||||
[ "type" .= ("reverse" :: Text)
|
||||
]
|
||||
TokenFilterDefinitionSnowball lang -> object
|
||||
[ "type" .= ("snowball" :: Text)
|
||||
, "language" .= languageToText lang
|
||||
]
|
||||
TokenFilterDefinitionShingle s -> object
|
||||
[ "type" .= ("shingle" :: Text)
|
||||
, "max_shingle_size" .= shingleMaxSize s
|
||||
, "min_shingle_size" .= shingleMinSize s
|
||||
, "output_unigrams" .= shingleOutputUnigrams s
|
||||
, "output_unigrams_if_no_shingles" .= shingleOutputUnigramsIfNoShingles s
|
||||
, "token_separator" .= shingleTokenSeparator s
|
||||
, "filler_token" .= shingleFillerToken s
|
||||
]
|
||||
|
||||
instance FromJSON TokenFilterDefinition where
|
||||
parseJSON = withObject "TokenFilterDefinition" $ \m -> do
|
||||
t <- m .: "type"
|
||||
case (t :: Text) of
|
||||
"reverse" -> return TokenFilterDefinitionReverse
|
||||
"apostrophe" -> return TokenFilterDefinitionApostrophe
|
||||
"lowercase" -> TokenFilterDefinitionLowercase
|
||||
<$> m .:? "language"
|
||||
"uppercase" -> TokenFilterDefinitionUppercase
|
||||
<$> m .:? "language"
|
||||
"snowball" -> TokenFilterDefinitionSnowball
|
||||
<$> m .: "language"
|
||||
"shingle" -> fmap TokenFilterDefinitionShingle $ Shingle
|
||||
<$> (fmap.fmap) unStringlyTypedInt (m .:? "max_shingle_size") .!= 2
|
||||
<*> (fmap.fmap) unStringlyTypedInt (m .:? "min_shingle_size") .!= 2
|
||||
<*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams") .!= True
|
||||
<*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams_if_no_shingles") .!= False
|
||||
<*> m .:? "token_separator" .!= " "
|
||||
<*> m .:? "filler_token" .!= "_"
|
||||
_ -> fail ("unrecognized token filter type: " ++ T.unpack t)
|
||||
|
||||
-- | The set of languages that can be passed to various analyzers,
|
||||
-- filters, etc. in ElasticSearch. Most data types in this module
|
||||
-- that have a 'Language' field are actually only actually to
|
||||
-- handle a subset of these languages. Consult the official
|
||||
-- ElasticSearch documentation to see what is actually supported.
|
||||
data Language
|
||||
= Arabic
|
||||
| Armenian
|
||||
| Basque
|
||||
| Bengali
|
||||
| Brazilian
|
||||
| Bulgarian
|
||||
| Catalan
|
||||
| Cjk
|
||||
| Czech
|
||||
| Danish
|
||||
| Dutch
|
||||
| English
|
||||
| Finnish
|
||||
| French
|
||||
| Galician
|
||||
| German
|
||||
| German2
|
||||
| Greek
|
||||
| Hindi
|
||||
| Hungarian
|
||||
| Indonesian
|
||||
| Irish
|
||||
| Italian
|
||||
| Kp
|
||||
| Latvian
|
||||
| Lithuanian
|
||||
| Lovins
|
||||
| Norwegian
|
||||
| Persian
|
||||
| Porter
|
||||
| Portuguese
|
||||
| Romanian
|
||||
| Russian
|
||||
| Sorani
|
||||
| Spanish
|
||||
| Swedish
|
||||
| Thai
|
||||
| Turkish
|
||||
deriving (Eq, Show)
|
||||
|
||||
instance ToJSON Language where
|
||||
toJSON = Data.Aeson.String . languageToText
|
||||
|
||||
instance FromJSON Language where
|
||||
parseJSON = withText "Language" $ \t -> case languageFromText t of
|
||||
Nothing -> fail "not a supported ElasticSearch language"
|
||||
Just lang -> return lang
|
||||
|
||||
languageToText :: Language -> Text
|
||||
languageToText x = case x of
|
||||
Arabic -> "arabic"
|
||||
Armenian -> "armenian"
|
||||
Basque -> "basque"
|
||||
Bengali -> "bengali"
|
||||
Brazilian -> "brazilian"
|
||||
Bulgarian -> "bulgarian"
|
||||
Catalan -> "catalan"
|
||||
Cjk -> "cjk"
|
||||
Czech -> "czech"
|
||||
Danish -> "danish"
|
||||
Dutch -> "dutch"
|
||||
English -> "english"
|
||||
Finnish -> "finnish"
|
||||
French -> "french"
|
||||
Galician -> "galician"
|
||||
German -> "german"
|
||||
German2 -> "german2"
|
||||
Greek -> "greek"
|
||||
Hindi -> "hindi"
|
||||
Hungarian -> "hungarian"
|
||||
Indonesian -> "indonesian"
|
||||
Irish -> "irish"
|
||||
Italian -> "italian"
|
||||
Kp -> "kp"
|
||||
Latvian -> "latvian"
|
||||
Lithuanian -> "lithuanian"
|
||||
Lovins -> "lovins"
|
||||
Norwegian -> "norwegian"
|
||||
Persian -> "persian"
|
||||
Porter -> "porter"
|
||||
Portuguese -> "portuguese"
|
||||
Romanian -> "romanian"
|
||||
Russian -> "russian"
|
||||
Sorani -> "sorani"
|
||||
Spanish -> "spanish"
|
||||
Swedish -> "swedish"
|
||||
Thai -> "thai"
|
||||
Turkish -> "turkish"
|
||||
|
||||
languageFromText :: Text -> Maybe Language
|
||||
languageFromText x = case x of
|
||||
"arabic" -> Just Arabic
|
||||
"armenian" -> Just Armenian
|
||||
"basque" -> Just Basque
|
||||
"bengali" -> Just Bengali
|
||||
"brazilian" -> Just Brazilian
|
||||
"bulgarian" -> Just Bulgarian
|
||||
"catalan" -> Just Catalan
|
||||
"cjk" -> Just Cjk
|
||||
"czech" -> Just Czech
|
||||
"danish" -> Just Danish
|
||||
"dutch" -> Just Dutch
|
||||
"english" -> Just English
|
||||
"finnish" -> Just Finnish
|
||||
"french" -> Just French
|
||||
"galician" -> Just Galician
|
||||
"german" -> Just German
|
||||
"german2" -> Just German2
|
||||
"greek" -> Just Greek
|
||||
"hindi" -> Just Hindi
|
||||
"hungarian" -> Just Hungarian
|
||||
"indonesian" -> Just Indonesian
|
||||
"irish" -> Just Irish
|
||||
"italian" -> Just Italian
|
||||
"kp" -> Just Kp
|
||||
"latvian" -> Just Latvian
|
||||
"lithuanian" -> Just Lithuanian
|
||||
"lovins" -> Just Lovins
|
||||
"norwegian" -> Just Norwegian
|
||||
"persian" -> Just Persian
|
||||
"porter" -> Just Porter
|
||||
"portuguese" -> Just Portuguese
|
||||
"romanian" -> Just Romanian
|
||||
"russian" -> Just Russian
|
||||
"sorani" -> Just Sorani
|
||||
"spanish" -> Just Spanish
|
||||
"swedish" -> Just Swedish
|
||||
"thai" -> Just Thai
|
||||
"turkish" -> Just Turkish
|
||||
_ -> Nothing
|
||||
|
||||
data Shingle = Shingle
|
||||
{ shingleMaxSize :: Int
|
||||
, shingleMinSize :: Int
|
||||
, shingleOutputUnigrams :: Bool
|
||||
, shingleOutputUnigramsIfNoShingles :: Bool
|
||||
, shingleTokenSeparator :: Text
|
||||
, shingleFillerToken :: Text
|
||||
} deriving (Eq, Show)
|
||||
|
@ -196,3 +196,6 @@ instance FromJSON MS where
|
||||
parseJSON = withScientific "MS" (return . MS . parse)
|
||||
where
|
||||
parse n = fromInteger ((truncate n) * 1000)
|
||||
|
||||
newtype TokenFilter =
|
||||
TokenFilter Text deriving (Eq, Show, FromJSON, ToJSON)
|
||||
|
@ -26,8 +26,16 @@ instance FromJSON StringlyTypedInt where
|
||||
. parseJSON
|
||||
. unStringlyTypeJSON
|
||||
|
||||
newtype StringlyTypedBool = StringlyTypedBool { unStringlyTypedBool :: Bool }
|
||||
|
||||
instance FromJSON StringlyTypedBool where
|
||||
parseJSON =
|
||||
fmap StringlyTypedBool
|
||||
. parseJSON
|
||||
. unStringlyTypeJSON
|
||||
|
||||
-- | For some reason in several settings APIs, all leaf values get returned
|
||||
-- as strings. This function attepmts to recover from this for all
|
||||
-- as strings. This function attempts to recover from this for all
|
||||
-- non-recursive JSON types. If nothing can be done, the value is left alone.
|
||||
unStringlyTypeJSON :: Value -> Value
|
||||
unStringlyTypeJSON (String "true") =
|
||||
|
@ -223,6 +223,7 @@ module Database.V5.Bloodhound.Types
|
||||
, CutoffFrequency(..)
|
||||
, Analyzer(..)
|
||||
, Tokenizer(..)
|
||||
, TokenFilter(..)
|
||||
, MaxExpansions(..)
|
||||
, Lenient(..)
|
||||
, MatchQueryType(..)
|
||||
@ -404,8 +405,11 @@ module Database.V5.Bloodhound.Types
|
||||
, Analysis(..)
|
||||
, AnalyzerDefinition(..)
|
||||
, TokenizerDefinition(..)
|
||||
, TokenFilterDefinition(..)
|
||||
, Ngram(..)
|
||||
, TokenChar(..)
|
||||
, Shingle(..)
|
||||
, Language(..)
|
||||
) where
|
||||
|
||||
import Bloodhound.Import
|
||||
@ -1575,7 +1579,6 @@ instance FromJSON DocVersion where
|
||||
i <- parseJSON v
|
||||
maybe (fail "DocVersion out of range") return $ mkDocVersion i
|
||||
|
||||
|
||||
data Suggest = Suggest { suggestText :: Text
|
||||
, suggestName :: Text
|
||||
, suggestType :: SuggestType
|
||||
|
@ -925,6 +925,14 @@ makeArbitrary ''Ngram
|
||||
instance Arbitrary Ngram where arbitrary = arbitraryNgram
|
||||
makeArbitrary ''TokenizerDefinition
|
||||
instance Arbitrary TokenizerDefinition where arbitrary = arbitraryTokenizerDefinition
|
||||
makeArbitrary ''TokenFilter
|
||||
instance Arbitrary TokenFilter where arbitrary = arbitraryTokenFilter
|
||||
makeArbitrary ''TokenFilterDefinition
|
||||
instance Arbitrary TokenFilterDefinition where arbitrary = arbitraryTokenFilterDefinition
|
||||
makeArbitrary ''Language
|
||||
instance Arbitrary Language where arbitrary = arbitraryLanguage
|
||||
makeArbitrary ''Shingle
|
||||
instance Arbitrary Shingle where arbitrary = arbitraryShingle
|
||||
makeArbitrary ''AnalyzerDefinition
|
||||
instance Arbitrary AnalyzerDefinition where arbitrary = arbitraryAnalyzerDefinition
|
||||
makeArbitrary ''Analysis
|
||||
@ -1023,6 +1031,7 @@ instance Arbitrary UpdatableIndexSetting' where
|
||||
NE.fromList . L.nubBy sameAttrName . NE.toList
|
||||
sameAttrName a b =
|
||||
nodeAttrFilterName a == nodeAttrFilterName b
|
||||
-- shrink (UpdatableIndexSetting' x) = map UpdatableIndexSetting' (shrink x)
|
||||
|
||||
-- | Due to the way nodeattrfilters get serialized here, they may come
|
||||
-- out in a different order, but they are morally equivalent
|
||||
@ -1773,12 +1782,31 @@ main = hspec $ do
|
||||
it "accepts customer analyzers" $ when' (atleast es50) $ withTestEnv $ do
|
||||
_ <- deleteExampleIndex
|
||||
let analysis = Analysis
|
||||
(M.singleton "ex_analyzer" (AnalyzerDefinition (Just (Tokenizer "ex_tokenizer"))))
|
||||
(M.singleton "ex_analyzer"
|
||||
( AnalyzerDefinition
|
||||
(Just (Tokenizer "ex_tokenizer"))
|
||||
(map TokenFilter
|
||||
[ "ex_filter_lowercase","ex_filter_uppercase","ex_filter_apostrophe"
|
||||
, "ex_filter_reverse","ex_filter_snowball"
|
||||
, "ex_filter_shingle"
|
||||
]
|
||||
)
|
||||
)
|
||||
)
|
||||
(M.singleton "ex_tokenizer"
|
||||
( TokenizerDefinitionNgram
|
||||
( Ngram 3 4 [TokenLetter,TokenDigit])
|
||||
)
|
||||
)
|
||||
(M.fromList
|
||||
[ ("ex_filter_lowercase",TokenFilterDefinitionLowercase (Just Greek))
|
||||
, ("ex_filter_uppercase",TokenFilterDefinitionUppercase Nothing)
|
||||
, ("ex_filter_apostrophe",TokenFilterDefinitionApostrophe)
|
||||
, ("ex_filter_reverse",TokenFilterDefinitionReverse)
|
||||
, ("ex_filter_snowball",TokenFilterDefinitionSnowball English)
|
||||
, ("ex_filter_shingle",TokenFilterDefinitionShingle (Shingle 3 3 True False " " "_"))
|
||||
]
|
||||
)
|
||||
updates = [AnalysisSetting analysis]
|
||||
createResp <- createIndexWith (updates ++ [NumberOfReplicas (ReplicaCount 0)]) 1 testIndex
|
||||
liftIO $ validateStatus createResp 200
|
||||
|
Loading…
Reference in New Issue
Block a user