Merged token filters work

This commit is contained in:
Chris Allen 2018-02-08 18:18:47 -06:00
commit c64536c39c
7 changed files with 275 additions and 8 deletions

View File

@ -21,7 +21,8 @@ env:
- GHCVER=8.2 ESVER=1.7.6 STACK_YAML=stack.yaml ESFLAG=ES1 DLINK=https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch
- GHCVER=8.2 ESVER=5.0.2 STACK_YAML=stack.yaml ESFLAG=ES5 DLINK=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch ES_JAVA_OPTS="-Xms500m -Xmx750m"
- GHCVER=8.2 ESVER=5.5.0 STACK_YAML=stack.yaml ESFLAG=ES5 DLINK=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch ES_JAVA_OPTS="-Xms500m -Xmx750m"
- GHCVER=8.2 ESVER=6.1.3 STACK_YAML=stack.yaml ESFLAG=ES5 DLINK=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch ES_JAVA_OPTS="-Xms500m -Xmx750m"
allow_failures:
- GHCVER=8.2 ESVER=6.1.3 STACK_YAML=stack.yaml ESFLAG=ES5 DLINK=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch ES_JAVA_OPTS="-Xms500m -Xmx750m"
install:
# stack

View File

@ -54,6 +54,7 @@ module Database.V5.Bloodhound.Client
-- ** Searching
, searchAll
, searchByIndex
, searchByIndices
, searchByType
, scanSearch
, getInitialScroll
@ -995,6 +996,15 @@ searchByIndex :: MonadBH m => IndexName -> Search -> m Reply
searchByIndex (IndexName indexName) = bindM2 dispatchSearch url . return
where url = joinPath [indexName, "_search"]
-- | 'searchByIndices' is a variant of 'searchByIndex' that executes a
-- 'Search' over many indices. This is much faster than using
-- 'mapM' to 'searchByIndex' over a collection since it only
-- causes a single HTTP request to be emitted.
searchByIndices :: MonadBH m => NonEmpty IndexName -> Search -> m Reply
searchByIndices ixs = bindM2 dispatchSearch url . return
where url = joinPath [renderedIxs, "_search"]
renderedIxs = T.intercalate (T.singleton ',') (map (\(IndexName t) -> t) (toList ixs))
-- | 'searchByType', given a 'Search', 'IndexName', and 'MappingName', will perform that
-- search against a specific mapping within an index on an Elasticsearch server.
--

View File

@ -8,24 +8,29 @@ import Data.Aeson.Types (Parser)
import qualified Data.Map.Strict as M
import Data.Maybe (catMaybes)
import Data.Text (Text)
import qualified Data.Text as T
import Database.V5.Bloodhound.Internal.Newtypes
import Database.V5.Bloodhound.Internal.StringlyTyped
data Analysis = Analysis
{ analysisAnalyzer :: M.Map Text AnalyzerDefinition
, analysisTokenizer :: M.Map Text TokenizerDefinition
, analysisTokenFilter :: M.Map Text TokenFilterDefinition
} deriving (Eq, Show)
instance ToJSON Analysis where
toJSON (Analysis analyzer tokenizer) = object
toJSON (Analysis analyzer tokenizer tokenFilter) = object
[ "analyzer" .= analyzer
, "tokenizer" .= tokenizer
, "filter" .= tokenFilter
]
instance FromJSON Analysis where
parseJSON = withObject "Analysis" $ \m -> Analysis
<$> m .: "analyzer"
<*> m .: "tokenizer"
<*> m .:? "tokenizer" .!= M.empty
<*> m .:? "filter" .!= M.empty
newtype Tokenizer =
Tokenizer Text
@ -33,17 +38,19 @@ newtype Tokenizer =
data AnalyzerDefinition = AnalyzerDefinition
{ analyzerDefinitionTokenizer :: Maybe Tokenizer
, analyzerDefinitionFilter :: [TokenFilter]
} deriving (Eq,Show)
instance ToJSON AnalyzerDefinition where
toJSON (AnalyzerDefinition tokenizer) = object $ catMaybes
toJSON (AnalyzerDefinition tokenizer tokenFilter) = object $ catMaybes
[ fmap ("tokenizer" .=) tokenizer
, Just $ "filter" .= tokenFilter
]
instance FromJSON AnalyzerDefinition where
parseJSON = withObject "AnalyzerDefinition" $ \m -> AnalyzerDefinition
<$> m .:? "tokenizer"
<*> m .:? "filter" .!= []
data TokenizerDefinition
= TokenizerDefinitionNgram Ngram
@ -98,3 +105,210 @@ instance FromJSON TokenChar where
"punctuation" -> return TokenPunctuation
"symbol" -> return TokenSymbol
_ -> fail "invalid TokenChar"
-- | Token filters are used to create custom analyzers.
data TokenFilterDefinition
= TokenFilterDefinitionLowercase (Maybe Language)
| TokenFilterDefinitionUppercase (Maybe Language)
| TokenFilterDefinitionApostrophe
| TokenFilterDefinitionReverse
| TokenFilterDefinitionSnowball Language
| TokenFilterDefinitionShingle Shingle
deriving (Eq, Show)
instance ToJSON TokenFilterDefinition where
toJSON x = case x of
TokenFilterDefinitionLowercase mlang -> object $ catMaybes
[ Just $ "type" .= ("lowercase" :: Text)
, fmap (\lang -> "language" .= languageToText lang) mlang
]
TokenFilterDefinitionUppercase mlang -> object $ catMaybes
[ Just $ "type" .= ("uppercase" :: Text)
, fmap (\lang -> "language" .= languageToText lang) mlang
]
TokenFilterDefinitionApostrophe -> object
[ "type" .= ("apostrophe" :: Text)
]
TokenFilterDefinitionReverse -> object
[ "type" .= ("reverse" :: Text)
]
TokenFilterDefinitionSnowball lang -> object
[ "type" .= ("snowball" :: Text)
, "language" .= languageToText lang
]
TokenFilterDefinitionShingle s -> object
[ "type" .= ("shingle" :: Text)
, "max_shingle_size" .= shingleMaxSize s
, "min_shingle_size" .= shingleMinSize s
, "output_unigrams" .= shingleOutputUnigrams s
, "output_unigrams_if_no_shingles" .= shingleOutputUnigramsIfNoShingles s
, "token_separator" .= shingleTokenSeparator s
, "filler_token" .= shingleFillerToken s
]
instance FromJSON TokenFilterDefinition where
parseJSON = withObject "TokenFilterDefinition" $ \m -> do
t <- m .: "type"
case (t :: Text) of
"reverse" -> return TokenFilterDefinitionReverse
"apostrophe" -> return TokenFilterDefinitionApostrophe
"lowercase" -> TokenFilterDefinitionLowercase
<$> m .:? "language"
"uppercase" -> TokenFilterDefinitionUppercase
<$> m .:? "language"
"snowball" -> TokenFilterDefinitionSnowball
<$> m .: "language"
"shingle" -> fmap TokenFilterDefinitionShingle $ Shingle
<$> (fmap.fmap) unStringlyTypedInt (m .:? "max_shingle_size") .!= 2
<*> (fmap.fmap) unStringlyTypedInt (m .:? "min_shingle_size") .!= 2
<*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams") .!= True
<*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams_if_no_shingles") .!= False
<*> m .:? "token_separator" .!= " "
<*> m .:? "filler_token" .!= "_"
_ -> fail ("unrecognized token filter type: " ++ T.unpack t)
-- | The set of languages that can be passed to various analyzers,
-- filters, etc. in ElasticSearch. Most data types in this module
-- that have a 'Language' field are actually only actually to
-- handle a subset of these languages. Consult the official
-- ElasticSearch documentation to see what is actually supported.
data Language
= Arabic
| Armenian
| Basque
| Bengali
| Brazilian
| Bulgarian
| Catalan
| Cjk
| Czech
| Danish
| Dutch
| English
| Finnish
| French
| Galician
| German
| German2
| Greek
| Hindi
| Hungarian
| Indonesian
| Irish
| Italian
| Kp
| Latvian
| Lithuanian
| Lovins
| Norwegian
| Persian
| Porter
| Portuguese
| Romanian
| Russian
| Sorani
| Spanish
| Swedish
| Thai
| Turkish
deriving (Eq, Show)
instance ToJSON Language where
toJSON = Data.Aeson.String . languageToText
instance FromJSON Language where
parseJSON = withText "Language" $ \t -> case languageFromText t of
Nothing -> fail "not a supported ElasticSearch language"
Just lang -> return lang
languageToText :: Language -> Text
languageToText x = case x of
Arabic -> "arabic"
Armenian -> "armenian"
Basque -> "basque"
Bengali -> "bengali"
Brazilian -> "brazilian"
Bulgarian -> "bulgarian"
Catalan -> "catalan"
Cjk -> "cjk"
Czech -> "czech"
Danish -> "danish"
Dutch -> "dutch"
English -> "english"
Finnish -> "finnish"
French -> "french"
Galician -> "galician"
German -> "german"
German2 -> "german2"
Greek -> "greek"
Hindi -> "hindi"
Hungarian -> "hungarian"
Indonesian -> "indonesian"
Irish -> "irish"
Italian -> "italian"
Kp -> "kp"
Latvian -> "latvian"
Lithuanian -> "lithuanian"
Lovins -> "lovins"
Norwegian -> "norwegian"
Persian -> "persian"
Porter -> "porter"
Portuguese -> "portuguese"
Romanian -> "romanian"
Russian -> "russian"
Sorani -> "sorani"
Spanish -> "spanish"
Swedish -> "swedish"
Thai -> "thai"
Turkish -> "turkish"
languageFromText :: Text -> Maybe Language
languageFromText x = case x of
"arabic" -> Just Arabic
"armenian" -> Just Armenian
"basque" -> Just Basque
"bengali" -> Just Bengali
"brazilian" -> Just Brazilian
"bulgarian" -> Just Bulgarian
"catalan" -> Just Catalan
"cjk" -> Just Cjk
"czech" -> Just Czech
"danish" -> Just Danish
"dutch" -> Just Dutch
"english" -> Just English
"finnish" -> Just Finnish
"french" -> Just French
"galician" -> Just Galician
"german" -> Just German
"german2" -> Just German2
"greek" -> Just Greek
"hindi" -> Just Hindi
"hungarian" -> Just Hungarian
"indonesian" -> Just Indonesian
"irish" -> Just Irish
"italian" -> Just Italian
"kp" -> Just Kp
"latvian" -> Just Latvian
"lithuanian" -> Just Lithuanian
"lovins" -> Just Lovins
"norwegian" -> Just Norwegian
"persian" -> Just Persian
"porter" -> Just Porter
"portuguese" -> Just Portuguese
"romanian" -> Just Romanian
"russian" -> Just Russian
"sorani" -> Just Sorani
"spanish" -> Just Spanish
"swedish" -> Just Swedish
"thai" -> Just Thai
"turkish" -> Just Turkish
_ -> Nothing
data Shingle = Shingle
{ shingleMaxSize :: Int
, shingleMinSize :: Int
, shingleOutputUnigrams :: Bool
, shingleOutputUnigramsIfNoShingles :: Bool
, shingleTokenSeparator :: Text
, shingleFillerToken :: Text
} deriving (Eq, Show)

View File

@ -196,3 +196,6 @@ instance FromJSON MS where
parseJSON = withScientific "MS" (return . MS . parse)
where
parse n = fromInteger ((truncate n) * 1000)
newtype TokenFilter =
TokenFilter Text deriving (Eq, Show, FromJSON, ToJSON)

View File

@ -26,8 +26,16 @@ instance FromJSON StringlyTypedInt where
. parseJSON
. unStringlyTypeJSON
newtype StringlyTypedBool = StringlyTypedBool { unStringlyTypedBool :: Bool }
instance FromJSON StringlyTypedBool where
parseJSON =
fmap StringlyTypedBool
. parseJSON
. unStringlyTypeJSON
-- | For some reason in several settings APIs, all leaf values get returned
-- as strings. This function attepmts to recover from this for all
-- as strings. This function attempts to recover from this for all
-- non-recursive JSON types. If nothing can be done, the value is left alone.
unStringlyTypeJSON :: Value -> Value
unStringlyTypeJSON (String "true") =

View File

@ -223,6 +223,7 @@ module Database.V5.Bloodhound.Types
, CutoffFrequency(..)
, Analyzer(..)
, Tokenizer(..)
, TokenFilter(..)
, MaxExpansions(..)
, Lenient(..)
, MatchQueryType(..)
@ -404,8 +405,11 @@ module Database.V5.Bloodhound.Types
, Analysis(..)
, AnalyzerDefinition(..)
, TokenizerDefinition(..)
, TokenFilterDefinition(..)
, Ngram(..)
, TokenChar(..)
, Shingle(..)
, Language(..)
) where
import Bloodhound.Import
@ -1575,7 +1579,6 @@ instance FromJSON DocVersion where
i <- parseJSON v
maybe (fail "DocVersion out of range") return $ mkDocVersion i
data Suggest = Suggest { suggestText :: Text
, suggestName :: Text
, suggestType :: SuggestType

View File

@ -925,6 +925,14 @@ makeArbitrary ''Ngram
instance Arbitrary Ngram where arbitrary = arbitraryNgram
makeArbitrary ''TokenizerDefinition
instance Arbitrary TokenizerDefinition where arbitrary = arbitraryTokenizerDefinition
makeArbitrary ''TokenFilter
instance Arbitrary TokenFilter where arbitrary = arbitraryTokenFilter
makeArbitrary ''TokenFilterDefinition
instance Arbitrary TokenFilterDefinition where arbitrary = arbitraryTokenFilterDefinition
makeArbitrary ''Language
instance Arbitrary Language where arbitrary = arbitraryLanguage
makeArbitrary ''Shingle
instance Arbitrary Shingle where arbitrary = arbitraryShingle
makeArbitrary ''AnalyzerDefinition
instance Arbitrary AnalyzerDefinition where arbitrary = arbitraryAnalyzerDefinition
makeArbitrary ''Analysis
@ -1023,6 +1031,7 @@ instance Arbitrary UpdatableIndexSetting' where
NE.fromList . L.nubBy sameAttrName . NE.toList
sameAttrName a b =
nodeAttrFilterName a == nodeAttrFilterName b
-- shrink (UpdatableIndexSetting' x) = map UpdatableIndexSetting' (shrink x)
-- | Due to the way nodeattrfilters get serialized here, they may come
-- out in a different order, but they are morally equivalent
@ -1773,12 +1782,31 @@ main = hspec $ do
it "accepts customer analyzers" $ when' (atleast es50) $ withTestEnv $ do
_ <- deleteExampleIndex
let analysis = Analysis
(M.singleton "ex_analyzer" (AnalyzerDefinition (Just (Tokenizer "ex_tokenizer"))))
(M.singleton "ex_analyzer"
( AnalyzerDefinition
(Just (Tokenizer "ex_tokenizer"))
(map TokenFilter
[ "ex_filter_lowercase","ex_filter_uppercase","ex_filter_apostrophe"
, "ex_filter_reverse","ex_filter_snowball"
, "ex_filter_shingle"
]
)
)
)
(M.singleton "ex_tokenizer"
( TokenizerDefinitionNgram
( Ngram 3 4 [TokenLetter,TokenDigit])
)
)
(M.fromList
[ ("ex_filter_lowercase",TokenFilterDefinitionLowercase (Just Greek))
, ("ex_filter_uppercase",TokenFilterDefinitionUppercase Nothing)
, ("ex_filter_apostrophe",TokenFilterDefinitionApostrophe)
, ("ex_filter_reverse",TokenFilterDefinitionReverse)
, ("ex_filter_snowball",TokenFilterDefinitionSnowball English)
, ("ex_filter_shingle",TokenFilterDefinitionShingle (Shingle 3 3 True False " " "_"))
]
)
updates = [AnalysisSetting analysis]
createResp <- createIndexWith (updates ++ [NumberOfReplicas (ReplicaCount 0)]) 1 testIndex
liftIO $ validateStatus createResp 200