This commit is contained in:
Chris Allen 2018-04-07 17:16:56 -05:00
parent 380da47cda
commit 2ddaedd05e
7 changed files with 127 additions and 8 deletions

View File

@ -1,8 +1,10 @@
0.16.0.0
========
- @bitemyapp
- Reorganized modules internally, ripped out Generic,
rewrote part of the test suite
- Reorganized V5 modules internally, ripped out Generic,
rewrote part of the test suite.
- @mxavier
- Reorganized V1 modules, mirroring the work done on V5.
- @andrewthad
- Added support for autogenerated elasticsearch ids in the bulk API
- Added support for token filters
@ -11,6 +13,12 @@
- Added support for scripts fields and function score queries
- @bermanjosh
- Added support for direct generators
- @rvl
- Add [character filters][] to the `AnalyzerDefinition` type.
- Add the "stemmer" and "stop" [token filters][] to `TokenFilterDefinition`.
[Character Filters]: https://www.elastic.co/guide/en/elasticsearch/reference/5.6/analysis-charfilters.html
[Token Filters]: https://www.elastic.co/guide/en/elasticsearch/reference/5.6/analysis-tokenfilters.html
0.15.0.2
========

View File

@ -6,6 +6,7 @@ module Database.V5.Bloodhound.Internal.Analysis where
import Bloodhound.Import
import qualified Data.Map.Strict as M
import Data.String
import qualified Data.Text as T
import Database.V5.Bloodhound.Internal.Newtypes
@ -15,13 +16,15 @@ data Analysis = Analysis
{ analysisAnalyzer :: M.Map Text AnalyzerDefinition
, analysisTokenizer :: M.Map Text TokenizerDefinition
, analysisTokenFilter :: M.Map Text TokenFilterDefinition
, analysisCharFilter :: M.Map Text CharFilterDefinition
} deriving (Eq, Show)
instance ToJSON Analysis where
toJSON (Analysis analyzer tokenizer tokenFilter) = object
toJSON (Analysis analyzer tokenizer tokenFilter charFilter) = object
[ "analyzer" .= analyzer
, "tokenizer" .= tokenizer
, "filter" .= tokenFilter
, "char_filter" .= charFilter
]
instance FromJSON Analysis where
@ -29,6 +32,7 @@ instance FromJSON Analysis where
<$> m .: "analyzer"
<*> m .:? "tokenizer" .!= M.empty
<*> m .:? "filter" .!= M.empty
<*> m .:? "char_filter" .!= M.empty
newtype Tokenizer =
Tokenizer Text
@ -37,18 +41,57 @@ newtype Tokenizer =
data AnalyzerDefinition = AnalyzerDefinition
{ analyzerDefinitionTokenizer :: Maybe Tokenizer
, analyzerDefinitionFilter :: [TokenFilter]
, analyzerDefinitionCharFilter :: [CharFilter]
} deriving (Eq,Show)
instance ToJSON AnalyzerDefinition where
toJSON (AnalyzerDefinition tokenizer tokenFilter) = object $ catMaybes
toJSON (AnalyzerDefinition tokenizer tokenFilter charFilter) =
object $ catMaybes
[ fmap ("tokenizer" .=) tokenizer
, Just $ "filter" .= tokenFilter
, Just $ "char_filter" .= charFilter
]
instance FromJSON AnalyzerDefinition where
parseJSON = withObject "AnalyzerDefinition" $ \m -> AnalyzerDefinition
<$> m .:? "tokenizer"
<*> m .:? "filter" .!= []
<*> m .:? "char_filter" .!= []
-- | Character filters are used to preprocess the stream of characters
-- before it is passed to the tokenizer.
data CharFilterDefinition
= CharFilterDefinitionMapping (M.Map Text Text)
| CharFilterDefinitionPatternReplace
{ charFilterDefinitionPatternReplacePattern :: Text
, charFilterDefinitionPatternReplaceReplacement :: Text
, charFilterDefinitionPatternReplaceFlags :: Maybe Text
}
deriving (Eq, Show)
instance ToJSON CharFilterDefinition where
toJSON (CharFilterDefinitionMapping ms) = object
[ "type" .= ("mapping" :: Text)
, "mappings" .= [a <> " => " <> b | (a, b) <- M.toList ms] ]
toJSON (CharFilterDefinitionPatternReplace pat repl flags) = object $
[ "type" .= ("pattern_replace" :: Text)
, "pattern" .= pat
, "replacement" .= repl
] ++ maybe [] (\f -> ["flags" .= f]) flags
instance FromJSON CharFilterDefinition where
parseJSON = withObject "CharFilterDefinition" $ \m -> do
t <- m .: "type"
case (t :: Text) of
"mapping" -> CharFilterDefinitionMapping . M.fromList <$> ms
where
ms = m .: "mappings" >>= mapM parseMapping
parseMapping kv = case T.splitOn "=>" kv of
(k:vs) -> pure (T.strip k, T.strip $ T.concat vs)
_ -> fail "mapping is not of the format key => value"
"pattern_replace" -> CharFilterDefinitionPatternReplace
<$> m .: "pattern" <*> m .: "replacement" <*> m .:? "flags"
_ -> fail ("unrecognized character filter type: " ++ T.unpack t)
newtype TokenizerDefinition =
TokenizerDefinitionNgram Ngram
@ -112,6 +155,8 @@ data TokenFilterDefinition
| TokenFilterDefinitionReverse
| TokenFilterDefinitionSnowball Language
| TokenFilterDefinitionShingle Shingle
| TokenFilterDefinitionStemmer Language
| TokenFilterDefinitionStop (Either Language [StopWord])
deriving (Eq, Show)
instance ToJSON TokenFilterDefinition where
@ -143,6 +188,16 @@ instance ToJSON TokenFilterDefinition where
, "token_separator" .= shingleTokenSeparator s
, "filler_token" .= shingleFillerToken s
]
TokenFilterDefinitionStemmer lang -> object
[ "type" .= ("stemmer" :: Text)
, "language" .= languageToText lang
]
TokenFilterDefinitionStop stop -> object
[ "type" .= ("stop" :: Text)
, "stopwords" .= case stop of
Left lang -> String $ "_" <> languageToText lang <> "_"
Right stops -> toJSON stops
]
instance FromJSON TokenFilterDefinition where
parseJSON = withObject "TokenFilterDefinition" $ \m -> do
@ -163,6 +218,19 @@ instance FromJSON TokenFilterDefinition where
<*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams_if_no_shingles") .!= False
<*> m .:? "token_separator" .!= " "
<*> m .:? "filler_token" .!= "_"
"stemmer" -> TokenFilterDefinitionStemmer
<$> m .: "language"
"stop" -> do
stop <- m .: "stopwords"
stop' <- case stop of
String lang ->
fmap Left
. parseJSON
. String
. T.drop 1
. T.dropEnd 1 $ lang
_ -> Right <$> parseJSON stop
return (TokenFilterDefinitionStop stop')
_ -> fail ("unrecognized token filter type: " ++ T.unpack t)
-- | The set of languages that can be passed to various analyzers,

View File

@ -231,4 +231,9 @@ instance FromJSON MS where
parse n = fromInteger (truncate n * 1000)
newtype TokenFilter =
TokenFilter Text deriving (Eq, Show, FromJSON, ToJSON)
TokenFilter Text
deriving (Eq, Show, FromJSON, ToJSON)
newtype CharFilter =
CharFilter Text
deriving (Eq, Show, FromJSON, ToJSON)

View File

@ -262,4 +262,9 @@ instance FromJSON DirectGenerators where
<*> o .:? "post_filter"
mkDirectGenerators :: FieldName -> DirectGenerators
mkDirectGenerators fn = DirectGenerators fn Nothing DirectGeneratorSuggestModeMissing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing
mkDirectGenerators fn =
DirectGenerators
fn
Nothing
DirectGeneratorSuggestModeMissing
Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing

View File

@ -224,6 +224,7 @@ module Database.V5.Bloodhound.Types
, Analyzer(..)
, Tokenizer(..)
, TokenFilter(..)
, CharFilter(..)
, MaxExpansions(..)
, Lenient(..)
, MatchQueryType(..)
@ -406,6 +407,7 @@ module Database.V5.Bloodhound.Types
, AnalyzerDefinition(..)
, TokenizerDefinition(..)
, TokenFilterDefinition(..)
, CharFilterDefinition(..)
, Ngram(..)
, TokenChar(..)
, Shingle(..)

View File

@ -435,8 +435,25 @@ makeArbitrary ''Language
instance Arbitrary Language where arbitrary = arbitraryLanguage
makeArbitrary ''Shingle
instance Arbitrary Shingle where arbitrary = arbitraryShingle
makeArbitrary ''CharFilter
instance Arbitrary CharFilter where arbitrary = arbitraryCharFilter
makeArbitrary ''AnalyzerDefinition
instance Arbitrary AnalyzerDefinition where arbitrary = arbitraryAnalyzerDefinition
-- TODO: This should have a proper generator that doesn't
-- create garbage that has to be filtered out.
instance Arbitrary CharFilterDefinition where
arbitrary =
oneof [ CharFilterDefinitionMapping
. chomp <$> arbitrary
, CharFilterDefinitionPatternReplace
<$> arbitrary <*> arbitrary <*> arbitrary
]
where chomp =
M.map T.strip
. M.mapKeys (T.replace "=>" "" . T.strip)
makeArbitrary ''Analysis
instance Arbitrary Analysis where arbitrary = arbitraryAnalysis
makeArbitrary ''Tokenizer
@ -533,4 +550,4 @@ instance Arbitrary UpdatableIndexSetting' where
NE.fromList . L.nubBy sameAttrName . NE.toList
sameAttrName a b =
nodeAttrFilterName a == nodeAttrFilterName b
-- shrink (UpdatableIndexSetting' x) = map UpdatableIndexSetting' (shrink x)
shrink (UpdatableIndexSetting' x) = map UpdatableIndexSetting' (shrink x)

View File

@ -99,6 +99,9 @@ spec = do
, "ex_filter_shingle"
]
)
(map CharFilter
["html_strip", "ex_mapping", "ex_pattern_replace"]
)
)
)
(M.singleton "ex_tokenizer"
@ -113,7 +116,18 @@ spec = do
, ("ex_filter_reverse",TokenFilterDefinitionReverse)
, ("ex_filter_snowball",TokenFilterDefinitionSnowball English)
, ("ex_filter_shingle",TokenFilterDefinitionShingle (Shingle 3 3 True False " " "_"))
]
, ("ex_filter_stemmer",TokenFilterDefinitionStemmer German)
, ("ex_filter_stop1", TokenFilterDefinitionStop (Left French))
, ("ex_filter_stop2",
TokenFilterDefinitionStop
(Right
$ map StopWord ["a", "is", "the"]))
]
)
(M.fromList
[ ("ex_mapping", CharFilterDefinitionMapping (M.singleton "١" "1"))
, ("ex_pattern_replace", CharFilterDefinitionPatternReplace "(\\d+)-(?=\\d)" "$1_" Nothing)
]
)
updates = [AnalysisSetting analysis]
createResp <- createIndexWith (updates ++ [NumberOfReplicas (ReplicaCount 0)]) 1 testIndex