mirror of
https://github.com/typeable/bloodhound.git
synced 2024-08-16 11:50:34 +03:00
Merged #230
This commit is contained in:
parent
380da47cda
commit
2ddaedd05e
12
changelog.md
12
changelog.md
@ -1,8 +1,10 @@
|
||||
0.16.0.0
|
||||
========
|
||||
- @bitemyapp
|
||||
- Reorganized modules internally, ripped out Generic,
|
||||
rewrote part of the test suite
|
||||
- Reorganized V5 modules internally, ripped out Generic,
|
||||
rewrote part of the test suite.
|
||||
- @mxavier
|
||||
- Reorganized V1 modules, mirroring the work done on V5.
|
||||
- @andrewthad
|
||||
- Added support for autogenerated elasticsearch ids in the bulk API
|
||||
- Added support for token filters
|
||||
@ -11,6 +13,12 @@
|
||||
- Added support for scripts fields and function score queries
|
||||
- @bermanjosh
|
||||
- Added support for direct generators
|
||||
- @rvl
|
||||
- Add [character filters][] to the `AnalyzerDefinition` type.
|
||||
- Add the "stemmer" and "stop" [token filters][] to `TokenFilterDefinition`.
|
||||
|
||||
[Character Filters]: https://www.elastic.co/guide/en/elasticsearch/reference/5.6/analysis-charfilters.html
|
||||
[Token Filters]: https://www.elastic.co/guide/en/elasticsearch/reference/5.6/analysis-tokenfilters.html
|
||||
|
||||
0.15.0.2
|
||||
========
|
||||
|
@ -6,6 +6,7 @@ module Database.V5.Bloodhound.Internal.Analysis where
|
||||
import Bloodhound.Import
|
||||
|
||||
import qualified Data.Map.Strict as M
|
||||
import Data.String
|
||||
import qualified Data.Text as T
|
||||
|
||||
import Database.V5.Bloodhound.Internal.Newtypes
|
||||
@ -15,13 +16,15 @@ data Analysis = Analysis
|
||||
{ analysisAnalyzer :: M.Map Text AnalyzerDefinition
|
||||
, analysisTokenizer :: M.Map Text TokenizerDefinition
|
||||
, analysisTokenFilter :: M.Map Text TokenFilterDefinition
|
||||
, analysisCharFilter :: M.Map Text CharFilterDefinition
|
||||
} deriving (Eq, Show)
|
||||
|
||||
instance ToJSON Analysis where
|
||||
toJSON (Analysis analyzer tokenizer tokenFilter) = object
|
||||
toJSON (Analysis analyzer tokenizer tokenFilter charFilter) = object
|
||||
[ "analyzer" .= analyzer
|
||||
, "tokenizer" .= tokenizer
|
||||
, "filter" .= tokenFilter
|
||||
, "char_filter" .= charFilter
|
||||
]
|
||||
|
||||
instance FromJSON Analysis where
|
||||
@ -29,6 +32,7 @@ instance FromJSON Analysis where
|
||||
<$> m .: "analyzer"
|
||||
<*> m .:? "tokenizer" .!= M.empty
|
||||
<*> m .:? "filter" .!= M.empty
|
||||
<*> m .:? "char_filter" .!= M.empty
|
||||
|
||||
newtype Tokenizer =
|
||||
Tokenizer Text
|
||||
@ -37,18 +41,57 @@ newtype Tokenizer =
|
||||
data AnalyzerDefinition = AnalyzerDefinition
|
||||
{ analyzerDefinitionTokenizer :: Maybe Tokenizer
|
||||
, analyzerDefinitionFilter :: [TokenFilter]
|
||||
, analyzerDefinitionCharFilter :: [CharFilter]
|
||||
} deriving (Eq,Show)
|
||||
|
||||
instance ToJSON AnalyzerDefinition where
|
||||
toJSON (AnalyzerDefinition tokenizer tokenFilter) = object $ catMaybes
|
||||
toJSON (AnalyzerDefinition tokenizer tokenFilter charFilter) =
|
||||
object $ catMaybes
|
||||
[ fmap ("tokenizer" .=) tokenizer
|
||||
, Just $ "filter" .= tokenFilter
|
||||
, Just $ "char_filter" .= charFilter
|
||||
]
|
||||
|
||||
instance FromJSON AnalyzerDefinition where
|
||||
parseJSON = withObject "AnalyzerDefinition" $ \m -> AnalyzerDefinition
|
||||
<$> m .:? "tokenizer"
|
||||
<*> m .:? "filter" .!= []
|
||||
<*> m .:? "char_filter" .!= []
|
||||
|
||||
-- | Character filters are used to preprocess the stream of characters
|
||||
-- before it is passed to the tokenizer.
|
||||
data CharFilterDefinition
|
||||
= CharFilterDefinitionMapping (M.Map Text Text)
|
||||
| CharFilterDefinitionPatternReplace
|
||||
{ charFilterDefinitionPatternReplacePattern :: Text
|
||||
, charFilterDefinitionPatternReplaceReplacement :: Text
|
||||
, charFilterDefinitionPatternReplaceFlags :: Maybe Text
|
||||
}
|
||||
deriving (Eq, Show)
|
||||
|
||||
instance ToJSON CharFilterDefinition where
|
||||
toJSON (CharFilterDefinitionMapping ms) = object
|
||||
[ "type" .= ("mapping" :: Text)
|
||||
, "mappings" .= [a <> " => " <> b | (a, b) <- M.toList ms] ]
|
||||
toJSON (CharFilterDefinitionPatternReplace pat repl flags) = object $
|
||||
[ "type" .= ("pattern_replace" :: Text)
|
||||
, "pattern" .= pat
|
||||
, "replacement" .= repl
|
||||
] ++ maybe [] (\f -> ["flags" .= f]) flags
|
||||
|
||||
instance FromJSON CharFilterDefinition where
|
||||
parseJSON = withObject "CharFilterDefinition" $ \m -> do
|
||||
t <- m .: "type"
|
||||
case (t :: Text) of
|
||||
"mapping" -> CharFilterDefinitionMapping . M.fromList <$> ms
|
||||
where
|
||||
ms = m .: "mappings" >>= mapM parseMapping
|
||||
parseMapping kv = case T.splitOn "=>" kv of
|
||||
(k:vs) -> pure (T.strip k, T.strip $ T.concat vs)
|
||||
_ -> fail "mapping is not of the format key => value"
|
||||
"pattern_replace" -> CharFilterDefinitionPatternReplace
|
||||
<$> m .: "pattern" <*> m .: "replacement" <*> m .:? "flags"
|
||||
_ -> fail ("unrecognized character filter type: " ++ T.unpack t)
|
||||
|
||||
newtype TokenizerDefinition =
|
||||
TokenizerDefinitionNgram Ngram
|
||||
@ -112,6 +155,8 @@ data TokenFilterDefinition
|
||||
| TokenFilterDefinitionReverse
|
||||
| TokenFilterDefinitionSnowball Language
|
||||
| TokenFilterDefinitionShingle Shingle
|
||||
| TokenFilterDefinitionStemmer Language
|
||||
| TokenFilterDefinitionStop (Either Language [StopWord])
|
||||
deriving (Eq, Show)
|
||||
|
||||
instance ToJSON TokenFilterDefinition where
|
||||
@ -143,6 +188,16 @@ instance ToJSON TokenFilterDefinition where
|
||||
, "token_separator" .= shingleTokenSeparator s
|
||||
, "filler_token" .= shingleFillerToken s
|
||||
]
|
||||
TokenFilterDefinitionStemmer lang -> object
|
||||
[ "type" .= ("stemmer" :: Text)
|
||||
, "language" .= languageToText lang
|
||||
]
|
||||
TokenFilterDefinitionStop stop -> object
|
||||
[ "type" .= ("stop" :: Text)
|
||||
, "stopwords" .= case stop of
|
||||
Left lang -> String $ "_" <> languageToText lang <> "_"
|
||||
Right stops -> toJSON stops
|
||||
]
|
||||
|
||||
instance FromJSON TokenFilterDefinition where
|
||||
parseJSON = withObject "TokenFilterDefinition" $ \m -> do
|
||||
@ -163,6 +218,19 @@ instance FromJSON TokenFilterDefinition where
|
||||
<*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams_if_no_shingles") .!= False
|
||||
<*> m .:? "token_separator" .!= " "
|
||||
<*> m .:? "filler_token" .!= "_"
|
||||
"stemmer" -> TokenFilterDefinitionStemmer
|
||||
<$> m .: "language"
|
||||
"stop" -> do
|
||||
stop <- m .: "stopwords"
|
||||
stop' <- case stop of
|
||||
String lang ->
|
||||
fmap Left
|
||||
. parseJSON
|
||||
. String
|
||||
. T.drop 1
|
||||
. T.dropEnd 1 $ lang
|
||||
_ -> Right <$> parseJSON stop
|
||||
return (TokenFilterDefinitionStop stop')
|
||||
_ -> fail ("unrecognized token filter type: " ++ T.unpack t)
|
||||
|
||||
-- | The set of languages that can be passed to various analyzers,
|
||||
|
@ -231,4 +231,9 @@ instance FromJSON MS where
|
||||
parse n = fromInteger (truncate n * 1000)
|
||||
|
||||
newtype TokenFilter =
|
||||
TokenFilter Text deriving (Eq, Show, FromJSON, ToJSON)
|
||||
TokenFilter Text
|
||||
deriving (Eq, Show, FromJSON, ToJSON)
|
||||
|
||||
newtype CharFilter =
|
||||
CharFilter Text
|
||||
deriving (Eq, Show, FromJSON, ToJSON)
|
||||
|
@ -262,4 +262,9 @@ instance FromJSON DirectGenerators where
|
||||
<*> o .:? "post_filter"
|
||||
|
||||
mkDirectGenerators :: FieldName -> DirectGenerators
|
||||
mkDirectGenerators fn = DirectGenerators fn Nothing DirectGeneratorSuggestModeMissing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing
|
||||
mkDirectGenerators fn =
|
||||
DirectGenerators
|
||||
fn
|
||||
Nothing
|
||||
DirectGeneratorSuggestModeMissing
|
||||
Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing
|
||||
|
@ -224,6 +224,7 @@ module Database.V5.Bloodhound.Types
|
||||
, Analyzer(..)
|
||||
, Tokenizer(..)
|
||||
, TokenFilter(..)
|
||||
, CharFilter(..)
|
||||
, MaxExpansions(..)
|
||||
, Lenient(..)
|
||||
, MatchQueryType(..)
|
||||
@ -406,6 +407,7 @@ module Database.V5.Bloodhound.Types
|
||||
, AnalyzerDefinition(..)
|
||||
, TokenizerDefinition(..)
|
||||
, TokenFilterDefinition(..)
|
||||
, CharFilterDefinition(..)
|
||||
, Ngram(..)
|
||||
, TokenChar(..)
|
||||
, Shingle(..)
|
||||
|
@ -435,8 +435,25 @@ makeArbitrary ''Language
|
||||
instance Arbitrary Language where arbitrary = arbitraryLanguage
|
||||
makeArbitrary ''Shingle
|
||||
instance Arbitrary Shingle where arbitrary = arbitraryShingle
|
||||
|
||||
makeArbitrary ''CharFilter
|
||||
instance Arbitrary CharFilter where arbitrary = arbitraryCharFilter
|
||||
makeArbitrary ''AnalyzerDefinition
|
||||
instance Arbitrary AnalyzerDefinition where arbitrary = arbitraryAnalyzerDefinition
|
||||
|
||||
-- TODO: This should have a proper generator that doesn't
|
||||
-- create garbage that has to be filtered out.
|
||||
instance Arbitrary CharFilterDefinition where
|
||||
arbitrary =
|
||||
oneof [ CharFilterDefinitionMapping
|
||||
. chomp <$> arbitrary
|
||||
, CharFilterDefinitionPatternReplace
|
||||
<$> arbitrary <*> arbitrary <*> arbitrary
|
||||
]
|
||||
where chomp =
|
||||
M.map T.strip
|
||||
. M.mapKeys (T.replace "=>" "" . T.strip)
|
||||
|
||||
makeArbitrary ''Analysis
|
||||
instance Arbitrary Analysis where arbitrary = arbitraryAnalysis
|
||||
makeArbitrary ''Tokenizer
|
||||
@ -533,4 +550,4 @@ instance Arbitrary UpdatableIndexSetting' where
|
||||
NE.fromList . L.nubBy sameAttrName . NE.toList
|
||||
sameAttrName a b =
|
||||
nodeAttrFilterName a == nodeAttrFilterName b
|
||||
-- shrink (UpdatableIndexSetting' x) = map UpdatableIndexSetting' (shrink x)
|
||||
shrink (UpdatableIndexSetting' x) = map UpdatableIndexSetting' (shrink x)
|
||||
|
@ -99,6 +99,9 @@ spec = do
|
||||
, "ex_filter_shingle"
|
||||
]
|
||||
)
|
||||
(map CharFilter
|
||||
["html_strip", "ex_mapping", "ex_pattern_replace"]
|
||||
)
|
||||
)
|
||||
)
|
||||
(M.singleton "ex_tokenizer"
|
||||
@ -113,7 +116,18 @@ spec = do
|
||||
, ("ex_filter_reverse",TokenFilterDefinitionReverse)
|
||||
, ("ex_filter_snowball",TokenFilterDefinitionSnowball English)
|
||||
, ("ex_filter_shingle",TokenFilterDefinitionShingle (Shingle 3 3 True False " " "_"))
|
||||
]
|
||||
, ("ex_filter_stemmer",TokenFilterDefinitionStemmer German)
|
||||
, ("ex_filter_stop1", TokenFilterDefinitionStop (Left French))
|
||||
, ("ex_filter_stop2",
|
||||
TokenFilterDefinitionStop
|
||||
(Right
|
||||
$ map StopWord ["a", "is", "the"]))
|
||||
]
|
||||
)
|
||||
(M.fromList
|
||||
[ ("ex_mapping", CharFilterDefinitionMapping (M.singleton "١" "1"))
|
||||
, ("ex_pattern_replace", CharFilterDefinitionPatternReplace "(\\d+)-(?=\\d)" "$1_" Nothing)
|
||||
]
|
||||
)
|
||||
updates = [AnalysisSetting analysis]
|
||||
createResp <- createIndexWith (updates ++ [NumberOfReplicas (ReplicaCount 0)]) 1 testIndex
|
||||
|
Loading…
Reference in New Issue
Block a user