diff --git a/changelog.md b/changelog.md index ef62ca0..9849a64 100644 --- a/changelog.md +++ b/changelog.md @@ -1,8 +1,10 @@ 0.16.0.0 ======== - @bitemyapp - - Reorganized modules internally, ripped out Generic, - rewrote part of the test suite + - Reorganized V5 modules internally, ripped out Generic, + rewrote part of the test suite. +- @mxavier + - Reorganized V1 modules, mirroring the work done on V5. - @andrewthad - Added support for autogenerated elasticsearch ids in the bulk API - Added support for token filters @@ -11,6 +13,12 @@ - Added support for scripts fields and function score queries - @bermanjosh - Added support for direct generators +- @rvl + - Add [character filters][] to the `AnalyzerDefinition` type. + - Add the "stemmer" and "stop" [token filters][] to `TokenFilterDefinition`. + +[Character Filters]: https://www.elastic.co/guide/en/elasticsearch/reference/5.6/analysis-charfilters.html +[Token Filters]: https://www.elastic.co/guide/en/elasticsearch/reference/5.6/analysis-tokenfilters.html 0.15.0.2 ======== diff --git a/src/Database/V5/Bloodhound/Internal/Analysis.hs b/src/Database/V5/Bloodhound/Internal/Analysis.hs index 2b363ec..29961ee 100644 --- a/src/Database/V5/Bloodhound/Internal/Analysis.hs +++ b/src/Database/V5/Bloodhound/Internal/Analysis.hs @@ -6,6 +6,7 @@ module Database.V5.Bloodhound.Internal.Analysis where import Bloodhound.Import import qualified Data.Map.Strict as M +import Data.String import qualified Data.Text as T import Database.V5.Bloodhound.Internal.Newtypes @@ -15,13 +16,15 @@ data Analysis = Analysis { analysisAnalyzer :: M.Map Text AnalyzerDefinition , analysisTokenizer :: M.Map Text TokenizerDefinition , analysisTokenFilter :: M.Map Text TokenFilterDefinition + , analysisCharFilter :: M.Map Text CharFilterDefinition } deriving (Eq, Show) instance ToJSON Analysis where - toJSON (Analysis analyzer tokenizer tokenFilter) = object + toJSON (Analysis analyzer tokenizer tokenFilter charFilter) = object [ "analyzer" .= analyzer , "tokenizer" .= tokenizer , "filter" .= tokenFilter + , "char_filter" .= charFilter ] instance FromJSON Analysis where @@ -29,6 +32,7 @@ instance FromJSON Analysis where <$> m .: "analyzer" <*> m .:? "tokenizer" .!= M.empty <*> m .:? "filter" .!= M.empty + <*> m .:? "char_filter" .!= M.empty newtype Tokenizer = Tokenizer Text @@ -37,18 +41,57 @@ newtype Tokenizer = data AnalyzerDefinition = AnalyzerDefinition { analyzerDefinitionTokenizer :: Maybe Tokenizer , analyzerDefinitionFilter :: [TokenFilter] + , analyzerDefinitionCharFilter :: [CharFilter] } deriving (Eq,Show) instance ToJSON AnalyzerDefinition where - toJSON (AnalyzerDefinition tokenizer tokenFilter) = object $ catMaybes + toJSON (AnalyzerDefinition tokenizer tokenFilter charFilter) = + object $ catMaybes [ fmap ("tokenizer" .=) tokenizer , Just $ "filter" .= tokenFilter + , Just $ "char_filter" .= charFilter ] instance FromJSON AnalyzerDefinition where parseJSON = withObject "AnalyzerDefinition" $ \m -> AnalyzerDefinition <$> m .:? "tokenizer" <*> m .:? "filter" .!= [] + <*> m .:? "char_filter" .!= [] + +-- | Character filters are used to preprocess the stream of characters +-- before it is passed to the tokenizer. +data CharFilterDefinition + = CharFilterDefinitionMapping (M.Map Text Text) + | CharFilterDefinitionPatternReplace + { charFilterDefinitionPatternReplacePattern :: Text + , charFilterDefinitionPatternReplaceReplacement :: Text + , charFilterDefinitionPatternReplaceFlags :: Maybe Text + } + deriving (Eq, Show) + +instance ToJSON CharFilterDefinition where + toJSON (CharFilterDefinitionMapping ms) = object + [ "type" .= ("mapping" :: Text) + , "mappings" .= [a <> " => " <> b | (a, b) <- M.toList ms] ] + toJSON (CharFilterDefinitionPatternReplace pat repl flags) = object $ + [ "type" .= ("pattern_replace" :: Text) + , "pattern" .= pat + , "replacement" .= repl + ] ++ maybe [] (\f -> ["flags" .= f]) flags + +instance FromJSON CharFilterDefinition where + parseJSON = withObject "CharFilterDefinition" $ \m -> do + t <- m .: "type" + case (t :: Text) of + "mapping" -> CharFilterDefinitionMapping . M.fromList <$> ms + where + ms = m .: "mappings" >>= mapM parseMapping + parseMapping kv = case T.splitOn "=>" kv of + (k:vs) -> pure (T.strip k, T.strip $ T.concat vs) + _ -> fail "mapping is not of the format key => value" + "pattern_replace" -> CharFilterDefinitionPatternReplace + <$> m .: "pattern" <*> m .: "replacement" <*> m .:? "flags" + _ -> fail ("unrecognized character filter type: " ++ T.unpack t) newtype TokenizerDefinition = TokenizerDefinitionNgram Ngram @@ -112,6 +155,8 @@ data TokenFilterDefinition | TokenFilterDefinitionReverse | TokenFilterDefinitionSnowball Language | TokenFilterDefinitionShingle Shingle + | TokenFilterDefinitionStemmer Language + | TokenFilterDefinitionStop (Either Language [StopWord]) deriving (Eq, Show) instance ToJSON TokenFilterDefinition where @@ -143,6 +188,16 @@ instance ToJSON TokenFilterDefinition where , "token_separator" .= shingleTokenSeparator s , "filler_token" .= shingleFillerToken s ] + TokenFilterDefinitionStemmer lang -> object + [ "type" .= ("stemmer" :: Text) + , "language" .= languageToText lang + ] + TokenFilterDefinitionStop stop -> object + [ "type" .= ("stop" :: Text) + , "stopwords" .= case stop of + Left lang -> String $ "_" <> languageToText lang <> "_" + Right stops -> toJSON stops + ] instance FromJSON TokenFilterDefinition where parseJSON = withObject "TokenFilterDefinition" $ \m -> do @@ -163,6 +218,19 @@ instance FromJSON TokenFilterDefinition where <*> (fmap.fmap) unStringlyTypedBool (m .:? "output_unigrams_if_no_shingles") .!= False <*> m .:? "token_separator" .!= " " <*> m .:? "filler_token" .!= "_" + "stemmer" -> TokenFilterDefinitionStemmer + <$> m .: "language" + "stop" -> do + stop <- m .: "stopwords" + stop' <- case stop of + String lang -> + fmap Left + . parseJSON + . String + . T.drop 1 + . T.dropEnd 1 $ lang + _ -> Right <$> parseJSON stop + return (TokenFilterDefinitionStop stop') _ -> fail ("unrecognized token filter type: " ++ T.unpack t) -- | The set of languages that can be passed to various analyzers, diff --git a/src/Database/V5/Bloodhound/Internal/Newtypes.hs b/src/Database/V5/Bloodhound/Internal/Newtypes.hs index d58690e..95669c8 100644 --- a/src/Database/V5/Bloodhound/Internal/Newtypes.hs +++ b/src/Database/V5/Bloodhound/Internal/Newtypes.hs @@ -231,4 +231,9 @@ instance FromJSON MS where parse n = fromInteger (truncate n * 1000) newtype TokenFilter = - TokenFilter Text deriving (Eq, Show, FromJSON, ToJSON) + TokenFilter Text + deriving (Eq, Show, FromJSON, ToJSON) + +newtype CharFilter = + CharFilter Text + deriving (Eq, Show, FromJSON, ToJSON) diff --git a/src/Database/V5/Bloodhound/Internal/Suggest.hs b/src/Database/V5/Bloodhound/Internal/Suggest.hs index b002091..43beccc 100644 --- a/src/Database/V5/Bloodhound/Internal/Suggest.hs +++ b/src/Database/V5/Bloodhound/Internal/Suggest.hs @@ -262,4 +262,9 @@ instance FromJSON DirectGenerators where <*> o .:? "post_filter" mkDirectGenerators :: FieldName -> DirectGenerators -mkDirectGenerators fn = DirectGenerators fn Nothing DirectGeneratorSuggestModeMissing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing +mkDirectGenerators fn = + DirectGenerators + fn + Nothing + DirectGeneratorSuggestModeMissing + Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing diff --git a/src/Database/V5/Bloodhound/Types.hs b/src/Database/V5/Bloodhound/Types.hs index cd48760..403f771 100644 --- a/src/Database/V5/Bloodhound/Types.hs +++ b/src/Database/V5/Bloodhound/Types.hs @@ -224,6 +224,7 @@ module Database.V5.Bloodhound.Types , Analyzer(..) , Tokenizer(..) , TokenFilter(..) + , CharFilter(..) , MaxExpansions(..) , Lenient(..) , MatchQueryType(..) @@ -406,6 +407,7 @@ module Database.V5.Bloodhound.Types , AnalyzerDefinition(..) , TokenizerDefinition(..) , TokenFilterDefinition(..) + , CharFilterDefinition(..) , Ngram(..) , TokenChar(..) , Shingle(..) diff --git a/tests/V5/Test/Generators.hs b/tests/V5/Test/Generators.hs index ea758af..be15afd 100644 --- a/tests/V5/Test/Generators.hs +++ b/tests/V5/Test/Generators.hs @@ -435,8 +435,25 @@ makeArbitrary ''Language instance Arbitrary Language where arbitrary = arbitraryLanguage makeArbitrary ''Shingle instance Arbitrary Shingle where arbitrary = arbitraryShingle + +makeArbitrary ''CharFilter +instance Arbitrary CharFilter where arbitrary = arbitraryCharFilter makeArbitrary ''AnalyzerDefinition instance Arbitrary AnalyzerDefinition where arbitrary = arbitraryAnalyzerDefinition + +-- TODO: This should have a proper generator that doesn't +-- create garbage that has to be filtered out. +instance Arbitrary CharFilterDefinition where + arbitrary = + oneof [ CharFilterDefinitionMapping + . chomp <$> arbitrary + , CharFilterDefinitionPatternReplace + <$> arbitrary <*> arbitrary <*> arbitrary + ] + where chomp = + M.map T.strip + . M.mapKeys (T.replace "=>" "" . T.strip) + makeArbitrary ''Analysis instance Arbitrary Analysis where arbitrary = arbitraryAnalysis makeArbitrary ''Tokenizer @@ -533,4 +550,4 @@ instance Arbitrary UpdatableIndexSetting' where NE.fromList . L.nubBy sameAttrName . NE.toList sameAttrName a b = nodeAttrFilterName a == nodeAttrFilterName b - -- shrink (UpdatableIndexSetting' x) = map UpdatableIndexSetting' (shrink x) + shrink (UpdatableIndexSetting' x) = map UpdatableIndexSetting' (shrink x) diff --git a/tests/V5/Test/Indices.hs b/tests/V5/Test/Indices.hs index 4422a8d..676e013 100644 --- a/tests/V5/Test/Indices.hs +++ b/tests/V5/Test/Indices.hs @@ -99,6 +99,9 @@ spec = do , "ex_filter_shingle" ] ) + (map CharFilter + ["html_strip", "ex_mapping", "ex_pattern_replace"] + ) ) ) (M.singleton "ex_tokenizer" @@ -113,7 +116,18 @@ spec = do , ("ex_filter_reverse",TokenFilterDefinitionReverse) , ("ex_filter_snowball",TokenFilterDefinitionSnowball English) , ("ex_filter_shingle",TokenFilterDefinitionShingle (Shingle 3 3 True False " " "_")) - ] + , ("ex_filter_stemmer",TokenFilterDefinitionStemmer German) + , ("ex_filter_stop1", TokenFilterDefinitionStop (Left French)) + , ("ex_filter_stop2", + TokenFilterDefinitionStop + (Right + $ map StopWord ["a", "is", "the"])) + ] + ) + (M.fromList + [ ("ex_mapping", CharFilterDefinitionMapping (M.singleton "١" "1")) + , ("ex_pattern_replace", CharFilterDefinitionPatternReplace "(\\d+)-(?=\\d)" "$1_" Nothing) + ] ) updates = [AnalysisSetting analysis] createResp <- createIndexWith (updates ++ [NumberOfReplicas (ReplicaCount 0)]) 1 testIndex