Initial commit

This commit is contained in:
Rúnar 2021-03-15 22:18:22 -04:00
commit 0f4f543f6b
3 changed files with 288 additions and 0 deletions

235
Fzf.hs Normal file
View File

@ -0,0 +1,235 @@
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedLists #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
import Control.Monad (join)
import Data.Array
( Array,
(!),
)
import qualified Data.Array as Array
import Data.Char (isAlphaNum, isLower, isUpper, toLower)
import Data.Foldable (maximumBy, toList, foldl')
import Data.Function (on)
import Data.Sequence
( Seq (..),
ViewL (..),
ViewR (..),
viewl,
viewr,
(<|)
)
import Data.List (sortOn)
import qualified Data.Sequence as Seq
import GHC.Generics (Generic)
data ResultSegment = Gap (Seq Char) | Match (Seq Char)
deriving (Eq, Ord, Show, Generic)
newtype Result = Result { segments :: Seq ResultSegment }
deriving (Eq, Ord, Show, Generic)
match :: Char -> Result
match a = Result [Match [a]]
gap :: Char -> Result
gap a = Result [Gap [a]]
reverseResult :: Result -> Result
reverseResult (Result xs) = Result . Seq.reverse $ reverseSegment <$> xs
reverseSegment :: ResultSegment -> ResultSegment
reverseSegment (Gap xs) = Gap (Seq.reverse xs)
reverseSegment (Match xs) = Match (Seq.reverse xs)
instance Monoid Result where
mempty = Result []
instance Semigroup Result where
Result Empty <> as = as
as <> Result Empty = as
Result (viewr -> h :> Gap []) <> as = Result h <> as
as <> Result (viewl -> Gap [] :< t) = as <> Result t
Result (viewr -> h :> Match []) <> as = Result h <> as
as <> Result (viewl -> Match [] :< t) = as <> Result t
Result (viewr -> i :> Gap l) <> Result (viewl -> Gap h :< t) =
Result (i <> [Gap (l <> h)] <> t)
Result (viewr -> i :> Match l) <> Result (viewl -> Match h :< t) =
Result (i <> [Match (l <> h)] <> t)
Result a <> Result b = Result (a <> b)
mergeResults :: Result -> Result -> Result
mergeResults as bs = merge as bs
where
drop' :: Int -> Result -> Result
drop' n m | n < 1 = m
drop' n (Result (viewl -> Gap g :< t)) =
Result [Gap (Seq.drop n g)] <> drop' (n - Seq.length g) (Result t)
drop' n (Result (viewl -> Match g :< t)) =
Result [Match (Seq.drop n g)] <> drop' (n - Seq.length g) (Result t)
merge :: Result -> Result -> Result
merge (Result Seq.Empty) ys = ys
merge xs (Result Seq.Empty) = xs
merge (Result xs) (Result ys ) = case (viewl xs, viewl ys) of
(Gap g :< t, Gap g' :< t')
| Seq.length g <= Seq.length g' -> Result [Gap g]
<> merge (Result t) (drop' (Seq.length g) (Result ys))
| otherwise -> Result [Gap g']
<> merge (drop' (Seq.length g') (Result xs)) (Result t')
(Match m :< t, Match m' :< t')
| Seq.length m >= Seq.length m' -> Result [Match m]
<> merge (Result t) (drop' (Seq.length m) (Result ys))
| otherwise -> Result [Match m']
<> merge (drop' (Seq.length m') (Result xs)) (Result t')
(Gap g :< t, Match m' :< t') ->
Result [Match m'] <> merge (drop' (Seq.length m') (Result xs)) (Result t')
(Match m :< t, Gap g' :< t') ->
Result [Match m] <> merge (Result t) (drop' (Seq.length m) (Result ys))
instance Semigroup Alignment where
Alignment n r <> Alignment m s = Alignment (n + m) (mergeResults r s)
instance Monoid Alignment where
mempty = Alignment 0 mempty
type Score = Int
data Alignment
= Alignment { score :: Score, result :: Result }
deriving (Eq, Ord, Show, Generic)
-- The base score given to a matching character
defaultMatchScore :: Int
defaultMatchScore = 16
-- The base score given to a mismatched character
defaultMismatchScore :: Int
defaultMismatchScore = 0
-- Bonus points given to characters matching at the beginning of words
defaultBoundaryBonus :: Int
defaultBoundaryBonus = defaultMatchScore `div` 2
-- Bonus points given to characters matching a hump of a CamelCase word.
-- We subtract a point from the word boundary score, since a word boundary will
-- incur a gap penalty.
defaultCamelCaseBonus :: Int
defaultCamelCaseBonus = defaultBoundaryBonus - 1
-- Double any bonus points for matching the first pattern of the character.
-- This way we strongly prefer starting the match at the beginning of a word.
defaultFirstCharBonusMultiplier :: Int
defaultFirstCharBonusMultiplier = 2
-- We prefer consecutive runs of matched characters in the pattern, so we
-- impose a penalty for any gaps, proportional to the size of the gap.
defaultGapPenalty :: Int -> Int
defaultGapPenalty 1 = 3
defaultGapPenalty n = max 0 (3 + n)
-- We give a bonus to consecutive matching characters.
-- A number about the same as the `boundaryBonus` will strongly prefer
-- runs of consecutive characters vs finding acronyms.
defaultConsecutiveBonus :: Int
defaultConsecutiveBonus = defaultGapPenalty 8
gaps :: String -> Result
gaps s = Result [Gap . Seq.fromList $ reverse s]
highlight :: Alignment -> String
highlight (Alignment s (Result segments)) =
foldMap prettySegment segments <> "\n" <> foldMap showGaps segments
where
prettySegment (Gap xs) = toList xs
prettySegment (Match xs) = toList xs
showGaps (Gap xs) = replicate (length xs) ' '
showGaps (Match xs) = replicate (length xs) '*'
bestMatch :: String -> String -> Maybe Alignment
bestMatch = bestMatch' defaultMatchScore
defaultMismatchScore
defaultGapPenalty
defaultBoundaryBonus
defaultCamelCaseBonus
defaultFirstCharBonusMultiplier
defaultConsecutiveBonus
fuzzyFind :: [String] -> [String] -> [Alignment]
fuzzyFind query strings =
sortOn score
$ strings
>>= (\s -> toList
$ foldl' (\a q -> (<>) <$> a <*> bestMatch q s) (Just mempty) query
)
bestMatch'
:: Int
-> Int
-> (Int -> Int)
-> Int
-> Int
-> Int
-> Int
-> String
-> String
-> Maybe Alignment
bestMatch' matchScore mismatchScore gapPenalty boundaryBonus camelCaseBonus firstCharBonusMultiplier consecutiveBonus query str
= Alignment (totalScore m nx) . reverseResult <$> traceback
where
totalScore i j = if i > m then 0 else hs ! (i, j) + bonuses ! (i, j)
table = unlines
[ unwords
$ (if y > 0 then show $ b' ! y else " ")
: [ show (totalScore x y) | x <- [0 .. m] ]
| y <- [0 .. n]
]
similarity a b =
if a == b || a == toLower b then matchScore else mismatchScore
traceback = (gaps (drop nx str) <>) <$> go (m, nx)
go (0, j) = pure $ gaps (take j str)
go (i, 0) = Nothing
go (i, j) = if similarity (a' ! i) (b' ! j) > 0
then (match (b' ! j) <>) <$> go (i - 1, j - 1)
else (gap (b' ! j) <>) <$> go (i, j - 1)
nx = localMax m n
localMax m n = maximumBy
(\b d -> compare (totalScore m b) (totalScore m d))
[ j | j <- [1 .. n] ]
m = length query
n = length str
a' = Array.listArray (1, m) query
b' = Array.listArray (1, n) str
hs = Array.listArray bounds [ h i j | (i, j) <- Array.range bounds ]
bonuses = Array.listArray bounds [ bonus i j | (i, j) <- Array.range bounds ]
bounds = ((0, 0), (m, n))
bonus 0 j = 0
bonus i 0 = 0
bonus i j = if similarity (a' ! i) (b' ! j) > 0
then multiplier * (boundary + camel + consecutive)
else 0
where
boundary =
if j < 2 || isAlphaNum (b' ! j) && not (isAlphaNum (b' ! (j - 1)))
then boundaryBonus
else 0
camel = if j > 1 && isLower (b' ! (j - 1)) && isUpper (b' ! j)
then camelCaseBonus
else 0
multiplier = if i == 1 then firstCharBonusMultiplier else 1
consecutive =
let
similar = i > 0 && j > 0 && similarity (a' ! i) (b' ! j) > 0
afterMatch =
i > 1 && j > 1 && similarity (a' ! (i - 1)) (b' ! (j - 1)) > 0
beforeMatch =
i < m && j < n && similarity (a' ! (i + 1)) (b' ! (j + 1)) > 0
in
if similar && (afterMatch || beforeMatch) then consecutiveBonus else 0
h 0 _ = 0
h _ 0 = 0
h i j = scoreMatch `max` scoreGap `max` 0
where
scoreMatch =
hs ! (i - 1, j - 1) + similarity (a' ! i) (b' ! j) + bonuses ! (i, j)
scoreGap = maximum [ hs ! (i, j - l) - gapPenalty l | l <- [1 .. j] ]

7
LICENSE Normal file
View File

@ -0,0 +1,7 @@
Copyright 2021 Unison Computing
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

46
README.markdown Normal file
View File

@ -0,0 +1,46 @@
# fuzzyfind
A package that provides an API for fuzzy text search in Haskell, using a modified version of the [Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm). The search is intended to behave similarly to the excellent [`fzf` tool by Junegunn Choi](https://github.com/junegunn/fzf).
The core functionality of the library is provided by the `bestMatch` function:
```haskell
bestMatch :: String -> String -> Maybe Alignment
```
Calling `bestMatch query string` will return `Nothing` if `query` is not a subsequence of `string`. Otherwise, it will return the "best" way to line the characters in `query` up with the characters in `string`. Lower-case characters in the query are assumed to be case-insensitive, and upper-case characters are assumed to be case-sensitive.
For example:
```
> bestMatch "ff" "FuzzyFind"
Just (Alignment {score = 25, result = Result {[Match "F", Gap "uzzy", Match "F", Gap "ind"]}})
```
The `score` indicates how "good" the match is. Better matches have higher scores. There's no maximum score (except for the upper limit of the `Int` datatype), but the lowest score is `0`.
A substring from the query will generate a `Match`, and any characters from the input that don't result in a `Match` will generate a `Gap`. Concatenating all the `Match` and `Gap` results should yield the original input string.
Note that the matched characters in the input always occur in the same order as they do in the query pattern.
The algorithm prefers (and will generate higher scores for) the following kinds of matches:
1. Contiguous characters from the query string. For example, `bestMatch "pp"` will find the last two `p`s in `"pickled pepper"`.
2. Characters at the beginnings of words. For example, `bestMatch "pp"` will find the first two `P`s in `"Peter Piper"`.
3. A character in the input that matches the first character of the query pattern is strongly preferred. For example, `bestMatch "mn" "Bat Man"` will score higher than `bestMatch "mn" "Batman"`.
All else being equal, matches that occur later in the input string are preferred.
The `fuzzyFind` function finds input strings that match all the given input patterns. For each input that matches it returns one `Alignment`. The output is sorted by `score`, ascending.
```haskell
fuzzyFind :: [String] -> [String] -> [Alignment]
```
For example:
```
> fuzzyFind ["dad", "mac", "dam"] ["tinned macadamia"]
[Alignment {score = 296, result = Result [Gap "tinne", Match "d", Gap " ", Match "macadam", Gap "ia"]}]
```