mirror of
https://github.com/simonmichael/hledger.git
synced 2024-11-08 15:14:49 +03:00
add: fix a bug impairing similar txn matching
In some cases (when the same letter pair was repeated) it was not picking the post similar past transaction. This was due to the similarity algorithm using the order-dependent Data.List.intersect, when it should use Set.intersection. Also added a reference for the algorithm, which is known as the Sørensen–Dice coefficient.
This commit is contained in:
parent
74512814ec
commit
a73c455570
@ -15,6 +15,7 @@ import Control.Monad
|
||||
import Control.Monad.Trans (liftIO)
|
||||
import Data.Char (toUpper, toLower)
|
||||
import Data.List.Compat
|
||||
import qualified Data.Set as S
|
||||
import Data.Maybe
|
||||
import Data.Time.Calendar (Day)
|
||||
import Data.Typeable (Typeable)
|
||||
@ -384,8 +385,9 @@ capitalize :: String -> String
|
||||
capitalize "" = ""
|
||||
capitalize (c:cs) = toUpper c : cs
|
||||
|
||||
-- Find the most similar and recent transactions matching the given transaction description and report query.
|
||||
-- Transactions are listed with their "relevancy" score, most relevant first.
|
||||
-- | Find the most similar and recent transactions matching the given
|
||||
-- transaction description and report query. Transactions are listed
|
||||
-- with their "relevancy" score, most relevant first.
|
||||
transactionsSimilarTo :: Journal -> Query -> String -> [(Double,Transaction)]
|
||||
transactionsSimilarTo j q desc =
|
||||
sortBy compareRelevanceAndRecency
|
||||
@ -396,27 +398,32 @@ transactionsSimilarTo j q desc =
|
||||
ts = filter (q `matchesTransaction`) $ jtxns j
|
||||
threshold = 0
|
||||
|
||||
compareDescriptions :: [Char] -> [Char] -> Double
|
||||
-- | Return a similarity measure, from 0 to 1, for two transaction
|
||||
-- descriptions. This is like compareStrings, but first strips out
|
||||
-- numbers, which often appear in transaction descriptions without
|
||||
-- being helpful for checking similarity.
|
||||
compareDescriptions :: String -> String -> Double
|
||||
compareDescriptions s t = compareStrings s' t'
|
||||
where s' = simplify s
|
||||
t' = simplify t
|
||||
simplify = filter (not . (`elem` "0123456789"))
|
||||
|
||||
-- | Return a similarity measure, from 0 to 1, for two strings.
|
||||
-- This is Simon White's letter pairs algorithm from
|
||||
-- http://www.catalysoft.com/articles/StrikeAMatch.html
|
||||
-- with a modification for short strings.
|
||||
-- | Return a similarity measure, from 0 to 1, for two strings. This
|
||||
-- was based on Simon White's string similarity algorithm
|
||||
-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
|
||||
-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
|
||||
-- modified to handle short strings better.
|
||||
compareStrings :: String -> String -> Double
|
||||
compareStrings "" "" = 1
|
||||
compareStrings (_:[]) "" = 0
|
||||
compareStrings "" (_:[]) = 0
|
||||
compareStrings (a:[]) (b:[]) = if toUpper a == toUpper b then 1 else 0
|
||||
compareStrings s1 s2 = 2.0 * fromIntegral i / fromIntegral u
|
||||
compareStrings s1 s2 = 2 * commonpairs / totalpairs
|
||||
where
|
||||
i = length $ intersect pairs1 pairs2
|
||||
u = length pairs1 + length pairs2
|
||||
pairs1 = wordLetterPairs $ uppercase s1
|
||||
pairs2 = wordLetterPairs $ uppercase s2
|
||||
pairs1 = S.fromList $ wordLetterPairs $ uppercase s1
|
||||
pairs2 = S.fromList $ wordLetterPairs $ uppercase s2
|
||||
commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
|
||||
totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2
|
||||
|
||||
wordLetterPairs = concatMap letterPairs . words
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user