add: fix a bug impairing similar txn matching

In some cases (when the same letter pair was repeated) it was not picking the post similar past transaction. This was due to the similarity algorithm using the order-dependent Data.List.intersect, when it should use Set.intersection. Also added a reference for the algorithm, which is known as the Sørensen–Dice coefficient.
2024-11-08 15:14:49 +03:00 · 2015-08-28 13:54:51 -07:00 · 2015-08-28 13:54:51 -07:00 · a73c455570
commit a73c455570
parent 74512814ec
1 changed files with 19 additions and 12 deletions
--- a/hledger/Hledger/Cli/Add.hs
+++ b/hledger/Hledger/Cli/Add.hs
@ -15,6 +15,7 @@ import Control.Monad
 import Control.Monad.Trans (liftIO)
 import Data.Char (toUpper, toLower)
 import Data.List.Compat
 import qualified Data.Set as S
 import Data.Maybe
 import Data.Time.Calendar (Day)
 import Data.Typeable (Typeable)
@ -384,8 +385,9 @@ capitalize :: String -> String
 capitalize "" = ""
 capitalize (c:cs) = toUpper c : cs
-- Find the most similar and recent transactions matching the given transaction description and report query.
+-- | Find the most similar and recent transactions matching the given
-- Transactions are listed with their "relevancy" score, most relevant first.
+-- transaction description and report query.  Transactions are listed
 -- with their "relevancy" score, most relevant first.
 transactionsSimilarTo :: Journal -> Query -> String -> [(Double,Transaction)]
 transactionsSimilarTo j q desc =
    sortBy compareRelevanceAndRecency
@ -396,27 +398,32 @@ transactionsSimilarTo j q desc =
      ts = filter (q `matchesTransaction`) $ jtxns j
      threshold = 0
-compareDescriptions :: [Char] -> [Char] -> Double
+-- | Return a similarity measure, from 0 to 1, for two transaction
 -- descriptions.  This is like compareStrings, but first strips out
 -- numbers, which often appear in transaction descriptions without
 -- being helpful for checking similarity.
 compareDescriptions :: String -> String -> Double
 compareDescriptions s t = compareStrings s' t'
    where s' = simplify s
          t' = simplify t
          simplify = filter (not . (`elem` "0123456789"))
-- | Return a similarity measure, from 0 to 1, for two strings.
+-- | Return a similarity measure, from 0 to 1, for two strings.  This
-- This is Simon White's letter pairs algorithm from
+-- was based on Simon White's string similarity algorithm
-- http://www.catalysoft.com/articles/StrikeAMatch.html
+-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
-- with a modification for short strings.
+-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
 -- modified to handle short strings better.
 compareStrings :: String -> String -> Double
 compareStrings "" "" = 1
 compareStrings (_:[]) "" = 0
 compareStrings "" (_:[]) = 0
 compareStrings (a:[]) (b:[]) = if toUpper a == toUpper b then 1 else 0
-compareStrings s1 s2 = 2.0 * fromIntegral i / fromIntegral u
+compareStrings s1 s2 = 2 * commonpairs / totalpairs
    where
-      i = length $ intersect pairs1 pairs2
+      pairs1      = S.fromList $ wordLetterPairs $ uppercase s1
-      u = length pairs1 + length pairs2
+      pairs2      = S.fromList $ wordLetterPairs $ uppercase s2
-      pairs1 = wordLetterPairs $ uppercase s1
+      commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
-      pairs2 = wordLetterPairs $ uppercase s2
+      totalpairs  = fromIntegral $ S.size pairs1 + S.size pairs2
 wordLetterPairs = concatMap letterPairs . words