mirror of
https://github.com/simonmichael/hledger.git
synced 2024-09-19 10:17:35 +03:00
imp: print: --match makes better choices
Previously, similarity completely outweighed recency, so a slightly-more-similar transaction would always be selected no matter how old it was. Now similarity and recency are more balanced, and it should produce the desired transaction more often. There is also new debug output (at debug level 1) for troubleshooting.
This commit is contained in:
parent
6bf2afe80c
commit
c03d6b1123
@ -3,6 +3,7 @@
|
||||
{-# LANGUAGE Rank2Types #-}
|
||||
{-# LANGUAGE ScopedTypeVariables #-}
|
||||
{-# LANGUAGE TupleSections #-}
|
||||
{-# LANGUAGE RecordWildCards #-}
|
||||
|
||||
{-|
|
||||
|
||||
@ -122,8 +123,8 @@ import Data.Maybe (catMaybes, fromMaybe, mapMaybe, maybeToList)
|
||||
import qualified Data.Set as S
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import Safe (headMay, headDef, maximumMay, minimumMay)
|
||||
import Data.Time.Calendar (Day, addDays, fromGregorian)
|
||||
import Safe (headMay, headDef, maximumMay, minimumMay, lastDef)
|
||||
import Data.Time.Calendar (Day, addDays, fromGregorian, diffDays)
|
||||
import Data.Time.Clock.POSIX (POSIXTime)
|
||||
import Data.Tree (Tree(..), flatten)
|
||||
import Text.Printf (printf)
|
||||
@ -140,6 +141,9 @@ import Hledger.Data.TransactionModifier
|
||||
import Hledger.Data.Valuation
|
||||
import Hledger.Query
|
||||
import System.FilePath (takeFileName)
|
||||
import Data.Ord (comparing)
|
||||
import Hledger.Data.Dates (nulldate)
|
||||
import Data.List (sort)
|
||||
|
||||
|
||||
-- | A parser of text that runs in some monad, keeping a Journal as state.
|
||||
@ -429,20 +433,38 @@ journalInheritedAccountTags j a =
|
||||
as = a : parentAccountNames a
|
||||
-- PERF: cache in journal ?
|
||||
|
||||
type DateWeightedSimilarityScore = Double
|
||||
type SimilarityScore = Double
|
||||
type Age = Integer
|
||||
|
||||
-- | Find up to N most similar and most recent transactions matching
|
||||
-- the given transaction description and query. Transactions are
|
||||
-- listed with their description's similarity score (see
|
||||
-- compareDescriptions), sorted by highest score and then by date.
|
||||
-- Only transactions with a similarity score greater than a minimum
|
||||
-- threshold (currently 0) are returned.
|
||||
journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)]
|
||||
journalTransactionsSimilarTo Journal{jtxns} q desc n =
|
||||
-- the given transaction description and query and exceeding the given
|
||||
-- description similarity score (0 to 1, see compareDescriptions).
|
||||
-- Returns transactions along with
|
||||
-- their age in days compared to the latest transaction date,
|
||||
-- their description similarity score,
|
||||
-- and a heuristically date-weighted variant of this that favours more recent transactions.
|
||||
journalTransactionsSimilarTo :: Journal -> Text -> Query -> SimilarityScore -> Int
|
||||
-> [(DateWeightedSimilarityScore, Age, SimilarityScore, Transaction)]
|
||||
journalTransactionsSimilarTo Journal{jtxns} desc q similaritythreshold n =
|
||||
take n $
|
||||
sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $
|
||||
filter ((> threshold).fst)
|
||||
dbg1With (
|
||||
unlines .
|
||||
("up to 30 transactions above description similarity threshold "<>show similaritythreshold<>" ordered by recency-weighted similarity:":) .
|
||||
take 30 .
|
||||
map ( \(w,a,s,Transaction{..}) -> printf "weighted:%8.3f age:%4d similarity:%5.3f %s %s" w a s (show tdate) tdescription )) $
|
||||
sortBy (comparing (negate.first4)) $
|
||||
map (\(s,t) -> (weightedScore (s,t), age t, s, t)) $
|
||||
filter ((> similaritythreshold).fst)
|
||||
[(compareDescriptions desc $ tdescription t, t) | t <- jtxns, q `matchesTransaction` t]
|
||||
where
|
||||
threshold = 0
|
||||
latest = lastDef nulldate $ sort $ map tdate jtxns
|
||||
age = diffDays latest . tdate
|
||||
-- Combine similarity and recency heuristically. This gave decent results
|
||||
-- in my "find most recent invoice" use case in 2023-03,
|
||||
-- but will probably need more attention.
|
||||
weightedScore :: (Double, Transaction) -> Double
|
||||
weightedScore (s, t) = 100 * s - fromIntegral (age t) / 4
|
||||
|
||||
-- | Return a similarity score from 0 to 1.5 for two transaction descriptions.
|
||||
-- This is based on compareStrings, with the following modifications:
|
||||
|
@ -251,12 +251,9 @@ backupNumber f g = case g =~ ("^" ++ f ++ "\\.([0-9]+)$") of
|
||||
-- Identify the closest recent match for this description in past transactions.
|
||||
-- If the options specify a query, only matched transactions are considered.
|
||||
journalSimilarTransaction :: CliOpts -> Journal -> T.Text -> Maybe Transaction
|
||||
journalSimilarTransaction cliopts j desc = mbestmatch
|
||||
journalSimilarTransaction cliopts j desc =
|
||||
fmap fourth4 $ headMay $ journalTransactionsSimilarTo j desc q 0 1
|
||||
where
|
||||
mbestmatch = snd <$> headMay bestmatches
|
||||
bestmatches =
|
||||
dbg1With (unlines . ("similar transactions:":) . map (\(score,Transaction{..}) -> printf "%0.3f %s %s" score (show tdate) tdescription)) $
|
||||
journalTransactionsSimilarTo j q desc 10
|
||||
q = queryFromFlags $ _rsReportOpts $ reportspec_ cliopts
|
||||
|
||||
-- | Render a 'PostingsReport' or 'AccountTransactionsReport' as Text,
|
||||
|
Loading…
Reference in New Issue
Block a user