swarm/test/unit/TestScoring.hs

{-# LANGUAGE OverloadedStrings #-}

-- | High score records
module TestScoring where

import Data.Text.IO qualified as TIO
import Data.Time.Calendar.OrdinalDate
import Data.Time.LocalTime
import Swarm.Game.Scenario.Scoring.Best
import Swarm.Game.Scenario.Scoring.CodeSize
import Swarm.Game.Scenario.Scoring.ConcreteMetrics
import Swarm.Game.Scenario.Scoring.GenericMetrics
import Swarm.Language.Module
import Swarm.Language.Pipeline
import Swarm.Language.Syntax
import System.FilePath ((</>))
import Test.Tasty
import Test.Tasty.HUnit

baseTestPath :: FilePath
baseTestPath = "data/test/language-snippets/code-size"

testHighScores :: TestTree
testHighScores =
  testGroup
    "Scoring"
    [ testGroup
        "Code size"
        [ compareAstSize 1 "single-move-bare.sw"
        , compareAstSize 2 "single-move-def.sw"
        , compareAstSize 3 "single-move-let-with-invocation.sw"
        , compareAstSize 5 "double-move-let-with-invocation.sw"
        , compareAstSize 6 "single-move-def-with-invocation.sw"
        , compareAstSize 8 "double-move-def-with-invocation.sw"
        , compareAstSize 28 "single-def-two-args-recursive.sw"
        , compareAstSize 36 "single-def-two-args-recursive-with-invocation.sw"
        ]
    , testGroup
        "Precedence"
        [ testGroup
            "Single metrics"
            [ testCase
                "Attempted long > Attempted short"
                $ assertEqual "Longer is better for incomplete games" (Metric Attempted (5 :: Int))
                $ chooseBetter pure (Metric Attempted 3) (Metric Attempted 5)
            , testCase
                "Completed short > Completed long"
                $ assertEqual "Shorter is better for completed games" (Metric Completed (3 :: Int))
                $ chooseBetter pure (Metric Completed 3) (Metric Completed 5)
            ]
        , testGroup
            "Grouped metrics"
            [ betterReplTimeAfterCodeSizeRecord
            , betterCodeWorseTime
            ]
        ]
    ]

compareAstSize :: Int -> FilePath -> TestTree
compareAstSize expectedSize path = testCase (unwords ["size of", path]) $ do
  contents <- TIO.readFile $ baseTestPath </> path
  ProcessedTerm (Module stx _) _ _ <- case processTermEither contents of
    Right x -> return x
    Left y -> assertFailure y
  let actualSize = measureAstSize stx
  assertEqual "incorrect size" expectedSize actualSize

betterReplTimeAfterCodeSizeRecord :: TestTree
betterReplTimeAfterCodeSizeRecord =
  testCase
    "new repl win after code size record"
    $ assertEqual "incorrect" newExpectedBest newActualBest
 where
  mkZonedTime t = ZonedTime (LocalTime (fromOrdinalDate 2023 1) (TimeOfDay 0 t 0)) utc
  newRunWithoutCodeSize =
    Metric Completed $
      ProgressStats (mkZonedTime 1) $
        AttemptMetrics
          (DurationMetrics 1 1)
          Nothing

  oldCompletedRunWithCodeSize =
    Metric Completed $
      ProgressStats (mkZonedTime 0) $
        AttemptMetrics
          (DurationMetrics 2 2)
          (Just $ ScenarioCodeMetrics 1 1)

  oldBestWithCodeSize =
    BestRecords
      oldCompletedRunWithCodeSize
      oldCompletedRunWithCodeSize
      oldCompletedRunWithCodeSize
      oldCompletedRunWithCodeSize

  newExpectedBest =
    BestRecords
      newRunWithoutCodeSize
      newRunWithoutCodeSize
      oldCompletedRunWithCodeSize
      oldCompletedRunWithCodeSize

  newActualBest =
    updateBest
      newRunWithoutCodeSize
      oldBestWithCodeSize

betterCodeWorseTime :: TestTree
betterCodeWorseTime =
  testCase
    "improvement upon code size with a worse time"
    $ assertEqual "incorrect" newExpectedBests newActualBests
 where
  mkZonedTime t = ZonedTime (LocalTime (fromOrdinalDate 2023 1) (TimeOfDay 0 t 0)) utc
  newRunBetterCodeSize =
    Metric Completed $
      ProgressStats (mkZonedTime 1) $
        AttemptMetrics
          (DurationMetrics 2 2)
          (Just $ ScenarioCodeMetrics 1 1)

  oldRunPoorCodeSize =
    Metric Completed $
      ProgressStats (mkZonedTime 0) $
        AttemptMetrics
          (DurationMetrics 1 1)
          (Just $ ScenarioCodeMetrics 2 2)

  oldBests =
    BestRecords
      oldRunPoorCodeSize
      oldRunPoorCodeSize
      oldRunPoorCodeSize
      oldRunPoorCodeSize

  newExpectedBests =
    BestRecords
      oldRunPoorCodeSize
      oldRunPoorCodeSize
      newRunBetterCodeSize
      newRunBetterCodeSize

  newActualBests =
    updateBest
      newRunBetterCodeSize
      oldBests
Record best code size (#974) towards #866 NOTE: #1116 should be merged first so that the schema change of save files is less disruptive. ## Examples Different criteria can have their own best score: ![image](https://user-images.githubusercontent.com/261693/219904496-fcd23ca0-b208-43e1-afc6-188acfe327cf.png) All criteria share the same single best score: ![image](https://user-images.githubusercontent.com/261693/219904553-abe3011c-41b0-469c-b34d-95d84b91697a.png) ## Behavior notes * As currently designed, the code size will only be scored if the the player has specified their code before the scenario begins. Furthermore, any input into the REPL will invalidate code size scoring for the current game. * Because of this, the only way to score code so far is with a command-line argument of `--run` or `--autoplay`. However, #1010 shall implement code size scoring when a scenario is launched from the UI. * In the "best scores" display, if multiple "best score" criteria were all from the same game, they will be consolidated. If all criteria are for the same game, the criteria labels will be omitted. * The code size metrics will not be displayed if a "best score" was not obtained via `--run`. ## Caveats ### `run` command Currently, the code entailed in a `run "somescript.sw"` command is not transitively included, so using `run` make the code size score meaningless. ## Testing ### Unit tests Run the subset of unit tests: scripts/run-tests.sh --test-arguments '--pattern "Tests.Precedence"' ### Manual integration tests First, reset the score: rm -f ~/.local/share/swarm/saves/Tutorials_grab.yaml Saving the following to `grab-soln.sw`: ``` move; move; grab; turn back; move; turn back; move; move; grab; turn back; move; turn back; move; move; grab; turn back; move; turn back; move; move; grab; turn back; move; turn back; move; move; grab; turn back; move; turn back; move; move; grab; ``` Run as follows: scripts/play.sh --scenario Tutorials/grab.yaml --run grab-soln.sw This should establish a record for code size. Then, play the Grab tutorial and immediately paste and run this in the REPL: move; move; grab; move; grab; move; grab; move; grab; move; grab; move; grab; This solution is faster in terms of time, but should not displace the code-length record, since no code length should be recorded from a REPL solution. 2023-05-02 10:06:01 +03:00			`{-# LANGUAGE OverloadedStrings #-}`

			`-- \| High score records`
			`module TestScoring where`

			`import Data.Text.IO qualified as TIO`
			`import Data.Time.Calendar.OrdinalDate`
			`import Data.Time.LocalTime`
			`import Swarm.Game.Scenario.Scoring.Best`
			`import Swarm.Game.Scenario.Scoring.CodeSize`
			`import Swarm.Game.Scenario.Scoring.ConcreteMetrics`
			`import Swarm.Game.Scenario.Scoring.GenericMetrics`
			`import Swarm.Language.Module`
			`import Swarm.Language.Pipeline`
			`import Swarm.Language.Syntax`
			`import System.FilePath ((</>))`
			`import Test.Tasty`
			`import Test.Tasty.HUnit`

			`baseTestPath :: FilePath`
			`baseTestPath = "data/test/language-snippets/code-size"`

			`testHighScores :: TestTree`
			`testHighScores =`
			`testGroup`
			`"Scoring"`
			`[ testGroup`
			`"Code size"`
			`[ compareAstSize 1 "single-move-bare.sw"`
			`, compareAstSize 2 "single-move-def.sw"`
			`, compareAstSize 3 "single-move-let-with-invocation.sw"`
			`, compareAstSize 5 "double-move-let-with-invocation.sw"`
			`, compareAstSize 6 "single-move-def-with-invocation.sw"`
			`, compareAstSize 8 "double-move-def-with-invocation.sw"`
			`, compareAstSize 28 "single-def-two-args-recursive.sw"`
			`, compareAstSize 36 "single-def-two-args-recursive-with-invocation.sw"`
			`]`
			`, testGroup`
			`"Precedence"`
			`[ testGroup`
			`"Single metrics"`
			`[ testCase`
			`"Attempted long > Attempted short"`
			`$ assertEqual "Longer is better for incomplete games" (Metric Attempted (5 :: Int))`
			`$ chooseBetter pure (Metric Attempted 3) (Metric Attempted 5)`
			`, testCase`
			`"Completed short > Completed long"`
			`$ assertEqual "Shorter is better for completed games" (Metric Completed (3 :: Int))`
			`$ chooseBetter pure (Metric Completed 3) (Metric Completed 5)`
			`]`
			`, testGroup`
			`"Grouped metrics"`
			`[ betterReplTimeAfterCodeSizeRecord`
			`, betterCodeWorseTime`
			`]`
			`]`
			`]`

			`compareAstSize :: Int -> FilePath -> TestTree`
			`compareAstSize expectedSize path = testCase (unwords ["size of", path]) $ do`
			`contents <- TIO.readFile $ baseTestPath </> path`
			`ProcessedTerm (Module stx _) _ _ <- case processTermEither contents of`
			`Right x -> return x`
			`Left y -> assertFailure y`
			`let actualSize = measureAstSize stx`
			`assertEqual "incorrect size" expectedSize actualSize`

			`betterReplTimeAfterCodeSizeRecord :: TestTree`
			`betterReplTimeAfterCodeSizeRecord =`
			`testCase`
			`"new repl win after code size record"`
			`$ assertEqual "incorrect" newExpectedBest newActualBest`
			`where`
			`mkZonedTime t = ZonedTime (LocalTime (fromOrdinalDate 2023 1) (TimeOfDay 0 t 0)) utc`
			`newRunWithoutCodeSize =`
			`Metric Completed $`
			`ProgressStats (mkZonedTime 1) $`
			`AttemptMetrics`
			`(DurationMetrics 1 1)`
			`Nothing`

			`oldCompletedRunWithCodeSize =`
			`Metric Completed $`
			`ProgressStats (mkZonedTime 0) $`
			`AttemptMetrics`
			`(DurationMetrics 2 2)`
			`(Just $ ScenarioCodeMetrics 1 1)`

			`oldBestWithCodeSize =`
			`BestRecords`
			`oldCompletedRunWithCodeSize`
			`oldCompletedRunWithCodeSize`
			`oldCompletedRunWithCodeSize`
			`oldCompletedRunWithCodeSize`

			`newExpectedBest =`
			`BestRecords`
			`newRunWithoutCodeSize`
			`newRunWithoutCodeSize`
			`oldCompletedRunWithCodeSize`
			`oldCompletedRunWithCodeSize`

			`newActualBest =`
			`updateBest`
			`newRunWithoutCodeSize`
			`oldBestWithCodeSize`

			`betterCodeWorseTime :: TestTree`
			`betterCodeWorseTime =`
			`testCase`
			`"improvement upon code size with a worse time"`
			`$ assertEqual "incorrect" newExpectedBests newActualBests`
			`where`
			`mkZonedTime t = ZonedTime (LocalTime (fromOrdinalDate 2023 1) (TimeOfDay 0 t 0)) utc`
			`newRunBetterCodeSize =`
			`Metric Completed $`
			`ProgressStats (mkZonedTime 1) $`
			`AttemptMetrics`
			`(DurationMetrics 2 2)`
			`(Just $ ScenarioCodeMetrics 1 1)`

			`oldRunPoorCodeSize =`
			`Metric Completed $`
			`ProgressStats (mkZonedTime 0) $`
			`AttemptMetrics`
			`(DurationMetrics 1 1)`
			`(Just $ ScenarioCodeMetrics 2 2)`

			`oldBests =`
			`BestRecords`
			`oldRunPoorCodeSize`
			`oldRunPoorCodeSize`
			`oldRunPoorCodeSize`
			`oldRunPoorCodeSize`

			`newExpectedBests =`
			`BestRecords`
			`oldRunPoorCodeSize`
			`oldRunPoorCodeSize`
			`newRunBetterCodeSize`
			`newRunBetterCodeSize`

			`newActualBests =`
			`updateBest`
			`newRunBetterCodeSize`
			`oldBests`