macaw/refinement/tests/RefinementTests.hs

-- This test harness uses local files to run Macaw discovery both with
-- and without refinement and checks the results of that validation
-- against expected outputs (aka. golden testing).
--
-- test/samples/
--    X[.A].[F-]expected       -- expected (golden) output [the primary file that enables this test]
--    X.A.exe                  -- input ELF binary file
--    X.c                      -- C source for generating binary
--
-- This methodology allows a binary for each architecture (A) to be
-- produced from a single C source (X), and then one or more
-- refinement test forms (F) to be created to run on that binary,
-- where F is one of:
--
--    "base"    -- standard Macaw discovery
--    "refined" -- additional discovery from this macaw-refinement package
--
-- If there is no "F-" portion, then the same expected file is applied
-- for all possible values of F.
--
-- If there is an expected file but it fails to Parse on a Haskell
-- Read (e.g. "echo garbage > blah.ppc.base-expected") then the test
-- will write the actual output to a separate file with the
-- "last-actual" suffix (e.g. "blah.ppc.base-last-actual"); verify the
-- contents of that file and then simply copy them to the expected
-- file to get the correct file contents.
--
-- There is also a README in the test/samples directory where the C
-- source is described, along with various tests.

{-# LANGUAGE AllowAmbiguousTypes #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeOperators #-}
{-# OPTIONS_GHC -Wno-missing-signatures #-}

module Main ( main ) where

import           GHC.TypeLits

import           Control.Lens hiding ( (<.>) )
import           Control.Monad
import qualified Control.Monad.Catch as X
import           Control.Monad.IO.Class
import qualified Data.ByteString as BS
import qualified Data.ElfEdit as E
import           Data.Foldable
import           Data.Function ( on )
import           Data.List ( intercalate, sortBy )
import qualified Data.Macaw.Architecture.Info as AI
import           Data.Macaw.BinaryLoader as MBL
import           Data.Macaw.BinaryLoader.X86 ()
import           Data.Macaw.CFG as MC
import qualified Data.Macaw.Discovery as MD
import qualified Data.Macaw.Memory.ElfLoader as ML
import           Data.Macaw.PPC
import           Data.Macaw.PPC.Symbolic ()
import           Data.Macaw.Refinement
import qualified Data.Macaw.Symbolic as MS
import qualified Data.Macaw.X86 as MX86
import           Data.Macaw.X86.Symbolic ()
import qualified Data.Map as M
import           Data.Maybe ( catMaybes )
import           Data.Parameterized.Some
import           Data.Proxy
import           Data.Semigroup ( (<>) )
import qualified Data.Set as Set
import           Data.Tagged
import           Data.Typeable ( Typeable )
import           Options.Applicative
import qualified SemMC.Architecture.PPC32 as PPC32
import qualified SemMC.Architecture.PPC64 as PPC64
import           System.Directory ( doesFileExist )
import           System.FilePath ( (</>), (<.>) )
import qualified System.FilePath as FP
import qualified System.FilePath.Glob as FPG
import qualified Test.Tasty as TT
import qualified Test.Tasty.HUnit as TTH
import           Test.Tasty.Ingredients
import           Test.Tasty.Options
import           Text.PrettyPrint.ANSI.Leijen hiding ( (<$>), (<>), (</>) )
import           Text.Read ( readEither )

import           Prelude


datadir =  "tests/samples"
supportedArch = [ "x86"
                , "ppc"
                ]


data ShowSearch =  ShowSearch Bool deriving (Eq, Ord, Typeable)

instance IsOption ShowSearch where
  defaultValue = ShowSearch False
  parseValue = fmap ShowSearch . safeRead
  optionName = pure $ "showsearch"
  optionHelp = pure $ "Show details of the search for the set of\n\
                      \refinement tests that would be performed\n\
                      \based on the contents of the " <> datadir <> " directory"
  optionCLParser = ShowSearch <$> switch
                      ( long (untag (optionName :: Tagged ShowSearch String))
                      <> help (untag (optionHelp :: Tagged ShowSearch String))
                      )

data VerboseLogging = VerboseLogging Bool
  deriving (Eq, Ord)

instance IsOption VerboseLogging where
  defaultValue = VerboseLogging False
  parseValue = fmap VerboseLogging . safeRead
  optionName = pure "verbose-logging"
  optionHelp = pure "Turn on verbose logging output"
  optionCLParser = VerboseLogging <$> switch ( long (untag (optionName :: Tagged VerboseLogging String))
                                             <> help (untag (optionHelp :: Tagged VerboseLogging String))
                                             )


-- | This is a Tasty "Ingredient" (aka test runner) that can be used
-- to display the search process and results for generating the tests.
searchResultsReport = TestManager [] $ \opts _tests ->
  if lookupOption opts == ShowSearch True
  then Just $ do searchlist <- getTestList datadir True
                 putStrLn ""
                 putStrLn $ "Final set of tests [" ++ show (length searchlist) ++ "]:"
                 mapM_ (putStrLn . show) searchlist
                 putStrLn ""
                 putStrLn ("These search results were based on the contents of the " <>
                           datadir <> " directory\n\
                           \Syntax:  BASENAME.c\n\
                           \         BASENAME.ARCH.EXE\n\
                           \         BASENAME.ARCH.expected  OR BASENAME.ARCH.base-expected\n\
                           \                                AND BASENAME.ARCH.refined-expected\n\
                           \Note that if any .expected file fails a Haskell Read\n\
                           \parse, a \".....last-actual\" file is written; it can\n\
                           \be verified and renamed to be the new expected contents.\n")

                 return True
  else Nothing


ingredients = TT.includingOptions [ Option (Proxy :: Proxy ShowSearch)
                                  , Option (Proxy :: Proxy VerboseLogging)
                                  ]
              : searchResultsReport
              : TT.defaultIngredients

main :: IO ()
main = do
  -- Note: dynamic test generation should be done before the
  -- TT.defaultMain call, but test arguments/options are not yet
  -- available for this; see:
  -- https://stackoverflow.com/questions/33040722
  -- https://github.com/feuerbach/tasty/issues/228
  testInputs <- getTestList datadir False
  let testNames = Set.fromList $ map name testInputs
  TT.defaultMainWithIngredients ingredients $
    TT.testGroup "macaw-refinement" $
    mkNameGroup testInputs <$> toList testNames
  where mkNameGroup inps nm =
          TT.testGroup nm $ map mkTest $ filter ((==) nm . name) inps


data TestInput = TestInput { name :: String
                           , arch :: String
                           , expectFileBase :: Maybe FilePath
                           , expectFileRefined :: Maybe FilePath
                           , binaryFile :: FilePath
                           }
  deriving (Eq,Show)

-- | Returns a list of the TestInputs that should be run.  These are
-- driven by the existence of a .[F-]expected file, for which there
-- should be a corresponding .exe.  The exe and expected files are
-- sub-named by the architecture (A) to which they apply.
getTestList :: FilePath -> Bool -> IO [TestInput]
getTestList basedir explain = do
  when explain $ putStrLn $ "Checking for test inputs in " <> (basedir </> "*.exe")
  let exeGlob = "*.(" <> (intercalate "|" supportedArch) <> ").exe"
  exeNames <- FPG.namesMatching $ basedir </> exeGlob
  postproc <$> mapM mkTI exeNames
  where
    postproc = sorted . catMaybes
    sorted = sortBy (compare `on` name)
    mkTI exe = do when explain $ putStrLn $ "Checking exe " ++ exe
                  -- exe is path/to/srcfile.A.exe
                  let exeRmvd = fst $ FP.splitExtension exe
                      (srcPathAndName, testArch') = FP.splitExtension exeRmvd
                      testName = FP.takeFileName srcPathAndName
                  if length testArch' < 2
                    then do when explain $ putStrLn $ "  no arch, skipping"
                            return Nothing
                    else mkTIAN exe (drop 1 testArch') testName
    mkTIAN exe testArch testName = do
          when explain $ putStrLn $ "  arch = " <> testArch
          when explain $ putStrLn $ "  name = " <> testName
          let expRoot = basedir </> testName <.> testArch
              expBaseName = expRoot <.> "base-expected"
              expRefinedName = expRoot <.> "refined-expected"
              expName = expRoot <.> "expected"
          base <- doesFileExist expBaseName
          refn <- doesFileExist expRefinedName
          expt <- doesFileExist expName
          let expn = if expt then Just expName else Nothing
          mkTIANE exe testArch testName
            (if base then Just expBaseName else expn)
            (if refn then Just expRefinedName else expn)
    mkTIANE exe testArch testName expBase expRefn = do
      when explain $ case expBase of
                       Just f -> putStrLn $ "  expected base = " <> f
                       Nothing -> return ()
      when explain $ case expRefn of
                       Just f -> putStrLn $ "  expected refined = " <> f
                       Nothing -> return ()
      case (expBase, expRefn) of
        (Nothing, Nothing) -> return Nothing
        _ -> return $ Just $ TestInput { name = testName
                                       , arch = testArch
                                       , binaryFile = exe
                                       , expectFileBase = expBase
                                       , expectFileRefined = expRefn
                                       }


mkTest :: TestInput -> TT.TestTree
mkTest testinp =
  let readbin = BS.readFile $ binaryFile testinp
      cleanup = return . const ()
      formName ref = if ref then "refined" else "base"
      tests = catMaybes [ mkT False <$> expectFileBase testinp
                        , mkT True <$> expectFileRefined testinp
                        ]
      mkT ref fn = \v r -> TTH.testCase (formName ref) $
                           testExpected ref fn testinp v r
  in TT.askOption $ \(VerboseLogging beVerbose) ->
     TT.withResource readbin cleanup $
     \readBinary ->
       TT.testGroup (arch testinp) $ map (\t -> t beVerbose readBinary) tests


testExpected useRefinement expFile testinp beVerbose readBinary = do
  bs <- readBinary
  case E.parseElf bs of
      E.Elf64Res warnings elf -> mapM_ print warnings >> withElf64 elf
      E.Elf32Res warnings elf -> mapM_ print warnings >> withElf32 elf
      _ -> let badMsg = binaryFile testinp <> " is not a 64-bit ELF file"
           in do when beVerbose $ putStrLn badMsg
                 TTH.assertBool badMsg False
  where
    withElf64 elf =
      case E.elfMachine elf of
        E.EM_PPC64 -> do
          bin <- MBL.loadBinary @PPC64.PPC ML.defaultLoadOptions elf
          let pli = ppc64_linux_info bin
          withBinaryDiscoveredInfo testinp useRefinement expFile pli bin
        E.EM_X86_64 ->
          withBinaryDiscoveredInfo testinp useRefinement expFile MX86.x86_64_linux_info =<<
            MBL.loadBinary @MX86.X86_64 ML.defaultLoadOptions elf
        m -> error $ "no 64-bit ELF support for " ++ show m
    withElf32 elf =
      case E.elfMachine elf of
        E.EM_PPC -> do
          bin <- MBL.loadBinary @PPC32.PPC ML.defaultLoadOptions elf
          let pli = ppc32_linux_info bin
          withBinaryDiscoveredInfo testinp useRefinement expFile pli bin
        m -> error $ "no 32-bit ELF support for " ++ show m

withBinaryDiscoveredInfo :: ( X.MonadThrow m
                            , MS.SymArchConstraints arch
                            , 16 <= MC.ArchAddrWidth arch
                            , MBL.BinaryLoader arch binFmt
                            , MonadIO m) =>
                            TestInput
                         -> Bool
                         -> FilePath
                         -> AI.ArchitectureInfo arch
                         -> MBL.LoadedBinary arch binFmt
                         -> m ()
withBinaryDiscoveredInfo testinp useRefinement expFile arch_info bin = do
  entries <- toList <$> entryPoints bin
  let baseCFG = MD.cfgFromAddrs arch_info (memoryImage bin) M.empty entries []
      actualBase = cfgToExpected testinp bin (Just baseCFG) Nothing
  if useRefinement
    then do refinedCFG <- refineDiscovery bin baseCFG
            let refinedBase = cfgToExpected testinp bin Nothing (Just refinedCFG)
            compareToExpected "refined" refinedBase expFile
    else compareToExpected "base" actualBase expFile


compareToExpected formName actual fn =
  let cfgName = formName <> " CFG" in
  do expectedData <- liftIO $ readFile fn
     case readEither expectedData of
       Right expInfo ->
         -- KWQ: TODO: refine this down to individual components if it fails
         liftIO $ TTH.assertEqual ("discovered " <> cfgName) expInfo actual
       Left e ->
         let badMsg = "error parsing expected " <> cfgName <> " data in " <> fn <> ": " <> e
             outFileName = take (length fn - length "expected") fn <> "last-actual"
         in liftIO $ do writeFile outFileName $ show actual
                        putStrLn $ "Generated actual output to: " <> outFileName
                        TTH.assertBool badMsg False

----------------------------------------------------------------------

-- | The ExpectedInfo is the format of information stored in the
-- .expected files.  Ideally this would be a 'Show' output so that a
-- 'Read' could import native data structures for a more refined
-- comparison, but unfortunately the 'read . show == id' intent is not
-- held for Macaw/Flexdis86, so the actual stored and compared format
-- is generally the 'pretty' output of the structures.
data ExpectedInfo arch = Expected
  { expBinaryName  :: String
  , expEntryPoints :: [EntryPoint arch]
  , expFunctions   :: [Function arch]
  }
  deriving (Show, Read, Eq)

data EntryPoint arch = EntryPoint (Address arch)
  deriving (Show, Read, Eq)

data Function arch = Function (Address arch) [Block arch]
  deriving (Show, Read, Eq)

data Block arch = Block (Address arch) StatementList
  deriving (Show, Read, Eq)

data Address arch = Address { addrSegmentBase :: Int
                            , addrSegmentOffset :: Int
                            , addrSegoffOffset :: Int
                            , addrPretty :: String
                            }
  deriving (Show, Read, Eq)

mkAddress :: (MemWidth (RegAddrWidth (ArchReg arch))) =>
             MC.MemSegmentOff (MC.ArchAddrWidth arch) -> Address arch
mkAddress addr = Address { addrSegmentBase = fromEnum $ segmentBase $ segoffSegment addr
                         , addrSegmentOffset = fromEnum $ memWordToUnsigned $ segmentOffset $ segoffSegment addr
                         , addrSegoffOffset = fromEnum $ memWordToUnsigned $ segoffOffset addr
                         , addrPretty = show $ pretty addr
                         }

type StatementList = String  -- no Read or Eq for Macaw.Discovery.StatementList, so just use String format

cfgToExpected :: (MBL.BinaryLoader arch binFmt) =>
                 TestInput
              -> MBL.LoadedBinary arch binFmt
              -> Maybe (MD.DiscoveryState arch)
              -> Maybe (MD.DiscoveryState arch)
              -> ExpectedInfo arch
cfgToExpected testinp bin mbCFG mbRefCFG =
  let eps = case entryPoints bin of
              Left _ -> []
              Right epl -> toList epl
      fns = case (mbCFG, mbRefCFG) of
              (Nothing, Nothing) -> error "must specify a discovered Macaw CFG"
              (Just _, Just _) -> error "must specify only one discovered Macaw CFG"
              (Just di, Nothing) -> getFunctions di
              (Nothing, Just di) -> getFunctions di
  in Expected { expBinaryName = binaryFile testinp
              , expEntryPoints = (EntryPoint . mkAddress) <$> eps
              , expFunctions = fns
              }

getFunctions :: MD.DiscoveryState arch -> [Function arch]
getFunctions di =
  AI.withArchConstraints (MD.archInfo di) $
  fmap (\(funAddr, Some dfi) ->
           Function
           (mkAddress funAddr)
           (fmap (\(blkAddr, pb) ->
                    Block (mkAddress blkAddr) (show $ MD.blockStatementList pb))
            (dfi ^. MD.parsedBlocks . to M.toList)))
  (di ^. MD.funInfo . to M.toList)