Added ARM Macaw disassembly functionality.

This commit is contained in:
Kevin Quick 2018-01-04 17:18:22 -08:00
parent 716ae2a28f
commit b533f4a92a
No known key found for this signature in database
GPG Key ID: E6D7733599CC0A21
4 changed files with 302 additions and 2 deletions

View File

@ -16,6 +16,7 @@ library
exposed-modules: Data.Macaw.ARM
, Data.Macaw.ARM.Arch
, Data.Macaw.ARM.ARMReg
, Data.Macaw.ARM.Disassemble
, Data.Macaw.ARM.Eval
, Data.Macaw.ARM.BinaryFormat.ELF
, Data.Macaw.ARM.Semantics.ARMSemantics
@ -31,8 +32,10 @@ library
, lens
, macaw-base
, macaw-semmc
, mtl
, parameterized-utils
, semmc-arm
, text
, vector
-- build-tools: arm-none-eabi-gcc
hs-source-dirs: src

View File

@ -12,7 +12,10 @@ module Data.Macaw.ARM
)
where
import Data.Macaw.ARM.Disassemble ( disassembleFn )
import Data.Macaw.ARM.Eval
import qualified Data.Macaw.ARM.Semantics.ARMSemantics as ARMSem
import qualified Data.Macaw.Architecture.Info as MI
import qualified Data.Macaw.Memory as MM
import Data.Proxy ( Proxy(..) )
@ -29,7 +32,7 @@ arm_linux_info =
, MI.archAddrWidth = MM.Addr32
, MI.archEndianness = MM.LittleEndian
, MI.jumpTableEntrySize = 0 -- undefined -- jumpTableEntrySize proxy
, MI.disassembleFn = undefined -- disassembleFn proxy ARMSem.execInstruction
, MI.disassembleFn = disassembleFn proxy ARMSem.execInstruction
, MI.mkInitialAbsState = mkInitialAbsState proxy
, MI.absEvalArchFn = undefined -- absEvalArchFn proxy
, MI.absEvalArchStmt = undefined -- absEvalArchStmt proxy

View File

@ -0,0 +1,292 @@
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE UndecidableInstances #-}
module Data.Macaw.ARM.Disassemble
( disassembleFn
)
where
import Control.Lens ( (&), (^.), (%~), (.~) )
import Control.Monad ( unless )
import qualified Control.Monad.Except as ET
import Control.Monad.ST ( ST )
import Control.Monad.Trans ( lift )
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Foldable as F
import Data.Macaw.ARM.ARMReg
import Data.Macaw.ARM.Arch ( ARMArchConstraints )
import Data.Macaw.AbsDomain.AbsState as MA
import Data.Macaw.CFG
import Data.Macaw.CFG.Block
import qualified Data.Macaw.CFG.Core as MC
import qualified Data.Macaw.Memory as MM
import qualified Data.Macaw.Memory.Permissions as MMP
import Data.Macaw.SemMC.Generator
import Data.Macaw.SemMC.Simplify ( simplifyValue )
import Data.Macaw.Types -- ( BVType, BoolType )
import Data.Maybe ( fromMaybe )
import qualified Data.Parameterized.Nonce as NC
import Data.Proxy ( Proxy(..) )
import qualified Data.Sequence as Seq
import qualified Data.Text as T
import Data.Word ( Word64 )
import qualified Dismantle.ARM as D
import Text.Printf ( printf )
-- | Disassemble a block from the given start address (which points into the
-- 'MM.Memory').
--
-- Return a list of disassembled blocks as well as the total number of bytes
-- occupied by those blocks.
disassembleFn :: (ARMArchConstraints arm)
=> proxy arm
-> (Value arm ids (BVType (ArchAddrWidth arm)) -> D.Instruction -> Maybe (Generator arm ids s ()))
-- ^ A function to look up the semantics for an instruction. The
-- lookup is provided with the value of the IP in case IP-relative
-- addressing is necessary.
-> MM.Memory (ArchAddrWidth arm)
-- ^ The mapped memory space
-> NC.NonceGenerator (ST s) ids
-- ^ A generator of unique IDs used for assignments
-> ArchSegmentOff arm
-- ^ The address to disassemble from
-> ArchAddrWord arm
-- ^ Maximum size of the block (a safeguard)
-> MA.AbsBlockState (ArchReg arm)
-- ^ Abstract state of the processor at the start of the block
-> ST s ([Block arm ids], MM.MemWord (ArchAddrWidth arm), Maybe String)
disassembleFn _ lookupSemantics mem nonceGen startAddr maxSize _ = do
mr <- ET.runExceptT (unDisM (tryDisassembleBlock lookupSemantics mem nonceGen startAddr maxSize))
case mr of
Left (blocks, off, exn) -> return (blocks, off, Just (show exn))
Right (blocks, bytes) -> return (blocks, bytes, Nothing)
tryDisassembleBlock :: (ARMArchConstraints arm)
=> (Value arm ids (BVType (ArchAddrWidth arm)) -> D.Instruction -> Maybe (Generator arm ids s ()))
-> MM.Memory (ArchAddrWidth arm)
-> NC.NonceGenerator (ST s) ids
-> ArchSegmentOff arm
-> ArchAddrWord arm
-> DisM arm ids s ([Block arm ids], MM.MemWord (ArchAddrWidth arm))
tryDisassembleBlock lookupSemantics mem nonceGen startAddr maxSize = do
let gs0 = initGenState nonceGen mem startAddr (initRegState startAddr)
let startOffset = MM.msegOffset startAddr
(nextIPOffset, blocks) <- disassembleBlock lookupSemantics mem gs0 startAddr (startOffset + maxSize)
unless (nextIPOffset > startOffset) $ do
let reason = InvalidNextIP (fromIntegral nextIPOffset) (fromIntegral startOffset)
failAt gs0 nextIPOffset startAddr reason
return (F.toList (blocks ^. frontierBlocks), nextIPOffset - startOffset)
-- | Disassemble an instruction and terminate the current block if we run out of
-- instructions to disassemble. We can run out if:
--
-- 1) We exceed the offset that macaw has told us to disassemble to
--
-- 2) We can't decode the IP (i.e., it isn't a constant)
--
-- 3) The IP after executing the semantics transformer is not equal to the
-- expected next IP value, which indicates a jump to another block or
-- function
--
-- In most of those cases, we end the block with a simple terminator. If the IP
-- becomes a mux, we split execution using 'conditionalBranch'.
disassembleBlock :: forall arm ids s
. ARMArchConstraints arm
=> (Value arm ids (BVType (ArchAddrWidth arm)) -> D.Instruction -> Maybe (Generator arm ids s ()))
-- ^ A function to look up the semantics for an instruction that we disassemble
-> MM.Memory (ArchAddrWidth arm)
-> GenState arm ids s
-> MM.MemSegmentOff (ArchAddrWidth arm)
-- ^ The current instruction pointer
-> MM.MemWord (ArchAddrWidth arm)
-- ^ The maximum offset into the bytestring that we should
-- disassemble to; in principle, macaw can tell us to limit our
-- search with this.
-> DisM arm ids s (MM.MemWord (ArchAddrWidth arm), BlockSeq arm ids)
disassembleBlock lookupSemantics mem gs curIPAddr maxOffset = do
let seg = MM.msegSegment curIPAddr
let off = MM.msegOffset curIPAddr
case readInstruction mem curIPAddr of
Left err -> failAt gs off curIPAddr (DecodeError err)
Right (i, bytesRead) -> do
-- traceM ("II: " ++ show i)
let nextIPOffset = off + bytesRead
nextIP = MM.relativeAddr seg nextIPOffset
-- nextIPVal = MC.RelocatableValue (pointerNatRepr (Proxy @arm)) nextIP
-- nextIPVal = MC.RelocatableValue (knownNat :: (BVType (ArchAddrWidth arm))) nextIP
nextIPVal = MC.RelocatableValue (knownNat :: NatRepr (ArchAddrWidth arm)) nextIP
-- Note: In ARM, the IP is incremented *after* an instruction
-- executes; pass in the -- physical address of the instruction here.
ipVal <- case MM.asAbsoluteAddr (MM.relativeSegmentAddr curIPAddr) of
Nothing -> failAt gs off curIPAddr (InstructionAtUnmappedAddr i)
Just addr -> return (BVValue (knownNat :: NatRepr (ArchAddrWidth arm)) (fromIntegral addr))
case lookupSemantics ipVal i of
Nothing -> failAt gs off curIPAddr (UnsupportedInstruction i)
Just transformer -> do
-- Once we have the semantics for the instruction (represented by a
-- state transformer), we apply the state transformer and then extract
-- a result from the state of the 'Generator'.
egs1 <- liftST $ ET.runExceptT (runGenerator genResult gs $ do
let lineStr = printf "%s: %s" (show curIPAddr) (show (D.ppInstruction i))
addStmt (Comment (T.pack lineStr))
transformer
-- Check to see if the IP has become conditionally-defined (by e.g.,
-- a mux). If it has, we need to split execution using a primitive
-- provided by the Generator monad.
nextIPExpr <- getRegValue ARM_IP
case matchConditionalBranch nextIPExpr of
Just (cond, t_ip, f_ip) ->
conditionalBranch cond (setRegVal ARM_IP t_ip) (setRegVal ARM_IP f_ip)
Nothing -> return ())
case egs1 of
Left genErr -> failAt gs off curIPAddr (GenerationError i genErr)
Right gs1 -> do
case resState gs1 of
Just preBlock
| Seq.null (resBlockSeq gs1 ^. frontierBlocks)
, v <- preBlock ^. (pBlockState . curIP)
, Just simplifiedIP <- simplifyValue v
, simplifiedIP == nextIPVal
, nextIPOffset < maxOffset
, Just nextIPSegAddr <- MM.asSegmentOff mem nextIP -> do
let preBlock' = (pBlockState . curIP .~ simplifiedIP) preBlock
let gs2 = GenState { assignIdGen = assignIdGen gs
, _blockSeq = resBlockSeq gs1
, _blockState = preBlock'
, genAddr = nextIPSegAddr
, genMemory = mem
}
disassembleBlock lookupSemantics mem gs2 nextIPSegAddr maxOffset
_ -> return (nextIPOffset, finishBlock FetchAndExecute gs1)
-- | Read one instruction from the 'MM.Memory' at the given segmented offset.
--
-- Returns the instruction and number of bytes consumed /or/ an error.
--
-- This code assumes that the 'MM.ByteRegion' is maximal; that is, that there
-- are no byte regions that could be coalesced.
readInstruction :: MM.Memory w
-> MM.MemSegmentOff w
-> Either (MM.MemoryError w) (D.Instruction, MM.MemWord w)
readInstruction mem addr = MM.addrWidthClass (MM.memAddrWidth mem) $ do
let seg = MM.msegSegment addr
case MM.segmentFlags seg `MMP.hasPerm` MMP.execute of
False -> ET.throwError (MM.PermissionsError (MM.relativeSegmentAddr addr))
True -> do
contents <- MM.addrContentsAfter mem (MM.relativeSegmentAddr addr)
case contents of
[] -> ET.throwError (MM.AccessViolation (MM.relativeSegmentAddr addr))
MM.SymbolicRef {} : _ ->
ET.throwError (MM.UnexpectedRelocation (MM.relativeSegmentAddr addr))
MM.ByteRegion bs : _rest
| BS.null bs -> ET.throwError (MM.AccessViolation (MM.relativeSegmentAddr addr))
| otherwise -> do
-- FIXME: Having to wrap the bytestring in a lazy wrapper is
-- unpleasant. We could alter the disassembler to consume strict
-- bytestrings, at the cost of possibly making it less efficient for
-- other clients.
let (bytesRead, minsn) = D.disassembleInstruction (LBS.fromStrict bs)
case minsn of
Just insn -> return (insn, fromIntegral bytesRead)
Nothing -> ET.throwError (MM.InvalidInstruction (MM.relativeSegmentAddr addr) contents)
-- | Examine a value and see if it is a mux; if it is, break the mux up and
-- return its component values (the condition and two alternatives)
matchConditionalBranch :: (ARMArchConstraints arch)
=> Value arch ids tp
-> Maybe (Value arch ids BoolType, Value arch ids tp, Value arch ids tp)
matchConditionalBranch v =
case v of
AssignedValue (Assignment { assignRhs = EvalApp a }) ->
case a of
Mux _rep cond t f -> Just (cond, fromMaybe t (simplifyValue t), fromMaybe f (simplifyValue f))
_ -> Nothing
_ -> Nothing
type LocatedError ppc ids = ([Block ppc ids], MM.MemWord (ArchAddrWidth ppc), TranslationError (ArchAddrWidth ppc))
-- | This is a monad for error handling during disassembly
--
-- It allows for early failure that reports progress (in the form of blocks
-- discovered and the latest address examined) along with a reason for failure
-- (a 'TranslationError').
newtype DisM ppc ids s a = DisM { unDisM :: ET.ExceptT (LocatedError ppc ids) (ST s) a }
deriving (Functor,
Applicative,
Monad)
-- | This funny instance is required because GHC doesn't allow type function
-- applications in instance heads, so we factor the type functions out into a
-- constraint on a fresh variable. See
--
-- > https://stackoverflow.com/questions/45360959/illegal-type-synonym-family-application-in-instance-with-functional-dependency
--
-- We also can't derive this instance because of that restriction (but deriving
-- silently fails).
instance (w ~ ArchAddrWidth ppc) => ET.MonadError ([Block ppc ids], MM.MemWord w, TranslationError w) (DisM ppc ids s) where
throwError e = DisM (ET.throwError e)
catchError a hdlr = do
r <- liftST $ ET.runExceptT (unDisM a)
case r of
Left l -> do
r' <- liftST $ ET.runExceptT (unDisM (hdlr l))
case r' of
Left e -> DisM (ET.throwError e)
Right res -> return res
Right res -> return res
data TranslationError w = TranslationError { transErrorAddr :: MM.MemSegmentOff w
, transErrorReason :: TranslationErrorReason w
}
data TranslationErrorReason w = InvalidNextIP Word64 Word64
| DecodeError (MM.MemoryError w)
| UnsupportedInstruction D.Instruction
| InstructionAtUnmappedAddr D.Instruction
| GenerationError D.Instruction GeneratorError
deriving (Show)
deriving instance (MM.MemWidth w) => Show (TranslationError w)
liftST :: ST s a -> DisM arm ids s a
liftST = DisM . lift
-- | Early failure for 'DisM'. This is a wrapper around 'ET.throwError' that
-- computes the current progress alongside the reason for the failure.
--
-- Note that the 'TranslateError' below is a block terminator marker that notes
-- that translation of the basic block resulted in an error (with the exception
-- string stored as its argument). This allows macaw to carry errors in the
-- instruction stream, which is useful for debugging and diagnostics.
failAt :: forall arm ids s a
. (ArchReg arm ~ ARMReg, MM.MemWidth (ArchAddrWidth arm))
=> GenState arm ids s
-> MM.MemWord (ArchAddrWidth arm)
-> MM.MemSegmentOff (ArchAddrWidth arm)
-> TranslationErrorReason (ArchAddrWidth arm)
-> DisM arm ids s a
failAt gs offset curIPAddr reason = do
let exn = TranslationError { transErrorAddr = curIPAddr
, transErrorReason = reason
}
let term = (`TranslateError` T.pack (show exn))
let b = finishBlock' (gs ^. blockState) term
let res = _blockSeq gs & frontierBlocks %~ (Seq.|> b)
let res' = F.toList (res ^. frontierBlocks)
ET.throwError (res', offset, exn)

View File

@ -10,7 +10,9 @@ import qualified Data.Macaw.Types as MT
import SemMC.ARM ( ARM, Instruction )
import Data.Macaw.SemMC.Generator ( Generator )
-- import SemMC.Architecture.ARM.Opcodes ( allSemantics, allOpcodeInfo )
import qualified Dismantle.ARM as D
execInstruction :: MC.Value ARM ids (MT.BVType 32) -> Instruction -> Maybe (Generator ARM ids s ())
execInstruction :: MC.Value ARM ids (MT.BVType 32) -> D.Instruction -> Maybe (Generator ARM ids s ())
execInstruction = undefined
-- execInstruction = $(genExecInstruction (Proxy @ARM) (locToRegTH (Proxy @ARM)) armNonceAppEval armAppEvaluator 'armInstructionMatcher allSemantics allOpcodeInfo)