Added ARM Macaw disassembly functionality.

This commit is contained in:
Kevin Quick 2018-01-04 17:18:22 -08:00
parent 716ae2a28f
commit b533f4a92a
No known key found for this signature in database
GPG Key ID: E6D7733599CC0A21
4 changed files with 302 additions and 2 deletions

View File

@ -16,6 +16,7 @@ library
exposed-modules: Data.Macaw.ARM exposed-modules: Data.Macaw.ARM
, Data.Macaw.ARM.Arch , Data.Macaw.ARM.Arch
, Data.Macaw.ARM.ARMReg , Data.Macaw.ARM.ARMReg
, Data.Macaw.ARM.Disassemble
, Data.Macaw.ARM.Eval , Data.Macaw.ARM.Eval
, Data.Macaw.ARM.BinaryFormat.ELF , Data.Macaw.ARM.BinaryFormat.ELF
, Data.Macaw.ARM.Semantics.ARMSemantics , Data.Macaw.ARM.Semantics.ARMSemantics
@ -31,8 +32,10 @@ library
, lens , lens
, macaw-base , macaw-base
, macaw-semmc , macaw-semmc
, mtl
, parameterized-utils , parameterized-utils
, semmc-arm , semmc-arm
, text
, vector , vector
-- build-tools: arm-none-eabi-gcc -- build-tools: arm-none-eabi-gcc
hs-source-dirs: src hs-source-dirs: src

View File

@ -12,7 +12,10 @@ module Data.Macaw.ARM
) )
where where
import Data.Macaw.ARM.Disassemble ( disassembleFn )
import Data.Macaw.ARM.Eval import Data.Macaw.ARM.Eval
import qualified Data.Macaw.ARM.Semantics.ARMSemantics as ARMSem
import qualified Data.Macaw.Architecture.Info as MI import qualified Data.Macaw.Architecture.Info as MI
import qualified Data.Macaw.Memory as MM import qualified Data.Macaw.Memory as MM
import Data.Proxy ( Proxy(..) ) import Data.Proxy ( Proxy(..) )
@ -29,7 +32,7 @@ arm_linux_info =
, MI.archAddrWidth = MM.Addr32 , MI.archAddrWidth = MM.Addr32
, MI.archEndianness = MM.LittleEndian , MI.archEndianness = MM.LittleEndian
, MI.jumpTableEntrySize = 0 -- undefined -- jumpTableEntrySize proxy , MI.jumpTableEntrySize = 0 -- undefined -- jumpTableEntrySize proxy
, MI.disassembleFn = undefined -- disassembleFn proxy ARMSem.execInstruction , MI.disassembleFn = disassembleFn proxy ARMSem.execInstruction
, MI.mkInitialAbsState = mkInitialAbsState proxy , MI.mkInitialAbsState = mkInitialAbsState proxy
, MI.absEvalArchFn = undefined -- absEvalArchFn proxy , MI.absEvalArchFn = undefined -- absEvalArchFn proxy
, MI.absEvalArchStmt = undefined -- absEvalArchStmt proxy , MI.absEvalArchStmt = undefined -- absEvalArchStmt proxy

View File

@ -0,0 +1,292 @@
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE UndecidableInstances #-}
module Data.Macaw.ARM.Disassemble
( disassembleFn
)
where
import Control.Lens ( (&), (^.), (%~), (.~) )
import Control.Monad ( unless )
import qualified Control.Monad.Except as ET
import Control.Monad.ST ( ST )
import Control.Monad.Trans ( lift )
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Foldable as F
import Data.Macaw.ARM.ARMReg
import Data.Macaw.ARM.Arch ( ARMArchConstraints )
import Data.Macaw.AbsDomain.AbsState as MA
import Data.Macaw.CFG
import Data.Macaw.CFG.Block
import qualified Data.Macaw.CFG.Core as MC
import qualified Data.Macaw.Memory as MM
import qualified Data.Macaw.Memory.Permissions as MMP
import Data.Macaw.SemMC.Generator
import Data.Macaw.SemMC.Simplify ( simplifyValue )
import Data.Macaw.Types -- ( BVType, BoolType )
import Data.Maybe ( fromMaybe )
import qualified Data.Parameterized.Nonce as NC
import Data.Proxy ( Proxy(..) )
import qualified Data.Sequence as Seq
import qualified Data.Text as T
import Data.Word ( Word64 )
import qualified Dismantle.ARM as D
import Text.Printf ( printf )
-- | Disassemble a block from the given start address (which points into the
-- 'MM.Memory').
--
-- Return a list of disassembled blocks as well as the total number of bytes
-- occupied by those blocks.
disassembleFn :: (ARMArchConstraints arm)
=> proxy arm
-> (Value arm ids (BVType (ArchAddrWidth arm)) -> D.Instruction -> Maybe (Generator arm ids s ()))
-- ^ A function to look up the semantics for an instruction. The
-- lookup is provided with the value of the IP in case IP-relative
-- addressing is necessary.
-> MM.Memory (ArchAddrWidth arm)
-- ^ The mapped memory space
-> NC.NonceGenerator (ST s) ids
-- ^ A generator of unique IDs used for assignments
-> ArchSegmentOff arm
-- ^ The address to disassemble from
-> ArchAddrWord arm
-- ^ Maximum size of the block (a safeguard)
-> MA.AbsBlockState (ArchReg arm)
-- ^ Abstract state of the processor at the start of the block
-> ST s ([Block arm ids], MM.MemWord (ArchAddrWidth arm), Maybe String)
disassembleFn _ lookupSemantics mem nonceGen startAddr maxSize _ = do
mr <- ET.runExceptT (unDisM (tryDisassembleBlock lookupSemantics mem nonceGen startAddr maxSize))
case mr of
Left (blocks, off, exn) -> return (blocks, off, Just (show exn))
Right (blocks, bytes) -> return (blocks, bytes, Nothing)
tryDisassembleBlock :: (ARMArchConstraints arm)
=> (Value arm ids (BVType (ArchAddrWidth arm)) -> D.Instruction -> Maybe (Generator arm ids s ()))
-> MM.Memory (ArchAddrWidth arm)
-> NC.NonceGenerator (ST s) ids
-> ArchSegmentOff arm
-> ArchAddrWord arm
-> DisM arm ids s ([Block arm ids], MM.MemWord (ArchAddrWidth arm))
tryDisassembleBlock lookupSemantics mem nonceGen startAddr maxSize = do
let gs0 = initGenState nonceGen mem startAddr (initRegState startAddr)
let startOffset = MM.msegOffset startAddr
(nextIPOffset, blocks) <- disassembleBlock lookupSemantics mem gs0 startAddr (startOffset + maxSize)
unless (nextIPOffset > startOffset) $ do
let reason = InvalidNextIP (fromIntegral nextIPOffset) (fromIntegral startOffset)
failAt gs0 nextIPOffset startAddr reason
return (F.toList (blocks ^. frontierBlocks), nextIPOffset - startOffset)
-- | Disassemble an instruction and terminate the current block if we run out of
-- instructions to disassemble. We can run out if:
--
-- 1) We exceed the offset that macaw has told us to disassemble to
--
-- 2) We can't decode the IP (i.e., it isn't a constant)
--
-- 3) The IP after executing the semantics transformer is not equal to the
-- expected next IP value, which indicates a jump to another block or
-- function
--
-- In most of those cases, we end the block with a simple terminator. If the IP
-- becomes a mux, we split execution using 'conditionalBranch'.
disassembleBlock :: forall arm ids s
. ARMArchConstraints arm
=> (Value arm ids (BVType (ArchAddrWidth arm)) -> D.Instruction -> Maybe (Generator arm ids s ()))
-- ^ A function to look up the semantics for an instruction that we disassemble
-> MM.Memory (ArchAddrWidth arm)
-> GenState arm ids s
-> MM.MemSegmentOff (ArchAddrWidth arm)
-- ^ The current instruction pointer
-> MM.MemWord (ArchAddrWidth arm)
-- ^ The maximum offset into the bytestring that we should
-- disassemble to; in principle, macaw can tell us to limit our
-- search with this.
-> DisM arm ids s (MM.MemWord (ArchAddrWidth arm), BlockSeq arm ids)
disassembleBlock lookupSemantics mem gs curIPAddr maxOffset = do
let seg = MM.msegSegment curIPAddr
let off = MM.msegOffset curIPAddr
case readInstruction mem curIPAddr of
Left err -> failAt gs off curIPAddr (DecodeError err)
Right (i, bytesRead) -> do
-- traceM ("II: " ++ show i)
let nextIPOffset = off + bytesRead
nextIP = MM.relativeAddr seg nextIPOffset
-- nextIPVal = MC.RelocatableValue (pointerNatRepr (Proxy @arm)) nextIP
-- nextIPVal = MC.RelocatableValue (knownNat :: (BVType (ArchAddrWidth arm))) nextIP
nextIPVal = MC.RelocatableValue (knownNat :: NatRepr (ArchAddrWidth arm)) nextIP
-- Note: In ARM, the IP is incremented *after* an instruction
-- executes; pass in the -- physical address of the instruction here.
ipVal <- case MM.asAbsoluteAddr (MM.relativeSegmentAddr curIPAddr) of
Nothing -> failAt gs off curIPAddr (InstructionAtUnmappedAddr i)
Just addr -> return (BVValue (knownNat :: NatRepr (ArchAddrWidth arm)) (fromIntegral addr))
case lookupSemantics ipVal i of
Nothing -> failAt gs off curIPAddr (UnsupportedInstruction i)
Just transformer -> do
-- Once we have the semantics for the instruction (represented by a
-- state transformer), we apply the state transformer and then extract
-- a result from the state of the 'Generator'.
egs1 <- liftST $ ET.runExceptT (runGenerator genResult gs $ do
let lineStr = printf "%s: %s" (show curIPAddr) (show (D.ppInstruction i))
addStmt (Comment (T.pack lineStr))
transformer
-- Check to see if the IP has become conditionally-defined (by e.g.,
-- a mux). If it has, we need to split execution using a primitive
-- provided by the Generator monad.
nextIPExpr <- getRegValue ARM_IP
case matchConditionalBranch nextIPExpr of
Just (cond, t_ip, f_ip) ->
conditionalBranch cond (setRegVal ARM_IP t_ip) (setRegVal ARM_IP f_ip)
Nothing -> return ())
case egs1 of
Left genErr -> failAt gs off curIPAddr (GenerationError i genErr)
Right gs1 -> do
case resState gs1 of
Just preBlock
| Seq.null (resBlockSeq gs1 ^. frontierBlocks)
, v <- preBlock ^. (pBlockState . curIP)
, Just simplifiedIP <- simplifyValue v
, simplifiedIP == nextIPVal
, nextIPOffset < maxOffset
, Just nextIPSegAddr <- MM.asSegmentOff mem nextIP -> do
let preBlock' = (pBlockState . curIP .~ simplifiedIP) preBlock
let gs2 = GenState { assignIdGen = assignIdGen gs
, _blockSeq = resBlockSeq gs1
, _blockState = preBlock'
, genAddr = nextIPSegAddr
, genMemory = mem
}
disassembleBlock lookupSemantics mem gs2 nextIPSegAddr maxOffset
_ -> return (nextIPOffset, finishBlock FetchAndExecute gs1)
-- | Read one instruction from the 'MM.Memory' at the given segmented offset.
--
-- Returns the instruction and number of bytes consumed /or/ an error.
--
-- This code assumes that the 'MM.ByteRegion' is maximal; that is, that there
-- are no byte regions that could be coalesced.
readInstruction :: MM.Memory w
-> MM.MemSegmentOff w
-> Either (MM.MemoryError w) (D.Instruction, MM.MemWord w)
readInstruction mem addr = MM.addrWidthClass (MM.memAddrWidth mem) $ do
let seg = MM.msegSegment addr
case MM.segmentFlags seg `MMP.hasPerm` MMP.execute of
False -> ET.throwError (MM.PermissionsError (MM.relativeSegmentAddr addr))
True -> do
contents <- MM.addrContentsAfter mem (MM.relativeSegmentAddr addr)
case contents of
[] -> ET.throwError (MM.AccessViolation (MM.relativeSegmentAddr addr))
MM.SymbolicRef {} : _ ->
ET.throwError (MM.UnexpectedRelocation (MM.relativeSegmentAddr addr))
MM.ByteRegion bs : _rest
| BS.null bs -> ET.throwError (MM.AccessViolation (MM.relativeSegmentAddr addr))
| otherwise -> do
-- FIXME: Having to wrap the bytestring in a lazy wrapper is
-- unpleasant. We could alter the disassembler to consume strict
-- bytestrings, at the cost of possibly making it less efficient for
-- other clients.
let (bytesRead, minsn) = D.disassembleInstruction (LBS.fromStrict bs)
case minsn of
Just insn -> return (insn, fromIntegral bytesRead)
Nothing -> ET.throwError (MM.InvalidInstruction (MM.relativeSegmentAddr addr) contents)
-- | Examine a value and see if it is a mux; if it is, break the mux up and
-- return its component values (the condition and two alternatives)
matchConditionalBranch :: (ARMArchConstraints arch)
=> Value arch ids tp
-> Maybe (Value arch ids BoolType, Value arch ids tp, Value arch ids tp)
matchConditionalBranch v =
case v of
AssignedValue (Assignment { assignRhs = EvalApp a }) ->
case a of
Mux _rep cond t f -> Just (cond, fromMaybe t (simplifyValue t), fromMaybe f (simplifyValue f))
_ -> Nothing
_ -> Nothing
type LocatedError ppc ids = ([Block ppc ids], MM.MemWord (ArchAddrWidth ppc), TranslationError (ArchAddrWidth ppc))
-- | This is a monad for error handling during disassembly
--
-- It allows for early failure that reports progress (in the form of blocks
-- discovered and the latest address examined) along with a reason for failure
-- (a 'TranslationError').
newtype DisM ppc ids s a = DisM { unDisM :: ET.ExceptT (LocatedError ppc ids) (ST s) a }
deriving (Functor,
Applicative,
Monad)
-- | This funny instance is required because GHC doesn't allow type function
-- applications in instance heads, so we factor the type functions out into a
-- constraint on a fresh variable. See
--
-- > https://stackoverflow.com/questions/45360959/illegal-type-synonym-family-application-in-instance-with-functional-dependency
--
-- We also can't derive this instance because of that restriction (but deriving
-- silently fails).
instance (w ~ ArchAddrWidth ppc) => ET.MonadError ([Block ppc ids], MM.MemWord w, TranslationError w) (DisM ppc ids s) where
throwError e = DisM (ET.throwError e)
catchError a hdlr = do
r <- liftST $ ET.runExceptT (unDisM a)
case r of
Left l -> do
r' <- liftST $ ET.runExceptT (unDisM (hdlr l))
case r' of
Left e -> DisM (ET.throwError e)
Right res -> return res
Right res -> return res
data TranslationError w = TranslationError { transErrorAddr :: MM.MemSegmentOff w
, transErrorReason :: TranslationErrorReason w
}
data TranslationErrorReason w = InvalidNextIP Word64 Word64
| DecodeError (MM.MemoryError w)
| UnsupportedInstruction D.Instruction
| InstructionAtUnmappedAddr D.Instruction
| GenerationError D.Instruction GeneratorError
deriving (Show)
deriving instance (MM.MemWidth w) => Show (TranslationError w)
liftST :: ST s a -> DisM arm ids s a
liftST = DisM . lift
-- | Early failure for 'DisM'. This is a wrapper around 'ET.throwError' that
-- computes the current progress alongside the reason for the failure.
--
-- Note that the 'TranslateError' below is a block terminator marker that notes
-- that translation of the basic block resulted in an error (with the exception
-- string stored as its argument). This allows macaw to carry errors in the
-- instruction stream, which is useful for debugging and diagnostics.
failAt :: forall arm ids s a
. (ArchReg arm ~ ARMReg, MM.MemWidth (ArchAddrWidth arm))
=> GenState arm ids s
-> MM.MemWord (ArchAddrWidth arm)
-> MM.MemSegmentOff (ArchAddrWidth arm)
-> TranslationErrorReason (ArchAddrWidth arm)
-> DisM arm ids s a
failAt gs offset curIPAddr reason = do
let exn = TranslationError { transErrorAddr = curIPAddr
, transErrorReason = reason
}
let term = (`TranslateError` T.pack (show exn))
let b = finishBlock' (gs ^. blockState) term
let res = _blockSeq gs & frontierBlocks %~ (Seq.|> b)
let res' = F.toList (res ^. frontierBlocks)
ET.throwError (res', offset, exn)

View File

@ -10,7 +10,9 @@ import qualified Data.Macaw.Types as MT
import SemMC.ARM ( ARM, Instruction ) import SemMC.ARM ( ARM, Instruction )
import Data.Macaw.SemMC.Generator ( Generator ) import Data.Macaw.SemMC.Generator ( Generator )
-- import SemMC.Architecture.ARM.Opcodes ( allSemantics, allOpcodeInfo ) -- import SemMC.Architecture.ARM.Opcodes ( allSemantics, allOpcodeInfo )
import qualified Dismantle.ARM as D
execInstruction :: MC.Value ARM ids (MT.BVType 32) -> Instruction -> Maybe (Generator ARM ids s ())
execInstruction :: MC.Value ARM ids (MT.BVType 32) -> D.Instruction -> Maybe (Generator ARM ids s ())
execInstruction = undefined execInstruction = undefined
-- execInstruction = $(genExecInstruction (Proxy @ARM) (locToRegTH (Proxy @ARM)) armNonceAppEval armAppEvaluator 'armInstructionMatcher allSemantics allOpcodeInfo) -- execInstruction = $(genExecInstruction (Proxy @ARM) (locToRegTH (Proxy @ARM)) armNonceAppEval armAppEvaluator 'armInstructionMatcher allSemantics allOpcodeInfo)