From b533f4a92a3ed89aafe82ce4f0f48fd355a089b2 Mon Sep 17 00:00:00 2001 From: Kevin Quick Date: Thu, 4 Jan 2018 17:18:22 -0800 Subject: [PATCH] Added ARM Macaw disassembly functionality. --- macaw-arm/macaw-arm.cabal | 3 + macaw-arm/src/Data/Macaw/ARM.hs | 5 +- macaw-arm/src/Data/Macaw/ARM/Disassemble.hs | 292 ++++++++++++++++++ .../Data/Macaw/ARM/Semantics/ARMSemantics.hs | 4 +- 4 files changed, 302 insertions(+), 2 deletions(-) create mode 100644 macaw-arm/src/Data/Macaw/ARM/Disassemble.hs diff --git a/macaw-arm/macaw-arm.cabal b/macaw-arm/macaw-arm.cabal index fe6978e4..378ebc3e 100644 --- a/macaw-arm/macaw-arm.cabal +++ b/macaw-arm/macaw-arm.cabal @@ -16,6 +16,7 @@ library exposed-modules: Data.Macaw.ARM , Data.Macaw.ARM.Arch , Data.Macaw.ARM.ARMReg + , Data.Macaw.ARM.Disassemble , Data.Macaw.ARM.Eval , Data.Macaw.ARM.BinaryFormat.ELF , Data.Macaw.ARM.Semantics.ARMSemantics @@ -31,8 +32,10 @@ library , lens , macaw-base , macaw-semmc + , mtl , parameterized-utils , semmc-arm + , text , vector -- build-tools: arm-none-eabi-gcc hs-source-dirs: src diff --git a/macaw-arm/src/Data/Macaw/ARM.hs b/macaw-arm/src/Data/Macaw/ARM.hs index ba431ef9..18b3bb41 100644 --- a/macaw-arm/src/Data/Macaw/ARM.hs +++ b/macaw-arm/src/Data/Macaw/ARM.hs @@ -12,7 +12,10 @@ module Data.Macaw.ARM ) where + +import Data.Macaw.ARM.Disassemble ( disassembleFn ) import Data.Macaw.ARM.Eval +import qualified Data.Macaw.ARM.Semantics.ARMSemantics as ARMSem import qualified Data.Macaw.Architecture.Info as MI import qualified Data.Macaw.Memory as MM import Data.Proxy ( Proxy(..) ) @@ -29,7 +32,7 @@ arm_linux_info = , MI.archAddrWidth = MM.Addr32 , MI.archEndianness = MM.LittleEndian , MI.jumpTableEntrySize = 0 -- undefined -- jumpTableEntrySize proxy - , MI.disassembleFn = undefined -- disassembleFn proxy ARMSem.execInstruction + , MI.disassembleFn = disassembleFn proxy ARMSem.execInstruction , MI.mkInitialAbsState = mkInitialAbsState proxy , MI.absEvalArchFn = undefined -- absEvalArchFn proxy , MI.absEvalArchStmt = undefined -- absEvalArchStmt proxy diff --git a/macaw-arm/src/Data/Macaw/ARM/Disassemble.hs b/macaw-arm/src/Data/Macaw/ARM/Disassemble.hs new file mode 100644 index 00000000..a84b465d --- /dev/null +++ b/macaw-arm/src/Data/Macaw/ARM/Disassemble.hs @@ -0,0 +1,292 @@ +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE StandaloneDeriving #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE UndecidableInstances #-} + +module Data.Macaw.ARM.Disassemble + ( disassembleFn + ) + where + +import Control.Lens ( (&), (^.), (%~), (.~) ) +import Control.Monad ( unless ) +import qualified Control.Monad.Except as ET +import Control.Monad.ST ( ST ) +import Control.Monad.Trans ( lift ) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as LBS +import qualified Data.Foldable as F +import Data.Macaw.ARM.ARMReg +import Data.Macaw.ARM.Arch ( ARMArchConstraints ) +import Data.Macaw.AbsDomain.AbsState as MA +import Data.Macaw.CFG +import Data.Macaw.CFG.Block +import qualified Data.Macaw.CFG.Core as MC +import qualified Data.Macaw.Memory as MM +import qualified Data.Macaw.Memory.Permissions as MMP +import Data.Macaw.SemMC.Generator +import Data.Macaw.SemMC.Simplify ( simplifyValue ) +import Data.Macaw.Types -- ( BVType, BoolType ) +import Data.Maybe ( fromMaybe ) +import qualified Data.Parameterized.Nonce as NC +import Data.Proxy ( Proxy(..) ) +import qualified Data.Sequence as Seq +import qualified Data.Text as T +import Data.Word ( Word64 ) +import qualified Dismantle.ARM as D +import Text.Printf ( printf ) + + +-- | Disassemble a block from the given start address (which points into the +-- 'MM.Memory'). +-- +-- Return a list of disassembled blocks as well as the total number of bytes +-- occupied by those blocks. +disassembleFn :: (ARMArchConstraints arm) + => proxy arm + -> (Value arm ids (BVType (ArchAddrWidth arm)) -> D.Instruction -> Maybe (Generator arm ids s ())) + -- ^ A function to look up the semantics for an instruction. The + -- lookup is provided with the value of the IP in case IP-relative + -- addressing is necessary. + -> MM.Memory (ArchAddrWidth arm) + -- ^ The mapped memory space + -> NC.NonceGenerator (ST s) ids + -- ^ A generator of unique IDs used for assignments + -> ArchSegmentOff arm + -- ^ The address to disassemble from + -> ArchAddrWord arm + -- ^ Maximum size of the block (a safeguard) + -> MA.AbsBlockState (ArchReg arm) + -- ^ Abstract state of the processor at the start of the block + -> ST s ([Block arm ids], MM.MemWord (ArchAddrWidth arm), Maybe String) +disassembleFn _ lookupSemantics mem nonceGen startAddr maxSize _ = do + mr <- ET.runExceptT (unDisM (tryDisassembleBlock lookupSemantics mem nonceGen startAddr maxSize)) + case mr of + Left (blocks, off, exn) -> return (blocks, off, Just (show exn)) + Right (blocks, bytes) -> return (blocks, bytes, Nothing) + +tryDisassembleBlock :: (ARMArchConstraints arm) + => (Value arm ids (BVType (ArchAddrWidth arm)) -> D.Instruction -> Maybe (Generator arm ids s ())) + -> MM.Memory (ArchAddrWidth arm) + -> NC.NonceGenerator (ST s) ids + -> ArchSegmentOff arm + -> ArchAddrWord arm + -> DisM arm ids s ([Block arm ids], MM.MemWord (ArchAddrWidth arm)) +tryDisassembleBlock lookupSemantics mem nonceGen startAddr maxSize = do + let gs0 = initGenState nonceGen mem startAddr (initRegState startAddr) + let startOffset = MM.msegOffset startAddr + (nextIPOffset, blocks) <- disassembleBlock lookupSemantics mem gs0 startAddr (startOffset + maxSize) + unless (nextIPOffset > startOffset) $ do + let reason = InvalidNextIP (fromIntegral nextIPOffset) (fromIntegral startOffset) + failAt gs0 nextIPOffset startAddr reason + return (F.toList (blocks ^. frontierBlocks), nextIPOffset - startOffset) + + + +-- | Disassemble an instruction and terminate the current block if we run out of +-- instructions to disassemble. We can run out if: +-- +-- 1) We exceed the offset that macaw has told us to disassemble to +-- +-- 2) We can't decode the IP (i.e., it isn't a constant) +-- +-- 3) The IP after executing the semantics transformer is not equal to the +-- expected next IP value, which indicates a jump to another block or +-- function +-- +-- In most of those cases, we end the block with a simple terminator. If the IP +-- becomes a mux, we split execution using 'conditionalBranch'. +disassembleBlock :: forall arm ids s + . ARMArchConstraints arm + => (Value arm ids (BVType (ArchAddrWidth arm)) -> D.Instruction -> Maybe (Generator arm ids s ())) + -- ^ A function to look up the semantics for an instruction that we disassemble + -> MM.Memory (ArchAddrWidth arm) + -> GenState arm ids s + -> MM.MemSegmentOff (ArchAddrWidth arm) + -- ^ The current instruction pointer + -> MM.MemWord (ArchAddrWidth arm) + -- ^ The maximum offset into the bytestring that we should + -- disassemble to; in principle, macaw can tell us to limit our + -- search with this. + -> DisM arm ids s (MM.MemWord (ArchAddrWidth arm), BlockSeq arm ids) +disassembleBlock lookupSemantics mem gs curIPAddr maxOffset = do + let seg = MM.msegSegment curIPAddr + let off = MM.msegOffset curIPAddr + case readInstruction mem curIPAddr of + Left err -> failAt gs off curIPAddr (DecodeError err) + Right (i, bytesRead) -> do +-- traceM ("II: " ++ show i) + let nextIPOffset = off + bytesRead + nextIP = MM.relativeAddr seg nextIPOffset + -- nextIPVal = MC.RelocatableValue (pointerNatRepr (Proxy @arm)) nextIP + -- nextIPVal = MC.RelocatableValue (knownNat :: (BVType (ArchAddrWidth arm))) nextIP + nextIPVal = MC.RelocatableValue (knownNat :: NatRepr (ArchAddrWidth arm)) nextIP + -- Note: In ARM, the IP is incremented *after* an instruction + -- executes; pass in the -- physical address of the instruction here. + ipVal <- case MM.asAbsoluteAddr (MM.relativeSegmentAddr curIPAddr) of + Nothing -> failAt gs off curIPAddr (InstructionAtUnmappedAddr i) + Just addr -> return (BVValue (knownNat :: NatRepr (ArchAddrWidth arm)) (fromIntegral addr)) + case lookupSemantics ipVal i of + Nothing -> failAt gs off curIPAddr (UnsupportedInstruction i) + Just transformer -> do + -- Once we have the semantics for the instruction (represented by a + -- state transformer), we apply the state transformer and then extract + -- a result from the state of the 'Generator'. + egs1 <- liftST $ ET.runExceptT (runGenerator genResult gs $ do + let lineStr = printf "%s: %s" (show curIPAddr) (show (D.ppInstruction i)) + addStmt (Comment (T.pack lineStr)) + transformer + + -- Check to see if the IP has become conditionally-defined (by e.g., + -- a mux). If it has, we need to split execution using a primitive + -- provided by the Generator monad. + nextIPExpr <- getRegValue ARM_IP + case matchConditionalBranch nextIPExpr of + Just (cond, t_ip, f_ip) -> + conditionalBranch cond (setRegVal ARM_IP t_ip) (setRegVal ARM_IP f_ip) + Nothing -> return ()) + case egs1 of + Left genErr -> failAt gs off curIPAddr (GenerationError i genErr) + Right gs1 -> do + case resState gs1 of + Just preBlock + | Seq.null (resBlockSeq gs1 ^. frontierBlocks) + , v <- preBlock ^. (pBlockState . curIP) + , Just simplifiedIP <- simplifyValue v + , simplifiedIP == nextIPVal + , nextIPOffset < maxOffset + , Just nextIPSegAddr <- MM.asSegmentOff mem nextIP -> do + let preBlock' = (pBlockState . curIP .~ simplifiedIP) preBlock + let gs2 = GenState { assignIdGen = assignIdGen gs + , _blockSeq = resBlockSeq gs1 + , _blockState = preBlock' + , genAddr = nextIPSegAddr + , genMemory = mem + } + disassembleBlock lookupSemantics mem gs2 nextIPSegAddr maxOffset + + _ -> return (nextIPOffset, finishBlock FetchAndExecute gs1) + +-- | Read one instruction from the 'MM.Memory' at the given segmented offset. +-- +-- Returns the instruction and number of bytes consumed /or/ an error. +-- +-- This code assumes that the 'MM.ByteRegion' is maximal; that is, that there +-- are no byte regions that could be coalesced. +readInstruction :: MM.Memory w + -> MM.MemSegmentOff w + -> Either (MM.MemoryError w) (D.Instruction, MM.MemWord w) +readInstruction mem addr = MM.addrWidthClass (MM.memAddrWidth mem) $ do + let seg = MM.msegSegment addr + case MM.segmentFlags seg `MMP.hasPerm` MMP.execute of + False -> ET.throwError (MM.PermissionsError (MM.relativeSegmentAddr addr)) + True -> do + contents <- MM.addrContentsAfter mem (MM.relativeSegmentAddr addr) + case contents of + [] -> ET.throwError (MM.AccessViolation (MM.relativeSegmentAddr addr)) + MM.SymbolicRef {} : _ -> + ET.throwError (MM.UnexpectedRelocation (MM.relativeSegmentAddr addr)) + MM.ByteRegion bs : _rest + | BS.null bs -> ET.throwError (MM.AccessViolation (MM.relativeSegmentAddr addr)) + | otherwise -> do + -- FIXME: Having to wrap the bytestring in a lazy wrapper is + -- unpleasant. We could alter the disassembler to consume strict + -- bytestrings, at the cost of possibly making it less efficient for + -- other clients. + let (bytesRead, minsn) = D.disassembleInstruction (LBS.fromStrict bs) + case minsn of + Just insn -> return (insn, fromIntegral bytesRead) + Nothing -> ET.throwError (MM.InvalidInstruction (MM.relativeSegmentAddr addr) contents) + +-- | Examine a value and see if it is a mux; if it is, break the mux up and +-- return its component values (the condition and two alternatives) +matchConditionalBranch :: (ARMArchConstraints arch) + => Value arch ids tp + -> Maybe (Value arch ids BoolType, Value arch ids tp, Value arch ids tp) +matchConditionalBranch v = + case v of + AssignedValue (Assignment { assignRhs = EvalApp a }) -> + case a of + Mux _rep cond t f -> Just (cond, fromMaybe t (simplifyValue t), fromMaybe f (simplifyValue f)) + _ -> Nothing + _ -> Nothing + + +type LocatedError ppc ids = ([Block ppc ids], MM.MemWord (ArchAddrWidth ppc), TranslationError (ArchAddrWidth ppc)) +-- | This is a monad for error handling during disassembly +-- +-- It allows for early failure that reports progress (in the form of blocks +-- discovered and the latest address examined) along with a reason for failure +-- (a 'TranslationError'). +newtype DisM ppc ids s a = DisM { unDisM :: ET.ExceptT (LocatedError ppc ids) (ST s) a } + deriving (Functor, + Applicative, + Monad) + +-- | This funny instance is required because GHC doesn't allow type function +-- applications in instance heads, so we factor the type functions out into a +-- constraint on a fresh variable. See +-- +-- > https://stackoverflow.com/questions/45360959/illegal-type-synonym-family-application-in-instance-with-functional-dependency +-- +-- We also can't derive this instance because of that restriction (but deriving +-- silently fails). +instance (w ~ ArchAddrWidth ppc) => ET.MonadError ([Block ppc ids], MM.MemWord w, TranslationError w) (DisM ppc ids s) where + throwError e = DisM (ET.throwError e) + catchError a hdlr = do + r <- liftST $ ET.runExceptT (unDisM a) + case r of + Left l -> do + r' <- liftST $ ET.runExceptT (unDisM (hdlr l)) + case r' of + Left e -> DisM (ET.throwError e) + Right res -> return res + Right res -> return res + + +data TranslationError w = TranslationError { transErrorAddr :: MM.MemSegmentOff w + , transErrorReason :: TranslationErrorReason w + } + +data TranslationErrorReason w = InvalidNextIP Word64 Word64 + | DecodeError (MM.MemoryError w) + | UnsupportedInstruction D.Instruction + | InstructionAtUnmappedAddr D.Instruction + | GenerationError D.Instruction GeneratorError + deriving (Show) + +deriving instance (MM.MemWidth w) => Show (TranslationError w) + +liftST :: ST s a -> DisM arm ids s a +liftST = DisM . lift + + +-- | Early failure for 'DisM'. This is a wrapper around 'ET.throwError' that +-- computes the current progress alongside the reason for the failure. +-- +-- Note that the 'TranslateError' below is a block terminator marker that notes +-- that translation of the basic block resulted in an error (with the exception +-- string stored as its argument). This allows macaw to carry errors in the +-- instruction stream, which is useful for debugging and diagnostics. +failAt :: forall arm ids s a + . (ArchReg arm ~ ARMReg, MM.MemWidth (ArchAddrWidth arm)) + => GenState arm ids s + -> MM.MemWord (ArchAddrWidth arm) + -> MM.MemSegmentOff (ArchAddrWidth arm) + -> TranslationErrorReason (ArchAddrWidth arm) + -> DisM arm ids s a +failAt gs offset curIPAddr reason = do + let exn = TranslationError { transErrorAddr = curIPAddr + , transErrorReason = reason + } + let term = (`TranslateError` T.pack (show exn)) + let b = finishBlock' (gs ^. blockState) term + let res = _blockSeq gs & frontierBlocks %~ (Seq.|> b) + let res' = F.toList (res ^. frontierBlocks) + ET.throwError (res', offset, exn) diff --git a/macaw-arm/src/Data/Macaw/ARM/Semantics/ARMSemantics.hs b/macaw-arm/src/Data/Macaw/ARM/Semantics/ARMSemantics.hs index 6697a6c6..4ded2e34 100644 --- a/macaw-arm/src/Data/Macaw/ARM/Semantics/ARMSemantics.hs +++ b/macaw-arm/src/Data/Macaw/ARM/Semantics/ARMSemantics.hs @@ -10,7 +10,9 @@ import qualified Data.Macaw.Types as MT import SemMC.ARM ( ARM, Instruction ) import Data.Macaw.SemMC.Generator ( Generator ) -- import SemMC.Architecture.ARM.Opcodes ( allSemantics, allOpcodeInfo ) +import qualified Dismantle.ARM as D -execInstruction :: MC.Value ARM ids (MT.BVType 32) -> Instruction -> Maybe (Generator ARM ids s ()) + +execInstruction :: MC.Value ARM ids (MT.BVType 32) -> D.Instruction -> Maybe (Generator ARM ids s ()) execInstruction = undefined -- execInstruction = $(genExecInstruction (Proxy @ARM) (locToRegTH (Proxy @ARM)) armNonceAppEval armAppEvaluator 'armInstructionMatcher allSemantics allOpcodeInfo)