Jam: much smarter preJam pass.

2024-12-20 17:32:11 +03:00 · 2019-06-30 19:30:23 -07:00 · 2019-06-30 19:30:23 -07:00 · 89b2cccae7
commit 89b2cccae7
parent d445c1cbb1
3 changed files with 218 additions and 56 deletions
--- a/pkg/hs-urbit/lib/Data/Noun/Jam.hs
+++ b/pkg/hs-urbit/lib/Data/Noun/Jam.hs
@ -7,8 +7,9 @@ import Data.Bits
 import Control.Lens
 import Text.Printf
 import Data.Map      (Map)
 import Control.Monad (guard)
 import Data.Map      (Map)
 import Text.Printf   (printf)
 import Test.Tasty
 import Test.Tasty.TH
@ -169,11 +170,11 @@ cue buf = view _2 <$> go mempty 0
      case (bitIdx i buf, bitIdx (i+1) buf) of
        (False, _     ) -> do Buf wid at <- rub (Cursor (i+1) buf)
                              let r = Atom at
-                              pure (1+wid, r, insertMap i r tbl)
+                              pure (1+wid, r, trace (show ('c', i, r)) $ insertMap i r tbl)
        (True,  False ) -> do (lSz,lef,tbl) <- go tbl (i+2)
                              (rSz,rit,tbl) <- go tbl (i+2+fromIntegral lSz)
                              let r = Cell lef rit
-                              pure (2+lSz+rSz, r, insertMap i r tbl)
+                              pure (2+lSz+rSz, r, trace (show ('c', i, r)) $ insertMap i r tbl)
        (True,  True  ) -> do Buf wid at <- rub (Cursor (i+2) buf)
                              r <- lookup (fromIntegral at) tbl & \case
                                     Nothing -> error ("bad-ref-" <> show at)
@ -203,38 +204,45 @@ pills = [ 0x2, 0xc, 0x48, 0x29, 0xc9, 0x299
        , 0x3170_c7c1, 0x93_c7c1, 0xa_72e0, 0x1bd5_b7dd_e080
        ]
-cueTest :: Maybe [Noun]
+-- cueTest :: Maybe [Noun]
-cueTest = traverse cue pills
+-- cueTest = traverse cue pills
-jamTest :: Maybe [Atom]
+-- jamTest :: Maybe [Atom]
-jamTest = fmap jam <$> cueTest
+-- jamTest = fmap jam <$> cueTest
-prop_fastMatSlow :: Atom -> Bool
+-- prop_fastMatSlow :: Atom -> Bool
-prop_fastMatSlow a = jam (Atom a) == Fast.jam (Atom a)
+-- prop_fastMatSlow a = jam (Atom a) == Fast.jam (Atom a)
-prop_fastJamSlow :: Noun -> Bool
+-- prop_fastJamSlow :: Noun -> Bool
-prop_fastJamSlow n = jam n == Fast.jam n
+-- prop_fastJamSlow n = jam n == Fast.jam n
 prop_fastJam :: Noun -> Bool
 prop_fastJam n = Just n == cue (Fast.jam n)
-prop_jamCue :: Noun -> Bool
+-- prop_jamCue :: Noun -> Bool
-prop_jamCue n = Just n == cue (jam n)
+-- prop_jamCue n = Just n == cue (jam n)
-prop_matRub :: Atom -> Bool
+-- prop_matRub :: Atom -> Bool
-prop_matRub atm = matSz==rubSz && rubRes==atm
+-- prop_matRub atm = matSz==rubSz && rubRes==atm
-  where
+  -- where
-    Buf matSz matBuf = mat atm
+    -- Buf matSz matBuf = mat atm
-    Buf rubSz rubRes = fromMaybe mempty (rub $ Cursor 0 matBuf)
+    -- Buf rubSz rubRes = fromMaybe mempty (rub $ Cursor 0 matBuf)
-prop_jamCue' :: Noun -> Bool
+-- prop_jamCue' :: Noun -> Bool
-prop_jamCue' n = Just n == cue' (jam' n)
+-- prop_jamCue' n = Just n == cue' (jam' n)
-prop_matRub' :: Atom -> Bool
+-- prop_matRub' :: Atom -> Bool
-prop_matRub' atm = matSz==rubSz && rubRes==atm
+-- prop_matRub' atm = matSz==rubSz && rubRes==atm
-  where
+  -- where
-    Buf matSz matBuf = mat' atm
+    -- Buf matSz matBuf = mat' atm
-    Buf rubSz rubRes = fromMaybe mempty (rub' $ Cursor 0 matBuf)
+    -- Buf rubSz rubRes = fromMaybe mempty (rub' $ Cursor 0 matBuf)
 main :: IO ()
 main = $(defaultMainGenerator)
 matSz' :: Atom -> Int
 matSz' a = length s - 1
  where
    s :: String
    s = printf "%b" $ fromIntegral @Atom @Integer $ jam $ Atom a
--- a/pkg/hs-urbit/lib/Data/Noun/Jam/Put.hs
+++ b/pkg/hs-urbit/lib/Data/Noun/Jam/Put.hs
@ -9,7 +9,7 @@ import GHC.Integer.GMP.Internals
 import Control.Lens          (view, to, from, (&))
 import Control.Monad         (guard)
-import Data.Bits             (shiftL, shiftR, setBit, clearBit, (.|.), (.&.))
+import Data.Bits             (shiftL, shiftR, setBit, clearBit, xor, (.|.), (.&.))
 import Data.Map              (Map)
 import Data.Noun.Atom        ( Atom(MkAtom), wordBitWidth, wordBitWidth#
                             , atomBitWidth#, takeBitsWord )
@ -24,6 +24,8 @@ import GHC.Int               (Int(I#))
 import GHC.Word              (Word(W#))
 import System.IO.Unsafe      (unsafePerformIO)
 import qualified Data.Hashable          as Hash
 import qualified Data.Map               as M
 import qualified Data.ByteString.Unsafe as BS
 import qualified Data.HashTable.IO      as H
 import qualified Data.Vector.Primitive  as VP
@ -57,13 +59,15 @@ newtype Put a = Put
 --------------------------------------------------------------------------------
 {-# INLINE insRef #-}
 insRef :: Noun -> Word -> Put ()
 insRef n w = Put \tbl s -> PutResult s <$> H.insert tbl n w
 {-# INLINE getRef #-}
 getRef :: Noun -> Put (Maybe Word)
-getRef n = Put \tbl s -> PutResult s <$> H.lookup tbl n
+getRef n = Put \tbl s -> do
    pos <- pure (pos s)
    traceM ("getRef: " <> show n <> " @" <> show pos)
    res <- H.lookup tbl n
    pure $ PutResult s $ case res of
                           Just w | w<pos -> Just w
                           _              -> Nothing
 {-
  1. Write the register to the output, and increment the output pointer.
@ -236,10 +240,10 @@ instance Monad Put where
 --------------------------------------------------------------------------------
-doPut :: Word -> Put () -> ByteString
+doPut :: Map Noun Word -> Word -> Put () -> ByteString
-doPut sz m =
+doPut tbl sz m =
    unsafePerformIO $ do
-        tbl <- H.new
+        tbl <- H.fromListWithSizeHint (M.size tbl) (mapToList tbl)
        buf <- callocBytes (fromIntegral $ 4 * wordSz*8)
        _   <- runPut (m >> mbFlush) tbl (S buf 0 0 0)
        BS.unsafePackCStringFinalizer (castPtr buf) (2*byteSz) (free buf)
@ -263,7 +267,6 @@ writeNoun :: Noun -> Put ()
 writeNoun n = do
    -- traceM "writeNoun"
    p    <- pos <$> getS
    mRef <- getRef n
    case (mRef, n) of
@ -272,9 +275,6 @@ writeNoun n = do
        (Just bk, Atom a) | bitWidth a <= wordBitWidth bk -> writeAtom a
        (Just bk, _)                                      -> writeBackRef bk
    when (mRef == Nothing) $
        insRef n p
 {-# INLINE writeMat #-}
 writeMat :: Atom -> Put ()
 writeMat 0 = do
@ -308,7 +308,8 @@ writeAtom a = do
 {-# INLINE writeBackRef #-}
 writeBackRef :: Word -> Put ()
 writeBackRef a = do
-    -- traceM ("writeBackRef: " <> show a)
+    p <- pos <$> getS
    traceM ("writeBackRef: " <> show a <> " @" <> show p)
    writeBit True
    writeBit True
    writeMat (toAtom a)
@ -316,13 +317,31 @@ writeBackRef a = do
 --------------------------------------------------------------------------------
 jamBS :: Noun -> ByteString
-jamBS n = doPut (fst $ preJam n) (writeNoun n)
+jamBS n = trace (show $ sort $ swap <$> mapToList tbl)
        $ doPut tbl sz (writeNoun n)
  where (sz, tbl) = preJam n
 jam :: Noun -> Atom
 jam = view (to jamBS . from atomBS)
 --------------------------------------------------------------------------------
 {-# INLINE matSz #-}
 matSz :: Atom -> Word
 matSz a = W# (matSz# a)
 {-# INLINE matSz# #-}
 matSz# :: Atom -> Word#
 matSz# 0 = 1##
 matSz# a = preW `plusWord#` preW `plusWord#` atmW
  where
    atmW = atomBitWidth# a
    preW = wordBitWidth# atmW
 {-# INLINE refSz# #-}
 refSz# :: Word# -> Word#
 refSz# w = 2## `plusWord#` (matSz# (MkAtom (NatS# w)))
 preJam :: Noun -> (Word, Map Noun Word)
 preJam = go 0 mempty
  where
@ -332,12 +351,12 @@ preJam = go 0 mempty
    go :: Word -> Map Noun Word -> Noun -> (Word, Map Noun Word)
    go off oldTbl noun =
      let tbl = insertNoun noun off oldTbl in
      case lookup noun oldTbl of
        Nothing ->
          let tbl = insertNoun noun off oldTbl in
          case noun of
            Atom atm ->
-              (1 + W# (matSz# atm), tbl)
+              (1 + matSz atm, tbl)
            Cell l r ->
              let (lSz, tbl')  = go (2+off)     tbl  l in
              let (rSz, tbl'') = go (2+off+lSz) tbl' r in
@ -346,23 +365,157 @@ preJam = go 0 mempty
          let refSz = W# (wordBitWidth# ref) in
          case noun of
            Atom atm ->
-              let worSz = W# (matSz# atm) in
+              let worSz = matSz atm in
              if worSz > refSz
              then (2 + refSz, oldTbl)
-              else (1 + worSz, tbl)
+              else (1 + worSz, oldTbl)
            Cell _ _ ->
              (2 + refSz, oldTbl)
    matSz# :: Atom -> Word#
    matSz# 0 = 1##
    matSz# a = preW `plusWord#` preW `plusWord#` atmW
      where
        atmW = atomBitWidth# a
        preW = wordBitWidth# atmW
-    refSz# :: Word# -> Word#
+-- Fast Pre-Jam ----------------------------------------------------------------
    refSz# w = 2## `plusWord#` (matSz# (MkAtom (NatS# w)))
-    nounSz# :: Noun -> Word#
+{-
-    nounSz# (Atom a)   = 1## `plusWord#` (matSz# a)
+    An `SHN` is a noun and some pre-computed information.
-    nounSz# (Cell l r) = 2## `plusWord#` (nounSz# l) `plusWord#` (nounSz# r)
+
    - `size` is the serialized size without backreferences, we use this
      for fast equality checks.
    - `jmSz` is the serialized size, we use this to allocate a buffer
      at the end.
    - `hash` is a precomputed noun hash. We use this to get better,
      cheaper hashes for our hashtable.
    - `noun` is the actual noun.
 -}
 data SHN = SHN
    { size :: {-# UNPACK #-} !Word
    , jmSz :: {-# UNPACK #-} !Word
    , hash :: {-# UNPACK #-} !Int
    , noun :: {-# UNPACK #-} !Noun
    }
  deriving (Show)
 instance Hashable SHN where
  hash (SHN _ _ h _) = h
  {-# INLINE hash #-}
  hashWithSalt     = defaultHashWithSalt
  {-# INLINE hashWithSalt #-}
 instance Eq SHN where
  x == y = (size x == size y) && (noun x == noun y)
 preJam' :: Noun -> IO (SHN, H.LinearHashTable Word Word)
 preJam' top = do
    nodes :: H.LinearHashTable SHN  Word <- H.new
    backs :: H.LinearHashTable Word Word <- H.new
    let goAtom :: Word -> Atom -> IO SHN
        goAtom pos a@(MkAtom nat) = do
            let atmSz = matSz a
            let res   = SHN (1+atmSz) (1+atmSz) (Hash.hash nat) (Atom a)
            H.lookup nodes res >>= \case
                Nothing -> do
                    H.insert nodes res pos
                    pure (traceShowId res)
                Just bak -> do
                    let refSz = matSz (toAtom bak)
                    if refSz < atmSz
                    then do H.insert backs pos bak
                            pure (traceShowId (res{jmSz=2+refSz}))
                    else pure (traceShowId res)
        goCell :: Word -> Noun -> Noun -> IO SHN
        goCell pos h t = do
            SHN hSz hJmSz hHash _ <- go (pos+2) h
            SHN tSz tJmSz tHash _ <- go (pos+2+hSz) t
            let sz   = 2+hSz+tSz
            let jmSz = 2+hJmSz+tJmSz
            let res  = SHN sz jmSz (combine hHash tHash) (Cell h t)
            H.lookup nodes res >>= \case
                Nothing -> do
                    H.insert nodes res pos
                    pure (traceShowId res)
                Just bak -> do
                    let refSz = matSz (toAtom bak)
                    H.insert backs pos bak
                    pure (traceShowId (res{jmSz=2+refSz}))
        go :: Word -> Noun -> IO SHN
        go p (Atom a)   = goAtom p a
        go p (Cell h t) = goCell p h t
    res <- go 0 top
    pure (res, backs)
 -- Stolen from Hashable Library ------------------------------------------------
 {-# INLINE combine #-}
 combine :: Int -> Int -> Int
 combine h1 h2 = (h1 * 16777619) `xor` h2
 {-# INLINE defaultHashWithSalt #-}
 defaultHashWithSalt :: Hashable a => Int -> a -> Int
 defaultHashWithSalt salt x = salt `combine` Hash.hash x
 {-
    I suspect that hashing these big atoms recursively is going to be the bottleneck:
      Unless you have a good hashing system.
        Which we totally do in the nock runtime.
      Checking the hash for the top-level node precomputes the hashes for
        everything else, recursively.
      This is really smart.
        Maybe I could implement this as well?
          But hashing traverses the whole structure.
        So, now we have
          1. precompute hashes.
          2. precompute size and backref table.
          3. serialize
        This seems excessive.
          We insert into the backref table right away, but actually:
            Backreferences can't exist until the whole node is processed.
          Which implies a smarter algorithm:
            - Setup a atom dup table
                atoms :: Hashtable BigNum Word
            - Setup a cell dup table
                cells :: Hashtable (Noun, Noun) Word
            - Setup a backref table (map from dup. pos to orig. pos)
                backs :: Hashtable Word Word
            - go :: Noun -> ST s (Hash, Word)
              - If atom,
                - Compute size and hash
                - Check atom table for backref
                - If atom in `atoms` table:
                  - If backref smaller than atom
                    - Insert (pos, bak) into `backs` table.
                    - Return (backref size, atom hash)
                  - If backref not smaller than atom
                    - Return (atom size, atom hash)
                - Otherwise:
                  - Insert atom into `atoms` table.
                  - Return (atom size, atom hash)
              - If cell
                - process head
                - process tail
                - produce size+hash from results
                - Check cell table for backref
                - If backref exists
                  - Insert `(pos, bak)` into `backs` table
                  - Return (backref size, cell hash)
                - Else
                  - Return (cell size, cell hash)
          Then, to serialize:
            - Allocate a buffer of `size` bits
            - If current pos in `backs` table:
              - Write `11`
              - Write backref (mat)
            - Otherwise:
              - If Atom:
                - Write `0`
                - Write atom (mat)
              - If Cell
                - Write `10`
                - Write head
                - Write tail
 -}
--- a/pkg/hs-urbit/package.yaml
+++ b/pkg/hs-urbit/package.yaml
@ -21,6 +21,7 @@ dependencies:
  - flat
  - ghc-prim
  - hashable
  - hashable
  - hashtables
  - http-client
  - http-types