mirror of
https://github.com/urbit/shrub.git
synced 2024-12-30 15:44:03 +03:00
235 lines
6.1 KiB
Haskell
235 lines
6.1 KiB
Haskell
{-
|
|
Can de-duplication be orthogonal to serialization?
|
|
-}
|
|
|
|
module Data.Noun.Zip where
|
|
|
|
import ClassyPrelude hiding (zip, unzip)
|
|
|
|
import Control.Lens
|
|
import Text.Printf
|
|
import Control.Applicative
|
|
import Data.Noun
|
|
import Data.Noun.Atom
|
|
import Data.Noun.Jam
|
|
import Data.Bits
|
|
import GHC.Generics
|
|
import Test.QuickCheck.Arbitrary
|
|
import Test.QuickCheck.Gen
|
|
import Data.Flat
|
|
import Data.Flat.Bits
|
|
import Data.Either.Extra
|
|
import GHC.Natural
|
|
import Data.Flat
|
|
|
|
import Data.Maybe (fromJust)
|
|
import Data.List (intercalate)
|
|
import Data.Typeable (Typeable)
|
|
|
|
import Control.Monad.State.Strict hiding (forM_, replicateM)
|
|
import Control.Monad.Trans.Maybe
|
|
|
|
import qualified ClassyPrelude
|
|
import qualified Data.Vector as V
|
|
import qualified Data.List as L
|
|
import qualified Data.Vector.Unboxed as UV
|
|
|
|
import Test.Tasty
|
|
import Test.Tasty.TH
|
|
import Test.Tasty.QuickCheck as QC
|
|
import Test.QuickCheck
|
|
|
|
-- Atoms Optimized For Small Values --------------------------------------------
|
|
|
|
data Unary = Z | O Unary
|
|
deriving stock (Eq, Ord, Show, Generic)
|
|
deriving anyclass Flat
|
|
|
|
instance IsAtom Unary where
|
|
toAtom Z = 0
|
|
toAtom (O u) = 1+toAtom u
|
|
fromAtom 0 = Z
|
|
fromAtom n = O (fromAtom (pred n))
|
|
|
|
data ZipAtom
|
|
= ZATiny Unary
|
|
| ZAWide Natural
|
|
deriving stock (Eq, Ord, Generic)
|
|
deriving anyclass Flat
|
|
|
|
instance Show ZipAtom where
|
|
show = show . toAtom
|
|
|
|
instance IsAtom ZipAtom where
|
|
toAtom (ZATiny u) = toAtom u
|
|
toAtom (ZAWide n) = toAtom n + 8
|
|
fromAtom a | a <= 7 = ZATiny (fromAtom a)
|
|
fromAtom (MkAtom n) = ZAWide (n-8)
|
|
|
|
|
|
-- External Types --------------------------------------------------------------
|
|
|
|
data ZipNode
|
|
= ZipAtom !ZipAtom
|
|
| ZipCell !ZipRef !ZipRef
|
|
deriving stock (Eq, Ord, Show, Generic)
|
|
deriving anyclass Flat
|
|
|
|
data ZipRef
|
|
= ZRInline !ZipNode
|
|
| ZRIndex !ZipAtom
|
|
deriving stock (Eq, Ord, Show, Generic)
|
|
deriving anyclass Flat
|
|
|
|
type Zip = ([ZipNode], ZipNode)
|
|
|
|
-- Zip and UnZip ---------------------------------------------------------------
|
|
|
|
refCount :: Noun -> Map Noun Word
|
|
refCount = go mempty
|
|
where
|
|
ins :: Noun -> Map Noun Word -> Map Noun Word
|
|
ins = alterMap (Just . maybe 1 (+1))
|
|
|
|
go :: Map Noun Word -> Noun -> Map Noun Word
|
|
go acc a@(Atom _) = ins a acc
|
|
go acc c@(Cell l r) = go (go (ins c acc) l) r
|
|
|
|
zipTable :: Noun -> (Vector Noun, Map Noun Int)
|
|
zipTable top = (V.fromList tbl, keys)
|
|
where
|
|
keys = mapFromList (ClassyPrelude.zip tbl [0..])
|
|
big = \case Atom a -> a >= 127+8
|
|
_ -> True
|
|
tbl = fmap fst
|
|
$ sortBy (comparing snd)
|
|
$ filter (\(k,v) -> big k && v>1)
|
|
$ mapToList
|
|
$ refCount top
|
|
|
|
zip :: Noun -> Zip
|
|
zip top = (V.toList dups, cvtNode top)
|
|
where
|
|
(tbl, keys) = zipTable top
|
|
dups = cvtNode <$> tbl
|
|
cvtRef n = lookup n keys & \case Nothing -> ZRInline (cvtNode n)
|
|
Just a -> ZRIndex (fromAtom $ toAtom a)
|
|
cvtNode = \case Atom a -> ZipAtom (fromAtom a)
|
|
Cell l r -> ZipCell (cvtRef l) (cvtRef r)
|
|
|
|
unzip :: Zip -> Maybe Noun
|
|
unzip (V.fromList -> dups, top) = recover top
|
|
where
|
|
recover :: ZipNode -> Maybe Noun
|
|
recover (ZipAtom a) = pure (Atom $ toAtom a)
|
|
recover (ZipCell l r) = Cell <$> getRef l <*> getRef r
|
|
|
|
getRef :: ZipRef -> Maybe Noun
|
|
getRef (ZRInline n) = recover n
|
|
getRef (ZRIndex ix) = dups V.!? fromAtom (toAtom ix) >>= recover
|
|
|
|
|
|
-- Tests -----------------------------------------------------------------------
|
|
|
|
compareSize :: Noun -> Int
|
|
compareSize n = flatSz - jamSz
|
|
where
|
|
Buf jamSz _ = fromAtom (jam' n)
|
|
flatSz = length (bits (zip n))
|
|
|
|
compareZipCompression :: Noun -> Int
|
|
compareZipCompression n = zipSz - rawSz
|
|
where
|
|
rawSz = length (bits n)
|
|
zipSz = length (bits (zip n))
|
|
|
|
compareRawToJam :: Noun -> Int
|
|
compareRawToJam n = rawSz - jamSz
|
|
where
|
|
rawSz = length (bits n)
|
|
Buf jamSz _ = fromAtom (jam' n)
|
|
|
|
prop_zipUnzip :: Noun -> Bool
|
|
prop_zipUnzip n = Just n == unzip (zip n)
|
|
|
|
zipFlat :: Noun -> ByteString
|
|
zipFlat = flat . zip
|
|
|
|
unZipFlat :: ByteString -> Maybe Noun
|
|
unZipFlat = (>>= unzip) . eitherToMaybe . unflat
|
|
|
|
prop_zipFlatRoundTrip :: Noun -> Bool
|
|
prop_zipFlatRoundTrip n = Just n == (unZipFlat . zipFlat) n
|
|
|
|
main :: IO ()
|
|
main = $(defaultMainGenerator)
|
|
|
|
dub :: Noun -> Noun
|
|
dub x = Cell x x
|
|
|
|
allAtoms :: Int -> [Noun]
|
|
allAtoms n = Atom . (\n -> 2^n - 1) <$> [0..toAtom n]
|
|
|
|
allCells :: Int -> [Noun]
|
|
allCells 0 = allAtoms 1
|
|
allCells n = do
|
|
a <- Atom <$> [0, (2 ^ toAtom n) - 1]
|
|
c <- allCells (n-1)
|
|
[Cell c a, Cell a c, Cell c c]
|
|
|
|
allNouns :: Int -> [Noun]
|
|
allNouns sz = ordNub (allCells sz)
|
|
|
|
nounSizes :: (Noun -> Int) -> Int -> [(Int, Noun)]
|
|
nounSizes f sz = sort (allNouns sz <&> \n -> (f n, n))
|
|
|
|
jamSz :: Noun -> Int
|
|
jamSz = (\(Buf sz _) -> sz) . fromAtom . jam'
|
|
|
|
showFlatZipSizes :: Int -> IO ()
|
|
showFlatZipSizes dep = traverse_ print (nounSizes (length . bits . zip) dep)
|
|
|
|
showJamSizes :: Int -> IO ()
|
|
showJamSizes dep = traverse_ print (nounSizes jamSz dep)
|
|
|
|
--------------------------------------------------------------------------------
|
|
|
|
sumJamSizes :: Int -> Int
|
|
sumJamSizes dep = sum $ map fst (nounSizes jamSz dep)
|
|
|
|
sumFlatSizes :: Int -> Int
|
|
sumFlatSizes dep = sum $ map fst (nounSizes (length . bits) dep)
|
|
|
|
sumFlatZipSizes :: Int -> Int
|
|
sumFlatZipSizes dep = sum $ map fst (nounSizes (length . bits . zip) dep)
|
|
|
|
--------------------------------------------------------------------------------
|
|
|
|
compareSizes :: (Noun -> Int) -> IO ()
|
|
compareSizes f = do
|
|
nouns <- join <$> (replicateM 100 (sample' (arbitrary :: Gen Noun)) :: IO [[Noun]])
|
|
traverse_ print $ reverse
|
|
$ ordNub
|
|
$ sort
|
|
$ fmap ((`div` 64) . f)
|
|
$ nouns
|
|
-- traverse_ print $ filter ((> 1000) . abs . f) nouns
|
|
|
|
testSizes :: IO ()
|
|
testSizes = compareSizes compareSize
|
|
|
|
testZipCompression :: IO ()
|
|
testZipCompression = compareSizes compareZipCompression
|
|
|
|
testRawToJamSizes :: IO ()
|
|
testRawToJamSizes = compareSizes compareRawToJam
|
|
|
|
allSizeTests :: IO ()
|
|
allSizeTests = do
|
|
putStrLn "zipFlat - jam"
|
|
testSizes
|
|
putStrLn "\nzipFlat - flat"
|
|
testZipCompression
|
|
putStrLn "\nflat - jam"
|
|
testRawToJamSizes
|