urbit/pkg/hs-urbit/lib/Data/Noun/Zip.hs

235 lines
6.1 KiB
Haskell
Raw Normal View History

2019-05-15 04:30:44 +03:00
{-
Can de-duplication be orthogonal to serialization?
-}
module Data.Noun.Zip where
2019-05-15 08:09:53 +03:00
import ClassyPrelude hiding (zip, unzip)
2019-05-15 04:30:44 +03:00
2019-05-15 08:09:53 +03:00
import Control.Lens
import Text.Printf
2019-05-15 04:30:44 +03:00
import Control.Applicative
import Data.Noun
import Data.Noun.Atom
2019-05-15 08:09:53 +03:00
import Data.Noun.Jam
2019-05-15 04:30:44 +03:00
import Data.Bits
import GHC.Generics
import Test.QuickCheck.Arbitrary
import Test.QuickCheck.Gen
import Data.Flat
import Data.Flat.Bits
2019-05-17 09:39:07 +03:00
import Data.Either.Extra
2019-05-18 02:02:39 +03:00
import GHC.Natural
import Data.Flat
2019-05-15 04:30:44 +03:00
import Data.Maybe (fromJust)
2019-05-15 04:30:44 +03:00
import Data.List (intercalate)
import Data.Typeable (Typeable)
2019-05-17 09:39:07 +03:00
import Control.Monad.State.Strict hiding (forM_, replicateM)
2019-05-15 08:09:53 +03:00
import Control.Monad.Trans.Maybe
import qualified ClassyPrelude
import qualified Data.Vector as V
import qualified Data.List as L
import qualified Data.Vector.Unboxed as UV
2019-05-15 08:09:53 +03:00
import Test.Tasty
import Test.Tasty.TH
import Test.Tasty.QuickCheck as QC
import Test.QuickCheck
2019-05-15 04:30:44 +03:00
2019-05-18 02:02:39 +03:00
-- Atoms Optimized For Small Values --------------------------------------------
data Unary = Z | O Unary
deriving stock (Eq, Ord, Show, Generic)
deriving anyclass Flat
instance IsAtom Unary where
toAtom Z = 0
toAtom (O u) = 1+toAtom u
fromAtom 0 = Z
fromAtom n = O (fromAtom (pred n))
data ZipAtom
= ZATiny Unary
| ZAWide Natural
deriving stock (Eq, Ord, Generic)
2019-05-18 02:02:39 +03:00
deriving anyclass Flat
instance Show ZipAtom where
show = show . toAtom
2019-05-18 02:02:39 +03:00
instance IsAtom ZipAtom where
toAtom (ZATiny u) = toAtom u
toAtom (ZAWide n) = toAtom n + 8
fromAtom a | a <= 7 = ZATiny (fromAtom a)
fromAtom (MkAtom n) = ZAWide (n-8)
2019-05-15 04:30:44 +03:00
-- External Types --------------------------------------------------------------
data ZipNode
2019-05-18 02:02:39 +03:00
= ZipAtom !ZipAtom
| ZipCell !ZipRef !ZipRef
deriving stock (Eq, Ord, Show, Generic)
deriving anyclass Flat
2019-05-15 04:30:44 +03:00
data ZipRef
= ZRInline !ZipNode
2019-05-18 02:02:39 +03:00
| ZRIndex !ZipAtom
deriving stock (Eq, Ord, Show, Generic)
deriving anyclass Flat
2019-05-15 04:30:44 +03:00
2019-05-18 02:02:39 +03:00
type Zip = ([ZipNode], ZipNode)
2019-05-15 04:30:44 +03:00
-- Zip and UnZip ---------------------------------------------------------------
2019-05-15 08:09:53 +03:00
refCount :: Noun -> Map Noun Word
refCount = go mempty
2019-05-15 08:09:53 +03:00
where
ins :: Noun -> Map Noun Word -> Map Noun Word
ins = alterMap (Just . maybe 1 (+1))
2019-05-15 08:09:53 +03:00
go :: Map Noun Word -> Noun -> Map Noun Word
go acc a@(Atom _) = ins a acc
go acc c@(Cell l r) = go (go (ins c acc) l) r
2019-05-15 08:09:53 +03:00
zipTable :: Noun -> (Vector Noun, Map Noun Int)
zipTable top = (V.fromList tbl, keys)
where
keys = mapFromList (ClassyPrelude.zip tbl [0..])
big = \case Atom a -> a >= 127+8
_ -> True
tbl = fmap fst
$ sortBy (comparing snd)
$ filter (\(k,v) -> big k && v>1)
$ mapToList
$ refCount top
2019-05-18 02:02:39 +03:00
2019-05-15 04:30:44 +03:00
zip :: Noun -> Zip
zip top = (V.toList dups, cvtNode top)
2019-05-15 04:30:44 +03:00
where
(tbl, keys) = zipTable top
dups = cvtNode <$> tbl
cvtRef n = lookup n keys & \case Nothing -> ZRInline (cvtNode n)
Just a -> ZRIndex (fromAtom $ toAtom a)
cvtNode = \case Atom a -> ZipAtom (fromAtom a)
Cell l r -> ZipCell (cvtRef l) (cvtRef r)
2019-05-15 08:09:53 +03:00
2019-05-15 04:30:44 +03:00
unzip :: Zip -> Maybe Noun
unzip (V.fromList -> dups, top) = recover top
2019-05-15 08:09:53 +03:00
where
recover :: ZipNode -> Maybe Noun
recover (ZipAtom a) = pure (Atom $ toAtom a)
recover (ZipCell l r) = Cell <$> getRef l <*> getRef r
2019-05-15 08:09:53 +03:00
getRef :: ZipRef -> Maybe Noun
getRef (ZRInline n) = recover n
getRef (ZRIndex ix) = dups V.!? fromAtom (toAtom ix) >>= recover
2019-05-15 08:09:53 +03:00
-- Tests -----------------------------------------------------------------------
compareSize :: Noun -> Int
compareSize n = flatSz - jamSz
2019-05-15 08:09:53 +03:00
where
2019-05-20 06:20:03 +03:00
Buf jamSz _ = fromAtom (jam' n)
2019-05-18 02:02:39 +03:00
flatSz = length (bits (zip n))
compareZipCompression :: Noun -> Int
compareZipCompression n = zipSz - rawSz
where
rawSz = length (bits n)
zipSz = length (bits (zip n))
compareRawToJam :: Noun -> Int
compareRawToJam n = rawSz - jamSz
where
rawSz = length (bits n)
2019-05-20 06:20:03 +03:00
Buf jamSz _ = fromAtom (jam' n)
2019-05-15 08:09:53 +03:00
prop_zipUnzip :: Noun -> Bool
prop_zipUnzip n = Just n == unzip (zip n)
2019-05-17 09:39:07 +03:00
zipFlat :: Noun -> ByteString
zipFlat = flat . zip
unZipFlat :: ByteString -> Maybe Noun
unZipFlat = (>>= unzip) . eitherToMaybe . unflat
prop_zipFlatRoundTrip :: Noun -> Bool
prop_zipFlatRoundTrip n = Just n == (unZipFlat . zipFlat) n
2019-05-15 08:09:53 +03:00
main :: IO ()
main = $(defaultMainGenerator)
dub :: Noun -> Noun
dub x = Cell x x
2019-05-18 02:02:39 +03:00
allAtoms :: Int -> [Noun]
allAtoms n = Atom . (\n -> 2^n - 1) <$> [0..toAtom n]
2019-05-18 02:02:39 +03:00
allCells :: Int -> [Noun]
allCells 0 = allAtoms 1
allCells n = do
a <- Atom <$> [0, (2 ^ toAtom n) - 1]
2019-05-18 02:02:39 +03:00
c <- allCells (n-1)
[Cell c a, Cell a c, Cell c c]
allNouns :: Int -> [Noun]
allNouns sz = ordNub (allCells sz)
2019-05-18 02:02:39 +03:00
nounSizes :: (Noun -> Int) -> Int -> [(Int, Noun)]
nounSizes f sz = sort (allNouns sz <&> \n -> (f n, n))
jamSz :: Noun -> Int
2019-05-20 06:20:03 +03:00
jamSz = (\(Buf sz _) -> sz) . fromAtom . jam'
2019-05-18 02:02:39 +03:00
showFlatZipSizes :: Int -> IO ()
showFlatZipSizes dep = traverse_ print (nounSizes (length . bits . zip) dep)
showJamSizes :: Int -> IO ()
showJamSizes dep = traverse_ print (nounSizes jamSz dep)
2019-05-20 06:20:03 +03:00
--------------------------------------------------------------------------------
2019-05-18 02:02:39 +03:00
sumJamSizes :: Int -> Int
sumJamSizes dep = sum $ map fst (nounSizes jamSz dep)
2019-05-20 06:20:03 +03:00
sumFlatSizes :: Int -> Int
sumFlatSizes dep = sum $ map fst (nounSizes (length . bits) dep)
sumFlatZipSizes :: Int -> Int
sumFlatZipSizes dep = sum $ map fst (nounSizes (length . bits . zip) dep)
--------------------------------------------------------------------------------
2019-05-18 02:02:39 +03:00
compareSizes :: (Noun -> Int) -> IO ()
compareSizes f = do
2019-05-20 06:20:03 +03:00
nouns <- join <$> (replicateM 100 (sample' (arbitrary :: Gen Noun)) :: IO [[Noun]])
2019-05-17 09:39:07 +03:00
traverse_ print $ reverse
$ ordNub
$ sort
2019-05-18 02:02:39 +03:00
$ fmap ((`div` 64) . f)
2019-05-17 09:39:07 +03:00
$ nouns
2019-05-18 02:02:39 +03:00
-- traverse_ print $ filter ((> 1000) . abs . f) nouns
testSizes :: IO ()
testSizes = compareSizes compareSize
testZipCompression :: IO ()
testZipCompression = compareSizes compareZipCompression
testRawToJamSizes :: IO ()
testRawToJamSizes = compareSizes compareRawToJam
allSizeTests :: IO ()
allSizeTests = do
putStrLn "zipFlat - jam"
testSizes
putStrLn "\nzipFlat - flat"
testZipCompression
putStrLn "\nflat - jam"
testRawToJamSizes