urbit/pkg/hs-urbit/lib/Noun/Zip.hs

{-
  Can de-duplication be orthogonal to serialization?
-}

module Noun.Zip where

import ClassyPrelude hiding (zip, unzip)

import Control.Applicative
import Control.Lens
import Data.Bits
import Data.Either.Extra
import Data.Flat
import Data.Flat
import Data.Flat.Bits
import GHC.Generics
import GHC.Natural
import Noun
import Noun.Atom
import Noun.Jam
import Test.QuickCheck.Arbitrary
import Test.QuickCheck.Gen
import Text.Printf

import Data.Maybe    (fromJust)
import Data.List     (intercalate)
import Data.Typeable (Typeable)

import Control.Monad.State.Strict hiding (forM_, replicateM)
import Control.Monad.Trans.Maybe

import qualified ClassyPrelude
import qualified Data.Vector         as V
import qualified Data.List           as L
import qualified Data.Vector.Unboxed as UV

import Test.Tasty
import Test.Tasty.TH
import Test.Tasty.QuickCheck as QC
import Test.QuickCheck

-- Atoms Optimized For Small Values --------------------------------------------

data Unary = Z | O Unary
  deriving stock    (Eq, Ord, Show, Generic)
  deriving anyclass Flat

instance IsAtom Unary where
  toAtom Z     = 0
  toAtom (O u) = 1+toAtom u
  fromAtom 0 = Z
  fromAtom n = O (fromAtom (pred n))

data ZipAtom
    = ZATiny Unary
    | ZAWide Natural
  deriving stock    (Eq, Ord, Generic)
  deriving anyclass Flat

instance Show ZipAtom where
  show = show . toAtom

instance IsAtom ZipAtom where
  toAtom (ZATiny u) = toAtom u
  toAtom (ZAWide n) = toAtom n + 8
  fromAtom a | a <= 7 = ZATiny (fromAtom a)
  fromAtom (MkAtom n) = ZAWide (n-8)


-- External Types --------------------------------------------------------------

data ZipNode
    = ZipAtom !ZipAtom
    | ZipCell !ZipRef !ZipRef
  deriving stock    (Eq, Ord, Show, Generic)
  deriving anyclass Flat

data ZipRef
    = ZRInline !ZipNode
    | ZRIndex  !ZipAtom
  deriving stock    (Eq, Ord, Show, Generic)
  deriving anyclass Flat

type Zip = ([ZipNode], ZipNode)

-- Zip and UnZip ---------------------------------------------------------------

refCount :: Noun -> Map Noun Word
refCount = go mempty
  where
    ins :: Noun -> Map Noun Word -> Map Noun Word
    ins = alterMap (Just . maybe 1 (+1))

    go :: Map Noun Word -> Noun -> Map Noun Word
    go acc a@(Atom _)   = ins a acc
    go acc c@(Cell l r) = go (go (ins c acc) l) r

zipTable :: Noun -> (Vector Noun, Map Noun Int)
zipTable top = (V.fromList tbl, keys)
  where
    keys = mapFromList (ClassyPrelude.zip tbl [0..])
    big  = \case Atom a -> a >= 127+8
                 _      -> True
    tbl  = fmap fst
         $ sortBy (comparing snd)
         $ filter (\(k,v) -> big k && v>1)
         $ mapToList
         $ refCount top

zip :: Noun -> Zip
zip top = (V.toList dups, cvtNode top)
  where
    (tbl, keys) = zipTable top
    dups        = cvtNode <$> tbl
    cvtRef n    = lookup n keys & \case Nothing -> ZRInline (cvtNode n)
                                        Just a  -> ZRIndex (fromAtom $ toAtom a)
    cvtNode     = \case Atom a   -> ZipAtom (fromAtom a)
                        Cell l r -> ZipCell (cvtRef l) (cvtRef r)

unzip :: Zip -> Maybe Noun
unzip (V.fromList -> dups, top) = recover top
  where
    recover :: ZipNode -> Maybe Noun
    recover (ZipAtom a)   = pure (Atom $ toAtom a)
    recover (ZipCell l r) = Cell <$> getRef l <*> getRef r

    getRef :: ZipRef -> Maybe Noun
    getRef (ZRInline n) = recover n
    getRef (ZRIndex ix) = dups V.!? fromAtom (toAtom ix) >>= recover


-- Tests -----------------------------------------------------------------------

compareSize :: Noun -> Int
compareSize n = flatSz - jamSz
  where
    Buf jamSz _ = fromAtom (jam' n)
    flatSz      = length (bits (zip n))

compareZipCompression :: Noun -> Int
compareZipCompression n = zipSz - rawSz
  where
    rawSz = length (bits n)
    zipSz = length (bits (zip n))

compareRawToJam :: Noun -> Int
compareRawToJam n = rawSz - jamSz
  where
    rawSz       = length (bits n)
    Buf jamSz _ = fromAtom (jam' n)

prop_zipUnzip :: Noun -> Bool
prop_zipUnzip n = Just n == unzip (zip n)

zipFlat :: Noun -> ByteString
zipFlat = flat . zip

unZipFlat :: ByteString -> Maybe Noun
unZipFlat = (>>= unzip) . eitherToMaybe . unflat

prop_zipFlatRoundTrip :: Noun -> Bool
prop_zipFlatRoundTrip n = Just n == (unZipFlat . zipFlat) n

main :: IO ()
main = $(defaultMainGenerator)

dub :: Noun -> Noun
dub x = Cell x x

allAtoms :: Int -> [Noun]
allAtoms n = Atom . (\n -> 2^n - 1) <$> [0..toAtom n]

allCells :: Int -> [Noun]
allCells 0 = allAtoms 1
allCells n = do
  a <- Atom <$> [0, (2 ^ toAtom n) - 1]
  c <- allCells (n-1)
  [Cell c a, Cell a c, Cell c c]

allNouns :: Int -> [Noun]
allNouns sz = ordNub (allCells sz)

nounSizes :: (Noun -> Int) -> Int -> [(Int, Noun)]
nounSizes f sz = sort (allNouns sz <&> \n -> (f n, n))

jamSz :: Noun -> Int
jamSz = (\(Buf sz _) -> sz) . fromAtom . jam'

showFlatZipSizes :: Int -> IO ()
showFlatZipSizes dep = traverse_ print (nounSizes (length . bits . zip) dep)

showJamSizes :: Int -> IO ()
showJamSizes dep = traverse_ print (nounSizes jamSz dep)

--------------------------------------------------------------------------------

sumJamSizes :: Int -> Int
sumJamSizes dep = sum $ map fst (nounSizes jamSz dep)

sumFlatSizes :: Int -> Int
sumFlatSizes dep = sum $ map fst (nounSizes (length . bits) dep)

sumFlatZipSizes :: Int -> Int
sumFlatZipSizes dep = sum $ map fst (nounSizes (length . bits . zip) dep)

--------------------------------------------------------------------------------

compareSizes :: (Noun -> Int) -> IO ()
compareSizes f = do
  nouns <- join <$> (replicateM 100 (sample' (arbitrary :: Gen Noun)) :: IO [[Noun]])
  traverse_ print $ reverse
                  $ ordNub
                  $ sort
                  $ fmap ((`div` 64) . f)
                  $ nouns
  -- traverse_ print $ filter ((> 1000) . abs . f) nouns

testSizes :: IO ()
testSizes = compareSizes compareSize

testZipCompression :: IO ()
testZipCompression = compareSizes compareZipCompression

testRawToJamSizes :: IO ()
testRawToJamSizes = compareSizes compareRawToJam

allSizeTests :: IO ()
allSizeTests = do
  putStrLn "zipFlat - jam"
  testSizes
  putStrLn "\nzipFlat - flat"
  testZipCompression
  putStrLn "\nflat - jam"
  testRawToJamSizes