shrub/pkg/hair/lib/Data/Noun/Zip.hs

178 lines
4.6 KiB
Haskell
Raw Normal View History

2019-05-15 04:30:44 +03:00
{-
Can de-duplication be orthogonal to serialization?
-}
module Data.Noun.Zip where
2019-05-15 08:09:53 +03:00
import ClassyPrelude hiding (zip, unzip)
2019-05-15 04:30:44 +03:00
2019-05-15 08:09:53 +03:00
import Control.Lens
import Text.Printf
2019-05-15 04:30:44 +03:00
import Control.Applicative
import Data.Noun
import Data.Noun.Atom
2019-05-15 08:09:53 +03:00
import Data.Noun.Jam
2019-05-15 04:30:44 +03:00
import Data.Bits
import GHC.Generics
import Test.QuickCheck.Arbitrary
import Test.QuickCheck.Gen
import Data.Flat
import Data.Flat.Bits
2019-05-17 09:39:07 +03:00
import Data.Either.Extra
2019-05-15 04:30:44 +03:00
import Data.List (intercalate)
import Data.Typeable (Typeable)
import Data.Word
2019-05-15 04:30:44 +03:00
2019-05-17 09:39:07 +03:00
import Control.Monad.State.Strict hiding (forM_, replicateM)
2019-05-15 08:09:53 +03:00
import Control.Monad.Trans.Maybe
import qualified Data.Vector as V
import qualified Data.List as L
import qualified Data.Vector.Unboxed as UV
2019-05-15 08:09:53 +03:00
import Test.Tasty
import Test.Tasty.TH
import Test.Tasty.QuickCheck as QC
import Test.QuickCheck
2019-05-15 04:30:44 +03:00
-- External Types --------------------------------------------------------------
data ZipNode
= ZipAtom !Atom
| ZipCell !ZipRef !ZipRef
deriving stock (Eq, Ord, Show, Generic)
deriving anyclass Flat
2019-05-15 04:30:44 +03:00
data ZipRef
= ZRInline !ZipNode
| ZRIndex !Word
deriving stock (Eq, Ord, Show, Generic)
deriving anyclass Flat
2019-05-15 04:30:44 +03:00
2019-05-17 09:39:07 +03:00
type Zip = [ZipNode]
2019-05-15 04:30:44 +03:00
-- Zip -------------------------------------------------------------------------
2019-05-15 08:09:53 +03:00
type ZipM a = State ([ZipNode], Word, Map Noun Word) a
2019-05-15 08:09:53 +03:00
findDups :: Noun -> Set Noun
2019-05-17 09:39:07 +03:00
findDups = keysSet . filterMap (> 1) . go mempty
2019-05-15 08:09:53 +03:00
where
ins :: Noun -> Map Noun Word -> Map Noun Word
2019-05-17 09:39:07 +03:00
ins = alterMap (Just . maybe 1 (+1))
2019-05-15 08:09:53 +03:00
go :: Map Noun Word -> Noun -> Map Noun Word
go acc a@(Atom _) = ins a acc
go acc c@(Cell l r) = go (go (ins c acc) l) r
2019-05-15 08:09:53 +03:00
2019-05-15 04:30:44 +03:00
zip :: Noun -> Zip
zip top = evalState (go top >> end) ([], 0, mempty)
2019-05-15 04:30:44 +03:00
where
dups :: Set Noun
dups = findDups top
2019-05-15 04:30:44 +03:00
end :: ZipM Zip
end = do
(acc, _, _) <- get
2019-05-17 09:39:07 +03:00
pure (reverse acc)
2019-05-15 04:30:44 +03:00
ins :: Noun -> ZipNode -> ZipM ZipRef
2019-05-15 04:30:44 +03:00
ins noun node = do
(acc, nex, tbl) <- get
put (node:acc, nex+1, insertMap noun nex tbl)
pure (ZRIndex nex)
doAtom :: Atom -> ZipM ZipRef
doAtom a = do
if a >= 128 && member (Atom a) dups
then ins (Atom a) (ZipAtom a)
else pure (ZRInline (ZipAtom a))
doCell :: (Noun, Noun) -> ZipM ZipRef
doCell (l,r) = do
lRef <- loop l
rRef <- loop r
let res = ZipCell lRef rRef
if member (Cell l r) dups
then ins (Cell l r) res
else pure (ZRInline res)
loop :: Noun -> ZipM ZipRef
loop noun = do
2019-05-15 04:30:44 +03:00
(acc, nex, tbl) <- get
case (lookup noun tbl, noun) of
(Just w, _) -> pure (ZRIndex w)
(Nothing, Atom atm) -> doAtom atm
(Nothing, Cell l r) -> doCell (l,r)
2019-05-15 04:30:44 +03:00
go :: Noun -> ZipM ZipRef
go noun = do
loop noun >>= \case
ZRInline x -> ins noun x
ZRIndex _ -> error "Impossible -- duplicate top-level node"
2019-05-15 08:09:53 +03:00
-- Unzip -----------------------------------------------------------------------
type UnZipM a = MaybeT (State (Word, Map Word Noun)) a
2019-05-15 04:30:44 +03:00
unzip :: Zip -> Maybe Noun
2019-05-17 09:39:07 +03:00
unzip = \case [] -> Nothing
zs -> L.last <$> cvt zs
2019-05-15 08:09:53 +03:00
where
cvt :: [ZipNode] -> Maybe [Noun]
cvt nodes = evalState (runMaybeT $ go nodes) (0, mempty)
ins :: Noun -> UnZipM Noun
ins noun = do
modify $ \(nex, tbl) -> (nex+1, insertMap nex noun tbl)
pure noun
find :: ZipRef -> UnZipM Noun
find (ZRInline (ZipAtom a)) = pure (Atom a)
find (ZRInline (ZipCell l r)) = Cell <$> find l <*> find r
find (ZRIndex idx) = do (nex, tbl) <- get
2019-05-17 09:39:07 +03:00
(MaybeT . pure) $ lookup idx tbl
2019-05-15 08:09:53 +03:00
go :: [ZipNode] -> UnZipM [Noun]
go = mapM $ \case ZipAtom a -> ins (Atom a)
ZipCell l r -> ins =<< Cell <$> find l <*> find r
-- Tests -----------------------------------------------------------------------
compareSize :: Noun -> Int
compareSize n = flatSz - jamSz
2019-05-15 08:09:53 +03:00
where
Buf jamSz _ = fromAtom (jam n)
flatSz = UV.length (bits (zip n))
2019-05-15 08:09:53 +03:00
prop_zipUnzip :: Noun -> Bool
prop_zipUnzip n = Just n == unzip (zip n)
2019-05-17 09:39:07 +03:00
zipFlat :: Noun -> ByteString
zipFlat = flat . zip
unZipFlat :: ByteString -> Maybe Noun
unZipFlat = (>>= unzip) . eitherToMaybe . unflat
prop_zipFlatRoundTrip :: Noun -> Bool
prop_zipFlatRoundTrip n = Just n == (unZipFlat . zipFlat) n
2019-05-15 08:09:53 +03:00
main :: IO ()
main = $(defaultMainGenerator)
dub :: Noun -> Noun
dub x = Cell x x
testSizes :: IO ()
testSizes = do
2019-05-17 09:39:07 +03:00
nouns <- join <$> (replicateM 50 (sample' (arbitrary :: Gen Noun)) :: IO [[Noun]])
traverse_ print $ reverse
$ ordNub
$ sort
$ fmap ((`div` 64) . compareSize)
$ nouns
-- traverse_ print $ filter ((> 1000) . abs . compareSize) nouns