2019-05-15 04:30:44 +03:00
|
|
|
{-
|
|
|
|
Can de-duplication be orthogonal to serialization?
|
|
|
|
-}
|
|
|
|
|
|
|
|
module Data.Noun.Zip where
|
|
|
|
|
2019-05-15 08:09:53 +03:00
|
|
|
import ClassyPrelude hiding (zip, unzip)
|
2019-05-15 04:30:44 +03:00
|
|
|
|
2019-05-15 08:09:53 +03:00
|
|
|
import Control.Lens
|
|
|
|
import Text.Printf
|
2019-05-15 04:30:44 +03:00
|
|
|
import Control.Applicative
|
|
|
|
import Data.Noun
|
|
|
|
import Data.Noun.Atom
|
2019-05-15 08:09:53 +03:00
|
|
|
import Data.Noun.Jam
|
2019-05-15 04:30:44 +03:00
|
|
|
import Data.Bits
|
|
|
|
import GHC.Generics
|
|
|
|
import Test.QuickCheck.Arbitrary
|
|
|
|
import Test.QuickCheck.Gen
|
2019-05-17 05:56:06 +03:00
|
|
|
import Data.Flat
|
|
|
|
import Data.Flat.Bits
|
2019-05-17 09:39:07 +03:00
|
|
|
import Data.Either.Extra
|
2019-05-15 04:30:44 +03:00
|
|
|
|
|
|
|
import Data.List (intercalate)
|
|
|
|
import Data.Typeable (Typeable)
|
2019-05-17 05:56:06 +03:00
|
|
|
import Data.Word
|
2019-05-15 04:30:44 +03:00
|
|
|
|
2019-05-17 09:39:07 +03:00
|
|
|
import Control.Monad.State.Strict hiding (forM_, replicateM)
|
2019-05-15 08:09:53 +03:00
|
|
|
import Control.Monad.Trans.Maybe
|
|
|
|
|
2019-05-17 05:56:06 +03:00
|
|
|
import qualified Data.Vector as V
|
|
|
|
import qualified Data.List as L
|
|
|
|
import qualified Data.Vector.Unboxed as UV
|
2019-05-15 08:09:53 +03:00
|
|
|
|
|
|
|
import Test.Tasty
|
|
|
|
import Test.Tasty.TH
|
|
|
|
import Test.Tasty.QuickCheck as QC
|
|
|
|
import Test.QuickCheck
|
2019-05-15 04:30:44 +03:00
|
|
|
|
|
|
|
|
|
|
|
-- External Types --------------------------------------------------------------
|
|
|
|
|
|
|
|
data ZipNode
|
|
|
|
= ZipAtom !Atom
|
2019-05-17 05:56:06 +03:00
|
|
|
| ZipCell !ZipRef !ZipRef
|
|
|
|
deriving stock (Eq, Ord, Show, Generic)
|
|
|
|
deriving anyclass Flat
|
2019-05-15 04:30:44 +03:00
|
|
|
|
2019-05-17 05:56:06 +03:00
|
|
|
data ZipRef
|
|
|
|
= ZRInline !ZipNode
|
|
|
|
| ZRIndex !Word
|
|
|
|
deriving stock (Eq, Ord, Show, Generic)
|
|
|
|
deriving anyclass Flat
|
2019-05-15 04:30:44 +03:00
|
|
|
|
2019-05-17 09:39:07 +03:00
|
|
|
type Zip = [ZipNode]
|
2019-05-15 04:30:44 +03:00
|
|
|
|
2019-05-17 05:56:06 +03:00
|
|
|
-- Zip -------------------------------------------------------------------------
|
2019-05-15 08:09:53 +03:00
|
|
|
|
2019-05-17 05:56:06 +03:00
|
|
|
type ZipM a = State ([ZipNode], Word, Map Noun Word) a
|
2019-05-15 08:09:53 +03:00
|
|
|
|
2019-05-17 05:56:06 +03:00
|
|
|
findDups :: Noun -> Set Noun
|
2019-05-17 09:39:07 +03:00
|
|
|
findDups = keysSet . filterMap (> 1) . go mempty
|
2019-05-15 08:09:53 +03:00
|
|
|
where
|
2019-05-17 05:56:06 +03:00
|
|
|
ins :: Noun -> Map Noun Word -> Map Noun Word
|
2019-05-17 09:39:07 +03:00
|
|
|
ins = alterMap (Just . maybe 1 (+1))
|
2019-05-15 08:09:53 +03:00
|
|
|
|
2019-05-17 05:56:06 +03:00
|
|
|
go :: Map Noun Word -> Noun -> Map Noun Word
|
|
|
|
go acc a@(Atom _) = ins a acc
|
|
|
|
go acc c@(Cell l r) = go (go (ins c acc) l) r
|
2019-05-15 08:09:53 +03:00
|
|
|
|
2019-05-15 04:30:44 +03:00
|
|
|
zip :: Noun -> Zip
|
2019-05-17 05:56:06 +03:00
|
|
|
zip top = evalState (go top >> end) ([], 0, mempty)
|
2019-05-15 04:30:44 +03:00
|
|
|
where
|
2019-05-17 05:56:06 +03:00
|
|
|
dups :: Set Noun
|
|
|
|
dups = findDups top
|
|
|
|
|
2019-05-15 04:30:44 +03:00
|
|
|
end :: ZipM Zip
|
|
|
|
end = do
|
|
|
|
(acc, _, _) <- get
|
2019-05-17 09:39:07 +03:00
|
|
|
pure (reverse acc)
|
2019-05-15 04:30:44 +03:00
|
|
|
|
2019-05-17 05:56:06 +03:00
|
|
|
ins :: Noun -> ZipNode -> ZipM ZipRef
|
2019-05-15 04:30:44 +03:00
|
|
|
ins noun node = do
|
|
|
|
(acc, nex, tbl) <- get
|
|
|
|
put (node:acc, nex+1, insertMap noun nex tbl)
|
2019-05-17 05:56:06 +03:00
|
|
|
pure (ZRIndex nex)
|
|
|
|
|
|
|
|
doAtom :: Atom -> ZipM ZipRef
|
|
|
|
doAtom a = do
|
|
|
|
if a >= 128 && member (Atom a) dups
|
|
|
|
then ins (Atom a) (ZipAtom a)
|
|
|
|
else pure (ZRInline (ZipAtom a))
|
|
|
|
|
|
|
|
doCell :: (Noun, Noun) -> ZipM ZipRef
|
|
|
|
doCell (l,r) = do
|
|
|
|
lRef <- loop l
|
|
|
|
rRef <- loop r
|
|
|
|
let res = ZipCell lRef rRef
|
|
|
|
if member (Cell l r) dups
|
|
|
|
then ins (Cell l r) res
|
|
|
|
else pure (ZRInline res)
|
|
|
|
|
|
|
|
loop :: Noun -> ZipM ZipRef
|
|
|
|
loop noun = do
|
2019-05-15 04:30:44 +03:00
|
|
|
(acc, nex, tbl) <- get
|
|
|
|
case (lookup noun tbl, noun) of
|
2019-05-17 05:56:06 +03:00
|
|
|
(Just w, _) -> pure (ZRIndex w)
|
|
|
|
(Nothing, Atom atm) -> doAtom atm
|
|
|
|
(Nothing, Cell l r) -> doCell (l,r)
|
2019-05-15 04:30:44 +03:00
|
|
|
|
2019-05-17 05:56:06 +03:00
|
|
|
go :: Noun -> ZipM ZipRef
|
|
|
|
go noun = do
|
|
|
|
loop noun >>= \case
|
|
|
|
ZRInline x -> ins noun x
|
|
|
|
ZRIndex _ -> error "Impossible -- duplicate top-level node"
|
2019-05-15 08:09:53 +03:00
|
|
|
|
|
|
|
-- Unzip -----------------------------------------------------------------------
|
|
|
|
|
|
|
|
type UnZipM a = MaybeT (State (Word, Map Word Noun)) a
|
|
|
|
|
2019-05-15 04:30:44 +03:00
|
|
|
unzip :: Zip -> Maybe Noun
|
2019-05-17 09:39:07 +03:00
|
|
|
unzip = \case [] -> Nothing
|
|
|
|
zs -> L.last <$> cvt zs
|
2019-05-15 08:09:53 +03:00
|
|
|
where
|
|
|
|
cvt :: [ZipNode] -> Maybe [Noun]
|
|
|
|
cvt nodes = evalState (runMaybeT $ go nodes) (0, mempty)
|
|
|
|
|
|
|
|
ins :: Noun -> UnZipM Noun
|
|
|
|
ins noun = do
|
|
|
|
modify $ \(nex, tbl) -> (nex+1, insertMap nex noun tbl)
|
|
|
|
pure noun
|
|
|
|
|
2019-05-17 05:56:06 +03:00
|
|
|
find :: ZipRef -> UnZipM Noun
|
|
|
|
find (ZRInline (ZipAtom a)) = pure (Atom a)
|
|
|
|
find (ZRInline (ZipCell l r)) = Cell <$> find l <*> find r
|
|
|
|
find (ZRIndex idx) = do (nex, tbl) <- get
|
2019-05-17 09:39:07 +03:00
|
|
|
(MaybeT . pure) $ lookup idx tbl
|
2019-05-15 08:09:53 +03:00
|
|
|
|
|
|
|
go :: [ZipNode] -> UnZipM [Noun]
|
|
|
|
go = mapM $ \case ZipAtom a -> ins (Atom a)
|
|
|
|
ZipCell l r -> ins =<< Cell <$> find l <*> find r
|
|
|
|
|
|
|
|
|
|
|
|
-- Tests -----------------------------------------------------------------------
|
|
|
|
|
2019-05-17 05:56:06 +03:00
|
|
|
compareSize :: Noun -> Int
|
|
|
|
compareSize n = flatSz - jamSz
|
2019-05-15 08:09:53 +03:00
|
|
|
where
|
2019-05-17 05:56:06 +03:00
|
|
|
Buf jamSz _ = fromAtom (jam n)
|
|
|
|
flatSz = UV.length (bits (zip n))
|
2019-05-15 08:09:53 +03:00
|
|
|
|
|
|
|
prop_zipUnzip :: Noun -> Bool
|
|
|
|
prop_zipUnzip n = Just n == unzip (zip n)
|
|
|
|
|
2019-05-17 09:39:07 +03:00
|
|
|
zipFlat :: Noun -> ByteString
|
|
|
|
zipFlat = flat . zip
|
|
|
|
|
|
|
|
unZipFlat :: ByteString -> Maybe Noun
|
|
|
|
unZipFlat = (>>= unzip) . eitherToMaybe . unflat
|
|
|
|
|
|
|
|
prop_zipFlatRoundTrip :: Noun -> Bool
|
|
|
|
prop_zipFlatRoundTrip n = Just n == (unZipFlat . zipFlat) n
|
|
|
|
|
2019-05-15 08:09:53 +03:00
|
|
|
main :: IO ()
|
|
|
|
main = $(defaultMainGenerator)
|
2019-05-17 05:56:06 +03:00
|
|
|
|
|
|
|
dub :: Noun -> Noun
|
|
|
|
dub x = Cell x x
|
|
|
|
|
|
|
|
testSizes :: IO ()
|
|
|
|
testSizes = do
|
2019-05-17 09:39:07 +03:00
|
|
|
nouns <- join <$> (replicateM 50 (sample' (arbitrary :: Gen Noun)) :: IO [[Noun]])
|
|
|
|
traverse_ print $ reverse
|
|
|
|
$ ordNub
|
|
|
|
$ sort
|
|
|
|
$ fmap ((`div` 64) . compareSize)
|
|
|
|
$ nouns
|
|
|
|
-- traverse_ print $ filter ((> 1000) . abs . compareSize) nouns
|