From dcd15f8c735e199abb63a0f9f68317533a73844c Mon Sep 17 00:00:00 2001 From: Arya Irani Date: Mon, 11 Jun 2018 18:19:22 -0400 Subject: [PATCH 1/3] use VarInt encoding for backrefs and sequence lengths --- parser-typechecker/src/Unison/Codecs.hs | 14 +++++++++---- .../main/src/main/scala/util/Sink.scala | 21 +++++++++++++++++-- .../main/src/main/scala/util/Source.scala | 19 +++++++++++++++-- 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/parser-typechecker/src/Unison/Codecs.hs b/parser-typechecker/src/Unison/Codecs.hs index 1f8c2a496..af6597faf 100644 --- a/parser-typechecker/src/Unison/Codecs.hs +++ b/parser-typechecker/src/Unison/Codecs.hs @@ -5,6 +5,10 @@ module Unison.Codecs where import Data.Text (Text) import Control.Arrow (second) import Control.Monad.State +import Data.Bits (Bits) +import qualified Data.Bytes.Serial as BS +import Data.Bytes.Signed (Unsigned) +import Data.Bytes.VarInt (VarInt(..)) import qualified Data.ByteString as B import Data.ByteString.Builder (doubleBE, int64BE, toLazyByteString) import qualified Data.ByteString.Lazy as BL @@ -257,10 +261,12 @@ serializeCase1 (MatchCase p guard body) = do pure $ MatchCase p posg posb putBackref :: MonadPut m => Pos -> m () -putBackref = putWord64be +putBackref = BS.serialize . VarInt -putLength :: (MonadPut m, Integral n) => n -> m () -putLength = putWord64be . fromIntegral +putLength :: (MonadPut m, Integral n, Integral (Unsigned n), + Bits n, Bits (Unsigned n)) + => n -> m () +putLength = BS.serialize . VarInt serializeMaybe :: (MonadPut m) => (a -> m ()) -> Maybe a -> m () serializeMaybe f b = case b of @@ -302,4 +308,4 @@ serializeFile (UnisonFile dataDecls effectDecls body) = do serializeFoldable (uncurry serializeConstructorArities) effectDecls' pos <- serializeTerm body putWord8 0 - putWord64be pos + putBackref pos diff --git a/runtime-jvm/main/src/main/scala/util/Sink.scala b/runtime-jvm/main/src/main/scala/util/Sink.scala index 553221abc..8e9756063 100644 --- a/runtime-jvm/main/src/main/scala/util/Sink.scala +++ b/runtime-jvm/main/src/main/scala/util/Sink.scala @@ -1,6 +1,7 @@ package org.unisonweb.util import java.nio.{ByteBuffer,BufferOverflowException} +import java.lang.Long.{compareUnsigned} import Text.Text /** @@ -15,8 +16,24 @@ trait Sink { def putLong(n: Long): Unit // todo: the UTF-8 of Long encoding, use a single byte if possible - def putVarLong(n: Long): Unit = - putLong(n) + // Uses the little-endian variable length encoding of unsigned integers: + // https://developers.google.com/protocol-buffers/docs/encoding#varints + def putVarLong(n: Long): Unit = { + val lsb = n.toShort & 0xff + if (compareUnsigned(n, 0x80) < 0) putByte(lsb.toByte) + else { + putByte((lsb | 0x80).toByte) + putVarLong(n >>> 7) + } + } + + // Uses the zigzag encoding for variable-length signed numbers, described at: + // https://developers.google.com/protocol-buffers/docs/encoding#signed-integers + // https://github.com/google/protobuf/blob/0400cca/java/core/src/main/java/com/google/protobuf/CodedOutputStream.java#L949-L952 + def putVarSignedLong(n: Long): Unit = { + putVarLong((n << 1) ^ (n >> 63)) + } + def putDouble(n: Double): Unit def putString(s: String): Unit def putText(txt: Text): Unit diff --git a/runtime-jvm/main/src/main/scala/util/Source.scala b/runtime-jvm/main/src/main/scala/util/Source.scala index 4b638e15a..aa24d6942 100644 --- a/runtime-jvm/main/src/main/scala/util/Source.scala +++ b/runtime-jvm/main/src/main/scala/util/Source.scala @@ -17,8 +17,23 @@ trait Source { self => def getByte: Byte def getInt: Int def getLong: Long - // todo: The UTF-8 of Long encodings, uses a single byte where possible - def getVarLong: Long = getLong + + // Uses the little-endian variable length encoding of unsigned integers: + // https://developers.google.com/protocol-buffers/docs/encoding#varints + def getVarLong: Long = { + val b = getByte + if ((b & 0x80) == 0) b + else (getVarLong << 7) | (b & 0x7f) + } + + // Uses the zigzag encoding for variable-length signed numbers, described at: + // https://developers.google.com/protocol-buffers/docs/encoding#signed-integers + // https://github.com/google/protobuf/blob/0400cca/java/core/src/main/java/com/google/protobuf/CodedInputStream.java#L557-L568 + def getVarSignedLong: Long = { + val n = getVarLong + (n >>> 1) ^ -(n & 1) + } + def getDouble: Double def position: Long def getFramed: Array[Byte] = get(getInt) From 5b601a424804633b4a31422c83cd4a5fe81cfb24 Mon Sep 17 00:00:00 2001 From: Arya Irani Date: Mon, 11 Jun 2018 18:54:39 -0400 Subject: [PATCH 2/3] update codecs to use VarInt encoding for Text and Array lengths Note that the max sizes for these on the JVM are <= 31 bits --- parser-typechecker/src/Unison/Codecs.hs | 4 ++-- runtime-jvm/main/src/main/scala/util/Sink.scala | 2 +- runtime-jvm/main/src/main/scala/util/Source.scala | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/parser-typechecker/src/Unison/Codecs.hs b/parser-typechecker/src/Unison/Codecs.hs index af6597faf..e59693693 100644 --- a/parser-typechecker/src/Unison/Codecs.hs +++ b/parser-typechecker/src/Unison/Codecs.hs @@ -276,7 +276,7 @@ serializeMaybe f b = case b of lengthEncode :: MonadPut m => Text -> m () lengthEncode text = do let bs = encodeUtf8 text - putWord32be . fromIntegral $ B.length bs + putLength $ B.length bs putByteString bs serializeFoldable :: (MonadPut m, Foldable f) => (a -> m ()) -> f a -> m () @@ -292,7 +292,7 @@ serializeReference ref = case ref of Derived hash -> do putWord8 1 let bs = Hash.toBytes hash - putWord32be . fromIntegral $ B.length bs + putLength $ B.length bs putByteString bs serializeConstructorArities :: MonadPut m => Reference -> [Int] -> m () diff --git a/runtime-jvm/main/src/main/scala/util/Sink.scala b/runtime-jvm/main/src/main/scala/util/Sink.scala index 8e9756063..26b1338a7 100644 --- a/runtime-jvm/main/src/main/scala/util/Sink.scala +++ b/runtime-jvm/main/src/main/scala/util/Sink.scala @@ -39,7 +39,7 @@ trait Sink { def putText(txt: Text): Unit def position: Long def putFramed(bs: Array[Byte]): Unit = { - putInt(bs.length) + putVarLong(bs.length) put(bs) } def putFramedSeq[A](seq: Seq[A])(f: (Sink,A) => Unit): Unit = diff --git a/runtime-jvm/main/src/main/scala/util/Source.scala b/runtime-jvm/main/src/main/scala/util/Source.scala index aa24d6942..d8c3f8d7c 100644 --- a/runtime-jvm/main/src/main/scala/util/Source.scala +++ b/runtime-jvm/main/src/main/scala/util/Source.scala @@ -36,7 +36,7 @@ trait Source { self => def getDouble: Double def position: Long - def getFramed: Array[Byte] = get(getInt) + def getFramed: Array[Byte] = get(getVarLong.toInt) final def getString: String = { val bytes = getFramed From 4b6de9c97f14674513c09b8dcd07dc475fd21027 Mon Sep 17 00:00:00 2001 From: Arya Irani Date: Tue, 12 Jun 2018 14:36:00 -0400 Subject: [PATCH 3/3] add todos re 32-bit length limitations --- .../main/src/main/scala/util/Sink.scala | 15 +++++++++------ .../main/src/main/scala/util/Source.scala | 18 +++++++++++++----- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/runtime-jvm/main/src/main/scala/util/Sink.scala b/runtime-jvm/main/src/main/scala/util/Sink.scala index 26b1338a7..8f6c6d281 100644 --- a/runtime-jvm/main/src/main/scala/util/Sink.scala +++ b/runtime-jvm/main/src/main/scala/util/Sink.scala @@ -15,9 +15,10 @@ trait Sink { def putInt(n: Int): Unit def putLong(n: Long): Unit - // todo: the UTF-8 of Long encoding, use a single byte if possible - // Uses the little-endian variable length encoding of unsigned integers: - // https://developers.google.com/protocol-buffers/docs/encoding#varints + /** + * Uses the little-endian variable length encoding of unsigned integers: + * https://developers.google.com/protocol-buffers/docs/encoding#varints + */ def putVarLong(n: Long): Unit = { val lsb = n.toShort & 0xff if (compareUnsigned(n, 0x80) < 0) putByte(lsb.toByte) @@ -27,9 +28,11 @@ trait Sink { } } - // Uses the zigzag encoding for variable-length signed numbers, described at: - // https://developers.google.com/protocol-buffers/docs/encoding#signed-integers - // https://github.com/google/protobuf/blob/0400cca/java/core/src/main/java/com/google/protobuf/CodedOutputStream.java#L949-L952 + /** + * Uses the zigzag encoding for variable-length signed numbers, described at: + * https://developers.google.com/protocol-buffers/docs/encoding#signed-integers + * https://github.com/google/protobuf/blob/0400cca/java/core/src/main/java/com/google/protobuf/CodedOutputStream.java#L949-L952 + */ def putVarSignedLong(n: Long): Unit = { putVarLong((n << 1) ^ (n >> 63)) } diff --git a/runtime-jvm/main/src/main/scala/util/Source.scala b/runtime-jvm/main/src/main/scala/util/Source.scala index d8c3f8d7c..7e493bad8 100644 --- a/runtime-jvm/main/src/main/scala/util/Source.scala +++ b/runtime-jvm/main/src/main/scala/util/Source.scala @@ -12,23 +12,28 @@ import scala.reflect.ClassTag * The cursor position can be accessed via the `position` method. */ trait Source { self => + // todo: use a representation that supports 64-bit lengths, unlike Array def get(n: Int): Array[Byte] def getBoolean: Boolean = getByte != 0 def getByte: Byte def getInt: Int def getLong: Long - // Uses the little-endian variable length encoding of unsigned integers: - // https://developers.google.com/protocol-buffers/docs/encoding#varints + /** + * Uses the little-endian variable length encoding of unsigned integers: + * https://developers.google.com/protocol-buffers/docs/encoding#varints + */ def getVarLong: Long = { val b = getByte if ((b & 0x80) == 0) b else (getVarLong << 7) | (b & 0x7f) } - // Uses the zigzag encoding for variable-length signed numbers, described at: - // https://developers.google.com/protocol-buffers/docs/encoding#signed-integers - // https://github.com/google/protobuf/blob/0400cca/java/core/src/main/java/com/google/protobuf/CodedInputStream.java#L557-L568 + /** + * Uses the zigzag encoding for variable-length signed numbers, described at: + * https://developers.google.com/protocol-buffers/docs/encoding#signed-integers + * https://github.com/google/protobuf/blob/0400cca/java/core/src/main/java/com/google/protobuf/CodedOutputStream.java#L949-L952 + */ def getVarSignedLong: Long = { val n = getVarLong (n >>> 1) ^ -(n & 1) @@ -36,8 +41,11 @@ trait Source { self => def getDouble: Double def position: Long + + // todo: use a representation that supports 64-bit lengths, unlike Array def getFramed: Array[Byte] = get(getVarLong.toInt) + // todo: use a representation that supports 64-bit lengths, unlike String final def getString: String = { val bytes = getFramed new String(bytes, java.nio.charset.StandardCharsets.UTF_8)