add version of wyhash to roc lib

2024-09-22 00:09:33 +03:00 · 2022-10-12 12:08:46 -07:00 · 2022-10-12 12:08:46 -07:00 · 49c3fe1643
commit 49c3fe1643
parent 89a4522faa
1 changed files with 249 additions and 1 deletions
--- a/crates/compiler/builtins/roc/Dict.roc
+++ b/crates/compiler/builtins/roc/Dict.roc
@ -21,7 +21,8 @@ interface Dict
        Bool.{ Bool, Eq },
        Result.{ Result },
        List,
-        Num.{ Nat },
+        Num.{ Nat, U64, U8 },
+        Hash.{ Hasher },
    ]

 ## A [dictionary](https://en.wikipedia.org/wiki/Associative_array) that lets you can associate keys with values.
@ -206,3 +207,250 @@ insertIfVacant = \dict, key, value ->
        dict
    else
        Dict.insert dict key value
+
+# We have decided not to expose the standard roc hashing algorithm.
+# This is to avoid external dependence and the need for versioning.
+# For now, it will just live within the Dict file.
+# If we add non-exposed roc standard library files, this should be moved to it's own file.
+# The current implementation is a form of [Wyhash final3](https://github.com/wangyi-fudan/wyhash/blob/a5995b98ebfa7bd38bfadc0919326d2e7aabb805/wyhash.h).
+# It is 64bit and little endian specific currently.
+LowLevelHasher := { originalSeed : U64, state : U64 } has [
+         Hasher {
+             addBytes,
+             addU8,
+             addU16,
+             addU32,
+             addU64,
+             addU128,
+             addI8,
+             addI16,
+             addI32,
+             addI64,
+             addI128,
+             complete,
+         },
+     ]
+
+createLowLevelHasher : { seed ?U64 } -> LowLevelHasher
+createLowLevelHasher = \{ seed ? 0x526F_6352_616E_643F } ->
+    @LowLevelHasher { originalSeed: seed, state: seed }
+
+combineState : LowLevelHasher, { a : U64, b : U64, seed : U64, length : U64 } -> LowLevelHasher
+combineState = \@LowLevelHasher { originalSeed, state }, { a, b, seed, length } ->
+    tmp = wymix (Num.bitwiseXor wyp1 a) (Num.bitwiseXor seed b)
+    hash = wymix (Num.bitwiseXor wyp1 length) tmp
+
+    @LowLevelHasher { originalSeed, state: wymix state hash }
+
+complete = \@LowLevelHasher { state } -> state
+
+addI8 = \hasher, i8 ->
+    addU8 hasher (Num.toU8 i8)
+addI16 = \hasher, i16 ->
+    addU16 hasher (Num.toU16 i16)
+addI32 = \hasher, i32 ->
+    addU32 hasher (Num.toU32 i32)
+addI64 = \hasher, i64 ->
+    addU64 hasher (Num.toU64 i64)
+addI128 = \hasher, i128 ->
+    addU128 hasher (Num.toU128 i128)
+
+# These implementations hash each value individually with the seed and then mix
+# the resulting hash with the state. There are other options that may be faster
+# like using the output of the last hash as the seed to the current hash.
+# I am simply not sure the tradeoffs here. Theoretically this method is more sound.
+# Either way, the performance will be similar and we can change this later.
+addU8 = \@LowLevelHasher { originalSeed, state }, u8 ->
+    seed = Num.bitwiseXor originalSeed wyp0
+    p0 = Num.toU64 u8
+    a =
+        Num.shiftLeftBy p0 16
+        |> Num.bitwiseOr (Num.shiftLeftBy p0 8)
+        |> Num.bitwiseOr p0
+    b = 0
+
+    combineState (@LowLevelHasher { originalSeed, state }) { a, b, seed, length: 8 }
+
+addU16 = \@LowLevelHasher { originalSeed, state }, u16 ->
+    seed = Num.bitwiseXor originalSeed wyp0
+    p0 = Num.bitwiseAnd u16 0xFF |> Num.toU64
+    p1 = Num.shiftRightZfBy u16 8 |> Num.toU64
+    a =
+        Num.shiftLeftBy p0 16
+        |> Num.bitwiseOr (Num.shiftLeftBy p1 8)
+        |> Num.bitwiseOr p1
+    b = 0
+
+    combineState (@LowLevelHasher { originalSeed, state }) { a, b, seed, length: 8 }
+
+addU32 = \@LowLevelHasher { originalSeed, state }, u32 ->
+    seed = Num.bitwiseXor originalSeed wyp0
+    p0 = Num.toU64 u32
+    p1 = Num.toU64 u32
+    a = Num.shiftLeftBy p0 32 |> Num.bitwiseOr p1
+    b = Num.shiftLeftBy p1 32 |> Num.bitwiseOr p0
+
+    combineState (@LowLevelHasher { originalSeed, state }) { a, b, seed, length: 8 }
+
+addU64 = \@LowLevelHasher { originalSeed, state }, u64 ->
+    seed = Num.bitwiseXor originalSeed wyp0
+    p0 = Num.bitwiseAnd 0xFFFF_FFFF u64
+    p1 = Num.shiftRightZfBy u64 32
+    a = Num.shiftLeftBy p0 32 |> Num.bitwiseOr p1
+    b = Num.shiftLeftBy p1 32 |> Num.bitwiseOr p0
+
+    combineState (@LowLevelHasher { originalSeed, state }) { a, b, seed, length: 8 }
+
+addU128 = \@LowLevelHasher { originalSeed, state }, u128 ->
+    seed = Num.bitwiseXor originalSeed wyp0
+    lower = u128 |> Num.toU64
+    upper = Num.shiftRightZfBy u128 64 |> Num.toU64
+    p0 = Num.bitwiseAnd 0xFFFF_FFFF lower
+    p1 = Num.shiftRightZfBy lower 32 |> Num.bitwiseAnd 0xFFFF_FFFF
+    p2 = Num.bitwiseAnd 0xFFFF_FFFF upper
+    p3 = Num.shiftRightZfBy upper 32 |> Num.bitwiseAnd 0xFFFF_FFFF
+    a = Num.shiftLeftBy p0 32 |> Num.bitwiseOr p2
+    b = Num.shiftLeftBy p3 32 |> Num.bitwiseOr p1
+
+    combineState (@LowLevelHasher { originalSeed, state }) { a, b, seed, length: 8 }
+
+addBytes : LowLevelHasher, List U8 -> LowLevelHasher
+addBytes = \@LowLevelHasher { originalSeed, state }, list ->
+    length = List.len list
+    seed = Num.bitwiseXor originalSeed wyp0
+    abs =
+        if length <= 16 then
+            if length >= 4 then
+                x = Num.shiftRightZfBy length 3 |> Num.shiftLeftBy 2
+                a = Num.bitwiseOr (wyr4 list 0 |> Num.shiftLeftBy 32) (wyr4 list x)
+                b =
+                    (wyr4 list (Num.subWrap length 4) |> Num.shiftLeftBy 32)
+                    |> Num.bitwiseOr (wyr4 list (Num.subWrap length 4 |> Num.subWrap x))
+
+                { a, b, seed }
+            else if length > 0 then
+                { a: wyr3 list 0 length, b: 0, seed }
+            else
+                { a: 0, b: 0, seed }
+        else if length <= 48 then
+            hashBytesHelper16 seed list 0 length
+        else
+            hashBytesHelper48 seed seed seed list 0 length
+
+    combineState (@LowLevelHasher { originalSeed, state }) { a: abs.a, b: abs.b, seed: abs.seed, length: Num.toU64 length }
+
+hashBytesHelper48 : U64, U64, U64, List U8, Nat, Nat -> { a : U64, b : U64, seed : U64 }
+hashBytesHelper48 = \seed, see1, see2, list, index, remaining ->
+    newSeed = wymix (Num.bitwiseXor (wyr8 list index) wyp1) (Num.bitwiseXor (wyr8 list (Num.addWrap index 8)) seed)
+    newSee1 = wymix (Num.bitwiseXor (wyr8 list (Num.addWrap index 16)) wyp2) (Num.bitwiseXor (wyr8 list (Num.addWrap index 24)) see1)
+    newSee2 = wymix (Num.bitwiseXor (wyr8 list (Num.addWrap index 32)) wyp3) (Num.bitwiseXor (wyr8 list (Num.addWrap index 40)) see2)
+    newRemaining = Num.subWrap remaining 48
+    newIndex = Num.addWrap index 48
+
+    if newRemaining > 48 then
+        hashBytesHelper48 newSeed newSee1 newSee2 list newIndex newRemaining
+    else if newRemaining > 16 then
+        finalSeed = Num.bitwiseXor newSee2 (Num.bitwiseXor newSee1 newSeed)
+
+        hashBytesHelper16 finalSeed list newIndex newRemaining
+    else
+        finalSeed = Num.bitwiseXor newSee2 (Num.bitwiseXor newSee1 newSeed)
+
+        { a: wyr8 list (Num.subWrap newRemaining 16 |> Num.addWrap newIndex), b: wyr8 list (Num.subWrap newRemaining 8 |> Num.addWrap newIndex), seed: finalSeed }
+
+hashBytesHelper16 : U64, List U8, Nat, Nat -> { a : U64, b : U64, seed : U64 }
+hashBytesHelper16 = \seed, list, index, remaining ->
+    newSeed = wymix (Num.bitwiseXor (wyr8 list index) wyp1) (Num.bitwiseXor (wyr8 list (Num.addWrap index 8)) seed)
+    newRemaining = Num.subWrap remaining 16
+    newIndex = Num.addWrap index 16
+
+    if newRemaining <= 16 then
+        { a: wyr8 list (Num.subWrap newRemaining 16 |> Num.addWrap newIndex), b: wyr8 list (Num.subWrap newRemaining 8 |> Num.addWrap newIndex), seed: newSeed }
+    else
+        hashBytesHelper16 newSeed list newIndex newRemaining
+
+wyp0 : U64
+wyp0 = 0xa0761d6478bd642f
+wyp1 : U64
+wyp1 = 0xe7037ed1a0b428db
+wyp2 : U64
+wyp2 = 0x8ebc6af09c88c6e3
+wyp3 : U64
+wyp3 = 0x589965cc75374cc3
+
+wymix : U64, U64 -> U64
+wymix = \a, b ->
+    { lower, upper } = wymum a b
+
+    Num.bitwiseXor lower upper
+
+wymum : U64, U64 -> { lower : U64, upper : U64 }
+wymum = \a, b ->
+    r = Num.toU128 a * Num.toU128 b
+    lower = Num.toU64 r
+    upper = Num.shiftRightZfBy r 64 |> Num.toU64
+
+    # This is the more robust form.
+    # { lower: Num.bitwiseXor a lower, upper: Num.bitwiseXor b upper }
+    { lower, upper }
+
+# Get the next 8 bytes as a U64
+wyr8 : List U8, Nat -> U64
+wyr8 = \list, index ->
+    # With seamless slices and Num.fromBytes, this should be possible to make faster and nicer.
+    # It would also deal with the fact that on big endian systems we want to invert the order here.
+    # Without seamless slices, we would need fromBytes to take an index.
+    p1 = List.getUnsafe list index |> Num.toU64
+    p2 = List.getUnsafe list (Num.addWrap index 1) |> Num.toU64
+    p3 = List.getUnsafe list (Num.addWrap index 2) |> Num.toU64
+    p4 = List.getUnsafe list (Num.addWrap index 3) |> Num.toU64
+    p5 = List.getUnsafe list (Num.addWrap index 4) |> Num.toU64
+    p6 = List.getUnsafe list (Num.addWrap index 5) |> Num.toU64
+    p7 = List.getUnsafe list (Num.addWrap index 6) |> Num.toU64
+    p8 = List.getUnsafe list (Num.addWrap index 7) |> Num.toU64
+    a = Num.bitwiseOr p1 (Num.shiftLeftBy p2 8)
+    b = Num.bitwiseOr (Num.shiftLeftBy p3 16) (Num.shiftLeftBy p4 24)
+    c = Num.bitwiseOr (Num.shiftLeftBy p5 32) (Num.shiftLeftBy p6 40)
+    d = Num.bitwiseOr (Num.shiftLeftBy p7 48) (Num.shiftLeftBy p8 56)
+
+    Num.bitwiseOr (Num.bitwiseOr a b) (Num.bitwiseOr c d)
+
+# Get the next 4 bytes as a U64 with some shifting.
+wyr4 : List U8, Nat -> U64
+wyr4 = \list, index ->
+    p1 = List.getUnsafe list index |> Num.toU64
+    p2 = List.getUnsafe list (Num.addWrap index 1) |> Num.toU64
+    p3 = List.getUnsafe list (Num.addWrap index 2) |> Num.toU64
+    p4 = List.getUnsafe list (Num.addWrap index 3) |> Num.toU64
+    a = Num.bitwiseOr p1 (Num.shiftLeftBy p2 8)
+    b = Num.bitwiseOr (Num.shiftLeftBy p3 16) (Num.shiftLeftBy p4 24)
+
+    Num.bitwiseOr a b
+
+# Get the next K bytes with some shifting.
+# K must be 3 or less.
+wyr3 : List U8, Nat, Nat -> U64
+wyr3 = \list, index, k ->
+    # ((uint64_t)p[0])<<16)|(((uint64_t)p[k>>1])<<8)|p[k-1]
+    p1 = List.getUnsafe list index |> Num.toU64
+    p2 = List.getUnsafe list (Num.shiftRightZfBy k 1 |> Num.addWrap index) |> Num.toU64
+    p3 = List.getUnsafe list (Num.subWrap k 1 |> Num.addWrap index) |> Num.toU64
+    a = Num.bitwiseOr (Num.shiftLeftBy p1 16) (Num.shiftLeftBy p2 8)
+
+    Num.bitwiseOr a p3
+
+expect
+    hash =
+        createLowLevelHasher {}
+        |> addU64 0xFEDC_BA98_7654_3210
+        |> complete
+
+    hash == 0xA133_4345_21E3_2C10
+
+expect
+    hash =
+        createLowLevelHasher {}
+        |> addBytes [0x10, 0x32, 0x54, 0x76, 0x98, 0xBA, 0xDC, 0xFE]
+        |> complete
+
+    hash == 0xA133_4345_21E3_2C10