Add benchmarks related to add_row_number performance investigation (#8091)

- Follow-up of #8055 - Adds a benchmark comparing performance of Enso Map and Java HashMap in two scenarios - _only incremental_ updates (like `Vector.distinct`) and _replacing_ updates (like keeping a counter for each key). These benchmarks can be used as a metric for #8090
2024-12-22 23:31:42 +03:00 · 2023-10-18 19:21:59 +02:00 · 2023-10-18 19:21:59 +02:00 · 93a31fcc8b
commit 93a31fcc8b
parent cec115d25b
7 changed files with 271 additions and 13 deletions
--- a/build.sbt
+++ b/build.sbt
@ -308,7 +308,8 @@ lazy val enso = (project in file("."))
    `std-aws`,
    `simple-httpbin`,
    `enso-test-java-helpers`,
-    `exploratory-benchmark-java-helpers`
+    `exploratory-benchmark-java-helpers`,
+    `benchmark-java-helpers`
  )
  .settings(Global / concurrentRestrictions += Tags.exclusive(Exclusive))
  .settings(
@ -1386,6 +1387,7 @@ lazy val runtime = (project in file("engine/runtime"))
    (Runtime / compile) := (Runtime / compile)
      .dependsOn(`std-base` / Compile / packageBin)
      .dependsOn(`enso-test-java-helpers` / Compile / packageBin)
+      .dependsOn(`benchmark-java-helpers` / Compile / packageBin)
      .dependsOn(`exploratory-benchmark-java-helpers` / Compile / packageBin)
      .dependsOn(`std-image` / Compile / packageBin)
      .dependsOn(`std-database` / Compile / packageBin)
@ -2200,6 +2202,26 @@ lazy val `exploratory-benchmark-java-helpers` = project
  .dependsOn(`std-base` % "provided")
  .dependsOn(`std-table` % "provided")

+lazy val `benchmark-java-helpers` = project
+  .in(
+    file(
+      "test/Benchmarks/polyglot-sources/benchmark-java-helpers"
+    )
+  )
+  .settings(
+    frgaalJavaCompilerSetting,
+    autoScalaLibrary := false,
+    Compile / packageBin / artifactPath :=
+      file(
+        "test/Benchmarks/polyglot/java/benchmark-java-helpers.jar"
+      ),
+    libraryDependencies ++= Seq(
+      "org.graalvm.sdk" % "graal-sdk" % graalMavenPackagesVersion % "provided"
+    )
+  )
+  .dependsOn(`std-base` % "provided")
+  .dependsOn(`std-table` % "provided")
+
 lazy val `std-table` = project
  .in(file("std-bits") / "table")
  .enablePlugins(Antlr4Plugin)
@ -2531,12 +2553,14 @@ pkgStdLibInternal := Def.inputTask {
    case "TestHelpers" =>
      (`enso-test-java-helpers` / Compile / packageBin).value
      (`exploratory-benchmark-java-helpers` / Compile / packageBin).value
+      (`benchmark-java-helpers` / Compile / packageBin).value
    case "AWS" =>
      (`std-aws` / Compile / packageBin).value
    case _ if buildAllCmd =>
      (`std-base` / Compile / packageBin).value
      (`enso-test-java-helpers` / Compile / packageBin).value
      (`exploratory-benchmark-java-helpers` / Compile / packageBin).value
+      (`benchmark-java-helpers` / Compile / packageBin).value
      (`std-table` / Compile / packageBin).value
      (`std-database` / Compile / packageBin).value
      (`std-image` / Compile / packageBin).value
--- a/distribution/lib/Standard/Test/0.0.0-dev/src/Bench.enso
+++ b/distribution/lib/Standard/Test/0.0.0-dev/src/Bench.enso
@ -159,10 +159,10 @@ type Bench
        count = self.total_specs
        IO.println <| "Found " + count.to_text + " cases to execute (ETA " + self.estimated_runtime.to_display_text + ")"

-        case Environment.get "ENSO_BENCHMARK_REPORT_PATH" of
+        case get_benchmark_report_path of
            Nothing -> Nothing
            path ->
-                line = 'Label,Phase,"Invocations count","Average time (ms)"'
+                line = 'Label,Phase,"Invocations count","Average time (ms)","Time Stdev"'
                line.write path on_existing_file=Existing_File_Behavior.Backup

        self.fold Nothing _-> g-> s->
@ -211,8 +211,10 @@ type Bench
       computation.
    single_call ~act =
        start = System.nano_time
-        Runtime.no_inline act
+        r = Runtime.no_inline act
        end = System.nano_time
+        # If the computation returned a dataflow error, we raise it to a panic - we do not want silent failures in benchmarks.
+        Panic.rethrow r
        end - start

    ## Run a single phase of the benchmark.
@ -237,13 +239,15 @@ type Bench
                durations_builder.append dur
                @Tail_Call go (cur_ns + dur)
        go phase_start
-        durations = durations_builder.to_vector
-        sum = durations.reduce (_ + _)
+        nanos_in_ms = 1000000
+        durations = durations_builder.to_vector.map (x-> x / nanos_in_ms)
+        stats = durations.compute_bulk [Statistic.Mean, Statistic.Standard_Deviation]
+        avg = stats.first
+        stddev = stats.second
        run_iters = durations.length
-        avg = (sum / run_iters) / 1000000
        phase_end = System.nano_time
        phase_duration = Duration.new nanoseconds=(phase_end - phase_start)
-        Bench.summarize_phase label phase_name run_iters avg phase_duration
+        Bench.summarize_phase label phase_name run_iters avg stddev phase_duration

    ## PRIVATE
       This is a very simple implementation of summarizing the benchmark
@ -251,16 +255,17 @@ type Bench

       We may want to improve it later, but it gets the job done to give us
       simple summary that can be analysed more easily than logs.
-    summarize_phase (label:Text) (phase_name:Text) (invocations:Integer) (average_time:Float) (phase_duration:Duration) =
-        fmt = average_time.format "#.###"
+    summarize_phase (label:Text) (phase_name:Text) (invocations:Integer) (average_time:Float) (time_stddev:Float) (phase_duration:Duration) =
+        avg_fmt = average_time.format "#.###"
+        stddev_fmt = time_stddev.format "#.###"
        IO.println <| phase_name + " duration:    " + (phase_duration.total_milliseconds.format "#.##") + " ms"
        IO.println <| phase_name + " invocations: " + invocations.to_text
-        IO.println <| phase_name + " avg time:    " + fmt + " ms"
+        IO.println <| phase_name + " avg time:    " + avg_fmt + " ms (+-" + stddev_fmt + "))"

-        case Environment.get "ENSO_BENCHMARK_REPORT_PATH" of
+        case get_benchmark_report_path of
            Nothing -> Nothing
            path ->
-                line = '\n"'+label+'","'+phase_name+'",'+invocations.to_text+','+fmt
+                line = '\n"'+label+'","'+phase_name+'",'+invocations.to_text+','+avg_fmt+','+stddev_fmt
                line.write path on_existing_file=Existing_File_Behavior.Append

 ## PRIVATE
@ -274,3 +279,7 @@ validate_name name =
    valid_java_identifier_regex = Regex.compile "[A-Za-z_$][a-zA-Z0-9_$]*"
    if valid_java_identifier_regex.matches name then Nothing else
        Panic.throw (Illegal_Argument.Error ("Invalid benchmark name: '" + name + "'"))
+
+## PRIVATE
+get_benchmark_report_path : Text | Nothing
+get_benchmark_report_path = Environment.get "ENSO_BENCHMARK_REPORT_PATH"
--- a/std-bits/table/src/main/java/org/enso/table/data/index/OrderedMultiValueKey.java
+++ b/std-bits/table/src/main/java/org/enso/table/data/index/OrderedMultiValueKey.java
@ -73,4 +73,9 @@ public class OrderedMultiValueKey extends MultiValueKeyBase
        "Currently no hash_code implementation consistent with the ObjectComparator is exposed, so"
            + " OrderedMultiValueKey is not hashable.");
  }
+
+  @Override
+  public String toString() {
+    return "OrderedMultiValueKey{row="+rowIndex+"}";
+  }
 }
--- a/test/Benchmarks/polyglot-sources/benchmark-java-helpers/src/main/java/org/enso/benchmark_helpers/JavaHashMapWrapper.java
+++ b/test/Benchmarks/polyglot-sources/benchmark-java-helpers/src/main/java/org/enso/benchmark_helpers/JavaHashMapWrapper.java
@ -0,0 +1,36 @@
+package org.enso.benchmark_helpers;
+
+import java.util.HashMap;
+
+/**
+ * Wraps a Java HashMap into an interface hiding it, to ensure that we are calling the raw HashMap
+ * and are not using the Enso conversions that may be applied automatically. This allows us to
+ * compare the raw HashMap performance with other variants.
+ */
+public class JavaHashMapWrapper {
+  private final HashMap<Object, Object> map = new HashMap<>();
+
+  public JavaHashMapWrapper insert(Object key, Object value) {
+    map.put(key, value);
+    return this;
+  }
+
+  public Object get(Object key) {
+    return map.get(key);
+  }
+
+  public long size() {
+    return map.size();
+  }
+
+  public Object[][] to_vector() {
+    Object[][] result = new Object[map.size()][2];
+    int i = 0;
+    for (var entry : map.entrySet()) {
+      result[i][0] = entry.getKey();
+      result[i][1] = entry.getValue();
+      i++;
+    }
+    return result;
+  }
+}
--- a/test/Benchmarks/src/Main.enso
+++ b/test/Benchmarks/src/Main.enso
@ -11,6 +11,7 @@ import project.Table.Arithmetic
 import project.Table.Column_From_Vector
 import project.Table.Cross_Tab
 import project.Table.Sorting
+import project.Table.Internal.Multi_Value_Key
 import project.Text.Build
 import project.Text.Compare
 import project.Text.Contains
@ -22,6 +23,7 @@ import project.Collections
 import project.Column_Numeric
 import project.Equality
 import project.Json_Bench
+import project.Map.Hash_Map
 import project.Natural_Order_Sort
 import project.Number_Parse
 import project.Numeric
@ -39,6 +41,9 @@ all_benchmarks =
    builder.append Operations.collect_benches
    builder.append Sort.collect_benches

+    # Map
+    builder.append Hash_Map.collect_benches
+
    # Statistics
    builder.append Count_Min_Max.collect_benches

@ -49,6 +54,7 @@ all_benchmarks =
    builder.append Column_From_Vector.collect_benches
    builder.append Cross_Tab.collect_benches
    builder.append Sorting.collect_benches
+    builder.append Multi_Value_Key.collect_benches

    # Text
    builder.append Build.collect_benches
--- a/test/Benchmarks/src/Map/Hash_Map.enso
+++ b/test/Benchmarks/src/Map/Hash_Map.enso
@ -0,0 +1,59 @@
+from Standard.Base import all
+
+from Standard.Table import Column, Value_Type, Auto
+import Standard.Table.Data.Type.Value_Type.Bits
+
+from Standard.Test import Bench
+
+polyglot java import org.enso.benchmark_helpers.JavaHashMapWrapper
+
+options = Bench.options . set_warmup (Bench.phase_conf 2 2) . set_measure (Bench.phase_conf 2 3)
+
+type Data
+    Value ~ints
+
+    create n =
+        create_ints =
+            rng = Random.new
+            Vector.new n _->
+                rng.integer 0 (n.div 100)
+        Data.Value create_ints
+
+type Scenario
+    Instance map_constructor
+
+    # Counts distinct values in a vector
+    run_distinct self ints =
+        new_map = ints.fold (self.map_constructor Nothing) acc-> x->
+            if acc.get x . is_nothing . not then acc else
+                acc.insert x True
+        new_map.size
+
+    # Finds the most frequent value in a vector
+    run_count_keys self ints =
+        new_map = ints.fold (self.map_constructor Nothing) acc-> x->
+            current_count = (acc.get x . if_nothing 0) + 1
+            acc.insert x current_count
+        max_key = new_map.to_vector.fold (Pair.new Nothing 0) acc-> entry->
+            freq = entry.second
+            if freq > acc.second then Pair.new entry.first freq else acc
+        max_key
+
+collect_benches = Bench.build builder->
+    n = 100000
+    data = Data.create n
+
+    builder.group ("Enso_Hash_Map_" + n.to_text) options group_builder->
+        # Scenario similar to what is done in distinct
+        group_builder.specify "Enso_Incremental" <|
+            Scenario.Instance (_ -> Map.empty) . run_distinct data.ints
+        group_builder.specify "Java_Incremental" <|
+            Scenario.Instance (_ -> JavaHashMapWrapper.new) . run_distinct data.ints
+
+        # A scenario similar to what is done in add_row_number with grouping
+        group_builder.specify "Enso_Replacement" <|
+            Scenario.Instance (_ -> Map.empty) . run_count_keys data.ints
+        group_builder.specify "Java_Replacement" <|
+            Scenario.Instance (_ -> JavaHashMapWrapper.new) . run_count_keys data.ints
+
+main = collect_benches . run_main
--- a/test/Benchmarks/src/Table/Internal/Multi_Value_Key.enso
+++ b/test/Benchmarks/src/Table/Internal/Multi_Value_Key.enso
@ -0,0 +1,119 @@
+from Standard.Base import all
+
+from Standard.Table import Table, Value_Type, Aggregate_Column
+import Standard.Table.Internal.Multi_Value_Key.Ordered_Multi_Value_Key
+import Standard.Table.Internal.Multi_Value_Key.Unordered_Multi_Value_Key
+from Standard.Test import Bench
+
+polyglot java import org.enso.table.data.index.OrderedMultiValueKey
+polyglot java import org.enso.table.data.index.UnorderedMultiValueKey
+polyglot java import org.enso.base.text.TextFoldingStrategy
+
+options = Bench.options . set_warmup (Bench.phase_conf 2 3) . set_measure (Bench.phase_conf 2 2)
+
+type My_Pair
+    Value x1 x2
+
+type My_Pair_Comparator
+    compare x y =
+        Ordering.compare x.x2 y.x2 . and_then <|
+            Ordering.compare x.x1 y.x1
+
+    hash x = x.x1.bit_xor x.x2
+
+Comparable.from (_:My_Pair) = My_Pair_Comparator
+
+create_table : Integer -> Table
+create_table num_rows =
+    rng = Random.new 42
+    x = Vector.new num_rows _-> rng.integer min=0 max=100
+    y = Vector.new num_rows _-> rng.integer min=0 max=20 . to_text
+    z = Vector.new num_rows _->
+        a = rng.integer min=0 max=100
+        b = rng.integer min=0 max=100
+        My_Pair.Value a b
+    t = Table.new [["X", x], ["Y", y], ["Z", z]]
+
+    assert condition =
+        if condition.not then Panic.throw "Assertion failed"
+
+    assert ((t.at "X" . value_type) == Value_Type.Integer)
+    assert ((t.at "Y" . value_type) == Value_Type.Char)
+    assert ((t.at "Z" . value_type) == Value_Type.Mixed)
+    t
+
+
+type Data
+    Value ~table
+
+    create num_rows = Data.Value (create_table num_rows)
+
+compare_ordered_keys make_key table compare_keys =
+    n = table.row_count
+    keys = 0.up_to n . map ix-> make_key ix
+    blackhole = 1.up_to n . fold 0 acc-> ix->
+        current = keys.at ix
+        previous = keys.at (ix - 1)
+        if compare_keys current previous then acc+1 else acc-1
+    blackhole
+
+compute_hashcodes make_key table get_hash =
+    n = table.row_count
+    keys = 0.up_to n . map ix-> make_key ix
+    blackhole = keys.fold 0 acc-> key->
+        h = get_hash key
+        (acc + h) % 1997
+    blackhole
+
+collect_benches = Bench.build builder->
+    num_rows = 100000
+    data = Data.create num_rows
+
+    builder.group ("Ordered_Multi_Value_Key" + num_rows.to_text) options group_builder->
+        run_enso table =
+            key_columns = table.columns
+            directions = Vector.fill key_columns.length False
+            make_key row_ix = Ordered_Multi_Value_Key.from_row key_columns directions row_ix
+            compare_keys key1 key2 = key1 < key2
+            compare_ordered_keys make_key table compare_keys
+
+        run_java table =
+            key_storages = table.columns.map c-> c.java_column.getStorage
+            directions = Vector.fill key_storages.length 1
+            make_key row_ix = OrderedMultiValueKey.new key_storages row_ix directions
+            compare_keys key1 key2 = key1.compareTo key2 < 0
+            compare_ordered_keys make_key table compare_keys
+
+        group_builder.specify "Primitive_Enso" <|
+            run_enso (data.table.select_columns ["X", "Y"])
+        group_builder.specify "Primitive_Java" <|
+            run_java (data.table.select_columns ["X", "Y"])
+        group_builder.specify "Custom_Object_Enso" <|
+            run_enso (data.table.select_columns ["X", "Z"])
+        group_builder.specify "Custom_Object_Java" <|
+            run_java (data.table.select_columns ["X", "Z"])
+
+    builder.group ("Unordered_Multi_Value_Key" + num_rows.to_text) options group_builder->
+        run_enso table =
+            key_columns = table.columns
+            make_key row_ix = Unordered_Multi_Value_Key.from_row key_columns row_ix
+            get_hash key = key.hash_code
+            compute_hashcodes make_key table get_hash
+
+        run_java table =
+            key_storages = table.columns.map c-> c.java_column.getStorage
+            text_folding_strategies = Vector.fill key_storages.length TextFoldingStrategy.unicodeNormalizedFold
+            make_key row_ix = UnorderedMultiValueKey.new key_storages row_ix text_folding_strategies
+            get_hash key = key.hashCode
+            compute_hashcodes make_key table get_hash
+
+        group_builder.specify "Primitive_Enso" <|
+            run_enso (data.table.select_columns ["X", "Y"])
+        group_builder.specify "Primitive_Java" <|
+            run_java (data.table.select_columns ["X", "Y"])
+        group_builder.specify "Custom_Object_Enso" <|
+            run_enso (data.table.select_columns ["X", "Z"])
+        group_builder.specify "Custom_Object_Java" <|
+            run_java (data.table.select_columns ["X", "Z"])
+
+main = collect_benches . run_main