Add benchmarks related to add_row_number performance investigation (#8091)

- Follow-up of #8055
- Adds a benchmark comparing performance of Enso Map and Java HashMap in two scenarios - _only incremental_ updates (like `Vector.distinct`) and _replacing_ updates (like keeping a counter for each key). These benchmarks can be used as a metric for #8090
This commit is contained in:
Radosław Waśko 2023-10-18 19:21:59 +02:00 committed by GitHub
parent cec115d25b
commit 93a31fcc8b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 271 additions and 13 deletions

View File

@ -308,7 +308,8 @@ lazy val enso = (project in file("."))
`std-aws`,
`simple-httpbin`,
`enso-test-java-helpers`,
`exploratory-benchmark-java-helpers`
`exploratory-benchmark-java-helpers`,
`benchmark-java-helpers`
)
.settings(Global / concurrentRestrictions += Tags.exclusive(Exclusive))
.settings(
@ -1386,6 +1387,7 @@ lazy val runtime = (project in file("engine/runtime"))
(Runtime / compile) := (Runtime / compile)
.dependsOn(`std-base` / Compile / packageBin)
.dependsOn(`enso-test-java-helpers` / Compile / packageBin)
.dependsOn(`benchmark-java-helpers` / Compile / packageBin)
.dependsOn(`exploratory-benchmark-java-helpers` / Compile / packageBin)
.dependsOn(`std-image` / Compile / packageBin)
.dependsOn(`std-database` / Compile / packageBin)
@ -2200,6 +2202,26 @@ lazy val `exploratory-benchmark-java-helpers` = project
.dependsOn(`std-base` % "provided")
.dependsOn(`std-table` % "provided")
lazy val `benchmark-java-helpers` = project
.in(
file(
"test/Benchmarks/polyglot-sources/benchmark-java-helpers"
)
)
.settings(
frgaalJavaCompilerSetting,
autoScalaLibrary := false,
Compile / packageBin / artifactPath :=
file(
"test/Benchmarks/polyglot/java/benchmark-java-helpers.jar"
),
libraryDependencies ++= Seq(
"org.graalvm.sdk" % "graal-sdk" % graalMavenPackagesVersion % "provided"
)
)
.dependsOn(`std-base` % "provided")
.dependsOn(`std-table` % "provided")
lazy val `std-table` = project
.in(file("std-bits") / "table")
.enablePlugins(Antlr4Plugin)
@ -2531,12 +2553,14 @@ pkgStdLibInternal := Def.inputTask {
case "TestHelpers" =>
(`enso-test-java-helpers` / Compile / packageBin).value
(`exploratory-benchmark-java-helpers` / Compile / packageBin).value
(`benchmark-java-helpers` / Compile / packageBin).value
case "AWS" =>
(`std-aws` / Compile / packageBin).value
case _ if buildAllCmd =>
(`std-base` / Compile / packageBin).value
(`enso-test-java-helpers` / Compile / packageBin).value
(`exploratory-benchmark-java-helpers` / Compile / packageBin).value
(`benchmark-java-helpers` / Compile / packageBin).value
(`std-table` / Compile / packageBin).value
(`std-database` / Compile / packageBin).value
(`std-image` / Compile / packageBin).value

View File

@ -159,10 +159,10 @@ type Bench
count = self.total_specs
IO.println <| "Found " + count.to_text + " cases to execute (ETA " + self.estimated_runtime.to_display_text + ")"
case Environment.get "ENSO_BENCHMARK_REPORT_PATH" of
case get_benchmark_report_path of
Nothing -> Nothing
path ->
line = 'Label,Phase,"Invocations count","Average time (ms)"'
line = 'Label,Phase,"Invocations count","Average time (ms)","Time Stdev"'
line.write path on_existing_file=Existing_File_Behavior.Backup
self.fold Nothing _-> g-> s->
@ -211,8 +211,10 @@ type Bench
computation.
single_call ~act =
start = System.nano_time
Runtime.no_inline act
r = Runtime.no_inline act
end = System.nano_time
# If the computation returned a dataflow error, we raise it to a panic - we do not want silent failures in benchmarks.
Panic.rethrow r
end - start
## Run a single phase of the benchmark.
@ -237,13 +239,15 @@ type Bench
durations_builder.append dur
@Tail_Call go (cur_ns + dur)
go phase_start
durations = durations_builder.to_vector
sum = durations.reduce (_ + _)
nanos_in_ms = 1000000
durations = durations_builder.to_vector.map (x-> x / nanos_in_ms)
stats = durations.compute_bulk [Statistic.Mean, Statistic.Standard_Deviation]
avg = stats.first
stddev = stats.second
run_iters = durations.length
avg = (sum / run_iters) / 1000000
phase_end = System.nano_time
phase_duration = Duration.new nanoseconds=(phase_end - phase_start)
Bench.summarize_phase label phase_name run_iters avg phase_duration
Bench.summarize_phase label phase_name run_iters avg stddev phase_duration
## PRIVATE
This is a very simple implementation of summarizing the benchmark
@ -251,16 +255,17 @@ type Bench
We may want to improve it later, but it gets the job done to give us
simple summary that can be analysed more easily than logs.
summarize_phase (label:Text) (phase_name:Text) (invocations:Integer) (average_time:Float) (phase_duration:Duration) =
fmt = average_time.format "#.###"
summarize_phase (label:Text) (phase_name:Text) (invocations:Integer) (average_time:Float) (time_stddev:Float) (phase_duration:Duration) =
avg_fmt = average_time.format "#.###"
stddev_fmt = time_stddev.format "#.###"
IO.println <| phase_name + " duration: " + (phase_duration.total_milliseconds.format "#.##") + " ms"
IO.println <| phase_name + " invocations: " + invocations.to_text
IO.println <| phase_name + " avg time: " + fmt + " ms"
IO.println <| phase_name + " avg time: " + avg_fmt + " ms (+-" + stddev_fmt + "))"
case Environment.get "ENSO_BENCHMARK_REPORT_PATH" of
case get_benchmark_report_path of
Nothing -> Nothing
path ->
line = '\n"'+label+'","'+phase_name+'",'+invocations.to_text+','+fmt
line = '\n"'+label+'","'+phase_name+'",'+invocations.to_text+','+avg_fmt+','+stddev_fmt
line.write path on_existing_file=Existing_File_Behavior.Append
## PRIVATE
@ -274,3 +279,7 @@ validate_name name =
valid_java_identifier_regex = Regex.compile "[A-Za-z_$][a-zA-Z0-9_$]*"
if valid_java_identifier_regex.matches name then Nothing else
Panic.throw (Illegal_Argument.Error ("Invalid benchmark name: '" + name + "'"))
## PRIVATE
get_benchmark_report_path : Text | Nothing
get_benchmark_report_path = Environment.get "ENSO_BENCHMARK_REPORT_PATH"

View File

@ -73,4 +73,9 @@ public class OrderedMultiValueKey extends MultiValueKeyBase
"Currently no hash_code implementation consistent with the ObjectComparator is exposed, so"
+ " OrderedMultiValueKey is not hashable.");
}
@Override
public String toString() {
return "OrderedMultiValueKey{row="+rowIndex+"}";
}
}

View File

@ -0,0 +1,36 @@
package org.enso.benchmark_helpers;
import java.util.HashMap;
/**
* Wraps a Java HashMap into an interface hiding it, to ensure that we are calling the raw HashMap
* and are not using the Enso conversions that may be applied automatically. This allows us to
* compare the raw HashMap performance with other variants.
*/
public class JavaHashMapWrapper {
private final HashMap<Object, Object> map = new HashMap<>();
public JavaHashMapWrapper insert(Object key, Object value) {
map.put(key, value);
return this;
}
public Object get(Object key) {
return map.get(key);
}
public long size() {
return map.size();
}
public Object[][] to_vector() {
Object[][] result = new Object[map.size()][2];
int i = 0;
for (var entry : map.entrySet()) {
result[i][0] = entry.getKey();
result[i][1] = entry.getValue();
i++;
}
return result;
}
}

View File

@ -11,6 +11,7 @@ import project.Table.Arithmetic
import project.Table.Column_From_Vector
import project.Table.Cross_Tab
import project.Table.Sorting
import project.Table.Internal.Multi_Value_Key
import project.Text.Build
import project.Text.Compare
import project.Text.Contains
@ -22,6 +23,7 @@ import project.Collections
import project.Column_Numeric
import project.Equality
import project.Json_Bench
import project.Map.Hash_Map
import project.Natural_Order_Sort
import project.Number_Parse
import project.Numeric
@ -39,6 +41,9 @@ all_benchmarks =
builder.append Operations.collect_benches
builder.append Sort.collect_benches
# Map
builder.append Hash_Map.collect_benches
# Statistics
builder.append Count_Min_Max.collect_benches
@ -49,6 +54,7 @@ all_benchmarks =
builder.append Column_From_Vector.collect_benches
builder.append Cross_Tab.collect_benches
builder.append Sorting.collect_benches
builder.append Multi_Value_Key.collect_benches
# Text
builder.append Build.collect_benches

View File

@ -0,0 +1,59 @@
from Standard.Base import all
from Standard.Table import Column, Value_Type, Auto
import Standard.Table.Data.Type.Value_Type.Bits
from Standard.Test import Bench
polyglot java import org.enso.benchmark_helpers.JavaHashMapWrapper
options = Bench.options . set_warmup (Bench.phase_conf 2 2) . set_measure (Bench.phase_conf 2 3)
type Data
Value ~ints
create n =
create_ints =
rng = Random.new
Vector.new n _->
rng.integer 0 (n.div 100)
Data.Value create_ints
type Scenario
Instance map_constructor
# Counts distinct values in a vector
run_distinct self ints =
new_map = ints.fold (self.map_constructor Nothing) acc-> x->
if acc.get x . is_nothing . not then acc else
acc.insert x True
new_map.size
# Finds the most frequent value in a vector
run_count_keys self ints =
new_map = ints.fold (self.map_constructor Nothing) acc-> x->
current_count = (acc.get x . if_nothing 0) + 1
acc.insert x current_count
max_key = new_map.to_vector.fold (Pair.new Nothing 0) acc-> entry->
freq = entry.second
if freq > acc.second then Pair.new entry.first freq else acc
max_key
collect_benches = Bench.build builder->
n = 100000
data = Data.create n
builder.group ("Enso_Hash_Map_" + n.to_text) options group_builder->
# Scenario similar to what is done in distinct
group_builder.specify "Enso_Incremental" <|
Scenario.Instance (_ -> Map.empty) . run_distinct data.ints
group_builder.specify "Java_Incremental" <|
Scenario.Instance (_ -> JavaHashMapWrapper.new) . run_distinct data.ints
# A scenario similar to what is done in add_row_number with grouping
group_builder.specify "Enso_Replacement" <|
Scenario.Instance (_ -> Map.empty) . run_count_keys data.ints
group_builder.specify "Java_Replacement" <|
Scenario.Instance (_ -> JavaHashMapWrapper.new) . run_count_keys data.ints
main = collect_benches . run_main

View File

@ -0,0 +1,119 @@
from Standard.Base import all
from Standard.Table import Table, Value_Type, Aggregate_Column
import Standard.Table.Internal.Multi_Value_Key.Ordered_Multi_Value_Key
import Standard.Table.Internal.Multi_Value_Key.Unordered_Multi_Value_Key
from Standard.Test import Bench
polyglot java import org.enso.table.data.index.OrderedMultiValueKey
polyglot java import org.enso.table.data.index.UnorderedMultiValueKey
polyglot java import org.enso.base.text.TextFoldingStrategy
options = Bench.options . set_warmup (Bench.phase_conf 2 3) . set_measure (Bench.phase_conf 2 2)
type My_Pair
Value x1 x2
type My_Pair_Comparator
compare x y =
Ordering.compare x.x2 y.x2 . and_then <|
Ordering.compare x.x1 y.x1
hash x = x.x1.bit_xor x.x2
Comparable.from (_:My_Pair) = My_Pair_Comparator
create_table : Integer -> Table
create_table num_rows =
rng = Random.new 42
x = Vector.new num_rows _-> rng.integer min=0 max=100
y = Vector.new num_rows _-> rng.integer min=0 max=20 . to_text
z = Vector.new num_rows _->
a = rng.integer min=0 max=100
b = rng.integer min=0 max=100
My_Pair.Value a b
t = Table.new [["X", x], ["Y", y], ["Z", z]]
assert condition =
if condition.not then Panic.throw "Assertion failed"
assert ((t.at "X" . value_type) == Value_Type.Integer)
assert ((t.at "Y" . value_type) == Value_Type.Char)
assert ((t.at "Z" . value_type) == Value_Type.Mixed)
t
type Data
Value ~table
create num_rows = Data.Value (create_table num_rows)
compare_ordered_keys make_key table compare_keys =
n = table.row_count
keys = 0.up_to n . map ix-> make_key ix
blackhole = 1.up_to n . fold 0 acc-> ix->
current = keys.at ix
previous = keys.at (ix - 1)
if compare_keys current previous then acc+1 else acc-1
blackhole
compute_hashcodes make_key table get_hash =
n = table.row_count
keys = 0.up_to n . map ix-> make_key ix
blackhole = keys.fold 0 acc-> key->
h = get_hash key
(acc + h) % 1997
blackhole
collect_benches = Bench.build builder->
num_rows = 100000
data = Data.create num_rows
builder.group ("Ordered_Multi_Value_Key" + num_rows.to_text) options group_builder->
run_enso table =
key_columns = table.columns
directions = Vector.fill key_columns.length False
make_key row_ix = Ordered_Multi_Value_Key.from_row key_columns directions row_ix
compare_keys key1 key2 = key1 < key2
compare_ordered_keys make_key table compare_keys
run_java table =
key_storages = table.columns.map c-> c.java_column.getStorage
directions = Vector.fill key_storages.length 1
make_key row_ix = OrderedMultiValueKey.new key_storages row_ix directions
compare_keys key1 key2 = key1.compareTo key2 < 0
compare_ordered_keys make_key table compare_keys
group_builder.specify "Primitive_Enso" <|
run_enso (data.table.select_columns ["X", "Y"])
group_builder.specify "Primitive_Java" <|
run_java (data.table.select_columns ["X", "Y"])
group_builder.specify "Custom_Object_Enso" <|
run_enso (data.table.select_columns ["X", "Z"])
group_builder.specify "Custom_Object_Java" <|
run_java (data.table.select_columns ["X", "Z"])
builder.group ("Unordered_Multi_Value_Key" + num_rows.to_text) options group_builder->
run_enso table =
key_columns = table.columns
make_key row_ix = Unordered_Multi_Value_Key.from_row key_columns row_ix
get_hash key = key.hash_code
compute_hashcodes make_key table get_hash
run_java table =
key_storages = table.columns.map c-> c.java_column.getStorage
text_folding_strategies = Vector.fill key_storages.length TextFoldingStrategy.unicodeNormalizedFold
make_key row_ix = UnorderedMultiValueKey.new key_storages row_ix text_folding_strategies
get_hash key = key.hashCode
compute_hashcodes make_key table get_hash
group_builder.specify "Primitive_Enso" <|
run_enso (data.table.select_columns ["X", "Y"])
group_builder.specify "Primitive_Java" <|
run_java (data.table.select_columns ["X", "Y"])
group_builder.specify "Custom_Object_Enso" <|
run_enso (data.table.select_columns ["X", "Z"])
group_builder.specify "Custom_Object_Java" <|
run_java (data.table.select_columns ["X", "Z"])
main = collect_benches . run_main