Fix the CSV file generation of bench_download script (#9421)

One can now once more create CSV files from benchmark results with something like: ``` ./bench_download.py -v -s stdlib --since 2024-01-01 --create-csv ``` The generated CSV is ready to be read by the Enso IDE. # Important Notes - Fix `--create-csv` functionality of the `bench_download.py` script. - Remove an outdated Enso project from `tools/performance/engine_benchmarks/Engine_Benchs` - This is now done by book clubs.
2024-12-23 00:52:09 +03:00 · 2024-03-14 12:59:58 +01:00 · 2024-03-14 12:59:58 +01:00 · 0801fcb4a0
commit 0801fcb4a0
parent 81c73a9866
4 changed files with 62 additions and 86 deletions
--- a/tools/performance/engine-benchmarks/Engine_Benchs/package.yaml
+++ b/tools/performance/engine-benchmarks/Engine_Benchs/package.yaml
@ -1,7 +0,0 @@
-name: Engine_Benchs
-namespace: local
-version: 0.0.1
-license: ""
-authors: []
-maintainers: []
-prefer-local-libraries: true
--- a/tools/performance/engine-benchmarks/Engine_Benchs/src/Main.enso
+++ b/tools/performance/engine-benchmarks/Engine_Benchs/src/Main.enso
--- a/tools/performance/engine-benchmarks/README.md
+++ b/tools/performance/engine-benchmarks/README.md
@ -24,6 +24,5 @@ Check `bench_download -h` for documentation and usage. Ensure that your
 creates `generated_site` directory with HTML files for visualizing the benchmark
 results.

-One can also analyze the benchmarks in Enso IDE by running
-`bench_download.py --create-csv` and then running `Engine_Benchs` project. The
-created CSV is pasted into `Engine_Benchs/data` directory by default.
+One can also download only a CSV file representing all the selected benchmark
+results with `bench_download.py --create-csv`.
--- a/tools/performance/engine-benchmarks/bench_download.py
+++ b/tools/performance/engine-benchmarks/bench_download.py
@ -46,6 +46,7 @@ Dependencies for the script:
 """

 import sys
+from dataclasses import dataclass

 from bench_tool.bench_results import get_bench_runs, fetch_job_reports
 from bench_tool.remote_cache import ReadonlyRemoteCache
@ -67,7 +68,8 @@ from datetime import datetime, timedelta
 from os import path
 from typing import List, Dict, Optional, Set

-from bench_tool import DATE_FORMAT, GENERATED_SITE_DIR, GH_ARTIFACT_RETENTION_PERIOD, TEMPLATES_DIR, \
+from bench_tool import DATE_FORMAT, GENERATED_SITE_DIR, \
+    GH_ARTIFACT_RETENTION_PERIOD, TEMPLATES_DIR, \
    JINJA_TEMPLATE, JobRun, JobReport, \
    TemplateBenchData, JinjaData, Source
 from bench_tool.gh import ensure_gh_installed
@ -78,49 +80,62 @@ try:
    import numpy as np
    import jinja2
 except ModuleNotFoundError as err:
-    print("ERROR: One of pandas, numpy, or jinja2 packages not installed", file=sys.stderr)
+    print("ERROR: One of pandas, numpy, or jinja2 packages not installed",
+          file=sys.stderr)
    print("Install either with `pip install pandas numpy jinja2` or "
-          "with `apt-get install python3-pandas python3-numpy python3-jinja2`", file=sys.stderr)
+          "with `apt-get install python3-pandas python3-numpy python3-jinja2`",
+          file=sys.stderr)
    exit(1)

-CSV_FIELDNAMES = [
-    "label",
-    "score",
-    "commit_id",
-    "commit_author",
-    "commit_timestamp",
-    "bench_run_url",
-    "bench_run_event"
-]
+
+@dataclass
+class CsvRow:
+    label: str
+    score: str
+    commit_id: str
+    commit_title: str
+    commit_timestamp: str
+    commit_author: str
+    bench_run_id: str
+    bench_run_url: str
+    bench_run_event: str


-def write_bench_reports_to_csv(bench_reports: List[JobReport], csv_fname: str) -> None:
+def write_bench_reports_to_csv(bench_reports: List[JobReport],
+    csv_fname: str) -> None:
    logging.info(
        f"Writing {len(bench_reports)} benchmark reports to {csv_fname}")
+    csv_fieldnames = CsvRow.__annotations__.keys()
    assert len(bench_reports) > 0
-    if not path.exists(path.dirname(csv_fname)):
-        logging.debug(f"Creating directory {path.dirname(csv_fname)}")
-        os.mkdir(path.dirname(csv_fname))
    with open(csv_fname, "w") as csv_file:
-        csv_writer = DictWriter(csv_file, CSV_FIELDNAMES)
+        csv_writer = DictWriter(csv_file, csv_fieldnames)
        csv_writer.writeheader()
        for bench_report in bench_reports:
            for label, score in bench_report.label_score_dict.items():
-                csv_writer.writerow({
-                    "label": label,
-                    "score": score,
-                    "commit_id": bench_report.bench_run.head_commit.id,
-                    "commit_author": bench_report.bench_run.head_commit.author.name,
-                    "commit_timestamp": bench_report.bench_run.head_commit.timestamp,
-                    "bench_run_url": bench_report.bench_run.html_url,
-                    "bench_run_event": bench_report.bench_run.event
-                })
+                commit_title = \
+                  bench_report.bench_run.head_commit.message.splitlines()[0]
+                commit_title = commit_title.replace(",", " ")
+                # Ensure that score is not printed with exponential notation,
+                # Enso cannot easily parse that by default now.
+                score_formatted = f"{score:.9f}"
+                row = CsvRow(
+                    label=label,
+                    score=score_formatted,
+                    commit_id=bench_report.bench_run.head_commit.id,
+                    commit_title=commit_title,
+                    commit_author=bench_report.bench_run.head_commit.author.name,
+                    commit_timestamp=bench_report.bench_run.head_commit.timestamp,
+                    bench_run_id=bench_report.bench_run.id,
+                    bench_run_url=bench_report.bench_run.html_url,
+                    bench_run_event=bench_report.bench_run.event
+                )
+                csv_writer.writerow(row.__dict__)


 async def main():
    default_since: datetime = (datetime.now() - timedelta(days=14))
    default_until: datetime = datetime.now()
-    default_csv_out = "Engine_Benchs/data/benchs.csv"
+    default_csv_out = "benchs.csv"
    date_format_help = DATE_FORMAT.replace("%", "%%")

    def _parse_bench_source(_bench_source: str) -> Source:
@ -128,7 +143,8 @@ async def main():
            return Source(_bench_source)
        except ValueError:
            print(f"Invalid benchmark source {_bench_source}.", file=sys.stderr)
-            print(f"Available sources: {[source.value for source in Source]}", file=sys.stderr)
+            print(f"Available sources: {[source.value for source in Source]}",
+                  file=sys.stderr)
            exit(1)

    arg_parser = ArgumentParser(description=__doc__,
@ -196,10 +212,10 @@ async def main():
    branches: List[str] = args.branches
    labels_override: Set[str] = args.labels
    logging.debug(f"parsed args: since={since}, until={until}, "
-                 f"temp_dir={temp_dir}, bench_source={bench_source}, "
-                 f"csv_output={csv_output}, "
-                 f"create_csv={create_csv}, branches={branches}, "
-                 f"labels_override={labels_override}")
+                  f"temp_dir={temp_dir}, bench_source={bench_source}, "
+                  f"csv_output={csv_output}, "
+                  f"create_csv={create_csv}, branches={branches}, "
+                  f"labels_override={labels_override}")

    ensure_gh_installed()

@ -208,11 +224,11 @@ async def main():
    min_since_without_cache = datetime.today() - GH_ARTIFACT_RETENTION_PERIOD
    if since < min_since_without_cache:
        logging.info(f"The default GH artifact retention period is "
-                        f"{GH_ARTIFACT_RETENTION_PERIOD.days} days. "
-                        f"This means that all the artifacts older than "
-                        f"{min_since_without_cache.date()} are expired."
-                        f"The since date was set to {since}, so the remote cache is enabled, "
-                        f"and the older artifacts will be fetched from the cache.")
+                     f"{GH_ARTIFACT_RETENTION_PERIOD.days} days. "
+                     f"This means that all the artifacts older than "
+                     f"{min_since_without_cache.date()} are expired."
+                     f"The since date was set to {since}, so the remote cache is enabled, "
+                     f"and the older artifacts will be fetched from the cache.")

    remote_cache = ReadonlyRemoteCache()

@ -254,8 +270,9 @@ async def main():
            if len(labels_override) > 0:
                logging.info(f"Subset of labels specified: {labels_override}")
                if not set(labels_override).issubset(all_bench_labels):
-                    print(f"Specified bench labels {labels_override} are not a subset of "
-                          f"all bench labels {all_bench_labels}")
+                    print(
+                        f"Specified bench labels {labels_override} are not a subset of "
+                        f"all bench labels {all_bench_labels}")
                    exit(1)
                bench_labels = labels_override
            else:
@ -282,13 +299,15 @@ async def main():
    if not path.exists(GENERATED_SITE_DIR):
        os.mkdir(GENERATED_SITE_DIR)

-    logging.debug(f"Rendering HTML from {JINJA_TEMPLATE} to {GENERATED_SITE_DIR}")
+    logging.debug(
+        f"Rendering HTML from {JINJA_TEMPLATE} to {GENERATED_SITE_DIR}")
    site_path = GENERATED_SITE_DIR.joinpath(bench_source.value + "-benchs.html")
    render_html(
        jinja_data,
        site_path
    )
-    logging.debug(f"Copying static site content from {TEMPLATES_DIR} to {GENERATED_SITE_DIR}")
+    logging.debug(
+        f"Copying static site content from {TEMPLATES_DIR} to {GENERATED_SITE_DIR}")
    shutil.copy(
        path.join(TEMPLATES_DIR, "styles.css"),
        path.join(GENERATED_SITE_DIR, "styles.css")