daml/infra/es_cluster.tf

# Copyright (c) 2022 Digital Asset (Switzerland) GmbH and/or its affiliates. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

resource "google_compute_network" "es" {
  name = "es-network"
}

/*
Instruct ES to move all data out of blue nodes:
PUT _cluster/settings
{
  "transient" : {
    "cluster.routing.allocation.exclude._name" : "es-blue-*"
  }
}
use null to reset
*/

locals {
  es_ssh  = 0
  es_feed = 1
  es_clusters = [
    {
      suffix         = "-blue",
      ubuntu_version = "2004",
      size           = 5,
      init           = "[]",
      type           = "n2-highmem-2",
      xmx            = "12g",
      disk_size      = 800,
    },
    {
      suffix         = "-green",
      ubuntu_version = "2004",
      size           = 0,
      init           = "[]",
      type           = "n2-highmem-2",
      xmx            = "12g",
      disk_size      = 800,
    },
    {
      suffix         = "-init",
      ubuntu_version = "2004",
      size           = 0,
      init           = "[\"$(hostname)\"]",
      type           = "e2-standard-2",
      xmx            = "6g",
      disk_size      = 200,
    },
  ]

  es_ports = [
    { name = "es", port = "9200" },
    { name = "kibana", port = "5601" },
    { name = "cerebro", port = "9000" },
  ]
}

resource "google_compute_firewall" "es-ssh" {
  ## Disabled by default
  count   = local.es_ssh
  name    = "es-ssh"
  network = google_compute_network.es.name
  log_config {
    metadata = "INCLUDE_ALL_METADATA"
  }
  allow {
    protocol = "tcp"
    ports    = ["22"]
  }
  source_ranges = [      # VPNs
    "35.194.81.56/32",   # North Virginia
    "35.189.40.124/32",  # Sydney
    "35.198.147.95/32",  # Frankfurt
    "18.210.210.130/32", # consultant
  ]
}

resource "google_compute_firewall" "es-http" {
  name        = "es-http"
  network     = google_compute_network.es.name
  target_tags = ["es"]

  source_ranges = [
    ## Google Load Balancer
    "130.211.0.0/22",
    "35.191.0.0/16",
  ]

  allow {
    protocol = "tcp"
    ports    = [for p in local.es_ports : p.port]
  }
}

resource "google_compute_firewall" "es-internal" {
  name        = "es-internal"
  network     = google_compute_network.es.name
  target_tags = ["es"]

  source_ranges = [
    ## Internal
    "10.128.0.0/9",
  ]

  allow {
    protocol = "tcp"
    ports    = ["9300"]
  }
}

resource "google_service_account" "es-discovery" {
  # account_id allows - but not _
  account_id   = "es-discovery"
  display_name = "es-discovery"
}

resource "google_project_iam_custom_role" "es-discovery" {
  # role_id allows _ but not -
  role_id     = "es_discovery"
  title       = "es-discovery"
  description = "es-discovery"
  permissions = [
    # Cloud logging
    "logging.logEntries.create",
    # ES discovery
    "compute.instances.get",
    "compute.instances.list",
  ]
}

resource "google_project_iam_member" "es-discovery" {
  project = local.project
  role    = google_project_iam_custom_role.es-discovery.id
  member  = "serviceAccount:${google_service_account.es-discovery.email}"
}

resource "google_compute_instance_template" "es" {
  count        = length(local.es_clusters)
  name_prefix  = "es${local.es_clusters[count.index].suffix}-"
  machine_type = local.es_clusters[count.index].type
  tags         = ["es"]
  labels       = local.machine-labels

  disk {
    boot         = true
    disk_size_gb = local.es_clusters[count.index].disk_size
    source_image = "ubuntu-os-cloud/ubuntu-${local.es_clusters[count.index].ubuntu_version}-lts"
  }

  metadata_startup_script = <<STARTUP
#! /bin/bash
set -euo pipefail

export DEBIAN_FRONTEND=noninteractive

apt-get update
apt-get -y upgrade
### stackdriver
curl -sSL https://dl.google.com/cloudagents/install-logging-agent.sh | bash

## Install Docker

mkdir -p /etc/docker
cat <<CONFIG > /etc/docker/daemon.json
{
  "log-driver": "json-file",
  "log-opts": {"max-size": "10m", "max-file": "3"}
}
CONFIG

apt-get install -y \
  apt-transport-https \
  ca-certificates \
  gnupg \
  lsb-release
curl -fsSL https://download.docker.com/linux/ubuntu/gpg \
  | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
echo \
  "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \
  $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
apt-get update
apt-get install -y docker-ce docker-ce-cli containerd.io

## Set up ES

sysctl -w vm.max_map_count=262144

mkdir -p /root/es-docker
cd /root/es-docker

cat <<EOF > es.yml
cluster.name: es
node.name: $(hostname)
cluster.initial_master_nodes: ${local.es_clusters[count.index].init}
discovery.seed_providers: gce
discovery.gce.tags: es
cloud.gce.project_id: ${local.project}
cloud.gce.zone: ${local.zone}
network.host: 0.0.0.0
network.publish_host: _gce_
http.max_content_length: 500mb
EOF

cat <<EOF > Dockerfile
FROM docker.elastic.co/elasticsearch/elasticsearch:7.13.2

RUN bin/elasticsearch-plugin install --batch discovery-gce
COPY es.yml /usr/share/elasticsearch/config/elasticsearch.yml
EOF

mkdir -p /root/es-data
chown 1000:0 /root/es-data

docker build -t es .
docker run -d \
           --restart on-failure \
           --name es \
           -p 9200:9200 \
           -p 9300:9300 \
           -e ES_JAVA_OPTS="-Xmx${local.es_clusters[count.index].xmx} -Xms${local.es_clusters[count.index].xmx} -Dlog4j2.formatMsgNoLookups=true" \
           -v /root/es-data:/usr/share/elasticsearch/data \
           es

docker run -d \
           --restart on-failure \
           --name kibana \
           -p 5601:5601 \
           --link es:elasticsearch \
           -e TELEMETRY_ENABLED=false \
           docker.elastic.co/kibana/kibana:7.13.2

docker run -d \
           -p 9000:9000 \
           --link es \
           --name cerebro \
           lmenezes/cerebro:0.9.4

## Getting container output directly to the GCP console

( exec 1> >(while IFS= read -r line; do echo "elastic: $line"; done); docker logs -f es ) &
( exec 1> >(while IFS= read -r line; do echo "kibana: $line"; done); docker logs -f kibana ) &
( exec 1> >(while IFS= read -r line; do echo "cerebro: $line"; done); docker logs -f cerebro ) &

for job in $(jobs -p); do
    wait $job
done

STARTUP


  network_interface {
    network = google_compute_network.es.name
    access_config {}
  }

  service_account {
    email = google_service_account.es-discovery.email
    scopes = [
      # Required for cloud logging
      "logging-write",
      # Required per ES documentation
      "compute-rw",
    ]
  }

  lifecycle {
    create_before_destroy = true
  }
}

resource "google_compute_instance_group_manager" "es" {
  provider           = google-beta
  count              = length(local.es_clusters)
  name               = "es${local.es_clusters[count.index].suffix}"
  base_instance_name = "es${local.es_clusters[count.index].suffix}"
  zone               = local.zone
  target_size        = local.es_clusters[count.index].size

  version {
    name              = "es${local.es_clusters[count.index].suffix}"
    instance_template = google_compute_instance_template.es[count.index].self_link
  }

  dynamic named_port {
    for_each = local.es_ports
    content {
      name = named_port.value["name"]
      port = named_port.value["port"]
    }
  }

  update_policy {
    type                    = "PROACTIVE"
    minimal_action          = "REPLACE"
    max_unavailable_percent = 100
  }
}

resource "google_compute_address" "es" {
  count        = length(local.es_ports)
  name         = "es-${local.es_ports[count.index].name}"
  network_tier = "STANDARD"
}

resource "google_compute_health_check" "es-http" {
  count              = length(local.es_ports)
  name               = "es-http-${local.es_ports[count.index].name}"
  check_interval_sec = 10
  timeout_sec        = 1

  tcp_health_check {
    port = local.es_ports[count.index].port
  }
}

resource "google_compute_backend_service" "es-http" {
  count                 = length(local.es_ports)
  name                  = "es-http-${local.es_ports[count.index].name}"
  health_checks         = [google_compute_health_check.es-http[count.index].self_link]
  port_name             = local.es_ports[count.index].name
  security_policy       = google_compute_security_policy.es.self_link
  load_balancing_scheme = "EXTERNAL"

  dynamic backend {
    for_each = local.es_clusters
    content {
      group           = google_compute_instance_group_manager.es[backend.key].instance_group
      balancing_mode  = "UTILIZATION"
      capacity_scaler = 1
    }
  }
}

resource "google_compute_url_map" "es-http" {
  count           = length(local.es_ports)
  name            = "es-http-${local.es_ports[count.index].name}"
  default_service = google_compute_backend_service.es-http[count.index].self_link
}

resource "google_compute_target_http_proxy" "es-http" {
  count   = length(local.es_ports)
  name    = "es-http-${local.es_ports[count.index].name}"
  url_map = google_compute_url_map.es-http[count.index].self_link
}

resource "google_compute_forwarding_rule" "es-http" {
  count        = length(local.es_ports)
  name         = "es-http-${local.es_ports[count.index].name}"
  target       = google_compute_target_http_proxy.es-http[count.index].self_link
  ip_address   = google_compute_address.es[count.index].address
  port_range   = "80"
  network_tier = "STANDARD"
}

## The proxy implied by the forwarding rule sits outside our network, but we
## still want to limit to VPNs.
resource "google_compute_security_policy" "es" {
  name = "es"

  rule {
    action   = "deny(403)"
    priority = "2147483647"
    match {
      versioned_expr = "SRC_IPS_V1"
      config {
        src_ip_ranges = ["*"]
      }
    }
    description = "Default: deny all"
  }

  rule {
    action   = "allow"
    priority = "1000"
    match {
      versioned_expr = "SRC_IPS_V1"
      config {
        src_ip_ranges = [      # VPNs
          "35.194.81.56/32",   # North Virginia
          "35.189.40.124/32",  # Sydney
          "35.198.147.95/32",  # Frankfurt
          "18.210.210.130/32", # consultant
          "${google_compute_address.es-feed.address}/32"
        ]
      }
    }
    description = "Allow VPNs"
  }
}

output "es_addresses" {
  value = { for idx, p in local.es_ports : p.name => google_compute_address.es[idx].address }
}

resource "google_compute_address" "es-feed" {
  name = "es-feed"
}

resource "google_service_account" "es-feed" {
  account_id   = "es-feed"
  display_name = "es-feed"
}

resource "google_project_iam_custom_role" "es-feed" {
  role_id     = "es_feed"
  title       = "es-feed"
  description = "es-feed"
  permissions = [
    # Cloud logging
    "logging.logEntries.create",
    # Access GCS bucket
    "storage.objects.get",
    "storage.objects.list",
  ]
}

resource "google_project_iam_member" "es-feed" {
  project = local.project
  role    = google_project_iam_custom_role.es-feed.id
  member  = "serviceAccount:${google_service_account.es-feed.email}"
}

resource "google_project_iam_custom_role" "es-feed-write" {
  role_id     = "es_feed_write"
  title       = "es-feed-write"
  description = "es-feed-write"
  permissions = [
    "storage.objects.create"
  ]
}

resource "google_project_iam_member" "es-feed-write" {
  project = local.project
  role    = google_project_iam_custom_role.es-feed-write.id
  member  = "serviceAccount:${google_service_account.es-feed.email}"

  condition {
    title       = "es_feed_write"
    description = "es_feed_write"
    expression  = "resource.name.startsWith(\"projects/_/buckets/${google_storage_bucket.data.name}/objects/kibana-export\")"
  }
}

resource "google_compute_instance_group_manager" "es-feed" {
  provider           = google-beta
  name               = "es-feed"
  base_instance_name = "es-feed"
  zone               = local.zone
  target_size        = local.es_feed

  version {
    name              = "es-feed"
    instance_template = google_compute_instance_template.es-feed.self_link
  }

  update_policy {
    type                    = "PROACTIVE"
    minimal_action          = "REPLACE"
    max_unavailable_percent = 100
  }
}

resource "google_compute_instance_template" "es-feed" {
  name_prefix  = "es-feed"
  machine_type = "e2-standard-2"
  tags         = ["es"]
  labels       = local.machine-labels

  disk {
    boot         = true
    source_image = "ubuntu-os-cloud/ubuntu-2004-lts"
    disk_size_gb = "200"
  }

  metadata_startup_script = <<STARTUP
apt-get update
export DEBIAN_FRONTEND=noninteractive
apt-get upgrade -y
apt-get install -y jq

### stackdriver
curl -sSL https://dl.google.com/cloudagents/install-logging-agent.sh | bash

cat <<'CRON' >/root/cron.sh
#!/usr/bin/env bash
set -euo pipefail

emit_mappings() {
  jq -nc '{
      mappings: {
        properties: {
          job: {
            properties: {
              timestamp: { type: "date" },
              id: { type: "keyword" },
              agent_id: { type: "keyword" },
              agent_job_name: { type: "keyword" },
              agent_machine_name: { type: "keyword" },
              agent_name: { type: "keyword" },
              agent_os: { type: "keyword" },
              agent_os_architecture: { type: "keyword" },
              build_build_id: { type: "keyword" },
              build_build_number: { type: "keyword" },
              build_definition_name: { type: "keyword" },
              build_source_branch: { type: "keyword" },
              build_source_branch_name: { type: "keyword" },
              build_source_version: { type: "keyword" },
              system_job_attempt: { type: "keyword" },
              system_job_display_name: { type: "keyword" },
              system_job_id: { type: "keyword" },
              system_job_name: { type: "keyword" },
              system_pullRequest_pullRequestId: { type: "keyword" },
              system_pullRequest_pullRequestNumber: { type: "keyword" },
              system_pullRequest_mergedAt: { type: "keyword" },
              system_pullRequest_sourceBranch: { type: "keyword" },
              system_pullRequest_targetBranch: { type: "keyword" },
              system_pullRequest_sourceRepositoryUri: { type: "keyword" },
              system_pullRequest_sourceCommitId: { type: "keyword" },
              git_branch_sha: { type: "keyword" },
              git_main_sha: { type: "keyword" },
              git_fork_point: { type: "keyword" },
              git_current_branch: { type: "keyword" },
              git_current_commit: { type: "keyword" },
              git_current_tree: { type: "keyword" },
            }
          },
          command: {
            properties: {
              name: { type: "keyword" }
            }
          },
          buildEvent: {
            properties: {
              id: { type: "object" },
              children: { type: "nested" },
              lastMessage: { type: "boolean" },
              progress: {
                properties: {
                  stdout: { type: "text" },
                  stderr: { type: "text" },
                }
              },
              aborted: {
                properties: {
                  reason: { type: "keyword" },
                  description: { type: "text" },
                }
              },
              started: {
                properties: {
                  uuid: { type: "keyword" },
                  startTimeMillis: {
                    type: "date",
                    format: "epoch_millis"
                  },
                  buildToolVersion: { type: "keyword" },
                  optionsDescription: { type: "text" },
                  command: { type: "keyword" },
                  workingDirectory: { type: "keyword" },
                  workspaceDirectory: { type: "keyword" },
                  serverPid: { type: "keyword" },
                }
              },
              unstructuredCommandLine: {
                properties: {
                  args: { type: "keyword" },
                }
              },
              structuredCommandLine: {
                properties: {
                  sections: { type: "nested" },
                },
              },
              optionsParsed: { type: "object" },
              workspaceStatus: {
                properties: {
                  item: {
                    type: "nested",
                    properties: {
                      key: { type: "keyword" },
                      value: { type: "text" },
                    },
                  },
                },
              },
              fetch: { type: "object" },
              configuration: { type: "object" },
              expanded: { type: "object" },
              configured: { type: "object" },
              action: {
                properties: {
                  actionMetadataLogs: { type: "nested" },
                },
              },
              namedSetOfFiles: { type: "object" },
              completed: {
                properties: {
                  success: { type: "boolean" },
                  outputGroup: { type: "nested" },
                  importantOutput: { type: "nested" },
                  directoryOutput: { type: "nested" },
                  testTimeoutSeconds: { type: "long" },
                },
              },
              testResult: {
                properties: {
                  cachedLocally: { type: "boolean" },
                  testAttemptStartMillisEpoch: {
                    type: "date",
                    format: "epoch_millis",
                  },
                  testAttemptDurationMillis: { type: "long" },
                  testActionOutput: { type: "nested" },
                  executionInfo: {
                    properties: {
                      timeoutSeconds: { type: "integer" },
                      cachedRemotely: { type: "boolean" },
                      exitCode: { type: "integer" },
                      timingBreakdown: { type: "nested" },
                      resourceUsage: { type: "nested" },
                    },
                  },
                },
              },
              testSummary: {
                properties: {
                  overallStatus: { type: "keyword" },
                  totalRunCount: { type: "integer" },
                  runCount: { type: "integer" },
                  shardCount: { type: "integer" },
                  passed: { type: "nested" },
                  failed: { type: "nested" },
                  totalNumCached: { type: "integer" },
                  firstStartTimeMillis: {
                    type: "date",
                    format: "epoch_millis",
                  },
                  lastStopTimeMillis: {
                    type: "date",
                    format: "epoch_millis",
                  },
                  totalRunDurationMillis: { type: "long" },
                },
              },
              finished: {
                properties: {
                  overallSuccess: { type: "boolean" },
                  exitCode: {
                    properties: {
                      name: { type: "keyword" },
                      code: { type: "integer" },
                    },
                  },
                  finishTimeMillis: {
                    type: "date",
                    format: "epoch_millis",
                  },
                  anomalyReport: {
                    properties: {
                      wasSuspended: { type: "boolean" },
                    },
                  },
                },
              },
              buildToolLogs: {
                properties: {
                  log: { type: "nested" },
                },
              },
              buildMetrics: {
                properties: {
                  actionSummary: {
                    properties: {
                      actionsCreated: { type: "long" },
                      actionsExecuted: { type: "long" },
                    },
                  },
                  memoryMetrics: {
                    properties: {
                      usedHeapSizePostBuild: { type: "long" },
                      peakPostGcHeapSize: { type: "long" },
                    },
                  },
                  targetMetrics: {
                    properties: {
                      targetsLoaded: { type: "long" },
                      targetsConfigured: { type: "long" },
                    },
                  },
                  packageMetrics: {
                    properties: {
                      packagesLoaded: { type: "long" },
                    },
                  },
                  timingMetrics: {
                    properties: {
                      cpuTimeInMs: { type: "long" },
                      wallTimeInMs: { type: "long" },
                    },
                  },
                },
              },
              workspaceConfig: { type: "object" },
              buildMetadata: { type: "object" },
              convenienceSymlinksIdentified: {
                properties: {
                  convenienceSymlinks: { type: "nested" },
                },
              },
            }
          },
          traceEvent: {
            properties: {
              cat: { type: "keyword" },
              name: { type: "keyword" },
              ph: { type: "keyword" },
              pid: { type: "integer" },
              tid: { type: "integer" },
              args: { type: "object" },
              ts: { type: "long" },
              dur: { type: "long" },
              args: {
                properties: {
                  name: { type: "keyword" },
                  target: { type: "keyword" },
                },
              },
            },
          },
        }
      },
      settings: {
        number_of_replicas: 1,
        number_of_shards: 3,
        "mapping.nested_objects.limit": 100000
      }
    }'
}

ensure_index() {
  local job index
  job="$1"
  index="$2"
  if ! [ -f $DONE/$index ]; then
    if curl -s --fail -I http://$ES_IP/$index >/dev/null; then
      echo "$job: index $index already exists"
    else
      echo "$job: creating index $index"
      emit_mappings | curl -XPUT http://$ES_IP/$index \
                           -s \
                           -H 'Content-Type: application/json' \
                           --fail \
                           --data-binary @- >/dev/null
    fi
    touch $DONE/$index
  fi
}

emit_build_events() {
  local job cmd file
  job="$1"
  cmd="$2"
  file="$3"
  jq -c \
     --slurpfile job_md "$job/job-md.json" \
     --arg cmd "$cmd" \
     --arg index "$(index "$job")" \
     --arg job "$job" \
     < "$file" \
     '
     { index: { _index: $index, _id: ($job + "-" + $cmd + "-events-" + (input_line_number | tostring)) } },
     { job: $job_md[0],
       command: { name: $cmd },
       buildEvent: .
     }
     '
}

emit_trace_events() {
  local job cmd index file
  job="$1"
  cmd="$2"
  file="$3"
  jq -c \
     --slurpfile job_md "$job/job-md.json" \
     --arg cmd "$cmd" \
     --arg index "$(index "$job")" \
     --arg job "$job" \
     < "$file" \
     '
     .traceEvents
     | to_entries[]
     | { index: { _index: $index, _id: ($job + "-" + $cmd + "-profile-" + (.key | tostring)) } },
       { job: $job_md[0],
         command: { name: $cmd },
         traceEvent: .value
       }
     '
}

bulk_upload() (

  ## Uploads a bunch of JSON objects, subject to these constraints:
  ##
  ## 1. The input file has one JSON object per line. We cannot bbreak lines, as
  ##    that would result in incomplete JSON objects.
  ## 2. JSON objects go in pairs: the first line is metadata for how ES should
  ##    ingest the second line. So we can't split in the middle of a pair
  ##    either.
  ## 3. The maximum size for a single upload is 500mb (set in the ES
  ##    configuration a bit higher in this file), so if a file is larger than
  ##    that we need to split it, respecting constraints 1 and 2.
  ##
  ## Because this function is defined with () rather than the usual {}, it runs
  ## in a subshell and can define its own scoped inner functions, as well as
  ## its own traps. Also, all variables are local.

  tmp=$(mktemp)
  chunk=$(mktemp)
  trap 'rm -f $tmp $chunk' EXIT
  cat - > $tmp
  lines_to_process=$(wc -l $tmp | awk '{print $1'})
  processed_lines=0
  lines_per_chunk=$lines_to_process

  push_chunk() {
    curl -X POST "http://$ES_IP/_bulk?filter_path=errors,items.*.status" \
         -H 'Content-Type: application/json' \
         --fail \
         -s \
         --data-binary @$chunk \
         | jq -r '.items[].index.status' | sort | uniq -c
    processed_lines=$(( processed_lines + lines_per_chunk ))
    lines_per_chunk=$(( lines_per_chunk * 2 ))
  }

  get_next_chunk() {
    (
    # tail -n +N drops the first N-1 lines
    # tail is expected to fail with 141 (pipe closed) on intermediate
    # iterations
    tail -n +$(( processed_lines + 1)) $tmp || (( $? == 141))
    ) \
      | head -n $lines_per_chunk \
      > $chunk
  }

  all_lines_have_been_processed() (( processed_lines >= lines_to_process ))

  # limit chunk size to 50MB
  # This will fail dramatically if we ever have a single line over 50MB
  chunk_is_too_big() (( $(du $chunk | awk '{print $1}') > 50000 ))

  reduce_chunk_size() {
    # divide by two, but keep an even number
    lines_per_chunk=$(( lines_per_chunk / 4 * 2))
  }

  until all_lines_have_been_processed; do
    get_next_chunk
    if chunk_is_too_big; then
      reduce_chunk_size
    else
      push_chunk
    fi
  done
)

patch() {
  local job map file
  job="$1"
  # Replace shortened Scala test names by their long names.
  # See //bazel_tools:scala.bzl%da_scala_test_short_name_aspect.
  map="scala-test-suite-name-map.json"
  if ! [[ -f "$job/$map" ]]; then
    echo "$job: no $map"
  else
    echo "$job: applying $map"
    # Generates a sed command to replace short labels by long labels.
    jq_command='to_entries | map("s|\(.key)\\b|\(.value)|g") | join(";")'
    sed_command="$(jq -r "$jq_command" <"$job/$map")"
    for f in build-events build-profile test-events test-profile; do
      file="$job/$f.json"
      if [ -f "$file" ]; then
        sed -i "$sed_command" "$file"
      fi
    done
  fi
}

push() {
  local job f pids
  job="$1"
  pids=""
  for cmd in "build" "test"; do

    f="$job/$cmd-events.json"
    if ! [[ -f "$f" ]]; then
      echo "$job: no $cmd-events.json"
    elif ! jq . >/dev/null 2>&1 < $f; then
      echo "$job: $cmd-events.json exists but is not valid json, skipping"
    else
      echo "$job: pushing $cmd-events.json"
      (emit_build_events "$job" "$cmd" "$f" | bulk_upload) &
      pids="$pids $!"
    fi

    f="$job/$cmd-profile.json"
    if ! [[ -f "$f" ]]; then
      echo "$job: no $cmd-profile.json"
    elif ! jq . >/dev/null 2>&1 < $f; then
      echo "$job: $cmd-profile.json exists but is not valid json, skipping"
    else
      echo "$job: pushing $cmd-profile.json"
      (emit_trace_events "$job" "$cmd" "$f" | bulk_upload) &
      pids="$pids $!"
    fi
  done
  for pid in $pids; do
    wait $pid
  done
}

index() {
  local job prefix
  job="$1"
  echo "events-$(echo $job | cut -c1-10)"
}

pid=$$
exec 2> >(while IFS= read -r line; do echo "$(date -uIs) [ingest] [$pid] [err]: $line"; done)
exec 1> >(while IFS= read -r line; do echo "$(date -uIs) [ingest] [$pid] [out]: $line"; done)

LOCK=/root/lock

if [ -f $LOCK ]; then
  echo "Already running; skipping."
  exit 0
else
  touch $LOCK
  trap "rm $LOCK; echo exited" EXIT
  echo "Starting..."
fi

echo "Running rsync..."
$GSUTIL -q -m rsync -r gs://daml-data/bazel-metrics/ $DATA/
echo "Total data size: $(du -hs $DATA | awk '{print $1}')."

todo=$(find $DATA -type f -name \*.tar.gz | sort)
comm=$(comm -23 <(for f in $todo; do basename $${f%.tar.gz}; done | sort) <(ls $DONE | sort))

echo "Need to push $(echo "$comm" | sed '/^$/d' | wc -l) files out of $(echo "$todo" | sed '/^$/d' | wc -l)."

for tar in $todo; do
  job=$(basename $${tar%.tar.gz})
  cd $(dirname $tar)
  if ! [ -f $DONE/$job ]; then
    ensure_index "$job" "$(index "$job")"
    tar --force-local -x -z -f "$(basename "$tar")"
    patch "$job"
    push "$job"
    rm -rf $job
    r=$(curl -H 'Content-Type: application/json' \
             --fail \
             -s \
             "http://$ES_IP/done/_doc/$job" \
             -d '{}')
    echo "$job: $(echo $r | jq '.result')"
    touch "$DONE/$job"
  fi
done
CRON

cat <<'HOURLY' >/root/hourly.sh
#!/usr/bin/env bash
set -euo pipefail

pid=$$
exec 2> >(while IFS= read -r line; do echo "$(date -uIs) [kibex] [$pid] [err]: $line"; done)
exec 1> >(while IFS= read -r line; do echo "$(date -uIs) [kibex] [$pid] [out]: $line"; done)

HOUR="$(date -u -Is | cut -c 1-13)"
TMP=$(mktemp)
TARGET="gs://daml-data/kibana-export/$HOUR.gz"

echo "Starting Kibana export..."

# Kibana export API does not support wildcard, so we list all of the object
# types that exist as of Kibana 7.13.
curl http://$KIBANA_IP/api/saved_objects/_export \
     -XPOST \
     -d'{"excludeExportDetails": true,
         "type": ["visualization", "dashboard", "search", "index-pattern",
                  "config", "timelion-sheet"]}' \
     -H 'kbn-xsrf: true' \
     -H 'Content-Type: application/json' \
     --fail \
     --silent \
     | gzip -9 > $TMP


echo "Pushing $TARGET"

$GSUTIL -q cp $TMP $TARGET

echo "Done."
HOURLY

chmod +x /root/cron.sh
chmod +x /root/hourly.sh

ES_IP=${google_compute_address.es[0].address}
KIB_IP=${google_compute_address.es[1].address}

DATA=/root/data
mkdir -p $DATA

DONE=/root/done
mkdir -p $DONE

echo "Synchronizing with cluster state..."
found=0
for prefix in jobs events; do
  for idx in $(curl --fail "http://$ES_IP/_cat/indices/$prefix-*?format=json" -s | jq -r '.[] | .index'); do
    found=$((found + 1))
    touch $DONE/$idx;
  done
done
echo "Found $found indices."

if curl -s --fail -I "http://$ES_IP/done" >/dev/null; then
  found=0
  res=$(curl --fail "http://$ES_IP/done/_search?_source=false&size=1000&scroll=5m" -s)
  while (echo $res | jq -e '.hits.hits != []' >/dev/null); do
    for id in $(echo $res | jq -r '.hits.hits[]._id'); do
      found=$((found + 1))
      touch $DONE/$id
    done
    scroll_id=$(echo $res | jq -r '._scroll_id')
    res=$(curl "http://$ES_IP/_search/scroll" \
               -s \
               --fail \
               -d "$(jq --arg id "$scroll_id" \
                        -n \
                        '{scroll: "5m", scroll_id: $id}')" \
               -H 'Content-Type: application/json')
  done
  echo "Found $found jobs."
else
  echo "No done index; creating..."
  r=$(curl -XPUT "http://$ES_IP/done" \
           -d '{"settings": {"number_of_replicas": 2}}' \
           --fail \
           -s \
           -H 'Content-Type: application/json')
  echo $r
fi

cat <<CRONTAB >> /etc/crontab
* * * * * root GSUTIL="$(which gsutil)" DONE="$DONE" DATA="$DATA" ES_IP="$ES_IP" /root/cron.sh >> /root/log 2>&1
1 * * * * root GSUTIL="$(which gsutil)" KIBANA_IP="$KIB_IP" /root/hourly.sh >> /root/log 2>&1
CRONTAB

echo "Waiting for first run..." > /root/log
tail -f /root/log

STARTUP

  network_interface {
    network = google_compute_network.es.name
    access_config {
      nat_ip = google_compute_address.es-feed.address
    }
  }

  service_account {
    email = google_service_account.es-feed.email
    scopes = [
      # Required for cloud logging
      "logging-write",
      # Read access to storage
      "storage-rw",
    ]
  }

  scheduling {
    automatic_restart   = false
    on_host_maintenance = "TERMINATE"
    preemptible         = true
  }

  lifecycle {
    create_before_destroy = true
  }

}