daml/azure-pipelines.yml
Gary Verhaegen 11efaebe8e
tell Gary when Azure fails to fetch commit (#4611)
We have seen a number of failures recently where the collect_build_data
and notify_user steps failed to fetch their branch commit from GitHub.
Trying to reproduce locally doesn't work (i.e. fetching the same sha
succeeds), so we're currently assuming transient network errors.

This PR adds a retry mechanism as well as a Slack message to help me
keep tabs on the issue.

CHANGELOG_BEGIN
CHANGELOG_END
2020-02-20 13:07:53 +01:00

595 lines
25 KiB
YAML

# Copyright (c) 2020 The DAML Authors. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Azure Pipelines file, see https://aka.ms/yaml
# Enable builds on all branches
trigger:
# Build every commit as our release process relies on
# the release process being built alone.
batch: false
branches:
include:
- master
# Enable PR triggers that target the master branch
pr:
autoCancel: true # cancel previous builds on push
branches:
include:
- master
jobs:
- job: git_sha
pool:
name: 'linux-pool'
steps:
- bash: |
set -euo pipefail
if [ "$(Build.Reason)" == "PullRequest" ]; then
echo "##vso[task.setvariable variable=branch;isOutput=true]$(git rev-parse 'HEAD^2')"
echo "##vso[task.setvariable variable=master;isOutput=true]$(git rev-parse 'HEAD^1')"
echo "##vso[task.setvariable variable=fork_point;isOutput=true]$(git merge-base --fork-point 'HEAD^1' 'HEAD^2')"
else
echo "##vso[task.setvariable variable=branch;isOutput=true]$(git rev-parse HEAD)"
echo "##vso[task.setvariable variable=master;isOutput=true]$(git rev-parse HEAD)"
echo "##vso[task.setvariable variable=fork_point;isOutput=true]$(git rev-parse HEAD)"
fi
name: out
- job: check_standard_change_label
condition: eq(variables['Build.Reason'], 'PullRequest')
pool:
name: 'linux-pool'
steps:
- checkout: self
- bash: |
set -euo pipefail
has_changed_infra_folder () {
git diff origin/master --name-only | grep -q '^infra/'
}
fail_if_missing_std_change_label () {
curl https://api.github.com/repos/digital-asset/daml/pulls/$PR -s | jq -r '.labels[].name' | grep -q '^Standard-Change$'
}
if has_changed_infra_folder; then
fail_if_missing_std_change_label
fi
env:
PR: $(System.PullRequest.PullRequestNumber)
- job: check_changelog_entry
condition: eq(variables['Build.Reason'], 'PullRequest')
pool:
name: 'linux-pool'
steps:
- checkout: self
- bash: ci/check-changelog.sh
- job: Linux
timeoutInMinutes: 360
pool:
name: 'linux-pool'
steps:
- template: ci/report-start.yml
- checkout: self
- template: ci/build-unix.yml
parameters:
name: linux
- template: ci/tell-slack-failed.yml
- template: ci/report-end.yml
- job: macOS
timeoutInMinutes: 360
pool:
vmImage: 'macOS-10.14'
variables:
nix-cache-key: $(Build.StagingDirectory)/macos-nix-key
nix-cache-path: /tmp/nix-cache/
bazel-repo-cache-key: $(Build.StagingDirectory)/bazel-repo-cache-key
bazel-repo-cache-path: $(Agent.BuildDirectory)/.bazel-cache/repo
steps:
- template: ci/report-start.yml
- checkout: self
- bash: echo $(git log -n1 --pretty=format:%H dev-env nix azure-pipelines.yml) >> $(nix-cache-key)
displayName: nix cache key
- task: CacheBeta@0
inputs:
key: $(nix-cache-key) | v2
path: $(nix-cache-path)
- bash: |
set -euo pipefail
if [[ -e $(nix-cache-path) ]]; then
DIR=$(pwd)
sudo mkdir /nix && sudo chown $USER /nix
cd /nix
tar xzf $(nix-cache-path)/nix.tar.gz
cd $DIR
curl -sfL https://nixos.org/releases/nix/nix-2.3.2/install | bash
fi
displayName: restore cache
- bash: echo $(git log -n1 --pretty=format:%H azure-pipelines.yml $(find . -name \*.bazel -or -name \*.bzl -or -name WORKSPACE -or -name BUILD)) >> $(bazel-repo-cache-key)
displayName: bazel repo cache key
- task: CacheBeta@0
inputs:
key: $(bazel-repo-cache-key)
path: $(bazel-repo-cache-path)
- template: ci/build-unix.yml
parameters:
name: macos
- bash: |
set -euo pipefail
if [[ ! -e $(nix-cache-path) ]]; then
mkdir -p $(nix-cache-path)
cd /nix
GZIP=-9 tar czf $(nix-cache-path)/nix.tar.gz store var
fi
displayName: create nix cache
- bash: mkdir -p $(bazel-repo-cache-path)
displayName: ensure bazel repo cache exists
- template: ci/tell-slack-failed.yml
- template: ci/report-end.yml
- job: Windows
timeoutInMinutes: 360
pool:
name: 'windows-pool'
steps:
- template: ci/report-start.yml
- template: ci/build-windows.yml
- template: ci/tell-slack-failed.yml
- template: ci/report-end.yml
- job: perf
timeoutInMinutes: 60
pool:
name: 'linux-pool'
steps:
- template: ci/report-start.yml
- checkout: self
- bash: ci/dev-env-install.sh
displayName: 'Build/Install the Developer Environment'
- bash: ci/configure-bazel.sh
displayName: 'Configure Bazel'
env:
IS_FORK: $(System.PullRequest.IsFork)
# to upload to the bazel cache
GOOGLE_APPLICATION_CREDENTIALS_CONTENT: $(GOOGLE_APPLICATION_CREDENTIALS_CONTENT)
- bash: |
set -euo pipefail
eval "$(./dev-env/bin/dade-assist)"
bazel run -- //ledger/sandbox-perf -foe true -i1 -f1 -wi 1 -bm avgt -rf csv -rff "$(Build.StagingDirectory)/sandbox-perf.csv"
- task: PublishBuildArtifacts@1
condition: succeededOrFailed()
inputs:
pathtoPublish: '$(Build.StagingDirectory)'
artifactName: 'Perf test logs'
- template: ci/tell-slack-failed.yml
- template: ci/report-end.yml
- job: Windows_signing
# Signing is a separate job so that we can make sure that we only sign on releases.
# Since the release check is run on Linux, we do not have access to that information
# in the regular Windows step.
dependsOn: [ "Windows", "Linux" ]
pool:
name: 'windows-pool'
condition: and(succeeded(), eq(dependencies.Linux.outputs['release.has_released'], 'true'))
variables:
unsigned-installer: $[ dependencies.Windows.outputs['publish.artifact-unsigned-windows-installer'] ]
steps:
- template: ci/report-start.yml
- checkout: self
persistCredentials: true
- task: DownloadPipelineArtifact@0
inputs:
artifactName: $(unsigned-installer)
targetPath: $(Build.StagingDirectory)/
- bash: |
set -euo pipefail
INSTALLER=daml-sdk-$(cat VERSION)-windows.exe
mv "$(Build.StagingDirectory)/$(unsigned-installer)" "$(Build.StagingDirectory)/$INSTALLER"
chmod +x "$(Build.StagingDirectory)/$INSTALLER"
cleanup () {
rm -f signing_key.pfx
}
trap cleanup EXIT
echo "$SIGNING_KEY" | base64 -d > signing_key.pfx
MSYS_NO_PATHCONV=1 signtool.exe sign '/f' signing_key.pfx '/fd' sha256 '/tr' "http://timestamp.digicert.com" '/v' "$(Build.StagingDirectory)/$INSTALLER"
rm signing_key.pfx
echo "##vso[task.setvariable variable=artifact-windows-installer;isOutput=true]$INSTALLER"
echo "##vso[task.setvariable variable=has_released;isOutput=true]true"
name: signing
env:
SIGNING_KEY: $(microsoft-code-signing)
- task: PublishPipelineArtifact@0
inputs:
targetPath: $(Build.StagingDirectory)/$(signing.artifact-windows-installer)
artifactName: $(signing.artifact-windows-installer)
- template: ci/tell-slack-failed.yml
- template: ci/report-end.yml
- job: release
dependsOn: [ "Linux", "macOS", "Windows", "Windows_signing", "perf"]
pool:
vmImage: "Ubuntu-16.04"
condition: and(succeeded(),
eq( dependencies.Linux.outputs['release.has_released'], 'true' ),
eq( dependencies.Windows.outputs['release.has_released'], 'true' ),
eq( dependencies.Windows_signing.outputs['signing.has_released'], 'true' ),
eq( dependencies.macOS.outputs['release.has_released'], 'true' ))
variables:
artifact-linux: $[ dependencies.Linux.outputs['publish.artifact'] ]
artifact-macos: $[ dependencies.macOS.outputs['publish.artifact'] ]
artifact-windows: $[ dependencies.Windows.outputs['publish.artifact'] ]
artifact-windows-installer: $[ dependencies.Windows_signing.outputs['signing.artifact-windows-installer'] ]
steps:
- template: ci/report-start.yml
- checkout: self
persistCredentials: true
- bash: |
set -euxo pipefail
if git tag v$(cat VERSION); then
git push origin v$(cat VERSION)
mkdir $(Build.StagingDirectory)/release
else
echo "##vso[task.setvariable variable=skip-github]TRUE"
fi
- task: DownloadPipelineArtifact@0
inputs:
artifactName: $(artifact-linux)
targetPath: $(Build.StagingDirectory)/release
condition: not(eq(variables['skip-github'], 'TRUE'))
- task: DownloadPipelineArtifact@0
inputs:
artifactName: $(artifact-macos)
targetPath: $(Build.StagingDirectory)/release
condition: not(eq(variables['skip-github'], 'TRUE'))
- task: DownloadPipelineArtifact@0
inputs:
artifactName: $(artifact-windows)
targetPath: $(Build.StagingDirectory)/release
condition: not(eq(variables['skip-github'], 'TRUE'))
- task: DownloadPipelineArtifact@0
inputs:
artifactName: $(artifact-windows-installer)
targetPath: $(Build.StagingDirectory)/release
condition: not(eq(variables['skip-github'], 'TRUE'))
- bash: |
set -euo pipefail
KEY_FILE=$(mktemp)
GPG_DIR=$(mktemp -d)
cleanup() {
rm -rf $KEY_FILE $GPG_DIR
}
trap cleanup EXIT
echo "$GPG_KEY" | base64 -d > $KEY_FILE
gpg --homedir $GPG_DIR --no-tty --quiet --import $KEY_FILE
cd $(Build.StagingDirectory)/release
# Note: relies on our release artifacts not having spaces in their
# names. Creates a ${f}.asc with the signature for each $f.
for f in *; do
gpg --homedir $GPG_DIR -ab $f
done
env:
GPG_KEY: $(gpg-code-signing)
- task: GitHubRelease@0
inputs:
gitHubConnection: 'garyverhaegen-da'
repositoryName: '$(Build.Repository.Name)'
action: 'create'
target: '$(Build.SourceVersion)'
tagSource: 'auto'
assets: $(Build.StagingDirectory)/release/*
assetUploadMode: 'replace'
addChangeLog: false
isPrerelease: true
condition: not(eq(variables['skip-github'], 'TRUE'))
- template: ci/tell-slack-failed.yml
- template: ci/report-end.yml
- job: write_ledger_dump
dependsOn: [ "Linux", "macOS", "Windows", "Windows_signing", "perf"]
pool:
vmImage: "Ubuntu-16.04"
condition: and(succeeded(),
eq( dependencies.Linux.outputs['release.has_released'], 'true' ),
eq( dependencies.Windows.outputs['release.has_released'], 'true' ),
eq( dependencies.Windows_signing.outputs['signing.has_released'], 'true' ),
eq( dependencies.macOS.outputs['release.has_released'], 'true' ))
steps:
- checkout: self
- bash: |
set -euo pipefail
sudo mkdir -p /nix
sudo chown $USER /nix
curl -sfL https://nixos.org/releases/nix/nix-2.3.2/install | bash
eval "$(dev-env/bin/dade-assist)"
GCS_KEY=$(mktemp)
cleanup () {
rm -f $GCS_KEY
}
trap cleanup EXIT
echo "$GOOGLE_APPLICATION_CREDENTIALS_CONTENT" > $GCS_KEY
gcloud auth activate-service-account --key-file=$GCS_KEY
export BOTO_CONFIG=/dev/null
bazel build //ledger/api-server-damlonx/reference-v2:reference-ledger-dump
gsutil cp bazel-bin/ledger/api-server-damlonx/reference-v2/reference-ledger-dump.out \
gs://daml-dumps/release/ledger/api-server-damlonx/reference-v2/reference-ledger-dump-$(cat VERSION)
env:
GOOGLE_APPLICATION_CREDENTIALS_CONTENT: $(GOOGLE_APPLICATION_CREDENTIALS_CONTENT)
- template: ci/tell-slack-failed.yml
- job: collect_build_data
condition: always()
dependsOn:
- Linux
- macOS
- Windows
- Windows_signing
- perf
- release
- check_standard_change_label
- write_ledger_dump
- git_sha
pool:
name: "linux-pool"
variables:
Linux.start: $[ dependencies.Linux.outputs['start.time'] ]
Linux.machine: $[ dependencies.Linux.outputs['start.machine'] ]
Linux.end: $[ dependencies.Linux.outputs['end.time'] ]
Linux.status: $[ dependencies.Linux.result ]
macOS.start: $[ dependencies.macOS.outputs['start.time'] ]
macOS.machine: $[ dependencies.macOS.outputs['start.machine'] ]
macOS.end: $[ dependencies.macOS.outputs['end.time'] ]
macOS.status: $[ dependencies.macOS.result ]
Windows.start: $[ dependencies.Windows.outputs['start.time'] ]
Windows.machine: $[ dependencies.Windows.outputs['start.machine'] ]
Windows.end: $[ dependencies.Windows.outputs['end.time'] ]
Windows.status: $[ dependencies.Windows.result ]
Windows_signing.start: $[ dependencies.Windows_signing.outputs['start.time'] ]
Windows_signing.machine: $[ dependencies.Windows_signing.outputs['start.machine'] ]
Windows_signing.end: $[ dependencies.Windows_signing.outputs['end.time'] ]
Windows_signing.status: $[ dependencies.Windows_signing.result ]
perf.start: $[ dependencies.perf.outputs['start.time'] ]
perf.machine: $[ dependencies.perf.outputs['start.machine'] ]
perf.end: $[ dependencies.perf.outputs['end.time'] ]
perf.status: $[ dependencies.perf.result ]
release.start: $[ dependencies.release.outputs['start.time'] ]
release.machine: $[ dependencies.release.outputs['start.machine'] ]
release.end: $[ dependencies.release.outputs['end.time'] ]
release.status: $[ dependencies.release.result ]
std_change.start: $[ dependencies.check_standard_change_label.outputs['start.time'] ]
std_change.machine: $[ dependencies.check_standard_change_label.outputs['start.machine'] ]
std_change.end: $[ dependencies.check_standard_change_label.outputs['end.time'] ]
std_change.status: $[ dependencies.check_standard_change_label.result ]
dump.start: $[ dependencies.write_ledger_dump.outputs['start.time'] ]
dump.machine: $[ dependencies.write_ledger_dump.outputs['start.machine'] ]
dump.end: $[ dependencies.write_ledger_dump.outputs['end.time'] ]
dump.status: $[ dependencies.write_ledger_dump.result ]
branch_sha: $[ dependencies.git_sha.outputs['out.branch'] ]
master_sha: $[ dependencies.git_sha.outputs['out.master'] ]
fork_sha: $[ dependencies.git_sha.outputs['out.fork_point'] ]
# Using expression syntax so we get an empty string if not set, rather
# than the raw $(VarName) string. Expression syntax works on the
# variables key, but not on the env one, so we need an extra indirection.
# Note: These Azure variables are only set for PR builds.
pr.num: $[ variables['System.PullRequest.PullRequestNumber'] ]
pr.branch: $[ variables['System.PullRequest.SourceBranch'] ]
steps:
# some change in Azure configuration makes this fail recently (2020-01).
# Azure runs PR builds not on the PR commit, but on the GitHub-provided
# commit that would be the result of merging the PR. Recently, it looks
# like when it reaches the point of running this job (which has to run
# after the macOS one, which sometimes takes up to an hour), if master
# has changed in the meantime, Azure cannot find the commit it wants to
# build anymore. Therefore, we tell it not to checkout anything, and
# manually checkout the PR commit.
- checkout: none
- bash: |
set -euo pipefail
# Note: this is going to get the PR branch commit, not the
# result of the merge (i.e. this is not using the same commit as the
# other jobs in this build).
#
# We have seen errors getting this commit from GitHub, which we
# suspect are transient network errors, so adding simple exponential
# backoff to mitigate.
BACKOFF=1
tell_gary() {
curl -XPOST \
-i \
-H 'Content-Type: application/json' \
--data "{\"text\":\"<@UEHSF89AQ> <https://dev.azure.com/digitalasset/daml/_build/results?buildId=$(Build.BuildId)|Build $(Build.BuildId)> for <https://github.com/digital-asset/daml/pull/$(pr.num)|PR $(pr.num)> has failed to fetch its commit $(branch_sha) up to BACKOFF=$BACKOFF.\"}" \
$(Slack.team-daml-ci)
}
trap tell_gary EXIT
while ! git fetch origin $(branch_sha); do
if (( $BACKOFF > 500 )); then
echo "Could not get commit $(branch_sha); something is very wrong."
exit 1
else
sleep $BACKOFF
let "BACKOFF = $BACKOFF * 2"
fi
done
if (( $BACKOFF > 1 )); then
tell_gary
fi
trap - EXIT
git checkout $(branch_sha)
eval "$(./dev-env/bin/dade-assist)"
REPORT=$(mktemp)
cat >$REPORT <<END
{"jobs": {"Linux": {"start": "$(Linux.start)",
"machine": "$(Linux.machine)",
"end": "$(Linux.end)",
"status": "$(Linux.status)"},
"macOS": {"start": "$(macOS.start)",
"machine": "$(macOS.machine)",
"end": "$(macOS.end)",
"status": "$(macOS.status)"},
"Windows": {"start": "$(Windows.start)",
"machine": "$(Windows.machine)",
"end": "$(Windows.end)",
"status": "$(Windows.status)"},
"Windows_signing": {"start": "$(Windows_signing.start)",
"machine": "$(Windows_signing.machine)",
"end": "$(Windows_signing.end)",
"status": "$(Windows_signing.status)"},
"perf": {"start": "$(perf.start)",
"machine": "$(perf.machine)",
"end": "$(perf.end)",
"status": "$(perf.status)"},
"check_standard_change_label": {"start": "$(std_change.start)",
"machine": "$(std_change.machine)",
"end": "$(std_change.end)",
"status": "$(std_change.status)"},
"write_ledger_dump": {"start": "$(dump.start)",
"machine": "$(dump.machine)",
"end": "$(dump.end)",
"status": "$(dump.status)"},
"release": {"start": "$(release.start)",
"machine": "$(release.machine)",
"end": "$(release.end)",
"status": "$(release.status)"}},
"id": "$(Build.BuildId)",
"url": "https://dev.azure.com/digitalasset/daml/_build/results?buildId=$(Build.BuildId)",
"name": "$(Build.DefinitionName)",
"version": "$(Build.DefinitionVersion)",
"queued_by": "$(Build.QueuedBy)",
"reason": "$(Build.Reason)",
"branch": "$(Build.SourceBranch)",
"merge_commit": "$(Build.SourceVersion)",
"branch_commit": "$(branch_sha)",
"master_commit": "$(master_sha)",
"fork_point_commit": "$(fork_sha)",
"commit_message": $(echo -n "$COMMIT_MSG" | jq -sR),
"is_fork": "$(System.PullRequest.IsFork)",
"pr": "$PR_NUM",
"pr_url": "https://github.com/digital-asset/daml/pull/$PR_NUM",
"pr_source_branch": "$PR_BRANCH"}
END
# Test above JSON is well formed
cat $REPORT | jq '.'
REPORT_GZ=$(mktemp)
cat $REPORT | gzip -9 > $REPORT_GZ
GCS_KEY=$(mktemp)
cleanup() {
rm -rf $GCS_KEY
}
trap cleanup EXIT
# Application credentials will not be set for forks. We give up on
# tracking those for now. "Not set" in Azure world means set to the
# expression Azure would otherwise substitute, i.e. the literal value
# of the string in the `env:` block below.
if [[ "${GOOGLE_APPLICATION_CREDENTIALS_CONTENT:1:${#GOOGLE_APPLICATION_CREDENTIALS_CONTENT}-1}" != '(GOOGLE_APPLICATION_CREDENTIALS_CONTENT)' ]]; then
echo "$GOOGLE_APPLICATION_CREDENTIALS_CONTENT" > $GCS_KEY
gcloud auth activate-service-account --key-file=$GCS_KEY
BOTO_CONFIG=/dev/null gsutil cp $REPORT_GZ gs://daml-data/builds/$(Build.BuildId)_$(date -u +%Y%m%d_%H%M%SZ).json.gz
else
echo "Could not save build data: no credentials. Data was:"
cat $REPORT
fi
# Linux, macOS, perf, check_std_change_label and Windows are always
# required and should always succeed.
#
# windows_signing, release and write_ledger_dump only run on releases
# and are skipped otherwise.
if [[ "$(Linux.status)" != "Succeeded"
|| "$(macOS.status)" != "Succeeded"
|| "$(Windows.status)" != "Succeeded"
|| "$(perf.status)" != "Succeeded"
|| "$(dump.status)" == "Canceled"
|| "$(Windows_signing.status)" == "Canceled"
|| "$(release.status)" == "Canceled" ]]; then
exit 1
fi
env:
GOOGLE_APPLICATION_CREDENTIALS_CONTENT: $(GOOGLE_APPLICATION_CREDENTIALS_CONTENT)
# Commit message is always set
COMMIT_MSG: $(Build.SourceVersionMessage)
# Because these variables are always set (in the variables block),
# hopefully these should be set as expected (i.e. either correct
# value or empty string, but not $(Azure.Variable.Name)).
PR_NUM: $(pr.num)
PR_BRANCH: $(pr.branch)
- job: notify_user
condition: eq(variables['Build.Reason'], 'PullRequest')
dependsOn:
- git_sha
- collect_build_data
pool:
name: 'linux-pool'
variables:
pr.num: $[ variables['System.PullRequest.PullRequestNumber'] ]
branch_sha: $[ dependencies.git_sha.outputs['out.branch'] ]
status: $[ dependencies.collect_build_data.result ]
steps:
- checkout: none
- bash: |
set -euo pipefail
# We have seen errors getting this commit from GitHub, which we
# suspect are transient network errors, so adding simple exponential
# backoff to mitigate.
BACKOFF=1
tell_gary() {
curl -XPOST \
-i \
-H 'Content-Type: application/json' \
--data "{\"text\":\"<@UEHSF89AQ> <https://dev.azure.com/digitalasset/daml/_build/results?buildId=$(Build.BuildId)|Build $(Build.BuildId)> for <https://github.com/digital-asset/daml/pull/$(pr.num)|PR $(pr.num)> has failed to fetch its commit $(branch_sha) up to BACKOFF=$BACKOFF.\"}" \
$(Slack.team-daml-ci)
}
trap tell_gary EXIT
while ! git fetch origin $(branch_sha); do
if (( $BACKOFF > 500 )); then
echo "Could not get commit $(branch_sha); something is very wrong."
exit 1
else
sleep $BACKOFF
let "BACKOFF = $BACKOFF * 2"
fi
done
if (( $BACKOFF > 1 )); then
tell_gary
fi
trap - EXIT
git checkout $(branch_sha)
tell_slack() {
local MESSAGE=$1
local USER_ID=$2
curl -XPOST \
-i \
-H 'Content-Type: application/json' \
--data "{\"text\":\"<@${USER_ID}> <https://dev.azure.com/digitalasset/daml/_build/results?buildId=$(Build.BuildId)|Build $(Build.BuildId)> for <https://github.com/digital-asset/daml/pull/$(pr.num)|PR $(pr.num)> has completed with status ${MESSAGE}.\"}" \
$(Slack.team-daml-ci)
}
EMAIL=$(git log -n 1 --format=%ae)
user_registered() {
cat ci/slack_user_ids | grep $EMAIL
}
user_id() {
echo $(cat ci/slack_user_ids | grep $EMAIL | awk '{print $2}')
}
if user_registered; then
tell_slack "$(status)" "$(user_id)"
else
echo "User $(user_id) did not opt in for notifications."
fi