From 4a6ab84b692df801926ed448f1e7d04af498d1ce Mon Sep 17 00:00:00 2001 From: Gary Verhaegen Date: Sat, 9 May 2020 18:21:42 +0200 Subject: [PATCH] add default machine capability (#5912) add default machine capability We semi-regularly need to do work that has the potential to disrupt a machine's local cache, rendering it broken for other streams of work. This can include upgrading nix, upgrading Bazel, debugging caching issues, or anything related to Windows. Right now we do not have any good solution for these situations. We can either not do those streams of work, or we can proceed with them and just accept that all other builds may get affected depending on which machine they get assigned to. Debugging broken nodes is particularly tricky as we do not have any way to force a build to run on a given node. This PR aims at providing a better alternative by (ab)using an Azure Pipelines feature called [capabilities](https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/agents?view=azure-devops&tabs=browser#capabilities). The idea behind capabilities is that you assign a set of tags to a machine, and then a job can express its [demands](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml), i.e. specify a set of tags machines need to have in order to run it. Support for this is fairly badly documented. We can gather from the documentation that a job can specify two things about a capability (through its `demands`): that a given tag exists, and that a given tag has an exact specified value. In particular, a job cannot specify that a capability should _not_ be present, meaning we cannot rely on, say, adding a "broken" tag to broken machines. Documentation on how to set capabilities for an agent is basically nonexistent, but [looking at the code](https://github.com/microsoft/azure-pipelines-agent/blob/master/src/Microsoft.VisualStudio.Services.Agent/Capabilities/UserCapabilitiesProvider.cs) indicates that they can be set by using a simple `key=value`-formatted text file, provided we can find the right place to put this file. This PR adds this file to our Linux, macOS and Windows node init scripts to define an `assignment` capability and adds a demand for a `default` value on each job. From then on, when we hit a case where we want a PR to run on a specific node, and to prevent other PRs from running on that node, we can manually override the capability from the Azure UI and update the demand in the relevant YAML file in the PR. CHANGELOG_BEGIN CHANGELOG_END --- azure-cron.yml | 4 ++++ azure-pipelines.yml | 11 +++++++++++ ci/cron/daily-compat.yml | 2 ++ ci/cron/monthly.yaml | 1 + infra/vsts_agent_linux_startup.sh | 1 + infra/vsts_agent_windows.tf | 3 +++ 6 files changed, 22 insertions(+) diff --git a/azure-cron.yml b/azure-cron.yml index 81ffe100742..3b7e2be0505 100644 --- a/azure-cron.yml +++ b/azure-cron.yml @@ -32,6 +32,7 @@ jobs: timeoutInMinutes: 120 pool: name: 'linux-pool' + demands: assignment -equals default steps: - checkout: self - bash: ci/dev-env-install.sh @@ -55,6 +56,7 @@ jobs: timeoutInMinutes: 60 pool: name: 'linux-pool' + demands: assignment -equals default steps: - checkout: self - bash: | @@ -95,6 +97,7 @@ jobs: timeoutInMinutes: 10 pool: name: 'linux-pool' + demands: assignment -equals default steps: - checkout: self - bash: | @@ -161,6 +164,7 @@ jobs: timeoutInMinutes: 10 pool: name: "linux-pool" + demands: assignment -equals default steps: - checkout: self - bash: | diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 26b1904e879..eb319a49aeb 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -25,6 +25,7 @@ jobs: - job: git_sha pool: name: 'linux-pool' + demands: assignment -equals default steps: - bash: | set -euo pipefail @@ -48,6 +49,7 @@ jobs: condition: eq(variables['Build.Reason'], 'PullRequest') pool: name: 'linux-pool' + demands: assignment -equals default steps: - checkout: self - bash: | @@ -75,6 +77,7 @@ jobs: condition: eq(variables['Build.Reason'], 'PullRequest') pool: name: 'linux-pool' + demands: assignment -equals default steps: - checkout: self - bash: ci/check-changelog.sh $(fork_sha) @@ -90,6 +93,7 @@ jobs: timeoutInMinutes: 360 pool: name: 'linux-pool' + demands: assignment -equals default steps: - template: ci/report-start.yml - checkout: self @@ -156,6 +160,7 @@ jobs: timeoutInMinutes: 360 pool: name: 'windows-pool' + demands: assignment -equals default steps: - template: ci/report-start.yml - checkout: self @@ -185,6 +190,7 @@ jobs: timeoutInMinutes: 60 pool: name: linux-pool + demands: assignment -equals default steps: - template: ci/report-start.yml - checkout: self @@ -215,6 +221,7 @@ jobs: timeoutInMinutes: 60 pool: name: 'windows-pool' + demands: assignment -equals default steps: - template: ci/report-start.yml - checkout: self @@ -237,6 +244,7 @@ jobs: fork_sha: $[ dependencies.git_sha.outputs['out.fork_point'] ] pool: name: "linux-pool" + demands: assignment -equals default steps: - bash: | set -euo pipefail @@ -263,6 +271,7 @@ jobs: - job: check_perf_test pool: name: linux-pool + demands: assignment -equals default condition: eq(variables['Build.Reason'], 'IndividualCI') steps: - bash: | @@ -430,6 +439,7 @@ jobs: - compatibility_windows pool: name: "linux-pool" + demands: assignment -equals default variables: Linux.start: $[ dependencies.Linux.outputs['start.time'] ] Linux.machine: $[ dependencies.Linux.outputs['start.machine'] ] @@ -585,6 +595,7 @@ jobs: - collect_build_data pool: name: 'linux-pool' + demands: assignment -equals default variables: pr.num: $[ variables['System.PullRequest.PullRequestNumber'] ] branch_sha: $[ dependencies.git_sha.outputs['out.branch'] ] diff --git a/ci/cron/daily-compat.yml b/ci/cron/daily-compat.yml index 903213f75c2..66d296b2d66 100644 --- a/ci/cron/daily-compat.yml +++ b/ci/cron/daily-compat.yml @@ -56,6 +56,7 @@ jobs: timeoutInMinutes: 240 pool: name: windows-pool + demands: assignment -equals default steps: - checkout: self - template: ../compatibility-windows.yml @@ -84,6 +85,7 @@ jobs: timeoutInMinutes: 120 pool: name: "linux-pool" + demands: assignment -equals default steps: - checkout: self - bash: ci/dev-env-install.sh diff --git a/ci/cron/monthly.yaml b/ci/cron/monthly.yaml index d211fb3293d..5ad1d05ff4c 100644 --- a/ci/cron/monthly.yaml +++ b/ci/cron/monthly.yaml @@ -23,6 +23,7 @@ jobs: timeoutInMinutes: 20 pool: name: 'linux-pool' + demands: assignment -equals default steps: - checkout: self - bash: | diff --git a/infra/vsts_agent_linux_startup.sh b/infra/vsts_agent_linux_startup.sh index 34f499d62c0..6b42f6d75df 100644 --- a/infra/vsts_agent_linux_startup.sh +++ b/infra/vsts_agent_linux_startup.sh @@ -104,6 +104,7 @@ VSTS_TOKEN=${vsts_token} mkdir -p ~/agent cd ~/agent +echo 'assignment=default' > .capabilities echo Determining matching VSTS agent... VSTS_AGENT_RESPONSE=$(curl -sSfL \ diff --git a/infra/vsts_agent_windows.tf b/infra/vsts_agent_windows.tf index c7c44291c30..54f8a7643f8 100644 --- a/infra/vsts_agent_windows.tf +++ b/infra/vsts_agent_windows.tf @@ -128,6 +128,9 @@ net start winrm echo "== Installing the VSTS agent" +New-Item -ItemType Directory -Path 'C:\agent' +Set-Content -Path 'C:\agent\.capabilities' -Value 'assignment=default' + $MachineName = Get-CimInstance -ClassName Win32_OperatingSystem | Select-Object CSName | ForEach{ $_.CSName } choco install azure-pipelines-agent --no-progress --yes --params "'/Token:${local.vsts_token} /Pool:${local.vsts_pool} /Url:https://dev.azure.com/${local.vsts_account}/ /LogonAccount:$Account /LogonPassword:$Password /Work:D:\a /AgentName:$MachineName /Replace'" echo OK