daml/infra/vsts_agent_windows.tf

185 lines
5.7 KiB
Terraform
Raw Normal View History

# Copyright (c) 2022 Digital Asset (Switzerland) GmbH and/or its affiliates. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
locals {
vsts_token = secret_resource.vsts-token.value
vsts_account = "digitalasset"
vsts_pool = "windows-pool"
}
locals {
w = [
{
name = "ci-w1",
size = 0,
assignment = "default",
disk_size = 200,
},
{
name = "ci-w2"
size = 6,
assignment = "default",
disk_size = 400
},
]
}
resource "google_compute_region_instance_group_manager" "vsts-agent-windows" {
count = length(local.w)
provider = google-beta
name = local.w[count.index].name
# keep the name short. windows hostnames are limited to 12(?) chars.
# -5 for the random postfix:
base_instance_name = local.w[count.index].name
region = "us-east1"
target_size = local.w[count.index].size
version {
name = local.w[count.index].name
instance_template = google_compute_instance_template.vsts-agent-windows[count.index].self_link
}
# uncomment when we get a provider >3.55
#distribution_policy_target_shape = "ANY"
update_policy {
type = "PROACTIVE"
minimal_action = "REPLACE"
# minimum is the number of availability zones (3)
max_surge_fixed = 3
# calculated with: serial console last timestamp after boot - VM start
# 09:54:28 - 09:45:55 = 513 seconds
min_ready_sec = 520
instance_redistribution_type = "NONE"
}
}
resource "google_compute_instance_template" "vsts-agent-windows" {
count = length(local.w)
name_prefix = "${local.w[count.index].name}-"
machine_type = "c2-standard-8"
labels = local.machine-labels
disk {
disk_size_gb = local.w[count.index].disk_size
disk_type = "pd-ssd"
# find the image name with `gcloud compute images list`
source_image = "windows-cloud/windows-2016"
}
# Drive D:\ for the agent work folder
disk {
disk_size_gb = local.w[count.index].disk_size
disk_type = "pd-ssd"
}
lifecycle {
create_before_destroy = true
}
metadata = {
// Prepare the machine
windows-startup-script-ps1 = <<SYSPREP_SPECIALIZE
Set-StrictMode -Version latest
$ErrorActionPreference = 'Stop'
# Disable Windows Defender to speed up disk access
Set-MpPreference -DisableRealtimeMonitoring $true
# Disable Print Spooler service (security)
Stop-Service -Name Spooler -Force
Set-Service -Name Spooler -StartupType Disabled
# Enable long paths
Set-ItemProperty -Path 'HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem' -Name LongPathsEnabled -Type DWord -Value 1
# Disable UAC
New-ItemProperty -Path HKLM:Software\Microsoft\Windows\CurrentVersion\policies\system -Name EnableLUA -PropertyType DWord -Value 0 -Force
# Redirect logs to SumoLogic
cd $env:UserProfile;
Invoke-WebRequest https://dl.google.com/cloudagents/windows/StackdriverLogging-v1-9.exe -OutFile StackdriverLogging-v1-9.exe;
.\StackdriverLogging-v1-9.exe /S /D="C:\Stackdriver\Logging\"
# Install chocolatey
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
iex (New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1')
# Install git, bash
& choco install git --no-progress --yes 2>&1 | %%{ "$_" }
& choco install windows-sdk-10.1 --no-progress --yes 2>&1 | %%{ "$_" }
# Add tools to the PATH
$OldPath = (Get-ItemProperty -Path 'Registry::HKEY_LOCAL_MACHINE\System\CurrentControlSet\Control\Session Manager\Environment' -Name PATH).path
$NewPath = "$OldPath;C:\Program Files\Git\bin;C:\Program Files (x86)\Windows Kits\10\App Certification Kit"
Set-ItemProperty -Path 'Registry::HKEY_LOCAL_MACHINE\System\CurrentControlSet\Control\Session Manager\Environment' -Name PATH -Value $NewPath
echo "== Prepare the D:\ drive"
$partition = @"
select disk 1
clean
convert gpt
create partition primary
format fs=ntfs quick
assign letter="D"
"@
$partition | Set-Content C:\diskpart.txt
& diskpart /s C:\diskpart.txt 2>&1 | %%{ "$_" }
# Create a temporary and random password for the VSTS user, forget about it once this script has finished running
$Username = "u"
$Account = "$env:COMPUTERNAME\$Username"
Add-Type -AssemblyName System.Web
$Password = [System.Web.Security.Membership]::GeneratePassword(24, 0)
echo "== Creating the VSTS user"
#New-LocalUser $Username -Password $SecurePassword -FullName $Username
net user $Username $Password /add /y
# net localgroup administrators $Username /add
Add-LocalGroupMember -Group "Administrators" -Member $Username
winrm set winrm/config/winrs '@{MaxMemoryPerShellMB="2048"}'
winrm set winrm/config '@{MaxTimeoutms="1800000"}'
winrm set winrm/config/service/auth '@{Basic="true"}'
net stop winrm
sc.exe config winrm start=auto
net start winrm
& choco install dotnetcore-3.1-sdk --no-progress --yes 2>&1 | %%{ "$_" }
echo "== Installing the VSTS agent"
add default machine capability (#5912) add default machine capability We semi-regularly need to do work that has the potential to disrupt a machine's local cache, rendering it broken for other streams of work. This can include upgrading nix, upgrading Bazel, debugging caching issues, or anything related to Windows. Right now we do not have any good solution for these situations. We can either not do those streams of work, or we can proceed with them and just accept that all other builds may get affected depending on which machine they get assigned to. Debugging broken nodes is particularly tricky as we do not have any way to force a build to run on a given node. This PR aims at providing a better alternative by (ab)using an Azure Pipelines feature called [capabilities](https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/agents?view=azure-devops&tabs=browser#capabilities). The idea behind capabilities is that you assign a set of tags to a machine, and then a job can express its [demands](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml), i.e. specify a set of tags machines need to have in order to run it. Support for this is fairly badly documented. We can gather from the documentation that a job can specify two things about a capability (through its `demands`): that a given tag exists, and that a given tag has an exact specified value. In particular, a job cannot specify that a capability should _not_ be present, meaning we cannot rely on, say, adding a "broken" tag to broken machines. Documentation on how to set capabilities for an agent is basically nonexistent, but [looking at the code](https://github.com/microsoft/azure-pipelines-agent/blob/master/src/Microsoft.VisualStudio.Services.Agent/Capabilities/UserCapabilitiesProvider.cs) indicates that they can be set by using a simple `key=value`-formatted text file, provided we can find the right place to put this file. This PR adds this file to our Linux, macOS and Windows node init scripts to define an `assignment` capability and adds a demand for a `default` value on each job. From then on, when we hit a case where we want a PR to run on a specific node, and to prevent other PRs from running on that node, we can manually override the capability from the Azure UI and update the demand in the relevant YAML file in the PR. CHANGELOG_BEGIN CHANGELOG_END
2020-05-09 19:21:42 +03:00
New-Item -ItemType Directory -Path 'C:\agent'
Set-Content -Path 'C:\agent\.capabilities' -Value 'assignment=${local.w[count.index].assignment}'
add default machine capability (#5912) add default machine capability We semi-regularly need to do work that has the potential to disrupt a machine's local cache, rendering it broken for other streams of work. This can include upgrading nix, upgrading Bazel, debugging caching issues, or anything related to Windows. Right now we do not have any good solution for these situations. We can either not do those streams of work, or we can proceed with them and just accept that all other builds may get affected depending on which machine they get assigned to. Debugging broken nodes is particularly tricky as we do not have any way to force a build to run on a given node. This PR aims at providing a better alternative by (ab)using an Azure Pipelines feature called [capabilities](https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/agents?view=azure-devops&tabs=browser#capabilities). The idea behind capabilities is that you assign a set of tags to a machine, and then a job can express its [demands](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml), i.e. specify a set of tags machines need to have in order to run it. Support for this is fairly badly documented. We can gather from the documentation that a job can specify two things about a capability (through its `demands`): that a given tag exists, and that a given tag has an exact specified value. In particular, a job cannot specify that a capability should _not_ be present, meaning we cannot rely on, say, adding a "broken" tag to broken machines. Documentation on how to set capabilities for an agent is basically nonexistent, but [looking at the code](https://github.com/microsoft/azure-pipelines-agent/blob/master/src/Microsoft.VisualStudio.Services.Agent/Capabilities/UserCapabilitiesProvider.cs) indicates that they can be set by using a simple `key=value`-formatted text file, provided we can find the right place to put this file. This PR adds this file to our Linux, macOS and Windows node init scripts to define an `assignment` capability and adds a demand for a `default` value on each job. From then on, when we hit a case where we want a PR to run on a specific node, and to prevent other PRs from running on that node, we can manually override the capability from the Azure UI and update the demand in the relevant YAML file in the PR. CHANGELOG_BEGIN CHANGELOG_END
2020-05-09 19:21:42 +03:00
$MachineName = Get-CimInstance -ClassName Win32_OperatingSystem | Select-Object CSName | ForEach{ $_.CSName }
choco install azure-pipelines-agent --no-progress --yes --params "'/Token:${local.vsts_token} /Pool:${local.vsts_pool} /Url:https://dev.azure.com/${local.vsts_account}/ /LogonAccount:$Account /LogonPassword:$Password /Work:D:\a /AgentName:$MachineName /Replace'"
echo OK
SYSPREP_SPECIALIZE
windows-shutdown-script-ps1 = "c://agent/config remove --unattended --auth PAT --token '${secret_resource.vsts-token.value}'"
}
network_interface {
network = "default"
// Ephemeral IP to get access to the Internet
access_config {}
}
scheduling {
automatic_restart = false
on_host_maintenance = "TERMINATE"
preemptible = false
}
}