diff --git a/nixos/doc/manual/release-notes/rl-2311.section.md b/nixos/doc/manual/release-notes/rl-2311.section.md index 307aeee6020a..a59dccfbc42b 100644 --- a/nixos/doc/manual/release-notes/rl-2311.section.md +++ b/nixos/doc/manual/release-notes/rl-2311.section.md @@ -265,6 +265,8 @@ The module update takes care of the new config syntax and the data itself (user - The `cawbird` package is dropped from nixpkgs, as it got broken by the Twitter API closing down and has been abandoned upstream. +- `hardware.nvidia` gained `datacenter` options for enabling NVIDIA Data Center drivers and configuration of NVLink/NVSwitch topologies through `nv-fabricmanager`. + - Certificate generation via the `security.acme` now limits the concurrent number of running certificate renewals and generation jobs, to avoid spiking resource usage when processing many certificates at once. The limit defaults to *5* and can be adjusted via `maxConcurrentRenewals`. Setting it to *0* disables the limits altogether. - New `boot.bcache.enable` (default enabled) allows completely removing `bcache` mount support. diff --git a/nixos/modules/hardware/video/nvidia.nix b/nixos/modules/hardware/video/nvidia.nix index 67c3afcf320a..0b1238dd888a 100644 --- a/nixos/modules/hardware/video/nvidia.nix +++ b/nixos/modules/hardware/video/nvidia.nix @@ -4,8 +4,10 @@ pkgs, ... }: let + x11Enabled = config.services.xserver.enable + && (lib.elem "nvidia" config.services.xserver.videoDrivers); nvidia_x11 = - if (lib.elem "nvidia" config.services.xserver.videoDrivers) + if x11Enabled || cfg.datacenter.enable then cfg.package else null; @@ -18,9 +20,64 @@ primeEnabled = syncCfg.enable || reverseSyncCfg.enable || offloadCfg.enable; busIDType = lib.types.strMatching "([[:print:]]+[\:\@][0-9]{1,3}\:[0-9]{1,2}\:[0-9])?"; ibtSupport = cfg.open || (nvidia_x11.ibtSupport or false); + settingsFormat = pkgs.formats.keyValue {}; in { options = { hardware.nvidia = { + datacenter.enable = lib.mkEnableOption (lib.mdDoc '' + Data Center drivers for NVIDIA cards on a NVLink topology. + ''); + datacenter.settings = lib.mkOption { + type = settingsFormat.type; + default = { + LOG_LEVEL=4; + LOG_FILE_NAME="/var/log/fabricmanager.log"; + LOG_APPEND_TO_LOG=1; + LOG_FILE_MAX_SIZE=1024; + LOG_USE_SYSLOG=0; + DAEMONIZE=1; + BIND_INTERFACE_IP="127.0.0.1"; + STARTING_TCP_PORT=16000; + FABRIC_MODE=0; + FABRIC_MODE_RESTART=0; + STATE_FILE_NAME="/var/tmp/fabricmanager.state"; + FM_CMD_BIND_INTERFACE="127.0.0.1"; + FM_CMD_PORT_NUMBER=6666; + FM_STAY_RESIDENT_ON_FAILURES=0; + ACCESS_LINK_FAILURE_MODE=0; + TRUNK_LINK_FAILURE_MODE=0; + NVSWITCH_FAILURE_MODE=0; + ABORT_CUDA_JOBS_ON_FM_EXIT=1; + TOPOLOGY_FILE_PATH=nvidia_x11.fabricmanager + "/share/nvidia-fabricmanager/nvidia/nvswitch"; + }; + defaultText = lib.literalExpression '' + { + LOG_LEVEL=4; + LOG_FILE_NAME="/var/log/fabricmanager.log"; + LOG_APPEND_TO_LOG=1; + LOG_FILE_MAX_SIZE=1024; + LOG_USE_SYSLOG=0; + DAEMONIZE=1; + BIND_INTERFACE_IP="127.0.0.1"; + STARTING_TCP_PORT=16000; + FABRIC_MODE=0; + FABRIC_MODE_RESTART=0; + STATE_FILE_NAME="/var/tmp/fabricmanager.state"; + FM_CMD_BIND_INTERFACE="127.0.0.1"; + FM_CMD_PORT_NUMBER=6666; + FM_STAY_RESIDENT_ON_FAILURES=0; + ACCESS_LINK_FAILURE_MODE=0; + TRUNK_LINK_FAILURE_MODE=0; + NVSWITCH_FAILURE_MODE=0; + ABORT_CUDA_JOBS_ON_FM_EXIT=1; + TOPOLOGY_FILE_PATH=nvidia_x11.fabricmanager + "/share/nvidia-fabricmanager/nvidia/nvswitch"; + } + ''; + description = lib.mdDoc '' + Additional configuration options for fabricmanager. + ''; + }; + powerManagement.enable = lib.mkEnableOption (lib.mdDoc '' experimental power management through systemd. For more information, see the NVIDIA docs, on Chapter 21. Configuring Power Management Support. @@ -167,9 +224,15 @@ in { It also drastically increases the time the driver needs to clock down after load. ''); - package = lib.mkPackageOptionMD config.boot.kernelPackages.nvidiaPackages "nvidia_x11" { - default = "stable"; + package = lib.mkOption { + default = config.boot.kernelPackages.nvidiaPackages."${if cfg.datacenter.enable then "dc" else "stable"}"; + defaultText = lib.literalExpression '' + config.boot.kernelPackages.nvidiaPackages."\$\{if cfg.datacenter.enable then "dc" else "stable"}" + ''; example = lib.mdDoc "config.boot.kernelPackages.nvidiaPackages.legacy_470"; + description = lib.mdDoc '' + The NVIDIA driver package to use. + ''; }; open = lib.mkEnableOption (lib.mdDoc '' @@ -188,8 +251,46 @@ in { then pCfg.intelBusId else pCfg.amdgpuBusId; in - lib.mkIf (nvidia_x11 != null) { - assertions = [ + lib.mkIf (nvidia_x11 != null) (lib.mkMerge [ + # Common + ({ + assertions = [ + { + assertion = !(x11Enabled && cfg.datacenter.enable); + message = "You cannot configure both X11 and Data Center drivers at the same time."; + } + ]; + boot = { + blacklistedKernelModules = ["nouveau" "nvidiafb"]; + kernelModules = [ "nvidia-uvm" ]; + }; + systemd.tmpfiles.rules = + lib.optional config.virtualisation.docker.enableNvidia + "L+ /run/nvidia-docker/bin - - - - ${nvidia_x11.bin}/origBin"; + services.udev.extraRules = + '' + # Create /dev/nvidia-uvm when the nvidia-uvm module is loaded. + KERNEL=="nvidia", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidiactl c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) 255'" + KERNEL=="nvidia", RUN+="${pkgs.runtimeShell} -c 'for i in $$(cat /proc/driver/nvidia/gpus/*/information | grep Minor | cut -d \ -f 4); do mknod -m 666 /dev/nvidia$${i} c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) $${i}; done'" + KERNEL=="nvidia_modeset", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-modeset c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) 254'" + KERNEL=="nvidia_uvm", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-uvm c $$(grep nvidia-uvm /proc/devices | cut -d \ -f 1) 0'" + KERNEL=="nvidia_uvm", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-uvm-tools c $$(grep nvidia-uvm /proc/devices | cut -d \ -f 1) 1'" + ''; + hardware.opengl = { + extraPackages = [ + nvidia_x11.out + ]; + extraPackages32 = [ + nvidia_x11.lib32 + ]; + }; + environment.systemPackages = [ + nvidia_x11.bin + ]; + }) + # X11 + (lib.mkIf x11Enabled { + assertions = [ { assertion = primeEnabled -> pCfg.intelBusId == "" || pCfg.amdgpuBusId == ""; message = "You cannot configure both an Intel iGPU and an AMD APU. Pick the one corresponding to your processor."; @@ -248,227 +349,207 @@ in { { assertion = cfg.dynamicBoost.enable -> lib.versionAtLeast nvidia_x11.version "510.39.01"; message = "NVIDIA's Dynamic Boost feature only exists on versions >= 510.39.01"; - } - ]; + }]; - # If Optimus/PRIME is enabled, we: - # - Specify the configured NVIDIA GPU bus ID in the Device section for the - # "nvidia" driver. - # - Add the AllowEmptyInitialConfiguration option to the Screen section for the - # "nvidia" driver, in order to allow the X server to start without any outputs. - # - Add a separate Device section for the Intel GPU, using the "modesetting" - # driver and with the configured BusID. - # - OR add a separate Device section for the AMD APU, using the "amdgpu" - # driver and with the configures BusID. - # - Reference that Device section from the ServerLayout section as an inactive - # device. - # - Configure the display manager to run specific `xrandr` commands which will - # configure/enable displays connected to the Intel iGPU / AMD APU. + # If Optimus/PRIME is enabled, we: + # - Specify the configured NVIDIA GPU bus ID in the Device section for the + # "nvidia" driver. + # - Add the AllowEmptyInitialConfiguration option to the Screen section for the + # "nvidia" driver, in order to allow the X server to start without any outputs. + # - Add a separate Device section for the Intel GPU, using the "modesetting" + # driver and with the configured BusID. + # - OR add a separate Device section for the AMD APU, using the "amdgpu" + # driver and with the configures BusID. + # - Reference that Device section from the ServerLayout section as an inactive + # device. + # - Configure the display manager to run specific `xrandr` commands which will + # configure/enable displays connected to the Intel iGPU / AMD APU. - # reverse sync implies offloading - hardware.nvidia.prime.offload.enable = lib.mkDefault reverseSyncCfg.enable; + # reverse sync implies offloading + hardware.nvidia.prime.offload.enable = lib.mkDefault reverseSyncCfg.enable; - services.xserver.drivers = - lib.optional primeEnabled { - name = igpuDriver; - display = offloadCfg.enable; - modules = lib.optional (igpuDriver == "amdgpu") pkgs.xorg.xf86videoamdgpu; - deviceSection = - '' - BusID "${igpuBusId}" - '' - + lib.optionalString (syncCfg.enable && igpuDriver != "amdgpu") '' - Option "AccelMethod" "none" - ''; - } - ++ lib.singleton { - name = "nvidia"; - modules = [nvidia_x11.bin]; - display = !offloadCfg.enable; - deviceSection = - lib.optionalString primeEnabled - '' - BusID "${pCfg.nvidiaBusId}" - '' - + lib.optionalString pCfg.allowExternalGpu '' - Option "AllowExternalGpus" - ''; - screenSection = - '' - Option "RandRRotation" "on" - '' - + lib.optionalString syncCfg.enable '' - Option "AllowEmptyInitialConfiguration" - '' - + lib.optionalString cfg.forceFullCompositionPipeline '' - Option "metamodes" "nvidia-auto-select +0+0 {ForceFullCompositionPipeline=On}" - Option "AllowIndirectGLXProtocol" "off" - Option "TripleBuffer" "on" - ''; - }; - - services.xserver.serverLayoutSection = - lib.optionalString syncCfg.enable '' - Inactive "Device-${igpuDriver}[0]" - '' - + lib.optionalString reverseSyncCfg.enable '' - Inactive "Device-nvidia[0]" - '' - + lib.optionalString offloadCfg.enable '' - Option "AllowNVIDIAGPUScreens" - ''; - - services.xserver.displayManager.setupCommands = let - gpuProviderName = - if igpuDriver == "amdgpu" - then - # find the name of the provider if amdgpu - "`${lib.getExe pkgs.xorg.xrandr} --listproviders | ${lib.getExe pkgs.gnugrep} -i AMD | ${lib.getExe pkgs.gnused} -n 's/^.*name://p'`" - else igpuDriver; - providerCmdParams = - if syncCfg.enable - then "\"${gpuProviderName}\" NVIDIA-0" - else "NVIDIA-G0 \"${gpuProviderName}\""; - in - lib.optionalString (syncCfg.enable || reverseSyncCfg.enable) '' - # Added by nvidia configuration module for Optimus/PRIME. - ${lib.getExe pkgs.xorg.xrandr} --setprovideroutputsource ${providerCmdParams} - ${lib.getExe pkgs.xorg.xrandr} --auto - ''; - - environment.etc = { - "nvidia/nvidia-application-profiles-rc" = lib.mkIf nvidia_x11.useProfiles {source = "${nvidia_x11.bin}/share/nvidia/nvidia-application-profiles-rc";}; - - # 'nvidia_x11' installs it's files to /run/opengl-driver/... - "egl/egl_external_platform.d".source = "/run/opengl-driver/share/egl/egl_external_platform.d/"; - }; - - hardware.opengl = { - extraPackages = [ - nvidia_x11.out - pkgs.nvidia-vaapi-driver - ]; - extraPackages32 = [ - nvidia_x11.lib32 - pkgs.pkgsi686Linux.nvidia-vaapi-driver - ]; - }; - environment.systemPackages = - [nvidia_x11.bin] - ++ lib.optional cfg.nvidiaSettings nvidia_x11.settings - ++ lib.optional cfg.nvidiaPersistenced nvidia_x11.persistenced - ++ lib.optional offloadCfg.enableOffloadCmd - (pkgs.writeShellScriptBin "nvidia-offload" '' - export __NV_PRIME_RENDER_OFFLOAD=1 - export __NV_PRIME_RENDER_OFFLOAD_PROVIDER=NVIDIA-G0 - export __GLX_VENDOR_LIBRARY_NAME=nvidia - export __VK_LAYER_NV_optimus=NVIDIA_only - exec "$@" - ''); - - systemd.packages = lib.optional cfg.powerManagement.enable nvidia_x11.out; - - systemd.services = let - nvidiaService = state: { - description = "NVIDIA system ${state} actions"; - path = [pkgs.kbd]; - serviceConfig = { - Type = "oneshot"; - ExecStart = "${nvidia_x11.out}/bin/nvidia-sleep.sh '${state}'"; + services.xserver.drivers = + lib.optional primeEnabled { + name = igpuDriver; + display = offloadCfg.enable; + modules = lib.optional (igpuDriver == "amdgpu") pkgs.xorg.xf86videoamdgpu; + deviceSection = + '' + BusID "${igpuBusId}" + '' + + lib.optionalString (syncCfg.enable && igpuDriver != "amdgpu") '' + Option "AccelMethod" "none" + ''; + } + ++ lib.singleton { + name = "nvidia"; + modules = [nvidia_x11.bin]; + display = !offloadCfg.enable; + deviceSection = + lib.optionalString primeEnabled + '' + BusID "${pCfg.nvidiaBusId}" + '' + + lib.optionalString pCfg.allowExternalGpu '' + Option "AllowExternalGpus" + ''; + screenSection = + '' + Option "RandRRotation" "on" + '' + + lib.optionalString syncCfg.enable '' + Option "AllowEmptyInitialConfiguration" + '' + + lib.optionalString cfg.forceFullCompositionPipeline '' + Option "metamodes" "nvidia-auto-select +0+0 {ForceFullCompositionPipeline=On}" + Option "AllowIndirectGLXProtocol" "off" + Option "TripleBuffer" "on" + ''; }; - before = ["systemd-${state}.service"]; - requiredBy = ["systemd-${state}.service"]; + + services.xserver.serverLayoutSection = + lib.optionalString syncCfg.enable '' + Inactive "Device-${igpuDriver}[0]" + '' + + lib.optionalString reverseSyncCfg.enable '' + Inactive "Device-nvidia[0]" + '' + + lib.optionalString offloadCfg.enable '' + Option "AllowNVIDIAGPUScreens" + ''; + + services.xserver.displayManager.setupCommands = let + gpuProviderName = + if igpuDriver == "amdgpu" + then + # find the name of the provider if amdgpu + "`${lib.getExe pkgs.xorg.xrandr} --listproviders | ${lib.getExe pkgs.gnugrep} -i AMD | ${lib.getExe pkgs.gnused} -n 's/^.*name://p'`" + else igpuDriver; + providerCmdParams = + if syncCfg.enable + then "\"${gpuProviderName}\" NVIDIA-0" + else "NVIDIA-G0 \"${gpuProviderName}\""; + in + lib.optionalString (syncCfg.enable || reverseSyncCfg.enable) '' + # Added by nvidia configuration module for Optimus/PRIME. + ${lib.getExe pkgs.xorg.xrandr} --setprovideroutputsource ${providerCmdParams} + ${lib.getExe pkgs.xorg.xrandr} --auto + ''; + + environment.etc = { + "nvidia/nvidia-application-profiles-rc" = lib.mkIf nvidia_x11.useProfiles {source = "${nvidia_x11.bin}/share/nvidia/nvidia-application-profiles-rc";}; + + # 'nvidia_x11' installs it's files to /run/opengl-driver/... + "egl/egl_external_platform.d".source = "/run/opengl-driver/share/egl/egl_external_platform.d/"; }; - in - lib.mkMerge [ - (lib.mkIf cfg.powerManagement.enable { - nvidia-suspend = nvidiaService "suspend"; - nvidia-hibernate = nvidiaService "hibernate"; - nvidia-resume = - (nvidiaService "resume") - // { - before = []; - after = ["systemd-suspend.service" "systemd-hibernate.service"]; - requiredBy = ["systemd-suspend.service" "systemd-hibernate.service"]; - }; - }) - (lib.mkIf cfg.nvidiaPersistenced { - "nvidia-persistenced" = { - description = "NVIDIA Persistence Daemon"; - wantedBy = ["multi-user.target"]; - serviceConfig = { - Type = "forking"; - Restart = "always"; - PIDFile = "/var/run/nvidia-persistenced/nvidia-persistenced.pid"; - ExecStart = "${lib.getExe nvidia_x11.persistenced} --verbose"; - ExecStopPost = "${pkgs.coreutils}/bin/rm -rf /var/run/nvidia-persistenced"; - }; + + hardware.opengl = { + extraPackages = [ + pkgs.nvidia-vaapi-driver + ]; + extraPackages32 = [ + pkgs.pkgsi686Linux.nvidia-vaapi-driver + ]; + }; + environment.systemPackages = + lib.optional cfg.nvidiaSettings nvidia_x11.settings + ++ lib.optional cfg.nvidiaPersistenced nvidia_x11.persistenced + ++ lib.optional offloadCfg.enableOffloadCmd + (pkgs.writeShellScriptBin "nvidia-offload" '' + export __NV_PRIME_RENDER_OFFLOAD=1 + export __NV_PRIME_RENDER_OFFLOAD_PROVIDER=NVIDIA-G0 + export __GLX_VENDOR_LIBRARY_NAME=nvidia + export __VK_LAYER_NV_optimus=NVIDIA_only + exec "$@" + ''); + + systemd.packages = lib.optional cfg.powerManagement.enable nvidia_x11.out; + + systemd.services = let + nvidiaService = state: { + description = "NVIDIA system ${state} actions"; + path = [pkgs.kbd]; + serviceConfig = { + Type = "oneshot"; + ExecStart = "${nvidia_x11.out}/bin/nvidia-sleep.sh '${state}'"; }; - }) - (lib.mkIf cfg.dynamicBoost.enable { - "nvidia-powerd" = { - description = "nvidia-powerd service"; - path = [ - pkgs.util-linux # nvidia-powerd wants lscpu - ]; - wantedBy = ["multi-user.target"]; - serviceConfig = { - Type = "dbus"; - BusName = "nvidia.powerd.server"; - ExecStart = "${nvidia_x11.bin}/bin/nvidia-powerd"; + before = ["systemd-${state}.service"]; + requiredBy = ["systemd-${state}.service"]; + }; + in + lib.mkMerge [ + (lib.mkIf cfg.powerManagement.enable { + nvidia-suspend = nvidiaService "suspend"; + nvidia-hibernate = nvidiaService "hibernate"; + nvidia-resume = + (nvidiaService "resume") + // { + before = []; + after = ["systemd-suspend.service" "systemd-hibernate.service"]; + requiredBy = ["systemd-suspend.service" "systemd-hibernate.service"]; + }; + }) + (lib.mkIf cfg.nvidiaPersistenced { + "nvidia-persistenced" = { + description = "NVIDIA Persistence Daemon"; + wantedBy = ["multi-user.target"]; + serviceConfig = { + Type = "forking"; + Restart = "always"; + PIDFile = "/var/run/nvidia-persistenced/nvidia-persistenced.pid"; + ExecStart = "${lib.getExe nvidia_x11.persistenced} --verbose"; + ExecStopPost = "${pkgs.coreutils}/bin/rm -rf /var/run/nvidia-persistenced"; + }; }; - }; - }) - ]; + }) + (lib.mkIf cfg.dynamicBoost.enable { + "nvidia-powerd" = { + description = "nvidia-powerd service"; + path = [ + pkgs.util-linux # nvidia-powerd wants lscpu + ]; + wantedBy = ["multi-user.target"]; + serviceConfig = { + Type = "dbus"; + BusName = "nvidia.powerd.server"; + ExecStart = "${nvidia_x11.bin}/bin/nvidia-powerd"; + }; + }; + }) + ]; + services.acpid.enable = true; - services.acpid.enable = true; + services.dbus.packages = lib.optional cfg.dynamicBoost.enable nvidia_x11.bin; - services.dbus.packages = lib.optional cfg.dynamicBoost.enable nvidia_x11.bin; + hardware.firmware = lib.optional cfg.open nvidia_x11.firmware; - hardware.firmware = lib.optional cfg.open nvidia_x11.firmware; + systemd.tmpfiles.rules = + lib.optional (nvidia_x11.persistenced != null && config.virtualisation.docker.enableNvidia) + "L+ /run/nvidia-docker/extras/bin/nvidia-persistenced - - - - ${nvidia_x11.persistenced}/origBin/nvidia-persistenced"; - systemd.tmpfiles.rules = - lib.optional config.virtualisation.docker.enableNvidia - "L+ /run/nvidia-docker/bin - - - - ${nvidia_x11.bin}/origBin" - ++ lib.optional (nvidia_x11.persistenced != null && config.virtualisation.docker.enableNvidia) - "L+ /run/nvidia-docker/extras/bin/nvidia-persistenced - - - - ${nvidia_x11.persistenced}/origBin/nvidia-persistenced"; + boot = { + extraModulePackages = + if cfg.open + then [nvidia_x11.open] + else [nvidia_x11.bin]; + # nvidia-uvm is required by CUDA applications. + kernelModules = + lib.optionals config.services.xserver.enable ["nvidia" "nvidia_modeset" "nvidia_drm"]; - boot = { - blacklistedKernelModules = ["nouveau" "nvidiafb"]; + # If requested enable modesetting via kernel parameter. + kernelParams = + lib.optional (offloadCfg.enable || cfg.modesetting.enable) "nvidia-drm.modeset=1" + ++ lib.optional cfg.powerManagement.enable "nvidia.NVreg_PreserveVideoMemoryAllocations=1" + ++ lib.optional cfg.open "nvidia.NVreg_OpenRmEnableUnsupportedGpus=1" + ++ lib.optional (config.boot.kernelPackages.kernel.kernelAtLeast "6.2" && !ibtSupport) "ibt=off"; - extraModulePackages = - if cfg.open - then [nvidia_x11.open] - else [nvidia_x11.bin]; - - # nvidia-uvm is required by CUDA applications. - kernelModules = - ["nvidia-uvm"] - ++ lib.optionals config.services.xserver.enable ["nvidia" "nvidia_modeset" "nvidia_drm"]; - - # If requested enable modesetting via kernel parameter. - kernelParams = - lib.optional (offloadCfg.enable || cfg.modesetting.enable) "nvidia-drm.modeset=1" - ++ lib.optional cfg.powerManagement.enable "nvidia.NVreg_PreserveVideoMemoryAllocations=1" - ++ lib.optional cfg.open "nvidia.NVreg_OpenRmEnableUnsupportedGpus=1" - ++ lib.optional (config.boot.kernelPackages.kernel.kernelAtLeast "6.2" && !ibtSupport) "ibt=off"; - - # enable finegrained power management - extraModprobeConfig = lib.optionalString cfg.powerManagement.finegrained '' - options nvidia "NVreg_DynamicPowerManagement=0x02" - ''; - }; - - services.udev.extraRules = - '' - # Create /dev/nvidia-uvm when the nvidia-uvm module is loaded. - KERNEL=="nvidia", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidiactl c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) 255'" - KERNEL=="nvidia", RUN+="${pkgs.runtimeShell} -c 'for i in $$(cat /proc/driver/nvidia/gpus/*/information | grep Minor | cut -d \ -f 4); do mknod -m 666 /dev/nvidia$${i} c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) $${i}; done'" - KERNEL=="nvidia_modeset", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-modeset c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) 254'" - KERNEL=="nvidia_uvm", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-uvm c $$(grep nvidia-uvm /proc/devices | cut -d \ -f 1) 0'" - KERNEL=="nvidia_uvm", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-uvm-tools c $$(grep nvidia-uvm /proc/devices | cut -d \ -f 1) 1'" - '' - + lib.optionalString cfg.powerManagement.finegrained ( + # enable finegrained power management + extraModprobeConfig = lib.optionalString cfg.powerManagement.finegrained '' + options nvidia "NVreg_DynamicPowerManagement=0x02" + ''; + }; + services.udev.extraRules = + lib.optionalString cfg.powerManagement.finegrained ( lib.optionalString (lib.versionOlder config.boot.kernelPackages.kernel.version "5.5") '' # Remove NVIDIA USB xHCI Host Controller devices, if present ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x0c0330", ATTR{remove}="1" @@ -489,5 +570,30 @@ in { ACTION=="unbind", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x030200", TEST=="power/control", ATTR{power/control}="on" '' ); - }; + }) + # Data Center + (lib.mkIf (cfg.datacenter.enable) { + boot.extraModulePackages = [ + nvidia_x11.bin + ]; + systemd.services.nvidia-fabricmanager = { + enable = true; + description = "Start NVIDIA NVLink Management"; + wantedBy = [ "multi-user.target" ]; + unitConfig.After = [ "network-online.target" ]; + unitConfig.Requires = [ "network-online.target" ]; + serviceConfig = { + Type = "forking"; + TimeoutStartSec = 240; + ExecStart = let + nv-fab-conf = settingsFormat.generate "fabricmanager.conf" cfg.datacenter.settings; + in + nvidia_x11.fabricmanager + "/bin/nv-fabricmanager -c " + nv-fab-conf; + LimitCORE="infinity"; + }; + }; + environment.systemPackages = + lib.optional cfg.datacenter.enable nvidia_x11.fabricmanager; + }) + ]); } diff --git a/pkgs/os-specific/linux/nvidia-x11/default.nix b/pkgs/os-specific/linux/nvidia-x11/default.nix index 9595de407cb4..24e0ed5adbb1 100644 --- a/pkgs/os-specific/linux/nvidia-x11/default.nix +++ b/pkgs/os-specific/linux/nvidia-x11/default.nix @@ -75,6 +75,18 @@ rec { url = "https://developer.nvidia.com/downloads/vulkan-beta-${lib.concatStrings (lib.splitString "." version)}-linux"; }; + # data center driver compatible with current default cudaPackages + dc = dc_520; + dc_520 = generic rec { + version = "520.61.05"; + url = "https://us.download.nvidia.com/tesla/${version}/NVIDIA-Linux-x86_64-${version}.run"; + sha256_64bit = "sha256-EPYWZwOur/6iN/otDMrNDpNXr1mzu8cIqQl8lXhQlzU=="; + fabricmanagerSha256 = "sha256-o8Kbmkg7qczKQclaGvEyXNzEOWq9ZpQZn9syeffnEiE=="; + useSettings = false; + usePersistenced = false; + useFabricmanager = true; + }; + # Update note: # If you add a legacy driver here, also update `top-level/linux-kernels.nix`, # adding to the `nvidia_x11_legacy*` entries. diff --git a/pkgs/os-specific/linux/nvidia-x11/fabricmanager.nix b/pkgs/os-specific/linux/nvidia-x11/fabricmanager.nix new file mode 100644 index 000000000000..58cf8c0e3557 --- /dev/null +++ b/pkgs/os-specific/linux/nvidia-x11/fabricmanager.nix @@ -0,0 +1,47 @@ +nvidia_x11: sha256: + +{ stdenv, lib, fetchurl, patchelf }: + +let + sys = with lib; concatStringsSep "-" (reverseList (splitString "-" stdenv.system)); + bsys = builtins.replaceStrings ["_"] ["-"] sys; + fmver = nvidia_x11.version; +in + +stdenv.mkDerivation rec { + pname = "fabricmanager"; + version = fmver; + src = fetchurl { + url = "https://developer.download.nvidia.com/compute/cuda/redist/fabricmanager/" + + "${sys}/${pname}-${sys}-${fmver}-archive.tar.xz"; + inherit sha256; + }; + phases = [ "unpackPhase" "installPhase" ]; + + installPhase = '' + find . + mkdir -p $out/{bin,share/nvidia-fabricmanager} + for bin in nv{-fabricmanager,switch-audit};do + ${patchelf}/bin/patchelf \ + --set-interpreter ${stdenv.cc.libc}/lib/ld-${bsys}.so.2 \ + --set-rpath ${lib.makeLibraryPath [ stdenv.cc.libc ]} \ + bin/$bin + done + mv bin/nv{-fabricmanager,switch-audit} $out/bin/. + for d in etc systemd share/nvidia;do + mv $d $out/share/nvidia-fabricmanager/. + done + for d in include lib;do + mv $d $out/. + done + ''; + + meta = with lib; { + homepage = "https://www.nvidia.com/object/unix.html"; + description = "Fabricmanager daemon for NVLink intialization and control"; + license = licenses.unfreeRedistributable; + platforms = nvidia_x11.meta.platforms; + mainProgram = "nv-fabricmanager"; + maintainers = with maintainers; [ edwtjo ]; + }; +} diff --git a/pkgs/os-specific/linux/nvidia-x11/generic.nix b/pkgs/os-specific/linux/nvidia-x11/generic.nix index 792fda42ca9c..8ec292f27251 100644 --- a/pkgs/os-specific/linux/nvidia-x11/generic.nix +++ b/pkgs/os-specific/linux/nvidia-x11/generic.nix @@ -4,14 +4,19 @@ , sha256_64bit , sha256_aarch64 ? null , openSha256 ? null -, settingsSha256 +, settingsSha256 ? null , settingsVersion ? version -, persistencedSha256 +, persistencedSha256 ? null , persistencedVersion ? version +, fabricmanagerSha256 ? null +, fabricmanagerVersion ? version , useGLVND ? true , useProfiles ? true , preferGtk2 ? false , settings32Bit ? false +, useSettings ? true +, usePersistenced ? true +, useFabricmanager ? false , ibtSupport ? false , prePatch ? "" @@ -33,14 +38,21 @@ disable32Bit ? stdenv.hostPlatform.system == "aarch64-linux" # 32 bit libs only version of this package , lib32 ? null - # Whether to extract the GSP firmware -, firmware ? openSha256 != null + # Whether to extract the GSP firmware, datacenter drivers needs to extract the + # firmware +, firmware ? openSha256 != null || useFabricmanager + # Whether the user accepts the NVIDIA Software License +, config, acceptLicense ? config.nvidia.acceptLicense or false }: with lib; assert !libsOnly -> kernel != null; assert versionOlder version "391" -> sha256_32bit != null; +assert useSettings -> settingsSha256 != null; +assert usePersistenced -> persistencedSha256 != null; +assert useFabricmanager -> fabricmanagerSha256 != null; +assert useFabricmanager -> !(useSettings || usePersistenced); let nameSuffix = optionalString (!libsOnly) "-${kernel.version}"; @@ -54,12 +66,33 @@ let dbus # for nvidia-powerd ]); + # maybe silly since we've ignored this previously and just unfree.. + throwLicense = throw '' + Use of NVIDIA Software requires license acceptance of the license: + + - License For Customer Use of NVIDIA Software [1] + + You can express acceptance by setting acceptLicense to true your nixpkgs.config. + Example: + + configuration.nix: + nixpkgs.config.allowUnfree = true; + nixpkgs.config.nvidia.acceptLicense = true; + + config.nix: + allowUnfree = true; + nvidia.acceptLicense = true; + + [1]: https://www.nvidia.com/content/DriverDownloads/licence.php?lang=us + ''; + self = stdenv.mkDerivation { - name = "nvidia-x11-${version}${nameSuffix}"; + name = "nvidia-${if useFabricmanager then "dc" else "x11"}-${version}${nameSuffix}"; builder = ./builder.sh; src = + if !acceptLicense && (openSha256 == null) then throwLicense else if stdenv.hostPlatform.system == "x86_64-linux" then fetchurl { urls = if args ? url then [ args.url ] else [ @@ -127,11 +160,17 @@ let nvidia_x11 = self; broken = brokenOpen; }) openSha256; - settings = (if settings32Bit then pkgsi686Linux.callPackage else callPackage) (import ./settings.nix self settingsSha256) { - withGtk2 = preferGtk2; - withGtk3 = !preferGtk2; - }; - persistenced = mapNullable (hash: callPackage (import ./persistenced.nix self hash) { }) persistencedSha256; + settings = if useSettings then + (if settings32Bit then pkgsi686Linux.callPackage else callPackage) (import ./settings.nix self settingsSha256) { + withGtk2 = preferGtk2; + withGtk3 = !preferGtk2; + } else {}; + persistenced = if usePersistenced then + mapNullable (hash: callPackage (import ./persistenced.nix self hash) { }) persistencedSha256 + else {}; + fabricmanager = if useFabricmanager then + mapNullable (hash: callPackage (import ./fabricmanager.nix self hash) { }) fabricmanagerSha256 + else {}; inherit persistencedVersion settingsVersion; compressFirmware = false; ibtSupport = ibtSupport || (lib.versionAtLeast version "530"); @@ -141,12 +180,12 @@ let meta = with lib; { homepage = "https://www.nvidia.com/object/unix.html"; - description = "X.org driver and kernel module for NVIDIA graphics cards"; + description = "${if useFabricmanager then "Data Center" else "X.org"} driver and kernel module for NVIDIA cards"; license = licenses.unfreeRedistributable; platforms = [ "x86_64-linux" ] ++ optionals (sha256_32bit != null) [ "i686-linux" ] ++ optionals (sha256_aarch64 != null) [ "aarch64-linux" ]; - maintainers = with maintainers; [ jonringer kiskae ]; + maintainers = with maintainers; [ jonringer kiskae edwtjo ]; priority = 4; # resolves collision with xorg-server's "lib/xorg/modules/extensions/libglx.so" inherit broken; }; diff --git a/pkgs/top-level/linux-kernels.nix b/pkgs/top-level/linux-kernels.nix index 96c95c819f6a..07429cee853a 100644 --- a/pkgs/top-level/linux-kernels.nix +++ b/pkgs/top-level/linux-kernels.nix @@ -410,6 +410,8 @@ in { nvidia_x11_legacy470 = nvidiaPackages.legacy_470; nvidia_x11_production = nvidiaPackages.production; nvidia_x11_vulkan_beta = nvidiaPackages.vulkan_beta; + nvidia_dc = nvidiaPackages.dc; + nvidia_dc_520 = nvidiaPackages.dc_520; # this is not a replacement for nvidia_x11* # only the opensource kernel driver exposed for hydra to build