Skip to content

Commit 9b0d64d

Browse files
committed
Define a new EDPM role for installing nvidia driver on nodes
In order to do nvidia vGPU testing, we need to deploy a specific package on EDPM nodes and do some other actions. This is a multi-stage role requring a reboot between the two stages hence the two defined phases.
1 parent a1221a4 commit 9b0d64d

File tree

18 files changed

+543
-0
lines changed

18 files changed

+543
-0
lines changed

playbooks/nvidia-mdev-phase1.yml

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
---
2+
# Copyright 2024 Red Hat, Inc.
3+
# All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
# License for the specific language governing permissions and limitations
15+
# under the License.
16+
17+
18+
- name: Gather the list of EDPM computes
19+
hosts: "{{ cifmw_target_hook_host | default('localhost') }}"
20+
gather_facts: false
21+
tasks:
22+
- name: Fetch OSP BMO nodesets
23+
environment:
24+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
25+
PATH: "{{ cifmw_path }}"
26+
ansible.builtin.command:
27+
cmd: >-
28+
oc get OpenStackBaremetalSet -n "{{ namespace|default('openstack') }}" -o yaml
29+
register: _osp_bmo_nodsets_oc_out
30+
31+
- name: Craft the BM hosts list
32+
ansible.builtin.set_fact:
33+
_bmo_provisioned_hosts: >-
34+
{% set hosts = [] -%}
35+
{% set nodesets = (_osp_bmo_nodsets_oc_out.stdout | from_yaml)['items'] | default([]) -%}
36+
{% for spec in nodesets | map(attribute='spec') -%}
37+
{% for host_key, host_val in spec.baremetalHosts.items() -%}
38+
{% set _ = hosts.append(
39+
{
40+
'name': host_key,
41+
'ip': host_val['ctlPlaneIP'] | ansible.utils.ipaddr('address'),
42+
'user': spec.cloudUserName,
43+
'group': host_key | split('-') | first + 's'
44+
}) -%}
45+
{% endfor -%}
46+
{% endfor -%}
47+
{{ hosts }}
48+
49+
- name: Add OSP BMO nodesets to Ansible
50+
ansible.builtin.add_host:
51+
name: "{{ item.name }}"
52+
groups: "{{ item.group }}"
53+
ansible_ssh_user: "{{ item.user }}"
54+
ansible_host: "{{ item.ip }}"
55+
ansible_ssh_private_key_file: "{{ ansible_user_dir }}/.ssh/id_cifw"
56+
ansible_ssh_extra_args: '-o StrictHostKeyChecking=no'
57+
loop: "{{ _bmo_provisioned_hosts }}"
58+
59+
- name: Wait for the instance to boot
60+
delegate_to: "{{ item.name }}"
61+
ansible.builtin.wait_for_connection:
62+
sleep: 2
63+
timeout: 600
64+
loop: "{{ _bmo_provisioned_hosts }}"
65+
66+
- name: Run the Nvidia phase 1 role
67+
hosts: edpms
68+
tasks:
69+
- name: Run phase1
70+
ansible.builtin.import_role:
71+
name: edpm_nvidia_mdev_prepare
72+
# As a reminder, at the end of phase1, the compute will reboot
73+
tasks_from: phase1

playbooks/nvidia-mdev-phase2.yml

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
---
2+
# Copyright 2024 Red Hat, Inc.
3+
# All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
# License for the specific language governing permissions and limitations
15+
# under the License.
16+
17+
18+
- name: Gather the list of EDPM computes
19+
hosts: "{{ cifmw_target_hook_host | default('localhost') }}"
20+
gather_facts: false
21+
tasks:
22+
- name: Fetch OSP BMO nodesets
23+
environment:
24+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
25+
PATH: "{{ cifmw_path }}"
26+
ansible.builtin.command:
27+
cmd: >-
28+
oc get OpenStackBaremetalSet -n "{{ namespace|default('openstack') }}" -o yaml
29+
register: _osp_bmo_nodsets_oc_out
30+
31+
- name: Craft the BM hosts list
32+
ansible.builtin.set_fact:
33+
_bmo_provisioned_hosts: >-
34+
{% set hosts = [] -%}
35+
{% set nodesets = (_osp_bmo_nodsets_oc_out.stdout | from_yaml)['items'] | default([]) -%}
36+
{% for spec in nodesets | map(attribute='spec') -%}
37+
{% for host_key, host_val in spec.baremetalHosts.items() -%}
38+
{% set _ = hosts.append(
39+
{
40+
'name': host_key,
41+
'ip': host_val['ctlPlaneIP'] | ansible.utils.ipaddr('address'),
42+
'user': spec.cloudUserName,
43+
'group': host_key | split('-') | first + 's'
44+
}) -%}
45+
{% endfor -%}
46+
{% endfor -%}
47+
{{ hosts }}
48+
49+
- name: Add OSP BMO nodesets to Ansible
50+
ansible.builtin.add_host:
51+
name: "{{ item.name }}"
52+
groups: "{{ item.group }}"
53+
ansible_ssh_user: "{{ item.user }}"
54+
ansible_host: "{{ item.ip }}"
55+
ansible_ssh_private_key_file: "{{ ansible_user_dir }}/.ssh/id_cifw"
56+
ansible_ssh_extra_args: '-o StrictHostKeyChecking=no'
57+
loop: "{{ _bmo_provisioned_hosts }}"
58+
59+
- name: Wait for the instance to boot
60+
delegate_to: "{{ item.name }}"
61+
ansible.builtin.wait_for_connection:
62+
sleep: 2
63+
timeout: 600
64+
loop: "{{ _bmo_provisioned_hosts }}"
65+
66+
- name: Run the Nvidia phase 2 role
67+
hosts: edpms
68+
tasks:
69+
- name: Run phase 2
70+
ansible.builtin.import_role:
71+
name: edpm_nvidia_mdev_prepare
72+
tasks_from: phase2
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# edpm_nvidia_mdev_prepare
2+
3+
This Ansible role prepares an EDPM node by configuring it for nvidia mediated
4+
devices usage by installing from an external location a RPM package containing
5+
the nvidia driver.
6+
7+
## Privilege escalation
8+
`become` will be set to True during the phase1 task in order to blacklist
9+
the nouveau driver, regenerate the initramfs and then install the nvidia RPM
10+
package. During phase2, we'll also create system files.
11+
12+
## Parameters
13+
14+
* `cifmw_edpm_nvidia_mdev_prepare_disable_nouveau`: (boolean) Whether to disable the `nouveau` kernel driver. Default: `true`.
15+
16+
* `cifmw_edpm_nvidia_mdev_prepare_driver_url`: (string) The location of the proprietary nvidia driver RPM package so that we can install it. Can be any URI supported by DNF.
17+
18+
* `cifmw_edpm_nvidia_mdev_prepare_package_name`: (string) The installed package name, which can't be inferred from the package name, usually set as `NVIDIA-vGPU-rhel`.
19+
20+
* `cifmw_edpm_nvidia_mdev_prepare_sriov_devices`: (list) List of PCI addresses corresponding to the nvidia physical SR-IOV GPUs that require virtual functions creation. Leave it defaulted to ALL if you want to enable SR-IOV VFs for all your nvidia GPUs.
21+
22+
# Should we start the service for automatically creating the VFs ?
23+
* `cifmw_edpm_nvidia_mdev_prepare_sriov_manage_start`: (boolean) Whether you want the virtual functions to be created upon reboot. Default: `true`.
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
---
2+
# Copyright Red Hat, Inc.
3+
# All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
# License for the specific language governing permissions and limitations
15+
# under the License.
16+
17+
# Does the OS needs to disable the nouveau driver ?
18+
cifmw_edpm_nvidia_mdev_prepare_disable_nouveau: true
19+
20+
# What will be the name of the nvidia package ?
21+
cifmw_edpm_nvidia_mdev_prepare_package_name: "NVIDIA-vGPU-rhel"
22+
23+
# Which SR-IOV GPU devices should be creating VFs ?
24+
cifmw_edpm_nvidia_mdev_prepare_sriov_devices:
25+
- ALL
26+
27+
# Should we start the service for automatically creating the VFs ?
28+
cifmw_edpm_nvidia_mdev_prepare_sriov_manage_start: true

roles/edpm_nvidia_mdev_prepare/files/.gitkeep

Whitespace-only changes.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
[Unit]
2+
After = nvidia-vgpu-mgr.service
3+
After = nvidia-vgpud.service
4+
Description = Enable Nvidia GPU virtual functions
5+
6+
[Service]
7+
Type = oneshot
8+
User = root
9+
Group = root
10+
ExecStart = /usr/lib/nvidia/sriov-manage -e %i
11+
# Give a reasonable amount of time for the server to start up/shut down
12+
TimeoutSec = 120
13+
# This creates a specific slice which all services will operate from
14+
# The accounting options give us the ability to see resource usage
15+
# through the `systemd-cgtop` command.
16+
Slice = system.slice
17+
# Set Accounting
18+
CPUAccounting = True
19+
BlockIOAccounting = True
20+
MemoryAccounting = True
21+
TasksAccounting = True
22+
RemainAfterExit = True
23+
ExecStartPre = /usr/bin/sleep 30
24+
25+
[Install]
26+
WantedBy = multi-user.target
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
---
2+
# Copyright Red Hat, Inc.
3+
# All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
# License for the specific language governing permissions and limitations
15+
# under the License.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
---
2+
# Copyright Red Hat, Inc.
3+
# All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
# License for the specific language governing permissions and limitations
15+
# under the License.
16+
17+
18+
galaxy_info:
19+
author: CI Framework
20+
description: CI Framework Role -- edpm_nvidia_mdev_prepare
21+
company: Red Hat
22+
license: Apache-2.0
23+
min_ansible_version: "2.14"
24+
namespace: cifmw
25+
galaxy_tags:
26+
- cifmw
27+
28+
# List your role dependencies here, one per line. Be sure to remove the '[]' above,
29+
# if you add dependencies to this list.
30+
dependencies: []
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
---
2+
# Copyright Red Hat, Inc.
3+
# All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
# License for the specific language governing permissions and limitations
15+
# under the License.
16+
17+
18+
- name: Converge
19+
hosts: all
20+
vars:
21+
cifmw_edpm_nvidia_mdev_prepare_driver_url: 'tmux'
22+
cifmw_edpm_nvidia_mdev_prepare_sriov_manage_start: false
23+
tasks:
24+
- name: Run phase1
25+
ansible.builtin.import_role:
26+
name: edpm_nvidia_mdev_prepare
27+
tasks_from: phase1
28+
29+
- name: Check expected files in phase 1
30+
ansible.builtin.stat:
31+
path: "{{ item }}"
32+
loop:
33+
- "/etc/modprobe.d/blacklist-nouveau.conf"
34+
register: phase1_files
35+
36+
- name: Check if expected files where created
37+
ansible.builtin.assert:
38+
that: item.stat.exists
39+
loop: "{{ phase1_files.results }}"
40+
41+
- name: Check that tmux was installed
42+
ansible.builtin.command: tmux lscm
43+
register: tmux_id
44+
failed_when: tmux_id.rc != 0
45+
46+
- name: Run phase 2
47+
ansible.builtin.import_role:
48+
name: edpm_nvidia_mdev_prepare
49+
tasks_from: phase2
50+
51+
- name: Check expected files in phase 2
52+
ansible.builtin.stat:
53+
path: "{{ item }}"
54+
loop:
55+
- "/etc/systemd/system/[email protected]"
56+
register: phase2_files
57+
58+
- name: Check if expected files where created
59+
ansible.builtin.assert:
60+
that: item.stat.exists
61+
loop: "{{ phase2_files.results }}"
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
# Mainly used to override the defaults set in .config/molecule/
3+
# By default, it uses the "config_podman.yml" - in CI, it will use
4+
# "config_local.yml".
5+
log: true
6+
7+
provisioner:
8+
name: ansible
9+
log: true
10+
env:
11+
ANSIBLE_STDOUT_CALLBACK: yaml

0 commit comments

Comments
 (0)