Merge pull request intel#798 from eero-t/media-wa

mythi · web-flow · commit 6bbc40ee969b · 2021-12-16T10:23:24.000+02:00
Provide workaround for the media issue and document it
diff --git a/cmd/gpu_plugin/README.md b/cmd/gpu_plugin/README.md
@@ -16,6 +16,9 @@ Table of Contents
         * [Run the plugin as administrator](#run-the-plugin-as-administrator)
     * [Verify plugin registration](#verify-plugin-registration)
     * [Testing the plugin](#testing-the-plugin)
+* [Issues with media workloads on multi-GPU setups](#issues-with-media-workloads-on-multi-gpu-setups)
+    * [Workaround for QSV and VA-API](#workaround-for-qsv-and-va-api)
+
 
 ## Introduction
 
@@ -242,3 +245,64 @@ We can test the plugin is working by deploying an OpenCL image and running `clin
       ----     ------            ----       ----               -------
       Warning  FailedScheduling  <unknown>  default-scheduler  0/1 nodes are available: 1 Insufficient gpu.intel.com/i915.
     ```
+
+
+## Issues with media workloads on multi-GPU setups
+
+Unlike with 3D & compute, and OneVPL media API, QSV (MediaSDK) & VA-API
+media APIs do not offer device discovery functionality for applications.
+There is nothing (e.g. environment variable) with which the default
+device could be overridden either.
+
+As result, most (all?) media applications using VA-API or QSV, fail to
+locate the correct GPU device file unless it is the first ("renderD128")
+one, or device file name is explictly specified with an application option.
+
+Kubernetes device plugins expose only requested number of device
+files, and their naming matches host device file names (for several
+reasons unrelated to media).  Therefore, on multi-GPU hosts, the only
+GPU device file mapped to the media container can be some other one
+than "renderD128", and media applications using VA-API or QSV need to
+be explicitly told which one to use.
+
+These options differ from application to application.  Relevant FFmpeg
+options are documented here:
+* VA-API: https://trac.ffmpeg.org/wiki/Hardware/VAAPI
+* QSV: https://github.com/Intel-Media-SDK/MediaSDK/wiki/FFmpeg-QSV-Multi-GPU-Selection-on-Linux
+
+
+### Workaround for QSV and VA-API
+
+[Render device](render-device.sh) shell script locates and outputs the
+correct device file name.  It can be added to the container and used
+to give device file name for the application.
+
+Use it either from another script invoking the application, or
+directly from the Pod YAML command line.  In latter case, it can be
+used either to add the device file name to the end of given command
+line, like this:
+
+```bash
+command: ["render-device.sh", "vainfo", "--display", "drm", "--device"]
+
+=> /usr/bin/vainfo --display drm --device /dev/dri/renderDXXX
+```
+
+Or inline, like this:
+
+```bash
+command: ["/bin/sh", "-c",
+          "vainfo --device $(render-device.sh 1) --display drm"
+         ]
+```
+
+If device file name is needed for multiple commands, one can use shell variable:
+
+```bash
+command: ["/bin/sh", "-c",
+          "dev=$(render-device.sh 1) && vainfo --device $dev && <more commands>"
+         ]
+```
+
+With argument N, script outputs name of the Nth suitable GPU device
+file, which can be used when more than one GPU resource was requested.
diff --git a/cmd/gpu_plugin/render-device.sh b/cmd/gpu_plugin/render-device.sh
@@ -0,0 +1,99 @@
+#!/bin/sh
+#
+# Copyright 2021 Intel Corporation.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+#
+# Some GPU workloads are unable to find the (Intel) GPU provisioned for
+# them by Kubernetes. This script checks and tells which device to use.
+#
+# For example (all?) media applications using VA-API or QSV media APIs [1],
+# fail when /dev/dri/renderD128 is not present, or happens to be of
+# a type not supported by the media driver.
+#
+# Happily (all?) media applications have an option to specify a suitable
+# render device name, which can be used with this script.
+#
+# [1] Compute, 3D, and OneVPL APIs do not suffer from this issue.
+#
+#
+# Running the script requires only few tools, which should be present in
+# all distro base images.  The required tools, and the packages they
+# reside in Debian based distros, are:
+# - dash: 'sh' (minimal bourne shell)
+# - coreutils: 'seq', 'cat', 'echo'
+# - sed: 'sed'
+#
+# But they are also provided by 'busybox' and 'toybox' tool sets.
+
+
+usage ()
+{
+	name=${0##*/}
+	echo "Provides (Intel GPU) render device name application can use, either"
+	echo "on standard output, or added to given command line. If device index"
+	echo "N is given, provides name of Nth available (Intel GPU) render device."
+	echo
+	echo "Usage:"
+	echo "  $name <device index>"
+	echo "  $name [device index] <media program> [other options] <GPU selection option>"
+	echo
+	echo "Examples:"
+	echo "  \$ vainfo --display drm --device \$($name 1)"
+	echo "  \$ $name vainfo --display drm --device"
+	echo "  Running: vainfo --display drm --device /dev/dri/renderD140"
+	echo
+	echo "ERROR: $1!"
+	exit 1
+}
+
+if [ $# -eq 0 ]; then
+	usage "no arguments given"
+fi
+
+# determine required GPU index
+NaN=$(echo "$1" | sed 's/[0-9]\+//')
+if [ "$NaN" = "" ] && [ "$1" != "" ]; then
+	required=$1
+	if [ "$required" -lt 1 ] || [ "$required" -gt 127 ]; then
+		usage "GPU index $required not in range 1-127"
+	fi
+	shift
+else
+	required=1
+fi
+visible=0
+
+vendor=""
+intel="0x8086"
+# find host index "i" for Nth visible Intel GPU device
+for i in $(seq 128 255); do
+	if [ -w "/dev/dri/renderD$i" ]; then
+		vendor=$(cat "/sys/class/drm/renderD$i/device/vendor")
+		if [ "$vendor" = "$intel" ]; then
+			visible=$((visible+1))
+			if [ $visible -eq $required ]; then
+				break
+			fi
+		fi
+	fi
+done
+
+if [ $visible -ne $required ]; then
+	usage "$visible Intel GPU(s) found, not $required as requested"
+fi
+device="/dev/dri/renderD$i"
+
+if [ $# -eq 0 ]; then
+	echo "$device"
+	exit 0
+fi
+
+if [ $# -lt 2 ]; then
+	usage "media program and/or GPU selection option missing"
+fi
+
+# run given media workload with GPU device name appended to end
+echo "Running: $* $device"
+exec "$@" "$device"