From 452291cb9a25850289d3f65aa1c314edc8b7e78b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentin=20Boittiaux?= <boittiauxclementin@gmail.com>
Date: Wed, 26 Jun 2024 11:38:55 +0200
Subject: [PATCH] Add depth map support for camera sensor

---
 .../Editor/CameraSensorComponentEditor.cs     |  1 +
 com.unity.ml-agents/Runtime/Resources.meta    |  8 +++
 .../Runtime/Resources/DepthShader.shader      | 64 +++++++++++++++++++
 .../Runtime/Resources/DepthShader.shader.meta |  9 +++
 .../Runtime/Sensors/CameraSensor.cs           | 37 ++++++++---
 .../Runtime/Sensors/CameraSensorComponent.cs  | 48 +++++++++++++-
 .../Runtime/Sensors/CompressionSpec.cs        |  7 +-
 .../Runtime/Sensors/ObservationWriter.cs      | 22 ++++---
 ml-agents-envs/mlagents_envs/rpc_utils.py     | 34 +++++++---
 ml-agents-envs/setup.py                       |  1 +
 ml-agents-envs/tests/test_rpc_utils.py        |  6 +-
 11 files changed, 202 insertions(+), 35 deletions(-)
 create mode 100644 com.unity.ml-agents/Runtime/Resources.meta
 create mode 100644 com.unity.ml-agents/Runtime/Resources/DepthShader.shader
 create mode 100644 com.unity.ml-agents/Runtime/Resources/DepthShader.shader.meta

diff --git a/com.unity.ml-agents/Editor/CameraSensorComponentEditor.cs b/com.unity.ml-agents/Editor/CameraSensorComponentEditor.cs
index 1df66ee3c9..15c6fd446f 100644
--- a/com.unity.ml-agents/Editor/CameraSensorComponentEditor.cs
+++ b/com.unity.ml-agents/Editor/CameraSensorComponentEditor.cs
@@ -24,6 +24,7 @@ public override void OnInspectorGUI()
                 EditorGUILayout.PropertyField(so.FindProperty("m_Width"), true);
                 EditorGUILayout.PropertyField(so.FindProperty("m_Height"), true);
                 EditorGUILayout.PropertyField(so.FindProperty("m_Grayscale"), true);
+                EditorGUILayout.PropertyField(so.FindProperty("m_RGBD"), true);
                 EditorGUILayout.PropertyField(so.FindProperty("m_ObservationStacks"), true);
                 EditorGUILayout.PropertyField(so.FindProperty("m_ObservationType"), true);
             }
diff --git a/com.unity.ml-agents/Runtime/Resources.meta b/com.unity.ml-agents/Runtime/Resources.meta
new file mode 100644
index 0000000000..ef98dc1a06
--- /dev/null
+++ b/com.unity.ml-agents/Runtime/Resources.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: ca0aab04b837598dc99f548d13baf0c6
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/com.unity.ml-agents/Runtime/Resources/DepthShader.shader b/com.unity.ml-agents/Runtime/Resources/DepthShader.shader
new file mode 100644
index 0000000000..a6b429f916
--- /dev/null
+++ b/com.unity.ml-agents/Runtime/Resources/DepthShader.shader
@@ -0,0 +1,64 @@
+Shader "Custom/DepthShader"
+{
+    Properties
+    {
+        _MainTex ("Texture", 2D) = "white" {}
+    }
+    SubShader
+    {
+        Pass
+        {
+            CGPROGRAM
+            #pragma vertex vert
+            #pragma fragment frag
+
+            #include "UnityCG.cginc"
+
+            struct appdata
+            {
+                float4 vertex : POSITION;
+                float2 uv : TEXCOORD0;
+            };
+
+            struct v2f
+            {
+                float2 uv : TEXCOORD0;
+                float4 vertex : SV_POSITION;
+                float4 screenPos: TEXTCOORD1;
+            };
+
+            v2f vert (appdata v)
+            {
+                v2f o;
+                o.vertex = UnityObjectToClipPos(v.vertex);
+                o.screenPos = ComputeScreenPos(o.vertex);
+                o.uv = v.uv;
+                return o;
+            }
+
+            sampler2D _MainTex, _CameraDepthTexture;
+
+            float4 frag (v2f i) : SV_Target
+            {
+                // Extract color from texture
+                float4 color = tex2D(_MainTex, i.uv);
+
+                // Extract depth from camera depth texture
+                float depth = LinearEyeDepth(tex2D(_CameraDepthTexture, i.screenPos.xy));
+
+                // Clip depth to far plane
+                float farPlane = _ProjectionParams.z;
+                if (depth > farPlane) depth = 0;
+
+                // Convert color from linear to sRGB
+                color.rgb = LinearToGammaSpace(saturate(color.rgb));
+
+                // Store depth in alpha channel
+                color.a = depth;
+
+                return color;
+            }
+            ENDCG
+        }
+    }
+}
diff --git a/com.unity.ml-agents/Runtime/Resources/DepthShader.shader.meta b/com.unity.ml-agents/Runtime/Resources/DepthShader.shader.meta
new file mode 100644
index 0000000000..1b967c47fd
--- /dev/null
+++ b/com.unity.ml-agents/Runtime/Resources/DepthShader.shader.meta
@@ -0,0 +1,9 @@
+fileFormatVersion: 2
+guid: 8c36e1786391089c18743562d1d2de06
+ShaderImporter:
+  externalObjects: {}
+  defaultTextures: []
+  nonModifiableTextures: []
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/com.unity.ml-agents/Runtime/Sensors/CameraSensor.cs b/com.unity.ml-agents/Runtime/Sensors/CameraSensor.cs
index 12dc651387..be05f8ed54 100644
--- a/com.unity.ml-agents/Runtime/Sensors/CameraSensor.cs
+++ b/com.unity.ml-agents/Runtime/Sensors/CameraSensor.cs
@@ -13,11 +13,19 @@ public class CameraSensor : ISensor, IBuiltInSensor, IDisposable
         int m_Width;
         int m_Height;
         bool m_Grayscale;
+        bool m_RGBD;
         string m_Name;
         private ObservationSpec m_ObservationSpec;
         SensorCompressionType m_CompressionType;
         Texture2D m_Texture;
 
+        /// <summary>
+        /// Indicates wether or not the Render method is being executed by CameraSensor.
+        /// This boolean is checked in CameraSensorComponent.OnRenderImage method to avoid
+        /// applying the depth shader outside of the camera sensor scope.
+        /// </summary>
+        public bool m_InCameraSensorRender { get; private set; }
+
         /// <summary>
         /// The Camera used for rendering the sensor observations.
         /// </summary>
@@ -47,17 +55,19 @@ public SensorCompressionType CompressionType
         /// <param name="compression">The compression to apply to the generated image.</param>
         /// <param name="observationType">The type of observation.</param>
         public CameraSensor(
-            Camera camera, int width, int height, bool grayscale, string name, SensorCompressionType compression, ObservationType observationType = ObservationType.Default)
+            Camera camera, int width, int height, bool grayscale, bool rgbd, string name, SensorCompressionType compression, ObservationType observationType = ObservationType.Default)
         {
             m_Camera = camera;
             m_Width = width;
             m_Height = height;
             m_Grayscale = grayscale;
+            m_RGBD = rgbd;
             m_Name = name;
-            var channels = grayscale ? 1 : 3;
+            var channels = rgbd ? 4 : grayscale ? 1 : 3;  // RGBD has priority over Grayscale
             m_ObservationSpec = ObservationSpec.Visual(channels, height, width, observationType);
             m_CompressionType = compression;
-            m_Texture = new Texture2D(width, height, TextureFormat.RGB24, false);
+            m_Texture = new Texture2D(width, height, rgbd ? TextureFormat.RGBAFloat : TextureFormat.RGB24, false);
+            m_InCameraSensorRender = false;
         }
 
         /// <summary>
@@ -90,8 +100,11 @@ public byte[] GetCompressedObservation()
             using (TimerStack.Instance.Scoped("CameraSensor.GetCompressedObservation"))
             {
                 // TODO support more types here, e.g. JPG
-                var compressed = m_Texture.EncodeToPNG();
-                return compressed;
+                if (m_CompressionType == SensorCompressionType.OPENEXR)
+                {
+                    return m_Texture.EncodeToEXR();
+                }
+                return m_Texture.EncodeToPNG();
             }
         }
 
@@ -104,7 +117,7 @@ public int Write(ObservationWriter writer)
         {
             using (TimerStack.Instance.Scoped("CameraSensor.WriteToTensor"))
             {
-                var numWritten = writer.WriteTexture(m_Texture, m_Grayscale);
+                var numWritten = writer.WriteTexture(m_Texture, m_Grayscale, m_RGBD);
                 return numWritten;
             }
         }
@@ -131,7 +144,7 @@ public CompressionSpec GetCompressionSpec()
         /// <param name="texture2D">Texture2D to render to.</param>
         /// <param name="width">Width of resulting 2D texture.</param>
         /// <param name="height">Height of resulting 2D texture.</param>
-        public static void ObservationToTexture(Camera obsCamera, Texture2D texture2D, int width, int height)
+        public void ObservationToTexture(Camera obsCamera, Texture2D texture2D, int width, int height)
         {
             if (SystemInfo.graphicsDeviceType == GraphicsDeviceType.Null)
             {
@@ -140,9 +153,9 @@ public static void ObservationToTexture(Camera obsCamera, Texture2D texture2D, i
 
             var oldRec = obsCamera.rect;
             obsCamera.rect = new Rect(0f, 0f, 1f, 1f);
-            var depth = 24;
-            var format = RenderTextureFormat.Default;
-            var readWrite = RenderTextureReadWrite.Default;
+            var depth = m_RGBD ? 32 : 24;
+            var format = m_RGBD ? RenderTextureFormat.ARGBFloat : RenderTextureFormat.Default;
+            var readWrite = m_RGBD ? RenderTextureReadWrite.Linear : RenderTextureReadWrite.Default;
 
             var tempRt =
                 RenderTexture.GetTemporary(width, height, depth, format, readWrite);
@@ -154,8 +167,12 @@ public static void ObservationToTexture(Camera obsCamera, Texture2D texture2D, i
             RenderTexture.active = tempRt;
             obsCamera.targetTexture = tempRt;
 
+            m_InCameraSensorRender = true;
+
             obsCamera.Render();
 
+            m_InCameraSensorRender = false;
+
             texture2D.ReadPixels(new Rect(0, 0, texture2D.width, texture2D.height), 0, 0);
 
             obsCamera.targetTexture = prevCameraRt;
diff --git a/com.unity.ml-agents/Runtime/Sensors/CameraSensorComponent.cs b/com.unity.ml-agents/Runtime/Sensors/CameraSensorComponent.cs
index f6b53f087e..d70fa4d1e8 100644
--- a/com.unity.ml-agents/Runtime/Sensors/CameraSensorComponent.cs
+++ b/com.unity.ml-agents/Runtime/Sensors/CameraSensorComponent.cs
@@ -67,13 +67,26 @@ public int Height
         bool m_Grayscale;
 
         /// <summary>
-        /// Whether to generate grayscale images or color.
+        /// Whether to generate grayscale images or color. Disable RGBD to use it.
         /// Note that changing this after the sensor is created has no effect.
         /// </summary>
         public bool Grayscale
         {
             get { return m_Grayscale; }
-            set { m_Grayscale = value; }
+            set { m_Grayscale = value; UpdateSensor(); }
+        }
+
+        [HideInInspector, SerializeField, FormerlySerializedAs("rgbd")]
+        bool m_RGBD;
+
+        /// <summary>
+        /// Whether to generate color+depth images. RGBD has priority over Grayscale.
+        /// Note that changing this after the sensor is created has no effect.
+        /// </summary>
+        public bool RGBD
+        {
+            get { return m_RGBD; }
+            set { m_RGBD = value; UpdateSensor(); }
         }
 
         [HideInInspector, SerializeField]
@@ -130,9 +143,15 @@ public int ObservationStacks
             set { m_ObservationStacks = value; }
         }
 
+        /// <summary>
+        /// The material used to render the depth image.
+        /// </summary>
+        private Material m_DepthMaterial;
+
         void Start()
         {
             UpdateSensor();
+            m_DepthMaterial = new Material(Shader.Find("Custom/DepthShader"));
         }
 
         /// <summary>
@@ -142,7 +161,7 @@ void Start()
         public override ISensor[] CreateSensors()
         {
             Dispose();
-            m_Sensor = new CameraSensor(m_Camera, m_Width, m_Height, Grayscale, m_SensorName, m_Compression, m_ObservationType);
+            m_Sensor = new CameraSensor(m_Camera, m_Width, m_Height, Grayscale, RGBD, m_SensorName, m_Compression, m_ObservationType);
 
             if (ObservationStacks != 1)
             {
@@ -158,6 +177,14 @@ internal void UpdateSensor()
         {
             if (m_Sensor != null)
             {
+                // Update depth settings before camera settings because m_Compression might change
+                if (m_RGBD)
+                {
+                    m_Grayscale = false;
+                    m_Compression = SensorCompressionType.OPENEXR;
+                }
+
+                // Update camera settings
                 m_Sensor.Camera = m_Camera;
                 m_Sensor.CompressionType = m_Compression;
                 m_Sensor.Camera.enabled = m_RuntimeCameraEnable;
@@ -175,5 +202,20 @@ public void Dispose()
                 m_Sensor = null;
             }
         }
+
+        /// <summary>
+        /// Apply the depth material to the camera image if the sensor is set to RGBD.
+        /// </summary>
+        void OnRenderImage(RenderTexture src, RenderTexture dest)
+        {
+            if (m_RGBD && m_Sensor != null && m_Sensor.m_InCameraSensorRender)
+            {
+                Graphics.Blit(src, dest, m_DepthMaterial);
+            }
+            else
+            {
+                Graphics.Blit(src, dest);
+            }
+        }
     }
 }
diff --git a/com.unity.ml-agents/Runtime/Sensors/CompressionSpec.cs b/com.unity.ml-agents/Runtime/Sensors/CompressionSpec.cs
index 76e283a14a..74c0c2b362 100644
--- a/com.unity.ml-agents/Runtime/Sensors/CompressionSpec.cs
+++ b/com.unity.ml-agents/Runtime/Sensors/CompressionSpec.cs
@@ -14,7 +14,12 @@ public enum SensorCompressionType
         /// <summary>
         /// PNG format. Data will be stored in binary format.
         /// </summary>
-        PNG
+        PNG,
+
+        /// <summary>
+        /// OpenEXR format.
+        /// </summary>
+        OPENEXR
     }
 
     /// <summary>
diff --git a/com.unity.ml-agents/Runtime/Sensors/ObservationWriter.cs b/com.unity.ml-agents/Runtime/Sensors/ObservationWriter.cs
index 24ed9fa5ba..d3074f1a4f 100644
--- a/com.unity.ml-agents/Runtime/Sensors/ObservationWriter.cs
+++ b/com.unity.ml-agents/Runtime/Sensors/ObservationWriter.cs
@@ -296,7 +296,8 @@ public static class ObservationWriterExtension
         public static int WriteTexture(
             this ObservationWriter obsWriter,
             Texture2D texture,
-            bool grayScale)
+            bool grayScale,
+            bool rgbd = false)
         {
             if (texture.format == TextureFormat.RGB24)
             {
@@ -306,7 +307,7 @@ public static int WriteTexture(
             var width = texture.width;
             var height = texture.height;
 
-            var texturePixels = texture.GetPixels32();
+            var texturePixels = texture.GetPixels();
 
             // During training, we convert from Texture to PNG before sending to the trainer, which has the
             // effect of flipping the image. We need another flip here at inference time to match this.
@@ -316,22 +317,25 @@ public static int WriteTexture(
                 {
                     var currentPixel = texturePixels[(height - h - 1) * width + w];
 
-                    if (grayScale)
+                    if (grayScale && !rgbd)
                     {
                         obsWriter[0, h, w] =
-                            (currentPixel.r + currentPixel.g + currentPixel.b) / 3f / 255.0f;
+                            (currentPixel.r + currentPixel.g + currentPixel.b) / 3f;
                     }
                     else
                     {
-                        // For Color32, the r, g and b values are between 0 and 255.
-                        obsWriter[0, h, w] = currentPixel.r / 255.0f;
-                        obsWriter[1, h, w] = currentPixel.g / 255.0f;
-                        obsWriter[2, h, w] = currentPixel.b / 255.0f;
+                        obsWriter[0, h, w] = currentPixel.r;
+                        obsWriter[1, h, w] = currentPixel.g;
+                        obsWriter[2, h, w] = currentPixel.b;
+                        if (rgbd)
+                        {
+                            obsWriter[3, h, w] = currentPixel.a;
+                        }
                     }
                 }
             }
 
-            return height * width * (grayScale ? 1 : 3);
+            return height * width * (rgbd ? 4 : grayScale ? 1 : 3);
         }
 
         internal static int WriteTextureRGB24(
diff --git a/ml-agents-envs/mlagents_envs/rpc_utils.py b/ml-agents-envs/mlagents_envs/rpc_utils.py
index f8df94896a..cca2911880 100644
--- a/ml-agents-envs/mlagents_envs/rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/rpc_utils.py
@@ -13,10 +13,13 @@
 from mlagents_envs.communicator_objects.observation_pb2 import (
     ObservationProto,
     NONE as COMPRESSION_TYPE_NONE,
+    PNG as COMPRESSION_TYPE_PNG,
 )
 from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
 import numpy as np
+import OpenEXR as exr
 import io
+import Imath
 from typing import cast, List, Tuple, Collection, Optional, Iterable
 from PIL import Image
 
@@ -104,7 +107,7 @@ def original_tell(self) -> int:
 
 @timed
 def process_pixels(
-    image_bytes: bytes, expected_channels: int, mappings: Optional[List[int]] = None
+    image_bytes: bytes, compression_type: int, expected_channels: int, mappings: Optional[List[int]] = None
 ) -> np.ndarray:
     """
     Converts byte array observation image into numpy array, re-sizes it,
@@ -118,13 +121,26 @@ def process_pixels(
     image_arrays = []
     # Read the images back from the bytes (without knowing the sizes).
     while True:
-        with hierarchical_timer("image_decompress"):
-            image = Image.open(image_fp)
-            # Normally Image loads lazily, load() forces it to do loading in the timer scope.
-            image.load()
-        image_arrays.append(
-            np.moveaxis(np.array(image, dtype=np.float32) / 255.0, -1, 0)
-        )
+        if compression_type == COMPRESSION_TYPE_PNG:
+            with hierarchical_timer("image_decompress"):
+                image = Image.open(image_fp)
+                # Normally Image loads lazily, load() forces it to do loading in the timer scope.
+                image.load()
+            image_arrays.append(
+                np.moveaxis(np.array(image, dtype=np.float32) / 255.0, -1, 0)
+            )
+        else:
+            with hierarchical_timer("image_decompress"):
+                file = exr.InputFile(image_fp)
+                header = file.header()
+                dw = header["dataWindow"]
+                channels = "RGBA" if "A" in header["channels"] else "RGB"
+                image_size = (dw.max.y - dw.min.y + 1, dw.max.x - dw.min.x + 1)
+                image_data = file.channels(channels, Imath.PixelType(Imath.PixelType.FLOAT))
+                image = np.stack([
+                    np.frombuffer(channel, dtype=np.float32) for channel in image_data
+                ]).reshape(-1, *image_size)
+            image_arrays.append(image)
 
         # Look for the next header, starting from the current stream location
         try:
@@ -234,7 +250,7 @@ def _observation_to_np_array(
         return img
     else:
         img = process_pixels(
-            obs.compressed_data, expected_channels, list(obs.compressed_channel_mapping)
+            obs.compressed_data, obs.compression_type, expected_channels, list(obs.compressed_channel_mapping)
         )
         # Compare decompressed image size to observation shape and make sure they match
         if list(obs.shape) != list(img.shape):
diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py
index fcbee96151..7b2937dc25 100644
--- a/ml-agents-envs/setup.py
+++ b/ml-agents-envs/setup.py
@@ -62,6 +62,7 @@ def run(self):
         "pettingzoo==1.15.0",
         "numpy>=1.23.5,<1.24.0",
         "filelock>=3.4.0",
+        "OpenEXR==3.2.4",
     ],
     python_requires=">=3.10.1,<=3.10.12",
     # TODO: Remove this once mypy stops having spurious setuptools issues.
diff --git a/ml-agents-envs/tests/test_rpc_utils.py b/ml-agents-envs/tests/test_rpc_utils.py
index 8440d6586a..ad193a4278 100644
--- a/ml-agents-envs/tests/test_rpc_utils.py
+++ b/ml-agents-envs/tests/test_rpc_utils.py
@@ -236,7 +236,7 @@ def proto_from_steps_and_action(
 def test_process_pixels():
     in_array = np.random.rand(3, 128, 64)
     byte_arr = generate_compressed_data(in_array)
-    out_array = process_pixels(byte_arr, 3)
+    out_array = process_pixels(byte_arr, PNG, 3)
     assert out_array.shape == (3, 128, 64)
     assert np.sum(in_array - out_array) / np.prod(in_array.shape) < 0.01
     assert np.allclose(in_array, out_array, atol=0.01)
@@ -248,7 +248,7 @@ def test_process_pixels_multi_png():
     num_channels = 7
     in_array = np.random.rand(num_channels, height, width)
     byte_arr = generate_compressed_data(in_array)
-    out_array = process_pixels(byte_arr, num_channels)
+    out_array = process_pixels(byte_arr, PNG, num_channels)
     assert out_array.shape == (num_channels, height, width)
     assert np.sum(in_array - out_array) / np.prod(in_array.shape) < 0.01
     assert np.allclose(in_array, out_array, atol=0.01)
@@ -257,7 +257,7 @@ def test_process_pixels_multi_png():
 def test_process_pixels_gray():
     in_array = np.random.rand(3, 128, 64)
     byte_arr = generate_compressed_data(in_array)
-    out_array = process_pixels(byte_arr, 1)
+    out_array = process_pixels(byte_arr, PNG, 1)
     assert out_array.shape == (1, 128, 64)
     assert np.mean(in_array.mean(axis=0, keepdims=True) - out_array) < 0.01
     assert np.allclose(in_array.mean(axis=0, keepdims=True), out_array, atol=0.01)