Add subscript operator support in cuda (#6830)

mkeshavaNV · slangbot · expipiplus1 · web-flow · commit b0e150511a6a · 2025-04-30T10:37:02.000Z
* cuda: Add support for subscript operator This CL adds support for the subscript operator for Read Only textures in cuda. Also adds a test for this. Fixes #6781 * format code * fix review comments * format code --------- Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com> Co-authored-by: Ellie Hermaszewska <ellieh@nvidia.com>
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
@@ -3366,3 +3366,70 @@ struct TensorView
         *reinterpret_cast<T*>(data + offset) = val;
     }
 };
+
+// Implementations for texture fetch/load functions using tex PTX intrinsics
+// These are used for read-only texture access with integer coordinates
+// See #6781 for details.
+
+// 1D is not supported via PTX. Keeping this placeholder in case it ever gets
+// supported.
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x)
+{
+    T result;
+    float dummy;
+    asm("tex.1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5}];"
+        : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+        : "l"(texObj), "r"(x));
+    return result;
+}
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y)
+{
+    T result;
+    float dummy;
+    asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
+        : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+        : "l"(texObj), "r"(x), "r"(y));
+    return result;
+}
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
+{
+    T result;
+    float dummy;
+    asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+        : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+        : "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
+    // Note: The repeated z is a dummy used as the fourth operand in ptx.
+    // From the docs:
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
+    // Operand c is a scalar or singleton tuple for 1d textures; is a two-element vector for 2d
+    // textures; and is a four-element vector for 3d textures.
+    return result;
+}
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1DArrayfetch_int(CUtexObject texObj, int x, int layer)
+{
+    T result;
+    float dummy;
+    asm("tex.a1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
+        : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+        : "l"(texObj), "r"(x), "r"(layer));
+    return result;
+}
+
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL T
+tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
+{
+    T result;
+    float dummy;
+    asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
+        : "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
+        : "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
+    return result;
+}
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
@@ -3610,14 +3610,42 @@ extension _Texture<T,Shape,isArray,0,sampleCount,0,isShadow,isCombined,format>
 //@public:
     [__readNone]
     [ForceInline]
-    [require(cpp_glsl_hlsl_metal_spirv_wgsl, texture_sm_4_1_samplerless)]
+    [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, texture_sm_4_1_samplerless)]
     T Load(vector<int, Shape.dimensions+isArray+1> location)
     {
         __target_switch
         {
         case cpp:
         case hlsl:
             __intrinsic_asm ".Load";
+		case cuda:
+            if (isArray != 0)
+                {
+     				static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_3D),
+                              "Integer coordinates are supported for texture reads only for 2D and 3D textures and 2D array textures.");
+
+                    if (Shape.flavor == $(SLANG_TEXTURE_2D))
+                    {
+						__intrinsic_asm "tex2DArrayfetch_int<$T0>($0, ($1).x, ($1).y, ($1).z)";
+				    }
+                    else
+					{
+                        __intrinsic_asm "<invalid intrinsic>";
+                    }
+                }
+                else
+                {
+                    switch(Shape.flavor)
+                    {
+                    case $(SLANG_TEXTURE_2D):
+                        __intrinsic_asm "tex2Dfetch_int<$T0>($0, ($1).x, ($1).y)";
+                    case $(SLANG_TEXTURE_3D):
+                        __intrinsic_asm "tex3Dfetch_int<$T0>($0, ($1).x, ($1).y, ($1).z)";
+					case $(SLANG_TEXTURE_CUBE):
+                    default:
+                        __intrinsic_asm "<invalid intrinsic>";
+                    }
+                }
         case metal:
             switch (Shape.flavor)
             {
@@ -3824,7 +3852,7 @@ extension _Texture<T,Shape,isArray,0,sampleCount,0,isShadow,isCombined,format>
     {
         [__readNone]
         [ForceInline]
-        [require(cpp_glsl_hlsl_metal_spirv_wgsl, texture_sm_4_1_samplerless)]
+        [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, texture_sm_4_1_samplerless)]
         get
         {
             __target_switch
@@ -3833,6 +3861,7 @@ extension _Texture<T,Shape,isArray,0,sampleCount,0,isShadow,isCombined,format>
                 case hlsl:
                     __intrinsic_asm ".operator[]";
                 case metal:
+                case cuda:
                     return Load(__makeVector(location, 0));
                 case glsl:
                     if (isCombined == 0)
diff --git a/tests/compute/texture-subscript-cuda.slang b/tests/compute/texture-subscript-cuda.slang
@@ -0,0 +1,61 @@
+// Test for verifying subscript operator support in cuda.
+
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
+//TEST_INPUT: Texture1D(size=4, content = one):name cudaT1D
+Texture1D<float> cudaT1D;
+//TEST_INPUT: Texture2D(size=8, content = one):name cudaT2D
+Texture2D<float> cudaT2D;
+//TEST_INPUT: Texture3D(size=8, content = one):name cudaT3D
+Texture3D<float> cudaT3D;
+//TEST_INPUT: TextureCube(size=16, content = one):name cudaTCube
+TextureCube<float> cudaTCube;
+//TEST_INPUT: Texture2D(size=16, content = one, arrayLength=3):name cudaT2DArray
+Texture2DArray<float> cudaT2DArray;
+//TEST_INPUT: TextureCube(size=16, content = one, arrayLength=1):name cudaTCubeArray
+TextureCubeArray<float> cudaTCubeArray;
+
+//TEST_INPUT: ubuffer(data=[0 0 0 0 0 0 0], stride=4):out,name cudaOutputBuffer
+RWStructuredBuffer<float> cudaOutputBuffer;
+
+[numthreads(7, 1, 1)]
+[shader("compute")]
+void computeMain(int3 dispatchThreadID : SV_DispatchThreadID)
+{
+    int idx = dispatchThreadID.x;
+    
+	switch (idx)
+	{
+	case 1:
+		{
+			int var = 0;
+			float result = cudaT1D[0];
+			// This is not supported in PTX.
+			//cudaOutputBuffer[idx] = result;
+		}	
+	break;
+	
+	case 2:
+		{
+			int2 var = int2(1, 2);
+			float result = cudaT2D[var];
+			cudaOutputBuffer[idx] = result;
+		}
+	break;
+
+	case 3:
+		{
+			int3 var = int3(1, 1, 1);
+			float result = cudaT3D[var];
+			cudaOutputBuffer[idx] = result;
+		}
+	break;
+
+	case 4:
+		{
+			int3 var = int3(0, 0, 1);
+			float result = cudaT2DArray[var];
+			cudaOutputBuffer[idx] = result;
+		}
+	break;
+	}
+} 
diff --git a/tests/compute/texture-subscript-cuda.slang.expected.txt b/tests/compute/texture-subscript-cuda.slang.expected.txt
@@ -0,0 +1,7 @@
+0
+0
+3F800000
+3F800000
+3F800000
+0
+0
diff --git a/tests/language-feature/capability/capability-invalid-fragment-in-compute.slang b/tests/language-feature/capability/capability-invalid-fragment-in-compute.slang
@@ -1,7 +1,7 @@
-//TEST:SIMPLE(filecheck=CHECK): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -DPRE
-//TEST:SIMPLE(filecheck=CHECK): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -DPOST
-//TEST:SIMPLE(filecheck=CHECK_IGNORE_CAPS): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -ignore-capabilities -DPRE
-//TEST:SIMPLE(filecheck=CHECK_IGNORE_CAPS): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -ignore-capabilities -DPOST
+//TEST:SIMPLE(filecheck=CHECK): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl
+//TEST:SIMPLE(filecheck=CHECK): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl
+//TEST:SIMPLE(filecheck=CHECK_IGNORE_CAPS): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -ignore-capabilities
+//TEST:SIMPLE(filecheck=CHECK_IGNORE_CAPS): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -ignore-capabilities
 
 // CHECK_IGNORE_CAPS-NOT: error 36107
 // CHECK: error 36107
@@ -11,11 +11,5 @@ Texture2D<int> rw;
 [numthreads(1,1,1)]
 void computeMain()
 {
-#ifdef PRE
-    rw.Load(0);
-#endif
-    clip(0.0f);
-#ifdef POST
-    rw.Load(0);
-#endif
+    clip(0.0f); // clip is not supported in compute shader, so this throws an error.
 }