Skip to content

Commit b0e1505

Browse files
mkeshavaNVslangbotexpipiplus1
authored
Add subscript operator support in cuda (#6830)
* cuda: Add support for subscript operator This CL adds support for the subscript operator for Read Only textures in cuda. Also adds a test for this. Fixes #6781 * format code * fix review comments * format code --------- Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com> Co-authored-by: Ellie Hermaszewska <ellieh@nvidia.com>
1 parent 41ac7a0 commit b0e1505

File tree

5 files changed

+171
-13
lines changed

5 files changed

+171
-13
lines changed

prelude/slang-cuda-prelude.h

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3366,3 +3366,70 @@ struct TensorView
33663366
*reinterpret_cast<T*>(data + offset) = val;
33673367
}
33683368
};
3369+
3370+
// Implementations for texture fetch/load functions using tex PTX intrinsics
3371+
// These are used for read-only texture access with integer coordinates
3372+
// See #6781 for details.
3373+
3374+
// 1D is not supported via PTX. Keeping this placeholder in case it ever gets
3375+
// supported.
3376+
template<typename T>
3377+
SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1Dfetch_int(CUtexObject texObj, int x)
3378+
{
3379+
T result;
3380+
float dummy;
3381+
asm("tex.1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5}];"
3382+
: "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
3383+
: "l"(texObj), "r"(x));
3384+
return result;
3385+
}
3386+
3387+
template<typename T>
3388+
SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex2Dfetch_int(CUtexObject texObj, int x, int y)
3389+
{
3390+
T result;
3391+
float dummy;
3392+
asm("tex.2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
3393+
: "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
3394+
: "l"(texObj), "r"(x), "r"(y));
3395+
return result;
3396+
}
3397+
3398+
template<typename T>
3399+
SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex3Dfetch_int(CUtexObject texObj, int x, int y, int z)
3400+
{
3401+
T result;
3402+
float dummy;
3403+
asm("tex.3d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
3404+
: "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
3405+
: "l"(texObj), "r"(x), "r"(y), "r"(z), "r"(z));
3406+
// Note: The repeated z is a dummy used as the fourth operand in ptx.
3407+
// From the docs:
3408+
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex
3409+
// Operand c is a scalar or singleton tuple for 1d textures; is a two-element vector for 2d
3410+
// textures; and is a four-element vector for 3d textures.
3411+
return result;
3412+
}
3413+
3414+
template<typename T>
3415+
SLANG_FORCE_INLINE SLANG_CUDA_CALL T tex1DArrayfetch_int(CUtexObject texObj, int x, int layer)
3416+
{
3417+
T result;
3418+
float dummy;
3419+
asm("tex.a1d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
3420+
: "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
3421+
: "l"(texObj), "r"(x), "r"(layer));
3422+
return result;
3423+
}
3424+
3425+
template<typename T>
3426+
SLANG_FORCE_INLINE SLANG_CUDA_CALL T
3427+
tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
3428+
{
3429+
T result;
3430+
float dummy;
3431+
asm("tex.a2d.v4.f32.s32 {%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
3432+
: "=f"(result), "=f"(dummy), "=f"(dummy), "=f"(dummy)
3433+
: "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
3434+
return result;
3435+
}

source/slang/hlsl.meta.slang

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3610,14 +3610,42 @@ extension _Texture<T,Shape,isArray,0,sampleCount,0,isShadow,isCombined,format>
36103610
//@public:
36113611
[__readNone]
36123612
[ForceInline]
3613-
[require(cpp_glsl_hlsl_metal_spirv_wgsl, texture_sm_4_1_samplerless)]
3613+
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, texture_sm_4_1_samplerless)]
36143614
T Load(vector<int, Shape.dimensions+isArray+1> location)
36153615
{
36163616
__target_switch
36173617
{
36183618
case cpp:
36193619
case hlsl:
36203620
__intrinsic_asm ".Load";
3621+
case cuda:
3622+
if (isArray != 0)
3623+
{
3624+
static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_3D),
3625+
"Integer coordinates are supported for texture reads only for 2D and 3D textures and 2D array textures.");
3626+
3627+
if (Shape.flavor == $(SLANG_TEXTURE_2D))
3628+
{
3629+
__intrinsic_asm "tex2DArrayfetch_int<$T0>($0, ($1).x, ($1).y, ($1).z)";
3630+
}
3631+
else
3632+
{
3633+
__intrinsic_asm "<invalid intrinsic>";
3634+
}
3635+
}
3636+
else
3637+
{
3638+
switch(Shape.flavor)
3639+
{
3640+
case $(SLANG_TEXTURE_2D):
3641+
__intrinsic_asm "tex2Dfetch_int<$T0>($0, ($1).x, ($1).y)";
3642+
case $(SLANG_TEXTURE_3D):
3643+
__intrinsic_asm "tex3Dfetch_int<$T0>($0, ($1).x, ($1).y, ($1).z)";
3644+
case $(SLANG_TEXTURE_CUBE):
3645+
default:
3646+
__intrinsic_asm "<invalid intrinsic>";
3647+
}
3648+
}
36213649
case metal:
36223650
switch (Shape.flavor)
36233651
{
@@ -3824,7 +3852,7 @@ extension _Texture<T,Shape,isArray,0,sampleCount,0,isShadow,isCombined,format>
38243852
{
38253853
[__readNone]
38263854
[ForceInline]
3827-
[require(cpp_glsl_hlsl_metal_spirv_wgsl, texture_sm_4_1_samplerless)]
3855+
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, texture_sm_4_1_samplerless)]
38283856
get
38293857
{
38303858
__target_switch
@@ -3833,6 +3861,7 @@ extension _Texture<T,Shape,isArray,0,sampleCount,0,isShadow,isCombined,format>
38333861
case hlsl:
38343862
__intrinsic_asm ".operator[]";
38353863
case metal:
3864+
case cuda:
38363865
return Load(__makeVector(location, 0));
38373866
case glsl:
38383867
if (isCombined == 0)
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// Test for verifying subscript operator support in cuda.
2+
3+
//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
4+
//TEST_INPUT: Texture1D(size=4, content = one):name cudaT1D
5+
Texture1D<float> cudaT1D;
6+
//TEST_INPUT: Texture2D(size=8, content = one):name cudaT2D
7+
Texture2D<float> cudaT2D;
8+
//TEST_INPUT: Texture3D(size=8, content = one):name cudaT3D
9+
Texture3D<float> cudaT3D;
10+
//TEST_INPUT: TextureCube(size=16, content = one):name cudaTCube
11+
TextureCube<float> cudaTCube;
12+
//TEST_INPUT: Texture2D(size=16, content = one, arrayLength=3):name cudaT2DArray
13+
Texture2DArray<float> cudaT2DArray;
14+
//TEST_INPUT: TextureCube(size=16, content = one, arrayLength=1):name cudaTCubeArray
15+
TextureCubeArray<float> cudaTCubeArray;
16+
17+
//TEST_INPUT: ubuffer(data=[0 0 0 0 0 0 0], stride=4):out,name cudaOutputBuffer
18+
RWStructuredBuffer<float> cudaOutputBuffer;
19+
20+
[numthreads(7, 1, 1)]
21+
[shader("compute")]
22+
void computeMain(int3 dispatchThreadID : SV_DispatchThreadID)
23+
{
24+
int idx = dispatchThreadID.x;
25+
26+
switch (idx)
27+
{
28+
case 1:
29+
{
30+
int var = 0;
31+
float result = cudaT1D[0];
32+
// This is not supported in PTX.
33+
//cudaOutputBuffer[idx] = result;
34+
}
35+
break;
36+
37+
case 2:
38+
{
39+
int2 var = int2(1, 2);
40+
float result = cudaT2D[var];
41+
cudaOutputBuffer[idx] = result;
42+
}
43+
break;
44+
45+
case 3:
46+
{
47+
int3 var = int3(1, 1, 1);
48+
float result = cudaT3D[var];
49+
cudaOutputBuffer[idx] = result;
50+
}
51+
break;
52+
53+
case 4:
54+
{
55+
int3 var = int3(0, 0, 1);
56+
float result = cudaT2DArray[var];
57+
cudaOutputBuffer[idx] = result;
58+
}
59+
break;
60+
}
61+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
0
2+
0
3+
3F800000
4+
3F800000
5+
3F800000
6+
0
7+
0
Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
//TEST:SIMPLE(filecheck=CHECK): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -DPRE
2-
//TEST:SIMPLE(filecheck=CHECK): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -DPOST
3-
//TEST:SIMPLE(filecheck=CHECK_IGNORE_CAPS): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -ignore-capabilities -DPRE
4-
//TEST:SIMPLE(filecheck=CHECK_IGNORE_CAPS): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -ignore-capabilities -DPOST
1+
//TEST:SIMPLE(filecheck=CHECK): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl
2+
//TEST:SIMPLE(filecheck=CHECK): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl
3+
//TEST:SIMPLE(filecheck=CHECK_IGNORE_CAPS): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -ignore-capabilities
4+
//TEST:SIMPLE(filecheck=CHECK_IGNORE_CAPS): -target hlsl -emit-spirv-directly -entry computeMain -stage compute -allow-glsl -ignore-capabilities
55

66
// CHECK_IGNORE_CAPS-NOT: error 36107
77
// CHECK: error 36107
@@ -11,11 +11,5 @@ Texture2D<int> rw;
1111
[numthreads(1,1,1)]
1212
void computeMain()
1313
{
14-
#ifdef PRE
15-
rw.Load(0);
16-
#endif
17-
clip(0.0f);
18-
#ifdef POST
19-
rw.Load(0);
20-
#endif
14+
clip(0.0f); // clip is not supported in compute shader, so this throws an error.
2115
}

0 commit comments

Comments
 (0)