Autoexposure example restoration #728

devshgraphicsprogramming · 2024-08-13T21:57:04Z

Description

Testing

TODO list:

…into autoexposue_ex

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

include/nbl/builtin/hlsl/math/morton.hlsl

devshgraphicsprogramming · 2024-08-16T14:12:49Z

include/nbl/builtin/hlsl/math/morton.hlsl

+template <typename T>
+T morton2d_mask(uint16_t _n)
+{
+    const static uint64_t mask[5] =
+    {
+        0x5555555555555555ull,
+        0x3333333333333333ull,
+        0x0F0F0F0F0F0F0F0Full,
+        0x00FF00FF00FF00FFull,
+        0x0000FFFF0000FFFFull
+    };
+    return (T)mask[_n];
+}
+
+template <typename T>
+T morton3d_mask(uint16_t _n)
+{
+    const static uint64_t mask[5] =
+    {
+        0x1249249249249249ull,
+        0x10C30C30C30C30C3ull,
+        0x010F00F00F00F00Full,
+        0x001F0000FF0000FFull,
+        0x001F00000000FFFFull
+    };
+    return (T)mask[_n];
+}
+template <typename T>
+T morton4d_mask(uint16_t _n)
+{
+    const static uint64_t mask[4] =
+    {
+        0x1111111111111111ull,
+        0x0303030303030303ull,
+        0x000F000F000F000Full,
+        0x000000FF000000FFull
+    };
+    return (T)mask[_n];
+}


couldn't you make them into

namespace morton { namespace impl { template<uint16_t Dims, typename T> struct mask; // now the partial specializations } }

with the masks as NBL_CONSTEXPR member variables

This way you can have

template<uint16_t Dims, typename T, uint16_t BitDepth> enable_if_t<is_scalar_v<T>&&is_integral_v<T>,T> decode(T x) { static_assert(BitDepth <= sizeof(T)*8); x = x & mask<Dims,T>::value[0]; .. rest of if-else statements }

@Fletterio will take over

devshgraphicsprogramming · 2024-08-16T14:21:37Z

include/nbl/builtin/hlsl/math/morton.hlsl

+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_decode_x(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_decode_y(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton >> 1); }
+
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_encode(T x, T y) { return impl::separate_bits_2d<T, bitDepth>(x) | (impl::separate_bits_2d<T, bitDepth>(y) << 1); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d<T, bitDepth>(x) | (impl::separate_bits_3d<T, bitDepth>(y) << 1) | (impl::separate_bits_3d<T, bitDepth>(z) << 2); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d<T, bitDepth>(x) | (impl::separate_bits_4d<T, bitDepth>(y) << 1) | (impl::separate_bits_4d<T, bitDepth>(z) << 2) | (impl::separate_bits_4d<T, bitDepth>(w) << 3); }


imho these should be template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8>

because that way you only need to write the encode and decode once

template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8> vector<T,Dims> decode(const T _morton) { vector<T,Dims> retval; for (uint16_t i=0; i<Dims; i++) retval[i] = impl::decode<Dims,T,BitDepth>(_morton>>i); return retval; } template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8> T encode(const vector<T,Dims> coord) { T retval = impl::separate_bits<Dims,T,BitDepth>(coord[0]); for (uint16_t i=1; i<Dims; i++) retval |= impl::separate_bits<Dims,T,BitDepth>(coord[1])<<i; return retval; }

@Fletterio will take over

include/nbl/builtin/hlsl/tonemapper/operators.hlsl

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

devshgraphicsprogramming · 2024-08-16T14:41:33Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        retval.minLuma = lumaMinimum;
+        retval.maxLuma = lumaMaximum;


typo, you've set the min and max equal to each other

P.S. its also more useful to take a precomputed minLumaLog2 and lumaLog2Range (diff between log of max and log of min)

still outstanding for the geom meter

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

devshgraphicsprogramming · 2025-03-17T14:18:42Z

include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl

+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);

 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);

 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);

 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);

 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);

 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);

 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);

 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);


no, not every pointer is a BDA pointer, see the master changes with is_spirv_type
https://github.com/Devsh-Graphics-Programming/Nabla/blob/0ecda6a8bcfaf8bba023d9cefcdc51be5d6a19b5/include/nbl/builtin/hlsl/type_traits.hlsl#L341C34-L341C50

you can now even check for Ptr_T being a pointer

Nabla/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl

Line 101 in 0ecda6a

NBL_CONSTEXPR_STATIC_INLINE bool is_pointer_v = is_pointer<T>::value;

include/nbl/builtin/hlsl/tonemapper/operators.hlsl

devshgraphicsprogramming · 2025-03-17T14:22:59Z

include/nbl/builtin/hlsl/tonemapper/operators.hlsl

+template<typename T = float32_t>
+struct Reinhard
+{
+	using float_t = enable_if_t<is_floating_point<T>::value, T>;


with new concepts, the constraint should be applied via NBL_PRIMARY_REQUIRES

include/nbl/builtin/hlsl/tonemapper/operators.hlsl

devshgraphicsprogramming · 2025-03-17T14:33:57Z

src/nbl/builtin/CMakeLists.txt

+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl")
+# tonemapper
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl")


I'd create tonemapper/operators/reinhardt and tonemapper/operators/aces instead of slapping everything into one file

devshgraphicsprogramming · 2025-03-17T14:34:47Z

src/nbl/builtin/CMakeLists.txt

@@ -34,6 +34,11 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/barycentric/utils.glsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl")
+# luma metering
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl")


similarly, luma_meter/luma_meter is a tautology, a file each like luma_meter/histogram and luma_meter/geom_avg would be nice

devshgraphicsprogramming · 2025-03-17T14:36:45Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
+struct geom_meter {
+    using float_t = typename SharedAccessor::type;
+    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;


even if doing color computation in float16_t this doesn't free you from doing texture coordinate calc in float32_t

devshgraphicsprogramming · 2025-03-17T14:38:05Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+    }
+
+    float_t sampleCount;
+    float_t2 lumaMinMax;


don't do weird things we used to do in GLSL (due to no scalar layout), have a separate variable for min and max

also you should have the min and max precomputed with log2 already applied

devshgraphicsprogramming · 2025-03-17T14:40:28Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
+    }
+
+    float_t sampleCount;


you want to compute and store the reciprocal of sampleCount and the 1<<fixedPointBitsLeft

that was the purpose of the rcpFirstPassWGCount variable in the old GLSL

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

devshgraphicsprogramming · 2025-03-17T14:45:14Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();


you're supposed to normalize by the number of samples you took during the sampling step, your workGroupCount here is NOT that value, its the number of workgroups you're exposing with

You must precompute the fixedPointsBitsLeft in the create method (and it needs to know how many invocations you'll be running the sample step)

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

devshgraphicsprogramming · 2025-03-17T14:58:56Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+    float_t computeLumaLog2(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(TexAccessor) tex,
+        float_t2 shiftedCoord
+    )
+    {
+        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;


precompute a scale and offset from the Metering Window + the number of workgroups + the workgroup size to apply to a uint16_t2 unnormalized coordinate

right now you have waaay too many variables:

tile Offset

viewport size

metering window
which are being manipulated per-pixel

devshgraphicsprogramming · 2025-03-17T15:00:18Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        NBL_REF_ARG(ValueAccessor) val,
+        NBL_REF_ARG(TexAccessor) tex,
+        NBL_REF_ARG(SharedAccessor) sdata,
+        float_t2 tileOffset,


why is tile Offset being provided from the outside? its a byproduct of your workgroupID, and workgroupSize-1 decoded as morton +1 in each dimension

devshgraphicsprogramming · 2025-03-17T15:02:01Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
+
+        return max(log2(luma), log2(lumaMinMax.x));


why max you already clamped!

btw if you have the log2 already precomputed then you can do

return min(spirv::nMax(log2(luma),lumaMinLog2),lumaMaxLog2);

nMin is a special SPIR-V version of min that will return the other operand if another is NaN (which happens on log of negative value or 0)

devshgraphicsprogramming · 2025-03-17T15:03:06Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        float_t luma = 0.0f;
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        luma = computeLumaLog2(window, tex, shiftedCoord);


luma should be called lumaLog2

devshgraphicsprogramming · 2025-03-17T15:04:38Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        luma = computeLumaLog2(window, tex, shiftedCoord);
+        float_t lumaSum = reduction(luma, sdata);
+
+        if (tid == GroupSize - 1) {


its somewhat semantically cleaner to pick the first, instead of last, esp since its a reduction you performed before

devshgraphicsprogramming · 2025-03-17T15:10:10Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+
+    void uploadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        uint32_t index,


don't give bad names to variables, this needs to be called workGroupIndex

this variable shouldn't even exist, because the MEAN meter didn't output to different address per X Y workgroup coordinate

devshgraphicsprogramming · 2025-03-17T15:12:18Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+
+        if (tid == GroupSize - 1) {
+            uint32_t3 workgroupCount = glsl::gl_NumWorkGroups();
+            uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64;


you're computing the wrong thing, every workgroup gets the same index 🤦

Also the original code was touching the same address with every for the MEAN meter mode

devshgraphicsprogramming · 2025-03-17T15:14:52Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        float_t rangeLog2
+    )
+    {
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();


take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart

devshgraphicsprogramming · 2025-03-17T15:15:09Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        float_t lumaSum = reduction(luma, sdata);
+
+        if (tid == GroupSize - 1) {
+            uint32_t3 workgroupCount = glsl::gl_NumWorkGroups();


take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart

devshgraphicsprogramming · 2025-03-17T15:19:41Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        float_t minLog2,
+        float_t rangeLog2


should already be precomputed as members

devshgraphicsprogramming · 2025-03-17T15:35:16Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));


lets write some docs for this....

The val was produced by a workgroup reduction is performed of values in the [MinLog2,MaxLog2] range

Which makes the scaledLogLuma (the variable that should hold (val-minLog2)*rangeLog2) is between 0 and WorkGroupSize

This value is atomic added by N workgroups

You now want to represent it in Fixed Point during the atomic add, but not be vulnerable to overflow, this means the worst case is adding N times WorkGroupSize.

This means that we need to multiply the by (2^32-1)/N precomputed as a float or if you must round up N to PoT and see how many bits are left (512 workgroups, means 9 bits, so 23 are left). To avoid rounding precision errors, the PoT method is chosen.

I have no clue where you're getting +SubgroupSizeLog2 from.

Also the value of (1<<fixedPointBitsLeft)-1 must be precomputed in create and stored as a member

IT should be as easy as

const uint32_t scaledLumaLog2BitPattern = uint32_t((val-lumaMinLog2)*maxIncrement_over_lumaRangeLog2+float_t(0.5));

where maxIncrement = (0x1u<<(32u-uint32_t(ceil(log2(WorkGroupCount*WorkGroupSize)))))-1;

devshgraphicsprogramming · 2025-03-17T15:35:46Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+
+        uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+
+        val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);


no, always the same address should be added to, if you wanted to stagger, then you should stagger based on modulo of the workgroup index

devshgraphicsprogramming · 2025-03-17T15:41:18Z

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl

+        float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
+        return luma / rangeLog2 + minLog2;


again, you're getting random floats based on workgroup index which thankfully was always the same (rare case of two wrongs making a right)

Again if you wanted to stagger, you should use entire subgroup to load the values, then subgroup reduce them

just converting to float_t is not the correct way to decode, you should divide by the maxIncrement

nipunG314 added 14 commits July 20, 2024 00:33

Add luma_meter and tonemapper

096e09d

Update submodule pointer

4fd700f

Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …

f93bb0f

…into autoexposue_ex

Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …

6152f96

…into autoexposue_ex

Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …

2311521

…into autoexposue_ex

Convert morton.h to hlsl

52e7ab2

Fix HLSL morton code

1cc26bd

Create geom_luma_meter and computeLuma

6922d0c

Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …

6e6eb64

…into autoexposue_ex

Add gatherLuma method

603a92f

Add getGatheredLuma()

810a6ac

Add reinhard and aces hlsl operators

69a73c1

Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …

72e0bc5

…into autoexposue_ex

cast mask values to correct type

4c70cf5

devshgraphicsprogramming commented Aug 13, 2024

View reviewed changes

include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl Outdated Show resolved Hide resolved

nipunG314 added 2 commits August 16, 2024 16:35

Add create methods to tonemapper params

d9d6dd8

Remove getGatheredLuma from luma_meter

305f7e7

devshgraphicsprogramming commented Aug 16, 2024

View reviewed changes

include/nbl/builtin/hlsl/math/morton.hlsl Outdated Show resolved Hide resolved

devshgraphicsprogramming commented Aug 16, 2024

View reviewed changes

include/nbl/builtin/hlsl/tonemapper/operators.hlsl Outdated Show resolved Hide resolved