-
Notifications
You must be signed in to change notification settings - Fork 65
Autoexposure example restoration #728
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Conversation
…into autoexposue_ex
…into autoexposue_ex
…into autoexposue_ex
…into autoexposue_ex
…into autoexposue_ex
template <typename T> | ||
T morton2d_mask(uint16_t _n) | ||
{ | ||
const static uint64_t mask[5] = | ||
{ | ||
0x5555555555555555ull, | ||
0x3333333333333333ull, | ||
0x0F0F0F0F0F0F0F0Full, | ||
0x00FF00FF00FF00FFull, | ||
0x0000FFFF0000FFFFull | ||
}; | ||
return (T)mask[_n]; | ||
} | ||
|
||
template <typename T> | ||
T morton3d_mask(uint16_t _n) | ||
{ | ||
const static uint64_t mask[5] = | ||
{ | ||
0x1249249249249249ull, | ||
0x10C30C30C30C30C3ull, | ||
0x010F00F00F00F00Full, | ||
0x001F0000FF0000FFull, | ||
0x001F00000000FFFFull | ||
}; | ||
return (T)mask[_n]; | ||
} | ||
template <typename T> | ||
T morton4d_mask(uint16_t _n) | ||
{ | ||
const static uint64_t mask[4] = | ||
{ | ||
0x1111111111111111ull, | ||
0x0303030303030303ull, | ||
0x000F000F000F000Full, | ||
0x000000FF000000FFull | ||
}; | ||
return (T)mask[_n]; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
couldn't you make them into
namespace morton
{
namespace impl
{
template<uint16_t Dims, typename T>
struct mask;
// now the partial specializations
}
}
with the masks as NBL_CONSTEXPR
member variables
This way you can have
template<uint16_t Dims, typename T, uint16_t BitDepth>
enable_if_t<is_scalar_v<T>&&is_integral_v<T>,T> decode(T x)
{
static_assert(BitDepth <= sizeof(T)*8);
x = x & mask<Dims,T>::value[0];
.. rest of if-else statements
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Fletterio will take over
template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
T morton2d_decode_x(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton); } | ||
template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
T morton2d_decode_y(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton >> 1); } | ||
|
||
template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
T morton2d_encode(T x, T y) { return impl::separate_bits_2d<T, bitDepth>(x) | (impl::separate_bits_2d<T, bitDepth>(y) << 1); } | ||
template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d<T, bitDepth>(x) | (impl::separate_bits_3d<T, bitDepth>(y) << 1) | (impl::separate_bits_3d<T, bitDepth>(z) << 2); } | ||
template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d<T, bitDepth>(x) | (impl::separate_bits_4d<T, bitDepth>(y) << 1) | (impl::separate_bits_4d<T, bitDepth>(z) << 2) | (impl::separate_bits_4d<T, bitDepth>(w) << 3); } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
imho these should be template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8>
because that way you only need to write the encode and decode once
template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8>
vector<T,Dims> decode(const T _morton)
{
vector<T,Dims> retval;
for (uint16_t i=0; i<Dims; i++)
retval[i] = impl::decode<Dims,T,BitDepth>(_morton>>i);
return retval;
}
template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8>
T encode(const vector<T,Dims> coord)
{
T retval = impl::separate_bits<Dims,T,BitDepth>(coord[0]);
for (uint16_t i=1; i<Dims; i++)
retval |= impl::separate_bits<Dims,T,BitDepth>(coord[1])<<i;
return retval;
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Fletterio will take over
retval.minLuma = lumaMinimum; | ||
retval.maxLuma = lumaMaximum; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo, you've set the min and max equal to each other
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
P.S. its also more useful to take a precomputed minLumaLog2
and lumaLog2Range
(diff between log of max and log of min)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
still outstanding for the geom meter
[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] | ||
[[vk::ext_instruction(spv::OpAtomicIAdd)]] | ||
enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); | ||
|
||
template<typename T, typename Ptr_T> // DXC Workaround | ||
[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] | ||
[[vk::ext_instruction(spv::OpAtomicIAdd)]] | ||
enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); | ||
|
||
template<typename T> // integers operate on 2s complement so same op for signed and unsigned | ||
[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] | ||
[[vk::ext_capability(spv::CapabilityInt64Atomics)]] | ||
[[vk::ext_instruction(spv::OpAtomicIAdd)]] | ||
enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); | ||
|
||
template<typename T, typename Ptr_T> // DXC Workaround | ||
[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] | ||
[[vk::ext_capability(spv::CapabilityInt64Atomics)]] | ||
[[vk::ext_instruction(spv::OpAtomicIAdd)]] | ||
enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); | ||
|
||
template<typename T> // integers operate on 2s complement so same op for signed and unsigned | ||
[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] | ||
[[vk::ext_instruction(spv::OpAtomicISub)]] | ||
enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); | ||
|
||
template<typename T, typename Ptr_T> // DXC Workaround | ||
[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] | ||
[[vk::ext_instruction(spv::OpAtomicISub)]] | ||
enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); | ||
|
||
template<typename T> // integers operate on 2s complement so same op for signed and unsigned | ||
[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] | ||
[[vk::ext_capability(spv::CapabilityInt64Atomics)]] | ||
[[vk::ext_instruction(spv::OpAtomicISub)]] | ||
enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); | ||
|
||
template<typename T, typename Ptr_T> // DXC Workaround | ||
[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] | ||
[[vk::ext_capability(spv::CapabilityInt64Atomics)]] | ||
[[vk::ext_instruction(spv::OpAtomicISub)]] | ||
enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no, not every pointer is a BDA pointer, see the master
changes with is_spirv_type
https://github.com/Devsh-Graphics-Programming/Nabla/blob/0ecda6a8bcfaf8bba023d9cefcdc51be5d6a19b5/include/nbl/builtin/hlsl/type_traits.hlsl#L341C34-L341C50
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can now even check for Ptr_T
being a pointer
NBL_CONSTEXPR_STATIC_INLINE bool is_pointer_v = is_pointer<T>::value; |
template<typename T = float32_t> | ||
struct Reinhard | ||
{ | ||
using float_t = enable_if_t<is_floating_point<T>::value, T>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
with new concepts, the constraint should be applied via NBL_PRIMARY_REQUIRES
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl") | ||
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl") | ||
# tonemapper | ||
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd create tonemapper/operators/reinhardt
and tonemapper/operators/aces
instead of slapping everything into one file
@@ -34,6 +34,11 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/barycentric/utils.glsl") | |||
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl") | |||
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl") | |||
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl") | |||
# luma metering | |||
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl") | |||
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
similarly, luma_meter/luma_meter
is a tautology, a file each like luma_meter/histogram
and luma_meter/geom_avg
would be nice
template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor> | ||
struct geom_meter { | ||
using float_t = typename SharedAccessor::type; | ||
using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
even if doing color computation in float16_t
this doesn't free you from doing texture coordinate calc in float32_t
} | ||
|
||
float_t sampleCount; | ||
float_t2 lumaMinMax; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't do weird things we used to do in GLSL (due to no scalar layout), have a separate variable for min and max
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also you should have the min and max precomputed with log2
already applied
return (luma / (1 << fixedPointBitsLeft)) / sampleCount; | ||
} | ||
|
||
float_t sampleCount; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you want to compute and store the reciprocal of sampleCount
and the 1<<fixedPointBitsLeft
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that was the purpose of the rcpFirstPassWGCount
variable in the old GLSL
uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); | ||
uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you're supposed to normalize by the number of samples you took during the sampling step, your workGroupCount
here is NOT that value, its the number of workgroups you're exposing with
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You must precompute the fixedPointsBitsLeft
in the create
method (and it needs to know how many invocations you'll be running the sample step)
float_t computeLumaLog2( | ||
NBL_CONST_REF_ARG(MeteringWindow) window, | ||
NBL_REF_ARG(TexAccessor) tex, | ||
float_t2 shiftedCoord | ||
) | ||
{ | ||
float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
precompute a scale and offset from the Metering Window + the number of workgroups + the workgroup size to apply to a uint16_t2
unnormalized coordinate
right now you have waaay too many variables:
- tile Offset
- viewport size
- metering window
which are being manipulated per-pixel
NBL_REF_ARG(ValueAccessor) val, | ||
NBL_REF_ARG(TexAccessor) tex, | ||
NBL_REF_ARG(SharedAccessor) sdata, | ||
float_t2 tileOffset, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is tile Offset being provided from the outside? its a byproduct of your workgroupID, and workgroupSize-1 decoded as morton +1 in each dimension
luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); | ||
|
||
return max(log2(luma), log2(lumaMinMax.x)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why max
you already clamped!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
btw if you have the log2
already precomputed then you can do
return min(spirv::nMax(log2(luma),lumaMinLog2),lumaMaxLog2);
nMin
is a special SPIR-V version of min that will return the other operand if another is NaN (which happens on log of negative value or 0)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
float_t luma = 0.0f; | ||
float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; | ||
luma = computeLumaLog2(window, tex, shiftedCoord); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
luma
should be called lumaLog2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
luma = computeLumaLog2(window, tex, shiftedCoord); | ||
float_t lumaSum = reduction(luma, sdata); | ||
|
||
if (tid == GroupSize - 1) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
its somewhat semantically cleaner to pick the first, instead of last, esp since its a reduction you performed before
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
|
||
void uploadFloat( | ||
NBL_REF_ARG(ValueAccessor) val_accessor, | ||
uint32_t index, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't give bad names to variables, this needs to be called workGroupIndex
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this variable shouldn't even exist, because the MEAN meter didn't output to different address per X Y workgroup coordinate
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
|
||
if (tid == GroupSize - 1) { | ||
uint32_t3 workgroupCount = glsl::gl_NumWorkGroups(); | ||
uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you're computing the wrong thing, every workgroup gets the same index 🤦
Also the original code was touching the same address with every for the MEAN meter mode
float_t rangeLog2 | ||
) | ||
{ | ||
uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart
float_t lumaSum = reduction(luma, sdata); | ||
|
||
if (tid == GroupSize - 1) { | ||
uint32_t3 workgroupCount = glsl::gl_NumWorkGroups(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart
float_t minLog2, | ||
float_t rangeLog2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should already be precomputed as members
uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); | ||
|
||
uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lets write some docs for this....
The val
was produced by a workgroup reduction is performed of values in the [MinLog2,MaxLog2]
range
Which makes the scaledLogLuma
(the variable that should hold (val-minLog2)*rangeLog2
) is between 0 and WorkGroupSize
This value is atomic added by N workgroups
You now want to represent it in Fixed Point during the atomic add, but not be vulnerable to overflow, this means the worst case is adding N times WorkGroupSize.
This means that we need to multiply the by (2^32-1)/N
precomputed as a float or if you must round up N
to PoT and see how many bits are left (512 workgroups, means 9 bits, so 23 are left). To avoid rounding precision errors, the PoT method is chosen.
I have no clue where you're getting +SubgroupSizeLog2
from.
Also the value of (1<<fixedPointBitsLeft)-1
must be precomputed in create
and stored as a member
IT should be as easy as
const uint32_t scaledLumaLog2BitPattern = uint32_t((val-lumaMinLog2)*maxIncrement_over_lumaRangeLog2+float_t(0.5));
where maxIncrement = (0x1u<<(32u-uint32_t(ceil(log2(WorkGroupCount*WorkGroupSize)))))-1;
|
||
uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); | ||
|
||
val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no, always the same address should be added to, if you wanted to stagger, then you should stagger based on modulo of the workgroup index
float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); | ||
return luma / rangeLog2 + minLog2; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
again, you're getting random floats based on workgroup index which thankfully was always the same (rare case of two wrongs making a right)
Again if you wanted to stagger, you should use entire subgroup to load the values, then subgroup reduce them
just converting to float_t
is not the correct way to decode, you should divide by the maxIncrement
Description
Testing
TODO list: