-
Notifications
You must be signed in to change notification settings - Fork 64
Autoexposure example restoration #728
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 45 commits
096e09d
4fd700f
f93bb0f
6152f96
2311521
52e7ab2
1cc26bd
6922d0c
6e6eb64
603a92f
810a6ac
69a73c1
72e0bc5
4c70cf5
d9d6dd8
305f7e7
3f4f6e9
515512a
77f5756
1919e53
4c58238
3c3f8b8
b0e0750
e8e46c9
ee5affe
56389f4
23771d1
ac39039
49a8049
8a10ae2
6b01b6d
4129afe
f95f1c1
1a58273
b6e1f57
5239c29
0df9ba6
06c915e
90d20c4
26a4ed2
4edd38c
f1e3e98
ce2ca41
f1b7d17
83ac633
2b5e502
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
|
||
#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_ | ||
#define _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_ | ||
|
||
#include "nbl/builtin/hlsl/cpp_compat.hlsl" | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace luma_meter | ||
{ | ||
|
||
struct MeteringWindow | ||
{ | ||
using this_t = MeteringWindow; | ||
float32_t2 meteringWindowScale; | ||
float32_t2 meteringWindowOffset; | ||
|
||
static this_t create(float32_t2 scale, float32_t2 offset) { | ||
this_t retval; | ||
retval.meteringWindowScale = scale; | ||
retval.meteringWindowOffset = offset; | ||
return retval; | ||
} | ||
}; | ||
|
||
} | ||
} | ||
} | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,293 @@ | ||
// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
|
||
#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ | ||
#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ | ||
|
||
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" | ||
#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" | ||
#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl" | ||
#include "nbl/builtin/hlsl/workgroup/basic.hlsl" | ||
#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" | ||
#include "nbl/builtin/hlsl/type_traits.hlsl" | ||
#include "nbl/builtin/hlsl/math/morton.hlsl" | ||
#include "nbl/builtin/hlsl/luma_meter/common.hlsl" | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace luma_meter | ||
{ | ||
|
||
template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor> | ||
struct geom_meter { | ||
using float_t = typename SharedAccessor::type; | ||
using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type; | ||
using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type; | ||
using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>; | ||
|
||
static this_t create(float_t2 lumaMinMax, float_t sampleCount) | ||
{ | ||
this_t retval; | ||
retval.lumaMinMax = lumaMinMax; | ||
retval.sampleCount = sampleCount; | ||
return retval; | ||
} | ||
|
||
float_t reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) | ||
{ | ||
return workgroup::reduction < plus < float_t >, GroupSize >:: | ||
template __call <SharedAccessor>(value, sdata); | ||
} | ||
|
||
float_t computeLumaLog2( | ||
NBL_CONST_REF_ARG(MeteringWindow) window, | ||
NBL_REF_ARG(TexAccessor) tex, | ||
float_t2 shiftedCoord | ||
) | ||
{ | ||
float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. precompute a scale and offset from the Metering Window + the number of workgroups + the workgroup size to apply to a right now you have waaay too many variables:
|
||
float_t3 color = tex.get(uvPos); | ||
float_t luma = (float_t)TexAccessor::toXYZ(color); | ||
|
||
luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); | ||
|
||
return max(log2(luma), log2(lumaMinMax.x)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw if you have the return min(spirv::nMax(log2(luma),lumaMinLog2),lumaMaxLog2);
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
} | ||
|
||
void uploadFloat( | ||
NBL_REF_ARG(ValueAccessor) val_accessor, | ||
uint32_t index, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't give bad names to variables, this needs to be called There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this variable shouldn't even exist, because the MEAN meter didn't output to different address per X Y workgroup coordinate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
float_t val, | ||
float_t minLog2, | ||
float_t rangeLog2 | ||
Comment on lines
+63
to
+64
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should already be precomputed as members |
||
) | ||
{ | ||
uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart |
||
uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); | ||
|
||
uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); | ||
Comment on lines
+69
to
+71
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lets write some docs for this.... The Which makes the This value is atomic added by N workgroups You now want to represent it in Fixed Point during the atomic add, but not be vulnerable to overflow, this means the worst case is adding N times WorkGroupSize. This means that we need to multiply the by I have no clue where you're getting Also the value of IT should be as easy as const uint32_t scaledLumaLog2BitPattern = uint32_t((val-lumaMinLog2)*maxIncrement_over_lumaRangeLog2+float_t(0.5)); where |
||
|
||
val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, always the same address should be added to, if you wanted to stagger, then you should stagger based on modulo of the workgroup index |
||
} | ||
|
||
float_t downloadFloat( | ||
NBL_REF_ARG(ValueAccessor) val_accessor, | ||
uint32_t index, | ||
float_t minLog2, | ||
float_t rangeLog2 | ||
) | ||
{ | ||
float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); | ||
return luma / rangeLog2 + minLog2; | ||
Comment on lines
+83
to
+84
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again, you're getting random floats based on workgroup index which thankfully was always the same (rare case of two wrongs making a right) Again if you wanted to stagger, you should use entire subgroup to load the values, then subgroup reduce them just converting to |
||
} | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
void sampleLuma( | ||
NBL_CONST_REF_ARG(MeteringWindow) window, | ||
NBL_REF_ARG(ValueAccessor) val, | ||
NBL_REF_ARG(TexAccessor) tex, | ||
NBL_REF_ARG(SharedAccessor) sdata, | ||
float_t2 tileOffset, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is tile Offset being provided from the outside? its a byproduct of your workgroupID, and workgroupSize-1 decoded as morton +1 in each dimension |
||
float_t2 viewportSize | ||
) | ||
{ | ||
uint32_t tid = workgroup::SubgroupContiguousIndex(); | ||
uint32_t2 coord = { | ||
morton2d_decode_x(tid), | ||
morton2d_decode_y(tid) | ||
}; | ||
|
||
float_t luma = 0.0f; | ||
float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; | ||
luma = computeLumaLog2(window, tex, shiftedCoord); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
float_t lumaSum = reduction(luma, sdata); | ||
|
||
if (tid == GroupSize - 1) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. its somewhat semantically cleaner to pick the first, instead of last, esp since its a reduction you performed before There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
uint32_t3 workgroupCount = glsl::gl_NumWorkGroups(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart |
||
uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you're computing the wrong thing, every workgroup gets the same index 🤦 Also the original code was touching the same address with every for the MEAN meter mode |
||
|
||
uploadFloat( | ||
val, | ||
workgroupIndex, | ||
lumaSum, | ||
log2(lumaMinMax.x), | ||
log2(lumaMinMax.y / lumaMinMax.x) | ||
); | ||
} | ||
} | ||
|
||
float_t gatherLuma( | ||
NBL_REF_ARG(ValueAccessor) val | ||
) | ||
{ | ||
uint32_t tid = glsl::gl_SubgroupInvocationID(); | ||
float_t luma = glsl::subgroupAdd( | ||
downloadFloat( | ||
val, | ||
tid, | ||
log2(lumaMinMax.x), | ||
log2(lumaMinMax.y / lumaMinMax.x) | ||
) | ||
); | ||
|
||
uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); | ||
uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); | ||
Comment on lines
+131
to
+132
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you're supposed to normalize by the number of samples you took during the sampling step, your There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You must precompute the |
||
|
||
return (luma / (1 << fixedPointBitsLeft)) / sampleCount; | ||
} | ||
|
||
float_t sampleCount; | ||
Comment on lines
+134
to
+137
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you want to compute and store the reciprocal of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that was the purpose of the |
||
float_t2 lumaMinMax; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't do weird things we used to do in GLSL (due to no scalar layout), have a separate variable for min and max There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also you should have the min and max precomputed with |
||
}; | ||
|
||
template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor> | ||
struct median_meter { | ||
using int_t = typename SharedAccessor::type; | ||
using float_t = float32_t; | ||
using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type; | ||
using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type; | ||
using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>; | ||
|
||
static this_t create(float_t2 lumaMinMax, float_t sampleCount) { | ||
this_t retval; | ||
retval.lumaMinMax = lumaMinMax; | ||
retval.sampleCount = sampleCount; | ||
return retval; | ||
} | ||
|
||
int_t inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) { | ||
return workgroup::inclusive_scan < plus < int_t >, GroupSize >:: | ||
template __call <SharedAccessor>(value, sdata); | ||
} | ||
|
||
float_t computeLuma( | ||
NBL_CONST_REF_ARG(MeteringWindow) window, | ||
NBL_REF_ARG(TexAccessor) tex, | ||
float_t2 shiftedCoord | ||
) { | ||
float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; | ||
float_t3 color = tex.get(uvPos); | ||
float_t luma = (float_t)TexAccessor::toXYZ(color); | ||
|
||
return clamp(luma, lumaMinMax.x, lumaMinMax.y); | ||
} | ||
|
||
int_t float2Int( | ||
float_t val, | ||
float_t minLog2, | ||
float_t rangeLog2 | ||
) { | ||
uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); | ||
uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); | ||
|
||
return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); | ||
} | ||
|
||
float_t int2Float( | ||
int_t val, | ||
float_t minLog2, | ||
float_t rangeLog2 | ||
) { | ||
return val / rangeLog2 + minLog2; | ||
} | ||
|
||
void sampleLuma( | ||
NBL_CONST_REF_ARG(MeteringWindow) window, | ||
NBL_REF_ARG(HistogramAccessor) histo, | ||
NBL_REF_ARG(TexAccessor) tex, | ||
NBL_REF_ARG(SharedAccessor) sdata, | ||
float_t2 tileOffset, | ||
float_t2 viewportSize | ||
) { | ||
uint32_t tid = workgroup::SubgroupContiguousIndex(); | ||
|
||
for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { | ||
sdata.set(vid, 0); | ||
} | ||
|
||
sdata.workgroupExecutionAndMemoryBarrier(); | ||
|
||
uint32_t2 coord = { | ||
morton2d_decode_x(tid), | ||
morton2d_decode_y(tid) | ||
}; | ||
|
||
float_t luma = 0.0f; | ||
float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; | ||
luma = computeLuma(window, tex, shiftedCoord); | ||
|
||
float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount; | ||
uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize); | ||
|
||
sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); | ||
|
||
sdata.workgroupExecutionAndMemoryBarrier(); | ||
|
||
float_t histogram_value; | ||
sdata.get(tid, histogram_value); | ||
|
||
sdata.workgroupExecutionAndMemoryBarrier(); | ||
|
||
float_t sum = inclusive_scan(histogram_value, sdata); | ||
histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); | ||
|
||
const bool is_last_wg_invocation = tid == (GroupSize - 1); | ||
const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize; | ||
|
||
for (int i = 1; i < RoundedBinCount; i++) { | ||
uint32_t keyBucketStart = GroupSize * i; | ||
uint32_t vid = tid + keyBucketStart; | ||
|
||
// no if statement about the last iteration needed | ||
if (is_last_wg_invocation) { | ||
float_t beforeSum; | ||
sdata.get(keyBucketStart, beforeSum); | ||
sdata.set(keyBucketStart, beforeSum + sum); | ||
} | ||
|
||
// propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes | ||
sdata.workgroupExecutionAndMemoryBarrier(); | ||
|
||
// no aliasing anymore | ||
float_t atVid; | ||
sdata.get(vid, atVid); | ||
sum = inclusive_scan(atVid, sdata); | ||
if (vid < BinCount) { | ||
histo.atomicAdd(vid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); | ||
} | ||
} | ||
} | ||
|
||
float_t gatherLuma( | ||
NBL_REF_ARG(HistogramAccessor) histo, | ||
NBL_REF_ARG(SharedAccessor) sdata | ||
) { | ||
uint32_t tid = workgroup::SubgroupContiguousIndex(); | ||
|
||
for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { | ||
sdata.set( | ||
vid, | ||
histo.get(vid & (BinCount - 1)) | ||
); | ||
} | ||
|
||
sdata.workgroupExecutionAndMemoryBarrier(); | ||
|
||
uint32_t percentile40, percentile60; | ||
sdata.get(BinCount * 0.4, percentile40); | ||
sdata.get(BinCount * 0.6, percentile60); | ||
|
||
return (int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2; | ||
} | ||
|
||
float_t sampleCount; | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
float_t2 lumaMinMax; | ||
}; | ||
|
||
} | ||
} | ||
} | ||
|
||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
even if doing color computation in
float16_t
this doesn't free you from doing texture coordinate calc infloat32_t