Skip to content

Autoexposure example restoration #728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 46 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 45 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
096e09d
Add luma_meter and tonemapper
nipunG314 Jul 19, 2024
4fd700f
Update submodule pointer
nipunG314 Jul 19, 2024
f93bb0f
Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …
nipunG314 Jul 20, 2024
6152f96
Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …
nipunG314 Jul 24, 2024
2311521
Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …
nipunG314 Jul 26, 2024
52e7ab2
Convert morton.h to hlsl
nipunG314 Aug 1, 2024
1cc26bd
Fix HLSL morton code
nipunG314 Aug 2, 2024
6922d0c
Create geom_luma_meter and computeLuma
nipunG314 Aug 5, 2024
6e6eb64
Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …
nipunG314 Aug 5, 2024
603a92f
Add gatherLuma method
nipunG314 Aug 7, 2024
810a6ac
Add getGatheredLuma()
nipunG314 Aug 8, 2024
69a73c1
Add reinhard and aces hlsl operators
nipunG314 Aug 8, 2024
72e0bc5
Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …
nipunG314 Aug 13, 2024
4c70cf5
cast mask values to correct type
nipunG314 Aug 13, 2024
d9d6dd8
Add create methods to tonemapper params
nipunG314 Aug 16, 2024
305f7e7
Remove getGatheredLuma from luma_meter
nipunG314 Aug 16, 2024
3f4f6e9
Separate LumaMeteringWindow into a common header
nipunG314 Aug 20, 2024
515512a
Simplify luma_meter naming
nipunG314 Aug 20, 2024
77f5756
Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …
nipunG314 Aug 20, 2024
1919e53
Simplify morton code
nipunG314 Aug 20, 2024
4c58238
Add missing comment
nipunG314 Aug 20, 2024
3c3f8b8
Refactor tonemapping operators
nipunG314 Aug 20, 2024
b0e0750
Small fixes
nipunG314 Aug 20, 2024
e8e46c9
Use promote to simplify code
nipunG314 Aug 21, 2024
ee5affe
Add static create to MeteringWindow
nipunG314 Aug 21, 2024
56389f4
Infer sample count from viewportSize
nipunG314 Aug 21, 2024
23771d1
Rename gatherLuma, add toXYZ method and templatize the float type
nipunG314 Aug 22, 2024
ac39039
Add uploadFloat, downloadFloat and gatherLuma
nipunG314 Aug 26, 2024
49a8049
Normalize tileOffset and coord to uv before computing Luma
nipunG314 Aug 27, 2024
8a10ae2
Simplify return statement
nipunG314 Sep 29, 2024
6b01b6d
Update submodule pointers
nipunG314 Dec 11, 2024
4129afe
Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …
nipunG314 Dec 11, 2024
f95f1c1
Update submodule pointer
nipunG314 Dec 11, 2024
1a58273
Update submodule pointer
nipunG314 Dec 13, 2024
b6e1f57
Update submodule pointer
nipunG314 Dec 13, 2024
5239c29
Update submodule pointer
nipunG314 Jan 14, 2025
0df9ba6
Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …
nipunG314 Jan 14, 2025
06c915e
stop rolling back my modules!
Jan 21, 2025
90d20c4
point submodule at head
Jan 21, 2025
26a4ed2
Merge branch 'master' of github.com:Devsh-Graphics-Programming/Nabla …
nipunG314 Feb 22, 2025
4edd38c
Add capabilities for atomic ops
nipunG314 Mar 13, 2025
f1e3e98
Fix luma_meter
nipunG314 Mar 13, 2025
ce2ca41
Merge branch 'autoexposue_ex' of github.com:Devsh-Graphics-Programmin…
nipunG314 Mar 13, 2025
f1b7d17
Add median_luma_meter
nipunG314 Mar 16, 2025
83ac633
Update submodule pointer
nipunG314 Mar 16, 2025
2b5e502
Make changes to luma_meter
nipunG314 Mar 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples_tests
Submodule examples_tests updated 52 files
+188 −0 11_FFT/README.md
+1 −0 11_FFT/app_resources/common.hlsl
+10 −6 11_FFT/app_resources/shader.comp.hlsl
+4 −1 11_FFT/main.cpp
+0 −12 23_Autoexposure/CMakeLists.txt
+0 −177 23_Autoexposure/main.cpp
+3 −3 25_FilterTest/main.cpp
+2 −1 26_Autoexposure/CMakeLists.txt
+68 −0 26_Autoexposure/app_resources/avg_luma_meter.comp.hlsl
+88 −0 26_Autoexposure/app_resources/avg_luma_tonemap.comp.hlsl
+28 −0 26_Autoexposure/app_resources/common.hlsl
+72 −0 26_Autoexposure/app_resources/median_luma_meter.comp.hlsl
+93 −0 26_Autoexposure/app_resources/median_luma_tonemap.comp.hlsl
+20 −0 26_Autoexposure/app_resources/present.frag.hlsl
+0 −0 26_Autoexposure/config.json.template
+1,134 −0 26_Autoexposure/main.cpp
+0 −0 26_Autoexposure/pipeline.groovy
+0 −11 26_Blur/app_resources/common.hlsl
+0 −160 26_Blur/app_resources/shader.comp.hlsl
+0 −782 26_Blur/main.cpp
+8 −7 28_FFTBloom/app_resources/common.hlsl
+9 −8 28_FFTBloom/app_resources/fft_common.hlsl
+76 −58 28_FFTBloom/app_resources/fft_convolve_ifft.hlsl
+0 −7 28_FFTBloom/app_resources/image_fft_first_axis.hlsl
+0 −7 28_FFTBloom/app_resources/image_ifft_first_axis.hlsl
+0 −6 28_FFTBloom/app_resources/kernel_fft_first_axis.hlsl
+2 −11 28_FFTBloom/app_resources/kernel_fft_second_axis.hlsl
+2 −3 28_FFTBloom/app_resources/kernel_spectrum_normalize.hlsl
+55 −78 28_FFTBloom/main.cpp
+9 −9 30_ComputeShaderPathTracer/main.cpp
+0 −4 62_CAD/DrawResourcesFiller.cpp
+1 −11 62_CAD/DrawResourcesFiller.h
+1 −2 62_CAD/Hatch.cpp
+1 −3 62_CAD/Hatch.h
+1 −4 62_CAD/Polyline.cpp
+5 −44 62_CAD/Polyline.h
+7 −7 62_CAD/SingleLineText.cpp
+2 −3 62_CAD/SingleLineText.h
+33 −48 62_CAD/main.cpp
+0 −4 62_CAD/shaders/globals.hlsl
+0 −2 62_CAD/shaders/main_pipeline/common.hlsl
+0 −4 62_CAD/shaders/main_pipeline/fragment.hlsl
+23 −33 62_CAD/shaders/main_pipeline/fragment_shader.hlsl
+5 −3 62_CAD/shaders/main_pipeline/fragment_shader_debug.hlsl
+10 −22 62_CAD/shaders/main_pipeline/resolve_alphas.hlsl
+0 −5 62_CAD/shaders/main_pipeline/vertex_shader.hlsl
+1 −11 67_RayQueryGeometry/main.cpp
+0 −27 68_JpegLoading/CMakeLists.txt
+0 −193 68_JpegLoading/main.cpp
+0 −50 68_JpegLoading/pipeline.groovy
+10 −10 70_FLIPFluids/main.cpp
+2 −3 CMakeLists.txt
2 changes: 1 addition & 1 deletion include/nbl/asset/utils/IMeshPacker.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#define __NBL_ASSET_I_MESH_PACKER_H_INCLUDED__

#include "nbl/asset/utils/IMeshManipulator.h"
#include "nbl/core/math/morton.h"
#include "nbl/builtin/hlsl/math/morton.hlsl"

namespace nbl
{
Expand Down
35 changes: 35 additions & 0 deletions include/nbl/builtin/hlsl/luma_meter/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h

#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
#define _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_

#include "nbl/builtin/hlsl/cpp_compat.hlsl"

namespace nbl
{
namespace hlsl
{
namespace luma_meter
{

struct MeteringWindow
{
using this_t = MeteringWindow;
float32_t2 meteringWindowScale;
float32_t2 meteringWindowOffset;

static this_t create(float32_t2 scale, float32_t2 offset) {
this_t retval;
retval.meteringWindowScale = scale;
retval.meteringWindowOffset = offset;
return retval;
}
};

}
}
}

#endif
293 changes: 293 additions & 0 deletions include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h

#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_

#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl"
#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
#include "nbl/builtin/hlsl/type_traits.hlsl"
#include "nbl/builtin/hlsl/math/morton.hlsl"
#include "nbl/builtin/hlsl/luma_meter/common.hlsl"

namespace nbl
{
namespace hlsl
{
namespace luma_meter
{

template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
struct geom_meter {
using float_t = typename SharedAccessor::type;
using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

even if doing color computation in float16_t this doesn't free you from doing texture coordinate calc in float32_t

using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;

static this_t create(float_t2 lumaMinMax, float_t sampleCount)
{
this_t retval;
retval.lumaMinMax = lumaMinMax;
retval.sampleCount = sampleCount;
return retval;
}

float_t reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
{
return workgroup::reduction < plus < float_t >, GroupSize >::
template __call <SharedAccessor>(value, sdata);
}

float_t computeLumaLog2(
NBL_CONST_REF_ARG(MeteringWindow) window,
NBL_REF_ARG(TexAccessor) tex,
float_t2 shiftedCoord
)
{
float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

precompute a scale and offset from the Metering Window + the number of workgroups + the workgroup size to apply to a uint16_t2 unnormalized coordinate

right now you have waaay too many variables:

  • tile Offset
  • viewport size
  • metering window
    which are being manipulated per-pixel

float_t3 color = tex.get(uvPos);
float_t luma = (float_t)TexAccessor::toXYZ(color);

luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);

return max(log2(luma), log2(lumaMinMax.x));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why max you already clamped!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw if you have the log2 already precomputed then you can do

return min(spirv::nMax(log2(luma),lumaMinLog2),lumaMaxLog2);

nMin is a special SPIR-V version of min that will return the other operand if another is NaN (which happens on log of negative value or 0)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

}

void uploadFloat(
NBL_REF_ARG(ValueAccessor) val_accessor,
uint32_t index,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't give bad names to variables, this needs to be called workGroupIndex

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this variable shouldn't even exist, because the MEAN meter didn't output to different address per X Y workgroup coordinate

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

float_t val,
float_t minLog2,
float_t rangeLog2
Comment on lines +63 to +64
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should already be precomputed as members

)
{
uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart

uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();

uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
Comment on lines +69 to +71
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets write some docs for this....

The val was produced by a workgroup reduction is performed of values in the [MinLog2,MaxLog2] range

Which makes the scaledLogLuma (the variable that should hold (val-minLog2)*rangeLog2) is between 0 and WorkGroupSize

This value is atomic added by N workgroups

You now want to represent it in Fixed Point during the atomic add, but not be vulnerable to overflow, this means the worst case is adding N times WorkGroupSize.

This means that we need to multiply the by (2^32-1)/N precomputed as a float or if you must round up N to PoT and see how many bits are left (512 workgroups, means 9 bits, so 23 are left). To avoid rounding precision errors, the PoT method is chosen.

I have no clue where you're getting +SubgroupSizeLog2 from.

Also the value of (1<<fixedPointBitsLeft)-1 must be precomputed in create and stored as a member

IT should be as easy as

const uint32_t scaledLumaLog2BitPattern = uint32_t((val-lumaMinLog2)*maxIncrement_over_lumaRangeLog2+float_t(0.5));

where maxIncrement = (0x1u<<(32u-uint32_t(ceil(log2(WorkGroupCount*WorkGroupSize)))))-1;


val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, always the same address should be added to, if you wanted to stagger, then you should stagger based on modulo of the workgroup index

}

float_t downloadFloat(
NBL_REF_ARG(ValueAccessor) val_accessor,
uint32_t index,
float_t minLog2,
float_t rangeLog2
)
{
float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
return luma / rangeLog2 + minLog2;
Comment on lines +83 to +84
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, you're getting random floats based on workgroup index which thankfully was always the same (rare case of two wrongs making a right)

Again if you wanted to stagger, you should use entire subgroup to load the values, then subgroup reduce them

just converting to float_t is not the correct way to decode, you should divide by the maxIncrement

}

void sampleLuma(
NBL_CONST_REF_ARG(MeteringWindow) window,
NBL_REF_ARG(ValueAccessor) val,
NBL_REF_ARG(TexAccessor) tex,
NBL_REF_ARG(SharedAccessor) sdata,
float_t2 tileOffset,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is tile Offset being provided from the outside? its a byproduct of your workgroupID, and workgroupSize-1 decoded as morton +1 in each dimension

float_t2 viewportSize
)
{
uint32_t tid = workgroup::SubgroupContiguousIndex();
uint32_t2 coord = {
morton2d_decode_x(tid),
morton2d_decode_y(tid)
};

float_t luma = 0.0f;
float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
luma = computeLumaLog2(window, tex, shiftedCoord);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

luma should be called lumaLog2

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

float_t lumaSum = reduction(luma, sdata);

if (tid == GroupSize - 1) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its somewhat semantically cleaner to pick the first, instead of last, esp since its a reduction you performed before

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

uint32_t3 workgroupCount = glsl::gl_NumWorkGroups();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart

uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're computing the wrong thing, every workgroup gets the same index 🤦

Also the original code was touching the same address with every for the MEAN meter mode


uploadFloat(
val,
workgroupIndex,
lumaSum,
log2(lumaMinMax.x),
log2(lumaMinMax.y / lumaMinMax.x)
);
}
}

float_t gatherLuma(
NBL_REF_ARG(ValueAccessor) val
)
{
uint32_t tid = glsl::gl_SubgroupInvocationID();
float_t luma = glsl::subgroupAdd(
downloadFloat(
val,
tid,
log2(lumaMinMax.x),
log2(lumaMinMax.y / lumaMinMax.x)
)
);

uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
Comment on lines +131 to +132
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're supposed to normalize by the number of samples you took during the sampling step, your workGroupCount here is NOT that value, its the number of workgroups you're exposing with

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You must precompute the fixedPointsBitsLeft in the create method (and it needs to know how many invocations you'll be running the sample step)


return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
}

float_t sampleCount;
Comment on lines +134 to +137
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you want to compute and store the reciprocal of sampleCount and the 1<<fixedPointBitsLeft

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that was the purpose of the rcpFirstPassWGCount variable in the old GLSL

float_t2 lumaMinMax;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't do weird things we used to do in GLSL (due to no scalar layout), have a separate variable for min and max

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also you should have the min and max precomputed with log2 already applied

};

template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
struct median_meter {
using int_t = typename SharedAccessor::type;
using float_t = float32_t;
using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;

static this_t create(float_t2 lumaMinMax, float_t sampleCount) {
this_t retval;
retval.lumaMinMax = lumaMinMax;
retval.sampleCount = sampleCount;
return retval;
}

int_t inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
template __call <SharedAccessor>(value, sdata);
}

float_t computeLuma(
NBL_CONST_REF_ARG(MeteringWindow) window,
NBL_REF_ARG(TexAccessor) tex,
float_t2 shiftedCoord
) {
float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
float_t3 color = tex.get(uvPos);
float_t luma = (float_t)TexAccessor::toXYZ(color);

return clamp(luma, lumaMinMax.x, lumaMinMax.y);
}

int_t float2Int(
float_t val,
float_t minLog2,
float_t rangeLog2
) {
uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();

return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
}

float_t int2Float(
int_t val,
float_t minLog2,
float_t rangeLog2
) {
return val / rangeLog2 + minLog2;
}

void sampleLuma(
NBL_CONST_REF_ARG(MeteringWindow) window,
NBL_REF_ARG(HistogramAccessor) histo,
NBL_REF_ARG(TexAccessor) tex,
NBL_REF_ARG(SharedAccessor) sdata,
float_t2 tileOffset,
float_t2 viewportSize
) {
uint32_t tid = workgroup::SubgroupContiguousIndex();

for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
sdata.set(vid, 0);
}

sdata.workgroupExecutionAndMemoryBarrier();

uint32_t2 coord = {
morton2d_decode_x(tid),
morton2d_decode_y(tid)
};

float_t luma = 0.0f;
float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
luma = computeLuma(window, tex, shiftedCoord);

float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);

sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));

sdata.workgroupExecutionAndMemoryBarrier();

float_t histogram_value;
sdata.get(tid, histogram_value);

sdata.workgroupExecutionAndMemoryBarrier();

float_t sum = inclusive_scan(histogram_value, sdata);
histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));

const bool is_last_wg_invocation = tid == (GroupSize - 1);
const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;

for (int i = 1; i < RoundedBinCount; i++) {
uint32_t keyBucketStart = GroupSize * i;
uint32_t vid = tid + keyBucketStart;

// no if statement about the last iteration needed
if (is_last_wg_invocation) {
float_t beforeSum;
sdata.get(keyBucketStart, beforeSum);
sdata.set(keyBucketStart, beforeSum + sum);
}

// propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
sdata.workgroupExecutionAndMemoryBarrier();

// no aliasing anymore
float_t atVid;
sdata.get(vid, atVid);
sum = inclusive_scan(atVid, sdata);
if (vid < BinCount) {
histo.atomicAdd(vid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
}
}
}

float_t gatherLuma(
NBL_REF_ARG(HistogramAccessor) histo,
NBL_REF_ARG(SharedAccessor) sdata
) {
uint32_t tid = workgroup::SubgroupContiguousIndex();

for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
sdata.set(
vid,
histo.get(vid & (BinCount - 1))
);
}

sdata.workgroupExecutionAndMemoryBarrier();

uint32_t percentile40, percentile60;
sdata.get(BinCount * 0.4, percentile40);
sdata.get(BinCount * 0.6, percentile60);

return (int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
}

float_t sampleCount;
float_t2 lumaMinMax;
};

}
}
}

#endif
Loading