Skip to content

Improved Scan #855

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
#include "nbl/builtin/hlsl/subgroup/arithmetic_portability_impl.hlsl"
#include "nbl/builtin/hlsl/concepts.hlsl"


namespace nbl
Expand Down
17 changes: 17 additions & 0 deletions include/nbl/builtin/hlsl/subgroup/ballot.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,23 @@ uint32_t ElectedSubgroupInvocationID() {
return glsl::subgroupBroadcastFirst<uint32_t>(glsl::gl_SubgroupInvocationID());
}

template<uint32_t SubgroupSizeLog2>
struct Configuration
{
using mask_t = conditional_t<SubgroupSizeLog2 < 7, conditional_t<SubgroupSizeLog2 < 6, uint32_t1, uint32_t2>, uint32_t4>;

NBL_CONSTEXPR_STATIC_INLINE uint16_t Size = 0x1u << SubgroupSizeLog2;
};

template<class T>
struct is_configuration : bool_constant<false> {};

template<uint32_t N>
struct is_configuration<Configuration<N> > : bool_constant<true> {};

template<typename T>
NBL_CONSTEXPR bool is_configuration_v = is_configuration<T>::value;

}
}
}
Expand Down
45 changes: 45 additions & 0 deletions include/nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h
#ifndef _NBL_BUILTIN_HLSL_SUBGROUP2_ARITHMETIC_PORTABILITY_INCLUDED_
#define _NBL_BUILTIN_HLSL_SUBGROUP2_ARITHMETIC_PORTABILITY_INCLUDED_


#include "nbl/builtin/hlsl/device_capabilities_traits.hlsl"

#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability_impl.hlsl"
#include "nbl/builtin/hlsl/concepts.hlsl"


namespace nbl
{
namespace hlsl
{
namespace subgroup2
{

template<typename Config, class BinOp, int32_t _ItemsPerInvocation=1, class device_capabilities=void NBL_PRIMARY_REQUIRES(subgroup::is_configuration_v<Config>)
struct ArithmeticParams
{
using config_t = Config;
using binop_t = BinOp;
using scalar_t = typename BinOp::type_t; // BinOp should be with scalar type
using type_t = conditional_t<_ItemsPerInvocation<2, scalar_t, vector<scalar_t, _ItemsPerInvocation> >;

NBL_CONSTEXPR_STATIC_INLINE int32_t ItemsPerInvocation = _ItemsPerInvocation;
NBL_CONSTEXPR_STATIC_INLINE bool UseNativeIntrinsics = device_capabilities_traits<device_capabilities>::shaderSubgroupArithmetic /*&& /*some heuristic for when its faster*/;
};

template<typename Params>
struct reduction : impl::reduction<typename Params::binop_t,typename Params::type_t,Params::ItemsPerInvocation,Params::UseNativeIntrinsics> {};
template<typename Params>
struct inclusive_scan : impl::inclusive_scan<typename Params::binop_t,typename Params::type_t,Params::ItemsPerInvocation,Params::UseNativeIntrinsics> {};
template<typename Params>
struct exclusive_scan : impl::exclusive_scan<typename Params::binop_t,typename Params::type_t,Params::ItemsPerInvocation,Params::UseNativeIntrinsics> {};

}
}
}

#endif
144 changes: 144 additions & 0 deletions include/nbl/builtin/hlsl/subgroup2/arithmetic_portability_impl.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h
#ifndef _NBL_BUILTIN_HLSL_SUBGROUP2_ARITHMETIC_PORTABILITY_IMPL_INCLUDED_
#define _NBL_BUILTIN_HLSL_SUBGROUP2_ARITHMETIC_PORTABILITY_IMPL_INCLUDED_

#include "nbl/builtin/hlsl/subgroup/arithmetic_portability_impl.hlsl"

namespace nbl
{
namespace hlsl
{
namespace subgroup2
{

namespace impl
{

template<class Binop, typename T, uint32_t ItemsPerInvocation, bool native>
struct inclusive_scan
{
using type_t = T;
using scalar_t = typename Binop::type_t;
using binop_t = Binop;
using exclusive_scan_op_t = subgroup::impl::exclusive_scan<binop_t, native>;

// NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation = vector_traits<T>::Dimension;

type_t operator()(NBL_CONST_REF_ARG(type_t) value)
{
binop_t binop;
type_t retval;
retval[0] = value[0];
//[unroll(ItemsPerInvocation-1)]
for (uint32_t i = 1; i < ItemsPerInvocation; i++)
retval[i] = binop(retval[i-1], value[i]);

exclusive_scan_op_t op;
scalar_t exclusive = op(retval[ItemsPerInvocation-1]);

//[unroll(ItemsPerInvocation)]
for (uint32_t i = 0; i < ItemsPerInvocation; i++)
retval[i] = binop(retval[i], exclusive);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this only works if the subgroup invocations are not coalesced

return retval;
}
};

template<class Binop, typename T, uint32_t ItemsPerInvocation, bool native>
struct exclusive_scan
{
using type_t = T;
using scalar_t = typename Binop::type_t;
using binop_t = Binop;
using inclusive_scan_op_t = subgroup2::impl::inclusive_scan<binop_t, T, ItemsPerInvocation, native>;

// NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation = vector_traits<T>::Dimension;

type_t operator()(type_t value)
{
inclusive_scan_op_t op;
value = op(value);

type_t left = glsl::subgroupShuffleUp<type_t>(value,1);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, if each invocation holds consecutive input and output elements, this shift becomes a mess (see that loop you have at the end)

also there was never a need to shuffle the entire vector, because you only ever used the last component

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you do coalesced, then a plain subgroup shuffle on the vector and then conditional set of first element (literal vectorized version of old code) will achieve what you want

const uint32_t invocationID = glsl::gl_SubgroupInvocationID();
// cyclic/modulo shuffle instead of relative needed
const type_t left = ItemsPerInvocation ? glsl::subgroupShuffle<type_t>(value,(invocationID-1)&SubgroupMask):glsl::subgroupShuffleUp<type_t>(value,1);
type_t newFirst; newFirst[0] = binop_t::identity;
[unroll]
for (uint32_t i=1; i<ItemsPerInvocation; i++)
   newFirst[i] = left[i-1];
return mix(newFirst,left,bool(glsl::gl_SubgroupInvocationID()));

P.S. also use mix(T,T,bool) instead of ? bevcause of HLSL short circuiting and turning ternaries into branches.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw the subgroupShuffle with a modulo SubgroupSize can be replaced with new intrinsic from SPV_KHR_subgroup_rotate if you extend the device_limits.json and so on (so that device_capability_traits gets it)


type_t retval;
retval[0] = bool(glsl::gl_SubgroupInvocationID()) ? left[ItemsPerInvocation-1] : binop_t::identity;
//[unroll(ItemsPerInvocation-1)]
for (uint32_t i = 1; i < ItemsPerInvocation; i++)
retval[i] = value[i-1];
return retval;
}
};

template<class Binop, typename T, uint32_t ItemsPerInvocation, bool native>
struct reduction
{
using type_t = T; // TODO? assert scalar_type<T> == scalar_t
using scalar_t = typename Binop::type_t;
using binop_t = Binop;
using op_t = subgroup::impl::reduction<binop_t, native>;

// NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation = vector_traits<T>::Dimension;

scalar_t operator()(NBL_CONST_REF_ARG(type_t) value)
{
binop_t binop;
op_t op;
scalar_t retval = value[0];
//[unroll(ItemsPerInvocation-1)]
for (uint32_t i = 1; i < ItemsPerInvocation; i++)
retval = binop(retval, value[i]);
return op(retval);
}
};


// spec for N=1 uses subgroup funcs
template<class Binop, typename T, bool native>
struct inclusive_scan<Binop, T, 1, native>
{
using binop_t = Binop;
using op_t = subgroup::impl::inclusive_scan<binop_t, native>;
// assert T == scalar type, binop::type == T

T operator()(NBL_CONST_REF_ARG(T) value)
{
op_t op;
return op(value);
}
};

template<class Binop, typename T, bool native>
struct exclusive_scan<Binop, T, 1, native>
{
using binop_t = Binop;
using op_t = subgroup::impl::exclusive_scan<binop_t, native>;

T operator()(NBL_CONST_REF_ARG(T) value)
{
op_t op;
return op(value);
}
};

template<class Binop, typename T, bool native>
struct reduction<Binop, T, 1, native>
{
using binop_t = Binop;
using op_t = subgroup::impl::reduction<binop_t, native>;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

benchmark is invalid if you do stuff in terms of subgroup functions, because you are are supposed to use the Params::Configuration::SizeLog2 to make sure your loops unroll, as opposed to the subgroup v1 loops which can't unroll because the loop invariant depends on gl_SubgroupSize which is a uniform and not a compile time constant (you can only hope that the IHV compiler is not dump and actually uses the subgroup size you provide in pipeline creation parameters when lowering SPIR-V to ISA)

TL;DR there can be no dependency between subgroup2 and subgroup namespace, copy the code over


T operator()(NBL_CONST_REF_ARG(T) value)
{
op_t op;
return op(value);
}
};

}

}
}
}

#endif