Skip to content

Commit 8087b1b

Browse files
committed
less aggressive threading onramp
1 parent a7f9e1b commit 8087b1b

File tree

3 files changed

+9
-20
lines changed

3 files changed

+9
-20
lines changed

Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <elrodc@gmail.com>"]
4-
version = "0.12.119"
4+
version = "0.12.120"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/codegen/lower_threads.jl

+7-19
Original file line numberDiff line numberDiff line change
@@ -154,22 +154,14 @@ end
154154
@inline choose_num_blocks(nt, ::StaticInt{NC} = lv_max_num_threads()) where {NC} =
155155
@inbounds choose_num_block_table(StaticInt{NC}())[nt]
156156

157-
if Sys.ARCH === :x86_64
158-
@inline function choose_num_threads(
157+
scale_cost(c) = @fastmath c * (Sys.ARCH === :x86_64 ? 0.0225 : 0.005625)
158+
scale_cost(c, looplen) = scale_cost(@fastmath c / looplen)
159+
@inline function choose_num_threads(
159160
C::T,
160161
NT::UInt,
161162
x::Base.BitInteger,
162163
) where {T<:Union{Float32,Float64}}
163-
_choose_num_threads(Base.mul_float_fast(T(C), T(0.0225)), NT, x)
164-
end
165-
else
166-
@inline function choose_num_threads(
167-
C::T,
168-
NT::UInt,
169-
x::Base.BitInteger,
170-
) where {T<:Union{Float32,Float64}}
171-
_choose_num_threads(Base.mul_float_fast(C, T(0.0225) * T(0.25)), NT, x)
172-
end
164+
_choose_num_threads(scale_cost(T(C)), NT, x)
173165
end
174166
@inline function _choose_num_threads(
175167
C::T,
@@ -422,13 +414,6 @@ function define_block_size(threadedloop, vloop, tn, W)
422414
end
423415
end
424416
end
425-
function scale_cost(c, looplen)
426-
c = 0.05 * c / looplen
427-
if Sys.ARCH !== :x86_64
428-
c *= 0.25
429-
end
430-
c
431-
end
432417
function thread_one_loops_expr(
433418
ls::LoopSet,
434419
ua::UnrollArgs,
@@ -868,17 +853,20 @@ function valid_thread_loops(ls::LoopSet)
868853
u₂loop = _u₂loop === nothing ? u₁loop : getloop_from_id(ls, _u₂loop)
869854
ua = UnrollArgs(u₁loop, u₂loop, getloop(ls, vectorized), u₁, u₂, u₂)
870855
valid_thread_loop = fill(true, length(order))
856+
has_reduced_deps = false
871857
for op operations(ls)
872858
if isstore(op) && (length(reduceddependencies(op)) > 0)
873859
for reduceddep reduceddependencies(op)
874860
for (i, o) enumerate(order)
875861
if o === reduceddep
862+
has_reduced_deps = true
876863
valid_thread_loop[i] = false
877864
end
878865
end
879866
end
880867
end
881868
end
869+
c *= (1.0 + 0.5has_reduced_deps)
882870
for (i, o) enumerate(order)
883871
loop = getloop(ls, o)
884872
if isstaticloop(loop) & (length(loop) 1)

test/Project.toml

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
55
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
66
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
77
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
8+
Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
89
StrideArraysCore = "7792a7ef-975c-4747-a70f-980b88e8d1da"
910
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
1011
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

0 commit comments

Comments
 (0)