Skip to content

Commit d1fdc1b

Browse files
committed
Limit specified unrolls for statically sized loops with known vector width, this allows not emitting masks unnecessarilly in some cases.
1 parent 09115a8 commit d1fdc1b

File tree

6 files changed

+54
-18
lines changed

6 files changed

+54
-18
lines changed

src/costs.jl

+3
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ const COST = Dict{Symbol,InstructionCost}(
166166
:(~) => InstructionCost(1, 0.5),
167167
:(&) => InstructionCost(1, 0.5),
168168
:(|) => InstructionCost(1, 0.5),
169+
:() => InstructionCost(1, 0.5),
170+
:(%) => InstructionCost(13, 4.0, -2.0),
171+
:(rem) => InstructionCost(13, 4.0, -2.0),
169172
:(>) => InstructionCost(1, 0.5),
170173
:(<) => InstructionCost(1, 0.5),
171174
:(>=) => InstructionCost(1, 0.5),

src/determinestrategy.jl

+19-3
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ function lsvecwidthshift(ls::LoopSet, vectorized::Symbol, size_T = nothing)
128128
end
129129

130130
# evaluates cost of evaluating loop in given order
131-
# heuristically, could simplify analysis by just unrolling outer loop?
132131
function evaluate_cost_unroll(
133132
ls::LoopSet, order::Vector{Symbol}, vectorized::Symbol, max_cost = typemax(Float64)
134133
)
@@ -166,7 +165,7 @@ function evaluate_cost_unroll(
166165
total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
167166
end
168167
end
169-
total_cost + stride_penalty(ls, order) - 1.0 # -1.0 to place finger on scale in its favor
168+
0.999total_cost + stride_penalty(ls, order) # 0.999 to place finger on scale in its favor
170169
end
171170

172171
# only covers vectorized ops; everything else considered lifted?
@@ -240,6 +239,9 @@ function unroll_no_reductions(ls, order, vectorized)
240239
# isstore(op) && dependson(op, unrolled)
241240
# end
242241
# end
242+
if unrolled === vectorized
243+
u = demote_unroll_factor(ls, u, vectorized)
244+
end
243245
u, unrolled
244246
# rt = max(compute_rt, load_rt + store_rt)
245247
# # (iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled
@@ -295,6 +297,16 @@ function count_reductions(ls::LoopSet)
295297
num_reductions
296298
end
297299

300+
demote_unroll_factor(ls::LoopSet, UF, loop::Symbol) = demote_unroll_factor(ls, UF, getloop(ls, loop))
301+
function demote_unroll_factor(ls::LoopSet, UF, loop::Loop)
302+
W = ls.vector_width[]
303+
if !iszero(W) && isstaticloop(loop)
304+
UFW = maybedemotesize(UF*W, length(loop))
305+
UF = cld(UFW, W)
306+
end
307+
UF
308+
end
309+
298310
function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vectorized::Symbol)
299311
num_reductions = count_reductions(ls)
300312
# The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
@@ -317,7 +329,11 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vectorized:
317329
end
318330
end
319331
# min(8, roundpow2(max(1, round(Int, latency / (rt * num_reductions) ) ))), best_unrolled
320-
min(8, VectorizationBase.nextpow2(max(1, round(Int, latency / (rt * num_reductions) ) ))), best_unrolled
332+
UF = min(8, VectorizationBase.nextpow2(max(1, round(Int, latency / (rt * num_reductions) ) )))
333+
if best_unrolled === vectorized
334+
UF = demote_unroll_factor(ls, UF, vectorized)
335+
end
336+
UF, best_unrolled
321337
end
322338

323339
function unroll_cost(X, u₁, u₂, u₁L, u₂L)

src/lowering.jl

+9-4
Original file line numberDiff line numberDiff line change
@@ -333,16 +333,22 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
333333
W = nisvectorized ? ls.vector_width[] : 1
334334
loopisstatic = isstaticloop(loop) & (!iszero(W))
335335
UFW = UF * W
336-
336+
looplength = length(loop)
337+
if loopisstatic & (UFW > looplength)
338+
UFWnew = cld(looplength, cld(looplength, UFW))
339+
UF = cld(UFWnew, W)
340+
UFW = UF * W
341+
us = nisunrolled ? UnrollSpecification(us, UF, u₂) : UnrollSpecification(us, u₁, UF)
342+
end
337343
remmask = inclmask | nisvectorized
338344
Ureduct = (n == num_loops(ls) && (u₂ == -1)) ? calc_Ureduct(ls, us) : -1
339345
# sl = startloop(loop, nisvectorized, loopsym)
340346
sl = startloop(ls, us, n)
341-
UFt = loopisstatic ? cld(length(loop) % UFW, W) : 1
347+
UFt = loopisstatic ? cld(looplength % UFW, W) : 1
342348
# Don't place remainder first if we're going to have to mask this loop (i.e., if this loop is vectorized)
343349
remfirst = loopisstatic & (!nisvectorized) & (UFt > 0) & !(unsigned(Ureduct) < unsigned(UF))
344350
tc = terminatecondition(ls, us, n, inclmask, remfirst ? 1 : UF)
345-
usorig = ls.unrollspecification[]
351+
# usorig = ls.unrollspecification[]
346352
# tc = (usorig.u₁ == us.u₁) && (usorig.u₂ == us.u₂) && !loopisstatic && !inclmask && !ls.loadelimination[] ? expect(tc) : tc
347353
body = lower_block(ls, us, n, inclmask, UF)
348354
if loopisstatic
@@ -359,7 +365,6 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
359365
remblock = init_remblock(loop, ls.lssm[], n)#loopsym)
360366
q = Expr(:while, tc, body)
361367
end
362-
# @show loopsym, loopisstatic, UFW
363368
q = if unsigned(Ureduct) < unsigned(UF) # unsigned(-1) == typemax(UInt); is logic relying on twos-complement bad?
364369
UF_cleanup = UF - Ureduct
365370
us_cleanup = nisunrolled ? UnrollSpecification(us, UF_cleanup, u₂) : UnrollSpecification(us, u₁, UF_cleanup)

src/split_loops.jl

+1
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
101101
# U_1 = T_1 = U_2 = T_2 = 2
102102
# @show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
103103
if cost_1 + cost_2 cost_fused
104+
# @show cost_1, cost_2, cost_fused
104105
ls_2_lowered = if length(remaining_ops) > 1
105106
inline = iszero(inline) ? (shouldinline_1 % Int) : inline
106107
lower_and_split_loops(ls_2, inline)

test/copy.jl

+12
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,15 @@ using LoopVectorization, OffsetArrays, Test
115115
end
116116
B
117117
end
118+
function copyselfdot!(s, x)
119+
m = zero(eltype(x))
120+
@avx for i 1:2
121+
sᵢ = x[i]
122+
s[i] = sᵢ
123+
m += sᵢ * sᵢ
124+
end
125+
m
126+
end
118127

119128
for T (Float32, Float64, Int32, Int64)
120129
@show T, @__LINE__
@@ -188,5 +197,8 @@ using LoopVectorization, OffsetArrays, Test
188197

189198
x = rand(R, 3); y = similar(x);
190199
@test copy3!(y, x) == x
200+
fill!(y,0);
201+
@test copyselfdot!(y, x) x[1]^2 + x[2]^2
202+
@test view(x, 1:2) == y
191203
end
192204
end

test/miscellaneous.jl

+10-11
Original file line numberDiff line numberDiff line change
@@ -628,7 +628,7 @@ using Test
628628
function MatCalcWtDW!(m)
629629
l, n = size(m.Wt)
630630
fill!(m.Wt_D_W, 0)
631-
@avx for k in 1: n
631+
@avx for k in 1:n
632632
for j in 1:l
633633
for i in 1:l
634634
m.Wt_D_W[i, j] += m.Wt[i, k] * m.Wt[j, k] * m.d[k]
@@ -930,15 +930,15 @@ end
930930
@test X1 X2
931931
@test Y1 Y2
932932

933-
a_re, a_im = rand(T, 2, 2, 2), rand(T, 2, 2, 2);
934-
b_re, b_im = rand(T, 2, 2), rand(T, 2, 2);
935-
c_re_1 = ones(T, 2, 2); c_re_2 = ones(T, 2, 2);
936-
multiple_unrolls_split_depchains!(c_re_1, a_re, b_re, a_im, b_im, true) # [1 1; 1 1]
937-
multiple_unrolls_split_depchains_avx!(c_re_2, a_re, b_re, a_im, b_im, true) # [1 1; 1 1]
938-
@test c_re_1 c_re_2
939-
multiple_unrolls_split_depchains!(c_re_1, a_re, b_re, a_im, b_im) # [1 1; 1 1]
940-
multiple_unrolls_split_depchains_avx!(c_re_2, a_re, b_re, a_im, b_im) # [1 1; 1 1]
941-
@test c_re_1 c_re_2
933+
# a_re, a_im = rand(T, 2, 2, 2), rand(T, 2, 2, 2);
934+
# b_re, b_im = rand(T, 2, 2), rand(T, 2, 2);
935+
# c_re_1 = ones(T, 2, 2); c_re_2 = ones(T, 2, 2);
936+
# multiple_unrolls_split_depchains!(c_re_1, a_re, b_re, a_im, b_im, true) # [1 1; 1 1]
937+
# multiple_unrolls_split_depchains_avx!(c_re_2, a_re, b_re, a_im, b_im, true) # [1 1; 1 1]
938+
# @test c_re_1 ≈ c_re_2
939+
# multiple_unrolls_split_depchains!(c_re_1, a_re, b_re, a_im, b_im) # [1 1; 1 1]
940+
# multiple_unrolls_split_depchains_avx!(c_re_2, a_re, b_re, a_im, b_im) # [1 1; 1 1]
941+
# @test c_re_1 ≈ c_re_2
942942

943943
@test loopinductvardivision(X1) loopinductvardivisionavx(X2)
944944

@@ -947,7 +947,6 @@ end
947947
Wt = rand(T, 181, 191),
948948
d = rand(T, 191)
949949
);
950-
Wt_D_W = similar(mh.Wt_D_W);
951950

952951
MatCalcWtDW!(mh)
953952
@test mh.Wt_D_W mh.Wt * Diagonal(mh.d) * mh.Wt'

0 commit comments

Comments
 (0)