Merge pull request #147 from theabhirath/master

darsnack · web-flow · commit e88e4789803a · 2022-04-08T22:59:27.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -17,9 +17,9 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 BSON = "0.3.2"
 Flux = "0.13"
 Functors = "0.2"
-MLUtils = "0.1.2, 0.2"
+MLUtils = "0.2"
 NNlib = "0.7.34, 0.8"
-julia = "1.4"
+julia = "1.6"
 NeuralAttentionlib = "0.0"
 
 [extras]
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
@@ -40,9 +40,9 @@ end
 @functor MHAttention
 
 function (m::MHAttention)(x::AbstractArray{T, 3}) where T
-  B, C, N = size(x)
-  q, k, v = chunk(reshape(m.qkv_layer(x), B ÷ m.nheads, m.nheads, C, 3 * N), 3; dims = 4)
+  features, len_seq, batch_size = size(x)
+  q, k, v = chunk(reshape(m.qkv_layer(x), features ÷ m.nheads, m.nheads, len_seq, 3 * batch_size), 3; dims = 4)
   scale = convert(T, sqrt(size(q, 1) / m.nheads))
   attn = m.attn_drop(softmax(NeuralAttentionlib.matmul(q, permutedims(k, (2, 1, 3, 4))) * scale))
-  x = m.projection(reshape(NeuralAttentionlib.matmul(attn, v), (B, C, N)))
+  x = m.projection(reshape(NeuralAttentionlib.matmul(attn, v), (features, len_seq, batch_size)))
 end