update

LittleLittleCloud · LittleLittleCloud · commit dbd57da05718 · 2024-05-01T00:39:35.000-07:00
diff --git a/Extension.cs b/Extension.cs
@@ -48,7 +48,7 @@ public static string Peek(this Tensor tensor, string id, int n = 10)
         avg = avg.round(4);
         var str = $"{id}: sum: {avg.ToSingle()}  dtype: {dtype} shape: [{shapeString}]";
 
-        //Console.WriteLine(str);
+        Console.WriteLine(str);
 
         return str;
     }
diff --git a/Module/Phi2Attention.cs b/Module/Phi2Attention.cs
@@ -129,8 +129,8 @@ public override (Tensor, Tensor?, Tensor?) forward(
         this.cache_v[..batchSize, .., pastKeyValueLength..kvSeqLen, ..] = valueStates;
         keyStates = this.cache_k[..batchSize, .., ..kvSeqLen, ..];
         valueStates = this.cache_v[..batchSize, .., ..kvSeqLen, ..];
-        var keyStates2 = Utils.RepeatKV(keyStates, this.numKeyValueGroups).transpose(2, 3);
-        var valueStates2 = Utils.RepeatKV(valueStates, this.numKeyValueGroups);
+        var keyStates2 = Utils.Phi2RepeatKV(keyStates, this.numKeyValueGroups).transpose(2, 3);
+        var valueStates2 = Utils.Phi2RepeatKV(valueStates, this.numKeyValueGroups);
         // Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow
         var attnWeights = torch.matmul(queryStates.to_type(float32), keyStates2.to_type(float32));
         attnWeights = attnWeights / Math.Sqrt(this.headDim);
diff --git a/Module/Phi3Attention.cs b/Module/Phi3Attention.cs
@@ -123,10 +123,9 @@ public override Phi3AttentionOutput forward(Phi3AttentionInput input)
 
         var qkv = this.qkv_proj.forward(hidden_states);
         var query_pos = this.num_heads * this.head_dim;
-        var query_states = qkv[.., ..query_pos];
-        var key_states = qkv[.., query_pos .. (query_pos + this.num_key_value_heads * this.head_dim)];
-        var value_states = qkv[.., (query_pos + this.num_key_value_heads * this.head_dim)..];
-
+        var query_states = qkv[.., .., ..query_pos];
+        var key_states = qkv[.., .., query_pos .. (query_pos + this.num_key_value_heads * this.head_dim)];
+        var value_states = qkv[.., .., (query_pos + this.num_key_value_heads * this.head_dim)..];
         query_states = query_states.view(bsz, q_len, this.num_heads, this.head_dim).transpose(1, 2);
         key_states = key_states.view(bsz, q_len, this.num_key_value_heads, this.head_dim).transpose(1, 2);
         value_states = value_states.view(bsz, q_len, this.num_key_value_heads, this.head_dim).transpose(1, 2);
@@ -138,19 +137,23 @@ public override Phi3AttentionOutput forward(Phi3AttentionInput input)
             kv_seq_len += past_key_value.GetUsableLength(kv_seq_len, this.layer_idx);
         }
 
-        var embOutput = this.rotary_emb.forward(new Phi3RotaryEmbeddingInput(q_len, kv_seq_len, this.layer_idx));
+        var embOutput = this.rotary_emb.forward(new Phi3RotaryEmbeddingInput(value_states, positionIds, kv_seq_len));
         (var cos, var sin) = (embOutput.Cos, embOutput.Sin);
 
-        (query_states, key_states) = Utils.ApplyRotaryPosEmb(query_states, key_states, cos, sin, positionIds);
+        query_states.Peek("query_states");
+        key_states.Peek("key_states");
+        cos.Peek("cos");
+        sin.Peek("sin");
+        (query_states, key_states) = Utils.ApplyRotaryPosEmb(query_states, key_states, cos, sin);
 
         if (past_key_value is not null)
         {
             (key_states, value_states) = past_key_value.UpdateKVCache(key_states, value_states, this.layer_idx);
         }
 
         // repeat k/v heads if n_kv_heads < n_heads
-        key_states = Utils.RepeatKV(key_states, this.num_key_value_heads);
-        value_states = Utils.RepeatKV(value_states, this.num_key_value_heads);
+        key_states = Utils.Phi3RepeatKV(key_states, this.num_key_value_groups);
+        value_states = Utils.Phi3RepeatKV(value_states, this.num_key_value_groups);
 
         var attn_weights = torch.matmul(query_states, key_states.transpose(2, 3));
         attn_weights = attn_weights / Math.Sqrt(this.head_dim);
@@ -160,7 +163,7 @@ public override Phi3AttentionOutput forward(Phi3AttentionInput input)
         var attention_mask = input.attention_mask;
         if (attention_mask is not null)
         {
-            attention_mask.shape.Should().BeEquivalentTo(new long[] { bsz, 1, 1, kv_seq_len });
+            attention_mask.shape.Should().BeEquivalentTo(new long[] { bsz, 1, q_len, kv_seq_len });
             attn_weights = attn_weights + attention_mask;
         }
 
diff --git a/Module/Phi3MLP.cs b/Module/Phi3MLP.cs
@@ -23,7 +23,10 @@ public Phi3MLP(Phi3Config config)
     public override Tensor forward(Tensor input)
     {
         using var input1 = this.gate_up_proj.forward(input);
-        using var input2 = this.activation_fn.forward(input1);
-        return this.down_proj.forward(input2);
+        var chunks = input1.chunk(2, dim: -1);
+        var gate = chunks[0];
+        var up_status = chunks[1];
+        up_status = up_status * this.activation_fn.forward(gate);
+        return this.down_proj.forward(up_status);
     }
 }
diff --git a/Module/Phi3Model.cs b/Module/Phi3Model.cs
@@ -151,10 +151,11 @@ public override CasualLMModelOutput forward(CasualLMModelInput input)
         }
         else
         {
-            attention_mask = AttentionMaskConverter.Create4DCasualAttentionMask(attention_mask, [batch_size, seq_length], inputs_embeds.dtype, device, past_key_values_length, this.config.SlidingWindow);
+            attention_mask = AttentionMaskConverter.Create4DCausalAttentionMask(attention_mask, [batch_size, seq_length], inputs_embeds.dtype, device, past_key_values_length, this.config.SlidingWindow);
         }
 
         var hidden_states = inputs_embeds;
+        hidden_states.Peek("hidden_states");
 
         var all_hidden_states = new List<Tensor>();
         var all_attentions = new List<Tensor>();
diff --git a/Module/Phi3RotaryEmbedding.cs b/Module/Phi3RotaryEmbedding.cs
@@ -59,10 +59,15 @@ public override Phi3RotaryEmbeddingOutput forward(Phi3RotaryEmbeddingInput input
         // TODO
         // can be calculated once and cached
         var inv_freq = this.get_buffer("inv_freq").to(x.device);
-        var inv_freq_expanded = inv_freq.unsqueeze(0).unsqueeze(-1).expand(position_ids.shape[0], -1, 1);
-        //position_ids_expanded = position_ids[:, None, :].float()
+        var inv_freq_expanded = inv_freq.unsqueeze(0).unsqueeze(-1);
+        inv_freq_expanded.Peek("inv_freq_expanded");
+        position_ids.Peek("position_ids");
+        inv_freq_expanded.Peek("inv_freq_expanded");
+        inv_freq_expanded = inv_freq_expanded.expand(new long[] { position_ids.shape[0], -1, 1 });
+        
         var position_ids_expanded = position_ids.unsqueeze(1).to(torch.float32);
         var freqs = inv_freq_expanded * position_ids_expanded;
+        freqs = freqs.transpose(1, 2);
         var emb = torch.cat([freqs, freqs], dim: -1);
 
         var cos = torch.cos(emb);
diff --git a/Phi3/Phi3ForCasualLM.cs b/Phi3/Phi3ForCasualLM.cs
@@ -40,7 +40,7 @@ public override CasualLMModelOutput forward(CasualLMModelInput input)
     public static Phi3ForCasualLM FromPretrained(
         string modelFolder,
         string configName = "config.json",
-        string modelWeightName = "model.safetensors.index.json",
+        string checkPointName = "model.safetensors.index.json",
         ScalarType torchDtype = ScalarType.BFloat16,
         string device = "cpu")
     {
@@ -49,7 +49,7 @@ public static Phi3ForCasualLM FromPretrained(
         modelConfig.DType = torchDtype;
         var phi = new Phi3ForCasualLM(modelConfig);
         var loadedParameters = new Dictionary<string, bool>();
-        phi.load_checkpoint(path: modelFolder, checkpointName: modelWeightName, strict: false, loadedParameters: loadedParameters);
+        phi.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: false, loadedParameters: loadedParameters);
         phi = phi.to(device);
         phi.eval();
 
diff --git a/Program.cs b/Program.cs
@@ -1,13 +1,14 @@
 ﻿using System.Runtime.InteropServices;
 using FluentAssertions;
+using Phi;
 using Phi.Pipeline;
 using TorchSharp;
 using static TorchSharp.torch;
 
 // Dynamic loading libtorch because Cuda 12 only support GPU driver >= 520
 // And I can't upgrade GPU driver because it's a cloud machine.
 
-var phi2Folder = @"C:\Users\xiaoyuz\source\repos\phi-2";
+var phi2Folder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
 var device = "cuda";
 
 if (device == "cuda")
@@ -16,15 +17,14 @@
     torch.cuda.is_available().Should().BeTrue();
 }
 
-var defaultType = ScalarType.Float16;
-torch.set_default_dtype(defaultType);
+var defaultType = ScalarType.BFloat16;
 torch.manual_seed(1);
 
 Console.WriteLine("Loading Phi2 from huggingface model weight folder");
 var timer = System.Diagnostics.Stopwatch.StartNew();
-var phi2 = Phi2ForCasualLM.FromPretrained(phi2Folder, device: device, torchDtype: defaultType, checkPointName: "model.safetensors.index.json");
-var tokenizer = Phi2Tokenizer.FromPretrained(phi2Folder);
-var pipeline = new CasualLMPipeline(tokenizer, phi2, device);
+var model = Phi3ForCasualLM.FromPretrained(phi2Folder, device: device, torchDtype: defaultType, checkPointName: "model.safetensors.index.json");
+var tokenizer = LLama2Tokenizer.FromPretrained(phi2Folder);
+var pipeline = new CasualLMPipeline(tokenizer, model, device);
 
 
 timer.Stop();
@@ -34,11 +34,10 @@
 int maxLen = 512;
 float temperature = 0.0f;
 Console.WriteLine($"QA Format: maxLen: {maxLen} temperature: {temperature}");
-var prompt = "Instruct: A skier slides down a frictionless slope of height 40m and length 80m, what's the skier's speed at the bottom, think step by step.\nOutput:";
+var prompt = "Can you provide ways to eat combinations of bananas and dragonfruits?";
 // wait for user to press enter
 Console.WriteLine($"Prompt: {prompt}");
 Console.WriteLine("Press enter to continue inferencing QA format");
-Console.ReadLine();
 
 Console.WriteLine(prompt);
 pipeline.Generate(prompt, maxLen: maxLen, temperature: temperature, device: device);
diff --git a/Utils.cs b/Utils.cs
@@ -133,7 +133,7 @@ public static Tensor RotateHalf(Tensor x)
 //     k_embed = (k * cos) + (rotate_half(k) * sin)
 //     return q_embed, k_embed
     
-    public static (Tensor, Tensor) ApplyRotaryPosEmb(Tensor q, Tensor k, Tensor cos, Tensor sin, Tensor positionIds, int unsqueezeDim = 1)
+    public static (Tensor, Tensor) ApplyRotaryPosEmb(Tensor q, Tensor k, Tensor cos, Tensor sin, Tensor? positionIds = null, int unsqueezeDim = 1)
     {
         // The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
         // sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
@@ -142,8 +142,17 @@ public static (Tensor, Tensor) ApplyRotaryPosEmb(Tensor q, Tensor k, Tensor cos,
         // cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
         // the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
 
-        cos = cos[positionIds].unsqueeze(unsqueezeDim);
-        sin = sin[positionIds].unsqueeze(unsqueezeDim);
+        if (positionIds is not null)
+        {
+            cos = cos[positionIds!].unsqueeze(unsqueezeDim);
+            sin = sin[positionIds!].unsqueeze(unsqueezeDim);
+        }
+        else
+        {
+            cos = cos.unsqueeze(unsqueezeDim);
+            sin = sin.unsqueeze(unsqueezeDim);
+        }
+        
         var qEmbed = q * cos;
         qEmbed += RotateHalf(q) * sin;
 
@@ -162,12 +171,12 @@ public static Module<Tensor, Tensor> GetActivation(string act_fn)
             "gelu" => nn.GELU(),
             "tanh" => nn.Tanh(),
             "swish" => nn.SiLU(),
-            _ => throw new ArgumentException("Invalid activation function", nameof(act_fn)),
+            _ => throw new ArgumentException("Invalid activation function", act_fn),
         };
     }
 
 
-    public static Tensor RepeatKV(Tensor x, int nRep)
+    public static Tensor Phi2RepeatKV(Tensor x, int nRep)
     {
         var batchSize = x.shape[0];
         var seqLen = x.shape[1];
@@ -183,4 +192,20 @@ public static Tensor RepeatKV(Tensor x, int nRep)
                 .view(batchSize, seqLen, nKVHeads * nRep, headDim);
     }
 
+    public static Tensor Phi3RepeatKV(Tensor x, int nRep)
+    {
+        var batchSize = x.shape[0];
+        var nKVHeads = x.shape[1];
+        var seqLen = x.shape[2];
+        var headDim = x.shape[3];
+        if (nRep == 1)
+        {
+            return x;
+        }
+
+        return x.unsqueeze(3)
+                .expand(batchSize, nKVHeads, nRep, seqLen, headDim)
+                .view(batchSize, nKVHeads * nRep, seqLen, headDim);
+    }
+
 }
diff --git a/Utils/AttentionMaskConverter.cs b/Utils/AttentionMaskConverter.cs
@@ -2,6 +2,7 @@
 using static TorchSharp.torch;
 using TorchSharp.Modules;
 using TorchSharp;
+using System.Threading.Tasks;
 
 namespace Phi;
 
@@ -10,12 +11,65 @@ public class AttentionMaskConverter
     private readonly bool is_casual;
     private readonly int? sliding_window;
 
-    public AttentionMaskConverter(bool is_casual, int? sliding_window)
+    public AttentionMaskConverter(bool is_causal, int? sliding_window)
     {
-        this.is_casual = is_casual;
+        this.is_casual = is_causal;
         this.sliding_window = sliding_window;
     }
 
+    /// <summary>
+    /// Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
+    /// key_value_length) shape and by adding a large negative bias to not-attended positions.If attention_mask is
+    /// causal, a causal mask will be added.
+    /// </summary>
+    /// <param name="attention_mask_2d"></param>
+    /// <param name="query_length"></param>
+    /// <param name="dtype"></param>
+    /// <param name="key_value_length"></param>
+    /// <returns></returns>
+    public Tensor To4D(
+        Tensor attention_mask_2d,
+        int query_length,
+        ScalarType dtype,
+        int? key_value_length = null)
+    {
+        long[] input_shape = [attention_mask_2d.shape[0], query_length];
+
+        // create causal mask
+        // [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        Tensor? casual_4d_mask = null;
+        if ((input_shape[^1] > 1 || this.sliding_window is not null) && this.is_casual)
+        {
+            if (key_value_length is null)
+            {
+                throw new ArgumentException("key_value_length should be provided when attention_mask is causal");
+            }
+
+            var past_key_values_length = key_value_length.Value - query_length;
+            casual_4d_mask = MakeCasualMask(input_shape, dtype, attention_mask_2d.device, past_key_values_length, this.sliding_window);
+        }
+        else if(this.sliding_window is not null)
+        {
+            throw new NotImplementedException("Sliding window is not supported for non-causal masks");
+        }
+
+        var expanded_attn_mask = ExpandMask(attention_mask_2d, dtype, query_length).to(attention_mask_2d.device);
+        if (casual_4d_mask is not null)
+        {
+            var min = dtype switch
+            {
+                ScalarType.Float32 => torch.finfo(dtype).min,
+                ScalarType.Float64 => torch.finfo(dtype).min,
+                ScalarType.Float16 => -65504.0,
+                ScalarType.BFloat16 => -65504.0,
+                _ => throw new ArgumentException("Invalid dtype"),
+            };
+            expanded_attn_mask = casual_4d_mask.masked_fill(expanded_attn_mask.to(ScalarType.Bool), min);
+        }
+
+        return expanded_attn_mask;
+    }
+
     public Tensor? ToCasual4D(
         int batch_size,
         int query_length,
@@ -57,6 +111,7 @@ public static Tensor MakeCasualMask(
             ScalarType.Float32 => torch.finfo(dtype).min,
             ScalarType.Float64 => torch.finfo(dtype).min,
             ScalarType.Float16 => -65504.0,
+            ScalarType.BFloat16 => -65504.0,
             _ => throw new ArgumentException("Invalid dtype"),
         };
         var mask = torch.full([tgt_len, tgt_len], min, dtype: dtype, device: device);
@@ -86,23 +141,28 @@ public static Tensor MakeCasualMask(
     /// Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`
     /// </summary>
     /// <param name="input_shape">The input shape should be a tuple that defines `(batch_size, query_length)`.</param>
-    public static Tensor? Create4DCasualAttentionMask(
+    public static Tensor? Create4DCausalAttentionMask(
         Tensor? attention_mask,
         long[] input_shape,
         ScalarType dtype,
         Device device,
         int past_key_values_length = 0,
         int? sliding_window = null)
     {
+        var converter = new AttentionMaskConverter(is_causal: true, sliding_window: sliding_window);
+        var batch_size = (int)input_shape[0];
+        var query_length = (int)input_shape[1];
+        var key_value_length = past_key_values_length + query_length;
         if (attention_mask is not null)
         {
-            throw new ArgumentException("This is not a casual mask");
+            if (attention_mask.ndim != 2)
+            {
+                throw new ArgumentException("Attention mask should be 2D");
+            }
+            return converter.To4D(attention_mask, (int)input_shape[1], dtype, key_value_length);
         }
 
-        var batch_size = (int)input_shape[0];
-        var query_length = (int)input_shape[1];
-        var converter = new AttentionMaskConverter(is_casual: true, sliding_window: sliding_window);
-        var key_value_length = past_key_values_length + query_length;
+        
         return converter.ToCasual4D(batch_size, query_length, key_value_length, dtype, device);
     }
 
@@ -122,6 +182,7 @@ public static Tensor ExpandMask(
             ScalarType.Float32 => torch.finfo(dtype).min,
             ScalarType.Float64 => torch.finfo(dtype).min,
             ScalarType.Float16 => -65504.0,
+            ScalarType.BFloat16 => -65504.0,
             _ => throw new ArgumentException("Invalid dtype"),
         };
 

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ public static string Peek(this Tensor tensor, string id, int n = 10)`
`48`	`48`	`avg = avg.round(4);`
`49`	`49`	`var str = $"{id}: sum: {avg.ToSingle()} dtype: {dtype} shape: [{shapeString}]";`
`50`	`50`
`51`		`- //Console.WriteLine(str);`
	`51`	`+ Console.WriteLine(str);`
`52`	`52`
`53`	`53`	`return str;`
`54`	`54`	`}`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,10 @@ public Phi3MLP(Phi3Config config)`
`23`	`23`	`public override Tensor forward(Tensor input)`
`24`	`24`	`{`
`25`	`25`	`using var input1 = this.gate_up_proj.forward(input);`
`26`		`- using var input2 = this.activation_fn.forward(input1);`
`27`		`- return this.down_proj.forward(input2);`
	`26`	`+ var chunks = input1.chunk(2, dim: -1);`
	`27`	`+ var gate = chunks[0];`
	`28`	`+ var up_status = chunks[1];`
	`29`	`+ up_status = up_status * this.activation_fn.forward(gate);`
	`30`	`+ return this.down_proj.forward(up_status);`
`28`	`31`	`}`
`29`	`32`	`}`
Original file line number	Diff line number	Diff line change
`@@ -151,10 +151,11 @@ public override CasualLMModelOutput forward(CasualLMModelInput input)`
`151`	`151`	`}`
`152`	`152`	`else`
`153`	`153`	`{`
`154`		`- attention_mask = AttentionMaskConverter.Create4DCasualAttentionMask(attention_mask, [batch_size, seq_length], inputs_embeds.dtype, device, past_key_values_length, this.config.SlidingWindow);`
	`154`	`+ attention_mask = AttentionMaskConverter.Create4DCausalAttentionMask(attention_mask, [batch_size, seq_length], inputs_embeds.dtype, device, past_key_values_length, this.config.SlidingWindow);`
`155`	`155`	`}`
`156`	`156`
`157`	`157`	`var hidden_states = inputs_embeds;`
	`158`	`+ hidden_states.Peek("hidden_states");`
`158`	`159`
`159`	`160`	`var all_hidden_states = new List<Tensor>();`
`160`	`161`	`var all_attentions = new List<Tensor>();`