linear:kaiming Init

miaobyte · miaobyte · commit e6edcad6e521 · 2024-10-12T16:58:32.000+08:00
diff --git a/datasetloader/mnist/train/mnist_train.go b/datasetloader/mnist/train/mnist_train.go
@@ -19,19 +19,19 @@ func main() {
 	}
 	// 设置超参数
 
-	numClasses := 10      // 分类数量
-	batchSize := 32       // 批处理大小
-	learningRate := 0.001 // 学习率
-	epochs := 30          // 训练轮数
+	numClasses := 10     // 分类数量
+	batchSize := 32      // 批处理大小
+	learningRate := 0.01 // 学习率
+	epochs := 30         // 训练轮数
 	// 创建模型
 	m := &model.Model{
 		Optimizer: optimizer.NewSGD(learningRate), // 学习率设置为0.01
 	}
-	m.Layer(layer.Linear(mnist.TRAIN_MNIST.ImageSize, 128)).
+	m.Layer(layer.Linear(mnist.TRAIN_MNIST.ImageSize, 128, true)).
 		Layer(layer.Activation(layer.Relu, layer.ReluDerivative)).
-		Layer(layer.Linear(128, 64)).
+		Layer(layer.Linear(128, 64, true)).
 		Layer(layer.Activation(layer.Relu, layer.ReluDerivative)).
-		Layer(layer.Linear(64, numClasses)) // 将各个层添加到模型中
+		Layer(layer.Linear(64, numClasses, true)) // 将各个层添加到模型中
 
 	// 定义前向传播函数
 	m.ForwardFunc = func(input *dl.Tensor) (output *dl.Tensor) {
diff --git a/datasetloader/mnist/train/mnist_train.py b/datasetloader/mnist/train/mnist_train.py
@@ -0,0 +1,101 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.utils.data import DataLoader
+
+# 设置设备
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# 设置超参数
+num_classes = 10
+batch_size = 32
+learning_rate = 0.001
+epochs = 30
+
+# 加载MNIST数据集
+transform = transforms.Compose([
+    transforms.ToTensor(),
+])
+ 
+
+train_dataset = datasets.MNIST(root='data', train=True, download=False, transform=transform)
+train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+
+# 定义验证集
+val_dataset = datasets.MNIST(root='data', train=False, download=False, transform=transform)
+val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+
+# 提前停止的参数
+patience = 5  # 容忍的epoch数
+best_loss = float('inf')
+trigger_times = 0
+
+# 定义模型
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.flatten = nn.Flatten()
+        self.fc1 = nn.Linear(28 * 28, 128)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(128, 64)
+        self.relu2 = nn.ReLU()
+        self.fc3 = nn.Linear(64, num_classes)
+
+    def forward(self, x):
+        x = self.flatten(x)
+        x = self.relu1(self.fc1(x))
+        x = self.relu2(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+model = Net().to(device)
+
+# 定义损失函数和优化器
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(model.parameters(), lr=learning_rate)
+
+# 训练循环
+while True:  # 不限制epochs，使用无限循环
+    model.train()  # 设置模型为训练模式
+    running_loss = 0.0
+    for batch_idx, (inputs, labels) in enumerate(train_loader):
+        inputs, labels = inputs.to(device), labels.to(device)
+
+        # 前向传播
+        outputs = model(inputs)
+        loss = criterion(outputs, labels)
+
+        # 反向传播和优化
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        running_loss += loss.item()
+
+    # 打印每个epoch的平均训练损失
+    average_loss = running_loss / len(train_loader)
+    print(f"Average Training Loss: {average_loss:.4f}", end="")
+
+    # 验证集评测
+    model.eval()  # 设置模型为评估模式
+    val_loss = 0.0
+    with torch.no_grad():
+        for inputs, labels in val_loader:
+            inputs, labels = inputs.to(device), labels.to(device)
+            outputs = model(inputs)
+            loss = criterion(outputs, labels)
+            val_loss += loss.item()
+    average_val_loss = val_loss / len(val_loader)
+    print(f"  Validation Loss: {average_val_loss:.4f}")
+
+    # 提前停止逻辑
+    if average_val_loss < best_loss:
+        best_loss = average_val_loss
+        trigger_times = 0  # 重置触发次数
+    else:
+        trigger_times += 1
+        if trigger_times >= patience:
+            print("Early stopping triggered")
+            break
+print("Training complete")
diff --git a/dl/layer/linear.go b/dl/layer/linear.go
@@ -2,10 +2,11 @@ package layer
 
 import (
 	"deepgo/dl"
+	"math"
 )
 
 // NewLinear 创建一个新的线性层，支持批处理
-func Linear(in_features, out_features int) (l *ComputeGraphNode) {
+func Linear(in_features, out_features int, biasInit bool) (l *ComputeGraphNode) {
 	l = NewNode(nil, nil)
 
 	l.SetAttr("in_features", in_features)
@@ -14,10 +15,21 @@ func Linear(in_features, out_features int) (l *ComputeGraphNode) {
 	weight := dl.NewTensor([]int{out_features, in_features})
 	bias := dl.NewTensor([]int{out_features})
 
-	// 使用He初始化
-	weight.He(in_features)
-	bias.He(in_features)
+	// 初始化权重
 
+	weight.KaimingUniform(math.Sqrt(5))
+	l.RegisterParameter("weight", weight)
+
+	if biasInit {
+		// 初始化偏置
+		biasT := dl.NewTensor([]int{out_features})
+		fanIn, _ := dl.CalculateFanInAndFanOut(weight)
+		bound := 1 / math.Sqrt(float64(fanIn))
+		biasT.Uniform(-bound, bound)
+		l.RegisterParameter("bias", biasT)
+	} else {
+		l.RegisterParameter("bias", nil)
+	}
 	l.RegisterParameter("weight", weight)
 	l.RegisterParameter("bias", bias)
 
diff --git a/dl/layer/linear.py b/dl/layer/linear.py
@@ -0,0 +1,28 @@
+import torch
+import torch.nn as nn
+import json
+import sys
+
+def create_linear_layer(in_features, out_features):
+    # 创建线性层
+    linear = nn.Linear(in_features, out_features)
+    
+    # 获取权重和偏置
+    weight = linear.weight.data.tolist()
+    bias = linear.bias.data.tolist()
+    
+    return {
+        "weight": weight,
+        "bias": bias
+    }
+
+if __name__ == "__main__":
+    # 从命令行参数读取输入和输出特征数
+    in_features = int(sys.argv[1])
+    out_features = int(sys.argv[2])
+    
+    # 创建线性层并获取参数
+    params = create_linear_layer(in_features, out_features)
+    
+    # 打印权重和偏置
+    print(json.dumps(params))
diff --git a/dl/layer/linear_test.go b/dl/layer/linear_test.go
@@ -17,7 +17,7 @@ func TestLinear(t *testing.T) {
 	// 创建一个线性层，输入特征为2，输出特征为3
 	inFeatures := 2
 	outFeatures := 3
-	linearLayer := Linear(inFeatures, outFeatures)
+	linearLayer := Linear(inFeatures, outFeatures, true)
 
 	// 创建输入张量，形状为 [batchSize, inFeatures]
 	inputTensor := dl.NewTensor([]int{1, inFeatures}, 1.0, 2.0) // 输入为 [1, 2]
diff --git a/dl/tensor_initializer.go b/dl/tensor_initializer.go
@@ -13,11 +13,32 @@ func (t *Tensor) Xavier(inFeatures int) {
 	t.Uniform(-stdv, stdv)
 }
 
-// He 使用He初始化方法初始化张量
-func (t *Tensor) He(inFeatures int) {
-	stdv := math.Sqrt(2.0 / float64(inFeatures))
-	// 使用均匀分布生成随机数，范围为[-stdv, stdv]
-	t.Uniform(-stdv, stdv)
+// KaimingUniform 使用 Kaiming uniform 初始化方法初始化张量
+func (t *Tensor) KaimingUniform(a float64) {
+	fanIn, _ := CalculateFanInAndFanOut(t)
+	std := a / math.Sqrt(float64(fanIn))
+	bound := math.Sqrt(3.0) * std
+	t.Uniform(-bound, bound)
+}
+
+// calculateFanInAndFanOut 计算 fan_in 和 fan_out
+func CalculateFanInAndFanOut(t *Tensor) (fanIn, fanOut int) {
+	dimensions := len(t.Shape)
+	if dimensions < 2 {
+		return 1, 1
+	}
+
+	numInputFmaps := t.Shape[1]
+	numOutputFmaps := t.Shape[0]
+	receptiveFieldSize := 1
+	if dimensions > 2 {
+		for _, s := range t.Shape[2:] {
+			receptiveFieldSize *= s
+		}
+	}
+	fanIn = numInputFmaps * receptiveFieldSize
+	fanOut = numOutputFmaps * receptiveFieldSize
+	return fanIn, fanOut
 }
 
 // Normal 使用正态分布初始化张量