|
|
|
|
@ -0,0 +1,211 @@
|
|
|
|
|
"""
|
|
|
|
|
文件名: train_gpu_blance_10features.py
|
|
|
|
|
|
|
|
|
|
训练部分代码
|
|
|
|
|
|
|
|
|
|
作者: 王春林
|
|
|
|
|
创建日期: 2023年10月18日
|
|
|
|
|
最后修改日期: 2023年10月20日
|
|
|
|
|
版本号: 1.0.0
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
|
|
|
|
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
|
|
|
|
from sklearn.model_selection import StratifiedKFold
|
|
|
|
|
import torch
|
|
|
|
|
from torch import nn
|
|
|
|
|
from torch.utils.data import DataLoader, TensorDataset
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
from sklearn.metrics import precision_score, recall_score, f1_score
|
|
|
|
|
from sklearn.utils.class_weight import compute_class_weight
|
|
|
|
|
|
|
|
|
|
# 训练集EXCEL文件名
|
|
|
|
|
train_excel = r'train_fold0.xlsx'
|
|
|
|
|
|
|
|
|
|
# 验证集EXCEL文件名
|
|
|
|
|
val_excel = r'val_fold0.xlsx'
|
|
|
|
|
|
|
|
|
|
# 检查GPU是否可用
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
|
|
|
|
# 定义 MLP 网络
|
|
|
|
|
class MLP(nn.Module):
|
|
|
|
|
def __init__(self):
|
|
|
|
|
super(MLP, self).__init__()
|
|
|
|
|
self.model = nn.Sequential(
|
|
|
|
|
nn.Linear(10, 32), # 输入层
|
|
|
|
|
nn.ReLU(), # 激活函数
|
|
|
|
|
nn.Linear(32, 128), # 隐藏层
|
|
|
|
|
nn.ReLU(), # 激活函数
|
|
|
|
|
nn.Linear(128, 32), # 隐藏层
|
|
|
|
|
nn.ReLU(), # 激活函数
|
|
|
|
|
nn.Linear(32, 4), # 输出层,4个类别
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
|
return self.model(x)
|
|
|
|
|
|
|
|
|
|
# 读取特征和标签
|
|
|
|
|
train_data = pd.read_excel(train_excel)
|
|
|
|
|
val_data = pd.read_excel(val_excel)
|
|
|
|
|
|
|
|
|
|
# 以下是你的特征名
|
|
|
|
|
feature_names = ["强迫症状数字化", "人际关系敏感数字化", "抑郁数字化", "多因子症状", "母亲教养方式数字化", "父亲教养方式数字化", "自评家庭经济条件数字化", "有无心理治疗(咨询)史数字化", "学业情况数字化", "出勤情况数字化"]
|
|
|
|
|
|
|
|
|
|
# 将特征和标签分开,并做归一化处理
|
|
|
|
|
X_train = train_data[feature_names].values
|
|
|
|
|
y_train = train_data['类别'].values
|
|
|
|
|
X_val = val_data[feature_names].values
|
|
|
|
|
y_val = val_data['类别'].values
|
|
|
|
|
|
|
|
|
|
train_dataset = TensorDataset(torch.from_numpy(X_train).float().to(device), torch.from_numpy(y_train).long().to(device))
|
|
|
|
|
val_dataset = TensorDataset(torch.from_numpy(X_val).float().to(device), torch.from_numpy(y_val).long().to(device))
|
|
|
|
|
|
|
|
|
|
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
|
|
|
|
|
val_loader = DataLoader(val_dataset, batch_size=16)
|
|
|
|
|
|
|
|
|
|
model = MLP().to(device)
|
|
|
|
|
#criterion = nn.CrossEntropyLoss()
|
|
|
|
|
#optimizer = torch.optim.Adam(model.parameters())
|
|
|
|
|
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)
|
|
|
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
|
|
|
|
|
|
|
|
|
n_epochs = 150 # 增加到150个epoch
|
|
|
|
|
train_losses, val_losses, train_accs, val_accs, val_class_precisions, val_class_recalls, val_class_f1_scores = [], [], [], [], [], [], []
|
|
|
|
|
|
|
|
|
|
# 增加样本平衡机制
|
|
|
|
|
class_sample_counts = np.bincount(y_train)
|
|
|
|
|
class_weights = 1.0 / torch.tensor(class_sample_counts, dtype=torch.float32)
|
|
|
|
|
class_weights = class_weights.to(device)
|
|
|
|
|
print(class_sample_counts)
|
|
|
|
|
print(class_weights)
|
|
|
|
|
|
|
|
|
|
# 计算类别权重
|
|
|
|
|
# class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
|
|
|
|
|
# class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
|
|
|
|
|
# print("class weights: ", class_weights)
|
|
|
|
|
# additional_weights = torch.tensor([1.0, 1.0, 1.0, 1.0], dtype=torch.float32).to(device)
|
|
|
|
|
# class_weights *= additional_weights
|
|
|
|
|
# print("Updated class weights: ", class_weights)
|
|
|
|
|
|
|
|
|
|
# 使用加权交叉熵损失函数
|
|
|
|
|
criterion = nn.CrossEntropyLoss(weight=class_weights)
|
|
|
|
|
#criterion = nn.CrossEntropyLoss()
|
|
|
|
|
|
|
|
|
|
# 存储每一折的模型和对应的验证准确率
|
|
|
|
|
best_val_acc = 0.0
|
|
|
|
|
best_model = None
|
|
|
|
|
|
|
|
|
|
for epoch in range(n_epochs):
|
|
|
|
|
model.train()
|
|
|
|
|
running_loss, corrects = 0, 0
|
|
|
|
|
for inputs, targets in train_loader:
|
|
|
|
|
optimizer.zero_grad()
|
|
|
|
|
outputs = model(inputs)
|
|
|
|
|
loss = criterion(outputs, targets)
|
|
|
|
|
loss.backward()
|
|
|
|
|
optimizer.step()
|
|
|
|
|
running_loss += loss.item() * inputs.size(0)
|
|
|
|
|
_, preds = torch.max(outputs, 1)
|
|
|
|
|
corrects += torch.sum(preds == targets.data)
|
|
|
|
|
|
|
|
|
|
epoch_loss = running_loss / len(train_loader.dataset)
|
|
|
|
|
epoch_acc = corrects.double().cpu() / len(train_loader.dataset)
|
|
|
|
|
train_losses.append(epoch_loss)
|
|
|
|
|
train_accs.append(epoch_acc)
|
|
|
|
|
|
|
|
|
|
print(f'Fold {+1}, Epoch {epoch+1} | Train Loss: {epoch_loss:.4f} | Train Accuracy: {epoch_acc:.4f}')
|
|
|
|
|
|
|
|
|
|
model.eval()
|
|
|
|
|
all_preds, all_targets = [], []
|
|
|
|
|
running_loss, corrects = 0, 0
|
|
|
|
|
with torch.no_grad():
|
|
|
|
|
for inputs, targets in val_loader:
|
|
|
|
|
outputs = model(inputs)
|
|
|
|
|
loss = criterion(outputs, targets)
|
|
|
|
|
running_loss += loss.item() * inputs.size(0)
|
|
|
|
|
_, preds = torch.max(outputs, 1)
|
|
|
|
|
corrects += torch.sum(preds == targets.data)
|
|
|
|
|
all_preds.extend(preds.cpu().numpy())
|
|
|
|
|
all_targets.extend(targets.cpu().numpy())
|
|
|
|
|
|
|
|
|
|
class_precisions = precision_score(all_targets, all_preds, average=None)
|
|
|
|
|
class_recalls = recall_score(all_targets, all_preds, average=None)
|
|
|
|
|
class_f1_scores = f1_score(all_targets, all_preds, average=None)
|
|
|
|
|
|
|
|
|
|
for i, (precision, recall, f1) in enumerate(zip(class_precisions, class_recalls, class_f1_scores)):
|
|
|
|
|
print(f'Fold {+1}, Epoch {epoch+1} | Class {i+1} Metrics: Precision={precision:.4f}, Recall={recall:.4f}, F1 Score={f1:.4f}')
|
|
|
|
|
|
|
|
|
|
epoch_loss = running_loss / len(val_loader.dataset)
|
|
|
|
|
epoch_acc = corrects.double().cpu() / len(val_loader.dataset)
|
|
|
|
|
val_losses.append(epoch_loss)
|
|
|
|
|
val_accs.append(epoch_acc)
|
|
|
|
|
val_class_precisions.append(np.mean(class_precisions))
|
|
|
|
|
val_class_recalls.append(np.mean(class_recalls))
|
|
|
|
|
val_class_f1_scores.append(np.mean(class_f1_scores))
|
|
|
|
|
|
|
|
|
|
print(f'Fold {+1}, Epoch {epoch+1} | Validation Loss: {epoch_loss:.4f} | Validation Accuracy: {epoch_acc:.4f}')
|
|
|
|
|
|
|
|
|
|
# 保存最佳模型
|
|
|
|
|
if np.mean(class_f1_scores) > best_val_acc:
|
|
|
|
|
best_val_acc = np.mean(class_f1_scores)
|
|
|
|
|
best_model = model.state_dict()
|
|
|
|
|
|
|
|
|
|
# 保存每一折的最佳模型
|
|
|
|
|
torch.save(best_model, train_excel+f'.pth')
|
|
|
|
|
|
|
|
|
|
# 用于存储所有折的损失和准确率
|
|
|
|
|
all_train_losses, all_val_losses, all_train_accs, all_val_accs, all_class_precisions, all_class_f1_scores, all_class_recalls = [], [], [], [], [], [], []
|
|
|
|
|
all_train_losses.append(train_losses)
|
|
|
|
|
all_val_losses.append(val_losses)
|
|
|
|
|
all_train_accs.append(train_accs)
|
|
|
|
|
all_val_accs.append(val_accs)
|
|
|
|
|
all_class_precisions.append(val_class_precisions)
|
|
|
|
|
all_class_recalls.append(val_class_recalls)
|
|
|
|
|
all_class_f1_scores.append(val_class_f1_scores)
|
|
|
|
|
|
|
|
|
|
print(f'All Fold Average | Train Loss: {np.mean(all_train_losses, axis=0)[-1].item():.4f} | Train Accuracy: {np.mean(all_train_accs, axis=0)[-1].item():.4f} | Validation Loss: {np.mean(all_val_losses, axis=0)[-1].item():.4f} | Validation Accuracy: {np.mean(all_val_accs, axis=0)[-1].item():.4f} | Validation Precision: {np.mean(all_class_precisions, axis=0)[-1].item():.4f} | Validation Recall: {np.mean(all_class_recalls, axis=0)[-1].item():.4f} | Validation F1_score: {np.mean(all_class_f1_scores, axis=0)[-1].item():.4f}')
|
|
|
|
|
|
|
|
|
|
# all_train_losses=train_losses
|
|
|
|
|
# all_val_losses=val_losses
|
|
|
|
|
# all_train_accs=train_accs
|
|
|
|
|
# all_val_accs=val_accs
|
|
|
|
|
# all_class_precisions=val_class_precisions
|
|
|
|
|
# all_class_recalls=val_class_recalls
|
|
|
|
|
# all_class_f1_scores=val_class_f1_scores
|
|
|
|
|
|
|
|
|
|
# print(f'All Fold Average | Train Loss: {all_train_losses:.4f} | Train Accuracy: {all_train_accs:.4f} | Validation Loss: {all_val_losses:.4f} | Validation Accuracy: {all_val_accs:.4f} | Validation Precision: {all_class_precisions:.4f} | Validation Recall: {all_class_recalls:.4f} | Validation F1_score: {all_class_f1_scores:.4f}')
|
|
|
|
|
|
|
|
|
|
# 绘制所有折的平均损失和准确率曲线
|
|
|
|
|
plt.figure(figsize=(12, 4))
|
|
|
|
|
plt.subplot(3, 2, 1)
|
|
|
|
|
plt.plot(range(n_epochs), np.mean(all_train_losses, axis=0), label='Train Loss')
|
|
|
|
|
plt.plot(range(n_epochs), np.mean(all_val_losses, axis=0), label='Validation Loss')
|
|
|
|
|
plt.legend()
|
|
|
|
|
plt.title('Loss')
|
|
|
|
|
|
|
|
|
|
plt.subplot(3, 2, 2)
|
|
|
|
|
plt.plot(range(n_epochs), np.mean(all_train_accs, axis=0), label='Train Accuracy')
|
|
|
|
|
plt.plot(range(n_epochs), np.mean(all_val_accs, axis=0), label='Validation Accuracy')
|
|
|
|
|
plt.legend()
|
|
|
|
|
plt.title('Accuracy')
|
|
|
|
|
|
|
|
|
|
plt.subplot(3, 2, 3)
|
|
|
|
|
plt.plot(range(n_epochs), np.mean(all_class_precisions, axis=0), label='Validation Precision')
|
|
|
|
|
plt.legend()
|
|
|
|
|
plt.title('Precision')
|
|
|
|
|
|
|
|
|
|
plt.subplot(3, 2, 4)
|
|
|
|
|
plt.plot(range(n_epochs), np.mean(all_class_recalls, axis=0), label='Validation Recall')
|
|
|
|
|
plt.legend()
|
|
|
|
|
plt.title('Recall')
|
|
|
|
|
|
|
|
|
|
plt.subplot(3, 2, 5)
|
|
|
|
|
plt.plot(range(n_epochs), np.mean(all_class_f1_scores, axis=0), label='Validation F1_score')
|
|
|
|
|
plt.legend()
|
|
|
|
|
plt.title('F1_score')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plt.show()
|