You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

212 lines
8.4 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
文件名: train_gpu_blance_10features.py
训练部分代码
作者: 王春林
创建日期: 2023年10月18日
最后修改日期: 2023年10月20日
版本号: 1.0.0
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
# 训练集EXCEL文件名
train_excel = r'train_fold0.xlsx'
# 验证集EXCEL文件名
val_excel = r'val_fold0.xlsx'
# 检查GPU是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 定义 MLP 网络
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.model = nn.Sequential(
nn.Linear(10, 32), # 输入层
nn.ReLU(), # 激活函数
nn.Linear(32, 128), # 隐藏层
nn.ReLU(), # 激活函数
nn.Linear(128, 32), # 隐藏层
nn.ReLU(), # 激活函数
nn.Linear(32, 4), # 输出层4个类别
)
def forward(self, x):
return self.model(x)
# 读取特征和标签
train_data = pd.read_excel(train_excel)
val_data = pd.read_excel(val_excel)
# 以下是你的特征名
feature_names = ["强迫症状数字化", "人际关系敏感数字化", "抑郁数字化", "多因子症状", "母亲教养方式数字化", "父亲教养方式数字化", "自评家庭经济条件数字化", "有无心理治疗(咨询)史数字化", "学业情况数字化", "出勤情况数字化"]
# 将特征和标签分开,并做归一化处理
X_train = train_data[feature_names].values
y_train = train_data['类别'].values
X_val = val_data[feature_names].values
y_val = val_data['类别'].values
train_dataset = TensorDataset(torch.from_numpy(X_train).float().to(device), torch.from_numpy(y_train).long().to(device))
val_dataset = TensorDataset(torch.from_numpy(X_val).float().to(device), torch.from_numpy(y_val).long().to(device))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
model = MLP().to(device)
#criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.Adam(model.parameters())
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
n_epochs = 150 # 增加到150个epoch
train_losses, val_losses, train_accs, val_accs, val_class_precisions, val_class_recalls, val_class_f1_scores = [], [], [], [], [], [], []
# 增加样本平衡机制
class_sample_counts = np.bincount(y_train)
class_weights = 1.0 / torch.tensor(class_sample_counts, dtype=torch.float32)
class_weights = class_weights.to(device)
print(class_sample_counts)
print(class_weights)
# 计算类别权重
# class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
# print("class weights: ", class_weights)
# additional_weights = torch.tensor([1.0, 1.0, 1.0, 1.0], dtype=torch.float32).to(device)
# class_weights *= additional_weights
# print("Updated class weights: ", class_weights)
# 使用加权交叉熵损失函数
criterion = nn.CrossEntropyLoss(weight=class_weights)
#criterion = nn.CrossEntropyLoss()
# 存储每一折的模型和对应的验证准确率
best_val_acc = 0.0
best_model = None
for epoch in range(n_epochs):
model.train()
running_loss, corrects = 0, 0
for inputs, targets in train_loader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
_, preds = torch.max(outputs, 1)
corrects += torch.sum(preds == targets.data)
epoch_loss = running_loss / len(train_loader.dataset)
epoch_acc = corrects.double().cpu() / len(train_loader.dataset)
train_losses.append(epoch_loss)
train_accs.append(epoch_acc)
print(f'Fold {+1}, Epoch {epoch+1} | Train Loss: {epoch_loss:.4f} | Train Accuracy: {epoch_acc:.4f}')
model.eval()
all_preds, all_targets = [], []
running_loss, corrects = 0, 0
with torch.no_grad():
for inputs, targets in val_loader:
outputs = model(inputs)
loss = criterion(outputs, targets)
running_loss += loss.item() * inputs.size(0)
_, preds = torch.max(outputs, 1)
corrects += torch.sum(preds == targets.data)
all_preds.extend(preds.cpu().numpy())
all_targets.extend(targets.cpu().numpy())
class_precisions = precision_score(all_targets, all_preds, average=None)
class_recalls = recall_score(all_targets, all_preds, average=None)
class_f1_scores = f1_score(all_targets, all_preds, average=None)
for i, (precision, recall, f1) in enumerate(zip(class_precisions, class_recalls, class_f1_scores)):
print(f'Fold {+1}, Epoch {epoch+1} | Class {i+1} Metrics: Precision={precision:.4f}, Recall={recall:.4f}, F1 Score={f1:.4f}')
epoch_loss = running_loss / len(val_loader.dataset)
epoch_acc = corrects.double().cpu() / len(val_loader.dataset)
val_losses.append(epoch_loss)
val_accs.append(epoch_acc)
val_class_precisions.append(np.mean(class_precisions))
val_class_recalls.append(np.mean(class_recalls))
val_class_f1_scores.append(np.mean(class_f1_scores))
print(f'Fold {+1}, Epoch {epoch+1} | Validation Loss: {epoch_loss:.4f} | Validation Accuracy: {epoch_acc:.4f}')
# 保存最佳模型
if np.mean(class_f1_scores) > best_val_acc:
best_val_acc = np.mean(class_f1_scores)
best_model = model.state_dict()
# 保存每一折的最佳模型
torch.save(best_model, train_excel+f'.pth')
# 用于存储所有折的损失和准确率
all_train_losses, all_val_losses, all_train_accs, all_val_accs, all_class_precisions, all_class_f1_scores, all_class_recalls = [], [], [], [], [], [], []
all_train_losses.append(train_losses)
all_val_losses.append(val_losses)
all_train_accs.append(train_accs)
all_val_accs.append(val_accs)
all_class_precisions.append(val_class_precisions)
all_class_recalls.append(val_class_recalls)
all_class_f1_scores.append(val_class_f1_scores)
print(f'All Fold Average | Train Loss: {np.mean(all_train_losses, axis=0)[-1].item():.4f} | Train Accuracy: {np.mean(all_train_accs, axis=0)[-1].item():.4f} | Validation Loss: {np.mean(all_val_losses, axis=0)[-1].item():.4f} | Validation Accuracy: {np.mean(all_val_accs, axis=0)[-1].item():.4f} | Validation Precision: {np.mean(all_class_precisions, axis=0)[-1].item():.4f} | Validation Recall: {np.mean(all_class_recalls, axis=0)[-1].item():.4f} | Validation F1_score: {np.mean(all_class_f1_scores, axis=0)[-1].item():.4f}')
# all_train_losses=train_losses
# all_val_losses=val_losses
# all_train_accs=train_accs
# all_val_accs=val_accs
# all_class_precisions=val_class_precisions
# all_class_recalls=val_class_recalls
# all_class_f1_scores=val_class_f1_scores
# print(f'All Fold Average | Train Loss: {all_train_losses:.4f} | Train Accuracy: {all_train_accs:.4f} | Validation Loss: {all_val_losses:.4f} | Validation Accuracy: {all_val_accs:.4f} | Validation Precision: {all_class_precisions:.4f} | Validation Recall: {all_class_recalls:.4f} | Validation F1_score: {all_class_f1_scores:.4f}')
# 绘制所有折的平均损失和准确率曲线
plt.figure(figsize=(12, 4))
plt.subplot(3, 2, 1)
plt.plot(range(n_epochs), np.mean(all_train_losses, axis=0), label='Train Loss')
plt.plot(range(n_epochs), np.mean(all_val_losses, axis=0), label='Validation Loss')
plt.legend()
plt.title('Loss')
plt.subplot(3, 2, 2)
plt.plot(range(n_epochs), np.mean(all_train_accs, axis=0), label='Train Accuracy')
plt.plot(range(n_epochs), np.mean(all_val_accs, axis=0), label='Validation Accuracy')
plt.legend()
plt.title('Accuracy')
plt.subplot(3, 2, 3)
plt.plot(range(n_epochs), np.mean(all_class_precisions, axis=0), label='Validation Precision')
plt.legend()
plt.title('Precision')
plt.subplot(3, 2, 4)
plt.plot(range(n_epochs), np.mean(all_class_recalls, axis=0), label='Validation Recall')
plt.legend()
plt.title('Recall')
plt.subplot(3, 2, 5)
plt.plot(range(n_epochs), np.mean(all_class_f1_scores, axis=0), label='Validation F1_score')
plt.legend()
plt.title('F1_score')
plt.show()