diff --git a/Psychological_Classification_4Classes.pth b/Psychological_Classification_4Classes.pth new file mode 100644 index 0000000..857abcb Binary files /dev/null and b/Psychological_Classification_4Classes.pth differ diff --git a/detect.py b/detect.py index 678be40..f86bb60 100644 --- a/detect.py +++ b/detect.py @@ -40,7 +40,7 @@ class MLP(nn.Module): # 加载模型 model = MLP().to(device) -model.load_state_dict(torch.load('model_5.pth')) +model.load_state_dict(torch.load('Psychological_Classification_4Classes.pth')) model.eval() # 创建数据加载器 diff --git a/detect_num.py b/detect_num.py new file mode 100644 index 0000000..7fa49d1 --- /dev/null +++ b/detect_num.py @@ -0,0 +1,126 @@ +""" +文件名: detect_num.py + +推理部分代码 + +作者: 王春林 +创建日期: 2023年7月14日 +最后修改日期: 2023年7月18日 +版本号: 1.0.0 + +""" +import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler +import torch +from torch import nn +from torch.utils.data import DataLoader, TensorDataset + +# 检查GPU是否可用 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +#device = torch.device("cpu") + +# 读取特征和标签 +data = pd.read_excel('feature_label.xlsx') + +# 获取编号列 +sample_ids = data['编号'].values + +# 以下是你的特征名 +feature_names = ["躯体化", "强迫症状", "人际关系敏感", "抑郁", "焦虑", "敌对", "恐怖", "偏执", "精神病性", "其他", "父亲教养方式数字化", "母亲教养方式数字化", "自评家庭经济条件数字化", "有无心理治疗(咨询)史数字化", "出勤情况数字化", "学业情况数字化", "权重数字化值"] + +# 将特征和标签分开,并做归一化处理 +X = data[feature_names].values +y = data['label'].values - 1 # 将标签从1-4转换为0-3 + +scaler = MinMaxScaler() +X = scaler.fit_transform(X) + +# 定义 MLP 网络 +class MLP(nn.Module): + def __init__(self): + super(MLP, self).__init__() + self.model = nn.Sequential( + nn.Linear(17, 32), # 输入层 + nn.ReLU(), # 激活函数 + nn.Linear(32, 128), # 隐藏层 + nn.ReLU(), # 激活函数 + nn.Linear(128, 32), # 隐藏层 + nn.ReLU(), # 激活函数 + nn.Linear(32, 4), # 输出层,4个类别 + ) + + def forward(self, x): + return self.model(x) + +# 加载模型 +model = MLP().to(device) +model.load_state_dict(torch.load('Psychological_Classification_4Classes.pth')) +model.eval() + +# 创建数据加载器 +dataset = TensorDataset(torch.from_numpy(X).float().to(device), torch.from_numpy(y).long().to(device)) +loader = DataLoader(dataset, batch_size=32) + +# 推理 +corrects = 0 +sample_index = 0 +for inputs, targets in loader: + outputs = model(inputs) + _, preds = torch.max(outputs, 1) + corrects += torch.sum(preds == targets.data) + + # 打印每个样本的推理结果 + for i in range(len(inputs)): + print(f'Sample ID: {sample_ids[sample_index]} | Target: {targets[i]} | Prediction: {preds[i]} (-1 in excel)') + sample_index += 1 + +# 计算整体推理的正确率 +accuracy = corrects.double().cpu() / len(loader.dataset) +print(f'Overall Accuracy: {accuracy:.4f}') + +# ...(之前的代码) + +# 创建文件来存储预测结果为0和1的学号 +file_1st_warning = open("一级预警.txt", "w", encoding="utf-8") +file_2nd_warning = open("二级预警.txt", "w", encoding="utf-8") + +# 初始化每个类别的计数器 +class_counts = {0: 0, 1: 0, 2: 0, 3: 0} +class_corrects = {0: 0, 1: 0, 2: 0, 3: 0} + +# 进行推理 +corrects = 0 +sample_index = 0 +for inputs, targets in loader: + outputs = model(inputs) + _, preds = torch.max(outputs, 1) + corrects += torch.sum(preds == targets.data) + + # 记录预测结果为0和1的学号 + for i in range(len(inputs)): + if preds[i] == 0: + file_1st_warning.write(f"{sample_ids[sample_index]}\n") + elif preds[i] == 1: + file_2nd_warning.write(f"{sample_ids[sample_index]}\n") + sample_index += 1 + + # 更新每个类别的计数器和正确计数器 + for i in range(len(targets)): + class_counts[targets[i].item()] += 1 + if preds[i] == targets[i]: + class_corrects[targets[i].item()] += 1 + +# 关闭文件 +file_1st_warning.close() +file_2nd_warning.close() + +# 计算整体准确率 +accuracy = corrects.double().cpu() / len(loader.dataset) +print(f'整体准确率: {accuracy:.4f}') + +# 打印每个类别的信息 +for class_idx in range(4): + class_accuracy = class_corrects[class_idx] / class_counts[class_idx] + print(f'类别 {class_idx + 1} | 预测数量: {class_counts[class_idx]} | 准确率: {class_accuracy:.4f}') + diff --git a/record.txt b/record.txt index 28e7b7f..244ff28 100644 --- a/record.txt +++ b/record.txt @@ -23,14 +23,32 @@ 完成学业预警统计统计,去除错误数据,一个错误 姓名重合:扎西尼玛, 扎西尼马 -2023年7月8日 +2023年7月3日 完成特征值缩放和选择工作(选择17个特征,包括幺爸给的加权值,相关性不管,没想到好方法),完成数据清洗,和缺失值处理前面已经完成 -2023年7月12日 +2023年7月4日 最后筛选出需要的特征 -2023年7月13日 +2023年7月7日 按照特征数字量化表的要求,完成了除学业和考勤外所有特征的量化,在Feature_Processed_2.py这个文件中处理,进一步统计特征,学业和出勤特征没有进一步统计进去 +2023年7月8日 +平均请假次数有一点歧义,不知道是统计26111个的,还是表里面的15342,我按全部来,没有记录的默认0次 + +2023年7月9日 +完成所有特征选择和数字化操作,Feature_Process_5.py文件可以把数字化全部做完,待检查其他特征是否集中完全,需要研究下特征的归一化怎么来做。初步选用17个特征来做 + +2023年7月11日 +删除信息严重不全的23人,总样本数26088(26111) + +2023年7月13日 +完成训练 + +2023年7月14日 +完成推理部分 + +2023年7月16日 +增加样本平衡机制,使用GPU训练,优化代码结构,增加必要打印输出信息 + diff --git a/train_gpu_blance.py b/train_gpu_blance.py new file mode 100644 index 0000000..81efea9 --- /dev/null +++ b/train_gpu_blance.py @@ -0,0 +1,155 @@ +""" +文件名: detect_num.py + +训练部分代码 + +作者: 王春林 +创建日期: 2023年7月13日 +最后修改日期: 2023年7月18日 +版本号: 1.0.0 + +""" +import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import StratifiedKFold +import torch +from torch import nn +from torch.utils.data import DataLoader, TensorDataset +import matplotlib.pyplot as plt + +# 检查GPU是否可用 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# 读取特征和标签 +data = pd.read_excel('feature_label.xlsx') + +# 以下是你的特征名 +feature_names = ["躯体化", "强迫症状", "人际关系敏感", "抑郁", "焦虑", "敌对", "恐怖", "偏执", "精神病性", "其他", "父亲教养方式数字化", "母亲教养方式数字化", "自评家庭经济条件数字化", "有无心理治疗(咨询)史数字化", "出勤情况数字化", "学业情况数字化", "权重数字化值"] + +# 将特征和标签分开,并做归一化处理 +X = data[feature_names].values +y = data['label'].values - 1 # 将标签从1-4转换为0-3 + +scaler = MinMaxScaler() +X = scaler.fit_transform(X) + +# 定义 MLP 网络 +class MLP(nn.Module): + def __init__(self): + super(MLP, self).__init__() + self.model = nn.Sequential( + nn.Linear(17, 32), # 输入层 + nn.ReLU(), # 激活函数 + nn.Linear(32, 128), # 隐藏层 + nn.ReLU(), # 激活函数 + nn.Linear(128, 32), # 隐藏层 + nn.ReLU(), # 激活函数 + nn.Linear(32, 4), # 输出层,4个类别 + ) + + def forward(self, x): + return self.model(x) + +# 使用5折交叉验证 +skf = StratifiedKFold(n_splits=5, shuffle=True) + +# 用于存储所有折的损失和准确率 +all_train_losses, all_val_losses, all_train_accs, all_val_accs = [], [], [], [] + +for fold, (train_index, test_index) in enumerate(skf.split(X, y)): + X_train, X_val = X[train_index], X[test_index] + y_train, y_val = y[train_index], y[test_index] + + train_dataset = TensorDataset(torch.from_numpy(X_train).float().to(device), torch.from_numpy(y_train).long().to(device)) + val_dataset = TensorDataset(torch.from_numpy(X_val).float().to(device), torch.from_numpy(y_val).long().to(device)) + + train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) + val_loader = DataLoader(val_dataset, batch_size=32) + + model = MLP().to(device) + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters()) + + n_epochs = 120 # 增加到150个epoch + train_losses, val_losses, train_accs, val_accs = [], [], [], [] + + # 增加样本平衡机制 + class_sample_counts = np.bincount(y_train) + class_weights = 1.0 / torch.tensor(class_sample_counts, dtype=torch.float32) + class_weights = class_weights.to(device) + + # 存储每一折的模型和对应的验证准确率 + best_val_acc = 0.0 + best_model = None + + for epoch in range(n_epochs): + model.train() + running_loss, corrects = 0, 0 + for inputs, targets in train_loader: + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, targets) + # 使用样本平衡机制 + loss = loss * class_weights[targets] + loss = loss.mean() + loss.backward() + optimizer.step() + running_loss += loss.item() * inputs.size(0) + _, preds = torch.max(outputs, 1) + corrects += torch.sum(preds == targets.data) + + epoch_loss = running_loss / len(train_loader.dataset) + epoch_acc = corrects.double().cpu() / len(train_loader.dataset) + train_losses.append(epoch_loss) + train_accs.append(epoch_acc) + + print(f'Fold {fold+1}, Epoch {epoch+1} | Train Loss: {epoch_loss:.4f} | Train Accuracy: {epoch_acc:.4f}') + + model.eval() + running_loss, corrects = 0, 0 + with torch.no_grad(): + for inputs, targets in val_loader: + outputs = model(inputs) + loss = criterion(outputs, targets) + running_loss += loss.item() * inputs.size(0) + _, preds = torch.max(outputs, 1) + corrects += torch.sum(preds == targets.data) + + epoch_loss = running_loss / len(val_loader.dataset) + epoch_acc = corrects.double().cpu() / len(val_loader.dataset) + val_losses.append(epoch_loss) + val_accs.append(epoch_acc) + + print(f'Fold {fold+1}, Epoch {epoch+1} | Validation Loss: {epoch_loss:.4f} | Validation Accuracy: {epoch_acc:.4f}') + + # 保存最佳模型 + if epoch_acc > best_val_acc: + best_val_acc = epoch_acc + best_model = model.state_dict() + + # 保存每一折的最佳模型 + torch.save(best_model, f'model_fold{fold+1}.pth') + + all_train_losses.append(train_losses) + all_val_losses.append(val_losses) + all_train_accs.append(train_accs) + all_val_accs.append(val_accs) + +# 绘制所有折的平均损失和准确率曲线 +plt.figure(figsize=(12, 4)) +plt.subplot(1, 2, 1) +plt.plot(range(n_epochs), np.mean(all_train_losses, axis=0), label='Train Loss') +plt.plot(range(n_epochs), np.mean(all_val_losses, axis=0), label='Validation Loss') +plt.legend() +plt.title('Loss') + +plt.subplot(1, 2, 2) +plt.plot(range(n_epochs), np.mean(all_train_accs, axis=0), label='Train Accuracy') +plt.plot(range(n_epochs), np.mean(all_val_accs, axis=0), label='Validation Accuracy') +plt.legend() +plt.title('Accuracy') + +print(f'All Fold Average | Train Loss: {np.mean(all_train_losses, axis=0)[-1].item():.4f} | Train Accuracy: {np.mean(all_train_accs, axis=0)[-1].item():.4f} | Validation Loss: {np.mean(all_val_losses, axis=0)[-1].item():.4f} | Validation Accuracy: {np.mean(all_val_accs, axis=0)[-1].item():.4f}') + +plt.show()