commit 9fd0931ec388ab0302ce29da476a9d6166b1f750 Author: wangchunlin Date: Fri May 5 18:23:53 2023 +0800 可以实现是和否的二分类,支持交叉验证,softmax支持多分类,但具体没有调试好,二分类没有问题,检测准确度为0.9 diff --git a/data/data_src.xlsx b/data/data_src.xlsx new file mode 100644 index 0000000..e54f534 Binary files /dev/null and b/data/data_src.xlsx differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..dad8947 --- /dev/null +++ b/main.py @@ -0,0 +1,79 @@ +import pandas as pd +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np +from torch.utils.data import Dataset, DataLoader + +# Define MLP model +class MLP(nn.Module): + def __init__(self, input_size, hidden_size, output_size): + super(MLP, self).__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.relu1 = nn.ReLU() + self.fc2 = nn.Linear(hidden_size, output_size) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + out = self.fc1(x) + out = self.relu1(out) + out = self.fc2(out) + out = self.sigmoid(out) + return out + +# Define custom dataset +class PsychologyDataset(Dataset): + def __init__(self, data_file): + self.data = pd.read_excel(data_file) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + features = self.data.iloc[idx, 36:43].values.astype(np.float32) + str = self.data.iloc[idx, -1] + #print(idx,str,self.data.iloc[0, 0]) + label = -1 + if(str=="是"): + label = 1 + else: + label = 0 + #print(features) + label = np.float32(label) + #return torch.tensor(features, dtype=torch.float), label + return features, label + +# Set hyperparameters +input_size = 7 +hidden_size = 16 +output_size = 1 +lr = 0.01 +num_epochs = 100 + +# Load data +dataset = PsychologyDataset("data/data_src.xlsx") +dataloader = DataLoader(dataset, batch_size=1, shuffle=False) + +# Instantiate model, loss function, and optimizer +model = MLP(input_size, hidden_size, output_size) +criterion = nn.BCELoss() +optimizer = optim.Adam(model.parameters(), lr=lr) + +# Train model +for epoch in range(num_epochs): + running_loss = 0.0 + #print(type(dataloader)) + for i, data in enumerate(dataloader): + #print("数据序号:", i, data) + #continue + inputs, labels = data + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, labels.unsqueeze(1)) + loss.backward() + optimizer.step() + running_loss += loss.item() + print('Epoch [%d/%d], Loss: %.4f' % (epoch+1, num_epochs, running_loss / len(dataloader))) + +# Save trained model +torch.save(model.state_dict(), 'psychology_model.pth') diff --git a/psychology_model.pth b/psychology_model.pth new file mode 100644 index 0000000..0bed536 Binary files /dev/null and b/psychology_model.pth differ diff --git a/val.py b/val.py new file mode 100644 index 0000000..fcf3fd5 --- /dev/null +++ b/val.py @@ -0,0 +1,227 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np +import pandas as pd +from sklearn.model_selection import StratifiedKFold +from torch.utils.data import DataLoader, Dataset + +# 定义 MLP 模型 +class MLP(nn.Module): + def __init__(self, input_size, hidden_size, output_size): + super(MLP, self).__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.relu = nn.ReLU() + self.fc2 = nn.Linear(hidden_size, output_size) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + out = self.fc1(x) + out = self.relu(out) + out = self.fc2(out) + out = self.sigmoid(out) + return out + + + +class MMLP(nn.Module): + def __init__(self, input_size, hidden_sizes, output_size): + super(MMLP, self).__init__() + self.layers = nn.ModuleList() + for h in hidden_sizes: + self.layers.append(nn.Linear(input_size, h)) + self.layers.append(nn.ReLU()) + input_size = h + self.layers.append(nn.Linear(input_size, output_size)) + self.layers.append(nn.Sigmoid()) + #self.layers.append(nn.Softmax(dim=1)) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + +# 定义数据集 +class TensorDataset(Dataset): + def __init__(self, features, labels): + self.features = features + self.labels = labels + + def __len__(self): + return len(self.features) + + def __getitem__(self, index): + return self.features[index], self.labels[index] + +# 定义训练函数 +def train_model(model, train_loader, criterion, optimizer, num_epochs): + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + model.to(device) + model.train() + + for epoch in range(num_epochs): + train_loss = 0.0 + train_corrects = 0 + + for inputs, labels in train_loader: + inputs = inputs.to(device) + labels = labels.to(device) + + optimizer.zero_grad() + + outputs = model(inputs) + _, preds = torch.max(outputs, 1) + loss = criterion(outputs, labels.unsqueeze(1)) + loss.backward() + optimizer.step() + + train_loss += loss.item() * inputs.size(0) + train_corrects += torch.sum(preds == labels.data) + + train_loss = train_loss / len(train_loader.dataset) + train_acc = train_corrects.double() / len(train_loader.dataset) + + print('Epoch [{}/{}], Loss: {:.4f}, Acc: {:.4f}' + .format(epoch+1, num_epochs, train_loss, train_acc)) + +# 定义测试函数 +def test(model, dataloader, criterion): + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + model.eval() + running_loss = 0.0 + running_corrects = 0 + + with torch.no_grad(): + for inputs, labels in dataloader: + inputs = inputs.to(device) + labels = labels.to(device) + + outputs = model(inputs) + loss = criterion(outputs, labels.unsqueeze(1)) + + _, preds = torch.max(outputs, 1) + + running_loss += loss.item() * inputs.size(0) + running_corrects += torch.sum(preds == labels.data) + + test_loss = running_loss / len(dataloader.dataset) + test_acc = running_corrects.double() / len(dataloader.dataset) + + print('Test Loss: {:.4f} Acc: {:.4f}'.format(test_loss, test_acc)) + return test_loss, test_acc + + + +# 加载数据 +df = pd.read_excel("data/data_src.xlsx") +src_features = df.iloc[:, 36:43].values.astype(np.float32) +src_labels = np.array([1 if str=="是" else 0 for str in df.iloc[:, -1].values]).astype(np.float32) +print(src_labels) +print(len(src_labels)) + +# 检查是否有可用的GPU设备 +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +print('Using device:', device) + +# 定义交叉验证折数 +n_splits = 5 +skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) + +# 定义模型参数 +input_size = src_features.shape[1] +print(input_size) +hidden_size = 32 +output_size = 1 +lr = 0.01 +num_epochs = 50 +batch_size = 32 + +# 定义损失函数和优化器 +# criterion = nn.BCELoss() +# optimizer = optim.Adam(model.parameters(), lr=lr) + +# 进行交叉验证训练和测试 +# for fold, (train_idx, val_idx) in enumerate(skf.split(features, labels)): +# print(f"Fold {fold+1}:") + +# # 将数据集分为训练集和验证集 + + +# 进行交叉验证训练和测试 +k_folds = 5 +#num_epochs = 50 +batch_size = 16 +fold_accuracy=[] + + +# 总数26111 + +# 遍历每个 fold +for fold, (train_idx, test_idx) in enumerate(skf.split(src_features, src_labels)): + print(f"Fold [{fold+1}/{skf.n_splits}]") + print("train_idx:", train_idx) + print("test_idx:", test_idx) + train_features = src_features[train_idx] + train_labels = src_labels[train_idx] + test_features = src_features[test_idx] + test_labels = src_labels[test_idx] + + # 将numpy数组转为PyTorch张量 + train_features_tensor = torch.tensor(train_features, dtype=torch.float) + train_labels_tensor = torch.tensor(train_labels, dtype=torch.float) + test_features_tensor = torch.tensor(test_features, dtype=torch.float) + test_labels_tensor = torch.tensor(test_labels, dtype=torch.float) + # 构建数据集和数据加载器 + train_dataset = TensorDataset(train_features_tensor, train_labels_tensor) + train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) + test_dataset = TensorDataset(test_features_tensor, test_labels_tensor) + test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) + + print('--------------------------------') + + + # 初始化 MLP 模型 + #model = MLP(input_size, 32, 2) + model = MMLP(input_size, [32], 1) + + # 定义损失函数和优化器 + #criterion = nn.CrossEntropyLoss() + criterion = nn.BCELoss() + optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + + # 训练 MLP 模型 + train_model(model, train_loader, criterion, optimizer, num_epochs) + model.train() + # for epoch in range(num_epochs): + # for i, (inputs, labels) in enumerate(train_loader): + # # 前向传播 + # #print("inputs size: ", inputs.size()) + # outputs = model(inputs) + # loss = criterion(outputs, labels) + + # # 反向传播和优化 + # optimizer.zero_grad() + # loss.backward() + # optimizer.step() + + # 测试 MLP 模型 + test_loss, test_acc = test(model, test_loader, criterion) + # correct = 0 + # total = 0 + # model.eval() + # with torch.no_grad(): + # for inputs, labels in test_loader: + # outputs = model(inputs) + # _, predicted = torch.max(outputs.data, 1) + # total += labels.size(0) + # correct += (predicted == labels).sum().item() + + fold_accuracy.append(test_acc.item()) + print(f'Accuracy for fold {fold}: {fold_accuracy[fold]*100} %') + print('--------------------------------') + +print('K-FOLD CROSS VALIDATION RESULTS') +print(f'Fold accuracies: {fold_accuracy}') +print(f'Mean accuracy: {np.mean(fold_accuracy)}') +