不会再改算法了, 完备

master
wangchunlin 2 years ago
parent e903ba49d0
commit 36b93fcbf0

@ -45,7 +45,7 @@ def classify_features(features_data_list):
返回: 返回:
dict: 包含分类结果和模型文件信息的字典 dict: 包含分类结果和模型文件信息的字典
""" """
response = requests.post("http://127.0.0.1:3397/evaluate/", json=features_data_list) response = requests.post("http://p2p1.melulu.top:3397/evaluate/", json=features_data_list)
if response.status_code == 200: if response.status_code == 200:
results = response.json() results = response.json()
print("Precision:", results["classification_result"]["precision"]) print("Precision:", results["classification_result"]["precision"])

@ -87,7 +87,7 @@ if __name__ == "__main__":
features_data_list = [features_data4, features_data2, features_data3, features_data1] features_data_list = [features_data4, features_data2, features_data3, features_data1]
# 发送 POST 请求 # 发送 POST 请求
response = requests.post("http://127.0.0.1:3397/inference/", json=features_data_list) response = requests.post("http://p2p1.melulu.top:3397/inference/", json=features_data_list)
if response.status_code == 200: if response.status_code == 200:
# 获取分类结果列表 # 获取分类结果列表

@ -1,5 +1,5 @@
#---设备配置---# #---设备配置---#
#device: cpu # device: cpu
device: cuda device: cuda
#---训练配置---# #---训练配置---#
@ -8,11 +8,11 @@ batch_size: 16
learning_rate: 0.001 learning_rate: 0.001
nc: 4 nc: 4
#data_train: train_val # train: 只用train训练val做验证 infer做测试train_val: 用train和val做训练infer做验证 infer做测试all: 全部训练全部验证全部测试数据先1/5作为infer剩下的再1/5作为val剩下的4/5作为训练 #data_train: train_val # train: 只用train训练val做验证 infer做测试train_val: 用train和val做训练infer做验证 infer做测试all: 全部训练全部验证全部测试数据先1/5作为infer剩下的再1/5作为val剩下的4/5作为训练
data_train: train data_train: train_val
early_stop_patience: 50 early_stop_patience: 50
gamma: 0.98 gamma: 0.98
step_size: 10 step_size: 10
experiments_count: 5 experiments_count: 1
#---检测和推理配置---# #---检测和推理配置---#
# 检测和推理使用模型路径 # 检测和推理使用模型路径
@ -46,3 +46,79 @@ feature_weights:
- 0.08 - 0.08
- 0.12 - 0.12
#---网络结构---#
# MLP configuration
mlp:
input_dim: 10 # Number of input features
layers:
- output_dim: 32
activation: relu
- output_dim: 128
activation: relu
- output_dim: 32
activation: relu
output_dim: 4 # Number of classes
# Transformer configuration
transformer:
d_model: 32 # Reduced embedding dimension
nhead: 4 # Reduced number of attention heads
num_encoder_layers: 2 # Reduced number of encoder layers
num_decoder_layers: 2 # Reduced number of decoder layers
dim_feedforward: 128 # Reduced feedforward network dimension
dropout: 0.1 # Dropout probability
input_dim: 10 # Number of input features
output_dim: 4 # Number of classes
#---训练配置备份---#
# MLP good train param 1
# #---训练配置---#
# n_epochs: 150
# batch_size: 16
# learning_rate: 0.001
# nc: 4
# #data_train: train_val # train: 只用train训练val做验证 infer做测试train_val: 用train和val做训练infer做验证 infer做测试all: 全部训练全部验证全部测试数据先1/5作为infer剩下的再1/5作为val剩下的4/5作为训练
# data_train: train_val
# early_stop_patience: 50
# gamma: 0.98
# step_size: 10
# experiments_count: 1
# MLP good train param 2
# #---训练配置---#
# n_epochs: 300
# batch_size: 8
# learning_rate: 0.0005
# nc: 4
# #data_train: train_val # train: 只用train训练val做验证 infer做测试train_val: 用train和val做训练infer做验证 infer做测试all: 全部训练全部验证全部测试数据先1/5作为infer剩下的再1/5作为val剩下的4/5作为训练
# data_train: train_val
# early_stop_patience: 50
# gamma: 0.98
# step_size: 10
# experiments_count: 1
# Transformer good train param 1
# #---训练配置---#
# n_epochs: 150
# batch_size: 64
# learning_rate: 0.001
# nc: 4
# #data_train: train_val # train: 只用train训练val做验证 infer做测试train_val: 用train和val做训练infer做验证 infer做测试all: 全部训练全部验证全部测试数据先1/5作为infer剩下的再1/5作为val剩下的4/5作为训练
# data_train: train_val
# early_stop_patience: 50
# gamma: 0.98
# step_size: 10
# experiments_count: 1
# Transformer good train param 2
# #---训练配置---#
# n_epochs: 300
# batch_size: 8
# learning_rate: 0.0005
# nc: 4
# #data_train: train_val # train: 只用train训练val做验证 infer做测试train_val: 用train和val做训练infer做验证 infer做测试all: 全部训练全部验证全部测试数据先1/5作为infer剩下的再1/5作为val剩下的4/5作为训练
# data_train: train_val
# early_stop_patience: 50
# gamma: 0.98
# step_size: 10
# experiments_count: 1

@ -1,31 +1,34 @@
import os import os
import sys
root_path = os.getcwd()
sys.path.append(root_path)
import time import time
import datetime import datetime
import signal import logging
import uvicorn import uvicorn
import pandas as pd import yaml
import numpy as np
from fastapi import FastAPI, Request from fastapi import FastAPI, Request
from pydantic import BaseModel from pydantic import BaseModel
from typing import List from typing import List
from utils.common import train_detect, evaluate_model, inference_model import atexit
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
import logging
import matplotlib.pyplot as plt
import argparse
import numpy as np
import yaml
import threading
import pickle
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from utils.feature_process import create_feature_df, apply_feature_weights, Features from utils.feature_process import create_feature_df, apply_feature_weights, Features
from utils.common import MLModel
app = FastAPI() app = FastAPI()
# 控制是否打印的宏定义
PRINT_LOG = True
def log_print(message):
logging.info(message)
if PRINT_LOG:
print(message)
# 保证日志写到文件
def flush_log():
for handler in logging.getLogger().handlers:
handler.flush()
# 定义fastapi返回类 inference # 定义fastapi返回类 inference
class PredictionResult(BaseModel): class PredictionResult(BaseModel):
predictions: list predictions: list
@ -46,14 +49,16 @@ app.add_middleware(
allow_headers=["*"], allow_headers=["*"],
) )
# 定义接口 # 初始化配置文件
config_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "config/config.yaml"))
# 定义训练接口
@app.post("/train/") @app.post("/train/")
async def classify_features(request: Request, features_list: List[Features]): async def train_model(request: Request, features_list: List[Features]):
# 遍历每个特征对象,并将其添加到 all_features 中 # 遍历每个特征对象,并将其添加到 all_features 中
all_features = create_feature_df(features_list) all_features = create_feature_df(features_list)
# 读取 YAML 配置文件 # 读取 YAML 配置文件
config_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "config/config.yaml"))
with open(config_path, 'r') as f: with open(config_path, 'r') as f:
config = yaml.load(f, Loader=yaml.FullLoader) config = yaml.load(f, Loader=yaml.FullLoader)
feature_names = config['feature_names'] feature_names = config['feature_names']
@ -70,54 +75,53 @@ async def classify_features(request: Request, features_list: List[Features]):
# 训练前设置 # 训练前设置
now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), static_dir, f"train_feature_label_weighted_{now}.xlsx")) data_path = os.path.abspath(os.path.join(static_dir, f"train_feature_label_weighted_{now}.xlsx"))
config['data_path'] = data_path config['data_path'] = data_path
feature_label_weighted.to_excel(data_path, index=False) feature_label_weighted.to_excel(data_path, index=False)
# 添加模型保存路径 # 添加模型保存路径
model_path = os.path.abspath(os.path.join(os.path.dirname(__file__), static_dir, f"train_model_{now}.pth")) model_path = os.path.abspath(os.path.join(static_dir, f"train_model_{now}.pth"))
config['model_path'] = model_path config['model_path'] = model_path
# 配置日志 # 配置日志
log_path = os.path.abspath(os.path.join(os.path.dirname(__file__), static_dir, f"train_log_{now}.log")) log_path = os.path.abspath(os.path.join(static_dir, f"train_log_{now}.log"))
logging.basicConfig(filename=log_path, level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logging.basicConfig(filename=log_path, level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
# 配置训练和验证结果图片路径 # 配置训练和验证结果图片路径
train_process_path = os.path.abspath(os.path.join(os.path.dirname(__file__), static_dir, f"train_progress_img_{now}.png")) train_process_path = os.path.abspath(os.path.join(static_dir, f"train_progress_img_{now}.png"))
config['train_process_path'] = train_process_path config['train_process_path'] = train_process_path
evaluate_result_path = os.path.abspath(os.path.join(os.path.dirname(__file__), static_dir, f"evaluate_result_img_{now}.png")) evaluate_result_path = os.path.abspath(os.path.join(static_dir, f"evaluate_result_img_{now}.png"))
config['evaluate_result_path'] = evaluate_result_path config['evaluate_result_path'] = evaluate_result_path
print("config: ", config) log_print("config: " + str(config))
logging.info("config: ", config)
# 开始训练 # 开始训练
# 初始化 MLModel 实例
ml_model = MLModel(config)
list_avg_f1 = [] list_avg_f1 = []
list_wrong_percentage = [] list_wrong_percentage = []
list_precision = [] list_precision = []
list_recall = [] list_recall = []
list_f1 = [] list_f1 = []
train_times = 1 if config['data_train']==r'all' else config["experiments_count"] train_times = 1 if config['data_train'] == 'all' else config["experiments_count"]
for i in range(train_times): for _ in range(train_times):
avg_f1, wrong_percentage, precision, recall, f1 = train_detect(config) avg_f1, wrong_percentage, precision, recall, f1 = ml_model.train_detect()
list_avg_f1.append(avg_f1) list_avg_f1.append(avg_f1)
list_wrong_percentage.append(wrong_percentage) list_wrong_percentage.append(wrong_percentage)
list_precision.append(precision) list_precision.append(precision)
list_recall.append(recall) list_recall.append(recall)
list_f1.append(f1) list_f1.append(f1)
logging.info(f"Result: Avg F1: {sum(list_avg_f1) / len(list_avg_f1):.4f} Avg Wrong Percentage: {sum(list_wrong_percentage) / len(list_wrong_percentage):.2f}%") log_print(f"Result: Avg F1: {sum(list_avg_f1) / len(list_avg_f1):.4f} Avg Wrong Percentage: {sum(list_wrong_percentage) / len(list_wrong_percentage):.2f}%")
logging.info(f"Result: Avg Precision: {[sum(p[i] for p in list_precision) / len(list_precision) for i in range(len(list_precision[0]))]} | {np.mean(list_precision)}") log_print(f"Result: Avg Precision: {[sum(p[i] for p in list_precision) / len(list_precision) for i in range(len(list_precision[0]))]} | {np.mean(list_precision)}")
logging.info(f"Result: Avg Recall: {[sum(r[i] for r in list_recall) / len(list_recall) for i in range(len(list_recall[0]))]} | {np.mean(list_recall)}") log_print(f"Result: Avg Recall: {[sum(r[i] for r in list_recall) / len(list_recall) for i in range(len(list_recall[0]))]} | {np.mean(list_recall)}")
logging.info(f"Result: Avg F1: {[sum(f1[i] for f1 in list_f1) / len(list_f1) for i in range(len(list_f1[0]))]} | {np.mean(list_f1)}") log_print(f"Result: Avg F1: {[sum(f1[i] for f1 in list_f1) / len(list_f1) for i in range(len(list_f1[0]))]} | {np.mean(list_f1)}")
print(f"Result: Avg F1: {sum(list_avg_f1) / len(list_avg_f1):.4f} Avg Wrong Percentage: {sum(list_wrong_percentage) / len(list_wrong_percentage):.2f}%")
print(f"Result: Avg Precision: {[sum(p[i] for p in list_precision) / len(list_precision) for i in range(len(list_precision[0]))]} | {np.mean(list_precision)}")
print(f"Result: Avg Recall: {[sum(r[i] for r in list_recall) / len(list_recall) for i in range(len(list_recall[0]))]} | {np.mean(list_recall)}")
print(f"Result: Avg F1: {[sum(f1[i] for f1 in list_f1) / len(list_f1) for i in range(len(list_f1[0]))]} | {np.mean(list_f1)}")
end_time = time.time() # 记录结束时间 end_time = time.time() # 记录结束时间
# 训练结束 log_print("预测耗时: " + str(end_time - start_time) + "") # 打印执行时间
print("预测耗时:", end_time - start_time, "") # 打印执行时间
# 保证日志写到文件
atexit.register(flush_log)
# 返回分类结果和模型文件下载 URLstatic不是程序执行路径而是app.mount的静态文件夹 # 返回分类结果和模型文件下载 URLstatic不是程序执行路径而是app.mount的静态文件夹
model_file_url = f"{request.base_url}train_api/train_model_{now}.pth" model_file_url = f"{request.base_url}train_api/train_model_{now}.pth"
@ -139,14 +143,13 @@ async def classify_features(request: Request, features_list: List[Features]):
} }
} }
# 定义接口 # 定义验证接口
@app.post("/evaluate/") @app.post("/evaluate/")
async def classify_features(request: Request, features_list: List[Features]): async def evaluate_model(request: Request, features_list: List[Features]):
# 遍历每个特征对象,并将其添加到 all_features 中 # 遍历每个特征对象,并将其添加到 all_features 中
all_features = create_feature_df(features_list) all_features = create_feature_df(features_list)
# 读取 YAML 配置文件 # 读取 YAML 配置文件
config_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "config/config.yaml"))
with open(config_path, 'r') as f: with open(config_path, 'r') as f:
config = yaml.load(f, Loader=yaml.FullLoader) config = yaml.load(f, Loader=yaml.FullLoader)
feature_names = config['feature_names'] feature_names = config['feature_names']
@ -163,50 +166,35 @@ async def classify_features(request: Request, features_list: List[Features]):
# 训练前设置 # 训练前设置
now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), static_dir, f"evaluate_feature_label_weighted_{now}.xlsx")) data_path = os.path.abspath(os.path.join(static_dir, f"evaluate_feature_label_weighted_{now}.xlsx"))
config['data_path'] = data_path config['data_path'] = data_path
feature_label_weighted.to_excel(data_path, index=False) feature_label_weighted.to_excel(data_path, index=False)
# 配置验证结果图片路径 # 配置验证结果图片路径
evaluate_result_path = os.path.abspath(os.path.join(os.path.dirname(__file__), static_dir, f"evaluate_result_img_{now}.png")) evaluate_result_path = os.path.abspath(os.path.join(static_dir, f"evaluate_result_img_{now}.png"))
config['evaluate_result_path'] = evaluate_result_path config['evaluate_result_path'] = evaluate_result_path
# 配置日志 # 配置日志
log_path = os.path.abspath(os.path.join(os.path.dirname(__file__), static_dir, f"evaluate_log_{now}.log")) log_path = os.path.abspath(os.path.join(static_dir, f"evaluate_log_{now}.log"))
logging.basicConfig(filename=log_path, level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logging.basicConfig(filename=log_path, level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
# 开始验证
list_avg_f1 = []
list_wrong_percentage = []
list_precision = []
list_recall = []
list_f1 = []
# 特征和标签 # 特征和标签
X = feature_label_weighted[config['feature_names']].values X = feature_label_weighted[config['feature_names']].values
y = feature_label_weighted[config['label_name']].values y = feature_label_weighted[config['label_name']].values
print("config: ", config) # 初始化 MLModel 实例
logging.info("config: ", config) ml_model = MLModel(config)
avg_f1, wrong_percentage, precision, recall, f1 = evaluate_model(config["model_path"], X, y, config)
list_avg_f1.append(avg_f1) # 加载模型
list_wrong_percentage.append(wrong_percentage) ml_model.load_model()
list_precision.append(precision)
list_recall.append(recall)
list_f1.append(f1)
logging.info(f"Result: Avg F1: {sum(list_avg_f1) / len(list_avg_f1):.4f} Avg Wrong Percentage: {sum(list_wrong_percentage) / len(list_wrong_percentage):.2f}%") avg_f1, wrong_percentage, precision, recall, f1 = ml_model.evaluate_model(X, y)
logging.info(f"Result: Avg Precision: {[sum(p[i] for p in list_precision) / len(list_precision) for i in range(len(list_precision[0]))]} | {np.mean(list_precision)}")
logging.info(f"Result: Avg Recall: {[sum(r[i] for r in list_recall) / len(list_recall) for i in range(len(list_recall[0]))]} | {np.mean(list_recall)}")
logging.info(f"Result: Avg F1: {[sum(f1[i] for f1 in list_f1) / len(list_f1) for i in range(len(list_f1[0]))]} | {np.mean(list_f1)}")
print(f"Result: Avg F1: {sum(list_avg_f1) / len(list_avg_f1):.4f} Avg Wrong Percentage: {sum(list_wrong_percentage) / len(list_wrong_percentage):.2f}%")
print(f"Result: Avg Precision: {[sum(p[i] for p in list_precision) / len(list_precision) for i in range(len(list_precision[0]))]} | {np.mean(list_precision)}")
print(f"Result: Avg Recall: {[sum(r[i] for r in list_recall) / len(list_recall) for i in range(len(list_recall[0]))]} | {np.mean(list_recall)}")
print(f"Result: Avg F1: {[sum(f1[i] for f1 in list_f1) / len(list_f1) for i in range(len(list_f1[0]))]} | {np.mean(list_f1)}")
end_time = time.time() # 记录结束时间 end_time = time.time() # 记录结束时间
# 训练结束 log_print("预测耗时: " + str(end_time - start_time) + "") # 打印执行时间
print("预测耗时:", end_time - start_time, "") # 打印执行时间
# 保证日志写到文件
atexit.register(flush_log)
# 返回分类结果和模型文件下载 URLstatic不是程序执行路径而是app.mount的静态文件夹 # 返回分类结果和模型文件下载 URLstatic不是程序执行路径而是app.mount的静态文件夹
log_file_url = f"{request.base_url}evaluate_api/evaluate_log_{now}.log" log_file_url = f"{request.base_url}evaluate_api/evaluate_log_{now}.log"
@ -226,14 +214,13 @@ async def classify_features(request: Request, features_list: List[Features]):
} }
} }
# 定义接口 # 定义推理接口
@app.post("/inference/") @app.post("/inference/")
async def classify_features(request: Request, features_list: List[Features]): async def inference_model(request: Request, features_list: List[Features]):
# 遍历每个特征对象,并将其添加到 all_features 中 # 遍历每个特征对象,并将其添加到 all_features 中
all_features = create_feature_df(features_list) all_features = create_feature_df(features_list)
# 读取 YAML 配置文件 # 读取 YAML 配置文件
config_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "config/config.yaml"))
with open(config_path, 'r') as f: with open(config_path, 'r') as f:
config = yaml.load(f, Loader=yaml.FullLoader) config = yaml.load(f, Loader=yaml.FullLoader)
feature_names = config['feature_names'] feature_names = config['feature_names']
@ -250,23 +237,32 @@ async def classify_features(request: Request, features_list: List[Features]):
# 训练前设置 # 训练前设置
now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), static_dir, f"inference_feature_label_weighted_{now}.xlsx")) data_path = os.path.abspath(os.path.join(static_dir, f"inference_feature_label_weighted_{now}.xlsx"))
config['data_path'] = data_path config['data_path'] = data_path
feature_label_weighted.to_excel(data_path, index=False) feature_label_weighted.to_excel(data_path, index=False)
# 配置日志 # 配置日志
log_path = os.path.abspath(os.path.join(os.path.dirname(__file__), static_dir, f"inference_log_{now}.log")) log_path = os.path.abspath(os.path.join(static_dir, f"inference_log_{now}.log"))
logging.basicConfig(filename=log_path, level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logging.basicConfig(filename=log_path, level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
# 特征和标签 # 特征和标签
X = feature_label_weighted[config['feature_names']].values X = feature_label_weighted[config['feature_names']].values
y = feature_label_weighted[config['label_name']].values
predictions = inference_model(config["model_path"], X, y, config) # 初始化 MLModel 实例
ml_model = MLModel(config)
# 加载模型
ml_model.load_model()
predictions = ml_model.inference_model(X)
end_time = time.time() # 记录结束时间 end_time = time.time() # 记录结束时间
print("预测耗时:", end_time - start_time, "") # 打印执行时间 log_print("预测耗时: " + str(end_time - start_time) + "") # 打印执行时间
log_print("预测结果: " + str(predictions))
print("预测结果:", predictions) # 保证日志写到文件
atexit.register(flush_log)
# 返回预测结果 # 返回预测结果
return PredictionResult(predictions=predictions) return PredictionResult(predictions=predictions)
@ -299,7 +295,5 @@ if __name__ == "__main__":
# 同级目录下的static文件夹 # 同级目录下的static文件夹
app.mount("/train_api", StaticFiles(directory=static_dir_train), name="static_dir_train") app.mount("/train_api", StaticFiles(directory=static_dir_train), name="static_dir_train")
app.mount("/evaluate_api", StaticFiles(directory=static_dir_evaluate), name="static_dir_evaluate") app.mount("/evaluate_api", StaticFiles(directory=static_dir_evaluate), name="static_dir_evaluate")
app.mount("/inference_api", StaticFiles(directory=static_dir_evaluate), name="static_dir_inference") app.mount("/inference_api", StaticFiles(directory=static_dir_inference), name="static_dir_inference")
uvicorn.run(app, host="0.0.0.0", port=3397, reload=False) uvicorn.run(app, host="0.0.0.0", port=3397, reload=False)
## train evl 功能OK了 差infer就可以了还有就是做一个模型上传机制目前为止最好的模型就是model下面那个

@ -10,33 +10,36 @@ from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight from sklearn.utils.class_weight import compute_class_weight
import logging import logging
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import argparse
# 控制是否打印的宏定义
PRINT_LOG = True
class MLP(nn.Module): def log_print(message):
def __init__(self, config): logging.info(message)
super(MLP, self).__init__() if PRINT_LOG:
self.model = nn.Sequential( print(message)
nn.Linear(len(config['feature_names']), 32),
nn.ReLU(),
nn.Linear(32, 128),
nn.ReLU(),
nn.Linear(128, 32),
nn.ReLU(),
nn.Linear(32, config['nc']),
)
def forward(self, x): class MLModel:
return self.model(x) def __init__(self, model_config):
self.config = model_config
self.model = None
def create_model(self):
self.model = MLP(self.config).to(self.config['device'])
# self.model = TransformerModel(self.config).to(self.config['device'])
def load_model(self):
self.create_model()
self.model.load_state_dict(torch.load(self.config['model_path'], map_location=self.config['device']))
def load_and_split_data(config): def load_and_split_data(self):
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
file_path = os.path.join(parent_dir, config['data_path']) file_path = os.path.join(parent_dir, self.config['data_path'])
data = pd.read_excel(file_path) data = pd.read_excel(file_path)
X = data[config['feature_names']].values X = data[self.config['feature_names']].values
y = data[config['label_name']].values y = data[self.config['label_name']].values
skf_outer = StratifiedKFold(n_splits=5, shuffle=True) skf_outer = StratifiedKFold(n_splits=5, shuffle=True)
train_index_outer, test_index_outer = next(skf_outer.split(X, y)) train_index_outer, test_index_outer = next(skf_outer.split(X, y))
@ -50,62 +53,42 @@ def load_and_split_data(config):
return X, y, X_train_val, y_train_val, X_train, y_train, X_val, y_val, X_infer, y_infer return X, y, X_train_val, y_train_val, X_train, y_train, X_val, y_val, X_infer, y_infer
def save_model(model_path, best_model): def save_model(self, model_path):
torch.save(best_model, model_path) torch.save(self.model.state_dict(), model_path)
def evaluate_model(model_path, X_infer, y_infer, config):
# 如果传入的是模型文件路径,则从该路径加载模型
if isinstance(model_path, str):
model = MLP(config).to(config['device'])
model.load_state_dict(torch.load(model_path, map_location=config['device'])) # 加载训练好的模型参数
else:
model = model_path
# infer_data = pd.DataFrame(X_infer, columns=config['feature_names'])
# infer_data[config['label_name']] = y_infer
# infer_data.to_excel(os.path.join(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")), config['infer_path']), index=False)
model.eval()
def evaluate_model(self, X_infer, y_infer):
self.model.eval()
with torch.no_grad(): with torch.no_grad():
outputs = model(torch.from_numpy(X_infer).float().to(config['device'])) outputs = self.model(torch.from_numpy(X_infer).float().to(self.config['device']))
_, predictions = torch.max(outputs, 1) _, predictions = torch.max(outputs, 1)
precision = precision_score(y_infer, predictions.cpu().numpy(), average=None)
wrong_indices = np.where(y_infer != predictions.cpu().numpy())[0] recall = recall_score(y_infer, predictions.cpu().numpy(), average=None)
f1 = f1_score(y_infer, predictions.cpu().numpy(), average=None)
wrong_count = len(wrong_indices) wrong_count = len(np.where(y_infer != predictions.cpu().numpy())[0])
total_count = len(y_infer) total_count = len(y_infer)
wrong_percentage = (wrong_count / total_count) * 100 wrong_percentage = (wrong_count / total_count) * 100
print("Infer Result: ") log_print("Evaluate Result: ")
logging.info("Infer Result: ")
print("预测错误数量:", wrong_count)
print("预测错误占总数量的百分比:", wrong_percentage, "%")
print("总数量:", total_count)
logging.info(f"Prediction errors: {wrong_count}") log_print(f"Prediction errors: {wrong_count}")
logging.info(f"Prediction error percentage: {wrong_percentage:.2f}%") log_print(f"Prediction error percentage: {wrong_percentage:.2f}%")
logging.info(f"Total samples: {total_count}") log_print(f"Total samples: {total_count}")
precision = precision_score(y_infer, predictions.cpu().numpy(), average=None)
recall = recall_score(y_infer, predictions.cpu().numpy(), average=None)
f1 = f1_score(y_infer, predictions.cpu().numpy(), average=None)
avg_precision = np.mean(precision) avg_precision = np.mean(precision)
avg_recall = np.mean(recall) avg_recall = np.mean(recall)
avg_f1 = np.mean(f1) avg_f1 = np.mean(f1)
for i in range(len(precision)): for i in range(len(precision)):
print(f"Class {i} Precision: {precision[i]:.4f}, Recall: {recall[i]:.4f}, F1: {f1[i]:.4f}") log_print(f"Class {i} Precision: {precision[i]:.4f}, Recall: {recall[i]:.4f}, F1: {f1[i]:.4f}")
print("精确率:", precision) log_print("精确率:" + str(precision))
print("召回率:", recall) log_print("召回率:" + str(recall))
print("F1得分:", f1) log_print("F1得分:" + str(f1))
print("平均精确率:", avg_precision) log_print("平均精确率:" + str(avg_precision))
print("平均召回率:", avg_recall) log_print("平均召回率:" + str(avg_recall))
print("平均F1得分:", avg_f1) log_print("平均F1得分:" + str(avg_f1))
print("Infer Result End: ") log_print("Evaluate Result End: ")
logging.info("Infer Result End: ")
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5)) fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
ax1.bar(np.arange(len(precision)), precision) ax1.bar(np.arange(len(precision)), precision)
@ -116,120 +99,40 @@ def evaluate_model(model_path, X_infer, y_infer, config):
ax3.set_title('F1 Score') ax3.set_title('F1 Score')
# 保存图片 # 保存图片
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
evaluate_result_path = os.path.join(parent_dir, config['evaluate_result_path']) evaluate_result_path = os.path.join(parent_dir, self.config['evaluate_result_path'])
plt.savefig(evaluate_result_path) plt.savefig(evaluate_result_path)
return avg_f1, wrong_percentage, precision, recall, f1 return np.mean(f1), wrong_percentage, precision, recall, f1
def inference_model(model_path, X_infer, y_infer, config):
# 如果传入的是模型文件路径,则从该路径加载模型
if isinstance(model_path, str):
model = MLP(config).to(config['device'])
model.load_state_dict(torch.load(model_path, map_location=config['device'])) # 加载训练好的模型参数
else:
model = model_path
# infer_data = pd.DataFrame(X_infer, columns=config['feature_names'])
# infer_data[config['label_name']] = y_infer
# infer_data.to_excel(os.path.join(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")), config['infer_path']), index=False)
model.eval()
# 推理 def inference_model(self, X_infer):
self.model.eval()
with torch.no_grad(): with torch.no_grad():
outputs = model(torch.from_numpy(X_infer).float().to(config['device'])) outputs = self.model(torch.from_numpy(X_infer).float().to(self.config['device']))
# 获取预测结果
_, predictions = torch.max(outputs, 1) _, predictions = torch.max(outputs, 1)
# 实际类别从1开始程序类别从0开始
predictions += 1
# 打印预测结果
# print("预测结果:", predictions.cpu().numpy())
# 返回预测结果
return predictions.cpu().numpy().tolist() return predictions.cpu().numpy().tolist()
def train_detect(config): def train_model(self, train_loader, val_loader, criterion, optimizer, scheduler):
X, y, X_train_val, y_train_val, X_train, y_train, X_val, y_val, X_infer, y_infer = load_and_split_data(config) n_epochs = self.config['n_epochs']
if config['data_train'] == r'train_val':
train_dataset = TensorDataset(torch.from_numpy(X_train_val).float().to(config['device']), torch.from_numpy(y_train_val).long().to(config['device']))
val_dataset = TensorDataset(torch.from_numpy(X_infer).float().to(config['device']), torch.from_numpy(y_infer).long().to(config['device']))
class_weights = torch.tensor(compute_class_weight('balanced', classes=np.unique(y_train_val), y=y_train_val), dtype=torch.float32).to(config['device'])
logging.info(f"Class weights: {class_weights}")
elif config['data_train'] == r'train':
train_dataset = TensorDataset(torch.from_numpy(X_train).float().to(config['device']), torch.from_numpy(y_train).long().to(config['device']))
val_dataset = TensorDataset(torch.from_numpy(X_val).float().to(config['device']), torch.from_numpy(y_val).long().to(config['device']))
class_weights = torch.tensor(compute_class_weight('balanced', classes=np.unique(y_train), y=y_train), dtype=torch.float32).to(config['device'])
logging.info(f"Class weights: {class_weights}")
elif config['data_train'] == r'all':
train_dataset = TensorDataset(torch.from_numpy(X).float().to(config['device']), torch.from_numpy(y).long().to(config['device']))
val_dataset = TensorDataset(torch.from_numpy(X).float().to(config['device']), torch.from_numpy(y).long().to(config['device']))
X_infer = X
y_infer = y
class_weights = torch.tensor(compute_class_weight('balanced', classes=np.unique(y), y=y), dtype=torch.float32).to(config['device'])
logging.info(f"Class weights: {class_weights}")
else:
print("Error: Set data_train first in yaml!")
logging.error("Error: Set data_train first in yaml!")
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])
model = MLP(config).to(config['device'])
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config['step_size'], config['gamma'])
best_val_f1, best_val_recall, best_val_precision, best_epoch, best_model = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, config)
# 保存模型
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
model_path = os.path.join(parent_dir, config['model_path'])
save_model(model_path, best_model)
logging.info(f"Best Validation F1 Score (Macro): {best_val_f1:.4f}")
logging.info(f"Best Validation Recall (Macro): {best_val_recall:.4f}")
logging.info(f"Best Validation Precision (Macro): {best_val_precision:.4f}")
logging.info(f"Best Epoch: {best_epoch + 1}")
print(f"Best Validation F1 Score (Macro): {best_val_f1:.4f}")
print(f"Best Validation Recall (Macro): {best_val_recall:.4f}")
print(f"Best Validation Precision (Macro): {best_val_precision:.4f}")
print(f"Best Epoch: {best_epoch + 1}")
avg_f1, wrong_percentage, precision, recall, f1 = evaluate_model(model, X_infer, y_infer, config)
return avg_f1, wrong_percentage, precision, recall, f1
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, config):
n_epochs = config['n_epochs']
best_val_f1 = 0.0 best_val_f1 = 0.0
best_val_recall = 0.0 best_val_recall = 0.0
best_val_precision = 0.0 best_val_precision = 0.0
best_epoch = -1 best_epoch = -1
best_model = None best_model = None
patience = config['early_stop_patience'] patience = self.config['early_stop_patience']
trigger_times = 0 trigger_times = 0
train_loss_history = [] train_loss_history, train_acc_history, val_loss_history, val_acc_history, val_f1_history, val_precision_history, val_recall_history = [[] for _ in range(7)]
train_acc_history = []
val_loss_history = []
val_acc_history = []
val_f1_history = []
val_precision_history = []
val_recall_history = []
plt.rcParams['figure.max_open_warning'] = 50 plt.rcParams['figure.max_open_warning'] = 50
for epoch in range(n_epochs): for epoch in range(n_epochs):
# 训练阶段 # Training phase
model.train() self.model.train()
train_loss, train_acc = 0, 0 train_loss, train_acc = 0, 0
for inputs, targets in train_loader: for inputs, targets in train_loader:
optimizer.zero_grad() optimizer.zero_grad()
outputs = model(inputs) outputs = self.model(inputs)
loss = criterion(outputs, targets) loss = criterion(outputs, targets)
loss.backward() loss.backward()
optimizer.step() optimizer.step()
@ -243,12 +146,12 @@ def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler
# 更新学习率 # 更新学习率
scheduler.step() scheduler.step()
# 验证阶段 # Validation phase
model.eval()
val_loss, val_acc, all_preds, all_targets = 0, 0, [], [] val_loss, val_acc, all_preds, all_targets = 0, 0, [], []
self.model.eval()
with torch.no_grad(): with torch.no_grad():
for inputs, targets in val_loader: for inputs, targets in val_loader:
outputs = model(inputs) outputs = self.model(inputs)
loss = criterion(outputs, targets) loss = criterion(outputs, targets)
val_loss += loss.item() * inputs.size(0) val_loss += loss.item() * inputs.size(0)
_, preds = torch.max(outputs, 1) _, preds = torch.max(outputs, 1)
@ -263,8 +166,7 @@ def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler
class_recalls_m = recall_score(all_targets, all_preds, average='macro') class_recalls_m = recall_score(all_targets, all_preds, average='macro')
class_f1_scores_m = f1_score(all_targets, all_preds, average='macro') class_f1_scores_m = f1_score(all_targets, all_preds, average='macro')
logging.info(f'Epoch {epoch+1:0{3}d} | Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f} | Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f} | Validation Mean Precision: {class_precisions_m:.4f} | Validation Mean Recall: {class_recalls_m:.4f} | Validation Mean F1_score: {class_f1_scores_m:.4f}') log_print(f'Epoch {epoch+1:0{3}d} | Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f} | Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f} | Validation Mean Precision: {class_precisions_m:.4f} | Validation Mean Recall: {class_recalls_m:.4f} | Validation Mean F1_score: {class_f1_scores_m:.4f}')
print(f'Epoch {epoch+1:0{3}d} | Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f} | Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f} | Validation Mean Precision: {class_precisions_m:.4f} | Validation Mean Recall: {class_recalls_m:.4f} | Validation Mean F1_score: {class_f1_scores_m:.4f}')
train_loss_history.append(train_loss) train_loss_history.append(train_loss)
train_acc_history.append(train_acc) train_acc_history.append(train_acc)
@ -292,7 +194,7 @@ def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler
ax3.legend() ax3.legend()
# 保存图片 # 保存图片
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
train_process_path = os.path.join(parent_dir, config['train_process_path']) train_process_path = os.path.join(parent_dir, self.config['train_process_path'])
plt.savefig(train_process_path) plt.savefig(train_process_path)
if class_f1_scores_m > best_val_f1: if class_f1_scores_m > best_val_f1:
@ -300,49 +202,112 @@ def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler
best_val_recall = class_recalls_m best_val_recall = class_recalls_m
best_val_precision = class_precisions_m best_val_precision = class_precisions_m
best_epoch = epoch best_epoch = epoch
best_model = model.state_dict() best_model = self.model.state_dict()
trigger_times = 0 trigger_times = 0
else: else:
trigger_times += 1 trigger_times += 1
if trigger_times >= patience: if trigger_times >= patience:
logging.info(f'Early stopping at epoch {epoch} | Best epoch : {best_epoch + 1}') log_print(f'Early stopping at epoch {epoch} | Best epoch : {best_epoch + 1}')
print(f'Early stopping at epoch {epoch} | Best epoch : {best_epoch + 1}')
break break
return best_val_f1, best_val_recall, best_val_precision, best_epoch, best_model return best_val_f1, best_val_recall, best_val_precision, best_epoch, best_model
if __name__ == "__main__": def train_detect(self):
parser = argparse.ArgumentParser() X, y, X_train_val, y_train_val, X_train, y_train, X_val, y_val, X_infer, y_infer = self.load_and_split_data()
parser.add_argument('--config', type=str, default='config.yaml', help='Path to the configuration file')
args = parser.parse_args() if self.config['data_train'] == 'train_val':
with open(args.config, 'r') as f: train_dataset = TensorDataset(torch.from_numpy(X_train_val).float().to(self.config['device']), torch.from_numpy(y_train_val).long().to(self.config['device']))
config = yaml.load(f, Loader=yaml.FullLoader) val_dataset = TensorDataset(torch.from_numpy(X_infer).float().to(self.config['device']), torch.from_numpy(y_infer).long().to(self.config['device']))
class_weights = torch.tensor(compute_class_weight('balanced', classes=np.unique(y_train_val), y=y_train_val), dtype=torch.float32).to(self.config['device'])
elif self.config['data_train'] == 'train':
train_dataset = TensorDataset(torch.from_numpy(X_train).float().to(self.config['device']), torch.from_numpy(y_train).long().to(self.config['device']))
val_dataset = TensorDataset(torch.from_numpy(X_val).float().to(self.config['device']), torch.from_numpy(y_val).long().to(self.config['device']))
class_weights = torch.tensor(compute_class_weight('balanced', classes=np.unique(y_train), y=y_train), dtype=torch.float32).to(self.config['device'])
elif self.config['data_train'] == 'all':
train_dataset = TensorDataset(torch.from_numpy(X).float().to(self.config['device']), torch.from_numpy(y).long().to(self.config['device']))
val_dataset = TensorDataset(torch.from_numpy(X).float().to(self.config['device']), torch.from_numpy(y).long().to(self.config['device']))
X_infer = X
y_infer = y
class_weights = torch.tensor(compute_class_weight('balanced', classes=np.unique(y), y=y), dtype=torch.float32).to(self.config['device'])
else:
logging.error("Error: Set data_train first in yaml!")
raise ValueError("Error: Set data_train first in yaml!")
log_print(f"Class weights: {class_weights}")
# 配置日志 train_loader = DataLoader(train_dataset, batch_size=self.config['batch_size'], shuffle=True)
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) val_loader = DataLoader(val_dataset, batch_size=self.config['batch_size'])
log_path = os.path.join(parent_dir, config['log_path'])
logging.basicConfig(filename=log_path, level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') self.create_model()
list_avg_f1 = [] criterion = nn.CrossEntropyLoss(weight=class_weights)
list_wrong_percentage = [] optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config['learning_rate'])
list_precision = [] scheduler = torch.optim.lr_scheduler.StepLR(optimizer, self.config['step_size'], self.config['gamma'])
list_recall = []
list_f1 = [] best_val_f1, best_val_recall, best_val_precision, best_epoch, best_model = self.train_model(train_loader, val_loader, criterion, optimizer, scheduler)
train_times = 1 if config['data_train']==r'all' else config["experiments_count"]
for i in range(train_times): # Save the best model
avg_f1, wrong_percentage, precision, recall, f1 = train_detect(config) self.save_model(self.config['model_path'])
list_avg_f1.append(avg_f1)
list_wrong_percentage.append(wrong_percentage) log_print(f"Best Validation F1 Score (Macro): {best_val_f1:.4f}")
list_precision.append(precision) log_print(f"Best Validation Recall (Macro): {best_val_recall:.4f}")
list_recall.append(recall) log_print(f"Best Validation Precision (Macro): {best_val_precision:.4f}")
list_f1.append(f1) log_print(f"Best Epoch: {best_epoch + 1}")
logging.info(f"Result: Avg F1: {sum(list_avg_f1) / len(list_avg_f1):.4f} Avg Wrong Percentage: {sum(list_wrong_percentage) / len(list_wrong_percentage):.2f}%") avg_f1, wrong_percentage, precision, recall, f1 = self.evaluate_model(X_infer, y_infer)
logging.info(f"Result: Avg Precision: {[sum(p[i] for p in list_precision) / len(list_precision) for i in range(len(list_precision[0]))]} | {np.mean(list_precision)}")
logging.info(f"Result: Avg Recall: {[sum(r[i] for r in list_recall) / len(list_recall) for i in range(len(list_recall[0]))]} | {np.mean(list_recall)}") return avg_f1, wrong_percentage, precision, recall, f1
logging.info(f"Result: Avg F1: {[sum(f1[i] for f1 in list_f1) / len(list_f1) for i in range(len(list_f1[0]))]} | {np.mean(list_f1)}")
print(f"Result: Avg F1: {sum(list_avg_f1) / len(list_avg_f1):.4f} Avg Wrong Percentage: {sum(list_wrong_percentage) / len(list_wrong_percentage):.2f}%") # class MLP(nn.Module):
print(f"Result: Avg Precision: {[sum(p[i] for p in list_precision) / len(list_precision) for i in range(len(list_precision[0]))]} | {np.mean(list_precision)}") # def __init__(self, config):
print(f"Result: Avg Recall: {[sum(r[i] for r in list_recall) / len(list_recall) for i in range(len(list_recall[0]))]} | {np.mean(list_recall)}") # super(MLP, self).__init__()
print(f"Result: Avg F1: {[sum(f1[i] for f1 in list_f1) / len(list_f1) for i in range(len(list_f1[0]))]} | {np.mean(list_f1)}") # self.model = nn.Sequential(
# nn.Linear(len(config['feature_names']), 32),
# nn.ReLU(),
# nn.Linear(32, 128),
# nn.ReLU(),
# nn.Linear(128, 32),
# nn.ReLU(),
# nn.Linear(32, config['nc']),
# )
# def forward(self, x):
# return self.model(x)
# 20260605
class MLP(nn.Module):
def __init__(self, config):
super(MLP, self).__init__()
layers = []
input_dim = config['mlp']['input_dim']
for layer_cfg in config['mlp']['layers']:
layers.append(nn.Linear(input_dim, layer_cfg['output_dim']))
if layer_cfg.get('activation', None) == 'relu':
layers.append(nn.ReLU())
input_dim = layer_cfg['output_dim']
layers.append(nn.Linear(input_dim, config['mlp']['output_dim']))
self.model = nn.Sequential(*layers)
def forward(self, x):
return self.model(x)
class TransformerModel(nn.Module):
def __init__(self, config):
super(TransformerModel, self).__init__()
self.embedding = nn.Linear(config['transformer']['input_dim'], config['transformer']['d_model'])
self.transformer = nn.Transformer(
d_model=config['transformer']['d_model'],
nhead=config['transformer']['nhead'],
num_encoder_layers=config['transformer']['num_encoder_layers'],
num_decoder_layers=config['transformer']['num_decoder_layers'],
dim_feedforward=config['transformer']['dim_feedforward'],
dropout=config['transformer']['dropout']
)
self.fc = nn.Linear(config['transformer']['d_model'], config['transformer']['output_dim'])
def forward(self, x):
x = self.embedding(x).unsqueeze(1) # Add sequence dimension
transformer_output = self.transformer(x, x)
output = self.fc(transformer_output.squeeze(1)) # Remove sequence dimension
return output

Loading…
Cancel
Save