from pydantic import BaseModel import requests import pandas as pd import os def preprocess_data(df_src, df_leave, df_dropout_warning): """ 对原始数据进行预处理,返回处理好的 DataFrame。 参数: df_src (pandas.DataFrame): 学生信息表 df_leave (pandas.DataFrame): 请假记录表 df_dropout_warning (pandas.DataFrame): 学业预警表 返回: pandas.DataFrame: 处理好的数据 DataFrame """ # 定义SCL-90的10个因子 scl_90_factors = ["躯体化", "强迫症状", "人际关系敏感", "抑郁", "焦虑", "敌对", "恐怖", "偏执", "精神病性", "其他"] new_factor_names = ["somatization", "obsessive_compulsive", "interpersonal_sensitivity", "depression", "anxiety", "hostility", "terror", "paranoia", "psychoticism", "other"] def calculate_warning_level(row): factors = row[scl_90_factors] if (factors >= 4).sum() >= 1 or (factors >= 3).sum() >= 8: return 1 elif (factors >= 3).sum() >= 1: return 2 elif (factors >= 2).sum() >= 1: return 3 else: return 4 # 根据学号填充请假次数 df_leave['请假次数'] = df_leave['学号'].map(df_leave.groupby('学号')['请假次数'].sum().to_dict()) df_src['请假次数'] = df_src['编号'].map(df_leave.set_index('学号')['请假次数']).fillna(0) # 计算请假次数平均值 leave_mean = df_leave['请假次数'].mean() # 根据学号填充是否受过退学预警 df_dropout_warning['是否受过退学预警'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['是否受过退学预警'].sum().to_dict()) df_src['是否受过退学预警'] = df_src['编号'].map(df_dropout_warning.set_index('学号')['是否受过退学预警']).fillna(0) # 计算预警等级 df_src['类别'] = df_src.apply(calculate_warning_level, axis=1) # 英文数字化特征 for i, factor in enumerate(scl_90_factors): df_src[new_factor_names[i]] = df_src[factor] df_src['label'] = df_src['类别'].apply(lambda x: x - 1) # 对df_src进行统一处理 df_src['father_parenting_style'] = df_src['父亲教养方式'].apply(lambda x: 1 if x == "温暖与理解" else 0) df_src['mother_parenting_style'] = df_src['母亲教养方式'].apply(lambda x: 1 if x == "温暖与理解" else 0) df_src['self_assessed_family_economic_condition'] = df_src['自评家庭经济条件'].apply(lambda x: 2 if x == "贫困" else 1 if x == "较差" else 0) df_src['history_of_psychological_counseling'] = df_src['心理治疗(咨询)史'].apply(lambda x: 0 if x in ['无', '没有'] else 1) df_src['absenteeism_above_average'] = df_src['请假次数'].apply(lambda x: 1 if x >= leave_mean else 0) df_src['academic_warning'] = df_src['是否受过退学预警'].apply(lambda x: 1 if x == 1 else 0) return df_src def convert_to_list(df): """ 将 DataFrame 转换成包含所有特征和标签的数据列表。 参数: df (pandas.DataFrame): 处理好的数据 DataFrame 返回: list: 包含所有特征和标签的数据列表 """ features_data_list = [] for _, row in df.iterrows(): features_data = { "somatization": row["somatization"], "obsessive_compulsive": row["obsessive_compulsive"], "interpersonal_sensitivity": row["interpersonal_sensitivity"], "depression": row["depression"], "anxiety": row["anxiety"], "hostility": row["hostility"], "terror": row["terror"], "paranoia": row["paranoia"], "psychoticism": row["psychoticism"], "other": row["other"], "father_parenting_style": row["father_parenting_style"], "mother_parenting_style": row["mother_parenting_style"], "self_assessed_family_economic_condition": row["self_assessed_family_economic_condition"], "history_of_psychological_counseling": row["history_of_psychological_counseling"], "absenteeism_above_average": row["absenteeism_above_average"], "academic_warning": row["academic_warning"], "label": row["label"] } features_data_list.append(features_data) return features_data_list