diff --git a/Feature_Process_5.py b/Feature_Process_5.py new file mode 100644 index 0000000..6c05d60 --- /dev/null +++ b/Feature_Process_5.py @@ -0,0 +1,54 @@ +import pandas as pd + +# 读取原始数据表 +df_src = pd.read_excel("data/data_src.xlsx") +df_leave = pd.read_excel("data_processed/Leave_Record_RES.xlsx") + +# 选择需要复制的列 +columns_to_copy = ['编号', '性别', '年龄', '父亲教养方式', '母亲教养方式', '自评家庭经济条件', '心理治疗(咨询)史', '躯体化', '强迫症状', '人际关系敏感', '抑郁', '焦虑', '敌对', '恐怖', '偏执', '精神病性', '其他'] + +# 复制选定的列到新表 +df_feature = df_src[columns_to_copy].copy() + +# 添加数字化列 +df_feature['父亲教养方式数字化'] = df_feature['父亲教养方式'].apply(lambda x: 0.59 if x == '温暖与理解' else 0.46) +df_feature['母亲教养方式数字化'] = df_feature['母亲教养方式'].apply(lambda x: 0.69 if x == '温暖与理解' else 0.56) +df_feature['自评家庭经济条件数字化'] = df_feature['自评家庭经济条件'].apply(lambda x: 0.54 if x in ['贫困', '较差'] else 0.47) +df_feature['有无心理治疗(咨询)史数字化'] = df_feature['心理治疗(咨询)史'].apply(lambda x: 0.09 if x in ['无', '没有'] else 0.21) + +# 计算强迫症状、人际关系敏感、抑郁症状的数字化值 +df_feature['强迫症状数字化'] = df_feature['强迫症状'] / 4 +df_feature['人际关系敏感数字化'] = df_feature['人际关系敏感'] / 4 +df_feature['抑郁数字化'] = df_feature['抑郁'] / 4 + +# 计算多因子症状 +symptom_columns = ['躯体化', '强迫症状', '人际关系敏感', '抑郁', '焦虑', '敌对', '恐怖', '偏执', '精神病性', '其他'] +df_feature['多因子症状'] = df_feature[symptom_columns].apply(lambda row: sum(row > 3.0), axis=1) / 10 + +# 根据学号填充请假次数 +df_leave['请假次数'] = df_leave['学号'].map(df_leave.groupby('学号')['请假次数'].sum().to_dict()) +df_feature['请假次数'] = df_feature['编号'].map(df_leave.set_index('学号')['请假次数']).fillna(0) + +# 计算请假次数平均值 +leave_mean = df_feature['请假次数'].mean() +print(leave_mean) + +# 添加出勤情况数字化列 +df_feature['出勤情况数字化'] = df_feature['请假次数'].apply(lambda x: 0.74 if x >= leave_mean else 0.67) + +# 读取退学警告表 +df_dropout_warning = pd.read_excel("data_processed/Dropout_Warning_RES.xlsx") + +# 根据学号填充是否受过退学预警 +df_dropout_warning['是否受过退学预警'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['是否受过退学预警'].sum().to_dict()) +df_feature['是否受过退学预警'] = df_feature['编号'].map(df_dropout_warning.set_index('学号')['是否受过退学预警']).fillna(0) + +# 根据学号填充受过退学预警次数 +df_dropout_warning['受过退学预警次数'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['受过退学预警次数'].sum().to_dict()) +df_feature['受过退学预警次数'] = df_feature['编号'].map(df_dropout_warning.set_index('学号')['受过退学预警次数']).fillna(0) + +# 添加学业情况数字化列 +df_feature['学业情况数字化'] = df_feature['是否受过退学预警'].apply(lambda x: 0.59 if x == 1 else 0.50) + +# 保存新表 +df_feature.to_excel("feature.xlsx", index=False)