import pandas as pd # 读取原始数据表 df_src = pd.read_excel("data/data_src.xlsx") df_leave = pd.read_excel("data_processed/Leave_Record_RES.xlsx") # 选择需要复制的列 columns_to_copy = ['编号', '性别', '年龄', '父亲教养方式', '母亲教养方式', '自评家庭经济条件', '心理治疗(咨询)史', '躯体化', '强迫症状', '人际关系敏感', '抑郁', '焦虑', '敌对', '恐怖', '偏执', '精神病性', '其他'] # 复制选定的列到新表 df_feature = df_src[columns_to_copy].copy() # 添加数字化列 df_feature['父亲教养方式数字化'] = df_feature['父亲教养方式'].apply(lambda x: 0.59 if x == '温暖与理解' else 0.46) df_feature['母亲教养方式数字化'] = df_feature['母亲教养方式'].apply(lambda x: 0.69 if x == '温暖与理解' else 0.56) df_feature['自评家庭经济条件数字化'] = df_feature['自评家庭经济条件'].apply(lambda x: 0.54 if x in ['贫困', '较差'] else 0.47) df_feature['有无心理治疗(咨询)史数字化'] = df_feature['心理治疗(咨询)史'].apply(lambda x: 0.09 if x in ['无', '没有'] else 0.21) # 计算强迫症状、人际关系敏感、抑郁症状的数字化值 df_feature['强迫症状数字化'] = df_feature['强迫症状'] / 4 df_feature['人际关系敏感数字化'] = df_feature['人际关系敏感'] / 4 df_feature['抑郁数字化'] = df_feature['抑郁'] / 4 # 计算多因子症状 symptom_columns = ['躯体化', '强迫症状', '人际关系敏感', '抑郁', '焦虑', '敌对', '恐怖', '偏执', '精神病性', '其他'] df_feature['多因子症状'] = df_feature[symptom_columns].apply(lambda row: sum(row > 3.0), axis=1) / 10 # 根据学号填充请假次数 df_leave['请假次数'] = df_leave['学号'].map(df_leave.groupby('学号')['请假次数'].sum().to_dict()) df_feature['请假次数'] = df_feature['编号'].map(df_leave.set_index('学号')['请假次数']).fillna(0) # 计算请假次数平均值 leave_mean = df_feature['请假次数'].mean() print(leave_mean) # 添加出勤情况数字化列 df_feature['出勤情况数字化'] = df_feature['请假次数'].apply(lambda x: 0.74 if x >= leave_mean else 0.67) # 读取退学警告表 df_dropout_warning = pd.read_excel("data_processed/Dropout_Warning_RES.xlsx") # 根据学号填充是否受过退学预警 df_dropout_warning['是否受过退学预警'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['是否受过退学预警'].sum().to_dict()) df_feature['是否受过退学预警'] = df_feature['编号'].map(df_dropout_warning.set_index('学号')['是否受过退学预警']).fillna(0) # 根据学号填充受过退学预警次数 df_dropout_warning['受过退学预警次数'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['受过退学预警次数'].sum().to_dict()) df_feature['受过退学预警次数'] = df_feature['编号'].map(df_dropout_warning.set_index('学号')['受过退学预警次数']).fillna(0) # 添加学业情况数字化列 df_feature['学业情况数字化'] = df_feature['是否受过退学预警'].apply(lambda x: 0.59 if x == 1 else 0.50) # 保存新表 df_feature.to_excel("data_processed/feature.xlsx", index=False)