You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
55 lines
3.2 KiB
Python
55 lines
3.2 KiB
Python
import pandas as pd
|
|
|
|
# 读取原始数据表
|
|
df_src = pd.read_excel("data/data_src.xlsx")
|
|
df_leave = pd.read_excel("data_processed/Leave_Record_RES.xlsx")
|
|
|
|
# 选择需要复制的列
|
|
columns_to_copy = ['编号', '性别', '年龄', '父亲教养方式', '母亲教养方式', '自评家庭经济条件', '心理治疗(咨询)史', '躯体化', '强迫症状', '人际关系敏感', '抑郁', '焦虑', '敌对', '恐怖', '偏执', '精神病性', '其他']
|
|
|
|
# 复制选定的列到新表
|
|
df_feature = df_src[columns_to_copy].copy()
|
|
|
|
# 添加数字化列
|
|
df_feature['父亲教养方式数字化'] = df_feature['父亲教养方式'].apply(lambda x: 0.59 if x == '温暖与理解' else 0.46)
|
|
df_feature['母亲教养方式数字化'] = df_feature['母亲教养方式'].apply(lambda x: 0.69 if x == '温暖与理解' else 0.56)
|
|
df_feature['自评家庭经济条件数字化'] = df_feature['自评家庭经济条件'].apply(lambda x: 0.54 if x in ['贫困', '较差'] else 0.47)
|
|
df_feature['有无心理治疗(咨询)史数字化'] = df_feature['心理治疗(咨询)史'].apply(lambda x: 0.09 if x in ['无', '没有'] else 0.21)
|
|
|
|
# 计算强迫症状、人际关系敏感、抑郁症状的数字化值
|
|
df_feature['强迫症状数字化'] = df_feature['强迫症状'] / 4
|
|
df_feature['人际关系敏感数字化'] = df_feature['人际关系敏感'] / 4
|
|
df_feature['抑郁数字化'] = df_feature['抑郁'] / 4
|
|
|
|
# 计算多因子症状
|
|
symptom_columns = ['躯体化', '强迫症状', '人际关系敏感', '抑郁', '焦虑', '敌对', '恐怖', '偏执', '精神病性', '其他']
|
|
df_feature['多因子症状'] = df_feature[symptom_columns].apply(lambda row: sum(row > 3.0), axis=1) / 10
|
|
|
|
# 根据学号填充请假次数
|
|
df_leave['请假次数'] = df_leave['学号'].map(df_leave.groupby('学号')['请假次数'].sum().to_dict())
|
|
df_feature['请假次数'] = df_feature['编号'].map(df_leave.set_index('学号')['请假次数']).fillna(0)
|
|
|
|
# 计算请假次数平均值
|
|
leave_mean = df_feature['请假次数'].mean()
|
|
print(leave_mean)
|
|
|
|
# 添加出勤情况数字化列
|
|
df_feature['出勤情况数字化'] = df_feature['请假次数'].apply(lambda x: 0.74 if x >= leave_mean else 0.67)
|
|
|
|
# 读取退学警告表
|
|
df_dropout_warning = pd.read_excel("data_processed/Dropout_Warning_RES.xlsx")
|
|
|
|
# 根据学号填充是否受过退学预警
|
|
df_dropout_warning['是否受过退学预警'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['是否受过退学预警'].sum().to_dict())
|
|
df_feature['是否受过退学预警'] = df_feature['编号'].map(df_dropout_warning.set_index('学号')['是否受过退学预警']).fillna(0)
|
|
|
|
# 根据学号填充受过退学预警次数
|
|
df_dropout_warning['受过退学预警次数'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['受过退学预警次数'].sum().to_dict())
|
|
df_feature['受过退学预警次数'] = df_feature['编号'].map(df_dropout_warning.set_index('学号')['受过退学预警次数']).fillna(0)
|
|
|
|
# 添加学业情况数字化列
|
|
df_feature['学业情况数字化'] = df_feature['是否受过退学预警'].apply(lambda x: 0.59 if x == 1 else 0.50)
|
|
|
|
# 保存新表
|
|
df_feature.to_excel("data_processed/feature.xlsx", index=False)
|