加入完整的excel处理,处理基础是:data/下文件;已经验证和最终现在实验的feature_label.xlsx一致;data/下的文件都是最原始的没有用python文件处理过,但是可能用手改过
parent
78c7027ec5
commit
e9de92962c
@ -0,0 +1,27 @@
|
||||
import pandas as pd
|
||||
|
||||
# 读取特征
|
||||
features = pd.read_excel('data_processed/feature.xlsx')
|
||||
|
||||
# 计算权重数字化值
|
||||
features['权重数字化值'] = features['强迫症状数字化'] * 0.135 + features['人际关系敏感数字化'] * 0.085 + features['抑郁数字化'] * 0.08 + features['多因子症状'] * 0.2 + features['母亲教养方式数字化'] * 0.09 + features['父亲教养方式数字化'] * 0.09 + features['自评家庭经济条件数字化'] * 0.06 + features['有无心理治疗(咨询)史数字化'] * 0.06 + features['学业情况数字化'] * 0.08 + features['出勤情况数字化'] * 0.12
|
||||
|
||||
# 定义SCL-90的10个因子
|
||||
scl_90_factors = ["躯体化", "强迫症状", "人际关系敏感", "抑郁", "焦虑", "敌对", "恐怖", "偏执", "精神病性", "其他"]
|
||||
|
||||
# 计算预警等级
|
||||
def calculate_warning_level(row):
|
||||
factors = row[scl_90_factors]
|
||||
if (factors >= 4).sum() >= 1 or (factors >= 3).sum() >= 8:
|
||||
return 1
|
||||
elif (factors >= 3).sum() >= 1:
|
||||
return 2
|
||||
elif (factors >= 2).sum() >= 1:
|
||||
return 3
|
||||
else:
|
||||
return 4
|
||||
|
||||
features['label'] = features.apply(calculate_warning_level, axis=1)
|
||||
|
||||
# 保存带有预警等级的数据
|
||||
features.to_excel('data_processed/feature_label.xlsx', index=False)
|
||||
@ -0,0 +1,54 @@
|
||||
import pandas as pd
|
||||
|
||||
# 读取原始数据表
|
||||
df_src = pd.read_excel("data/data_src.xlsx")
|
||||
df_leave = pd.read_excel("data_processed/Leave_Record_RES.xlsx")
|
||||
|
||||
# 选择需要复制的列
|
||||
columns_to_copy = ['编号', '性别', '年龄', '父亲教养方式', '母亲教养方式', '自评家庭经济条件', '心理治疗(咨询)史', '躯体化', '强迫症状', '人际关系敏感', '抑郁', '焦虑', '敌对', '恐怖', '偏执', '精神病性', '其他']
|
||||
|
||||
# 复制选定的列到新表
|
||||
df_feature = df_src[columns_to_copy].copy()
|
||||
|
||||
# 添加数字化列
|
||||
df_feature['父亲教养方式数字化'] = df_feature['父亲教养方式'].apply(lambda x: 0.59 if x == '温暖与理解' else 0.46)
|
||||
df_feature['母亲教养方式数字化'] = df_feature['母亲教养方式'].apply(lambda x: 0.69 if x == '温暖与理解' else 0.56)
|
||||
df_feature['自评家庭经济条件数字化'] = df_feature['自评家庭经济条件'].apply(lambda x: 0.54 if x in ['贫困', '较差'] else 0.47)
|
||||
df_feature['有无心理治疗(咨询)史数字化'] = df_feature['心理治疗(咨询)史'].apply(lambda x: 0.09 if x in ['无', '没有'] else 0.21)
|
||||
|
||||
# 计算强迫症状、人际关系敏感、抑郁症状的数字化值
|
||||
df_feature['强迫症状数字化'] = df_feature['强迫症状'] / 4
|
||||
df_feature['人际关系敏感数字化'] = df_feature['人际关系敏感'] / 4
|
||||
df_feature['抑郁数字化'] = df_feature['抑郁'] / 4
|
||||
|
||||
# 计算多因子症状
|
||||
symptom_columns = ['躯体化', '强迫症状', '人际关系敏感', '抑郁', '焦虑', '敌对', '恐怖', '偏执', '精神病性', '其他']
|
||||
df_feature['多因子症状'] = df_feature[symptom_columns].apply(lambda row: sum(row > 3.0), axis=1) / 10
|
||||
|
||||
# 根据学号填充请假次数
|
||||
df_leave['请假次数'] = df_leave['学号'].map(df_leave.groupby('学号')['请假次数'].sum().to_dict())
|
||||
df_feature['请假次数'] = df_feature['编号'].map(df_leave.set_index('学号')['请假次数']).fillna(0)
|
||||
|
||||
# 计算请假次数平均值
|
||||
leave_mean = df_feature['请假次数'].mean()
|
||||
print(leave_mean)
|
||||
|
||||
# 添加出勤情况数字化列
|
||||
df_feature['出勤情况数字化'] = df_feature['请假次数'].apply(lambda x: 0.74 if x >= leave_mean else 0.67)
|
||||
|
||||
# 读取退学警告表
|
||||
df_dropout_warning = pd.read_excel("data_processed/Dropout_Warning_RES.xlsx")
|
||||
|
||||
# 根据学号填充是否受过退学预警
|
||||
df_dropout_warning['是否受过退学预警'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['是否受过退学预警'].sum().to_dict())
|
||||
df_feature['是否受过退学预警'] = df_feature['编号'].map(df_dropout_warning.set_index('学号')['是否受过退学预警']).fillna(0)
|
||||
|
||||
# 根据学号填充受过退学预警次数
|
||||
df_dropout_warning['受过退学预警次数'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['受过退学预警次数'].sum().to_dict())
|
||||
df_feature['受过退学预警次数'] = df_feature['编号'].map(df_dropout_warning.set_index('学号')['受过退学预警次数']).fillna(0)
|
||||
|
||||
# 添加学业情况数字化列
|
||||
df_feature['学业情况数字化'] = df_feature['是否受过退学预警'].apply(lambda x: 0.59 if x == 1 else 0.50)
|
||||
|
||||
# 保存新表
|
||||
df_feature.to_excel("data_processed/feature.xlsx", index=False)
|
||||
@ -0,0 +1,16 @@
|
||||
import pandas as pd
|
||||
|
||||
# 读取原始Excel文件的所有sheet
|
||||
excel_file = pd.ExcelFile('data/LeaveRecord.xlsx')
|
||||
df = pd.concat([excel_file.parse(sheet_name) for sheet_name in excel_file.sheet_names])
|
||||
|
||||
# 统计每个学生的请假次数和姓名
|
||||
student_counts = df.groupby('学号').size().reset_index(name='请假次数')
|
||||
student_names = df.groupby('学号')['姓名'].unique().reset_index()
|
||||
student_counts['姓名'] = student_names['姓名'].apply(lambda x: ','.join(x))
|
||||
|
||||
# 判断是否错误
|
||||
student_counts['是否错误'] = student_counts['姓名'].apply(lambda x: 1 if len(x.split(',')) > 1 else 0)
|
||||
|
||||
# 保存结果到新的Excel文件
|
||||
student_counts.to_excel('data_processed/Leave_Record_RES.xlsx', index=False)
|
||||
Loading…
Reference in New Issue