You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
93 lines
4.2 KiB
Python
93 lines
4.2 KiB
Python
from pydantic import BaseModel
|
|
import requests
|
|
import pandas as pd
|
|
import os
|
|
|
|
def preprocess_data(df_src, df_leave, df_dropout_warning):
|
|
"""
|
|
对原始数据进行预处理,返回处理好的 DataFrame。
|
|
|
|
参数:
|
|
df_src (pandas.DataFrame): 学生信息表
|
|
df_leave (pandas.DataFrame): 请假记录表
|
|
df_dropout_warning (pandas.DataFrame): 学业预警表
|
|
|
|
返回:
|
|
pandas.DataFrame: 处理好的数据 DataFrame
|
|
"""
|
|
# 定义SCL-90的10个因子
|
|
scl_90_factors = ["躯体化", "强迫症状", "人际关系敏感", "抑郁", "焦虑", "敌对", "恐怖", "偏执", "精神病性", "其他"]
|
|
new_factor_names = ["somatization", "obsessive_compulsive", "interpersonal_sensitivity", "depression", "anxiety", "hostility", "terror", "paranoia", "psychoticism", "other"]
|
|
|
|
def calculate_warning_level(row):
|
|
factors = row[scl_90_factors]
|
|
if (factors >= 4).sum() >= 1 or (factors >= 3).sum() >= 8:
|
|
return 1
|
|
elif (factors >= 3).sum() >= 1:
|
|
return 2
|
|
elif (factors >= 2).sum() >= 1:
|
|
return 3
|
|
else:
|
|
return 4
|
|
|
|
# 根据学号填充请假次数
|
|
df_leave['请假次数'] = df_leave['学号'].map(df_leave.groupby('学号')['请假次数'].sum().to_dict())
|
|
df_src['请假次数'] = df_src['编号'].map(df_leave.set_index('学号')['请假次数']).fillna(0)
|
|
# 计算请假次数平均值
|
|
leave_mean = df_leave['请假次数'].mean()
|
|
|
|
# 根据学号填充是否受过退学预警
|
|
df_dropout_warning['是否受过退学预警'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['是否受过退学预警'].sum().to_dict())
|
|
df_src['是否受过退学预警'] = df_src['编号'].map(df_dropout_warning.set_index('学号')['是否受过退学预警']).fillna(0)
|
|
|
|
# 计算预警等级
|
|
df_src['类别'] = df_src.apply(calculate_warning_level, axis=1)
|
|
|
|
# 英文数字化特征
|
|
for i, factor in enumerate(scl_90_factors):
|
|
df_src[new_factor_names[i]] = df_src[factor]
|
|
df_src['label'] = df_src['类别'].apply(lambda x: x - 1)
|
|
|
|
# 对df_src进行统一处理
|
|
df_src['father_parenting_style'] = df_src['父亲教养方式'].apply(lambda x: 1 if x == "温暖与理解" else 0)
|
|
df_src['mother_parenting_style'] = df_src['母亲教养方式'].apply(lambda x: 1 if x == "温暖与理解" else 0)
|
|
df_src['self_assessed_family_economic_condition'] = df_src['自评家庭经济条件'].apply(lambda x: 2 if x == "贫困" else 1 if x == "较差" else 0)
|
|
df_src['history_of_psychological_counseling'] = df_src['心理治疗(咨询)史'].apply(lambda x: 0 if x in ['无', '没有'] else 1)
|
|
df_src['absenteeism_above_average'] = df_src['请假次数'].apply(lambda x: 1 if x >= leave_mean else 0)
|
|
df_src['academic_warning'] = df_src['是否受过退学预警'].apply(lambda x: 1 if x == 1 else 0)
|
|
|
|
return df_src
|
|
|
|
def convert_to_list(df):
|
|
"""
|
|
将 DataFrame 转换成包含所有特征和标签的数据列表。
|
|
|
|
参数:
|
|
df (pandas.DataFrame): 处理好的数据 DataFrame
|
|
|
|
返回:
|
|
list: 包含所有特征和标签的数据列表
|
|
"""
|
|
features_data_list = []
|
|
for _, row in df.iterrows():
|
|
features_data = {
|
|
"somatization": row["somatization"],
|
|
"obsessive_compulsive": row["obsessive_compulsive"],
|
|
"interpersonal_sensitivity": row["interpersonal_sensitivity"],
|
|
"depression": row["depression"],
|
|
"anxiety": row["anxiety"],
|
|
"hostility": row["hostility"],
|
|
"terror": row["terror"],
|
|
"paranoia": row["paranoia"],
|
|
"psychoticism": row["psychoticism"],
|
|
"other": row["other"],
|
|
"father_parenting_style": row["father_parenting_style"],
|
|
"mother_parenting_style": row["mother_parenting_style"],
|
|
"self_assessed_family_economic_condition": row["self_assessed_family_economic_condition"],
|
|
"history_of_psychological_counseling": row["history_of_psychological_counseling"],
|
|
"absenteeism_above_average": row["absenteeism_above_average"],
|
|
"academic_warning": row["academic_warning"],
|
|
"label": row["label"]
|
|
}
|
|
features_data_list.append(features_data)
|
|
return features_data_list |