You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
4.2 KiB
Python

from pydantic import BaseModel
import requests
import pandas as pd
import os
def preprocess_data(df_src, df_leave, df_dropout_warning):
"""
对原始数据进行预处理,返回处理好的 DataFrame。
参数:
df_src (pandas.DataFrame): 学生信息表
df_leave (pandas.DataFrame): 请假记录表
df_dropout_warning (pandas.DataFrame): 学业预警表
返回:
pandas.DataFrame: 处理好的数据 DataFrame
"""
# 定义SCL-90的10个因子
scl_90_factors = ["躯体化", "强迫症状", "人际关系敏感", "抑郁", "焦虑", "敌对", "恐怖", "偏执", "精神病性", "其他"]
new_factor_names = ["somatization", "obsessive_compulsive", "interpersonal_sensitivity", "depression", "anxiety", "hostility", "terror", "paranoia", "psychoticism", "other"]
def calculate_warning_level(row):
factors = row[scl_90_factors]
if (factors >= 4).sum() >= 1 or (factors >= 3).sum() >= 8:
return 1
elif (factors >= 3).sum() >= 1:
return 2
elif (factors >= 2).sum() >= 1:
return 3
else:
return 4
# 根据学号填充请假次数
df_leave['请假次数'] = df_leave['学号'].map(df_leave.groupby('学号')['请假次数'].sum().to_dict())
df_src['请假次数'] = df_src['编号'].map(df_leave.set_index('学号')['请假次数']).fillna(0)
# 计算请假次数平均值
leave_mean = df_leave['请假次数'].mean()
# 根据学号填充是否受过退学预警
df_dropout_warning['是否受过退学预警'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['是否受过退学预警'].sum().to_dict())
df_src['是否受过退学预警'] = df_src['编号'].map(df_dropout_warning.set_index('学号')['是否受过退学预警']).fillna(0)
# 计算预警等级
df_src['类别'] = df_src.apply(calculate_warning_level, axis=1)
# 英文数字化特征
for i, factor in enumerate(scl_90_factors):
df_src[new_factor_names[i]] = df_src[factor]
df_src['label'] = df_src['类别'].apply(lambda x: x - 1)
# 对df_src进行统一处理
df_src['father_parenting_style'] = df_src['父亲教养方式'].apply(lambda x: 1 if x == "温暖与理解" else 0)
df_src['mother_parenting_style'] = df_src['母亲教养方式'].apply(lambda x: 1 if x == "温暖与理解" else 0)
df_src['self_assessed_family_economic_condition'] = df_src['自评家庭经济条件'].apply(lambda x: 2 if x == "贫困" else 1 if x == "较差" else 0)
df_src['history_of_psychological_counseling'] = df_src['心理治疗(咨询)史'].apply(lambda x: 0 if x in ['', '没有'] else 1)
df_src['absenteeism_above_average'] = df_src['请假次数'].apply(lambda x: 1 if x >= leave_mean else 0)
df_src['academic_warning'] = df_src['是否受过退学预警'].apply(lambda x: 1 if x == 1 else 0)
return df_src
def convert_to_list(df):
"""
将 DataFrame 转换成包含所有特征和标签的数据列表。
参数:
df (pandas.DataFrame): 处理好的数据 DataFrame
返回:
list: 包含所有特征和标签的数据列表
"""
features_data_list = []
for _, row in df.iterrows():
features_data = {
"somatization": row["somatization"],
"obsessive_compulsive": row["obsessive_compulsive"],
"interpersonal_sensitivity": row["interpersonal_sensitivity"],
"depression": row["depression"],
"anxiety": row["anxiety"],
"hostility": row["hostility"],
"terror": row["terror"],
"paranoia": row["paranoia"],
"psychoticism": row["psychoticism"],
"other": row["other"],
"father_parenting_style": row["father_parenting_style"],
"mother_parenting_style": row["mother_parenting_style"],
"self_assessed_family_economic_condition": row["self_assessed_family_economic_condition"],
"history_of_psychological_counseling": row["history_of_psychological_counseling"],
"absenteeism_above_average": row["absenteeism_above_average"],
"academic_warning": row["academic_warning"],
"label": row["label"]
}
features_data_list.append(features_data)
return features_data_list