From e9de92962c83a48512d5dcdb16178cc90ffcabc0 Mon Sep 17 00:00:00 2001
From: wangchunlin <wangchunin666@gmail.com>
Date: Wed, 3 Apr 2024 01:37:05 +0800
Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5=E5=AE=8C=E6=95=B4=E7=9A=84ex?=
 =?UTF-8?q?cel=E5=A4=84=E7=90=86=EF=BC=8C=E5=A4=84=E7=90=86=E5=9F=BA?=
 =?UTF-8?q?=E7=A1=80=E6=98=AF=EF=BC=9Adata/=E4=B8=8B=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=EF=BC=9B=E5=B7=B2=E7=BB=8F=E9=AA=8C=E8=AF=81=E5=92=8C=E6=9C=80?=
 =?UTF-8?q?=E7=BB=88=E7=8E=B0=E5=9C=A8=E5=AE=9E=E9=AA=8C=E7=9A=84feature?=
 =?UTF-8?q?=5Flabel.xlsx=E4=B8=80=E8=87=B4=EF=BC=9Bdata/=E4=B8=8B=E7=9A=84?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E9=83=BD=E6=98=AF=E6=9C=80=E5=8E=9F=E5=A7=8B?=
 =?UTF-8?q?=E7=9A=84=E6=B2=A1=E6=9C=89=E7=94=A8python=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E5=A4=84=E7=90=86=E8=BF=87=EF=BC=8C=E4=BD=86=E6=98=AF=E5=8F=AF?=
 =?UTF-8?q?=E8=83=BD=E7=94=A8=E6=89=8B=E6=94=B9=E8=BF=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Academic_Data_Processing.py     |  4 +--
 Concat_Feature_Label.py         | 27 +++++++++++++++++
 Feature_Processing.py           | 54 +++++++++++++++++++++++++++++++++
 Leave_Record_Data_Processing.py | 16 ++++++++++
 4 files changed, 99 insertions(+), 2 deletions(-)
 create mode 100644 Concat_Feature_Label.py
 create mode 100644 Feature_Processing.py
 create mode 100644 Leave_Record_Data_Processing.py

diff --git a/Academic_Data_Processing.py b/Academic_Data_Processing.py
index cc5cbc1..76fba71 100644
--- a/Academic_Data_Processing.py
+++ b/Academic_Data_Processing.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
 # 定义文件列表和学期数量
-files = ["2020_2021_1.xlsx", "2020_2021_2.xlsx", "2021_2022_1.xlsx", "2021_2022_2.xlsx"]
+files = ["data/2020_2021_1.xlsx", "data/2020_2021_2.xlsx", "data/2021_2022_1.xlsx", "data/2021_2022_2.xlsx"]
 semester_count = len(files)
 
 # 初始化空字典用于存储学生信息和学期数量
@@ -57,4 +57,4 @@ for data in student_data.values():
 # 创建DataFrame并保存结果到新的Excel文件
 output_df = pd.DataFrame(student_data.values())
 output_df = output_df[["学号", "姓名", "不合格课程门数", "不合格课程门数组成", "是否错误", "是否受过退学预警", "受过退学预警次数"]]
-output_df.to_excel("output_Study.xlsx", index=False)
+output_df.to_excel("data_processed/Dropout_Warning_RES.xlsx", index=False)
diff --git a/Concat_Feature_Label.py b/Concat_Feature_Label.py
new file mode 100644
index 0000000..bf1592c
--- /dev/null
+++ b/Concat_Feature_Label.py
@@ -0,0 +1,27 @@
+import pandas as pd
+
+# 读取特征
+features = pd.read_excel('data_processed/feature.xlsx')
+
+# 计算权重数字化值
+features['权重数字化值'] = features['强迫症状数字化'] * 0.135 + features['人际关系敏感数字化'] * 0.085 + features['抑郁数字化'] * 0.08 + features['多因子症状'] * 0.2 + features['母亲教养方式数字化'] * 0.09 + features['父亲教养方式数字化'] * 0.09 + features['自评家庭经济条件数字化'] * 0.06 + features['有无心理治疗（咨询）史数字化'] * 0.06 + features['学业情况数字化'] * 0.08 + features['出勤情况数字化'] * 0.12
+
+# 定义SCL-90的10个因子
+scl_90_factors = ["躯体化", "强迫症状", "人际关系敏感", "抑郁", "焦虑", "敌对", "恐怖", "偏执", "精神病性", "其他"]
+
+# 计算预警等级
+def calculate_warning_level(row):
+    factors = row[scl_90_factors]
+    if (factors >= 4).sum() >= 1 or (factors >= 3).sum() >= 8:
+        return 1
+    elif (factors >= 3).sum() >= 1:
+        return 2
+    elif (factors >= 2).sum() >= 1:
+        return 3
+    else:
+        return 4
+
+features['label'] = features.apply(calculate_warning_level, axis=1)
+
+# 保存带有预警等级的数据
+features.to_excel('data_processed/feature_label.xlsx', index=False)
diff --git a/Feature_Processing.py b/Feature_Processing.py
new file mode 100644
index 0000000..ef2fc72
--- /dev/null
+++ b/Feature_Processing.py
@@ -0,0 +1,54 @@
+import pandas as pd
+
+# 读取原始数据表
+df_src = pd.read_excel("data/data_src.xlsx")
+df_leave = pd.read_excel("data_processed/Leave_Record_RES.xlsx")
+
+# 选择需要复制的列
+columns_to_copy = ['编号', '性别', '年龄', '父亲教养方式', '母亲教养方式', '自评家庭经济条件', '心理治疗（咨询）史', '躯体化', '强迫症状', '人际关系敏感', '抑郁', '焦虑', '敌对', '恐怖', '偏执', '精神病性', '其他']
+
+# 复制选定的列到新表
+df_feature = df_src[columns_to_copy].copy()
+
+# 添加数字化列
+df_feature['父亲教养方式数字化'] = df_feature['父亲教养方式'].apply(lambda x: 0.59 if x == '温暖与理解' else 0.46)
+df_feature['母亲教养方式数字化'] = df_feature['母亲教养方式'].apply(lambda x: 0.69 if x == '温暖与理解' else 0.56)
+df_feature['自评家庭经济条件数字化'] = df_feature['自评家庭经济条件'].apply(lambda x: 0.54 if x in ['贫困', '较差'] else 0.47)
+df_feature['有无心理治疗（咨询）史数字化'] = df_feature['心理治疗（咨询）史'].apply(lambda x: 0.09 if x in ['无', '没有'] else 0.21)
+
+# 计算强迫症状、人际关系敏感、抑郁症状的数字化值
+df_feature['强迫症状数字化'] = df_feature['强迫症状'] / 4
+df_feature['人际关系敏感数字化'] = df_feature['人际关系敏感'] / 4
+df_feature['抑郁数字化'] = df_feature['抑郁'] / 4
+
+# 计算多因子症状
+symptom_columns = ['躯体化', '强迫症状', '人际关系敏感', '抑郁', '焦虑', '敌对', '恐怖', '偏执', '精神病性', '其他']
+df_feature['多因子症状'] = df_feature[symptom_columns].apply(lambda row: sum(row > 3.0), axis=1) / 10
+
+# 根据学号填充请假次数
+df_leave['请假次数'] = df_leave['学号'].map(df_leave.groupby('学号')['请假次数'].sum().to_dict())
+df_feature['请假次数'] = df_feature['编号'].map(df_leave.set_index('学号')['请假次数']).fillna(0)
+
+# 计算请假次数平均值
+leave_mean = df_feature['请假次数'].mean()
+print(leave_mean)
+
+# 添加出勤情况数字化列
+df_feature['出勤情况数字化'] = df_feature['请假次数'].apply(lambda x: 0.74 if x >= leave_mean else 0.67)
+
+# 读取退学警告表
+df_dropout_warning = pd.read_excel("data_processed/Dropout_Warning_RES.xlsx")
+
+# 根据学号填充是否受过退学预警
+df_dropout_warning['是否受过退学预警'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['是否受过退学预警'].sum().to_dict())
+df_feature['是否受过退学预警'] = df_feature['编号'].map(df_dropout_warning.set_index('学号')['是否受过退学预警']).fillna(0)
+
+# 根据学号填充受过退学预警次数
+df_dropout_warning['受过退学预警次数'] = df_dropout_warning['学号'].map(df_dropout_warning.groupby('学号')['受过退学预警次数'].sum().to_dict())
+df_feature['受过退学预警次数'] = df_feature['编号'].map(df_dropout_warning.set_index('学号')['受过退学预警次数']).fillna(0)
+
+# 添加学业情况数字化列
+df_feature['学业情况数字化'] = df_feature['是否受过退学预警'].apply(lambda x: 0.59 if x == 1 else 0.50)
+
+# 保存新表
+df_feature.to_excel("data_processed/feature.xlsx", index=False)
diff --git a/Leave_Record_Data_Processing.py b/Leave_Record_Data_Processing.py
new file mode 100644
index 0000000..90ee512
--- /dev/null
+++ b/Leave_Record_Data_Processing.py
@@ -0,0 +1,16 @@
+import pandas as pd
+
+# 读取原始Excel文件的所有sheet
+excel_file = pd.ExcelFile('data/LeaveRecord.xlsx')
+df = pd.concat([excel_file.parse(sheet_name) for sheet_name in excel_file.sheet_names])
+
+# 统计每个学生的请假次数和姓名
+student_counts = df.groupby('学号').size().reset_index(name='请假次数')
+student_names = df.groupby('学号')['姓名'].unique().reset_index()
+student_counts['姓名'] = student_names['姓名'].apply(lambda x: ','.join(x))
+
+# 判断是否错误
+student_counts['是否错误'] = student_counts['姓名'].apply(lambda x: 1 if len(x.split(',')) > 1 else 0)
+
+# 保存结果到新的Excel文件
+student_counts.to_excel('data_processed/Leave_Record_RES.xlsx', index=False)