import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

# 从Excel文件中读取数据
df_base = pd.read_excel('financial_fraud_data.xlsx')

# 检查数据
print(df_base.head())
print(df_base.info())

# 删除不需要的列
df = df_base.drop(columns=['索引', '年份'])

           索引    年份          货币资金  交易性金融资产  应收票据          应收账款          预付账款  \
0  2010000001  2010           NaN      NaN   NaN           NaN           NaN   
1  2010000002  2010  3.781693e+10      NaN   NaN  1.594025e+09  1.783800e+10   
2  2010000004  2010  4.508343e+07      NaN   NaN  4.296550e+06  2.981632e+06   
3  2010000005  2010  1.891043e+07      NaN   NaN  1.082415e+07  2.128812e+07   
4  2010000006  2010  1.664538e+09      NaN   NaN           NaN  7.601087e+07   

             存货      可供出售金融资产       持有至到期投资  ...     折旧率指数      应计指数  \
0           NaN  3.153418e+10  6.137984e+10  ...  1.156866 -0.021251   
1  1.333335e+11  4.047636e+08           NaN  ...  0.903096  0.030618   
2  4.310033e+07           NaN           NaN  ...  0.978994  0.173374   
3  5.394083e+07           NaN           NaN  ...  0.916015 -0.008157   
4  5.876571e+09  2.992880e+08           NaN  ...  0.973408 -0.068900   

   净利润是否大于0   毛利率指数是否大于1  资产质量指数是否大于1   营业收入指数是否大于1  销售管理费用指数是否大于1  \
0         1            0            1             1              0   
1         1            0            1             1              0   
2         1            1            0             1              1   
3         0            0            1             1              0   
4         1            0            1             1              0   

    财务杠杆指数是否大于1   折旧率指数是否大于1  是否舞弊  
0             0            1     0  
1             1            0     0  
2             0            0     0  
3             1            0     0  
4             1            0     0  

[5 rows x 156 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29603 entries, 0 to 29602
Columns: 156 entries, 索引 to 是否舞弊
dtypes: float64(145), int64(10), object(1)
memory usage: 35.2+ MB
None

# 清理数值中的逗号和空格，并将所有列转换为数值类型
for col in df.columns:
    df[col] = df[col].astype(str).str.replace(',', '').str.strip()
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 删除全为NaN的列
df = df.dropna(axis=1, how='all')

# 缺失数据用中位数填充
df.fillna(df.median(), inplace=True)

# 无穷大值处理，用最大值填充
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.max(), inplace=True)

# 确保没有NaN值
assert df.isnull().sum().sum() == 0, "数据中仍有NaN值，请检查预处理步骤"

print(df.head())

           货币资金      交易性金融资产         应收票据          应收账款          预付账款  \
0  5.227256e+08  24623748.47  48307693.46  2.844320e+08  4.323133e+07   
1  3.781693e+10  24623748.47  48307693.46  1.594025e+09  1.783800e+10   
2  4.508343e+07  24623748.47  48307693.46  4.296550e+06  2.981632e+06   
3  1.891043e+07  24623748.47  48307693.46  1.082415e+07  2.128812e+07   
4  1.664538e+09  24623748.47  48307693.46  2.844320e+08  7.601087e+07   

             存货      可供出售金融资产       持有至到期投资        长期股权投资          固定资产  ...  \
0  3.521421e+08  3.153418e+10  6.137984e+10  4.043900e+08  2.470051e+09  ...   
1  1.333335e+11  4.047636e+08  4.073097e+06  4.493752e+09  1.219582e+09  ...   
2  4.310033e+07  5.091286e+07  4.073097e+06  2.426915e+07  2.436679e+07  ...   
3  5.394083e+07  5.091286e+07  4.073097e+06  7.353624e+07  7.780635e+07  ...   
4  5.876571e+09  2.992880e+08  4.073097e+06  3.583000e+06  7.429962e+06  ...   

      折旧率指数      应计指数  净利润是否大于0   毛利率指数是否大于1  资产质量指数是否大于1   营业收入指数是否大于1  \
0  1.156866 -0.021251         1            0            1             1   
1  0.903096  0.030618         1            0            1             1   
2  0.978994  0.173374         1            1            0             1   
3  0.916015 -0.008157         0            0            1             1   
4  0.973408 -0.068900         1            0            1             1   

   销售管理费用指数是否大于1   财务杠杆指数是否大于1   折旧率指数是否大于1  是否舞弊  
0              0             0            1     0  
1              0             1            0     0  
2              1             0            0     0  
3              0             1            0     0  
4              0             1            0     0  

[5 rows x 152 columns]

# 提取特征变量和目标变量
X = df.drop(columns=['是否舞弊'])
y = df['是否舞弊']

# 数据分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=76)

# 使用SMOTE进行过采样
smote = SMOTE(random_state=76)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 查看合成后的数据集中多数类和少数类的数量
print("合成后的数据集中的类分布：")
print(y_train_resampled.value_counts())

# 数据标准化（可选，根据模型要求）
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

合成后的数据集中的类分布：
是否舞弊
0    23562
1    23562
Name: count, dtype: int64

# 初始化LightGBM分类器
model = lgb.LGBMClassifier(random_state=76)

# 训练模型
#model.fit(X_train, y_train)
model.fit(X_train_resampled, y_train_resampled)

[LightGBM] [Info] Number of positive: 23562, number of negative: 23562
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 36569
[LightGBM] [Info] Number of data points in the train set: 47124, number of used features: 151
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

LGBMClassifier(random_state=76)

LGBMClassifier(random_state=76)

# 预测测试集
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("准确率：", accuracy)
print("AUC：", roc_auc)
print("混淆矩阵：\n", conf_matrix)
print("分类报告：\n", class_report)

准确率： 0.9944266171254855
AUC： 0.9274502436304575
混淆矩阵：
 [[5883   10]
 [  23    5]]
分类报告：
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5893
           1       0.33      0.18      0.23        28

    accuracy                           0.99      5921
   macro avg       0.66      0.59      0.61      5921
weighted avg       0.99      0.99      0.99      5921

# 绘制混淆矩阵
plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('predicted value')
plt.ylabel('actual value')
plt.title('Confusion Matrix')
plt.show()

# 绘制ROC曲线
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

# 定义参数网格
# param_grid = {
#     'num_leaves': [31, 50, 70],
#     'max_depth': [-1, 10, 20, 30],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'n_estimators': [100, 200, 500],
#     'min_child_samples': [20, 30, 40],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0]
# }
# 定义参数网格
param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.05],
    'n_estimators': [100, 200]
}

# 初始化LightGBM分类器
model = lgb.LGBMClassifier(
    random_state=76,
    device='cuda',
    max_bin=63, # 减少bin数量可降低内存占用
    gpu_use_dp=False, # 使用单精度浮点数可降低内存占
    gpu_device_id=3,
    verbose=-1,
    )

# 使用GridSearchCV进行参数调优
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# 输出最佳参数
print("最佳参数：", grid_search.best_params_)

# 使用最佳参数训练模型
best_model = grid_search.best_estimator_
best_model.fit(X_train_resampled, y_train_resampled)

[LightGBM] [Fatal] [CUDA] out of memory /tmp/pip-install-owi0_rx_/lightgbm_d96532b272d64fb89e2a0e320827ef32/src/boosting/cuda/cuda_score_updater.cpp 28

[LightGBM] [Fatal] [CUDA] out of memory /tmp/pip-install-owi0_rx_/lightgbm_d96532b272d64fb89e2a0e320827ef32/src/boosting/cuda/cuda_score_updater.cpp 28

[LightGBM] [Fatal] [CUDA] out of memory /tmp/pip-install-owi0_rx_/lightgbm_d96532b272d64fb89e2a0e320827ef32/src/boosting/cuda/cuda_score_updater.cpp 28

[LightGBM] [Fatal] [CUDA] out of memory /tmp/pip-install-owi0_rx_/lightgbm_d96532b272d64fb89e2a0e320827ef32/src/io/cuda/cuda_tree.cpp 124

/opt/anaconda3/envs/marl/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:425: FitFailedWarning: 
4 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/lightgbm/sklearn.py", line 1187, in fit
    super().fit(
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_call(_LIB.LGBM_BoosterCreate(
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/lightgbm/basic.py", line 263, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8'))
lightgbm.basic.LightGBMError: [CUDA] out of memory /tmp/pip-install-owi0_rx_/lightgbm_d96532b272d64fb89e2a0e320827ef32/src/boosting/cuda/cuda_score_updater.cpp 28


--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/lightgbm/sklearn.py", line 1187, in fit
    super().fit(
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/lightgbm/engine.py", line 276, in train
    booster.update(fobj=fobj)
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/lightgbm/basic.py", line 3891, in update
    _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
  File "/opt/anaconda3/envs/marl/lib/python3.8/site-packages/lightgbm/basic.py", line 263, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8'))
lightgbm.basic.LightGBMError: [CUDA] out of memory /tmp/pip-install-owi0_rx_/lightgbm_d96532b272d64fb89e2a0e320827ef32/src/io/cuda/cuda_tree.cpp 124


  warnings.warn(some_fits_failed_message, FitFailedWarning)
/opt/anaconda3/envs/marl/lib/python3.8/site-packages/sklearn/model_selection/_search.py:979: UserWarning: One or more of the test scores are non-finite: [       nan 0.96154638 0.88680405 0.99323972        nan 0.9998272
 0.92893804        nan]
  warnings.warn(

最佳参数： {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 50}

LGBMClassifier(device='cuda', gpu_device_id=3, gpu_use_dp=False,
               learning_rate=0.05, max_bin=63, num_leaves=50, random_state=76,
               verbose=-1)

LGBMClassifier(device='cuda', gpu_device_id=3, gpu_use_dp=False,
               learning_rate=0.05, max_bin=63, num_leaves=50, random_state=76,
               verbose=-1)

# 定义参数网格
# param_grid = {
#     'num_leaves': [31, 50, 70],
#     'max_depth': [-1, 10, 20, 30],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'n_estimators': [100, 200, 500],
#     'min_child_samples': [20, 30, 40],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0]
# }
# 定义参数网格
param_grid = {
    'max_depth': [-1, 10],
    'min_child_samples': [20, 30],
    'subsample': [0.6, 0.8],
}

# 初始化LightGBM分类器
model = lgb.LGBMClassifier(
    random_state=76,
    device='cuda',
    max_bin=63, # 减少bin数量可降低内存占用
    gpu_use_dp=False, # 使用单精度浮点数可降低内存占
    gpu_device_id=3,
    verbose=-1,

    learning_rate=0.05, 
    n_estimators=100,
    num_leaves=50
    )

# 使用GridSearchCV进行参数调优
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# 输出最佳参数
print("最佳参数：", grid_search.best_params_)

# 使用最佳参数训练模型
best_model = grid_search.best_estimator_
best_model.fit(X_train_resampled, y_train_resampled)

最佳参数： {'max_depth': -1, 'min_child_samples': 30, 'subsample': 0.8}

LGBMClassifier(device='cuda', gpu_device_id=3, gpu_use_dp=False,
               learning_rate=0.05, max_bin=63, min_child_samples=30,
               num_leaves=50, random_state=76, subsample=0.8, verbose=-1)

LGBMClassifier(device='cuda', gpu_device_id=3, gpu_use_dp=False,
               learning_rate=0.05, max_bin=63, min_child_samples=30,
               num_leaves=50, random_state=76, subsample=0.8, verbose=-1)

# 预测测试集
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("准确率：", accuracy)
print("AUC：", roc_auc)
print("混淆矩阵：\n", conf_matrix)
print("分类报告：\n", class_report)

准确率： 0.993244384394528
AUC： 0.9506011975467261
混淆矩阵：
 [[5873   20]
 [  20    8]]
分类报告：
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5893
           1       0.29      0.29      0.29        28

    accuracy                           0.99      5921
   macro avg       0.64      0.64      0.64      5921
weighted avg       0.99      0.99      0.99      5921

# 绘制混淆矩阵
plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('predicted value')
plt.ylabel('actual value')
plt.title('Confusion Matrix')
plt.show()

# 绘制ROC曲线
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

财务舞弊模型¶

1. 导入必要的库¶

2. 从Excel文件中读取数据并预处理¶

为了处理不平衡数据集，我们可以使用一些技术，例如：¶

.过采样（Oversampling）：增加少数类的样本数量。¶

.欠采样（Undersampling）：减少多数类的样本数量。¶

.合成少数类样本（Synthetic Minority Over-sampling Technique, SMOTE）：生成少数类的合成样本。¶

.调整分类阈值：在分类模型中调整预测的阈值以平衡精度和召回率。¶

.使用惩罚机制的模型：如带有类权重的模型，增加对少数类的惩罚力度。¶

在这里，我们可以尝试使用SMOTE进行过采样，以增加少数类的样本数量，从而平衡数据集。¶

3. 训练LightGBM模型¶

4. 模型预测和评估¶

5. 可视化结果¶

6. LightGBM参数调优¶

再次优化¶

7. 再次预测测试集并可视化¶