随机森林与GBDT简单对比及代码样例

本文简单对比了随机森林与GBDT模型的优缺点,并列举了两种模型的样例的模型效果

随机森林(Random Forest)

  • 随机森林指的是利用多棵树对样本进行训练并预测的一种分类器,随机森林的每一棵决策树之间是没有关联的

优缺点

优点

  • 在数据集上表现良好
  • 方差和偏差都比较低,泛化性能优越
  • 在创建随机森林的时候,对generlization error使用的是无偏估计
  • 它能够处理很高维度(feature很多)的数据,并且不用做特征选择
  • 在训练完后,能够输出特征(feature)的重要性程度,非常实用
  • 高度并行化,易于分布式实现,训练速度快
  • 在训练过程中,能够检测到feature间的互相影响
  • 由于是树模型 ,不需要归一化即可之间使用,实现比较简单

缺点

  • 随机森林在某些噪音较大的分类或回归问题上会过拟合
  • 分裂的时候,偏向于选择取值较多的特征
  • 忽略属性之间的相关性

    代码样例

1
2
3
4
5
6
7
8
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from pandas import DataFrame
model_data = pd.read_csv('model_data.csv')

查看数据情况(数据清洗过程已忽略)

1
model_data.head()
Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var36 Var37 Var38 Var39 Var40 Var41 Var42 Var43 Var44 tag_y
0 1 -1 -1 -1 3 703 -1 -1 -1 1 0 0 1 0 0 1 11 37 0 0
1 1 -1 -1 -1 3 57 2 51 91 2 0 1 1 0 0 0 11 46 1 0
2 1 3 0 -1 3 815 2 51 -1 1 0 1 0 0 0 1 11 40 1 0
3 1 -1 -1 5 3 354 2 51 90 1 0 1 0 0 0 1 11 39 0 0
4 3 -1 -1 -1 3 391 1 61 90 1 0 0 0 0 1 1 11 37 1 0

5 rows × 45 columns

1
model_data.describe()
描述性统计

变量相关性检验

1
2
3
4
5
6
7
key = 'tag_y'
coef_df = []
for col in model_data.columns:
corrcoef = np.corrcoef(model_data[col],model_data[key])[0,1]
coef_df.append({'变量':col,'相关系数':corrcoef,'相关系数绝对值':abs(corrcoef)})
DataFrame(coef_df).sort_values('相关系数绝对值',ascending=False)
变量 相关系数 相关系数绝对值
44 tag_y 1.000000 1.000000
23 Var24 -0.387025 0.387025
16 Var17 0.210136 0.210136
27 Var28 -0.157304 0.157304
22 Var23 -0.150241 0.150241
15 Var16 -0.095761 0.095761
20 Var21 0.083199 0.083199
2 Var3 -0.082515 0.082515
17 Var18 -0.081642 0.081642
25 Var26 -0.080799 0.080799
24 Var25 -0.080647 0.080647
1 Var2 -0.068508 0.068508
12 Var13 -0.066173 0.066173
30 Var31 -0.063315 0.063315
0 Var1 0.062513 0.062513
13 Var14 -0.058684 0.058684
18 Var19 0.055990 0.055990
7 Var8 -0.055442 0.055442
40 Var41 0.051801 0.051801
39 Var40 0.050829 0.050829
31 Var32 0.044908 0.044908
36 Var37 -0.042666 0.042666
28 Var29 0.042470 0.042470
42 Var43 0.042357 0.042357
33 Var34 -0.033433 0.033433
34 Var35 -0.031154 0.031154
32 Var33 0.029990 0.029990
11 Var12 -0.025125 0.025125
14 Var15 0.024673 0.024673
9 Var10 0.023796 0.023796
3 Var4 -0.017289 0.017289
5 Var6 -0.016744 0.016744
37 Var38 -0.016197 0.016197
10 Var11 0.015352 0.015352
4 Var5 0.014266 0.014266
6 Var7 -0.013188 0.013188
19 Var20 -0.013174 0.013174
41 Var42 -0.012301 0.012301
21 Var22 -0.010746 0.010746
8 Var9 0.010663 0.010663
35 Var36 0.009791 0.009791
43 Var44 -0.006633 0.006633
26 Var27 -0.004636 0.004636
29 Var30 0.002331 0.002331
38 Var39 -0.000359 0.000359

随机森林建模

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from numpy import array, linspace, infty
Y = array(model_data[key])
X = model_data.drop(key,axis=1)
## 数据切割成训练集和测试集(此处比例选择的是8:2)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
## 模型的各项参数
model_config = {
'model': RandomForestClassifier,
# 'args':
'kargs':{
'n_estimators': 1000,
'class_weight': 'balanced',
'max_features': 'auto',
'max_depth': 5,
'min_samples_leaf': 100,
'random_state':33,
'bootstrap': True,
'oob_score': True
}
}
model = model_config['model']
other_args = model_config['kargs']
#告诉模型参数
clf = model(**other_args)
clf
## 训练模型,喂数据
clf = clf.fit(X_train, y_train)
clf.oob_score_ #验证集上的准确率(非测试集、非训练集)
0.67633816908454225
1
2
3
4
5
6
## 变量重要性输出
importance = pd.DataFrame({'dummy_variable':X_train.columns, 'importance':clf.feature_importances_})\
.sort_values('importance', ascending=False)
importance['importance'] = importance['importance'].apply(lambda x: round(x, 3))
importance
dummy_variable importance
23 Var24 0.435
16 Var17 0.127
15 Var16 0.090
27 Var28 0.076
22 Var23 0.053
13 Var14 0.024
17 Var18 0.021
25 Var26 0.020
20 Var21 0.019
5 Var6 0.016
26 Var27 0.014
42 Var43 0.012
30 Var31 0.010
2 Var3 0.009
41 Var42 0.007
8 Var9 0.006
0 Var1 0.006
7 Var8 0.006
1 Var2 0.005
18 Var19 0.004
28 Var29 0.004
3 Var4 0.004
12 Var13 0.004
11 Var12 0.003
39 Var40 0.003
24 Var25 0.003
6 Var7 0.003
14 Var15 0.003
10 Var11 0.003
21 Var22 0.003
40 Var41 0.002
36 Var37 0.002
33 Var34 0.001
9 Var10 0.001
29 Var30 0.000
43 Var44 0.000
31 Var32 0.000
37 Var38 0.000
34 Var35 0.000
32 Var33 0.000
35 Var36 0.000
38 Var39 0.000
4 Var5 0.000
19 Var20 0.000
1
2
3
4
5
6
## 测试数据的y预测值
y_pred_prob = clf.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_curve, auc, roc_auc_score,precision_recall_curve, f1_score
fpr, tpr, threshold = roc_curve(y_test, y_pred_prob)# fpr:false positive rate;tpr:true positive rate

AUC值和ROC曲线

1
2
roc_auc = auc(fpr, tpr)
roc_auc
0.79153905777374467
1
2
3
4
5
6
7
8
9
10
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid")
from matplotlib.pyplot import plot, figure, rc
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = "Microsoft YaHei"#"Droid Sans Fallback"#
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['figure.titlesize'] = 20
1
2
3
4
5
6
7
8
9
10
11
12
13
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC曲线')
plt.legend(loc="lower right")
plt.show()
plt.savefig('随机森林ROC曲线.png')
随机森林ROC曲线

K-S值和K-S曲线

1
2
3
ks = tpr-fpr
max(ks)
0.47769207501512401
1
2
3
4
5
6
7
8
9
10
11
12
13
plt.figure()
lw = 2
plt.plot(threshold, ks, color='darkorange',
lw=lw, label='KS curve (max = %0.2f)' % max(ks))
# plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Threshold')
plt.ylabel('K-S value')
plt.title('K-S曲线')
plt.legend(loc="lower right")
plt.show()
plt.savefig('随机森林KS曲线.png')
随机森林KS曲线

PRC曲线与f1_score曲线

1
2
precision, recall, threshold_pr = precision_recall_curve(y_test, y_pred_prob)
f1_score_ = 2*recall*precision/(precision + recall)
1
2
3
4
5
6
7
8
9
10
11
12
plt.figure()
lw = 2
plt.plot(recall,precision , color='darkorange',
lw=lw, label='PRC curve')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall Rate')
plt.ylabel('Precision Rate')
plt.title('PRC曲线')
plt.legend(loc="lower right")
plt.show()
plt.savefig('随机森林PRC曲线.png')
随机森林PRC曲线
1
2
3
4
5
6
7
8
9
10
11
12
plt.figure()
lw = 2
plt.plot(threshold_pr, f1_score_[:-1] ,color='darkorange',
lw=lw, label='f1_score curve')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('threshold')
plt.ylabel('f1_score')
plt.title('f1_score曲线')
plt.legend(loc="middle right")
plt.show()
plt.savefig('随机森林f1_score曲线.png')
随机森林f1_score曲线
1
max(f1_score_)
0.63074901445466491

阈值与留存

1
2
from jfdata.utilities.model_evaluation import evaluate_binary_classifer
from jfdata.utilities.visualization import line_plot
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
score_df = pd.DataFrame({'y':pd.Series(y_test).map({1:True, 0:False}), 'prob':y_pred_prob})
score_df[(True, True)] = 0
score_df[(True, False)] = 1
score_df[(False, False)] = 1
score_df[(False, True)] = 0
s = evaluate_binary_classifer(score_df, 20)
anatations = ['{0:0.2f}%'.format(100*rate) for rate in s['target_rate']]
plot_x = s.threshold
plot_y = s.score
x_label = '风险度阈值'
y_label = '留存数量'
title = '不同的风险阈值对留存的正常用户和bad占比的影响'
f1 = line_plot(plot_x, plot_y, title, x_label, y_label, anatations)
plt.savefig('随机森林阈值.png')
随机森林阈值

梯度提升树GBDT (Gradient Boosting Decision Tree)

简介/备注

  • GBDT的树都是回归树,而不是分类树,GBDT 是多棵树的输出预测值的累加
  • 在Gradient Boost中,每个新的模型的建立是为了使得之前模型的残差往梯度方向减少
  • GDBT 由损失函数和正则化函数两部分构成:
    • 损失函数尽可能的小,这样使得目标函数能够尽可能的符合样本
    • 正则化函数防止模型过拟合
    • 寻求损失函数和正则化函数的平衡点
  • Feature 分裂原则:
    • 遍历所有Feature,找到每个Feature的增益最大的分裂点,并计算出每个Feature分裂点的增益
    • 取所有Feature分裂点的增益最大的Feature作为最先分裂点
    • 使用贪婪法,重复上面的过程,建立一棵完整的决策树
  • 每次分裂的目的是为了获得更多的信息增益,如果分裂后信息增益为负数,则停止分裂

优缺点

优点

  • 预测精度高
  • 能处理非线性数据、多特征类型
  • 适合低维稠密数据
  • 可以灵活处理各种类型的数据,包括连续值和离散值,不需要做特征的归一化
  • 在相对少的调参时间情况下,预测的准备率也可以比较高
  • 使用一些健壮的损失函数,对异常值的鲁棒性非常强

    缺点

  • 弱学习器之间存在依赖关系,难以并行训练数据,是个串行的过程
  • 计算复杂度大
  • 不使用高维稀疏特征

gbdt使用什么损失函数?比如 Huber损失函数和Quantile损失函数;均方误差和LogLoss等

代码样例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
model_config = {
'model': RandomForestClassifier,
# 'args':
'kargs':{
'n_estimators': 100,
'learning_rate': 0.1,
'max_features': 5,
'max_depth': 4,
'min_samples_leaf': 100,
'random_state':33,
}
}
model = model_config['model']
other_args = model_config['kargs']

GBDT建模

1
2
3
from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(**other_args)
gbdt.fit(X_train,y_train)
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=5, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=100,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=33,
              subsample=1.0, verbose=0, warm_start=False)

算法评估指标

1
2
3
pred_y= gbdt.predict(X_test)#默认阈值为0.5
pd.crosstab(y_test,pred_y)
col_0 0 1
row_0
0 629 67
1 147 157
1
2
3
4
5
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from pandas import DataFrame
print(classification_report(y_test, pred_y, digits=4))
             precision    recall  f1-score   support

          0     0.8106    0.9037    0.8546       696
          1     0.7009    0.5164    0.5947       304

avg / total     0.7772    0.7860    0.7756      1000

ROC曲线

1
2
3
4
5
6
y_pred_prob = gbdt.predict_proba(X_test)[:,1]
fpr, tpr, threshold = roc_curve(y_test, y_pred_prob)# fpr:false positive rate;tpr:true positive rate
roc_auc = auc(fpr, tpr)
roc_auc
0.81887571839080453
1
2
3
4
5
6
7
8
9
10
11
12
13
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC曲线')
plt.legend(loc="lower right")
plt.show()
plt.savefig('GBDT_ROC曲线.png')
GBDT_ROC曲线

K-S值和K-S曲线

1
2
3
ks = tpr-fpr
max(ks)
0.50457501512401692
1
2
3
4
5
6
7
8
9
10
11
12
13
plt.figure()
lw = 2
plt.plot(threshold, ks, color='darkorange',
lw=lw, label='KS curve (max = %0.2f)' % max(ks))
# plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Threshold')
plt.ylabel('K-S value')
plt.title('K-S曲线')
plt.legend(loc="lower right")
plt.show()
plt.savefig('GBDT_KS曲线.png')
GBDT_KS曲线

PRC曲线与f1_score曲线

1
2
precision, recall, threshold_pr = precision_recall_curve(y_test, y_pred_prob)
f1_score_ = 2*recall*precision/(precision + recall)
1
2
3
4
5
6
7
8
9
10
11
12
plt.figure()
lw = 2
plt.plot(recall,precision , color='darkorange',
lw=lw, label='PRC curve')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall Rate')
plt.ylabel('Precision Rate')
plt.title('PRC曲线')
plt.legend(loc="lower right")
plt.show
plt.savefig('GBDT_PRC曲线.png')
GBDT_PRC曲线
1
2
3
4
5
6
7
8
9
10
11
12
plt.figure()
lw = 2
plt.plot(threshold_pr, f1_score_[:-1] ,color='darkorange',
lw=lw, label='f1_score curve')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('threshold')
plt.ylabel('f1_score')
plt.title('f1_score曲线')
plt.legend(loc="middle right")
plt.show()
plt.savefig('GBDT_f1_score曲线.png')
GBDT_f1_score曲线

阈值选取问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
score_df = pd.DataFrame({'y':pd.Series(y_test).map({1:True, 0:False}), 'prob':y_pred_prob})
score_df[(True, True)] = 0
score_df[(True, False)] = 1
score_df[(False, False)] = 1
score_df[(False, True)] = 0
s = evaluate_binary_classifer(score_df, 20)
anatations = ['{0:0.2f}%'.format(100*rate) for rate in s['target_rate']]
plot_x = s.threshold
plot_y = s.score
f1 = line_plot(plot_x, plot_y, title, x_label, y_label, anatations)
plt.savefig('GBDT阈值.png')
GBDT阈值

自动调参

1
2
3
4
5
6
7
8
9
10
11
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
gbdt = GradientBoostingClassifier()
cross_validation = StratifiedKFold(pd.Series(y_train),n_folds = 10)
parameter_grid = {'max_depth':[2,3,4,5],
'max_features':[1,3,5,7,9],
'n_estimators':[100,300,500,1000]}
grid_search = GridSearchCV(gbdt,param_grid = parameter_grid,cv =cross_validation,scoring = 'accuracy')
grid_search.fit(X_train,pd.Series(y_train))
GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 1 ..., 1 0], n_folds=10, shuffle=False, random_state=None),
       error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 300, 500, 1000], 'max_depth': [2, 3, 4, 5], 'max_features': [1, 3, 5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)
1
2
#输出最高得分
grid_search.best_score_
0.7763881940970485
1
2
#输出最佳参数
grid_search.best_params_
{'max_depth': 5, 'max_features': 9, 'n_estimators': 100}