Python) Catboost 모델링 및 Shap으로 분석해보기
2022. 11. 27. 23:07ㆍML(머신러닝)/Tree Based Model
In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
/tmp/ipykernel_2227596/3510566465.py:1: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display
from IPython.core.display import display, HTML
In [2]:
import shap
# train XGBoost model
X,y = shap.datasets.adult()
In [5]:
import pandas as pd
import sys
ROOT_DIR = "./../"
sys.path.append(ROOT_DIR)
import shap
import matplotlib.pyplot as plt ## 시각화
import matplotlib.font_manager as fm## 시각화
import matplotlib
import catboost as cb
import numpy as np
import warnings ## 에러 방지
warnings.filterwarnings(action='ignore')
## 한글 폰트 (나눔) 적용하는 코드
plt.rc('axes',unicode_minus=False)
# plt.rcParams["font.family"] = 'NanumGothic'
# matplotlib.rcParams['axes.unicode_minus'] = False
# plt.rc('font', family='NanumGothic')
# path = '/usr/share/fonts/truetype/nanum/NanumGothicCoding.ttf'
plt.rc('font', family='NanumBarunGothic') # 폰트 설정
# plt.rc("font", family = "Malgun Gothic")
plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")
In [18]:
X,y = shap.datasets.adult(display=True)
cat_features = list(X.select_dtypes("category"))
train_dataset = cb.Pool(X, y,cat_features=cat_features)
model = cb.CatBoostClassifier(verbose=False)
grid = {'iterations': [100, 150, 200],
'learning_rate': [0.03, 0.05,0.07,0.1],
'depth': [6, 8,10],
'l2_leaf_reg': [0.2, 0.5, 1, 3]}
# model.grid_search(grid, train_dataset)
model_result = model.randomized_search(grid, train_dataset,n_iter=2,
plot=False,
verbose=False,partition_random_seed=0)
bestTest = 0.2842787463
bestIteration = 99
bestTest = 0.2856006001
bestIteration = 149
Training on fold [0/3]
bestTest = 0.2923580878
bestIteration = 98
Training on fold [1/3]
bestTest = 0.2938665719
bestIteration = 99
Training on fold [2/3]
bestTest = 0.2807298121
bestIteration = 97
In [19]:
pd.DataFrame([model.get_params()])
Out[19]:
verbose | depth | l2_leaf_reg | iterations | learning_rate | |
---|---|---|---|---|---|
0 | False | 8 | 1 | 100 | 0.07 |
In [20]:
sorted_feature_importance = model.feature_importances_.argsort()
plt.barh(np.array(list(X))[sorted_feature_importance],
model.feature_importances_[sorted_feature_importance],
color='turquoise')
plt.xlabel("CatBoost Feature Importance")
Out[20]:
Text(0.5, 0, 'CatBoost Feature Importance')
In [21]:
approximate = model.get_feature_importance(train_dataset, type="ShapValues", shap_calc_type="Approximate")[:, :-1]
shap.summary_plot(approximate, X,plot_type="bar")
In [22]:
approximate = model.get_feature_importance(train_dataset, type="ShapValues", shap_calc_type="Exact")[:, :-1]
shap.summary_plot(approximate, X,plot_type="bar")
In [23]:
approximate = model.get_feature_importance(train_dataset, type="ShapValues", shap_calc_type="Regular")[:, :-1]
shap.summary_plot(approximate, X,plot_type="bar")
In [24]:
shap.dependence_plot("Relationship", approximate, X)
In [25]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(cb.Pool(X, y, cat_features=cat_features))
In [26]:
shap.initjs()
sample_idxs = [0,1,2]
shap.force_plot(explainer.expected_value, shap_values[sample_idxs,], X.iloc[sample_idxs,:])
Out[26]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [27]:
shap.summary_plot(shap_values, X, plot_type="bar")
In [28]:
shap.summary_plot(shap_values, X)
In [29]:
cols = list(X.select_dtypes("category"))
import category_encoders as ce
encoder = ce.OrdinalEncoder(cols=cols)
In [30]:
shap_values_v2 = explainer(
cb.Pool(X, y, cat_features=cat_features))
exp =shap.Explanation(
values=shap_values_v2.values ,
base_values= shap_values_v2.base_values,
data=encoder.fit_transform(X).values,
feature_names=list(X),
display_data=X.values
)
In [31]:
for col in cols :
shap.plots.scatter(exp[:,col])
In [32]:
shap_values_v2 = explainer(cb.Pool(X, y, cat_features=cat_features))
exp =shap.Explanation(
values=shap_values_v2.values ,
base_values= shap_values_v2.base_values,
data=X.values,
feature_names=list(X),
display_data=X.values
)
In [34]:
import matplotlib.pyplot as plt
from shap.plots import waterfall
waterfall(exp[2],show=True,max_display=22)
In [35]:
shap.plots.heatmap(exp[0:500],max_display=22)
In [ ]:
728x90
'ML(머신러닝) > Tree Based Model' 카테고리의 다른 글
CatBoost란? unbiased boosting with categorical features - 2 (0) | 2019.05.21 |
---|---|
CatBoost란? unbiased boosting with categorical features - 1 (2) | 2019.05.21 |
지도학습 결정트리 앙상블(Randomforest, Gradient Boosting) (0) | 2018.01.05 |