[ Python ] Scikit-Learn Pipeline + RandomizedSearchCV + shap,eli5

2019. 12. 28. 18:06분석 Python/Scikit Learn (싸이킷런)


이번 글에서는 전체 모델링을 하고 나서 모델 해석을 위해 eli5 , shap을 사용하려고 한다.
핵심 포인트는 Pipeline과 Shap , Eli5를 보시면 될 것 같다.
모델 해석으로는 lime, shap, eli5가 있는데, 다 좋지만 개인적으로 shap가 선호하므로, 좀 더 잘 알기 위해서 추후에 정리해보려고 한다.

import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import eli5
import shap
import seaborn as sns
# load the dataset
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler , RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

### data 
income = pd.read_csv("./../Data/income_evaluation.csv")
newcol = [i.strip() for i in income.columns.tolist()]
income.columns = newcol

### PipeLine Class 만들기 ###

# Custom Transformer that extracts columns passed as argument
class FeatureSelector(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self, feature_names):
        self.feature_names = feature_names
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        return X[self.feature_names]
class MissingTransformer( BaseEstimator, TransformerMixin ):
    def __init__(self, MissingImputer):
        self.MissingImputer = MissingImputer
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self
    def transform(self, X , y = None ):
        cols = X.columns.tolist()
        df = X.copy()
        result = self.MissingImputer.fit_transform(df)
        result = pd.DataFrame(result , columns = cols )
        return result
# converts certain features to categorical
class NumericalTransformer( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes a boolean as its argument
    def __init__(self, new_features=True):
        self.new_features = new_features
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        df = X.copy()
        # convert columns to numerical
        columns =df.columns.to_list()
        for name in columns :
            if name == "age" :
                value = RobustScaler().fit_transform(df[name].values.reshape(-1,1))
            else :
                value = StandardScaler().fit_transform(df[name].values.reshape(-1,1))
            df[name] = value
        # returns numpy array
        return df
# converts certain features to categorical
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes a boolean as its argument
    def __init__(self, new_features=True):
        self.new_features = new_features
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        df = X.copy()
        if self.new_features:
            # Treat ? workclass as unknown
            df['workclass']= df['workclass'].replace('?','Unknown') 
            # Two many category level, convert just US and Non-US
            df.loc[df['native-country']!=' United-States','native-country'] = 'non_usa'
            df.loc[df['native-country']==' United-States','native-country'] = 'usa'
        # convert columns to categorical
        columns =df.columns.to_list()
        for name in columns :
            col = pd.Categorical(df[name])
            df[name] = col.codes
        # returns numpy array
        return df

### Pipeline 구축하기 ###

# get the categorical feature names
categorical_features = X.select_dtypes("object").columns.to_list()
# get the numerical feature names
numerical_features = X.select_dtypes("float").columns.to_list()
# create the steps for the categorical pipeline
categorical_steps = [
    ('cat_selector', FeatureSelector(categorical_features)),
    ('imputer', MissingTransformer(SimpleImputer(strategy='constant', 
    ('cat_transformer', CategoricalTransformer())
# create the steps for the numerical pipeline
numerical_steps = [
    ('num_selector', FeatureSelector(numerical_features)),
    ('imputer', MissingTransformer(SimpleImputer(strategy='median'))),
    ('std_scaler', NumericalTransformer()),
# create the 2 pipelines with the respective steps
categorical_pipeline = Pipeline(categorical_steps)
numerical_pipeline = Pipeline(numerical_steps)

pipeline_list = [
    ('categorical_pipeline', categorical_pipeline),
    ('numerical_pipeline', numerical_pipeline)
# Combining the 2 pieplines horizontally into one full pipeline 
preprocessing_pipeline =FeatureUnion(transformer_list=pipeline_list)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,
                                                    random_state=seed, shuffle=True, stratify=y)

### PipeLine + RandomForest + RandomSearchCV ###

# we pass the preprocessing pipeline as a step to the full pipeline
full_pipeline_steps = [
    ('preprocessing_pipeline', preprocessing_pipeline),
    ('model', RandomForestClassifier(random_state=seed))
# create the full pipeline object
full_pipeline = Pipeline(steps=full_pipeline_steps)
# Create the grid search parameter grid and scoring funcitons
param_grid = {
    "model": [RandomForestClassifier(random_state=seed)],
    "model__max_depth": np.linspace(1, 32, 32),
    "model__n_estimators": np.arange(100, 1000, 100),
    "model__criterion": ["gini","entropy"],
    "model__max_leaf_nodes": [16, 64, 128, 256],
    "model__oob_score": [True],
scoring = {
    'AUC': 'roc_auc', 
    'Accuracy': make_scorer(accuracy_score)
# create the Kfold object
num_folds = 3
kfold = StratifiedKFold(n_splits=num_folds, random_state=seed)
# create the grid search object with the full pipeline as estimator
grid = RandomizedSearchCV(
# fit grid search
best_rf = grid.fit(X_train,y_train)
print(f'Best score: {best_rf.best_score_}')
print(f'Best model: {best_rf.best_params_}')

pred_test = best_rf.predict(X_test)
pred_train = best_rf.predict(X_train)
print('Train Accuracy: ', accuracy_score(y_train, pred_train))
print('Test Accuraccy: ', accuracy_score(y_test, pred_test))
print("Out-of-Bag Accuracy: ", best_rf.best_params_['model'].oob_score_)
print('\nConfusion Matrix:')
print('\nClassification Report:')

# lets get the random forest model configuration and feature names
rf_model = best_rf.best_params_['model']
features = np.array(X_train.columns)
new_X_test = preprocessing_pipeline.fit_transform(X_test)
new_X_test = pd.DataFrame(new_X_test, columns=features)

### Feature Importance Plot ###

# get the predicitons from the random forest object
y_pred = rf_model.predict(new_X_test)
# get the feature importances
importances = rf_model.feature_importances_
# sort the indexes
sorted_index = np.argsort(importances)
sorted_importances = importances[sorted_index]
sorted_features = features[sorted_index]
# plot the explained variance using a barplot
fig, ax = plt.subplots()
ax.barh(sorted_features , sorted_importances)

### eli5 ###

eli5에 대해서 좀 알아보니 catboost까지 지원을 한다!  keras도 지원하는데 이미지 관련된 것만 가능하고 
lime도 되는데, lime은 text만 가능하다.  한번 처음 써봤는데 굉장히 직관적이고 좋은 것 같다.

rf_model = best_rf.best_params_['model']
features = np.array(X_train.columns)
eli5.show_weights(rf_model, feature_names=features)
eli5.show_prediction(rf_model , new_X_test.sample(1))
eli5.explain_prediction_df(rf_model , new_X_test.iloc[0])

from eli5.permutation_importance import get_score_importances
from sklearn.metrics import accuracy_score
def score(X, y):
    y_pred = rf_model.predict(X)
    return accuracy_score(y, y_pred)

base_score , score_decreas = get_score_importances(score , new_X_test.values ,
                                                   y_test , n_iter =10 )

### shap ###

많은 함수를 제공해서 사용하기에는 좋은 것 같다. 아래 글에 참조한 블로그도 있으니 참고하면 될 것 같다!

import shap
# Create the explainer object
explainer = shap.TreeExplainer(rf_model)
print('Expected Value:', explainer.expected_value)
# get the shap values from the explainer
shap_values = explainer.shap_values(new_X_test)


참고하시면 될 것 같다 (https://data-newbie.tistory.com/254)

 target = 0 , idx = 4
target = 1 , idx = 4
target : 0 . 0 ~ 1000 개
target : 1 . 0 ~ 1000 개



target : 0 , index : 2
target : 0 , index 0 ~ 100


age 변수 / 왼쪽 : target 0 오른쪽 target : 1


age 변수 / 왼쪽 : target 0 오른쪽 target : 1


하다 보니, 본래에서 하려고 하던 것보다 eli5와 shap에 대해서 함수 찾는 과정이 재미있어서 보게 되었다.
아직 해석하는 방법에 대해서는 추후에 알아보기로 하고 관심 있는 것을 사용하면 될 것 같다.
암튼 pipleline을 이용해서 전처리도 모델링까지 한번에 구축할 수 있다는 큰 장점이 있고, scikit-learn 같은 경우 모델 해석하는 라이브러리와 붙여서 쓸 수 있어서 참 좋은 것 같다.
scikit-learn은 보면 볼수록 엄청나게 기능이 많은 패키지라는 것을 다시 느끼게 된다...ㄷㄷ




