sklearn Pipeline을 이용해 다양한 Classification모델들 모델링하기

2019. 6. 15. 18:48분석 Python/Scikit Learn (싸이킷런)

728x90

sklearn을 사용해서 sklearn에서 제공하는 다양한 분류 모델들을 비교하는 코드를 만들려고 한다.

 

원래는 한 Pipeline에 다 하려고 했는데, 먼가 자꾸 꼬여서 그냥 여러 개를 만드는 방향으로 바꿨다.

 

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline , make_pipeline
from sklearn.preprocessing import StandardScaler  ,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.multiclass import OneVsRestClassifier

from sklearn.impute import SimpleImputer
##
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score , ShuffleSplit
from sklearn import metrics
from sklearn.metrics import roc_curve , roc_auc_score , f1_score , classification_report
## plot
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets

##Model classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import GradientBoostingClassifier as GBM
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier 


def rocvis(true , prob , label ) :
    AUC = np.mean(true == np.round(prob.ravel()).astype(int))
    if type(true[0]) == str :
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        true = le.fit_transform(true)
    else :
        pass
    fpr, tpr, thresholds = roc_curve(true, prob)
    plt.plot(fpr, tpr, marker='.', label =  "AUC : {:.2f} , {}".format(AUC,label)   )
    
    
cancer = datasets.load_breast_cancer()


X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, test_size=0.4, random_state=0)
    

import warnings
warnings.filterwarnings("ignore")
from pycm import * 

cv = ShuffleSplit(n_splits=5 , test_size=0.3, random_state=42)

pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', LogisticRegression(random_state=42))])

pipe_rf = Pipeline([('scl', StandardScaler()),
                    ('clf', RandomForestClassifier(random_state=42 ,))])

pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('clf', SVC(random_state=42 , probability = True))])

pipe_lda = Pipeline([('scl', StandardScaler()),
                     ('clf', LDA(n_components = 2 ))])

pipe_qda = Pipeline([('scl', StandardScaler()),
                     ('clf', QDA())])

pipe_gbm = Pipeline([('scl', StandardScaler()),
                     ('clf', GBM(n_estimators  = 1000 , random_state = 42))])
pipe_knn = Pipeline([('scl', StandardScaler()),
                     ('clf', KNN())])

pipe_ridge = Pipeline([('scl', StandardScaler()),
                     ('clf', RidgeClassifier())])


aram_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
depth_range = [7,8,9]
min_samples_split_range = [0.5, 0.7 , 0.9]
param_range = [0.5, 0.1]
param_range_fl = np.logspace(0,-5,5)

grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
                   'clf__C': param_range_fl,
                   'clf__solver': ['liblinear']}] 

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
                   'clf__min_samples_leaf': param_range,
                   'clf__max_depth': depth_range,
                   'clf__min_samples_split': min_samples_split_range }]

grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
                    'clf__C': param_range}]

grid_params_lda = [{"clf__tol" : param_range_fl , 
                    "clf__solver" : ["svd","lsqr","eigen"]} ]

grid_params_qda = [{"clf__tol" : param_range_fl} ]

grid_params_gbm = [{"clf__tol" : param_range_fl , 
                    'clf__max_depth': depth_range,
                    'clf__min_samples_leaf': param_range,
                    'clf__loss' : ["deviance", "exponential"],
                   } ]

grid_params_knn = [{"clf__n_neighbors" : [2,4,6] , 
                    "clf__weights" : ["uniform","distance"] , 
                    'clf__algorithm': ["ball_tree", "kd_tree", "brute"],
                   } ]

grid_params_ridge = [
    {"clf__solver" : ["svd","cholesky","lsqr","sparse_cg" , "sag","saga"] ,
     "clf__tol" : param_range_fl , 
     "clf__alpha" : param_range_fl , 
    }
]


pipe = [
    pipe_lr , pipe_rf , pipe_svm , 
    pipe_lda , pipe_qda , pipe_gbm , 
    pipe_knn , pipe_ridge
       ]
params = [
    grid_params_lr , grid_params_rf , grid_params_svm , 
    grid_params_lda , grid_params_qda, grid_params_gbm , 
    grid_params_knn , grid_params_ridge , 
]

jobs = 20

grid_dict = {0: 'Logistic Regression', 
             1: 'Random Forest', 
             2: 'Support Vector Machine' ,
             3: "Linear Discriminant Analysis",
             4: "Quadratic Discriminant Analysis",
             5: "GradientBoostingClassifer" ,
             6: "KNNClassifier" , 
             7: "RidgeClassifier" , 
            }

model_prob = {}
model_result = {}
model_best_params = {}
model_confusion = {}
plt.style.use('ggplot')
fig , ax = plt.subplots(figsize= (20,10))
plt.plot([0, 1], [0, 1], linestyle='--')


for idx , (param , model) in enumerate(zip(params , pipe)) :
    search = GridSearchCV(model, param, iid=True , 
                          cv=cv , n_jobs=jobs , verbose=-1 )
    search.fit(X_train , y_train)
    y_pred = search.predict(X_test)
    try :
        y_prob = search.predict_proba(X_test)
    except Exception as e :
        pass
    rocvis(true = y_test , prob = y_prob[:,1] , label = grid_dict.get(idx) )
    model_result[grid_dict.get(idx)] = roc_auc_score(y_test, y_pred)  
    model_prob[grid_dict.get(idx)] = y_prob
    model_best_params[grid_dict.get(idx)] = search.best_params_
    model_confusion[grid_dict.get(idx)] = ConfusionMatrix(y_test, y_pred)
    
plt.legend(fontsize = 20 , loc='center', shadow=True )
plt.title("Models Roc Curve" , fontsize= 25)
plt.savefig("./Model_Result.png")
plt.show()

cp = Compare(model_confusion)

 

 

output = pd.DataFrame([model_result.keys() , model_result.values()], index = ["algo","r2"]).T
output.sort_values(["r2"], ascending= False ,inplace=True)
fig ,ax = plt.subplots(figsize=(20, 10))
sns.set(font_scale = 2)
sns.barplot(y="algo", x="r2", data=output)
plt.show()

 

print(cp)
print(cp.best_name , "\n" , cp.best)

 

 

https://github.com/sungreong/TIL/blob/master/Machine_Learning/SKlearn%20Pipeline%20Classification%20%26%20Regression.ipynb

 

sungreong/TIL

Today I Learned. Contribute to sungreong/TIL development by creating an account on GitHub.

github.com

 

728x90