sklearn Pipeline을 이용해 다양한 Classification모델들 모델링하기
2019. 6. 15. 18:48ㆍ분석 Python/Scikit Learn (싸이킷런)
sklearn을 사용해서 sklearn에서 제공하는 다양한 분류 모델들을 비교하는 코드를 만들려고 한다.
원래는 한 Pipeline에 다 하려고 했는데, 먼가 자꾸 꼬여서 그냥 여러 개를 만드는 방향으로 바꿨다.
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline , make_pipeline
from sklearn.preprocessing import StandardScaler ,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.impute import SimpleImputer
##
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score , ShuffleSplit
from sklearn import metrics
from sklearn.metrics import roc_curve , roc_auc_score , f1_score , classification_report
## plot
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
##Model classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import GradientBoostingClassifier as GBM
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
def rocvis(true , prob , label ) :
AUC = np.mean(true == np.round(prob.ravel()).astype(int))
if type(true[0]) == str :
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
true = le.fit_transform(true)
else :
pass
fpr, tpr, thresholds = roc_curve(true, prob)
plt.plot(fpr, tpr, marker='.', label = "AUC : {:.2f} , {}".format(AUC,label) )
cancer = datasets.load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, test_size=0.4, random_state=0)
import warnings
warnings.filterwarnings("ignore")
from pycm import *
cv = ShuffleSplit(n_splits=5 , test_size=0.3, random_state=42)
pipe_lr = Pipeline([('scl', StandardScaler()),
('clf', LogisticRegression(random_state=42))])
pipe_rf = Pipeline([('scl', StandardScaler()),
('clf', RandomForestClassifier(random_state=42 ,))])
pipe_svm = Pipeline([('scl', StandardScaler()),
('clf', SVC(random_state=42 , probability = True))])
pipe_lda = Pipeline([('scl', StandardScaler()),
('clf', LDA(n_components = 2 ))])
pipe_qda = Pipeline([('scl', StandardScaler()),
('clf', QDA())])
pipe_gbm = Pipeline([('scl', StandardScaler()),
('clf', GBM(n_estimators = 1000 , random_state = 42))])
pipe_knn = Pipeline([('scl', StandardScaler()),
('clf', KNN())])
pipe_ridge = Pipeline([('scl', StandardScaler()),
('clf', RidgeClassifier())])
aram_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
depth_range = [7,8,9]
min_samples_split_range = [0.5, 0.7 , 0.9]
param_range = [0.5, 0.1]
param_range_fl = np.logspace(0,-5,5)
grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
'clf__C': param_range_fl,
'clf__solver': ['liblinear']}]
grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
'clf__min_samples_leaf': param_range,
'clf__max_depth': depth_range,
'clf__min_samples_split': min_samples_split_range }]
grid_params_svm = [{'clf__kernel': ['linear', 'rbf'],
'clf__C': param_range}]
grid_params_lda = [{"clf__tol" : param_range_fl ,
"clf__solver" : ["svd","lsqr","eigen"]} ]
grid_params_qda = [{"clf__tol" : param_range_fl} ]
grid_params_gbm = [{"clf__tol" : param_range_fl ,
'clf__max_depth': depth_range,
'clf__min_samples_leaf': param_range,
'clf__loss' : ["deviance", "exponential"],
} ]
grid_params_knn = [{"clf__n_neighbors" : [2,4,6] ,
"clf__weights" : ["uniform","distance"] ,
'clf__algorithm': ["ball_tree", "kd_tree", "brute"],
} ]
grid_params_ridge = [
{"clf__solver" : ["svd","cholesky","lsqr","sparse_cg" , "sag","saga"] ,
"clf__tol" : param_range_fl ,
"clf__alpha" : param_range_fl ,
}
]
pipe = [
pipe_lr , pipe_rf , pipe_svm ,
pipe_lda , pipe_qda , pipe_gbm ,
pipe_knn , pipe_ridge
]
params = [
grid_params_lr , grid_params_rf , grid_params_svm ,
grid_params_lda , grid_params_qda, grid_params_gbm ,
grid_params_knn , grid_params_ridge ,
]
jobs = 20
grid_dict = {0: 'Logistic Regression',
1: 'Random Forest',
2: 'Support Vector Machine' ,
3: "Linear Discriminant Analysis",
4: "Quadratic Discriminant Analysis",
5: "GradientBoostingClassifer" ,
6: "KNNClassifier" ,
7: "RidgeClassifier" ,
}
model_prob = {}
model_result = {}
model_best_params = {}
model_confusion = {}
plt.style.use('ggplot')
fig , ax = plt.subplots(figsize= (20,10))
plt.plot([0, 1], [0, 1], linestyle='--')
for idx , (param , model) in enumerate(zip(params , pipe)) :
search = GridSearchCV(model, param, iid=True ,
cv=cv , n_jobs=jobs , verbose=-1 )
search.fit(X_train , y_train)
y_pred = search.predict(X_test)
try :
y_prob = search.predict_proba(X_test)
except Exception as e :
pass
rocvis(true = y_test , prob = y_prob[:,1] , label = grid_dict.get(idx) )
model_result[grid_dict.get(idx)] = roc_auc_score(y_test, y_pred)
model_prob[grid_dict.get(idx)] = y_prob
model_best_params[grid_dict.get(idx)] = search.best_params_
model_confusion[grid_dict.get(idx)] = ConfusionMatrix(y_test, y_pred)
plt.legend(fontsize = 20 , loc='center', shadow=True )
plt.title("Models Roc Curve" , fontsize= 25)
plt.savefig("./Model_Result.png")
plt.show()
cp = Compare(model_confusion)
ㅎ
output = pd.DataFrame([model_result.keys() , model_result.values()], index = ["algo","r2"]).T
output.sort_values(["r2"], ascending= False ,inplace=True)
fig ,ax = plt.subplots(figsize=(20, 10))
sns.set(font_scale = 2)
sns.barplot(y="algo", x="r2", data=output)
plt.show()
print(cp)
print(cp.best_name , "\n" , cp.best)
728x90
'분석 Python > Scikit Learn (싸이킷런)' 카테고리의 다른 글
[ Python ]sklearn Pipeline으로 전처리하고 dict에 저장 후 재사용 (0) | 2019.08.06 |
---|---|
sklearn Pipeline을 이용해 다양한 Regression모델 모델링하기 (0) | 2019.06.15 |
Sklearn SVM + OneVsRestClassifer Gridsearch (0) | 2019.06.15 |
[ Python ] Scikit-Learn, Numeric 표준화 / Category Onehot 하는 Pipeline 및 모델링하는 코드 (0) | 2019.06.15 |
Lasso coordinate Descent 방식으로 최적의 Coef 구하기 (0) | 2019.05.12 |