[ 변수 처리] 파이썬 결측치 대체 알고리즘 비교 예시
2019. 9. 10. 19:57ㆍ분석 Python/Data Preprocessing
Class로 만들어서 비교해보기
from autoimpute import imputations
import impyute as impy
import numpy as np
from missingpy import MissForest
from tqdm import tqdm_notebook
n = 30
arr = np.random.uniform(high=6, size=(n, n))
arr[:,0:5] = arr[:,0:5].astype(int)
true = arr.copy()
arr.ravel()[np.random.choice(arr.size , 100 , replace = False )] = np.nan
mask = np.isnan(arr) * 1
class Evaluate :
"""
X = missing data
T = True data
mask = missing index
algo = [ "MissForest","mean", "median" , "knn", "MICE" , "EM" , "MultipleImputer"]
"""
def __init__(self , X , T , mask , algo) :
self.X = X
self.T = T
self.mask = mask
self.algo = algo
self.method = {
"MissForest" : lambda x : MissForest().fit_transform(x) ,
"mean" : lambda x : impy.mean(x) ,
"median" : lambda x : impy.median(x) ,
"knn" : lambda x : impy.fast_knn(x) ,
"MICE" : lambda x : impy.mice(x) ,
"EM" : lambda x : impy.em(x),
# "MultipleImputer" : lambda x : MultipleImputer(return_List = True).\
# fit_transform(pd.DataFrame(x)).values,
}
def select_algo(self,) :
self.algo2 = {}
for name in self.algo :
self.algo2[name] = self.method[name]
def evaluate(self,) :
self.select_algo()
self.MSE = {}
self.result = {}
algo = self.algo2
for name in tqdm_notebook(algo) :
try :
print("{} 진행중....".format(name))
imputed = self.method[name](self.X)
D = ((imputed -self.T)*self.mask)**2
Metric = np.mean(np.sum(D , axis = 0 ) / np.sum(self.mask, axis = 0))
self.MSE[name] = Metric
# np.sqrt(np.mean(((self.mask)* imputed - (self.mask) * self.T )**2 ) / np.mean(self.mask))
self.result[name] = imputed
except Exception as e :
print(e)
del self.algo2[name]
def append_algo(self , name , value , imputed) :
self.MSE[name] = value
self.result[name] = imputed
return print("{} 업데이트".format(name))
def save_result(self, filename) :
with open("{}.pkl".format(filename) , "wb" ) as w :
dill.dump(self.MSE , w )
def load_result(self, filename) :
with open("{}.pkl".format(filename) , "rb" ) as r :
self.MSE = dill.load(r)
print(">> Result : \n" , self.MSE)
def plot(self, figsize = (9,9) , img_name = None ) :
fig, ax = plt.subplots(figsize = figsize)
out = pd.DataFrame({"ALGO" : self.MSE }).reset_index()
plt.barh(range(len(out)), out["ALGO"], color=plt.cm.Paired(np.arange(len(out))))
plt.yticks(range(len(out)) , tuple(out["index"]) )
for i, v in enumerate(list( out["ALGO"].values )):
ax.text(0, i-0.1, str(v), color='black', fontweight='bold')
plt.title("Imputation RMSE BenchMark")
if img_name is None :
plt.show()
else :
plt.savefig(img_name)
plt.show()
EV = Evaluate(arr ,true , mask , [ "MissForest","mean", "median" , "knn", "MICE" , "EM"]) #
EV.evaluate()
EV.plot()
728x90
'분석 Python > Data Preprocessing' 카테고리의 다른 글
[변수 처리] 데이터에서 결측치 잘 만들어보기 (0) | 2019.09.17 |
---|---|
[변수 처리] Python에서 범주형 변수(Categorical) 다루기 (0) | 2019.09.13 |
[변수 처리] 011011 같은 값을 multiple label encoding 으로 만들어주기 (0) | 2019.07.16 |
[변수 생성] AutoEncoder로 파생변수 만들기 -2 (모델링 파트) Catboost (0) | 2019.06.02 |
[변수 생성] AutoEncoder로 파생변수 만들기 (0) | 2019.06.02 |