[ 변수 처리] 파이썬 결측치 대체 알고리즘 비교 예시

2019. 9. 10. 19:57분석 Python/Data Preprocessing

 

Class로 만들어서 비교해보기

from autoimpute import imputations
import impyute as impy
import numpy as np
from missingpy import MissForest
from tqdm import tqdm_notebook
n = 30
arr = np.random.uniform(high=6, size=(n, n))
arr[:,0:5] = arr[:,0:5].astype(int)
true = arr.copy() 
arr.ravel()[np.random.choice(arr.size , 100 , replace = False )] = np.nan
mask = np.isnan(arr) * 1
class Evaluate :
    """
    X = missing data
    T = True data
    mask = missing index
    algo = [ "MissForest","mean", "median" , "knn", "MICE"  , "EM" , "MultipleImputer"]
    """
    def __init__(self , X , T , mask , algo) :    
        self.X = X
        self.T = T
        self.mask = mask
        self.algo = algo
        self.method = {
            "MissForest" : lambda x : MissForest().fit_transform(x) ,
            "mean" : lambda x : impy.mean(x) , 
            "median" : lambda x : impy.median(x) , 
            "knn" : lambda x : impy.fast_knn(x) ,
            "MICE" : lambda x : impy.mice(x) ,
            "EM" : lambda x : impy.em(x),
#             "MultipleImputer" : lambda x : MultipleImputer(return_List = True).\
#             fit_transform(pd.DataFrame(x)).values,
        }
        
    def select_algo(self,) :
        self.algo2 = {}
        for name in self.algo :
            self.algo2[name] = self.method[name]
            
    def evaluate(self,) :
        self.select_algo()
        self.MSE = {}
        self.result = {}
        algo = self.algo2
        for name in tqdm_notebook(algo) : 
            try :
                print("{} 진행중....".format(name))
                imputed = self.method[name](self.X)
                D = ((imputed -self.T)*self.mask)**2
                Metric = np.mean(np.sum(D , axis = 0 ) / np.sum(self.mask, axis = 0))
                self.MSE[name] = Metric
#                 np.sqrt(np.mean(((self.mask)* imputed  - (self.mask) * self.T )**2 ) / np.mean(self.mask))
                self.result[name] = imputed 
            except Exception as e :
                print(e)
                del self.algo2[name]
    
    def append_algo(self , name , value , imputed) :
        self.MSE[name] = value
        self.result[name] = imputed
        return print("{} 업데이트".format(name))
    
    def save_result(self, filename) :
        with open("{}.pkl".format(filename) , "wb" ) as w :
            dill.dump(self.MSE , w )
    
    def load_result(self, filename) :
        with open("{}.pkl".format(filename) , "rb" ) as r :
            self.MSE = dill.load(r)
        print(">> Result : \n" , self.MSE)
    
    def plot(self, figsize = (9,9) , img_name = None ) :
        fig, ax = plt.subplots(figsize = figsize)  
        out = pd.DataFrame({"ALGO" : self.MSE }).reset_index()
        plt.barh(range(len(out)), out["ALGO"], color=plt.cm.Paired(np.arange(len(out))))
        plt.yticks(range(len(out)) , tuple(out["index"]) )
        for i, v in enumerate(list( out["ALGO"].values )):
            ax.text(0, i-0.1, str(v), color='black', fontweight='bold')
        plt.title("Imputation RMSE BenchMark")
        if img_name is None :
            plt.show()
        else :
            plt.savefig(img_name)
            plt.show()
EV = Evaluate(arr ,true , mask , [ "MissForest","mean", "median" , "knn", "MICE"  , "EM"]) # 
EV.evaluate()

EV.plot()

728x90