Python) most frequent speed test

2021. 12. 24. 22:44분석 Python/구현 및 자료

파이썬에서 최빈값을 계산하기 위해서 여러가지 테스트를 수행해봤다. 

 

목차

    functions

    import numpy as np 
    import collections
    from scipy.stats import mode
    import concurrent.futures
    from joblib import Parallel, delayed
    
    def numpy_most_frequent(x) :
        values, counts = np.unique(x, return_counts=True)
    
        ind = np.argmax(counts)
        return values[ind]
    
    def counter_most_frequent(x) :
        return collections.Counter(x).most_common()[0][0]
    
    def stats_most_frequent(x) :
        return mode(x)[0][0]
    
    def max_most_freqeunt(x) :
        return max(map(lambda val: (x.count(val), val), set(x)))[1]
    
    def check_arr(arr , func) :
        return [func(arr[:,i].tolist()) for i in range(arr.shape[1])]
            
    def np_arr_most_frequent(arr,axis=0) :
        u, indices = np.unique(arr, return_inverse=True)
        return u[np.argmax(np.apply_along_axis(np.bincount, axis, indices.reshape(arr.shape),
                                        None, np.max(indices) + 1), axis=axis)]
                                        
    def np_arr_stat_mode(arr,axis=0) :
        return mode(arr , axis=axis)
        
    
    
    def check_arr_parallel(arr , func) :
        results = Parallel(n_jobs=5)(delayed(func)(arr[:,i].tolist()) for i in range(arr.shape[1]))
        return results 
    
    def check_arr_thread(arr , func) :
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures= []
            for i in (arr[:,i].tolist() for i in range(arr.shape[1])) :
                futures.append(executor.submit(func, i))
            result = []
            for future in concurrent.futures.as_completed(futures):
                result.append(future.result())
    
    
        return  result

    test

    sample = np.random.choice(list("ABCDEFGHIJKMLNOPQR"),100).tolist()
    
    
    %timeit  check_arr(arr , max_most_freqeunt)
    %timeit  check_arr(arr , stats_most_frequent)
    %timeit  check_arr(arr , counter_most_frequent)
    %timeit  check_arr(arr , numpy_most_frequent)
    %timeit  np_arr_most_frequent(arr,axis=0)
    %timeit  np_arr_stat_mode(arr, axis=0)
    
    98.9 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    199 ms ± 14.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    156 ms ± 4.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    161 ms ± 22.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    10.6 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    64.4 ms ± 1.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

     


    vector

    sample = np.random.choice(list("ABCDEFGHIJKMLNOPQR"),100000).tolist()
    
    %timeit max_most_freqeunt(sample)
    %timeit stats_most_frequent(sample)
    %timeit counter_most_frequent(sample)
    %timeit numpy_most_frequent(sample)
    
    # 54.4 ms ± 2.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    # 88.1 ms ± 2.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    # 6.27 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    # 37.2 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

     

    array

     

    loop / array

    import string
    sample = np.random.choice(list(string.ascii_lowercase),100000)
    arr = sample.reshape(-1,100)
    
    
    %timeit  check_arr(arr , max_most_freqeunt)
    %timeit  check_arr(arr , stats_most_frequent)
    %timeit  check_arr(arr , counter_most_frequent)
    %timeit  check_arr(arr , numpy_most_frequent)
    %timeit  np_arr_most_frequent(arr,axis=0)
    %timeit  np_arr_stat_mode(arr,axis=0)
    
    # 80.2 ms ± 2.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    # 106 ms ± 2.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    # 9.49 ms ± 216 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    # 45.4 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    # 10.6 ms ± 241 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    # 61.4 ms ± 2.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

    parallel

    %timeit  check_arr_parallel(arr , max_most_freqeunt)
    %timeit  check_arr_parallel(arr , stats_most_frequent)
    %timeit  check_arr_parallel(arr , counter_most_frequent)
    %timeit  check_arr_parallel(arr , numpy_most_frequent)
    
    93.5 ms ± 8.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    185 ms ± 5.94 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    145 ms ± 10.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    158 ms ± 19.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

    thread

    %timeit  check_arr_thread(arr , max_most_freqeunt)
    %timeit  check_arr_thread(arr , stats_most_frequent)
    %timeit  check_arr_thread(arr , counter_most_frequent)
    %timeit  check_arr_thread(arr , numpy_most_frequent)
    
    
    297 ms ± 8.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    417 ms ± 8.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    52.2 ms ± 797 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    193 ms ± 7.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

     

     

     

    결론적으로는 counter_most_frequent 나 np_arr_most_frequent를 사용하는게 안정적이지 않을까 싶다.

    728x90