Python) most frequent speed test
2021. 12. 24. 22:44ㆍ분석 Python/구현 및 자료
파이썬에서 최빈값을 계산하기 위해서 여러가지 테스트를 수행해봤다.
목차
functions
import numpy as np
import collections
from scipy.stats import mode
import concurrent.futures
from joblib import Parallel, delayed
def numpy_most_frequent(x) :
values, counts = np.unique(x, return_counts=True)
ind = np.argmax(counts)
return values[ind]
def counter_most_frequent(x) :
return collections.Counter(x).most_common()[0][0]
def stats_most_frequent(x) :
return mode(x)[0][0]
def max_most_freqeunt(x) :
return max(map(lambda val: (x.count(val), val), set(x)))[1]
def check_arr(arr , func) :
return [func(arr[:,i].tolist()) for i in range(arr.shape[1])]
def np_arr_most_frequent(arr,axis=0) :
u, indices = np.unique(arr, return_inverse=True)
return u[np.argmax(np.apply_along_axis(np.bincount, axis, indices.reshape(arr.shape),
None, np.max(indices) + 1), axis=axis)]
def np_arr_stat_mode(arr,axis=0) :
return mode(arr , axis=axis)
def check_arr_parallel(arr , func) :
results = Parallel(n_jobs=5)(delayed(func)(arr[:,i].tolist()) for i in range(arr.shape[1]))
return results
def check_arr_thread(arr , func) :
with concurrent.futures.ThreadPoolExecutor() as executor:
futures= []
for i in (arr[:,i].tolist() for i in range(arr.shape[1])) :
futures.append(executor.submit(func, i))
result = []
for future in concurrent.futures.as_completed(futures):
result.append(future.result())
return result
test
sample = np.random.choice(list("ABCDEFGHIJKMLNOPQR"),100).tolist()
%timeit check_arr(arr , max_most_freqeunt)
%timeit check_arr(arr , stats_most_frequent)
%timeit check_arr(arr , counter_most_frequent)
%timeit check_arr(arr , numpy_most_frequent)
%timeit np_arr_most_frequent(arr,axis=0)
%timeit np_arr_stat_mode(arr, axis=0)
98.9 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
199 ms ± 14.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
156 ms ± 4.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
161 ms ± 22.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
10.6 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
64.4 ms ± 1.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
vector
sample = np.random.choice(list("ABCDEFGHIJKMLNOPQR"),100000).tolist()
%timeit max_most_freqeunt(sample)
%timeit stats_most_frequent(sample)
%timeit counter_most_frequent(sample)
%timeit numpy_most_frequent(sample)
# 54.4 ms ± 2.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# 88.1 ms ± 2.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# 6.27 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# 37.2 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
array
loop / array
import string
sample = np.random.choice(list(string.ascii_lowercase),100000)
arr = sample.reshape(-1,100)
%timeit check_arr(arr , max_most_freqeunt)
%timeit check_arr(arr , stats_most_frequent)
%timeit check_arr(arr , counter_most_frequent)
%timeit check_arr(arr , numpy_most_frequent)
%timeit np_arr_most_frequent(arr,axis=0)
%timeit np_arr_stat_mode(arr,axis=0)
# 80.2 ms ± 2.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# 106 ms ± 2.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# 9.49 ms ± 216 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# 45.4 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# 10.6 ms ± 241 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# 61.4 ms ± 2.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
parallel
%timeit check_arr_parallel(arr , max_most_freqeunt)
%timeit check_arr_parallel(arr , stats_most_frequent)
%timeit check_arr_parallel(arr , counter_most_frequent)
%timeit check_arr_parallel(arr , numpy_most_frequent)
93.5 ms ± 8.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
185 ms ± 5.94 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
145 ms ± 10.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
158 ms ± 19.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
thread
%timeit check_arr_thread(arr , max_most_freqeunt)
%timeit check_arr_thread(arr , stats_most_frequent)
%timeit check_arr_thread(arr , counter_most_frequent)
%timeit check_arr_thread(arr , numpy_most_frequent)
297 ms ± 8.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
417 ms ± 8.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
52.2 ms ± 797 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
193 ms ± 7.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
결론적으로는 counter_most_frequent 나 np_arr_most_frequent를 사용하는게 안정적이지 않을까 싶다.
728x90
'분석 Python > 구현 및 자료' 카테고리의 다른 글
Python) Sphinx를 사용하여 문서화하기 + Github Pages + Gitlab (2) | 2022.01.26 |
---|---|
Python) featuretools를 사용한 자동 변수 생성 (0) | 2022.01.22 |
선형 Kalman Filtering 알아보기 (0) | 2021.10.08 |
Python) 회귀 분석 기본 사용법 정리(scikit-learn, statsmodels) (2) | 2021.08.11 |
notion-py를 사용하여 캘린더 만들기 (0) | 2021.08.08 |