[Python] H2O로 Randomforest 해보기

2020. 8. 10. 20:22분석 Python/구현 및 자료

알고리즘 설정하기

import h2o
h2o.init()
from h2o.estimators import H2ORandomForestEstimator
# Import the cars dataset into H2O:
cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
## 타겟 변수 (classfication)
cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
## 타겟 변수 (regression)
cars["economy"] = cars["economy"].asnumeric()
predictors = ["displacement","power","weight","acceleration","year"]
factor_response = "economy"
numeric_response = "economy_20mpg"

train, valid = cars.split_frame(ratios=[.8], seed=1234)

Classfication

# Build and train the model:
cars_drf = H2ORandomForestEstimator(ntrees=10,
                                    max_depth=5,
                                    min_rows=10,
                                    calibrate_model=True,
                                    binomial_double_trees=True)

cars_drf.train(x=predictors,
               y=factor_response,
               training_frame=train,
               validation_frame=valid)

Regression

# Build and train the model:
cars_drf = H2ORandomForestEstimator(ntrees=10,
                                    max_depth=5,
                                    min_rows=10,
                                    calibrate_model=False,
                                    binomial_double_trees=False)

cars_drf.train(x=predictors,
               y=numeric_response,
               training_frame=train,
               validation_frame=valid)

Check Performance

# Eval performance:
perf = cars_drf.model_performance()

# Generate predictions on a validation set (if necessary):
pred = cars_drf.predict(valid)

모델 결과 확인 

# model1 = h2o.get_model(model_id)  model_id == string
cars_drf.varimp(use_pandas=True)
cars_drf.varimp_plot()
cars_drf.partial_plot(train, cols =[x[0]])

728x90