Below are the solutions to these exercises on h2o and machine learning
############### # # # Exercise 1 # # # ############### setwd("H20/") library(h2o) cluster.h <- h2o.init() bank_data <- h2o.importFile("data\bank.csv") response = "y" predictors <- c("age","job","marital","education","default","balance","housing","loan") splits <- h2o.splitFrame(bank_data,c(0.8,0.1)) train <- splits[[1]] valid <- splits[[2]] test <- splits[[3]] gbm.m <- h2o.gbm(x, y, train, nfolds = 10, model_id = "GBM_defaults",seed = 1042) b <- h2o.gbm(x = predictors, y = response, training_frame = train, validation_frame = valid, balance_classes = TRUE, seed = 1234,nfolds = 5,keep_cross_validation_predictions = TRUE,ntrees = 10) h2o.auc(h2o.performance(b,newdata = valid)) h2o.auc(h2o.performance(b,newdata = test)) ############### # # # Exercise 2 # # # ############### class_sampling_factor = c(0.2,1.0) b.1 <- h2o.gbm(x = predictors, y = response, training_frame = train, validation_frame = valid, balance_classes = TRUE, seed = 1234,nfolds = 10,ntrees = 15,class_sampling_factors = class_sampling_factor, fold_assignment = "Stratified",distribution = "bernoulli", stopping_metric = "AUC",stopping_tolerance = 1e-2,stopping_rounds = 4 ) b.2<- h2o.gbm(x = predictors, y = response, training_frame = train, validation_frame = valid, balance_classes = TRUE, seed = 1234,nfolds = 10, ntrees = 10,class_sampling_factors = c(1.0,0.2), fold_assignment = "Stratified",distribution = "bernoulli", stopping_metric = "AUC",stopping_tolerance = 1e-2,stopping_rounds = 4) h2o.auc(h2o.performance(b.1,newdata = valid)) h2o.auc(h2o.performance(b.1,newdata = test)) h2o.auc(h2o.performance(b.2,newdata = valid)) h2o.auc(h2o.performance(b.2,newdata = test)) ############### # # # Exercise 3 # # # ############### hyper_params <- list( nbins_cats = c(2,4,8, 16) , learn_rate = seq(0.1,1,0.1), col_sample_rate_per_tree = c(.2, .6, .8, 1) ) grid <- h2o.grid(x = predictors, y = response, training_frame = train, validation_frame = valid, algorithm = "gbm",distribution = "bernoulli", grid_id = "air_grid", hyper_params = hyper_params, stopping_rounds = 5, stopping_tolerance = 1e-4, stopping_metric = "AUC", search_criteria = list(strategy = "Cartesian"), seed = 1234,balance_classes = TRUE,class_sampling_factors = c(0.2,1.0)) ############### # # # Exercise 4 # # # ############### h2o.getGrid("air_grid", sort_by="auc", decreasing = TRUE) best.m <- h2o.getModel("air_grid_model_1") h2o.auc(h2o.performance(best.m,test)) ############### # # # Exercise 5 # # # ############### rfHex <- h2o.randomForest(x=predictors, y= response, ntrees = 100, max_depth = 30, nbins_cats = 16, training_frame= train) h2o.auc(h2o.performance(rfHex,newdata = valid)) h2o.auc(h2o.performance(rfHex,newdata = test)) ############### # # # Exercise 6 # # # ############### ntrees_opts <- c(10000) ## early stopping will stop earlier max_depth_opts <- seq(1,20) min_rows_opts <- c(1,5,10,20,50,100) learn_rate_opts <- seq(0.001,0.01,0.001) sample_rate_opts <- seq(0.3,1,0.05) col_sample_rate_opts <- seq(0.3,1,0.05) col_sample_rate_per_tree_opts = seq(0.3,1,0.05) hyper_params.rf= list( ntrees = ntrees_opts, max_depth = max_depth_opts, min_rows = min_rows_opts, #learn_rate = learn_rate_opts, sample_rate = sample_rate_opts ) grid.rf<- h2o.grid(x = predictors, y = response, training_frame = train, validation_frame = valid, algorithm = "randomForest",distribution = "bernoulli", grid_id = "rf_grid_1", hyper_params = hyper_params.rf, stopping_rounds = 5, stopping_tolerance = 1e-4, stopping_metric = "AUC", search_criteria = list(strategy = "Cartesian"), seed = 1234,balance_classes = TRUE) ############### # # # Exercise 7 # # # ############### gbm.en <- h2o.gbm(x = predictors, y = response, training_frame = train, distribution = "bernoulli", ntrees = 10, max_depth = 3, min_rows = 2, learn_rate = 0.2, nfolds = 5, fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE, seed = 1) rf.en <- h2o.randomForest(x = predictors, y = response, training_frame = train, ntrees = 50, nfolds = 5, fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE, seed = 1) # Train a stacked ensemble using the GBM and RF above ensemble.rf.gbm <- h2o.stackedEnsemble(x = predictors, y = response, training_frame = train, model_id = "en.rf.gbm", base_models = list(gbm.en@model_id, rf.en@model_id)) perf <- h2o.performance(ensemble.rf.gbm, newdata = test) h2o.auc(perf) ############### # # # Exercise 8 # # # ############### prostate <- h2o.importFile("data\pstate.csv") prostate.km = h2o.kmeans(prostate, k = 10, x = c("AGE","RACE","GLEASON","CAPSULE","DCAPS") ) print(prostate.km) ############### # # # Exercise 9 # # # ############### prostate.km.2 = h2o.kmeans(prostate, k = 10, x = c("AGE","RACE","GLEASON","CAPSULE","DCAPS") ,init = "Furthest") par(mfrow = c(1,2)) prostate.ctrs = as.data.frame(prostate.km@model$centers) plot(prostate.ctrs[,1:2]) plot(prostate.ctrs[,3:4]) h2o.predict(prostate.km,newdata = prostate) ############### # # # Exercise 10 # # # ############### pca_data <- h2o.importFile("data\Train_UWu5bXk.csv") h2o.str(pca_data) splits.pca <- h2o.splitFrame(pca_data,c(0.8)) train.pca <- splits.pca[[1]] test.pca <- splits.pca[[2]] mkj <- h2o.prcomp(train.pca,k=8,use_all_factor_levels = TRUE) screeplot(mkj)
Leave a Reply