Below are the solutions to these exercises on parallel computing with snow
and Rmpi
.
#################### # # # Exercise 1 # # # #################### data_large <- read.csv("InstEval_reduced.csv") #################### # # # Exercise 2 # # # #################### # create a smaller data set set.seed(1234) selected_rows <- sample(x = 1:nrow(data_large), size = nrow(data_large)/10) data_small <- data_large[selected_rows, ] # print the number of rows in both data sets print(nrow(data_large))
## [1] 10000
print(nrow(data_small))
## [1] 1000
#################### # # # Exercise 3 # # # #################### regress_resampled_data <- function(data) { rows_resampled <- sample(x = 1:nrow(data), size = nrow(data), replace = TRUE) data_resampled <- data[rows_resampled, ] fit <- lm(y ~ ., data = data_resampled) return(fit$coefficients) } #################### # # # Exercise 4 # # # #################### require(foreach) # run the function 10 times with the large dataset sequentially system.time( result_large <- foreach(1:10, .combine = rbind) %do% regress_resampled_data (data_large) )
## user system elapsed ## 6.297 0.149 6.447
# run the function 10 times with the large dataset sequentially system.time( result_small <- foreach(1:100, .combine = rbind) %do% regress_resampled_data (data_small) )
## user system elapsed ## 7.500 0.011 7.513
# The time spent is: # 6.447 s in the first case, and # 7.513 s in the second case # The difference may be attributed to # the foreach loop overhead # (it is very likely that you have other figures due to difference # in hardware and software configuration). #################### # # # Exercise 5 # # # #################### require(snow) require(doSNOW) cluster_snow <- makeCluster(2, type="SOCK") registerDoSNOW(cluster_snow) #################### # # # Exercise 6 # # # #################### require(foreach) require(snow) system.time( result_large <- foreach(1:10, .combine = rbind) %dopar% regress_resampled_data (data_large) )
## user system elapsed ## 0.054 0.015 3.814
stopCluster(cluster_snow) # The execution time is 3.814 s. # It is 41% faster comparing to the task run sequentially. #################### # # # Exercise 7 # # # #################### require(snow) require(doSNOW) require(foreach) # prepare a cluster cluster_snow <- makeCluster(2, type="SOCK") registerDoSNOW(cluster_snow) # run the task 100 times system.time( result_small <- foreach(1:100, .combine = rbind) %dopar% regress_resampled_data (data_small) )
## user system elapsed ## 0.114 0.014 6.253
# stop the cluster stopCluster(cluster_snow) # The execution time is 6.253 s. # It is only 17% faster comparing to the task run sequentially. # This reflect the fact that the overhead related to inter-process communication # can be significant (the shorter is the repeated task, the more noticeable is # this overhead). # In some cases, a computation in parallel can even take more time # than a sequential one. #################### # # # Exercise 8 # # # #################### require(doMPI) cluster_mpi <- startMPIcluster(2)
## 2 slaves are spawned successfully. 0 failed.
registerDoMPI(cluster_mpi) #################### # # # Exercise 9 # # # #################### require(doMPI) closeCluster(cluster_mpi) #################### # # # Exercise 10 # # # #################### require(doMPI) require(foreach) # run the task 10 times with the data_large data set cluster_mpi <- startMPIcluster(count = 2)
## 2 slaves are spawned successfully. 0 failed.
registerDoMPI(cluster_mpi) system.time( result_large <- foreach(1:10, .combine = rbind) %dopar% regress_resampled_data (data_large) )
## user system elapsed ## 2.018 2.001 4.472
closeCluster (cluster_mpi) # run the task 100 times with the data_small data set cluster_mpi <- startMPIcluster(count = 2)
## 2 slaves are spawned successfully. 0 failed.
registerDoMPI(cluster_mpi) system.time( result_small <- foreach(1:100, .combine = rbind) %dopar% regress_resampled_data (data_small) )
## user system elapsed ## 5.227 0.835 6.139
closeCluster(cluster_mpi) # when you finish work mpi.finalize()
## [1] 1
# for the large data set, parallel execution with Rmpi is # 31% faster than sequential one, but 17% slower # than parallel execution with snow # (note that in other cases the situation may be different, and # computations with snow may be faster) # # for the small data set, parallel execution with Rmpi is 18% faster # than sequential one, and 2% faster than parallel execution with snow
Leave a Reply