Below are the solutions to these exercises on parallel computing.

#################### # # # Exercise 1 # # # #################### require(parallel) detectCores()

## [1] 4

detectCores(logical=TRUE)

## [1] 4

#################### # # # Exercise 2 # # # #################### df <- read.csv("data_snowfall.csv") #################### # # # Exercise 3 # # # #################### system.time(fit_30 <- kmeans(df, centers = 3, nstart = 30))

## user system elapsed ## 5.006 0.005 5.054

#################### # # # Exercise 4 # # # #################### index <- sample(1:nrow(df), 3) # select 3 rows randomly initial_points <- df[index, ] # subset the data set initial_points <- as.matrix(initial_points) # transform the subset into a matrix fit <- kmeans(df, centers = initial_points) # run kmeans #################### # # # Exercise 5 # # # #################### wrapper <- function(arg) { # line added index <- sample(1:nrow(df), 3) initial_points <- df[index, ] initial_points <- as.matrix(initial_points) fit <- kmeans(df, centers = initial_points) return(fit) # line added } # The reason why an argument (arg) is used in the wrapper() function # definition is that this function is expected to be passed as an argument # to the sfLapply() function (a parallelized version of lapply()). # # Both sfLapply() and lapply() take a function as an argument, and that # function is required to be a function of at least one argument. # In the present wrapper() function, the argument can be thought of as # a serial number of the wrapper() function call. # # There is an alternative way of writing the wrapper() function in this case, # which makes use of an argument. One can create a list of matrices, # each of which represent a set of randomly chosen initial points. # That list can be passed to sfLapply() as the first argument (X). # If this is the case, the wrapper() function will use one set at a time, # and the argument will represent that set (a matrix). # # The approach proposed in this set of exercises is probably easier to # understand for people who implement the sfLapply() function for the first # time. This approach may also be less error-prone. #################### # # # Exercise 6 # # # #################### require(snowfall) sfInit(parallel=TRUE, cpus = 3)

## R Version: R version 3.1.2 (2014-10-31)

sfExport("df") sfClusterSetupRNG(seed = 1234)

## [1] "RNGstream"

#################### # # # Exercise 7 # # # #################### require(snowfall) system.time(result <- sfLapply(1:30, wrapper))

## user system elapsed ## 0.017 0.002 1.017

#################### # # # Exercise 8 # # # #################### require(snowfall) sfStop() #################### # # # Exercise 9 # # # #################### class(result)

## [1] "list"

length(result)

## [1] 30

str(result[[1]])

## List of 9 ## $ cluster : int [1:57403] 2 2 1 2 2 3 2 2 2 3 ... ## $ centers : num [1:3, 1:4] -1.415 0.296 0.421 -0.754 0.762 ... ## ..- attr(*, "dimnames")=List of 2 ## .. ..$ : chr [1:3] "1" "2" "3" ## .. ..$ : chr [1:4] "v1" "v2" "v3" "v4" ## $ totss : num 229608 ## $ withinss : num [1:3] 14835 48755 25606 ## $ tot.withinss: num 89195 ## $ betweenss : num 140413 ## $ size : int [1:3] 11600 23050 22753 ## $ iter : int 4 ## $ ifault : int 0 ## - attr(*, "class")= chr "kmeans"

result[[1]]$tot.withinss

## [1] 89194.77

#################### # # # Exercise 10 # # # #################### min_ss <- Inf best_result <- NULL for (i in 1:length(result)) { if(result[[i]]$tot.withinss < min_ss) { min_ss <- result[[i]]$tot.withinss best_result <- result[[i]] } } print(best_result$tot.withinss)

## [1] 79635.16

print(fit_30$tot.withinss)

## [1] 79635.16

# the squared sums of distances are equal, which suggest that the results # are identical

## Leave a Reply