Below are the solutions to these exercises on “Sharpening The Knives in The data.table Toolbox.”
#################### # # # Exercise 1 # # # #################### library(gapminder) library(data.table) gp <- gapminder # Set as data.table setDT(gp) gp[, uniqueN(country)]
## [1] 142
#################### # # # Exercise 2 # # # #################### gp[, gdpPercap_l1 := shift(gdpPercap), by = country] head(gp)
## country continent year lifeExp pop gdpPercap gdpPercap_l1 ## 1: Afghanistan Asia 1952 28.801 8425333 779.4453 NA ## 2: Afghanistan Asia 1957 30.332 9240934 820.8530 779.4453 ## 3: Afghanistan Asia 1962 31.997 10267083 853.1007 820.8530 ## 4: Afghanistan Asia 1967 34.020 11537966 836.1971 853.1007 ## 5: Afghanistan Asia 1972 36.088 13079460 739.9811 836.1971 ## 6: Afghanistan Asia 1977 38.438 14880372 786.1134 739.9811
#################### # # # Exercise 3 # # # #################### gp[year == 2007, .(country, continent, growth07 = (gdpPercap / gdpPercap_l1) - 1) ][order(growth07), .(country = last(country), growth07 = last(growth07)), continent]
## continent country growth07 ## 1: Asia Cambodia 0.9122171 ## 2: Africa Angola 0.7297996 ## 3: Americas Trinidad and Tobago 0.5713408 ## 4: Europe Montenegro 0.4112585 ## 5: Oceania Australia 0.1221208
# Alternatively you can extract the last observation with .N gp[year == 2007, .(country, continent, growth07 = (gdpPercap / gdpPercap_l1) - 1) ][order(growth07), .(country = country[.N], growth07 = growth07[.N]), continent]
## continent country growth07 ## 1: Asia Cambodia 0.9122171 ## 2: Africa Angola 0.7297996 ## 3: Americas Trinidad and Tobago 0.5713408 ## 4: Europe Montenegro 0.4112585 ## 5: Oceania Australia 0.1221208
#################### # # # Exercise 4 # # # #################### temp <- names(gp) setnames(gp, "year", "anno") temp
## [1] "country" "continent" "anno" "lifeExp" ## [5] "pop" "gdpPercap" "gdpPercap_l1"
address(temp)
## [1] "0000000015951F98"
address(names(gp))
## [1] "0000000015951F98"
# Both are actually just referring to the same object "<-" passed the the names only be reference. # Being aware of this is the price of the speed data.table gives. # No such thing as a free lunch #################### # # # Exercise 5 # # # #################### data(gapminder) gp <- gapminder setDT(gp) temp <- copy(names(gp)) setnames(gp, "year", "anno") temp
## [1] "country" "continent" "year" "lifeExp" "pop" "gdpPercap"
names(gp)
## [1] "country" "continent" "anno" "lifeExp" "pop" "gdpPercap"
address(temp)
## [1] "000000001BA9BC30"
address(names(gp))
## [1] "000000001BAB3A80"
# Convert factors to characters factcols <- sapply(gp, is.factor) factcols <- names(factcols)[factcols] gp[, (factcols) := lapply(.SD, as.character), .SDcols = factcols] # Actually there should be a cleaner way to do this without losing generalizability # Please comment if you think you have the answer #################### # # # Exercise 6 # # # #################### gA_2014 <- data.table( country = c("Brazil", "Mexico", "Croatia", "Cameroon"), goals2014 = c(7, 4, 6, 1) ) gA_2014[, pop_mill := gp[anno == 2007 ][chmatch(gA_2014$country, country), round(pop / 1e6)]] gA_2014
## country goals2014 pop_mill ## 1: Brazil 7 190 ## 2: Mexico 4 109 ## 3: Croatia 6 4 ## 4: Cameroon 1 18
#################### # # # Exercise 7 # # # #################### # First make sure data is ordered by country and year gp <- gp[order(country, anno)] # Years from first 8k gp[, years_from8k := anno - anno[which(gdpPercap >= 8e3)[1]], country ][years_from8k < 0, years_from8k := NA] head(gp)
## country continent anno lifeExp pop gdpPercap years_from8k ## 1: Afghanistan Asia 1952 28.801 8425333 779.4453 NA ## 2: Afghanistan Asia 1957 30.332 9240934 820.8530 NA ## 3: Afghanistan Asia 1962 31.997 10267083 853.1007 NA ## 4: Afghanistan Asia 1967 34.020 11537966 836.1971 NA ## 5: Afghanistan Asia 1972 36.088 13079460 739.9811 NA ## 6: Afghanistan Asia 1977 38.438 14880372 786.1134 NA
#################### # # # Exercise 8 # # # #################### gp[gdpPercap >= 8e3, obs8k_numb := rowid(country)] # This is not the same kind of variable because countries could fall below 8k # again gp[anno == 2007 & !is.na(obs8k_numb) ][order(obs8k_numb), .(country[obs8k_numb == max(obs8k_numb)], obs8k_numb[obs8k_numb == max(obs8k_numb)]), continent ]
## continent V1 V2 ## 1: Americas Canada 12 ## 2: Americas United States 12 ## 3: Africa Gabon 9 ## 4: Africa Libya 9 ## 5: Europe Belgium 12 ## 6: Europe Denmark 12 ## 7: Europe Netherlands 12 ## 8: Europe Norway 12 ## 9: Europe Sweden 12 ## 10: Europe Switzerland 12 ## 11: Europe United Kingdom 12 ## 12: Asia Bahrain 12 ## 13: Asia Kuwait 12 ## 14: Oceania Australia 12 ## 15: Oceania New Zealand 12
#################### # # # Exercise 9 # # # #################### gp[anno == 2002 & lifeExp %inrange% list(c(0,80), c(40, Inf))]
## country continent anno lifeExp pop gdpPercap ## 1: Australia Oceania 2002 80.370 19546792 30687.7547 ## 2: Hong Kong, China Asia 2002 81.495 6762476 30209.0152 ## 3: Iceland Europe 2002 80.500 288030 31163.2020 ## 4: Italy Europe 2002 80.240 57926999 27968.0982 ## 5: Japan Asia 2002 82.000 127065841 28604.5919 ## 6: Sweden Europe 2002 80.040 8954175 29341.6309 ## 7: Switzerland Europe 2002 80.620 7361757 34480.9577 ## 8: Zambia Africa 2002 39.193 10595811 1071.6139 ## 9: Zimbabwe Africa 2002 39.989 11926563 672.0386 ## years_from8k obs8k_numb ## 1: 50 11 ## 2: 30 7 ## 3: 45 10 ## 4: 40 9 ## 5: 35 8 ## 6: 50 11 ## 7: 50 11 ## 8: NA NA ## 9: NA NA
#################### # # # Exercise 10 # # # #################### gA_2014b <- data.table( country = c("Brazil", "Mexico", "Croatia", "Mexico"), goals2014 = c("7-2", "4-1", "6-6", "1-9") ) gA_2014b[, c("goals_for", "goals_against") := tstrsplit(goals2014, "-") ][, goals2014 := NULL] gA_2014b
## country goals_for goals_against ## 1: Brazil 7 2 ## 2: Mexico 4 1 ## 3: Croatia 6 6 ## 4: Mexico 1 9
Leave a Reply