map
Functions with purrr
The following code has been adapted from a great presentation Brandon gave at the D-RUG group in Oct 2017. See the original post and content here
tidyverse
suppressPackageStartupMessages(library(tidyverse))
For a nice demonstration of what vectors of different data types look like, try the code below.
# initiate blank vectors of a specific type
# this can be useful when capturing data from for loops
a <- vector("logical", 5)
a
## [1] FALSE FALSE FALSE FALSE FALSE
b <- vector("integer", 4)
b
## [1] 0 0 0 0
c <- vector("double", 2) # can also use "numeric"
c
## [1] 0 0
d <- vector("character", 10)
d
## [1] "" "" "" "" "" "" "" "" "" ""
e <- vector("complex", 7)
e
## [1] 0+0i 0+0i 0+0i 0+0i 0+0i 0+0i 0+0i
f <- vector("raw", 3)
f
## [1] 00 00 00
g <- list()
g
## list()
NA
’s in RNA
’s are a common issue in dealing with data from various places (remember spreadsheet lesson?). R deals well with NA’s and even provides different NA’s for different data types.
#what do the NAs look like?
list(NA, NA_integer_, NA_real_, NA_character_, NA_complex_)
## [[1]]
## [1] NA
##
## [[2]]
## [1] NA
##
## [[3]]
## [1] NA
##
## [[4]]
## [1] NA
##
## [[5]]
## [1] NA
NULL
element# NULL is the absence of a vector
a <- c(TRUE, FALSE, TRUE, FALSE, FALSE, NA) # logical vector
a
## [1] TRUE FALSE TRUE FALSE FALSE NA
a <- NULL
a
## NULL
# let's use mtcars as example of making a column NULL
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
## wt_scaled
## Mazda RX4 0.2830478
## Mazda RX4 Wag 0.3482485
## Datsun 710 0.2063411
## Hornet 4 Drive 0.4351828
## Hornet Sportabout 0.4927129
## Valiant 0.4978266
dim(mtcars) # how many rows, cols
## [1] 32 12
# assign NULL to a given column to remove it
mtcars[1] <- NULL #this gets rid of the mpg column completely
head(mtcars)
## cyl disp hp drat wt qsec vs am gear carb wt_scaled
## Mazda RX4 6 160 110 3.90 2.620 16.46 0 1 4 4 0.2830478
## Mazda RX4 Wag 6 160 110 3.90 2.875 17.02 0 1 4 4 0.3482485
## Datsun 710 4 108 93 3.85 2.320 18.61 1 1 4 1 0.2063411
## Hornet 4 Drive 6 258 110 3.08 3.215 19.44 1 0 3 1 0.4351828
## Hornet Sportabout 8 360 175 3.15 3.440 17.02 0 0 3 2 0.4927129
## Valiant 6 225 105 2.76 3.460 20.22 1 0 3 1 0.4978266
dim(mtcars) # how many rows, cols
## [1] 32 11
data(mtcars) # restore full mtcars dataset
dim(mtcars) # how many rows, cols
## [1] 32 11
Now that we have an idea what different vectors are, let’s talk about how to make them into lists.
Lists can contain anything of any length, and data.frames
are a special type of list.
# let's put actual values into these
a <- c(TRUE, FALSE, TRUE, FALSE, FALSE, NA) # a logical vector 6 items long
b <- c(1:7, NA_integer_) # Integers
c <- c(rnorm(8), NA_real_) # pull doubles from normal distribution
d <- c(LETTERS[1:3], NA_character_) # first 3 capital letters
e <- list(a, b, c, d) #make a list
e # check it
## [[1]]
## [1] TRUE FALSE TRUE FALSE FALSE NA
##
## [[2]]
## [1] 1 2 3 4 5 6 7 NA
##
## [[3]]
## [1] 0.54604321 -0.27976609 0.11066213 -0.03871764 -2.51287248 0.47283240
## [7] -0.35411707 1.30559401 NA
##
## [[4]]
## [1] "A" "B" "C" NA
# Lists can contain anything of any length
## data.frame is a special type of list
## data.frame rows imply association so they need to line up
data.frame(a, b) # uh oh
## Error in data.frame(a, b): arguments imply differing number of rows: 6, 8
a <- c(TRUE, FALSE, TRUE, FALSE, FALSE, NA, NA, NA) # make it 8 like b
df <- data.frame(a = a, b = b)
df
## a b
## 1 TRUE 1
## 2 FALSE 2
## 3 TRUE 3
## 4 FALSE 4
## 5 FALSE 5
## 6 NA 6
## 7 NA 7
## 8 NA NA
tibbles
are data.frames with some stricter rules: - slightly more informative printing (not shown well here) - strict behavior around subsetting [
-> tibble [[
vector - does not partially match column names (b/c this can get you into trouble)
as_tibble(df)
## # A tibble: 8 x 2
## a b
## <lgl> <int>
## 1 TRUE 1
## 2 FALSE 2
## 3 TRUE 3
## 4 FALSE 4
## 5 FALSE 5
## 6 NA 6
## 7 NA 7
## 8 NA NA
See presentation, SLIDE:
#atomic vectors
a <- c(TRUE, FALSE, TRUE, FALSE, FALSE, NA) #logic vector
a
a[1]
a[[1]] # does the same thing
a[[[1]]] # nope just stop adding brackets!
# lists
list_example <- list(a = a, b = b, c = c, d = d, e = list(1, "A")) #make a named list
list_example
# single brackets returns a list with that element in it
# can reference by location or by name
list_example[1]
list_example["a"]
# double brackets return a vector
list_example[[2]]
list_example[["b"]]
#stack [['s to peel the onion back
list_example[["e"]] # remove one layer
list_example[["e"]][[2]] # remove the second layer (access e list, and 2 item in that list)
#dolla dolla bills y'all (works like [[, with named vectors)
list_example$c
#same for dataframes
mtcars[1]
mtcars[[1]]
# and for tibbles
tbbl <- tibble(x = 1:3, y = LETTERS[1:3])
tbbl[1]
tbbl["x"]
tbbl[[2]]
tbbl[["y"]]
## Error: <text>:7:4: unexpected '['
## 6: a[[1]] # does the same thing
## 7: a[[[
## ^
See presentation, SLIDE:
for
LoopsWhen should we make a function in R?
#Simple dataframe
df <- data.frame(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
# What are we doing here?
df$a <- (df$a - min(df$a, na.rm = TRUE)) /
(max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))
df$b <- (df$b - min(df$b, na.rm = TRUE)) /
(max(df$b, na.rm = TRUE) - min(df$b, na.rm = TRUE))
df$c <- (df$c - min(df$a, na.rm = TRUE)) /
(max(df$c, na.rm = TRUE) - min(df$c, na.rm = TRUE))
df$d <- (df$d - min(df$d, na.rm = TRUE)) /
(max(df$d, na.rm = TRUE) - min(df$d, na.rm = TRUE))
Anytime we need to do the same thing 3 or more times, it would probably be a good idea to think about creating a function. Stepping through these pieces, here’s what we’d want to do:
First what is the calculation/task you want to do?
df$d <- (df$d - min(df$d, na.rm = TRUE)) /
(max(df$d, na.rm = TRUE) - min(df$d, na.rm = TRUE))
# exactly as above
Now pull out the common components:
# pull out what's common
x <- (x - min(x, na.rm = TRUE)) /
(max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
Problem is the above isn’t a function yet, it’s just the “anonymous” calculation we want our function to follow. We can simplify even more if we want:
# simplify more and refactor if that makes sense
x <- (x - min(x, na.rm = TRUE)) /
diff(range(x, na.rm = TRUE))
And now add the actual function()
call!
rescale_0_1 <- function(x) {
(x - min(x, na.rm = TRUE)) /
diff(range(x, na.rm = TRUE))
}
Let’s test this function and see what happens. We can test a single vector, and then actually apply our function to each column in our dataset. Use a test dataset that’s simple and we now what it should look like.
rescale_0_1 <- function(x) {
(x - min(x, na.rm = TRUE)) /
diff(range(x, na.rm = TRUE))
}
testvec <- 10:20 # so this should get rescaled from 0 to 1
testvec
## [1] 10 11 12 13 14 15 16 17 18 19 20
rescale_0_1(testvec) # looks like it worked, it rescaled all values from 0 to 1!
## [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
df <- data.frame(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
rescale_0_1 <- function(x) {
(x - min(x, na.rm = TRUE)) /
diff(range(x, na.rm = TRUE))
}
out <- df # rename df so we can always come back
out$a <- rescale_0_1(df$a)
out$b <- rescale_0_1(df$b)
out$c <- rescale_0_1(df$c)
out$d <- rescale_0_1(df$d)
out
## a b c d
## 1 0.210583367 0.58944212 0.50976037 0.8524446
## 2 0.456383992 0.41784664 0.50200281 0.5580446
## 3 0.671156870 0.33566716 0.77986187 0.4537088
## 4 0.849874511 1.00000000 0.39146476 0.8358137
## 5 0.002931114 0.17498379 0.00000000 1.0000000
## 6 0.806134726 0.02530868 0.28712257 0.3631367
## 7 0.655715321 0.24926608 0.02283683 0.9162297
## 8 0.643930985 0.00000000 0.18657748 0.4917923
## 9 1.000000000 0.55112356 0.68806649 0.0000000
## 10 0.000000000 0.09840771 1.00000000 0.8234917
for
Loops to apply a functionReduce iteration with a for
loop, which takes a vector or list, and interates or steps through each piece of that vector/list, does something and returns values/data.
out <- df #make a copy to store scaled data
for (i in 1:ncol(df)) { # loop through each element
out[i] <- rescale_0_1(df[[i]]) #apply function and store it to out
}
out
## a b c d
## 1 0.210583367 0.58944212 0.50976037 0.8524446
## 2 0.456383992 0.41784664 0.50200281 0.5580446
## 3 0.671156870 0.33566716 0.77986187 0.4537088
## 4 0.849874511 1.00000000 0.39146476 0.8358137
## 5 0.002931114 0.17498379 0.00000000 1.0000000
## 6 0.806134726 0.02530868 0.28712257 0.3631367
## 7 0.655715321 0.24926608 0.02283683 0.9162297
## 8 0.643930985 0.00000000 0.18657748 0.4917923
## 9 1.000000000 0.55112356 0.68806649 0.0000000
## 10 0.000000000 0.09840771 1.00000000 0.8234917
seq_along
Handy command that sequences along a list for you, so you don’t have to specify columns or rows, or series of numbers.
out <- df #make a copy to store scaled data
for (i in seq_along(df)) {
out[i] <- rescale_0_1(df[[i]])
}
out
## a b c d
## 1 0.210583367 0.58944212 0.50976037 0.8524446
## 2 0.456383992 0.41784664 0.50200281 0.5580446
## 3 0.671156870 0.33566716 0.77986187 0.4537088
## 4 0.849874511 1.00000000 0.39146476 0.8358137
## 5 0.002931114 0.17498379 0.00000000 1.0000000
## 6 0.806134726 0.02530868 0.28712257 0.3631367
## 7 0.655715321 0.24926608 0.02283683 0.9162297
## 8 0.643930985 0.00000000 0.18657748 0.4917923
## 9 1.000000000 0.55112356 0.68806649 0.0000000
## 10 0.000000000 0.09840771 1.00000000 0.8234917
map
and apply
FunctionsThere are a family of functions called apply
(mapply
, sapply
, lapply
, vapply
) which essentially take a list of something and apply a function or operation on each item in that list. These are base functions in R, but a newer set of functions (which do the same thing & more) have been written in R, called the map
functions. Let’s see how they migth work.
apply
df_app1 <- apply(df, MARGIN = 2, function(x) rescale_0_1(x)) # iterate over the columns in df
class(df_app1) # returns a matrix
## [1] "matrix"
df_app2 <- sapply(df, function(x) rescale_0_1(x)) # same as above but defaults to cols
Okay so apply
returns a matrix. What about map
?
map
library(purrr)
df_map1<- map(df, function(x) rescale_0_1(x)) #map over the columns in df
class(df_map1) # returns a list! Keeps same format that was input
df_map2 <- map_df(df, rescale_0_1) # returns a dataframe!
Let’s look at another more simple example.
map
Another set of examples below will help us understand how map
makes it easy to get different data classes (matrix
,list
, dataframe
) back out. The default map
function takes a list and returns a list, but there are various options that allow you to specify what you want.
This example creates a list containing two integer vectors. Let’s say we want the mean of each one:
l = list(a=1:10, b = 10:100)
map(l, function(x) mean(x, na.rm = TRUE)) # and this is returning a list (list in, list out)
double
type backSpecify and you get a vector back instead of a list
l = list(a=1:10, b = 10:100)
map_dbl(l, function(x) mean(x, na.rm = TRUE)) # this is a numeric vector
character
backMaybe not so useful, but demonstrates the point.
map_chr(l, function(x) mean(x, na.rm = TRUE))
So if you have or try to use the wrong type of data, the function won’t run.
l = list(a=1:10, b = 10:100)
map_lgl(l, function(x) mean(x, na.rm = TRUE))
map
Examplesmap
Use Case #1: Read in filesBased upon this tutorial http://data.library.virginia.edu/getting-started-with-the-purrr-package-in-r/
# this will work if you clone the github repository
fileloc <- file.path(getwd(), "stocks")
#create a list of files to read in that end in csv
files <- list.files(fileloc, pattern = "csv$", full.names = TRUE)
#this reads in each csv independently into a list
dat2 <- map(files, read_csv)
dat2
fileloc <- file.path(getwd(), "stocks")
read_folder <- function(folder) {
# get files that end with csv or CSV
files <- list.files(folder, pattern = "(csv|CSV)$", full.names = TRUE)
#this reads in each csv independently into a list
map(files, read_csv) # last thing is returned
}
read_folder(fileloc)
fileloc <- file.path(getwd(), "stocks")
read_folder <- function(folder, pattern, read_fun) {
# get files that end with csv or CSV
files <- list.files(folder, pattern = pattern, full.names = TRUE)
#this reads in each csv independently into a list
map(files, read_fun)
}
read_folder(folder = fileloc, pattern = "(csv|CSV)$", read_fun = read_csv)
# OR more succinctly
# read_folder(fileloc, "(csv|CSV)$", read_csv)
If there are multiple files but all have same structure (same number of columns/column types), we can read in separately and then combine. But add the filename of each dataset as a new column variable.
fileloc <- file.path(getwd(), "stocks")
# This function reads a file and appends a column called "name" for ID
read_fun <- function(file, FUN) {
dat <- FUN(file) # read in file
dat$name <- tools::file_path_sans_ext(basename(file)) #record the filename for later grouping
dat #bring data back
}
# This function reads a folder for files of given pattern
# then applies the given read function to read them in read_fun
read_files_bind <- function(folder, pattern, FUN) {
# get files that end with csv or CSV
files <- list.files(folder, pattern = pattern, full.names = TRUE)
map_dfr(files, read_fun, FUN)
}
# Read everything in at once and bind it all together
read_files_bind(folder = fileloc, pattern = "(csv|CSV)$", FUN = read_csv)
Ok, so we can get data into our environment in a list, but what then? There are a number of ways to deal with these data, or to deal with lists within dataframes…called list columns.
tidyr::nest()
embeds tibbles in a columnSee how to group by something and then create a list-columns that hold each dataframe for that group.
cars_nested <-
mtcars %>% # motortrend car dataset
group_by(cyl) %>% # group by cylinder
nest(.key = "data") # nest data by cyl, default key is "data"
## Warning: `.key` is deprecated
cars_nested
## # A tibble: 3 x 2
## # Groups: cyl [3]
## cyl data
## <dbl> <list<df[,10]>>
## 1 6 [7 × 10]
## 2 4 [11 × 10]
## 3 8 [14 × 10]
map
Use map
!. Let’s make a linear model for each subset, pull out the coefficients with broom
, and plot them.
library(broom)
library(purrr)
library(dplyr)
library(tidyr)
mtc_modeled <-
mtcars %>% # motortrend car dataset
group_by(cyl) %>% # group by cylinder
nest(.key = "data") %>% # nest data by cyl, default key is "data"
mutate(model = map(data, ~ lm(mpg ~ disp, data = .x))) %>% # make models
mutate(coeff = map(model, tidy)) %>% #pull out coefficients
mutate(slope = map_dbl(coeff, ~ pull(.x, estimate)[2])) %>% # pull slope from coefficients
mutate(intercept = map_dbl(coeff, ~ pull(.x, estimate)[1])) # pull intercept from coefficients
ggplot() +
# plot the original data
geom_point(data = mtcars, aes(x = disp, y = mpg, color = as.factor(cyl))) +
# plot the models
geom_abline(data = mtc_modeled, aes(slope = slope, intercept = intercept, color = as.factor(cyl)))
See presentation, SLIDE:
map2
and pmap
n = list(5,10,20)
mu = list(50,30,10)
sd = list(10,5,1)
# how does it look as we go from map to map2 to pmap?
map(n, rnorm)
map2(n, mu, rnorm)
pmap(list(n, mu, sd), rnorm)
#technically, pmap can do it all
pmap(list(n), rnorm)
pmap(list(n, mu), rnorm)
# formula interface
map(n, ~rnorm(.))
map2(n, mu, ~ rnorm(.x, .y))
pmap(list(n, mu, sd), ~rnorm(..1, ..2, ..3)) # ..4 and so on for as many as you need
# CAUTION: position matters with map2 & pmap
map2(n, mu, rnorm)
map2(mu, n, rnorm)
pmap(list(n, mu, sd), rnorm)
pmap(list(mu, n, sd), rnorm)
# if you need non-default ordering, specify with full function call
map2(mu, n, function(x, y) rnorm(y, x))
# or using the formula interface
map2(mu, n, ~ rnorm(.y, .x))
# pmap you can name your list to get around positional ordering
pmap(list(mean=mu, n=n, sd=sd), rnorm)
# OR you can specify with a full function call
pmap(list(mu, n, sd), function(mu, n, sd) rnorm(n, mu, sd))
# OR with the formula interface
pmap(list(mu, n, sd), ~ rnorm(..2, ..1, ..3))
See presentation, SLIDE:
walk/walk2/pwalk
are used to output side-effectsasdf <- list(a = LETTERS[1:10], b = 1:15, c = FALSE)
# we just want to print each element of this list
walk(asdf, print)
map
Functions# make a simple plotting function
plot_points <- function(t) ggplot(t, aes(mpg, wt)) + geom_point()
mtcars %>% # motortrend car dataset
group_by(cyl) %>% # group by cylinder
nest(.key = "data") %>% # nest by cylinder
mutate(plot = map(data, plot_points)) %>% # we made a graph for each subset
pull(plot) %>% #pull the plots out as a list
walk(., print) # print them
walk
with walk2/pwalk
# make a simple plotting function
plot_points <- function(t) ggplot(t, aes(mpg, wt)) + geom_point()
mtc_plot_n_save <-
mtcars %>% # motortrend car dataset
group_by(cyl) %>% # group by cylinder
nest(.key = "data") %>% # nest by cylinder
mutate(plot = map(data, plot_points)) %>% # we made a graph for each subset
mutate(filepath = map_chr(cyl, ~ file.path("figures", paste0(.x, ".png")))) #make the save paths
walk2(mtc_plot_n_save$plot, mtc_plot_n_save$filepath, ~ ggsave(.y, .x)) # save them out
That is great! Ok, let’s make this a bit fancier. With pwalk
we can have a bunch of different lists of things that get mapped…not just 1 or 2.
# make a simple plotting function
plot_points <- function(t) ggplot(t, aes(mpg, wt)) + geom_point()
#function that prints cylinder info, ggplot, and table
print_func <-
function(info, plot, table, filepath) {
print(paste0("This is for ", info, " cylinders./n"))
print(plot)
ggsave(filepath)
print(table)
}
mtc_plot_n_save <-
mtcars %>% # motortrend car dataset
group_by(cyl) %>% # group by cylinder
nest(.key = "data") %>% # nest by cylinder
mutate(plot = map(data, plot_points)) %>% # we made a graph for each subset
mutate(filepath = map_chr(cyl, ~ file.path("figures", paste0(.x, ".png")))) # make the save paths
# print all the info
pwalk(list(info = mtc_plot_n_save$cyl, plot = mtc_plot_n_save$plot, table = mtc_plot_n_save$data, filepath = mtc_plot_n_save$filepath), print_func)
This lesson is adapted from Brandon Hurr’s D-RUG presentation materials.