问题
Let us say I want to run the linear regression model on the mtcars dataset several times on different samples. The idea is, for each iteration in a for loop, to store the results of the predict() method every time the linear regression is run for a different sample. The small example follows for one run:
## Perform model once on a Sample and use model on full dataset:
Sample_Size <- 10
Sample <- mtcars[sample(nrow(mtcars), Sample_Size), ]
Model <- lm(formula = mpg ~ wt, data = Sample)
Predictions <- predict(Model,newdata=mtcars)
## Gets us a list with predicted wt for each car:
Predictions <- t(Predictions)
This yields
> Predictions
Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout
[1,] 25.80494 23.89161 28.05592 21.34051 19.65228
Valiant Duster 360 Merc 240D Merc 230 Merc 280 Merc 280C Merc 450SE
[1,] 19.50221 18.67685 21.52809 21.82822 19.65228 19.65228 14.92523
Merc 450SL Merc 450SLC Cadillac Fleetwood Lincoln Continental
[1,] 17.47633 17.10117 6.071394 4.765828
.... and so on for other cars
I would like to perform this procedure several times inside a for loop, every time choosing a different sample and getting a correspondent Predictions() list, and store all the Predictions() results by line in a dataframe.
Let's say I run the model for two different samples. Each row of the resulting dataframe should be the outcome above for that sample, like:
Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout
[1,] 25.80494 23.89161 28.05592 21.34051 19.65228
[2,] 22.80492 22.89147 28.05532 21.34231 20.65290
Valiant Duster 360 Merc 240D Merc 230 Merc 280 Merc 280C Merc 450SE
[1,] 19.50221 18.67685 21.52809 21.82822 19.65228 19.65228 14.92523
[2,] 21.83492 23.84147 29.02532 21.34231 20.35290 18.45228 13.92523
... and so on for other cars.
Any idea on how to go about doing this? I have developed something but it either throws an error or only stores the last result...What am I missing here?
Here is what I have so far:
### Inside a for loop, to get a dataframe of Predictions:
Bootstrap_times <- 2
Sample_Size <- 10
Predictions <- list()
Results <-vector ("list",Bootstrap_times)## Stores the Predictions for each run
for(i in 1:Bootstrap_times){
### Take a sample
Sample[[i]] <- mtcars[sample(nrow(mtcars), Sample_Size), ]
### Do the regression on the sample
Model[[i]] <- lm(formula = mpg ~ wt, data = Sample[[i]])
### Perform the predict() on the sample
Predictions[[i]] <- predict(Model[[i]],newdata=mtcars)
### put the result as a line on the dataframe Results
Predictions[[i]] <- t(Predictions[[i]])
return(Predictions)
}
Howeever, I keep getting:
Error in
[[<-.data.frame(*tmp*, i, value = list(mpg = c(13.3, 10.4, : replacement has 10 rows, data has 0
回答1:
I prefer to use magic_for() however you can also do this with base R pretty easily.
Here's an example:
Bootstrap_times <- 2
Sample_Size <- 10
Sample <- mtcars[sample(nrow(mtcars), Sample_Size), ]
Model <- lm(formula = mpg ~ wt, data = Sample)
Predictions <- predict(Model,newdata=mtcars)
## You like how I line up arrows, right?
Predictions <- t(Predictions)
Predictions <- list()
Results <-vector ("list",Bootstrap_times)## Stores the Predictions for each run
magicfor::magic_for()
for(i in 1:Bootstrap_times){
### Take a sample
Sample[[i]] <- mtcars[sample(nrow(mtcars), Sample_Size), ]
### Do the regression on the sample
Model[[i]] <- lm(formula = mpg ~ wt, data = Sample[[i]])
### Perform the predict() on the sample
put(predict(Model[[i]],newdata=mtcars))
}
tmp<-magicfor::magic_result_as_dataframe()
tmp
i predict(Model[[i]],newdata=mtcars) 1 1 22.858806 2 2 20.922763 3 1 25.136504 4 2 18.341372 5 1 16.633098 6 2 16.481252 7 1 15.646096 8 2 18.531180 9 1 18.834873 10 2 16.633098 11 1 16.633098 12 2 11.849933 13 1 14.431324 14 2 14.051708 15 1 2.890988 16 2 1.569924 17 1 2.169717 18 2 26.047583 19 1 30.489093 20 2 28.818782 21 1 24.035616 22 2 16.025712 23 1 16.671060 24 2 13.596168 25 1 13.558206 26 2 28.059549 27 1 26.503122 28 2 31.263511 29 1 18.683026 30 2 21.719957 31 1 15.646096 32 2 21.644034 33 1 22.978374 34 2 21.584264 35 1 24.618503 36 2 19.725450 37 1 18.495353 38 2 18.386011 39 1 17.784630 40 2 19.862128 41 1 20.080812 42 2 18.495353 43 1 18.495353 44 2 15.051081 45 1 16.909894 46 2 16.636540 47 1 8.599905 48 2 7.648629 49 1 8.080530 50 2 25.274555 51 1 28.472808 52 2 27.270046 53 1 23.825774 54 2 18.057985 55 1 18.522689 56 2 16.308514 57 1 16.281178 58 2 26.723336 59 1 25.602581 60 2 29.030452 61 1 19.971470 62 2 22.158309 63 1 17.784630 64 2 22.103638
回答2:
My version:
# load data
data(mtcars)
N <- nrow(mtcars)
# bootstrap parameters
sample_size <- 10
bootstrap_times <- 20
# create empty storage matrix of results
# one row per bootstrap sample, one column per predicted weight
res_mat <- matrix(NA, nrow=bootstrap_times, ncol=N)
colnames(res_mat) <- rownames(mtcars)
# do bootstrap
for (i in seq(bootstrap_times)) {
this_sample <- sample(N, sample_size, replace=FALSE)
reg_result <- lm(mpg ~ wt, data=mtcars[this_sample,])
res_mat[i,] <- predict(reg_result, mtcars)
}
回答3:
Here is a tidyverse approach using nested data.frames:
library(tidyverse)
Bootstrap_times <- 2
Sample_Size <- 10
Predictions <- data.frame(SampleID = 1:Bootstrap_times) %>%
group_by(SampleID) %>%
nest() %>%
mutate(data = data %>% map(~mtcars[sample(nrow(mtcars), Sample_Size), ]),
Model = data %>% map(~lm(formula = mpg ~ wt, data = .)),
Predictions = map2(Model, data, ~predict(.x, newdata = .y))) %>%
select(SampleID, Predictions) %>%
unnest()
Result:
# A tibble: 20 x 2
SampleID Predictions
<int> <dbl>
1 1 22.7
2 1 16.2
3 1 19.7
4 1 21.5
5 1 18.7
6 1 17.4
7 1 23.3
8 1 10.7
9 1 18.8
10 1 19.8
11 2 11.4
12 2 19.6
13 2 11.7
14 2 18.1
15 2 21.1
16 2 18.6
17 2 16.2
18 2 23.5
19 2 19.7
20 2 20.7
The advantage of this method is that it is very easy to extract other information from the model (using broom) and combine as one single data.frame output:
library(broom)
data.frame(SampleID = 1:Bootstrap_times) %>%
group_by(SampleID) %>%
nest() %>%
mutate(data = data %>% map(~mtcars[sample(nrow(mtcars), Sample_Size), ]),
Model = data %>% map(~lm(formula = mpg ~ wt, data = .) %>% augment())) %>%
select(-data) %>%
unnest()
Result:
# A tibble: 20 x 11
SampleID .rownames mpg wt .fitted .se.fit .resid .hat .sigma .cooksd .std.resid
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 Dodge Challenger 15.5 3.52 17.2 0.689 -1.72 0.106 2.15 0.0442 -0.862
2 1 Datsun 710 22.8 2.32 23.5 0.940 -0.655 0.198 2.24 0.0148 -0.346
3 1 Cadillac Fleetwood 10.4 5.25 8.24 1.52 2.16 0.515 1.93 1.15 1.47
4 1 Merc 450SE 16.4 4.07 14.4 0.863 2.04 0.167 2.10 0.112 1.06
5 1 Ford Pantera L 15.8 3.17 19.0 0.672 -3.24 0.101 1.85 0.147 -1.62
6 1 Lotus Europa 30.4 1.51 27.6 1.39 2.75 0.432 1.79 1.14 1.73
7 1 Volvo 142E 21.4 2.78 21.1 0.751 0.334 0.126 2.26 0.00207 0.169
8 1 Merc 280C 17.8 3.44 17.6 0.678 0.163 0.103 2.26 0.000378 0.0812
9 1 Mazda RX4 Wag 21 2.88 20.6 0.724 0.428 0.117 2.25 0.00308 0.215
10 1 Camaro Z28 13.3 3.84 15.6 0.773 -2.26 0.134 2.06 0.102 -1.15
11 2 Merc 280 19.2 3.44 19.7 1.09 -0.470 0.108 3.53 0.00138 -0.151
12 2 Toyota Corolla 33.9 1.84 28.2 1.65 5.66 0.251 2.52 0.658 1.98
13 2 Hornet Sportabout 18.7 3.44 19.7 1.09 -0.970 0.108 3.51 0.00588 -0.311
14 2 Mazda RX4 Wag 21 2.88 22.7 1.07 -1.69 0.106 3.47 0.0173 -0.540
15 2 Chrysler Imperial 14.7 5.34 9.50 2.42 5.20 0.539 2.02 3.15 2.32
16 2 Camaro Z28 13.3 3.84 17.5 1.26 -4.23 0.145 3.08 0.163 -1.39
17 2 Valiant 18.1 3.46 19.6 1.09 -1.46 0.110 3.48 0.0136 -0.469
18 2 Porsche 914-2 26 2.14 26.6 1.43 -0.611 0.188 3.52 0.00490 -0.205
19 2 Merc 280C 17.8 3.44 19.7 1.09 -1.87 0.108 3.45 0.0219 -0.600
20 2 Lotus Europa 30.4 1.51 30.0 1.91 0.441 0.335 3.52 0.00677 0.164
Note:
Using this method, you don't even need the prediction step (unless you are using new data), since you have the .fitted values from augment.
The predictions are different between the first and second output because no seed was set.
来源:https://stackoverflow.com/questions/51429205/place-results-of-predict-in-a-for-loop-inside-a-list