Jingyi Jessica Li
June 9, 2020
Use R Markdown
library(glmnet)
set.seed(1010)
n = 1000
p = 100
nzc = trunc(p/10)
x = matrix(rnorm(n * p), n, p)
beta = rnorm(nzc)
fx = x[, seq(nzc)] %*% beta
px = exp(fx)
px = px/(1 + px)
y = rbinom(n = length(px), prob = px, size = 1)
# use cross validation to choose lambda in penalized logistic reg.
# first time
cv.glmnet(x, y, family = "binomial")$lambda.min
[1] 0.0102121
# second time
cv.glmnet(x, y, family = "binomial")$lambda.min
[1] 0.009304889
# same seed; use the same data
set.seed(1010)
# first time
cv.glmnet(x, y, family = "binomial")$lambda.min
[1] 0.008478269
# second time
cv.glmnet(x, y, family = "binomial")$lambda.min
[1] 0.009304889
Never use the result produced by a local code chunk in terminal.
Run the script from the beginning in a clean R workspace.
set.seed(1010)
n = 1000
p = 100
nzc = trunc(p/10)
x = matrix(rnorm(n * p), n, p)
beta = rnorm(nzc)
fx = x[, seq(nzc)] %*% beta
px = exp(fx)
px = px/(1 + px)
y = rbinom(n = length(px), prob = px, size = 1)
# use cross validation to choose lambda in penalized logistic reg.
# first time
cv.glmnet(x, y, family = "binomial")$lambda.min
[1] 0.0102121
# number of folds
K = 10
# divide data into K folds
set.seed(1010)
foldid = sample(1:K, n, replace=T)
# you may save the foldid to avoid the unforseen difference between operating systems
## saveRDS(foldid, file="foldid.rds")
## foldid <- readRDS(file="foldid.rds")
# first time
cv.glmnet(x, y, family = "binomial", foldid=foldid)$lambda.min
[1] 0.009304889
# second time
cv.glmnet(x, y, family = "binomial", foldid=foldid)$lambda.min
[1] 0.009304889
library(randomForest)
library(PRROC)
# split data into two halves for training and testing
x_train = x[1:(n/2),]
x_test = x[(n/2+1):n,]
y_train = as.factor(y[1:(n/2)]) #required by randomForest
y_test = y[(n/2+1):n] #pr.curve requires the labels to be numeric
# first time
rf_model <- randomForest(x=x_train, y=y_train, xtest=x_test)
rf_pred <- rf_model$test$votes[,2]
pr.curve(scores.class0=rf_pred, weights.class0=y_test)$auc.integral
[1] 0.8389693
# second time
rf_model <- randomForest(x=x_train, y=y_train, xtest=x_test)
rf_pred <- rf_model$test$votes[,2]
pr.curve(scores.class0=rf_pred, weights.class0=y_test)$auc.integral
[1] 0.84573
If a function has internal randomness (which requires understanding of the algorithm) and there is no way to make it deterministic, set a random seed right before running the function.
# first time
set.seed(1010)
rf_model <- randomForest(x=x_train, y=y_train, xtest=x_test)
rf_pred <- rf_model$test$votes[,2]
pr.curve(scores.class0=rf_pred, weights.class0=y_test)$auc.integral
[1] 0.8456932
# second time
set.seed(1010)
rf_model <- randomForest(x=x_train, y=y_train, xtest=x_test)
rf_pred <- rf_model$test$votes[,2]
pr.curve(scores.class0=rf_pred, weights.class0=y_test)$auc.integral
[1] 0.8456932
library(parallel)
# first time
set.seed(1010)
mclapply(1:2, FUN=function(i) sample(1:5), mc.cores=2)
[[1]]
[1] 2 3 4 1 5
[[2]]
[1] 4 2 1 5 3
# second time
set.seed(1010)
mclapply(1:2, FUN=function(i) sample(1:5), mc.cores=2)
[[1]]
[1] 4 1 2 5 3
[[2]]
[1] 1 3 5 4 2
# first time
mclapply(1:2, FUN=function(i) {
set.seed(1010); sample(1:5)
}, mc.cores=2)
[[1]]
[1] 4 5 1 2 3
[[2]]
[1] 4 5 1 2 3
# second time
mclapply(1:2, FUN=function(i) {
set.seed(1010); sample(1:5)
}, mc.cores=2)
[[1]]
[1] 4 5 1 2 3
[[2]]
[1] 4 5 1 2 3
RNGkind("L'Ecuyer-CMRG")
# first time
set.seed(1010)
mclapply(1:2, FUN=function(i) sample(1:5), mc.cores=2)
[[1]]
[1] 1 2 4 5 3
[[2]]
[1] 4 3 2 1 5
# second time
set.seed(1010)
mclapply(1:2, FUN=function(i) sample(1:5), mc.cores=2)
[[1]]
[1] 1 2 4 5 3
[[2]]
[1] 4 3 2 1 5
# obtain the random number generator information
RNGkind()
[1] "L'Ecuyer-CMRG" "Inversion" "Rejection"
# first time
set.seed(1010)
sample(1:5)
[1] 1 2 4 3 5
# second time
set.seed(1010)
sample(1:5)
[1] 1 2 4 3 5
RNGkind("Mersenne-Twister")
# first time
set.seed(1010)
sample(1:5)
[1] 4 5 1 2 3
# first time
set.seed(1010)
sample(1:5)
[1] 4 5 1 2 3
If you use parallel computing, be more careful about the reproducibility.
For code chunks that take long time to run on a laptop, set the code chunk option eval=FALSE and save the results to .rdata files
load("intermediate.rdata")
http://adv-r.had.co.nz/Reproducibility.html
Include the output of sessionInfo() as a comment. This summarises your R environment and makes it easy to check if you are using an out-of-date package.
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Catalina 10.15.5
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] PRROC_1.3.1 randomForest_4.6-14 glmnet_4.0
[4] Matrix_1.2-17 knitr_1.28
loaded via a namespace (and not attached):
[1] lattice_0.20-38 codetools_0.2-16 foreach_1.5.0 grid_3.6.1
[5] magrittr_1.5 evaluate_0.14 stringi_1.4.3 iterators_1.0.12
[9] tools_3.6.1 stringr_1.4.0 xfun_0.14 compiler_3.6.1
[13] shape_1.4.4