# is the comment character in R # lines beginning with # below are either comments # or output from R; lines not beginning with # are R commands # Windows users: i suggest that, at the beginning of each R session, # you unbuffer the output by unchecking the 'Buffered output' option # in the 'Misc' pull-down menu s <- 72 n <- 400 print( theta.hat <- s / n ) # [1] 0.18 # read up on the str (structure) command str( theta.hat ) # num 0.18 print( se.hat.theta.hat <- sqrt( theta.hat * ( 1 - theta.hat ) / n ) ) # [1] 0.01920937 clt.simulation <- function( theta, n, M, seed ) { set.seed( seed ) y.bar.star <- rep( NA, M ) for ( i in 1:M ) { y.star <- rbinom( n, 1, theta ) y.bar.star[ i ] <- mean( y.star ) } return( y.bar.star ) } seed <- 1 M <- 100000 system.time( y.bar.star.1 <- clt.simulation( theta.hat, n, M, seed ) ) # user system elapsed # 2.34 0.00 2.34 str( y.bar.star.1 ) # num [1:100000] 0.168 0.198 0.172 0.205 0.18 ... c( mean( y.bar.star.1 ), sd( y.bar.star.1 ) ) # [1] 0.18007560 0.01919307 c( theta.hat, se.hat.theta.hat ) # [1] 0.18000000 0.01920937 c( mean( y.bar.star.1 ), sd( y.bar.star.1 ) ) - c( theta.hat, se.hat.theta.hat ) # [1] 7.560000e-05 -1.630187e-05 # you can see that the Monte Carlo (simulation) noise # is quite small with M = 100,000: on the order of 0.00001 to 0.0001 hist( y.bar.star.1, main = 'n = 400, theta = 0.18', prob = T, xlab = 'y bar star' ) y.bar.star.grid <- seq( 0.1, 0.25, length = 500 ) lines( y.bar.star.grid, dnorm( y.bar.star.grid, theta.hat, se.hat.theta.hat ), lwd = 2, lty = 2, col = 'red' ) # clearly the CLT is working for us with n = 400 # IID draws from a binary population with success probability # around 0.18 qqnorm( y.bar.star.1 ) # read about normal quantile-quantile (qq) plots on the web abline( mean( y.bar.star.1 ), sd( y.bar.star.1 ), lwd = 2, col = 'red' ) # this is an even clearer demonstration that # the repeated-sampling distribution of the mean # of n = 400 IID Bernoulli( theta ) random draws # with theta around 0.18 is extremely close to normal # optional stuff below this line, for serious R programmers possibly.faster.clt.simulation <- function( theta, n, M, seed ) { set.seed( seed ) return( apply( matrix( rbinom( M * n, 1, theta ), M, n ), 1, mean ) ) } seed <- 1 M <- 100000 system.time( y.bar.star.2 <- possibly.faster.clt.simulation( theta.hat, n, M, seed ) ) # user system elapsed # 2.65 0.08 2.73 # so not faster after all, at least with M = 100,000 hist( y.bar.star.2, main = 'n = 400, theta = 0.18', prob = T, xlab = 'y bar star' ) qqplot( y.bar.star.1, y.bar.star.2, xlab = 'y bar star 1' ylab = 'y bar star 2' ) abline( 0, 1, lwd = 2, col = 'red' ) M <- 1000000 system.time( y.bar.star.3 <- clt.simulation( theta.hat, n, M, seed ) ) # user system elapsed # 23.25 0.02 23.26 system.time( y.bar.star.4 <- possibly.faster.clt.simulation( theta.hat, n, M, seed ) ) # user system elapsed # 27.28 0.33 27.62 # the second CLT simulation function is 'more elegant' # but is actually slower for large M hist( y.bar.star.3, main = 'n = 400, theta = 0.18', prob = T, xlab = 'y bar star' ) lines( y.bar.star.grid, dnorm( y.bar.star.grid, theta.hat, se.hat.theta.hat ), lwd = 2, lty = 2, col = 'red' ) hist( y.bar.star.4, main = 'n = 400, theta = 0.18', prob = T, xlab = 'y bar star' ) lines( y.bar.star.grid, dnorm( y.bar.star.grid, theta.hat, se.hat.theta.hat ), lwd = 2, lty = 2, col = 'red' ) qqplot( y.bar.star.3, y.bar.star.4, xlab = 'y bar star 3', ylab = 'y bar star 4' ) abline( 0, 1, lwd = 2, col = 'red' ) ######################################################################### alpha <- 0.05 qnorm( 1 - alpha / 2 ) [1] 1.959964 print( confidence.interval <- c( theta.hat - qnorm( 1 - alpha / 2 ) * se.hat.theta.hat, theta.hat + qnorm( 1 - alpha / 2 ) * se.hat.theta.hat ) ) [1] 0.1423503 0.2176497