# is the comment character in R

# lines beginning with # below are either comments
# or output from R; lines not beginning with # are R commands

# Windows users: i suggest that, at the beginning of each R session,
# you unbuffer the output by unchecking the 'Buffered output' option 
# in the 'Misc' pull-down menu

s <- 72

n <- 400

print( theta.hat <- s / n )

# [1] 0.18

# read up on the str (structure) command

str( theta.hat )

# num 0.18

print( se.hat.theta.hat <- sqrt( theta.hat * ( 1 - theta.hat ) / n ) )

# [1] 0.01920937

clt.simulation <- function( theta, n, M, seed ) {

  set.seed( seed )

  y.bar.star <- rep( NA, M )

  for ( i in 1:M ) {

    y.star <- rbinom( n, 1, theta )

    y.bar.star[ i ] <- mean( y.star )

  }

  return( y.bar.star )

}

seed <- 1

M <- 100000

system.time( 

  y.bar.star.1 <- clt.simulation( theta.hat, n, M, seed )

)

#  user  system elapsed 
#  2.34    0.00    2.34 

str( y.bar.star.1 )

# num [1:100000] 0.168 0.198 0.172 0.205 0.18 ...

c( mean( y.bar.star.1 ), sd( y.bar.star.1 ) )

# [1] 0.18007560 0.01919307

c( theta.hat, se.hat.theta.hat )

# [1] 0.18000000 0.01920937

c( mean( y.bar.star.1 ), sd( y.bar.star.1 ) ) -
  c( theta.hat, se.hat.theta.hat )

# [1]  7.560000e-05 -1.630187e-05

# you can see that the Monte Carlo (simulation) noise
# is quite small with M = 100,000: on the order of 0.00001 to 0.0001

hist( y.bar.star.1, main = 'n = 400, theta = 0.18', 
  prob = T, xlab = 'y bar star' )

y.bar.star.grid <- seq( 0.1, 0.25, length = 500 )

lines( y.bar.star.grid, dnorm( y.bar.star.grid,
  theta.hat, se.hat.theta.hat ), lwd = 2, lty = 2,
  col = 'red' )

# clearly the CLT is working for us with n = 400
# IID draws from a binary population with success probability
# around 0.18

qqnorm( y.bar.star.1 )

# read about normal quantile-quantile (qq) plots on the web

abline( mean( y.bar.star.1 ), sd( y.bar.star.1 ), lwd = 2,
  col = 'red' )

# this is an even clearer demonstration that
# the repeated-sampling distribution of the mean
# of n = 400 IID Bernoulli( theta ) random draws
# with theta around 0.18 is extremely close to normal

# optional stuff below this line, for serious R programmers

possibly.faster.clt.simulation <- function( theta, n, M, seed ) {

  set.seed( seed )

  return( 

    apply( 

      matrix( 

        rbinom( M * n, 1, theta ), M, n 

      ), 1, mean 

    ) 

  )

}

seed <- 1

M <- 100000

system.time( 

  y.bar.star.2 <- possibly.faster.clt.simulation( theta.hat, n, M, seed )

)

#   user  system elapsed 
#   2.65    0.08    2.73

# so not faster after all, at least with M = 100,000

hist( y.bar.star.2, main = 'n = 400, theta = 0.18', 
  prob = T, xlab = 'y bar star' )

qqplot( y.bar.star.1, y.bar.star.2, xlab = 'y bar star 1'
  ylab = 'y bar star 2' )

abline( 0, 1, lwd = 2, col = 'red' )

M <- 1000000

system.time( 

  y.bar.star.3 <- clt.simulation( theta.hat, n, M, seed )

)

#   user  system elapsed 
#  23.25    0.02   23.26

system.time( 

  y.bar.star.4 <- possibly.faster.clt.simulation( theta.hat, n, M, seed )

)

#   user  system elapsed 
#  27.28    0.33   27.62

# the second CLT simulation function is 'more elegant'
# but is actually slower for large M

hist( y.bar.star.3, main = 'n = 400, theta = 0.18', 
  prob = T, xlab = 'y bar star' )

lines( y.bar.star.grid, dnorm( y.bar.star.grid,
  theta.hat, se.hat.theta.hat ), lwd = 2, lty = 2,
  col = 'red' )

hist( y.bar.star.4, main = 'n = 400, theta = 0.18', 
  prob = T, xlab = 'y bar star' )

lines( y.bar.star.grid, dnorm( y.bar.star.grid,
  theta.hat, se.hat.theta.hat ), lwd = 2, lty = 2,
  col = 'red' )

qqplot( y.bar.star.3, y.bar.star.4, xlab = 'y bar star 3',
  ylab = 'y bar star 4' )

abline( 0, 1, lwd = 2, col = 'red' )

#########################################################################

alpha <- 0.05                

qnorm( 1 - alpha / 2 )

[1] 1.959964

print( confidence.interval <- c( theta.hat - qnorm( 1 - alpha / 2 ) * 
  se.hat.theta.hat, theta.hat + qnorm( 1 - alpha / 2 ) * se.hat.theta.hat ) )

[1] 0.1423503 0.2176497