# R code for a partial analysis of the mushroom data set

################## the first block of code starts here ##################

# unbuffer the output

# change directory to the location to which you downloaded
# the file 'mushroom-data.txt'

raw.mushroom.data <- read.csv( "mushroom-data.txt", header = T )

str( raw.mushroom.data )

# 'data.frame':   8124 obs. of  23 variables:

#  $ poisonous               : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
#  $ cap.shape               : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
#  $ cap.surface             : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
#  $ cap.color               : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
#  $ bruises                 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
#  $ odor                    : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
#  $ gill.atachment          : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
#  $ gill.spacing            : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
#  $ gill.size               : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
#  $ gill.color              : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
#  $ stalk.shape             : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
#  $ stalk.root              : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
#  $ stalk.surface.above.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
#  $ stalk.surface.below.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
#  $ stalk.color.above.ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
#  $ stalk.color.below.ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
#  $ veil.type               : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
#  $ veil.color              : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
#  $ ring.number             : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
#  $ ring.type               : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
#  $ spore.print.color       : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
#  $ population              : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
#  $ habitat                 : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...

print( n <- dim( raw.mushroom.data )[ 1 ] )

# [1] 8124          the sample size in this data set

# the next command converts the categorical variable 'poisonous'
# into a quantitative dichotomous binary variable, in which 1 = poisonous
# and 0 = edible

raw.mushroom.data$poisonous <- ifelse( raw.mushroom.data$poisonous == 'p',
  1, 0 )

# in the table arising from the str(  ) command above, notice that
# the variable 'veil.type' is constant (it's a factor with only 1 level);
# such variables obviously have no predictive power, so the next command
# drops this variable from the data set 

raw.mushroom.data <- subset( raw.mushroom.data, select = - veil.type )

str( raw.mushroom.data )

# 'data.frame':   8124 obs. of  22 variables:

#  $ poisonous               : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
#  $ cap.shape               : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
#  $ cap.surface             : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
#  $ cap.color               : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
#  $ bruises                 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
#  $ odor                    : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
#  $ gill.atachment          : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
#  $ gill.spacing            : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
#  $ gill.size               : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
#  $ gill.color              : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
#  $ stalk.shape             : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
#  $ stalk.root              : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
#  $ stalk.surface.above.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
#  $ stalk.surface.below.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
#  $ stalk.color.above.ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
#  $ stalk.color.below.ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
#  $ veil.color              : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
#  $ ring.number             : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
#  $ ring.type               : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
#  $ spore.print.color       : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
#  $ population              : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
#  $ habitat                 : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...

with( raw.mushroom.data, mean( poisonous ) )

# [1] 0.4820286

# the proportion of poisonous mushrooms in the data set is about 48%;
# prediction of a dichotomous (binary) outcome is easiest when the data set
# contains 50% yes/1/poisonous and 50% no/0/edible, so this
# is close to optimal

# the next function enables us to examine the relationship
# between each of the predictor variables (one at a time)
# and the dichotomous outcome 

tab.sum <- function( x1, y ) {

  # written by Daniel Helkey, edited by David Draper

  # summarize outcome variable y by levels of the categorical variable x1

  # make sure x and y are same length, etc...

  stopifnot( length( x1 ) == length( y ) )

  # two helper functions

  summaryFun <- function( x ) {

    return( list( n = length( x ), mean = mean( x, na.rm = TRUE ), 
      sd = sd( x, na.rm = TRUE ) ) )

  }

  mapFun <- function( level ) {

    indices <- ( x1 == level )

    summaryFun( y[ indices ] )

  }
  
  levels <- sort( unique( x1 ) )

  out_mat <- do.call( rbind, Map( mapFun, levels ) )

  out_mat <- cbind( levels, out_mat )

  out_mat <- rbind( out_mat, c( 'Total', summaryFun( y ) ) )

  colnames( out_mat )[ 1 ] <- as.character( substitute( x1 ) )

  return( out_mat )

}

# the next command computes the relative frequency distribution
# of the variable 'cap.shape'

with( raw.mushroom.data, 
  signif( table( cap.shape, useNA = 'always' ) / n, 4 ) )

# cap.shape
#         b         c         f         k         s         x      <NA> 
# 0.0556400 0.0004924 0.3880000 0.1019000 0.0039390 0.4500000 0.0000000

# consulting the text file containing contextual information
# about this data set, you can see that most of the mushrooms
# have a cap shape that's convex (category 'x'), followed by
# flat ('f')

# you can also see that this variable has no missing values ('NA')

with( raw.mushroom.data, tab.sum( cap.shape, poisonous ) )

#      cap.shape n    mean      sd       

# [1,] 1         452  0.1061947 0.308428 
# [2,] 2         4    1         0        
# [3,] 3         3152 0.4936548 0.5000391
# [4,] 4         828  0.7246377 0.4469667
# [5,] 5         32   0         0        
# [6,] 6         3656 0.4671772 0.4989898

# [7,] "Total"   8124 0.4820286 0.4997077

# the table above this line summarizes the relative frequency 
# of poisonous mushrooms for each cap shape

# if all of the cap shape categories had the same proportion
# of poisonous mushrooms (namely, the overall poisonous rate of 48%),
# that would tell us that cap.shape has *no* predictive usefulness

# instead we see that there's a bit of signal here: for example,
# 828 of the mushrooms (the ones with cap.shape category 4,
# which the relative-frequency table above shows is 'k' and
# the context file says is knobbed) have a poisonous rate of about
# 72%, which is far above average, and 452 of the mushrooms
# category 1 ('b': bell) have a poisonous rate of only 10.6%,
# which is well below average

# so cap.shape looks like a decent candidate to put into
# our predictive models

################## the first block of code ends here ##################

################ the second block of code starts here #################

# the next command fits a linear regression model
# in which the binary outcome variable 'poisonous'
# is regressed on the categorical predictor 'cap.shape',
# and then summarizes the fit

summary( linear.model.1 <- lm( poisonous ~ cap.shape, 
  data = raw.mushroom.data ) )

# Call:
# lm(formula = poisonous ~ cap.shape, data = raw.mushroom.data)

# Residuals:
#     Min      1Q  Median      3Q     Max 
# -0.7246 -0.4672 -0.1062  0.5063  0.8938 

# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  0.10619    0.02279   4.659 3.22e-06 ***
# cap.shapec   0.89381    0.24335   3.673 0.000241 ***
# cap.shapef   0.38746    0.02437  15.898  < 2e-16 ***
# cap.shapek   0.61844    0.02834  21.824  < 2e-16 ***
# cap.shapes  -0.10619    0.08864  -1.198 0.230926    
# cap.shapex   0.36098    0.02416  14.942  < 2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 0.4846 on 8118 degrees of freedom
# Multiple R-squared:  0.06031,   Adjusted R-squared:  0.05973 
# F-statistic: 104.2 on 5 and 8118 DF,  p-value: < 2.2e-16

# this factor (x.1 = cap.shape) has k.1 = 6 levels; R has created 
# ( k.1 - 1 ) = 5 temporary dummy variables called cap.shapec,
# cap.shapef, ..., cap.shapex corresponding to the levels 'c',
# 'f', ..., 'x' of x.1 and included these dummies as predictors

# this means that it has chosen the omitted group to be 
# the first (in alphabetical order) level 'b' of x.1 = cap.shape;
# R always does this (i.e., it always omits the dummy variable
# corresponding to the alphabetically first level of a factor)

# the next command creates a vector of p.hat values from linear.model.1

p.hat.linear.model.1 <- predict( linear.model.1, type = 'response' )

# the next four commands make a diagnostic plot from the p.hat values

par( mfrow = c( 2, 1 ) )

hist( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 0 ],
  nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T,
  xlim = c( 0, 1 ) )

hist( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 1 ],
  nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T,
  xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

# the next command computes a diagnostic i call the
#  predictive separation index (psi), the last number
# in the vector of three numbers

c( mean( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 0 ] ),
  mean( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 1 ] ),
  mean( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 1 ] ) -
  mean( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 0 ] ) )

# [1] 0.45295970 0.51326496 0.06030526

################ the second block of code ends here #################

################ the third block of code begins here ################

univariate.exploration <- function( new.variable ) {

  print( with( raw.mushroom.data, 
    signif( table( new.variable, useNA = 'always' ) / n, 4 ) ) )

  print( with( raw.mushroom.data, tab.sum( new.variable, poisonous ) ) )

  print( summary( temp.linear.model <- lm( poisonous ~ new.variable, 
    data = raw.mushroom.data ) ) )

  p.hat.temp.linear.model <- predict( temp.linear.model, type = 'response' )

  par( mfrow = c( 2, 1 ) )

  hist( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 0 ],
    nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T,
    xlim = c( 0, 1 ) )

  hist( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 1 ],
    nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T,
    xlim = c( 0, 1 ) )

  par( mfrow = c( 1, 1 ) )

  print( c( 
    mean( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 0 ] ),
    mean( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 1 ] ),
    mean( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 1 ] ) -
    mean( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 0 ] ) ) )

  return( )

}

univariate.exploration( raw.mushroom.data$cap.surface )

# new.variable
#         f         g         s         y      <NA> 
# 0.2856000 0.0004924 0.3146000 0.3993000 0.0000000 
#      new.variable n    mean      sd       
# [1,] 1            2320 0.3275862 0.4694342
# [2,] 2            4    1         0        
# [3,] 3            2556 0.5524257 0.4973413
# [4,] 4            3244 0.5363748 0.498752 
# [5,] "Total"      8124 0.4820286 0.4997077

# Call:
# lm(formula = poisonous ~ new.variable, data = raw.mushroom.data)

# Residuals:
#     Min      1Q  Median      3Q     Max 
# -0.5524 -0.5364 -0.3276  0.4636  0.6724 

# Coefficients:
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept)    0.32759    0.01017  32.200  < 2e-16 ***
# new.variableg  0.67241    0.24522   2.742  0.00612 ** 
# new.variables  0.22484    0.01405  16.001  < 2e-16 ***
# new.variabley  0.20879    0.01332  15.671  < 2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 0.49 on 8120 degrees of freedom
# Multiple R-squared:  0.03878,   Adjusted R-squared:  0.03842 
# F-statistic: 109.2 on 3 and 8120 DF,  p-value: < 2.2e-16

# [1] 0.46333584 0.50211511 0.03877928
# NULL

# to call this function, just replace 

#   cap.surface 

# in

#   univariate.exploration( raw.mushroom.data$cap.surface )

# with the name of the predictor variable you want to explore
# (e.g., bruises, ...)

################ the third block of code ends here #################

############## the fourth block of code begins here ################

# the next command fits a linear model using all predictors

summary( linear.model.all.predictors <- lm( poisonous ~ ., 
  data = raw.mushroom.data ) )

# Call:
# lm(formula = poisonous ~ ., data = raw.mushroom.data)

# Residuals:
#        Min         1Q     Median         3Q        Max 
# -1.863e-12 -9.900e-15 -7.000e-16  7.800e-15  2.636e-11 

# Coefficients: (10 not defined because of singularities)
#                             Estimate Std. Error    t value Pr(>|t|)    
# (Intercept)                4.509e-13  2.843e-13  1.586e+00 0.112793    
# cap.shapec                 4.351e-13  1.898e-13  2.293e+00 0.021893 *  
# cap.shapef                 3.121e-14  2.157e-14  1.446e+00 0.148077    
# cap.shapek                 3.524e-13  2.345e-14  1.503e+01  < 2e-16 ***
# cap.shapes                -1.149e-14  7.394e-14 -1.550e-01 0.876466    
# cap.shapex                 2.952e-13  2.067e-14  1.428e+01  < 2e-16 ***
# cap.surfaceg              -3.231e-13  2.320e-13 -1.393e+00 0.163785    
# cap.surfaces              -2.751e-14  1.235e-14 -2.228e+00 0.025937 *  
# cap.surfacey              -3.800e-14  1.035e-14 -3.672e+00 0.000243 ***
# cap.colorc                 5.543e-14  7.168e-14  7.730e-01 0.439323    
# cap.colore                -1.709e-15  3.337e-14 -5.100e-02 0.959145    
# cap.colorg                 4.570e-14  3.197e-14  1.429e+00 0.152914    
# cap.colorn                 5.296e-15  3.262e-14  1.620e-01 0.871038    
# cap.colorp                -3.893e-14  4.105e-14 -9.480e-01 0.343043    
# cap.colorr                 1.660e-13  1.203e-13  1.380e+00 0.167615    
# cap.coloru                 1.293e-13  1.203e-13  1.075e+00 0.282396    
# cap.colorw                 4.347e-14  3.217e-14  1.351e+00 0.176600    
# cap.colory                 3.094e-14  3.423e-14  9.040e-01 0.366158    
# bruisest                   5.000e-01  1.102e-13  4.538e+12  < 2e-16 ***
# odorc                      5.000e-01  1.318e-13  3.792e+12  < 2e-16 ***
# odorf                     -5.000e-01  3.300e-13 -1.515e+12  < 2e-16 ***
# odorl                     -2.673e-15  2.318e-14 -1.150e-01 0.908213    
# odorm                      2.500e+00  5.191e-13  4.816e+12  < 2e-16 ***
# odorn                     -1.500e+00  3.184e-13 -4.712e+12  < 2e-16 ***
# odorp                     -1.000e+00  3.964e-13 -2.523e+12  < 2e-16 ***
# odors                     -5.000e-01  3.306e-13 -1.512e+12  < 2e-16 ***
# odory                     -5.000e-01  3.306e-13 -1.512e+12  < 2e-16 ***
# gill.attachmentf           5.811e-14  1.093e-13  5.320e-01 0.594898    
# gill.spacingw             -3.494e-14  4.732e-14 -7.380e-01 0.460285    
# gill.sizen                -1.500e+00  2.984e-13 -5.027e+12  < 2e-16 ***
# gill.colore               -5.000e-01  1.976e-13 -2.530e+12  < 2e-16 ***
# gill.colorg               -5.000e-01  1.957e-13 -2.555e+12  < 2e-16 ***
# gill.colorh               -5.000e-01  1.953e-13 -2.560e+12  < 2e-16 ***
# gill.colork               -5.000e-01  1.960e-13 -2.552e+12  < 2e-16 ***
# gill.colorn               -5.000e-01  1.956e-13 -2.557e+12  < 2e-16 ***
# gill.coloro               -5.000e-01  2.033e-13 -2.460e+12  < 2e-16 ***
# gill.colorp               -5.000e-01  1.952e-13 -2.562e+12  < 2e-16 ***
# gill.colorr               -5.000e-01  2.115e-13 -2.364e+12  < 2e-16 ***
# gill.coloru               -5.000e-01  1.961e-13 -2.549e+12  < 2e-16 ***
# gill.colorw               -5.000e-01  1.948e-13 -2.567e+12  < 2e-16 ***
# gill.colory               -5.000e-01  2.015e-13 -2.481e+12  < 2e-16 ***
# stalk.shapet              -1.000e+00  1.944e-13 -5.145e+12  < 2e-16 ***
# stalk.rootb               -5.000e-01  1.287e-13 -3.886e+12  < 2e-16 ***
# stalk.rootc               -3.000e+00  6.028e-13 -4.977e+12  < 2e-16 ***
# stalk.roote                5.000e-01  1.184e-13  4.222e+12  < 2e-16 ***
# stalk.rootr               -3.500e+00  5.190e-13 -6.744e+12  < 2e-16 ***
# stalk.surface.above.ringk  5.733e-15  2.455e-14  2.340e-01 0.815332    
# stalk.surface.above.rings  6.957e-15  1.973e-14  3.530e-01 0.724415    
# stalk.surface.above.ringy -7.417e-15  2.696e-13 -2.800e-02 0.978056    
# stalk.surface.below.ringk  3.675e-15  2.455e-14  1.500e-01 0.880992    
# stalk.surface.below.rings  3.989e-15  1.973e-14  2.020e-01 0.839786    
# stalk.surface.below.ringy  5.000e-01  1.647e-13  3.035e+12  < 2e-16 ***
# stalk.color.above.ringc           NA         NA         NA       NA    
# stalk.color.above.ringe   -9.794e-15  5.372e-14 -1.820e-01 0.855337    
# stalk.color.above.ringg   -7.479e-16  2.854e-14 -2.600e-02 0.979098    
# stalk.color.above.ringn    2.426e-16  2.231e-14  1.100e-02 0.991321    
# stalk.color.above.ringo   -1.000e+00  2.487e-13 -4.021e+12  < 2e-16 ***
# stalk.color.above.ringp   -7.090e-16  2.231e-14 -3.200e-02 0.974644    
# stalk.color.above.ringw   -6.699e-16  2.543e-14 -2.600e-02 0.978987    
# stalk.color.above.ringy    3.000e+00  4.990e-13  6.013e+12  < 2e-16 ***
# stalk.color.below.ringc           NA         NA         NA       NA    
# stalk.color.below.ringe   -3.626e-15  5.372e-14 -6.800e-02 0.946181    
# stalk.color.below.ringg    8.068e-16  2.854e-14  2.800e-02 0.977452    
# stalk.color.below.ringn    6.493e-16  2.231e-14  2.900e-02 0.976776    
# stalk.color.below.ringo           NA         NA         NA       NA    
# stalk.color.below.ringp    6.740e-16  2.231e-14  3.000e-02 0.975895    
# stalk.color.below.ringw    6.205e-16  2.543e-14  2.400e-02 0.980537    
# stalk.color.below.ringy    1.474e-16  1.180e-13  1.000e-03 0.999004    
# veil.coloro                1.224e-16  4.732e-14  3.000e-03 0.997937    
# veil.colorw                       NA         NA         NA       NA    
# veil.colory                       NA         NA         NA       NA    
# ring.numbero               2.500e+00  4.776e-13  5.235e+12  < 2e-16 ***
# ring.numbert                      NA         NA         NA       NA    
# ring.typef                 1.000e+00  2.390e-13  4.185e+12  < 2e-16 ***
# ring.typel                        NA         NA         NA       NA    
# ring.typen                        NA         NA         NA       NA    
# ring.typep                 5.000e-01  1.002e-13  4.989e+12  < 2e-16 ***
# spore.print.colorh                NA         NA         NA       NA    
# spore.print.colork         4.758e-15  6.777e-14  7.000e-02 0.944034    
# spore.print.colorn        -1.446e-17  6.692e-14  0.000e+00 0.999828    
# spore.print.coloro        -1.100e-16  6.692e-14 -2.000e-03 0.998688    
# spore.print.colorr         2.500e+00  3.196e-13  7.822e+12  < 2e-16 ***
# spore.print.coloru        -8.504e-15  9.463e-14 -9.000e-02 0.928401    
# spore.print.colorw         1.500e+00  3.066e-13  4.893e+12  < 2e-16 ***
# spore.print.colory         2.863e-24  6.692e-14  0.000e+00 1.000000    
# populationc               -6.210e-14  5.716e-14 -1.086e+00 0.277296    
# populationn                3.537e-14  3.312e-14  1.068e+00 0.285547    
# populations               -8.241e-15  2.366e-14 -3.480e-01 0.727585    
# populationv               -6.210e-14  3.207e-14 -1.937e+00 0.052833 .  
# populationy               -5.922e-14  3.322e-14 -1.783e+00 0.074703 .  
# habitatg                   3.339e-16  1.969e-14  1.700e-02 0.986468    
# habitatl                  -5.514e-17  1.821e-14 -3.000e-03 0.997584    
# habitatm                   7.764e-16  3.352e-14  2.300e-02 0.981521    
# habitatp                  -1.103e-16  1.440e-14 -8.000e-03 0.993889    
# habitatu                   9.875e-14  3.432e-14  2.878e+00 0.004019 ** 
# habitatw                          NA         NA         NA       NA    
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 3.278e-13 on 8038 degrees of freedom
# Multiple R-squared:      1,     Adjusted R-squared:      1 
# F-statistic: 2.221e+26 on 85 and 8038 DF,  p-value: < 2.2e-16

# the NA rows are telling us that, because of the way the mushrooms
# are distributed, some of the dummy variables are redundant
# in the presence of the other dummies; this turns out not to present
# a problem in the prediction process

p.hat.linear.model.all.predictors <- 
  predict( linear.model.all.predictors, type = 'response' )

par( mfrow = c( 2, 1 ) )

hist( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 0 ],
  nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T,
  xlim = c( 0, 1 ) )

hist( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 1 ],
  nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T,
  xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

c( 

 mean( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 0 ] ),
 mean( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 1 ] ),
 mean( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 1 ] ) -
 mean( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 0 ] )

 )

# [1] 2.981711e-13 1.000000e+00 1.000000e+00

############## the fourth block of code ends here #################

############## the fifth block of code begins here ################

install.packages( 'DAAG' )

library( DAAG )

cross.validation.all.predictors.10.fold <- cv.lm( 
  data = raw.mushroom.data, form.lm = formula( poisonous ~ . ), 
  m = 10, printit = F ) 

warnings( )

# Warning messages:
# 1: In predict.lm(subs.lm, newdata = data[rows.out, ]) :
#   prediction from a rank-deficient fit may be misleading
# 2: In predict.lm(subs.lm, newdata = data[rows.out, ]) :
#   prediction from a rank-deficient fit may be misleading
# 3: In predict.lm(subs.lm, newdata = data[rows.out, ]) :
#   prediction from a rank-deficient fit may be misleading
# 4: In predict.lm(subs.lm, newdata = data[rows.out, ]) :
#   prediction from a rank-deficient fit may be misleading
# 5: In predict.lm(subs.lm, newdata = data[rows.out, ]) :
#   prediction from a rank-deficient fit may be misleading
# 6: In predict.lm(subs.lm, newdata = data[rows.out, ]) :
#   prediction from a rank-deficient fit may be misleading
# 7: In predict.lm(subs.lm, newdata = data[rows.out, ]) :
#   prediction from a rank-deficient fit may be misleading
# 8: In predict.lm(subs.lm, newdata = data[rows.out, ]) :
#   prediction from a rank-deficient fit may be misleading
# 9: In predict.lm(subs.lm, newdata = data[rows.out, ]) :
#   prediction from a rank-deficient fit may be misleading
# 10: In predict.lm(subs.lm, newdata = data[rows.out, ]) :
#   prediction from a rank-deficient fit may be misleading
# 11: In cv.lm(data = raw.mushroom.data, form.lm = formula(poisonous ~  ... : 

#  As there is >1 explanatory variable, cross-validation
#  predicted values for a fold are not a linear function
#  of corresponding overall predicted values.  Lines that
#  are shown for the different folds are approximate

# the next command computes the cross-validated
# root-mean-squared error of the regression predictions,
# which is essentially the same as the value
# from the regression with the full data set

sqrt( attr( cross.validation.all.predictors.10.fold, 'ms' ) )

# [1] 2.2e-13

############## the fifth block of code ends here ##################

############## the sixth block of code begins here ################

# the next command investigates which predictors can be dropped
# from the model with little or no predictive accuracy loss

null.model <- lm( poisonous ~ 1, data = raw.mushroom.data )

full.model <- lm( poisonous ~ ., data = raw.mushroom.data )

variable.selection.with.bic <- step( null.model,
  scope = list( lower = null.model, upper = full.model ), 
  direction = 'both', trace = 1, steps = 1000, k = log( n ) )

# the output of this command is in the file

#   ams-206-mushroom-analysis-variable-selection-with-bic.txt

############## the sixth block of code ends here ################

############ the seventh block of code begins here ##############

summary( linear.model.just.odor <- 
  lm( poisonous ~ odor, data = raw.mushroom.data ) )

# Call:
# lm(formula = poisonous ~ odor, data = raw.mushroom.data)

# Residuals:
#    Min     1Q Median     3Q    Max 
# -0.034 -0.034  0.000  0.000  0.966 

# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 1.37e-14   5.98e-03    0.00        1    
# odorc       1.00e+00   1.05e-02   95.30  < 2e-16 ***
# odorf       1.00e+00   6.51e-03  153.71  < 2e-16 ***
# odorl       9.37e-14   8.45e-03    0.00        1    
# odorm       1.00e+00   2.08e-02   48.08  < 2e-16 ***
# odorn       3.40e-02   6.31e-03    5.39  7.1e-08 ***
# odorp       1.00e+00   9.57e-03  104.54  < 2e-16 ***
# odors       1.00e+00   7.78e-03  128.55  < 2e-16 ***
# odory       1.00e+00   7.78e-03  128.55  < 2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 0.12 on 8115 degrees of freedom
# Multiple R-squared:  0.943,     Adjusted R-squared:  0.943 
# F-statistic: 1.67e+04 on 8 and 8115 DF,  p-value: <2e-16

p.hat.linear.model.just.odor <- 
  predict( linear.model.just.odor, type = 'response' )

par( mfrow = c( 2, 1 ) )

hist( p.hat.linear.model.just.odor[ raw.mushroom.data$poisonous == 0 ],
  nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T,
  xlim = c( 0, 1 ) )

hist( p.hat.linear.model.just.odor[ raw.mushroom.data$poisonous == 1 ],
  nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T,
  xlim = c( 0, 1 ) )

par( mfrow = c( 1, 1 ) )

table( p.hat.linear.model.just.odor, useNA = 'always' )

# p.hat.linear.model.just.odor
# 1.36725100485403e-14 1.07412118907621e-13   0.0340136054422548    0.999999999999688 
#                  400                  400                 3528                  192 
#    0.999999999999851     1.00000000000006     1.00000000000009     1.00000000000016 
#                 2160                   36                  576                  576 
#     1.00000000000018                 <NA> 
#                  256                    0 

app.says.poisonous.0.05 <- ifelse( p.hat.linear.model.just.odor > 0.05, 1, 0 )

with( raw.mushroom.data, table( app.says.poisonous.0.05, poisonous ) )

#                        poisonous
# app.says.poisonous.0.05    0    1
#                       0 4208  120
#                       1    0 3796

app.says.poisonous.0.01 <- ifelse( p.hat.linear.model.just.odor > 0.01, 1, 0 )

with( raw.mushroom.data, table( app.says.poisonous.0.01, poisonous ) )

#                        poisonous
# app.says.poisonous.0.01    0    1
#                       0  800    0
#                       1 3408 3916

############## the seventh block of code ends here ################