# R code for a partial analysis of the mushroom data set ################## the first block of code starts here ################## # unbuffer the output # change directory to the location to which you downloaded # the file 'mushroom-data.txt' raw.mushroom.data <- read.csv( "mushroom-data.txt", header = T ) str( raw.mushroom.data ) # 'data.frame': 8124 obs. of 23 variables: # $ poisonous : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ... # $ cap.shape : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ... # $ cap.surface : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ... # $ cap.color : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ... # $ bruises : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ... # $ odor : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ... # $ gill.atachment : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ... # $ gill.spacing : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ... # $ gill.size : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ... # $ gill.color : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ... # $ stalk.shape : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ... # $ stalk.root : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ... # $ stalk.surface.above.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ... # $ stalk.surface.below.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ... # $ stalk.color.above.ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ... # $ stalk.color.below.ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ... # $ veil.type : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ... # $ veil.color : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ... # $ ring.number : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ... # $ ring.type : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ... # $ spore.print.color : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ... # $ population : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ... # $ habitat : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ... print( n <- dim( raw.mushroom.data )[ 1 ] ) # [1] 8124 the sample size in this data set # the next command converts the categorical variable 'poisonous' # into a quantitative dichotomous binary variable, in which 1 = poisonous # and 0 = edible raw.mushroom.data$poisonous <- ifelse( raw.mushroom.data$poisonous == 'p', 1, 0 ) # in the table arising from the str( ) command above, notice that # the variable 'veil.type' is constant (it's a factor with only 1 level); # such variables obviously have no predictive power, so the next command # drops this variable from the data set raw.mushroom.data <- subset( raw.mushroom.data, select = - veil.type ) str( raw.mushroom.data ) # 'data.frame': 8124 obs. of 22 variables: # $ poisonous : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ... # $ cap.shape : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ... # $ cap.surface : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ... # $ cap.color : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ... # $ bruises : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ... # $ odor : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ... # $ gill.atachment : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ... # $ gill.spacing : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ... # $ gill.size : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ... # $ gill.color : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ... # $ stalk.shape : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ... # $ stalk.root : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ... # $ stalk.surface.above.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ... # $ stalk.surface.below.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ... # $ stalk.color.above.ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ... # $ stalk.color.below.ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ... # $ veil.color : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ... # $ ring.number : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ... # $ ring.type : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ... # $ spore.print.color : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ... # $ population : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ... # $ habitat : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ... with( raw.mushroom.data, mean( poisonous ) ) # [1] 0.4820286 # the proportion of poisonous mushrooms in the data set is about 48%; # prediction of a dichotomous (binary) outcome is easiest when the data set # contains 50% yes/1/poisonous and 50% no/0/edible, so this # is close to optimal # the next function enables us to examine the relationship # between each of the predictor variables (one at a time) # and the dichotomous outcome tab.sum <- function( x1, y ) { # written by Daniel Helkey, edited by David Draper # summarize outcome variable y by levels of the categorical variable x1 # make sure x and y are same length, etc... stopifnot( length( x1 ) == length( y ) ) # two helper functions summaryFun <- function( x ) { return( list( n = length( x ), mean = mean( x, na.rm = TRUE ), sd = sd( x, na.rm = TRUE ) ) ) } mapFun <- function( level ) { indices <- ( x1 == level ) summaryFun( y[ indices ] ) } levels <- sort( unique( x1 ) ) out_mat <- do.call( rbind, Map( mapFun, levels ) ) out_mat <- cbind( levels, out_mat ) out_mat <- rbind( out_mat, c( 'Total', summaryFun( y ) ) ) colnames( out_mat )[ 1 ] <- as.character( substitute( x1 ) ) return( out_mat ) } # the next command computes the relative frequency distribution # of the variable 'cap.shape' with( raw.mushroom.data, signif( table( cap.shape, useNA = 'always' ) / n, 4 ) ) # cap.shape # b c f k s x # 0.0556400 0.0004924 0.3880000 0.1019000 0.0039390 0.4500000 0.0000000 # consulting the text file containing contextual information # about this data set, you can see that most of the mushrooms # have a cap shape that's convex (category 'x'), followed by # flat ('f') # you can also see that this variable has no missing values ('NA') with( raw.mushroom.data, tab.sum( cap.shape, poisonous ) ) # cap.shape n mean sd # [1,] 1 452 0.1061947 0.308428 # [2,] 2 4 1 0 # [3,] 3 3152 0.4936548 0.5000391 # [4,] 4 828 0.7246377 0.4469667 # [5,] 5 32 0 0 # [6,] 6 3656 0.4671772 0.4989898 # [7,] "Total" 8124 0.4820286 0.4997077 # the table above this line summarizes the relative frequency # of poisonous mushrooms for each cap shape # if all of the cap shape categories had the same proportion # of poisonous mushrooms (namely, the overall poisonous rate of 48%), # that would tell us that cap.shape has *no* predictive usefulness # instead we see that there's a bit of signal here: for example, # 828 of the mushrooms (the ones with cap.shape category 4, # which the relative-frequency table above shows is 'k' and # the context file says is knobbed) have a poisonous rate of about # 72%, which is far above average, and 452 of the mushrooms # category 1 ('b': bell) have a poisonous rate of only 10.6%, # which is well below average # so cap.shape looks like a decent candidate to put into # our predictive models ################## the first block of code ends here ################## ################ the second block of code starts here ################# # the next command fits a linear regression model # in which the binary outcome variable 'poisonous' # is regressed on the categorical predictor 'cap.shape', # and then summarizes the fit summary( linear.model.1 <- lm( poisonous ~ cap.shape, data = raw.mushroom.data ) ) # Call: # lm(formula = poisonous ~ cap.shape, data = raw.mushroom.data) # Residuals: # Min 1Q Median 3Q Max # -0.7246 -0.4672 -0.1062 0.5063 0.8938 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 0.10619 0.02279 4.659 3.22e-06 *** # cap.shapec 0.89381 0.24335 3.673 0.000241 *** # cap.shapef 0.38746 0.02437 15.898 < 2e-16 *** # cap.shapek 0.61844 0.02834 21.824 < 2e-16 *** # cap.shapes -0.10619 0.08864 -1.198 0.230926 # cap.shapex 0.36098 0.02416 14.942 < 2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 0.4846 on 8118 degrees of freedom # Multiple R-squared: 0.06031, Adjusted R-squared: 0.05973 # F-statistic: 104.2 on 5 and 8118 DF, p-value: < 2.2e-16 # this factor (x.1 = cap.shape) has k.1 = 6 levels; R has created # ( k.1 - 1 ) = 5 temporary dummy variables called cap.shapec, # cap.shapef, ..., cap.shapex corresponding to the levels 'c', # 'f', ..., 'x' of x.1 and included these dummies as predictors # this means that it has chosen the omitted group to be # the first (in alphabetical order) level 'b' of x.1 = cap.shape; # R always does this (i.e., it always omits the dummy variable # corresponding to the alphabetically first level of a factor) # the next command creates a vector of p.hat values from linear.model.1 p.hat.linear.model.1 <- predict( linear.model.1, type = 'response' ) # the next four commands make a diagnostic plot from the p.hat values par( mfrow = c( 2, 1 ) ) hist( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) # the next command computes a diagnostic i call the # predictive separation index (psi), the last number # in the vector of three numbers c( mean( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 0 ] ), mean( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 1 ] ), mean( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 1 ] ) - mean( p.hat.linear.model.1[ raw.mushroom.data$poisonous == 0 ] ) ) # [1] 0.45295970 0.51326496 0.06030526 ################ the second block of code ends here ################# ################ the third block of code begins here ################ univariate.exploration <- function( new.variable ) { print( with( raw.mushroom.data, signif( table( new.variable, useNA = 'always' ) / n, 4 ) ) ) print( with( raw.mushroom.data, tab.sum( new.variable, poisonous ) ) ) print( summary( temp.linear.model <- lm( poisonous ~ new.variable, data = raw.mushroom.data ) ) ) p.hat.temp.linear.model <- predict( temp.linear.model, type = 'response' ) par( mfrow = c( 2, 1 ) ) hist( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) print( c( mean( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 0 ] ), mean( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 1 ] ), mean( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 1 ] ) - mean( p.hat.temp.linear.model[ raw.mushroom.data$poisonous == 0 ] ) ) ) return( ) } univariate.exploration( raw.mushroom.data$cap.surface ) # new.variable # f g s y # 0.2856000 0.0004924 0.3146000 0.3993000 0.0000000 # new.variable n mean sd # [1,] 1 2320 0.3275862 0.4694342 # [2,] 2 4 1 0 # [3,] 3 2556 0.5524257 0.4973413 # [4,] 4 3244 0.5363748 0.498752 # [5,] "Total" 8124 0.4820286 0.4997077 # Call: # lm(formula = poisonous ~ new.variable, data = raw.mushroom.data) # Residuals: # Min 1Q Median 3Q Max # -0.5524 -0.5364 -0.3276 0.4636 0.6724 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 0.32759 0.01017 32.200 < 2e-16 *** # new.variableg 0.67241 0.24522 2.742 0.00612 ** # new.variables 0.22484 0.01405 16.001 < 2e-16 *** # new.variabley 0.20879 0.01332 15.671 < 2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 0.49 on 8120 degrees of freedom # Multiple R-squared: 0.03878, Adjusted R-squared: 0.03842 # F-statistic: 109.2 on 3 and 8120 DF, p-value: < 2.2e-16 # [1] 0.46333584 0.50211511 0.03877928 # NULL # to call this function, just replace # cap.surface # in # univariate.exploration( raw.mushroom.data$cap.surface ) # with the name of the predictor variable you want to explore # (e.g., bruises, ...) ################ the third block of code ends here ################# ############## the fourth block of code begins here ################ # the next command fits a linear model using all predictors summary( linear.model.all.predictors <- lm( poisonous ~ ., data = raw.mushroom.data ) ) # Call: # lm(formula = poisonous ~ ., data = raw.mushroom.data) # Residuals: # Min 1Q Median 3Q Max # -1.863e-12 -9.900e-15 -7.000e-16 7.800e-15 2.636e-11 # Coefficients: (10 not defined because of singularities) # Estimate Std. Error t value Pr(>|t|) # (Intercept) 4.509e-13 2.843e-13 1.586e+00 0.112793 # cap.shapec 4.351e-13 1.898e-13 2.293e+00 0.021893 * # cap.shapef 3.121e-14 2.157e-14 1.446e+00 0.148077 # cap.shapek 3.524e-13 2.345e-14 1.503e+01 < 2e-16 *** # cap.shapes -1.149e-14 7.394e-14 -1.550e-01 0.876466 # cap.shapex 2.952e-13 2.067e-14 1.428e+01 < 2e-16 *** # cap.surfaceg -3.231e-13 2.320e-13 -1.393e+00 0.163785 # cap.surfaces -2.751e-14 1.235e-14 -2.228e+00 0.025937 * # cap.surfacey -3.800e-14 1.035e-14 -3.672e+00 0.000243 *** # cap.colorc 5.543e-14 7.168e-14 7.730e-01 0.439323 # cap.colore -1.709e-15 3.337e-14 -5.100e-02 0.959145 # cap.colorg 4.570e-14 3.197e-14 1.429e+00 0.152914 # cap.colorn 5.296e-15 3.262e-14 1.620e-01 0.871038 # cap.colorp -3.893e-14 4.105e-14 -9.480e-01 0.343043 # cap.colorr 1.660e-13 1.203e-13 1.380e+00 0.167615 # cap.coloru 1.293e-13 1.203e-13 1.075e+00 0.282396 # cap.colorw 4.347e-14 3.217e-14 1.351e+00 0.176600 # cap.colory 3.094e-14 3.423e-14 9.040e-01 0.366158 # bruisest 5.000e-01 1.102e-13 4.538e+12 < 2e-16 *** # odorc 5.000e-01 1.318e-13 3.792e+12 < 2e-16 *** # odorf -5.000e-01 3.300e-13 -1.515e+12 < 2e-16 *** # odorl -2.673e-15 2.318e-14 -1.150e-01 0.908213 # odorm 2.500e+00 5.191e-13 4.816e+12 < 2e-16 *** # odorn -1.500e+00 3.184e-13 -4.712e+12 < 2e-16 *** # odorp -1.000e+00 3.964e-13 -2.523e+12 < 2e-16 *** # odors -5.000e-01 3.306e-13 -1.512e+12 < 2e-16 *** # odory -5.000e-01 3.306e-13 -1.512e+12 < 2e-16 *** # gill.attachmentf 5.811e-14 1.093e-13 5.320e-01 0.594898 # gill.spacingw -3.494e-14 4.732e-14 -7.380e-01 0.460285 # gill.sizen -1.500e+00 2.984e-13 -5.027e+12 < 2e-16 *** # gill.colore -5.000e-01 1.976e-13 -2.530e+12 < 2e-16 *** # gill.colorg -5.000e-01 1.957e-13 -2.555e+12 < 2e-16 *** # gill.colorh -5.000e-01 1.953e-13 -2.560e+12 < 2e-16 *** # gill.colork -5.000e-01 1.960e-13 -2.552e+12 < 2e-16 *** # gill.colorn -5.000e-01 1.956e-13 -2.557e+12 < 2e-16 *** # gill.coloro -5.000e-01 2.033e-13 -2.460e+12 < 2e-16 *** # gill.colorp -5.000e-01 1.952e-13 -2.562e+12 < 2e-16 *** # gill.colorr -5.000e-01 2.115e-13 -2.364e+12 < 2e-16 *** # gill.coloru -5.000e-01 1.961e-13 -2.549e+12 < 2e-16 *** # gill.colorw -5.000e-01 1.948e-13 -2.567e+12 < 2e-16 *** # gill.colory -5.000e-01 2.015e-13 -2.481e+12 < 2e-16 *** # stalk.shapet -1.000e+00 1.944e-13 -5.145e+12 < 2e-16 *** # stalk.rootb -5.000e-01 1.287e-13 -3.886e+12 < 2e-16 *** # stalk.rootc -3.000e+00 6.028e-13 -4.977e+12 < 2e-16 *** # stalk.roote 5.000e-01 1.184e-13 4.222e+12 < 2e-16 *** # stalk.rootr -3.500e+00 5.190e-13 -6.744e+12 < 2e-16 *** # stalk.surface.above.ringk 5.733e-15 2.455e-14 2.340e-01 0.815332 # stalk.surface.above.rings 6.957e-15 1.973e-14 3.530e-01 0.724415 # stalk.surface.above.ringy -7.417e-15 2.696e-13 -2.800e-02 0.978056 # stalk.surface.below.ringk 3.675e-15 2.455e-14 1.500e-01 0.880992 # stalk.surface.below.rings 3.989e-15 1.973e-14 2.020e-01 0.839786 # stalk.surface.below.ringy 5.000e-01 1.647e-13 3.035e+12 < 2e-16 *** # stalk.color.above.ringc NA NA NA NA # stalk.color.above.ringe -9.794e-15 5.372e-14 -1.820e-01 0.855337 # stalk.color.above.ringg -7.479e-16 2.854e-14 -2.600e-02 0.979098 # stalk.color.above.ringn 2.426e-16 2.231e-14 1.100e-02 0.991321 # stalk.color.above.ringo -1.000e+00 2.487e-13 -4.021e+12 < 2e-16 *** # stalk.color.above.ringp -7.090e-16 2.231e-14 -3.200e-02 0.974644 # stalk.color.above.ringw -6.699e-16 2.543e-14 -2.600e-02 0.978987 # stalk.color.above.ringy 3.000e+00 4.990e-13 6.013e+12 < 2e-16 *** # stalk.color.below.ringc NA NA NA NA # stalk.color.below.ringe -3.626e-15 5.372e-14 -6.800e-02 0.946181 # stalk.color.below.ringg 8.068e-16 2.854e-14 2.800e-02 0.977452 # stalk.color.below.ringn 6.493e-16 2.231e-14 2.900e-02 0.976776 # stalk.color.below.ringo NA NA NA NA # stalk.color.below.ringp 6.740e-16 2.231e-14 3.000e-02 0.975895 # stalk.color.below.ringw 6.205e-16 2.543e-14 2.400e-02 0.980537 # stalk.color.below.ringy 1.474e-16 1.180e-13 1.000e-03 0.999004 # veil.coloro 1.224e-16 4.732e-14 3.000e-03 0.997937 # veil.colorw NA NA NA NA # veil.colory NA NA NA NA # ring.numbero 2.500e+00 4.776e-13 5.235e+12 < 2e-16 *** # ring.numbert NA NA NA NA # ring.typef 1.000e+00 2.390e-13 4.185e+12 < 2e-16 *** # ring.typel NA NA NA NA # ring.typen NA NA NA NA # ring.typep 5.000e-01 1.002e-13 4.989e+12 < 2e-16 *** # spore.print.colorh NA NA NA NA # spore.print.colork 4.758e-15 6.777e-14 7.000e-02 0.944034 # spore.print.colorn -1.446e-17 6.692e-14 0.000e+00 0.999828 # spore.print.coloro -1.100e-16 6.692e-14 -2.000e-03 0.998688 # spore.print.colorr 2.500e+00 3.196e-13 7.822e+12 < 2e-16 *** # spore.print.coloru -8.504e-15 9.463e-14 -9.000e-02 0.928401 # spore.print.colorw 1.500e+00 3.066e-13 4.893e+12 < 2e-16 *** # spore.print.colory 2.863e-24 6.692e-14 0.000e+00 1.000000 # populationc -6.210e-14 5.716e-14 -1.086e+00 0.277296 # populationn 3.537e-14 3.312e-14 1.068e+00 0.285547 # populations -8.241e-15 2.366e-14 -3.480e-01 0.727585 # populationv -6.210e-14 3.207e-14 -1.937e+00 0.052833 . # populationy -5.922e-14 3.322e-14 -1.783e+00 0.074703 . # habitatg 3.339e-16 1.969e-14 1.700e-02 0.986468 # habitatl -5.514e-17 1.821e-14 -3.000e-03 0.997584 # habitatm 7.764e-16 3.352e-14 2.300e-02 0.981521 # habitatp -1.103e-16 1.440e-14 -8.000e-03 0.993889 # habitatu 9.875e-14 3.432e-14 2.878e+00 0.004019 ** # habitatw NA NA NA NA # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 3.278e-13 on 8038 degrees of freedom # Multiple R-squared: 1, Adjusted R-squared: 1 # F-statistic: 2.221e+26 on 85 and 8038 DF, p-value: < 2.2e-16 # the NA rows are telling us that, because of the way the mushrooms # are distributed, some of the dummy variables are redundant # in the presence of the other dummies; this turns out not to present # a problem in the prediction process p.hat.linear.model.all.predictors <- predict( linear.model.all.predictors, type = 'response' ) par( mfrow = c( 2, 1 ) ) hist( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) c( mean( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 0 ] ), mean( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 1 ] ), mean( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 1 ] ) - mean( p.hat.linear.model.all.predictors[ raw.mushroom.data$poisonous == 0 ] ) ) # [1] 2.981711e-13 1.000000e+00 1.000000e+00 ############## the fourth block of code ends here ################# ############## the fifth block of code begins here ################ install.packages( 'DAAG' ) library( DAAG ) cross.validation.all.predictors.10.fold <- cv.lm( data = raw.mushroom.data, form.lm = formula( poisonous ~ . ), m = 10, printit = F ) warnings( ) # Warning messages: # 1: In predict.lm(subs.lm, newdata = data[rows.out, ]) : # prediction from a rank-deficient fit may be misleading # 2: In predict.lm(subs.lm, newdata = data[rows.out, ]) : # prediction from a rank-deficient fit may be misleading # 3: In predict.lm(subs.lm, newdata = data[rows.out, ]) : # prediction from a rank-deficient fit may be misleading # 4: In predict.lm(subs.lm, newdata = data[rows.out, ]) : # prediction from a rank-deficient fit may be misleading # 5: In predict.lm(subs.lm, newdata = data[rows.out, ]) : # prediction from a rank-deficient fit may be misleading # 6: In predict.lm(subs.lm, newdata = data[rows.out, ]) : # prediction from a rank-deficient fit may be misleading # 7: In predict.lm(subs.lm, newdata = data[rows.out, ]) : # prediction from a rank-deficient fit may be misleading # 8: In predict.lm(subs.lm, newdata = data[rows.out, ]) : # prediction from a rank-deficient fit may be misleading # 9: In predict.lm(subs.lm, newdata = data[rows.out, ]) : # prediction from a rank-deficient fit may be misleading # 10: In predict.lm(subs.lm, newdata = data[rows.out, ]) : # prediction from a rank-deficient fit may be misleading # 11: In cv.lm(data = raw.mushroom.data, form.lm = formula(poisonous ~ ... : # As there is >1 explanatory variable, cross-validation # predicted values for a fold are not a linear function # of corresponding overall predicted values. Lines that # are shown for the different folds are approximate # the next command computes the cross-validated # root-mean-squared error of the regression predictions, # which is essentially the same as the value # from the regression with the full data set sqrt( attr( cross.validation.all.predictors.10.fold, 'ms' ) ) # [1] 2.2e-13 ############## the fifth block of code ends here ################## ############## the sixth block of code begins here ################ # the next command investigates which predictors can be dropped # from the model with little or no predictive accuracy loss null.model <- lm( poisonous ~ 1, data = raw.mushroom.data ) full.model <- lm( poisonous ~ ., data = raw.mushroom.data ) variable.selection.with.bic <- step( null.model, scope = list( lower = null.model, upper = full.model ), direction = 'both', trace = 1, steps = 1000, k = log( n ) ) # the output of this command is in the file # ams-206-mushroom-analysis-variable-selection-with-bic.txt ############## the sixth block of code ends here ################ ############ the seventh block of code begins here ############## summary( linear.model.just.odor <- lm( poisonous ~ odor, data = raw.mushroom.data ) ) # Call: # lm(formula = poisonous ~ odor, data = raw.mushroom.data) # Residuals: # Min 1Q Median 3Q Max # -0.034 -0.034 0.000 0.000 0.966 # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 1.37e-14 5.98e-03 0.00 1 # odorc 1.00e+00 1.05e-02 95.30 < 2e-16 *** # odorf 1.00e+00 6.51e-03 153.71 < 2e-16 *** # odorl 9.37e-14 8.45e-03 0.00 1 # odorm 1.00e+00 2.08e-02 48.08 < 2e-16 *** # odorn 3.40e-02 6.31e-03 5.39 7.1e-08 *** # odorp 1.00e+00 9.57e-03 104.54 < 2e-16 *** # odors 1.00e+00 7.78e-03 128.55 < 2e-16 *** # odory 1.00e+00 7.78e-03 128.55 < 2e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 0.12 on 8115 degrees of freedom # Multiple R-squared: 0.943, Adjusted R-squared: 0.943 # F-statistic: 1.67e+04 on 8 and 8115 DF, p-value: <2e-16 p.hat.linear.model.just.odor <- predict( linear.model.just.odor, type = 'response' ) par( mfrow = c( 2, 1 ) ) hist( p.hat.linear.model.just.odor[ raw.mushroom.data$poisonous == 0 ], nclass = 100, main = 'y = 0', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) hist( p.hat.linear.model.just.odor[ raw.mushroom.data$poisonous == 1 ], nclass = 100, main = 'y = 1', xlab = 'p.hat', prob = T, xlim = c( 0, 1 ) ) par( mfrow = c( 1, 1 ) ) table( p.hat.linear.model.just.odor, useNA = 'always' ) # p.hat.linear.model.just.odor # 1.36725100485403e-14 1.07412118907621e-13 0.0340136054422548 0.999999999999688 # 400 400 3528 192 # 0.999999999999851 1.00000000000006 1.00000000000009 1.00000000000016 # 2160 36 576 576 # 1.00000000000018 # 256 0 app.says.poisonous.0.05 <- ifelse( p.hat.linear.model.just.odor > 0.05, 1, 0 ) with( raw.mushroom.data, table( app.says.poisonous.0.05, poisonous ) ) # poisonous # app.says.poisonous.0.05 0 1 # 0 4208 120 # 1 0 3796 app.says.poisonous.0.01 <- ifelse( p.hat.linear.model.just.odor > 0.01, 1, 0 ) with( raw.mushroom.data, table( app.says.poisonous.0.01, poisonous ) ) # poisonous # app.says.poisonous.0.01 0 1 # 0 800 0 # 1 3408 3916 ############## the seventh block of code ends here ################