Student Solution

The following proposed solution uses the “Heart.csv” file to predict incidences of heart disease among patients (i.e. the “AHD” variable). As this is a discrete outcome, some methods will use classification models.

loading packages & data

rm(list = ls())
library(tidyverse)
library(glmnet)
library(tree)
library(rpart)
library(rpart.plot)
library(randomForest)

# read in csv
heart_raw <- read_csv("seminar-material/Heart.csv")

Clean data & prepare training + testing files

Convert categorical variables to factors

heart_cleaned = heart_raw %>%
  drop_na() %>%
  mutate(across(c(ChestPain, Thal), factor)) %>%
  mutate(AHD = ifelse(AHD == "No",
                      0,
                      1)) %>%
  select(-...1)

Create training and test data

set.seed(1)
training_list = sample(1:nrow(heart_cleaned), 3*nrow(heart_cleaned)/4)
training_set = heart_cleaned %>%
  filter(row_number() %in% training_list)

# covariates in training set
training_set_x = training_set %>%
  select(-AHD)

# y var in training set
training_set_y = training_set %>%
  select(AHD)

test_set = heart_cleaned %>%
  filter(!row_number() %in% training_list)

# covariates in test set
test_set_x = test_set %>%
  select(-AHD)

# y var in test set
test_set_y = test_set %>%
  select(AHD)

Linear Probability Model

Estimate LPM

lm_heart = lm(AHD ~., data = training_set)
summary(lm_heart)


Call:
lm(formula = AHD ~ ., data = training_set)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.95305 -0.22780 -0.03695  0.18969  0.93653 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)          0.2421555  0.3864105   0.627 0.531567    
Age                 -0.0027566  0.0033881  -0.814 0.416812    
Sex                  0.1790580  0.0584518   3.063 0.002483 ** 
ChestPainnonanginal -0.2186487  0.0646517  -3.382 0.000862 ***
ChestPainnontypical -0.1978742  0.0783460  -2.526 0.012305 *  
ChestPaintypical    -0.2671775  0.0967247  -2.762 0.006262 ** 
RestBP               0.0027032  0.0014410   1.876 0.062088 .  
Chol                 0.0004496  0.0004828   0.931 0.352899    
Fbs                 -0.0769553  0.0731226  -1.052 0.293848    
RestECG              0.0317033  0.0254639   1.245 0.214543    
MaxHR               -0.0026952  0.0014136  -1.907 0.057968 .  
ExAng                0.0912519  0.0627836   1.453 0.147632    
Oldpeak              0.0177934  0.0302302   0.589 0.556778    
Slope                0.0854202  0.0547350   1.561 0.120157    
Ca                   0.1528525  0.0300224   5.091 8.03e-07 ***
Thalnormal          -0.0864777  0.1100964  -0.785 0.433083    
Thalreversable       0.1085007  0.1029830   1.054 0.293316    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3535 on 205 degrees of freedom
Multiple R-squared:  0.5346,    Adjusted R-squared:  0.4982 
F-statistic: 14.72 on 16 and 205 DF,  p-value: < 2.2e-16

Compute and plot predicted values

lm_pred = predict(lm_heart, newdata = test_set_x)

# add to a predictions dataset
predictions_dataset = test_set_y %>%
  add_column(lm_pred)

# plotting predictions against actual values of AHD
# plus regression line (with standard errors)
ggplot(predictions_dataset, aes(x = lm_pred, y = AHD)) + 
  geom_point() +
  geom_smooth(method = "lm", se = TRUE)

`geom_smooth()` using formula = 'y ~ x'

Compute and store MSE

mse_lm = mean((predictions_dataset$lm_pred - predictions_dataset$AHD)^2)

Logit

Estimate logit model

logit_heart = glm(AHD ~., data = training_set,
                  family = binomial(link = "logit"))
summary(logit_heart)


Call:
glm(formula = AHD ~ ., family = binomial(link = "logit"), data = training_set)

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -4.150966   3.435225  -1.208  0.22691    
Age                 -0.023463   0.029751  -0.789  0.43033    
Sex                  1.804318   0.603192   2.991  0.00278 ** 
ChestPainnonanginal -1.753466   0.569798  -3.077  0.00209 ** 
ChestPainnontypical -1.186858   0.659783  -1.799  0.07204 .  
ChestPaintypical    -2.058178   0.739613  -2.783  0.00539 ** 
RestBP               0.027662   0.012578   2.199  0.02786 *  
Chol                 0.006939   0.004460   1.556  0.11969    
Fbs                 -0.580343   0.692135  -0.838  0.40176    
RestECG              0.215295   0.217382   0.990  0.32198    
MaxHR               -0.023207   0.013539  -1.714  0.08651 .  
ExAng                0.622182   0.511284   1.217  0.22364    
Oldpeak              0.144425   0.283812   0.509  0.61084    
Slope                0.873239   0.458705   1.904  0.05695 .  
Ca                   1.346442   0.315736   4.264    2e-05 ***
Thalnormal          -0.093183   0.984062  -0.095  0.92456    
Thalreversable       0.942034   0.949106   0.993  0.32093    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 305.95  on 221  degrees of freedom
Residual deviance: 150.14  on 205  degrees of freedom
AIC: 184.14

Number of Fisher Scoring iterations: 6

Compute and plot the predicted values. Note, logit is a linear regression of log-odds on covariates. Without specifying type = “response”, it will give you the predicted log-odds. Use type = "response" to convert these log-odds into probabilities using logistic function.

logit_pred = predict(logit_heart, newdata = test_set_x,
                     type = "response")

# add to a predictions dataset
predictions_dataset = predictions_dataset %>%
  add_column(logit_pred)

# plotting predictions against actual values of AHD
# plus regression line (with standard errors)
ggplot(predictions_dataset, aes(x = logit_pred, y = AHD)) + 
  geom_point() +
  geom_smooth(method = "glm",
              method.args = list(family = "binomial"),
              se = TRUE)

`geom_smooth()` using formula = 'y ~ x'

Compute and store the MSE

mse_logit = mean((predictions_dataset$logit_pred - predictions_dataset$AHD)^2)

LPM with penalisation (Ridge, LASSO)

Create matrix of training data covariates and y values for these glmnet regressions. Noe, -1 removes column of intercepts.

X = model.matrix(~. - 1, data = training_set_x)
y = model.matrix(~. -1, data = training_set_y)

Ridge regression (alpha is lasso penalty, lambda is ridge penalty, so alpha = 0). If lambda is not explicitly chosen, glmnet fits the model for a sequence (100) of lambda values and provides regressions for each lambda.

# Ridge (alpha=0)
ridge_lm = glmnet(X, y, alpha = 0)

# LASSO (alpha = 1)
lasso_lm = glmnet(X, y, alpha = 1)

Find optimal lambda using cross-validation

cv_ridge = cv.glmnet(X, y, alpha = 0)
plot(cv_ridge)

cv_lasso = cv.glmnet(X, y, alpha = 0)
plot(cv_lasso)

Can use minimum or highest lambda value within 1 se of the minimum value; i.e. statistically indistiguishable but encourages the most parsimony.

best_lambda_ridge <- cv_ridge$lambda.min
best_lambda_lasso <- cv_lasso$lambda.min

Re-estimate using cross-validated lambdas

ridge_lm = glmnet(X, y, alpha = 0, lambda = best_lambda_ridge)
coef(ridge_lm)

18 x 1 sparse Matrix of class "dgCMatrix"
                                 s0
(Intercept)            0.1949964465
Age                   -0.0001486077
Sex                    0.1318820062
ChestPainasymptomatic  0.1239429970
ChestPainnonanginal   -0.0757550405
ChestPainnontypical   -0.0561441225
ChestPaintypical      -0.0974488394
RestBP                 0.0016269983
Chol                   0.0002823434
Fbs                   -0.0479978216
RestECG                0.0275318571
MaxHR                 -0.0023282165
ExAng                  0.0943465699
Oldpeak                0.0318760159
Slope                  0.0596369624
Ca                     0.1075913668
Thalnormal            -0.1027598013
Thalreversable         0.0933925932

lasso_lm = glmnet(X, y, alpha = 1, lambda = best_lambda_lasso)
coef(lasso_lm)

18 x 1 sparse Matrix of class "dgCMatrix"
                                 s0
(Intercept)            0.5394031490
Age                    .           
Sex                    .           
ChestPainasymptomatic  0.0311738079
ChestPainnonanginal    .           
ChestPainnontypical    .           
ChestPaintypical       .           
RestBP                 .           
Chol                   .           
Fbs                    .           
RestECG                .           
MaxHR                 -0.0004762322
ExAng                  .           
Oldpeak                .           
Slope                  .           
Ca                     0.0056972778
Thalnormal            -0.0555254843
Thalreversable         .

Make predictions on test set

X_test = model.matrix(~. - 1, data = test_set_x)
ridge_lm_pred = predict(ridge_lm, newx = X_test, s = best_lambda_ridge)
lasso_lm_pred = predict(lasso_lm, newx = X_test, s = best_lambda_lasso)

Add ridge and lasso logit predictions to predictions dataset

predictions_dataset = predictions_dataset %>% 
  bind_cols(ridge_lm_pred,
            lasso_lm_pred) %>%
  rename(ridge_lm_pred = last_col(offset = 1),
         lasso_lm_pred = last_col())

New names:
• `s1` -> `s1...4`
• `s1` -> `s1...5`

Plotting predictions against actual values of AHD with regression line (with standard errors) for Ridge,

ggplot(predictions_dataset, aes(x = ridge_lm_pred, y = AHD)) + 
  geom_point() +
  geom_smooth(method = "lm", se = TRUE)

`geom_smooth()` using formula = 'y ~ x'

and then LASSO.

ggplot(predictions_dataset, aes(x = lasso_lm_pred, y = AHD)) + 
  geom_point() +
  geom_smooth(method = "lm", se = TRUE)

`geom_smooth()` using formula = 'y ~ x'

Compute and store MSEs

mse_lm_ridge = mean((predictions_dataset$ridge_lm_pred - predictions_dataset$AHD)^2)
mse_lm_lasso = mean((predictions_dataset$lasso_lm_pred - predictions_dataset$AHD)^2)

Logit with penalisation (Ridge, LASSO)

Estimate models

# ridge regression (alpha = 0)
ridge_logit = glmnet(X, y, family = "binomial", alpha = 0)

# LASSO (alpha = 1)
lasso_logit = glmnet(X, y, family = "binomial", alpha = 1)

Find optimal lambda using cross-validation

cv_ridge <- cv.glmnet(X, y, family = "binomial", alpha = 0)
plot(cv_ridge)

cv_lasso <- cv.glmnet(X, y, family = "binomial", alpha = 1)
plot(cv_lasso)

# Best lambda values
best_lambda_ridge <- cv_ridge$lambda.min
best_lambda_lasso <- cv_lasso$lambda.min

Re-estimate using cross-validated lambdas

ridge_logit = glmnet(X, y, family = "binomial",
                     alpha = 0, lambda = best_lambda_ridge)
coef(ridge_logit)

18 x 1 sparse Matrix of class "dgCMatrix"
                                s0
(Intercept)           -2.661402839
Age                   -0.002610642
Sex                    0.957860963
ChestPainasymptomatic  0.769543615
ChestPainnonanginal   -0.526442599
ChestPainnontypical   -0.259377844
ChestPaintypical      -0.613834638
RestBP                 0.012988845
Chol                   0.002782430
Fbs                   -0.292254411
RestECG                0.182781847
MaxHR                 -0.014358323
ExAng                  0.540745355
Oldpeak                0.221090359
Slope                  0.407296108
Ca                     0.746793790
Thalnormal            -0.495527722
Thalreversable         0.564571575

lasso_logit = glmnet(X, y, family = "binomial",alpha = 1,
                     lambda = best_lambda_lasso)
coef(lasso_logit)

18 x 1 sparse Matrix of class "dgCMatrix"
                                s0
(Intercept)           -3.162812284
Age                    .          
Sex                    1.042538876
ChestPainasymptomatic  1.353767797
ChestPainnonanginal    .          
ChestPainnontypical    .          
ChestPaintypical       .          
RestBP                 0.011537202
Chol                   0.002119158
Fbs                   -0.070976617
RestECG                0.114921995
MaxHR                 -0.014420841
ExAng                  0.481805396
Oldpeak                0.144953650
Slope                  0.429642894
Ca                     0.888321133
Thalnormal            -0.308037617
Thalreversable         0.684159842

Outside of sample rediction plotted

ridge_logit_pred = predict(ridge_logit, newx = X_test, s = best_lambda_ridge, type = "response")
lasso_logit_pred = predict(lasso_logit, newx = X_test, s = best_lambda_lasso, type = "response")

# add ridge and lasso logit predictions to predictions dataset
predictions_dataset = predictions_dataset %>% 
  bind_cols(ridge_logit_pred,
            lasso_logit_pred) %>%
  rename(ridge_logit_pred = last_col(offset = 1),
         lasso_logit_pred = last_col())

New names:
• `s1` -> `s1...6`
• `s1` -> `s1...7`

# plotting predictions against actual values of AHD
# plus regression line (with standard errors)
ggplot(predictions_dataset, aes(x = ridge_logit_pred, y = AHD)) + 
  geom_point() +
  geom_smooth(method = "glm",
              method.args = list(family = "binomial"),
              se = TRUE)

`geom_smooth()` using formula = 'y ~ x'

And for LASSO

ggplot(predictions_dataset, aes(x = lasso_logit_pred, y = AHD)) + 
  geom_point() +
  geom_smooth(method = "glm",
              method.args = list(family = "binomial"),
              se = TRUE)

`geom_smooth()` using formula = 'y ~ x'

Compute and store MSE

mse_logit_ridge = mean((predictions_dataset$ridge_logit_pred - predictions_dataset$AHD)^2)
mse_logit_lasso = mean((predictions_dataset$lasso_logit_pred - predictions_dataset$AHD)^2)

Regression tree (with `tree`)

Begin by converting AHD to factor variable

training_set_AHD_fact = training_set %>%
  mutate(AHD = as.factor(AHD))

Estimate and plot tree using tree

tree_heart = tree(AHD ~., data = training_set_AHD_fact)
summary(tree_heart)


Classification tree:
tree(formula = AHD ~ ., data = training_set_AHD_fact)
Variables actually used in tree construction:
[1] "Thal"      "Ca"        "RestBP"    "Chol"      "MaxHR"     "Oldpeak"  
[7] "Slope"     "Sex"       "ChestPain"
Number of terminal nodes:  17 
Residual mean deviance:  0.4809 = 98.57 / 205 
Misclassification error rate: 0.1126 = 25 / 222

tree_heart

node), split, n, deviance, yval, (yprob)
      * denotes terminal node

  1) root 222 306.000 0 ( 0.54505 0.45495 )  
    2) Thal: normal 125 137.800 0 ( 0.76000 0.24000 )  
      4) Ca < 0.5 86  61.820 0 ( 0.88372 0.11628 )  
        8) RestBP < 156.5 81  42.780 0 ( 0.92593 0.07407 )  
         16) Chol < 228.5 32   0.000 0 ( 1.00000 0.00000 ) *
         17) Chol > 228.5 49  36.430 0 ( 0.87755 0.12245 )  
           34) MaxHR < 169.5 30  30.020 0 ( 0.80000 0.20000 )  
             68) Oldpeak < 1.25 25  18.350 0 ( 0.88000 0.12000 )  
              136) Oldpeak < 0.1 13  14.050 0 ( 0.76923 0.23077 ) *
              137) Oldpeak > 0.1 12   0.000 0 ( 1.00000 0.00000 ) *
             69) Oldpeak > 1.25 5   6.730 1 ( 0.40000 0.60000 ) *
           35) MaxHR > 169.5 19   0.000 0 ( 1.00000 0.00000 ) *
        9) RestBP > 156.5 5   5.004 1 ( 0.20000 0.80000 ) *
      5) Ca > 0.5 39  54.040 1 ( 0.48718 0.51282 )  
       10) Slope < 1.5 26  32.100 0 ( 0.69231 0.30769 )  
         20) Sex < 0.5 13   0.000 0 ( 1.00000 0.00000 ) *
         21) Sex > 0.5 13  17.320 1 ( 0.38462 0.61538 )  
           42) ChestPain: nonanginal,nontypical,typical 8  10.590 0 ( 0.62500 0.37500 ) *
           43) ChestPain: asymptomatic 5   0.000 1 ( 0.00000 1.00000 ) *
       11) Slope > 1.5 13   7.051 1 ( 0.07692 0.92308 ) *
    3) Thal: fixed,reversable 97 112.800 1 ( 0.26804 0.73196 )  
      6) ChestPain: nonanginal,nontypical,typical 37  51.050 0 ( 0.54054 0.45946 )  
       12) Ca < 0.5 23  26.400 0 ( 0.73913 0.26087 )  
         24) Slope < 1.5 7   0.000 0 ( 1.00000 0.00000 ) *
         25) Slope > 1.5 16  21.170 0 ( 0.62500 0.37500 ) *
       13) Ca > 0.5 14  14.550 1 ( 0.21429 0.78571 )  
         26) Chol < 232 7   9.561 1 ( 0.42857 0.57143 ) *
         27) Chol > 232 7   0.000 1 ( 0.00000 1.00000 ) *
      7) ChestPain: asymptomatic 60  39.010 1 ( 0.10000 0.90000 )  
       14) Oldpeak < 0.55 11  14.420 1 ( 0.36364 0.63636 ) *
       15) Oldpeak > 0.55 49  16.710 1 ( 0.04082 0.95918 )  
         30) Thal: fixed 10  10.010 1 ( 0.20000 0.80000 ) *
         31) Thal: reversable 39   0.000 1 ( 0.00000 1.00000 ) *

# plot tree
plot(tree_heart)
  text(tree_heart, pretty = 1)

Generate predicted values using type = "vector" to predict the probability that AHD = 1. Alternatively, can use type = "class" to predict the actual class. This produces 2 columns: the first has the probability AHD = 0 and the second the probability AHD = 1 (which is what we’re interested in).

tree_pred = predict(tree_heart, newdata = test_set_x, type = "vector")

# add predictions to predictions dataset
# note we only keep the 2nd column of `tree_pred` as explained above
predictions_dataset = predictions_dataset %>% 
  bind_cols(tree_pred[,2]) %>%
  rename(tree_pred = last_col())

New names:
• `` -> `...8`

# plot predicted values against actual values
ggplot(predictions_dataset, aes(x = tree_pred, y = AHD)) + 
  geom_point() +
  geom_smooth(method = "lm",
              se = TRUE)

`geom_smooth()` using formula = 'y ~ x'

Compute and store MSE. Note, this is the same as the misclassification rate for a binary (0/1) variable. Trees are very sensitive to the sample (overfits - high variance). The tendency to overfit is because of sequential design of trees.

mse_tree = mean((predictions_dataset$AHD - predictions_dataset$tree_pred)^2)

Pruned tree

set.seed(789)
cvtree_heart = cv.tree(tree_heart, FUN = prune.tree)
names(cvtree_heart)

[1] "size"   "dev"    "k"      "method"

cvtree_heart

$size
 [1] 17 16 15 14 13 11 10  9  8  7  6  5  4  3  2  1

$dev
 [1] 407.5569 385.4016 366.2669 367.3015 343.6143 327.9664 328.5242 328.5242
 [9] 325.9152 267.8555 259.0320 260.1638 260.1638 277.7223 278.2690 309.3478

$k
 [1]      -Inf  4.300942  4.947779  4.987522  5.232343  6.376143  6.703877
 [8]  6.738228  7.877432 10.098779 14.044109 14.773333 14.892341 21.905580
[15] 22.713030 55.410752

$method
[1] "deviance"

attr(,"class")
[1] "prune"         "tree.sequence"

Plot size of tree and cost-complexity parameter against deviance (number of misclassifications). We can visually see that a tree size of 6 (6 terminal nodes) gives minimal deviance. Increasing the tree size beyond that is resulting in overfitting.

par(mfrow = c(1,2))
plot(cvtree_heart$size, cvtree_heart$dev, type = "b")
plot(cvtree_heart$k, cvtree_heart$dev, type = "b")

#returning plots back to 1 plot per figure
par(mfrow = c(1,1))

# the minimal deviance obtains this value, 6
optimal_size = cvtree_heart$size[which.min(cvtree_heart$dev)]
optimal_size

[1] 6

Prune tree and generate new predicted values

prune_heart = prune.tree(tree_heart, best = optimal_size)
plot(prune_heart)
  text(prune_heart, pretty = 1)

# generate predicted values from pruned tree
prune_tree_pred = predict(prune_heart, newdata = test_set_x, type = "vector")

# add predictions to predictions dataset
predictions_dataset = predictions_dataset %>% 
  bind_cols(prune_tree_pred[,2]) %>%
  rename(prune_tree_pred = last_col())

New names:
• `` -> `...9`

Compute and store MSE. Note. this is the same as the misclassification rate for a binary (0/1) variable.

mse_prune_tree = mean((predictions_dataset$AHD - predictions_dataset$prune_tree_pred)^2)

Regression tree (with `rpart`)

AHD is a binary variable, so the class method is assumed

tree_heart_rpart = rpart(AHD ~., data = training_set_AHD_fact, method = "class")
summary(tree_heart_rpart)

Call:
rpart(formula = AHD ~ ., data = training_set_AHD_fact, method = "class")
  n= 222 

          CP nsplit rel error    xerror       xstd
1 0.44554455      0 1.0000000 1.0000000 0.07346078
2 0.05940594      1 0.5544554 0.7128713 0.06905800
3 0.05445545      3 0.4356436 0.6237624 0.06650755
4 0.01000000      5 0.3267327 0.5049505 0.06205621

Variable importance
     Thal     MaxHR        Ca ChestPain       Sex   Oldpeak     ExAng     Slope 
       22        12        11        10        10         9         8         7 
      Age    RestBP   RestECG      Chol 
        4         2         2         2 

Node number 1: 222 observations,    complexity param=0.4455446
  predicted class=0  expected loss=0.454955  P(node) =1
    class counts:   121   101
   probabilities: 0.545 0.455 
  left son=2 (125 obs) right son=3 (97 obs)
  Primary splits:
      Thal      splits as  RLR,       improve=26.43724, (0 missing)
      Ca        < 0.5   to the left,  improve=26.35143, (0 missing)
      MaxHR     < 150.5 to the right, improve=25.49278, (0 missing)
      ChestPain splits as  RLLL,      improve=24.75130, (0 missing)
      Oldpeak   < 1.7   to the left,  improve=19.60541, (0 missing)
  Surrogate splits:
      MaxHR   < 150.5 to the right, agree=0.707, adj=0.330, (0 split)
      Slope   < 1.5   to the left,  agree=0.698, adj=0.309, (0 split)
      Oldpeak < 1.55  to the left,  agree=0.694, adj=0.299, (0 split)
      ExAng   < 0.5   to the left,  agree=0.685, adj=0.278, (0 split)
      Sex     < 0.5   to the left,  agree=0.649, adj=0.196, (0 split)

Node number 2: 125 observations,    complexity param=0.05940594
  predicted class=0  expected loss=0.24  P(node) =0.5630631
    class counts:    95    30
   probabilities: 0.760 0.240 
  left son=4 (86 obs) right son=5 (39 obs)
  Primary splits:
      Ca        < 0.5   to the left,  improve=8.438402, (0 missing)
      ChestPain splits as  RLLR,      improve=6.923375, (0 missing)
      MaxHR     < 119.5 to the right, improve=5.609579, (0 missing)
      Oldpeak   < 1.7   to the left,  improve=4.833038, (0 missing)
      ExAng     < 0.5   to the left,  improve=4.474680, (0 missing)
  Surrogate splits:
      Age       < 62.5  to the left,  agree=0.760, adj=0.231, (0 split)
      MaxHR     < 130.5 to the right, agree=0.736, adj=0.154, (0 split)
      ChestPain splits as  LLLR,      agree=0.704, adj=0.051, (0 split)
      Oldpeak   < 1.7   to the left,  agree=0.704, adj=0.051, (0 split)

Node number 3: 97 observations,    complexity param=0.05445545
  predicted class=1  expected loss=0.2680412  P(node) =0.4369369
    class counts:    26    71
   probabilities: 0.268 0.732 
  left son=6 (37 obs) right son=7 (60 obs)
  Primary splits:
      ChestPain splits as  RLLL,      improve=8.883477, (0 missing)
      Ca        < 0.5   to the left,  improve=8.188351, (0 missing)
      MaxHR     < 144.5 to the right, improve=6.623394, (0 missing)
      Oldpeak   < 0.7   to the left,  improve=6.113597, (0 missing)
      Slope     < 1.5   to the left,  improve=3.431719, (0 missing)
  Surrogate splits:
      MaxHR   < 144.5 to the right, agree=0.732, adj=0.297, (0 split)
      ExAng   < 0.5   to the left,  agree=0.732, adj=0.297, (0 split)
      Oldpeak < 0.7   to the left,  agree=0.680, adj=0.162, (0 split)
      RestBP  < 109   to the left,  agree=0.660, adj=0.108, (0 split)
      Age     < 63.5  to the right, agree=0.649, adj=0.081, (0 split)

Node number 4: 86 observations
  predicted class=0  expected loss=0.1162791  P(node) =0.3873874
    class counts:    76    10
   probabilities: 0.884 0.116 

Node number 5: 39 observations,    complexity param=0.05940594
  predicted class=1  expected loss=0.4871795  P(node) =0.1756757
    class counts:    19    20
   probabilities: 0.487 0.513 
  left son=10 (17 obs) right son=11 (22 obs)
  Primary splits:
      Sex       < 0.5   to the left,  improve=6.818730, (0 missing)
      Slope     < 1.5   to the left,  improve=6.564103, (0 missing)
      ChestPain splits as  RLLL,      improve=5.818730, (0 missing)
      ExAng     < 0.5   to the left,  improve=2.857309, (0 missing)
      RestBP    < 139   to the right, improve=2.632007, (0 missing)
  Surrogate splits:
      ChestPain splits as  RLLR,      agree=0.718, adj=0.353, (0 split)
      RestECG   < 1     to the left,  agree=0.718, adj=0.353, (0 split)
      Age       < 56.5  to the right, agree=0.692, adj=0.294, (0 split)
      RestBP    < 136   to the right, agree=0.667, adj=0.235, (0 split)
      Chol      < 292   to the right, agree=0.667, adj=0.235, (0 split)

Node number 6: 37 observations,    complexity param=0.05445545
  predicted class=0  expected loss=0.4594595  P(node) =0.1666667
    class counts:    20    17
   probabilities: 0.541 0.459 
  left son=12 (23 obs) right son=13 (14 obs)
  Primary splits:
      Ca     < 0.5   to the left,  improve=4.794527, (0 missing)
      MaxHR  < 143.5 to the right, improve=4.386315, (0 missing)
      Slope  < 1.5   to the left,  improve=2.413343, (0 missing)
      Chol   < 205.5 to the left,  improve=1.730759, (0 missing)
      RestBP < 122.5 to the left,  improve=1.359745, (0 missing)
  Surrogate splits:
      MaxHR     < 146.5 to the right, agree=0.730, adj=0.286, (0 split)
      Oldpeak   < 1.95  to the left,  agree=0.703, adj=0.214, (0 split)
      Age       < 68.5  to the left,  agree=0.676, adj=0.143, (0 split)
      ChestPain splits as  -RLL,      agree=0.676, adj=0.143, (0 split)
      Chol      < 190.5 to the right, agree=0.649, adj=0.071, (0 split)

Node number 7: 60 observations
  predicted class=1  expected loss=0.1  P(node) =0.2702703
    class counts:     6    54
   probabilities: 0.100 0.900 

Node number 10: 17 observations
  predicted class=0  expected loss=0.1764706  P(node) =0.07657658
    class counts:    14     3
   probabilities: 0.824 0.176 

Node number 11: 22 observations
  predicted class=1  expected loss=0.2272727  P(node) =0.0990991
    class counts:     5    17
   probabilities: 0.227 0.773 

Node number 12: 23 observations
  predicted class=0  expected loss=0.2608696  P(node) =0.1036036
    class counts:    17     6
   probabilities: 0.739 0.261 

Node number 13: 14 observations
  predicted class=1  expected loss=0.2142857  P(node) =0.06306306
    class counts:     3    11
   probabilities: 0.214 0.786

tree_heart_rpart

n= 222 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

 1) root 222 101 0 (0.5450450 0.4549550)  
   2) Thal=normal 125  30 0 (0.7600000 0.2400000)  
     4) Ca< 0.5 86  10 0 (0.8837209 0.1162791) *
     5) Ca>=0.5 39  19 1 (0.4871795 0.5128205)  
      10) Sex< 0.5 17   3 0 (0.8235294 0.1764706) *
      11) Sex>=0.5 22   5 1 (0.2272727 0.7727273) *
   3) Thal=fixed,reversable 97  26 1 (0.2680412 0.7319588)  
     6) ChestPain=nonanginal,nontypical,typical 37  17 0 (0.5405405 0.4594595)  
      12) Ca< 0.5 23   6 0 (0.7391304 0.2608696) *
      13) Ca>=0.5 14   3 1 (0.2142857 0.7857143) *
     7) ChestPain=asymptomatic 60   6 1 (0.1000000 0.9000000) *

# plot tree
rpart.plot(tree_heart_rpart)

Generate and plot predicted values

tree_pred_rpart = predict(tree_heart_rpart, newdata = test_set_x, type = "class")

# add predictions to predictions dataset
predictions_dataset = predictions_dataset %>% 
  bind_cols(as.numeric(tree_pred_rpart)-1) %>%
  rename(tree_pred_rpart = last_col())

New names:
• `` -> `...10`

# plot predicted values against actual values
ggplot(predictions_dataset, aes(x = tree_pred_rpart, y = AHD)) + 
  geom_point() +
  geom_smooth(method = "lm",
              se = TRUE)

`geom_smooth()` using formula = 'y ~ x'

Compute and store MSE. Note, this is the same as the misclassification rate for a binary (0/1) variable. Trees are very sensitive to the sample (overfits - high variance). They have a tendency to overfit is because of sequential design of trees.

mse_tree_rpart = mean((predictions_dataset$AHD - predictions_dataset$tree_pred_rpart)^2)

Plot cost-complexity parameter of tree

plotcp(tree_heart_rpart)

Use CV to pick optimal cp parameter

optimal_cp = tree_heart_rpart$cptable[which.min(tree_heart_rpart$cptable[,"xerror"]), "CP"]
optimal_cp

[1] 0.01

Prune the tree

prune_heart_rpart = prune(tree_heart_rpart, cp = optimal_cp)

# plot pruned tree
rpart.plot(prune_heart_rpart)

Generate predicted values from pruned tree. Note, these predictions are saved as factors.

prune_tree_rpart_pred = predict(prune_heart_rpart, newdata = test_set_x, type = "class")

# add predictions to predictions dataset
predictions_dataset = predictions_dataset %>% 
  bind_cols(as.numeric(prune_tree_rpart_pred)-1) %>%
  rename(prune_tree_rpart_pred = last_col())

New names:
• `` -> `...11`

Compute and store MSE. Note, this is the same as the misclassification rate for a binary (0/1) variable.

mse_prune_tree_rpart = mean((predictions_dataset$AHD - predictions_dataset$prune_tree_rpart_pred)^2)

Bagging

set.seed(8)
# number of regressors is total number of x variables
bag_heart = randomForest(AHD ~., data = training_set_AHD_fact, mtry = ncol(training_set_AHD_fact)-1,
                         importance = TRUE)
bag_heart


Call:
 randomForest(formula = AHD ~ ., data = training_set_AHD_fact,      mtry = ncol(training_set_AHD_fact) - 1, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 13

        OOB estimate of  error rate: 20.72%
Confusion matrix:
   0  1 class.error
0 98 23   0.1900826
1 23 78   0.2277228

Generate predictions

bag_pred = predict(bag_heart, newdata = test_set_x)

# add predictions to predictions dataset
predictions_dataset = predictions_dataset %>% 
  bind_cols(as.numeric(bag_pred)-1) %>%
  rename(bag_pred = last_col())

New names:
• `` -> `...12`

Compute and store MSE. Note, this is the same as the misclassification rate for a binary (0/1) variable.

mse_bag = mean((predictions_dataset$bag_pred - predictions_dataset$AHD)^2)

Random Forest

set.seed(9)
# number of x variables selected for inclusion in each tree is lower than the number of x variables. I chose 5
forest_heart = randomForest(AHD ~., data = training_set_AHD_fact, mtry = 5,
                         importance = TRUE)
forest_heart


Call:
 randomForest(formula = AHD ~ ., data = training_set_AHD_fact,      mtry = 5, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 5

        OOB estimate of  error rate: 18.47%
Confusion matrix:
    0  1 class.error
0 106 15   0.1239669
1  26 75   0.2574257

Generate predictions

forest_pred = predict(forest_heart, newdata = test_set_x)

# add predictions to predictions dataset
predictions_dataset = predictions_dataset %>% 
  bind_cols(as.numeric(forest_pred)-1) %>%
  rename(forest_pred = last_col())

New names:
• `` -> `...13`

Compute and store MSE. Note, this is the same as the misclassification rate for a binary (0/1) variable.

mse_forest = mean((predictions_dataset$forest_pred - predictions_dataset$AHD)^2)

View importance of each variable

importance(forest_heart)

                   0          1 MeanDecreaseAccuracy MeanDecreaseGini
Age        7.4813332  1.5625272           6.72565563        8.4384225
Sex       14.8645298  7.7057242          15.96165461        4.8548739
ChestPain  9.3111531 11.5074587          14.94983417       12.3559486
RestBP     1.5848401  2.0435154           2.35415853        8.4844240
Chol       2.1655147 -2.8478400          -0.09698318        8.5918130
Fbs        1.5604704 -0.6417825           0.90755919        0.7394629
RestECG    0.2401243  0.8172705           0.71520690        1.7564494
MaxHR     11.0172530  5.9234611          12.57651338       16.8039954
ExAng      4.6298814  4.9564873           6.90975700        4.9899581
Oldpeak   10.1551432 10.9200106          15.31150069       10.0643034
Slope      8.7171783  8.7564891          12.41001509        6.3931459
Ca        21.8972697 19.0532580          26.53962079       15.5191479
Thal      12.4311268  8.8729300          14.76064804       10.5493398

varImpPlot(forest_heart)

Comparison

# Get variables starting with "mse_"
mse_vars = ls(pattern = "^mse_")

# Create dataframe with variable names and values
mse_df = data.frame(
  method = sub("^mse_", "", mse_vars),
  mse = sapply(mse_vars, get),
  stringsAsFactors = FALSE)

mse_df = mse_df %>%
  arrange(mse)

view(mse_df)
mse_df

                               method        mse
mse_logit_ridge           logit_ridge 0.09556880
mse_logit_lasso           logit_lasso 0.09775592
mse_logit                       logit 0.10001662
mse_lm_ridge                 lm_ridge 0.10308230
mse_lm                             lm 0.10644605
mse_prune_tree             prune_tree 0.17054945
mse_forest                     forest 0.17333333
mse_tree                         tree 0.18526094
mse_bag                           bag 0.20000000
mse_prune_tree_rpart prune_tree_rpart 0.20000000
mse_tree_rpart             tree_rpart 0.20000000
mse_lm_lasso                 lm_lasso 0.21966165

To view predictions_dataset for probabilistic/factor predictions under each model, run

view(predictions_dataset)

loading packages & data

Clean data & prepare training + testing files

Linear Probability Model

Logit

LPM with penalisation (Ridge, LASSO)

Logit with penalisation (Ridge, LASSO)

Regression tree (with tree)

Pruned tree

Regression tree (with rpart)

Bagging

Random Forest

Comparison

Regression tree (with `tree`)

Regression tree (with `rpart`)