This chunk performs binarization. Note that it is set to fullRank = FALSE. This creates binarized variables for each factor level. If set to TRUE it will leave one out. Note the new variables are placed in a new dataframe. It can attached to an existing dataframe via old.df <- cbind(old.df, binarized_vars)

factor_names <- c("ER","Age") #insert the column names of the variables to be binarized
factor_vars <- readmission[,factor_names]
for (var in factor_names) {
  factor_vars[, var] <- as.character(factor_vars[, var])
}

binarizer <- caret::dummyVars(paste("~", paste(factor_names, collapse = "+")) , data = factor_vars, fullRank = TRUE)
binarized_vars <- data.frame(predict(binarizer, newdata = factor_vars))
head(binarized_vars)

This chunk creates training and testing sets.

#Create train and test sets
set.seed(4321)
partition <- createDataPartition(readmission[,1], list = FALSE, p = .75) #The partition will stratify using variable 1 from the dataframe
train <- readmission[partition, ]
test <- readmission[-partition, ]

print("TRAIN")
mean(train$Readmission.Status)

print("TEST")
mean(test$Readmission.Status)

The following chunk provides code that can be used to combine factor levels. It also relevels in case the new level has the highest frequency.

#USED fct-infreq method instead

#This example combines levels other than White of Race into a new level called NonWhite.
#Execute the function levels(readmission$Race) to identify the levels. Be sure the variable is a factor variable before doing this. This code assumes the variable has previously been releveled so that "White" is the first level.

#readmission2<-readmission #The results are in a new data frame called readmission2. This is done so that the results can be checked without losing the original data frame. When done, consider executing readmission <- readmission2

#Warning: If you run this code and import the plyr library, it will cause errors later on.
#To fix these errors, there are two options
#1. Restart R and RStudio and load tidyverse AFTER plyr
#2. Tell R to use the dplyr library for functions that share the same name in both libraries: dplyr::summarise, dplyr::group_by, etc.
#library(plyr)
#var <- "Race"
#var.levels <- levels(readmission2[,var])
#readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("White","NonWhite","NonWhite","NonWhite"))
#Relevel
#table <- as.data.frame(table(readmission2[,var]))
#  max <- which.max(table[,2])
#  level.name <- as.character(table[max,1])
#  readmission2[,var] <- relevel(readmission2[,var], ref = level.name)

#table(readmission2[,var])

This chunk reads in the data, relevels factors, and prints a summary.

#Set factor levels to those with the most observations
readmission <- readmission %>% 
  mutate_if(is.character, fct_infreq)

summary(readmission) 
##  Readmission.Status Gender          Race             ER           DRG.Class    
##  Min.   :0.0000     F:38011   White   :56124   Min.   :0.0000   MED    :35771  
##  1st Qu.:0.0000     M:28771   Black   : 7099   1st Qu.:0.0000   SURG   :30447  
##  Median :0.0000               Others  : 2273   Median :0.0000   UNGROUP:  564  
##  Mean   :0.1259               Hispanic: 1286   Mean   :0.5083                  
##  3rd Qu.:0.0000                                3rd Qu.:1.0000                  
##  Max.   :1.0000                                Max.   :9.0000                  
##       LOS              Age         HCC.Riskscore         DRG.Complication
##  Min.   : 1.000   Min.   : 24.00   Min.   : 0.079   MedicalMCC.CC:18110  
##  1st Qu.: 3.000   1st Qu.: 67.00   1st Qu.: 1.107   SurgMCC.CC   :15468  
##  Median : 5.000   Median : 75.00   Median : 1.865   MedicalNoC   :12310  
##  Mean   : 6.693   Mean   : 73.64   Mean   : 2.345   SurgNoC      :11549  
##  3rd Qu.: 8.000   3rd Qu.: 83.00   3rd Qu.: 3.173   Other        : 9345  
##  Max.   :36.000   Max.   :101.00   Max.   :12.307
glimpse(readmission)
## Rows: 66,782
## Columns: 9
## $ Readmission.Status <dbl> 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, …
## $ Gender             <fct> M, M, M, M, M, M, F, F, F, F, F, M, M, M, M, M, M, …
## $ Race               <fct> White, White, White, White, White, White, White, Wh…
## $ ER                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, …
## $ DRG.Class          <fct> MED, SURG, SURG, MED, MED, SURG, MED, MED, MED, SUR…
## $ LOS                <dbl> 29, 5, 9, 5, 13, 10, 4, 10, 14, 4, 3, 9, 8, 22, 10,…
## $ Age                <dbl> 73, 73, 73, 73, 73, 73, 71, 71, 71, 71, 71, 67, 67,…
## $ HCC.Riskscore      <dbl> 12.307, 12.307, 12.307, 12.307, 12.307, 12.307, 11.…
## $ DRG.Complication   <fct> Other, SurgNoC, SurgNoC, MedicalNoC, MedicalNoC, Su…

Task 1 - Perform univariate exploration of the four non-factor variables (6 points)

Code is provided to create a histogram for one of the variables.

library(ggplot2)
readmission %>% ggplot(aes(ER)) + geom_histogram(bins = 9) #Set the number of bins equal to 9 because there are 9 unique values

readmission %>% ggplot(aes(LOS)) + geom_histogram(bins = 36) #Set the number of bins equal to 36 because there are 36 unique values

readmission %>% ggplot(aes(log(LOS))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

readmission %>% ggplot(aes(Age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

readmission %>% ggplot(aes(HCC.Riskscore)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

readmission <- readmission %>% mutate(log_LOS = log(LOS),
                                      log_riskscore = log(HCC.Riskscore)) %>% 
  select(-LOS, -HCC.Riskscore)

Task 2 - Examine relationships between DRG.Class and DRG.Complication (5 points)

Code is provided to create a tabular view of the two variables.

readmission %>% 
  count(DRG.Class, DRG.Complication) %>% 
  spread(DRG.Class,n) %>% 
  make_table()
DRG.Complication MED SURG UNGROUP
MedicalMCC.CC 18104 6 NA
SurgMCC.CC NA 15468 NA
MedicalNoC 12310 NA NA
SurgNoC NA 11549 NA
Other 5357 3424 564
#Using base R
table(readmission$DRG.Class,readmission$DRG.Complication) 
##          
##           MedicalMCC.CC SurgMCC.CC MedicalNoC SurgNoC Other
##   MED             18104          0      12310       0  5357
##   SURG                6      15468          0   11549  3424
##   UNGROUP             0          0          0       0   564
#Using base R
#rows_to_drop <- (readmission$DRG.Complication  == "MedicalMCC.CC") & (readmission$DRG.Class == "SURG")
#readmission_temp <- readmission[!rows_to_drop,]

#removes the 6 patients 
readmission <- readmission %>% 
  filter(!(DRG.Complication == "MedicalMCC.CC" & DRG.Class == "SURG"))

readmission <- readmission %>% 
  mutate(DRG.Class = case_when(
    DRG.Complication == "MedicalMCC.CC" & DRG.Class == "MED" ~ "MED",
    DRG.Complication == "SurgMCC.CC" & DRG.Class == "SURG" ~ "SURG",
    DRG.Complication == "MedicalNoC" & DRG.Class == "MED" ~ "MED",
    DRG.Complication == "SurgNoC" & DRG.Class == "SURG" ~ "SURG",
    TRUE ~ "OTHER"
  )) 

#Using Base R,
#readmission_temp$DRG.Class <- case_when(
#    readmission$DRG.Complication == "MedicalMCC.CC" & readmission$DRG.Class == "MED" ~ "MED",
#    readmission$DRG.Complication == "SurgMCC.CC" & readmission$DRG.Class == "SURG" ~ "SURG",
#    readmission$DRG.Complication == "MedicalNoC" & readmission$DRG.Class == "MED" ~ "MED",
#    readmission$DRG.Complication == "SurgNoC" & readmission$DRG.Class == "SURG" ~ "SURG",
#    TRUE ~ "OTHER"
#  ) 


readmission %>% count(DRG.Class) %>% make_table()
DRG.Class n
MED 30414
OTHER 9345
SURG 27017
#table(readmission_temp$DRG.Class)

#relevel the new variable
readmission$DRG.Class <- fct_infreq(readmission$DRG.Class)

Task 3 - Use observations from cluster analysis to consider a new feature (9 points)

Code is provided to perform cluster analysis for from 1 to 12 clusters, construct an elbow plot and create a new variable based on a selected number of clusters. That variable will need to be retained for potentially being added tot he dataframe.

nstart.val <- 20
cluster_vars <- readmission[c('log_LOS','Age')]
for(i in 1:ncol(cluster_vars)){
  cluster_vars[,i] <- scale(cluster_vars[,i])
}
km1 <- kmeans(cluster_vars,centers=1,nstart=nstart.val)
km2 <- kmeans(cluster_vars,centers=2,nstart=nstart.val)
km3 <- kmeans(cluster_vars,centers=3,nstart=nstart.val)
km4 <- kmeans(cluster_vars,centers=4,nstart=nstart.val)
km5 <- kmeans(cluster_vars,centers=5,nstart=nstart.val)
km6 <- kmeans(cluster_vars,centers=6,nstart=nstart.val)
km7 <- kmeans(cluster_vars,centers=7,nstart=nstart.val)
km8 <- kmeans(cluster_vars,centers=8,nstart=nstart.val)
km9 <- kmeans(cluster_vars,centers=9,nstart=nstart.val)
km10 <- kmeans(cluster_vars,centers=10,nstart=nstart.val)
km11 <- kmeans(cluster_vars,centers=11,nstart=nstart.val)
km12 <- kmeans(cluster_vars,centers=12,nstart=nstart.val)

var.exp <- data.frame(k = c(1:12),
                      bss_tss = c(km1$betweenss/km1$totss,
                                  km2$betweenss/km2$totss,
                                  km3$betweenss/km3$totss,
                                  km4$betweenss/km4$totss,
                                  km5$betweenss/km5$totss,
                                  km6$betweenss/km6$totss,
                                  km7$betweenss/km7$totss,
                                  km8$betweenss/km8$totss,
                                  km9$betweenss/km9$totss,
                                  km10$betweenss/km10$totss,
                                  km11$betweenss/km11$totss,
                                  km12$betweenss/km12$totss))

ggplot(var.exp,aes(x=k,y=bss_tss))+geom_point()

LOS_Age_Clust <- as.factor(km4$cluster) #This creates a new variable based on having 8 clusters.
cluster_vars$LOS_Age_Clust <- LOS_Age_Clust
ggplot(data = cluster_vars, aes(x = Age, y = log_LOS, col = LOS_Age_Clust)) + geom_point() + theme(axis.text = element_blank(), legend.title = element_blank()) +ggtitle("Clustering with 4 groups")

readmission <- readmission %>% mutate(los_age_clust = LOS_Age_Clust)

Task 4 - Select an interaction (5 points)

The following code may help determine if interactions are present. It is best to treat ER as a factor variable for this purpose.

#Both variables are factor variables
ggplot(readmission,aes(Gender,fill=factor(Readmission.Status))) + geom_bar(position = "fill") +
   facet_wrap(~Race,ncol=2,scales="free")+scale_y_continuous()

#Look at the change in the percentage of readmitted patients
readmission %>% 
  dplyr::group_by(Race, Gender) %>% 
  dplyr::summarise(percent_readmission = mean(Readmission.Status)) %>% 
  tidyr::spread(Gender, percent_readmission) %>% 
  mutate_if(is.numeric, ~round(.x,2)) %>% 
  make_table()
## `summarise()` has grouped output by 'Race'. You can override using the
## `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
Race F M
White 0.12 0.13
Black 0.13 0.14
Others 0.11 0.12
Hispanic 0.15 0.12
#One factor variable and one continuous numeric variable
ggplot(readmission,aes(x=factor(Readmission.Status),y=log_riskscore)) + geom_boxplot() +facet_wrap(~factor(ER))

Task 6 - Decide on the factor variable from Task 3 (5 points)

No code is provided.

model <- glm(Readmission.Status ~ . + Gender*Race - log_LOS - Age, data=train, family = binomial(link="logit"))

summary(model)
## 
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race - log_LOS - 
##     Age, family = binomial(link = "logit"), data = train)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -3.159845   0.042191 -74.894  < 2e-16 ***
## GenderM               0.008457   0.031214   0.271  0.78646    
## RaceBlack             0.000908   0.060084   0.015  0.98794    
## RaceOthers           -0.096542   0.111857  -0.863  0.38809    
## RaceHispanic          0.083876   0.132874   0.631  0.52788    
## ER                    0.004442   0.017267   0.257  0.79701    
## DRG.ClassSURG         0.016665   0.030672   0.543  0.58690    
## DRG.ClassOTHER        0.119978   0.042068   2.852  0.00434 ** 
## log_riskscore         1.297614   0.023429  55.385  < 2e-16 ***
## los_age_clust2        0.078253   0.038017   2.058  0.03955 *  
## los_age_clust3        0.147635   0.036731   4.019 5.83e-05 ***
## los_age_clust4        0.197225   0.049776   3.962 7.42e-05 ***
## GenderM:RaceBlack     0.110899   0.090263   1.229  0.21921    
## GenderM:RaceOthers    0.152623   0.158979   0.960  0.33704    
## GenderM:RaceHispanic  0.067235   0.205509   0.327  0.74354    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33838  on 50067  degrees of freedom
## AIC: 33868
## 
## Number of Fisher Scoring iterations: 5
preds <- predict(model, newdat=test,type="response")

roc_model <- roc(test$Readmission.Status,preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)
## Area under the curve: 0.7447

Task 7 - Select features (15 points)

No code is provided.

library(caret)

factor_names <- c("DRG.Class","Race") #insert the column names of the variables to be binarized
factor_vars <- readmission[,factor_names]

binarizer <- caret::dummyVars(paste("~", paste(factor_names, collapse = "+")) , data = factor_vars, fullRank = TRUE)
binarized_vars <- data.frame(predict(binarizer, newdata = factor_vars))
head(binarized_vars)
##   DRG.Class.SURG DRG.Class.OTHER Race.Black Race.Others Race.Hispanic
## 1              0               1          0           0             0
## 2              1               0          0           0             0
## 3              1               0          0           0             0
## 4              0               0          0           0             0
## 5              0               0          0           0             0
## 6              1               0          0           0             0
readmission_binarized <- readmission %>% 
  select(-DRG.Class, -Race) %>% 
  mutate(DRG.Class.MED = binarized_vars$DRG.Class.MED,
         DRG.Class.OTHER = binarized_vars$DRG.Class.OTHER,
         DRG.Class.SURG = binarized_vars$DRG.Class.SURG,
         Race.Black = binarized_vars$Race.Black,
         Race.Hispanic = binarized_vars$Race.Hispanic,
         Race.Others = binarized_vars$Race.White)

train_bin <- readmission_binarized[partition,]
test_bin <- readmission_binarized[-partition,]
glimpse(train_bin)
## Rows: 50,082
## Columns: 11
## $ Readmission.Status <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ Gender             <fct> M, M, M, M, F, F, F, F, M, M, M, M, M, M, M, M, M, …
## $ ER                 <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, …
## $ Age                <dbl> 73, 73, 73, 73, 71, 71, 71, 71, 67, 67, 67, 67, 67,…
## $ log_LOS            <dbl> 3.3672958, 1.6094379, 2.5649494, 2.3025851, 2.30258…
## $ log_riskscore      <dbl> 2.510168, 2.510168, 2.510168, 2.510168, 2.431154, 2…
## $ los_age_clust      <fct> 3, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, …
## $ DRG.Class.OTHER    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ DRG.Class.SURG     <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, …
## $ Race.Black         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Race.Hispanic      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
model <- glm(Readmission.Status ~ . - los_age_clust, data=train_bin, family = binomial(link="logit"))

summary(model)
## 
## Call:
## glm(formula = Readmission.Status ~ . - los_age_clust, family = binomial(link = "logit"), 
##     data = train_bin)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.745388   0.092682 -29.621  < 2e-16 ***
## GenderM          0.022245   0.028545   0.779  0.43579    
## ER               0.004367   0.017270   0.253  0.80036    
## Age             -0.005712   0.001061  -5.385 7.26e-08 ***
## log_LOS          0.052813   0.020469   2.580  0.00988 ** 
## log_riskscore    1.300556   0.023604  55.100  < 2e-16 ***
## DRG.Class.OTHER  0.117939   0.042134   2.799  0.00512 ** 
## DRG.Class.SURG   0.016673   0.030673   0.544  0.58675    
## Race.Black       0.035221   0.045198   0.779  0.43582    
## Race.Hispanic    0.102097   0.101446   1.006  0.31422    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33828  on 50072  degrees of freedom
## AIC: 33848
## 
## Number of Fisher Scoring iterations: 5
MASS::stepAIC(model)
## Start:  AIC=33847.6
## Readmission.Status ~ (Gender + ER + Age + log_LOS + log_riskscore + 
##     los_age_clust + DRG.Class.OTHER + DRG.Class.SURG + Race.Black + 
##     Race.Hispanic) - los_age_clust
## 
##                   Df Deviance   AIC
## - ER               1    33828 33846
## - DRG.Class.SURG   1    33828 33846
## - Race.Black       1    33828 33846
## - Gender           1    33828 33846
## - Race.Hispanic    1    33829 33847
## <none>                  33828 33848
## - log_LOS          1    33834 33852
## - DRG.Class.OTHER  1    33835 33853
## - Age              1    33856 33874
## - log_riskscore    1    37459 37477
## 
## Step:  AIC=33845.66
## Readmission.Status ~ Gender + Age + log_LOS + log_riskscore + 
##     DRG.Class.OTHER + DRG.Class.SURG + Race.Black + Race.Hispanic
## 
##                   Df Deviance   AIC
## - DRG.Class.SURG   1    33828 33844
## - Gender           1    33828 33844
## - Race.Black       1    33828 33844
## - Race.Hispanic    1    33829 33845
## <none>                  33828 33846
## - log_LOS          1    33834 33850
## - DRG.Class.OTHER  1    33835 33851
## - Age              1    33856 33872
## - log_riskscore    1    37459 37475
## 
## Step:  AIC=33843.96
## Readmission.Status ~ Gender + Age + log_LOS + log_riskscore + 
##     DRG.Class.OTHER + Race.Black + Race.Hispanic
## 
##                   Df Deviance   AIC
## - Gender           1    33829 33843
## - Race.Black       1    33829 33843
## - Race.Hispanic    1    33829 33843
## <none>                  33828 33844
## - log_LOS          1    33835 33849
## - DRG.Class.OTHER  1    33836 33850
## - Age              1    33857 33871
## - log_riskscore    1    37459 37473
## 
## Step:  AIC=33842.55
## Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER + 
##     Race.Black + Race.Hispanic
## 
##                   Df Deviance   AIC
## - Race.Black       1    33829 33841
## - Race.Hispanic    1    33830 33842
## <none>                  33829 33843
## - log_LOS          1    33835 33847
## - DRG.Class.OTHER  1    33836 33848
## - Age              1    33858 33870
## - log_riskscore    1    37462 37474
## 
## Step:  AIC=33841.13
## Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER + 
##     Race.Hispanic
## 
##                   Df Deviance   AIC
## - Race.Hispanic    1    33830 33840
## <none>                  33829 33841
## - log_LOS          1    33836 33846
## - DRG.Class.OTHER  1    33837 33847
## - Age              1    33861 33871
## - log_riskscore    1    37466 37476
## 
## Step:  AIC=33840.02
## Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER
## 
##                   Df Deviance   AIC
## <none>                  33830 33840
## - log_LOS          1    33837 33845
## - DRG.Class.OTHER  1    33838 33846
## - Age              1    33862 33870
## - log_riskscore    1    37467 37475
## 
## Call:  glm(formula = Readmission.Status ~ Age + log_LOS + log_riskscore + 
##     DRG.Class.OTHER, family = binomial(link = "logit"), data = train_bin)
## 
## Coefficients:
##     (Intercept)              Age          log_LOS    log_riskscore  
##       -2.703682        -0.005935         0.052674         1.301097  
## DRG.Class.OTHER  
##        0.110212  
## 
## Degrees of Freedom: 50081 Total (i.e. Null);  50077 Residual
## Null Deviance:       37780 
## Residual Deviance: 33830     AIC: 33840
preds <- predict(model, newdat=test_bin,type="response")

roc_model <- roc(test$Readmission.Status,preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))

plot(roc_model)

auc(roc_model)
## Area under the curve: 0.7451
model <- glm(Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER + 
    Race.Black + Race.Hispanic,
    data=train_bin, family = binomial(link="logit"))

summary(model)
## 
## Call:
## glm(formula = Readmission.Status ~ Age + log_LOS + log_riskscore + 
##     DRG.Class.OTHER + Race.Black + Race.Hispanic, family = binomial(link = "logit"), 
##     data = train_bin)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.720301   0.088953 -30.581  < 2e-16 ***
## Age             -0.005783   0.001056  -5.474 4.39e-08 ***
## log_LOS          0.052715   0.020466   2.576   0.0100 *  
## log_riskscore    1.300703   0.023601  55.113  < 2e-16 ***
## DRG.Class.OTHER  0.110062   0.039645   2.776   0.0055 ** 
## Race.Black       0.034392   0.045184   0.761   0.4466    
## Race.Hispanic    0.100989   0.101443   0.996   0.3195    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33829  on 50075  degrees of freedom
## AIC: 33843
## 
## Number of Fisher Scoring iterations: 5
preds <- predict(model, newdat=test_bin,type="response")

roc_model <- roc(test$Readmission.Status,preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_model)

auc(roc_model)
## Area under the curve: 0.7453

Task 8 - Interpret the model (6 points)

No code is provided.

model <- glm(Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER + 
    Race.Black + Race.Hispanic, 
    data = readmission_binarized, 
    family = binomial(link="probit"))

summary(model)
## 
## Call:
## glm(formula = Readmission.Status ~ Age + log_LOS + log_riskscore + 
##     DRG.Class.OTHER + Race.Black + Race.Hispanic, family = binomial(link = "probit"), 
##     data = readmission_binarized)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.511383   0.041502 -36.417  < 2e-16 ***
## Age             -0.003620   0.000497  -7.284 3.25e-13 ***
## log_LOS          0.033235   0.009729   3.416 0.000635 ***
## log_riskscore    0.700327   0.010727  65.289  < 2e-16 ***
## DRG.Class.OTHER  0.059406   0.018642   3.187 0.001439 ** 
## Race.Black       0.018695   0.021326   0.877 0.380676    
## Race.Hispanic    0.048057   0.047324   1.015 0.309880    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 50559  on 66775  degrees of freedom
## Residual deviance: 45116  on 66769  degrees of freedom
## AIC: 45130
## 
## Number of Fisher Scoring iterations: 5
readmission_binarized %>% summary()
##  Readmission.Status Gender          ER              Age            log_LOS     
##  Min.   :0.0000     F:38005   Min.   :0.0000   Min.   : 24.00   Min.   :0.000  
##  1st Qu.:0.0000     M:28771   1st Qu.:0.0000   1st Qu.: 67.00   1st Qu.:1.099  
##  Median :0.0000               Median :0.0000   Median : 75.00   Median :1.609  
##  Mean   :0.1259               Mean   :0.5083   Mean   : 73.64   Mean   :1.653  
##  3rd Qu.:0.0000               3rd Qu.:1.0000   3rd Qu.: 83.00   3rd Qu.:2.079  
##  Max.   :1.0000               Max.   :9.0000   Max.   :101.00   Max.   :3.584  
##  log_riskscore     los_age_clust DRG.Class.OTHER  DRG.Class.SURG  
##  Min.   :-2.5383   1:19188       Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 0.1017   2:22226       1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 0.6238   3:17466       Median :0.0000   Median :0.0000  
##  Mean   : 0.6000   4: 7896       Mean   :0.1399   Mean   :0.4046  
##  3rd Qu.: 1.1547                 3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   : 2.5102                 Max.   :1.0000   Max.   :1.0000  
##    Race.Black     Race.Hispanic    
##  Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000  
##  Mean   :0.1063   Mean   :0.01926  
##  3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000
example1 <- tibble(
  Gender  = "F",
  ER = 0,
  Age = 74,
  log_LOS = 1.65,
  log_riskscore = 0.6,
  los_age_cluster = 2,
  DRG.Class.OTHER = 0,
  DRG.Class.SURG = 1,
  Race.Black = 0,
  Race.Hispanic = 0
)

example2 <- example1 %>% mutate(log_riskscore = 2)
example3 <- example1 %>% mutate(log_LOS = 2)
example4 <- example1 %>% mutate(Age = 80)
example5 <- example1 %>% mutate(Gender = "M")
example6 <- example1 %>% mutate(Race.Black  = 1)
example7 <- example1 %>% mutate(Race.Hispanic  = 1)

preds1 <- predict(model, newdat=example1,type="response")
preds2 <- predict(model, newdat=example2,type="response")
preds3 <- predict(model, newdat=example3,type="response")
preds4 <- predict(model, newdat=example4,type="response")
preds5 <- predict(model, newdat=example5,type="response")
preds6 <- predict(model, newdat=example6,type="response")
preds7 <- predict(model, newdat=example7,type="response")

example1 %>% 
  rbind(example2) %>% 
  rbind(example3) %>% 
  rbind(example4) %>% 
  rbind(example5) %>% 
  rbind(example6) %>% 
  rbind(example7) %>% 
  mutate(y = c(preds1, preds2, preds3, preds4,preds5, preds6, preds7)) %>% 
  mutate(patientid = row_number(), y = round(y,3)) %>% 
  select(patientid, y)
## # A tibble: 7 × 2
##   patientid     y
##       <int> <dbl>
## 1         1 0.096
## 2         2 0.373
## 3         3 0.098
## 4         4 0.092
## 5         5 0.096
## 6         6 0.099
## 7         7 0.105

Task 9 - Set the cutoff (9 points)

The following code calculates the cost using a cutoff of 0.075. It assumes the final model constructed on the full dataset is called glm_full and the final dataset is readmit.

readmission$Readmission.Status %>% sum()*25
## [1] 210225
pred_full <- predict(model,type="response")

cutoff <- 0.1

pred_readmit <- 1*(pred_full > cutoff)

no_intervention_cost <- 25*sum(readmission$Readmission.Status == 1)
full_intervention_cost <- 2*nrow(readmission)
no_intervention_cost
## [1] 210225
full_intervention_cost
## [1] 133552
library(e1071)
cm <- confusionMatrix(factor(1*(pred_full>cutoff)),factor(readmission$Readmission.Status))$table
cm
##           Reference
## Prediction     0     1
##          0 31709  1648
##          1 26658  6761
modified_cost <- cm[2,1]*2+cm[2,2]*2+cm[1,2]*25
modified_cost
## [1] 108038
cutoff_values <- c(1, 0.4, 0.3, 0.2, 0.1, 0.09, 0.08, 0.075, 0.07, 0.05, 0)

get_modified_cost <- function(input_cutoff){
  pred_readmit <- 1*(pred_full > input_cutoff)
  cm <- confusionMatrix(factor(pred_readmit),factor(readmission$Readmission.Status))
  modified_cost <- cm$table[2,1]*2+cm$table[2,2]*2+cm$table[1,2]*25
  modified_cost
}
get_modified_cost(0.2)
## [1] 141967
tibble(cutoff = cutoff_values,
       cost = sapply(cutoff_values, get_modified_cost)) 
## Warning in confusionMatrix.default(factor(pred_readmit),
## factor(readmission$Readmission.Status)): Levels are not in the same order for
## reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(factor(pred_readmit),
## factor(readmission$Readmission.Status)): Levels are not in the same order for
## reference and data. Refactoring data to match.
## # A tibble: 11 × 2
##    cutoff   cost
##     <dbl>  <dbl>
##  1  1     210225
##  2  0.4   203799
##  3  0.3   180869
##  4  0.2   141967
##  5  0.1   108038
##  6  0.09  106426
##  7  0.08  105952
##  8  0.075 106227
##  9  0.07  106855
## 10  0.05  110386
## 11  0     133552
cutoff <- 0.08
pred_readmit <- 1*(pred_full > cutoff)
cm <- confusionMatrix(factor(pred_readmit),factor(readmission$Readmission.Status))
cm$table
##           Reference
## Prediction     0     1
##          0 26381  1094
##          1 31986  7315

Task 10 – Consider alternative models and model construction techniques (12 points)

Task 11 – Executive summary (20 points)