Hospital Readmissions

This chunk performs binarization. Note that it is set to fullRank = FALSE. This creates binarized variables for each factor level. If set to TRUE it will leave one out. Note the new variables are placed in a new dataframe. It can attached to an existing dataframe via old.df <- cbind(old.df, binarized_vars)

factor_names <- c("ER","Age") #insert the column names of the variables to be binarized
factor_vars <- readmission[,factor_names]
for (var in factor_names) {
  factor_vars[, var] <- as.character(factor_vars[, var])
}

binarizer <- caret::dummyVars(paste("~", paste(factor_names, collapse = "+")) , data = factor_vars, fullRank = TRUE)
binarized_vars <- data.frame(predict(binarizer, newdata = factor_vars))
head(binarized_vars)

This chunk creates training and testing sets.

#Create train and test sets
set.seed(4321)
partition <- createDataPartition(readmission[,1], list = FALSE, p = .75) #The partition will stratify using variable 1 from the dataframe
train <- readmission[partition, ]
test <- readmission[-partition, ]

print("TRAIN")
mean(train$Readmission.Status)

print("TEST")
mean(test$Readmission.Status)

The following chunk provides code that can be used to combine factor levels. It also relevels in case the new level has the highest frequency.

#USED fct-infreq method instead

#This example combines levels other than White of Race into a new level called NonWhite.
#Execute the function levels(readmission$Race) to identify the levels. Be sure the variable is a factor variable before doing this. This code assumes the variable has previously been releveled so that "White" is the first level.

#readmission2<-readmission #The results are in a new data frame called readmission2. This is done so that the results can be checked without losing the original data frame. When done, consider executing readmission <- readmission2

#Warning: If you run this code and import the plyr library, it will cause errors later on.
#To fix these errors, there are two options
#1. Restart R and RStudio and load tidyverse AFTER plyr
#2. Tell R to use the dplyr library for functions that share the same name in both libraries: dplyr::summarise, dplyr::group_by, etc.
#library(plyr)
#var <- "Race"
#var.levels <- levels(readmission2[,var])
#readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("White","NonWhite","NonWhite","NonWhite"))
#Relevel
#table <- as.data.frame(table(readmission2[,var]))
#  max <- which.max(table[,2])
#  level.name <- as.character(table[max,1])
#  readmission2[,var] <- relevel(readmission2[,var], ref = level.name)

#table(readmission2[,var])

This chunk reads in the data, relevels factors, and prints a summary.

#Set factor levels to those with the most observations
readmission <- readmission %>% 
  mutate_if(is.character, fct_infreq)

summary(readmission)

##  Readmission.Status Gender          Race             ER           DRG.Class    
##  Min.   :0.0000     F:38011   White   :56124   Min.   :0.0000   MED    :35771  
##  1st Qu.:0.0000     M:28771   Black   : 7099   1st Qu.:0.0000   SURG   :30447  
##  Median :0.0000               Others  : 2273   Median :0.0000   UNGROUP:  564  
##  Mean   :0.1259               Hispanic: 1286   Mean   :0.5083                  
##  3rd Qu.:0.0000                                3rd Qu.:1.0000                  
##  Max.   :1.0000                                Max.   :9.0000                  
##       LOS              Age         HCC.Riskscore         DRG.Complication
##  Min.   : 1.000   Min.   : 24.00   Min.   : 0.079   MedicalMCC.CC:18110  
##  1st Qu.: 3.000   1st Qu.: 67.00   1st Qu.: 1.107   SurgMCC.CC   :15468  
##  Median : 5.000   Median : 75.00   Median : 1.865   MedicalNoC   :12310  
##  Mean   : 6.693   Mean   : 73.64   Mean   : 2.345   SurgNoC      :11549  
##  3rd Qu.: 8.000   3rd Qu.: 83.00   3rd Qu.: 3.173   Other        : 9345  
##  Max.   :36.000   Max.   :101.00   Max.   :12.307

glimpse(readmission)

## Rows: 66,782
## Columns: 9
## $ Readmission.Status <dbl> 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, …
## $ Gender             <fct> M, M, M, M, M, M, F, F, F, F, F, M, M, M, M, M, M, …
## $ Race               <fct> White, White, White, White, White, White, White, Wh…
## $ ER                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, …
## $ DRG.Class          <fct> MED, SURG, SURG, MED, MED, SURG, MED, MED, MED, SUR…
## $ LOS                <dbl> 29, 5, 9, 5, 13, 10, 4, 10, 14, 4, 3, 9, 8, 22, 10,…
## $ Age                <dbl> 73, 73, 73, 73, 73, 73, 71, 71, 71, 71, 71, 67, 67,…
## $ HCC.Riskscore      <dbl> 12.307, 12.307, 12.307, 12.307, 12.307, 12.307, 11.…
## $ DRG.Complication   <fct> Other, SurgNoC, SurgNoC, MedicalNoC, MedicalNoC, Su…

Task 1 - Perform univariate exploration of the four non-factor variables (6 points)

Code is provided to create a histogram for one of the variables.

library(ggplot2)
readmission %>% ggplot(aes(ER)) + geom_histogram(bins = 9) #Set the number of bins equal to 9 because there are 9 unique values

readmission %>% ggplot(aes(LOS)) + geom_histogram(bins = 36) #Set the number of bins equal to 36 because there are 36 unique values

readmission %>% ggplot(aes(log(LOS))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

readmission %>% ggplot(aes(Age)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

readmission %>% ggplot(aes(HCC.Riskscore)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

readmission <- readmission %>% mutate(log_LOS = log(LOS),
                                      log_riskscore = log(HCC.Riskscore)) %>% 
  select(-LOS, -HCC.Riskscore)

Task 2 - Examine relationships between DRG.Class and DRG.Complication (5 points)

Code is provided to create a tabular view of the two variables.

readmission %>% 
  count(DRG.Class, DRG.Complication) %>% 
  spread(DRG.Class,n) %>% 
  make_table()


DRG.Complication	MED	SURG	UNGROUP
MedicalMCC.CC	18104	6	NA
SurgMCC.CC	NA	15468	NA
MedicalNoC	12310	NA	NA
SurgNoC	NA	11549	NA
Other	5357	3424	564

#Using base R
table(readmission$DRG.Class,readmission$DRG.Complication)

##          
##           MedicalMCC.CC SurgMCC.CC MedicalNoC SurgNoC Other
##   MED             18104          0      12310       0  5357
##   SURG                6      15468          0   11549  3424
##   UNGROUP             0          0          0       0   564

#Using base R
#rows_to_drop <- (readmission$DRG.Complication  == "MedicalMCC.CC") & (readmission$DRG.Class == "SURG")
#readmission_temp <- readmission[!rows_to_drop,]

#removes the 6 patients 
readmission <- readmission %>% 
  filter(!(DRG.Complication == "MedicalMCC.CC" & DRG.Class == "SURG"))

readmission <- readmission %>% 
  mutate(DRG.Class = case_when(
    DRG.Complication == "MedicalMCC.CC" & DRG.Class == "MED" ~ "MED",
    DRG.Complication == "SurgMCC.CC" & DRG.Class == "SURG" ~ "SURG",
    DRG.Complication == "MedicalNoC" & DRG.Class == "MED" ~ "MED",
    DRG.Complication == "SurgNoC" & DRG.Class == "SURG" ~ "SURG",
    TRUE ~ "OTHER"
  )) 

#Using Base R,
#readmission_temp$DRG.Class <- case_when(
#    readmission$DRG.Complication == "MedicalMCC.CC" & readmission$DRG.Class == "MED" ~ "MED",
#    readmission$DRG.Complication == "SurgMCC.CC" & readmission$DRG.Class == "SURG" ~ "SURG",
#    readmission$DRG.Complication == "MedicalNoC" & readmission$DRG.Class == "MED" ~ "MED",
#    readmission$DRG.Complication == "SurgNoC" & readmission$DRG.Class == "SURG" ~ "SURG",
#    TRUE ~ "OTHER"
#  ) 


readmission %>% count(DRG.Class) %>% make_table()


DRG.Class	n
MED	30414
OTHER	9345
SURG	27017

#table(readmission_temp$DRG.Class)

#relevel the new variable
readmission$DRG.Class <- fct_infreq(readmission$DRG.Class)

Task 3 - Use observations from cluster analysis to consider a new feature (9 points)

Code is provided to perform cluster analysis for from 1 to 12 clusters, construct an elbow plot and create a new variable based on a selected number of clusters. That variable will need to be retained for potentially being added tot he dataframe.

nstart.val <- 20
cluster_vars <- readmission[c('log_LOS','Age')]
for(i in 1:ncol(cluster_vars)){
  cluster_vars[,i] <- scale(cluster_vars[,i])
}
km1 <- kmeans(cluster_vars,centers=1,nstart=nstart.val)
km2 <- kmeans(cluster_vars,centers=2,nstart=nstart.val)
km3 <- kmeans(cluster_vars,centers=3,nstart=nstart.val)
km4 <- kmeans(cluster_vars,centers=4,nstart=nstart.val)
km5 <- kmeans(cluster_vars,centers=5,nstart=nstart.val)
km6 <- kmeans(cluster_vars,centers=6,nstart=nstart.val)
km7 <- kmeans(cluster_vars,centers=7,nstart=nstart.val)
km8 <- kmeans(cluster_vars,centers=8,nstart=nstart.val)
km9 <- kmeans(cluster_vars,centers=9,nstart=nstart.val)
km10 <- kmeans(cluster_vars,centers=10,nstart=nstart.val)
km11 <- kmeans(cluster_vars,centers=11,nstart=nstart.val)
km12 <- kmeans(cluster_vars,centers=12,nstart=nstart.val)

var.exp <- data.frame(k = c(1:12),
                      bss_tss = c(km1$betweenss/km1$totss,
                                  km2$betweenss/km2$totss,
                                  km3$betweenss/km3$totss,
                                  km4$betweenss/km4$totss,
                                  km5$betweenss/km5$totss,
                                  km6$betweenss/km6$totss,
                                  km7$betweenss/km7$totss,
                                  km8$betweenss/km8$totss,
                                  km9$betweenss/km9$totss,
                                  km10$betweenss/km10$totss,
                                  km11$betweenss/km11$totss,
                                  km12$betweenss/km12$totss))

ggplot(var.exp,aes(x=k,y=bss_tss))+geom_point()

LOS_Age_Clust <- as.factor(km4$cluster) #This creates a new variable based on having 8 clusters.
cluster_vars$LOS_Age_Clust <- LOS_Age_Clust
ggplot(data = cluster_vars, aes(x = Age, y = log_LOS, col = LOS_Age_Clust)) + geom_point() + theme(axis.text = element_blank(), legend.title = element_blank()) +ggtitle("Clustering with 4 groups")

readmission <- readmission %>% mutate(los_age_clust = LOS_Age_Clust)

Task 4 - Select an interaction (5 points)

The following code may help determine if interactions are present. It is best to treat ER as a factor variable for this purpose.

#Both variables are factor variables
ggplot(readmission,aes(Gender,fill=factor(Readmission.Status))) + geom_bar(position = "fill") +
   facet_wrap(~Race,ncol=2,scales="free")+scale_y_continuous()

#Look at the change in the percentage of readmitted patients
readmission %>% 
  dplyr::group_by(Race, Gender) %>% 
  dplyr::summarise(percent_readmission = mean(Readmission.Status)) %>% 
  tidyr::spread(Gender, percent_readmission) %>% 
  mutate_if(is.numeric, ~round(.x,2)) %>% 
  make_table()

## `summarise()` has grouped output by 'Race'. You can override using the
## `.groups` argument.
## `mutate_if()` ignored the following grouping variables:


Race	F	M
White	0.12	0.13
Black	0.13	0.14
Others	0.11	0.12
Hispanic	0.15	0.12

#One factor variable and one continuous numeric variable
ggplot(readmission,aes(x=factor(Readmission.Status),y=log_riskscore)) + geom_boxplot() +facet_wrap(~factor(ER))

Task 5 - Select a link function (8 points)

The following code runs a GLM using the logit link and all available variables. It assumes that train and test sets have been constructed. Adding an interaction of Gender and Race is included in the code. That is for illustration purposes. The code also produces an ROC curve, a confusion matrix, and calculates AUC.

readmission <- readmission %>% select(-DRG.Complication)

#In base R,
#readmission$DRG.Complication <- NULL
#readmission$DRG.Class <- NULL


#Create train and test sets
library(caret)
set.seed(4321)
partition <- createDataPartition(readmission$Readmission.Status, list = FALSE, p = .75) #The partition will stratify using variable 1 from the dataframe
train <- readmission[partition, ]
test <- readmission[-partition, ]

print("TRAIN")

## [1] "TRAIN"

mean(train$Readmission.Status)

## [1] 0.1252346

print("TEST")

## [1] "TEST"

mean(test$Readmission.Status)

## [1] 0.1280101

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

model <- glm(Readmission.Status ~ . + Gender*Race - los_age_clust, data=train, family = binomial(link="logit"))

summary(model)

## 
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race - los_age_clust, 
##     family = binomial(link = "logit"), data = train)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -2.735001   0.093088 -29.381  < 2e-16 ***
## GenderM               0.003635   0.031228   0.116  0.90732    
## RaceBlack            -0.013293   0.060167  -0.221  0.82514    
## RaceOthers           -0.107187   0.111979  -0.957  0.33846    
## RaceHispanic          0.074722   0.132926   0.562  0.57403    
## ER                    0.004250   0.017273   0.246  0.80565    
## DRG.ClassSURG         0.017028   0.030677   0.555  0.57886    
## DRG.ClassOTHER        0.117817   0.042141   2.796  0.00518 ** 
## Age                  -0.005727   0.001062  -5.391    7e-08 ***
## log_LOS               0.052907   0.020472   2.584  0.00976 ** 
## log_riskscore         1.300394   0.023607  55.085  < 2e-16 ***
## GenderM:RaceBlack     0.109082   0.090315   1.208  0.22713    
## GenderM:RaceOthers    0.158532   0.159044   0.997  0.31887    
## GenderM:RaceHispanic  0.062118   0.205592   0.302  0.76254    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33825  on 50068  degrees of freedom
## AIC: 33853
## 
## Number of Fisher Scoring iterations: 5

preds <- predict(model, newdat=test,type="response")

roc_model <- roc(test$Readmission.Status,preds)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)

## Area under the curve: 0.7451

model <- glm(Readmission.Status ~ . + Gender*Race - los_age_clust, data=train, family = binomial(link="probit"))

#summary(model)

preds <- predict(model, newdat=test,type="response")

roc_model <- roc(test$Readmission.Status,preds)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)

## Area under the curve: 0.745

model <- glm(Readmission.Status ~ . + Gender*Race  - los_age_clust, data=train, family = binomial(link="cauchit"))

#summary(model)

preds <- predict(model, newdat=test,type="response")

roc_model <- roc(test$Readmission.Status,preds)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)

## Area under the curve: 0.7449

model <- glm(Readmission.Status ~ . + Gender*Race - los_age_clust, data=train, family = binomial(link="cloglog"))

#summary(model)

preds <- predict(model, newdat=test,type="response")

roc_model <- roc(test$Readmission.Status,preds)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)

## Area under the curve: 0.745

Task 6 - Decide on the factor variable from Task 3 (5 points)

No code is provided.

model <- glm(Readmission.Status ~ . + Gender*Race - log_LOS - Age, data=train, family = binomial(link="logit"))

summary(model)

## 
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race - log_LOS - 
##     Age, family = binomial(link = "logit"), data = train)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -3.159845   0.042191 -74.894  < 2e-16 ***
## GenderM               0.008457   0.031214   0.271  0.78646    
## RaceBlack             0.000908   0.060084   0.015  0.98794    
## RaceOthers           -0.096542   0.111857  -0.863  0.38809    
## RaceHispanic          0.083876   0.132874   0.631  0.52788    
## ER                    0.004442   0.017267   0.257  0.79701    
## DRG.ClassSURG         0.016665   0.030672   0.543  0.58690    
## DRG.ClassOTHER        0.119978   0.042068   2.852  0.00434 ** 
## log_riskscore         1.297614   0.023429  55.385  < 2e-16 ***
## los_age_clust2        0.078253   0.038017   2.058  0.03955 *  
## los_age_clust3        0.147635   0.036731   4.019 5.83e-05 ***
## los_age_clust4        0.197225   0.049776   3.962 7.42e-05 ***
## GenderM:RaceBlack     0.110899   0.090263   1.229  0.21921    
## GenderM:RaceOthers    0.152623   0.158979   0.960  0.33704    
## GenderM:RaceHispanic  0.067235   0.205509   0.327  0.74354    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33838  on 50067  degrees of freedom
## AIC: 33868
## 
## Number of Fisher Scoring iterations: 5

preds <- predict(model, newdat=test,type="response")

roc_model <- roc(test$Readmission.Status,preds)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)

## Area under the curve: 0.7447

Task 7 - Select features (15 points)

No code is provided.

library(caret)

factor_names <- c("DRG.Class","Race") #insert the column names of the variables to be binarized
factor_vars <- readmission[,factor_names]

binarizer <- caret::dummyVars(paste("~", paste(factor_names, collapse = "+")) , data = factor_vars, fullRank = TRUE)
binarized_vars <- data.frame(predict(binarizer, newdata = factor_vars))
head(binarized_vars)

##   DRG.Class.SURG DRG.Class.OTHER Race.Black Race.Others Race.Hispanic
## 1              0               1          0           0             0
## 2              1               0          0           0             0
## 3              1               0          0           0             0
## 4              0               0          0           0             0
## 5              0               0          0           0             0
## 6              1               0          0           0             0

readmission_binarized <- readmission %>% 
  select(-DRG.Class, -Race) %>% 
  mutate(DRG.Class.MED = binarized_vars$DRG.Class.MED,
         DRG.Class.OTHER = binarized_vars$DRG.Class.OTHER,
         DRG.Class.SURG = binarized_vars$DRG.Class.SURG,
         Race.Black = binarized_vars$Race.Black,
         Race.Hispanic = binarized_vars$Race.Hispanic,
         Race.Others = binarized_vars$Race.White)

train_bin <- readmission_binarized[partition,]
test_bin <- readmission_binarized[-partition,]
glimpse(train_bin)

## Rows: 50,082
## Columns: 11
## $ Readmission.Status <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ Gender             <fct> M, M, M, M, F, F, F, F, M, M, M, M, M, M, M, M, M, …
## $ ER                 <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, …
## $ Age                <dbl> 73, 73, 73, 73, 71, 71, 71, 71, 67, 67, 67, 67, 67,…
## $ log_LOS            <dbl> 3.3672958, 1.6094379, 2.5649494, 2.3025851, 2.30258…
## $ log_riskscore      <dbl> 2.510168, 2.510168, 2.510168, 2.510168, 2.431154, 2…
## $ los_age_clust      <fct> 3, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, …
## $ DRG.Class.OTHER    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ DRG.Class.SURG     <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, …
## $ Race.Black         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Race.Hispanic      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

model <- glm(Readmission.Status ~ . - los_age_clust, data=train_bin, family = binomial(link="logit"))

summary(model)

## 
## Call:
## glm(formula = Readmission.Status ~ . - los_age_clust, family = binomial(link = "logit"), 
##     data = train_bin)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.745388   0.092682 -29.621  < 2e-16 ***
## GenderM          0.022245   0.028545   0.779  0.43579    
## ER               0.004367   0.017270   0.253  0.80036    
## Age             -0.005712   0.001061  -5.385 7.26e-08 ***
## log_LOS          0.052813   0.020469   2.580  0.00988 ** 
## log_riskscore    1.300556   0.023604  55.100  < 2e-16 ***
## DRG.Class.OTHER  0.117939   0.042134   2.799  0.00512 ** 
## DRG.Class.SURG   0.016673   0.030673   0.544  0.58675    
## Race.Black       0.035221   0.045198   0.779  0.43582    
## Race.Hispanic    0.102097   0.101446   1.006  0.31422    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33828  on 50072  degrees of freedom
## AIC: 33848
## 
## Number of Fisher Scoring iterations: 5

MASS::stepAIC(model)

## Start:  AIC=33847.6
## Readmission.Status ~ (Gender + ER + Age + log_LOS + log_riskscore + 
##     los_age_clust + DRG.Class.OTHER + DRG.Class.SURG + Race.Black + 
##     Race.Hispanic) - los_age_clust
## 
##                   Df Deviance   AIC
## - ER               1    33828 33846
## - DRG.Class.SURG   1    33828 33846
## - Race.Black       1    33828 33846
## - Gender           1    33828 33846
## - Race.Hispanic    1    33829 33847
## <none>                  33828 33848
## - log_LOS          1    33834 33852
## - DRG.Class.OTHER  1    33835 33853
## - Age              1    33856 33874
## - log_riskscore    1    37459 37477
## 
## Step:  AIC=33845.66
## Readmission.Status ~ Gender + Age + log_LOS + log_riskscore + 
##     DRG.Class.OTHER + DRG.Class.SURG + Race.Black + Race.Hispanic
## 
##                   Df Deviance   AIC
## - DRG.Class.SURG   1    33828 33844
## - Gender           1    33828 33844
## - Race.Black       1    33828 33844
## - Race.Hispanic    1    33829 33845
## <none>                  33828 33846
## - log_LOS          1    33834 33850
## - DRG.Class.OTHER  1    33835 33851
## - Age              1    33856 33872
## - log_riskscore    1    37459 37475
## 
## Step:  AIC=33843.96
## Readmission.Status ~ Gender + Age + log_LOS + log_riskscore + 
##     DRG.Class.OTHER + Race.Black + Race.Hispanic
## 
##                   Df Deviance   AIC
## - Gender           1    33829 33843
## - Race.Black       1    33829 33843
## - Race.Hispanic    1    33829 33843
## <none>                  33828 33844
## - log_LOS          1    33835 33849
## - DRG.Class.OTHER  1    33836 33850
## - Age              1    33857 33871
## - log_riskscore    1    37459 37473
## 
## Step:  AIC=33842.55
## Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER + 
##     Race.Black + Race.Hispanic
## 
##                   Df Deviance   AIC
## - Race.Black       1    33829 33841
## - Race.Hispanic    1    33830 33842
## <none>                  33829 33843
## - log_LOS          1    33835 33847
## - DRG.Class.OTHER  1    33836 33848
## - Age              1    33858 33870
## - log_riskscore    1    37462 37474
## 
## Step:  AIC=33841.13
## Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER + 
##     Race.Hispanic
## 
##                   Df Deviance   AIC
## - Race.Hispanic    1    33830 33840
## <none>                  33829 33841
## - log_LOS          1    33836 33846
## - DRG.Class.OTHER  1    33837 33847
## - Age              1    33861 33871
## - log_riskscore    1    37466 37476
## 
## Step:  AIC=33840.02
## Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER
## 
##                   Df Deviance   AIC
## <none>                  33830 33840
## - log_LOS          1    33837 33845
## - DRG.Class.OTHER  1    33838 33846
## - Age              1    33862 33870
## - log_riskscore    1    37467 37475

## 
## Call:  glm(formula = Readmission.Status ~ Age + log_LOS + log_riskscore + 
##     DRG.Class.OTHER, family = binomial(link = "logit"), data = train_bin)
## 
## Coefficients:
##     (Intercept)              Age          log_LOS    log_riskscore  
##       -2.703682        -0.005935         0.052674         1.301097  
## DRG.Class.OTHER  
##        0.110212  
## 
## Degrees of Freedom: 50081 Total (i.e. Null);  50077 Residual
## Null Deviance:       37780 
## Residual Deviance: 33830     AIC: 33840

preds <- predict(model, newdat=test_bin,type="response")

roc_model <- roc(test$Readmission.Status,preds)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))

plot(roc_model)

auc(roc_model)

## Area under the curve: 0.7451

model <- glm(Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER + 
    Race.Black + Race.Hispanic,
    data=train_bin, family = binomial(link="logit"))

summary(model)

## 
## Call:
## glm(formula = Readmission.Status ~ Age + log_LOS + log_riskscore + 
##     DRG.Class.OTHER + Race.Black + Race.Hispanic, family = binomial(link = "logit"), 
##     data = train_bin)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.720301   0.088953 -30.581  < 2e-16 ***
## Age             -0.005783   0.001056  -5.474 4.39e-08 ***
## log_LOS          0.052715   0.020466   2.576   0.0100 *  
## log_riskscore    1.300703   0.023601  55.113  < 2e-16 ***
## DRG.Class.OTHER  0.110062   0.039645   2.776   0.0055 ** 
## Race.Black       0.034392   0.045184   0.761   0.4466    
## Race.Hispanic    0.100989   0.101443   0.996   0.3195    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37785  on 50081  degrees of freedom
## Residual deviance: 33829  on 50075  degrees of freedom
## AIC: 33843
## 
## Number of Fisher Scoring iterations: 5

preds <- predict(model, newdat=test_bin,type="response")

roc_model <- roc(test$Readmission.Status,preds)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc_model)

auc(roc_model)

## Area under the curve: 0.7453

Task 8 - Interpret the model (6 points)

No code is provided.

model <- glm(Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER + 
    Race.Black + Race.Hispanic, 
    data = readmission_binarized, 
    family = binomial(link="probit"))

summary(model)

## 
## Call:
## glm(formula = Readmission.Status ~ Age + log_LOS + log_riskscore + 
##     DRG.Class.OTHER + Race.Black + Race.Hispanic, family = binomial(link = "probit"), 
##     data = readmission_binarized)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.511383   0.041502 -36.417  < 2e-16 ***
## Age             -0.003620   0.000497  -7.284 3.25e-13 ***
## log_LOS          0.033235   0.009729   3.416 0.000635 ***
## log_riskscore    0.700327   0.010727  65.289  < 2e-16 ***
## DRG.Class.OTHER  0.059406   0.018642   3.187 0.001439 ** 
## Race.Black       0.018695   0.021326   0.877 0.380676    
## Race.Hispanic    0.048057   0.047324   1.015 0.309880    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 50559  on 66775  degrees of freedom
## Residual deviance: 45116  on 66769  degrees of freedom
## AIC: 45130
## 
## Number of Fisher Scoring iterations: 5

readmission_binarized %>% summary()

##  Readmission.Status Gender          ER              Age            log_LOS     
##  Min.   :0.0000     F:38005   Min.   :0.0000   Min.   : 24.00   Min.   :0.000  
##  1st Qu.:0.0000     M:28771   1st Qu.:0.0000   1st Qu.: 67.00   1st Qu.:1.099  
##  Median :0.0000               Median :0.0000   Median : 75.00   Median :1.609  
##  Mean   :0.1259               Mean   :0.5083   Mean   : 73.64   Mean   :1.653  
##  3rd Qu.:0.0000               3rd Qu.:1.0000   3rd Qu.: 83.00   3rd Qu.:2.079  
##  Max.   :1.0000               Max.   :9.0000   Max.   :101.00   Max.   :3.584  
##  log_riskscore     los_age_clust DRG.Class.OTHER  DRG.Class.SURG  
##  Min.   :-2.5383   1:19188       Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 0.1017   2:22226       1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 0.6238   3:17466       Median :0.0000   Median :0.0000  
##  Mean   : 0.6000   4: 7896       Mean   :0.1399   Mean   :0.4046  
##  3rd Qu.: 1.1547                 3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   : 2.5102                 Max.   :1.0000   Max.   :1.0000  
##    Race.Black     Race.Hispanic    
##  Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000  
##  Mean   :0.1063   Mean   :0.01926  
##  3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000

example1 <- tibble(
  Gender  = "F",
  ER = 0,
  Age = 74,
  log_LOS = 1.65,
  log_riskscore = 0.6,
  los_age_cluster = 2,
  DRG.Class.OTHER = 0,
  DRG.Class.SURG = 1,
  Race.Black = 0,
  Race.Hispanic = 0
)

example2 <- example1 %>% mutate(log_riskscore = 2)
example3 <- example1 %>% mutate(log_LOS = 2)
example4 <- example1 %>% mutate(Age = 80)
example5 <- example1 %>% mutate(Gender = "M")
example6 <- example1 %>% mutate(Race.Black  = 1)
example7 <- example1 %>% mutate(Race.Hispanic  = 1)

preds1 <- predict(model, newdat=example1,type="response")
preds2 <- predict(model, newdat=example2,type="response")
preds3 <- predict(model, newdat=example3,type="response")
preds4 <- predict(model, newdat=example4,type="response")
preds5 <- predict(model, newdat=example5,type="response")
preds6 <- predict(model, newdat=example6,type="response")
preds7 <- predict(model, newdat=example7,type="response")

example1 %>% 
  rbind(example2) %>% 
  rbind(example3) %>% 
  rbind(example4) %>% 
  rbind(example5) %>% 
  rbind(example6) %>% 
  rbind(example7) %>% 
  mutate(y = c(preds1, preds2, preds3, preds4,preds5, preds6, preds7)) %>% 
  mutate(patientid = row_number(), y = round(y,3)) %>% 
  select(patientid, y)

## # A tibble: 7 × 2
##   patientid     y
##       <int> <dbl>
## 1         1 0.096
## 2         2 0.373
## 3         3 0.098
## 4         4 0.092
## 5         5 0.096
## 6         6 0.099
## 7         7 0.105

Task 9 - Set the cutoff (9 points)

The following code calculates the cost using a cutoff of 0.075. It assumes the final model constructed on the full dataset is called glm_full and the final dataset is readmit.

readmission$Readmission.Status %>% sum()*25

## [1] 210225

pred_full <- predict(model,type="response")

cutoff <- 0.1

pred_readmit <- 1*(pred_full > cutoff)

no_intervention_cost <- 25*sum(readmission$Readmission.Status == 1)
full_intervention_cost <- 2*nrow(readmission)
no_intervention_cost

## [1] 210225

full_intervention_cost

## [1] 133552

library(e1071)
cm <- confusionMatrix(factor(1*(pred_full>cutoff)),factor(readmission$Readmission.Status))$table
cm

##           Reference
## Prediction     0     1
##          0 31709  1648
##          1 26658  6761

modified_cost <- cm[2,1]*2+cm[2,2]*2+cm[1,2]*25
modified_cost

## [1] 108038

cutoff_values <- c(1, 0.4, 0.3, 0.2, 0.1, 0.09, 0.08, 0.075, 0.07, 0.05, 0)

get_modified_cost <- function(input_cutoff){
  pred_readmit <- 1*(pred_full > input_cutoff)
  cm <- confusionMatrix(factor(pred_readmit),factor(readmission$Readmission.Status))
  modified_cost <- cm$table[2,1]*2+cm$table[2,2]*2+cm$table[1,2]*25
  modified_cost
}
get_modified_cost(0.2)

## [1] 141967

tibble(cutoff = cutoff_values,
       cost = sapply(cutoff_values, get_modified_cost))

## Warning in confusionMatrix.default(factor(pred_readmit),
## factor(readmission$Readmission.Status)): Levels are not in the same order for
## reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(factor(pred_readmit),
## factor(readmission$Readmission.Status)): Levels are not in the same order for
## reference and data. Refactoring data to match.

## # A tibble: 11 × 2
##    cutoff   cost
##     <dbl>  <dbl>
##  1  1     210225
##  2  0.4   203799
##  3  0.3   180869
##  4  0.2   141967
##  5  0.1   108038
##  6  0.09  106426
##  7  0.08  105952
##  8  0.075 106227
##  9  0.07  106855
## 10  0.05  110386
## 11  0     133552

cutoff <- 0.08
pred_readmit <- 1*(pred_full > cutoff)
cm <- confusionMatrix(factor(pred_readmit),factor(readmission$Readmission.Status))
cm$table

##           Reference
## Prediction     0     1
##          0 26381  1094
##          1 31986  7315

Hospital Readmissions - solution

Futuro Insight Model

06 June, 2024

Task 1 - Perform univariate exploration of the four non-factor variables (6 points)

Task 2 - Examine relationships between DRG.Class and DRG.Complication (5 points)

Task 3 - Use observations from cluster analysis to consider a new feature (9 points)

Task 4 - Select an interaction (5 points)

Task 5 - Select a link function (8 points)

Task 6 - Decide on the factor variable from Task 3 (5 points)

Task 7 - Select features (15 points)

Task 8 - Interpret the model (6 points)

Task 9 - Set the cutoff (9 points)

Task 10 – Consider alternative models and model construction techniques (12 points)

Task 11 – Executive summary (20 points)