This chunk performs binarization. Note that it is set to fullRank = FALSE. This creates binarized variables for each factor level. If set to TRUE it will leave one out. Note the new variables are placed in a new dataframe. It can attached to an existing dataframe via old.df <- cbind(old.df, binarized_vars)
factor_names <- c("ER","Age") #insert the column names of the variables to be binarized
factor_vars <- readmission[,factor_names]
for (var in factor_names) {
factor_vars[, var] <- as.character(factor_vars[, var])
}
binarizer <- caret::dummyVars(paste("~", paste(factor_names, collapse = "+")) , data = factor_vars, fullRank = TRUE)
binarized_vars <- data.frame(predict(binarizer, newdata = factor_vars))
head(binarized_vars)
This chunk creates training and testing sets.
#Create train and test sets
set.seed(4321)
partition <- createDataPartition(readmission[,1], list = FALSE, p = .75) #The partition will stratify using variable 1 from the dataframe
train <- readmission[partition, ]
test <- readmission[-partition, ]
print("TRAIN")
mean(train$Readmission.Status)
print("TEST")
mean(test$Readmission.Status)
The following chunk provides code that can be used to combine factor levels. It also relevels in case the new level has the highest frequency.
#USED fct-infreq method instead
#This example combines levels other than White of Race into a new level called NonWhite.
#Execute the function levels(readmission$Race) to identify the levels. Be sure the variable is a factor variable before doing this. This code assumes the variable has previously been releveled so that "White" is the first level.
#readmission2<-readmission #The results are in a new data frame called readmission2. This is done so that the results can be checked without losing the original data frame. When done, consider executing readmission <- readmission2
#Warning: If you run this code and import the plyr library, it will cause errors later on.
#To fix these errors, there are two options
#1. Restart R and RStudio and load tidyverse AFTER plyr
#2. Tell R to use the dplyr library for functions that share the same name in both libraries: dplyr::summarise, dplyr::group_by, etc.
#library(plyr)
#var <- "Race"
#var.levels <- levels(readmission2[,var])
#readmission2[,var] <- mapvalues(readmission2[,var],var.levels,c("White","NonWhite","NonWhite","NonWhite"))
#Relevel
#table <- as.data.frame(table(readmission2[,var]))
# max <- which.max(table[,2])
# level.name <- as.character(table[max,1])
# readmission2[,var] <- relevel(readmission2[,var], ref = level.name)
#table(readmission2[,var])
This chunk reads in the data, relevels factors, and prints a summary.
#Set factor levels to those with the most observations
readmission <- readmission %>%
mutate_if(is.character, fct_infreq)
summary(readmission)
## Readmission.Status Gender Race ER DRG.Class
## Min. :0.0000 F:38011 White :56124 Min. :0.0000 MED :35771
## 1st Qu.:0.0000 M:28771 Black : 7099 1st Qu.:0.0000 SURG :30447
## Median :0.0000 Others : 2273 Median :0.0000 UNGROUP: 564
## Mean :0.1259 Hispanic: 1286 Mean :0.5083
## 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :9.0000
## LOS Age HCC.Riskscore DRG.Complication
## Min. : 1.000 Min. : 24.00 Min. : 0.079 MedicalMCC.CC:18110
## 1st Qu.: 3.000 1st Qu.: 67.00 1st Qu.: 1.107 SurgMCC.CC :15468
## Median : 5.000 Median : 75.00 Median : 1.865 MedicalNoC :12310
## Mean : 6.693 Mean : 73.64 Mean : 2.345 SurgNoC :11549
## 3rd Qu.: 8.000 3rd Qu.: 83.00 3rd Qu.: 3.173 Other : 9345
## Max. :36.000 Max. :101.00 Max. :12.307
glimpse(readmission)
## Rows: 66,782
## Columns: 9
## $ Readmission.Status <dbl> 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, …
## $ Gender <fct> M, M, M, M, M, M, F, F, F, F, F, M, M, M, M, M, M, …
## $ Race <fct> White, White, White, White, White, White, White, Wh…
## $ ER <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, …
## $ DRG.Class <fct> MED, SURG, SURG, MED, MED, SURG, MED, MED, MED, SUR…
## $ LOS <dbl> 29, 5, 9, 5, 13, 10, 4, 10, 14, 4, 3, 9, 8, 22, 10,…
## $ Age <dbl> 73, 73, 73, 73, 73, 73, 71, 71, 71, 71, 71, 67, 67,…
## $ HCC.Riskscore <dbl> 12.307, 12.307, 12.307, 12.307, 12.307, 12.307, 11.…
## $ DRG.Complication <fct> Other, SurgNoC, SurgNoC, MedicalNoC, MedicalNoC, Su…
Code is provided to create a histogram for one of the variables.
library(ggplot2)
readmission %>% ggplot(aes(ER)) + geom_histogram(bins = 9) #Set the number of bins equal to 9 because there are 9 unique values
readmission %>% ggplot(aes(LOS)) + geom_histogram(bins = 36) #Set the number of bins equal to 36 because there are 36 unique values
readmission %>% ggplot(aes(log(LOS))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
readmission %>% ggplot(aes(Age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
readmission %>% ggplot(aes(HCC.Riskscore)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
readmission <- readmission %>% mutate(log_LOS = log(LOS),
log_riskscore = log(HCC.Riskscore)) %>%
select(-LOS, -HCC.Riskscore)
Code is provided to create a tabular view of the two variables.
readmission %>%
count(DRG.Class, DRG.Complication) %>%
spread(DRG.Class,n) %>%
make_table()
| DRG.Complication | MED | SURG | UNGROUP |
|---|---|---|---|
| MedicalMCC.CC | 18104 | 6 | NA |
| SurgMCC.CC | NA | 15468 | NA |
| MedicalNoC | 12310 | NA | NA |
| SurgNoC | NA | 11549 | NA |
| Other | 5357 | 3424 | 564 |
#Using base R
table(readmission$DRG.Class,readmission$DRG.Complication)
##
## MedicalMCC.CC SurgMCC.CC MedicalNoC SurgNoC Other
## MED 18104 0 12310 0 5357
## SURG 6 15468 0 11549 3424
## UNGROUP 0 0 0 0 564
#Using base R
#rows_to_drop <- (readmission$DRG.Complication == "MedicalMCC.CC") & (readmission$DRG.Class == "SURG")
#readmission_temp <- readmission[!rows_to_drop,]
#removes the 6 patients
readmission <- readmission %>%
filter(!(DRG.Complication == "MedicalMCC.CC" & DRG.Class == "SURG"))
readmission <- readmission %>%
mutate(DRG.Class = case_when(
DRG.Complication == "MedicalMCC.CC" & DRG.Class == "MED" ~ "MED",
DRG.Complication == "SurgMCC.CC" & DRG.Class == "SURG" ~ "SURG",
DRG.Complication == "MedicalNoC" & DRG.Class == "MED" ~ "MED",
DRG.Complication == "SurgNoC" & DRG.Class == "SURG" ~ "SURG",
TRUE ~ "OTHER"
))
#Using Base R,
#readmission_temp$DRG.Class <- case_when(
# readmission$DRG.Complication == "MedicalMCC.CC" & readmission$DRG.Class == "MED" ~ "MED",
# readmission$DRG.Complication == "SurgMCC.CC" & readmission$DRG.Class == "SURG" ~ "SURG",
# readmission$DRG.Complication == "MedicalNoC" & readmission$DRG.Class == "MED" ~ "MED",
# readmission$DRG.Complication == "SurgNoC" & readmission$DRG.Class == "SURG" ~ "SURG",
# TRUE ~ "OTHER"
# )
readmission %>% count(DRG.Class) %>% make_table()
| DRG.Class | n |
|---|---|
| MED | 30414 |
| OTHER | 9345 |
| SURG | 27017 |
#table(readmission_temp$DRG.Class)
#relevel the new variable
readmission$DRG.Class <- fct_infreq(readmission$DRG.Class)
Code is provided to perform cluster analysis for from 1 to 12 clusters, construct an elbow plot and create a new variable based on a selected number of clusters. That variable will need to be retained for potentially being added tot he dataframe.
nstart.val <- 20
cluster_vars <- readmission[c('log_LOS','Age')]
for(i in 1:ncol(cluster_vars)){
cluster_vars[,i] <- scale(cluster_vars[,i])
}
km1 <- kmeans(cluster_vars,centers=1,nstart=nstart.val)
km2 <- kmeans(cluster_vars,centers=2,nstart=nstart.val)
km3 <- kmeans(cluster_vars,centers=3,nstart=nstart.val)
km4 <- kmeans(cluster_vars,centers=4,nstart=nstart.val)
km5 <- kmeans(cluster_vars,centers=5,nstart=nstart.val)
km6 <- kmeans(cluster_vars,centers=6,nstart=nstart.val)
km7 <- kmeans(cluster_vars,centers=7,nstart=nstart.val)
km8 <- kmeans(cluster_vars,centers=8,nstart=nstart.val)
km9 <- kmeans(cluster_vars,centers=9,nstart=nstart.val)
km10 <- kmeans(cluster_vars,centers=10,nstart=nstart.val)
km11 <- kmeans(cluster_vars,centers=11,nstart=nstart.val)
km12 <- kmeans(cluster_vars,centers=12,nstart=nstart.val)
var.exp <- data.frame(k = c(1:12),
bss_tss = c(km1$betweenss/km1$totss,
km2$betweenss/km2$totss,
km3$betweenss/km3$totss,
km4$betweenss/km4$totss,
km5$betweenss/km5$totss,
km6$betweenss/km6$totss,
km7$betweenss/km7$totss,
km8$betweenss/km8$totss,
km9$betweenss/km9$totss,
km10$betweenss/km10$totss,
km11$betweenss/km11$totss,
km12$betweenss/km12$totss))
ggplot(var.exp,aes(x=k,y=bss_tss))+geom_point()
LOS_Age_Clust <- as.factor(km4$cluster) #This creates a new variable based on having 8 clusters.
cluster_vars$LOS_Age_Clust <- LOS_Age_Clust
ggplot(data = cluster_vars, aes(x = Age, y = log_LOS, col = LOS_Age_Clust)) + geom_point() + theme(axis.text = element_blank(), legend.title = element_blank()) +ggtitle("Clustering with 4 groups")
readmission <- readmission %>% mutate(los_age_clust = LOS_Age_Clust)
The following code may help determine if interactions are present. It is best to treat ER as a factor variable for this purpose.
#Both variables are factor variables
ggplot(readmission,aes(Gender,fill=factor(Readmission.Status))) + geom_bar(position = "fill") +
facet_wrap(~Race,ncol=2,scales="free")+scale_y_continuous()
#Look at the change in the percentage of readmitted patients
readmission %>%
dplyr::group_by(Race, Gender) %>%
dplyr::summarise(percent_readmission = mean(Readmission.Status)) %>%
tidyr::spread(Gender, percent_readmission) %>%
mutate_if(is.numeric, ~round(.x,2)) %>%
make_table()
## `summarise()` has grouped output by 'Race'. You can override using the
## `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
| Race | F | M |
|---|---|---|
| White | 0.12 | 0.13 |
| Black | 0.13 | 0.14 |
| Others | 0.11 | 0.12 |
| Hispanic | 0.15 | 0.12 |
#One factor variable and one continuous numeric variable
ggplot(readmission,aes(x=factor(Readmission.Status),y=log_riskscore)) + geom_boxplot() +facet_wrap(~factor(ER))
The following code runs a GLM using the logit link and all available variables. It assumes that train and test sets have been constructed. Adding an interaction of Gender and Race is included in the code. That is for illustration purposes. The code also produces an ROC curve, a confusion matrix, and calculates AUC.
readmission <- readmission %>% select(-DRG.Complication)
#In base R,
#readmission$DRG.Complication <- NULL
#readmission$DRG.Class <- NULL
#Create train and test sets
library(caret)
set.seed(4321)
partition <- createDataPartition(readmission$Readmission.Status, list = FALSE, p = .75) #The partition will stratify using variable 1 from the dataframe
train <- readmission[partition, ]
test <- readmission[-partition, ]
print("TRAIN")
## [1] "TRAIN"
mean(train$Readmission.Status)
## [1] 0.1252346
print("TEST")
## [1] "TEST"
mean(test$Readmission.Status)
## [1] 0.1280101
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
model <- glm(Readmission.Status ~ . + Gender*Race - los_age_clust, data=train, family = binomial(link="logit"))
summary(model)
##
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race - los_age_clust,
## family = binomial(link = "logit"), data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.735001 0.093088 -29.381 < 2e-16 ***
## GenderM 0.003635 0.031228 0.116 0.90732
## RaceBlack -0.013293 0.060167 -0.221 0.82514
## RaceOthers -0.107187 0.111979 -0.957 0.33846
## RaceHispanic 0.074722 0.132926 0.562 0.57403
## ER 0.004250 0.017273 0.246 0.80565
## DRG.ClassSURG 0.017028 0.030677 0.555 0.57886
## DRG.ClassOTHER 0.117817 0.042141 2.796 0.00518 **
## Age -0.005727 0.001062 -5.391 7e-08 ***
## log_LOS 0.052907 0.020472 2.584 0.00976 **
## log_riskscore 1.300394 0.023607 55.085 < 2e-16 ***
## GenderM:RaceBlack 0.109082 0.090315 1.208 0.22713
## GenderM:RaceOthers 0.158532 0.159044 0.997 0.31887
## GenderM:RaceHispanic 0.062118 0.205592 0.302 0.76254
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33825 on 50068 degrees of freedom
## AIC: 33853
##
## Number of Fisher Scoring iterations: 5
preds <- predict(model, newdat=test,type="response")
roc_model <- roc(test$Readmission.Status,preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)
## Area under the curve: 0.7451
model <- glm(Readmission.Status ~ . + Gender*Race - los_age_clust, data=train, family = binomial(link="probit"))
#summary(model)
preds <- predict(model, newdat=test,type="response")
roc_model <- roc(test$Readmission.Status,preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)
## Area under the curve: 0.745
model <- glm(Readmission.Status ~ . + Gender*Race - los_age_clust, data=train, family = binomial(link="cauchit"))
#summary(model)
preds <- predict(model, newdat=test,type="response")
roc_model <- roc(test$Readmission.Status,preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)
## Area under the curve: 0.7449
model <- glm(Readmission.Status ~ . + Gender*Race - los_age_clust, data=train, family = binomial(link="cloglog"))
#summary(model)
preds <- predict(model, newdat=test,type="response")
roc_model <- roc(test$Readmission.Status,preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)
## Area under the curve: 0.745
No code is provided.
model <- glm(Readmission.Status ~ . + Gender*Race - log_LOS - Age, data=train, family = binomial(link="logit"))
summary(model)
##
## Call:
## glm(formula = Readmission.Status ~ . + Gender * Race - log_LOS -
## Age, family = binomial(link = "logit"), data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.159845 0.042191 -74.894 < 2e-16 ***
## GenderM 0.008457 0.031214 0.271 0.78646
## RaceBlack 0.000908 0.060084 0.015 0.98794
## RaceOthers -0.096542 0.111857 -0.863 0.38809
## RaceHispanic 0.083876 0.132874 0.631 0.52788
## ER 0.004442 0.017267 0.257 0.79701
## DRG.ClassSURG 0.016665 0.030672 0.543 0.58690
## DRG.ClassOTHER 0.119978 0.042068 2.852 0.00434 **
## log_riskscore 1.297614 0.023429 55.385 < 2e-16 ***
## los_age_clust2 0.078253 0.038017 2.058 0.03955 *
## los_age_clust3 0.147635 0.036731 4.019 5.83e-05 ***
## los_age_clust4 0.197225 0.049776 3.962 7.42e-05 ***
## GenderM:RaceBlack 0.110899 0.090263 1.229 0.21921
## GenderM:RaceOthers 0.152623 0.158979 0.960 0.33704
## GenderM:RaceHispanic 0.067235 0.205509 0.327 0.74354
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33838 on 50067 degrees of freedom
## AIC: 33868
##
## Number of Fisher Scoring iterations: 5
preds <- predict(model, newdat=test,type="response")
roc_model <- roc(test$Readmission.Status,preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
#plot(roc_model)
auc(roc_model)
## Area under the curve: 0.7447
No code is provided.
library(caret)
factor_names <- c("DRG.Class","Race") #insert the column names of the variables to be binarized
factor_vars <- readmission[,factor_names]
binarizer <- caret::dummyVars(paste("~", paste(factor_names, collapse = "+")) , data = factor_vars, fullRank = TRUE)
binarized_vars <- data.frame(predict(binarizer, newdata = factor_vars))
head(binarized_vars)
## DRG.Class.SURG DRG.Class.OTHER Race.Black Race.Others Race.Hispanic
## 1 0 1 0 0 0
## 2 1 0 0 0 0
## 3 1 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 1 0 0 0 0
readmission_binarized <- readmission %>%
select(-DRG.Class, -Race) %>%
mutate(DRG.Class.MED = binarized_vars$DRG.Class.MED,
DRG.Class.OTHER = binarized_vars$DRG.Class.OTHER,
DRG.Class.SURG = binarized_vars$DRG.Class.SURG,
Race.Black = binarized_vars$Race.Black,
Race.Hispanic = binarized_vars$Race.Hispanic,
Race.Others = binarized_vars$Race.White)
train_bin <- readmission_binarized[partition,]
test_bin <- readmission_binarized[-partition,]
glimpse(train_bin)
## Rows: 50,082
## Columns: 11
## $ Readmission.Status <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ Gender <fct> M, M, M, M, F, F, F, F, M, M, M, M, M, M, M, M, M, …
## $ ER <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, …
## $ Age <dbl> 73, 73, 73, 73, 71, 71, 71, 71, 67, 67, 67, 67, 67,…
## $ log_LOS <dbl> 3.3672958, 1.6094379, 2.5649494, 2.3025851, 2.30258…
## $ log_riskscore <dbl> 2.510168, 2.510168, 2.510168, 2.510168, 2.431154, 2…
## $ los_age_clust <fct> 3, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, …
## $ DRG.Class.OTHER <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ DRG.Class.SURG <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, …
## $ Race.Black <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Race.Hispanic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
model <- glm(Readmission.Status ~ . - los_age_clust, data=train_bin, family = binomial(link="logit"))
summary(model)
##
## Call:
## glm(formula = Readmission.Status ~ . - los_age_clust, family = binomial(link = "logit"),
## data = train_bin)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.745388 0.092682 -29.621 < 2e-16 ***
## GenderM 0.022245 0.028545 0.779 0.43579
## ER 0.004367 0.017270 0.253 0.80036
## Age -0.005712 0.001061 -5.385 7.26e-08 ***
## log_LOS 0.052813 0.020469 2.580 0.00988 **
## log_riskscore 1.300556 0.023604 55.100 < 2e-16 ***
## DRG.Class.OTHER 0.117939 0.042134 2.799 0.00512 **
## DRG.Class.SURG 0.016673 0.030673 0.544 0.58675
## Race.Black 0.035221 0.045198 0.779 0.43582
## Race.Hispanic 0.102097 0.101446 1.006 0.31422
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33828 on 50072 degrees of freedom
## AIC: 33848
##
## Number of Fisher Scoring iterations: 5
MASS::stepAIC(model)
## Start: AIC=33847.6
## Readmission.Status ~ (Gender + ER + Age + log_LOS + log_riskscore +
## los_age_clust + DRG.Class.OTHER + DRG.Class.SURG + Race.Black +
## Race.Hispanic) - los_age_clust
##
## Df Deviance AIC
## - ER 1 33828 33846
## - DRG.Class.SURG 1 33828 33846
## - Race.Black 1 33828 33846
## - Gender 1 33828 33846
## - Race.Hispanic 1 33829 33847
## <none> 33828 33848
## - log_LOS 1 33834 33852
## - DRG.Class.OTHER 1 33835 33853
## - Age 1 33856 33874
## - log_riskscore 1 37459 37477
##
## Step: AIC=33845.66
## Readmission.Status ~ Gender + Age + log_LOS + log_riskscore +
## DRG.Class.OTHER + DRG.Class.SURG + Race.Black + Race.Hispanic
##
## Df Deviance AIC
## - DRG.Class.SURG 1 33828 33844
## - Gender 1 33828 33844
## - Race.Black 1 33828 33844
## - Race.Hispanic 1 33829 33845
## <none> 33828 33846
## - log_LOS 1 33834 33850
## - DRG.Class.OTHER 1 33835 33851
## - Age 1 33856 33872
## - log_riskscore 1 37459 37475
##
## Step: AIC=33843.96
## Readmission.Status ~ Gender + Age + log_LOS + log_riskscore +
## DRG.Class.OTHER + Race.Black + Race.Hispanic
##
## Df Deviance AIC
## - Gender 1 33829 33843
## - Race.Black 1 33829 33843
## - Race.Hispanic 1 33829 33843
## <none> 33828 33844
## - log_LOS 1 33835 33849
## - DRG.Class.OTHER 1 33836 33850
## - Age 1 33857 33871
## - log_riskscore 1 37459 37473
##
## Step: AIC=33842.55
## Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER +
## Race.Black + Race.Hispanic
##
## Df Deviance AIC
## - Race.Black 1 33829 33841
## - Race.Hispanic 1 33830 33842
## <none> 33829 33843
## - log_LOS 1 33835 33847
## - DRG.Class.OTHER 1 33836 33848
## - Age 1 33858 33870
## - log_riskscore 1 37462 37474
##
## Step: AIC=33841.13
## Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER +
## Race.Hispanic
##
## Df Deviance AIC
## - Race.Hispanic 1 33830 33840
## <none> 33829 33841
## - log_LOS 1 33836 33846
## - DRG.Class.OTHER 1 33837 33847
## - Age 1 33861 33871
## - log_riskscore 1 37466 37476
##
## Step: AIC=33840.02
## Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER
##
## Df Deviance AIC
## <none> 33830 33840
## - log_LOS 1 33837 33845
## - DRG.Class.OTHER 1 33838 33846
## - Age 1 33862 33870
## - log_riskscore 1 37467 37475
##
## Call: glm(formula = Readmission.Status ~ Age + log_LOS + log_riskscore +
## DRG.Class.OTHER, family = binomial(link = "logit"), data = train_bin)
##
## Coefficients:
## (Intercept) Age log_LOS log_riskscore
## -2.703682 -0.005935 0.052674 1.301097
## DRG.Class.OTHER
## 0.110212
##
## Degrees of Freedom: 50081 Total (i.e. Null); 50077 Residual
## Null Deviance: 37780
## Residual Deviance: 33830 AIC: 33840
preds <- predict(model, newdat=test_bin,type="response")
roc_model <- roc(test$Readmission.Status,preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#confusionMatrix(factor(1*(preds>.8)),factor(test$Readmission.Status))
plot(roc_model)
auc(roc_model)
## Area under the curve: 0.7451
model <- glm(Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER +
Race.Black + Race.Hispanic,
data=train_bin, family = binomial(link="logit"))
summary(model)
##
## Call:
## glm(formula = Readmission.Status ~ Age + log_LOS + log_riskscore +
## DRG.Class.OTHER + Race.Black + Race.Hispanic, family = binomial(link = "logit"),
## data = train_bin)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.720301 0.088953 -30.581 < 2e-16 ***
## Age -0.005783 0.001056 -5.474 4.39e-08 ***
## log_LOS 0.052715 0.020466 2.576 0.0100 *
## log_riskscore 1.300703 0.023601 55.113 < 2e-16 ***
## DRG.Class.OTHER 0.110062 0.039645 2.776 0.0055 **
## Race.Black 0.034392 0.045184 0.761 0.4466
## Race.Hispanic 0.100989 0.101443 0.996 0.3195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37785 on 50081 degrees of freedom
## Residual deviance: 33829 on 50075 degrees of freedom
## AIC: 33843
##
## Number of Fisher Scoring iterations: 5
preds <- predict(model, newdat=test_bin,type="response")
roc_model <- roc(test$Readmission.Status,preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_model)
auc(roc_model)
## Area under the curve: 0.7453
No code is provided.
model <- glm(Readmission.Status ~ Age + log_LOS + log_riskscore + DRG.Class.OTHER +
Race.Black + Race.Hispanic,
data = readmission_binarized,
family = binomial(link="probit"))
summary(model)
##
## Call:
## glm(formula = Readmission.Status ~ Age + log_LOS + log_riskscore +
## DRG.Class.OTHER + Race.Black + Race.Hispanic, family = binomial(link = "probit"),
## data = readmission_binarized)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.511383 0.041502 -36.417 < 2e-16 ***
## Age -0.003620 0.000497 -7.284 3.25e-13 ***
## log_LOS 0.033235 0.009729 3.416 0.000635 ***
## log_riskscore 0.700327 0.010727 65.289 < 2e-16 ***
## DRG.Class.OTHER 0.059406 0.018642 3.187 0.001439 **
## Race.Black 0.018695 0.021326 0.877 0.380676
## Race.Hispanic 0.048057 0.047324 1.015 0.309880
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 50559 on 66775 degrees of freedom
## Residual deviance: 45116 on 66769 degrees of freedom
## AIC: 45130
##
## Number of Fisher Scoring iterations: 5
readmission_binarized %>% summary()
## Readmission.Status Gender ER Age log_LOS
## Min. :0.0000 F:38005 Min. :0.0000 Min. : 24.00 Min. :0.000
## 1st Qu.:0.0000 M:28771 1st Qu.:0.0000 1st Qu.: 67.00 1st Qu.:1.099
## Median :0.0000 Median :0.0000 Median : 75.00 Median :1.609
## Mean :0.1259 Mean :0.5083 Mean : 73.64 Mean :1.653
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.: 83.00 3rd Qu.:2.079
## Max. :1.0000 Max. :9.0000 Max. :101.00 Max. :3.584
## log_riskscore los_age_clust DRG.Class.OTHER DRG.Class.SURG
## Min. :-2.5383 1:19188 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.1017 2:22226 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.6238 3:17466 Median :0.0000 Median :0.0000
## Mean : 0.6000 4: 7896 Mean :0.1399 Mean :0.4046
## 3rd Qu.: 1.1547 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. : 2.5102 Max. :1.0000 Max. :1.0000
## Race.Black Race.Hispanic
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000
## Mean :0.1063 Mean :0.01926
## 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000
example1 <- tibble(
Gender = "F",
ER = 0,
Age = 74,
log_LOS = 1.65,
log_riskscore = 0.6,
los_age_cluster = 2,
DRG.Class.OTHER = 0,
DRG.Class.SURG = 1,
Race.Black = 0,
Race.Hispanic = 0
)
example2 <- example1 %>% mutate(log_riskscore = 2)
example3 <- example1 %>% mutate(log_LOS = 2)
example4 <- example1 %>% mutate(Age = 80)
example5 <- example1 %>% mutate(Gender = "M")
example6 <- example1 %>% mutate(Race.Black = 1)
example7 <- example1 %>% mutate(Race.Hispanic = 1)
preds1 <- predict(model, newdat=example1,type="response")
preds2 <- predict(model, newdat=example2,type="response")
preds3 <- predict(model, newdat=example3,type="response")
preds4 <- predict(model, newdat=example4,type="response")
preds5 <- predict(model, newdat=example5,type="response")
preds6 <- predict(model, newdat=example6,type="response")
preds7 <- predict(model, newdat=example7,type="response")
example1 %>%
rbind(example2) %>%
rbind(example3) %>%
rbind(example4) %>%
rbind(example5) %>%
rbind(example6) %>%
rbind(example7) %>%
mutate(y = c(preds1, preds2, preds3, preds4,preds5, preds6, preds7)) %>%
mutate(patientid = row_number(), y = round(y,3)) %>%
select(patientid, y)
## # A tibble: 7 × 2
## patientid y
## <int> <dbl>
## 1 1 0.096
## 2 2 0.373
## 3 3 0.098
## 4 4 0.092
## 5 5 0.096
## 6 6 0.099
## 7 7 0.105
The following code calculates the cost using a cutoff of 0.075. It assumes the final model constructed on the full dataset is called glm_full and the final dataset is readmit.
readmission$Readmission.Status %>% sum()*25
## [1] 210225
pred_full <- predict(model,type="response")
cutoff <- 0.1
pred_readmit <- 1*(pred_full > cutoff)
no_intervention_cost <- 25*sum(readmission$Readmission.Status == 1)
full_intervention_cost <- 2*nrow(readmission)
no_intervention_cost
## [1] 210225
full_intervention_cost
## [1] 133552
library(e1071)
cm <- confusionMatrix(factor(1*(pred_full>cutoff)),factor(readmission$Readmission.Status))$table
cm
## Reference
## Prediction 0 1
## 0 31709 1648
## 1 26658 6761
modified_cost <- cm[2,1]*2+cm[2,2]*2+cm[1,2]*25
modified_cost
## [1] 108038
cutoff_values <- c(1, 0.4, 0.3, 0.2, 0.1, 0.09, 0.08, 0.075, 0.07, 0.05, 0)
get_modified_cost <- function(input_cutoff){
pred_readmit <- 1*(pred_full > input_cutoff)
cm <- confusionMatrix(factor(pred_readmit),factor(readmission$Readmission.Status))
modified_cost <- cm$table[2,1]*2+cm$table[2,2]*2+cm$table[1,2]*25
modified_cost
}
get_modified_cost(0.2)
## [1] 141967
tibble(cutoff = cutoff_values,
cost = sapply(cutoff_values, get_modified_cost))
## Warning in confusionMatrix.default(factor(pred_readmit),
## factor(readmission$Readmission.Status)): Levels are not in the same order for
## reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(factor(pred_readmit),
## factor(readmission$Readmission.Status)): Levels are not in the same order for
## reference and data. Refactoring data to match.
## # A tibble: 11 × 2
## cutoff cost
## <dbl> <dbl>
## 1 1 210225
## 2 0.4 203799
## 3 0.3 180869
## 4 0.2 141967
## 5 0.1 108038
## 6 0.09 106426
## 7 0.08 105952
## 8 0.075 106227
## 9 0.07 106855
## 10 0.05 110386
## 11 0 133552
cutoff <- 0.08
pred_readmit <- 1*(pred_full > cutoff)
cm <- confusionMatrix(factor(pred_readmit),factor(readmission$Readmission.Status))
cm$table
## Reference
## Prediction 0 1
## 0 26381 1094
## 1 31986 7315