# Setup ----
options(scipen=10000000)
#install.packages("gtsummary")
library(gtsummary) #for table cause broom always f's everything up
library(lubridate)
library(tidyverse)
library(kableExtra)
library(knitr)
library(pscl)
library(plotROC)
library(pROC)
library(tidyr)
library(caret)
library(dplyr)
library(gridExtra)
<- c("#648FFF","#FFB000","#163c5b","#BA5A31","#823038")
palette5 <- c("#648FFF","#FFB000","#163c5b","#BA5A31")
palette4 <- c("#648FFF","#FFB000")
palette2
# Functions ----
#plotTheme FOR CHAPTER 1, 6, 7 = title_size = 16
#plotTheme FOR CHAPTER 2 = title_size = 24
#plotTheme FOR CHAPTER 3,4,5,8 = title_size = 14
<- function(base_size = 12, title_size = 16) {
plotTheme theme(
text = element_text( color = "black"),
plot.title = element_text(size = title_size, colour = "black"),
plot.subtitle = element_text(face="italic"),
plot.caption = element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=12),
axis.title = element_text(size=12),
axis.text = element_text(size=10),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(colour = "black", face = "italic"),
legend.text = element_text(colour = "black", face = "italic"),
strip.text.x = element_text(size = 14)
)
}
# Iterate Thresholds Chapter 6, 7 (left in Chapters)
<- function(data, observedClass, predictedProbs, group) {
iterateThresholds #This function takes as its inputs, a data frame with an observed binomial class (1 or 0); a vector of predicted #probabilities; and optionally a group indicator like race. It returns accuracy plus counts and rates of confusion matrix #outcomes. It's a bit verbose because of the if (missing(group)). I don't know another way to make an optional parameter.
<- enquo(observedClass)
observedClass <- enquo(predictedProbs)
predictedProbs <- enquo(group)
group = .01
x <- data.frame()
all_prediction
if (missing(group)) {
while (x <= 1) {
<- data.frame()
this_prediction
<-
this_prediction %>%
data mutate(predclass = ifelse(!!predictedProbs > x, 1,0)) %>%
count(predclass, !!observedClass) %>%
summarize(Count_TN = sum(n[predclass==0 & !!observedClass==0]),
Count_TP = sum(n[predclass==1 & !!observedClass==1]),
Count_FN = sum(n[predclass==0 & !!observedClass==1]),
Count_FP = sum(n[predclass==1 & !!observedClass==0]),
Rate_TP = Count_TP / (Count_TP + Count_FN),
Rate_FP = Count_FP / (Count_FP + Count_TN),
Rate_FN = Count_FN / (Count_FN + Count_TP),
Rate_TN = Count_TN / (Count_TN + Count_FP),
Accuracy = (Count_TP + Count_TN) /
+ Count_TN + Count_FN + Count_FP)) %>%
(Count_TP mutate(Threshold = round(x,2))
<- rbind(all_prediction,this_prediction)
all_prediction <- x + .01
x
}return(all_prediction)
}else if (!missing(group)) {
while (x <= 1) {
<- data.frame()
this_prediction
<-
this_prediction %>%
data mutate(predclass = ifelse(!!predictedProbs > x, 1,0)) %>%
group_by(!!group) %>%
count(predclass, !!observedClass) %>%
summarize(Count_TN = sum(n[predclass==0 & !!observedClass==0]),
Count_TP = sum(n[predclass==1 & !!observedClass==1]),
Count_FN = sum(n[predclass==0 & !!observedClass==1]),
Count_FP = sum(n[predclass==1 & !!observedClass==0]),
Rate_TP = Count_TP / (Count_TP + Count_FN),
Rate_FP = Count_FP / (Count_FP + Count_TN),
Rate_FN = Count_FN / (Count_FN + Count_TP),
Rate_TN = Count_TN / (Count_TN + Count_FP),
Accuracy = (Count_TP + Count_TN) /
+ Count_TN + Count_FN + Count_FP)) %>%
(Count_TP mutate(Threshold = round(x,2))
<- rbind(all_prediction,this_prediction)
all_prediction <- x + .01
x
}return(all_prediction)
} }
The Department of Housing and Community Development in Emil City has no doubt that their home repair tax credit program is does good and is of social benefit. But they’re not so sure about their tactics to recruit eligible homeowners. This report explores the usefulness of employing an algorithm to proactively identify eligible homeowners that are more likely to accept a home repair tax credit. In doing so, the Department of Housing and Community Development (HCD) of Emil City will know were to direct their limited marketing resources for the most gain.
##### Load Data ----
<- read.csv("https://raw.githubusercontent.com/bri-ne/Public-Policy-Analytics-Landing/master/DATA/Chapter6/housingSubsidy.csv")
data <-data%>%mutate(pdays = ifelse(pdays>900, 0, pdays))
data
#### Data Prodding Exploration ----
<- data[["cons.conf.idx"]]%>%unique()
cons.conf
%>%length()
cons.conf
<- data[["cons.price.idx"]]%>%unique()
cons.price %>%length()
cons.price<- data[["unemploy_rate"]]%>%unique()
unemp %>%length()
unemp
##seems like 26 months of cons confidence and price data, assuming no same values in a month
##seems like 24 months of unemployment data, assuming no same values in a month
#### Statistical Exploration ----
#### Simplified Cats and New Cats Added
<-data%>%mutate(edu.redux = ifelse(grepl("basic|illi|unk",data$education), 'belowHS', education),
data edu.redux = ifelse(grepl("high",data$education), 'high.school', 'above.hs'),
atleast.bach = ifelse(grepl("uni|prof",data$education), 'yes', 'no'))
#inspect datatypes
sapply(data, class)
#Create num data and cat data dfs
<- data%>%dplyr::select(where(is.character),y)
catdata <- data%>%dplyr::select(where(is.numeric),where(is.integer),y, -y_numeric)
numdata
#### Chisqaured (for TWO CAT VARS) ----
library(gmodels)
#Create empty list to hold looped results
<- list()
cattestvals
#forloop over columns
for(i in 1:ncol(catdata)) {
<- CrossTable(catdata$y,
cattestvals[[i]]
catdata[ ,i], prop.c=TRUE,
prop.r=FALSE,
prop.t=FALSE,
prop.chisq=FALSE,
chisq=TRUE)
}
#print results
for(i in 1:length(cattestvals)){
print(i)
print(cattestvals[[i]]$chisq)
}
#looks like these are the significant cat vars, getting rid of other old edu var
#b/c it is less sig
<- catdata%>%dplyr::select(1,2,4,6, 7,8,10,11, 13)
catdata.sig
<- sapply(catdata.sig, colnames)
catcol
#just double checking ----
.2 <- list()
cattestvalsfor(i in 1:ncol(catdata.sig)) {
.2[[i]] <- CrossTable(catdata.sig$y,
cattestvals
catdata.sig[ ,i], prop.c=TRUE,
prop.r=FALSE,
prop.t=FALSE,
prop.chisq=FALSE,
chisq=TRUE)
}
#print results
for(i in 1:length(cattestvals.2)){
print(i)
print(cattestvals.2[[i]]$chisq)
}
#### Tapply (Binary + Continuous VARS) -----
#tapply(data$spent_on_repairs, data$y, mean)
#We can also get standard deviations
#tapply(data$spent_on_repairs, data$y, sd)
#Create empty list to hold looped results
#numtestvals<- list()
#forloop over columns
#for(i in 1:ncol(numdata)) { # for-loop over columns
# numtestvals[[i]] <- t.test(numdata[ ,i]~numdata$y)
#}
#print results
#for(i in 1:length(numtestvals)){
# print(i)
# print(numtestvals[[i]]$p.value)
#
#}
#looks like all are significant except the index,
#but not all of these make sense to me
#for example the negative inflation
#and the
As of now only 11% of eligible homeowners that are contacted accept the credit, but with the use of a predictive algorithm, HCD hopes to increase the percentage of homeowner that accept the credit to 25%. This algorithm will use various variables, both categorical and numeric to predict eligible homeowners that are most likely to accept home repair tax credits. The nature of the question we are trying to answer, being either ‘yes, accept the credit’ or ‘no, do not accept the accept’, means that we will be using Logit Regression.
In Fig. 1a, all the available categorical variables are evaluated for use, though some have been combined beforehand. In Fig.1b, the categorical variables that may have some predictive power are pictured. If a variable is substantially different for yes
and no
, then it might have some predictive power.This is shown by the count of each outcome associated with each variable class.
Of the variables, or features, shown in Fig.1b, only a few were kept for the resulting two algorithms. Some features were not included either because they did not show to be substantially different for yes
and no
outcomes, or because the nature of the data made the feature difficult to interpret. For example, whether a homeowner had a lien on their house (taxLien
) was not included die to the large number of unknown entries.
A similar process was followed for identifying numeric variables, but in addition to looking at counts we visualized the distribution of features with yes
and no
results. Fig.2a – Fig.2d show the results for numeric variables.
In addition to visualizing the data in this way, we ran quick statistical analysis using ChiSquared for the categorical variables and using the T-test for numeric variables. (You can see this in the first hidden code chunk under the Features section. Note: the t-test section threw error for knitting, so it’s commented out).
There were some interesting conflicts between the two kinds of plots for the numeric variables. The quick and dirty statistical analysis we ran as well as the Fig.2b show spent_on_repairs
, cons.conf.idx
, and cons.price.idx
to have some interesting difference between yes
and no
outcomes. But the summary mean bar graph Fig.2a does not show this difference for those two variables. This is true in the opposite way for the campaign
variable.
Below are the results for the first model, that incorporated almost all of the original features. As we can see from the summary table, not all of the variables are significant. For the next model, some features will be kept, and new ones will be created.
## Final Data Set
<- numdata%>%dplyr::select(X, contvars)%>%cbind(catdata.sig%>%dplyr::select(catvars))
data.sig <- data.sig%>%cbind((data%>%dplyr::select(y, y_numeric)))
data.sig
## Partition
set.seed(2121)
<- createDataPartition(data.sig$y, p = .65,
trainIndex list = FALSE,
times = 1)
<- data.sig[ trainIndex,]
dataTrain <- data.sig[-trainIndex,]
dataTest
## First REG
<- glm(y_numeric ~ .,
Model1 data=dataTrain %>%
::select(-X, -y), #### Just getting rid of index
dplyrfamily="binomial" (link="logit"))
#summary(Model)
<- tbl_regression(Model1, exponentiate = TRUE)
summary1.table
## TEST PROBS
<- data.frame(Outcome = as.factor(dataTest$y_numeric),
testProbs1 Probs = predict(Model1, dataTest, type= "response"))
## CONFUSION MATRIX: SENS & SPECS
#first set threshold
<-
whichThreshold1 iterateThresholds(
data=testProbs1, observedClass = Outcome, predictedProbs = Probs)
<-
testProbs1 %>%
testProbs1 mutate(predOutcome = as.factor(ifelse(testProbs1$Probs > 0.5 , 1, 0)))
#it doesn't really matter where I set it b/c the model is bad
#Accuracy
#(True Positives + True Negatives) / Total Observations
#Sensitivity (True Positive Rate)
#True Positives / Observed Positives
#Specificity (True Negative Rate)
#True Negatives / Observed Negatives
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML r2CU
## -725.3366731 -926.8710680 403.0687898 0.2174352 0.1396835 0.2797012
Characteristic | OR1 | 95% CI1 | p-value |
---|---|---|---|
pdays | 1.13 | 1.04, 1.23 | 0.007 |
previous | 1.03 | 0.83, 1.29 | 0.8 |
campaign | 0.95 | 0.87, 1.02 | 0.2 |
spent_on_repairs | 1.01 | 1.00, 1.02 | 0.037 |
cons.conf.idx | 1.10 | 1.06, 1.15 | <0.001 |
cons.price.idx | 11.2 | 5.35, 23.5 | <0.001 |
unemploy_rate | 0.28 | 0.17, 0.44 | <0.001 |
inflation_rate | 0.82 | 0.46, 1.44 | 0.5 |
age | 1.02 | 1.00, 1.03 | 0.049 |
marital | |||
divorced | — | — | |
married | 1.19 | 0.74, 1.98 | 0.5 |
single | 1.49 | 0.86, 2.64 | 0.2 |
unknown | 0.00 | >0.9 | |
contact | |||
cellular | — | — | |
telephone | 0.25 | 0.16, 0.39 | <0.001 |
atleast.bach | |||
no | — | — | |
yes | 0.85 | 0.61, 1.18 | 0.3 |
job | |||
admin. | — | — | |
blue-collar | 0.62 | 0.39, 0.97 | 0.039 |
entrepreneur | 0.59 | 0.21, 1.39 | 0.3 |
housemaid | 0.43 | 0.13, 1.15 | 0.12 |
management | 0.65 | 0.36, 1.12 | 0.13 |
retired | 0.30 | 0.13, 0.66 | 0.004 |
self-employed | 0.59 | 0.23, 1.32 | 0.2 |
services | 0.71 | 0.40, 1.23 | 0.2 |
student | 0.52 | 0.21, 1.21 | 0.14 |
technician | 1.00 | 0.65, 1.52 | >0.9 |
unemployed | 1.15 | 0.52, 2.38 | 0.7 |
unknown | 0.58 | 0.15, 1.84 | 0.4 |
taxbill_in_phl | |||
no | — | — | |
yes | 1.08 | 0.76, 1.57 | 0.7 |
1
OR = Odds Ratio, CI = Confidence Interval
|
The plot below shows the distribution of our predicted values compared to the actual values. The first model seems to predict well for negative results, but not well at all for positive results.
#### NEW FEATS & SECOND REG ----
#### DON'T FORGET TO REPARTITION
## doing this now just I don't overfit my model, will partion again using the
## same seed, so that I can incorp my new feats into the cv
<- data.sig
data.sig2 set.seed(2121)
<- createDataPartition(data.sig2$y, p = .65,
trainIndex2 list = FALSE,
times = 1)
<- data.sig2[ trainIndex2,]
dataTrain2 <- data.sig2[-trainIndex2,]
dataTest2
##makesure to interpret
## Feature 1: Telling month: were they last contacted in the summer? Y/N **
"month"]]%>%unique()
data[[
#getting just the month an dbinding with datasig
<-data%>%dplyr::select(month)%>%cbind(data.sig)
data.sig2
<- data.sig2%>%mutate(telling_mo = ifelse(grepl("j|aug|may",data.sig2$month), 'yes', 'no'))%>%dplyr::select(-month)
data.sig2
# **while month data wasn't incorporated into the first model, b/c it didn't prove
# to be sig, a more reduced version of the data will be incorporated into the new
# model and it is created based on what we saw in the visual of the bar graphs.
# the summer months (may-aug) seem to have the biggest difference in y or n
## Feature 2: Contacted: were they contacted at all before? Y/N *
#getting just the poutcome an cbinding with datasig
<-data%>%dplyr::select(poutcome)%>%cbind(data.sig2)
data.sig2
<- data.sig2%>%mutate(contacted = ifelse(grepl("non",data.sig2$poutcome), 'no', 'yes'))%>%dplyr::select(-poutcome)
data.sig2
# *this seems helpful b/c there's not already an explict variable saying this
## Feature 3: Optimal_contact: were they contacted the optimal amount of times for this campaign?
#Finding the optimal amount of times in the training set?
#finding the median and mode number of contacts made to those who accepted
#first filter for only accepted
<- dataTrain2%>%filter(y_numeric == 1)
accepted
<- accepted[["campaign"]]%>%median()
median.camp
<- function(x) {
modefn <- unique(x)
ux which.max(tabulate(match(x, ux)))]
ux[
}
<- accepted[["campaign"]]%>%unique()%>%modefn()
mode.camp
#it seems like most people that accepted the credit were contacted 1 time
#and that the median was being contact twice, so it might be helpful to say
#if you were contacted between 1 and 2 times, you are more likely to accept the
#credit compared to if you weren't contacted at all or contacted more than twice
<- data.sig2%>%mutate(optimal_contact = ifelse(campaign <3 & campaign >0, 'yes', 'no'))%>%dplyr::select(-campaign)
data.sig2
## Feature 4: Optimal_spending: Spent more on repairs than the average spent by those that accepted aka Optimal Spending
#looking into the average spent on repairs
<- accepted[["spent_on_repairs"]]%>%mean()
mean.spent <- accepted[["spent_on_repairs"]]%>%median()
median.spent
<- data.sig2%>%mutate(optimal_spending = ifelse(spent_on_repairs < 5092 & spent_on_repairs > 5075, 'yes', 'no'))
data.sig2
## Feature 5: Optimal Age
#it looked like from the the density graph of the numeric vars
#that there was difference in outcome in people aged between 35 - 45, 54-56,
#and 61 - 68. the other age ranges seemed to be more similar
# dens plot for age for train2 data
<- data.sig2%>%mutate(suboptimal_age = ifelse(age>34 & age <46 | age>53 & age <57 | age>60 & age<69,
data.sig2 'yes', 'no'))%>%dplyr::select(-age)
## Feature 6: Retired or not
<- data.sig2%>%mutate(retired = ifelse(grepl("reti",data$job), 'yes', 'no'))%>%dplyr::select(-job) data.sig2
Given the results of the first model, it was clear that a better model might need new features. Here’s a break down of what was created. Figures 4 and 5 show the potential these features have.
While month data wasn’t incorporated into the first model, because it didn’t prove to be significant, a more reduced version of the data will be incorporated into the new model and it is created based on what we saw in the visual of the bar graphs in Figure 1.a. The results of yes
or no
seem to be substantially different between Summer months (May – August) and non-summer months. This feature is categorical and established whether a homeowner was last contacted in a Summer month or a non=summer month. I think this might prove useful due to the nature of the product: a tax credit. Depending on how close to tax season a homeowner was last contacted, there may be in difference in the action they take.
While there are variables showing what the outcome of homeowner’s actions were in a previous campaign, there is no explicit features that shows whether or not they were contacted during a previous campaign, or if they were a part of a previous campaign. This feature is categorical and defines whether or not a homeowner was previously contacted before the current campaign.
The number of times that a homeowner was contacted during the campaign may have an effect on whether or not they followed through with accepting the credit. If they were contacted an extraordinarily amount of times, they might be fed up with hearing from the city and have declined to take the process any further. Or the opposite might be true. In trying to determine what the optimal amount of contacts was, we investigated the amounted of times a yes
observation was contacted within our training data. Only training data was used to inform this variable, in the hope that the model would not become over-fit.
How we found the optimal amount of times in the training set:
first filter for only accepted, but only in train data set
find the median and mode number of contacts made to those who accepted
It appears that on average, people that accepted the credit were contacted once, and most people that accepted the credit were contacted twice. So this feature categorizes those contacted between one and two times as a yes
, and sets any other amount of contacts as no
.
The same process as above was followed for determining the Optimal Spending feature. The optimal amount spent on home repair ranges from $5,093 to $5,074. If a homeowner spent within that range they were marked as a yes
, and if they didn’t they were marked as a no
.
Instead of looking at the ages for the yes
observations, the density graph showing the age variable (from Fig.2a) was used. From the graph below there are clear difference in outcome between certain age ranges.
The optimal age ranges were found to be:
between 35 – 45
between 54 – 56
between 61 – 68
This feature was created after iterating through a few more models. The results showed that retirees was a significant feature in identifying eligible household that would accept the credit. If the homeowner was listed as retired, they were marked as a yes
, and if they were not they were marked as a no
.
All of the new features seem to show a difference between yes
and no
outcomes, but through iterating over many models, only a handful of the new features were kept.
Below are the results for the second model, that incorporated some of the original features and only a few of the new features. This model was arrived at through iterating over several models. As we can see from the summary table, all of the variables are significant. But as the resulting plot shows, this new model still does not yield extremely clear predictions.
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML r2CU
## -734.6400782 -926.8710680 384.4619795 0.2073978 0.1336874 0.2676947
Characteristic | OR1 | 95% CI1 | p-value |
---|---|---|---|
optimal_spending | |||
no | — | — | |
yes | 1.79 | 1.08, 2.96 | 0.022 |
pdays | 1.13 | 1.05, 1.23 | 0.002 |
unemploy_rate | 0.44 | 0.39, 0.50 | <0.001 |
cons.price.idx | 5.14 | 3.73, 7.12 | <0.001 |
cons.conf.idx | 1.07 | 1.05, 1.10 | <0.001 |
contact | |||
cellular | — | — | |
telephone | 0.30 | 0.20, 0.44 | <0.001 |
retired | |||
no | — | — | |
yes | 0.54 | 0.27, 1.01 | 0.064 |
1
OR = Odds Ratio, CI = Confidence Interval
|
The plot below shows the distribution of our predicted values compared to the actual values. Similar to the first model, the second model seems to predict well for negative results, but not well at all for positive results.
Below the ROC curve for both the initial model (kitchen skin model, Fig.7) and the second, engineered model (Fig.8) are shown. While the curves look pretty similar the threshold that each are set at is drastically different. The first model is set at the default 0.50 threshold, while the second is set at a 0.07 threshold. Both curves fall into the “useful fit” range, meaning they’re better than tossing a coin. For the first model suffering a prediction of false positives at ~27%, only yields a prediction of true positives at less than 60%. The result is a little better for the engineered model, where predicting false positives at ~27% results in predicting true positives at more than 60%. As we get into the cost-benefit analysis, we’ll see how these small differences play out.
0 | 1 | |
---|---|---|
0 | 1236 | 118 |
1 | 47 | 39 |
0 | 1 | |
---|---|---|
0 | 940 | 52 |
1 | 343 | 105 |
Below, Fig. 9 and 10 show the results of each model. Both seem to have strong predictive power for predicting `yes` (Sensitivity) but are not as good at predicting `no` (Specificity). The engineered model does appear to have a slightly tighter distribution of True Positives and that is also reflected in the Sensitivity difference between the engineered model (0.988018) and the kitchen sink model (0.9847523).
## Cross Validation
#define the variables we want
<- colnames(dplyr::select(data.sig, everything(), -y, -X, -y_numeric))
Model1.vars
<- trainControl(method = "cv", number = 100, classProbs=TRUE, summaryFunction=twoClassSummary)
ctrl
<- caret::train(y ~ .,
cvFit1 data=data.sig%>%dplyr::select(all_of(Model1.vars), y),
method="glm", family="binomial",
metric="ROC", trControl = ctrl)
## Fourth CV **fourth** ----
<- trainControl(method = "cv", number = 100, classProbs=TRUE, summaryFunction=twoClassSummary)
ctrl
<- caret::train(y ~ .,
cvFit4 data=data.sig2 %>%dplyr::select(all_of(Model4.vars), y),
method="glm", family="binomial",
metric="ROC", trControl = ctrl)
## Generalized Linear Model
##
## 4119 samples
## 14 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (100 fold)
## Summary of sample sizes: 4078, 4077, 4078, 4078, 4078, 4078, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7635931 0.9841967 0.1885
## Generalized Linear Model
##
## 4119 samples
## 7 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (100 fold)
## Summary of sample sizes: 4078, 4078, 4078, 4078, 4078, 4079, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7704142 0.9880255 0.1735
In every policy decision, there is always trade off, and this is the case with the two models we’ve built. In this case, the only outcomes that cost HCD funds are True Positives and False Positives. They both require the cost of marketing, and the True Positives require the cost of the tax credit. The plot below explores the difference in cost depending on which threshold is chosen in both Model 1 and 2.
In both it seems that False Positives cost the most the lower the threshold, but at a certain point all costs start to even out as the number of false and true positives equalize.
Below is the break down between what a correct or incorrect prediction means and it’s associated cost. We also included the social benefit of a True Benefit Result.
A True Positive, or correctly predicted acceptance of the credit means HCD incurs marketing costs (\$ 2,850) and the cost of providing the credit for the 25% of homeowners that do in fact accept the credit (\$5,000*0.25). But the social benefits of those 25% True Positives accepting the credit, is felt by the homeowner who will likely see an increase in home value of \$10,000 and the benefit felt by surrounding homeowners. Cumulatively their benefit is \$56,000.
A False Positive, or incorrectly predicted acceptance of the credit, means that HCD incurs marketing cost only (\$ 2,850) and no social benefit. And a False Negative, incorrectly predicting no acceptance of credit, and a True Negative, correctly predicting no acceptance of credit, means a $0 cost for HCD, and no social benefit.
Result | Description | Cost | SocialBenefit |
---|---|---|---|
True Positive | We predicted the credit was accepted and it was accepted | 2,8500 + (5,000 * 0.25) | (10,000 + 56,000) * 0.25 |
True Negative | We predict the credit was not accepted and it was not | 0 | 0 |
False Positive | We predict the credit was accepted and it was not | 2,850 | 0 |
False Negative | We predict the credit was not accepted and it was accepted | 0 | 0 |
Here the resulting cost from the kitchen sink model (model 1) at the default threshold is shown. At the default threshold, 0.50, there are more False Positives than True Positives. When accounting for the difference in cost to HCD and the social benefit, there is a net loss of \$349,650.00.
Variable | Count | Cost | Social_Benefit | Description |
---|---|---|---|---|
True_Positive | 39 | $ 159900.00 | $ 643500.00 | We correctly predicted credit accepted |
True_Negative | 1236 | $ 0.00 | $ 0.00 | We correctly predicted credit not accepted |
False_Negative | 118 | $ 0.00 | $ 0.00 | We predicted not accepted and customer accepted |
False_Positive | 47 | $ 133950.00 | $ 0.00 | We predicted accepted and customer did not accept |
Subtotal | $ 293850.00 | $ 643500.00 | ||
Total | $ -349650.00 |
Here the resulting cost from the kitchen sink model (model 1) at the optimal threshold is shown. At the optimal threshold, 0.07, there are still more False Positives than True Positives, but there are more True Positives identified compared to the first model. This does mean that the initial costs are higher, but the social benefit is greater. When accounting for the difference in cost to HCD and the social benefit, there is a smaller net loss of \$324,450.00
The resulting financial differences between the models are:
Cost to HCD: $1.1 M more is spent with use of Model 2, instead of Model 1
Social Benefit: $1.09 M more social benefit with use of Model 2, instead of Model 1
Net Total: $25,200 increase with use of Model 2, instead of Model 1
Variable | Count | Cost | Social_Benefit | Description |
---|---|---|---|---|
True_Positive | 105 | $ 430500.00 | $ 1732500.00 | We correctly predicted credit accepted |
True_Negative | 940 | $ 0.00 | $ 0.00 | We correctly predicted credit not accepted |
False_Negative | 52 | $ 0.00 | $ 0.00 | We predicted not accepted and customer accepted |
False_Positive | 343 | $ 977550.00 | $ 0.00 | We predicted accepted and customer did not accept |
Subtotal | $ 1408050.00 | $ 1732500.00 | ||
Total | $ -324450.00 |
Below the counts of credits are shown for two threshold settings for Model 2: one at the default 0.50 and the other at the optimal threshold 0.07. There are less credits given out at the optimal threshold compared to the default, but it does not come without a trade off.
At the default threshold, the rate of False Positives and True Positives are 0.02416
and 0.02416
, respectively. While the are only That relationship is better at the optimal threshold, with 0.26734
and 0.66879
as the False Positive and True Positive rates, respectively. While the ratio between the False and True Positive Rates is not as good for the threshold 0.07, it’s chosen as the optimal threshold because it captures a greater percentage of the True Positives. Given that goal of the program is to reach more eligible homeowners that are likely to take the credit, I think this is a worthwhile trade off.
# - Create two small multiple plots that show Threshold ----
# as a function of Total_Revenue and Total_Count_of_Credits.
# Interpret this.
##REVENUE
<-
whichThreshold_revenue4 %>%
whichThreshold4_edit group_by(Threshold) %>%
summarize(Revenue = sum(Revenue))
<- ggplot(whichThreshold_revenue4)+
WT_r450.plotgeom_line(aes(x = Threshold, y = Revenue))+
geom_vline(xintercept = pull(whichThreshold_revenue4[50,1]))+
labs(title = "Model Revenues By Threshold For Engineered Model",
subtitle = "Vertical Line Denotes Default Threshold",
caption = paste("Threshold", pull(whichThreshold_revenue4[50,1]),
", Revenue: $",
pull(whichThreshold_revenue4[50,2])))
<- ggplot(whichThreshold_revenue4)+
WT_r4opt.plotgeom_line(aes(x = Threshold, y = Revenue))+
geom_vline(xintercept = pull(whichThreshold_revenue4[7,1]))+
labs(title = "Model Revenues By Threshold For Engineered Model",
subtitle = "Vertical Line Denotes Optimal Threshold",
caption = paste("Threshold", pull(whichThreshold_revenue4[7,1]),
", Revenue: $",
pull(whichThreshold_revenue4[7,2])))
##COUNT ----
<-
whichThreshold_count4 %>%
whichThreshold4 mutate(Credits = (Count_TP*0.25))%>%
group_by(Threshold) %>%
summarize(Credits = sum(Credits))
<- ggplot(whichThreshold_count4)+
WT_c450.plot geom_line(aes(x = Threshold, y = Credits))+
geom_vline(xintercept = pull(whichThreshold_count4[50,1]))+
labs(title = "Model Credits By Threshold For Engineered Model",
subtitle = "Vertical Line Denotes Default Threshold",
caption = paste("Threshold", pull(whichThreshold_count4[50,1]),
", # of Credits:",
round(pull(whichThreshold_count4[50,2]))))
<- ggplot(whichThreshold_count4)+
WT_c4opt.plot geom_line(aes(x = Threshold, y = Credits))+
geom_vline(xintercept = pull(whichThreshold_count4[7,1]))+
labs(title = "Model Credits By Threshold For Engineered Model",
subtitle = "Vertical Line Denotes Optimal Threshold",
caption = paste("Threshold", pull(whichThreshold_count4[7,1]),
", # of Credits:",
round(pull(whichThreshold_count4[7,2]))))
Below the cost is shown for two threshold settings for Model 2: one at the default 0.50 and the other at the optimal threshold 0.07. The cost is greater at the optimal threshold, but only because more eligible homeowner who are likely to take the credit have been identified, which after all, is the point of the program. While it’s assumed that only 25% of True Positives will take the credit, this seems to be worth the marketing expense, especially when social benefit is considered.
Threshold | Credits | Cost |
---|---|---|
DefaultThreshold (0.50) | 8 | $ 141150.00 |
OptimalThreshold (0.07) | 26 | $ 1145550.00 |
Clearly, this model is not perfect; Fig.6 illustrating the distribution of prediction versus outcomes shows as much. The model would likely benefit from different data sources, such as homeowners who have taken other credits, language spoken at home, and I’m sure others. Filing taxes is not easy for everyone. It takes knowledge of the system and language fluency. Those who have taken other tax credits are probably more likely to take additional credits, purely because they know that they might exist and they might feel more comfortable taking the necessary steps compared to others who have not ever taken credits. Similarly, if you are not fluent in the language of the State, then you might not be comfortable enough to make financial decisions based on information in that language.
However, working on the assumption that at least 25% of the identified homeowners will take the credit, this model is worth employing. While at the current threshold (0.07), this model will cost the HCD a little more than 1.14 million dollars, the amount of homeowners accepting the credit will be greater, and that is the goal. There is a cost associated with reaching more people, but this model is still useful even at different thresholds. While there are trade offs, if there are other limitations HCD faces, such as funding, the threshold can be adjusted and all limitations and goals can be met. This is of course in relation to the status quo. If in fact the HCD reaches more eligible homeowners likely to take the credit with this model set to a different threshold than they would otherwise, this is still a win.
There might even be a use for another model built around A|B testing of marketing materials. This could be used to answer the question: “what kind of information do people respond to?” It could even be used to target specific kinds of homeowners who might otherwise be left out due to the design of marketing materials. An example of this, would be to provide materials in several languages. There is plenty of room for statistics in policy, especially in program delivery; this is just one case that would benefit.