#https://cran.r-project.org/web/packages/scorecard/scorecard.pdf #https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data) rm(list=ls()) while (!is.null(dev.list())) dev.off() #install.packages("scorecard") library(scorecard) #Q: What are the scorecards used for? #Q: What regression/classification methods can be used in credit scoring? data(germancredit) summary(germancredit) #Q: What does the dataset contain? Which variables will be the most predictive? #Q: Are there any concerns regarding use of available data in the model? #Q: Is the sample representative of the application portfolio? #Q: What are potential data limitations? # Let's mess up the dataset a bit to make it look more realistic ################################################# germancredit$status.of.existing.checking.account_1 <- germancredit$status.of.existing.checking.account set.seed(1) rind <- which(germancredit$status.of.existing.checking.account == 'no checking account') rind <- rind[sort(sample(1:length(rind), 30))] germancredit$status.of.existing.checking.account_1[rind] <- NA germancredit$age.in.years_1 <- germancredit$age.in.years rind <- sort(sample(1:1000, 50)) germancredit$age.in.years_1[rind] <- NA germancredit$age.in.years_1[c(1,2,3)] <- c(200,300,400) germancredit <- germancredit[, c(1,22,2:13,23,14:21)] summary(germancredit) # Divide sample into train (80%) and test (20%) ################################################################## dt_split <- split_df(germancredit, y=NULL, ratio=0.80, seed=1) train <- dt_split$train test <- dt_split$test summary(train$creditability) #Q: Is the split into train and test sample necessary? # check if distribution is comparable in the samples: par(mfrow = c(2,1)) barplot(table(train$property), main = "train") barplot(table(test$property), main = "test") hist(train$duration.in.month, main = "train") hist(test$duration.in.month, main = "test") par(mfrow = c(1,1)) # Variable selection ############################################################################################# par(mfrow = c(2,1)) barplot(table(train$property[train$creditability == "bad"]), main = "bad") barplot(table(train$property[train$creditability == "good"]), main = "good") hist(train$duration.in.month[train$creditability == "bad"], main = "bad") hist(train$duration.in.month[train$creditability == "good"], main = "good") par(mfrow = c(1,1)) # Calculate Information Value manually: var <- train$credit.history #variable we consider IV_calc <- data.frame(summary(var)) #Total Number of loans per each variable category colnames(IV_calc) <- c('total_per_type') IV_calc IV_calc$dist_loans <- IV_calc$total_per_type / sum(IV_calc$total_per_type) #(NOT USED IN CALCULATION) distribution of categories IV_calc IV_calc$goods <- summary(var[train$creditability == "good"]) #no of good loans per each credit history IV_calc IV_calc$bads <- summary(var[train$creditability == "bad"]) #no of bad loans per each credit history IV_calc IV_calc$perc_bads <- IV_calc$bads / IV_calc$total_per_type #(NOT USED IN CALCULATION) % of bads in category IV_calc IV_calc$dist_bads <- IV_calc$bads / sum(IV_calc$bads) #distribution of bads across categories IV_calc IV_calc$dist_goods <- IV_calc$goods / sum(IV_calc$goods) #distribution of bads across categories IV_calc IV_calc$woe_per_type <- log(IV_calc$dist_goods / IV_calc$dist_bads) #WoE IV_calc IV_calc$iv_per_type <- (IV_calc$dist_goods - IV_calc$dist_bads) * IV_calc$woe_per_type #IV component IV_calc sum(IV_calc$iv_per_type) #IV for variable # Calculate Information Value for each variable: iv(train, y = "creditability") #Q: What does IV mean? When it is considered too low? Can it be too high? #Q: What is the logic of the IV formula? # IV = sum(distBadi-distGoodi)*log(distBadi/distGoodi) # ID %bads %goods diff rel_diff log IV # 1 1% 11% -0.10 0.09 -2.40 0.24 # 2 5% 15% -0.10 0.33 -1.10 0.11 # 3 10% 30% -0.20 0.33 -1.10 0.22 # 4 20% 30% -0.10 0.67 -0.41 0.04 # 5 11% 1% 0.10 11.00 2.40 0.24 # 6 53% 13% 0.40 4.08 1.41 0.56 #Q: What variables do we want to exclude from model development? # Select variables based on standard selection criteria. Creditability becomes numeric: train_sel <- var_filter(train, "creditability", return_rm_reason = TRUE) train_sel$rm train_sel <- var_filter(train, "creditability", return_rm_reason = FALSE) #run so that woe_bin command works # WoE, Fine & Coarse classing ##################################################################################### # Do binning for the variables you want to use: bins <- woebin(train_sel, "creditability") # Investigate if bins make sense: dev.off() woebin_plot(bins) # Plots with WoE profile: for (i in 1 : length(bins)){ barplot(bins[[i]]$woe, main = bins[[i]]$variable[1], names.arg = bins[[i]]$bin) } # Revise proposed WoE: breaks_adj <- woebin_adj(train_sel, y="creditability", bins) # new bins (to be entered manually for age.in.years, age.in.years_1): 24,27,30,32,37,40,42,50,55 #Q: Why is WoE used instead of raw values? What are pros and cons of this approach? #Q: What is the impact of the change of binning on IV? # Replace your data with final WoE: bins_final <- woebin(train_sel, y="creditability", breaks_list=breaks_adj) train_woe <- woebin_ply(train_sel, bins_final) # Fitting logistic regression #################################################################################### # run glm function & fit all of the variables transformed to WoE: m <- glm(creditability ~ ., family = binomial(), data = train_woe) summary(m) # Select a model by AIC: m_step <- step(m, direction="both", trace=FALSE) m <- eval(m_step$call) summary(m) # Select alternative model: m <- glm(creditability ~ status.of.existing.checking.account_woe + duration.in.month_woe + credit.history_woe + purpose_woe + credit.amount_woe + savings.account.and.bonds_woe + present.employment.since_woe + installment.rate.in.percentage.of.disposable.income_woe #+ personal.status.and.sex_woe + other.debtors.or.guarantors_woe + property_woe + age.in.years_woe + other.installment.plans_woe #+ housing_woe , family = binomial(), data = train_woe) summary(m) m <- glm(creditability ~ status.of.existing.checking.account_woe + duration.in.month_woe + credit.history_woe + purpose_woe + credit.amount_woe + savings.account.and.bonds_woe #+ present.employment.since_woe #+ installment.rate.in.percentage.of.disposable.income_woe #+ personal.status.and.sex_woe #+ other.debtors.or.guarantors_woe #+ property_woe + age.in.years_woe + other.installment.plans_woe #+ housing_woe , family = binomial(), data = train_woe) summary(m) #Q: Can only linear and monotonic relationships be modelled? # Additional check - what happens with U shape trend############################################################## ushape_var <- vector(mode = "numeric", length = 802) ushape_var[train$creditability == "bad"] <- cbind(runif(117, min = 18, max = 28), runif(117, min = 80, max = 90)) ushape_var[train$creditability == "good"] <- runif(568, min = 28, max = 80) N_rnd <- 100 ushape_var[sample(seq(ushape_var), N_rnd)] <- runif(N_rnd, min = 18, max = 90) temp <- cbind(train, ushape_var)[, 23:24] hist(temp$ushape_var[temp$creditability == "bad"]) hist(temp$ushape_var[temp$creditability == "good"]) temp_bins <- woebin(temp, "creditability") woebin_plot(temp_bins) barplot(temp_bins$ushape_var$woe, names.arg = temp_bins$ushape_var$bin) temp_woe <- woebin_ply(temp, temp_bins) temp <- cbind(temp, temp_woe[, -1]) mt <- glm(creditability ~ ushape_var, family = binomial(), data = temp) summary(mt) mt <- glm(creditability ~ ushape_var_woe, family = binomial(), data = temp) summary(mt) #Q: How to compare the scoring models? What model is considered good? # Metrics for assessment of discriminatory power################################################################## # ROC curve logic - example indicator <- c(1,2,2,2,3,3,3,4,4,4,4,4,5,5,5,5,6,6,6,6,6,7,7,7,8,8,8,9,9,10) bad_flag <- c(1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0) temp <- table(indicator, bad_flag) temp <- as.data.frame(cbind(temp[,1], temp[,2])) colnames(temp) <- c('Ng', 'Nb') temp$fp <- cumsum(temp$Ng) temp$tp <- cumsum(temp$Nb) temp$fpr <- temp$fp/sum(temp$Ng) temp$tpr <- temp$tp/sum(temp$Nb) View(temp) par(mfrow = c(2,1)) barplot(temp$Ng, names.arg = c(1:10), ylim = c(0,5), col = "lightblue") barplot(temp$Nb, names.arg = c(1:10), ylim = c(0,5), col = "pink") par(mfrow = c(1,1)) plot(temp$fpr,temp$tpr, type=c("b")) perf_eva(indicator, bad_flag, show_plot = 'roc') # KS logic - example plot(1:10, temp$fpr, type="l",col="blue") lines(1:10, temp$tpr, col="red") perf_eva(indicator, bad_flag, show_plot = 'ks') # Gini logic - example perfm <- perf_eva(indicator, bad_flag, show_plot = 'lz') perfm$binomial_metric$dat$Gini 2 * perfm$binomial_metric$dat$AUC - 1 # Performance assessment ######################################################################################### # predicted probability train_pred = predict(m, type='response', train_woe) perf_eva(train_pred, train_woe$creditability, show_plot = c('roc','ks','lz')) # filter variables in test sample test_sel <- subset(test, select=colnames(train_sel)) # create WoE for test sample test_woe <- woebin_ply(test_sel, bins_final) # predicted probability test_pred <- predict(m, type='response', test_woe) # model validation on test sample perf_eva(test_pred, test_woe$creditability, show_plot = c('roc','ks','lz')) #Q: Why is the model called scorecard? Where are the scores? # Create a scorecard ########################################################################################### # create a scorecard (score of 200 corresponding to 10:1, increase of 20 to pdo) card <- scorecard(bins_final, m, points0 = 200, odds0 = 1/10, pdo = 20, basepoints_eq0 = FALSE) cal_factor <- 20/log(2) #20 points to double the odds cal_offset <- 200-cal_factor*log(10) #score of 200 gives 10:1 odds # calculate manually increase in score between categories slope <- -m$coefficients["status.of.existing.checking.account_woe"] # minus since we are interested to predict probability of good customer here cats <- sort(unique(train_woe$status.of.existing.checking.account_woe)) score <- cal_factor*(cats*slope)-cal_offset score score <- score - min(score) score # compare with scorecard generated score_card <- card$status.of.existing.checking.account$points - min(card$status.of.existing.checking.account$points) score_card <- sort(score_card, decreasing=TRUE) score_card # score accounts from the given dataset score1 <- scorecard_ply(train, card, only_total_score = FALSE) View(score1)