# ETHICS FUNCTIONS # by Philippe De Brouwer # 2022-10-23 #' return a data.frame of counts and percentages and rownames #' predicated on an outcome variable (y) #' #' @description #' return a table of counts and percentages, given two categorical variables. #' #' @returns a table with counts and percentages #' #' @param x the rows of the output table #' @param y the columns of the output table #' # Note: we will depreciate this and use summarytools pct_table <- function(x, y, round_digits = 2) { x <- table(x, y) %>% as.array() col_names <- colnames(x) x <- cbind(x, apply(x, 1, sum)) %>% as.data.frame() colnames(x)[3] <- "sum" x[[paste0("pct_", col_names[1])]] <- x[[col_names[1]]] / x$sum x[[paste0("pct_", col_names[2])]] <- x[[col_names[2]]] / x$sum return(round(x,2)) } dollars_to_numeric = function(input) { out = sub("\\$", "", input) out = as.numeric(sub(",", "", out)) return(out) } # Replace spaces with underscores space_to_underscore = function(input) { out = sub(" ", "_", input) return(out) } make_WOE_table <- function(df, y) { WOE_tbl <- data.frame(varName = character(0), IV = numeric(0)) for (col in colnames(df)) { if (!(col == y) & (is.factor(df[[col]]))) { wt <- WOETable(df[[col]], df[[y]], valueOfGood = 1) tmp <- data.frame(varName = col, IV = sum(wt$IV)) WOE_tbl <- rbind(WOE_tbl, tmp) } } WOE_tbl } make_dependency_plot <- function(data, x, y, title, method = "loess", q_ymax = 1, q_xmax = 1) { qplot(data = data, get(x), get(y), xlab = "", ylab = "", alpha = 0.001) + geom_smooth(method = method, size = 1.5, span = 0.85) + ggtitle(title) + ylim(c(0, quantile(data[[y]], probs = c(q_ymax)))) + xlim(c(min(data[[x]]), quantile(data[[x]], probs = c(q_xmax)))) + theme_bw() + theme(legend.position = "none") } make_dependency_hist_plot <- function(data, x, y, title, method = "loess", q_ymax = 1, q_xmax = 1) { p1 <- qplot(data = data, get(x), get(y), xlab = "", ylab = "", alpha = 0.001) + geom_smooth(method = method, size = 1.5, span = 0.85) + ggtitle(title) + ylim(c(0, quantile(data[[y]], probs = c(q_ymax)))) + xlim(c(min(data[[x]]), quantile(data[[x]], probs = c(q_xmax)))) + theme_bw() + theme(legend.position = "none") p2 <- ggplot(data = data, aes(x=get(x))) + geom_histogram(color="darkblue", fill="lightblue") + # ggtitle(title) + xlim(c(min(data[[x]]), quantile(data[[x]], probs = c(q_xmax)))) + theme_bw() + theme(legend.position = "none") grid.arrange(p1, p2, ncol = 2) } # Expands to a factor column binary factor columns # expand_factor2bin <- function(data, col){ col = as.name(col) paste0('~ ',col,' -1', collapse = '') %>% as.formula -> formulae current.na.action <- options('na.action') options(na.action='na.pass') expanded <- model.matrix(data = data, object = formulae) options(na.action=current.na.action) colnames(expanded) <- gsub(replacement = paste0(col,'.'),x = colnames(expanded),pattern=col) expanded %>% dplyr::tbl_df() %>% mutate_each(funs(as.integer)) -> expanded #return(bind_cols(data,expanded)) return(expanded) } ##---- # function to get AUC # USES: library(ROCR) fAUC <- function(model, data, ...) { y <- all.vars(formula(model))[1] pred1 <- predict(model, newdata = data, ...) pred <- ROCR::prediction(pred1, data[[y]]) perf <- performance(pred, "auc") AUC <- attr(perf, "y.values")[[1]] AUC } # The optimal cutoff # We introduce cost.fp to be understood as a the cost of a # false positive, expressed as a multiple of the cost of a # false negative. # get_best_cutoff # Finds a cutoff for the score so that sensitivity and specificity # are optimal. # Arguments # fpr -- numeric vector -- false positive rate # tpr -- numeric vector -- true positive rate # cutoff -- numeric vector -- the associated cutoff values # cost.fp -- numeric -- cost of false positive divided # by the cost of a false negative # (default = 1) # Returns: # the cutoff value (numeric) get_best_cutoff <- function(fpr, tpr, cutoff, cost.fp = 1){ cst <- (cost.fp * fpr - 0)^2 + (tpr - 1)^2 idx = which(cst == min(cst)) c(sensitivity = tpr[[idx]], specificity = 1 - fpr[[idx]], cutoff = cutoff[[idx]]) } # opt_cut_off # Wrapper for get_best_cutoff. Finds a cutof for the score so that # sensitivity and specificity are optimal. # Arguments: # perf -- performance object (ROCR package) # pred -- prediction object (ROCR package) # cost.fp -- numeric -- cost of false positive divided by the # cost of a false negative (default = 1) # Returns: # The optimal cutoff value (numeric) opt_cut_off = function(perf, pred, cost.fp = 1){ mapply(FUN=get_best_cutoff, perf@x.values, perf@y.values, pred@cutoffs, cost.fp) } # clean a string from special characters # specific for our use case though str_clean <- function(x) { x <- gsub("<=",".le.", x) x <- gsub(">=",".ge.", x) x <- gsub(">", ".gt.", x) x <- gsub("<", ".lt.", x) x <- gsub("[.]+", ".", x) x <- gsub("[-]+", ".to.", x) x <- gsub(" ", "", x) x <- gsub("/", ".or.", x) x }