# ETHICS FUNCTIONS # by Philippe De Brouwer # created: 2022-10-23 # last update: 2025-11-06 #' return a data.frame of counts and percentages and rownames #' predicated on an outcome variable (y) #' #' @description #' return a table of counts and percentages, given two categorical variables. #' #' @returns a table with counts and percentages #' #' @param x the rows of the output table #' @param y the columns of the output table #' # Note: we will depreciate this and use summarytools pct_table <- function(x, y, round_digits = 2) { x <- table(x, y) %>% as.array() col_names <- colnames(x) x <- cbind(x, apply(x, 1, sum)) %>% as.data.frame() colnames(x)[3] <- "sum" x[[paste0("pct_", col_names[1])]] <- x[[col_names[1]]] / x$sum x[[paste0("pct_", col_names[2])]] <- x[[col_names[2]]] / x$sum return(round(x,2)) } #' Convert dollar amounts stored as strings to numeric values #' #' @description #' This function takes a character vector with dollar amounts and converts them #' to numeric values by removing dollar signs and commas. #' #' @param input A character vector containing dollar amounts, e.g. "$1,234.56" #' #' @returns A numeric vector representing the dollar values. dollars_to_numeric = function(input) { out = sub("\\$", "", input) out = as.numeric(sub(",", "", out)) return(out) } #' Replace first space in strings with underscore #' #' @description #' This function replaces the first space character in each string of the input vector with an underscore. #' #' @param input A character vector. #' #' @returns A character vector with the first space replaced by an underscore. space_to_underscore = function(input) { out = sub(" ", "_", input) return(out) } #' Create Weight of Evidence (WOE) table for categorical variables in a dataframe #' #' @description #' This function calculates the Information Value (IV) for each factor variable in the dataframe (excluding #' the outcome variable y) using Weight of Evidence encoding, returning a summary table of IVs per variable. #' #' @param df A dataframe containing factor predictor variables and outcome variable. #' @param y A string specifying the name of the binary outcome variable in df. #' #' @returns A dataframe with variable names and their corresponding summed IV. make_WOE_table <- function(df, y) { WOE_tbl <- data.frame(varName = character(0), IV = numeric(0)) for (col in colnames(df)) { if (!(col == y) & (is.factor(df[[col]]))) { wt <- WOETable(df[[col]], df[[y]], valueOfGood = 1) tmp <- data.frame(varName = col, IV = sum(wt$IV)) WOE_tbl <- rbind(WOE_tbl, tmp) } } WOE_tbl } #' Generate dependency plot with smoothing curve for variables x and y in a dataset #' #' @description #' This function creates a scatter plot of x vs. y with a smooth trend line (default loess), #' supporting quantile-based axis limits for improved visualization. #' #' @param data A dataframe containing the variables to plot. #' @param x A string specifying the x-variable column name. #' @param y A string specifying the y-variable column name. #' @param title A string for the plot title. #' @param method The smoothing method to use in geom_smooth (default: "loess"). #' @param q_ymax Numeric quantile to set maximum y-axis limit (default: 1). #' @param q_xmax Numeric quantile to set maximum x-axis limit (default: 1). #' #' @returns A ggplot object representing the dependency plot. make_dependency_plot <- function(data, x, y, title, method = "loess", q_ymax = 1, q_xmax = 1) { qplot(data = data, get(x), get(y), xlab = "", ylab = "", alpha = 0.001) + geom_smooth(method = method, size = 1.5, span = 0.85) + ggtitle(title) + ylim(c(0, quantile(data[[y]], probs = c(q_ymax)))) + xlim(c(min(data[[x]]), quantile(data[[x]], probs = c(q_xmax)))) + theme_bw() + theme(legend.position = "none") } #' Generate combined dependency scatter plot and histogram for a variable x against y #' #' @description #' Generates a two-panel plot: a scatter plot with a smooth curve for x vs y, and a histogram for x distribution. #' Supports axis limits via quantiles. #' #' @param data A dataframe containing the variables. #' @param x A string specifying the x-variable column name. #' @param y A string specifying the y-variable column name. #' @param title A string for the smooth plot title. #' @param method The smoothing method for geom_smooth (default: "loess"). #' @param q_ymax Numeric quantile to set max y-axis limit (default: 1). #' @param q_xmax Numeric quantile to set max x-axis limit (default: 1). #' #' @returns A grid arranged plot combining scatter and histogram. make_dependency_hist_plot <- function(data, x, y, title, method = "loess", q_ymax = 1, q_xmax = 1) { p1 <- qplot(data = data, get(x), get(y), xlab = "", ylab = "", alpha = 0.001) + geom_smooth(method = method, size = 1.5, span = 0.85) + ggtitle(title) + ylim(c(0, quantile(data[[y]], probs = c(q_ymax)))) + xlim(c(min(data[[x]]), quantile(data[[x]], probs = c(q_xmax)))) + theme_bw() + theme(legend.position = "none") p2 <- ggplot(data = data, aes(x=get(x))) + geom_histogram(color="darkblue", fill="lightblue") + # ggtitle(title) + xlim(c(min(data[[x]]), quantile(data[[x]], probs = c(q_xmax)))) + theme_bw() + theme(legend.position = "none") grid.arrange(p1, p2, ncol = 2) } #' Expand factor column to binary (one-hot encoded) columns #' #' @description #' This function takes a dataset and a factor column name, returning a dataframe with one-hot encoded columns #' for each level of the factor. The result contains integer 0/1 indicators. #' #' @param data A dataframe containing the factor column. #' @param col A string specifying the name of the factor column to expand. #' #' @returns A tibble dataframe with binary one-hot encoded columns. expand_factor2bin <- function(data, col){ col = as.name(col) paste0('~ ',col,' -1', collapse = '') %>% as.formula -> formulae current.na.action <- options('na.action') options(na.action='na.pass') expanded <- model.matrix(data = data, object = formulae) options(na.action=current.na.action) colnames(expanded) <- gsub(replacement = paste0(col,'.'),x = colnames(expanded),pattern=col) expanded %>% dplyr::tbl_df() %>% mutate_each(funs(as.integer)) -> expanded #return(bind_cols(data,expanded)) return(expanded) } #' Calculate the Area Under the Curve (AUC) for a binary classification model #' #' @description #' This function computes the AUC metric, which summarizes the ROC curve performance, #' for a set of predicted probabilities and true binary outcomes. #' It utilizes the \code{ROCR} package to create prediction objects and calculate AUC. #' #' @param pred A numeric vector of predicted probabilities or scores for the positive class. #' @param true_outcome A binary vector (factor or numeric) of true class labels, where positive #' class is typically coded as 1. #' #' @returns A numeric scalar representing the AUC value, between 0 and 1. #' #' @examples #' \dontrun{ #' library(ROCR) #' pred <- c(0.1, 0.4, 0.35, 0.8) #' true <- c(0, 0, 1, 1) #' auc <- get_auc(pred, true) #' print(auc) #' } fAUC <- function(model, data, ...) { y <- all.vars(formula(model))[1] pred1 <- predict(model, newdata = data, ...) # Convert the response variable to numeric if it's a factor actual <- data[[y]] if (is.factor(actual)) { actual <- as.numeric(as.character(actual)) } else if (is.character(actual)) { actual <- as.numeric(actual) } pred <- ROCR::prediction(pred1, actual) perf <- ROCR::performance(pred, "auc") AUC <- attr(perf, "y.values")[[1]] return(as.numeric(AUC)) } #' Find the optimal cutoff value balancing sensitivity and specificity #' #' @description #' Calculates an optimal threshold for a classifier score by minimizing the #' squared distance to perfect sensitivity (1) and specificity (1), #' while incorporating a cost ratio for false positives relative to false negatives. #' #' @param fpr Numeric vector of false positive rates corresponding to cutoff values. #' @param tpr Numeric vector of true positive rates corresponding to cutoff values. #' @param cutoff Numeric vector of cutoff values associated with FPR and TPR. #' @param cost.fp Numeric scalar representing the cost of a false positive #' relative to the cost of a false negative (default is 1). #' #' @returns A named numeric vector with: #' \describe{ #' \item{sensitivity}{The true positive rate at optimal cutoff} #' \item{specificity}{The true negative rate (1 - false positive rate) at optimal cutoff} #' \item{cutoff}{The cutoff value that optimizes the balance given the cost ratio} #' } get_best_cutoff <- function(fpr, tpr, cutoff, cost.fp = 1){ cst <- (cost.fp * fpr - 0)^2 + (tpr - 1)^2 idx = which(cst == min(cst)) c(sensitivity = tpr[[idx]], specificity = 1 - fpr[[idx]], cutoff = cutoff[[idx]]) } #' Wrapper function to find optimal cutoff given ROCR performance and prediction objects #' #' @description #' Uses ROCR package performance and prediction objects to extract FPR, TPR, and cutoff values, #' then applies \code{get_best_cutoff} to find the score threshold optimizing sensitivity and specificity #' under the specified false positive cost ratio. #' #' @param perf A ROCR performance object (typically obtained using \code{performance(pred, "tpr", "fpr")}). #' @param pred A ROCR prediction object. #' @param cost.fp Numeric scalar giving the cost multiplier of a false positive relative to a false negative (default 1). #' #' @returns A numeric vector of optimal cutoff values for each curve in the performance object. opt_cut_off = function(perf, pred, cost.fp = 1){ mapply(FUN=get_best_cutoff, perf@x.values, perf@y.values, pred@cutoffs, cost.fp) } #' Clean strings by replacing specific special characters with standardized tokens #' #' @description #' Cleans input strings by substituting common special characters such as inequalities, spaces, #' dashes, and slashes with descriptive tokens or removing spaces, to produce standardized, machine-friendly strings. #' #' This function is tailored to the use cases where variable or feature names must be sanitized. #' #' @param x A character vector containing strings to be cleaned. #' #' @returns A character vector with special characters replaced or removed. str_clean <- function(x) { x <- gsub("<=",".le.", x) x <- gsub(">=",".ge.", x) x <- gsub(">", ".gt.", x) x <- gsub("<", ".lt.", x) x <- gsub("[.]+", ".", x) x <- gsub("[-]+", ".to.", x) x <- gsub(" ", "", x) x <- gsub("/", ".or.", x) x } #' Generate interpretative information value (IV) comment for a variable #' #' @description #' Calculates the Information Value (IV) of a specified variable in a given dataframe #' using the \code{scorecard::iv()} function. Based on the IV value, returns a textual #' interpretation of the variable's predictive power. #' #' @param df A dataframe containing the data. #' @param x The name of the predictor variable (string) for which IV is calculated. #' @param y The name of the binary target variable (string). #' #' @return A character string describing the IV value and the corresponding predictability category: f_comment_iv <- function (df, x, y) { theIV <- scorecard::iv(df, x = x, y = y)[1,2] theIV <- round(theIV,3) if (theIV < 0.02) { txt <- "not predictable" } else if (theIV >= 0.02 && theIV < 0.1) { txt <- "weakly predictable" } else if (theIV >= 0.1 && theIV < 0.3) { txt <- "medium predictable" } else if (theIV >= 0.3 && theIV < 0.5) { txt <- "strongly predictable" } else if (theIV >= 0.5) { txt <- "very strong predictdable (but suspiciously high)" } txt <- paste0("The information value for `", x, "` is: ", theIV, " (this is ", txt, ").") return(txt) }