# --------------------------------------------------------------------- # +----------------------------------------------------------------------------+ # | the R code for the | # | Big R-book | # | by Philippe J.S. De Brouwer | # +----------------------------------------------------------------------------+ # | Copyright: (C) Philippe De Brouwer, 2020 | # | Owners of the book can use this code for private and commercial | # | use, except re-publishing. | # +----------------------------------------------------------------------------+ # | This document is a true copy of all the R code that appears in the book. | # | It is provided for you so that when you work with the book, you can copy | # | and paste any code segment that you want to test without typing everything | # | again. | # | No warranties or promises are given. | # | Have an intersting time and learn faster than me! | # | Philippe J.S. De Brouwer | # +----------------------------------------------------------------------------+ # | For your convenience, blocks are separated by: '# ----------------------'. | # +----------------------------------------------------------------------------+ # # --------------------------------------------------------------------- install.packages("devtools") install.packages("roxygen2") # --------------------------------------------------------------------- library(devtools) # --------------------------------------------------------------------- library(roxygen2) # --------------------------------------------------------------------- setwd(".") # replace this with your desired directory # Step 1: define the function(s) # -- below we repeat the function as defined earlier. # diversity # Calculates the entropy of a system with discrete states. # Arguments: # x -- numeric vector -- observed probabilities of classes # prior -- numeric vector -- prior probabilities of the classes # Returns: # numeric -- the entropy / diversity measure diversity <- function(x, prior = NULL) { if (min(x) <= 0) {return(0);} # the log will fail for 0 # if the numbers are higher than 1, then not probabilities but # populations are given, so we rescale to probabilities: if (sum(x) != 1) {x <- x / sum(x)} N <- length(x) if(!is.null(prior)) { for (i in (1:N)) { a <- (1 - 1 / (N * prior[i])) / (1 - prior[i]) b <- (1 - N * prior[i]^2) / (N * prior[i] * (1 - prior[i])) x[i] <- a * x[i]^2 + b * x[i] } } f <- function(x) x * log(x) x1 <- mapply(FUN = f, x) - sum(x1) / log(N) } # --------------------------------------------------------------------- # Step 2: create the empty package directory: package.skeleton(list = c("diversity"), # list all the functions # from the current # environment that we want # in the package. name = "div" # name of the package ) # --------------------------------------------------------------------- list.files(path = "div", all.files = TRUE) # --------------------------------------------------------------------- ?diversity # --------------------------------------------------------------------- #' Function to calculate a diversity index based on entropy. #' #' This function returns entropy of a system with discrete states #' @param x numeric vector, observed probabilities #' @param prior numeric vector, the prior probabilities #' @keywords diversity, entropy #' @return the entropy or diversity measure #' @examples #' x <- c(0.4, 0.6) #' diversity(x) # --------------------------------------------------------------------- setwd("./div") # or your choice of package directory document() # --------------------------------------------------------------------- Warning: The existing 'NAMESPACE' file was not generated by roxygen2, and will not be overwritten. Warning: The existing 'diversity.Rd' file was not generated by roxygen2, and will not be overwritten. # --------------------------------------------------------------------- setwd("~/Documents/science/books/r-book") # --------------------------------------------------------------------- setwd(".") # the directory in which ./div is a subdirectory install("div") # --------------------------------------------------------------------- install("../../div") # --------------------------------------------------------------------- # Step 1: define the function(s) # -- below we repeat the function as defined earlier. # diversity # Calculates the entropy of a system with discrete states. # Arguments: # x -- numeric vector -- observed probabilities of classes # prior -- numeric vector -- prior probabilities of the classes # Returns: # numeric -- the entropy / diversity measure diversity <- function(x, prior = NULL) { if (min(x) <= 0) {return(0);} # the log will fail for 0 # if the numbers are higher than 1, then not probabilities but # populations are given, so we rescale to probabilities: if (sum(x) != 1) {x <- x / sum(x)} N <- length(x) if(!is.null(prior)) { for (i in (1:N)) { a <- (1 - 1 / (N * prior[i])) / (1 - prior[i]) b <- (1 - N * prior[i]^2) / (N * prior[i] * (1 - prior[i])) x[i] <- a * x[i]^2 + b * x[i] } } f <- function(x) x * log(x) x1 <- mapply(FUN = f, x) - sum(x1) / log(N) } # --------------------------------------------------------------------- # First define the population percentages and prior probabilities: x <- c(0.3, 0.2, 0.5) pr <- c(0.33, 0.33, 0.34) # Then test the package div: diversity(x, prior = pr) # --------------------------------------------------------------------- ?diversity # --------------------------------------------------------------------- install_github('div','your_github_username') # --------------------------------------------------------------------- # When you use a R-package, you should cite it in your work. # The information can be found as follows: citation('base') # --------------------------------------------------------------------- # You can even extract the BibTex information: toBibtex(citation('base')) # --------------------------------------------------------------------- # -- remind the data fabrication --- set.seed(1865) age <- rlnorm(1000, meanlog = log(40), sdlog = log(1.3)) y <- rep(NA, length(age)) for(n in 1:length(age)) { y[n] <- max(0, dnorm(age[n], mean= 40, sd=10) + rnorm(1, mean = 0, sd = 10 * dnorm(age[n], mean= 40, sd=15)) * 0.075) } y <- y / max(y) dt <- tibble (age = age, spending_ratio = y) # -- plotting loesss and histogram with ggplot2 --- library(ggplot2) library(gridExtra) p1 <- ggplot(dt, mapping = aes(x = age, y = spending_ratio)) + geom_point(alpha=0.2, size = 2) + geom_smooth(method = "loess") p2 <- qplot(dt$age, geom="histogram", binwidth=5) grid.arrange(p1, p2, ncol=2) # --------------------------------------------------------------------- # Sets up an empty plotting field: plot(x = c(0, 4.5), y = c(0, 5), main = "Some pch arguments", xaxt = "n", yaxt = "n", xlab = "", ylab = "", cex.main = 2.6, col = "white" ) # Plot all of the standard pch arguments: y = rep(5:0, each=5) for (i in 0:25) { points(x = i %% 5, y = y[i+1], pch = i,cex = 2, col="blue", bg="khaki3") text(0.3 + i %% 5, y = y[i+1], i, cex = 2) } for (i in 1:2) { ch <- LETTERS[i] points(x = i, y = 0, pch = ch,cex = 2, col="red") text(0.3 + i, y = 0, ch, cex = 2) } for (i in 1:2) { ch <- letters[i] points(x = i + 2, y = 0, pch = ch,cex = 2, col="red") text(0.3 + i + 2, y = 0, ch, cex = 2) } # --------------------------------------------------------------------- # Using model m and data frame t2: predicScore <- predict(object=m,type="response") d0 <- data.frame( score = as.vector(predicScore)[t2$Survived == 0], true_result = 'not survived') d1 <- data.frame( score = as.vector(predicScore)[t2$Survived == 1], true_result = 'survived') d <- rbind(d0, d1) d <- d[complete.cases(d),] cumDf0 <- ecdf(d0$score) cumDf1 <- ecdf(d1$score) x <- sort(d$score) cumD0 <- cumDf0(x) cumD1 <- cumDf1(x) diff <- cumD0 - cumD1 y1 <- gdata::first(cumD0[diff == max(diff)]) y2 <- gdata::first(cumD1[diff == max(diff)]) x1 <- x2 <- quantile(d0$score, probs=y1, na.rm=TRUE) # Plot this with ggplot2: p <- ggplot(d, aes(x = score)) + stat_ecdf(geom = "step", aes(col = true_result), lwd=2) + ggtitle('Cummulative distributions and KS') + geom_segment(aes(x = x1, y = y1, xend = x2, yend = y2), color='navy', lwd = 3) + ggplot2::annotate("text", label = paste0("KS=",round((y1-y2)*100,2),"%"), x = x1 + 0.15, y = y2+(y1-y2)/2, color = "navy") p # --------------------------------------------------------------------- library(ggplot2) library(latex2exp) d <- seq(from = -3, to = +3, length.out = 100) ## error function family: erf <- function(x) 2 * pnorm(x * sqrt(2)) - 1 # (see Abramowitz and Stegun 29.2.29) erfc <- function(x) 2 * pnorm(x * sqrt(2), lower = FALSE) erfinv <- function (x) qnorm((1 + x)/2)/sqrt(2) erfcinv <- function (x) qnorm(x/2, lower = FALSE)/sqrt(2) ## Gudermannian function gd <- function(x) asin(tanh(x)) f1 <- function(x) erf( sqrt(pi) / 2 * x) f2 <- function(x) tanh(x) f3 <- function(x) 2 / pi * gd(pi / 2 * x) f4 <- function(x) x / sqrt(1 + x^2) f5 <- function(x) 2 / pi * atan(pi / 2 * x) f6 <- function(x) x / (1 + abs(x)) df <- data.frame(d = d, y = f1(d), Function = "erf( sqrt(pi) / 2 * d)") df <- rbind(df, data.frame(d = d, y = f2(d), Function = "tanh(d)")) df <- rbind(df, data.frame(d = d, y = f3(d), Function = "2 / pi * gd(pi / 2 * d)")) df <- rbind(df, data.frame(d = d, y = f4(d), Function = "d / (1 + d^2)")) df <- rbind(df, data.frame(d = d, y = f5(d), Function = "2 / pi * atan(pi / 2 * d)")) df <- rbind(df, data.frame(d = d, y = f6(d), Function = "x / (1 + abs(d))")) fn <- "" fn[1] <- "erf \\left(\\frac{\\sqrt{\\pi} d}{2}\\right)" fn[2] <- "tanh(x)" fn[3] <- "\\frac{2}{\\pi} gd\\left( \\frac{\\pi d}{2} \\right)" fn[4] <- "\\frac{d}{1 + d^2}" fn[5] <- "\\frac{2}{\\pi} atan\\left(\\frac{\\pi d}{2}\\right)" fn[6] <- "\\frac{x}{1+ |x|}" ggplot(data = df, aes(x = d, y = y, color = Function)) + geom_line(aes(col = Function), lwd=2) + guides(color=guide_legend(title=NULL)) + scale_color_discrete(labels=lapply(sprintf('$\\pi(d) = %s$', fn), TeX)) + theme(legend.justification = c(1, 0), legend.position = c(1, 0), # south east legend.box.margin=ggplot2::margin(rep(20, times=4)), # increase vertical space between legend items: legend.key.size = unit(1.5, "cm") ) + ylab(TeX('Preference --- $\\pi$')) # --------------------------------------------------------------------- f <- function(x) x^2 + 1 f Vectorize(f) # --------------------------------------------------------------------- f_curve <- function(f) { g <- Vectorize(f) s <- deparse(f)[2] curve(g, xlab = '', ylab = '', col = 'red', lwd = 3, from = -1, to = +1, #-1- main = bquote(bold(.(s)))) main = s ) } gaus <- function(x) exp (-(x-0)^2 / 0.5) f1 <- function(x) - 3/2 * x^5 + 5/2 * x^3 f2 <- function(x) sin(pi * x / 2) f3 <- function(x) min(1, max(2*x, -1)) f4 <- function(x) x f5 <- function(x) ifelse(x < 0 , gaus(x) - 1, 1 - gaus(x)) f6 <- function(x) tanh(3 * x) par(mfrow=c(3,2)) f_curve(f1) f_curve(f2) f_curve(f3) f_curve(f4) f_curve(f5) f_curve(f6) par(mfrow=c(1,1)) # --------------------------------------------------------------------- nottemC <- 5/9 * nottem - 32 # --------------------------------------------------------------------- dotproduct <- function (m1, m2) { if(!is.matrix(m1) || !is.matrix(m2)) { print("ERROR 1: m1 and m2 must be matrices."); return(-1); } if (ncol(m1) != nrow(m2)) { print("ERROR 2: ncol(m1) must match nrow(m2).") return(-2); } # Note that we do not check for values NA or Inf. In R they work # as one would expect. m <- matrix(nrow=nrow(m1), ncol=ncol(m2)) for (k in 1:nrow(m1)) { for (l in 1:ncol(m1)){ m[k,l] <- 0 for (n in 1:ncol(m1)) { m[k,l] <- m[k,l] + m1[k,n] * m2[n,l] } } } return(m); } M1 <- matrix(1:6, ncol=2) M2 <- matrix(5:8, nrow=2) # Try our function dotproduct(M1, M2) # Compare with M1 %*% M2 # The following must fail: dotproduct("apple", M2) # This must fail too: dotproduct(as.matrix(pi),M1) # --------------------------------------------------------------------- # Loading mtcars library(MASS) # However, this is probably already loaded # Exploring the number of gears summary(mtcars$gear) ## show the histogram # since the only valuse are 3,4 and 5 we format the # breaks around those values. hist(mtcars$gear, col="khaki3", breaks=c(2.5, 2.9,3.1,3.9,4.1,4.9,5.1, 5.5)) # Study the correlation between gears and transmission # (see section on correlations) cor(mtcars$gear, mtcars$am) cor(rank(mtcars$gear), rank(mtcars$am)) # We can also show the correlation between gears and transmission # Since the data is mainly overlapping we use the function # jitter() to add some noise so we can see individual points. plot(jitter(mtcars$am,0.125), jitter(mtcars$gear,0.3), pch=21, col="blue", bg="red", xlab = "Transmission (0 = automatic, 1 = manual)", ylab = "Number of forward gears", main = "Jittered view of gears and transmission" ) # Add a blue line (linear fit -- see linear regression) abline(lm(mtcars$gear ~ mtcars$am), col='blue',lwd = 3) # --------------------------------------------------------------------- # show the distribution hist(mtcars$hp, col="khaki3") # Try a possible cut: table(cut(mtcars$hp, breaks=c(50,100,175,350))) # Assume we like the cut and use this one: c <- cut(mtcars$hp, breaks=c(50,100,175,350)) # add the cut l <- unique(c) # find levels l # check levels order # Note that we provide the labels in the order of levels: f <- factor(c, levels=l, labels=c("M", "L", "H")) plot(f, col="khaki3", main="Horsepower as a factor", xlab="bins: L=low, M=Medium, H=high", ylab="number of cars in that bin") # --------------------------------------------------------------------- M <- matrix(c(1:9),nrow=3); D <- data.frame(M); rownames(D) <- c("2016","2017","2018"); colnames(D) <- c("Belgium", "France", "Poland"); cbind(D,rowSums(D)); D <- D[,-2] # --------------------------------------------------------------------- # Start a standard input t <- read_csv(readr_example("challenge.csv")) # Then, to see the issues, do: problems(t) # Notice that the problems start in row 1001, so # the first 1000 rows are special cases. The first improvement # can be obtained by increasing guess_max ## Compare this: spec_csv(readr_example("challenge.csv")) ## with this: spec_csv(readr_example("challenge.csv"), guess_max = 1001) # second step: t <- read_csv(readr_example("challenge.csv"), guess_max = 1001) # Inspect the result (first and last rows): head(t) tail(t) # --------------------------------------------------------------------- # We use the same seed so our results will be comparable. set.seed(1865) # We use the function mutate() from dplyr: library(dplyr) # For completeness sake, we generate the same data again. N <- 500 age_f <- rlnorm(N, meanlog = log(40), sdlog = log(1.3)) x_f <- abs(age_f + rnorm(N, 0, 20)) # Add noise & keep positive x_f <- 1 - (x_f - min(x_f)) / max(x_f) # Scale between 0 and 1 x_f <- 0.5 * x_f / mean(x_f) # Coerce mean to 0.5 # This last step will produce some outliers above 1 x_f[x_f > 1] <- 1 # Coerce those > 1 to 1 age_m <- rlnorm(N, meanlog = log(40), sdlog = log(1.3)) x_m <- abs(age_m + rnorm(N, 0, 20)) # Add noise & keep positive x_m <- 1 - (x_m - min(x_m)) / max(x_m) # Scale between 0 and 1 x_m <- 0.5 * x_m / mean(x_m) # Coerce mean to 0.5 # This last step will produce some outliers above 1 x_m[x_m > 1] <- 1 # Coerce those > 1 to 1 x_m <- 1 - x_m # Make the relation increasing p_f <- x_f p_m <- x_m tf <- tibble("age" = age_f, "sex" = "F", "is_good" = p_f) tm <- tibble("age" = age_m, "sex" = "M", "is_good" = p_m) t <- full_join(tf, tm, by = c("age", "sex", "is_good")) %>% mutate("is_good" = if_else(is_good >= 0.5, 1L, 0L))%>% mutate("sexM" = if_else(sex == "F", 0, 1)) ########### # Model 1 # ########### regr1 <- glm(formula = is_good ~ age + sexM, family = binomial, data = t) # assess the model: summary(regr1) pred1 <- 1 / (1+ exp(-(coef(regr1)[1] + t$age * coef(regr1)[2] + t$sexM * coef(regr1)[3]))) SE1 <- (pred1 - t$is_good)^2 MSE1 <- sum(SE1) / length(SE1) ########### # Model 2 # ########### # make the same cut t <- as_tibble(t) %>% mutate(is_LF = if_else((age <= 35) & (sex == "F"), 1L, 0L)) %>% mutate(is_HF = if_else((age > 50) & (sex == "F"), 1L, 0L)) %>% mutate(is_LM = if_else((age <= 35) & (sex == "M"), 1L, 0L)) %>% mutate(is_HM = if_else((age > 50) & (sex == "M"), 1L, 0L)) regr2 <- glm(formula = is_good ~ is_LF + is_HF + is_LM + is_HM, family = binomial, data = t) # assess the model: summary(regr2) pred2 <- 1 / (1 + exp(-(coef(regr2)[1] + + t$is_LF * coef(regr2)[2] + t$is_HF * coef(regr2)[3] + t$is_LM * coef(regr2)[4] + t$is_HM * coef(regr2)[5] ))) SE2 <- (pred2 - t$is_good)^2 MSE2 <- sum(SE2) / length(SE2) # Finally, we also note that the MSE has improved too. MSE1 MSE2 # --------------------------------------------------------------------- library(InformationValue) IV(X = factor(mtcars$vs), Y = factor(mtcars$am)) WOETable(X = factor(mtcars$vs), Y = factor(mtcars$am)) # --------------------------------------------------------------------- # mcda_promethee_list # delivers the preference flow matrices for the Promethee method # Arguments: # M -- decision matrix # w -- weights # piFUNs -- a list of preference functions, # if not provided min(1,max(0,d)) is assumed. # Returns (as side effect) # phi_plus <<- rowSums(PI.plus) # phi_min <<- rowSums(PI.min) # phi_ <<- phi_plus - phi_min # mcda_promethee_list <- function(M, w, piFUNs='x') { if (piFUNs == 'x') { # create a factory function: makeFUN <- function(x) {x; function(x) max(0,x) } P <- list() for (k in 1:ncol(M)) P[[k]] <- makeFUN(k) } # else, we assume a vector of functions is provided # initializations PI.plus <<- matrix(data=0, nrow=nrow(M), ncol=nrow(M)) PI.min <<- matrix(data=0, nrow=nrow(M), ncol=nrow(M)) # calculate the preference matrix for (i in 1:nrow(M)){ for (j in 1:nrow(M)) { for (k in 1:ncol(M)) { if (M[i,k] > M[j,k]) { PI.plus[i,j] = PI.plus[i,j] + w[k] * P[[k]](M[i,k] - M[j,k]) } if (M[j,k] > M[i,k]) { PI.min[i,j] = PI.min[i,j] + w[k] * P[[k]](M[j,k] - M[i,k]) } } } } # Till this line the code was exactly the same. # In the following line we create the list to return. L <- list("phi_plus" <- rowSums(PI.plus), "phi_min" <- rowSums(PI.min), "phi_" <- phi_plus - phi_min ) return(L) } # --------------------------------------------------------------------- set.seed(1492) M <- matrix (runif(9), nrow = 3) w <- c(runif(3)) L <- mcda_promethee_list(M, w)