# ---------------------------------------------------------------------


 # +----------------------------------------------------------------------------+
 # |                               the R code for the                           |
 # |                                   Big R-book                               |
 # |                           by Philippe J.S. De Brouwer                      |
 # +----------------------------------------------------------------------------+
 # | Copyright: (C) Philippe De Brouwer, 2020                                   |
 # |            Owners of the book can use this code for private and commercial |
 # |            use, except re-publishing.                                      |
 # +----------------------------------------------------------------------------+
 # | This document is a true copy of all the R code that appears in the book.   |
 # | It is provided for you so that when you work with the book, you can copy   | 
 # | and paste any code segment that you want to test without typing everything |
 # | again.                                                                     |
 # | No warranties or promises are given.                                       |
 # | Have an intersting time and learn faster than me!                          |
 # | Philippe J.S. De Brouwer                                                   |
 # +----------------------------------------------------------------------------+
 # | For your convenience, blocks are separated by: '# ----------------------'. |
 # +----------------------------------------------------------------------------+
 #
# ---------------------------------------------------------------------


install.packages("devtools")
install.packages("roxygen2")
# ---------------------------------------------------------------------


library(devtools)
# ---------------------------------------------------------------------


library(roxygen2)
# ---------------------------------------------------------------------


setwd(".")  # replace this with your desired directory

# Step 1: define the function(s)
# -- below we repeat the function as defined earlier.
# diversity
# Calculates the entropy of a system with discrete states.
# Arguments:
#   x     -- numeric vector -- observed probabilities of classes
#   prior -- numeric vector -- prior probabilities of the classes
# Returns:
#    numeric -- the entropy / diversity measure
diversity <- function(x, prior = NULL) {
  if (min(x) <= 0) {return(0);} # the log will fail for 0
  # if the numbers are higher than 1, then not probabilities but 
  # populations are given, so we rescale to probabilities:
  if (sum(x) != 1) {x <- x / sum(x)}
  N <- length(x)
  if(!is.null(prior)) {
    for (i in (1:N)) {
      a <- (1 - 1 / (N * prior[i])) / (1 - prior[i])
      b <- (1 - N * prior[i]^2) / (N * prior[i] * (1 - prior[i]))
      x[i] <- a * x[i]^2 + b * x[i]
    }
   }
  f <- function(x) x * log(x)
  x1 <- mapply(FUN = f, x)
  - sum(x1) / log(N)
  }
# ---------------------------------------------------------------------


# Step 2: create the empty package directory:
package.skeleton(list = c("diversity"), # list all the functions 
                                        # from the current 
					# environment that we want 
					# in the package.
                name = "div"            # name of the package
		)  
# ---------------------------------------------------------------------


list.files(path = "div", all.files = TRUE)
# ---------------------------------------------------------------------


?diversity
# ---------------------------------------------------------------------


#' Function to calculate a diversity index based on entropy.
#'
#' This function returns entropy of a system with discrete states
#' @param x numeric vector, observed probabilities
#' @param prior numeric vector, the prior probabilities
#' @keywords diversity, entropy
#' @return the entropy or diversity measure
#' @examples
#' x <- c(0.4, 0.6)
#' diversity(x)
# ---------------------------------------------------------------------


setwd("./div")  # or your choice of package directory
document()
# ---------------------------------------------------------------------


Warning: The existing 'NAMESPACE' file was not generated by roxygen2, 
and will not be overwritten.
Warning: The existing 'diversity.Rd' file was not generated by 
roxygen2, and will not be overwritten.
# ---------------------------------------------------------------------


setwd("~/Documents/science/books/r-book")
# ---------------------------------------------------------------------


setwd(".") # the directory in which ./div is a subdirectory
install("div")
# ---------------------------------------------------------------------


install("../../div")
# ---------------------------------------------------------------------


# Step 1: define the function(s)
# -- below we repeat the function as defined earlier.
# diversity
# Calculates the entropy of a system with discrete states.
# Arguments:
# x -- numeric vector -- observed probabilities of classes
# prior -- numeric vector -- prior probabilities of the classes
# Returns:
# numeric -- the entropy / diversity measure
diversity <- function(x, prior = NULL) {
  if (min(x) <= 0) {return(0);} # the log will fail for 0
  # if the numbers are higher than 1, then not probabilities but
  # populations are given, so we rescale to probabilities:
  if (sum(x) != 1) {x <- x / sum(x)}
  N <- length(x)
  if(!is.null(prior)) {
    for (i in (1:N)) {
      a <- (1 - 1 / (N * prior[i])) / (1 - prior[i])
      b <- (1 - N * prior[i]^2) / (N * prior[i] * (1 - prior[i]))
      x[i] <- a * x[i]^2 + b * x[i]
      }
    }
  f <- function(x) x * log(x)
  x1 <- mapply(FUN = f, x)
  - sum(x1) / log(N)
}
# ---------------------------------------------------------------------


# First define the population percentages and prior probabilities:
x  <- c(0.3, 0.2, 0.5)
pr <- c(0.33, 0.33, 0.34)

# Then test the package div:
diversity(x, prior = pr)
# ---------------------------------------------------------------------


?diversity
# ---------------------------------------------------------------------


install_github('div','your_github_username')
# ---------------------------------------------------------------------


# When you use a R-package, you should cite it in your work.
# The information can be found as follows:
citation('base')
# ---------------------------------------------------------------------


# You can even extract the BibTex information:
toBibtex(citation('base'))
# ---------------------------------------------------------------------


# -- remind the data fabrication --- 
set.seed(1865)
age <- rlnorm(1000, meanlog = log(40), sdlog = log(1.3))
y <- rep(NA, length(age))
for(n in 1:length(age)) {
  y[n] <- max(0, 
              dnorm(age[n], mean= 40, sd=10) 
	         + rnorm(1, mean = 0, sd = 10 * dnorm(age[n], 
		   mean= 40, sd=15)) * 0.075)
}
y <- y / max(y)
dt <- tibble (age = age, spending_ratio = y)

# -- plotting loesss and histogram with ggplot2 ---
library(ggplot2)
library(gridExtra)
p1 <- ggplot(dt, mapping = aes(x = age, y = spending_ratio)) +
     geom_point(alpha=0.2, size = 2) +
     geom_smooth(method = "loess")
p2 <- qplot(dt$age, geom="histogram", binwidth=5) 
grid.arrange(p1, p2, ncol=2)
# ---------------------------------------------------------------------


# Sets up an empty plotting field:
plot(x = c(0, 4.5),
     y = c(0, 5),
     main = "Some pch arguments",
     xaxt = "n",
     yaxt = "n",
     xlab = "",
     ylab = "",
     cex.main = 2.6, 
     col = "white"
)

# Plot all of the standard pch arguments:
y = rep(5:0, each=5)
for (i in 0:25) { 
  points(x = i %% 5, y = y[i+1], pch = i,cex = 2, col="blue", bg="khaki3")
  text(0.3 + i %% 5, y = y[i+1], i, cex = 2)
}
for (i in 1:2) {
  ch <- LETTERS[i]
  points(x = i, y = 0, pch = ch,cex = 2, col="red")
  text(0.3 + i, y = 0, ch, cex = 2)
}
for (i in 1:2) {
  ch <- letters[i]
  points(x = i + 2, y = 0, pch = ch,cex = 2, col="red")
  text(0.3 + i + 2, y = 0, ch, cex = 2)
}
# ---------------------------------------------------------------------


# Using model m and data frame t2:
predicScore <- predict(object=m,type="response")
d0 <- data.frame(
       score = as.vector(predicScore)[t2$Survived == 0],
       true_result = 'not survived')
d1 <- data.frame(
       score = as.vector(predicScore)[t2$Survived == 1],
       true_result = 'survived')
d  <- rbind(d0, d1)
d  <- d[complete.cases(d),]

cumDf0 <- ecdf(d0$score)
cumDf1 <- ecdf(d1$score)
x      <- sort(d$score)
cumD0  <- cumDf0(x)
cumD1  <- cumDf1(x)
diff   <- cumD0 - cumD1
y1     <- gdata::first(cumD0[diff == max(diff)])
y2     <- gdata::first(cumD1[diff == max(diff)])
x1     <- x2 <- quantile(d0$score, probs=y1, na.rm=TRUE)

# Plot this with ggplot2:
p <- ggplot(d, aes(x = score)) +
     stat_ecdf(geom = "step", aes(col = true_result), lwd=2) +
     ggtitle('Cummulative distributions and KS') +
     geom_segment(aes(x = x1, y = y1, xend = x2, yend = y2), 
                  color='navy', lwd = 3) + 
     ggplot2::annotate("text", 
              label = paste0("KS=",round((y1-y2)*100,2),"%"), 
              x = x1 + 0.15, y = y2+(y1-y2)/2, color = "navy")
p
# ---------------------------------------------------------------------


library(ggplot2)
library(latex2exp)
d <- seq(from = -3, to = +3, length.out = 100)

## error function family:
erf     <- function(x) 2 * pnorm(x * sqrt(2)) - 1
# (see Abramowitz and Stegun 29.2.29)
erfc    <- function(x) 2 * pnorm(x * sqrt(2), lower = FALSE)
erfinv  <- function (x) qnorm((1 + x)/2)/sqrt(2)
erfcinv <- function (x) qnorm(x/2, lower = FALSE)/sqrt(2)
 
## Gudermannian function
gd <- function(x) asin(tanh(x))

f1 <- function(x) erf( sqrt(pi) / 2 * x)
f2 <- function(x) tanh(x)
f3 <- function(x) 2 / pi * gd(pi / 2 * x)
f4 <- function(x) x / sqrt(1 + x^2)
f5 <- function(x) 2 / pi * atan(pi / 2 * x)
f6 <- function(x) x / (1 + abs(x))

df <- data.frame(d = d, y = f1(d), 
                 Function = "erf( sqrt(pi) / 2 * d)")
df <- rbind(df, data.frame(d = d, y = f2(d), Function = "tanh(d)"))
df <- rbind(df, data.frame(d = d, y = f3(d), 
                           Function = "2 / pi * gd(pi / 2 * d)"))
df <- rbind(df, data.frame(d = d, y = f4(d), 
                           Function = "d / (1 + d^2)"))
df <- rbind(df, data.frame(d = d, y = f5(d), 
                           Function = "2 / pi * atan(pi / 2 * d)"))
df <- rbind(df, data.frame(d = d, y = f6(d), 
                           Function = "x / (1 + abs(d))"))

fn <- ""
fn[1] <- "erf \\left(\\frac{\\sqrt{\\pi} d}{2}\\right)"
fn[2] <- "tanh(x)"
fn[3] <- "\\frac{2}{\\pi} gd\\left( \\frac{\\pi d}{2} \\right)"
fn[4] <- "\\frac{d}{1 + d^2}"
fn[5] <- "\\frac{2}{\\pi} atan\\left(\\frac{\\pi d}{2}\\right)"
fn[6] <- "\\frac{x}{1+ |x|}"


ggplot(data = df, aes(x = d, y = y, color = Function)) +
   geom_line(aes(col = Function), lwd=2) +
   guides(color=guide_legend(title=NULL)) +
   scale_color_discrete(labels=lapply(sprintf('$\\pi(d) = %s$', fn), 
                        TeX)) + 
   theme(legend.justification = c(1, 0), 
        legend.position = c(1, 0),   # south east
        legend.box.margin=ggplot2::margin(rep(20, times=4)),
	# increase vertical space between legend items:
        legend.key.size = unit(1.5, "cm")  
	) +
   ylab(TeX('Preference --- $\\pi$'))
# ---------------------------------------------------------------------


f <- function(x) x^2 + 1
f
Vectorize(f)
# ---------------------------------------------------------------------


f_curve <- function(f) {
  g <- Vectorize(f)
  s <- deparse(f)[2]
   curve(g, xlab = '', ylab = '', col = 'red', lwd = 3, 
         from = -1, to = +1,
#-1-         main = bquote(bold(.(s))))
          main = s
	 )
   }

gaus <- function(x) exp (-(x-0)^2 / 0.5)
f1 <- function(x) - 3/2 * x^5 + 5/2 * x^3
f2 <- function(x) sin(pi * x / 2)
f3 <- function(x) min(1, max(2*x, -1))
f4 <- function(x) x
f5 <- function(x) ifelse(x < 0 , gaus(x) - 1, 1 - gaus(x))
f6 <- function(x) tanh(3 * x)

par(mfrow=c(3,2))
f_curve(f1)
f_curve(f2)
f_curve(f3)
f_curve(f4)
f_curve(f5)
f_curve(f6)
par(mfrow=c(1,1))
# ---------------------------------------------------------------------


nottemC <- 5/9 * nottem - 32
# ---------------------------------------------------------------------


dotproduct <- function (m1, m2) {
  if(!is.matrix(m1) || !is.matrix(m2)) {
    print("ERROR 1: m1 and m2 must be matrices.");
    return(-1);
  }
  if (ncol(m1) != nrow(m2)) {
    print("ERROR 2: ncol(m1) must match nrow(m2).")
    return(-2);
  }
  # Note that we do not check for values NA or Inf. In R they work
  # as one would expect.
  m <- matrix(nrow=nrow(m1), ncol=ncol(m2))
  for (k in 1:nrow(m1)) {
    for (l in 1:ncol(m1)){
      m[k,l] <- 0
      for (n in 1:ncol(m1)) {
        m[k,l] <- m[k,l] + m1[k,n] * m2[n,l]
        }
      }
  }
  return(m);
}
  
M1 <- matrix(1:6, ncol=2)
M2 <- matrix(5:8, nrow=2)

# Try our function
dotproduct(M1, M2)

# Compare with
M1 %*% M2  

# The following must fail:
dotproduct("apple", M2)

# This must fail too:
dotproduct(as.matrix(pi),M1)
# ---------------------------------------------------------------------


# Loading mtcars  
library(MASS)     # However, this is probably already loaded

# Exploring the number of gears
summary(mtcars$gear)

## show the histogram
# since the only valuse are 3,4 and 5 we format the 
# breaks around those values.
hist(mtcars$gear, col="khaki3", 
     breaks=c(2.5, 2.9,3.1,3.9,4.1,4.9,5.1, 5.5))

# Study the correlation between gears and transmission
# (see section on correlations)
cor(mtcars$gear, mtcars$am)
cor(rank(mtcars$gear), rank(mtcars$am))

# We can also show the correlation between gears and transmission
# Since the data is mainly overlapping we use the function 
# jitter() to add some noise so we can see individual points.
plot(jitter(mtcars$am,0.125), jitter(mtcars$gear,0.3), 
            pch=21, col="blue", bg="red",
    xlab = "Transmission (0 = automatic, 1 = manual)",
    ylab = "Number of forward gears",
    main = "Jittered view of gears and transmission"
    )
# Add a blue line (linear fit -- see linear regression)
abline(lm(mtcars$gear ~ mtcars$am),
       col='blue',lwd = 3)
# ---------------------------------------------------------------------


# show the distribution
hist(mtcars$hp, col="khaki3")

# Try a possible cut:
table(cut(mtcars$hp, breaks=c(50,100,175,350)))

# Assume we like the cut and use this one:
c <- cut(mtcars$hp, breaks=c(50,100,175,350)) # add the cut
l <- unique(c)                                # find levels
l                                             # check levels order

# Note that we provide the labels in the order of levels:
f <- factor(c, levels=l, labels=c("M", "L", "H"))
plot(f, col="khaki3", 
     main="Horsepower as a factor",
     xlab="bins: L=low, M=Medium, H=high",
     ylab="number of cars in that bin")
# ---------------------------------------------------------------------


M <- matrix(c(1:9),nrow=3); 
D <- data.frame(M); 
rownames(D) <- c("2016","2017","2018"); 
colnames(D) <- c("Belgium", "France", "Poland"); 
cbind(D,rowSums(D)); 
D <- D[,-2]
# ---------------------------------------------------------------------


# Start a standard input
t <- read_csv(readr_example("challenge.csv"))

# Then, to see the issues, do:
problems(t)

# Notice that the problems start in row 1001, so
# the first 1000 rows are special cases. The first improvement
# can be obtained by increasing guess_max
## Compare this:
spec_csv(readr_example("challenge.csv"))

## with this:
spec_csv(readr_example("challenge.csv"), guess_max = 1001)

# second step:
t <- read_csv(readr_example("challenge.csv"), guess_max = 1001)

# Inspect the result (first and last rows):
head(t)
tail(t)
# ---------------------------------------------------------------------


# We use the same seed so our results will be comparable.
set.seed(1865)

# We use the function mutate() from dplyr:
library(dplyr)

# For completeness sake, we generate the same data again.
N <- 500
age_f   <- rlnorm(N, meanlog = log(40), sdlog = log(1.3))
x_f <- abs(age_f + rnorm(N, 0, 20))    # Add noise & keep positive
x_f <- 1 - (x_f - min(x_f)) / max(x_f) # Scale between 0 and 1
x_f <- 0.5 * x_f / mean(x_f)           # Coerce mean to 0.5
# This last step will produce some outliers above 1
x_f[x_f > 1] <- 1                      # Coerce those > 1 to 1

age_m   <- rlnorm(N, meanlog = log(40), sdlog = log(1.3))
x_m <- abs(age_m + rnorm(N, 0, 20))    # Add noise & keep positive
x_m <- 1 - (x_m - min(x_m)) / max(x_m) # Scale between 0 and 1
x_m <- 0.5 * x_m / mean(x_m)           # Coerce mean to 0.5
# This last step will produce some outliers above 1
x_m[x_m > 1] <- 1                      # Coerce those > 1 to 1
x_m <- 1 - x_m                         # Make the relation increasing

p_f <- x_f
p_m <- x_m

tf <- tibble("age" = age_f, "sex" = "F", "is_good" = p_f) 
tm <- tibble("age" = age_m, "sex" = "M", "is_good" = p_m)
t  <- full_join(tf, tm, by = c("age", "sex", "is_good")) %>%
      mutate("is_good" = if_else(is_good >= 0.5, 1L, 0L))%>%  
      mutate("sexM"    = if_else(sex == "F", 0, 1)) 

###########
# Model 1 #
###########
regr1 <- glm(formula = is_good ~ age + sexM, 
             family = binomial,
             data = t)

# assess the model:
summary(regr1)

pred1 <- 1 / (1+ exp(-(coef(regr1)[1] + t$age * coef(regr1)[2] 
                     + t$sexM * coef(regr1)[3])))
SE1 <-  (pred1 - t$is_good)^2
MSE1 <- sum(SE1) / length(SE1)

###########
# Model 2 #
###########
# make the same cut
t <- as_tibble(t)                                               %>%
    mutate(is_LF = if_else((age <= 35) & (sex == "F"), 1L, 0L)) %>%
    mutate(is_HF = if_else((age >  50) & (sex == "F"), 1L, 0L)) %>%
    mutate(is_LM = if_else((age <= 35) & (sex == "M"), 1L, 0L)) %>%
    mutate(is_HM = if_else((age >  50) & (sex == "M"), 1L, 0L))

regr2 <- glm(formula = is_good ~ is_LF + is_HF + is_LM + is_HM,
             family = binomial,
             data = t)

# assess the model:
summary(regr2)

pred2 <- 1 / (1 + exp(-(coef(regr2)[1] + 
                     + t$is_LF * coef(regr2)[2] 
		     + t$is_HF * coef(regr2)[3] 
                     + t$is_LM * coef(regr2)[4] 
		     + t$is_HM * coef(regr2)[5] 
                      )))
SE2 <-  (pred2 - t$is_good)^2
MSE2 <- sum(SE2) / length(SE2)

# Finally, we also note that the MSE has improved too.
MSE1
MSE2
# ---------------------------------------------------------------------


library(InformationValue)
IV(X = factor(mtcars$vs), Y = factor(mtcars$am))
WOETable(X = factor(mtcars$vs), Y = factor(mtcars$am))
# ---------------------------------------------------------------------


# mcda_promethee_list
# delivers the preference flow matrices for the Promethee method
# Arguments:
#    M      -- decision matrix
#    w      -- weights
#    piFUNs -- a list of preference functions, 
#              if not provided min(1,max(0,d)) is assumed.
# Returns (as side effect)
# phi_plus <<- rowSums(PI.plus)
# phi_min  <<- rowSums(PI.min)
# phi_     <<- phi_plus - phi_min
#
mcda_promethee_list <- function(M, w, piFUNs='x')
{
  if (piFUNs == 'x') {
       # create a factory function:
       makeFUN <- function(x) {x; function(x) max(0,x) }
       P <- list()
       for (k in 1:ncol(M)) P[[k]] <- makeFUN(k)
       } # else, we assume a vector of functions is provided
# initializations
PI.plus  <<- matrix(data=0, nrow=nrow(M), ncol=nrow(M))
PI.min   <<- matrix(data=0, nrow=nrow(M), ncol=nrow(M))
# calculate the preference matrix
for (i in 1:nrow(M)){
  for (j in 1:nrow(M)) {
    for (k in 1:ncol(M)) {
      if (M[i,k] > M[j,k]) {
        PI.plus[i,j] = PI.plus[i,j] + w[k] * P[[k]](M[i,k] - M[j,k])
      }
      if (M[j,k] > M[i,k]) {
        PI.min[i,j] = PI.min[i,j] + w[k] * P[[k]](M[j,k] - M[i,k])
      }
    }
  }
}
# Till this line the code was exactly the same.
# In the following line we create the list to return.
L <- list("phi_plus" <- rowSums(PI.plus), 
          "phi_min"  <- rowSums(PI.min),
          "phi_"     <- phi_plus - phi_min
	  )
return(L)
}
# ---------------------------------------------------------------------


set.seed(1492)
M <- matrix (runif(9), nrow = 3)
w <- c(runif(3))
L <- mcda_promethee_list(M, w)