# ---------------------------------------------------------------------


 # +----------------------------------------------------------------------------+
 # |                               the R code for the                           |
 # |                                   Big R-book                               |
 # |                           by Philippe J.S. De Brouwer                      |
 # +----------------------------------------------------------------------------+
 # | Copyright: (C) Philippe De Brouwer, 2020                                   |
 # |            Owners of the book can use this code for private and commercial |
 # |            use, except re-publishing.                                      |
 # +----------------------------------------------------------------------------+
 # | This document is a true copy of all the R code that appears in the book.   |
 # | It is provided for you so that when you work with the book, you can copy   | 
 # | and paste any code segment that you want to test without typing everything |
 # | again.                                                                     |
 # | No warranties or promises are given.                                       |
 # | Have an intersting time and learn faster than me!                          |
 # | Philippe J.S. De Brouwer                                                   |
 # +----------------------------------------------------------------------------+
 # | For your convenience, blocks are separated by: '# ----------------------'. |
 # +----------------------------------------------------------------------------+
 #
# ---------------------------------------------------------------------


# install.packages('sodium') # do only once
                           # fails if you do not have libsodium-dev
library(sodium)

# Create the SHA256 key based on a secret password:
key <- sha256(charToRaw("My sweet secret"))

# Serialize the data to be encrypted:
msg <- serialize("Philippe J.S. De Brouwer", NULL)

# Encrypt:
msg_encr <- data_encrypt(msg, key)


orig <- data_decrypt(msg_encr, key)
stopifnot(identical(msg, orig))

# Tag the message with your key (HMAC):
tag <- data_tag(msg, key)
# ---------------------------------------------------------------------


# -- 
library(RMySQL)

# -- The functions as mentioned earlier:
# db_get_data
# Get data from a MySQL database
# Arguments:
#    con_info -- MySQLConnection object -- containing the connection 
#                                          info to the MySQL database
#    sSQL     -- character string       -- the SQL statement that selects 
#                                          the records
# Returns
#    data.frame, containing the selected records
db_get_data <- function(con_info, sSQL){
  con <- dbConnect(MySQL(), 
                 user     = con_info$user, 
                 password = con_info$password, 
                 dbname   = con_info$dbname, 
                 host     = con_info$host
                 )
  df <- dbGetQuery(con, sSQL)
  dbDisconnect(con)
  df
}
# ---------------------------------------------------------------------


# db_run_sql
# Run a query that returns no data in an MySQL database
# Arguments:
#    con_info -- MySQLConnection object -- containing the connection 
#                                          info to the MySQL database
#    sSQL     -- character string       -- the SQL statement to be run
db_run_sql <-function(con_info, sSQL)
{
  con <- dbConnect(MySQL(), 
                 user     = con_info$user, 
                 password = con_info$password, 
                 dbname   = con_info$dbname, 
                 host     = con_info$host
                 )
  rs <- dbSendQuery(con,sSQL)
  dbDisconnect(con)
}
# ---------------------------------------------------------------------


# Load dplyr via tidyverse:
library(tidyverse)

# Define the wrapper functions:

# Step 1: define the connection info.
my_con_info <- list()
my_con_info$user     <- "librarian"
my_con_info$password <- "librarianPWD"
my_con_info$dbname   <- "library"
my_con_info$host     <- "localhost"


# -- The data import was similar to what we had done previously.
# -- However, now we import all tables separately
# Step 2: get the data
my_tables <- c("tbl_authors", "tbl_author_book",
               "tbl_books", "tbl_genres")
my_db_names <- c("authors", "author_book", 
               "books", "genres")

# Loop over the four tables and download their data:
for (n in 1:length(my_tables)) {
  my_sql <- paste("SELECT * FROM `",my_tables[n],"`;", sep="")
  df <- db_get_data(my_con_info, my_sql)
  # the next line uses tibbles are from the tidyverse
  as_tibble(assign(my_db_names[n],df))  
 }

# Step 3: do something with the data
# -- This will follow in the remainder of the section
# ---------------------------------------------------------------------


str(authors)
# ---------------------------------------------------------------------


# example to illustrate the parser functions

v       <- c("1.0", "2.3", "2.7", ".")
nbrs    <- c("$100,00.00", "12.4%")
s_dte   <- "2018-05-03"

# The parser functions can generate from these strings 
# more specialized types.

# The following will generate an error:
parse_double(v)         # reason: "." is not a number
parse_double(v, na=".") # Tell R what the encoding of NA is
parse_number(nbrs)
parse_date(s_dte)
# ---------------------------------------------------------------------


parse_guess(v)
parse_guess(v, na = ".")
parse_guess(s_dte)
guess_parser(v)
guess_parser(v[1:3])
guess_parser(s_dte)
guess_parser(nbrs)
# ---------------------------------------------------------------------


library(tidyverse)
s_csv = "'a','b','c'\n001,2.34,.\n2,3.14,55\n3,.,43"
read_csv(s_csv)
read_csv(s_csv, na = '.')  # Tell R how to understand the '.'
read_csv(s_csv, na = '.',  quote = "'") # Tell how a string is quoted
# ---------------------------------------------------------------------


# Method 1: before the actual import
spec_csv(s_csv, na = '.', quote = "'")

# Method 2: check after facts:
t <- read_csv(s_csv, na = '.', quote = "'")
spec(t)
# ---------------------------------------------------------------------


read_csv(s_csv, na = '.', quote = "'",
  col_names = TRUE,
  cols(
    a = col_character(),
    b = col_double(),
    c = col_double()      # coerce to double
    )
  )
# ---------------------------------------------------------------------


# Start with:
t <- read_csv(readr_example("challenge.csv"))

# Then, to see the issues, do:
problems(t)

# Notice that the problems start in row 1001, so
# the first 1000 rows are special cases. The first improvement
# can be obtained by increase the guesses
## compare 
spec_csv(readr_example("challenge.csv"))
## with 
spec_csv(readr_example("challenge.csv"), guess_max = 1001)
# ---------------------------------------------------------------------


# load readr
library(readr)
# Or load the tidyverse with library(tidyverse), it includes readr.
# ---------------------------------------------------------------------


# Make a string that looks like a fixed-width table (shortened):
txt <- "book_id  year  title                                           genre                                                                                                                                                                   
       1  1896  Les plaisirs et les jour                        LITmod 
       2  1927  Albertine disparue                              LITmod 
       3  1954  Contre Sainte-Beuve                             LITmod 
       8  1687  Philosophiæ Naturalis Principia Mathematica      SCIphy 
       9  -300  Elements (translated )                          SCImat 
      10  2014  Big Data World                                  SCIdat 
      11  2016  Key Business Analytics                          SCIdat 
      12  2011  Maslowian Portfolio Theory                      FINinv 
      13  2016  R for Data Science                              SCIdat"
# ---------------------------------------------------------------------


fileConn <- file("books.txt")
writeLines(txt, fileConn)
close(fileConn)

my_headers <- c("book_id","year","title","genre")
# ---------------------------------------------------------------------


# Reading the fixed-width file
# -- > by indicating the widths of the columns
t <- read_fwf(
  file = "./books.txt",   
  skip = 1,               # skip one line with headers
  fwf_widths(c(8, 6, 48, 8), my_headers)
  )

# Inspect the input:
print(t)
# ---------------------------------------------------------------------


print(t3)
# ---------------------------------------------------------------------


head(mtcars)
mtcars[1,1]          # mpg is in the first column
rownames(mtcars[1,]) # the name of the car is not a column
# ---------------------------------------------------------------------


## -- 
## -- Load dplyr via tidyverse
library(tidyverse)
library(RMySQL)

## -- The functions as mentioned earlier:

# db_get_data
# Get data from a MySQL database
# Arguments:
#    con_info -- MySQLConnection object -- the connection info
#                                          to the MySQL database
#    sSQL     -- character string       -- the SQL statement that  
#                                          selects the records
# Returns
#    data.frame, containing the selected records
db_get_data <- function(con_info, sSQL){
  con <- dbConnect(MySQL(), 
                 user     = con_info$user, 
                 password = con_info$password, 
                 dbname   = con_info$dbname, 
                 host     = con_info$host
                 )
  df <- dbGetQuery(con, sSQL)
  dbDisconnect(con)
  df
}

# db_run_sql
# Run a query that returns no data in an MySQL database
# Arguments:
#    con_info -- MySQLConnection object -- the connection info
#                                          to the MySQL database
#    sSQL     -- character string       -- the SQL statement 
#                                          to be run
db_run_sql <-function(con_info, sSQL)
{
  con <- dbConnect(MySQL(), 
                 user     = con_info$user, 
                 password = con_info$password, 
                 dbname   = con_info$dbname, 
                 host     = con_info$host
                 )
  rs <- dbSendQuery(con,sSQL)
  dbDisconnect(con)
}
# ---------------------------------------------------------------------


# use the wrapper functions to get data.

# step 1: define the connection info
my_con_info <- list()
my_con_info$user     <- "user3"
my_con_info$password <- "thebigrbook"
my_con_info$dbname   <- "library"
my_con_info$host     <- "45.76.81.41"
my_con_info$port     <- 3306


## -- Import 2 tables combined
# step 2: get the data
my_sql <- "SELECT * FROM tbl_authors 
   JOIN tbl_author_book ON author_id = author
   JOIN tbl_books       ON book      = book_id
   JOIN tbl_genres      ON genre     = genre_id;"
t_mix <- db_get_data(my_con_info, my_sql)
t_mix <- as.tibble(t_mix)  

# Show the result:
head(t_mix)
# ---------------------------------------------------------------------


# Make a table of how much each author_id occurs:
nbr_auth <- t_mix  %>% count(author_id)

# Do the same and include all fields that are assumed to
# be part of the table authors.
nbr_auth2 <- t_mix    %>% 
  count(author_id, pen_name, full_name, birth_date, death_date, book)

nbr_auth$n - nbr_auth2$n
# ---------------------------------------------------------------------


# Try without book:
nbr_auth2 <- t_mix    %>% 
  count(author_id, pen_name, full_name, birth_date, death_date)
  
# Now these occurrences are the same:
nbr_auth$n - nbr_auth2$n
# ---------------------------------------------------------------------


my_authors <- tibble(author_id = t_mix$author_id, 
                    pen_name   = t_mix$pen_name,
                    full_name  = t_mix$full_name,
                    birth_date = t_mix$birth_date,
                    death_date = t_mix$death_date
                    )      %>%
              unique       %>%
              print
# ---------------------------------------------------------------------


auth <- tibble(
            author_id = as.integer(my_authors$author_id), 
            pen_name   = my_authors$pen_name,
            full_name  = my_authors$full_name,
            birth_date = as.Date(my_authors$birth_date),
            death_date = as.Date(my_authors$death_date)
               )       %>%
        unique         %>%
        print
# ---------------------------------------------------------------------


auth$full_name <- str_replace(auth$full_name, "\n", "")     %>%
   print
# ---------------------------------------------------------------------


# First read in some data (using a flat file to remind
# how this works):
 x <- " January   100       102       108
 February  106       105       105
 March     104       104       106
 April     120       122       118
 May       130       100       133
 June      141       139       135
 July      175       176       180
 August    170       188       187
 September 142       148       155
 October   133       137       145
 November  122       128       131
 December  102       108       110"

# Read in the flat file via read_fwf from readr:
t <- read_fwf(x,  fwf_empty(x, col_names = my_headers))

# Set the column names:
colnames(t) <-  c("month", "Sales2017", "Sales2018", "Sales2019")

# Finally, we can show the data as it appeared in the spreadsheet
# from the sales department:
print(t)
# ---------------------------------------------------------------------


t2 <- gather(t, "year", "sales", 2:4)
t2$year <- str_sub(t2$year,6,9)  # delete the sales word
t2$year <- as.integer(t2$year)   # convert to integer

# Show the result:
t2
# ---------------------------------------------------------------------


library(lubridate)
t2$date <- parse_date_time(paste(t2$year,t$month), orders = "ym")
plot(x = t2$date, y = t2$sales, col = "red")
lines(t2$date, t2$sales, col = "blue")
# ---------------------------------------------------------------------


library(dplyr)
sales_info <- data.frame(
       time = as.Date('2016-01-01') + 0:9 + rep(c(0,-1), times=5),
       type  = rep(c("bought","sold"),5),
       value = round(runif(10, min = 0, max = 10001))
       )

# Show the data frame:
sales_info

# Use the function spread():
spread(sales_info, type, value)
# ---------------------------------------------------------------------


sales_info                %>%
  spread(type, value)     %>%
  gather(type, value, 2:3)
# ---------------------------------------------------------------------


library(tidyr)
# The original data frame:
turnover <- data.frame(
       what = paste(as.Date('2016-01-01') + 0:9 + rep(c(0,-1), times=5),
                    rep(c("HSBC","JPM"),5), sep="/"),
       value = round(runif(10, min = 0, max = 50))
       )
turnover

# Use the function separate():
separate(turnover, what, into=c("date","counterpart"), sep="/")
# ---------------------------------------------------------------------


# Use as separator a digit followed by a forward slash 
# and then a capital letter.
separate(turnover, what, into=c("date","counterpart"), 
         sep="[0-9]/[A-Z]")
# ---------------------------------------------------------------------


library(tidyr)

# Define a data frame:
df <- data.frame(year = 2018, month = 0 + 1:12, day = 5)
print(df)

# Merge the columns to one variable:
unite(df, 'date', 'year', 'month', 'day', sep = '-')
# ---------------------------------------------------------------------


library(dplyr)
# ---------------------------------------------------------------------


# Using the example of the library:
dplyr::select(genres,      # the first argument is the tibble
       genre_id, location) # then a list of column names
# ---------------------------------------------------------------------


a1 <- filter(authors, birth_date > as.Date("1900-01-01"))
paste(a1$pen_name,"--",a1$birth_date)
# ---------------------------------------------------------------------


authors           %>%
 count(author_id) %>%
 filter(n > 1)    %>%
 nrow()
# ---------------------------------------------------------------------


author_book     %>%
  count(author) %>%
  filter(n > 1)
# ---------------------------------------------------------------------


library(sqldf)
# Because we have RMySQL loaded (and we don't want to unload it) sqldf will
# default to using that engine to run the queries. If we want it to use the 
# R environment and data frames, then use the following line:
options(sqldf.driver = "SQLite")
# ---------------------------------------------------------------------


# Now you can use SQL syntax on R-data-frames. Imagine that we need to find the
# titles of books of the authors with name ending in 'Brouwer':
sqldf("SELECT B.title FROM authors AS A, author_book as AB, books AS B
              WHERE A.author_id = AB.author AND AB.book = B.book_id
                    AND full_name LIKE '%Brouwer';")
# ---------------------------------------------------------------------


detach("package:sqldf", unload=TRUE)
# ---------------------------------------------------------------------


a2 <- books                                         %>%
  inner_join(genres, by = c("genre" = "genre_id"))  
paste(a2$title, "-->", a2$location)
# ---------------------------------------------------------------------


a3 <- authors                                              %>%
  inner_join(author_book, by = c("author_id" = "author"))  %>%
  inner_join(books,       by = c("book"      = "book_id")) %>%
  inner_join(genres,      by = c("genre"     = "genre_id"))%>%
  dplyr::select(pen_name, location)                        %>%
  arrange(location)                                        %>%
  unique()                                                 %>%
  print()

# Note the difference with the base-R code below!
b <- merge(authors, author_book, by.x="author_id", 
                                 by.y = "author")
b <- merge(b, books,  by.x="book",  by.y = "book_id")
b <- merge(b, genres, by.x="genre", by.y = "genre_id")
b <- cbind(b$pen_name, b$location)  # colnames disappear 
colnames(b) <- c("pen_name", "location")
b <- as.data.frame(b)
b <- b[order(b$location), ] # sort for data frames is order
b <- unique(b)
print(b)
# ---------------------------------------------------------------------


arrange(desc(location))
# ---------------------------------------------------------------------


a3[!duplicated(a3$location), ]
# ---------------------------------------------------------------------


inner_join(A, B, by = c("z", "z")  # ambiguous, but works
inner_join(A, B, by = "z")         # shorter
inner_join(A, B)                   # shortest
# ---------------------------------------------------------------------


ab <- authors                                              %>%
  inner_join(author_book, by = c("author_id" = "author"))  %>%
  inner_join(books,       by = c("book"      = "book_id")) %>%
  add_count(author_id)
ab$n
# ---------------------------------------------------------------------


genres                        %>%
   mutate(genre = genre_id)   %>%  # add column genre
   inner_join(books)          %>%  # leave out the "by="
   dplyr::select(c(title, location))
# ---------------------------------------------------------------------


t <- authors                                                   %>%
     mutate(short_name = str_sub(pen_name,1,7))                %>%
     mutate(x_name = if_else(str_length(pen_name) > 15,
                            paste(str_sub(pen_name,1,8),
                                  "...",
                                  str_sub(pen_name, 
                                         start = -3),
                                  sep=''),
                            pen_name,
                            "pen_name is NA"
                           )
           )                                                   %>%
     mutate(is_alive = 
       if_else(!is.na(birth_date) & is.na(death_date), 
            "YES",
            if_else(death_date < Sys.Date(), 
                "no", 
                "maybe"),
             "NA")
            )                                                  %>%
    dplyr::select(c(x_name, birth_date, death_date, is_alive)) %>%
    print()
# ---------------------------------------------------------------------


authors    %>%
  transmute(name = full_name, my_date = as.Date(birth_date) -5)
# ---------------------------------------------------------------------


authors    %>%
  filter(!is.na(birth_date) & is.na(death_date)) %>%
  transmute(name = full_name, my_date = as.Date(birth_date) -5)  
# ---------------------------------------------------------------------


# Define two sets (with one column):
A <- tibble(col1 = c(1L:4L)) 
B <- tibble(col1 = c(4L,4L,5L))

# Study some of the set-operations:
dplyr::intersect(A,B)
union(A,B)
union_all(A,B)
setdiff(A,B)
setequal(A,B)

# The next example uses a data-frame with two columns:
A <- tibble(col1 = c(1L:4L), 
            col2 = c('a', 'a', 'b', 'b'))
B <- tibble(col1 = c(4L,4L,5L), 
            col2 = c('b', 'b', 'c'))

# Study the same set-operations:
dplyr::intersect(A,B)
union(A,B)
union_all(A,B)
setdiff(A,B)
setequal(A,B)
# ---------------------------------------------------------------------


library(tidyverse)
library(stringr)

# define strings
s1 <- "Hello"  # double quotes are fine
s2 <- 'world.' # single quotes are also fine

# Return the length of a string:
str_length(s1)

# Concatenate strings:
str_c(s1, ", ", s2)       # str_c accepts many strings
str_c(s1, s2, sep = ", ") # str_c also has a 
# ---------------------------------------------------------------------


apropos('str_')
# ---------------------------------------------------------------------


s <- 'World'
str_c('Hello, ', s, '.')
# ---------------------------------------------------------------------


library(stringr)                    # or library(tidyverse)
sVector <- c("Hello", ", ", "world", "Philippe")

str_sub (sVector,1,3)               # the first 3 characters
str_sub (sVector,-3,-1)             # the last 3 characters
str_to_lower(sVector[4])            # convert to lowercase
str_to_upper(sVector[4])            # convert to uppercase
str_c(sVector, collapse=" ")        # collapse into one string
str_flatten(sVector, collapse=" ")  # flatten string
str_length(sVector)                 # length of a string
# ---------------------------------------------------------------------


# Nest the functions:
str_c(str_to_upper(str_sub(sVector[4],1,4)),
      str_to_lower(str_sub(sVector[4],5,-1))
     )

# Use pipes:
sVector[4]        %>%
   str_sub(1,4)   %>%
   str_to_upper()
# ---------------------------------------------------------------------


str <- "abcde"

# Replace from 2nd to 4th character with "+"
str_sub(str, 2, 4) <- "+"  
str
# ---------------------------------------------------------------------


str <- "F0"
str_dup(str, c(2,3))  # duplicate a string
# ---------------------------------------------------------------------


str <- c(" 1 ", "  abc", "Philippe De Brouwer   ")
str_pad(str, 5)  # fills with white-space to x characters
# str_pad never makes a string shorter!
# So to make all strings the same length we first truncate:
str      %>%
  str_trunc(10) %>%
  str_pad(10,"right")   %>%
  print
  
# Remove trailing and leading white space:
str_trim(str)
str_trim(str,"left")

# Modify an existing string to fit a line length:
"The quick brown fox jumps over the lazy dog. "  %>%
   str_dup(5)    %>%
   str_c         %>%  # str_flatten also removes existing \n
   str_wrap(50)  %>%  # Make lines of 50 characters long.
   cat                # or writeLines (print shows "\n")
# ---------------------------------------------------------------------


str <- c("a", "z", "b", "c")

# str_order informs about the order of strings (rank number):
str_order(str)

# Sorting is done with str_sort:
str_sort(str)
# ---------------------------------------------------------------------


library(stringr)   # or library(tidyverse)
sV <- c("philosophy", "physiography", "phis", 
        "Hello world", "Philippe", "Philosophy", 
        "physics", "philology")

# Extracting substrings that match a regex pattern:
str_extract(sV, regex("Phi"))
str_extract(sV, "Phi")        # the same, regex assumed
# ---------------------------------------------------------------------


str_extract(sV, "(p|P)hi")

# Or do it this way:
str_extract(sV, "(phi|Phi)")
# ---------------------------------------------------------------------


# Match also i and y:
str_extract(sV, "(p|P)h(i|y)")

# This is equivalent to:
str_extract(sV, "(phi|Phi|phy|Phy)")
# ---------------------------------------------------------------------


str_extract(sV, "(p|P)h(i|y)[^lL]")
# ---------------------------------------------------------------------


str_extract(sV, "(?i)Ph(i|y)[^(?i)L]")
# ---------------------------------------------------------------------


# First create some example email addresses:
emails <- c("god@heaven.org", "philippe@de-brouwer.com",
            "falsemaail@nothingmy", "mistaken.email.@com")
            
# Define the regex:
regX <- "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$"
 
# Use it to match the sample email adresses:
str_extract(emails, regX)
# ---------------------------------------------------------------------


str_extract("Philippe", "Ph\\w*")  # is greedy
str_extract("Philippe", "Ph\\w*?") # is lazy
# ---------------------------------------------------------------------


# Load the library rex:
library(rex)

# In this example we construct the regex to match a valid URL, and will
# define the valid characters first:
valid_chars <- rex(one_of(regex('a-z0-9\u00a1-\uffff')))
# ---------------------------------------------------------------------


# Then build the regex:
expr <- rex(
  start,       # start of the string: ^

  # Protocol identifier (optional) + //
  group(list('http', maybe('s')) %or% 'ftp', '://'),

  # User: pass authentication (optional)
  maybe(non_spaces,
    maybe(':', zero_or_more(non_space)),
    '@'),

  # Host name:
  group(zero_or_more(valid_chars, 
        zero_or_more('-')), 
        one_or_more(valid_chars)),

  # Domain name:
  zero_or_more('.', 
              zero_or_more(valid_chars, 
              zero_or_more('-')), 
	      one_or_more(valid_chars)),

  # Top Level Domain (TLD) identifier
  group('.', valid_chars %>% at_least(2)),

  # Server port number (optional)
  maybe(':', digit %>% between(2, 5)),

  # Resource path (optional):
  maybe('/', non_space %>% zero_or_more()),
  
  end
)
# ---------------------------------------------------------------------


# Print the result elegantly:
substring(expr, seq(1,  nchar(expr)-1, 40),
                seq(41, nchar(expr),   40))      %>%
  str_c(sep="\n")                              
# ---------------------------------------------------------------------


# for example:
str_extract("www.de-brouwer.com", expr)
str_extract("http://www.de-brouwer.com", expr)
str_extract("error=www.de-brouwer.com", expr)
# ---------------------------------------------------------------------


string <- c("one:1", "NO digit", "c5c5c5", "d123d", "123", 6)
pattern <- "\\d"
# ---------------------------------------------------------------------


# grep() returns the whole string if a match is found:
grep(pattern, string, value = TRUE) 

# The default for value is FALSE -> only returns indexes:
grep(pattern, string)               

# L for returning a logical variable:
grepl(pattern, string)              

# --- stringr ---
# similar to grepl (note order of arguments!)
str_detect(string, pattern)        
# ---------------------------------------------------------------------


# Locate the first match (the numbers are the position in the string):
regexpr (pattern, string)  

# grepexpr() finds all matches and returns a list:
gregexpr(pattern, string)  

# --- stringr ---
# Find the first match and returns a matrix:
str_locate(string, pattern) 

# Find all matches and returns a list (same as grepexpr):
str_locate_all(string, pattern)
# ---------------------------------------------------------------------


# First, we need additionally a replacement (repl)
repl <- "___"

# sub() replaces the first match:
sub(pattern, repl, string)

# gsub() replaces all matches:
gsub(pattern, repl, string)

# --- stringr ---
# str_replace() replaces the first match:
str_replace(string, pattern, repl)

# str_replace_all() replaces all mathches:
str_replace_all(string, pattern, repl)
# ---------------------------------------------------------------------


# regmatches() with regexpr() will extract only the first match:
regmatches(string, regexpr(pattern, string))

# regmatches() with gregexpr() will extract all matches:
regmatches(string, gregexpr(pattern, string)) # all matches

# --- stringr --- 
# Extract the first match:
str_extract(string, pattern)

# Similar as str_extract, but returns column instead of row:
str_match(string, pattern)

# Extract all matches (list as return):
str_extract_all(string, pattern)

# To get a neat matrix output, add simplify = T:
str_extract_all(string, pattern, simplify = TRUE)

# Similar to str_extract_all (but returns column instead of row):
str_match_all(string, pattern)
# ---------------------------------------------------------------------


# --- base-R ---
strsplit(string, pattern)

# --- stringr ---
str_split(string, pattern)
# ---------------------------------------------------------------------


# Load the tidyverse for its functionality such as pipes:
library(tidyverse)

# Lubridate is not part of the core-tidyverse, so we need
# to load it separately:
library(lubridate)  
# ---------------------------------------------------------------------


as.numeric(Sys.time())   # the number of seconds passed since 1 January 1970
as.numeric(Sys.time()) / (60 * 60 * 24 * 365.2422)
# ---------------------------------------------------------------------


# There is a list of functions that convert to a date
mdy("04052018")
mdy("4/5/2018")
mdy("04052018")
mdy("4052018")  # ambiguous formats are refused!
dmy("04052018") # same string, different date
# ---------------------------------------------------------------------


dt <- ymd(20180505)  %>% print
as.numeric(dt)
ymd(17656)
# ---------------------------------------------------------------------


yq("201802")
# ---------------------------------------------------------------------


ymd_hms("2018-11-01T13:59:00") 
dmy_hms("01-11-2018T13:59:00") 

ymd_hm("2018-11-01T13:59") 
ymd_h("2018-11-01T13") 

hms("13:14:15")
hm("13:14")
# ---------------------------------------------------------------------


as_date("2018-11-12")
as_date(0)
as_date(-365)
as_date(today()) - as_date("1969-02-21")
# ---------------------------------------------------------------------


today()
now()
# ---------------------------------------------------------------------


# Note it converts the system time-zone to UTC:
as_datetime("2006-07-22T14:00")  

# Force time-zone:
as_datetime("2006-07-22T14:00 UTC")

as_datetime("2006-07-22 14:00 Europe/Warsaw") #Fails silently!

dt <- as_datetime("2006-07-22 14:00", tz = "Europe/Warsaw") %>% 
      print  

# Get the same date-time numerals in a different time-zone:
force_tz(dt, "Pacific/Tahiti")

# Get the same cosmic moment in a new time-zone
with_tz(dt,  "Pacific/Tahiti")
# ---------------------------------------------------------------------


today(tzone = "Pacific/Tahiti")
date_decimal(2018.521, tz = "UTC")
# ---------------------------------------------------------------------


dt1 <- make_datetime(year = 1890, month = 12L, day = 29L, 
              hour = 8L, tz = 'MST')
dt1
# ---------------------------------------------------------------------


# We will use the date from previous hint:
dt1

year(dt)      # extract the year
month(dt)     # extract the month
week(dt)      # extract the week
day(dt)       # extract the day
wday(dt)      # extract the day of the week as number
qday(dt)      # extract the day of the quarter as number
yday(dt)      # extract the day of the year as number
hour(dt)      # extract the hour
minute(dt)    # extract the minutes
second(dt)    # extract the seconds
quarter(dt)   # extract the quarter 
semester(dt)  # extract the  semester
am(dt)        # TRUE if morning
pm(dt)        # TRUE if afternoon
leap_year(dt) # TRUE if leap-year
# ---------------------------------------------------------------------


# We will use the date from previous example:
dt1

# Experiment changing it:
update(dt, month = 5)
update(dt, year  = 2018)
update(dt, hour  = 18)
# ---------------------------------------------------------------------


moment1 <- as_datetime("2018-10-28 01:59:00", tz = "Europe/Warsaw")
moment2 <- as_datetime("2018-10-28 02:01:00", tz = "Europe/Warsaw")

moment2 - moment1  # Is it 2 minutes or 1 hour and 3 minutes?
moment3 <- as_datetime("2018-10-28 03:01:00", tz = "Europe/Warsaw")

# The clocks were put back in this tz from 3 to 2am.
# So, there is 2 hours difference between 2am and 3am!
moment3 - moment1 
# ---------------------------------------------------------------------


# Calculate the duration in seconds:
dyears(x = 1/365)
dweeks(x = 1)
ddays(x = 1)
dhours(x = 1)
dminutes(x = 1)
dseconds(x = 1)
dmilliseconds(x = 1) 
dmicroseconds(x = 1) 
dnanoseconds(x = 1) 
dpicoseconds(x = 1) 
# ---------------------------------------------------------------------


# Note that a duration object times a number is again a Duration object
# and it allows arithmetic:
dpicoseconds(x = 1) * 10^12
# ---------------------------------------------------------------------


# Investigate the object type:
dur <- dnanoseconds(x = 1)
class(dur)
str(dur)
print(dur)
# ---------------------------------------------------------------------


# Useful for automation:
duration(5, unit = "years") 

# Coerce and logical:
dur <- dyears(x = 10)
as.duration(60 * 60 * 24)
as.duration(dur)
is.duration(dur)
is.difftime(dur)
as.duration(dur)
make_difftime(60, units="minutes")
# ---------------------------------------------------------------------


years(x = 1)
months(x = 1)
weeks(x = 1)
days(x = 1)
hours(x = 1)
minutes(x = 1)
seconds(x = 1)
milliseconds(x = 1)
microseconds(x = 1)
nanoseconds(x = 1)
picoseconds(x = 1)

# Investigate the object type:
per <- days(x = 1)
class(per)
str(per)
print(per)
# ---------------------------------------------------------------------


# For automations:
period(5, unit = "years") 

# Coerce timespan to period:
as.period(5, unit="years") 


as.period(10)
p <- seconds_to_period(10) %>%
     print
period_to_seconds(p)
# ---------------------------------------------------------------------


years(1) + months(3) + days(13)
# ---------------------------------------------------------------------


d1 <- ymd_hm("1939-09-01 09:00", tz = "Europe/Warsaw")
d2 <- ymd_hm("1945-08-15 12:00", tz = "Asia/Tokyo")

interval(d1, d2)  # defines the interval

# Or use the operator %--%:
ww2 <- d1 %--% d2 # defines the same interval

ww2 / days(1)   # the period expressed in days
ww2 / ddays(1)  # duration in terms of days
# The small difference is due to DST and equals one hour:
(ww2 / ddays(1) - ww2 / days(1)) * 24

# Allow the interval to report on its length:
int_length(ww2) / 60 / 60 / 24
# ---------------------------------------------------------------------


d_date <- ymd("19450430")

# Is a date or interval in another:
d_date %within% ww2
ph <- interval(ymd_hm("1941-12-07 07:48", tz = "US/Hawaii"),
                     ymd_hm("1941-12-07 09:50", tz = "US/Hawaii")
		     )
ph %within% ww2    # is ph in ww2?
int_aligns(ph, ww2) # do ww2 and ph share start or end?

# Shift forward or backward:
int_shift(ww2, years(1))
int_shift(ww2, years(-1))

# Swap start and end moment
flww2 <- int_flip(ww2)

# Coerce all to "positive" (start-date before end-date)
int_standardize(flww2)

# Modify start or end date
int_start(ww2) <- d_date; print(ww2)
int_end(ww2)  <-  d_date; print(ww2)
# ---------------------------------------------------------------------


dts <- c(ymd("2000-01-10"), ymd("1999-12-28"),
         ymd("1492-01-01"), ymd("2100-10-15")
	 )
round_date(dts, unit="month")
floor_date(dts, unit="month")
ceiling_date(dts, unit="month")
# Change a date to the last day of the previous month or
# to the first day of the month with rollback()
rollback(dts, roll_to_first = FALSE, preserve_hms = TRUE) 
# ---------------------------------------------------------------------


set.seed(1911)
s <- tibble(reply = runif(n = 1000, min = 0, max = 13))
hml <- function (x = 0) {
  if (x < 0)  return(NA)
  if (x <= 4) return("L")
  if (x <= 8) return("M")
  if (x <= 12) return("H")
  return(NA)
  }
surv <- apply(s, 1, FUN = hml)  # output is a vector
surv <- tibble(reply = surv)  # coerce back to tibble
surv
# ---------------------------------------------------------------------


# 1. Define the factor-levels in the right order:
f_levels <- c("L", "M", "H")

# 2. Define our data as factors:
survey <- parse_factor(surv$reply, levels = f_levels)
# ---------------------------------------------------------------------


summary(survey)
plot(survey, col="khaki3",
     main = "Customer Satisfaction",
     xlab = "Response to the last survey"
     )
# ---------------------------------------------------------------------


surv2 <- parse_factor(surv$reply, levels = unique(surv$reply))

# Note that the labels are in order of first occurrence
summary(surv2)
# ---------------------------------------------------------------------


# Count the labels:
fct_count(survey)
# ---------------------------------------------------------------------


# Relabel factors with fct_relabel:
HML <- function (x = NULL) {
  x[x == "L"] <- "Low"
  x[x == "M"] <- "Medium/High"
  x[x == "H"] <- "Medium/High"
  x[!(x %in% c("High", "Medium/High", "Low"))] <- NA
  return(x)
  }
f <- fct_relabel(survey, HML)
summary(f)
plot(f, col="khaki3",
     main = "Only one third of customers is not happy",
     xlab = "Response to the expensive survey"
     )
# ---------------------------------------------------------------------


HMLregex <- function (x = NULL) {
  x[grepl("^L$", x)] <- "Low"
  x[grepl("^M$", x)] <- "Medium/High"
  x[grepl("^H$", x)] <- "Medium/High"
  x[!(x %in% c("High", "Medium/High", "Low"))] <- NA
  return(x)
  }
# This would do exactly the same, but it is a powerful 
# tool with many other possibilities.
# ---------------------------------------------------------------------


num_obs <- 1000  # the number of observations in the survey
# Start from a new survey: srv
srv <- tibble(reply = 1:num_obs)
srv$age <- rnorm(num_obs, mean=50,sd=20)
srv$age[srv$age < 15] <- NA
srv$age[srv$age > 85] <- NA

hml <- function (x = 0) {
  if (x < 0)  return(NA)
  if (x <= 4) return("L")
  if (x <= 8) return("M")
  if (x <= 12) return("H")
  return(NA)
  }

for (n in 1:num_obs) {
  if (!is.na(srv$age[n])) {
     srv$reply[n] <- hml(rnorm(n = 1, mean = srv$age[n] / 7, sd = 2))
   }
   else {
     srv$reply[n] <- hml(runif(n = 1, min = 1, max = 12))
   }
}
f_levels <- c("L", "M", "H")
srv$fct <- parse_factor(srv$reply, levels = f_levels)
# ---------------------------------------------------------------------


# From most frequent to least frequent:
srv$fct                    %>%
fct_infreq(ordered = TRUE) %>%
  levels()
# ---------------------------------------------------------------------


# From least frequent to more frequent:
srv$fct      %>%
 fct_infreq  %>%
  fct_rev    %>% 
  levels
# ---------------------------------------------------------------------


# Reorder the reply variable in function of median age:
fct_reorder(srv$reply, srv$age) %>% 
   levels
# ---------------------------------------------------------------------


# Add the function min() to order based on the minimum
# age in each group (instead of default median):
fct_reorder(srv$reply, srv$age, min) %>% 
   levels
# ---------------------------------------------------------------------


# Show the means per class of satisfaction in base-R style:
by(srv$age, srv$fct, mean, na.rm = TRUE)

# Much more accessible result with the dplyr:
satisf <- srv            %>%
          group_by(fct)  %>%
	  summarize(
	     age = median(age, na.rm = TRUE),
	     n = n()
	     )           %>%
         print
# ---------------------------------------------------------------------


# Show the impact of age on satisfaction visually:
par(mfrow = c(1,2))
barplot(satisf$age,  horiz=TRUE, names.arg = satisf$fct,
        col=c("khaki3","khaki3","khaki3","red"), 
	main = "Median age per group")
barplot(satisf$n,  horiz = TRUE, names.arg = satisf$fct,
        col=c("khaki3","khaki3","khaki3","red"), 
	main = "Frequency per group")
# ---------------------------------------------------------------------


srv                                 %>%
 mutate("fct_ano" = fct_anon(fct))  %>%
 print
# ---------------------------------------------------------------------


set.seed(1890)
# Get the data:
d1       <- d0  <- iris

# Introduce the missing values:
i        <- sample(1:nrow(d0), round(0.20 * nrow(d0)))
d1[i,1]  <- NA
i        <- sample(1:nrow(d0), round(0.30 * nrow(d0)))
d1[i,2]  <- NA

# Show a part of the resulting dataset:
head(d1, n=10L)
# ---------------------------------------------------------------------


#install.packages('mice')  # uncomment if necessary
library(mice)              # load the package

# mice provides the improved visualization function md.pattern():
md.pattern(d1)  # function provided by mice
# ---------------------------------------------------------------------


d2_imp <- mice(d1, m = 5, maxit = 25, method = 'pmm', seed = 1500)
# ---------------------------------------------------------------------


# Choose set number 3:
d3_complete <- complete(d2_imp, 3)
# ---------------------------------------------------------------------


# install.packages('missForest') # only first time
library(missForest)              # load the library
d_mf <- missForest(d1)           # using the same data as before

# access the imputed data in the ximp attribute:
head(d_mf$ximp)

# normalized MSE of imputation:
d_mf$OOBerror
# ---------------------------------------------------------------------


# Install the package first via:
# install.packages('Hmisc')
library(Hmisc)

# impute using mean:
SepLImp_mean <- with(d1, impute(Sepal.Length, mean))

# impute a randomly chosen value:
SepLImp_rand <- with(d1, impute(Sepal.Length, 'random'))

# impute the maximum value:
SepLImp_max <- with(d1, impute(Sepal.Length, max))

# impute the minimum value:
SepLImp_min <- with(d1, impute(Sepal.Length, min))

# note the '*' next to the imputed values"
head(SepLImp_min, n = 10L)
# ---------------------------------------------------------------------


aregImp <- aregImpute(~ Sepal.Length + Sepal.Width 
                        + Petal.Length + Petal.Width + Species, 
			data = d1, n.impute = 4)

print(aregImp)

# n.impute = 4 produced 4 sets of imputed values
# Access the second imputed data set as follows:
head(aregImp$imputed$Sepal.Length[,2], n = 10L)
# ---------------------------------------------------------------------


set.seed(1890)
d <- rnorm(90)
par(mfrow=c(1,2))
hist(d, breaks=70, col="khaki3")
hist(d, breaks=12, col="khaki3")
# ---------------------------------------------------------------------


# Try a possible cut
c <- cut(d, breaks = c(-3, -1, 0, 1, 2, 3))
table(c)

# This is not good, it will not make solid predictions for the last bin.
# So, we neet to use other bins:
c <- cut(d, breaks = c(-3, -0.5, 0.5, 3))
table(c)

# We have now a similar number of observations in each bin.
# Is that the only thing to think about?
# ---------------------------------------------------------------------


# install.packages('binr) # do once
library(binr)
b <- bins.quantiles(d, target.bins=5, max.breaks=10)
b$binct
# ---------------------------------------------------------------------


set.seed(1890)
age <- rlnorm(1000, meanlog = log(40), sdlog = log(1.3))
y <- rep(NA, length(age))
for(n in 1:length(age)) {
  y[n] <- max(0, 
              dnorm(age[n], mean= 40, sd=10) 
	         + rnorm(1, mean = 0, sd = 10 * dnorm(age[n], 
		   mean= 40, sd=15)) * 0.075)
}
y <- y / max(y)
plot(age, y,
     pch = 21, col = "blue", bg = "red",
     xlab = "age",
     ylab = "spending ratio"
     )

# Assume this data is:
#   age            = age of customer
#   spending_ratio = R : = S_n/ (S_{n-1} + S_n)
#                       (zero if both are zero)
#      with S_n the spending in month n
dt <- tibble (age = age, spending_ratio = y)
# ---------------------------------------------------------------------


# Leave out NAs (in this example redundant):
d1 <- dt[complete.cases(dt),]  

# order() returns sorted indices, so this orders the vector:
d1 <- d1[order(d1$age),]  

# Fit a loess:
d1_loess <- loess(spending_ratio ~ age, d1)

# Add predictions:
d1_pred_loess <- predict(d1_loess)

# Plot the results:
par(mfrow=c(1,2))
plot(d1$age, d1$spending_ratio, pch=16,
     xlab = 'age', ylab = 'spending ratio')
lines(d1$age, d1_pred_loess, lwd = 7, col = 'dodgerblue4')
hist(d1$age, col = 'dodgerblue4', xlab = 'age')
par(mfrow=c(1,1))
# ---------------------------------------------------------------------


# Fit the logistic regression directly on the data without binning:
lReg1 <- glm(formula = spending_ratio ~ age, 
             family = quasibinomial, 
             data = dt)
# ---------------------------------------------------------------------


# Investigate the model:
summary(lReg1)

# Calculate predictions and means square error:
pred1 <- 1 / (1 + exp( -(coef(lReg1)[1] + dt$age * coef(lReg1)[2])))
SE1   <-  (pred1 - dt$spending_ratio)^2
MSE1  <- sum(SE1) / length(SE1)
# ---------------------------------------------------------------------


# Bin the variable age:
c <- cut(dt$age, breaks = c(15, 30, 55, 90))

# Check the binning:
table(c)
# We have one big bucket and two smaller (with the smallest
# more than 10% of our dataset.

lvls <- unique(c)      # find levels
lvls                   # check levels order

# Create the tibble (a data-frame also works):
dt <- as_tibble(dt)                            %>%
      mutate(is_L = if_else(age <= 30, 1, 0))  %>%
      mutate(is_H = if_else(age > 55 , 1, 0))

# Fit the logistic regression with is_L and is_H:
# (is_M is not used because it is correlated with the previous)
lReg2 <- glm(formula = spending_ratio ~ is_L + is_H, 
             family = quasibinomial, data = dt)
# ---------------------------------------------------------------------


# Investigate the logistic model:
summary(lReg2)

# Calculate predictions for our model and calculate MSE:
pred2 <- 1 / (1+ exp(-(coef(lReg2)[1] + dt$is_L * coef(lReg2)[2] 
                     + dt$is_H * coef(lReg2)[3])))
SE2  <-  (pred2 - dt$spending_ratio)^2
MSE2 <- sum(SE2) / length(SE2)
# ---------------------------------------------------------------------


# Compare the MSE of the two models:
MSE1
MSE2
# ---------------------------------------------------------------------


# Load libraries and define parameters:
library(tidyverse) # provides tibble (only used in next block)
set.seed(1880)     # to make results reproducible
N <- 500           # number of rows

# Ladies first:
# age will function as our x-value:
age_f   <- rlnorm(N, meanlog = log(40), sdlog = log(1.3))
# x is a temporary variable that will become the propensity to buy:
x_f <- abs(age_f + rnorm(N, 0, 20))    # Add noise & keep positive
x_f <- 1 - (x_f - min(x_f)) / max(x_f) # Scale between 0 and 1
x_f <- 0.5 * x_f / mean(x_f)           # Coerce mean to 0.5
# This last step will produce some outliers above 1
x_f[x_f > 1] <- 1   # Coerce those few that are too big to 1

# Then the gentlemen:
age_m   <- rlnorm(N, meanlog = log(40), sdlog = log(1.3))
x_m <- abs(age_m + rnorm(N, 0, 20))    # Add noise & keep positive
x_m <- 1 - (x_m - min(x_m)) / max(x_m) # Scale between 0 and 1
x_m <- 0.5 * x_m / mean(x_m)           # Coerce mean to 0.5
# This last step will produce some outliers above 1
x_m[x_m > 1] <- 1   # Coerce those few that are too big to 1
x_m <- 1 - x_m                         # relation to be increasing

# Rename (p_x is not the gendered propensity to buy)
p_f <- x_f
p_m <- x_m
# ---------------------------------------------------------------------


# We want a double plot, so change plot params & save old values:
oldparams <- par(mfrow=c(1,2)) 

plot(age_f, p_f,
     pch = 21, col = "blue", bg = "red",
     xlab = "Age",
     ylab = "Spending probability",
     main = "Females"
    )
plot(age_m, p_m,
     pch = 21, col = "blue", bg = "red",
     xlab = "Age",
     ylab = "Spending probability",
     main = "Males"
    )
par(oldparams)   # Reset the plot parameters after plotting
# ---------------------------------------------------------------------


d1 <- t[complete.cases(t),]  

d1 <- d1[order(d1$age),]  
d1_age_loess <- loess(is_good ~ age, d1)
d1_age_pred_loess <- predict(d1_age_loess)

d1 <- d1[order(d1$sexM),]  
d1_sex_loess <- loess(is_good ~ sexM, d1)
d1_sex_pred_loess <- predict(d1_sex_loess)

# Plot the results:
par(mfrow=c(2,2))
d1 <- d1[order(d1$age),]  
plot(d1$age, d1$is_good, pch=16,
     xlab = 'Age', ylab = 'Spending probability')
lines(d1$age, d1_age_pred_loess, lwd = 7, col = 'dodgerblue4')
hist(d1$age, col = 'khaki3', xlab = 'age')

d1 <- d1[order(d1$sexM),]  
plot(d1$sexM, d1$is_good, pch=16,
     xlab = 'Gender', ylab = 'Spending probability')
lines(d1$sexM, d1_sex_pred_loess, lwd = 7, col = 'dodgerblue4')
hist(d1$sexM, col = 'khaki3', xlab = 'gender')
par(mfrow=c(1,1))
# ---------------------------------------------------------------------


# Note that we can feed "sex" into the model and it will create
# for us a variable "sexM" (meaning the same as ours)
# To avoid this confusion, we put in our own variable.
regr1 <- glm(formula = is_good ~ age + sexM, 
             family = quasibinomial,
             data = t)
# ---------------------------------------------------------------------


# assess the model:
summary(regr1)

pred1 <- 1 / (1+ exp(-(coef(regr1)[1] + t$age * coef(regr1)[2] 
                     + t$sexM * coef(regr1)[3])))
SE1   <- (pred1 - t$is_good)^2
MSE1  <- sum(SE1) / length(SE1)
# ---------------------------------------------------------------------


# 1. Check the potential cut:
c <- cut(t$age, breaks = c(min(t$age), 35, 55, max(t$age)))
table(c)
# ---------------------------------------------------------------------


# 2. Create the matrix variables:
t <- as_tibble(t)                                               %>%
    mutate(is_LF = if_else((age <= 35) & (sex == "F"), 1L, 0L)) %>%
    mutate(is_HF = if_else((age >  50) & (sex == "F"), 1L, 0L)) %>%
    mutate(is_LM = if_else((age <= 35) & (sex == "M"), 1L, 0L)) %>%
    mutate(is_HM = if_else((age >  50) & (sex == "M"), 1L, 0L)) %>%
    print
# ---------------------------------------------------------------------


# 3. Check if the final bins aren't too small:
t[,5:8] %>% map_int(sum)
# ---------------------------------------------------------------------


regr2 <- glm(formula = is_good ~ is_LF + is_HF + is_LM + is_HM,
             family = quasibinomial,
             data = t)
# ---------------------------------------------------------------------


# Assess the model:
summary(regr2)
# ---------------------------------------------------------------------


# Calculate the MSE for model 2:
pred2 <- 1 / (1+ exp(-(coef(regr2)[1] + 
                     + t$is_LF * coef(regr2)[2] 
		     + t$is_HF * coef(regr2)[3] 
                     + t$is_LM * coef(regr2)[4] 
		     + t$is_HM * coef(regr2)[5] 
                      )))
SE2 <-  (pred2 - t$is_good)^2
MSE2 <- sum(SE2) / length(SE2)
# ---------------------------------------------------------------------


MSE1
MSE2
# ---------------------------------------------------------------------


t <- mutate(t, "is_good" = if_else(is_good >= 0.5, 1L, 0L))
# ---------------------------------------------------------------------


# We start from this dataset used in previous section:
print(t)
# ---------------------------------------------------------------------


#install.packages("InformationValue") 
library(InformationValue)

WOETable(X = factor(t$sexM), Y = t$is_good, valueOfGood=1)    %>%
   knitr::kable(format.args = list(big.mark = " ", digits=2))

## also functions WOE() and IV(), e.g.
# IV of a categorical variable is the sum of IV of its categories
IV(X = factor(t$sexM), Y = t$is_good, valueOfGood=1)
# ---------------------------------------------------------------------


WOETable(X = factor(t$is_LF), Y = t$is_good, valueOfGood=1) %>%
   knitr::kable(digits=2)

# The package porvides also functions WOE() and IV().
# The IV of a categorical variable is the sum of IV of its categories.
IV(X = factor(t$is_LF), Y = t$is_good, valueOfGood=1)
# ---------------------------------------------------------------------


loadings(fit)     # show PC loadings
# ---------------------------------------------------------------------


head(fit$scores)  # the first principal components 
# ---------------------------------------------------------------------


# plot the loadings (output see figure):
plot(fit,type="b", col='khaki3') 

# show the biplot:
biplot(fit)       
# ---------------------------------------------------------------------


# Maximum Likelihood Factor Analysis

# Extracting 3 factors with varimax rotation:
fit <- factanal(mtcars, 3, rotation = "varimax")
print(fit, digits = 2, cutoff = .3, sort = TRUE)
# ---------------------------------------------------------------------


# plot factor 1 by factor 2
load <- fit$loadings[,1:2]
plot(load, type = "n")                     # plot the loads
text(load, labels = colnames(mtcars), 
     cex = 1.75, col = 'blue')             # add variable names 
# ---------------------------------------------------------------------


# load the library nFactors:
library(nFactors)
# ---------------------------------------------------------------------


# Get the eigenvectors:
eigV <- eigen(cor(mtcars))


# Get mean and selected quantile of the distribution of eigen-
# values of correlation or a covariance matrices of standardized
# normally distributed variables:
aPar <- parallel(subject = nrow(mtcars),var = ncol(mtcars),
                 rep = 100, cent = 0.05)
  
# Get the optimal number of factors analysis:
nScr <- nScree(x = eigV$values, aparallel = aPar$eigen$qevpea)

# See the result
nScr
# and plot it.
plotnScree(nScr) 
# ---------------------------------------------------------------------


# PCA Variable Factor Map
library(FactoMineR)

# PCA will generate two plots as side effect (not shown here)
result <- PCA(mtcars)