#################################################
############ 510 MLE - PROBLEM SET 1 ############
#################################################

# rm(list = ls())
# 
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) # To set current folder as your wd.
# print(getwd()) # Check that your wd is correct
# dir()


library(ggplot2)
library(tidyverse)

###### Problem 1: Three ways to solve probability problems:-------------------------------

# By hand 

## What is the probability of sampling 22 males in a random sample of size 30?

p = 0.49

n = 30

k = 22

fact = factorial(n)/(factorial(k)*factorial(n-k))

probability1 = fact*(p^k)*(1-p)^(n-k) # PDF of Binomial distribution. It's this simple!

probability1  # P(X = 22) = 0.004094946

## What is the probability of sampling 16 males in a random sample of size 30?

k = 16 # p, n, fact remain the same

probability2 = fact*(p^k)*(1-p)^(n-k) # PDF of Binomial distribution

probability2 # P(X = 16) =  0.1293457


# R's built-in functions for calculating probabilities

dbinom(x = 22, size = 30, prob = 0.49) # P(X = 22) = 0.004094946

dbinom(x = 16, size = 30, prob = 0.49) # P(X = 16) =  0.1293457


# Additionally, we can calculate the pdf of this data generator process:

# Simulation

set.seed(12345)

sims <- 100000

nmen <- vector(mode = "numeric", length = sims)

for (i in 1:sims) {
  nmen[i] <- sum(sample(x = c(0,1),
                        size = 30,
                        replace = TRUE,
                        prob = c(0.51, 0.49) ))
}

sum(nmen==22)/sims # 0.00416
sum(nmen==16)/sims # 0.1294
sum(nmen==16)/length(nmen)  
mean(nmen==16)

## Plot (extra information to help your understanding):

## notice that the below plot is wrong: a binomial distribution has discrete changes
## therefore, we cannot display it as a continuous distribution

## In continuous distributions, there are no probabilities for a particular point

## Plot:

# The binomial distribution  is discrete and has a PMF, 
# thus a column bar or dot plot is a good representation

k <- 0:30 # males

p = 0.49 # probability

n = 30 # sample size

probs <- vector(mode = "numeric", length = length(k)) # create a vector to store the probabilities for each scenario

for(i in 0:length(k)) {
  probs[i] <- dbinom(x = k[i], size = n, prob = p)
}


df <- data.frame(probs,k) # df with probabilities

## example of not good representation: the surface is not continuous

df %>% 
  ggplot(aes(x = k,
             y = probs)) +
  theme_minimal() +
  scale_y_continuous(name = "Probability",
                     expand = c(0.001, 0.001)) +
  scale_x_continuous(limits = c(0, 30), 
                     breaks = seq(0, 30, 1),
                     name = "Males sampling") +
  geom_line(color = "blue", 
            size = 1) +
  geom_area(fill = "lightblue", 
            alpha = 0.5) +
  geom_vline(linetype = "dashed",
             color = "red",
             xintercept = 16) +
  geom_vline(linetype = "dashed",
             color = "red",
             xintercept = 22) +
  geom_hline(linetype = "dashed",
             color = "red",
             yintercept = filter(df, k == 16)[,"probs"]) +
  geom_hline(linetype = "dashed",
             color = "red",
             yintercept = filter(df, k == 22)[,"probs"]) +
  geom_point(x = 16, 
             y = filter(df, k == 16)[,"probs"], # Selecting prob by sampling 16 males
             color = "black", size = 2) +
  geom_point(x = 22, 
             y = filter(df, k == 22)[,"probs"], # Selecting prob by sampling 22 males
             color = "black", size = 2) +
  annotate(geom="text",
           x = 18,
           y = .135,
           color = "red",
           parse = TRUE,          # To allow mathematical expressions
           label = "prob == .129") +
  annotate(geom="text",
           x = 24,
           y = .01,
           color = "red",
           parse = TRUE,          # To allow mathematical expressions
           label = "prob == .004") +
  labs(title = "Q1 - Binomial distribution") +
  theme(
    panel.background = element_rect(fill = NA),
    axis.line.x.bottom = element_line(size = 0.5),
    axis.ticks.length = unit(0.5, "char"),
    legend.position = "none",
    axis.text = element_text(size = 10, color = "black")
  )

#w <- 6
# ggsave("output/binomial_pmf1.pdf",
#        width = w,
#        height = w/1.618)


## example of good representation: discrete

df %>% 
  ggplot(aes(x = k,
             y = probs)) +
  geom_col(fill = "blue", 
           alpha = 0.5) +
  theme_minimal() +
  scale_y_continuous(name = "Probability",
                     expand = c(0.001, 0.001)) +
  scale_x_continuous(limits = c(0, 30), 
                     breaks = seq(0, 30, 1),
                     name = "Males sampling")  +
  geom_hline(linetype = "dashed",
             color = "red",
             yintercept = filter(df, k == 16)[,"probs"]) +
  geom_hline(linetype = "dashed",
             color = "red",
             yintercept = filter(df, k == 22)[,"probs"]) +
  geom_point(x = 16, 
             y = filter(df, k == 16)[,"probs"], # Selecting prob by sampling 16 males
             color = "black", size = 2) +
  geom_point(x = 22, 
             y = filter(df, k == 22)[,"probs"], # Selecting prob by sampling 22 males
             color = "black", size = 2) +
  annotate(geom="text",
           x = 18,
           y = .135,
           color = "red",
           parse = TRUE,          # To allow mathematical expressions
           label = "prob == .129") +
  annotate(geom="text",
           x = 24,
           y = .01,
           color = "red",
           parse = TRUE,          # To allow mathematical expressions
           label = "prob == .004") +
  labs(title = "Q1 - Binomial distribution") +
  theme(
    panel.background = element_rect(fill = NA),
    axis.line.x.bottom = element_line(size = 0.5),
    axis.ticks.length = unit(0.5, "char"),
    legend.position = "none",
    axis.text = element_text(size = 10, color = "black")
  )



# ggsave("output/binomial_pmf2.pdf",
#        width = w,
#        height = w/1.618)



df %>% 
  ggplot(aes(x = k,
             y = probs)) +
  geom_point(fill = "blue", 
           alpha = 0.5) +
  theme_minimal() +
  scale_y_continuous(name = "Probability",
                     expand = c(0.001, 0.001)) +
  scale_x_continuous(limits = c(0, 30), 
                     breaks = seq(0, 30, 1),
                     name = "Males sampling")  +
  geom_vline(linetype = "dashed",
             color = "red",
             xintercept = 16) +
  geom_vline(linetype = "dashed",
             color = "red",
             xintercept = 22) +
  geom_hline(linetype = "dashed",
             color = "red",
             yintercept = filter(df, k == 16)[,"probs"]) +
  geom_hline(linetype = "dashed",
             color = "red",
             yintercept = filter(df, k == 22)[,"probs"]) +
  geom_point(x = 16, 
             y = filter(df, k == 16)[,"probs"], # Selecting prob by sampling 16 males
             color = "black", size = 2) +
  geom_point(x = 22, 
             y = filter(df, k == 22)[,"probs"], # Selecting prob by sampling 22 males
             color = "black", size = 2) +
  annotate(geom="text",
           x = 18,
           y = .135,
           color = "red",
           parse = TRUE,          # To allow mathematical expressions
           label = "prob == .129") +
  annotate(geom="text",
           x = 24,
           y = .01,
           color = "red",
           parse = TRUE,          # To allow mathematical expressions
           label = "prob == .004") +
  labs(title = "Q1 - Binomial distribution")+
  theme(
    panel.background = element_rect(fill = NA),
    axis.line.x.bottom = element_line(size = 0.5),
    axis.ticks.length = unit(0.5, "char"),
    legend.position = "none",
    axis.text = element_text(size = 10, color = "black")
  )



# ggsave("output/binomial_pmf3.pdf",
#        width = w,
#        height = w/1.618)












###### Problem 2: Working with marginal, joint, and conditional probabilities:-------------------------------

# marginal probabilities of X:

0.05 + 0.08 + 0.13 + 0.03 # P(X = 1) =  0.29

0.07 + 0.1 + 0.16 + 0.04 # P(X = 2) =  0.37

0.06 + 0.07 + 0.15 + 0.06 # P(X = 3) =   0.34

0.29 + 0.37 + 0.34 # 1.00

# marginal probabilities of Y:

0.05 + 0.07 + 0.06 # P(Y = 1) =  0.18

0.08 + 0.1 + 0.07 # P(Y = 2) = 0.25

0.13 + 0.16 + 0.15 # P(Y = 3) =  0.44

0.03 + 0.04 + 0.06 # P(Y = 4) =  0.13

0.18 + 0.25 + 0.44 + 0.13 # 1.00

# joint probability 

# h

1-(0.05+0.08+0.13+0.03+0.07+0.16+0.04+0.06+0.07+0.15+0.06) # 0.1

# conditional probability

# i

0.08 / 0.25 # 0.32

# j

0.06 / 0.13 # 0.4615385

# k

0.06 / 0.34 # 0.1764706

# l

0.1 / 0.37 # 0.2702703


###### Alternative version:


## Create a matrix with the relative probabilities:

p <- matrix(c(.05,.07,.06,.08,.10,.07,.13,.16,.15,.03,.04,.06),ncol=4) 

p                                                                      #


## create marginal probabilities for X  

px <- apply(p,1,sum) 
px                   ## 0.29 0.37 0.34


## create marginal probabilities for Y 

py <- apply(p,2,sum)
py                  ## 0.18 0.25 0.44 0.13


## computes conditional probability

p_x1_y2 <- p[1,2]/py[2]  ## computes conditional probability P(X=1|Y=2)
p_x1_y2                  ## 0.32

p_x3_y4 <- p[3,4]/py[4]  ## computes conditional probability P(X=3|Y=4)
p_x3_y4                  ## 0.4615385

p_y4_x3 <- p[3,4]/px[3]  ## computes conditional probability P(Y=4|X=3)
p_y4_x3                  ## 0.1764706

p_y2_x2 <- p[2,2]/px[2]  ## computes conditional probability P(X=1|Y=2)
p_y2_x2                  ## 0.2702703

