Visualizing profile likelihoods

We learned that minimizing the sum of squared residuals (SSR) is equivalent to maximizing the log-likelihood function under the assumption of normally distributed errors with constant variance. This means that the OLS estimator is also the MLE for the linear model with normally distributed dependent variable and homoskedastic errors.

A profile likelihood is a method for assessing the likelihood of a specific parameter in a model by fixing that parameter at different values and maximizing the likelihood with respect to all other parameters.

Each value in the profile likelihood represents the best possible fit of the model when the parameter of interest is fixed, while the other parameters are optimized. The peak of the profile likelihood corresponds to the maximum likelihood estimate (MLE) for that parameter, and it is commonly used for constructing confidence intervals and evaluating uncertainty around the parameter of interest.

Below I show a simple simulation where we can see the equivalence of an OLS and MLE estimator for an example of only estimating one parameter: the intercept.

# Let's simulate a normal distribution

## sample
set.seed(123456)
n <- 100

## set simulation parameters
mu <- 67       # mean
sigma2 <- 10   # variance

## generate random normal distribution
y <- rnorm(n = n, mean = mu, sd = sqrt(sigma2))


## mean and variance of y
mean(y) ; var(y)
## [1] 67.05319
## [1] 9.870409
## histogram
hist(y)

## beta_hat from intercept only linear model
lm(y ~ 1)$coef
## (Intercept) 
##    67.05319
## define the residual sum of squares function with respect to mu

SSR <- function(mu, y) {
  # create empty vector
  SSR <- 0
  
  # estimate SSR
  SSR <- sum((y - mu)^2)
  
  return(SSR)
}

## Question: what is the value of mu that minimizes the SSR?
SSR(mu = 120, y = y)
## [1] 281313.7
## Use optim to find the minima
optim_mu <- optim(par=0,
                  fn=SSR,
                  method = "BFGS",
                  y=y)

optim_mu$value  # minimum SSR
## [1] 977.1705
optim_mu$par    # optim point (optimal parameter, mu_hat)
## [1] 67.05319
# Let's visualize the profile likelihoods and the SSR


## First, provide a range of mu to plot over. 
# Note: the range must fit the distribution boundaries, if any.

mu_range <- seq(from=55, 
                to=75,
                length.out= 1000)

## function that will iterate SSR for each value of mu



SSR <- function(mu, y) {
  # create empty vector
  SSR.vector <- vector(mode = "numeric", length = length(mu))
  
  for (i in 1:length(SSR.vector)){
    # estimate SSR
    SSR.vector[i] <- sum((y - mu[i])^2)
  }
  
  return(SSR.vector)
}


# for each value of mu, it returns the SSR given y

tibble(
  SSR = SSR(mu=mu_range,y=y),
  mu = mu_range
)
## # A tibble: 1,000 × 2
##       SSR    mu
##     <dbl> <dbl>
##  1 15505.  55  
##  2 15457.  55.0
##  3 15409.  55.0
##  4 15361.  55.1
##  5 15313.  55.1
##  6 15265.  55.1
##  7 15217.  55.1
##  8 15169.  55.1
##  9 15122.  55.2
## 10 15074.  55.2
## # ℹ 990 more rows
head(SSR(mu=mu_range,y=y)) # for each value of mu, it returns the SSR given y
## [1] 15505.11 15456.89 15408.74 15360.68 15312.70 15264.80
## Visualize

tibble(
  SSR = SSR(mu=mu_range,y=y),
  mu = mu_range
) |> 
  ggplot(aes(y=SSR,
             x=mu)) +
  geom_line() +
  geom_vline(xintercept = optim_mu$par,
             color="red",
             linetype="dashed")

## Now let's define the profile likelihood of the same distribution

## returns a vector of likelihood values over a given range of mu values

like_norm <- function(mu, sigma2, y) {
  # create empty vector
  likelihood.vector <- vector(mode = "numeric", length = length(mu))
  
  # for each mu, calculate the likelihood value
  for (i in 1:length(likelihood.vector)) {
    likelihood.vector[i] <- prod(dnorm(x = y, 
                                       mean = mu[i], 
                                       sd = sqrt(sigma2)))
  }
  
  # return vector of likelihood values spanning range of mu
  return(likelihood.vector)
}

#like_norm(mu=mu_range, sigma2 = sigma2, y = y)


tibble(
  like_norm =like_norm(mu=mu_range, sigma2 = sigma2, y = y),
  mu = mu_range
) %>% 
  ggplot(aes(y=like_norm,
             x=mu)) +
  geom_line() +
  geom_vline(xintercept = optim_mu$par,
             color="red",
             linetype="dashed")

## returns a vector of likelihood values over a given range of mu values

loglike_norm <- function(mu, sigma2, y) {
  # create empty vector
  likelihood.vector <- vector(mode = "numeric", length = length(mu))
  
  # for each mu, calculate the likelihood value
  for (i in 1:length(likelihood.vector)) {
    likelihood.vector[i] <- sum(dnorm(x = y, 
                                      mean = mu[i], 
                                      sd = sqrt(sigma2), 
                                      log = T))
  }
  
  # return vector of likelihood values spanning range of mu
  return(likelihood.vector)
}

#loglike_norm(mu=mu_range, sigma2 = sigma2, y = y)



## create dataset for visualization


df_vis <-
  tibble(SSR = SSR(mu=mu_range,y=y),
         like_norm =like_norm(mu=mu_range, sigma2 = sigma2, y = y),
         loglike_norm =loglike_norm(mu=mu_range, sigma2 = sigma2, y = y),
         mu = mu_range)

df_vis |> 
  pivot_longer(cols = c("SSR":"loglike_norm"),
               names_to = "profile",
               values_to = "value") |> 
  ggplot(aes(y=value,
             x=mu)) +
  geom_line() +
  geom_vline(xintercept = optim_mu$par,
             linetype = "dashed",
             color="red") +
  facet_wrap(~profile, scales = "free") +
  theme_minimal()

Heteroskedastic Normal Example

In this exercise we are going to

Generating heteroskedastic normal data

  • Think back in linear regression where you model the \(\text{outcome}_i\) on:\(\beta_0 + x_{1, i} \beta_1 + x_{2, i} \beta_2... x_{k, i} \beta_k\), for person \(i\) and a total number of \(k\) covariates

  • An efficient way is to store in matrices all covariates as \(\mathbf{X}\) (think about your excel spreadsheet or .csv), and all coefficients as \(\boldsymbol{\beta}\).

  • Let’s say we have one constant and two covariates, i.e. three \(\beta\): \[ \mathbf{X} = \left[\begin{array} {rrrr} x_{0} & x_{1, 1} & x_{2, 1} \\ x_{0} & x_{1, 2} & x_{2, 2} \\ x_{0} & x_{1, 3} & x_{2, 3} \\ \vdots & \vdots & \vdots \\ x_{0} & x_{1, i} & x_{2, i} \\ \end{array}\right] \quad \boldsymbol{\beta} = \left[\begin{array} {r} \beta_0 \\ \beta_1 \\ \beta_2 \\ \end{array}\right] \]

  • Matrix multiplication generates another matrix of systematic components easily:

\[ \mathbf{X \boldsymbol{\beta}} = \left[\begin{array}{c} x_{0} \cdot \beta_0 + x_{1, 1} \cdot \beta_1 + x_{2, 1} \cdot \beta_2 \\ x_{0} \cdot \beta_0 + x_{1, 2} \cdot \beta_1 + x_{2, 2} \cdot \beta_2 \\ x_{0} \cdot \beta_0 + x_{1, 3} \cdot \beta_1 + x_{2, 3} \cdot \beta_2 \\ \vdots \\ x_{0} \cdot \beta_0 + x_{1, i} \cdot \beta_1 + x_{2, i} \cdot \beta_2 \end{array}\right] \]

  • Fast rule:a \(\bf{4} \times \it{3}\) matrix multiplied by a \(\it{3} \times \bf{1}\) one \(\rightarrow\) a \(\bf{4} \times \bf{1}\) matrix

Steps

  1. Set the number of observations to 1500 (\(n\))

  2. Generate two equivalent datasets (matrices \(\mathbf{x}\) and \(\mathbf{z}\)), both with one constant and two covariates

    • The constant takes the value of 1; two covariates are drawn from a uniform distribution with min = 0, max = 1
  3. Set a parameter vector (\(\boldsymbol{\beta}\)) for the mean, \(\mu_i\)

    • By “divine revelation”, we know the true values are 0, 5, 15
rm(list=ls())       # Clear memory
set.seed(123456)    # For reproducible random numbers
library(MASS)       # Load packages
#install.packages('simcf')
library(simcf) 
library(tidyverse)

n <- 1500           # Generate 1500 observations

w0 <- rep(1, n)     # Create the constant
w1 <- runif(n)      # Create two covariates
w2 <- runif(n)

x <- cbind(w0, w1, w2)       # Create a matrix of the covariates
z <- x                       # i.e., same covariates affect mu and sigma

beta <- c(0, 5, 15) # Set a parameter vector for the mean
                    # One for constant, one for covariate 1, 
                    # one for covariate 2.

gamma <- c(1, 0, 3) # Set a parameter vector for the variance 
                    # Gamma estimate for covariate 2 is set to be 3,
                    # which creates heteroskedasticity 
  1. Set a parameter vector (\(\boldsymbol{\gamma}\)) for the variance, \(\sigma_i\), with heteroskedasticity

    • Again, we know the true values are 1, 0, 3
    • Pause and think: why is there heteroskedasticity?
  2. Create the systematic component for the mean (\(\boldsymbol{x}_{i}\boldsymbol{\beta}\))

  3. Create the systematic component for the variance (\(\text{exp}(\boldsymbol{z}_{i}\boldsymbol{\gamma})\)),

    • Assume the same covariates affect both \(\mu_i\) and \(\sigma_i\), that is, \(\mathbf{x}\) and \(\mathbf{z}\) are the same
  4. Generate the outcome variable (\(y_{i}\))

  5. Save the data to a data frame

  6. Plot the data

mu <- x %*% beta             # Create systemtic component for the mean

sigma2 <- exp(z %*% gamma)   # Create systematic component for the variance
                             # Since ith row of sigma2 = exp(1 + 0 + w2_i * 3)
                             # i.e., it is a function of w2 (heteroskedastic)

y <- rnorm(n = n,            # Create the outcome variable
           mean = mu,        # Think about the stochastic component!
           sd = sqrt(sigma2)
           ) 

data <- cbind(y, w1, w2)     # Save the data to a data frame
data <- as.data.frame(data)
names(data) <- c("y", "w1", "w2")

The clearest way is to put the parameters in the correct matrix format. But %*% is smart enough to make a vector conformable when multiplied with a matrix.

Fitting a model using OLS - lm()

Assume, one day, the true values of the parameters become unknown to mankind, and we as scientists try to recover the “truth” by gathering and studying the data

Motivation: If our statistical model is valid, it should be able to recover the true parameter values from the data we synthesize

  1. Fit a model using least squares (function lm()) and regress the outcome variable on the two covariates

  2. Calculate and print the AIC (More in lecture)

ls.result <- lm(y ~ w1 + w2, data = data) # Fit a linear model
                                          # using simulated data
summary(ls.result)
## 
## Call:
## lm(formula = y ~ w1 + w2, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.5710  -2.3157  -0.0219   2.2124  21.9345 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.04867    0.28221  -0.172    0.863    
## w1           4.78421    0.37699  12.690   <2e-16 ***
## w2          15.68283    0.37112  42.258   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.17 on 1497 degrees of freedom
## Multiple R-squared:  0.5678, Adjusted R-squared:  0.5672 
## F-statistic: 983.3 on 2 and 1497 DF,  p-value: < 2.2e-16
ls.aic <- AIC(ls.result) # Calculate and print the AIC 
                         # i.e. Akaike Information Criterion
                         # to assess goodness of fit; lower AIC is better
print(ls.aic)
## [1] 8545.903
  • What do the above exercises tell us about statistical model and inference?
    • What do we mean when we say “fitting a model to the data”?
  • Data generating process (DGP): the “true, underlying” process that generates the data we observe
  • Model: an attempt to capture important aspects of, and approximate crudely, that DGP
    • We just stipulate the parameters and synthesize the data
    • It’ll be very worrying if our model can’t recover the truth…
  • Specifying a “correct” model to real world phenomena is incredibly hard, but also humbling, and hopefully exciting

Fitting the model using ML - optim()

  • Opening the black box: build our own regression machine using maximum likelihood

  • Lecture recap: How to derive maximum likelihood estimators in four easy steps:

    1. Express the joint probability of the data, using the chosen probability distribution

    2. Convert the joint probability to the likelihood (trivial, as they are proportional)

    3. Simplify the likelihood for easy maximization (take logs and reduce to “sufficient statistics”)

    4. Substitute in the systematic component

  • Then we find the set of parameters that maximize the likelihood

    • Using gradient descent; optim()
  • Recap: our heteroskedastic normal model

  • Stochastic component: \[y_i \sim f_\mathcal{N}(\mu_{i}, \sigma_{i}^{2})\]

  • Systematic components: \[\mu_i = \boldsymbol{x}_{i} \boldsymbol{\beta}\] \[\sigma_{i}^{2} = \exp(\boldsymbol{z}_{i} \boldsymbol{\gamma})\]

  • MLE for a heteroskedastic normal data

\[ \begin{aligned} \mathrm{P}\left(\mathbf{y} | \boldsymbol{\mu}, \boldsymbol{\sigma}^{2}\right) &=\prod_{i=1}^{n} f_{\mathcal{N}}\left(y_{i} | \mu_{i}, \sigma_{i}^{2}\right) & [\text{Joint probability}] \\ \mathrm{P}\left(\mathbf{y} | \boldsymbol{\mu}, \boldsymbol{\sigma}^{2}\right) &=\prod_{i=1}^{n}\left(2 \pi \sigma_{i}^{2}\right)^{-1 / 2} \exp \left[\frac{-\left(y_{i}-\mu_{i}\right)^{2}}{2 \sigma_{i}^{2}}\right] & [\text{Expressed in Normal distribution}]\\ \log \mathcal{L}\left(\boldsymbol{\mu}, \boldsymbol{\sigma}^{2} | \mathbf{y}\right) & \propto-\frac{1}{2} \sum_{i=1}^{n} \log \sigma_{i}^{2}-\frac{1}{2} \sum_{i=1}^{n} \frac{\left(y_{i}-\mu_{i}\right)^{2}}{\sigma_{i}^{2}} & [\text{Converted to log likelihood; simplify}] \\ \log \mathcal{L}(\boldsymbol{\beta}, \boldsymbol{\gamma} | \mathbf{y}) & \propto-\frac{1}{2} \sum_{i=1}^{n} \log \mathbf{z}_{i} \gamma-\frac{1}{2} \sum_{i=1}^{n} \frac{\left(y_{i}-\mathbf{x}_{i} \boldsymbol{\beta}\right)^{2}}{\exp \left(\mathbf{z}_{i} \gamma\right)} & [\text{Substitute in systematic components}] \end{aligned} \]

###########IMPORTANT###########

# A likelihood function for ML heteroskedastic Normal
llk.hetnorm <- function(param, y, x, z) {
  x <- as.matrix(x)                      # x (some covariates) as a matrix for mu
  z <- as.matrix(z)                      # z (some covariates) as a matrix for variance 
  os <- rep(1, nrow(x))                  # Set the intercept as 1 (constant)
  x <- cbind(os, x)                      # Add intercept to covariates x
  z <- cbind(os, z)                      # Add intercept to covariates z
  
  b <- param[1 : ncol(x)]                         # Parameters for x
  g <- param[(ncol(x) + 1) : (ncol(x) + ncol(z))] # Parameters for z
  
  xb <- x %*% b                          # Systematic components for mean
  s2 <- exp(z %*% g)                     # Systematic components for variance
  
  sum(0.5 * (log(s2) + (y - xb)^2 / s2)) # Likelihood we want to maximize 
                                         # optim is a minimizer by default
                                         # To maximize lnL is to minimize -lnL
                                         # so the +/- signs are reversed
}

Think back:

\[ \begin{aligned} \log \mathcal{L}(\boldsymbol{\beta}, \boldsymbol{\gamma} | \mathbf{y}) &\propto-\frac{1}{2} \sum_{i=1}^{n} \log \mathbf{z}_{i} \gamma-\frac{1}{2} \sum_{i=1}^{n} \frac{\left(y_{i}-\mathbf{x}_{i} \boldsymbol{\beta}\right)^{2}}{\exp \left(\mathbf{z}_{i} \gamma\right)}\\ &\propto-1 \times \sum_{i=1}^{n} 0.5\times (\log \mathbf{z}_{i} \gamma- \frac{\left(y_{i}-\mathbf{x}_{i} \boldsymbol{\beta}\right)^{2}}{\exp \left(\mathbf{z}_{i} \gamma\right)}) \end{aligned} \]

Important: optim() is a minimizer by default, so you need to reverse the +/- signs to find the maximum point

###########IMPORTANT###########

# Create input matrices
xcovariates <- cbind(w1, w2)
zcovariates <- cbind(w1, w2)

# Initial guesses of beta0, beta1, ..., gamma0, gamma1, ...
# We need one entry per parameter, in order!
# Note: also include beta and gamma estiamtes for constants
stval <- c(0, 0, 0, 0, 0, 0)  

# Run ML, and get the output we need
hetnorm.result <- optim(par = stval,           # Initial guesses
                        fn = llk.hetnorm,  # Likelihood function
                        method = "BFGS", # Gradient method
                        hessian = TRUE,  # Return Hessian matrix 
                        y = y,           # Outcome variable
                        x = xcovariates, # Covariates x (w/o constant) - for mean
                        z = zcovariates  # Covariates z (w/o constant) - for variance 
                        ) 

Note: By default, optim() performs a minimizing procedure. You can make optim a maximizer by adding control = list(fnscale = -1)

pe <- hetnorm.result$par            # Point estimates - three parameters for mean, three for variance
round(pe, 3)
## [1] -0.167  5.007 15.698  0.910  0.224  2.994
vc <- solve(hetnorm.result$hessian) # invert -> Variance-covariance matrix (for computing s.e.)
round(vc, 5)
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]
## [1,]  0.02909 -0.03265 -0.02810  0.00059 -0.00087 -0.00032
## [2,] -0.03265  0.06787 -0.00025 -0.00091  0.00099  0.00084
## [3,] -0.02810 -0.00025  0.10331 -0.00056  0.00143 -0.00033
## [4,]  0.00059 -0.00091 -0.00056  0.00920 -0.00832 -0.00749
## [5,] -0.00087  0.00099  0.00143 -0.00832  0.01693 -0.00042
## [6,] -0.00032  0.00084 -0.00033 -0.00749 -0.00042  0.01568
se <- sqrt(diag(vc))                # To compute standard errors (s.e.)
                                    # take the diagonal of the Hessian; 
                                    # then take square root
round(se, 3)
## [1] 0.171 0.261 0.321 0.096 0.130 0.125
-1*hetnorm.result[["value"]]       # value is MLE. Multiply -1 bc optim finds minimum by default
## [1] -2620.334
mle.result <- cbind(pe[1:3], se[1:3])         # Report ML results 
colnames(mle.result) <- c("Estimate", "Std.Error")
rownames(mle.result) <- c("(Intercept)", "w1", "w2")
round(mle.result, 3)
##             Estimate Std.Error
## (Intercept)   -0.167     0.171
## w1             5.007     0.261
## w2            15.698     0.321
round(coef(summary(ls.result))[, c(1, 2)], 3) # Compare with lm() results
##             Estimate Std. Error
## (Intercept)   -0.049      0.282
## w1             4.784      0.377
## w2            15.683      0.371
ll <- -hetnorm.result$value                   # Report likelihood at maximum 
                                              # No need to have negative sign 
                                              # if optim is set as maximizer
ll
## [1] -2620.334
# AIC is 2 * number of parameters - 2 * ll (i.e. likelihood at maximum)
hetnorm.aic <- 2 * length(stval) - 2 * ll  

# Compare AIC from ML and from lm(); lower is better
print(hetnorm.aic)
## [1] 5252.668
print(ls.aic)
## [1] 8545.903
## MLE estimate is very accurate -> why we use MLE

```