Prerequiste

rm(list = ls()) # Clear memory

library(tidyverse) # Load package
## Warning: package 'tidyverse' was built under R version 4.0.5
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5

Vector Practice

  1. vector1 : The numbers one through five and then the number six five times
  2. vector2 : 10 randomly drawn numbers from a normal distribution with a mean 10 and a s.d. of 1
  3. vector3 : Results of 10 single binomial trials with a probability of 0.4
  4. vector4 : Sample 100 observations from a 5-trial binomial distribution with a probability of success of 0.4
  5. vector5 : The numbers one through three and the word apple
vector1 <- c(1:5, rep(x=6, n=5))
vector2 <- rnorm(n=10, mean=10, sd=1)
vector3 <- rbinom(n=10, size=1, prob=0.4)
vector4 <- rbinom(n=100, size=5, prob=0.4)
vector5 <- c(1:5, "apple")
  1. What type of data is vector2?
  2. Round up vector2 to two decimal place
  3. What happened in vector5?
class(vector2)
## [1] "numeric"
round(vector2, digits=2)
##  [1]  9.29 10.11  8.08  9.84 10.48  9.21  9.94 10.20  8.64 10.45
vector5
## [1] "1"     "2"     "3"     "4"     "5"     "apple"

Matrices Practice

  1. matrix1: Create 5 by 5 matrix containing all NAs
  2. Assign matrix1 the row names (a,b,c,d,e) and the column names (1,2,3,4,5)
  3. Replace the NAs in the first column of matrix1 with “Inf”
matrix1 <- matrix(data=NA, nrow=5, ncol=5)
rownames(matrix1) <- c("a","b","c","d","e")
colnames(matrix1) <- c(1,2,3,4,5)
matrix1[,1] <- rep(x=Inf, n=5) # or just matrix1[,1] <- Inf

List Practice

  1. Create a list list1 that contains vector1, vector2, vector3, and matrix1
  2. Name each list component as vector1, vector2, vector3, and matrix1 respectively
  3. Locate vector2 from the list
list1 <- list(vector1, vector2, vector3, matrix1)
names(list1) <- c("vector1", "vector2", "vector3", "matrix1")
list1$vector2
##  [1]  9.288917 10.107441  8.076768  9.838950 10.475953  9.205911  9.942233
##  [8] 10.203666  8.638666 10.451385

Data Frames Practice 1

Working directory

Check if your working directory is correct (where you have saved lab01_data.csv)

basedir <- getwd()
rowdata.folder <- paste(basedir, "Specify if you create a folder", sep = "/")

1. Load Lab1data.csv in R

# Load data
data <- read.csv("lab1_data.csv", header = TRUE, stringsAsFactors = FALSE)
# ?read.csv

2. What is the data structure? What does that tell us about type?

# Check structure  
dim(data)
## [1] 1914    4
class(data)
## [1] "data.frame"
is.data.frame(data)
## [1] TRUE
is.matrix(data)
## [1] FALSE
# Alternatively
str(data)
## 'data.frame':    1914 obs. of  4 variables:
##  $ country                                 : chr  "Antigua and Barbuda" "Antigua and Barbuda" "Antigua and Barbuda" "Antigua and Barbuda" ...
##  $ Year                                    : int  2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
##  $ GDP.per.capita.PPP.current.international: num  12346 12655 12960 13699 14866 ...
##  $ polity2                                 : int  NA NA NA NA NA NA NA NA NA NA ...

3. Check the names and summary statistics of the data. Fix any names that are less than good.

# Check and fix names
names(data)
## [1] "country"                                 
## [2] "Year"                                    
## [3] "GDP.per.capita.PPP.current.international"
## [4] "polity2"
names(data)[3] <- "gdp.per.cap"

names(data) # Check again
## [1] "country"     "Year"        "gdp.per.cap" "polity2"
# Summary Statistics
summary(data)
##    country               Year       gdp.per.cap         polity2       
##  Length:1914        Min.   :2000   Min.   :  219.2   Min.   :-10.000  
##  Class :character   1st Qu.:2002   1st Qu.: 1625.0   1st Qu.: -4.000  
##  Mode  :character   Median :2005   Median : 4299.2   Median :  5.000  
##                     Mean   :2005   Mean   : 7874.9   Mean   :  2.431  
##                     3rd Qu.:2008   3rd Qu.: 9818.6   3rd Qu.:  8.000  
##                     Max.   :2010   Max.   :91712.3   Max.   : 10.000  
##                                    NA's   :373       NA's   :542

4. Remove observations with missing values

# Remove NAs
dataClean <- na.omit(data) # listwise deletion!!

dim(data)
## [1] 1914    4
dim(dataClean)
## [1] 1305    4

5. Calculate the average GDP per capita for Brazil for the observed period. Repeat the calculation for all countries.

# Base R
mean(dataClean[dataClean$country == "Brazil", "gdp.per.cap"])
## [1] 8530.801
# Tidy way
dataClean %>%
  filter(country == "Brazil") %>%
  summarize(mean(gdp.per.cap))
# Average gdp.per.cap for all countries
dataClean %>%
  group_by(country) %>%
  summarize(mean(gdp.per.cap))

6. Plot GDP per capita (on the x-axis) and polity2 (on the y-axis).

# Base Graphics
plot(x = dataClean$gdp.per.cap, 
     y = dataClean$polity2)

# Try logging GDP
plot(x = log(dataClean$gdp.per.cap), 
     y = dataClean$polity2,
     xlab = "Logged GDP per capita",
     ylab = "Polity2")

# ggplot2 
ggplot(dataClean, aes(y = polity2, x = log(gdp.per.cap))) +
  geom_point() +
  labs(x = "Logged GDP per capita", y = "Polity2") +
  theme_classic()

7. Create a new variable called “democracy”. Assign 0 to countries with negative value or zero polity2 score, and assign 1 to countries with positive score.

# Create a variable called "democracy"
dataClean$democracy <- NA
head(dataClean)

# You can subset data based on a logical statement 
dataClean$polity2 <= 0

dataClean[dataClean$polity2 <= 0, ]

# Take advantage of this: Assign values to "democracy" based on polity2 values
dataClean$democracy[dataClean$polity2 <= 0] <- 0

# Do the same for positive Polity2 score
dataClean$democracy[dataClean$polity2 > 0] <- 1

# Tidy way
dataClean %>%
  mutate(democracy = case_when(polity2 <= 0 ~ 0,
                                       TRUE ~ 1))

8. Use a loop to do the same coding.

dataClean$democracy <- NA # clean this variable

n <- nrow(dataClean) # how many loops do you need?

for (i in 1:n) {                                          
  
  if (dataClean$polity2[i] <= 0) dataClean$democracy[i] <- 0
  
  else dataClean$democracy[i] <- 1
  
}

## or try this

for (i in 1:n) {
  
  dataClean$democracy <- ifelse(dataClean$polity2[i]<=0, 1, 0)
  
}

Data Frames Practice 2

1. Read in the data “lab1_survey.csv”

# Clear and load data
rm(list = ls())

survey_data <- read.csv(file = "lab1_survey.csv", 
                        encoding="UTF-8") # is this a good idea?

2. Inspect and view the data. What format are they in? What values do the data take, and how do those values correspond with the survey?

View(survey_data)

2.1. How do you want to load properly and change the format into a plain data frame format?

survey_data <- read.table(file = "lab1_survey.csv", sep=",",
                          encoding="UTF-8")

survey_data <- t(survey_data) # transpose

items <- survey_data[1,] # save column names for items

survey_data <- survey_data[-1,] # delete the name column

n <- nrow(survey_data) # the number of total respondents
id <- 1:n # id column

rownames(survey_data) <- id
colnames(survey_data) <- items

survey_data <- as.data.frame(survey_data)

# coerce all elements from character into numeric
survey_data <- survey_data %>% 
  mutate_all(function(x) as.numeric(as.character(x)))

Need a shorter version?

survey_data_2 <- read.table(file = "lab1_survey.csv", sep=",",
                          encoding="UTF-8") %>% 
  setNames(., c("item",1:19))  %>% 
  gather(key=id, value=response, 2:20) %>% 
  spread(key=item, value=response) %>% 
  arrange(as.numeric(id)) %>% 
  select(-id)

#compare the results
survey_data[13,] %>% sum()
## [1] 76
survey_data_2[13,] %>% sum()
## [1] 76

3. Generate some summary statistics.

colnames(survey_data)[1] <- colnames(survey_data_2)[1] <- "logit"

summary(survey_data)
##      logit           probit      ordered probit  Bayes Theorem  
##  Min.   :2.000   Min.   :1.000   Min.   :1.000   Min.   :2.000  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.500   1st Qu.:2.000  
##  Median :3.000   Median :3.000   Median :2.000   Median :2.000  
##  Mean   :3.158   Mean   :2.526   Mean   :2.105   Mean   :2.684  
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##  Maximum likelihood Negative Binomial Poisson distribution
##  Min.   :1.000      Min.   :1.000     Min.   :1.000       
##  1st Qu.:2.000      1st Qu.:2.000     1st Qu.:2.500       
##  Median :2.000      Median :2.000     Median :3.000       
##  Mean   :2.211      Mean   :2.211     Mean   :2.842       
##  3rd Qu.:2.500      3rd Qu.:3.000     3rd Qu.:3.000       
##  Max.   :4.000      Max.   :3.000     Max.   :4.000       
##  zero-inflated Poisson       R             LATEX       stochastic simulation
##  Min.   :1.000         Min.   :3.000   Min.   :1.000   Min.   :1.000        
##  1st Qu.:1.000         1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000        
##  Median :2.000         Median :3.000   Median :2.000   Median :1.000        
##  Mean   :1.842         Mean   :3.474   Mean   :2.421   Mean   :1.526        
##  3rd Qu.:3.000         3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:2.000        
##  Max.   :3.000         Max.   :4.000   Max.   :4.000   Max.   :3.000        
##       GLM        random effects  fixed effects   selection bias 
##  Min.   :1.000   Min.   :2.000   Min.   :2.000   Min.   :2.000  
##  1st Qu.:2.500   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:3.000  
##  Median :3.000   Median :2.000   Median :3.000   Median :4.000  
##  Mean   :2.842   Mean   :2.526   Mean   :2.789   Mean   :3.421  
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:4.000  
##  Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##  prior distribution cross-validation     tobit       heteroskedasticity
##  Min.   :1.0        Min.   :1.000    Min.   :1.000   Min.   :2.000     
##  1st Qu.:1.5        1st Qu.:1.000    1st Qu.:1.000   1st Qu.:3.000     
##  Median :2.0        Median :2.000    Median :1.000   Median :3.000     
##  Mean   :2.0        Mean   :1.947    Mean   :1.316   Mean   :3.105     
##  3rd Qu.:2.0        3rd Qu.:3.000    3rd Qu.:1.000   3rd Qu.:4.000     
##  Max.   :4.0        Max.   :4.000    Max.   :3.000   Max.   :4.000     
##  conditional logit hierarchical linear model gradient search   bootstrap    
##  Min.   :1.000     Min.   :1.000             Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000     1st Qu.:2.000             1st Qu.:1.000   1st Qu.:2.000  
##  Median :1.000     Median :2.000             Median :1.000   Median :2.000  
##  Mean   :1.421     Mean   :1.947             Mean   :1.263   Mean   :2.474  
##  3rd Qu.:2.000     3rd Qu.:2.000             3rd Qu.:1.500   3rd Qu.:3.000  
##  Max.   :3.000     Max.   :4.000             Max.   :2.000   Max.   :4.000  
##  likelihood ratio      AIC        confidence interval first difference
##  Min.   :2.000    Min.   :1.000   Min.   :3.000       Min.   :1.000   
##  1st Qu.:2.000    1st Qu.:2.000   1st Qu.:3.500       1st Qu.:1.000   
##  Median :2.000    Median :3.000   Median :4.000       Median :1.000   
##  Mean   :2.579    Mean   :2.526   Mean   :3.737       Mean   :1.316   
##  3rd Qu.:3.000    3rd Qu.:3.000   3rd Qu.:4.000       3rd Qu.:1.500   
##  Max.   :4.000    Max.   :4.000   Max.   :4.000       Max.   :3.000   
##  relative risk   expected value  multiple imputation
##  Min.   :1.000   Min.   :2.000   Min.   :1          
##  1st Qu.:1.500   1st Qu.:2.500   1st Qu.:2          
##  Median :2.000   Median :3.000   Median :2          
##  Mean   :2.421   Mean   :2.947   Mean   :2          
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:2          
##  Max.   :4.000   Max.   :4.000   Max.   :4
mean(survey_data$R)
## [1] 3.473684
mean(survey_data$LATEX)
## [1] 2.421053
median(survey_data$R)
## [1] 3
median(survey_data$LATEX)
## [1] 2
sd(survey_data$R)
## [1] 0.5129892
sd(survey_data$LATEX)
## [1] 0.7685332
# Tidy way
survey_data %>%
  summarize_all(funs(mean, median, sd, min, max)) %>%
  gather(key = "stat")
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
# Which item has the highest and lowest mean value?
survey_data %>%
  summarize_all(funs(mean)) %>%
  gather(key = "stat") %>% 
  filter(rank(value)==1 | rank(value)==30)

5. Are there any problems with the way the data are coded? (Think about lecture yesterday.)

6. Recode the data

survey_data %>%
  mutate(# Recode R into categories 
         R_cat = case_when(R == 1 ~ "What's that?",
                           R == 2 ~ "I've heard of it",
                           R == 3 ~ "I can use it or apply it",
                           TRUE ~ "I understand it well"),
         # Recode latex into categories 
         LATEX_cat = case_when(LATEX == 1 ~ "What's that?",
                               LATEX == 2 ~ "I've heard of it",
                               LATEX == 3 ~ "I can use it or apply it",
                               TRUE ~ "I understand it well"))
# We're repeating ourselves... Must be a faster way
survey_data <- 
  survey_data %>%
  mutate_at(vars(R, LATEX),
            function(x) case_when(x == 1 ~ "What's that?",
                                  x == 2 ~ "I've heard of it",
                                  x == 3 ~ "I can use it or apply it",
                                  TRUE ~ "I understand it well"))

7. Why is this coding method better?

8. Generate some plots of the data: bar charts are good here, scatterplots even better.

LaTex in R Markdown

\[ 1 + 1 = 2 \]

\[ 11 \times 11 = 121 \\ \]

\[ E = mc^2 \]

I think it’s Einstein who proposed \(E = mc^2\).

\[ x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} \]

\[ \begin{split} X & = (x+a)(x-b) \\ & = x(x-b) + a(x-b) \\ & = x^2 + x(a-b) - ab \end{split} \] Chris recommends using LaTeX functions rather than dollar signs, such as:

\[\begin{equation} x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} \end{equation}\]

\[\begin{equation} \textrm{response}_i = \alpha_i + \beta \mathrm{covariate}_i + \varepsilon_i \end{equation}\]

\[\begin{eqnarray} \textrm{response}_i & = & \alpha_i \\ && + \beta \mathrm{covariate}_i + \varepsilon_i \end{eqnarray}\]