# Load all the packages
library(dplyr)
library(tidyr)
library(ggplot2)
library(broom)
library(corrplot)
library(fivethirtyeight)
library(car)

# Load the candy_rankings dataset from the fivethirtyeight package
data(candy_rankings)

# Take a glimpse() at the dataset
glimpse(candy_rankings)

Rows: 85
Columns: 13
$ competitorname   <chr> "100 Grand", "3 Musketeers", "One dime", "One quarter…
$ chocolate        <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, F…
$ fruity           <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE…
$ caramel          <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE,…
$ peanutyalmondy   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, …
$ nougat           <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE,…
$ crispedricewafer <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
$ hard             <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ bar              <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, F…
$ pluribus         <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE…
$ sugarpercent     <dbl> 0.732, 0.604, 0.011, 0.011, 0.906, 0.465, 0.604, 0.31…
$ pricepercent     <dbl> 0.860, 0.511, 0.116, 0.511, 0.511, 0.767, 0.767, 0.51…
$ winpercent       <dbl> 66.97173, 67.60294, 32.26109, 46.11650, 52.34146, 50.…

head(candy_rankings$competitorname, 9)

# Gather the categorical variables to make them easier to plot
candy_rankings_long <- candy_rankings %>% gather("feature", "value", chocolate:pluribus)

# Make a bar plot showing the distribution of each variable
ggplot(candy_rankings_long, aes(value)) + geom_bar() + facet_wrap(~feature)

# Make a lollipop chart of pricepercent
options(repr.plot.width= 12, repr.plot.height= 15) 
ggplot(candy_rankings, aes(reorder(competitorname, pricepercent), pricepercent)) +
  geom_segment(aes(xend = reorder(competitorname, pricepercent), yend = 0)) +
  geom_point() + coord_flip() + xlab("Candy Name") +ylab("Price Percentile")

# Plot a histogram of winpercent
options(repr.plot.width= 6, repr.plot.height= 6) 
ggplot(candy_rankings, aes(winpercent)) + geom_histogram(bins= 10)

# Make a lollipop chart of winpercent
options(repr.plot.width= 12, repr.plot.height= 15) 
ggplot(candy_rankings, aes(reorder(competitorname, winpercent), winpercent)) +
  geom_segment(aes(xend = reorder(competitorname, winpercent), yend = 0)) +
  geom_point() + coord_flip() + xlab("Candy Name") +ylab("Win Percent")

# Plot the correlation matrix
options(repr.plot.width= 8, repr.plot.height= 8) 
candy_rankings %>% select(-competitorname) %>% cor() %>% corrplot()

# Fit a linear model of winpercent explained by all variables except competitorname
win_mod <- lm(winpercent ~ . -competitorname, data= candy_rankings)

summary(win_mod)

Call:
lm(formula = winpercent ~ . - competitorname, data = candy_rankings)

Residuals:
     Min       1Q   Median       3Q      Max 
-20.2244  -6.6247   0.1986   6.8420  23.8680 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)           34.5340     4.3199   7.994 1.44e-11 ***
chocolateTRUE         19.7481     3.8987   5.065 2.96e-06 ***
fruityTRUE             9.4223     3.7630   2.504  0.01452 *  
caramelTRUE            2.2245     3.6574   0.608  0.54493    
peanutyalmondyTRUE    10.0707     3.6158   2.785  0.00681 ** 
nougatTRUE             0.8043     5.7164   0.141  0.88849    
crispedricewaferTRUE   8.9190     5.2679   1.693  0.09470 .  
hardTRUE              -6.1653     3.4551  -1.784  0.07852 .  
barTRUE                0.4415     5.0611   0.087  0.93072    
pluribusTRUE          -0.8545     3.0401  -0.281  0.77945    
sugarpercent           9.0868     4.6595   1.950  0.05500 .  
pricepercent          -5.9284     5.5132  -1.075  0.28578    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.7 on 73 degrees of freedom
Multiple R-squared:  0.5402,	Adjusted R-squared:  0.4709 
F-statistic: 7.797 on 11 and 73 DF,  p-value: 9.504e-09

round(vif(win_mod), 2)

# Plot the residuals vs the fitted values
ggplot(augment(win_mod), aes(.fitted, .resid)) + geom_point() + geom_hline(yintercept = 0)

# Fit a glm() of chocolate
choc_mod <- glm(chocolate ~ . - competitorname, family= "binomial", data= candy_rankings)

Warning message:
"glm.fit: fitted probabilities numerically 0 or 1 occurred"

summary(choc_mod)

Call:
glm(formula = chocolate ~ . - competitorname, family = "binomial", 
    data = candy_rankings)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)   
(Intercept)           -10.29370    4.12040  -2.498  0.01248 * 
fruityTRUE             -6.75305    2.20462  -3.063  0.00219 **
caramelTRUE            -1.85093    1.66750  -1.110  0.26700   
peanutyalmondyTRUE     -4.11907    2.98175  -1.381  0.16715   
nougatTRUE            -16.74818 3520.13323  -0.005  0.99620   
crispedricewaferTRUE   14.98331 4725.35051   0.003  0.99747   
hardTRUE                1.83504    1.80742   1.015  0.30997   
barTRUE                19.06799 3520.13379   0.005  0.99568   
pluribusTRUE            0.22804    1.45457   0.157  0.87542   
sugarpercent            0.12168    2.07707   0.059  0.95329   
pricepercent            1.76626    2.24816   0.786  0.43208   
winpercent              0.23019    0.08593   2.679  0.00739 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 116.407  on 84  degrees of freedom
Residual deviance:  25.802  on 73  degrees of freedom
AIC: 49.802

Number of Fisher Scoring iterations: 19

# Make a data frame of predictions
preds <- augment(choc_mod, type.predict= "response") %>% mutate(prediction= .fitted > .5)

# Create the confusion matrix
conf_mat <- preds %>% select(chocolate, prediction) %>% table()

# Calculate the accuracy
accuracy <- sum(diag(conf_mat))/sum(conf_mat)
round(accuracy, 2)

Exploring data on halloween candy preferences¶

Linear Regression¶

Logistic Regression¶