# Problem 1

library(ggplot2)
library(dplyr)
library(car)

# a)

head(Salaries)
#    rank discipline yrs.since.phd yrs.service sex salary
# 1    Prof       B          19       18 Male 139750
# 2    Prof       B          20       16 Male 173200
# 3 AsstProf       B          4       3 Male 79750
# 4    Prof       B          45       39 Male 115000
# 5    Prof       B          40       41 Male 141500
# 6 AssocProf       B          6       6 Male 97000

summary(Salaries)
#    rank discipline yrs.since.phd yrs.service    sex       salary
# AsstProf : 67 A:181    Min. : 1.00 Min. : 0.00 Female: 39 Min. : 57800
# AssocProf: 64 B:216    1st Qu.:12.00 1st Qu.: 7.00 Male :358 1st Qu.: 91000
# Prof :266          Median :21.00 Median :16.00             Median :107300
#                         Mean :22.31 Mean :17.61             Mean :113706
#                         3rd Qu.:32.00 3rd Qu.:27.00             3rd Qu.:134185
#                         Max. :56.00 Max. :60.00             Max. :231545

# The Salaries data has three categorical variables including sex and
# two continuous varaible and one continuos variable salary. Those
# categorical variables take only 2 or 3 factors. The size of male faculty is a lot
# more than the female faculty. The sample size is 397.

cor(Salaries[c("yrs.since.phd", "yrs.service", "salary")])
#             yrs.since.phd yrs.service salary
# yrs.since.phd 1.0000000 0.9096491 0.4192311
# yrs.service    0.9096491 1.0000000 0.3347447
# salary          0.4192311 0.3347447 1.0000000

# The two continous variables are correlated with salary in a very similar way and
# they are highly correlated. We may only include one of them in the model.

qplot(sex, salary, geom = "boxplot", data = Salaries)

# Using a boxplot, means for salaries for both sex are very similar but the overall
# observations for male faculty salaries are larger than the female counterparts.

ggplot(Salaries, aes(x=yrs.since.phd, y=salary, col=sex)) + geom_point()

# Using scatterplot, salary and year-since-phds appear to be correlated positively. It looks like
# the posivity inclination is slightly smaller for female faculty members, but
# it may be due to the sample size for female faculty.

Exercise 1. Consider the Hidalgo data set. Load the data, and calc...