### import the dataset [ELECTIONS]
elections <- readxl::read_excel("state-data.xlsx", sheet="elections")

### convert the variables into factors
elections$candidate <- factor(elections$candidate)
elections$gender <- factor(elections$gender)
elections$educcat <- factor(elections$educcat, ordered = TRUE, levels = c("lt high school", "high school", "junior college", "bachelor", "graduate degree"))
attach(elections)

### frequency table
table(candidate)

### measures of central tendency
## modus
max(table(candidate))
Modal <- which(table(candidate) == max(table(candidate)))
Modal

## median (for ordinal data)
median(candidate)
table(age); median(age)
table(agecat); median(agecat)

### measures of dispersion
## proportion table
table(candidate)/length(candidate)
prop.table(table(candidate))

## relative frequency of the modal category
table(candidate)[Modal]/length(candidate)

## squared sum of relative frequencies
sum(prop.table(table(candidate))^2)

## variation ratio
prop.table(table(candidate))[Modal]

## nominal variance (Gini impurity)
1 - sum(prop.table(table(candidate))^2)
length(levels(candidate)) / (length(levels(candidate)) - 1) * (1 - sum(prop.table(table(candidate))^2))

## entropy
- sum(prop.table(table(candidate)) * log(prop.table(table(candidate))))
- sum(prop.table(table(candidate)) * log(prop.table(table(candidate)))) / log(length(levels(candidate)))


### frequency tables and frequency distributions
table(age)
table(cut(age, breaks = c(0, 12, 18, 25, 35, 45, 100)))
table(cut(age, breaks = c(0, 12, 18, 25, 35, 45, 100), right = FALSE))
## Sturges' rule
table(cut(age, breaks = ceiling(1 + log(length(age), 2))))
## relative, cumulative, and relative cumulative frequencies
age.FreqTable <- table(cut(age, breaks = c(0, 12, 18, 25, 35, 45, 100)))
prop.table(age.FreqTable)
cumsum(age.FreqTable)
cumsum(prop.table(age.FreqTable))

### frequency charts
barplot(age.FreqTable, col = rainbow(6))
pie(age.FreqTable, col = rainbow(6))
hist(age.FreqTable)
## histogram
hist(age, breaks = c(0, 12, 18, 25, 35, 45, 100))
hist(age, breaks = "Sturges")  # default
hist(age, breaks = "Freedman-Diaconis")

### boxplot
boxplot(age)
boxplot(educ)
which(educ > 20)
educ.corrected <- educ[-which(educ > 20)]
boxplot(educ.corrected)

### empirical distribution function
## simulated ecdf (20 observations of the data 0:10)
set.seed(1234)   # set the RNG state to obtain the same result of the simulation 
xx <- round(runif(n = 20, min = 0, max = 10), digits = 0); xx
# compare with the cummulative relative frequency table
cumsum(prop.table(table(xx)))
ecdf(xx)
plot(ecdf(xx))

## ecdf on the real data 
cumsum(prop.table(table(age)))
ecdf(age)
plot(ecdf(age))