### import the dataset [ELECTIONS] elections <- readxl::read_excel("state-data.xlsx", sheet="elections") ### convert the variables into factors elections$candidate <- factor(elections$candidate) elections$gender <- factor(elections$gender) elections$educcat <- factor(elections$educcat, ordered = TRUE, levels = c("lt high school", "high school", "junior college", "bachelor", "graduate degree")) attach(elections) ### frequency table table(candidate) ### measures of central tendency ## modus max(table(candidate)) Modal <- which(table(candidate) == max(table(candidate))) Modal ## median (for ordinal data) median(candidate) table(age); median(age) table(agecat); median(agecat) ### measures of dispersion ## proportion table table(candidate)/length(candidate) prop.table(table(candidate)) ## relative frequency of the modal category table(candidate)[Modal]/length(candidate) ## squared sum of relative frequencies sum(prop.table(table(candidate))^2) ## variation ratio prop.table(table(candidate))[Modal] ## nominal variance (Gini impurity) 1 - sum(prop.table(table(candidate))^2) length(levels(candidate)) / (length(levels(candidate)) - 1) * (1 - sum(prop.table(table(candidate))^2)) ## entropy - sum(prop.table(table(candidate)) * log(prop.table(table(candidate)))) - sum(prop.table(table(candidate)) * log(prop.table(table(candidate)))) / log(length(levels(candidate))) ### frequency tables and frequency distributions table(age) table(cut(age, breaks = c(0, 12, 18, 25, 35, 45, 100))) table(cut(age, breaks = c(0, 12, 18, 25, 35, 45, 100), right = FALSE)) ## Sturges' rule table(cut(age, breaks = ceiling(1 + log(length(age), 2)))) ## relative, cumulative, and relative cumulative frequencies age.FreqTable <- table(cut(age, breaks = c(0, 12, 18, 25, 35, 45, 100))) prop.table(age.FreqTable) cumsum(age.FreqTable) cumsum(prop.table(age.FreqTable)) ### frequency charts barplot(age.FreqTable, col = rainbow(6)) pie(age.FreqTable, col = rainbow(6)) hist(age.FreqTable) ## histogram hist(age, breaks = c(0, 12, 18, 25, 35, 45, 100)) hist(age, breaks = "Sturges") # default hist(age, breaks = "Freedman-Diaconis") ### boxplot boxplot(age) boxplot(educ) which(educ > 20) educ.corrected <- educ[-which(educ > 20)] boxplot(educ.corrected) ### empirical distribution function ## simulated ecdf (20 observations of the data 0:10) set.seed(1234) # set the RNG state to obtain the same result of the simulation xx <- round(runif(n = 20, min = 0, max = 10), digits = 0); xx # compare with the cummulative relative frequency table cumsum(prop.table(table(xx))) ecdf(xx) plot(ecdf(xx)) ## ecdf on the real data cumsum(prop.table(table(age))) ecdf(age) plot(ecdf(age))