Baseball Salary Analysis SQL/R Code

For this analysis, I take the data of baseball players and analyze the variation in salary level. This demonstrate my background in working in SQL with R.

For the result of this analysis, please go to Baseball Salary Analysis in the Projects Section.

ssh s133an@springer.berkeley.edu
mysql -u stat133 -p
T0pSecr3t
SHOW DATABASES;
USE baseball;
SHOW TABLES IN baseball;
DESCRIBE Teams;
SELECT teamID FROM Teams;
SELECT name FROM Teams;

1)
DESCRIBE Teams;

SELECT name  FROM Teams GROUP BY name;
SELECT COUNT(DISTINCT name) FROM Teams;

#Plus one year because team playing from 1884 to 1884 is consider one year
SELECT name, teamID, min(yearID), max(yearID), max(yearID)-min(yearID)+1 AS numyears FROM Teams GROUP BY name;

#Washington Nationals played 136 Years in row 132

SELECT name, attendance, max(yearID) FROM Teams GROUP by name ORDER BY (attendance) DESC;

2)
DESCRIBE Salaries;
SELECT yearID, max(salary), min(salary), avg(salary) from Salaries GROUP BY yearID;

SELECT * FROM Master LIMIT 5;

SELECT Master.nameLast, Master.nameFirst, Master.nameNick, Salaries.salary FROM Master, Salaries WHERE Master.playerID = Salaries.playerID AND Salaries.yearID = “2007”;

3)
library(RMySQL)
drv <- dbDriver(“MySQL”)
con <- dbConnect(drv, dbname = “baseball”, user = “stat133”, password = “T0pSecr3t”, host = “springer”)
salary.table <- dbGetQuery(con, statement = “SELECT yearID, max(salary), min(salary), avg(salary) from Salaries GROUP BY yearID”)
player <- dbGetQuery(con, statement = “SELECT Master.nameLast, Master.nameFirst, Master.nameNick, Salaries.salary FROM Master, Salaries WHERE Master.playerID = Salaries.playerID AND Salaries.yearID = 2007″)

#plotting max, min, and avg salary
plot(salary.table[,1], salary.table[,2], type=”l”, xlab = “Year”, ylab = “Salary (in Dollars)”, main = “Baseball Salary vs. Year”) # Max Salary
lines(salary.table[,1], salary.table[,3], lty=2)
lines(salary.table[,1], salary.table[,4], lty=3)
legend(“topleft”, lty = 1:3, col = “black”, bg = “white”, legend = c(“Max Salary”, “Min Salary”, “Avg Salary”))
dev.print(jpeg, file = “Salary.jpeg”, height = 700, width = 700)

#plot of two boxplots separately
par(mfrow = c(1, 2))
boxplot(player$salary[is.na(player$nameNick) == FALSE], ylab = “Salary (in Dollars)”) # No nicknames
title(main = “Player Salary with no Nickname”)
boxplot(player$salary[is.na(player$nameNick) == TRUE], ylab = “Salary (in Dollars)”) # With nicknames
title(main = “Player Salary with Nickname”)
dev.print(jpeg, file = “Player1.jpeg”, height = 700, width = 700)

#plot of two boxplots together in one frame for easier comparison; finalized version of graph of nickname comparison
boxplot(split(player$salary, is.na(player$nameNick)), ylab = “Salary (in Dollars)”) # No nicknames
title(main = “Baseball Player Salary”)
legend(“topright”, legend = c(“FALSE: Player with no nickname”, ”  TRUE: Player with nickname”))
dev.print(jpeg, file = “Player.jpeg”, height = 700, width = 700)

%d bloggers like this: