State of the Union Addresses Analysis Code

This is a code for analyzing the number of sentences per address, average words per address, total number of words per address, and the word stem that are most frequently used in all United States President’s State of the Union Addresses. This code will not only demonstrate the use of R program, but also the frequent use of Regular Expression.

For the result of this code, please go to State of the Union Addresses Analysis under the Projects tab.

#source(“~/Desktop/SOU.R”) using lab computers
#SOU <- readLines(“~/Desktop/SOU.txt”) <- SOU.txt is a compilation of State of Union Addresses text

source(“C:/Users/Kazuki/Desktop/SOU.R”)
SOU <- readLines(“SOU.txt”)
pattern <- ‘\\*\\*\\*\\*\\*’
index <- grep(pattern,SOU)
index <- index[1:(length(index)-1)]+2 #Last “*****” locates the end of the txt file instead of a new speech
Pres <- SOU[index]
pp <- ‘(.+)[[:blank:]]$’
Pres <- gsub(pp, ‘\\1’, Pres)
index <- index+2
Date <- SOU[index]
Date[170] <- ” December 1, 1843″
Date <- strptime(Date,” %B %e, %Y”)
speech.sentences <- speech.words <- speech.stems <- vector(“list”, length(Pres))

#Taking the period out of “Mr.”, “Mrs.”, and “Ms.”
abbrev <- ‘(Mr|Mrs|Ms)\\.’
SOU <- gsub(abbrev, ‘\\1’, SOU)

#Speech always starts on Index+6 and ends at next Index-3
pattern <- ‘\\*\\*\\*\\*\\*’
index <- grep(pattern,SOU)
index.new1 <- index+6
index.new2 <- index-3

#Take out any unneccessary punctuation and spaces, and remove all blank word elements
clean <- ‘([[:alpha:]]+|[[:digit:]]+)[[:punct:]]’
word1 <- “([[:alpha:]])([[:punct:]]+)”
word2 <- “([[:punct:]]+)([[:alpha:]])”
word3 <- “([[:blank:]])[[:blank:]]+”

meanwords <- nsentences <- rep(NA, length(Pres))
for(i in 1:length(Pres)){
speech <- SOU[index.new1[i]:index.new2[i+1]]
speech <- paste(speech, sep=””, collapse=””)
speech.sentences[[i]] <- unlist(strsplit(speech, “\\.”))
speech.sentences[[i]] <- gsub(clean, ‘\\1’, speech.sentences[[i]])
total.words <- gsub(word1, ‘\\1’, speech)
total.words <- gsub(word2, ‘\\2’, total.words)
total.words <- gsub(word3, ‘\\1’, total.words)
speech.words[[i]] <- unlist(strsplit(total.words, “[[:blank:]]+|-+”))
speech.words[[i]] <- speech.words[[i]][nchar(speech.words[[i]])>0]
nsentences[i] <- length(speech.sentences[[i]])
meanwords[i] <- length(speech.words[[i]])/nsentences[i]
}

#Putting the Date, Pres, speech.sentences, and speech.words in chronological order
Pres <- Pres[order(Date)]
new.speech.words <- new.speech.sentences <- vector(“list”, length(Pres))
for(i in 1:length(Pres)){
new.speech.sentences[[i]] <- speech.sentences[[(order(Date))[i]]]
new.speech.words[[i]] <- speech.words[[(order(Date))[i]]]
}
speech.sentences <- new.speech.sentences
speech.words <- new.speech.words
Date <- Date[order(Date)]

#Part 2
#Take the word stem of all lower case words in each speech and put them in speech.stems
for(i in 1:length(Pres)){
speech.stems[[i]] <- wordStem(tolower(speech.words[[i]]))
}
#Do the same process in lappy
speech.stems <- lappy(tolower(speech.words), wordStem)

#Tale out all unique words from all speeches
all.stems <- unique(unlist(speech.stems))

#Convert to factors
factor.stems <- vector(“list”, length(Pres))
for(i in 1:length(Pres)){
factor.stems[[i]] <- factor(speech.stems[[i]], levels = all.stems)}

#Count the number of time unique word has appeared in each speech
stem.count <- matrix(NA, nrow = length(Pres), ncol = length(all.stems))
for(i in 1:length(Pres)){
tab <- tabulate(factor.stems[[i]], nbins = length(all.stems))
stem.count[i,] <- tab}

#Tally up total number of times each unique word has appeared throughout all speeches
total.counts <- rep(NA, length(ncol(stem.count)))
for(i in 1:ncol(stem.count)){
total.counts[i] <- sum(stem.count[,i])
}

#Top 50 most frequent words in all speeches
all.stems[order(total.counts, decreasing = TRUE)[1:50]]

#Number of occurence of the top 50 frequent words
total.counts[order(total.counts, decreasing = TRUE)[1:50]]

#Graphs (From HW3.R)
## Use this after you compute nsentences and meanwords

pdf(file = “SoUplot.pdf”, height = 8, width = 10)
par(mfrow = c(2, 1), mar = c(0, 4, 1, 1), oma = c(12, 0, 2, 0), xpd = NA)
plot(Date, nsentences, type = “l”, col = “grey”,
xaxt = “n”, ylab = “Number of Sentences”)
title(main = “Summary Statistics for State of the Union Addresses”, line = 1)
points(Date, nsentences, pch = 16, col = match(Pres, unique(Pres)))
plot(Date, meanwords, type = “l”,
col = “grey”, ylab = “Avg Words per Sentence”)
points(Date, meanwords, pch = 16, col = match(Pres, unique(Pres)))
text(Date[!duplicated(Pres)], 5, unique(Pres),
srt = -90, col = 1:length(Pres), adj = 0, srt = -90)
dev.off()

## Use this after you’ve created stem.count

source(“C:/Users/Kazuki/Desktop/SJD.R”)

presfreq <- function(president, stem.count, Pres){
submat <- stem.count[,Pres == president, drop = FALSE]
wordcounts <- apply(submat, 1, sum)
return(wordcounts/sum(wordcounts))
}
word.freq <- sapply(unique(Pres), presfreq, stem.count, Pres)
SJ.mat <- computeSJDistance(terms = all.stems,
df = apply(word.freq > 0, 1, sum),
tf = word.freq)
mds <- cmdscale(SJ.mat)

library(fields)
## if you get an error, use this to install from CRAN first
## install.packages(“fields”)

ratio <- diff(range(mds[,2]))/diff(range(mds[,1]))
pdf(file = “mds.pdf”, height = 10 * ratio, width = 10)
xlim <- range(mds[,1]) + c(-1, 1) * 0.1 * diff(range(mds[,1])) # make room for names
par(oma = c(3, 0, 0, 0))
plot(mds, type = “n”, bty = “l”, xlim = xlim, xlab = “”, ylab = “”)
text(mds, label = unique(Pres), col = tim.colors(length(unique(Pres))))
title(main = “Distances Between Presidentional Word Frequency Distributions”)
image.plot(zlim = c(1790, 2008), col = tim.colors(40), legend.only = TRUE,
horizontal = TRUE, legend.mar = 0, legend.shrink = 0.7)
dev.off()

%d bloggers like this: