# download pdftotxt from
# ftp://ftp.foolabs.com/pub/xpdf/xpdfbin-win-3.03.zip
# and extract to your program files folder
# here is a pdf for mining
url <- span=""> "http://www.noisyroom.net/blog/RomneySpeech072912.pdf"->
dest <- span=""> tempfile(fileext = ".pdf")->
download.file(url, dest, mode = "wb")
# set path to pdftotxt.exe and convert pdf to text
exe <- span=""> "C:\\Program Files\\xpdfbin-win-3.03\\bin32\\pdftotext.exe"->
system(paste("\"", exe, "\" \"", dest, "\"", sep = ""), wait = F)
# get txt-file name and open it
filetxt <- span=""> sub(".pdf", ".txt", dest)->
shell.exec(filetxt); shell.exec(filetxt) # strangely the first try always throws an error..
# do something with it, i.e. a simple word cloud
library(tm)
library(wordcloud)
library(Rstem)
txt <- span=""> readLines(filetxt) # don't mind warning..->
txt <- span=""> tolower(txt)->
txt <- span=""> removeWords(txt, c("\\f", stopwords()))->
corpus <- span=""> Corpus(VectorSource(txt))->
corpus <- span=""> tm_map(corpus, removePunctuation)->
tdm <- span=""> TermDocumentMatrix(corpus)->
m <- span=""> as.matrix(tdm)->
d <- span=""> data.frame(freq = sort(rowSums(m), decreasing = TRUE))->
# Stem words
d$stem <- span=""> wordStem(row.names(d), language = "english")->
# and put words to column, otherwise they would be lost when aggregating
d$word <- span=""> row.names(d)->
# remove web address (very long string):
d <- span=""> d[nchar(row.names(d)) < 20, ]->
# aggregate freqeuncy by word stem and
# keep first words..
agg_freq <- span=""> aggregate(freq ~ stem, data = d, sum)->
agg_word <- span=""> aggregate(word ~ stem, data = d, function(x) x[1])->
d <- span=""> cbind(freq = agg_freq[, 2], agg_word)->
# sort by frequency
d <- span=""> d[order(d$freq, decreasing = T), ]->
# print wordcloud:
wordcloud(d$word, d$freq)
# remove files
file.remove(dir(tempdir(), full.name=T)) # remove files
I am born with potential, I am born with goodness, I am born with ideas and dreams, I am born with greatness, I have wings, I have two wings, I am meant for creativity because I have wings, I will fly, I will fly, I will fly !!! -- DR. A. P. J. Abdul Kalam
Search This Blog
Saturday, August 24, 2013
Reading and Text Mining a PDF File in R
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment
Thank you