Key words : TextMining, Elections, France, Debate, 2nd Round
We use the packages qdap from (Tyler Rinker) and tm to perform textmining analysis and the classical package like ggplot or RColorBrewer make our graphics look pretty.
For Hollande
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 18/05/2013 | |
# Key words : TextMining, Elections, France, Debate, 2nd Round | |
# We use the packages qdap from (donner le lien) and | |
# tm to perform textmining analysis and the classical | |
# package like ggplot or RColorBrewer to get the graphics pretty. | |
suppressPackageStartupMessages(require(twitteR)) | |
suppressPackageStartupMessages(require(XML)) | |
suppressPackageStartupMessages(require(tm)) | |
suppressPackageStartupMessages(require(rgdal)) | |
suppressPackageStartupMessages(require(ggplot2)) | |
suppressPackageStartupMessages(require(qdap)) | |
suppressPackageStartupMessages(require(rJava)) | |
suppressPackageStartupMessages(library(wordcloud)) | |
library(Rstem) | |
setwd("D:/PERSO/R_Working/Tutoriels/TextMining") | |
# Hollande | |
debate <- read.transcript("./Data/debat2tours.docx", col.names=c("person", "dialogue")) | |
htruncdf(debate,5,50) | |
# We keep just Holland's word | |
Hollande = subset(debate,person=="HOLLANDE") | |
# We define the stop words | |
sw=c("a","ou",tm::stopwords("fr"),"c'est", "n'est","s'y","qu'on","s'il","ah", | |
letters,"ca","n'y","d'un","monsieur") | |
generateCorpus= function(df,my.stopwords=c()){ | |
text2.corpus= Corpus(VectorSource(df),readerControl=list(language="fr")) | |
text2.corpus = tm_map(text2.corpus, removePunctuation) | |
text2.corpus = tm_map(text2.corpus, tolower) | |
text2.corpus= tm_map(text2.corpus, removeNumbers) | |
text2.corpus = tm_map(text2.corpus, removeWords, stopwords("fr")) | |
text2.corpus = tm_map(text2.corpus, removeWords, my.stopwords) | |
#text2.corpus <- tm_map(text2.corpus, stemDocument, language = "french") | |
} | |
HollandeCorpus<-generateCorpus(Hollande,sw) | |
# We build a Term Document Matrix | |
H.tdm <- TermDocumentMatrix(HollandeCorpus) | |
H.m <- as.matrix(H.tdm) | |
H.v <- sort(rowSums(H.m),decreasing=TRUE) | |
H.d <- data.frame(word = names(H.v),freq=H.v) | |
H.d = subset(H.d,freq<=90) | |
H.d = subset(H.d,freq>=3) | |
H.d$stem <- wordStem(row.names(H.d), language = "french") | |
# and put words to column, otherwise they would be lost when aggregating | |
H.d$word <- row.names(H.d) | |
agg_freq <- stats::aggregate(freq ~ stem, data = H.d, sum) | |
agg_word <- stats::aggregate(word ~ stem, data = H.d, function(x) x[1]) | |
forW <- cbind(freq = agg_freq[, 2], agg_word) | |
# sort by frequency | |
forW <- forW[order(forW$freq, decreasing = T), ] | |
# Wordcloud | |
col<- brewer.pal(8,"Dark2") | |
png("wordcloud_Hollande.png", width=1280,height=800) | |
wordcloud(forW$word,forW$freq, scale=c(8,.2),min.freq=5, | |
max.words=Inf, random.order=FALSE, rot.per=.20, colors=col) | |
dev.off() |
![]() |
Top words From hollande |
![]() |
Top words from Sarkozy |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
debate2 = subset(debate,person=="SARKOZY"|person == "HOLLANDE") | |
debate2$person<- factor(debate2$person,levels=qcv(terms="SARKOZY HOLLANDE")) | |
png("Gant.png", width=700,height=500) | |
with(debate2, gantt_plot(dialogue, person, xlab = "duration(words)", scale = "free")) | |
dev.off() | |

No comments:
Post a Comment