问题
I have been trying to follow this example by Norbert Ryciak, whom I havent been able to get in touch with.
Since this article was written in 2014, some things in R have changed so I have been able to update some of those things in the code, but I got stuck in the last part.
Here is my Working code so far:
library(tm)
library(stringi)
library(proxy)
wiki <- "https://en.wikipedia.org/wiki/"
titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral", "Derivative",
"Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
"Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")
articles <- character(length(titles))
for (i in 1:length(titles)) {
articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
}
docs <- Corpus(VectorSource(articles))
docs[[1]]
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("english"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, content_transformer(tolower))
docs8[[1]]
docsTDM <- TermDocumentMatrix(docs8)
docsTDM2 <- as.matrix(docsTDM)
docsdissim <- dist(docsTDM2, method = "cosine")
But I havent been able to get pass this part:
docsdissim2 <- as.matrix(docsdissim)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
docsdissim2
h <- hclust(docsdissim, method = "ward.D")
plot(h, labels = titles, sub = "")
I tried to run the "hclust" directly, and then I was able to Plot, but nothing readable came out of it.
This are the errors Im getting:
rownames(docsdissim2) <- titles
Error in `rownames<-`(`*tmp*`, value = c("Integral", "Riemann_integral", :
length of 'dimnames' [1] not equal to array extent
Another:
plot(h, labels = titles, sub = "")
Error in graphics:::plotHclust(n1, merge, height, order(x$order), hang, :
invalid dendrogram input
Is there anyone that could give me a hand to finish this example?
Best Regards,
回答1:
I was able to solve this problem thanks to Norbert Ryciak (the author of the tutorial).
Since he used an older version of "tm" (which was probably the latest at the time) it was not compatible with the one I used.
The solution was to replace "docsTDM <- TermDocumentMatrix(docs8)" with "docsTDM <- DocumentTermMatrix(docs8)".
So the final code:
library(tm)
library(stringi)
library(proxy)
wiki <- "https://en.wikipedia.org/wiki/"
titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral", "Derivative",
"Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
"Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")
articles <- character(length(titles))
for (i in 1:length(titles)) {
articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
}
docs <- Corpus(VectorSource(articles))
docs[[1]]
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("english"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, content_transformer(tolower))
docs8[[1]]
docsTDM <- DocumentTermMatrix(docs8)
docsTDM2 <- as.matrix(docsTDM)
docsdissim <- dist(docsTDM2, method = "cosine")
docsdissim2 <- as.matrix(docsdissim)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
docsdissim2
h <- hclust(docsdissim, method = "ward")
plot(h, labels = titles, sub = "")
来源:https://stackoverflow.com/questions/34423823/r-automatic-categorization-of-wikipedia-articles