Last active
August 29, 2015 14:04
-
-
Save shawngraham/e51e8bb7c6b322caf204 to your computer and use it in GitHub Desktop.
Mimno's Mallet Wrapper- example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#this is adapted from Ben Marwick's Day of Archaeology 2013 analysis | |
#this is for an example of using Mimno's wrapper for Mallet | |
#we're using the sample data that comes bundeled when you download MALLET from | |
#http://mallet.cs.umass.edu/download.php | |
#we've assumed that, on Mac, you've put MALLET that you unzipped from Umass into a folder | |
#under [user], ie "shawngraham/mallet-2.0.7" | |
#on windows, use the full path "C:>\\mallet-2.0.7\\" | |
#insert the path to your documents between the quotation marks | |
#and windows users be sure to use \\ instead of a single \ | |
#we're assuming that you've already installed the mallet wrapper for R; if not, uncomment and run this line: | |
#install.packages('mallet') | |
#if you are using Mavericks OS there could be a problem in installation - see chapter four for solution. | |
require(mallet) | |
#import the documents from the folder | |
#each document is here its own text file | |
documents <- mallet.read.dir("mallet-2.0.7/sample-data/web/en/") | |
##windows users, remember: have the full path, ie "C:\\mallet-2.0.7\\sample-data\\web\\" and so on throughout this script | |
mallet.instances <- mallet.import(documents$id, documents$text, "mallet-2.0.7/stoplists/en.txt", token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}") | |
#create topic trainer object | |
n.topics <- 30 | |
topic.model <- MalletLDA(n.topics) | |
#load documents | |
topic.model$loadDocuments(mallet.instances) | |
## Get the vocabulary, and some statistics about word frequencies. | |
## These may be useful in further curating the stopword list. | |
vocabulary <- topic.model$getVocabulary() | |
word.freqs <- mallet.word.freqs(topic.model) | |
## Optimize hyperparameters every 20 iterations, | |
## after 50 burn-in iterations. | |
topic.model$setAlphaOptimization(20, 50) | |
## Now train a model. Note that hyperparameter optimization is on, by default. | |
## We can specify the number of iterations. Here we'll use a large-ish round number. | |
topic.model$train(200) | |
## NEW: run through a few iterations where we pick the best topic for each token, | |
## rather than sampling from the posterior distribution. | |
topic.model$maximize(10) | |
## Get the probability of topics in documents and the probability of words in topics. | |
## By default, these functions return raw word counts. Here we want probabilities, | |
## so we normalize, and add "smoothing" so that nothing has exactly 0 probability. | |
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T) | |
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T) | |
# from http://www.cs.princeton.edu/~mimno/R/clustertrees.R | |
## transpose and normalize the doc topics | |
topic.docs <- t(doc.topics) | |
topic.docs <- topic.docs / rowSums(topic.docs) | |
write.csv(topic.docs, "topics-docs.csv" ) ## "C:\\Mallet-2.0.7\\topic-docs.csv" | |
## Get a vector containing short names for the topics | |
topics.labels <- rep("", n.topics) | |
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ") | |
# have a look at keywords for each topic | |
topics.labels | |
write.csv(topics.labels, "topics-labels.csv") ## "C:\\Mallet-2.0.7\\topics-labels.csv") | |
### now that you've got your topic model, you can do all sorts of other visualizations. See line 67 onwards | |
# at https://github.com/shawngraham/R/blob/master/topicmodel.R | |
# for some ideas; | |
# see the original worked through piece by Ben Marwick, Distant Reading a Day of Archaeology 2013 at | |
# https://github.com/benmarwick/dayofarchaeology |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment