So, for this week, I modified the POSVisualizer code to export a text and color-code it by the parts of speech tag for each word. Here are the results for the first few paragraphs of Alice in Wonderland:
Since the Brown Corpus I am using has a total of 78 different parts of speech tags, defining a unique color for each of these tags would have been overkill. The way I found around this is to examine just the first letter of the tag, to simplify things. So, for instance, I know that a tag beginning with the letter “v” is a verb of some kind, “n” a noun, and so on. All together, I color-coded four different parts of speech: nouns, verbs, pronouns, and adverbs.
And here is the code:
package class5.pos;
/*
* Visualize all of 1 POS for example, nouns, in a text based on a trained POS tagger
* This code extends the tagging code in TagNewText
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Set;
import processing.core.PApplet;
import processing.core.PFont;
import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunking;
import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.sentences.IndoEuropeanSentenceModel;
import com.aliasi.sentences.SentenceChunker;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tag.Tagging;
import com.aliasi.tokenizer.RegExTokenizerFactory;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.util.Files;
import com.aliasi.util.Streams;
public class VisualizePOS2 extends PApplet{
private static final String MODEL_FILE = "/Users/kalicious/Documents/workspace/PartsOfSpeech/brownCKmodel.model";
private static final String TEXT_FILE = "/texts/p.txt";
//private static final String TEXT = "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'";
private static final String POS = "nn";
static TokenizerFactory TOKENIZER_FACTORY = new RegExTokenizerFactory("[a-zA-Z']+|[0-9]+|[.!?,\"-]");
// "(-|'|\\d|\\p{L})+|\\S"
static final SentenceModel SENTENCE_MODEL = new IndoEuropeanSentenceModel();
static final SentenceChunker SENTENCE_CHUNKER
= new SentenceChunker(TOKENIZER_FACTORY,SENTENCE_MODEL);
public void setup(){
size(1280,650);
background(50);
smooth();
textAlign(LEFT, TOP);
PFont font = createFont("FFScala", 20, true);
textFont(font);
try {
HiddenMarkovModel hmm = readModel(MODEL_FILE);
HmmDecoder decoder = new HmmDecoder(hmm);
//read in text file
//System.out.println("reading text from file " + TEXT_FILE);
File textFile = new File(TEXT_FILE);
//String text = TEXT;
String text = Files.readFromFile(textFile, "ISO-8859-1");
//chunk text file into sentences
Chunking chunking = getSentences(text);
Set<Chunk> sentences = chunking.chunkSet();
String slice = chunking.charSequence().toString();
//get a set of tokens matching our chosen POS
// Set<String> tokens = getTokens(sentences, slice, decoder);
List<String> tokens = getTokens(sentences, slice, decoder);
int x = 10;
int y = 10;
//int i = 0;
Random ran = new Random();
//for(String token: tokens){
for(int i=0; i<tokens.size();i+=2){
String tag = tokens.get(i+1);
try{
int color = tag.charAt(0);
String c = tag.substring(0, 1);
System.out.println("c: "+c);
// fill(tag.hashCode()/100);
//fill(tag.charAt(0)*2,tag.length()*50, tag.hashCode()/100);
//fill(tag.charAt(0)*2,tag.charAt(1)*2, tag.charAt(2)*2);
if(c.equals("n")){
//nouns
fill(0,150,230);
}else if(c.equals("v")){
//verbs
fill(255,176,82);
}else if(c.equals("p")){
//pronouns
fill(152,255,239);
}
else if(c.equals("r")){
//adverbs
fill(236,187,255);
}
else{
fill(150);
}
}
catch(Exception e){
}
String token = tokens.get(i);
if (x + textWidth(token+ " ") > width){
x = 10;
y += 30;
}
System.out.println("tag: "+tag);
text(token + " ", x, y);
if(tokens.get(i+2).matches("[.,?!]")){
delay(10);
x+=textWidth(token);
}
else{
delay(5000);
//Thread.sleep(4000);
x+=textWidth(token + " ");
}
if (y>height) {
System.out.println("end of screen reached");
break; //end of screen reached
}
// i++;
}
}
catch(Exception e){
e.printStackTrace();
}
}
private static HiddenMarkovModel readModel(String modelFile) throws IOException, ClassNotFoundException {
System.out.println("Reading model from file=" + modelFile);
FileInputStream fileIn = new FileInputStream(modelFile);
ObjectInputStream objIn = new ObjectInputStream(fileIn);
HiddenMarkovModel hmm = (HiddenMarkovModel) objIn.readObject();
Streams.closeQuietly(objIn);
return hmm;
}
private static Chunking getSentences(String text) {
//split text into sentences
System.out.println("chunking text into sentences");
Chunking chunking = SENTENCE_CHUNKER.chunk(text.toCharArray(),0,text.length());
System.out.println(chunking);
return chunking;
}
//return a set of unique tokens of tag POS
private static List<String> getTokens(Set<Chunk> sentences, String slice, HmmDecoder decoder) {
int i = 1;
System.out.println("tokenizing and tagging text");
//iterate over sentences
HashSet<String> tokenSet = new HashSet<String>();
ArrayList<String> returnTokenList = new ArrayList<String>();
for (Iterator<Chunk> it = sentences.iterator(); it.hasNext(); ) {
Chunk sentence = it.next();
int start = sentence.start();
int end = sentence.end();
char[] cs = slice.substring(start,end).toCharArray();
//tokenize sentence
List<String> tokenList = tokenize(cs);
//feed tokens to decoder
//ArrayList<String> tokenList = new ArrayList<String>();
Tagging<String> tagging = decoder.tag(tokenList);
for (int j=0; j<tagging.size(); j++){
// if (tagging.tag(j).equals(POS)){ //ONLY return tokens matching POS
//System.out.println(tagging.token(j));
// token.add(tagging.token(j));
// token.add(tagging.tag(j));
tokenSet.add(tagging.token(j));
returnTokenList.add(tagging.token(j));
returnTokenList.add(tagging.tag(j));
System.out.println("tagging.token(j): "+tagging.token(j));
System.out.println("tagging.tag(j): "+tagging.tag(j));
// }
}
}
//return tokenSet;
return returnTokenList;
}
private static List<String> tokenize(char[] cs) {
Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(cs ,0,cs.length);
String[] tokens = tokenizer.tokenize();
return Arrays.asList(tokens);
}
public static void main(String[] args) {
PApplet.main(new String[] { "class5.pos.VisualizePOS" });
}
}
