Parts of Speech Visualizer

So, for this week, I modified the POSVisualizer code to export a text and color-code it by the parts of speech tag for each word. Here are the results for the first few paragraphs of Alice in Wonderland:

Since the Brown Corpus I am using has a total of 78 different parts of speech tags, defining a unique color for each of these tags would have been overkill. The way I found around this is to examine just the first letter of the tag, to simplify things. So, for instance, I know that a tag beginning with the letter “v” is a verb of some kind, “n” a noun, and so on. All together, I color-coded four different parts of speech: nouns, verbs, pronouns, and adverbs.

And here is the code:

package class5.pos;
/*
 * Visualize all of 1 POS for example, nouns, in a text based on a trained POS tagger
 * This code extends the tagging code in TagNewText
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Set;

import processing.core.PApplet;
import processing.core.PFont;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunking;
import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.sentences.IndoEuropeanSentenceModel;
import com.aliasi.sentences.SentenceChunker;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tag.Tagging;
import com.aliasi.tokenizer.RegExTokenizerFactory;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.util.Files;
import com.aliasi.util.Streams;

public class VisualizePOS2 extends PApplet{

	private static final String MODEL_FILE = "/Users/kalicious/Documents/workspace/PartsOfSpeech/brownCKmodel.model";
	private static final String TEXT_FILE = "/texts/p.txt";
	//private static final String TEXT = "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'";
	private static final String POS = "nn";
	static TokenizerFactory TOKENIZER_FACTORY = new RegExTokenizerFactory("[a-zA-Z']+|[0-9]+|[.!?,\"-]");
	// "(-|'|\\d|\\p{L})+|\\S"
	static final SentenceModel SENTENCE_MODEL  = new IndoEuropeanSentenceModel();
	static final SentenceChunker SENTENCE_CHUNKER
	= new SentenceChunker(TOKENIZER_FACTORY,SENTENCE_MODEL);

	public void setup(){
		size(1280,650);
		background(50);
		smooth();
		textAlign(LEFT, TOP);
		PFont font = createFont("FFScala", 20, true);
		textFont(font);

		try {
			HiddenMarkovModel hmm = readModel(MODEL_FILE);
			HmmDecoder decoder = new HmmDecoder(hmm);

			//read in text file
			//System.out.println("reading text from file " + TEXT_FILE);
			File textFile = new File(TEXT_FILE);
			//String text = TEXT;
			String text = Files.readFromFile(textFile, "ISO-8859-1");
			//chunk text file into sentences
			Chunking chunking = getSentences(text);
			Set<Chunk> sentences = chunking.chunkSet();
			String slice = chunking.charSequence().toString();

			//get a set of tokens matching our chosen POS
		//	Set<String> tokens = getTokens(sentences, slice, decoder);
			List<String> tokens = getTokens(sentences, slice, decoder);

			int x = 10;
			int y = 10;
			//int i = 0;

			Random ran = new Random();
			//for(String token: tokens){
			for(int i=0; i<tokens.size();i+=2){
				String tag = tokens.get(i+1);
				try{

					int color = tag.charAt(0);
					String c  = tag.substring(0, 1);
					System.out.println("c: "+c);
				//	fill(tag.hashCode()/100);
					//fill(tag.charAt(0)*2,tag.length()*50, tag.hashCode()/100);

					//fill(tag.charAt(0)*2,tag.charAt(1)*2, tag.charAt(2)*2);
					if(c.equals("n")){
						//nouns
						fill(0,150,230);
					}else if(c.equals("v")){
						//verbs
						fill(255,176,82);
					}else if(c.equals("p")){
						//pronouns
						fill(152,255,239);
					}
					else if(c.equals("r")){
						//adverbs
						fill(236,187,255);
					}
					else{
						fill(150);
					}

				}
				catch(Exception e){

				}

				String token = tokens.get(i);

				if (x + textWidth(token+ " ") > width){
					x = 10;
					y += 30;
				}

				System.out.println("tag: "+tag);
					text(token + " ", x, y);
				if(tokens.get(i+2).matches("[.,?!]")){
					delay(10);
					x+=textWidth(token);
				}
				else{
					delay(5000);
					//Thread.sleep(4000);
					x+=textWidth(token + " ");
				}

				if (y>height) {
					System.out.println("end of screen reached");
					break; //end of screen reached
				}
			//	i++;
			}

		}
		catch(Exception e){
			e.printStackTrace();
		}
	}

	private static HiddenMarkovModel readModel(String modelFile) throws IOException, ClassNotFoundException {
		System.out.println("Reading model from file=" + modelFile);
		FileInputStream fileIn = new FileInputStream(modelFile);
		ObjectInputStream objIn = new ObjectInputStream(fileIn);
		HiddenMarkovModel hmm = (HiddenMarkovModel) objIn.readObject();
		Streams.closeQuietly(objIn);
		return hmm;
	}

	private static Chunking getSentences(String text) {
		//split text into sentences
		System.out.println("chunking text into sentences");
		Chunking chunking = SENTENCE_CHUNKER.chunk(text.toCharArray(),0,text.length());
		System.out.println(chunking);
		return chunking;
	}

	//return a set of unique tokens of tag POS
	private static List<String> getTokens(Set<Chunk> sentences, String slice, HmmDecoder decoder) {
		int i = 1;
		System.out.println("tokenizing and tagging text");
		//iterate over sentences
		HashSet<String> tokenSet = new HashSet<String>();

		ArrayList<String> returnTokenList = new ArrayList<String>();

		for (Iterator<Chunk> it = sentences.iterator(); it.hasNext(); ) {
			Chunk sentence = it.next();
			int start = sentence.start();
			int end = sentence.end();
			char[] cs = slice.substring(start,end).toCharArray();

			//tokenize sentence
			List<String> tokenList = tokenize(cs);

			//feed tokens to decoder
			//ArrayList<String> tokenList = new ArrayList<String>();
			Tagging<String> tagging = decoder.tag(tokenList);
			for (int j=0; j<tagging.size(); j++){
			//	if (tagging.tag(j).equals(POS)){ //ONLY return tokens matching POS
					//System.out.println(tagging.token(j));

				//	token.add(tagging.token(j));
				//	token.add(tagging.tag(j));
					tokenSet.add(tagging.token(j));
					returnTokenList.add(tagging.token(j));
					returnTokenList.add(tagging.tag(j));
					System.out.println("tagging.token(j): "+tagging.token(j));
					System.out.println("tagging.tag(j): "+tagging.tag(j));
			//	}
			}
		}

		//return tokenSet;
		return returnTokenList;
	}

	private static List<String> tokenize(char[] cs) {
		Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(cs ,0,cs.length);
		String[] tokens = tokenizer.tokenize();
		return Arrays.asList(tokens);
	}

	public static void main(String[] args) {

		PApplet.main(new String[] { "class5.pos.VisualizePOS" });

	}

}
This entry was posted in Learning Bit by Bit. Bookmark the permalink.

Leave a Reply

Your email address will not be published. Required fields are marked *

*

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>