Python – NLTK – POS Tagging - WhatsTheBusiness.org

#part of speech tagging

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

#PunktSentenceTokenizer is an unsupervised training algorithm 
#loading corpus for training text
train_text = state_union.raw("2005-GWBush.txt")
#load corpus for sample text
sample_text = state_union.raw("2006-GWBush.txt")

#create your PunktSentenceTokenizer
#we are training the PunktSentenceTokenizer on the train_text
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content(): 
	try: 
		for i in tokenized: 
			words = nltk.word_tokenize(i)
			tagged = nltk.pos_tag(words)

			print(tagged)

	except Exception as e: 
		print(str(e))

process_content()

#part of speech tagging creates tuples of 
#....
# POS tag list:

# CC	coordinating conjunction
# CD	cardinal digit
# DT	determiner
# EX	existential there (like: "there is" ... think of it like "there exists")
# FW	foreign word
# IN	preposition/subordinating conjunction
# JJ	adjective	'big'
# JJR	adjective, comparative	'bigger'
# JJS	adjective, superlative	'biggest'
# LS	list marker	1)
# MD	modal	could, will
# NN	noun, singular 'desk'
# NNS	noun plural	'desks'
# NNP	proper noun, singular	'Harrison'
# NNPS	proper noun, plural	'Americans'
# PDT	predeterminer	'all the kids'
# POS	possessive ending	parent\'s
# PRP	personal pronoun	I, he, she
# PRP$	possessive pronoun	my, his, hers
# RB	adverb	very, silently,
# RBR	adverb, comparative	better
# RBS	adverb, superlative	best
# RP	particle	give up
# TO	to	go 'to' the store.
# UH	interjection	 errrrrrrrm
# VB	verb, base form	take
# VBD	verb, past tense	took
# VBG	verb, gerund/present participle	taking
# VBN	verb, past participle	taken
# VBP	verb, sing. present, non-3d	take
# VBZ	verb, 3rd person sing. present	takes
# WDT	wh-determiner	which
# WP	wh-pronoun	who, what
# WP$	possessive wh-pronoun	whose
# WRB	wh-abverb	where, when
Python – NLTK – POS Tagging

Share this:

Like this: