Python – NLTK – Text Classification


# text classifer can be spam or not spam for instance
# or a general cateagory
# we will do positive or negative for sentiment analysis
# we can create our own tagged list and as long as it is labeled as one or two labels we can use it as a classifer

import nltk
import random
from nltk.corpus import movie_reviews

#features are what makes an element of something
# we will have a list of tuples
#first will be presence or absense of the word

#list of movie reviews

documents = [(list (movie_reviews.words(fileid)), category)
			for category in movie_reviews.categories()
			for fileid in movie_reviews.fileids(category)]

#if you train and test the same data that would be extreme bias
#therefore the training data can't be the same as the testing data
random.shuffle(documents)

print(documents[1])

#documents creates training and testing sets

#we will take every word in every review and find the most popular words and we will determine which is positive and negative
#if the section of text has the most positive or negative words then the group of text is positive or negative

#this is jsut words this is how we complie hte words
all_words = []
for w in movie_reviews.words(): 
	all_words.append(w.lower())

#later on we use the features of words to compare
#all words is a list we need to change that to a frequency distribution

all_words = nltk.FreqDist(all_words)
# print(all_words.most_common(15))


# #the top 15 may be sort of useless

# #we can also find the frequency of a specific word
# print(all_words("stupid"))

#we can use the naieve bais algorithm for classification
#this is a basic algorithm for classifying text as positive or negative

#frquency distribution is the most common to the amount
#it is a huge amount of words
#we could train against the words but it would be useless

word_features = list(all_words.keys())[:3000]
#the top 15 didn't matter because they were periods commas and dashes
#this will be good enough to find words that are commonly used and find which words are postive and negative
#now we want to have some sort of quick function to find the features in teh document we are using
#define fine features

def find_features(documents):
	#every single word will be a set of words
	features = {}
	words = set(documents)
	for w in word_features: 
		features[w] = (w in words) #this creates either a boolean of true or false

	return features

print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

#well call something
featurestest = [(find_features(rev), category)for (rev,category)in documents]
#the feature set will be the find features and find categories
#each document will be the review on just the words and the category
#it will convert the words with true or false it will be a dictionary of the top words and category
#using the top 3000 words we will see the existance of the words and assess if they are positive or negative

#we'll continue to build on the algorithm 


#my challenge is that with my cpu the processing takes a really long time. I need a GPU.