Python – NLTK – Stop Words


#stop words

#nltk will not generate insights for you it will help you analyze and pull apart text
#stop words are words that you pull out and are not needed - they have filler words and in terms of data analysis they are not useful

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sentence = "This is an example showing off stop word filtration"
#these words are words already complied by NLTK you can also append additional stop words
stop_words = set(stopwords.words("english"))
#the list is not comprehensive but they are common words that can be removed when doing data analysis

#tokenize the example sentence
words = word_tokenize(example_sentence)

#filtered_sentence
filtered_sentence = []

for w in words: 
	if w not in stop_words: 
			filtered_sentence.append(w)

print(filtered_sentence)

#result: 
#['This', 'example', 'showing', 'stop', 'word', 'filtration']
# you will see that words in the stop words list have been removed from the list

%d bloggers like this: