Python – NLTK – Tokenizing

from nltk.tokenize import sent_tokenize, word_tokenize

# tokenizing - work tokenizers...sentence tokenizers
# lexicon adn corpras
# corpora - body of tex
# lexicon - words and their meaning

example_text = "Hello there, how are you. The sky is pinkish blue yay!"

#will split the two sentences into a list separated by sentences
print(sent_tokenize(example_text))

#splits each word up in the sentence into elements in a list
#the punctuation is also taken as its own element
print(word_tokenize(example_text))

#this is more of preprocessing rather than analysis
#later on we will disucss part of speech tagging

%d bloggers like this: