- Example task: Clean and preprocess a sample text.
- Step 1: Import necessary libraries.
- Step 2: Tokenize text.
- Step 3: Remove stop words.
- Step 4: Apply stemming/lemmatization.
- Python Code Example:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Sample text
text = "Natural Language Processing is an exciting field of Artificial Intelligence!"
# Tokenization
tokens = word_tokenize(text)
# Removing stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Original Text:", text)
print("Tokenized Words:", tokens)
print("Filtered Tokens:", filtered_tokens)
print("Stemmed Words:", stemmed_words)
print("Lemmatized Words:", lemmatized_words)