In this code we perform sentiment analyis using a neural network¶
Import libraries¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from wordcloud import STOPWORDS
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn import model_selection
Read-in data¶
#Read in text files
yelp= pd.read_csv('yelp_labelled.txt', header= None, sep= '\t', names= ['Review', 'Sentiment'], quoting= 3)
amazon= pd.read_csv('amazon_cells_labelled.txt', header= None, sep= '\t', names= ['Review', 'Sentiment'], quoting= 3)
imdb= pd.read_csv('imdb_labelled.txt', header= None, sep= '\t', names= ['Review', 'Sentiment'], quoting= 3)
#Put them all together
reviews= pd.concat([yelp, amazon, imdb], ignore_index= True)
Perform Exploratory Data Analysis¶
The data consist of 3000 reviews of products, movies and services provided. They are labeled as positive or negative. There are 1500 reviews for each category.
#Take a look at shape; expecting 3000 reviews
reviews.shape
(3000, 2)
#Look at positive and negative sentiment counts; expecting 1500 of each, positive (1) and negative (0)
reviews.Sentiment.value_counts()
Sentiment 1 1500 0 1500 Name: count, dtype: int64
#Check for missing values
reviews.isna().sum()
Review 0 Sentiment 0 dtype: int64
#Tokenize to get number of unique words across all reviews
tokenizer= Tokenizer()
tokenizer.fit_on_texts(reviews.Review)
print("Vocab size before text cleaning: ", len(tokenizer.word_index))
Vocab size before text cleaning: 5271
Here we check for the presence of unique characters
#Create list of characters
text = reviews['Review']
char_list = []
for rev in text:
for char in rev:
if char not in char_list:
char_list.append(char)
print(char_list)
Perform text normalization¶
Due to the computation complexity of training neural networks it is advantageous to limit the input size while at the same time retaining the meaningful information in the text.
#Make all reviews lower case
reviews.Review = reviews.Review.str.lower()
#Remove punctuation and instances of a single character (Sadli, 2020)
reviews.Review = reviews.Review.apply(lambda x: re.sub(r'[^a-z]', ' ', x))
reviews.Review = reviews.Review.apply(lambda x: re.sub(r'\s+[a-z]\s+', ' ', x))
#Remove stopwords (detro, n.d.)
reviews.Review = reviews.Review.apply(lambda x: ' '.join([word for word in x.split() if word not in (STOPWORDS)]))
#Perform lemmatization (Prabhakaran, n.d.)
def get_wordnet_pos(word):
#Map POS tag to first character lemmatize() accepts
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
lemmatizer= WordNetLemmatizer()
reviews.Review = \
reviews.Review.apply(lambda x: ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_tokenize(x)]))
#Create a column for word count
reviews['Word_Count'] = reviews.Review.apply(lambda x: len(x.split()))
#Remove entires with 3 words or less
reviews = reviews[reviews['Word_Count'] > 3]
#Tokenize to get number of unique words across all reviews
tokenizer= Tokenizer()
tokenizer.fit_on_texts(reviews.Review)
vocab_size= len(tokenizer.word_index) + 1
print("Vocab size after text cleaning: ", vocab_size)
Vocab size after text cleaning: 3998
Transform data for modeling¶
#Set max length based on word count of longest review
max_length = max(reviews['Word_Count'])
print("Number of words in longest review: ", max_length)
Number of words in longest review: 44
#Tokenize and encode reviews to numeric values
tokenizer= Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(reviews.Review)
encoded_revs= tokenizer.texts_to_sequences(reviews.Review)
padded_revs= pad_sequences(encoded_revs, maxlen= max_length)
#Look at a padded review
padded_revs[1582]
array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 500, 32, 296, 355, 41, 3198, 3199, 190, 3200])
#Train/test split
#X refers to the features (Reviews), Y to the labels (Sentiment)
X_train, X_test, y_train, y_test= \
model_selection.train_test_split(padded_revs, reviews.Sentiment, random_state= 42, test_size= 0.10)
#Needed for input size
train_shape = X_train.shape[0]
print("Input shape will be: ", train_shape)
Input shape will be: 1854
##################
#Modeling
#Remainder of code adapted from (Sewell, n.d.)
Model specification. This defines the architecuture.¶
model= Sequential()
model.add(Input(shape= (train_shape, )))
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128, dropout= 0.5, recurrent_dropout= 0.5, activation= 'tanh'))
model.add(Dense(1, activation= 'sigmoid'))
model.compile(loss= 'binary_crossentropy', optimizer= Adam(learning_rate= 0.0001), metrics= ['accuracy'])
print(model.summary())
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩ │ embedding (Embedding) │ (None, 1854, 100) │ 399,800 │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ lstm (LSTM) │ (None, 128) │ 117,248 │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dense (Dense) │ (None, 1) │ 129 │ └──────────────────────────────────────┴─────────────────────────────┴─────────────────┘
Total params: 517,177 (1.97 MB)
Trainable params: 517,177 (1.97 MB)
Non-trainable params: 0 (0.00 B)
None
Create early stopping monitor to use during model fit¶
early_stopping= EarlyStopping(patience= 2)
Fit the model on the training set using the test set for validation¶
history= model.fit(X_train, y_train, epochs= 12, validation_data= (X_test, y_test), callbacks= [early_stopping])
Epoch 1/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 4s 32ms/step - accuracy: 0.5035 - loss: 0.6932 - val_accuracy: 0.5728 - val_loss: 0.6921 Epoch 2/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 24ms/step - accuracy: 0.5795 - loss: 0.6911 - val_accuracy: 0.5777 - val_loss: 0.6909 Epoch 3/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 23ms/step - accuracy: 0.6182 - loss: 0.6894 - val_accuracy: 0.6505 - val_loss: 0.6892 Epoch 4/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 21ms/step - accuracy: 0.6591 - loss: 0.6864 - val_accuracy: 0.7087 - val_loss: 0.6864 Epoch 5/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 22ms/step - accuracy: 0.7058 - loss: 0.6807 - val_accuracy: 0.6748 - val_loss: 0.6807 Epoch 6/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 25ms/step - accuracy: 0.7274 - loss: 0.6690 - val_accuracy: 0.6845 - val_loss: 0.6702 Epoch 7/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 27ms/step - accuracy: 0.7548 - loss: 0.6479 - val_accuracy: 0.7573 - val_loss: 0.6503 Epoch 8/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 26ms/step - accuracy: 0.7854 - loss: 0.6169 - val_accuracy: 0.7427 - val_loss: 0.6237 Epoch 9/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 22ms/step - accuracy: 0.8121 - loss: 0.5706 - val_accuracy: 0.7330 - val_loss: 0.5842 Epoch 10/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 25ms/step - accuracy: 0.8156 - loss: 0.5216 - val_accuracy: 0.8155 - val_loss: 0.5510 Epoch 11/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 26ms/step - accuracy: 0.8766 - loss: 0.4452 - val_accuracy: 0.8204 - val_loss: 0.5165 Epoch 12/12 58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 29ms/step - accuracy: 0.8929 - loss: 0.3925 - val_accuracy: 0.8204 - val_loss: 0.4923
Create visualizations for model accuracy and loss¶
plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Model Accuracy")
plt.xlabel("Epoch")
plt.legend()
plt.show()
plt.plot(history.history['loss'], label='Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Model Loss")
plt.xlabel("Epoch")
plt.legend()
plt.show()
Evaluate the model using the test set¶
The model can predict sentiment with an accuracy of 81%
model.evaluate(X_test, y_test)
7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.8104 - loss: 0.5035
[0.4923301339149475, 0.8203883767127991]
Make predictions using unseen data¶
#Function to run a prediction on a given text input
def predict_sentiment(text):
tw= tokenizer.texts_to_sequences([text])
tw= pad_sequences(tw)
prediction= int(model.predict(tw).round().item())
print("Predicted label: ", prediction)
#Make predictions on a positive and negative sentence
#Expecting a prediction of 1
test_sentence1 = "I love my new gadget! I would recommend it to others."
predict_sentiment(test_sentence1)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 234ms/step Predicted label: 1
#Expecting a prediction of 0
test_sentence2 = "I hate this phone. It's the worst!"
predict_sentiment(test_sentence2)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 218ms/step Predicted label: 0
References¶
detro. (n.d.). Remove Stopwords From Text in Dataframe Column. https://www.datasnips.com/58/remove-stop-words-from-text-in-dataframe-column/
Prabhakaran, S. (n.d.). Lemmatization Approaches with Examples in Python. https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
Sadli, R. (2020). Sentiment Analysis Using Keras Embedding Layer. https://machinelearningspace.com/sentiment-analysis-tensorflow/
Sewell, W. (n.d.). Sentiment Analysis I Presentation. Webinars - OneDrive (sharepoint.com)
Sewell, W. (n.d.). Sentiment Analysis # 3 Airline Tweets. Webinars - OneDrive (sharepoint.com)