In this code we perform sentiment analyis using a neural network¶

Import libraries¶
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from wordcloud import STOPWORDS
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn import model_selection
Read-in data¶
In [2]:
#Read in text files
yelp= pd.read_csv('yelp_labelled.txt', header= None, sep= '\t', names= ['Review', 'Sentiment'], quoting= 3)
amazon= pd.read_csv('amazon_cells_labelled.txt', header= None, sep= '\t', names= ['Review', 'Sentiment'], quoting= 3)
imdb= pd.read_csv('imdb_labelled.txt', header= None, sep= '\t', names= ['Review', 'Sentiment'], quoting= 3)
In [3]:
#Put them all together
reviews= pd.concat([yelp, amazon, imdb], ignore_index= True)
Perform Exploratory Data Analysis¶

The data consist of 3000 reviews of products, movies and services provided. They are labeled as positive or negative. There are 1500 reviews for each category.

In [4]:
#Take a look at shape; expecting 3000 reviews
reviews.shape
Out[4]:
(3000, 2)
In [5]:
#Look at positive and negative sentiment counts; expecting 1500 of each, positive (1) and negative (0)
reviews.Sentiment.value_counts()
Out[5]:
Sentiment
1    1500
0    1500
Name: count, dtype: int64
In [6]:
#Check for missing values
reviews.isna().sum()
Out[6]:
Review       0
Sentiment    0
dtype: int64
In [7]:
#Tokenize to get number of unique words across all reviews
tokenizer= Tokenizer()
tokenizer.fit_on_texts(reviews.Review)
print("Vocab size before text cleaning: ", len(tokenizer.word_index))
Vocab size before text cleaning:  5271


Here we check for the presence of unique characters

In [ ]:
#Create list of characters
text = reviews['Review']
char_list = []
for rev in text:
    for char in rev:
        if char not in char_list:
            char_list.append(char)
print(char_list)
Perform text normalization¶

Due to the computation complexity of training neural networks it is advantageous to limit the input size while at the same time retaining the meaningful information in the text.

In [8]:
#Make all reviews lower case
reviews.Review = reviews.Review.str.lower()
In [9]:
#Remove punctuation and instances of a single character (Sadli, 2020)
reviews.Review = reviews.Review.apply(lambda x: re.sub(r'[^a-z]', ' ', x))
reviews.Review = reviews.Review.apply(lambda x: re.sub(r'\s+[a-z]\s+', ' ', x))
In [10]:
#Remove stopwords (detro, n.d.)
reviews.Review = reviews.Review.apply(lambda x: ' '.join([word for word in x.split() if word not in (STOPWORDS)]))
In [11]:
#Perform lemmatization (Prabhakaran, n.d.)
def get_wordnet_pos(word):
    #Map POS tag to first character lemmatize() accepts
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)
lemmatizer= WordNetLemmatizer()
reviews.Review = \
    reviews.Review.apply(lambda x: ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_tokenize(x)]))
In [12]:
#Create a column for word count
reviews['Word_Count'] = reviews.Review.apply(lambda x: len(x.split()))
In [13]:
#Remove entires with 3 words or less
reviews = reviews[reviews['Word_Count'] > 3]
In [14]:
#Tokenize to get number of unique words across all reviews
tokenizer= Tokenizer()
tokenizer.fit_on_texts(reviews.Review)
vocab_size= len(tokenizer.word_index) + 1
print("Vocab size after text cleaning: ", vocab_size)
Vocab size after text cleaning:  3998
Transform data for modeling¶
In [15]:
#Set max length based on word count of longest review
max_length = max(reviews['Word_Count'])
print("Number of words in longest review: ", max_length)
Number of words in longest review:  44
In [16]:
#Tokenize and encode reviews to numeric values
tokenizer= Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(reviews.Review)
encoded_revs= tokenizer.texts_to_sequences(reviews.Review)
padded_revs= pad_sequences(encoded_revs, maxlen= max_length)
In [17]:
#Look at a padded review
padded_revs[1582]
Out[17]:
array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  500,   32,  296,  355,   41, 3198, 3199,  190, 3200])
In [18]:
#Train/test split
#X refers to the features (Reviews), Y to the labels (Sentiment)
X_train, X_test, y_train, y_test= \
model_selection.train_test_split(padded_revs, reviews.Sentiment, random_state= 42, test_size= 0.10)
In [19]:
#Needed for input size
train_shape = X_train.shape[0]
print("Input shape will be: ", train_shape)
Input shape will be:  1854
In [ ]:
##################
#Modeling
#Remainder of code adapted from (Sewell, n.d.)
Model specification. This defines the architecuture.¶
In [20]:
model= Sequential() 
model.add(Input(shape= (train_shape, )))
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128, dropout= 0.5, recurrent_dropout= 0.5, activation= 'tanh'))
model.add(Dense(1, activation= 'sigmoid')) 
model.compile(loss= 'binary_crossentropy', optimizer= Adam(learning_rate= 0.0001), metrics= ['accuracy'])  
print(model.summary()) 
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                         ┃ Output Shape                ┃         Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ embedding (Embedding)                │ (None, 1854, 100)           │         399,800 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ lstm (LSTM)                          │ (None, 128)                 │         117,248 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense (Dense)                        │ (None, 1)                   │             129 │
└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘
 Total params: 517,177 (1.97 MB)
 Trainable params: 517,177 (1.97 MB)
 Non-trainable params: 0 (0.00 B)
None
Create early stopping monitor to use during model fit¶
In [21]:
early_stopping= EarlyStopping(patience= 2)
Fit the model on the training set using the test set for validation¶
In [22]:
history= model.fit(X_train, y_train, epochs= 12, validation_data= (X_test, y_test), callbacks= [early_stopping])
Epoch 1/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 4s 32ms/step - accuracy: 0.5035 - loss: 0.6932 - val_accuracy: 0.5728 - val_loss: 0.6921
Epoch 2/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 24ms/step - accuracy: 0.5795 - loss: 0.6911 - val_accuracy: 0.5777 - val_loss: 0.6909
Epoch 3/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 23ms/step - accuracy: 0.6182 - loss: 0.6894 - val_accuracy: 0.6505 - val_loss: 0.6892
Epoch 4/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 21ms/step - accuracy: 0.6591 - loss: 0.6864 - val_accuracy: 0.7087 - val_loss: 0.6864
Epoch 5/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 22ms/step - accuracy: 0.7058 - loss: 0.6807 - val_accuracy: 0.6748 - val_loss: 0.6807
Epoch 6/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 25ms/step - accuracy: 0.7274 - loss: 0.6690 - val_accuracy: 0.6845 - val_loss: 0.6702
Epoch 7/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 27ms/step - accuracy: 0.7548 - loss: 0.6479 - val_accuracy: 0.7573 - val_loss: 0.6503
Epoch 8/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 26ms/step - accuracy: 0.7854 - loss: 0.6169 - val_accuracy: 0.7427 - val_loss: 0.6237
Epoch 9/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 22ms/step - accuracy: 0.8121 - loss: 0.5706 - val_accuracy: 0.7330 - val_loss: 0.5842
Epoch 10/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 25ms/step - accuracy: 0.8156 - loss: 0.5216 - val_accuracy: 0.8155 - val_loss: 0.5510
Epoch 11/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 26ms/step - accuracy: 0.8766 - loss: 0.4452 - val_accuracy: 0.8204 - val_loss: 0.5165
Epoch 12/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 29ms/step - accuracy: 0.8929 - loss: 0.3925 - val_accuracy: 0.8204 - val_loss: 0.4923
Create visualizations for model accuracy and loss¶
In [23]:
plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Model Accuracy")
plt.xlabel("Epoch")
plt.legend()
plt.show()
No description has been provided for this image
In [24]:
plt.plot(history.history['loss'], label='Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Model Loss")
plt.xlabel("Epoch")
plt.legend()
plt.show()
No description has been provided for this image
Evaluate the model using the test set¶

The model can predict sentiment with an accuracy of 81%

In [25]:
model.evaluate(X_test, y_test)
7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.8104 - loss: 0.5035
Out[25]:
[0.4923301339149475, 0.8203883767127991]
Make predictions using unseen data¶
In [26]:
#Function to run a prediction on a given text input
def predict_sentiment(text):
    tw= tokenizer.texts_to_sequences([text])
    tw= pad_sequences(tw)
    prediction= int(model.predict(tw).round().item())
    print("Predicted label: ", prediction)
In [27]:
#Make predictions on a positive and negative sentence

#Expecting a prediction of 1
test_sentence1 = "I love my new gadget! I would recommend it to others."
predict_sentiment(test_sentence1)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 234ms/step
Predicted label:  1
In [28]:
#Expecting a prediction of 0
test_sentence2 = "I hate this phone. It's the worst!"
predict_sentiment(test_sentence2)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 218ms/step
Predicted label:  0
References¶

detro. (n.d.). Remove Stopwords From Text in Dataframe Column. https://www.datasnips.com/58/remove-stop-words-from-text-in-dataframe-column/

Prabhakaran, S. (n.d.). Lemmatization Approaches with Examples in Python. https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

Sadli, R. (2020). Sentiment Analysis Using Keras Embedding Layer. https://machinelearningspace.com/sentiment-analysis-tensorflow/

Sewell, W. (n.d.). Sentiment Analysis I Presentation. Webinars - OneDrive (sharepoint.com)

Sewell, W. (n.d.). Sentiment Analysis # 3 Airline Tweets. Webinars - OneDrive (sharepoint.com)