import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from wordcloud import STOPWORDS
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn import model_selection

#Read in text files
yelp= pd.read_csv('yelp_labelled.txt', header= None, sep= '\t', names= ['Review', 'Sentiment'], quoting= 3)
amazon= pd.read_csv('amazon_cells_labelled.txt', header= None, sep= '\t', names= ['Review', 'Sentiment'], quoting= 3)
imdb= pd.read_csv('imdb_labelled.txt', header= None, sep= '\t', names= ['Review', 'Sentiment'], quoting= 3)

#Put them all together
reviews= pd.concat([yelp, amazon, imdb], ignore_index= True)

#Take a look at shape; expecting 3000 reviews
reviews.shape

(3000, 2)

#Look at positive and negative sentiment counts; expecting 1500 of each, positive (1) and negative (0)
reviews.Sentiment.value_counts()

Sentiment
1    1500
0    1500
Name: count, dtype: int64

#Check for missing values
reviews.isna().sum()

Review       0
Sentiment    0
dtype: int64

#Tokenize to get number of unique words across all reviews
tokenizer= Tokenizer()
tokenizer.fit_on_texts(reviews.Review)
print("Vocab size before text cleaning: ", len(tokenizer.word_index))

Vocab size before text cleaning:  5271

#Create list of characters
text = reviews['Review']
char_list = []
for rev in text:
    for char in rev:
        if char not in char_list:
            char_list.append(char)
print(char_list)

#Make all reviews lower case
reviews.Review = reviews.Review.str.lower()

#Remove punctuation and instances of a single character (Sadli, 2020)
reviews.Review = reviews.Review.apply(lambda x: re.sub(r'[^a-z]', ' ', x))
reviews.Review = reviews.Review.apply(lambda x: re.sub(r'\s+[a-z]\s+', ' ', x))

#Remove stopwords (detro, n.d.)
reviews.Review = reviews.Review.apply(lambda x: ' '.join([word for word in x.split() if word not in (STOPWORDS)]))

#Perform lemmatization (Prabhakaran, n.d.)
def get_wordnet_pos(word):
    #Map POS tag to first character lemmatize() accepts
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)
lemmatizer= WordNetLemmatizer()
reviews.Review = \
    reviews.Review.apply(lambda x: ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_tokenize(x)]))

#Create a column for word count
reviews['Word_Count'] = reviews.Review.apply(lambda x: len(x.split()))

#Remove entires with 3 words or less
reviews = reviews[reviews['Word_Count'] > 3]

#Tokenize to get number of unique words across all reviews
tokenizer= Tokenizer()
tokenizer.fit_on_texts(reviews.Review)
vocab_size= len(tokenizer.word_index) + 1
print("Vocab size after text cleaning: ", vocab_size)

Vocab size after text cleaning:  3998

#Set max length based on word count of longest review
max_length = max(reviews['Word_Count'])
print("Number of words in longest review: ", max_length)

Number of words in longest review:  44

#Tokenize and encode reviews to numeric values
tokenizer= Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(reviews.Review)
encoded_revs= tokenizer.texts_to_sequences(reviews.Review)
padded_revs= pad_sequences(encoded_revs, maxlen= max_length)

#Look at a padded review
padded_revs[1582]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  500,   32,  296,  355,   41, 3198, 3199,  190, 3200])

#Train/test split
#X refers to the features (Reviews), Y to the labels (Sentiment)
X_train, X_test, y_train, y_test= \
model_selection.train_test_split(padded_revs, reviews.Sentiment, random_state= 42, test_size= 0.10)

#Needed for input size
train_shape = X_train.shape[0]
print("Input shape will be: ", train_shape)

Input shape will be:  1854

##################
#Modeling
#Remainder of code adapted from (Sewell, n.d.)

model= Sequential() 
model.add(Input(shape= (train_shape, )))
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128, dropout= 0.5, recurrent_dropout= 0.5, activation= 'tanh'))
model.add(Dense(1, activation= 'sigmoid')) 
model.compile(loss= 'binary_crossentropy', optimizer= Adam(learning_rate= 0.0001), metrics= ['accuracy'])  
print(model.summary())

Model: "sequential"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                         ┃ Output Shape                ┃         Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ embedding (Embedding)                │ (None, 1854, 100)           │         399,800 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ lstm (LSTM)                          │ (None, 128)                 │         117,248 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense (Dense)                        │ (None, 1)                   │             129 │
└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘

 Total params: 517,177 (1.97 MB)

 Trainable params: 517,177 (1.97 MB)

 Non-trainable params: 0 (0.00 B)

None

early_stopping= EarlyStopping(patience= 2)

history= model.fit(X_train, y_train, epochs= 12, validation_data= (X_test, y_test), callbacks= [early_stopping])

Epoch 1/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 4s 32ms/step - accuracy: 0.5035 - loss: 0.6932 - val_accuracy: 0.5728 - val_loss: 0.6921
Epoch 2/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 24ms/step - accuracy: 0.5795 - loss: 0.6911 - val_accuracy: 0.5777 - val_loss: 0.6909
Epoch 3/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 23ms/step - accuracy: 0.6182 - loss: 0.6894 - val_accuracy: 0.6505 - val_loss: 0.6892
Epoch 4/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 21ms/step - accuracy: 0.6591 - loss: 0.6864 - val_accuracy: 0.7087 - val_loss: 0.6864
Epoch 5/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 22ms/step - accuracy: 0.7058 - loss: 0.6807 - val_accuracy: 0.6748 - val_loss: 0.6807
Epoch 6/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 25ms/step - accuracy: 0.7274 - loss: 0.6690 - val_accuracy: 0.6845 - val_loss: 0.6702
Epoch 7/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 27ms/step - accuracy: 0.7548 - loss: 0.6479 - val_accuracy: 0.7573 - val_loss: 0.6503
Epoch 8/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 26ms/step - accuracy: 0.7854 - loss: 0.6169 - val_accuracy: 0.7427 - val_loss: 0.6237
Epoch 9/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 1s 22ms/step - accuracy: 0.8121 - loss: 0.5706 - val_accuracy: 0.7330 - val_loss: 0.5842
Epoch 10/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 25ms/step - accuracy: 0.8156 - loss: 0.5216 - val_accuracy: 0.8155 - val_loss: 0.5510
Epoch 11/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 26ms/step - accuracy: 0.8766 - loss: 0.4452 - val_accuracy: 0.8204 - val_loss: 0.5165
Epoch 12/12
58/58 ━━━━━━━━━━━━━━━━━━━━ 2s 29ms/step - accuracy: 0.8929 - loss: 0.3925 - val_accuracy: 0.8204 - val_loss: 0.4923

plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Model Accuracy")
plt.xlabel("Epoch")
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Model Loss")
plt.xlabel("Epoch")
plt.legend()
plt.show()

model.evaluate(X_test, y_test)

7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.8104 - loss: 0.5035

[0.4923301339149475, 0.8203883767127991]

#Function to run a prediction on a given text input
def predict_sentiment(text):
    tw= tokenizer.texts_to_sequences([text])
    tw= pad_sequences(tw)
    prediction= int(model.predict(tw).round().item())
    print("Predicted label: ", prediction)

#Make predictions on a positive and negative sentence

#Expecting a prediction of 1
test_sentence1 = "I love my new gadget! I would recommend it to others."
predict_sentiment(test_sentence1)

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 234ms/step
Predicted label:  1

#Expecting a prediction of 0
test_sentence2 = "I hate this phone. It's the worst!"
predict_sentiment(test_sentence2)

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 218ms/step
Predicted label:  0

In this code we perform sentiment analyis using a neural network¶

Import libraries¶

Read-in data¶

Perform Exploratory Data Analysis¶

Perform text normalization¶

Transform data for modeling¶

Model specification. This defines the architecuture.¶

Create early stopping monitor to use during model fit¶

Fit the model on the training set using the test set for validation¶

Create visualizations for model accuracy and loss¶

Evaluate the model using the test set¶

Make predictions using unseen data¶

References¶