Classification using neural networks¶
Here we'll create a model capable of predicting whether a customer will churn using labeled data from a fictious telecommunications company.
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn import model_selection
#Read in data
data= pd.read_csv('churn_clean.csv')
The data consists of 50 variables. We'll select specific variables that may have an impact on churn.
#Select variables of interest
data = data[['Income', 'Churn', 'Outage_sec_perweek', 'Email', 'Contacts', 'Yearly_equip_failure', 'Techie',
'Contract', 'StreamingTV', 'StreamingMovies', 'Tenure', 'MonthlyCharge']]
There are 10000 observations in the data set. There are 12 varialbes total. Churn will be the target variable; the rest will be the predictors.
data.shape
(10000, 12)
The variables of type 'object' are categorical. They are originally represented using strings. They'll need to be converted into dummy variables.
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Income 10000 non-null float64 1 Churn 10000 non-null object 2 Outage_sec_perweek 10000 non-null float64 3 Email 10000 non-null int64 4 Contacts 10000 non-null int64 5 Yearly_equip_failure 10000 non-null int64 6 Techie 10000 non-null object 7 Contract 10000 non-null object 8 StreamingTV 10000 non-null object 9 StreamingMovies 10000 non-null object 10 Tenure 10000 non-null float64 11 MonthlyCharge 10000 non-null float64 dtypes: float64(4), int64(3), object(5) memory usage: 937.6+ KB
Most of the categorical variables are in the form 'yes' or 'no'. We can simply replace that with 1 or 0.
#Manually do dummy variables for yes/no varialbes
data['Churn'] = data['Churn'].replace({'Yes':1, 'No':0})
data['Techie'] = data['Techie'].replace({'Yes':1, 'No':0})
data['StreamingTV'] = data['StreamingTV'].replace({'Yes':1, 'No':0})
data['StreamingMovies'] = data['StreamingMovies'].replace({'Yes':1, 'No':0})
The Contract variable has 3 values
pd.unique(data.Contract)
array(['One year', 'Month-to-month', 'Two Year'], dtype=object)
We'll convert Contract to a dummy variable using Panda's get_dummies function.
#Get dummies for Contract (has 3 levels)
data = pd.get_dummies(data, columns=['Contract'], dtype=int)
We'll scale the continuous variables before modeling.
#Scale the numeric variables
scaler = StandardScaler()
data[['Income', 'Outage_sec_perweek', 'Email', 'Contacts', 'Yearly_equip_failure', 'Tenure', 'MonthlyCharge']] = \
scaler.fit_transform(
data[['Income', 'Outage_sec_perweek', 'Email', 'Contacts', 'Yearly_equip_failure', 'Tenure', 'MonthlyCharge']])
All variables are numeric now.
data.head()
Income | Churn | Outage_sec_perweek | Contacts | Yearly_equip_failure | Techie | StreamingTV | StreamingMovies | Tenure | MonthlyCharge | Contract_Month-to-month | Contract_One year | Contract_Two Year | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.398778 | 0 | -0.679978 | -0.666282 | -1.005852 | 0.946658 | 0 | 0 | 1 | -1.048746 | -0.003943 | 0 | 1 | 0 |
1 | -0.641954 | 1 | 0.570331 | -0.005288 | -1.005852 | 0.946658 | 1 | 1 | 1 | -1.262001 | 1.630326 | 1 | 0 | 0 |
2 | -1.070885 | 0 | 0.252347 | -0.996779 | -1.005852 | 0.946658 | 1 | 0 | 1 | -0.709940 | -0.295225 | 0 | 0 | 1 |
3 | -0.740525 | 0 | 1.650506 | 0.986203 | 1.017588 | -0.625864 | 1 | 1 | 0 | -0.659524 | -1.226521 | 0 | 0 | 1 |
4 | 0.009478 | 1 | -0.623156 | 1.316700 | 1.017588 | 0.946658 | 0 | 1 | 0 | -1.242551 | -0.528086 | 1 | 0 | 0 |
Creating dummies for Contract resulted the variable being represented with 3 columns instead of 1.
data.shape
(10000, 14)
We'll remove Churn to create the group of predictors.
predictors = data.drop('Churn', axis= 1)
predictors.shape
(10000, 13)
We'll isolate Churn as the target variable.
target = data['Churn']
target.shape
(10000,)
We'll split the data into training and test sets using a 80/20 ratio.
X_train, X_test, y_train, y_test= \
model_selection.train_test_split(predictors, target, random_state= 42, test_size= 0.20)
Now we're ready to create the model using the Keras interface to TensorFlow.
# Set up the model
model = Sequential()
model.add(Input(shape= (predictors.shape[1], )))
model.add(Dense(500, activation= 'relu'))
model.add(Dense(1, activation= 'sigmoid'))
# Compile the model
model.compile(loss= 'binary_crossentropy', optimizer= Adam(learning_rate= 0.0001), metrics= ['accuracy'])
#View model summary
model.summary()
Model: "sequential_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩ │ dense_2 (Dense) │ (None, 500) │ 7,000 │ ├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤ │ dense_3 (Dense) │ (None, 1) │ 501 │ └──────────────────────────────────────┴─────────────────────────────┴─────────────────┘
Total params: 7,501 (29.30 KB)
Trainable params: 7,501 (29.30 KB)
Non-trainable params: 0 (0.00 B)
We set up and early stopping monitor which will help prevent overfitting.
early_stopping= EarlyStopping(patience= 2)
And now fit the model on the training data using the test data for valildation.
# Fit the model
history = model.fit(X_train, y_train, epochs= 20, validation_data= (X_test, y_test), callbacks= [early_stopping])
Epoch 1/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.6815 - loss: 0.6182 - val_accuracy: 0.8005 - val_loss: 0.4587 Epoch 2/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.8334 - loss: 0.4192 - val_accuracy: 0.8610 - val_loss: 0.3554 Epoch 3/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.8630 - loss: 0.3345 - val_accuracy: 0.8785 - val_loss: 0.3066 Epoch 4/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.8716 - loss: 0.2970 - val_accuracy: 0.8845 - val_loss: 0.2852 Epoch 5/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.8725 - loss: 0.2831 - val_accuracy: 0.8890 - val_loss: 0.2751 Epoch 6/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.8766 - loss: 0.2699 - val_accuracy: 0.8890 - val_loss: 0.2703 Epoch 7/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.8816 - loss: 0.2622 - val_accuracy: 0.8900 - val_loss: 0.2677 Epoch 8/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.8849 - loss: 0.2574 - val_accuracy: 0.8895 - val_loss: 0.2670 Epoch 9/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.8854 - loss: 0.2549 - val_accuracy: 0.8900 - val_loss: 0.2660 Epoch 10/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.8862 - loss: 0.2503 - val_accuracy: 0.8920 - val_loss: 0.2647 Epoch 11/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.8881 - loss: 0.2482 - val_accuracy: 0.8930 - val_loss: 0.2643 Epoch 12/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.8894 - loss: 0.2482 - val_accuracy: 0.8925 - val_loss: 0.2634 Epoch 13/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.8890 - loss: 0.2498 - val_accuracy: 0.8925 - val_loss: 0.2636 Epoch 14/20 250/250 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.8928 - loss: 0.2469 - val_accuracy: 0.8925 - val_loss: 0.2634
The early stopping monitor stopped the process after 14 epoch since the validation loss wasn't improving.
Next we can visualize model performance
plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Model Accuracy")
plt.xlabel("Epoch")
plt.legend()
plt.show()
plt.plot(history.history['loss'], label='Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Model Loss")
plt.xlabel("Epoch")
plt.legend()
plt.show()
And finally evaluate the model on the test data.
model.evaluate(X_test, y_test)
63/63 ━━━━━━━━━━━━━━━━━━━━ 0s 702us/step - accuracy: 0.8850 - loss: 0.2732
[0.26341530680656433, 0.8924999833106995]
The model achieves an accuracy of 89% meaning it can correclty predict wether or not a customer will churn 89% of the time.