In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
In [2]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)
You have TensorFlow version 2.11.0
In [3]:
df = pd.read_csv('Consumer_Complaints.csv.zip', encoding='latin-1')
df.head()
Out[3]:
Date received Product Sub-product Issue Sub-issue Consumer Complaint Company Public Response Company State ZIP code Tags Consumer consent provided? Submitted via Date Sent to Company Company Response to Consumer Timely response? Consumer disputed? Complaint ID Unnamed: 18
0 03-12-2014 Mortgage Other mortgage Loan modification,collection,foreclosure NaN NaN NaN M&T BANK CORPORATION MI 48382 NaN NaN Referral 03/17/2014 Closed with explanation Yes No 759217 NaN
1 10-01-2016 Credit reporting NaN Incorrect information on credit report Account status I have outdated information on my credit repor... Company has responded to the consumer and the ... TRANSUNION INTERMEDIATE HOLDINGS, INC. AL 352XX NaN Consent provided Web 10-05-2016 Closed with explanation Yes No 2141773 NaN
2 10/17/2016 Consumer Loan Vehicle loan Managing the loan or lease NaN I purchased a new car on XXXX XXXX. The car de... NaN CITIZENS FINANCIAL GROUP, INC. PA 177XX Older American Consent provided Web 10/20/2016 Closed with explanation Yes No 2163100 NaN
3 06-08-2014 Credit card NaN Bankruptcy NaN NaN NaN AMERICAN EXPRESS COMPANY ID 83854 Older American NaN Web 06-10-2014 Closed with explanation Yes Yes 885638 NaN
4 09/13/2014 Debt collection Credit card Communication tactics Frequent or repeated calls NaN NaN CITIBANK, N.A. VA 23233 NaN NaN Web 09/13/2014 Closed with explanation Yes Yes 1027760 NaN
In [4]:
col = ['Consumer Complaint', 'Product']
df = df[col]
df = df[pd.notnull(df['Consumer Complaint'])]
df.head()
Out[4]:
Consumer Complaint Product
1 I have outdated information on my credit repor... Credit reporting
2 I purchased a new car on XXXX XXXX. The car de... Consumer Loan
7 An account on my credit report has a mistaken ... Credit reporting
12 This company refuses to provide me verificatio... Debt collection
16 This complaint is in regards to Square Two Fin... Debt collection
In [5]:
df.isnull().sum()
Out[5]:
Consumer Complaint    0
Product               0
dtype: int64
In [6]:
df['Product'].value_counts()
Out[6]:
Debt collection                                                                 63268
Credit reporting, credit repair services, or other personal consumer reports    49006
Mortgage                                                                        43837
Credit reporting                                                                31593
Credit card                                                                     18842
Student loan                                                                    16689
Bank account or service                                                         14887
Credit card or prepaid card                                                     10659
Consumer Loan                                                                    9474
Checking or savings account                                                      6489
Money transfer, virtual currency, or money service                               3089
Vehicle loan or lease                                                            2791
Payday loan, title loan, or personal loan                                        2186
Payday loan                                                                      1748
Money transfers                                                                  1497
Prepaid card                                                                     1450
Other financial service                                                           293
Virtual currency                                                                   16
Name: Product, dtype: int64
In [7]:
# Split data into train and test
train_size = int(len(df) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))
Train size: 222251
Test size: 55563
In [8]:
train_narrative = df['Consumer Complaint'][:train_size]
train_product = df['Product'][:train_size]

test_narrative = df['Consumer Complaint'][train_size:]
test_product = df['Product'][train_size:]
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:1: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.
  train_narrative = df['Consumer Complaint'][:train_size]
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:2: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.
  train_product = df['Product'][:train_size]
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:4: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.
  test_narrative = df['Consumer Complaint'][train_size:]
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:5: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.
  test_product = df['Product'][train_size:]
In [9]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
In [10]:
tokenize.fit_on_texts(train_narrative) # only fit on train
x_train = tokenize.texts_to_matrix(train_narrative)
x_test = tokenize.texts_to_matrix(test_narrative)
In [11]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)
In [12]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
In [13]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
x_train shape: (222251, 1000)
x_test shape: (55563, 1000)
y_train shape: (222251, 18)
y_test shape: (55563, 18)
In [14]:
batch_size = 32
epochs = 5
In [15]:
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
In [16]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)
Epoch 1/5
2023-01-12 00:46:54.296166: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
6251/6251 [==============================] - 15s 2ms/step - loss: 0.8670 - accuracy: 0.7269 - val_loss: 1.2860 - val_accuracy: 0.4665
Epoch 2/5
6251/6251 [==============================] - 15s 2ms/step - loss: 0.7470 - accuracy: 0.7582 - val_loss: 1.2719 - val_accuracy: 0.4960
Epoch 3/5
6251/6251 [==============================] - 15s 2ms/step - loss: 0.6891 - accuracy: 0.7762 - val_loss: 1.4310 - val_accuracy: 0.4723
Epoch 4/5
6251/6251 [==============================] - 15s 2ms/step - loss: 0.6442 - accuracy: 0.7901 - val_loss: 1.4292 - val_accuracy: 0.4808
Epoch 5/5
6251/6251 [==============================] - 15s 2ms/step - loss: 0.6013 - accuracy: 0.8029 - val_loss: 1.4415 - val_accuracy: 0.4891
In [17]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])
1737/1737 [==============================] - 1s 588us/step - loss: 1.5201 - accuracy: 0.4798
Test score: 1.5201138257980347
Test accuracy: 0.4797797203063965
In [18]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_narrative.iloc[i][:50], "...")
    print('Actual label:' + test_product.iloc[i])
    print("Predicted label: " + predicted_label + "\n")
1/1 [==============================] - 0s 36ms/step
On XXXX/XXXX/XXXX, I was contacted by Cavalry rega ...
Actual label:Debt collection
Predicted label: Debt collection

1/1 [==============================] - 0s 9ms/step
XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX MN, X ...
Actual label:Debt collection
Predicted label: Debt collection

1/1 [==============================] - 0s 9ms/step
I 'm currently having hardship and I ca n't make r ...
Actual label:Payday loan, title loan, or personal loan
Predicted label: Debt collection

1/1 [==============================] - 0s 8ms/step
I tried to set up a credit freeze with Equifax on  ...
Actual label:Credit reporting, credit repair services, or other personal consumer reports
Predicted label: Credit reporting

1/1 [==============================] - 0s 8ms/step
XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, CA XXXX S ...
Actual label:Debt collection
Predicted label: Debt collection

1/1 [==============================] - 0s 9ms/step
I applied for a refinance through Caliber Home Loa ...
Actual label:Mortgage
Predicted label: Mortgage

1/1 [==============================] - 0s 9ms/step
US Bank is my mortgage company. I live in XXXX XXX ...
Actual label:Mortgage
Predicted label: Mortgage

1/1 [==============================] - 0s 8ms/step
Sent first certified letter XX/XX/XXXX explanation ...
Actual label:Vehicle loan or lease
Predicted label: Consumer Loan

1/1 [==============================] - 0s 8ms/step
I tried to use Experian 's online application to p ...
Actual label:Credit reporting, credit repair services, or other personal consumer reports
Predicted label: Credit reporting

1/1 [==============================] - 0s 8ms/step
The equifax company leaked my information to peopl ...
Actual label:Credit reporting, credit repair services, or other personal consumer reports
Predicted label: Credit reporting