from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import os
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils
# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)
You have TensorFlow version 2.11.0
df = pd.read_csv('Consumer_Complaints.csv.zip', encoding='latin-1')
df.head()
Date received | Product | Sub-product | Issue | Sub-issue | Consumer Complaint | Company Public Response | Company | State | ZIP code | Tags | Consumer consent provided? | Submitted via | Date Sent to Company | Company Response to Consumer | Timely response? | Consumer disputed? | Complaint ID | Unnamed: 18 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 03-12-2014 | Mortgage | Other mortgage | Loan modification,collection,foreclosure | NaN | NaN | NaN | M&T BANK CORPORATION | MI | 48382 | NaN | NaN | Referral | 03/17/2014 | Closed with explanation | Yes | No | 759217 | NaN |
1 | 10-01-2016 | Credit reporting | NaN | Incorrect information on credit report | Account status | I have outdated information on my credit repor... | Company has responded to the consumer and the ... | TRANSUNION INTERMEDIATE HOLDINGS, INC. | AL | 352XX | NaN | Consent provided | Web | 10-05-2016 | Closed with explanation | Yes | No | 2141773 | NaN |
2 | 10/17/2016 | Consumer Loan | Vehicle loan | Managing the loan or lease | NaN | I purchased a new car on XXXX XXXX. The car de... | NaN | CITIZENS FINANCIAL GROUP, INC. | PA | 177XX | Older American | Consent provided | Web | 10/20/2016 | Closed with explanation | Yes | No | 2163100 | NaN |
3 | 06-08-2014 | Credit card | NaN | Bankruptcy | NaN | NaN | NaN | AMERICAN EXPRESS COMPANY | ID | 83854 | Older American | NaN | Web | 06-10-2014 | Closed with explanation | Yes | Yes | 885638 | NaN |
4 | 09/13/2014 | Debt collection | Credit card | Communication tactics | Frequent or repeated calls | NaN | NaN | CITIBANK, N.A. | VA | 23233 | NaN | NaN | Web | 09/13/2014 | Closed with explanation | Yes | Yes | 1027760 | NaN |
col = ['Consumer Complaint', 'Product']
df = df[col]
df = df[pd.notnull(df['Consumer Complaint'])]
df.head()
Consumer Complaint | Product | |
---|---|---|
1 | I have outdated information on my credit repor... | Credit reporting |
2 | I purchased a new car on XXXX XXXX. The car de... | Consumer Loan |
7 | An account on my credit report has a mistaken ... | Credit reporting |
12 | This company refuses to provide me verificatio... | Debt collection |
16 | This complaint is in regards to Square Two Fin... | Debt collection |
df.isnull().sum()
Consumer Complaint 0 Product 0 dtype: int64
df['Product'].value_counts()
Debt collection 63268 Credit reporting, credit repair services, or other personal consumer reports 49006 Mortgage 43837 Credit reporting 31593 Credit card 18842 Student loan 16689 Bank account or service 14887 Credit card or prepaid card 10659 Consumer Loan 9474 Checking or savings account 6489 Money transfer, virtual currency, or money service 3089 Vehicle loan or lease 2791 Payday loan, title loan, or personal loan 2186 Payday loan 1748 Money transfers 1497 Prepaid card 1450 Other financial service 293 Virtual currency 16 Name: Product, dtype: int64
# Split data into train and test
train_size = int(len(df) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))
Train size: 222251 Test size: 55563
train_narrative = df['Consumer Complaint'][:train_size]
train_product = df['Product'][:train_size]
test_narrative = df['Consumer Complaint'][train_size:]
test_product = df['Product'][train_size:]
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:1: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`. train_narrative = df['Consumer Complaint'][:train_size] /var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:2: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`. train_product = df['Product'][:train_size] /var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:4: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`. test_narrative = df['Consumer Complaint'][train_size:] /var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:5: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`. test_product = df['Product'][train_size:]
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_narrative) # only fit on train
x_train = tokenize.texts_to_matrix(train_narrative)
x_test = tokenize.texts_to_matrix(test_narrative)
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
x_train shape: (222251, 1000) x_test shape: (55563, 1000) y_train shape: (222251, 18) y_test shape: (55563, 18)
batch_size = 32
epochs = 5
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
history = model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_split=0.1)
Epoch 1/5
2023-01-12 00:46:54.296166: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
6251/6251 [==============================] - 15s 2ms/step - loss: 0.8670 - accuracy: 0.7269 - val_loss: 1.2860 - val_accuracy: 0.4665 Epoch 2/5 6251/6251 [==============================] - 15s 2ms/step - loss: 0.7470 - accuracy: 0.7582 - val_loss: 1.2719 - val_accuracy: 0.4960 Epoch 3/5 6251/6251 [==============================] - 15s 2ms/step - loss: 0.6891 - accuracy: 0.7762 - val_loss: 1.4310 - val_accuracy: 0.4723 Epoch 4/5 6251/6251 [==============================] - 15s 2ms/step - loss: 0.6442 - accuracy: 0.7901 - val_loss: 1.4292 - val_accuracy: 0.4808 Epoch 5/5 6251/6251 [==============================] - 15s 2ms/step - loss: 0.6013 - accuracy: 0.8029 - val_loss: 1.4415 - val_accuracy: 0.4891
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])
1737/1737 [==============================] - 1s 588us/step - loss: 1.5201 - accuracy: 0.4798 Test score: 1.5201138257980347 Test accuracy: 0.4797797203063965
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_
for i in range(10):
prediction = model.predict(np.array([x_test[i]]))
predicted_label = text_labels[np.argmax(prediction)]
print(test_narrative.iloc[i][:50], "...")
print('Actual label:' + test_product.iloc[i])
print("Predicted label: " + predicted_label + "\n")
1/1 [==============================] - 0s 36ms/step On XXXX/XXXX/XXXX, I was contacted by Cavalry rega ... Actual label:Debt collection Predicted label: Debt collection 1/1 [==============================] - 0s 9ms/step XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX MN, X ... Actual label:Debt collection Predicted label: Debt collection 1/1 [==============================] - 0s 9ms/step I 'm currently having hardship and I ca n't make r ... Actual label:Payday loan, title loan, or personal loan Predicted label: Debt collection 1/1 [==============================] - 0s 8ms/step I tried to set up a credit freeze with Equifax on ... Actual label:Credit reporting, credit repair services, or other personal consumer reports Predicted label: Credit reporting 1/1 [==============================] - 0s 8ms/step XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, CA XXXX S ... Actual label:Debt collection Predicted label: Debt collection 1/1 [==============================] - 0s 9ms/step I applied for a refinance through Caliber Home Loa ... Actual label:Mortgage Predicted label: Mortgage 1/1 [==============================] - 0s 9ms/step US Bank is my mortgage company. I live in XXXX XXX ... Actual label:Mortgage Predicted label: Mortgage 1/1 [==============================] - 0s 8ms/step Sent first certified letter XX/XX/XXXX explanation ... Actual label:Vehicle loan or lease Predicted label: Consumer Loan 1/1 [==============================] - 0s 8ms/step I tried to use Experian 's online application to p ... Actual label:Credit reporting, credit repair services, or other personal consumer reports Predicted label: Credit reporting 1/1 [==============================] - 0s 8ms/step The equifax company leaked my information to peopl ... Actual label:Credit reporting, credit repair services, or other personal consumer reports Predicted label: Credit reporting