from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 2.11.0


df = pd.read_csv('Consumer_Complaints.csv.zip', encoding='latin-1')
df.head()


col = ['Consumer Complaint', 'Product']
df = df[col]
df = df[pd.notnull(df['Consumer Complaint'])]
df.head()


df.isnull().sum()

Consumer Complaint    0
Product               0
dtype: int64


df['Product'].value_counts()

Debt collection                                                                 63268
Credit reporting, credit repair services, or other personal consumer reports    49006
Mortgage                                                                        43837
Credit reporting                                                                31593
Credit card                                                                     18842
Student loan                                                                    16689
Bank account or service                                                         14887
Credit card or prepaid card                                                     10659
Consumer Loan                                                                    9474
Checking or savings account                                                      6489
Money transfer, virtual currency, or money service                               3089
Vehicle loan or lease                                                            2791
Payday loan, title loan, or personal loan                                        2186
Payday loan                                                                      1748
Money transfers                                                                  1497
Prepaid card                                                                     1450
Other financial service                                                           293
Virtual currency                                                                   16
Name: Product, dtype: int64


# Split data into train and test
train_size = int(len(df) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))

Train size: 222251
Test size: 55563


train_narrative = df['Consumer Complaint'][:train_size]
train_product = df['Product'][:train_size]

test_narrative = df['Consumer Complaint'][train_size:]
test_product = df['Product'][train_size:]

/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:1: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.
  train_narrative = df['Consumer Complaint'][:train_size]
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:2: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.
  train_product = df['Product'][:train_size]
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:4: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.
  test_narrative = df['Consumer Complaint'][train_size:]
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_36318/39497960.py:5: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.
  test_product = df['Product'][train_size:]


max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)


tokenize.fit_on_texts(train_narrative) # only fit on train
x_train = tokenize.texts_to_matrix(train_narrative)
x_test = tokenize.texts_to_matrix(test_narrative)


# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)


# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)


# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (222251, 1000)
x_test shape: (55563, 1000)
y_train shape: (222251, 18)
y_test shape: (55563, 18)


batch_size = 32
epochs = 5


# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/5

2023-01-12 00:46:54.296166: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz

6251/6251 [==============================] - 15s 2ms/step - loss: 0.8670 - accuracy: 0.7269 - val_loss: 1.2860 - val_accuracy: 0.4665
Epoch 2/5
6251/6251 [==============================] - 15s 2ms/step - loss: 0.7470 - accuracy: 0.7582 - val_loss: 1.2719 - val_accuracy: 0.4960
Epoch 3/5
6251/6251 [==============================] - 15s 2ms/step - loss: 0.6891 - accuracy: 0.7762 - val_loss: 1.4310 - val_accuracy: 0.4723
Epoch 4/5
6251/6251 [==============================] - 15s 2ms/step - loss: 0.6442 - accuracy: 0.7901 - val_loss: 1.4292 - val_accuracy: 0.4808
Epoch 5/5
6251/6251 [==============================] - 15s 2ms/step - loss: 0.6013 - accuracy: 0.8029 - val_loss: 1.4415 - val_accuracy: 0.4891


# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

1737/1737 [==============================] - 1s 588us/step - loss: 1.5201 - accuracy: 0.4798
Test score: 1.5201138257980347
Test accuracy: 0.4797797203063965


# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_narrative.iloc[i][:50], "...")
    print('Actual label:' + test_product.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

1/1 [==============================] - 0s 36ms/step
On XXXX/XXXX/XXXX, I was contacted by Cavalry rega ...
Actual label:Debt collection
Predicted label: Debt collection

1/1 [==============================] - 0s 9ms/step
XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX MN, X ...
Actual label:Debt collection
Predicted label: Debt collection

1/1 [==============================] - 0s 9ms/step
I 'm currently having hardship and I ca n't make r ...
Actual label:Payday loan, title loan, or personal loan
Predicted label: Debt collection

1/1 [==============================] - 0s 8ms/step
I tried to set up a credit freeze with Equifax on  ...
Actual label:Credit reporting, credit repair services, or other personal consumer reports
Predicted label: Credit reporting

1/1 [==============================] - 0s 8ms/step
XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, CA XXXX S ...
Actual label:Debt collection
Predicted label: Debt collection

1/1 [==============================] - 0s 9ms/step
I applied for a refinance through Caliber Home Loa ...
Actual label:Mortgage
Predicted label: Mortgage

1/1 [==============================] - 0s 9ms/step
US Bank is my mortgage company. I live in XXXX XXX ...
Actual label:Mortgage
Predicted label: Mortgage

1/1 [==============================] - 0s 8ms/step
Sent first certified letter XX/XX/XXXX explanation ...
Actual label:Vehicle loan or lease
Predicted label: Consumer Loan

1/1 [==============================] - 0s 8ms/step
I tried to use Experian 's online application to p ...
Actual label:Credit reporting, credit repair services, or other personal consumer reports
Predicted label: Credit reporting

1/1 [==============================] - 0s 8ms/step
The equifax company leaked my information to peopl ...
Actual label:Credit reporting, credit repair services, or other personal consumer reports
Predicted label: Credit reporting

	Date received	Product	Sub-product	Issue	Sub-issue	Consumer Complaint	Company Public Response	Company	State	ZIP code	Tags	Consumer consent provided?	Submitted via	Date Sent to Company	Company Response to Consumer	Timely response?	Consumer disputed?	Complaint ID	Unnamed: 18
0	03-12-2014	Mortgage	Other mortgage	Loan modification,collection,foreclosure	NaN	NaN	NaN	M&T BANK CORPORATION	MI	48382	NaN	NaN	Referral	03/17/2014	Closed with explanation	Yes	No	759217	NaN
1	10-01-2016	Credit reporting	NaN	Incorrect information on credit report	Account status	I have outdated information on my credit repor...	Company has responded to the consumer and the ...	TRANSUNION INTERMEDIATE HOLDINGS, INC.	AL	352XX	NaN	Consent provided	Web	10-05-2016	Closed with explanation	Yes	No	2141773	NaN
2	10/17/2016	Consumer Loan	Vehicle loan	Managing the loan or lease	NaN	I purchased a new car on XXXX XXXX. The car de...	NaN	CITIZENS FINANCIAL GROUP, INC.	PA	177XX	Older American	Consent provided	Web	10/20/2016	Closed with explanation	Yes	No	2163100	NaN
3	06-08-2014	Credit card	NaN	Bankruptcy	NaN	NaN	NaN	AMERICAN EXPRESS COMPANY	ID	83854	Older American	NaN	Web	06-10-2014	Closed with explanation	Yes	Yes	885638	NaN
4	09/13/2014	Debt collection	Credit card	Communication tactics	Frequent or repeated calls	NaN	NaN	CITIBANK, N.A.	VA	23233	NaN	NaN	Web	09/13/2014	Closed with explanation	Yes	Yes	1027760	NaN