# for Extracting, Transforming, Loading Data--as well as--Processing, Analyzing, and Visualizing Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

# for Statistical Analysis
from scipy import stats

# for Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

import feature_engine
from feature_engine.outliers import Winsorizer


# for Sane and Clear Outputs
import warnings
warnings.filterwarnings("ignore")


# excel = pd.read_excel('Online Retail.xlsx')
# excel.to_csv('retail.csv')

# excel = pd.read_excel('train.xlsx')
# excel.to_csv('train.csv')

# excel = pd.read_excel('test.xlsx')
# excel.to_csv('test.csv')


df = pd.read_csv('retail.csv', encoding='ISO-8859-1',
                 index_col=0, parse_dates=['InvoiceDate'])
df.head()


df.tail()


def summary(df):
    obs = df.shape[0]
    types = df.dtypes
    counts = df.apply(lambda x: x.count())
    uniques = df.apply(lambda x: x.unique().shape[0])
    nulls = df.apply(lambda x: x.isnull().sum())
    min = df.min()
    max = df.max()
    print('Data shape:', df.shape)

    cols = ['types', 'counts', 'uniques', 'nulls', 'min', 'max']
    summary = pd.concat([types, counts, uniques, nulls,
                        min, max], axis=1, sort=True)

    summary.columns = cols
    dtypes = summary.types.value_counts()
    print('___________________________\nData types:')
    print(summary.types.value_counts())
    print('___________________________')
    return summary


details = summary(df)
display(details.sort_values(by='nulls', ascending=False))

Data shape: (541909, 8)
___________________________
Data types:
object            4
float64           2
datetime64[ns]    1
int64             1
Name: types, dtype: int64
___________________________


df.describe()


df.isnull().mean()

InvoiceNo      0.000000
StockCode      0.000000
Description    0.002683
Quantity       0.000000
InvoiceDate    0.000000
UnitPrice      0.000000
CustomerID     0.249267
Country        0.000000
dtype: float64


df[df.isnull().any(axis=1)].Country.value_counts(normalize=True)

United Kingdom    0.989044
EIRE              0.005264
Hong Kong         0.002132
Unspecified       0.001495
Switzerland       0.000925
France            0.000489
Israel            0.000348
Portugal          0.000289
Bahrain           0.000015
Name: Country, dtype: float64


df.duplicated().mean()

0.009721189350979592


df.loc[(df.UnitPrice < 0)]


df.loc[(df.Quantity < 0)]


df.Country.value_counts(normalize=True)

United Kingdom          0.914320
Germany                 0.017521
France                  0.015790
EIRE                    0.015124
Spain                   0.004674
Netherlands             0.004375
Belgium                 0.003818
Switzerland             0.003694
Portugal                0.002803
Australia               0.002323
Norway                  0.002004
Italy                   0.001482
Channel Islands         0.001399
Finland                 0.001283
Cyprus                  0.001148
Sweden                  0.000853
Unspecified             0.000823
Austria                 0.000740
Denmark                 0.000718
Japan                   0.000661
Poland                  0.000629
Israel                  0.000548
USA                     0.000537
Hong Kong               0.000531
Singapore               0.000423
Iceland                 0.000336
Canada                  0.000279
Greece                  0.000269
Malta                   0.000234
United Arab Emirates    0.000125
European Community      0.000113
RSA                     0.000107
Lebanon                 0.000083
Lithuania               0.000065
Brazil                  0.000059
Czech Republic          0.000055
Bahrain                 0.000035
Saudi Arabia            0.000018
Name: Country, dtype: float64


df = df.drop_duplicates()


df = df[df.Quantity > 0]
df = df[df.UnitPrice > 0]


df = df[pd.notnull(df['CustomerID'])]
df['CustomerID'] = df['CustomerID'].astype('int')


df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Date'] = df['InvoiceDate'].dt.strftime('%Y-%m-%d')
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = df['Date'].dt.strftime('%Y-%m')


df.duplicated().mean()

0.0


df.isnull().mean()

InvoiceNo      0.0
StockCode      0.0
Description    0.0
Quantity       0.0
InvoiceDate    0.0
UnitPrice      0.0
CustomerID     0.0
Country        0.0
Date           0.0
dtype: float64


df['Revenue'] = df['Quantity']*df['UnitPrice']
df.to_csv('eda.csv')


plt.figure(figsize=(12, 12))
sns.barplot(x=df.Date.unique(), y=df.Date.value_counts(
    sort=False), palette='viridis')
plt.title("Invoices per Month")
plt.show()


top_products = df['Description'].value_counts()[:25]
plt.figure(figsize=(12, 12))
sns.barplot(y=top_products.index,
            x=top_products.values,
            palette='viridis_r')
plt.title("Top selling products")
plt.show()


# Create a copy of dataframe for cohort analysis
cohort = df.copy()


# Define a function that will parse the date
def get_month(x):
    return dt.datetime(x.year, x.month, 1)


# Create InvoiceMonth column
cohort['InvoiceMonth'] = cohort['InvoiceDate'].apply(get_month)

# Group by CustomerID and select the InvoiceMonth value
grouping = cohort.groupby('CustomerID')['InvoiceMonth']

# Assign a minimum InvoiceMonth value to the dataset
cohort['CohortMonth'] = grouping.transform('min')


def get_date_int(df, column):
    year = df[column].dt.year
    month = df[column].dt.month
    return year, month


# Get the integers for date parts from the `InvoiceMonth` column
invoice_year, invoice_month = get_date_int(cohort, 'InvoiceMonth')

# Get the integers for date parts from the `CohortMonth` column
cohort_year, cohort_month = get_date_int(cohort, 'CohortMonth')

# Calculate difference in years
years_diff = invoice_year - cohort_year

# Calculate difference in months
months_diff = invoice_month - cohort_month

# Extract the difference in months from all previous values
cohort['CohortIndex'] = years_diff * 12 + months_diff + 1

cohort.head()


grouping = cohort.groupby(['CohortMonth', 'CohortIndex'])

# Count the number of unique values per customer ID
cohort_data = grouping['CustomerID'].apply(pd.Series.nunique).reset_index()

# Create a pivot
cohort_counts = cohort_data.pivot(
    index='CohortMonth', columns='CohortIndex', values='CustomerID')

# Select the first column and store it to cohort_sizes
cohort_sizes = cohort_counts.iloc[:, 0]

# Divide the cohort count by cohort sizes along the rows
retention = cohort_counts.divide(cohort_sizes, axis=0)*100

month_list = ["Dec '10", "Jan '11", "Feb '11", "Mar '11", "Apr '11",
              "May '11", "Jun '11", "Jul '11", "Aug '11", "Sep '11",
              "Oct '11", "Nov '11", "Dec '11"]

plt.figure(figsize=(12, 12))
sns.heatmap(data=retention,
            annot=True,
            vmin=0.0,
            cmap='viridis',
            vmax=list(retention.max().sort_values(ascending=False))[1]+3,
            fmt='.1f',
            linewidth=0.3,
            yticklabels=month_list)
plt.show()


# Create a groupby object and pass the monthly cohort and cohort index as a list
grouping = cohort.groupby(['CohortMonth', 'CohortIndex'])

# Calculate the average of the unit price column
cohort_data = grouping['UnitPrice'].mean()

# Reset the index of cohort_data
cohort_data = cohort_data.reset_index()

# Create a pivot
average_price = cohort_data.pivot(
    index='CohortMonth', columns='CohortIndex', values='UnitPrice')
average_price.round(1)
average_price.index = average_price.index.date

plt.figure(figsize=(12, 12))
sns.heatmap(data=average_price,
            annot=True,
            vmin=0.0,
            cmap='viridis',
            vmax=list(average_price.max().sort_values(ascending=False))[1]+3,
            fmt='.1f',
            linewidth=0.3,
            yticklabels=month_list)
plt.show()


# Create a groupby object and pass the monthly cohort and cohort index as a list
grouping = cohort.groupby(['CohortMonth', 'CohortIndex'])

# Calculate the average of the Quantity column
cohort_data = grouping['Quantity'].mean()

# Reset the index of cohort_data
cohort_data = cohort_data.reset_index()

# Create a pivot
average_quantity = cohort_data.pivot(
    index='CohortMonth', columns='CohortIndex', values='Quantity')

plt.figure(figsize=(12, 12))
sns.heatmap(data=average_quantity,
            annot=True,
            vmin=0.0,
            cmap='viridis',
            vmax=list(average_quantity.max().sort_values(
                ascending=False))[1]+3,
            fmt='.1f',
            linewidth=0.3,
            yticklabels=month_list)
plt.show()


dummies = pd.get_dummies(df['Date'])
df = df.join(dummies)


all_months = df.groupby('Date').agg({'CustomerID': 'nunique', 'Quantity': 'sum', 'Revenue': 'sum', '2010-12': 'sum',
                                     '2011-01': 'sum', '2011-02': 'sum', '2011-03': 'sum', '2011-04': 'sum', '2011-05': 'sum', '2011-06': 'sum',
                                     '2011-07': 'sum', '2011-08': 'sum', '2011-09': 'sum', '2011-10': 'sum', '2011-11': 'sum', '2011-12': 'sum'})

all_customers = df.groupby('CustomerID').agg({'Date': ['min', 'max', 'nunique'], 'Quantity': 'sum', 'Revenue': 'sum', '2010-12': 'sum',
                                              '2011-01': 'sum', '2011-02': 'sum', '2011-03': 'sum', '2011-04': 'sum', '2011-05': 'sum', '2011-06': 'sum',
                                              '2011-07': 'sum', '2011-08': 'sum', '2011-09': 'sum', '2011-10': 'sum', '2011-11': 'sum', '2011-12': 'sum'})


df['InvoiceDate'].max()

Timestamp('2011-12-09 12:50:00')


# Lets set this date as the today's date for further analysis
current_date = dt.date(2011, 12, 9)

# Lets create a date column for date values only
df['Purchase_Date'] = df.InvoiceDate.dt.date

recency = df.groupby('CustomerID')['Purchase_Date'].max().reset_index()

# Create a separate column for this date.
recency = recency.assign(Current_Date=current_date)

# Compute the number of days since last purchase
recency['Recency'] = recency.Purchase_Date.apply(
    lambda x: (current_date - x).days)

recency.head()


# Drop the irrelevant Date columns
recency.drop(['Purchase_Date', 'Current_Date'], axis=1, inplace=True)


frequency = df.groupby('CustomerID').InvoiceNo.nunique(
).reset_index().rename(columns={'InvoiceNo': 'Frequency'})

frequency.head()


# Create a separate column for Total Cost of Unit purchased
df['Total_cost'] = df.Quantity * df.UnitPrice

monetary = df.groupby('CustomerID').Total_cost.sum(
).reset_index().rename(columns={'Total_cost': 'Monetary'})

monetary.head()


temp = recency.merge(frequency, on='CustomerID')
rfm_table = temp.merge(monetary, on='CustomerID')
rfm_table.head()


# RFM Quantiles
quantiles = rfm_table.quantile(q=[0.25, 0.5, 0.75])

# Let's convert quartile information into a dictionary so that cutoffs can be picked up.
quantiles = quantiles.to_dict()

quantiles

{'CustomerID': {0.25: 13813.25, 0.5: 15299.5, 0.75: 16778.75},
 'Recency': {0.25: 17.0, 0.5: 50.0, 0.75: 141.75},
 'Frequency': {0.25: 1.0, 0.5: 2.0, 0.75: 5.0},
 'Monetary': {0.25: 306.48249999999996, 0.5: 668.57, 0.75: 1660.5974999999999}}


# Arguments (x = value, p = recency, monetary_value, frequency, d = quantiles dict)
def RScore(x, p, d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]:
        return 2
    else:
        return 1


# Arguments (x = value, p = recency, monetary_value, frequency, k = quantiles dict)
def FMScore(x, p, d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]:
        return 3
    else:
        return 4


rfm_segment = rfm_table.copy()
rfm_segment['R_Quartile'] = rfm_segment['Recency'].apply(
    RScore, args=('Recency', quantiles,))
rfm_segment['F_Quartile'] = rfm_segment['Frequency'].apply(
    FMScore, args=('Frequency', quantiles,))
rfm_segment['M_Quartile'] = rfm_segment['Monetary'].apply(
    FMScore, args=('Monetary', quantiles,))
rfm_segment.head()


rfm_segment['RFM'] = rfm_segment.R_Quartile.map(
    str) + rfm_segment.F_Quartile.map(str) + rfm_segment.M_Quartile.map(str)
rfm_segment['Score'] = rfm_segment[[
    'R_Quartile', 'F_Quartile', 'M_Quartile']].sum(axis=1)
rfm_segment.head()


# Create a dictionary for each segment to map them against each customer
segment_dict = {
    # Highest frequency as well as monetary value with least recency
    'Best Customers': '444',
    # High frequency as well as monetary value with good recency
    'Loyal Customers': '344',
    # High monetary value but good recency and frequency values
    'Big Spenders': '334',
    # Customer's shopping less often now who used to shop a lot
    'Almost Lost': '244',
    # Customer's shopped long ago who used to shop a lot.
    'Lost Customers': '144',
    # Customer's who recently started shopping a lot but with less monetary value
    'Recent Customers': '443',
    # Customer's shopped long ago but with less frequency and monetary value
    'Lost Cheap Customers': '122'
}

# Swap the key and value of dictionary
dict_segment = dict(zip(segment_dict.values(), segment_dict.keys()))

# Allocate segments to each customer as per the RFM score mapping
rfm_segment['Segment'] = rfm_segment.RFM.map(lambda x: dict_segment.get(x))

# Allocate all remaining customers to others segment category
rfm_segment.Segment.fillna('others', inplace=True)

rfm_segment.sample(10)


# Best Customers who's recency, frequency as well as monetary attribute is highest.
rfm_segment[rfm_segment.RFM == '444'].sort_values(
    'Monetary', ascending=False).head()


# Biggest spenders
rfm_segment[rfm_segment.RFM == '334'].sort_values(
    'Monetary', ascending=False).head()


# Almost Lost i.e. who's recency value is low
rfm_segment[rfm_segment.RFM == '244'].sort_values(
    'Monetary', ascending=False).head()


# Lost customers that don't needs attention who's recency, frequency as well as monetary values are low
rfm_segment[rfm_segment.RFM == '122'].sort_values(
    'Monetary', ascending=False).head()


# Loyal customers who's purchase frequency is high
rfm_segment[rfm_segment.RFM == '344'].sort_values(
    'Monetary', ascending=False).head()


# Customers that you must retain are those whose monetary and frequency was high but recency reduced quite a lot recently
rfm_segment[rfm_segment.RFM == '244'].sort_values(
    'Monetary', ascending=False).head()


# Function to Check Skewness
def check_skew(df_skew, column, color):
    skew = stats.skew(df_skew[column])
    skewtest = stats.skewtest(df_skew[column])
    plt.title('Distribution of ' + column)
    sns.distplot(df_skew[column], color=color)
    print("{}'s: Skew: {}, : {}".format(column, skew, skewtest))
    return


# Check Skewness
plt.figure(figsize=(12, 12))
plt.subplot(3, 1, 1)
check_skew(rfm_table, 'Recency', "dodgerblue")
plt.subplot(3, 1, 2)
check_skew(rfm_table, 'Frequency', "deeppink")
plt.subplot(3, 1, 3)
check_skew(rfm_table, 'Monetary', "gold")
plt.tight_layout()

Recency's: Skew: 1.2453948317057284, : SkewtestResult(statistic=26.60351236555474, pvalue=6.181721752536432e-156)
Frequency's: Skew: 12.062857869870964, : SkewtestResult(statistic=74.62743613377035, pvalue=0.0)
Monetary's: Skew: 19.332680144099353, : SkewtestResult(statistic=85.01187149828888, pvalue=0.0)


# Removing Skewness
table_scaled = rfm_table.copy()
RFM_log = np.log(table_scaled+1)
plt.figure(figsize=(12, 12))
plt.subplot(3, 1, 1)
check_skew(RFM_log, 'Recency', "dodgerblue")
plt.subplot(3, 1, 2)
check_skew(RFM_log, 'Frequency', "deeppink")
plt.subplot(3, 1, 3)
check_skew(RFM_log, 'Monetary', "gold")
plt.tight_layout()

Recency's: Skew: -0.4670749364137121, : SkewtestResult(statistic=-11.982131984493975, pvalue=4.408387035293672e-33)
Frequency's: Skew: 1.2082335351584435, : SkewtestResult(statistic=26.04793003945421, pvalue=1.419991761586644e-149)
Monetary's: Skew: 0.3964614244871878, : SkewtestResult(statistic=10.299963600725635, pvalue=7.048796791830502e-25)


rfm_table.describe()


windsoriser = Winsorizer(tail='both',  # cap left, right or both tails
                         fold=2,
                         variables=['Recency', 'Frequency', 'Monetary']
                         )
windsoriser.fit(RFM_log)

Winsorizer(fold=2, tail='both', variables=['Recency', 'Frequency', 'Monetary'])


RFM_log = windsoriser.transform(RFM_log)


scaler = StandardScaler()
scaler.fit(RFM_log)
RFM_scaled = scaler.transform(RFM_log)
RFM_scaled = pd.DataFrame(RFM_scaled, columns=RFM_log.columns)
RFM_scaled.head()


from scipy.spatial.distance import cdist
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(2, 12)

for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(RFM_scaled)
    kmeanModel.fit(RFM_scaled)

    distortions.append(sum(np.min(cdist(RFM_scaled, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / RFM_scaled.shape[0])
    inertias.append(kmeanModel.inertia_)

    mapping1[k] = sum(np.min(cdist(RFM_scaled, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / RFM_scaled.shape[0]
    mapping2[k] = kmeanModel.inertia_


plt.figure(figsize=(12, 12))
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()


plt.figure(figsize=(12, 12))
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method with Inertia')
plt.show()


from sklearn.metrics import silhouette_score
wcss_silhouette = []
for k in K:
    km = KMeans(n_clusters=k, random_state=1).fit(RFM_scaled)
    preds = km.predict(RFM_scaled)
    silhouette = silhouette_score(RFM_scaled, preds)
    wcss_silhouette.append(silhouette)
    print("Silhouette score for number of cluster(s) {}: {}".format(k, silhouette))

plt.figure(figsize=(12, 12))
plt.title("The silhouette coefficient method \nfor determining number of clusters\n", fontsize=16)
plt.scatter(x=[i for i in range(2, 12)],
            y=wcss_silhouette, s=150, edgecolor='k')
plt.grid(True)
plt.xlabel("Number of clusters", fontsize=14)
plt.ylabel("Silhouette score", fontsize=15)
plt.xticks([i for i in range(2, 12)], fontsize=14)
plt.yticks(fontsize=15)
plt.show()

Silhouette score for number of cluster(s) 2: 0.34305376691087064
Silhouette score for number of cluster(s) 3: 0.28105352172658254
Silhouette score for number of cluster(s) 4: 0.25648293481928397
Silhouette score for number of cluster(s) 5: 0.25293679864881496
Silhouette score for number of cluster(s) 6: 0.2569378664274715
Silhouette score for number of cluster(s) 7: 0.24420527452086455
Silhouette score for number of cluster(s) 8: 0.24235651303445258
Silhouette score for number of cluster(s) 9: 0.2373907626700457
Silhouette score for number of cluster(s) 10: 0.23511719757718036
Silhouette score for number of cluster(s) 11: 0.22863163520649685


def kmeans(normalised_df_rfm, clusters_number, original_df_rfm):

    kmeans = KMeans(n_clusters=clusters_number, random_state=1)
    kmeans.fit(normalised_df_rfm)

    # Extract cluster labels
    cluster_labels = kmeans.labels_

    # Create a cluster label column in original dataset
    df_new = original_df_rfm.assign(Cluster=cluster_labels)

    # Initialise TSNE
    model = TSNE(random_state=1)
    transformed = model.fit_transform(df_new)

    # Plot t-SNE
    plt.title('Flattened Graph of {} Clusters'.format(clusters_number))
    sns.scatterplot(x=transformed[:, 0], y=transformed[:, 1],
                    hue=cluster_labels, style=cluster_labels, palette="Set1")

    return df_new


plt.figure(figsize=(12, 12))
plt.subplot(3, 1, 1)
df_rfm_k3 = kmeans(RFM_scaled, 3, rfm_table)
plt.subplot(3, 1, 2)
df_rfm_k4 = kmeans(RFM_scaled, 4, rfm_table)
plt.subplot(3, 1, 3)
df_rfm_k5 = kmeans(RFM_scaled, 5, rfm_table)
plt.tight_layout()


def snake_plot(normalised_df_rfm, df_rfm_kmeans, df_rfm_original):
    normalised_df_rfm = pd.DataFrame(normalised_df_rfm,
                                     index=df_rfm_original.index,
                                     columns=df_rfm_original.columns)
    normalised_df_rfm['Cluster'] = df_rfm_kmeans['Cluster']

    # Melt data into long format
    df_melt = pd.melt(normalised_df_rfm.reset_index(),
                      id_vars=['CustomerID', 'Cluster'],
                      value_vars=['Recency', 'Frequency', 'Monetary'],
                      var_name='Metric',
                      value_name='Value')
    plt.xlabel('Metric')
    plt.ylabel('Value')
    sns.pointplot(data=df_melt, x='Metric', y='Value',
                  hue='Cluster', palette="Set1")

    return


plt.figure(figsize=(12, 12))
plt.subplot(3, 1, 1)
snake_3 = snake_plot(RFM_scaled, df_rfm_k3, rfm_table)
plt.subplot(3, 1, 2)
snake_4 = snake_plot(RFM_scaled, df_rfm_k4, rfm_table)
plt.subplot(3, 1, 3)
snake_5 = snake_plot(RFM_scaled, df_rfm_k5, rfm_table)
plt.tight_layout()


kmeans = KMeans(n_clusters=4, random_state=1)
kmeans.fit(RFM_scaled)
cluster_labels = kmeans.labels_
kmeans

KMeans(n_clusters=4, random_state=1)


# Assign the clusters as column to each customer
Cluster_table = rfm_segment.assign(Cluster=cluster_labels)
# Save RFM Clusters to csv
Cluster_table.to_csv('clusters.csv')
# Check counts of records assigned to different clusters
Cluster_table.Cluster.value_counts()

0    1234
2    1153
3    1033
1     918
Name: Cluster, dtype: int64


# Plotting two dimesional plots of each attributes respectively.
X = RFM_scaled.iloc[:, 0:3].values
count = X.shape[1]
for i in range(0, count):
    for j in range(i+1, count):
        plt.figure(figsize=(12, 12))
        plt.scatter(X[cluster_labels == 0, i], X[cluster_labels ==
                    0, j], s=10, c='red', label='Cluster0')
        plt.scatter(X[cluster_labels == 1, i], X[cluster_labels ==
                    1, j], s=10, c='blue', label='Cluster1')
        plt.scatter(X[cluster_labels == 2, i], X[cluster_labels ==
                    2, j], s=10, c='green', label='Cluster2')
        plt.scatter(X[cluster_labels == 3, i], X[cluster_labels ==
                    3, j], s=10, c='purple', label='Cluster3')
        plt.scatter(kmeans.cluster_centers_[:, i], kmeans.cluster_centers_[
                    :, j], s=50, c='black', label='Centroids')
        plt.xlabel(RFM_scaled.columns[i])
        plt.ylabel(RFM_scaled.columns[j])
        plt.legend()
        plt.show()


Cluster_table.sample(10)


Cluster_table[Cluster_table.Cluster == 3].sample(5)


Cluster_table[Cluster_table.Cluster == 2].sample(5)


Cluster_table[Cluster_table.Cluster == 1].sample(5)


Cluster_table[Cluster_table.Cluster == 0].sample(5)


# Assign Cluster labels to RFM table
rfm_table_cluster = rfm_table.assign(Cluster=cluster_labels)

# Average attributes for each cluster
cluster_avg = rfm_table_cluster.groupby(['Cluster']).mean()

# Calculate the population average
population_avg = rfm_table.mean()

# Calculate relative importance of attributes by
relative_imp = cluster_avg / population_avg - 1

plt.figure(figsize=(12, 12))
plt.title('Relative importance of attributes')
sns.heatmap(data=relative_imp, annot=True, fmt='.2f', cmap='RdYlGn')
plt.show()

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	2010-12-01 08:26:00	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	2010-12-01 08:26:00	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
541904	581587	22613	PACK OF 20 SPACEBOY NAPKINS	12	2011-12-09 12:50:00	0.85	12680.0	France
541905	581587	22899	CHILDREN'S APRON DOLLY GIRL	6	2011-12-09 12:50:00	2.10	12680.0	France
541906	581587	23254	CHILDRENS CUTLERY DOLLY GIRL	4	2011-12-09 12:50:00	4.15	12680.0	France
541907	581587	23255	CHILDRENS CUTLERY CIRCUS PARADE	4	2011-12-09 12:50:00	4.15	12680.0	France
541908	581587	22138	BAKING SET 9 PIECE RETROSPOT	3	2011-12-09 12:50:00	4.95	12680.0	France

	Quantity	UnitPrice	CustomerID
count	541909.000000	541909.000000	406829.000000
mean	9.552250	4.611114	15287.690570
std	218.081158	96.759853	1713.600303
min	-80995.000000	-11062.060000	12346.000000
25%	1.000000	1.250000	13953.000000
50%	3.000000	2.080000	15152.000000
75%	10.000000	4.130000	16791.000000
max	80995.000000	38970.000000	18287.000000

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
299983	A563186	B	Adjust bad debt	1	2011-08-12 14:51:00	-11062.06	NaN	United Kingdom
299984	A563187	B	Adjust bad debt	1	2011-08-12 14:52:00	-11062.06	NaN	United Kingdom

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
141	C536379	D	Discount	-1	2010-12-01 09:41:00	27.50	14527.0	United Kingdom
154	C536383	35004C	SET OF 3 COLOURED FLYING DUCKS	-1	2010-12-01 09:49:00	4.65	15311.0	United Kingdom
235	C536391	22556	PLASTERS IN TIN CIRCUS PARADE	-12	2010-12-01 10:24:00	1.65	17548.0	United Kingdom
236	C536391	21984	PACK OF 12 PINK PAISLEY TISSUES	-24	2010-12-01 10:24:00	0.29	17548.0	United Kingdom
237	C536391	21983	PACK OF 12 BLUE PAISLEY TISSUES	-24	2010-12-01 10:24:00	0.29	17548.0	United Kingdom
...	...	...	...	...	...	...	...	...
540449	C581490	23144	ZINC T-LIGHT HOLDER STARS SMALL	-11	2011-12-09 09:57:00	0.83	14397.0	United Kingdom
541541	C581499	M	Manual	-1	2011-12-09 10:28:00	224.69	15498.0	United Kingdom
541715	C581568	21258	VICTORIAN SEWING BOX LARGE	-5	2011-12-09 11:57:00	10.95	15311.0	United Kingdom
541716	C581569	84978	HANGING HEART JAR T-LIGHT HOLDER	-1	2011-12-09 11:58:00	1.25	17315.0	United Kingdom
541717	C581569	20979	36 PENCILS TUBE RED RETROSPOT	-5	2011-12-09 11:58:00	1.25	17315.0	United Kingdom

Online Retail Dataset: RFM Modeling with K-Means Clustering¶

Data Description:¶

Problem Statement:¶

Approach:¶

Preprocessing¶

Missing values

Duplicate records

Credited Accounts

Cancelled Orders

Exploratory Data Analysis¶

Cohort Analysis¶

Types of cohorts:

Assign monthly acquisition cohort

Calculate time offset in months

Calculate retention rate

Calculate average price per cohort

Calculate average quantity per cohort

RFM Model¶

Benefits of RFM analysis

For example, let’s look at a customer who:

Recency

Frequency

Monetary

Customer segments with RFM Model

Creation of RFM Segments

RFM Segment allocation

Let's visualize different customer segments records in general to answers these questions for the retail business.

K-Means Clustering¶

Winsorized Mean

Optimization

Snake plots

Scatter plots

Heat Map

Dashboard¶

	types	counts	uniques	nulls	min	max
CustomerID	float64	406829	4373	135080	12346.0	18287.0
Description	object	540455	4224	1454	NaN	NaN
Country	object	541909	38	0	Australia	Unspecified
InvoiceDate	datetime64[ns]	541909	23260	0	2010-12-01 08:26:00	2011-12-09 12:50:00
InvoiceNo	object	541909	25900	0	536365	C581569
Quantity	int64	541909	722	0	-80995	80995
StockCode	object	541909	4070	0	10002	m
UnitPrice	float64	541909	1630	0	-11062.06	38970.0

Segment	RFM	Description	Marketing
Best Customers	444	Customers who bought most recently, most often, and spent the most.	New products and Loyalty programs
Loyal Customers	344	Customers who bought most recently.	Use R and M to further segment
Big Spenders	334	Customers who spent the most.	Market most expensive products
Almost Lost	244	Haven't purchased recently, but did make frequent purchases at high cost.	Aggressive price incentives
Lost Customers	144	Haven't purchased in some time, but used to make frequent purchases at high cost.	Aggressive price incentives
Lost Cheap Customers	122	Haven't purchased in some time; did not purchase often or spend very much.	Don't spend too much to reacquire

	CustomerID	Purchase_Date	Current_Date	Recency
0	12346	2011-01-18	2011-12-09	325
1	12347	2011-12-07	2011-12-09	2
2	12348	2011-09-25	2011-12-09	75
3	12349	2011-11-21	2011-12-09	18
4	12350	2011-02-02	2011-12-09	310

	CustomerID	Monetary
0	12346	77183.60
1	12347	4310.00
2	12348	1797.24
3	12349	1757.55
4	12350	334.40

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment
2643	15948	8	2	955.24	4	2	3	423	9	others
3856	17628	81	1	75.75	2	1	1	211	4	others
3572	17232	2	2	417.77	4	2	2	422	8	others
1083	13812	42	3	539.00	3	3	2	332	8	others
3867	17643	373	1	101.55	1	1	1	111	3	others
12	12359	57	4	6310.03	2	3	4	234	9	others
2138	15258	168	2	623.16	1	2	2	122	5	Lost Cheap Customers
259	12664	8	9	4881.88	4	4	4	444	12	Best Customers
699	13273	113	1	138.98	2	1	1	211	4	others
292	12707	291	1	603.42	1	1	2	112	4	others

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment
1689	14646	1	73	280206.02	4	4	4	444	12	Best Customers
4201	18102	0	60	259657.30	4	4	4	444	12	Best Customers
3728	17450	8	46	194390.79	4	4	4	444	12	Best Customers
1879	14911	1	201	143711.17	4	4	4	444	12	Best Customers
1333	14156	9	55	117210.08	4	4	4	444	12	Best Customers

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment
152	12536	43	3	12601.83	3	3	4	334	10	Big Spenders
2773	16126	29	4	6287.77	3	3	4	334	10	Big Spenders
729	13316	37	4	5732.93	3	3	4	334	10	Big Spenders
154	12539	22	4	5568.35	3	3	4	334	10	Big Spenders
2899	16303	25	3	5360.63	3	3	4	334	10	Big Spenders

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment
324	12744	51	7	21279.29	2	4	4	244	10	Almost Lost
459	12939	64	8	11581.80	2	4	4	244	10	Almost Lost
2814	16180	100	8	10254.18	2	4	4	244	10	Almost Lost
1903	14952	59	11	8099.49	2	4	4	244	10	Almost Lost
3222	16745	86	17	7180.70	2	4	4	244	10	Almost Lost

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment
3487	17105	159	2	665.78	1	2	2	122	5	Lost Cheap Customers
4216	18121	149	2	653.30	1	2	2	122	5	Lost Cheap Customers
1621	14548	150	2	652.80	1	2	2	122	5	Lost Cheap Customers
2229	15384	169	2	642.41	1	2	2	122	5	Lost Cheap Customers
4286	18218	211	2	641.92	1	2	2	122	5	Lost Cheap Customers

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment
55	12415	24	21	124914.53	3	4	4	344	11	Loyal Customers
2702	16029	38	63	80850.84	3	4	4	344	11	Loyal Customers
453	12931	21	15	42055.96	3	4	4	344	11	Loyal Customers
1713	14680	25	16	28754.11	3	4	4	344	11	Loyal Customers
330	12753	22	6	21429.39	3	4	4	344	11	Loyal Customers

	CustomerID	Recency	Frequency	Monetary
count	4338.000000	4338.000000	4338.000000	4338.000000
mean	15300.408022	92.059474	4.272015	2048.688081
std	1721.808492	100.012264	7.697998	8985.230220
min	12346.000000	0.000000	1.000000	3.750000
25%	13813.250000	17.000000	1.000000	306.482500
50%	15299.500000	50.000000	2.000000	668.570000
75%	16778.750000	141.750000	5.000000	1660.597500
max	18287.000000	373.000000	209.000000	280206.020000

	CustomerID	Recency	Frequency	Monetary
0	-1.833336	1.463293	-1.016221	2.146328
1	-1.832623	-2.020149	1.218827	1.522186
2	-1.831909	0.381349	0.461066	0.781226
3	-1.831196	-0.648681	-1.016221	0.762313
4	-1.830483	1.428294	-1.016221	-0.641811

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment	Cluster
2101	15214	1	8	1661.44	4	4	4	444	12	Best Customers	1
2967	16392	269	1	217.89	1	1	1	111	3	others	2
374	12822	70	2	948.88	2	2	3	223	7	others	0
728	13314	1	3	775.94	4	3	3	433	10	others	1
2934	16350	21	4	1116.47	3	3	3	333	9	others	3
3731	17454	192	4	517.53	1	3	2	132	6	others	2
1133	13880	21	13	3219.77	3	4	4	344	11	Loyal Customers	1
2720	16054	145	1	783.90	1	1	3	113	5	others	2
3312	16858	368	1	375.69	1	1	2	112	4	others	2
1250	14044	26	4	646.42	3	3	2	332	8	others	0

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment	Cluster
4076	17926	133	2	397.29	2	2	2	222	6	others	2
3494	17117	288	1	116.20	1	1	1	111	3	others	2
2510	15758	24	1	205.25	3	1	1	311	5	others	2
3684	17391	163	2	508.80	1	2	2	122	5	Lost Cheap Customers	2
2859	16239	56	2	414.20	2	2	2	222	6	others	2

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment	Cluster
3669	17373	40	4	646.92	3	3	2	332	8	others	3
4314	18251	87	1	4314.72	2	1	4	214	7	others	3
2547	15809	36	5	515.77	3	3	2	332	8	others	3
3475	17086	7	6	2050.08	4	4	4	444	12	Best Customers	3
3966	17770	198	5	1143.27	1	3	3	133	7	others	3

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment	Cluster
1583	14502	19	11	2429.83	3	4	4	344	11	Loyal Customers	1
714	13297	7	4	2089.85	4	3	4	434	11	others	1
600	13138	22	6	962.39	3	4	3	343	10	others	1
3819	17576	8	19	3564.83	4	4	4	444	12	Best Customers	1
1734	14708	2	3	1126.37	4	3	3	433	10	others	1

	CustomerID	Recency	Frequency	Monetary	R_Quartile	F_Quartile	M_Quartile	RFM	Score	Segment
1397	14242	234	2	280.55	1	2	1	121	4	others
995	13692	24	3	1488.32	3	3	3	333	9	others
135	12516	77	2	1312.06	2	2	3	223	7	others
1556	14470	11	2	461.19	4	2	2	422	8	others
734	13321	72	2	567.36	2	2	2	222	6	others