Market Segmentation - PCA & K-Means Clustering

Author

Collin Real

Published

July 26, 2024

Import Libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans
pd.set_option('display.max_columns', None)

sns.set()

Import Data

df_customers = pd.read_csv('customers.csv')
df_customers
ID Sex Marital status Age Education Income Occupation Settlement size
0 100000001 0 0 67 2 124670 1 2
1 100000002 1 1 22 1 150773 1 2
2 100000003 0 0 49 1 89210 0 0
3 100000004 0 0 45 1 171565 1 1
4 100000005 0 0 53 1 149031 1 1
... ... ... ... ... ... ... ... ...
1995 100001996 1 0 47 1 123525 0 0
1996 100001997 1 1 27 1 117744 1 0
1997 100001998 0 0 31 0 86400 0 0
1998 100001999 1 1 24 1 97968 0 0
1999 100002000 0 0 25 0 68416 0 0

2000 rows × 8 columns

Descriptive Statistics

df_customers.describe()
ID Sex Marital status Age Education Income Occupation Settlement size
count 2.000000e+03 2000.000000 2000.000000 2000.000000 2000.00000 2000.000000 2000.000000 2000.000000
mean 1.000010e+08 0.457000 0.496500 35.909000 1.03800 120954.419000 0.810500 0.739000
std 5.774946e+02 0.498272 0.500113 11.719402 0.59978 38108.824679 0.638587 0.812533
min 1.000000e+08 0.000000 0.000000 18.000000 0.00000 35832.000000 0.000000 0.000000
25% 1.000005e+08 0.000000 0.000000 27.000000 1.00000 97663.250000 0.000000 0.000000
50% 1.000010e+08 0.000000 0.000000 33.000000 1.00000 115548.500000 1.000000 1.000000
75% 1.000015e+08 1.000000 1.000000 42.000000 1.00000 138072.250000 1.000000 1.000000
max 1.000020e+08 1.000000 1.000000 76.000000 3.00000 309364.000000 2.000000 2.000000

Variable correlation

df_customers.corr()
ID Sex Marital status Age Education Income Occupation Settlement size
ID 1.000000 0.328262 0.074403 -0.085246 0.012543 -0.303217 -0.291958 -0.378445
Sex 0.328262 1.000000 0.566511 -0.182885 0.244838 -0.195146 -0.202491 -0.300803
Marital status 0.074403 0.566511 1.000000 -0.213178 0.374017 -0.073528 -0.029490 -0.097041
Age -0.085246 -0.182885 -0.213178 1.000000 0.654605 0.340610 0.108388 0.119751
Education 0.012543 0.244838 0.374017 0.654605 1.000000 0.233459 0.064524 0.034732
Income -0.303217 -0.195146 -0.073528 0.340610 0.233459 1.000000 0.680357 0.490881
Occupation -0.291958 -0.202491 -0.029490 0.108388 0.064524 0.680357 1.000000 0.571795
Settlement size -0.378445 -0.300803 -0.097041 0.119751 0.034732 0.490881 0.571795 1.000000

Correlation - Heat Map

plt.figure(figsize=(12, 9))
s = sns.heatmap(df_customers.corr(),
                annot=True,
                cmap='RdBu',
                vmin=-1,
                vmax=1)
s.set_xticklabels(s.get_xticklabels(), rotation=90)
plt.title('Correlation Matrix')
plt.show()

Scatter Plot - Age vs. Income

plt.figure(figsize=(12, 9))
# plt.scatter(df_customers.iloc[:, 2], df_customers.iloc[:, 4])
plt.scatter(df_customers['Age'], df_customers['Income'])
plt.xlabel('Age')
plt.ylabel('Income')
Text(0, 0.5, 'Income')

Standardize the DataFrame

scaler = StandardScaler()
customers_std = scaler.fit_transform(df_customers)
customers_std
array([[-1.731185  , -0.91739884, -0.99302433, ...,  0.09752361,
         0.29682303,  1.552326  ],
       [-1.72945295,  1.09003844,  1.00702467, ...,  0.78265438,
         0.29682303,  1.552326  ],
       [-1.7277209 , -0.91739884, -0.99302433, ..., -0.83320224,
        -1.26952539, -0.90972951],
       ...,
       [ 1.7277209 , -0.91739884, -0.99302433, ..., -0.90695688,
        -1.26952539, -0.90972951],
       [ 1.72945295,  1.09003844,  1.00702467, ..., -0.60332923,
        -1.26952539, -0.90972951],
       [ 1.731185  , -0.91739884, -0.99302433, ..., -1.3789866 ,
        -1.26952539, -0.90972951]])

Hierarchical Clustering

h_cluster = linkage(customers_std, method='ward')

plt.figure(figsize=(12, 9))
plt.xlabel('Observations')
plt.ylabel('Distance')
dendrogram(h_cluster,
           show_leaf_counts=False,
           no_labels=True)
plt.show()

K-Means Clustering

results = {}

for i in range (1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(customers_std)
    results[i] = kmeans.inertia_

plt.figure(figsize=(12, 9))
plt.plot(results.keys(), results.values(), marker='o', linestyle='--')
plt.plot([0.7, 3.2], [12000, 7000], color='#FF8400')
plt.plot([4.5, 9], [5700, 4000], color='#FF8400')
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of squared distance')
plt.show()

K-Means Clustering - 4 Clusters

kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
kmeans.fit(customers_std)

df_customers_kmeans = df_customers.copy()
df_customers_kmeans['Segment'] = kmeans.labels_
df_customers_kmeans
ID Sex Marital status Age Education Income Occupation Settlement size Segment
0 100000001 0 0 67 2 124670 1 2 2
1 100000002 1 1 22 1 150773 1 2 1
2 100000003 0 0 49 1 89210 0 0 3
3 100000004 0 0 45 1 171565 1 1 0
4 100000005 0 0 53 1 149031 1 1 0
... ... ... ... ... ... ... ... ... ...
1995 100001996 1 0 47 1 123525 0 0 3
1996 100001997 1 1 27 1 117744 1 0 1
1997 100001998 0 0 31 0 86400 0 0 3
1998 100001999 1 1 24 1 97968 0 0 1
1999 100002000 0 0 25 0 68416 0 0 3

2000 rows × 9 columns

Characteristics of the people in each cluster

df_customers_analysis = df_customers_kmeans.groupby('Segment').mean().round(3)
df_customers_analysis
ID Sex Marital status Age Education Income Occupation Settlement size
Segment
0 1.000007e+08 0.032 0.180 35.637 0.738 140135.807 1.251 1.389
1 1.000011e+08 0.876 0.999 29.003 1.068 105597.536 0.630 0.418
2 1.000009e+08 0.483 0.680 55.881 2.130 155931.141 1.093 1.078
3 1.000014e+08 0.403 0.043 34.690 0.742 94407.322 0.255 0.060
df_customers_analysis['Count'] = df_customers_kmeans[['Segment', 'Sex']].groupby('Segment').count()
df_customers_analysis['%'] = df_customers_analysis['Count'] / df_customers_analysis['Count'].sum()
df_customers_analysis.rename(index={
    0: 'well-off',
    1: 'fewer-opportunities',
    2: 'standard',
    3: 'career-focused'
}, inplace=True)

df_customers_analysis
ID Sex Marital status Age Education Income Occupation Settlement size Count %
Segment
well-off 1.000007e+08 0.032 0.180 35.637 0.738 140135.807 1.251 1.389 633 0.3165
fewer-opportunities 1.000011e+08 0.876 0.999 29.003 1.068 105597.536 0.630 0.418 679 0.3395
standard 1.000009e+08 0.483 0.680 55.881 2.130 155931.141 1.093 1.078 269 0.1345
career-focused 1.000014e+08 0.403 0.043 34.690 0.742 94407.322 0.255 0.060 419 0.2095
plt.figure(figsize=(8, 6))
s = sns.barplot(data=df_customers_analysis, x=df_customers_analysis.index, y='Count', palette=['g','c','r','m'])
plt.xlabel('Segment')
plt.ylabel('Population')
plt.show()
/var/folders/s3/h2qfnwzs63b1k1xft79tdnfw0000gn/T/ipykernel_94496/843998559.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

Assign Meaningful Labels to the segments

df_customers_kmeans['Segment'] = df_customers_kmeans['Segment'].map({
    0: 'well-off',
    1: 'fewer-opportunities',
    2: 'standard',
    3: 'career-focused'
})
df_customers_kmeans
ID Sex Marital status Age Education Income Occupation Settlement size Segment
0 100000001 0 0 67 2 124670 1 2 standard
1 100000002 1 1 22 1 150773 1 2 fewer-opportunities
2 100000003 0 0 49 1 89210 0 0 career-focused
3 100000004 0 0 45 1 171565 1 1 well-off
4 100000005 0 0 53 1 149031 1 1 well-off
... ... ... ... ... ... ... ... ... ...
1995 100001996 1 0 47 1 123525 0 0 career-focused
1996 100001997 1 1 27 1 117744 1 0 fewer-opportunities
1997 100001998 0 0 31 0 86400 0 0 career-focused
1998 100001999 1 1 24 1 97968 0 0 fewer-opportunities
1999 100002000 0 0 25 0 68416 0 0 career-focused

2000 rows × 9 columns

Visualize the segmented customers

colors = ['g','r','c','m']
sns.set_palette(sns.color_palette("pastel"))
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=df_customers_kmeans['Age'],
    y=df_customers_kmeans['Income'],
    hue=df_customers_kmeans['Segment'],
    palette=colors
)
plt.scatter(62, 160000 , s=60000, facecolors='none', edgecolors='#FF8400' ) 
plt.title('Age vs Income in each segment')

plt.show()

Education vs. Income

plt.figure(figsize=(16, 8))
sns.violinplot(
    x=df_customers_kmeans['Education'],
    y=df_customers_kmeans['Income'],
    hue=df_customers_kmeans['Segment'],
    palette=['g','r','c','m']
)
plt.title('Education vs Income in each segment')
plt.show()

Improve K-Means with PCA

from sklearn.decomposition import PCA

pca = PCA()
pca.fit(customers_std)

pca.explained_variance_ratio_
array([0.34103573, 0.23178599, 0.16650585, 0.09955452, 0.06169548,
       0.04785186, 0.03407515, 0.01749541])

Plot the cumulative sum of variability

plt.figure(figsize=(12, 8))
plt.plot(range(0, 8), pca.explained_variance_ratio_.cumsum(), marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
Text(0, 0.5, 'Cumulative Explained Variance')

Pick 3 Components from the PCA model

pca = PCA(n_components=3)
pca.fit(customers_std)

df_pca_components = pd.DataFrame(
    data=pca.components_.round(4),
    columns=df_customers.columns.values,
    index=['component 1', 'component 2', 'component 3'])

df_pca_components
ID Sex Marital status Age Education Income Occupation Settlement size
component 1 -0.3454 -0.3286 -0.1873 0.2703 0.1045 0.4838 0.4617 0.4543
component 2 0.1072 0.4213 0.4721 0.3553 0.6528 0.1763 0.0614 -0.0308
component 3 0.1435 -0.3180 -0.4854 0.6134 0.2523 -0.1236 -0.3446 -0.2621

Correlation Matrix of the 3 Components

s = sns.heatmap(
    df_pca_components,
    vmin=-1,
    vmax=1,
    cmap='RdBu',
    annot=True
)
plt.title('Correlation Matrix')
plt.show()

Implementing K-Means Clustering

pca_scores = pca.transform(customers_std)

results = {}

for i in range(1, 11):
    kmeans_pca = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans_pca.fit(pca_scores)   # pca_scores are standarzied by default
    results[i] = kmeans_pca.inertia_
plt.figure(figsize=(12, 8))
plt.plot(results.keys(), results.values(), marker='o', linestyle='--')
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of squared distance')
plt.show()

Implementing K-Means Clustering w/ 4 Clusters

kmeans_pca = KMeans(n_clusters=4, init='k-means++', random_state=42)
kmeans_pca.fit(pca_scores)
KMeans(n_clusters=4, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
df_segm_pca = pd.concat([df_customers.reset_index(drop=True), pd.DataFrame(pca_scores)], axis=1)
df_segm_pca.columns.values[-3:] = ['component 1', 'component 2', 'component 3']
df_segm_pca['K-means PCA'] = kmeans_pca.labels_
df_segm_pca.to_csv("customer_segment_pca.csv", encoding='utf-8', index=False)
df_segm_pca
ID Sex Marital status Age Education Income Occupation Settlement size component 1 component 2 component 3 K-means PCA
0 100000001 0 0 67 2 124670 1 2 2.859782 0.936676 2.036586 2
1 100000002 1 1 22 1 150773 1 2 0.944130 0.394492 -2.433785 0
2 100000003 0 0 49 1 89210 0 0 -0.023032 -0.881797 1.974083 3
3 100000004 0 0 45 1 171565 1 1 2.212422 -0.563616 0.635332 0
4 100000005 0 0 53 1 149031 1 1 2.110202 -0.425124 1.127543 0
... ... ... ... ... ... ... ... ... ... ... ... ...
1995 100001996 1 0 47 1 123525 0 0 -1.485348 0.432286 1.615196 3
1996 100001997 1 1 27 1 117744 1 0 -1.672129 0.839600 -0.923547 1
1997 100001998 0 0 31 0 86400 0 0 -1.841798 -2.158681 1.116012 3
1998 100001999 1 1 24 1 97968 0 0 -2.716832 0.561390 -0.476253 1
1999 100002000 0 0 25 0 68416 0 0 -2.209795 -2.423450 0.860709 3

2000 rows × 12 columns

Analyze Segmentation Results

df_segm_pca_analysis = df_segm_pca.groupby(['K-means PCA']).mean().round(4)
df_segm_pca_analysis
ID Sex Marital status Age Education Income Occupation Settlement size component 1 component 2 component 3
K-means PCA
0 1.000007e+08 0.0347 0.1924 35.4479 0.7382 140183.3155 1.2539 1.3912 1.4667 -0.9422 -0.1839
1 1.000012e+08 0.9190 0.9670 28.9580 1.0645 106617.4678 0.6597 0.4273 -1.2052 0.6160 -0.8333
2 1.000009e+08 0.4925 0.6842 55.8421 2.1278 157389.3872 1.1128 1.0977 1.5153 2.1581 0.8680
3 1.000013e+08 0.3418 0.1016 35.0462 0.7667 92501.5889 0.2079 0.0439 -1.2220 -0.8951 1.0196
  • Segment 0: low career and experience values with high education and lifestyle values.
  • Label: Standard
  • Segment 1: high career but low education, lifestyle and experience
  • Label: Career focused
  • Segment 2: low career, education and lifestyle, but high life experience
  • Label: Fewer opportunities
  • Segment 3: high career, education and lifestyle as well as high life experience
  • Label: Well-off
df_segm_pca_analysis['Count'] = df_segm_pca[['K-means PCA', 'Sex']].groupby(['K-means PCA']).count()
df_segm_pca_analysis['%'] = df_segm_pca_analysis['Count'] / df_segm_pca_analysis['Count'].sum()
df_segm_pca_analysis.rename(index={
    0: 'standard',
    1: 'career-focused',
    2: 'fewer-opportunities',
    3: 'well-off'
}, inplace=True)

df_segm_pca_analysis
ID Sex Marital status Age Education Income Occupation Settlement size component 1 component 2 component 3 Count %
K-means PCA
standard 1.000007e+08 0.0347 0.1924 35.4479 0.7382 140183.3155 1.2539 1.3912 1.4667 -0.9422 -0.1839 634 0.3170
career-focused 1.000012e+08 0.9190 0.9670 28.9580 1.0645 106617.4678 0.6597 0.4273 -1.2052 0.6160 -0.8333 667 0.3335
fewer-opportunities 1.000009e+08 0.4925 0.6842 55.8421 2.1278 157389.3872 1.1128 1.0977 1.5153 2.1581 0.8680 266 0.1330
well-off 1.000013e+08 0.3418 0.1016 35.0462 0.7667 92501.5889 0.2079 0.0439 -1.2220 -0.8951 1.0196 433 0.2165

Number of Customers per Segment

plt.figure(figsize=(8, 6))
s = sns.barplot(data=df_segm_pca_analysis, x=df_segm_pca_analysis.index, y='Count', palette=['g','c','r','m'])
plt.xlabel('Segment')
plt.ylabel('Population')
plt.show()
/var/folders/s3/h2qfnwzs63b1k1xft79tdnfw0000gn/T/ipykernel_94496/2239997931.py:2: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

Add segment labels to original dataset

df_segm_pca['Segment'] = df_segm_pca['K-means PCA'].map({
    0: 'standard',
    1: 'career-focused',
    2: 'fewer-opportunities',
    3: 'well-off'
})
df_segm_pca
ID Sex Marital status Age Education Income Occupation Settlement size component 1 component 2 component 3 K-means PCA Segment
0 100000001 0 0 67 2 124670 1 2 2.859782 0.936676 2.036586 2 fewer-opportunities
1 100000002 1 1 22 1 150773 1 2 0.944130 0.394492 -2.433785 0 standard
2 100000003 0 0 49 1 89210 0 0 -0.023032 -0.881797 1.974083 3 well-off
3 100000004 0 0 45 1 171565 1 1 2.212422 -0.563616 0.635332 0 standard
4 100000005 0 0 53 1 149031 1 1 2.110202 -0.425124 1.127543 0 standard
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1995 100001996 1 0 47 1 123525 0 0 -1.485348 0.432286 1.615196 3 well-off
1996 100001997 1 1 27 1 117744 1 0 -1.672129 0.839600 -0.923547 1 career-focused
1997 100001998 0 0 31 0 86400 0 0 -1.841798 -2.158681 1.116012 3 well-off
1998 100001999 1 1 24 1 97968 0 0 -2.716832 0.561390 -0.476253 1 career-focused
1999 100002000 0 0 25 0 68416 0 0 -2.209795 -2.423450 0.860709 3 well-off

2000 rows × 13 columns

Visualize segments with respect to first two components

plt.figure(figsize=(10, 8))
sns.scatterplot(
    x=df_segm_pca['component 2'],
    y=df_segm_pca['component 1'],
    hue=df_segm_pca['Segment'],
    palette=['g','r','c','m']
)
plt.scatter(2.35, 2 , s=60000, facecolors='none', edgecolors='g' )
plt.scatter(-1.2, 2 , s=60000, facecolors='none', edgecolors='y' )
plt.scatter(-1, -1 , s=30000, facecolors='none', edgecolors='b' )
plt.scatter(1.2, -1 , s=40000, facecolors='none', edgecolors='r' )

plt.title('Component 1 vs Component 2')
plt.show()