项目作者: atemate

项目描述 :
Data transformations toolkit made from jupyter notebook: https://www.kaggle.com/fabiendaniel/customer-segmentation
高级语言: Jupyter Notebook
项目地址: git://github.com/atemate/customer-segmentation-toolkit.git
创建时间: 2021-05-11T18:41:46Z
项目社区:https://github.com/atemate/customer-segmentation-toolkit

开源协议:Apache License 2.0

下载


Customer segmentation toolkit

Data transformations toolkit made by Team #2 for the MLOps Engineering Lab #2 “Feature Store for ML”.

Installation

pip install -U customer-segmentation-toolkit

Usage

  1. import pandas as pd
  2. from pathlib import Path

01. Load and split dataset

  1. import datetime
  2. from customer_segmentation_toolkit.load_split import load_data_csv, split_by_invoice_date
  3. ONLINEOFFLINE_DATE_SPLIT = datetime.date(2011,10,1)
  4. # Loading original dataset
  5. df = load_data_csv('../data/data.csv')
  6. print(f'Loaded dataset, shape: {df.shape}')
  7. # Splitting dataset to offline and online parts
  8. df_offline, df_online = split_by_invoice_date(df, ONLINEOFFLINE_DATE_SPLIT)
  9. print(f'Offline dataset shape: {df_offline.shape}')
  10. print(f'Offline invoices: from {df_offline["InvoiceDate"].min()} to {df_offline["InvoiceDate"].max()}')
  11. print(f'Online dataset shape: {df_online.shape}')
  12. print(f'Online invoices: from {df_online["InvoiceDate"].min()} to {df_online["InvoiceDate"].max()}')
  1. Loaded dataset, shape: (541909, 8)
  2. Offline dataset shape: (370931, 8)
  3. Offline invoices: from 2010-12-01 08:26:00 to 2011-09-30 17:22:00
  4. Online dataset shape: (170978, 8)
  5. Online invoices: from 2011-10-02 10:32:00 to 2011-12-09 12:50:00
  1. # Saving processed data
  2. OUTPUT = Path(f'../data/output/01_data_split_offline_online')
  3. OUTPUT.mkdir(exist_ok=True, parents=True)
  4. df_offline.to_csv(f'{OUTPUT}/no_live_data.csv', index=False)
  5. df_online.to_csv(f'{OUTPUT}/raw_live_data.csv', index=False)
  6. Path(f'{OUTPUT}/onlineoffline_date_split.txt').write_text(str(ONLINEOFFLINE_DATE_SPLIT))
  7. print(f'Output data saved to {OUTPUT}: {[p.name for p in Path(OUTPUT).iterdir()]}')
  1. Output data saved to ../data/output/01_data_split_offline_online: ['onlineoffline_date_split.txt', 'no_live_data.csv', 'raw_live_data.csv']

02. Clean dataset rows

  1. from customer_segmentation_toolkit.load_split import load_data_csv
  2. from customer_segmentation_toolkit.clean_rows import clean_data_rows
  3. # Loading raw offline dataset'
  4. df = load_data_csv('../data/output/01_data_split_offline_online/no_live_data.csv')
  5. print(f'Loaded raw offline dataset, shape: {df.shape}')
  6. # Cleaning the dataset
  7. df_cleaned = clean_data_rows(df)
  8. print(f'Cleaned offline dataset shape: {df.shape}')
  1. Loaded raw offline dataset, shape: (370931, 8)
  2. Cleaned offline dataset shape: (370931, 8)
  1. # Saving processed data
  2. OUTPUT = Path(f'../data/output/02_data_clean_rows')
  3. OUTPUT.mkdir(exist_ok=True, parents=True)
  4. df_cleaned.to_csv(f'{OUTPUT}/no_live_data__cleaned.csv', index=False)
  5. print(f'Output data saved to {OUTPUT}: {[p.name for p in Path(OUTPUT).iterdir()]}')
  1. Output data saved to ../data/output/02_data_clean_rows: ['no_live_data__cleaned.csv']

03. Analyse purchases

  1. import datetime
  2. from customer_segmentation_toolkit.load_split import load_data_csv
  3. from customer_segmentation_toolkit.analyse_purchases import build_product_list
  4. N_PURCHASE_CLUSTERS = 5
  5. TRAINTEST_DATE_SPLIT = datetime.date(2011,8,1)
  6. # Loading cleaned dataset
  7. df_cleaned = load_data_csv('../data/output/02_data_clean_rows/no_live_data__cleaned.csv')
  8. print(f'Loaded cleaned offline dataset, shape: {df_cleaned.shape}')
  9. list_products = build_product_list(df_cleaned)
  10. print(f'Built list of products:')
  11. print(pd.DataFrame(list_products).head())
  12. print('...')
  1. Loaded cleaned offline dataset, shape: (263815, 10)
  2. Built list of products:
  3. 0 1
  4. 0 heart 251
  5. 1 vintage 195
  6. 2 set 194
  7. 3 bag 158
  8. 4 box 147
  9. ...
  1. from customer_segmentation_toolkit.analyse_purchases import build_keywords_matrix
  2. # Building keywords count matrix
  3. THRESHOLD = [0, 1, 2, 3, 5, 10]
  4. matrix = build_keywords_matrix(df_cleaned, list_products, THRESHOLD)
  5. print(f'Built keywords count matrix (shape: {matrix.shape}):')
  6. print(matrix.head())
  1. Built keywords count matrix (shape: (3662, 188)):
  2. heart vintage set bag box glass christmas design candle flower \
  3. 0 1 0 0 0 0 0 0 0 0 0
  4. 1 0 0 0 0 0 0 0 0 0 0
  5. 2 1 0 0 0 0 0 0 0 0 0
  6. 3 0 0 0 0 0 0 0 0 0 0
  7. 4 1 0 0 0 0 0 0 0 0 0
  8. ... medium hen wallet point 0<.<1 1<.<2 2<.<3 3<.<5 5<.<10 .>10
  9. 0 ... 0 0 0 0 0 0 1 0 0 0
  10. 1 ... 0 0 0 0 0 0 0 1 0 0
  11. 2 ... 0 0 0 0 0 0 0 1 0 0
  12. 3 ... 0 0 0 0 0 0 0 1 0 0
  13. 4 ... 0 0 0 0 0 0 0 1 0 0
  14. [5 rows x 188 columns]
  1. from customer_segmentation_toolkit.analyse_purchases import compute_purchase_clusters
  2. # Computing purchases clusters via Kmeans
  3. clusters = compute_purchase_clusters(matrix, N_PURCHASE_CLUSTERS)
  4. print(f'Built purchase clusters:')
  5. print(pd.Series(clusters).value_counts())
  1. Built purchase clusters:
  2. 1 1117
  3. 4 911
  4. 0 638
  5. 2 566
  6. 3 430
  7. dtype: int64
  1. from sklearn.metrics import silhouette_samples, silhouette_score
  2. from customer_segmentation_toolkit.analyse_purchases import plot_silhouette
  3. silhouette_avg = silhouette_score(matrix, clusters)
  4. sample_silhouette_values = silhouette_samples(matrix, clusters)
  5. # Plotting silhouette values
  6. plot_silhouette(N_PURCHASE_CLUSTERS, [-0.07, 0.33], len(matrix), sample_silhouette_values, clusters)

svg

  1. from customer_segmentation_toolkit.analyse_purchases import add_purchase_clusters_info
  2. # Constructing the result DataFrame
  3. df_with_clusters = add_purchase_clusters_info(df_cleaned, clusters, N_PURCHASE_CLUSTERS)
  4. print(f'Added purchase clusters info to the offline cleaned dataset:')
  5. print(f'Shape: {df_with_clusters.shape}')
  6. print(f'Columns: {list(df_with_clusters.columns)}')
  1. Added purchase clusters info to the offline cleaned dataset:
  2. Shape: (13081, 9)
  3. Columns: ['CustomerID', 'InvoiceNo', 'Basket Price', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4', 'InvoiceDate']
  1. from customer_segmentation_toolkit.load_split import split_by_invoice_date
  2. # Splitting the new dataset (offline + cluster info) to train+test
  3. df_offline_train, df_offline_test = split_by_invoice_date(df_with_clusters, TRAINTEST_DATE_SPLIT)
  4. print(f'Splitted: train of shape {df_offline_train.shape} + test of shape {df_offline_test.shape}')
  1. Splitted: train of shape (10054, 9) + test of shape (3027, 9)
  1. # Saving processed data
  2. OUTPUT = Path(f'../data/output/03_data_compute_description_keywords')
  3. OUTPUT.mkdir(exist_ok=True, parents=True)
  4. matrix.to_csv(f'{OUTPUT}/no_live_data__cleaned__keywords.csv', index=False)
  5. df_offline_train.to_csv(f'{OUTPUT}/no_live_data__cleaned__purchase_clusters__train.csv', index=False)
  6. df_offline_test.to_csv(f'{OUTPUT}/no_live_data__cleaned__purchase_clusters__test.csv', index=False)
  7. pd.DataFrame(THRESHOLD, columns=['threshold']).to_csv(f'{OUTPUT}/threshold.csv', index=False)
  8. Path(f'{OUTPUT}/n_purchase_clusters.txt').write_text(str(N_PURCHASE_CLUSTERS))
  9. Path(f'{OUTPUT}/traintest_date_split.txt').write_text(str(TRAINTEST_DATE_SPLIT))
  10. print(f'Output data saved to {OUTPUT}: {[p.name for p in Path(OUTPUT).iterdir()]}')
  1. Output data saved to ../data/output/03_data_compute_description_keywords: ['no_live_data__cleaned__keywords.csv', 'no_live_data__cleaned__purchase_clusters__test.csv', 'threshold.csv', 'n_purchase_clusters.txt', 'traintest_date_split.txt', 'no_live_data__cleaned__purchase_clusters__train.csv']

04. Analyse customer categories

  1. from customer_segmentation_toolkit.load_split import load_data_csv
  2. N_CUSTOMER_CLUSTERS = 11
  3. SELECTED_CUSTOMERS_CATEG_THRESHOLD = 40
  4. # Loading cleaned offline train dataset
  5. DATA = '../data/output/03_data_compute_description_keywords'
  6. N_PURCHASE_CLUSTERS = int(Path(f'{DATA}/n_purchase_clusters.txt').read_text().strip())
  7. basket_price = load_data_csv(f'{DATA}/no_live_data__cleaned__purchase_clusters__train.csv')
  8. print(f'Loaded purchase clusters data of shape: {basket_price.shape}')
  9. print(basket_price.head())
  10. print('...')
  1. Loaded purchase clusters data of shape: (10054, 9)
  2. CustomerID InvoiceNo Basket Price categ_0 categ_1 categ_2 categ_3 \
  3. 0 12347 537626 711.79 83.40 187.20 293.35 124.44
  4. 1 12347 542237 475.39 53.10 168.75 169.20 0.00
  5. 2 12347 549222 636.25 71.10 369.15 115.00 0.00
  6. 3 12347 556201 382.52 78.06 74.40 168.76 19.90
  7. 4 12348 539318 892.80 0.00 414.00 0.00 0.00
  8. categ_4 InvoiceDate
  9. 0 23.40 2010-12-07 14:57:00.000001024
  10. 1 84.34 2011-01-26 14:29:59.999999744
  11. 2 81.00 2011-04-07 10:42:59.999999232
  12. 3 41.40 2011-06-09 13:01:00.000000256
  13. 4 478.80 2010-12-16 19:09:00.000000000
  14. ...
  1. from customer_segmentation_toolkit.analyse_customers import build_transactions_per_user
  2. # Building transactions per user
  3. transactions_per_user = build_transactions_per_user(basket_price, n_purchase_clusters=N_PURCHASE_CLUSTERS)
  4. print(f'Built transactions per user, shape: {transactions_per_user.shape}')
  5. print(transactions_per_user.head())
  6. print('...')
  1. Built transactions per user, shape: (3143, 13)
  2. CustomerID count min max mean sum categ_0 \
  3. 0 12347 4 382.52 711.79 551.487500 2205.95 12.949523
  4. 1 12348 3 227.44 892.80 495.746667 1487.24 0.000000
  5. 2 12350 1 334.40 334.40 334.400000 334.40 27.900718
  6. 3 12352 4 144.35 840.30 360.370000 1441.48 3.683714
  7. 4 12353 1 89.00 89.00 89.000000 89.00 19.887640
  8. categ_1 categ_2 categ_3 categ_4 LastPurchase FirstPurchase
  9. 0 36.242889 33.831682 6.543213 10.432693 52 236
  10. 1 54.059869 0.000000 0.000000 45.940131 117 227
  11. 2 60.406699 0.000000 0.000000 11.692584 179 179
  12. 3 77.977495 5.771846 11.859339 0.707606 131 165
  13. 4 13.033708 0.000000 67.078652 0.000000 73 73
  14. ...
  1. from customer_segmentation_toolkit.analyse_customers import (
  2. plot_customers_pca,
  3. convert_customers_df_to_np,
  4. analyse_customers_pca,
  5. )
  6. # Analysing customers distribution via PCA
  7. matrix = convert_customers_df_to_np(transactions_per_user, N_PURCHASE_CLUSTERS)
  8. scaled_matrix, pca = analyse_customers_pca(matrix)
  9. plot_customers_pca(matrix, pca)

svg

  1. from customer_segmentation_toolkit.analyse_customers import compute_customer_clusters
  2. # Computing customers clusters via Kmeans
  3. clusters_clients = compute_customer_clusters(scaled_matrix, N_CUSTOMER_CLUSTERS)
  4. print('Computed customers clusters via Kmeans:')
  5. display(pd.Series(clusters_clients).value_counts())
  1. Computed customers clusters via Kmeans:
  2. 7 1186
  3. 6 475
  4. 0 305
  5. 3 276
  6. 8 239
  7. 9 235
  8. 1 226
  9. 4 152
  10. 2 32
  11. 5 10
  12. 10 7
  13. dtype: int64
  1. from sklearn.metrics import silhouette_samples, silhouette_score
  2. from customer_segmentation_toolkit.analyse_purchases import plot_silhouette
  3. silhouette_avg = silhouette_score(scaled_matrix, clusters_clients)
  4. sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)
  5. # Plotting silhouette values
  6. plot_silhouette(N_CUSTOMER_CLUSTERS, [-0.15, 0.55], len(scaled_matrix), sample_silhouette_values, clusters_clients)

svg

  1. from customer_segmentation_toolkit.analyse_customers import plot_customer_categories
  2. # Plotting customers categories
  3. plot_customer_categories(scaled_matrix, clusters_clients, N_CUSTOMER_CLUSTERS)

svg

  1. from customer_segmentation_toolkit.analyse_customers import add_customer_clusters_info
  2. # Constructing the result dataset
  3. merged_df = add_customer_clusters_info(transactions_per_user, clusters_clients)
  4. print(f'Constructed the result dataset:')
  5. print(f'Shape: {merged_df.shape}')
  6. print(f'Columns: {list(merged_df.columns)}')
  1. Constructed the result dataset:
  2. Shape: (3143, 14)
  3. Columns: ['CustomerID', 'count', 'min', 'max', 'mean', 'sum', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4', 'LastPurchase', 'FirstPurchase', 'cluster']
  1. from customer_segmentation_toolkit.analyse_customers import compute_aggregated_customer_clusters_info
  2. # Constructing the aggregated cluster info dataset
  3. selected_customers_df = compute_aggregated_customer_clusters_info(merged_df, N_PURCHASE_CLUSTERS, N_CUSTOMER_CLUSTERS,
  4. categ_threshold=SELECTED_CUSTOMERS_CATEG_THRESHOLD)
  5. print('Constructed the aggregated cluster info:')
  6. print(f'Shape: {selected_customers_df.shape}')
  7. print(f'Columns: {list(selected_customers_df.columns)}')
  1. Constructed the aggregated cluster info:
  2. Shape: (11, 14)
  3. Columns: ['cluster', 'count', 'min', 'max', 'mean', 'sum', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4', 'LastPurchase', 'FirstPurchase', 'size']
  1. # Saving processed data
  2. OUTPUT = Path(f'../data/output/04_data_analyse_customers')
  3. OUTPUT.mkdir(exist_ok=True, parents=True)
  4. selected_customers_df.to_csv(f'{OUTPUT}/no_live_data__cleaned__purchase_clusters__train__selected_customers_aggregated.csv', index=False)
  5. merged_df.to_csv(f'{OUTPUT}/no_live_data__cleaned__purchase_clusters__train__customer_clusters.csv', index=False)
  6. Path(f'{OUTPUT}/n_customer_clusters.txt').write_text(str(N_CUSTOMER_CLUSTERS))
  7. print(f'Output data saved to {OUTPUT}: {[p.name for p in Path(OUTPUT).iterdir()]}')
  1. Output data saved to ../data/output/04_data_analyse_customers: ['n_customer_clusters.txt', 'no_live_data__cleaned__purchase_clusters__train__selected_customers_aggregated.csv', 'no_live_data__cleaned__purchase_clusters__train__customer_clusters.csv']

05. Download dataset and use it for training

  1. from customer_segmentation_toolkit.data_zoo import download_data_csv
  2. from sklearn.model_selection import train_test_split
  3. # Download dataset from the data_zoo:
  4. csv = 'no_live_data__cleaned__purchase_clusters__train__customer_clusters.csv'
  5. selected_customers: pd.DataFrame = download_data_csv(f'data/output/04_data_analyse_customers/{csv}')
  6. X = selected_customers[['mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4' ]]
  7. Y = selected_customers['cluster']
  8. X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8)
  9. X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
  1. ((2514, 6), (629, 6), (2514,), (629,))