from pathlib import Path
from typing import Tuple
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import confusion_matrix, roc_auc_score, log_loss, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
"display.max_columns", None)
pd.set_option(
= Path.cwd().parent.parent
PROJECT_ROOT
"figure.facecolor"] = (1, 1, 1, 0) # RGBA tuple with alpha=0
plt.rcParams["axes.facecolor"] = (1, 1, 1, 0) # RGBA tuple with alpha=0 plt.rcParams[
Profit Curves
Getting the data
The dataset is from the Telco Customer Churn Kaggle Competition. The target is the simple binary outcome in the Churn
column.
= pd.read_csv(f"{PROJECT_ROOT}/data/WA_Fn-UseC_-Telco-Customer-Churn.csv", low_memory=False) data
data
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7038 | 6840-RESVB | Male | 0 | Yes | Yes | 24 | Yes | Yes | DSL | Yes | No | Yes | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.5 | No |
7039 | 2234-XADUH | Female | 0 | Yes | Yes | 72 | Yes | Yes | Fiber optic | No | Yes | Yes | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.9 | No |
7040 | 4801-JZAZL | Female | 0 | Yes | Yes | 11 | No | No phone service | DSL | Yes | No | No | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | No |
7041 | 8361-LTMKD | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.6 | Yes |
7042 | 3186-AJIEK | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes | No | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 | No |
7043 rows × 21 columns
= "Churn"
target
= train_test_split(
X_train, X_test, y_train, y_test =1), data[target] == "Yes", test_size=0.2, random_state=0
data.drop(target, axis
)= [
cols_to_use "tenure",
"PhoneService",
"MultipleLines",
"InternetService",
"OnlineSecurity",
"OnlineBackup",
"DeviceProtection",
"TechSupport",
"StreamingTV",
"StreamingMovies",
"Contract",
"PaperlessBilling",
"PaymentMethod",
"MonthlyCharges",
]
Training a model
= ColumnTransformer(
preprocessor =[
transformers
("one_hot",
OneHotEncoder(),="object"),
make_column_selector(dtype_include
),
],="passthrough", # Leave numerical variables unchanged
remainder
)
= Pipeline([("preprocessor", preprocessor), ("classifier", XGBClassifier())])
pipeline
pipeline.fit(X_train[cols_to_use], y_train)= pipeline.predict_proba(X_test[cols_to_use])[:, 1]
y_pred = roc_auc_score(y_test, y_pred)
roc_auc = log_loss(y_test, y_pred) log_loss_val
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")
print(f"GINI: {2 * roc_auc_score(y_test, y_pred) - 1}")
print(f"Log loss: {log_loss(y_test, y_pred)}")
ROC AUC Score: 0.8152565467986469
GINI: 0.6305130935972938
Log loss: 0.4718305882942505
Plotting our predictions
We can plot the distribution of the model’s predictions like so
= plt.subplots(figsize=(5, 3))
fig, ax =100, label="Predictions")
ax.hist(y_pred, bins0, 1)
ax.set_xlim("Predicted probability")
ax.set_xlabel("Frequency")
ax.set_ylabel("Histogram of predicted probabilities")
ax.set_title( plt.show()
Picking a decision threshold from here is not always obvious. We can see the trade offs by plotting confusion matrices for a range of thresholds.
= plt.subplots(1, 3, figsize=(9, 3), sharey=True)
fig, axes for i, threshold in enumerate([0.1, 0.5, 0.9]):
= confusion_matrix(y_test, (y_pred > threshold).astype(int))
cm =pipeline.classes_).plot(ax=axes[i], colorbar=False)
ConfusionMatrixDisplay(cm, display_labelsif i != 0:
None)
axes[i].set_ylabel(f"Threshold: {threshold}")
axes[i].set_title( plt.show()
The payoff matrix
We can use a payoff matrix
to help us decide which threshold to use. This is a table that shows how much retaining a customer is worth to us, and how much it costs us to retain them.
For instance, say the matrix looks like the one below:
= np.array(
payoff_matrix
[0, 0],
[-80, 200],
[
]
)
= plt.subplots(figsize=(5, 3))
fig, ax = plt.Normalize(vmin=payoff_matrix.min(), vmax=payoff_matrix.max())
norm = plt.cm.viridis
colormap
="viridis", norm=norm)
ax.imshow(payoff_matrix, cmap2), ["False", "True"])
ax.set_xticks(np.arange("Would have churned")
ax.set_xlabel(2), ["False", "True"])
ax.set_yticks(np.arange("Were given discount")
ax.set_ylabel("Payoff Matrix (£)")
ax.set_title(
for i in range(payoff_matrix.shape[0]):
for j in range(payoff_matrix.shape[1]):
= payoff_matrix[i, j]
cell_value = "black" if np.array(colormap(norm(cell_value))[:3]).mean() > 0.5 else "white"
text_color ="center", va="center", color=text_color)
ax.text(j, i, cell_value, ha
plt.show()
This matrix says that retaining a customer who was going to churn by giving them a discount is woth £200 to us. It also says that giving a customer who wasn’t going to church a discount costs us £80. The payoffs for the scenarios where we don’t give a discount are set to 0.
Let’s combine our confusion and payoff matrices to see how much deploying this model could be worth.
If we used the 0.5 threshold we would end up with:
\[ \begin{array}{cccc} & 189 & \times & 200 & \\ + & 179 & \times & -80 & \\ \hline = & 32430 & \\ \end{array} \]
So £23,480 of value created within our test set.
The profit curve
How do we know that the threshold we chose was the most profitable one? We can plot the profit curve to see how the profit changes as we change the threshold.
def profit_curve(y_true: np.ndarray, y_pred: np.ndarray, payoff_matrix: np.ndarray, n_points: int = 101) -> Tuple:
"""
Calculate profit curve for a binary classifier.
Args:
y_true (array-like): True labels.
y_pred (array-like): Predicted probabilities.
payoff_matrix (array-like): Payoff matrix.
n_points (int): Number of points to calculate.
Returns:
tuple: x and y values for the profit curve.
"""
= np.linspace(0, 1, n_points)
thresholds = []
profits for threshold in thresholds:
= np.where(y_pred >= threshold, 1, 0)
y_pred_binary = confusion_matrix(y_true, y_pred_binary)
cm = (cm * payoff_matrix.T).sum()
profit
profits.append(profit)return thresholds, profits
= profit_curve(y_test, y_pred, payoff_matrix) thresholds, profits
def plot_profit_curve(*data, precision: int):
= plt.subplots(figsize=(5, 3))
fig, ax
for tup in data:
= tup
thresholds, profits, label
=label)
ax.plot(thresholds, profits, label
0, 1])
ax.set_xlim([0, 1.1, 0.1))
ax.set_xticks(np.arange("Threshold")
ax.set_xlabel(
= (min(profits) // precision) * precision
y_min = ((max(profits) + precision - 1) // precision) * precision
y_max
ax.set_ylim([y_min, y_max])+ 1, precision))
ax.set_yticks(np.arange(y_min, y_max "Profit")
ax.set_ylabel(
True)
ax.grid(="best")
ax.legend(loc"Profit Curve")
ax.set_title(
plt.show()
"Model 1"), precision=10000) plot_profit_curve((thresholds, profits,
def find_best_threshold_and_profit(thresholds, profits):
"""Find the best threshold and profit.
Args:
thresholds (array-like): Threshold values.
profits (array-like): Profit values.
Returns:
tuple: Best threshold and profit.
"""
= max(profits)
best_profit = thresholds[profits.index(best_profit)]
best_threshold return best_threshold, best_profit
= find_best_threshold_and_profit(thresholds, profits)
best_thresh, best_profits print(f"Ideal threshold is {best_thresh:.2f} which yields profits of £{best_profits:.0f}")
Ideal threshold is 0.22 which yields profits of £32360
Limitations
Some limitations of the profit curve:
- It depends upon us knowing the value of true and false positive predictions. If your payoff matrix is meaningless, your profit curve will be too.
- It assumes that all customers are worth roughly the same. This may not be true in practice.
- We can only use it for classification problems like the “will they churn / won’t they churn” example above.
Using the profit curve to compare models
Let’s image with lose a feature so we have a new model. How will this affect our profit curve?
= "Contract"
field_lost_in_broken_pipeline
= [col for col in cols_to_use if col != field_lost_in_broken_pipeline]
new_cols_to_use
pipeline.fit(X_train[new_cols_to_use], y_train)= pipeline.predict_proba(X_test[new_cols_to_use])[:, 1]
new_y_pred
= roc_auc_score(y_test, new_y_pred)
roc_auc = log_loss(y_test, new_y_pred)
log_loss_val
= profit_curve(y_test, new_y_pred, payoff_matrix) _, new_profits
plot_profit_curve("First model"),
(thresholds, profits, "New model"),
(thresholds, new_profits, =10000,
precision )
= find_best_threshold_and_profit(thresholds, new_profits)
new_best_thresh, new_best_profits print(f"Ideal threshold is {new_best_thresh:.2f} which yields profits of £{new_best_profits:.0f}")
print(f"Change in profit due to lost field: £{new_best_profits - best_profits:.0f}")
Ideal threshold is 0.24 which yields profits of £30800
Change in profit due to lost field: £-1560
Beyond Simple-Number Decision Thresholds
When we have some customers who are worth a lot more than others, it makes sense to focus our attention and discounts on them. We may therefore want to set different thesholds for different groups of customers.
= plt.subplots(figsize=(5, 3))
fig, ax "MonthlyCharges"].hist(ax=ax, bins=20, color="lightblue")
data[= 10
precision
= ((data["MonthlyCharges"].max() + precision - 1) // precision) * precision
x_max 0, data["MonthlyCharges"].max())
ax.set_xlim(0, x_max + 1, precision))
ax.set_xticks(np.arange("Monthly Charges")
ax.set_xlabel(
0, 1200)
ax.set_ylim(0, 1501, 250))
ax.set_yticks(np.arange("Count")
ax.set_ylabel(
"Distribution of Monthly Charges")
ax.set_title( plt.show()
Let’s assume that the value of keeping a customer is 3 times their monthly charge. The cost of giving a discount to someone who wasn’t going to churn can stay at £80.
Let’s group customers in 3 groups based on the amount they pay per month. Each group will have a different payoff matrix based on the average monthly charge of the group.
def group_specific_profit_curve(
float, high_thresh: float
X_test: pd.DataFrame, y_test: pd.Series, y_pred: np.ndarray, low_thresh:
):= (X_test["MonthlyCharges"] > low_thresh) & (X_test["MonthlyCharges"] <= high_thresh)
in_group = y_test[in_group]
y_test_in_group = y_pred[in_group]
y_pred_in_group = X_test["MonthlyCharges"][in_group].mean()
mean_monthly_charge = np.array(
payoff_matrix
[0, 0],
[-80, 3 * mean_monthly_charge],
[
]
)= profit_curve(y_test_in_group, y_pred_in_group, payoff_matrix)
thresholds, profits return thresholds, profits
= group_specific_profit_curve(X_test, y_test, y_pred, 0, 20)
thresholds, low_profits = group_specific_profit_curve(X_test, y_test, y_pred, 20, 60)
_, medium_profits = group_specific_profit_curve(X_test, y_test, y_pred, 60, 100)
_, high_profits = group_specific_profit_curve(X_test, y_test, y_pred, 100, 200) _, very_high_profits
= np.array([low_profits, medium_profits, high_profits, very_high_profits]).sum(axis=0) total_profits
We can see from plotting the profit curve for each group that it makes sense to use different thresholds for different groups. Generally we want a lower threshold for higher value groups since the cost of losing them is greater.
We can also see that by doing this our total profit across all customers is higher than if we had used a single threshold for all customers.
plot_profit_curve("Low customer profits"),
(thresholds, low_profits, "Med customer profits"),
(thresholds, medium_profits, "High customer profits"),
(thresholds, high_profits, "V high customer profits"),
(thresholds, very_high_profits, "Total profits"),
(thresholds, total_profits, =10000,
precision )
= find_best_threshold_and_profit(thresholds, low_profits)
thresh_low, profits_low = find_best_threshold_and_profit(thresholds, medium_profits)
thresh_medium, profits_medium = find_best_threshold_and_profit(thresholds, high_profits)
thresh_high, profits_high = find_best_threshold_and_profit(thresholds, very_high_profits) thresh_very_high, profits_very_high
print(f"Ideal threshold for low customers: {thresh_low}")
print(f"Ideal threshold for medium customers: {thresh_medium}")
print(f"Ideal threshold for high customers: {thresh_high}")
print(f"Ideal threshold for very high customers: {thresh_very_high}")
Ideal threshold for low customers: 0.45
Ideal threshold for medium customers: 0.64
Ideal threshold for high customers: 0.19
Ideal threshold for very high customers: 0.07
print(
f"""Profit with a signle threshold: £{best_profits}
Profit with flexible thresholds: £{(profits_low + profits_medium + profits_high + profits_very_high):.0f}"""
)
Profit with a signle threshold: £32360
Profit with flexible thresholds: £44919
References
- Machine Learning for Business Decision Optimization - Dan Becker