Profit Curves

from pathlib import Path
from typing import Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import confusion_matrix, roc_auc_score, log_loss, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

pd.set_option("display.max_columns", None)

PROJECT_ROOT = Path.cwd().parent.parent

plt.rcParams["figure.facecolor"] = (1, 1, 1, 0)  # RGBA tuple with alpha=0
plt.rcParams["axes.facecolor"] = (1, 1, 1, 0)  # RGBA tuple with alpha=0

Getting the data

The dataset is from the Telco Customer Churn Kaggle Competition. The target is the simple binary outcome in the Churn column.

data = pd.read_csv(f"{PROJECT_ROOT}/data/WA_Fn-UseC_-Telco-Customer-Churn.csv", low_memory=False)

data

	customerID	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	0	Yes	No	1	No	No phone service	DSL	No	Yes	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	0	No	No	34	Yes	No	DSL	Yes	No	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	0	No	No	2	Yes	No	DSL	Yes	Yes	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	0	No	No	45	No	No phone service	DSL	Yes	No	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	0	No	No	2	Yes	No	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7038	6840-RESVB	Male	0	Yes	Yes	24	Yes	Yes	DSL	Yes	No	Yes	Yes	Yes	Yes	One year	Yes	Mailed check	84.80	1990.5	No
7039	2234-XADUH	Female	0	Yes	Yes	72	Yes	Yes	Fiber optic	No	Yes	Yes	No	Yes	Yes	One year	Yes	Credit card (automatic)	103.20	7362.9	No
7040	4801-JZAZL	Female	0	Yes	Yes	11	No	No phone service	DSL	Yes	No	No	No	No	No	Month-to-month	Yes	Electronic check	29.60	346.45	No
7041	8361-LTMKD	Male	1	Yes	No	4	Yes	Yes	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Mailed check	74.40	306.6	Yes
7042	3186-AJIEK	Male	0	No	No	66	Yes	No	Fiber optic	Yes	No	Yes	Yes	Yes	Yes	Two year	Yes	Bank transfer (automatic)	105.65	6844.5	No

7043 rows × 21 columns

target = "Churn"

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(target, axis=1), data[target] == "Yes", test_size=0.2, random_state=0
)
cols_to_use = [
    "tenure",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
    "MonthlyCharges",
]

Training a model

preprocessor = ColumnTransformer(
    transformers=[
        (
            "one_hot",
            OneHotEncoder(),
            make_column_selector(dtype_include="object"),
        ),
    ],
    remainder="passthrough",  # Leave numerical variables unchanged
)

pipeline = Pipeline([("preprocessor", preprocessor), ("classifier", XGBClassifier())])
pipeline.fit(X_train[cols_to_use], y_train)
y_pred = pipeline.predict_proba(X_test[cols_to_use])[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
log_loss_val = log_loss(y_test, y_pred)

print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")
print(f"GINI: {2 * roc_auc_score(y_test, y_pred) - 1}")
print(f"Log loss: {log_loss(y_test, y_pred)}")

ROC AUC Score: 0.8152565467986469
GINI: 0.6305130935972938
Log loss: 0.4718305882942505

Plotting our predictions

We can plot the distribution of the model’s predictions like so

fig, ax = plt.subplots(figsize=(5, 3))
ax.hist(y_pred, bins=100, label="Predictions")
ax.set_xlim(0, 1)
ax.set_xlabel("Predicted probability")
ax.set_ylabel("Frequency")
ax.set_title("Histogram of predicted probabilities")
plt.show()

Picking a decision threshold from here is not always obvious. We can see the trade offs by plotting confusion matrices for a range of thresholds.

fig, axes = plt.subplots(1, 3, figsize=(9, 3), sharey=True)
for i, threshold in enumerate([0.1, 0.5, 0.9]):
    cm = confusion_matrix(y_test, (y_pred > threshold).astype(int))
    ConfusionMatrixDisplay(cm, display_labels=pipeline.classes_).plot(ax=axes[i], colorbar=False)
    if i != 0:
        axes[i].set_ylabel(None)
    axes[i].set_title(f"Threshold: {threshold}")
plt.show()

The payoff matrix

We can use a payoff matrix to help us decide which threshold to use. This is a table that shows how much retaining a customer is worth to us, and how much it costs us to retain them.

For instance, say the matrix looks like the one below:

payoff_matrix = np.array(
    [
        [0, 0],
        [-80, 200],
    ]
)

fig, ax = plt.subplots(figsize=(5, 3))
norm = plt.Normalize(vmin=payoff_matrix.min(), vmax=payoff_matrix.max())
colormap = plt.cm.viridis

ax.imshow(payoff_matrix, cmap="viridis", norm=norm)
ax.set_xticks(np.arange(2), ["False", "True"])
ax.set_xlabel("Would have churned")
ax.set_yticks(np.arange(2), ["False", "True"])
ax.set_ylabel("Were given discount")
ax.set_title("Payoff Matrix (£)")

for i in range(payoff_matrix.shape[0]):
    for j in range(payoff_matrix.shape[1]):
        cell_value = payoff_matrix[i, j]
        text_color = "black" if np.array(colormap(norm(cell_value))[:3]).mean() > 0.5 else "white"
        ax.text(j, i, cell_value, ha="center", va="center", color=text_color)

plt.show()

This matrix says that retaining a customer who was going to churn by giving them a discount is woth £200 to us. It also says that giving a customer who wasn’t going to church a discount costs us £80. The payoffs for the scenarios where we don’t give a discount are set to 0.

Let’s combine our confusion and payoff matrices to see how much deploying this model could be worth.

If we used the 0.5 threshold we would end up with:

\[ \begin{array}{cccc} & 189 & \times & 200 & \\ + & 179 & \times & -80 & \\ \hline = & 32430 & \\ \end{array} \]

So £23,480 of value created within our test set.

The profit curve

How do we know that the threshold we chose was the most profitable one? We can plot the profit curve to see how the profit changes as we change the threshold.

def profit_curve(y_true: np.ndarray, y_pred: np.ndarray, payoff_matrix: np.ndarray, n_points: int = 101) -> Tuple:
    """
    Calculate profit curve for a binary classifier.

    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted probabilities.
        payoff_matrix (array-like): Payoff matrix.
        n_points (int): Number of points to calculate.

    Returns:
        tuple: x and y values for the profit curve.
    """

    thresholds = np.linspace(0, 1, n_points)
    profits = []
    for threshold in thresholds:
        y_pred_binary = np.where(y_pred >= threshold, 1, 0)
        cm = confusion_matrix(y_true, y_pred_binary)
        profit = (cm * payoff_matrix.T).sum()
        profits.append(profit)
    return thresholds, profits

thresholds, profits = profit_curve(y_test, y_pred, payoff_matrix)

def plot_profit_curve(*data, precision: int):
    fig, ax = plt.subplots(figsize=(5, 3))

    for tup in data:
        thresholds, profits, label = tup

        ax.plot(thresholds, profits, label=label)

        ax.set_xlim([0, 1])
        ax.set_xticks(np.arange(0, 1.1, 0.1))
        ax.set_xlabel("Threshold")

        y_min = (min(profits) // precision) * precision
        y_max = ((max(profits) + precision - 1) // precision) * precision
        ax.set_ylim([y_min, y_max])
        ax.set_yticks(np.arange(y_min, y_max + 1, precision))
        ax.set_ylabel("Profit")

    ax.grid(True)
    ax.legend(loc="best")
    ax.set_title("Profit Curve")

    plt.show()

plot_profit_curve((thresholds, profits, "Model 1"), precision=10000)

def find_best_threshold_and_profit(thresholds, profits):
    """Find the best threshold and profit.

    Args:
        thresholds (array-like): Threshold values.
        profits (array-like): Profit values.

    Returns:
        tuple: Best threshold and profit.
    """
    best_profit = max(profits)
    best_threshold = thresholds[profits.index(best_profit)]
    return best_threshold, best_profit

best_thresh, best_profits = find_best_threshold_and_profit(thresholds, profits)
print(f"Ideal threshold is {best_thresh:.2f} which yields profits of £{best_profits:.0f}")

Ideal threshold is 0.22 which yields profits of £32360

Limitations

Some limitations of the profit curve:

It depends upon us knowing the value of true and false positive predictions. If your payoff matrix is meaningless, your profit curve will be too.
It assumes that all customers are worth roughly the same. This may not be true in practice.
We can only use it for classification problems like the “will they churn / won’t they churn” example above.

Using the profit curve to compare models

Let’s image with lose a feature so we have a new model. How will this affect our profit curve?

field_lost_in_broken_pipeline = "Contract"

new_cols_to_use = [col for col in cols_to_use if col != field_lost_in_broken_pipeline]
pipeline.fit(X_train[new_cols_to_use], y_train)
new_y_pred = pipeline.predict_proba(X_test[new_cols_to_use])[:, 1]

roc_auc = roc_auc_score(y_test, new_y_pred)
log_loss_val = log_loss(y_test, new_y_pred)

_, new_profits = profit_curve(y_test, new_y_pred, payoff_matrix)

plot_profit_curve(
    (thresholds, profits, "First model"),
    (thresholds, new_profits, "New model"),
    precision=10000,
)

new_best_thresh, new_best_profits = find_best_threshold_and_profit(thresholds, new_profits)
print(f"Ideal threshold is {new_best_thresh:.2f} which yields profits of £{new_best_profits:.0f}")
print(f"Change in profit due to lost field: £{new_best_profits - best_profits:.0f}")

Ideal threshold is 0.24 which yields profits of £30800
Change in profit due to lost field: £-1560

Beyond Simple-Number Decision Thresholds

When we have some customers who are worth a lot more than others, it makes sense to focus our attention and discounts on them. We may therefore want to set different thesholds for different groups of customers.

fig, ax = plt.subplots(figsize=(5, 3))
data["MonthlyCharges"].hist(ax=ax, bins=20, color="lightblue")
precision = 10

x_max = ((data["MonthlyCharges"].max() + precision - 1) // precision) * precision
ax.set_xlim(0, data["MonthlyCharges"].max())
ax.set_xticks(np.arange(0, x_max + 1, precision))
ax.set_xlabel("Monthly Charges")

ax.set_ylim(0, 1200)
ax.set_yticks(np.arange(0, 1501, 250))
ax.set_ylabel("Count")

ax.set_title("Distribution of Monthly Charges")
plt.show()

Let’s assume that the value of keeping a customer is 3 times their monthly charge. The cost of giving a discount to someone who wasn’t going to churn can stay at £80.

Let’s group customers in 3 groups based on the amount they pay per month. Each group will have a different payoff matrix based on the average monthly charge of the group.

def group_specific_profit_curve(
    X_test: pd.DataFrame, y_test: pd.Series, y_pred: np.ndarray, low_thresh: float, high_thresh: float
):
    in_group = (X_test["MonthlyCharges"] > low_thresh) & (X_test["MonthlyCharges"] <= high_thresh)
    y_test_in_group = y_test[in_group]
    y_pred_in_group = y_pred[in_group]
    mean_monthly_charge = X_test["MonthlyCharges"][in_group].mean()
    payoff_matrix = np.array(
        [
            [0, 0],
            [-80, 3 * mean_monthly_charge],
        ]
    )
    thresholds, profits = profit_curve(y_test_in_group, y_pred_in_group, payoff_matrix)
    return thresholds, profits

thresholds, low_profits = group_specific_profit_curve(X_test, y_test, y_pred, 0, 20)
_, medium_profits = group_specific_profit_curve(X_test, y_test, y_pred, 20, 60)
_, high_profits = group_specific_profit_curve(X_test, y_test, y_pred, 60, 100)
_, very_high_profits = group_specific_profit_curve(X_test, y_test, y_pred, 100, 200)

total_profits = np.array([low_profits, medium_profits, high_profits, very_high_profits]).sum(axis=0)

We can see from plotting the profit curve for each group that it makes sense to use different thresholds for different groups. Generally we want a lower threshold for higher value groups since the cost of losing them is greater.

We can also see that by doing this our total profit across all customers is higher than if we had used a single threshold for all customers.

plot_profit_curve(
    (thresholds, low_profits, "Low customer profits"),
    (thresholds, medium_profits, "Med customer profits"),
    (thresholds, high_profits, "High customer profits"),
    (thresholds, very_high_profits, "V high customer profits"),
    (thresholds, total_profits, "Total profits"),
    precision=10000,
)

thresh_low, profits_low = find_best_threshold_and_profit(thresholds, low_profits)
thresh_medium, profits_medium = find_best_threshold_and_profit(thresholds, medium_profits)
thresh_high, profits_high = find_best_threshold_and_profit(thresholds, high_profits)
thresh_very_high, profits_very_high = find_best_threshold_and_profit(thresholds, very_high_profits)

print(f"Ideal threshold for low customers: {thresh_low}")
print(f"Ideal threshold for medium customers: {thresh_medium}")
print(f"Ideal threshold for high customers: {thresh_high}")
print(f"Ideal threshold for very high customers: {thresh_very_high}")

Ideal threshold for low customers: 0.45
Ideal threshold for medium customers: 0.64
Ideal threshold for high customers: 0.19
Ideal threshold for very high customers: 0.07

print(
    f"""Profit with a signle threshold: £{best_profits}
Profit with flexible thresholds: £{(profits_low + profits_medium + profits_high + profits_very_high):.0f}"""
)

Profit with a signle threshold: £32360
Profit with flexible thresholds: £44919

References

Machine Learning for Business Decision Optimization - Dan Becker