XGBoost

from pathlib import Path

import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

PROJECT_ROOT = Path.cwd().parent.parent
train_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/train.csv", low_memory=False)
test_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/test.csv", low_memory=False)
store_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/store.csv", low_memory=False)
sample_submission_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/sample_submission.csv", low_memory=False)
train_df
Store DayOfWeek Date Sales Customers Open Promo StateHoliday SchoolHoliday
0 1 5 2015-07-31 5263 555 1 1 0 1
1 2 5 2015-07-31 6064 625 1 1 0 1
2 3 5 2015-07-31 8314 821 1 1 0 1
3 4 5 2015-07-31 13995 1498 1 1 0 1
4 5 5 2015-07-31 4822 559 1 1 0 1
... ... ... ... ... ... ... ... ... ...
1017204 1111 2 2013-01-01 0 0 0 0 a 1
1017205 1112 2 2013-01-01 0 0 0 0 a 1
1017206 1113 2 2013-01-01 0 0 0 0 a 1
1017207 1114 2 2013-01-01 0 0 0 0 a 1
1017208 1115 2 2013-01-01 0 0 0 0 a 1

1017209 rows × 9 columns

test_df
Id Store DayOfWeek Date Open Promo StateHoliday SchoolHoliday
0 1 1 4 2015-09-17 1.0 1 0 0
1 2 3 4 2015-09-17 1.0 1 0 0
2 3 7 4 2015-09-17 1.0 1 0 0
3 4 8 4 2015-09-17 1.0 1 0 0
4 5 9 4 2015-09-17 1.0 1 0 0
... ... ... ... ... ... ... ... ...
41083 41084 1111 6 2015-08-01 1.0 0 0 0
41084 41085 1112 6 2015-08-01 1.0 0 0 0
41085 41086 1113 6 2015-08-01 1.0 0 0 0
41086 41087 1114 6 2015-08-01 1.0 0 0 0
41087 41088 1115 6 2015-08-01 1.0 0 0 1

41088 rows × 8 columns

store_df
Store StoreType Assortment CompetitionDistance CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2 Promo2SinceWeek Promo2SinceYear PromoInterval
0 1 c a 1270.0 9.0 2008.0 0 NaN NaN NaN
1 2 a a 570.0 11.0 2007.0 1 13.0 2010.0 Jan,Apr,Jul,Oct
2 3 a a 14130.0 12.0 2006.0 1 14.0 2011.0 Jan,Apr,Jul,Oct
3 4 c c 620.0 9.0 2009.0 0 NaN NaN NaN
4 5 a a 29910.0 4.0 2015.0 0 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ...
1110 1111 a a 1900.0 6.0 2014.0 1 31.0 2013.0 Jan,Apr,Jul,Oct
1111 1112 c c 1880.0 4.0 2006.0 0 NaN NaN NaN
1112 1113 a c 9260.0 NaN NaN 0 NaN NaN NaN
1113 1114 a c 870.0 NaN NaN 0 NaN NaN NaN
1114 1115 d c 5350.0 NaN NaN 1 22.0 2012.0 Mar,Jun,Sept,Dec

1115 rows × 10 columns

sample_submission_df
Id Sales
0 1 0
1 2 0
2 3 0
3 4 0
4 5 0
... ... ...
41083 41084 0
41084 41085 0
41085 41086 0
41086 41087 0
41087 41088 0

41088 rows × 2 columns

train_df = pd.merge(train_df, store_df, on="Store", how="left")
test_df = pd.merge(test_df, store_df, on="Store", how="left")
def split_date(df):
    df["Date"] = pd.to_datetime(df["Date"])
    df["Year"] = df.Date.dt.year
    df["Month"] = df.Date.dt.month
    df["Day"] = df.Date.dt.day
    df["WeekOfYear"] = df.Date.dt.isocalendar().week
    return df
train_df = split_date(train_df)
test_df = split_date(test_df)
train_df
Store DayOfWeek Date Sales Customers Open Promo StateHoliday SchoolHoliday StoreType ... CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2 Promo2SinceWeek Promo2SinceYear PromoInterval Year Month Day WeekOfYear
0 1 5 2015-07-31 5263 555 1 1 0 1 c ... 9.0 2008.0 0 NaN NaN NaN 2015 7 31 31
1 2 5 2015-07-31 6064 625 1 1 0 1 a ... 11.0 2007.0 1 13.0 2010.0 Jan,Apr,Jul,Oct 2015 7 31 31
2 3 5 2015-07-31 8314 821 1 1 0 1 a ... 12.0 2006.0 1 14.0 2011.0 Jan,Apr,Jul,Oct 2015 7 31 31
3 4 5 2015-07-31 13995 1498 1 1 0 1 c ... 9.0 2009.0 0 NaN NaN NaN 2015 7 31 31
4 5 5 2015-07-31 4822 559 1 1 0 1 a ... 4.0 2015.0 0 NaN NaN NaN 2015 7 31 31
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1017204 1111 2 2013-01-01 0 0 0 0 a 1 a ... 6.0 2014.0 1 31.0 2013.0 Jan,Apr,Jul,Oct 2013 1 1 1
1017205 1112 2 2013-01-01 0 0 0 0 a 1 c ... 4.0 2006.0 0 NaN NaN NaN 2013 1 1 1
1017206 1113 2 2013-01-01 0 0 0 0 a 1 a ... NaN NaN 0 NaN NaN NaN 2013 1 1 1
1017207 1114 2 2013-01-01 0 0 0 0 a 1 a ... NaN NaN 0 NaN NaN NaN 2013 1 1 1
1017208 1115 2 2013-01-01 0 0 0 0 a 1 d ... NaN NaN 1 22.0 2012.0 Mar,Jun,Sept,Dec 2013 1 1 1

1017209 rows × 22 columns

train_df = train_df.query("Open == 1").copy()
train_df
Store DayOfWeek Date Sales Customers Open Promo StateHoliday SchoolHoliday StoreType ... CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2 Promo2SinceWeek Promo2SinceYear PromoInterval Year Month Day WeekOfYear
0 1 5 2015-07-31 5263 555 1 1 0 1 c ... 9.0 2008.0 0 NaN NaN NaN 2015 7 31 31
1 2 5 2015-07-31 6064 625 1 1 0 1 a ... 11.0 2007.0 1 13.0 2010.0 Jan,Apr,Jul,Oct 2015 7 31 31
2 3 5 2015-07-31 8314 821 1 1 0 1 a ... 12.0 2006.0 1 14.0 2011.0 Jan,Apr,Jul,Oct 2015 7 31 31
3 4 5 2015-07-31 13995 1498 1 1 0 1 c ... 9.0 2009.0 0 NaN NaN NaN 2015 7 31 31
4 5 5 2015-07-31 4822 559 1 1 0 1 a ... 4.0 2015.0 0 NaN NaN NaN 2015 7 31 31
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1016776 682 2 2013-01-01 3375 566 1 0 a 1 b ... 9.0 2006.0 0 NaN NaN NaN 2013 1 1 1
1016827 733 2 2013-01-01 10765 2377 1 0 a 1 b ... 10.0 1999.0 0 NaN NaN NaN 2013 1 1 1
1016863 769 2 2013-01-01 5035 1248 1 0 a 1 b ... NaN NaN 1 48.0 2012.0 Jan,Apr,Jul,Oct 2013 1 1 1
1017042 948 2 2013-01-01 4491 1039 1 0 a 1 b ... NaN NaN 0 NaN NaN NaN 2013 1 1 1
1017190 1097 2 2013-01-01 5961 1405 1 0 a 1 b ... 3.0 2002.0 0 NaN NaN NaN 2013 1 1 1

844392 rows × 22 columns

def comp_months(df):
    df["CompetitionOpen"] = 12 * (df["Year"] - df["CompetitionOpenSinceYear"]) + (
        df["Month"] - df["CompetitionOpenSinceMonth"]
    )
    df["CompetitionOpen"] = df["CompetitionOpen"].map(lambda x: 0 if x < 0 else x).fillna(0)
    return df
train_df = comp_months(train_df)
test_df = comp_months(test_df)
def check_promo_month(row):
    month2str = {
        1: "Jan",
        2: "Feb",
        3: "Mar",
        4: "Apr",
        5: "May",
        6: "Jun",
        7: "Jul",
        8: "Aug",
        9: "Sept",
        10: "Oct",
        11: "Nov",
        12: "Dec",
    }
    try:
        # given input row the promointerval column is indexed and split by ","
        months = (row["PromoInterval"] or "").split(",")
        # if the row indexed promo2open column and the row month are in months you return 1
        if row["Promo2Open"] and month2str[row["Month"]] in months:
            return 1
        else:
            return 0
    except Exception:
        return 0


def promo_cols(df):
    # Months since Promo2 was open
    df["Promo2Open"] = 12 * (df.Year - df.Promo2SinceYear) + (df.WeekOfYear - df.Promo2SinceWeek) * 7 / 30.5
    df["Promo2Open"] = df["Promo2Open"].fillna(0).map(lambda x: 0 if x < 0 else x) * df["Promo2"]
    # Whether a new round of promotions was started in the current month
    df["IsPromo2Month"] = df.apply(check_promo_month, axis=1) * df["Promo2"]
    return df
train_df = promo_cols(train_df)
test_df = promo_cols(test_df)
train_df
Store DayOfWeek Date Sales Customers Open Promo StateHoliday SchoolHoliday StoreType ... Promo2SinceWeek Promo2SinceYear PromoInterval Year Month Day WeekOfYear CompetitionOpen Promo2Open IsPromo2Month
0 1 5 2015-07-31 5263 555 1 1 0 1 c ... NaN NaN NaN 2015 7 31 31 82.0 0.000000 0
1 2 5 2015-07-31 6064 625 1 1 0 1 a ... 13.0 2010.0 Jan,Apr,Jul,Oct 2015 7 31 31 92.0 64.131148 1
2 3 5 2015-07-31 8314 821 1 1 0 1 a ... 14.0 2011.0 Jan,Apr,Jul,Oct 2015 7 31 31 103.0 51.901639 1
3 4 5 2015-07-31 13995 1498 1 1 0 1 c ... NaN NaN NaN 2015 7 31 31 70.0 0.000000 0
4 5 5 2015-07-31 4822 559 1 1 0 1 a ... NaN NaN NaN 2015 7 31 31 3.0 0.000000 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1016776 682 2 2013-01-01 3375 566 1 0 a 1 b ... NaN NaN NaN 2013 1 1 1 76.0 0.000000 0
1016827 733 2 2013-01-01 10765 2377 1 0 a 1 b ... NaN NaN NaN 2013 1 1 1 159.0 0.000000 0
1016863 769 2 2013-01-01 5035 1248 1 0 a 1 b ... 48.0 2012.0 Jan,Apr,Jul,Oct 2013 1 1 1 0.0 1.213115 1
1017042 948 2 2013-01-01 4491 1039 1 0 a 1 b ... NaN NaN NaN 2013 1 1 1 0.0 0.000000 0
1017190 1097 2 2013-01-01 5961 1405 1 0 a 1 b ... NaN NaN NaN 2013 1 1 1 130.0 0.000000 0

844392 rows × 25 columns

input_cols = [
    "Store",
    "DayOfWeek",
    "Promo",
    "StateHoliday",
    "SchoolHoliday",
    "StoreType",
    "Assortment",
    "CompetitionDistance",
    "CompetitionOpen",
    "Day",
    "Month",
    "Year",
    "WeekOfYear",
    "Promo2",
    "Promo2Open",
    "IsPromo2Month",
]
target_col = "Sales"
inputs = train_df[input_cols].copy()
targets = train_df[target_col].copy()
test_inputs = test_df[input_cols].copy()
numeric_cols = [
    "Store",
    "Promo",
    "SchoolHoliday",
    "CompetitionDistance",
    "CompetitionOpen",
    "Promo2",
    "Promo2Open",
    "IsPromo2Month",
    "Day",
    "Month",
    "Year",
    "WeekOfYear",
]
categorical_cols = ["DayOfWeek", "StateHoliday", "StoreType", "Assortment"]
inputs[numeric_cols].isna().sum()
Store                     0
Promo                     0
SchoolHoliday             0
CompetitionDistance    2186
CompetitionOpen           0
Promo2                    0
Promo2Open                0
IsPromo2Month             0
Day                       0
Month                     0
Year                      0
WeekOfYear                0
dtype: int64
test_inputs[numeric_cols].isna().sum()
Store                   0
Promo                   0
SchoolHoliday           0
CompetitionDistance    96
CompetitionOpen         0
Promo2                  0
Promo2Open              0
IsPromo2Month           0
Day                     0
Month                   0
Year                    0
WeekOfYear              0
dtype: int64
max_distance = inputs["CompetitionDistance"].max()
inputs["CompetitionDistance"] = inputs["CompetitionDistance"].fillna(max_distance)
test_inputs["CompetitionDistance"] = test_inputs["CompetitionDistance"].fillna(max_distance)
scaler = MinMaxScaler().fit(inputs[numeric_cols])
inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
inputs[encoded_cols] = encoder.transform(inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])
X = inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]