XGBoost

from pathlib import Path

import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

PROJECT_ROOT = Path.cwd().parent.parent

train_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/train.csv", low_memory=False)
test_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/test.csv", low_memory=False)
store_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/store.csv", low_memory=False)
sample_submission_df = pd.read_csv(f"{PROJECT_ROOT}/data/rossmann-store-sales/sample_submission.csv", low_memory=False)

train_df

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	StateHoliday	SchoolHoliday
0	1	5	2015-07-31	5263	555	1	1	0	1
1	2	5	2015-07-31	6064	625	1	1	0	1
2	3	5	2015-07-31	8314	821	1	1	0	1
3	4	5	2015-07-31	13995	1498	1	1	0	1
4	5	5	2015-07-31	4822	559	1	1	0	1
...	...	...	...	...	...	...	...	...	...
1017204	1111	2	2013-01-01	0	0	0	0	a	1
1017205	1112	2	2013-01-01	0	0	0	0	a	1
1017206	1113	2	2013-01-01	0	0	0	0	a	1
1017207	1114	2	2013-01-01	0	0	0	0	a	1
1017208	1115	2	2013-01-01	0	0	0	0	a	1

1017209 rows × 9 columns

test_df

	Id	Store	DayOfWeek	Date	Open	Promo	StateHoliday	SchoolHoliday
0	1	1	4	2015-09-17	1.0	1	0	0
1	2	3	4	2015-09-17	1.0	1	0	0
2	3	7	4	2015-09-17	1.0	1	0	0
3	4	8	4	2015-09-17	1.0	1	0	0
4	5	9	4	2015-09-17	1.0	1	0	0
...	...	...	...	...	...	...	...	...
41083	41084	1111	6	2015-08-01	1.0	0	0	0
41084	41085	1112	6	2015-08-01	1.0	0	0	0
41085	41086	1113	6	2015-08-01	1.0	0	0	0
41086	41087	1114	6	2015-08-01	1.0	0	0	0
41087	41088	1115	6	2015-08-01	1.0	0	0	1

41088 rows × 8 columns

store_df

	Store	StoreType	Assortment	CompetitionDistance	CompetitionOpenSinceMonth	CompetitionOpenSinceYear	Promo2	Promo2SinceWeek	Promo2SinceYear	PromoInterval
0	1	c	a	1270.0	9.0	2008.0	0	NaN	NaN	NaN
1	2	a	a	570.0	11.0	2007.0	1	13.0	2010.0	Jan,Apr,Jul,Oct
2	3	a	a	14130.0	12.0	2006.0	1	14.0	2011.0	Jan,Apr,Jul,Oct
3	4	c	c	620.0	9.0	2009.0	0	NaN	NaN	NaN
4	5	a	a	29910.0	4.0	2015.0	0	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...
1110	1111	a	a	1900.0	6.0	2014.0	1	31.0	2013.0	Jan,Apr,Jul,Oct
1111	1112	c	c	1880.0	4.0	2006.0	0	NaN	NaN	NaN
1112	1113	a	c	9260.0	NaN	NaN	0	NaN	NaN	NaN
1113	1114	a	c	870.0	NaN	NaN	0	NaN	NaN	NaN
1114	1115	d	c	5350.0	NaN	NaN	1	22.0	2012.0	Mar,Jun,Sept,Dec

1115 rows × 10 columns

sample_submission_df

	Id	Sales
0	1	0
1	2	0
2	3	0
3	4	0
4	5	0
...	...	...
41083	41084	0
41084	41085	0
41085	41086	0
41086	41087	0
41087	41088	0

41088 rows × 2 columns

train_df = pd.merge(train_df, store_df, on="Store", how="left")
test_df = pd.merge(test_df, store_df, on="Store", how="left")

def split_date(df):
    df["Date"] = pd.to_datetime(df["Date"])
    df["Year"] = df.Date.dt.year
    df["Month"] = df.Date.dt.month
    df["Day"] = df.Date.dt.day
    df["WeekOfYear"] = df.Date.dt.isocalendar().week
    return df

train_df = split_date(train_df)
test_df = split_date(test_df)

train_df

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	StateHoliday	SchoolHoliday	StoreType	...	CompetitionOpenSinceMonth	CompetitionOpenSinceYear	Promo2	Promo2SinceWeek	Promo2SinceYear	PromoInterval	Year	Month	Day	WeekOfYear
0	1	5	2015-07-31	5263	555	1	1	0	1	c	...	9.0	2008.0	0	NaN	NaN	NaN	2015	7	31	31
1	2	5	2015-07-31	6064	625	1	1	0	1	a	...	11.0	2007.0	1	13.0	2010.0	Jan,Apr,Jul,Oct	2015	7	31	31
2	3	5	2015-07-31	8314	821	1	1	0	1	a	...	12.0	2006.0	1	14.0	2011.0	Jan,Apr,Jul,Oct	2015	7	31	31
3	4	5	2015-07-31	13995	1498	1	1	0	1	c	...	9.0	2009.0	0	NaN	NaN	NaN	2015	7	31	31
4	5	5	2015-07-31	4822	559	1	1	0	1	a	...	4.0	2015.0	0	NaN	NaN	NaN	2015	7	31	31
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1017204	1111	2	2013-01-01	0	0	0	0	a	1	a	...	6.0	2014.0	1	31.0	2013.0	Jan,Apr,Jul,Oct	2013	1	1	1
1017205	1112	2	2013-01-01	0	0	0	0	a	1	c	...	4.0	2006.0	0	NaN	NaN	NaN	2013	1	1	1
1017206	1113	2	2013-01-01	0	0	0	0	a	1	a	...	NaN	NaN	0	NaN	NaN	NaN	2013	1	1	1
1017207	1114	2	2013-01-01	0	0	0	0	a	1	a	...	NaN	NaN	0	NaN	NaN	NaN	2013	1	1	1
1017208	1115	2	2013-01-01	0	0	0	0	a	1	d	...	NaN	NaN	1	22.0	2012.0	Mar,Jun,Sept,Dec	2013	1	1	1

1017209 rows × 22 columns

train_df = train_df.query("Open == 1").copy()

train_df

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	StateHoliday	SchoolHoliday	StoreType	...	CompetitionOpenSinceMonth	CompetitionOpenSinceYear	Promo2	Promo2SinceWeek	Promo2SinceYear	PromoInterval	Year	Month	Day	WeekOfYear
0	1	5	2015-07-31	5263	555	1	1	0	1	c	...	9.0	2008.0	0	NaN	NaN	NaN	2015	7	31	31
1	2	5	2015-07-31	6064	625	1	1	0	1	a	...	11.0	2007.0	1	13.0	2010.0	Jan,Apr,Jul,Oct	2015	7	31	31
2	3	5	2015-07-31	8314	821	1	1	0	1	a	...	12.0	2006.0	1	14.0	2011.0	Jan,Apr,Jul,Oct	2015	7	31	31
3	4	5	2015-07-31	13995	1498	1	1	0	1	c	...	9.0	2009.0	0	NaN	NaN	NaN	2015	7	31	31
4	5	5	2015-07-31	4822	559	1	1	0	1	a	...	4.0	2015.0	0	NaN	NaN	NaN	2015	7	31	31
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1016776	682	2	2013-01-01	3375	566	1	0	a	1	b	...	9.0	2006.0	0	NaN	NaN	NaN	2013	1	1	1
1016827	733	2	2013-01-01	10765	2377	1	0	a	1	b	...	10.0	1999.0	0	NaN	NaN	NaN	2013	1	1	1
1016863	769	2	2013-01-01	5035	1248	1	0	a	1	b	...	NaN	NaN	1	48.0	2012.0	Jan,Apr,Jul,Oct	2013	1	1	1
1017042	948	2	2013-01-01	4491	1039	1	0	a	1	b	...	NaN	NaN	0	NaN	NaN	NaN	2013	1	1	1
1017190	1097	2	2013-01-01	5961	1405	1	0	a	1	b	...	3.0	2002.0	0	NaN	NaN	NaN	2013	1	1	1

844392 rows × 22 columns

def comp_months(df):
    df["CompetitionOpen"] = 12 * (df["Year"] - df["CompetitionOpenSinceYear"]) + (
        df["Month"] - df["CompetitionOpenSinceMonth"]
    )
    df["CompetitionOpen"] = df["CompetitionOpen"].map(lambda x: 0 if x < 0 else x).fillna(0)
    return df

train_df = comp_months(train_df)
test_df = comp_months(test_df)

def check_promo_month(row):
    month2str = {
        1: "Jan",
        2: "Feb",
        3: "Mar",
        4: "Apr",
        5: "May",
        6: "Jun",
        7: "Jul",
        8: "Aug",
        9: "Sept",
        10: "Oct",
        11: "Nov",
        12: "Dec",
    }
    try:
        # given input row the promointerval column is indexed and split by ","
        months = (row["PromoInterval"] or "").split(",")
        # if the row indexed promo2open column and the row month are in months you return 1
        if row["Promo2Open"] and month2str[row["Month"]] in months:
            return 1
        else:
            return 0
    except Exception:
        return 0


def promo_cols(df):
    # Months since Promo2 was open
    df["Promo2Open"] = 12 * (df.Year - df.Promo2SinceYear) + (df.WeekOfYear - df.Promo2SinceWeek) * 7 / 30.5
    df["Promo2Open"] = df["Promo2Open"].fillna(0).map(lambda x: 0 if x < 0 else x) * df["Promo2"]
    # Whether a new round of promotions was started in the current month
    df["IsPromo2Month"] = df.apply(check_promo_month, axis=1) * df["Promo2"]
    return df

train_df = promo_cols(train_df)
test_df = promo_cols(test_df)

train_df

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	StateHoliday	SchoolHoliday	StoreType	...	Promo2SinceWeek	Promo2SinceYear	PromoInterval	Year	Month	Day	WeekOfYear	CompetitionOpen	Promo2Open	IsPromo2Month
0	1	5	2015-07-31	5263	555	1	1	0	1	c	...	NaN	NaN	NaN	2015	7	31	31	82.0	0.000000	0
1	2	5	2015-07-31	6064	625	1	1	0	1	a	...	13.0	2010.0	Jan,Apr,Jul,Oct	2015	7	31	31	92.0	64.131148	1
2	3	5	2015-07-31	8314	821	1	1	0	1	a	...	14.0	2011.0	Jan,Apr,Jul,Oct	2015	7	31	31	103.0	51.901639	1
3	4	5	2015-07-31	13995	1498	1	1	0	1	c	...	NaN	NaN	NaN	2015	7	31	31	70.0	0.000000	0
4	5	5	2015-07-31	4822	559	1	1	0	1	a	...	NaN	NaN	NaN	2015	7	31	31	3.0	0.000000	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1016776	682	2	2013-01-01	3375	566	1	0	a	1	b	...	NaN	NaN	NaN	2013	1	1	1	76.0	0.000000	0
1016827	733	2	2013-01-01	10765	2377	1	0	a	1	b	...	NaN	NaN	NaN	2013	1	1	1	159.0	0.000000	0
1016863	769	2	2013-01-01	5035	1248	1	0	a	1	b	...	48.0	2012.0	Jan,Apr,Jul,Oct	2013	1	1	1	0.0	1.213115	1
1017042	948	2	2013-01-01	4491	1039	1	0	a	1	b	...	NaN	NaN	NaN	2013	1	1	1	0.0	0.000000	0
1017190	1097	2	2013-01-01	5961	1405	1	0	a	1	b	...	NaN	NaN	NaN	2013	1	1	1	130.0	0.000000	0

844392 rows × 25 columns

input_cols = [
    "Store",
    "DayOfWeek",
    "Promo",
    "StateHoliday",
    "SchoolHoliday",
    "StoreType",
    "Assortment",
    "CompetitionDistance",
    "CompetitionOpen",
    "Day",
    "Month",
    "Year",
    "WeekOfYear",
    "Promo2",
    "Promo2Open",
    "IsPromo2Month",
]
target_col = "Sales"

inputs = train_df[input_cols].copy()
targets = train_df[target_col].copy()

test_inputs = test_df[input_cols].copy()

numeric_cols = [
    "Store",
    "Promo",
    "SchoolHoliday",
    "CompetitionDistance",
    "CompetitionOpen",
    "Promo2",
    "Promo2Open",
    "IsPromo2Month",
    "Day",
    "Month",
    "Year",
    "WeekOfYear",
]
categorical_cols = ["DayOfWeek", "StateHoliday", "StoreType", "Assortment"]

inputs[numeric_cols].isna().sum()

Store                     0
Promo                     0
SchoolHoliday             0
CompetitionDistance    2186
CompetitionOpen           0
Promo2                    0
Promo2Open                0
IsPromo2Month             0
Day                       0
Month                     0
Year                      0
WeekOfYear                0
dtype: int64

test_inputs[numeric_cols].isna().sum()

Store                   0
Promo                   0
SchoolHoliday           0
CompetitionDistance    96
CompetitionOpen         0
Promo2                  0
Promo2Open              0
IsPromo2Month           0
Day                     0
Month                   0
Year                    0
WeekOfYear              0
dtype: int64

max_distance = inputs["CompetitionDistance"].max()

inputs["CompetitionDistance"] = inputs["CompetitionDistance"].fillna(max_distance)
test_inputs["CompetitionDistance"] = test_inputs["CompetitionDistance"].fillna(max_distance)

scaler = MinMaxScaler().fit(inputs[numeric_cols])

inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

inputs[encoded_cols] = encoder.transform(inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

X = inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]