Machine Learning Example Code
Business Problem: How to improve customer retention and predict a return customer.
Dataset: Hotel_bookings.csv
Note: This code exemplifies my pre-processing with python as well as the use of logistic regression.
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df = pd.read_csv('hotel_bookings.csv')
In [3]:
df.head()
Out[3]:
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
5 rows × 32 columns
In [4]:
df.columns
Out[4]:
Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
'arrival_date_month', 'arrival_date_week_number',
'arrival_date_day_of_month', 'stays_in_weekend_nights',
'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
'country', 'market_segment', 'distribution_channel',
'is_repeated_guest', 'previous_cancellations',
'previous_bookings_not_canceled', 'reserved_room_type',
'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
'company', 'days_in_waiting_list', 'customer_type', 'adr',
'required_car_parking_spaces', 'total_of_special_requests',
'reservation_status', 'reservation_status_date'],
dtype='object')
In [5]:
df.shape
Out[5]:
(119390, 32)
In [6]:
pd.set_option('display.max_columns', None)
In [7]:
pd.isna(df).describe()
Out[7]:
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 |
| unique | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| top | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False |
| freq | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119386 | 119390 | 119390 | 118902 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 103050 | 112593 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 | 119390 |
In [8]:
df['company'].describe()
Out[8]:
count 6797.000000 mean 189.266735 std 131.655015 min 6.000000 25% 62.000000 50% 179.000000 75% 270.000000 max 543.000000 Name: company, dtype: float64
In [9]:
df['company'].fillna(189.266735, inplace = True)
In [10]:
df['hotel'].describe()
Out[10]:
count 119390 unique 2 top City Hotel freq 79330 Name: hotel, dtype: object
In [11]:
df['customer_type'].unique()
Out[11]:
array(['Transient', 'Contract', 'Transient-Party', 'Group'], dtype=object)
In [12]:
df['customer_type'].describe()
Out[12]:
count 119390 unique 4 top Transient freq 89613 Name: customer_type, dtype: object
In [13]:
df['market_segment'].unique()
Out[13]:
array(['Direct', 'Corporate', 'Online TA', 'Offline TA/TO',
'Complementary', 'Groups', 'Undefined', 'Aviation'], dtype=object)
In [14]:
df['market_segment'].describe()
Out[14]:
count 119390 unique 8 top Online TA freq 56477 Name: market_segment, dtype: object
In [15]:
df['is_repeated_guest'].unique()
Out[15]:
array([0, 1], dtype=int64)
In [16]:
df['is_repeated_guest'].describe()
Out[16]:
count 119390.000000 mean 0.031912 std 0.175767 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 1.000000 Name: is_repeated_guest, dtype: float64
In [ ]:
In [17]:
df_small = pd.DataFrame(df, columns = ['company', 'hotel','customer_type','market_segment','is_repeated_guest'])
In [18]:
df_small[df_small['is_repeated_guest'] == 0].describe()
Out[18]:
| company | is_repeated_guest | |
|---|---|---|
| count | 115580.000000 | 115580.0 |
| mean | 190.081897 | 0.0 |
| std | 27.544123 | 0.0 |
| min | 6.000000 | 0.0 |
| 25% | 189.266735 | 0.0 |
| 50% | 189.266735 | 0.0 |
| 75% | 189.266735 | 0.0 |
| max | 543.000000 | 0.0 |
In [19]:
df_small[df_small['is_repeated_guest'] == 1].describe()
Out[19]:
| company | is_repeated_guest | |
|---|---|---|
| count | 3810.000000 | 3810.0 |
| mean | 164.538009 | 1.0 |
| std | 85.279604 | 0.0 |
| min | 9.000000 | 1.0 |
| 25% | 94.000000 | 1.0 |
| 50% | 189.266735 | 1.0 |
| 75% | 189.266735 | 1.0 |
| max | 543.000000 | 1.0 |
In [20]:
df_small[df_small['is_repeated_guest'] == 1]
Out[20]:
| company | hotel | customer_type | market_segment | is_repeated_guest | |
|---|---|---|---|---|---|
| 13937 | 189.266735 | Resort Hotel | Transient | Complementary | 1 |
| 14681 | 189.266735 | Resort Hotel | Contract | Offline TA/TO | 1 |
| 14777 | 189.266735 | Resort Hotel | Transient | Online TA | 1 |
| 14817 | 189.266735 | Resort Hotel | Contract | Offline TA/TO | 1 |
| 14823 | 189.266735 | Resort Hotel | Transient | Direct | 1 |
| ... | ... | ... | ... | ... | ... |
| 117701 | 189.266735 | City Hotel | Transient | Offline TA/TO | 1 |
| 117841 | 40.000000 | City Hotel | Transient | Corporate | 1 |
| 117961 | 189.266735 | City Hotel | Transient | Direct | 1 |
| 118029 | 189.266735 | City Hotel | Transient | Direct | 1 |
| 119070 | 189.266735 | City Hotel | Group | Offline TA/TO | 1 |
3810 rows × 5 columns
In [21]:
plt.scatter(df_small.company, df_small.is_repeated_guest, marker = '+', color ='blue')
plt.show()
In [22]:
plt.scatter(df_small.hotel, df_small.is_repeated_guest, marker = '+', color ='blue')
plt.show()
In [25]:
plt.scatter(df_small.customer_type, df_small.is_repeated_guest, marker = '+', color ='blue')
plt.show()
In [26]:
plt.scatter(df_small.market_segment, df_small.is_repeated_guest, marker = '+', color ='blue')
plt.show()
In [27]:
from sklearn.model_selection import train_test_split
In [30]:
x = df_small.drop(columns = ['is_repeated_guest'])
x.head()
#this test input set only contains the predictor variables
Out[30]:
| company | hotel | customer_type | market_segment | |
|---|---|---|---|---|
| 0 | 189.266735 | Resort Hotel | Transient | Direct |
| 1 | 189.266735 | Resort Hotel | Transient | Direct |
| 2 | 189.266735 | Resort Hotel | Transient | Direct |
| 3 | 189.266735 | Resort Hotel | Transient | Corporate |
| 4 | 189.266735 | Resort Hotel | Transient | Online TA |
In [31]:
y = df_small.drop(columns = ['company', 'customer_type', 'market_segment', 'hotel'])
y.head()
#this test output set contains the ouput variable
Out[31]:
| is_repeated_guest | |
|---|---|
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 0 |
In [32]:
#X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)
In [33]:
#X_test
Out[33]:
| company | hotel | customer_type | market_segment | |
|---|---|---|---|---|
| 88096 | 189.266735 | City Hotel | Transient | Online TA |
| 104709 | 189.266735 | City Hotel | Transient-Party | Groups |
| 46679 | 189.266735 | City Hotel | Transient | Direct |
| 92732 | 189.266735 | City Hotel | Transient | Direct |
| 6996 | 189.266735 | Resort Hotel | Transient | Online TA |
| ... | ... | ... | ... | ... |
| 23951 | 189.266735 | Resort Hotel | Transient | Online TA |
| 3926 | 189.266735 | Resort Hotel | Transient | Offline TA/TO |
| 87376 | 189.266735 | City Hotel | Transient | Direct |
| 26457 | 189.266735 | Resort Hotel | Transient | Online TA |
| 108387 | 189.266735 | City Hotel | Transient | Online TA |
23878 rows × 4 columns
In [35]:
#X_train
Out[35]:
| company | hotel | customer_type | market_segment | |
|---|---|---|---|---|
| 35115 | 189.266735 | Resort Hotel | Transient | Offline TA/TO |
| 104102 | 189.266735 | City Hotel | Transient | Online TA |
| 51818 | 189.266735 | City Hotel | Transient | Groups |
| 23725 | 189.266735 | Resort Hotel | Transient | Groups |
| 107314 | 189.266735 | City Hotel | Transient | Offline TA/TO |
| ... | ... | ... | ... | ... |
| 35722 | 189.266735 | Resort Hotel | Transient | Online TA |
| 51822 | 189.266735 | City Hotel | Transient | Groups |
| 12953 | 189.266735 | Resort Hotel | Contract | Offline TA/TO |
| 70866 | 189.266735 | City Hotel | Transient-Party | Groups |
| 57910 | 189.266735 | City Hotel | Transient | Online TA |
95512 rows × 4 columns
In [36]:
from sklearn.linear_model import LogisticRegression
In [37]:
model = LogisticRegression()
In [39]:
df_small['hotel'].unique()
Out[39]:
array(['Resort Hotel', 'City Hotel'], dtype=object)
In [40]:
#Turn all the strings into 0 and 1 because logistic regressions cannot use string input
df_small['hotel'] = df_small['hotel'].map({'Resort Hotel':0,
'City Hotel':1,},
na_action=None)
In [44]:
df_small.head()
Out[44]:
| company | hotel | customer_type | market_segment | is_repeated_guest | |
|---|---|---|---|---|---|
| 0 | 189.266735 | 0 | Transient | Direct | 0 |
| 1 | 189.266735 | 0 | Transient | Direct | 0 |
| 2 | 189.266735 | 0 | Transient | Direct | 0 |
| 3 | 189.266735 | 0 | Transient | Corporate | 0 |
| 4 | 189.266735 | 0 | Transient | Online TA | 0 |
In [41]:
df_small['hotel'].unique()
Out[41]:
array([0, 1], dtype=int64)
In [45]:
df_small['customer_type'].unique()
Out[45]:
array(['Transient', 'Contract', 'Transient-Party', 'Group'], dtype=object)
In [47]:
df_small['customer_type'] = df_small['customer_type'].map({
'Transient':0,
'Contract':1,
'Transient-Party':2,
'Group':3, },
na_action=None)
In [48]:
df_small['customer_type'].unique()
Out[48]:
array([0, 1, 2, 3], dtype=int64)
In [49]:
df_small['market_segment'].unique()
Out[49]:
array(['Direct', 'Corporate', 'Online TA', 'Offline TA/TO',
'Complementary', 'Groups', 'Undefined', 'Aviation'], dtype=object)
In [50]:
df_small['market_segment'] = df_small['market_segment'].map({
'Direct':0,
'Corporate':1,
'Online TA':2,
'Offline TA/TO':3,
'Complementary':4,
'Groups':5,
'Unidentified':6,
'Aviation':7, },
na_action=None)
In [51]:
df_small['market_segment'].unique()
Out[51]:
array([ 0., 1., 2., 3., 4., 5., nan, 7.])
In [52]:
df_small['market_segment'].fillna(6, inplace = True)
In [56]:
x = df_small.drop(columns = ['is_repeated_guest'])
x.head()
#this test input set only contains the predictor variables
Out[56]:
| company | hotel | customer_type | market_segment | |
|---|---|---|---|---|
| 0 | 189.266735 | 0 | 0 | 0.0 |
| 1 | 189.266735 | 0 | 0 | 0.0 |
| 2 | 189.266735 | 0 | 0 | 0.0 |
| 3 | 189.266735 | 0 | 0 | 1.0 |
| 4 | 189.266735 | 0 | 0 | 2.0 |
In [ ]:
In [53]:
df_small['market_segment'].unique()
Out[53]:
array([0., 1., 2., 3., 4., 5., 6., 7.])
In [57]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)
In [58]:
X_train
Out[58]:
| company | hotel | customer_type | market_segment | |
|---|---|---|---|---|
| 84386 | 189.266735 | 1 | 0 | 2.0 |
| 118570 | 189.266735 | 1 | 0 | 2.0 |
| 38454 | 189.266735 | 0 | 0 | 0.0 |
| 113995 | 189.266735 | 1 | 0 | 2.0 |
| 15005 | 189.266735 | 0 | 0 | 3.0 |
| ... | ... | ... | ... | ... |
| 9675 | 189.266735 | 0 | 0 | 2.0 |
| 77986 | 189.266735 | 1 | 0 | 3.0 |
| 5403 | 189.266735 | 0 | 0 | 5.0 |
| 36792 | 189.266735 | 0 | 0 | 2.0 |
| 54574 | 189.266735 | 1 | 0 | 2.0 |
95512 rows × 4 columns
In [59]:
model.fit(X_train, y_train)
C:\Users\manof\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). return f(**kwargs)
Out[59]:
LogisticRegression()
In [61]:
model.predict(X_test.head())
Out[61]:
array([0, 0, 0, 0, 0], dtype=int64)
In [62]:
model.score(X_test,y_test)
Out[62]:
0.9677108635564118
In [ ]:
Comments
Post a Comment