Business Problem: How to improve customer retention and predict a return customer.

Note: This code exemplifies my pre-processing with python as well as the use of logistic regression.

Hotel_Predictor

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:

df = pd.read_csv('hotel_bookings.csv')

In [3]:

df.head()

Out[3]:

	hotel	lead_time	arrival_date_year	arrival_date_month	arrival_date_week_number	arrival_date_day_of_month	stays_in_week_nights	adults	...	deposit_type	agent	company	customer_type	adr	total_of_special_requests	reservation_status	reservation_status_date
0	Resort Hotel	342	2015	July	27	1	0	2	...	No Deposit	NaN	NaN	Transient	0.0	0	Check-Out	2015-07-01
1	Resort Hotel	737	2015	July	27	1	0	2	...	No Deposit	NaN	NaN	Transient	0.0	0	Check-Out	2015-07-01
2	Resort Hotel	7	2015	July	27	1	1	1	...	No Deposit	NaN	NaN	Transient	75.0	0	Check-Out	2015-07-02
3	Resort Hotel	13	2015	July	27	1	1	1	...	No Deposit	304.0	NaN	Transient	75.0	0	Check-Out	2015-07-02
4	Resort Hotel	14	2015	July	27	1	2	2	...	No Deposit	240.0	NaN	Transient	98.0	1	Check-Out	2015-07-03

5 rows × 32 columns

In [4]:

df.columns

Out[4]:

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')

In [5]:

df.shape

Out[5]:

(119390, 32)

In [6]:

pd.set_option('display.max_columns', None)

In [7]:

pd.isna(df).describe()

Out[7]:

	hotel	is_canceled	lead_time	arrival_date_year	arrival_date_month	arrival_date_week_number	arrival_date_day_of_month	stays_in_weekend_nights	stays_in_week_nights	adults	children	babies	meal	country	market_segment	distribution_channel	is_repeated_guest	previous_cancellations	previous_bookings_not_canceled	reserved_room_type	assigned_room_type	booking_changes	deposit_type	agent	company	days_in_waiting_list	customer_type	adr	required_car_parking_spaces	total_of_special_requests	reservation_status	reservation_status_date
count	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390
unique	1	1	1	1	1	1	1	1	1	1	2	1	1	2	1	1	1	1	1	1	1	1	1	2	2	1	1	1	1	1	1	1
top	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	True	False	False	False	False	False	False	False
freq	119390	119390	119390	119390	119390	119390	119390	119390	119390	119390	119386	119390	119390	118902	119390	119390	119390	119390	119390	119390	119390	119390	119390	103050	112593	119390	119390	119390	119390	119390	119390	119390

In [8]:

df['company'].describe()

Out[8]:

count    6797.000000
mean      189.266735
std       131.655015
min         6.000000
25%        62.000000
50%       179.000000
75%       270.000000
max       543.000000
Name: company, dtype: float64

In [9]:

df['company'].fillna(189.266735, inplace = True)

In [10]:

df['hotel'].describe()

Out[10]:

count         119390
unique             2
top       City Hotel
freq           79330
Name: hotel, dtype: object

In [11]:

df['customer_type'].unique()

Out[11]:

array(['Transient', 'Contract', 'Transient-Party', 'Group'], dtype=object)

In [12]:

df['customer_type'].describe()

Out[12]:

count        119390
unique            4
top       Transient
freq          89613
Name: customer_type, dtype: object

In [13]:

df['market_segment'].unique()

Out[13]:

array(['Direct', 'Corporate', 'Online TA', 'Offline TA/TO',
       'Complementary', 'Groups', 'Undefined', 'Aviation'], dtype=object)

In [14]:

df['market_segment'].describe()

Out[14]:

count        119390
unique            8
top       Online TA
freq          56477
Name: market_segment, dtype: object

In [15]:

df['is_repeated_guest'].unique()

Out[15]:

array([0, 1], dtype=int64)

In [16]:

df['is_repeated_guest'].describe()

Out[16]:

count    119390.000000
mean          0.031912
std           0.175767
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: is_repeated_guest, dtype: float64

In [ ]:

In [17]:

df_small = pd.DataFrame(df, columns = ['company', 'hotel','customer_type','market_segment','is_repeated_guest'])

In [18]:

df_small[df_small['is_repeated_guest'] == 0].describe()

Out[18]:

	company	is_repeated_guest
count	115580.000000	115580.0
mean	190.081897	0.0
std	27.544123	0.0
min	6.000000	0.0
25%	189.266735	0.0
50%	189.266735	0.0
75%	189.266735	0.0
max	543.000000	0.0

In [19]:

df_small[df_small['is_repeated_guest'] == 1].describe()

Out[19]:

	company	is_repeated_guest
count	3810.000000	3810.0
mean	164.538009	1.0
std	85.279604	0.0
min	9.000000	1.0
25%	94.000000	1.0
50%	189.266735	1.0
75%	189.266735	1.0
max	543.000000	1.0

In [20]:

df_small[df_small['is_repeated_guest'] == 1]

Out[20]:

	company	hotel	customer_type	market_segment	is_repeated_guest
13937	189.266735	Resort Hotel	Transient	Complementary	1
14681	189.266735	Resort Hotel	Contract	Offline TA/TO	1
14777	189.266735	Resort Hotel	Transient	Online TA	1
14817	189.266735	Resort Hotel	Contract	Offline TA/TO	1
14823	189.266735	Resort Hotel	Transient	Direct	1
...	...	...	...	...	...
117701	189.266735	City Hotel	Transient	Offline TA/TO	1
117841	40.000000	City Hotel	Transient	Corporate	1
117961	189.266735	City Hotel	Transient	Direct	1
118029	189.266735	City Hotel	Transient	Direct	1
119070	189.266735	City Hotel	Group	Offline TA/TO	1

3810 rows × 5 columns

In [21]:

plt.scatter(df_small.company, df_small.is_repeated_guest, marker = '+', color ='blue')
plt.show()

In [22]:

plt.scatter(df_small.hotel, df_small.is_repeated_guest, marker = '+', color ='blue')
plt.show()

In [25]:

plt.scatter(df_small.customer_type, df_small.is_repeated_guest, marker = '+', color ='blue')
plt.show()

In [26]:

plt.scatter(df_small.market_segment, df_small.is_repeated_guest, marker = '+', color ='blue')
plt.show()

In [27]:

from sklearn.model_selection import train_test_split

In [30]:

x = df_small.drop(columns = ['is_repeated_guest'])
x.head()
#this test input set only contains the predictor variables

Out[30]:

	company	hotel	customer_type	market_segment
0	189.266735	Resort Hotel	Transient	Direct
1	189.266735	Resort Hotel	Transient	Direct
2	189.266735	Resort Hotel	Transient	Direct
3	189.266735	Resort Hotel	Transient	Corporate
4	189.266735	Resort Hotel	Transient	Online TA

In [31]:

y = df_small.drop(columns = ['company', 'customer_type', 'market_segment', 'hotel'])
y.head()
#this test output set contains the ouput variable

Out[31]:

	is_repeated_guest
0	0
1	0
2	0
3	0
4	0

In [32]:

#X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)

In [33]:

#X_test

Out[33]:

	company	hotel	customer_type	market_segment
88096	189.266735	City Hotel	Transient	Online TA
104709	189.266735	City Hotel	Transient-Party	Groups
46679	189.266735	City Hotel	Transient	Direct
92732	189.266735	City Hotel	Transient	Direct
6996	189.266735	Resort Hotel	Transient	Online TA
...	...	...	...	...
23951	189.266735	Resort Hotel	Transient	Online TA
3926	189.266735	Resort Hotel	Transient	Offline TA/TO
87376	189.266735	City Hotel	Transient	Direct
26457	189.266735	Resort Hotel	Transient	Online TA
108387	189.266735	City Hotel	Transient	Online TA

23878 rows × 4 columns

In [35]:

#X_train

Out[35]:

	company	hotel	customer_type	market_segment
35115	189.266735	Resort Hotel	Transient	Offline TA/TO
104102	189.266735	City Hotel	Transient	Online TA
51818	189.266735	City Hotel	Transient	Groups
23725	189.266735	Resort Hotel	Transient	Groups
107314	189.266735	City Hotel	Transient	Offline TA/TO
...	...	...	...	...
35722	189.266735	Resort Hotel	Transient	Online TA
51822	189.266735	City Hotel	Transient	Groups
12953	189.266735	Resort Hotel	Contract	Offline TA/TO
70866	189.266735	City Hotel	Transient-Party	Groups
57910	189.266735	City Hotel	Transient	Online TA

95512 rows × 4 columns

In [36]:

from sklearn.linear_model import LogisticRegression

In [37]:

model = LogisticRegression()

In [39]:

df_small['hotel'].unique()

Out[39]:

array(['Resort Hotel', 'City Hotel'], dtype=object)

In [40]:

#Turn all the strings into 0 and 1 because logistic regressions cannot use string input
df_small['hotel'] = df_small['hotel'].map({'Resort Hotel':0,
                             'City Hotel':1,},
                             na_action=None)

In [44]:

df_small.head()

Out[44]:

	company	customer_type	market_segment
0	189.266735	Transient	Direct
1	189.266735	Transient	Direct
2	189.266735	Transient	Direct
3	189.266735	Transient	Corporate
4	189.266735	Transient	Online TA

In [41]:

df_small['hotel'].unique()

Out[41]:

array([0, 1], dtype=int64)

In [45]:

df_small['customer_type'].unique()

Out[45]:

array(['Transient', 'Contract', 'Transient-Party', 'Group'], dtype=object)

In [47]:

df_small['customer_type'] = df_small['customer_type'].map({
                            'Transient':0,
                            'Contract':1,
                            'Transient-Party':2,
                            'Group':3,   },
                             na_action=None)

In [48]:

df_small['customer_type'].unique()

Out[48]:

array([0, 1, 2, 3], dtype=int64)

In [49]:

df_small['market_segment'].unique()

Out[49]:

array(['Direct', 'Corporate', 'Online TA', 'Offline TA/TO',
       'Complementary', 'Groups', 'Undefined', 'Aviation'], dtype=object)

In [50]:

df_small['market_segment'] = df_small['market_segment'].map({
                            'Direct':0,
                            'Corporate':1,
                            'Online TA':2,
                            'Offline TA/TO':3,
                            'Complementary':4,
                            'Groups':5,
                            'Unidentified':6,
                            'Aviation':7, },
                             na_action=None)

In [51]:

df_small['market_segment'].unique()

Out[51]:

array([ 0.,  1.,  2.,  3.,  4.,  5., nan,  7.])

In [52]:

df_small['market_segment'].fillna(6, inplace = True)

In [56]:

x = df_small.drop(columns = ['is_repeated_guest'])
x.head()
#this test input set only contains the predictor variables

Out[56]:

	company	market_segment
0	189.266735	0.0
1	189.266735	0.0
2	189.266735	0.0
3	189.266735	1.0
4	189.266735	2.0

In [ ]:

In [53]:

df_small['market_segment'].unique()

Out[53]:

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [57]:

X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)

In [58]:

X_train

Out[58]:

	company	hotel	customer_type	market_segment
84386	189.266735	1	0	2.0
118570	189.266735	1	0	2.0
38454	189.266735	0	0	0.0
113995	189.266735	1	0	2.0
15005	189.266735	0	0	3.0
...	...	...	...	...
9675	189.266735	0	0	2.0
77986	189.266735	1	0	3.0
5403	189.266735	0	0	5.0
36792	189.266735	0	0	2.0
54574	189.266735	1	0	2.0

95512 rows × 4 columns

In [59]:

model.fit(X_train, y_train)

C:\Users\manof\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(**kwargs)

Out[59]:

LogisticRegression()

In [61]:

model.predict(X_test.head())

Out[61]:

array([0, 0, 0, 0, 0], dtype=int64)

In [62]:

model.score(X_test,y_test)

Out[62]:

0.9677108635564118

In [ ]:

Search This Blog

Lone Datasaur

Machine Learning Example Code

Comments

Post a Comment

Popular posts from this blog

Kaggle Account

Tableau Example with Analytical Thinking & Business Decisions