In this exercise, I will be using various models (e.g., linear regression, decision trees, random forest, spline regression) to predict average daily rates at a resort hotel. The data were obtained from Kaggle. The dataset was modified from its original source to include 18 of the original 32 variables.
Useful resources:
# Install packages and load libraries
library(ggplot2)
library(dplyr)
library(reshape2) #For reshaping data
library(purrr)
library(gridExtra) #For plotting multiple plots on the same display
library(knitr) #For making nice tables
library(lubridate) # For formatting dates
library(corrplot) #For correlation plot
library(PerformanceAnalytics) #For correlation plot
library(faraway) #For halfnorm function
library(leaps) #For variable selection
library(lmtest) #For checking if constant residuals have constant variance using bptest
library(rpart) #For regression trees
library(rattle) #For fancy regression tree plots
library(rpart.plot) #For fancy regression tree plots
library(RColorBrewer) #For fancy regression tree plots
library(randomForest)
library(splines) #For regression splines
# Set working directory
# setwd("") #Set working directory to where the file is
# Read data
df <- read.csv("stat425_fpdata.csv", header=TRUE)
# Data overview
dim(df) #2097 x 18
## [1] 2097 18
head(df)
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 Resort Hotel 0 68 2015 July
## 2 Resort Hotel 0 14 2015 July
## 3 Resort Hotel 0 10 2015 July
## 4 Resort Hotel 0 9 2015 July
## 5 Resort Hotel 0 51 2015 July
## 6 Resort Hotel 0 51 2015 July
## arrival_date_week_number arrival_date_day_of_month
## 1 27 1
## 2 27 2
## 3 27 3
## 4 27 3
## 5 28 6
## 6 28 6
## stays_in_weekend_nights stays_in_week_nights adults children babies meal
## 1 0 4 2 0 0 BB
## 2 0 2 2 0 0 BB
## 3 0 2 2 2 0 BB
## 4 0 1 2 0 0 BB
## 5 1 3 2 0 0 BB
## 6 1 3 3 0 0 BB
## market_segment reserved_room_type customer_type adr
## 1 Online TA D Transient 97.00
## 2 Online TA A Transient 98.00
## 3 Online TA G Transient 153.00
## 4 Online TA C Transient 94.71
## 5 Online TA G Transient 117.81
## 6 Online TA G Transient 117.81
## total_of_special_requests
## 1 3
## 2 1
## 3 0
## 4 0
## 5 2
## 6 2
There are 18 variables:
# Convert some of the numeric variables into factors
df$is_canceled <- as.factor(df$is_canceled)
df$arrival_date_year <- as.factor(df$arrival_date_year)
# Rearrange order of the levels of the arrival_date_month factor so they are in chronological order when plotting them
df$arrival_date_month <- factor(df$arrival_date_month, levels=c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))
# Select Resort data
df_resort <- subset(df, hotel=="Resort Hotel", select=is_canceled:total_of_special_requests)
dim(df_resort) #479 x 17
## [1] 479 17
# Convert the arrival_date_year, arrival_date_month and arrival_date_day_of_month variables into one column to be used as a date
df_resort$arrival_date <- with(df_resort, paste(arrival_date_year, arrival_date_month, arrival_date_day_of_month, sep="-"))
df_resort$arrival_date_formatted <- parse_date_time(df_resort$arrival_date, orders = "ymd")
head(df_resort[ , c("arrival_date_year", "arrival_date_month", "arrival_date_day_of_month", "arrival_date", "arrival_date_formatted")]) #Check
## arrival_date_year arrival_date_month arrival_date_day_of_month
## 1 2015 July 1
## 2 2015 July 2
## 3 2015 July 3
## 4 2015 July 3
## 5 2015 July 6
## 6 2015 July 6
## arrival_date arrival_date_formatted
## 1 2015-July-1 2015-07-01
## 2 2015-July-2 2015-07-02
## 3 2015-July-3 2015-07-03
## 4 2015-July-3 2015-07-03
## 5 2015-July-6 2015-07-06
## 6 2015-July-6 2015-07-06
# Descriptive stats
summary(df_resort)
## is_canceled lead_time arrival_date_year arrival_date_month
## 0:407 Min. : 0.00 2015: 64 July : 98
## 1: 72 1st Qu.: 7.00 2016:215 August : 91
## Median : 35.00 2017:200 June : 58
## Mean : 60.02 May : 47
## 3rd Qu.: 92.50 September: 35
## Max. :542.00 April : 33
## (Other) :117
## arrival_date_week_number arrival_date_day_of_month
## Min. : 1.00 Min. : 1.00
## 1st Qu.:22.00 1st Qu.: 8.00
## Median :28.00 Median :16.00
## Mean :28.04 Mean :15.94
## 3rd Qu.:35.00 3rd Qu.:24.00
## Max. :53.00 Max. :31.00
##
## stays_in_weekend_nights stays_in_week_nights adults
## Min. :0.0000 Min. : 0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.:2.000
## Median :1.0000 Median : 2.000 Median :2.000
## Mean :0.7641 Mean : 1.952 Mean :1.967
## 3rd Qu.:1.0000 3rd Qu.: 3.000 3rd Qu.:2.000
## Max. :4.0000 Max. :10.000 Max. :4.000
##
## children babies meal market_segment
## Min. :0.0000 Min. :0.000000 BB :453 Aviation : 0
## 1st Qu.:0.0000 1st Qu.:0.000000 HB : 24 Complementary: 0
## Median :0.0000 Median :0.000000 SC : 1 Corporate : 3
## Mean :0.2651 Mean :0.008351 Undefined: 1 Direct :116
## 3rd Qu.:0.0000 3rd Qu.:0.000000 Groups : 4
## Max. :3.0000 Max. :1.000000 Offline TA/TO: 34
## Online TA :322
## reserved_room_type customer_type adr
## A :168 Contract : 2 Min. : 0.00
## E : 95 Group : 4 1st Qu.: 79.16
## D : 77 Transient :438 Median :123.00
## G : 60 Transient-Party: 35 Mean :133.87
## F : 43 3rd Qu.:184.66
## C : 22 Max. :328.33
## (Other): 14
## total_of_special_requests arrival_date
## Min. :0.0000 Length:479
## 1st Qu.:0.0000 Class :character
## Median :1.0000 Mode :character
## Mean :0.7557
## 3rd Qu.:1.0000
## Max. :4.0000
##
## arrival_date_formatted
## Min. :2015-07-01 00:00:00
## 1st Qu.:2016-06-04 00:00:00
## Median :2016-10-05 00:00:00
## Mean :2016-10-20 15:07:53
## 3rd Qu.:2017-05-28 00:00:00
## Max. :2017-08-31 00:00:00
##
From these summary statistics, we can see that:
We will be prediciting adr (average daily rate) so let’s take a look at the distribution of this variable
ggplot(df_resort, aes(x=adr)) +
geom_histogram(aes(y=..density..), binwidth=5, colour="black", fill="lightgray") +
geom_density(alpha=.1, fill="#FF6666") +
theme_light() +
ggtitle("Histogram of average daily rate (adr)") +
xlab("average daily rate (adr)")
Adr is somewhat normally distributed so there’s a good range of values to predict. Based on the descriptive summary above, it is of note that many of the variables don’t have much variability (e.g., adults, children, babies, meal, market_segment, customer_type, total_of_special requests). The variables that have more variability and are likely to affect the average daily rate are: is_canceled, lead_time, arrival_date_month, arrival_date_week_number, arrival_date_day_of_month, reserved_room_type.
Next, let’s explore explore associations among the different variables and adr.
# Plot correlations among numeric variables
num_df_resort <- select_if(df_resort, is.numeric) #Pick numeric variables
# dim(num_df_resort) #10 numeric variables
# Plot correlations and histograms using PerformanceAnalytics library
chart.Correlation(num_df_resort, histogram=TRUE, pch=19)
Based on the correlation plots between adr and all the other numeric variables, we can see that there is a strong nonlinear trend between adr and arrival_date_week_number where adr is much higher at medium values of arrival_date_week_number (June -August). There is a significant positive correlation between adr and adults (r=0.23), children (r=0.40), and total_of_special_requests (r=0.15). There is no significant correlation between adr and lead_time, arrival_date_day_of_month, stays_in_weekend_nights, stays_in_week_nights, and babies.
Now let’s take a closer look at some the variables that showed an association with adr
# Number of adults, number of children, total of special requests
ggplot(df_resort, aes(x=adults, y=adr)) + geom_point() + theme_light()
ggplot(df_resort, aes(x=children, y=adr)) + geom_point() + theme_light()
ggplot(df_resort, aes(x=total_of_special_requests, y=adr)) + geom_point() + theme_light()
# Explore arrival date week number trends
df_resort %>% group_by(arrival_date_week_number) %>% summarise(mean_adr=mean(adr, na.rm=TRUE)) %>% arrange(desc(mean_adr))
## # A tibble: 53 x 2
## arrival_date_week_number mean_adr
## <int> <dbl>
## 1 32 221.
## 2 34 217.
## 3 33 216.
## 4 35 194.
## 5 31 191.
## 6 30 191.
## 7 29 169.
## 8 28 164.
## 9 24 163.
## 10 26 163.
## # … with 43 more rows
week <- ggplot(df_resort, aes(x=arrival_date_week_number, y=adr)) +
geom_point() +
geom_smooth() +
theme_light() +
ggtitle("Average daily rate vs. Arrival date week number") +
xlab("Arrival date week number") +
ylab("Average daily rate (adr)")
# Explore monthly trends
df_resort %>% group_by(arrival_date_month) %>% summarise(mean_adr=mean(adr, na.rm=TRUE)) %>% arrange(desc(mean_adr))
## # A tibble: 12 x 2
## arrival_date_month mean_adr
## <fct> <dbl>
## 1 August 204.
## 2 July 171.
## 3 June 153.
## 4 September 125.
## 5 May 103.
## 6 April 87.7
## 7 October 78.3
## 8 December 72.9
## 9 March 71.6
## 10 February 63.2
## 11 November 57.8
## 12 January 44.1
month <- ggplot(df_resort, aes(x=arrival_date_month, y=adr)) +
geom_boxplot() +
theme_light() +
ggtitle("Average daily rate vs. Arrival month") +
xlab("Arrival month") +
ylab("Average daily rate (adr)")
# Explore yearly trends
df_resort %>% group_by(arrival_date_year) %>% summarise(mean_adr=mean(adr, na.rm=TRUE)) %>% arrange(desc(mean_adr))
## # A tibble: 3 x 2
## arrival_date_year mean_adr
## <fct> <dbl>
## 1 2017 147.
## 2 2016 127.
## 3 2015 114.
year <- ggplot(df_resort, aes(x=arrival_date_year, y=adr)) +
geom_boxplot() +
theme_light() +
ggtitle("Average daily rate vs. Arrival date year") +
xlab("Arrival date year") +
ylab("Average daily rate (adr)")
# Plot week, monthly, and yearly trends on the same display
grid.arrange(week, month, year, nrow=3, ncol=1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Noteworthy from the plots above are that there is a strong nonlinear trend between week number and adr; adr increases from January til August, and then decreases from August to December. Across the years, there is an increase in adr 2015 to 2017
Other date trend visualizations:
# Explore full arrival date trends
ggplot(df_resort, aes(x=arrival_date_formatted, y=adr)) +
geom_point() +
theme_light() +
ggtitle("Average daily rate vs. Arrival date") +
xlab("Arrival date") +
ylab("Average daily rate (adr)")
# Plot arrival_date_day_of_month for each month separately
df_resort_adr_by_month_day <- df_resort %>% group_by(arrival_date_month, arrival_date_day_of_month) %>% summarise(mean_adr = mean(adr)) %>% arrange(arrival_date_month, arrival_date_day_of_month)
ggplot(data = df_resort_adr_by_month_day, aes(x=arrival_date_day_of_month, y=mean_adr, colour=arrival_date_month)) +
geom_point() +
geom_smooth(method='lm', se=FALSE) +
facet_wrap( ~ arrival_date_month) +
ggtitle("Mean average daily rate (adr) per day of month by month") +
xlab("Day of month") + ylab("Mean average daily rate (adr)") +
scale_color_brewer(palette = "Paired") +
theme_light() +
theme(legend.position = "none") +
theme(strip.background = element_rect(color="black", size=1)) + #Facet label rectangle contour
theme(strip.text.x = element_text(color = "black")) #Facet label font colour
Within each month, prices remain relatively the same across days except for June-July and December when later days are more expensive than earlier days. For September-November, the opposite is true: later days are cheaper than earlier days in the month
Now let’s explore some of the categorical variables
# Explore categorical variables
cat_df_resort <- select_if(df_resort, negate(is.numeric)) #Pick non-numeric variables
# dim(cat_df_resort) #9 variables are not numeric
# Explore cancelled reservations
df_resort %>% group_by(is_canceled) %>% summarise(mean=mean(adr, na.rm=TRUE)) %>% arrange(desc(mean))
## # A tibble: 2 x 2
## is_canceled mean
## <fct> <dbl>
## 1 1 164.
## 2 0 129.
ggplot(df_resort, aes(x=is_canceled, y=adr, colour=is_canceled)) +
geom_boxplot() +
geom_jitter(position=position_jitter(0.2)) +
theme_light() +
ggtitle("Average daily rate vs. Cancellation status") +
xlab("Cancellation status") +
ylab("Average daily rate (adr)") +
scale_x_discrete(labels=c("Not cancelled","Cancelled")) +
theme(legend.position = "none")
Cancelled reservations seem to have higher adr than non-cancelled reservations
# Explore market segment
df_resort %>% group_by(market_segment) %>% summarise(mean=mean(adr, na.rm=TRUE), n=n()) %>% arrange(desc(mean))
## # A tibble: 5 x 3
## market_segment mean n
## <fct> <dbl> <int>
## 1 Direct 148. 116
## 2 Online TA 135. 322
## 3 Offline TA/TO 88.0 34
## 4 Corporate 44 3
## 5 Groups 40.2 4
ggplot(df_resort, aes(x=market_segment, y=adr, colour=market_segment)) +
geom_boxplot() +
geom_jitter(position=position_jitter(0.2)) +
theme_light() +
ggtitle("Average daily rate vs. Market segment") +
xlab("Market segment") +
ylab("Average daily rate (adr)") +
theme(legend.position = "none")
Direct reservations seem to have higher adr but also have a large range (similar to Online TA). Very few reservations fall under the other market segments (Offline TA/TO, Corporate, Groups)
# Explore meal
df_resort %>% group_by(meal) %>% summarise(mean=mean(adr, na.rm=TRUE), n=n()) %>% arrange(desc(mean))
## # A tibble: 4 x 3
## meal mean n
## <fct> <dbl> <int>
## 1 Undefined 311 1
## 2 HB 165. 24
## 3 BB 132. 453
## 4 SC 22.4 1
ggplot(df_resort, aes(x=meal, y=adr, colour=meal)) +
geom_boxplot() +
geom_jitter(position=position_jitter(0.2)) +
scale_color_brewer(palette = "RdBu") +
theme_light() +
ggtitle("Average daily rate vs. Type of Meal") +
xlab("Meal") +
ylab("Average daily rate (adr)") +
theme(legend.position = "none")
Most reservations are BB, followed by HB (HB seems to be slightly higher). These two are pretty much the only meal types in the data
# Explore reserved_room_type
df_resort %>% group_by(reserved_room_type) %>% summarise(mean=mean(adr, na.rm=TRUE), n=n()) %>% arrange(desc(mean))
## # A tibble: 7 x 3
## reserved_room_type mean n
## <fct> <dbl> <int>
## 1 H 226. 14
## 2 G 181. 60
## 3 C 176. 22
## 4 F 147. 43
## 5 E 136. 95
## 6 D 131. 77
## 7 A 100. 168
ggplot(df_resort, aes(x=reserved_room_type, y=adr, colour=reserved_room_type)) +
geom_boxplot() +
geom_jitter(position=position_jitter(0.2)) +
scale_color_brewer(palette = "Dark2") +
theme_light() +
ggtitle("Average daily rate vs. Room Type") +
xlab("Reserved Room Type") +
ylab("Average daily rate (adr)") +
theme(legend.position = "none")
Most reservations are room type A (also seem to have the lowest adr)
# Explore customer_type
df_resort %>% group_by(customer_type) %>% summarise(mean=mean(adr, na.rm=TRUE), n=n()) %>% arrange(desc(mean))
## # A tibble: 4 x 3
## customer_type mean n
## <fct> <dbl> <int>
## 1 Transient 136. 438
## 2 Transient-Party 120. 35
## 3 Group 96.2 4
## 4 Contract 66.2 2
ggplot(df_resort, aes(x=customer_type, y=adr, colour=customer_type)) +
geom_boxplot() +
geom_jitter(position=position_jitter(0.2)) +
scale_color_brewer(palette = "Set2") +
theme_light() +
ggtitle("Average daily rate vs. Customer Type") +
xlab("Customer Type") +
ylab("Average daily rate (adr)") +
theme(legend.position = "none")
Most reservations are Transient (have the largest adr but also the largest range). Very few reservations are of a different customer type
So far, we’ve seen that there is a strong nonlinear trend between adr and arrival_date_week_number / arrival_date_month. There are some positive correlations between adr and adults, children, and total_of_special_requests. Meal, market_segment, and customer_type don’t have much variability; reserved_room_type does have variability.
Next, let’s check for interactions between arrival_date_month and the other categorical variables. This would allow us to explore whether, for example, adr is higher for cancelled (vs. not cancelled) reservations in July but not in November
Categorical variables: is_canceled, market_segment, meal, reserved_room_type, customer_type
par(mfrow=c(2,1))
interaction.plot(df_resort$arrival_date_month, df_resort$is_canceled, df_resort$adr, xlab="Arrival date month", ylab="Mean of adr", legend=TRUE, trace.label="Is cancelled")
interaction.plot(df_resort$arrival_date_month, df_resort$market_segment, df_resort$adr, xlab="Market segment", ylab="Mean of adr", legend=TRUE, trace.label="Market Segment")
interaction.plot(df_resort$arrival_date_month, df_resort$meal, df_resort$adr, xlab="Meal", ylab="Mean of adr", legend=TRUE, trace.label="Meal")
interaction.plot(df_resort$arrival_date_month, df_resort$reserved_room_type, df_resort$adr, xlab="Reserved Room Type", ylab="Mean of adr", legend=TRUE, trace.label="Room Type")
interaction.plot(df_resort$arrival_date_month, df_resort$customer_type, df_resort$adr, xlab="Customer Type", ylab="Mean of adr", legend=TRUE, trace.label="Customer Type")
There are no obvious interactions between categorical variables from the visual displays. The lines do cross for is_canceled, reserved_room_type, and customer_type but overall, no large interactions are noteworthy from the plots
Now, let’s run some predictive models. I will be looking at different error metrics to evaluate models. The error is basically the difference between the predicted and actual/observed value. I will be focusing on:
First, let’s subset the dataset and pick the values used for training and testing
# Choose the variables that will be used for modeling (all variables except the newly created formatted arrival dates)
df_resort_short <- subset(df_resort, select=c(is_canceled, lead_time, arrival_date_year, arrival_date_month, arrival_date_week_number, arrival_date_day_of_month, stays_in_weekend_nights, stays_in_week_nights, adults, children, babies, meal, market_segment, reserved_room_type, customer_type, total_of_special_requests, adr))
dim(df_resort_short) # 479 x 17
## [1] 479 17
# Pick rows to be used for training
set.seed(10)
train <- sample(1:nrow(df_resort_short), round(nrow(df_resort_short)*0.75, 0)) #Pick rows to be used for training (75% of the data; 359 rows)
length(train) #359
## [1] 359
# Values used for testing
y <- df_resort_short[-train, "adr"] # adr observed values used for evaluating model performance in test dataset
# Best initial guess is the average adr in the training dataset
baseline.pred <- mean(df_resort_short$adr[train])
# Make predictions on the test dataset
MSE.baseline <- mean((baseline.pred - df_resort_short$adr[-train])^2) #4960.995
RMSE.baseline <- sqrt(mean((baseline.pred - df_resort_short$adr[-train])^2)) #70.43433
MAE.baseline <- mean(abs(baseline.pred - df_resort_short$adr[-train])) #59.02402
# Train model
linear.reg <- lm(adr ~ ., data=df_resort_short, subset=train)
summary(linear.reg)
##
## Call:
## lm(formula = adr ~ ., data = df_resort_short, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -186.208 -14.702 0.525 15.617 73.580
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -42.030268 33.310720 -1.262 0.207956
## is_canceled1 19.186267 4.839235 3.965 9.07e-05 ***
## lead_time -0.158799 0.029632 -5.359 1.61e-07 ***
## arrival_date_year2016 31.635535 5.350113 5.913 8.64e-09 ***
## arrival_date_year2017 35.751026 6.983525 5.119 5.31e-07 ***
## arrival_date_monthFebruary 33.747615 26.886061 1.255 0.210322
## arrival_date_monthMarch 95.056992 50.150022 1.895 0.058936 .
## arrival_date_monthApril 150.697718 73.345620 2.055 0.040730 *
## arrival_date_monthMay 206.479100 97.646370 2.115 0.035243 *
## arrival_date_monthJune 291.025199 122.121783 2.383 0.017754 *
## arrival_date_monthJuly 367.566440 146.389142 2.511 0.012537 *
## arrival_date_monthAugust 443.056559 171.814416 2.579 0.010365 *
## arrival_date_monthSeptember 431.043087 195.909820 2.200 0.028508 *
## arrival_date_monthOctober 419.833987 220.500920 1.904 0.057810 .
## arrival_date_monthNovember 444.638700 244.670691 1.817 0.070110 .
## arrival_date_monthDecember 504.146893 269.846940 1.868 0.062641 .
## arrival_date_week_number -9.834252 5.652543 -1.740 0.082860 .
## arrival_date_day_of_month 1.825549 0.821819 2.221 0.027029 *
## stays_in_weekend_nights 0.634854 2.116325 0.300 0.764388
## stays_in_week_nights 4.459792 1.243124 3.588 0.000386 ***
## adults 15.948700 4.024958 3.962 9.16e-05 ***
## children 13.155300 3.521957 3.735 0.000222 ***
## babies -10.064728 15.067148 -0.668 0.504622
## mealHB 23.177131 7.242627 3.200 0.001512 **
## mealSC -32.063292 36.756910 -0.872 0.383697
## mealUndefined 73.013162 29.871379 2.444 0.015056 *
## market_segmentDirect 15.101435 20.976480 0.720 0.472100
## market_segmentGroups -15.981393 28.436442 -0.562 0.574508
## market_segmentOffline TA/TO -18.935452 20.574600 -0.920 0.358095
## market_segmentOnline TA 8.466227 20.657738 0.410 0.682203
## reserved_room_typeC 28.946624 9.638141 3.003 0.002882 **
## reserved_room_typeD 16.402822 4.614178 3.555 0.000435 ***
## reserved_room_typeE 31.131108 4.637291 6.713 8.72e-11 ***
## reserved_room_typeF 42.483500 6.012299 7.066 1.01e-11 ***
## reserved_room_typeG 50.154157 6.471137 7.750 1.24e-13 ***
## reserved_room_typeH 59.622202 11.140273 5.352 1.67e-07 ***
## customer_typeGroup -20.688186 27.787994 -0.745 0.457122
## customer_typeTransient -0.008309 22.580566 0.000 0.999707
## customer_typeTransient-Party 16.750639 23.236736 0.721 0.471518
## total_of_special_requests 2.122942 2.302050 0.922 0.357123
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.23 on 319 degrees of freedom
## Multiple R-squared: 0.8429, Adjusted R-squared: 0.8237
## F-statistic: 43.89 on 39 and 319 DF, p-value: < 2.2e-16
# R squared 0.8429
# Adjusted R squared 0.8237
# Make predictions on the test dataset
yhat.linear.reg <- predict(linear.reg, newdata=df_resort_short[-train, ])
MSE.linear.reg <- mean((yhat.linear.reg - y)^2) #761.2256
RMSE.linear.reg <- sqrt(mean((yhat.linear.reg - y)^2)) #27.59032
MAE.linear.reg <- mean(abs(yhat.linear.reg - y)) #21.13004
AIC.linear.reg <- AIC(linear.reg) #3456.801
BIC.linear.reg <- BIC(linear.reg) #3616.017
DO MODEL DIAGNOSTICS ON THE LINEAR REGRESSION MODEL
# CHECK REGRESSION DIAGNOSTICS
par(mfrow=c(2,2))
plot(linear.reg)
## Warning: not plotting observations with leverage one:
## 134, 154
## Warning: not plotting observations with leverage one:
## 134, 154
bptest(linear.reg) #Constant variance (from lmtest library)
##
## studentized Breusch-Pagan test
##
## data: linear.reg
## BP = 32.989, df = 39, p-value = 0.7397
shapiro.test(residuals(linear.reg)) #p= 3.793e-11, residuals not normal
##
## Shapiro-Wilk normality test
##
## data: residuals(linear.reg)
## W = 0.93747, p-value = 3.793e-11
hist(linear.reg$res) #the residuals seem to follow a normal distribution except for some extreme values on the left
The histogram of residuals suggests residuals do not follow a normal distribution. This is supported by the Shapiro-Wilk normality test. The plot of fitted vs. residuals suggests constant variance of residuals (homoskedasticity) as there is no patterned shape in the plot of residuals vs. fitted values. This conclusion is supported by the Breusch-Pagan test
CHECK FOR POINTS WITH HIGH LEVERAGE, OUTLIERS, AND INFLUENTIAL POINTS
# Check for large leverage points
n=359; #Number of observations
p=40; #Number of predictors, including the intercept
lev=influence(linear.reg)$hat
halfnorm(lev, nlab=10, labs=row.names(df_resort_short), ylab="Leverages") #134 and 154 have high leverage
# Check for outliers
jack=rstudent(linear.reg);
t=qt(.05/(2*n), n-p-1) # Bonferroni correction. # -3.856416
jack[which(abs(jack) > abs(t))] # 445 and 173 outliers
## 445 173
## -7.372723 -3.991591
# Check for influential points
cook = cooks.distance(linear.reg)
halfnorm(cook, nlab=2, labs=row.names(df_resort_short[train,]), ylab="Cook's distances")
max(cook, na.rm=TRUE) # No influential points (CD) above 1. Maybe 173 and 445
## [1] 0.08191411
Data points 134, 154, 173, 445 might have high leverage and influence. Let’s take a closer look at those points
df_resort_short[c(134,154,173,145), ]
## is_canceled lead_time arrival_date_year arrival_date_month
## 134 0 5 2015 November
## 154 0 2 2016 April
## 173 0 77 2016 May
## 145 0 0 2016 January
## arrival_date_week_number arrival_date_day_of_month
## 134 48 22
## 154 15 4
## 173 20 10
## 145 2 7
## stays_in_weekend_nights stays_in_week_nights adults children babies
## 134 2 0 1 0 0
## 154 1 1 2 0 0
## 173 0 5 2 0 0
## 145 2 4 2 0 0
## meal market_segment reserved_room_type customer_type
## 134 BB Online TA A Transient
## 154 BB Online TA E Transient
## 173 HB Online TA F Transient
## 145 BB Offline TA/TO A Transient
## total_of_special_requests adr
## 134 1 40
## 154 1 87
## 173 1 34
## 145 0 38
There’s nothing in particular that stands out so for now, I will leave them in the model
We can see that there are many variables that are not significant in the linear regression model. Let’s refine our model by doing variable seelction using a stepwise algorithm (both forward and backward selection)
# Refine regression model using stepwise algorithm (both forward and backward)
linear.reg.step <- step(linear.reg, direction="both")
summary(linear.reg.step)
##
## Call:
## lm(formula = adr ~ is_canceled + lead_time + arrival_date_year +
## arrival_date_month + arrival_date_week_number + arrival_date_day_of_month +
## stays_in_week_nights + adults + children + meal + market_segment +
## reserved_room_type + customer_type, data = df_resort_short,
## subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -187.954 -14.655 1.053 15.435 75.107
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -41.97580 32.77708 -1.281 0.201240
## is_canceled1 18.52301 4.70808 3.934 0.000102 ***
## lead_time -0.15294 0.02784 -5.493 8.05e-08 ***
## arrival_date_year2016 31.40158 5.30075 5.924 8.07e-09 ***
## arrival_date_year2017 36.21700 6.94694 5.213 3.32e-07 ***
## arrival_date_monthFebruary 33.68510 26.29443 1.281 0.201090
## arrival_date_monthMarch 93.57648 49.12445 1.905 0.057686 .
## arrival_date_monthApril 147.93339 71.55849 2.067 0.039504 *
## arrival_date_monthMay 203.07831 95.42697 2.128 0.034088 *
## arrival_date_monthJune 286.89931 119.24790 2.406 0.016696 *
## arrival_date_monthJuly 361.98532 142.92825 2.533 0.011795 *
## arrival_date_monthAugust 436.63748 167.77415 2.603 0.009682 **
## arrival_date_monthSeptember 422.24042 191.09717 2.210 0.027839 *
## arrival_date_monthOctober 410.87091 215.25747 1.909 0.057184 .
## arrival_date_monthNovember 435.20103 238.88231 1.822 0.069410 .
## arrival_date_monthDecember 493.82446 263.59454 1.873 0.061915 .
## arrival_date_week_number -9.59717 5.52299 -1.738 0.083224 .
## arrival_date_day_of_month 1.79063 0.80300 2.230 0.026441 *
## stays_in_week_nights 4.61083 1.21797 3.786 0.000183 ***
## adults 16.43287 3.97010 4.139 4.46e-05 ***
## children 13.06775 3.49366 3.740 0.000217 ***
## mealHB 22.86248 7.14186 3.201 0.001505 **
## mealSC -31.50296 36.49033 -0.863 0.388602
## mealUndefined 76.36955 29.55187 2.584 0.010199 *
## market_segmentDirect 16.17735 20.61177 0.785 0.433113
## market_segmentGroups -17.62962 28.09345 -0.628 0.530753
## market_segmentOffline TA/TO -18.72581 20.30431 -0.922 0.357085
## market_segmentOnline TA 10.06033 20.19339 0.498 0.618684
## reserved_room_typeC 29.71170 9.53806 3.115 0.002004 **
## reserved_room_typeD 16.45196 4.59243 3.582 0.000393 ***
## reserved_room_typeE 30.77934 4.56978 6.735 7.53e-11 ***
## reserved_room_typeF 42.13408 5.98043 7.045 1.13e-11 ***
## reserved_room_typeG 50.01913 6.42379 7.787 9.51e-14 ***
## reserved_room_typeH 59.40273 11.09936 5.352 1.66e-07 ***
## customer_typeGroup -23.04663 27.52687 -0.837 0.403078
## customer_typeTransient -1.61747 22.29687 -0.073 0.942215
## customer_typeTransient-Party 14.94656 22.85159 0.654 0.513533
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.15 on 322 degrees of freedom
## Multiple R-squared: 0.8423, Adjusted R-squared: 0.8247
## F-statistic: 47.78 on 36 and 322 DF, p-value: < 2.2e-16
# R squared: 0.8423
# Adjusted R squared: 0.8247
The final model from the stepwise algorithm has 13 variables (is_canceled + lead_time + arrival_date_year + arrival_date_month + arrival_date_week_number + arrival_date_day_of_month + stays_in_week_nights + adults + children + meal + market_segment + reserved_room_type + customer_type)
Let’s see how this model compares to the previous linear regression model
anova(linear.reg, linear.reg.step) #p=0.7437
## Analysis of Variance Table
##
## Model 1: adr ~ is_canceled + lead_time + arrival_date_year + arrival_date_month +
## arrival_date_week_number + arrival_date_day_of_month + stays_in_weekend_nights +
## stays_in_week_nights + adults + children + babies + meal +
## market_segment + reserved_room_type + customer_type + total_of_special_requests
## Model 2: adr ~ is_canceled + lead_time + arrival_date_year + arrival_date_month +
## arrival_date_week_number + arrival_date_day_of_month + stays_in_week_nights +
## adults + children + meal + market_segment + reserved_room_type +
## customer_type
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 319 254233
## 2 322 255221 -3 -987.68 0.4131 0.7437
An ANOVA comparison between the original linear regression model and that refined using the stepwise algorithm fit the same, suggesting that we should choose the more parsimonious model (the one suggested by the stepwise algorithm since it has less predictors)
# Make predictions on the test dataset
yhat.linear.reg.step <- predict(linear.reg.step, newdata=df_resort_short[-train, ])
MSE.linear.reg.step <- mean((yhat.linear.reg.step - y)^2) #789.4876
RMSE.linear.reg.step <- sqrt(mean((yhat.linear.reg.step - y)^2)) #28.09782
MAE.linear.reg.step <- mean(abs(yhat.linear.reg.step - y))#21.52217
AIC.linear.reg.step <- AIC(linear.reg.step) #3452.1931
BIC.linear.reg.step <- BIC(linear.reg.step) #3599.76
Next, given the nonlinear trends seen in the plots, run models that do not assume linear dependency between the otucome and predictors, such as decision trees and random forest
# MODEL 2 - REGRESSION/DECISION TREE
# Train model using default regression tree from rpart
tree <- rpart(adr ~ ., df_resort_short, subset=train)
# Find optimal tree (complexity parameter) using cross validation
# Train model
tree2 <- rpart(adr ~ ., df_resort_short, subset=train, cp=0.001)
# Print cp (complexity parameter) table to find optimal cp
printcp(tree2)
##
## Regression tree:
## rpart(formula = adr ~ ., data = df_resort_short, subset = train,
## cp = 0.001)
##
## Variables actually used in tree construction:
## [1] adults arrival_date_day_of_month
## [3] arrival_date_month arrival_date_week_number
## [5] arrival_date_year children
## [7] lead_time market_segment
## [9] reserved_room_type stays_in_week_nights
##
## Root node error: 1618446/359 = 4508.2
##
## n= 359
##
## CP nsplit rel error xerror xstd
## 1 0.4526587 0 1.00000 1.00641 0.068151
## 2 0.0761419 1 0.54734 0.55240 0.043762
## 3 0.0569565 2 0.47120 0.53387 0.038243
## 4 0.0367141 3 0.41424 0.49834 0.037393
## 5 0.0261614 4 0.37753 0.49019 0.036268
## 6 0.0204292 5 0.35137 0.48779 0.041550
## 7 0.0203406 6 0.33094 0.48144 0.041236
## 8 0.0183512 7 0.31060 0.46334 0.041205
## 9 0.0149422 8 0.29225 0.44677 0.040501
## 10 0.0116729 10 0.26236 0.42205 0.041371
## 11 0.0093930 11 0.25069 0.41305 0.039387
## 12 0.0085059 12 0.24130 0.40795 0.038879
## 13 0.0078785 13 0.23279 0.41108 0.038860
## 14 0.0074394 14 0.22491 0.40972 0.038734
## 15 0.0074234 15 0.21747 0.41012 0.038735
## 16 0.0064647 16 0.21005 0.40912 0.038709
## 17 0.0040318 17 0.20358 0.39933 0.038439
## 18 0.0034912 18 0.19955 0.39211 0.037636
## 19 0.0033534 19 0.19606 0.39159 0.037602
## 20 0.0024504 20 0.19271 0.38777 0.037246
## 21 0.0023060 21 0.19026 0.39011 0.037664
## 22 0.0015002 22 0.18795 0.38396 0.035987
## 23 0.0012808 23 0.18645 0.38467 0.035893
## 24 0.0011723 25 0.18389 0.38488 0.035884
## 25 0.0010968 26 0.18272 0.38652 0.035949
## 26 0.0010000 27 0.18162 0.38577 0.035928
# Find out tree with lowest xerror (cross validation error) and plot the line that's 1SD above its xerror (red line)
myCPtable <- tree2$cptable
id.min <- which.min(myCPtable[,'xerror']) #22, 22 splits
my1se.err <- myCPtable[id.min,'xerror'] + myCPtable[id.min, 'xstd'] # 0.4199426
plotcp(tree2)
abline(h=my1se.err, col="red")
# Pick the smallest tree below the red line, find its cp, and prune the tree with that cp value (optimal tree)
id.1se <- min(which(myCPtable[,'xerror'] < my1se.err)) #ID for tree with 11 splits
CP.1se <- myCPtable[id.1se, 'CP'] #0.009393006
tree.2.pruned <- prune.rpart(tree2, CP.1se)
# Fancy plot
fancyRpartPlot(tree.2.pruned, caption = NULL) #11 splits
# Make predictions on the test dataset
yhat.tree.2.pruned <- predict(tree.2.pruned, newdata=df_resort_short[-train, ])
MSE.tree.2.pruned <- mean((yhat.tree.2.pruned - y)^2) #1316.809
RMSE.tree.2.pruned <- sqrt(mean((yhat.tree.2.pruned - y)^2)) # 36.28787
MAE.tree.2.pruned <- mean(abs(yhat.tree.2.pruned - y)) #27.56134
Decision trees only grow one tree; with random forests we can grow multiple trees and average their responses to obtain a prediction. By using the average prediction of multiple trees, the model reduces bias from having one tree and it is less likely to overfit. To ensure each tree is different from each other, there is some randomness introduced when building the trees - for example, by choosing a bootstramped sample with replacement and randomly sampling different predictors for each tree
First, we need to identify the optimal mtry (number of variables randomly sampled as candidates at each split) so let’s run multiple random forests with different values of mtry and compare their MSEs to pick the model with the lowest MSE
# Find the best mtry parameter value for the model
set.seed(10)
n.predictors <- 16
mtry.MSE <- c()
for (i in 1:n.predictors) {
rf.mtry <- randomForest(adr ~ ., data = df_resort_short, subset=train, mtry=i, importance=TRUE, ntree=500)
yhat.rf.mtry <- predict(rf.mtry, newdata=df_resort_short[-train,])
mtry.MSE[i] <- mean((yhat.rf.mtry - y)^2) #MSE
}
plot(mtry.MSE)
min(mtry.MSE)
## [1] 739.7892
which.min(mtry.MSE) # mtry of 7
## [1] 7
# Train random forest model with mtry=7, ntree=500
set.seed(10)
rf <- randomForest(adr ~ ., data = df_resort_short, subset=train, mtry=7, importance=TRUE, ntree=500)
rf
##
## Call:
## randomForest(formula = adr ~ ., data = df_resort_short, mtry = 7, importance = TRUE, ntree = 500, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 7
##
## Mean of squared residuals: 882.4106
## % Var explained: 80.43
#names(rf)
rf$ntree
## [1] 500
rf$mtry
## [1] 7
#rf$mse
plot(rf)
which.min(rf$mse)
## [1] 474
#sort(importance(rf)[,1], decreasing=TRUE)
varImpPlot(rf) #arrival_date_month is the most important variable for adr prediction
# Make predictions on the test dataset
yhat.rf <- predict(rf, newdata=df_resort_short[-train,])
MSE.rf <- mean((yhat.rf - y)^2) #744.0877
RMSE.rf <- sqrt(mean((yhat.rf - y)^2)) # 27.27797
MAE.rf <- mean(abs(yhat.rf - y)) #19.79513
We can see from the model that arrival_date_month is the most important variable for adr prediction. Let’s next build a regression spline just using arrival date as the predictor
The spline regression gives error messages when using factors as predictors so use the formatted version of the arrival date (combining day, month, and year): arrival_date_formatted.
For spline regression, we also need to specify the number of knots (m) so let’s first calculate the optimal number of knots by computing AIC and BIC for models with number of knots between 3 and 30, and picking the model with the lowest AIC/BIC. We’ll use the splines package where we need to specify the number of degrees of freedom (df) when running the model (df=m+4)
# MODEL 4 - REGRESSION SPLINE
# First, calculate optimal number of knots (m).
# Compute BIC and AIC for model with number of knots between 3 and 30.
# 3 knots = 7 df
# 30 knots = 34 df
df_initial <- 7
df_last <- 34
r.spline.knots.aic.bic <- data.frame(matrix(ncol = 3, nrow = df_last-df_initial + 1))
for (i in df_initial:df_last){
r.spline.fit <- lm(adr ~ bs(arrival_date_formatted, df=i, intercept=TRUE), data=df_resort, subset=train)
r.spline.knots.aic.bic[i-6, 1] <- i-4 #Number of knots is df-4
r.spline.knots.aic.bic[i-6, 2] <- AIC(r.spline.fit)
r.spline.knots.aic.bic[i-6, 3] <- BIC(r.spline.fit)
}
# r.spline.knots.aic.bic
# Rename columns
names(r.spline.knots.aic.bic)[1] <- "knots"
names(r.spline.knots.aic.bic)[2] <- "AIC"
names(r.spline.knots.aic.bic)[3] <- "BIC"
r.spline.knots.aic.bic
## knots AIC BIC
## 1 3 3842.397 3873.464
## 2 4 3789.666 3824.616
## 3 5 3754.839 3793.672
## 4 6 3775.556 3818.273
## 5 7 3737.241 3783.841
## 6 8 3750.634 3801.117
## 7 9 3721.599 3775.965
## 8 10 3719.441 3777.690
## 9 11 3717.967 3780.100
## 10 12 3721.418 3787.435
## 11 13 3722.647 3792.546
## 12 14 3724.398 3798.181
## 13 15 3724.265 3801.932
## 14 16 3728.847 3810.397
## 15 17 3726.843 3812.276
## 16 18 3723.572 3812.889
## 17 19 3728.164 3821.364
## 18 20 3726.351 3823.434
## 19 21 3729.615 3830.582
## 20 22 3723.424 3828.273
## 21 23 3727.000 3835.733
## 22 24 3722.507 3835.123
## 23 25 3727.330 3843.830
## 24 26 3723.666 3844.049
## 25 27 3728.979 3853.245
## 26 28 3730.513 3858.662
## 27 29 3731.253 3863.286
## 28 30 3734.542 3870.458
# Plot BIC and AIC vs number of knots to choose best model
plot(r.spline.knots.aic.bic$knots, r.spline.knots.aic.bic$AIC, xlab="Knots", ylab="AIC", main="Knots vs. AIC")
plot(r.spline.knots.aic.bic$knots, r.spline.knots.aic.bic$BIC, xlab="Knots", ylab="BIC", main="Knots vs. BIC")
r.spline.knots.aic.bic[which.min(r.spline.knots.aic.bic$AIC),] #11 knots
## knots AIC BIC
## 9 11 3717.967 3780.1
r.spline.knots.aic.bic[which.min(r.spline.knots.aic.bic$BIC),] #9 knots
## knots AIC BIC
## 7 9 3721.599 3775.965
AIC suggests picking model with 11 knots. BIC suggests picking model with 9 knots. I’d probably choose 10 knots since it’s in the middle but let’s plot the fit of 9 knots (13 df), 10 knots (14 df), and 11 knots (15 df) on top of the test data
plot(df_resort$arrival_date_formatted, df_resort$adr, xlab="Arrival date formatted", ylab="Average Daily Rate (adr)")
r.spline.fit.1 <- lm(adr ~ bs(arrival_date_formatted, df=13, intercept=TRUE), data=df_resort, subset=-train)
r.spline.fit.2 <- lm(adr ~ bs(arrival_date_formatted, df=14, intercept=TRUE), data=df_resort, subset=-train)
r.spline.fit.3 <- lm(adr ~ bs(arrival_date_formatted, df=15, intercept=TRUE), data=df_resort, subset=-train)
lines(spline(df_resort$arrival_date_formatted[-train], predict(r.spline.fit.1)), col="blue", lty=1, lwd=2)
lines(spline(df_resort$arrival_date_formatted[-train], predict(r.spline.fit.2)), col="red", lty=1, lwd=2)
lines(spline(df_resort$arrival_date_formatted[-train], predict(r.spline.fit.3)), col="forestgreen", lty=1, lwd=2)
legend("topleft", lty=rep(1,3), lwd=rep(2,3), col=c("blue", "red", "forestgreen"), legend=c("df=13, 9 knots", "df=14, 10 knots", "df=15, 11 knots"))
All three models fit similarly. Let’s make our predictions with the model with 10 knots.
summary(r.spline.fit.2)
##
## Call:
## lm(formula = adr ~ bs(arrival_date_formatted, df = 14, intercept = TRUE),
## data = df_resort, subset = -train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.52 -21.84 -2.04 19.61 93.97
##
## Coefficients: (1 not defined because of singularities)
## Estimate
## (Intercept) 204.08
## bs(arrival_date_formatted, df = 14, intercept = TRUE)1 -95.36
## bs(arrival_date_formatted, df = 14, intercept = TRUE)2 15.65
## bs(arrival_date_formatted, df = 14, intercept = TRUE)3 -231.50
## bs(arrival_date_formatted, df = 14, intercept = TRUE)4 -116.29
## bs(arrival_date_formatted, df = 14, intercept = TRUE)5 -102.13
## bs(arrival_date_formatted, df = 14, intercept = TRUE)6 51.72
## bs(arrival_date_formatted, df = 14, intercept = TRUE)7 -36.40
## bs(arrival_date_formatted, df = 14, intercept = TRUE)8 -213.57
## bs(arrival_date_formatted, df = 14, intercept = TRUE)9 -94.01
## bs(arrival_date_formatted, df = 14, intercept = TRUE)10 -145.05
## bs(arrival_date_formatted, df = 14, intercept = TRUE)11 -31.47
## bs(arrival_date_formatted, df = 14, intercept = TRUE)12 -31.06
## bs(arrival_date_formatted, df = 14, intercept = TRUE)13 85.56
## bs(arrival_date_formatted, df = 14, intercept = TRUE)14 NA
## Std. Error t value
## (Intercept) 27.34 7.464
## bs(arrival_date_formatted, df = 14, intercept = TRUE)1 34.01 -2.804
## bs(arrival_date_formatted, df = 14, intercept = TRUE)2 42.35 0.370
## bs(arrival_date_formatted, df = 14, intercept = TRUE)3 64.72 -3.577
## bs(arrival_date_formatted, df = 14, intercept = TRUE)4 60.72 -1.915
## bs(arrival_date_formatted, df = 14, intercept = TRUE)5 36.23 -2.819
## bs(arrival_date_formatted, df = 14, intercept = TRUE)6 33.13 1.561
## bs(arrival_date_formatted, df = 14, intercept = TRUE)7 35.72 -1.019
## bs(arrival_date_formatted, df = 14, intercept = TRUE)8 40.53 -5.270
## bs(arrival_date_formatted, df = 14, intercept = TRUE)9 45.32 -2.074
## bs(arrival_date_formatted, df = 14, intercept = TRUE)10 35.17 -4.124
## bs(arrival_date_formatted, df = 14, intercept = TRUE)11 36.94 -0.852
## bs(arrival_date_formatted, df = 14, intercept = TRUE)12 31.80 -0.977
## bs(arrival_date_formatted, df = 14, intercept = TRUE)13 50.56 1.692
## bs(arrival_date_formatted, df = 14, intercept = TRUE)14 NA NA
## Pr(>|t|)
## (Intercept) 2.46e-11 ***
## bs(arrival_date_formatted, df = 14, intercept = TRUE)1 0.006008 **
## bs(arrival_date_formatted, df = 14, intercept = TRUE)2 0.712481
## bs(arrival_date_formatted, df = 14, intercept = TRUE)3 0.000525 ***
## bs(arrival_date_formatted, df = 14, intercept = TRUE)4 0.058147 .
## bs(arrival_date_formatted, df = 14, intercept = TRUE)5 0.005750 **
## bs(arrival_date_formatted, df = 14, intercept = TRUE)6 0.121457
## bs(arrival_date_formatted, df = 14, intercept = TRUE)7 0.310604
## bs(arrival_date_formatted, df = 14, intercept = TRUE)8 7.24e-07 ***
## bs(arrival_date_formatted, df = 14, intercept = TRUE)9 0.040466 *
## bs(arrival_date_formatted, df = 14, intercept = TRUE)10 7.42e-05 ***
## bs(arrival_date_formatted, df = 14, intercept = TRUE)11 0.396152
## bs(arrival_date_formatted, df = 14, intercept = TRUE)12 0.330844
## bs(arrival_date_formatted, df = 14, intercept = TRUE)13 0.093563 .
## bs(arrival_date_formatted, df = 14, intercept = TRUE)14 NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.55 on 106 degrees of freedom
## Multiple R-squared: 0.7677, Adjusted R-squared: 0.7392
## F-statistic: 26.94 on 13 and 106 DF, p-value: < 2.2e-16
# R squared 0.7677
# Adjusted R Squared 0.7392
# Calculate MSE, RMSE, MAE, AIC, BIC for the model with df=14, 10 knots
yhat.r.spline <- r.spline.fit.2$fitted.values
y.test <- df_resort[-train, "adr"]
MSE.r.spline <- mean((yhat.r.spline - y.test)^2) #1116.474
RMSE.r.spline <- sqrt(mean((yhat.r.spline - y.test)^2)) # 33.41367
MAE.r.spline <- mean(abs(yhat.r.spline - y.test)) #25.96699
AIC.r.spline <- AIC(r.spline.fit.2) # 1212.697
BIC.r.spline <- BIC(r.spline.fit.2) # 1254.509
We evaluated several models. Let’s create a table to summarize their results:
# Create table with MSE, RMSE, MAE, AIC, BIC (for those that are available) for each model
results <- data.frame(Model = c("Baseline", "Linear regression", "Linear regression-stepwise", "Pruned tree", "Random forest", "Regression spline"),
MSE = c(MSE.baseline, MSE.linear.reg, MSE.linear.reg.step, MSE.tree.2.pruned, MSE.rf, MSE.r.spline),
RMSE = c(RMSE.baseline, RMSE.linear.reg, RMSE.linear.reg.step, RMSE.tree.2.pruned, RMSE.rf, RMSE.r.spline),
MAE = c(MAE.baseline, MAE.linear.reg, MAE.linear.reg.step, MAE.tree.2.pruned, MAE.rf, MAE.r.spline),
AIC = c(NA, AIC.linear.reg, AIC.linear.reg.step, NA, NA, AIC.r.spline),
BIC = c(NA, BIC.linear.reg, BIC.linear.reg.step, NA, NA, BIC.r.spline))
#results
kable(results, format="pandoc", digits=2, caption = "Summary results for each model performance") #Nicer table using kable
Model | MSE | RMSE | MAE | AIC | BIC |
---|---|---|---|---|---|
Baseline | 4960.99 | 70.43 | 59.02 | NA | NA |
Linear regression | 761.23 | 27.59 | 21.13 | 3456.80 | 3616.02 |
Linear regression-stepwise | 789.49 | 28.10 | 21.52 | 3452.19 | 3599.76 |
Pruned tree | 1316.81 | 36.29 | 27.56 | NA | NA |
Random forest | 744.09 | 27.28 | 19.80 | NA | NA |
Regression spline | 1116.47 | 33.41 | 25.97 | 1212.70 | 1254.51 |
Let’s also plot the fitted vs actual values for all models
# All predictions
predictions <- data.frame(actual = y,
baseline = baseline.pred,
linear.reg = yhat.linear.reg,
linear.reg.step = yhat.linear.reg.step,
tree.2.pruned = yhat.tree.2.pruned,
rf = yhat.rf,
r.spline = yhat.r.spline)
# Transform from wide to long for ggplot2
predictions_long <- melt(predictions,
id.vars="actual",
measure.vars=c("baseline", "linear.reg", "linear.reg.step", "tree.2.pruned", "rf", "r.spline"),
variable.name="model",
value.name="prediction"
)
# Plot actual vs. predicted values for each model
# Create new facet label names for each model to be used for plotting
model.labs <- c("Baseline", "Linear regression", "Linear regression step", "Decision tree", "Random forest", "Regression spline")
names(model.labs) <- c("baseline", "linear.reg", "linear.reg.step", "tree.2.pruned", "rf", "r.spline")
ggplot(data = predictions_long, aes(x=actual, y=prediction, colour=model)) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
facet_wrap( ~ model, labeller = labeller(model = model.labs)) +
ggtitle("Predicted vs. Actual values for each model") +
xlab("Actual values") + ylab("Predicted values") +
scale_colour_viridis_d(option="plasma") +
theme_light() +
theme(legend.position = "none") +
theme(strip.background = element_rect(color="black", size=1)) + #Facet label rectangle contour
theme(strip.text.x = element_text(color = "black")) #Facet label font colour
The summary table indicates that the best performing model was the random forest, followed by the linear regression. All models make better predictions than the baseline model