#included suppressPackageStartupMessages() to suppress the startup messages when loading the packages
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(psych))

df<- read_csv("C:\\Users\\ETIABA CHAMBER'S\\Documents\\Data_Analytics\\My_portfolio\\Hotel_Booking\\hotel_bookings.csv", show_col_types=FALSE)

#Confirming that the dataset had been properly imported
head(df)
str(df)

spec_tbl_df [88,998 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ hotel                         : chr [1:88998] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
 $ is_canceled                   : num [1:88998] 0 0 0 0 0 0 0 0 1 1 ...
 $ lead_time                     : num [1:88998] 342 737 7 13 14 14 0 9 85 75 ...
 $ arrival_date_year             : num [1:88998] 2015 2015 2015 2015 2015 ...
 $ arrival_date_month            : chr [1:88998] "July" "July" "July" "July" ...
 $ arrival_date_week_number      : num [1:88998] 27 27 27 27 27 27 27 27 27 27 ...
 $ arrival_date_day_of_month     : num [1:88998] 1 1 1 1 1 1 1 1 1 1 ...
 $ stays_in_weekend_nights       : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ...
 $ stays_in_week_nights          : num [1:88998] 0 0 1 1 2 2 2 2 3 3 ...
 $ adults                        : num [1:88998] 2 2 1 1 2 2 2 2 2 2 ...
 $ children                      : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ...
 $ babies                        : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ...
 $ meal                          : chr [1:88998] "BB" "BB" "BB" "BB" ...
 $ country                       : chr [1:88998] "PRT" "PRT" "GBR" "GBR" ...
 $ market_segment                : chr [1:88998] "Direct" "Direct" "Direct" "Corporate" ...
 $ distribution_channel          : chr [1:88998] "Direct" "Direct" "Direct" "Corporate" ...
 $ is_repeated_guest             : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ...
 $ previous_cancellations        : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ...
 $ previous_bookings_not_canceled: num [1:88998] 0 0 0 0 0 0 0 0 0 0 ...
 $ reserved_room_type            : chr [1:88998] "C" "C" "A" "A" ...
 $ assigned_room_type            : chr [1:88998] "C" "C" "C" "A" ...
 $ booking_changes               : num [1:88998] 3 4 0 0 0 0 0 0 0 0 ...
 $ deposit_type                  : chr [1:88998] "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
 $ agent                         : chr [1:88998] "NULL" "NULL" "NULL" "304" ...
 $ company                       : chr [1:88998] "NULL" "NULL" "NULL" "NULL" ...
 $ days_in_waiting_list          : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ...
 $ customer_type                 : chr [1:88998] "Transient" "Transient" "Transient" "Transient" ...
 $ adr                           : num [1:88998] 0 0 75 75 98 ...
 $ required_car_parking_spaces   : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ...
 $ total_of_special_requests     : num [1:88998] 0 0 0 0 1 1 0 1 1 0 ...
 $ reservation_status            : chr [1:88998] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
 $ reservation_status_date       : Date[1:88998], format: "2015-07-01" "2015-07-01" ...
 - attr(*, "spec")=
  .. cols(
  ..   hotel = col_character(),
  ..   is_canceled = col_double(),
  ..   lead_time = col_double(),
  ..   arrival_date_year = col_double(),
  ..   arrival_date_month = col_character(),
  ..   arrival_date_week_number = col_double(),
  ..   arrival_date_day_of_month = col_double(),
  ..   stays_in_weekend_nights = col_double(),
  ..   stays_in_week_nights = col_double(),
  ..   adults = col_double(),
  ..   children = col_double(),
  ..   babies = col_double(),
  ..   meal = col_character(),
  ..   country = col_character(),
  ..   market_segment = col_character(),
  ..   distribution_channel = col_character(),
  ..   is_repeated_guest = col_double(),
  ..   previous_cancellations = col_double(),
  ..   previous_bookings_not_canceled = col_double(),
  ..   reserved_room_type = col_character(),
  ..   assigned_room_type = col_character(),
  ..   booking_changes = col_double(),
  ..   deposit_type = col_character(),
  ..   agent = col_character(),
  ..   company = col_character(),
  ..   days_in_waiting_list = col_double(),
  ..   customer_type = col_character(),
  ..   adr = col_double(),
  ..   required_car_parking_spaces = col_double(),
  ..   total_of_special_requests = col_double(),
  ..   reservation_status = col_character(),
  ..   reservation_status_date = col_date(format = "")
  .. )
 - attr(*, "problems")=<externalptr>

#using the nrow() and duplicated() functions to find out the number of duplicate rows with same values across all 32 columns
nrow(df[duplicated(df[,1:32]),])

#dropping duplicates
df1 <- distinct(df, df[,1:32], .keep_all = TRUE)
nrow(df1)

#double-checking that there were no more duplicates 
df1[duplicated(df1[,1:32]),]

Warning message in cbind(parts$left, ellip_h, parts$right, deparse.level = 0L):
"number of rows of result is not a multiple of vector length (arg 2)"
Warning message in cbind(parts$left, ellip_h, parts$right, deparse.level = 0L):
"number of rows of result is not a multiple of vector length (arg 2)"
Warning message in cbind(parts$left, ellip_h, parts$right, deparse.level = 0L):
"number of rows of result is not a multiple of vector length (arg 2)"
Warning message in cbind(parts$left, ellip_h, parts$right, deparse.level = 0L):
"number of rows of result is not a multiple of vector length (arg 2)"

#dropping irrelevant columns
df2 <- select(df1, -3, -6, -7, -10:-14, -15:-27, -29:-32)
str(df2)

tibble [62,057 × 7] (S3: tbl_df/tbl/data.frame)
 $ hotel                  : chr [1:62057] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
 $ is_canceled            : num [1:62057] 0 0 0 0 0 0 0 1 1 1 ...
 $ arrival_date_year      : num [1:62057] 2015 2015 2015 2015 2015 ...
 $ arrival_date_month     : chr [1:62057] "July" "July" "July" "July" ...
 $ stays_in_weekend_nights: num [1:62057] 0 0 0 0 0 0 0 0 0 0 ...
 $ stays_in_week_nights   : num [1:62057] 0 0 1 1 2 2 2 3 3 4 ...
 $ adr                    : num [1:62057] 0 0 75 75 98 ...

#to return sum of missing values in df2
sum(is.na(df2))

#renaming columns
df2 <- df2 %>%
  rename(arrival_year = arrival_date_year,
         arrival_month = arrival_date_month,
         average_daily_rate = adr)

#double-checking that my columns had been properly renamed
str(df2)
head(df2)

tibble [62,057 × 7] (S3: tbl_df/tbl/data.frame)
 $ hotel                  : chr [1:62057] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
 $ is_canceled            : num [1:62057] 0 0 0 0 0 0 0 1 1 1 ...
 $ arrival_year           : num [1:62057] 2015 2015 2015 2015 2015 ...
 $ arrival_month          : chr [1:62057] "July" "July" "July" "July" ...
 $ stays_in_weekend_nights: num [1:62057] 0 0 0 0 0 0 0 0 0 0 ...
 $ stays_in_week_nights   : num [1:62057] 0 0 1 1 2 2 2 3 3 4 ...
 $ average_daily_rate     : num [1:62057] 0 0 75 75 98 ...

#selecting only enties where the booking was not cancelled, and the hotel was Resort Hotel
df3 <- df2[df2$is_canceled == 0 & df2$hotel == 'Resort Hotel',]
nrow(df3)

#dropping all entries after June 2017 (ie. dropping entries for July 2017 and August 2017)
df4 <- df3[!((df3$arrival_month == "July" & df3$arrival_year == 2017) | 
             (df3$arrival_month == "August" & df3$arrival_year == 2017)),]

str(df4)

tibble [23,856 × 7] (S3: tbl_df/tbl/data.frame)
 $ hotel                  : chr [1:23856] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
 $ is_canceled            : num [1:23856] 0 0 0 0 0 0 0 0 0 0 ...
 $ arrival_year           : num [1:23856] 2015 2015 2015 2015 2015 ...
 $ arrival_month          : chr [1:23856] "July" "July" "July" "July" ...
 $ stays_in_weekend_nights: num [1:23856] 0 0 0 0 0 0 0 0 0 0 ...
 $ stays_in_week_nights   : num [1:23856] 0 0 1 1 2 2 2 4 4 4 ...
 $ average_daily_rate     : num [1:23856] 0 0 75 75 98 ...

#looking at the statistics for numeric columns  
describe(df4)

#checking for entries where Average Daily Rate was below 0
df4[(df4$average_daily_rate < 0),]

#dropping bad entry where Average Daily Rate was negative
df5 <- df4[!(df4$average_daily_rate < 0),]
str(df5)

tibble [23,855 × 7] (S3: tbl_df/tbl/data.frame)
 $ hotel                  : chr [1:23855] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
 $ is_canceled            : num [1:23855] 0 0 0 0 0 0 0 0 0 0 ...
 $ arrival_year           : num [1:23855] 2015 2015 2015 2015 2015 ...
 $ arrival_month          : chr [1:23855] "July" "July" "July" "July" ...
 $ stays_in_weekend_nights: num [1:23855] 0 0 0 0 0 0 0 0 0 0 ...
 $ stays_in_week_nights   : num [1:23855] 0 0 1 1 2 2 2 4 4 4 ...
 $ average_daily_rate     : num [1:23855] 0 0 75 75 98 ...

#double-checking that there were no more Average Daily Rate entries that were less than 0 
df5[(df5$average_daily_rate < 0),]

df5 <- df5 %>%
  mutate(no_of_days_stayed = stays_in_weekend_nights + stays_in_week_nights)

df5 <- df5 %>%
  mutate(guest_spend = no_of_days_stayed * average_daily_rate)

#double-checking that my columns were successfully added
str(df5)
head(df5, 10)

tibble [23,855 × 9] (S3: tbl_df/tbl/data.frame)
 $ hotel                  : chr [1:23855] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
 $ is_canceled            : num [1:23855] 0 0 0 0 0 0 0 0 0 0 ...
 $ arrival_year           : num [1:23855] 2015 2015 2015 2015 2015 ...
 $ arrival_month          : chr [1:23855] "July" "July" "July" "July" ...
 $ stays_in_weekend_nights: num [1:23855] 0 0 0 0 0 0 0 0 0 0 ...
 $ stays_in_week_nights   : num [1:23855] 0 0 1 1 2 2 2 4 4 4 ...
 $ average_daily_rate     : num [1:23855] 0 0 75 75 98 ...
 $ no_of_days_stayed      : num [1:23855] 0 0 1 1 2 2 2 4 4 4 ...
 $ guest_spend            : num [1:23855] 0 0 75 75 196 ...

#saving my final dataset as a CSV file called "hotel_booking_cleaned"
write.csv(df5, "C:\\Users\\ETIABA CHAMBER'S\\Documents\\Data_Analytics\\My_portfolio\\Hotel_Booking\\hotel_bookings_cleaned.csv", row.names = FALSE)

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
	<int>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
hotel*	1	23856	1.000000	0.0000000	1	1.000000	0.0000	1.00	1	0.00	NaN	NaN	0.000000000
is_canceled	2	23856	0.000000	0.0000000	0	0.000000	0.0000	0.00	0	0.00	NaN	NaN	0.000000000
arrival_year	3	23856	2016.028630	0.6980359	2016	2016.035785	0.0000	2015.00	2017	2.00	-0.03893328	-0.9480246	0.004519381
arrival_month*	4	23856	6.512282	3.4758465	7	6.530756	4.4478	1.00	12	11.00	-0.04160526	-1.2283331	0.022504107
stays_in_weekend_nights	5	23856	1.132839	1.1630451	1	1.005921	1.4826	0.00	19	19.00	1.57992437	9.3931871	0.007530048
stays_in_week_nights	6	23856	2.987215	2.4957150	2	2.701719	1.4826	0.00	50	50.00	2.37788804	18.1145363	0.016158319
average_daily_rate	7	23856	84.532770	52.9328086	69	77.508349	38.5476	-6.38	508	514.38	1.31861336	1.9900011	0.342709486

Data Wrangling in R¶

1. Loading the packages to be used (N/B: I already had them installed)¶

2. Importing the dataset¶

3. Cleaning the dataset¶

~ Handling duplicates¶

~ Dropping irrelevant columns¶

~ Handling missing values¶

~ Renaming some columns¶

~ Dropping entries that were not relevant to my analysis¶

~ Dropping bad data¶

4. Adding additional columns¶

5. Exporting my cleaned dataset¶

hotel	is_canceled	lead_time	arrival_date_year	arrival_date_month	arrival_date_week_number	arrival_date_day_of_month	stays_in_weekend_nights	stays_in_week_nights	adults	⋯	deposit_type	agent	company	days_in_waiting_list	customer_type	adr	required_car_parking_spaces	total_of_special_requests	reservation_status	reservation_status_date
<chr>	<dbl>	<dbl>	<dbl>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<chr>	<chr>	<chr>	<dbl>	<chr>	<dbl>	<dbl>	<dbl>	<chr>	<date>
Resort Hotel	0	342	2015	July	27	1	0	0	2	⋯	No Deposit	NULL	NULL	0	Transient	0	0	0	Check-Out	2015-07-01
Resort Hotel	0	737	2015	July	27	1	0	0	2	⋯	No Deposit	NULL	NULL	0	Transient	0	0	0	Check-Out	2015-07-01
Resort Hotel	0	7	2015	July	27	1	0	1	1	⋯	No Deposit	NULL	NULL	0	Transient	75	0	0	Check-Out	2015-07-02
Resort Hotel	0	13	2015	July	27	1	0	1	1	⋯	No Deposit	304	NULL	0	Transient	75	0	0	Check-Out	2015-07-02
Resort Hotel	0	14	2015	July	27	1	0	2	2	⋯	No Deposit	240	NULL	0	Transient	98	0	1	Check-Out	2015-07-03
Resort Hotel	0	14	2015	July	27	1	0	2	2	⋯	No Deposit	240	NULL	0	Transient	98	0	1	Check-Out	2015-07-03