Data Wrangling in R¶
This is a notebook by Ogechi Anoliefo.
1. Loading the packages to be used (N/B: I already had them installed)¶
#included suppressPackageStartupMessages() to suppress the startup messages when loading the packages
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(psych))
2. Importing the dataset¶
df<- read_csv("C:\\Users\\ETIABA CHAMBER'S\\Documents\\Data_Analytics\\My_portfolio\\Hotel_Booking\\hotel_bookings.csv", show_col_types=FALSE)
#Confirming that the dataset had been properly imported
head(df)
str(df)
hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ⋯ | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <chr> | <chr> | <chr> | <dbl> | <chr> | <dbl> | <dbl> | <dbl> | <chr> | <date> |
Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ⋯ | No Deposit | NULL | NULL | 0 | Transient | 0 | 0 | 0 | Check-Out | 2015-07-01 |
Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ⋯ | No Deposit | NULL | NULL | 0 | Transient | 0 | 0 | 0 | Check-Out | 2015-07-01 |
Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ⋯ | No Deposit | NULL | NULL | 0 | Transient | 75 | 0 | 0 | Check-Out | 2015-07-02 |
Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ⋯ | No Deposit | 304 | NULL | 0 | Transient | 75 | 0 | 0 | Check-Out | 2015-07-02 |
Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ⋯ | No Deposit | 240 | NULL | 0 | Transient | 98 | 0 | 1 | Check-Out | 2015-07-03 |
Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ⋯ | No Deposit | 240 | NULL | 0 | Transient | 98 | 0 | 1 | Check-Out | 2015-07-03 |
spec_tbl_df [88,998 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame) $ hotel : chr [1:88998] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ... $ is_canceled : num [1:88998] 0 0 0 0 0 0 0 0 1 1 ... $ lead_time : num [1:88998] 342 737 7 13 14 14 0 9 85 75 ... $ arrival_date_year : num [1:88998] 2015 2015 2015 2015 2015 ... $ arrival_date_month : chr [1:88998] "July" "July" "July" "July" ... $ arrival_date_week_number : num [1:88998] 27 27 27 27 27 27 27 27 27 27 ... $ arrival_date_day_of_month : num [1:88998] 1 1 1 1 1 1 1 1 1 1 ... $ stays_in_weekend_nights : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ... $ stays_in_week_nights : num [1:88998] 0 0 1 1 2 2 2 2 3 3 ... $ adults : num [1:88998] 2 2 1 1 2 2 2 2 2 2 ... $ children : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ... $ babies : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ... $ meal : chr [1:88998] "BB" "BB" "BB" "BB" ... $ country : chr [1:88998] "PRT" "PRT" "GBR" "GBR" ... $ market_segment : chr [1:88998] "Direct" "Direct" "Direct" "Corporate" ... $ distribution_channel : chr [1:88998] "Direct" "Direct" "Direct" "Corporate" ... $ is_repeated_guest : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ... $ previous_cancellations : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ... $ previous_bookings_not_canceled: num [1:88998] 0 0 0 0 0 0 0 0 0 0 ... $ reserved_room_type : chr [1:88998] "C" "C" "A" "A" ... $ assigned_room_type : chr [1:88998] "C" "C" "C" "A" ... $ booking_changes : num [1:88998] 3 4 0 0 0 0 0 0 0 0 ... $ deposit_type : chr [1:88998] "No Deposit" "No Deposit" "No Deposit" "No Deposit" ... $ agent : chr [1:88998] "NULL" "NULL" "NULL" "304" ... $ company : chr [1:88998] "NULL" "NULL" "NULL" "NULL" ... $ days_in_waiting_list : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ... $ customer_type : chr [1:88998] "Transient" "Transient" "Transient" "Transient" ... $ adr : num [1:88998] 0 0 75 75 98 ... $ required_car_parking_spaces : num [1:88998] 0 0 0 0 0 0 0 0 0 0 ... $ total_of_special_requests : num [1:88998] 0 0 0 0 1 1 0 1 1 0 ... $ reservation_status : chr [1:88998] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ... $ reservation_status_date : Date[1:88998], format: "2015-07-01" "2015-07-01" ... - attr(*, "spec")= .. cols( .. hotel = col_character(), .. is_canceled = col_double(), .. lead_time = col_double(), .. arrival_date_year = col_double(), .. arrival_date_month = col_character(), .. arrival_date_week_number = col_double(), .. arrival_date_day_of_month = col_double(), .. stays_in_weekend_nights = col_double(), .. stays_in_week_nights = col_double(), .. adults = col_double(), .. children = col_double(), .. babies = col_double(), .. meal = col_character(), .. country = col_character(), .. market_segment = col_character(), .. distribution_channel = col_character(), .. is_repeated_guest = col_double(), .. previous_cancellations = col_double(), .. previous_bookings_not_canceled = col_double(), .. reserved_room_type = col_character(), .. assigned_room_type = col_character(), .. booking_changes = col_double(), .. deposit_type = col_character(), .. agent = col_character(), .. company = col_character(), .. days_in_waiting_list = col_double(), .. customer_type = col_character(), .. adr = col_double(), .. required_car_parking_spaces = col_double(), .. total_of_special_requests = col_double(), .. reservation_status = col_character(), .. reservation_status_date = col_date(format = "") .. ) - attr(*, "problems")=<externalptr>
The dataset contained a total of 88,998 rows and 32 columns.
3. Cleaning the dataset¶
~ Handling duplicates¶
Since this was actual hotel data, all personal identifying information (such as name, phone number, email) had been removed from the data, and as such the primary key had been taken out. However, how I handled duplicates was by checking for and removing duplicate rows with the exact same values across all 32 columns. It is (near) impossible for any 2 guest bookings to have the same values across all 32 columns.
N/B: In a typical work scenario, I would use the primary key to check for duplicates.
#using the nrow() and duplicated() functions to find out the number of duplicate rows with same values across all 32 columns
nrow(df[duplicated(df[,1:32]),])
26,941 duplicate rows returned.
#dropping duplicates
df1 <- distinct(df, df[,1:32], .keep_all = TRUE)
nrow(df1)
62,057 entries left.
#double-checking that there were no more duplicates
df1[duplicated(df1[,1:32]),]
Warning message in cbind(parts$left, ellip_h, parts$right, deparse.level = 0L): "number of rows of result is not a multiple of vector length (arg 2)" Warning message in cbind(parts$left, ellip_h, parts$right, deparse.level = 0L): "number of rows of result is not a multiple of vector length (arg 2)" Warning message in cbind(parts$left, ellip_h, parts$right, deparse.level = 0L): "number of rows of result is not a multiple of vector length (arg 2)" Warning message in cbind(parts$left, ellip_h, parts$right, deparse.level = 0L): "number of rows of result is not a multiple of vector length (arg 2)"
hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ⋯ | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <chr> | <chr> | <chr> | <dbl> | <chr> | <dbl> | <dbl> | <dbl> | <chr> | <date> |
The last line of code above returned 0 entries, confirming that there were no duplicates in df1.
~ Dropping irrelevant columns¶
Because my analysis was focused on KPI changes over time, only the following 7 columns were useful for my analysis: hotel, is_canceelled, arrival_date_year, arrival_date_month, stays_in_weekend_nights, stays_in_week_nights, adr. I droped the remaining 25 columns.
#dropping irrelevant columns
df2 <- select(df1, -3, -6, -7, -10:-14, -15:-27, -29:-32)
str(df2)
tibble [62,057 × 7] (S3: tbl_df/tbl/data.frame) $ hotel : chr [1:62057] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ... $ is_canceled : num [1:62057] 0 0 0 0 0 0 0 1 1 1 ... $ arrival_date_year : num [1:62057] 2015 2015 2015 2015 2015 ... $ arrival_date_month : chr [1:62057] "July" "July" "July" "July" ... $ stays_in_weekend_nights: num [1:62057] 0 0 0 0 0 0 0 0 0 0 ... $ stays_in_week_nights : num [1:62057] 0 0 1 1 2 2 2 3 3 4 ... $ adr : num [1:62057] 0 0 75 75 98 ...
~ Handling missing values¶
My next step was to check for missing values in the columns left.
#to return sum of missing values in df2
sum(is.na(df2))
No missing values.
~ Renaming some columns¶
I renamed some of the columns for clarity.
#renaming columns
df2 <- df2 %>%
rename(arrival_year = arrival_date_year,
arrival_month = arrival_date_month,
average_daily_rate = adr)
#double-checking that my columns had been properly renamed
str(df2)
head(df2)
tibble [62,057 × 7] (S3: tbl_df/tbl/data.frame) $ hotel : chr [1:62057] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ... $ is_canceled : num [1:62057] 0 0 0 0 0 0 0 1 1 1 ... $ arrival_year : num [1:62057] 2015 2015 2015 2015 2015 ... $ arrival_month : chr [1:62057] "July" "July" "July" "July" ... $ stays_in_weekend_nights: num [1:62057] 0 0 0 0 0 0 0 0 0 0 ... $ stays_in_week_nights : num [1:62057] 0 0 1 1 2 2 2 3 3 4 ... $ average_daily_rate : num [1:62057] 0 0 75 75 98 ...
hotel | is_canceled | arrival_year | arrival_month | stays_in_weekend_nights | stays_in_week_nights | average_daily_rate |
---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <chr> | <dbl> | <dbl> | <dbl> |
Resort Hotel | 0 | 2015 | July | 0 | 0 | 0 |
Resort Hotel | 0 | 2015 | July | 0 | 0 | 0 |
Resort Hotel | 0 | 2015 | July | 0 | 1 | 75 |
Resort Hotel | 0 | 2015 | July | 0 | 1 | 75 |
Resort Hotel | 0 | 2015 | July | 0 | 2 | 98 |
Resort Hotel | 0 | 2015 | July | 0 | 2 | 107 |
~ Dropping entries that were not relevant to my analysis¶
I mentioned earlier that this analysis would focus on data for the period from July 2015 to June 2017 for the Resort Hotel alone, so I went ahead with entries that met the above conditions, and dropped the rest.
Also the dataset is such that the "is_cancelled" column cointains only 0 and 1 values. 0 represents when a booking was not cancelled and 1 represents when a booking was cancelled. For my analysis, I went ahead with only the cases where the booking was not cancelled (ie. I dropped the entries where the booking was cancelled)
#selecting only enties where the booking was not cancelled, and the hotel was Resort Hotel
df3 <- df2[df2$is_canceled == 0 & df2$hotel == 'Resort Hotel',]
nrow(df3)
25,992 entries.
#dropping all entries after June 2017 (ie. dropping entries for July 2017 and August 2017)
df4 <- df3[!((df3$arrival_month == "July" & df3$arrival_year == 2017) |
(df3$arrival_month == "August" & df3$arrival_year == 2017)),]
str(df4)
tibble [23,856 × 7] (S3: tbl_df/tbl/data.frame) $ hotel : chr [1:23856] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ... $ is_canceled : num [1:23856] 0 0 0 0 0 0 0 0 0 0 ... $ arrival_year : num [1:23856] 2015 2015 2015 2015 2015 ... $ arrival_month : chr [1:23856] "July" "July" "July" "July" ... $ stays_in_weekend_nights: num [1:23856] 0 0 0 0 0 0 0 0 0 0 ... $ stays_in_week_nights : num [1:23856] 0 0 1 1 2 2 2 4 4 4 ... $ average_daily_rate : num [1:23856] 0 0 75 75 98 ...
Dropped 2,136 rows; 23,856 rows left
~ Dropping bad data¶
#looking at the statistics for numeric columns
describe(df4)
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
hotel* | 1 | 23856 | 1.000000 | 0.0000000 | 1 | 1.000000 | 0.0000 | 1.00 | 1 | 0.00 | NaN | NaN | 0.000000000 |
is_canceled | 2 | 23856 | 0.000000 | 0.0000000 | 0 | 0.000000 | 0.0000 | 0.00 | 0 | 0.00 | NaN | NaN | 0.000000000 |
arrival_year | 3 | 23856 | 2016.028630 | 0.6980359 | 2016 | 2016.035785 | 0.0000 | 2015.00 | 2017 | 2.00 | -0.03893328 | -0.9480246 | 0.004519381 |
arrival_month* | 4 | 23856 | 6.512282 | 3.4758465 | 7 | 6.530756 | 4.4478 | 1.00 | 12 | 11.00 | -0.04160526 | -1.2283331 | 0.022504107 |
stays_in_weekend_nights | 5 | 23856 | 1.132839 | 1.1630451 | 1 | 1.005921 | 1.4826 | 0.00 | 19 | 19.00 | 1.57992437 | 9.3931871 | 0.007530048 |
stays_in_week_nights | 6 | 23856 | 2.987215 | 2.4957150 | 2 | 2.701719 | 1.4826 | 0.00 | 50 | 50.00 | 2.37788804 | 18.1145363 | 0.016158319 |
average_daily_rate | 7 | 23856 | 84.532770 | 52.9328086 | 69 | 77.508349 | 38.5476 | -6.38 | 508 | 514.38 | 1.31861336 | 1.9900011 | 0.342709486 |
From the summary statistics above, I saw that the minimum Average Daily Rate was a negaitive number. This should not be possible, so I took a closer look.
#checking for entries where Average Daily Rate was below 0
df4[(df4$average_daily_rate < 0),]
hotel | is_canceled | arrival_year | arrival_month | stays_in_weekend_nights | stays_in_week_nights | average_daily_rate |
---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <chr> | <dbl> | <dbl> | <dbl> |
Resort Hotel | 0 | 2017 | March | 4 | 6 | -6.38 |
There was only one entry with Average Daily Rate less than 0. I dropped this entry
#dropping bad entry where Average Daily Rate was negative
df5 <- df4[!(df4$average_daily_rate < 0),]
str(df5)
tibble [23,855 × 7] (S3: tbl_df/tbl/data.frame) $ hotel : chr [1:23855] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ... $ is_canceled : num [1:23855] 0 0 0 0 0 0 0 0 0 0 ... $ arrival_year : num [1:23855] 2015 2015 2015 2015 2015 ... $ arrival_month : chr [1:23855] "July" "July" "July" "July" ... $ stays_in_weekend_nights: num [1:23855] 0 0 0 0 0 0 0 0 0 0 ... $ stays_in_week_nights : num [1:23855] 0 0 1 1 2 2 2 4 4 4 ... $ average_daily_rate : num [1:23855] 0 0 75 75 98 ...
23,855 entries left.
#double-checking that there were no more Average Daily Rate entries that were less than 0
df5[(df5$average_daily_rate < 0),]
hotel | is_canceled | arrival_year | arrival_month | stays_in_weekend_nights | stays_in_week_nights | average_daily_rate |
---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <chr> | <dbl> | <dbl> | <dbl> |
No more entries with Average Daily Rate less than 0.
4. Adding additional columns¶
To aid my analysis, I added 2 calculated columns:
i). "no_of days_stayed" to calculate the total number of days that each guest stayed.
ii). "guest_spend" to calculate the amount spent by each guest.
df5 <- df5 %>%
mutate(no_of_days_stayed = stays_in_weekend_nights + stays_in_week_nights)
df5 <- df5 %>%
mutate(guest_spend = no_of_days_stayed * average_daily_rate)
#double-checking that my columns were successfully added
str(df5)
head(df5, 10)
tibble [23,855 × 9] (S3: tbl_df/tbl/data.frame) $ hotel : chr [1:23855] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ... $ is_canceled : num [1:23855] 0 0 0 0 0 0 0 0 0 0 ... $ arrival_year : num [1:23855] 2015 2015 2015 2015 2015 ... $ arrival_month : chr [1:23855] "July" "July" "July" "July" ... $ stays_in_weekend_nights: num [1:23855] 0 0 0 0 0 0 0 0 0 0 ... $ stays_in_week_nights : num [1:23855] 0 0 1 1 2 2 2 4 4 4 ... $ average_daily_rate : num [1:23855] 0 0 75 75 98 ... $ no_of_days_stayed : num [1:23855] 0 0 1 1 2 2 2 4 4 4 ... $ guest_spend : num [1:23855] 0 0 75 75 196 ...
hotel | is_canceled | arrival_year | arrival_month | stays_in_weekend_nights | stays_in_week_nights | average_daily_rate | no_of_days_stayed | guest_spend |
---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
Resort Hotel | 0 | 2015 | July | 0 | 0 | 0.00 | 0 | 0.00 |
Resort Hotel | 0 | 2015 | July | 0 | 0 | 0.00 | 0 | 0.00 |
Resort Hotel | 0 | 2015 | July | 0 | 1 | 75.00 | 1 | 75.00 |
Resort Hotel | 0 | 2015 | July | 0 | 1 | 75.00 | 1 | 75.00 |
Resort Hotel | 0 | 2015 | July | 0 | 2 | 98.00 | 2 | 196.00 |
Resort Hotel | 0 | 2015 | July | 0 | 2 | 107.00 | 2 | 214.00 |
Resort Hotel | 0 | 2015 | July | 0 | 2 | 103.00 | 2 | 206.00 |
Resort Hotel | 0 | 2015 | July | 0 | 4 | 145.00 | 4 | 580.00 |
Resort Hotel | 0 | 2015 | July | 0 | 4 | 97.00 | 4 | 388.00 |
Resort Hotel | 0 | 2015 | July | 0 | 4 | 154.77 | 4 | 619.08 |
5. Exporting my cleaned dataset¶
#saving my final dataset as a CSV file called "hotel_booking_cleaned"
write.csv(df5, "C:\\Users\\ETIABA CHAMBER'S\\Documents\\Data_Analytics\\My_portfolio\\Hotel_Booking\\hotel_bookings_cleaned.csv", row.names = FALSE)