Anda di halaman 1dari 24

> setwd("G:\\Management-IMT\\Analytics\\code")

>
> #loading the required libraries
> library(rpart)
> library(rattle)
> library(rpart.plot)
> library(beanplot)
> library(dplyr)
> library(RColorBrewer)
> library(randomForest)
> library(magrittr)
>
> # reading the data files
> bike=read.csv("Bike Rental Data.csv", header = TRUE)
>
> # Get the feel about the data
> str(bike)
'data.frame': 17379 obs. of 17 variables:
$ instant : int 1 2 3 4 5 6 7 8 9 10 ...
$ dteday : Factor w/ 731 levels "2011-01-01","2011-01-02",..: 1 1 1 1
1 1 1 1 1 1 ...
$ season : int 1 1 1 1 1 1 1 1 1 1 ...
$ yr : int 0 0 0 0 0 0 0 0 0 0 ...
$ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
$ hr : int 0 1 2 3 4 5 6 7 8 9 ...
$ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
$ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
$ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
$ weathersit: int 1 1 1 1 1 2 1 1 1 1 ...
$ temp : num 0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
$ atemp : num 0.288 0.273 0.273 0.288 0.288 ...
$ hum : num 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
$ windspeed : num 0 0 0 0 0 0.0896 0 0 0 0 ...
$ casual : int 3 8 5 3 0 0 2 1 1 8 ...
$ registered: int 13 32 27 10 1 1 0 2 7 6 ...
$ cnt : int 16 40 32 13 1 1 2 3 8 14 ...
> summary(bike)
instant dteday season yr
mnth
Min. : 1 2011-01-01: 24 Min. :1.000 Min. :0.0000 Min.
: 1.000
1st Qu.: 4346 2011-01-08: 24 1st Qu.:2.000 1st Qu.:0.0000 1st
Qu.: 4.000
Median : 8690 2011-01-09: 24 Median :3.000 Median :1.0000
Median : 7.000
Mean : 8690 2011-01-10: 24 Mean :2.502 Mean :0.5026 Mean
: 6.538
3rd Qu.:13034 2011-01-13: 24 3rd Qu.:3.000 3rd Qu.:1.0000 3rd
Qu.:10.000
Max. :17379 2011-01-15: 24 Max. :4.000 Max. :1.0000 Max.
:12.000
(Other) :17235
hr holiday weekday workingday
weathersit
Min. : 0.00 Min. :0.00000 Min. :0.000 Min. :0.0000 Min.
:1.000
1st Qu.: 6.00 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st
Qu.:1.000
Median :12.00 Median :0.00000 Median :3.000 Median :1.0000 Median
:1.000
Mean :11.55 Mean :0.02877 Mean :3.004 Mean :0.6827 Mean
:1.425
3rd Qu.:18.00 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:1.0000 3rd
Qu.:2.000
Max. :23.00 Max. :1.00000 Max. :6.000 Max. :1.0000 Max.
:4.000

temp atemp hum windspeed


casual
Min. :0.020 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min.
: 0.00
1st Qu.:0.340 1st Qu.:0.3333 1st Qu.:0.4800 1st Qu.:0.1045 1st
Qu.: 4.00
Median :0.500 Median :0.4848 Median :0.6300 Median :0.1940 Median
: 17.00
Mean :0.497 Mean :0.4758 Mean :0.6272 Mean :0.1901 Mean
: 35.68
3rd Qu.:0.660 3rd Qu.:0.6212 3rd Qu.:0.7800 3rd Qu.:0.2537 3rd
Qu.: 48.00
Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :0.8507 Max.
:367.00

registered cnt
Min. : 0.0 Min. : 1.0
1st Qu.: 34.0 1st Qu.: 40.0
Median :115.0 Median :142.0
Mean :153.8 Mean :189.5
3rd Qu.:220.0 3rd Qu.:281.0
Max. :886.0 Max. :977.0

> dim(bike)
[1] 17379 17
> names(bike)
[1] "instant" "dteday" "season" "yr" "mnth" "hr"
"holiday"
[8] "weekday" "workingday" "weathersit" "temp" "atemp"
"hum" "windspeed"
[15] "casual" "registered" "cnt"
>
> # are there any missing values in the data?
> table(is.na(bike))

FALSE
295443
>
> # Recode Season values 1-4 to Spring, Summer, Fall, Winter
>
> # Function to recode values
> recodev <- function(original.vector, old.values, new.values) {
+ new.vector <- original.vector
+ for (i in 1:length(old.values)) {
+ change.log <- original.vector == old.values[i] &
+ is.na(original.vector) == F
+ new.vector[change.log] <- new.values[i]
+ }
+ return(new.vector)
+ }
>
> # apply recodev function for recoding season values
> bike$season <- recodev(original.vector = data$season,
+ old.values = c(1:4),
+ new.values = c("Spring","Summer","Fall",
+ "Winter"))
Error in data$season : object of type 'closure' is not subsettable
> table(bike$season)

1 2 3 4
4242 4409 4496 4232
>
> # What is the most popular season for bike rentals
> boxplot(count ~ season,
+ data = bike,
+ xlab = "Season",
+ ylab = "Count",
+ main = "Count by Season",
+ col = "yellow3")
Error in model.frame.default(formula = count ~ season, data = bike) :
object is not a matrix
>
> # check seasons and respective months
> unique(bike$month[bike$season=="Spring"])
NULL
> unique(bike$month[bike$season=="Summer"])
NULL
> unique(bike$month[bike$season=="Fall"])
NULL
> unique(bike$month[bike$season=="Winter"])
NULL
>
> # temp : Normalized temperature in Celsius. The values are derived via
(t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
> # atemp: Normalized feeling temperature in Celsius. The values are
derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly
scale)
> # temp and atemp are in normalized form
> # we need to denormalize it before using
>
> # create a function for denormalise temp, atemp
> tconvert <- function(min, max, vector){
+ result <- vector * (max - min) + min
+ return (result)
+ }
>
> # apply the function and denormalise the temperature values
> bike$temp <- tconvert(-8, 39, bike$temp)
> bike$atemp <- tconvert(-16, 50, bike$atemp)
>
> # calculate mean, std.dev and median for each season
> season_agg <- bike %>%
+ group_by(season) %>%
+ summarise(
+ temp.min = min(temp),
+ temp.max = max(temp),
+ temp.med = median(temp),
+ temp.stdev = sd(temp),
+ temp.mean = mean(temp),
+ count = n())
> season_agg
# A tibble: 4 x 7
season temp.min temp.max temp.med temp.stdev temp.mean count
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1 -7.06 25.8 5.16 5.58 6.06 4242
2 2 -0.480 36.2 18.3 6.54 17.6 4409
3 3 9.86 39 24.9 4.41 25.2 4496
4 4 -1.42 27.7 11.7 5.74 11.9 4232
>
> # create a boxplot for temperature by season
> boxplot(temp ~ season,
+ data = bike,
+ xlab = "Season",
+ ylab = "Temperature",
+ main = "Temperature by Season",
+ col = "skyblue")
>
> # rename columns yr and mnth.. just to improve readability
> names(bike)[4:5] <- c("year","month")
>
> # recode year values to 2011, 2012
> bike$year <- recodev(original.vector = bike$year,
+ old.values = c(0,1),
+ new.values = c(2011,2012))
> # check column names
> names(bike)
[1] "instant" "dteday" "season" "year" "month" "hr"
"holiday"
[8] "weekday" "workingday" "weathersit" "temp" "atemp"
"hum" "windspeed"
[15] "casual" "registered" "cnt"
>
> # Similarly rename hum and cnt to meaningful names
> names(bike)[names(bike)=="hum"] <- "humidity"
> names(bike)[names(bike)=="cnt"] <- "count"
> names(bike)
[1] "instant" "dteday" "season" "year" "month" "hr"
"holiday"
[8] "weekday" "workingday" "weathersit" "temp" "atemp"
"humidity" "windspeed"
[15] "casual" "registered" "count"
>
> # understand the distribution of numerical variables..
> # setting margins for the graphs
> par(mfrow=c(4,2))
> par(mar = rep(2, 4))
> hist(bike$weather)
> hist(bike$humidity)
> hist(bike$holiday)
> hist(bike$workingday)
> hist(bike$temp)
> hist(bike$atemp)
> hist(bike$windspeed)
>
> # For which weather condition the number of total bike rentals are the
lowest/highest?
>
> # create a beanplot for number of bike rents per each weather condition
> par(mfrow=c(1,1))
> par(mar=c(4,4,4,4))
> bean.cols <- lapply(brewer.pal(6, "Set3"),
+ function(x){return(c(x, "black", "gray", "red"))})
> beanplot(count ~ weathersit,
+ data = bike,
+ main = "Bike Rents by Weather Condition",
+ xlab = "Weather Condition",
+ ylab = "Number of rentals",
+ col = bean.cols,
+ lwd = 1,
+ what = c (1,1,1,0),
+ log = ""
+ )
>
> # What is the mean, median and standard deviation for total number of
rentals
> # (count) per season?
> # create a data frame
> df <- data.frame(Spring = rep(NA, 3),
+ Winter = rep(NA, 3),
+ Summer = rep(NA, 3),
+ Fall = rep(NA, 3))
> row.names(df) <- c("mean", "median", "sd")
>
> # fill the data frame with corresponding mean, median and sd values
> vec <- c ("mean","median","sd")
> for (n in vec){
+ for (i in unique(bike$season)) {
+ my.fun <- get(n)
+ res <- my.fun(bike$count[bike$season == i])
+ df[n,i] <- res
+ }
+ }
> df
Spring Winter Summer Fall
mean 111.1146 208.3441 236.0162 198.8689
median 76.0000 165.0000 199.0000 155.5000
sd 119.2240 188.3625 197.7116 182.9680
>
> # ANOVA of season
> summary(aov(count ~ season, data = bike))
Df Sum Sq Mean Sq F value Pr(>F)
season 1 18127040 18127040 569 <2e-16 ***
Residuals 17377 553634551 31860
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> # pairwise comparison of means for seasons in order to identify any
difference between
> # two means that is greater than the expected standard error
> TukeyHSD(aov(count ~ season, data = bike))
Error in TukeyHSD.aov(aov(count ~ season, data = bike)) :
no factors in the fitted model
In addition: Warning message:
In replications(paste("~", xx), data = mf) : non-factors ignored: season
>
> # correlation test for count~atemp
> t1 <- cor.test(bike$atemp[bike$year == 2011],
+ bike$count[bike$year == 2011])
> t1

Pearson's product-moment correlation

data: bike$atemp[bike$year == 2011] and bike$count[bike$year == 2011]


t = 46.46, df = 8643, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4300004 0.4637388
sample estimates:
cor
0.4470285

>
> t2 <- cor.test(bike$atemp[bike$year == 2012],
+ bike$count[bike$year == 2012])
> t2

Pearson's product-moment correlation

data: bike$atemp[bike$year == 2012] and bike$count[bike$year == 2012]


t = 40.346, df = 8732, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3785679 0.4139248
sample estimates:
cor
0.3963933

>
> # plot the results in a scatterplot with regression lines
>
> # blank plot
> plot(x = 1,
+ xlab = "Temperature",
+ ylab = "Number of Rents",
+ xlim = c(-25,50),
+ ylim = c(0,1000),
+ main = "Temperature vs. Count")
>
> # draw points for 2011 year
> points(x = bike$atemp[bike$year == 2011],
+ y = bike$count[bike$year == 2011],
+ pch = 16,
+ col = "red",
+ cex = 0.5
+ )
> # draw points for 2012 year
> points(x = bike$atemp[bike$year == 2012],
+ y = bike$count[bike$year == 2012],
+ pch = 16,
+ col = "darkgreen",
+ cex = 0.5
+ )
>
> # add regression lines for two ears
> abline(lm(count~atemp, bike, subset = year == 2011),
+ col = "darkgreen",
+ lwd = 3)
>
> abline(lm(count~atemp, bike, subset = year == 2012),
+ col = "red",
+ lwd = 3)
>
> # add legend
> legend("topleft",
+ legend = c(2011, 2012),
+ col = c("darkgreen","red"),
+ pch = c(16, 16),
+ bg = "white",
+ cex = 1
+ )
>
> # create histograms for each weather condition to explore distribution
of the bike rentals by
> # weather condition
>
> # create a vector for histograms titles
> vec <- c("Clear Weather", "Cloudy Weather", "Rainy Weather",
"Thunderstorm Weather")
>
> # parameters for plots combining
> par(mfrow = c(1, 1))
>
> # create 4 histograms with a loop
> for (i in c(1:4)){
+ name.i <- vec[i]
+ hist(bike$count[bike$weathersit == i],
+ main = name.i,
+ xlab = "Number of Rents",
+ ylab = "Frequency",
+ breaks = 10,
+ col = "yellow3",
+ border = "black")
+
+ # the line indicating median value
+ abline(v = median(bike$count[bike$weathersit == i]),
+ col = "black",
+ lwd = 3,
+ lty = 2)
+
+ # the line indicating mean value
+ abline(v = mean(bike$count[bike$weathersit == i]),
+ col = "blue",
+ lwd = 3,
+ lty = 2)
+ }
>
> # Bike rents by type of the day
> beanplot(count ~ holiday,
+ data = bike,
+ main = "Bike Rents by Type of a Day",
+ xlab = "Type of Day",
+ ylab = "Number of rents",
+ col = bean.cols,
+ lwd = 1,
+ what = c(1,1,1,0),
+ log = ""
+ )
>
> # Hourly trend of count
> boxplot(bike$count ~ bike$hr, xlab="Hour", ylab="Count of users",
main="Hourly Trend of Count")
>
> # Hourly trend of casual users
> boxplot(bike$casual ~ bike$hr, xlab="Hour", ylab="Casual users",
main="Hourly Trend of Casual users")
>
> # Hourly trend of registered users
> boxplot(bike$registered ~ bike$hr, xlab="Hour", ylab="Registered users",
main="Hourly Trend of Registered users")
>
>
> #Checking dataset
> #Tests if values in a vector are integers
> is.integer(bike)
[1] FALSE
>
> #Tests if values in a vector are NA or NULL
> #is.na(day) we tested it but due to the huge output we deleted it. There
was no "NA".
> is.null(bike)
[1] FALSE
>
> #The Dataset is already recoded and correct.
> #For this question we converted "atemp" because it was devided of 50.
> bike$raw.temp <- (bike$temp*41)
> bike$raw.atemp <-(bike$atemp * 50)
>
> #Create a new column of the mean of raw.temp and raw.atemp.
> bike$raw.mean.temp.atemp <- (bike$raw.temp + bike$raw.atemp)/2
> head(bike)
instant dteday season year month hr holiday weekday workingday
weathersit temp atemp
1 1 2011-01-01 1 2011 1 0 0 6 0
1 3.28 3.0014
2 2 2011-01-01 1 2011 1 1 0 6 0
1 2.34 1.9982
3 3 2011-01-01 1 2011 1 2 0 6 0
1 2.34 1.9982
4 4 2011-01-01 1 2011 1 3 0 6 0
1 3.28 3.0014
5 5 2011-01-01 1 2011 1 4 0 6 0
1 3.28 3.0014
6 6 2011-01-01 1 2011 1 5 0 6 0
2 3.28 1.0016
humidity windspeed casual registered count raw.temp raw.atemp
raw.mean.temp.atemp
1 0.81 0.0000 3 13 16 134.48 150.07
142.275
2 0.80 0.0000 8 32 40 95.94 99.91
97.925
3 0.80 0.0000 5 27 32 95.94 99.91
97.925
4 0.75 0.0000 3 10 13 134.48 150.07
142.275
5 0.75 0.0000 0 1 1 134.48 150.07
142.275
6 0.75 0.0896 0 1 1 134.48 50.08
92.280
>
> Temperature <- bike$raw.temp
>
> #Correlation between raw.temp and the total count of bike rentals.
>
> #cor.temp <- cor.test(x = as.numeric(as.factor(bike$raw.temp)), y =
as.numeric(as.factor(bike$countt)))
>
> #cor.temp
>
>
>
>
> #Correlation between atemp and the total count of bike rentals.
> #cor.atemp <- cor.test(x = bike$raw.atemp,y = bike$cnt)
> #cor.atemp
>
> Feeled.Temperature <- bike$raw.atemp
>
>
> Feeled.Raw.Temperature <- bike$raw.mean.temp.atemp
> Amount.Rentals <- bike$cnt
>
> par(mfrow=c(2,2))
>
> plot(x = Temperature, y = Amount.Rentals, main = "Correlation", col =
"red")
> abline(lm(Amount.Rentals ~ Temperature), col = "blue")
Error in model.frame.default(formula = Amount.Rentals ~ Temperature,
drop.unused.levels = TRUE) :
invalid type (NULL) for variable 'Amount.Rentals'
> legend("topleft", legend = paste("cor = ", round(cor(Temperature,
Amount.Rentals, method="kendall"), 2), sep = ""),lty = 1, col = "blue")
Error in cor(Temperature, Amount.Rentals, method = "kendall") :
supply both 'x' and 'y' or a matrix-like 'x'
>
> plot(x = Feeled.Temperature, y = Amount.Rentals, main = "Correlation",
col = "blue")
> abline(lm(Amount.Rentals ~ Feeled.Temperature), col = "red")
Error in model.frame.default(formula = Amount.Rentals ~
Feeled.Temperature, :
invalid type (NULL) for variable 'Amount.Rentals'
> legend("topleft", legend = paste("cor = ", round(cor(Feeled.Temperature,
Amount.Rentals), 2), sep = ""),lty = 1, col = "red")
Error in cor(Feeled.Temperature, Amount.Rentals) :
supply both 'x' and 'y' or a matrix-like 'x'
>
> plot(x = Feeled.Raw.Temperature, y = Amount.Rentals, main =
"Correlation", col = "green")
> abline(lm(Amount.Rentals ~ Feeled.Raw.Temperature), col = "orange")
Error in model.frame.default(formula = Amount.Rentals ~
Feeled.Raw.Temperature, :
invalid type (NULL) for variable 'Amount.Rentals'
> legend("topleft", legend = paste("cor = ", round(cor(Temperature,
Amount.Rentals), 2), sep = ""),lty = 1, col = "orange")
Error in cor(Temperature, Amount.Rentals) :
supply both 'x' and 'y' or a matrix-like 'x'
>
>
> plot(x = 1, y = 1, xlab = "Temperature", ylab = "Amount of rentals",
xlim = c(0, 40), ylim = c(0, 10000), main = "Three correlations combined")
>
> points(Feeled.Raw.Temperature, Amount.Rentals, pch = 8, col = "green")
> points(Temperature, Amount.Rentals, pch = 8, col = "red")
> points(Feeled.Temperature, Amount.Rentals, pch = 8, col = "blue")
>
>
>
> #A Two-sample-t-test for real temperature and feeled temperature.
>
> hist(bike$raw.temp, yaxt = "n", xaxt = "n", xlab = "",
+ ylab = "", main = "Two Sample t-test", xlim = c(5, 40), col =
rgb(0, 0, 1, alpha = .1))
> text(x = 13, y = 140, paste("Mean real Temp.\n",
round(mean(bike$raw.temp), 2), sep = ""), col = "blue")
> abline(v = mean(bike$raw.temp), lty = 1,
+ col = rgb(0, 0, 1, alpha = 1), lwd = 4)
>
> par(new = T)
> hist(bike$raw.atemp, yaxt = "n", xaxt = "n", xlab = "",
+ ylab = "", main = "", xlim = c(5, 40), col = rgb(1, 0, 0, alpha =
.1))
>
> abline(v = mean(bike$raw.atemp), lty = 1,
+ col = rgb(1, 0, 0, alpha = 1), lwd = 4)
> text(x= 32, y = 131, paste("Mean feeled Temp.\n",
round(mean(bike$raw.atemp), 2), sep = ""), col = "red")
>
> mtext(text = "Alternative Hypothesis is confirmed true difference in
means is not equal to 0", line = 0, side = 3)
>
>
> # two-sample t-test for real temperature and feeled temperature in
spring.
>
> temp.spring <- subset(bike, subset = season == "1")$raw.temp
> atemp.spring <- subset(bike, subset = season == "1")$raw.atemp
> test.result.spring <- t.test(x = temp.spring, y = atemp.spring,
alternative = "two.sided")
> test.result.spring
Welch Two Sample t-test

data: temp.spring and atemp.spring


t = 9.5415, df = 6985.7, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
51.38445 77.95791
sample estimates:
mean of x mean of y
248.4556 183.7844

>
> # two-sample t-test for real temperature and feeled temperature in
summer.
>
> temp.summer <- subset(bike, subset = season == "2")$raw.temp
> atemp.summer <- subset(bike, subset = season == "2")$raw.atemp
> test.result.summer <- t.test(x = temp.summer, y = atemp.summer,
alternative = "two.sided")
> test.result.summer

Welch Two Sample t-test

data: temp.summer and atemp.summer


t = -26.724, df = 7629.1, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-210.6338 -181.8441
sample estimates:
mean of x mean of y
721.5660 917.8049

>
> # two-sample t-test for real temperature and feeled temperature in fall.
>
> temp.fall <- subset(bike, subset = season == "3")$raw.temp
> atemp.fall <- subset(bike, subset = season == "3")$raw.atemp
> test.result.fall <- t.test(x = temp.fall, y = atemp.fall, alternative =
"two.sided")
> test.result.fall

Welch Two Sample t-test

data: temp.fall and atemp.fall


t = -64.182, df = 7452.5, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-341.6860 -321.4325
sample estimates:
mean of x mean of y
1033.252 1364.812

>
> # two-sample t-test for real temperature and feeled temperature in
winter.
>
> temp.winter <- subset(bike, subset = season == "4")$raw.temp
> atemp.winter <- subset(bike, subset = season == "4")$raw.atemp
> test.result.winter <- t.test(x = temp.winter, y = atemp.winter,
alternative = "two.sided")
> test.result.winter

Welch Two Sample t-test

data: temp.winter and atemp.winter


t = -12.767, df = 7280.7, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-97.52962 -71.56560
sample estimates:
mean of x mean of y
487.3869 571.9345

>
>
> # Plotting the association:
> plot(x = 1, y = 1, xlab = "Temperature in Celcius", ylab = "Bike
rentals", type = "n", main = "Association between temperature and bike
rentals",
+ xlim = c(0, 40), ylim = c(0, 90))
>
>
> #Calculating min and max for the x-axis and y-axis:
> min(bike$raw.temp)
[1] -289.46
>
> #Adding points to the plot
> day$raw.temp <- (bike$temp*41)
Error in day$raw.temp <- (bike$temp * 41) : object 'day' not found
> points(bike$raw.temp, bike$casual, pch = 16, col = "red")
> points(bike$raw.temp, bike$registered, pch = 16, col = "skyblue")
>
> # Adding a legend to the plot
> legend("topleft",legend = c("casual", "registered"), col =
c("red","skyblue"), pch = c(16, 16), bg = "white")
>
>
>
> # Calculating the correlation between raw.temp and registered users and
between raw.temp and causal users
> cor.reg <- cor.test(x = bike$raw.temp, y = bike$registered)
> cor.reg

Pearson's product-moment correlation

data: bike$raw.temp and bike$registered


t = 46.925, df = 17377, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3220992 0.3484909
sample estimates:
cor
0.3353608

>
> cor.cas <- cor.test(x = bike$raw.temp,
+ y = bike$casual)
> cor.cas

Pearson's product-moment correlation

data: bike$raw.temp and bike$casual


t = 68.22, df = 17377, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4478081 0.4712629
sample estimates:
cor
0.4596156

>
> # Adding Correlation line and the correlation value to the plot
> abline(lm(bike$registered ~ bike$raw.temp), lty = 6, col = "blue")
>
> abline(lm(bike$casual ~ bike$raw.temp), lty = 6, col = "orange")
>
> reg <- paste("cor = ", round(cor(bike$registered, bike$raw.temp), 2),
sep = "")
> cas <- paste("cor = ", round(cor(bike$casual, bike$raw.temp), 2), sep =
"")
>
> legend("left",legend = c(cas, reg) , col = c('orange', 'blue'),pch =
c(16, 16), bg = "white")
>
> #Can the number of total bike rentals be predicted by holiday and
weather?
> lookup <- data.frame("numbers"=c("1","2","3","4"),
+ "weather"=c("nice","cloudy", "wet", "lousy")
+ )
>
> bike <- merge(x= bike,
+ y= lookup,
+ by.x="weathersit",
+ by.y="numbers"
+ )
>
> head(bike)
weathersit instant dteday season year month hr holiday weekday
workingday temp atemp
1 1 1 2011-01-01 1 2011 1 0 0 6
0 3.28 3.0014
2 1 2 2011-01-01 1 2011 1 1 0 6
0 2.34 1.9982
3 1 3 2011-01-01 1 2011 1 2 0 6
0 2.34 1.9982
4 1 4 2011-01-01 1 2011 1 3 0 6
0 3.28 3.0014
5 1 5 2011-01-01 1 2011 1 4 0 6
0 3.28 3.0014
6 1 3286 2011-05-21 2 2011 5 19 0 6
0 23.96 26.0024
humidity windspeed casual registered count raw.temp raw.atemp
raw.mean.temp.atemp weather
1 0.81 0.0000 3 13 16 134.48 150.07
142.275 nice
2 0.80 0.0000 8 32 40 95.94 99.91
97.925 nice
3 0.80 0.0000 5 27 32 95.94 99.91
97.925 nice
4 0.75 0.0000 3 10 13 134.48 150.07
142.275 nice
5 0.75 0.0000 0 1 1 134.48 150.07
142.275 nice
6 0.47 0.1642 117 184 301 982.36 1300.12
1141.240 nice
>
> #Using linear regression
>
> total.rentals.lm <- lm(bike$count ~ holiday + weather, data = bike)
> summary (total.rentals.lm)
Call:
lm(formula = bike$count ~ holiday + weather, data = bike)

Residuals:
Min 1Q Median 3Q Max
-204.97 -141.21 -44.97 89.03 780.72

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 176.276 2.672 65.976 < 2e-16 ***
holiday -36.836 8.141 -4.525 6.08e-06 ***
weatherlousy -101.943 103.578 -0.984 0.325
weathernice 29.694 3.146 9.439 < 2e-16 ***
weatherwet -64.126 5.455 -11.755 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 179.3 on 17374 degrees of freedom


Multiple R-squared: 0.02264, Adjusted R-squared: 0.02241
F-statistic: 100.6 on 4 and 17374 DF, p-value: < 2.2e-16

>
> # Using a anova
> anv.weather <- anova (total.rentals.lm)
> anv.weather
Analysis of Variance Table

Response: bike$count
Df Sum Sq Mean Sq F value Pr(>F)
holiday 1 546889 546889 17.003 3.749e-05 ***
weather 3 12396684 4132228 128.474 < 2.2e-16 ***
Residuals 17374 558818018 32164
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> #The F-values (holiday, weather, residuals) are:
> anv.weather$`F value`
[1] 17.00312 128.47354 NA
>
> #The p-values (holiday, weather, residuals) are:
> anv.weather$`Pr(>F)`
[1] 3.749198e-05 2.588619e-82 NA
>
> #Using Tukey-Test
> weather.aov <- aov(bike$count ~ weather, data = bike)
> summary(weather.aov)
Df Sum Sq Mean Sq F value Pr(>F)
weather 3 12285030 4095010 127.2 <2e-16 ***
Residuals 17375 559476561 32200
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> TukeyHSD(weather.aov)
Tukey multiple comparisons of means
95% family-wise confidence level

Fit: aov(formula = bike$count ~ weather, data = bike)

$`weather`
diff lwr upr p adj
lousy-cloudy -100.83216 -367.10249 165.43817 0.7648878
nice-cloudy 29.70378 21.61661 37.79095 0.0000000
wet-cloudy -63.58621 -77.60667 -49.56576 0.0000000
nice-lousy 130.53594 -135.68152 396.75339 0.5886467
wet-lousy 37.24595 -229.21775 303.70965 0.9841444
wet-nice -93.28999 -106.26764 -80.31234 0.0000000

>
> # Months is coded as 1 to 12
> # Converting month with "merge"
>
> lookup.month<- data.frame("month" = c(1:12),
+ "mnth.name" = c("01Jan", "02Feb", "03March",
"04April", "05May", "06June", "07July", "08Aug", "09Sept", "10Oct",
"11Nov", "12Dec"), stringsAsFactors = FALSE)
>
> bike <- merge(x=bike, y= lookup.month, by = 'month')
>
>
> # Convert the nomalized windspeed and humidity
> bike$raw.windspeed <- (bike$windspeed*67)
> bike$raw.hum <- (bike$hum * 100)
> head(bike)
month weathersit instant dteday season year hr holiday weekday
workingday temp atemp
1 1 1 1 2011-01-01 1 2011 0 0 6
0 3.28 3.0014
2 1 1 2 2011-01-01 1 2011 1 0 6
0 2.34 1.9982
3 1 1 3 2011-01-01 1 2011 2 0 6
0 2.34 1.9982
4 1 1 4 2011-01-01 1 2011 3 0 6
0 3.28 3.0014
5 1 1 5 2011-01-01 1 2011 4 0 6
0 3.28 3.0014
6 1 2 6 2011-01-01 1 2011 5 0 6
0 3.28 1.0016
humidity windspeed casual registered count raw.temp raw.atemp
raw.mean.temp.atemp weather
1 0.81 0.0000 3 13 16 134.48 150.07
142.275 nice
2 0.80 0.0000 8 32 40 95.94 99.91
97.925 nice
3 0.80 0.0000 5 27 32 95.94 99.91
97.925 nice
4 0.75 0.0000 3 10 13 134.48 150.07
142.275 nice
5 0.75 0.0000 0 1 1 134.48 150.07
142.275 nice
6 0.75 0.0896 0 1 1 134.48 50.08
92.280 cloudy
mnth.name raw.windspeed raw.hum
1 01Jan 0.0000 81
2 01Jan 0.0000 80
3 01Jan 0.0000 80
4 01Jan 0.0000 75
5 01Jan 0.0000 75
6 01Jan 6.0032 75
>
> #descreptive statistics with the dplyr-functions:
> require(dplyr)
> month.agg <- bike %>% group_by(mnth.name) %>% summarise(
+ mean.temp = mean(raw.temp),
+ mean.hum = mean(raw.hum),
+ mean.windspeed = mean(raw.windspeed),
+ mean.rentals = mean(bike$count))
>
> month.agg
# A tibble: 12 x 5
mnth.name mean.temp mean.hum mean.windspeed mean.rentals
<chr> <dbl> <dbl> <dbl> <dbl>
1 01Jan 130. 58.1 13.9 189.
2 02Feb 251. 56.7 14.5 189.
3 03March 425. 58.9 14.9 189.
4 04April 578. 58.8 15.7 189.
5 05May 818. 68.9 12.3 189.
6 06June 990. 57.6 12.4 189.
7 07July 1128. 59.8 11.1 189.
8 08Aug 1038. 63.7 11.5 189.
9 09Sept 860. 71.4 11.1 189.
10 10Oct 611. 68.9 11.5 189.
11 11Nov 383. 62.5 12.3 189.
12 12Dec 297. 66.6 11.8 189.
>
> par(mfrow=c(2,2))
> barplot(height = month.agg$mean.rentals,
+ names.arg = month.agg$mnth.name ,col = "red", main = "Mean
rentals" )
>
> barplot(height = month.agg$mean.windspeed,
+ names.arg = month.agg$mnth.name,col = "blue", main = "Mean
Windspeed (km/h)" )
>
> barplot(height = month.agg$mean.hum,
+ names.arg = month.agg$mnth.name,col = "green", main = "Mean
Humidity" )
>
>
> barplot(height = month.agg$mean.temp,
+ names.arg = month.agg$mnth.name,col = "skyblue", main = "Mean
Temperature" )
>
>
>
> # Compute correlations
>
sub=data.frame(bike$registered,bike$casual,bike$count,bike$temp,bike$humid
ity,bike$atemp,bike$windspeed)
> cor(sub)
bike.registered bike.casual bike.count bike.temp
bike.humidity bike.atemp
bike.registered 1.00000000 0.50661770 0.97215073 0.33536085 -
0.27393312 0.33255864
bike.casual 0.50661770 1.00000000 0.69456408 0.45961565 -
0.34702809 0.45408007
bike.count 0.97215073 0.69456408 1.00000000 0.40477228 -
0.32291074 0.40092930
bike.temp 0.33536085 0.45961565 0.40477228 1.00000000 -
0.06988139 0.98767214
bike.humidity -0.27393312 -0.34702809 -0.32291074 -0.06988139
1.00000000 -0.05191770
bike.atemp 0.33255864 0.45408007 0.40092930 0.98767214 -
0.05191770 1.00000000
bike.windspeed 0.08232085 0.09028678 0.09323378 -0.02312526 -
0.29010490 -0.06233604
bike.windspeed
bike.registered 0.08232085
bike.casual 0.09028678
bike.count 0.09323378
bike.temp -0.02312526
bike.humidity -0.29010490
bike.atemp -0.06233604
bike.windspeed 1.00000000
>
> # hour buckets for registered users
> d1 = rpart(registered~hr,data=bike)
> fancyRpartPlot(d1)
>
> # hour buckets for casual users
> d2 = rpart(casual~hr,data=bike)
> fancyRpartPlot(d2)
>
> # create bis for regular and causal hours based on rpart output
> bike$dp_reg=0
> bike$dp_reg[bike$hr<8]=1
> bike$dp_reg[bike$hr>=22]=2
> bike$dp_reg[bike$hr>9 & bike$hr<18]=3
> bike$dp_reg[bike$hr==8]=4
> bike$dp_reg[bike$hr==9]=5
> bike$dp_reg[bike$hr==20 | bike$hr==21]=6
> bike$dp_reg[bike$hr==19 | bike$hr==18]=7
>
> bike$dp_cas=0
> bike$dp_cas[bike$hour<=8]=1
> bike$dp_cas[bike$hour==9]=2
> bike$dp_cas[bike$hour>=10 & bike$hour<=19]=3
> bike$dp_cas[bike$hour>19]=4
>
> # temp buckets for registered users
> f1=rpart(registered~temp,data=bike)
> fancyRpartPlot(f1)
>
> # temp buckets for casual users
> f2=rpart(casual~temp,data=bike)
> fancyRpartPlot(f2)
>
> # create buckets for temp registered users
> bike$temp_reg=0
> bike$temp_reg[bike$temp<13]=1
> bike$temp_reg[bike$temp>=13 & bike$temp<23]=2
> bike$temp_reg[bike$temp>=23 & bike$temp<30]=3
> bike$temp_reg[bike$temp>=30]=4
>
> # create buckets for temp casual users
> bike$temp_cas=0
> bike$temp_cas[bike$temp<15]=1
> bike$temp_cas[bike$temp>=15 & bike$temp<23]=2
> bike$temp_cas[bike$temp>=23 & bike$temp<30]=3
> bike$temp_cas[bike$temp>=30]=4
>
> # create paritions for year
> bike$year_part[bike$year=='2011']=1
> bike$year_part[bike$year=='2011' & bike$month>3]=2
> bike$year_part[bike$year=='2011' & bike$month>6]=3
> bike$year_part[bike$year=='2011' & bike$month>9]=4
> bike$year_part[bike$year=='2012']=5
> bike$year_part[bike$year=='2012' & bike$month>3]=6
> bike$year_part[bike$year=='2012' & bike$month>6]=7
> bike$year_part[bike$year=='2012' & bike$month>9]=8
> table(bike$year_part)

1 2 3 4 5 6 7 8
2067 2183 2192 2203 2176 2182 2208 2168
>
> # creating another variable day_type which may affect our accuracy as
weekends
> # and weekdays are important in deciding rentals
> bike$day_type=0
> bike$day_type[bike$holiday==0 & bike$workingday==0]="weekend"
> bike$day_type[bike$holiday==1]="holiday"
> bike$day_type[bike$holiday==0 & bike$workingday==1]="working day"
>
> str(bike)
'data.frame': 17379 obs. of 30 variables:
$ month : int 1 1 1 1 1 1 1 1 1 1 ...
$ weathersit : int 1 1 1 1 1 2 1 1 1 1 ...
$ instant : int 1 2 3 4 5 6 7 8 9 10 ...
$ dteday : Factor w/ 731 levels "2011-01-01","2011-01-02",..:
1 1 1 1 1 1 1 1 1 1 ...
$ season : int 1 1 1 1 1 1 1 1 1 1 ...
$ year : num 2011 2011 2011 2011 2011 ...
$ hr : int 0 1 2 3 4 5 6 7 8 9 ...
$ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
$ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
$ workingday : int 0 0 0 0 0 0 0 0 0 0 ...
$ temp : num 3.28 2.34 2.34 3.28 3.28 3.28 2.34 1.4 3.28
7.04 ...
$ atemp : num 3 2 2 3 3 ...
$ humidity : num 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75
0.76 ...
$ windspeed : num 0 0 0 0 0 0.0896 0 0 0 0 ...
$ casual : int 3 8 5 3 0 0 2 1 1 8 ...
$ registered : int 13 32 27 10 1 1 0 2 7 6 ...
$ count : int 16 40 32 13 1 1 2 3 8 14 ...
$ raw.temp : num 134.5 95.9 95.9 134.5 134.5 ...
$ raw.atemp : num 150.1 99.9 99.9 150.1 150.1 ...
$ raw.mean.temp.atemp: num 142.3 97.9 97.9 142.3 142.3 ...
$ weather : Factor w/ 4 levels "cloudy","lousy",..: 3 3 3 3 3
1 3 3 3 3 ...
$ mnth.name : chr "01Jan" "01Jan" "01Jan" "01Jan" ...
$ raw.windspeed : num 0 0 0 0 0 ...
$ raw.hum : num 81 80 80 75 75 75 80 86 75 76 ...
$ dp_reg : num 1 1 1 1 1 1 1 1 4 5 ...
$ dp_cas : num 0 0 0 0 0 0 0 0 0 0 ...
$ temp_reg : num 1 1 1 1 1 1 1 1 1 1 ...
$ temp_cas : num 1 1 1 1 1 1 1 1 1 1 ...
$ year_part : num 1 1 1 1 1 1 1 1 1 1 ...
$ day_type : chr "weekend" "weekend" "weekend" "weekend" ...
>
> # dividing total data depending on windspeed to impute/predict the
missing values
> table(bike$windspeed==0)

FALSE TRUE
15199 2180
> k=bike$windspeed==0
> wind_0=subset(bike,k)
> wind_1=subset(bike,!k)
>
> # predicting missing values in windspeed using a random forest model
> # this is a different approach to impute missing values rather than just
using the mean or median or some other statistic for imputation
> set.seed(415)
> str(wind_1)
'data.frame': 15199 obs. of 30 variables:
$ month : int 1 1 1 1 1 1 1 1 1 1 ...
$ weathersit : int 2 1 1 1 1 3 1 1 1 2 ...
$ instant : int 6 11 12 13 8760 9128 8762 8763 8764 632 ...
$ dteday : Factor w/ 731 levels "2011-01-01","2011-01-02",..:
1 1 1 1 370 386 370 370 370 29 ...
$ season : int 1 1 1 1 1 1 1 1 1 1 ...
$ year : num 2011 2011 2011 2011 2012 ...
$ hr : int 5 10 11 12 19 5 21 22 23 14 ...
$ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
$ weekday : int 6 6 6 6 4 6 4 4 4 6 ...
$ workingday : int 0 0 0 0 1 0 1 1 1 0 ...
$ temp : num 3.28 9.86 8.92 11.74 7.98 ...
$ atemp : num 1 10 6 12 7 ...
$ humidity : num 0.75 0.76 0.81 0.77 0.34 0.86 0.6 0.6 0.7 0.6
...
$ windspeed : num 0.0896 0.2537 0.2836 0.2836 0.0896 ...
$ casual : int 0 12 26 29 10 0 9 1 4 10 ...
$ registered : int 1 24 30 55 255 2 88 70 56 89 ...
$ count : int 1 36 56 84 265 2 97 71 60 99 ...
$ raw.temp : num 134 404 366 481 327 ...
$ raw.atemp : num 50.1 499.9 299.9 599.9 350 ...
$ raw.mean.temp.atemp: num 92.3 452.1 332.8 540.6 338.6 ...
$ weather : Factor w/ 4 levels "cloudy","lousy",..: 1 3 3 3 3
4 3 3 3 1 ...
$ mnth.name : chr "01Jan" "01Jan" "01Jan" "01Jan" ...
$ raw.windspeed : num 6 17 19 19 6 ...
$ raw.hum : num 75 76 81 77 34 86 60 60 70 60 ...
$ dp_reg : num 1 3 3 3 7 1 6 2 2 3 ...
$ dp_cas : num 0 0 0 0 0 0 0 0 0 0 ...
$ temp_reg : num 1 1 1 1 1 1 1 1 1 1 ...
$ temp_cas : num 1 1 1 1 1 1 1 1 1 1 ...
$ year_part : num 1 1 1 1 5 5 5 5 5 1 ...
$ day_type : chr "weekend" "weekend" "weekend" "weekend" ...
> wind_1$season <- as.factor(wind_1$season)
> wind_0$season <- as.factor(wind_0$season)
> fit <- randomForest(windspeed ~
season+weathersit+humidity+month+temp+year+atemp,
data=wind_1,importance=TRUE, ntree=250)
> pred=predict(fit,wind_0)
> wind_0$windspeed=pred
>
> bike=rbind(wind_0,wind_1)
> bike$weekend=0
> bike$weekend[bike$day=="Sunday" | bike$day=="Saturday"]=1
> names(bike)
[1] "month" "weathersit" "instant"
"dteday"
[5] "season" "year" "hr"
"holiday"
[9] "weekday" "workingday" "temp"
"atemp"
[13] "humidity" "windspeed" "casual"
"registered"
[17] "count" "raw.temp" "raw.atemp"
"raw.mean.temp.atemp"
[21] "weather" "mnth.name" "raw.windspeed"
"raw.hum"
[25] "dp_reg" "dp_cas" "temp_reg"
"temp_cas"
[29] "year_part" "day_type" "weekend"
> bike <- select(bike, -c(weekend))