Anda di halaman 1dari 37

Table of Contents

Pendahuluan
Packages
library(ISLR)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse


1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ──────────────────────────────────────────
tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()

library(splines)
library(rsample)
library(dplyr)
library(ggplot2)
library(ggpubr)

Dataset
attach(Auto)

## The following object is masked from package:ggplot2:


##
## mpg

str(Auto)

## 'data.frame': 392 obs. of 9 variables:


## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36
231 14 161 141 54 223 241 2 ...
Dataset yang digunakan adalah data Auto dari package ISLR. Data ini terdiri dari 9 variabel
dan 392 observasi. Dengan penjelasan untuk masing-masing variabel dalam data sebagai
berikut :
• mpg: miles per gallon

• cylinders: Number of cylinders between 4 and 8

• displacement: Engine displacement (cu. inches)

• horsepower: Engine horsepower

• weight: Vehicle weight (lbs.)

• acceleration: Time to accelerate from 0 to 60 mph (sec.)

• year: Model year (modulo 100)

• origin: Origin of car (1. American, 2. European, 3. Japanese)

• name: Vehicle name

Bagian 1
Pada bagian ini akan digunakan variabel mpg dan horsepower.
Jika data tersebut digambarkan dalam bentuk scatterplot maka akan diperoleh sebagai
berikut
ggplot(Auto,aes(x=horsepower,y=mpg),group = origin) + geom_point(aes(color =
factor(origin))) + stat_smooth(aes(x=horsepower,y=mpg),method = "lm", formula
= y~x,lty = 1, col = "black",se = F) + theme_bw()+ xlab("Horse Power") +
ylab("mile per gallon")

Berdasarkan hasil tersebut, terlihat bahwa kedua variabel tersebut memiliki pola
hubungan yang tidak linear.
Selanjutnya akan dilakukan cross validation untuk menghasilkan pemodelan mpg vs
horsepower optimal menggunakan metode regresi polinomial, fungsi tangga, dan regresi
natural spline.

Regresi Polinomial
Akan dilakukan cross validation untuk mengetahui regresi polinomial derajat berapa yang
menghasilkan model terbaik.
set.seed(7)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

degree <- 1:4

polinomial <- map_dfr(degree, function(i) {


metric_polinom <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~
poly(horsepower,derajat=i),data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response
= pred)
mae <- mlr3measures::mae(truth = truth, response =
pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_polinom

# menghitung rata-rata untuk 10 folds


mean_metric_poly <- colMeans(metric_polinom)
mean_metric_poly
}
)

polinomial <- cbind(DERAJAT=degree,polinomial)


data.frame(polinomial)

## DERAJAT RMSE MAE


## 1 1 4.890259 3.839273
## 2 2 4.341205 3.269186
## 3 3 4.348653 3.270510
## 4 4 4.350900 3.282369

#berdasarkan rmse
polinomial %>% slice_min(RMSE)

## DERAJAT RMSE MAE


## 1 2 4.341205 3.269186

#berdasarkan mae
best_p<-polinomial %>% slice_min(MAE)
best_p

## DERAJAT RMSE MAE


## 1 2 4.341205 3.269186

Berdasarkan hasil yang tersebut, diperoleh bahwa berdasarkan nilai RMSE dan MAE,
regresi polinomial berderajat 2 merupakan yang terbaik karena memiliki nilai terkecil.

Regresi Polinomial Derajat 2


mod_polinom2 = lm(mpg ~ poly(horsepower,2,raw = T), data=Auto)
summary(mod_polinom2)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2, raw = T), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.9000997 1.8004268 31.60 <2e-16 ***
## poly(horsepower, 2, raw = T)1 -0.4661896 0.0311246 -14.98 <2e-16 ***
## poly(horsepower, 2, raw = T)2 0.0012305 0.0001221 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16

ggplot(Auto,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,


color="purple") + stat_smooth(aes(x=horsepower, y=mpg),method = "lm", formula
= y~poly(x,2,raw=T),lty = 1, col = "black",se = F) + theme_bw() +
ggtitle("Regresi Polinomial Derajat 2") + xlab("Horse Power") + ylab("mile
per gallon")

Fungsi Tangga
Akan dilakukan cross validation untuk mengetahui fungsi tangga dengan knots berapa
yang menghasilkan model terbaik.
set.seed(258)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 3:10

fungsitangga <- map_dfr(breaks, function(i){


metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <-
cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric(
sub("\\((.+),.*","\\1", labs_x) ),
upper = as.numeric(
sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto[-x$in_id,]
horsepower_new <-
cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth =
data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth =
data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)

metric_tangga

# menghitung rata-rata untuk 10 folds


mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
fungsitangga <- cbind(breaks=breaks,fungsitangga)
data.frame(fungsitangga)

## breaks RMSE MAE


## 1 3 5.737872 4.515990
## 2 4 4.975666 3.789383
## 3 5 4.672683 3.568542
## 4 6 4.650409 3.563403
## 5 7 4.505560 3.383609
## 6 8 4.412809 3.380792
## 7 9 4.540796 3.489276
## 8 10 4.559803 3.449358

#berdasarkan rmse
fungsitangga %>% slice_min(RMSE)

## breaks RMSE MAE


## 1 8 4.412809 3.380792

#berdasarkan mae
best_t<-fungsitangga %>% slice_min(MAE)
best_t

## breaks RMSE MAE


## 1 8 4.412809 3.380792
Diperoleh hasil bahwa, berdasarkan nilai RMSE dan MAE, fungsi tangga terbaik dengan
breaks 8 (knots = 7)

Fungsi Tangga dengan knots 7


# Fungsi Tangga dengan knots 7
mod_tangga7 = lm(mpg ~ cut(horsepower,8),data=Auto)
summary(mod_tangga7)

##
## Call:
## lm(formula = mpg ~ cut(horsepower, 8), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9471 -2.6757 -0.1533 2.4015 14.5529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.9085 0.5771 58.76 <2e-16 ***
## cut(horsepower, 8)(69,92] -6.9614 0.6910 -10.07 <2e-16 ***
## cut(horsepower, 8)(92,115] -12.7551 0.7425 -17.18 <2e-16 ***
## cut(horsepower, 8)(115,138] -15.6656 1.1264 -13.91 <2e-16 ***
## cut(horsepower, 8)(138,161] -18.7799 0.8568 -21.92 <2e-16 ***
## cut(horsepower, 8)(161,184] -19.9735 1.1470 -17.41 <2e-16 ***
## cut(horsepower, 8)(184,207] -21.1228 1.7720 -11.92 <2e-16 ***
## cut(horsepower, 8)(207,230] -21.0085 1.5159 -13.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.433 on 384 degrees of freedom
## Multiple R-squared: 0.6832, Adjusted R-squared: 0.6774
## F-statistic: 118.3 on 7 and 384 DF, p-value: < 2.2e-16

ggplot(Auto,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55, color="cyan")


+ stat_smooth(method = "lm", formula = y~cut(x,8), lty = 1, col = "black",se
= F)+ theme_bw()+ ggtitle("Fungsi Tangga Dengan Knots 7") + xlab("Horse
Power") + ylab("mile per gallon")

Regresi Natural Spline


Akan dilakukan cross validation untuk mengetahui regresi natural spline berapa yang
menghasilkan model terbaik.
set.seed(127)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 2:6

naturalspline <- map_dfr(df, function(i) {


metric_spline <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~ ns(horsepower,df=i),
data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred)
mae <- mlr3measures::mae(truth = truth,
response = pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_spline

# menghitung rata-rata untuk 10 folds


mean_metric_spline <- colMeans(metric_spline)
mean_metric_spline
}
)

naturalspline <- cbind(df=df,naturalspline)


data.frame(naturalspline)

## df RMSE MAE
## 1 2 4.343284 3.268135
## 2 3 4.375896 3.289081
## 3 4 4.343955 3.282771
## 4 5 4.331450 3.273342
## 5 6 4.326606 3.260358

#berdasarkan rmse
naturalspline %>% slice_min(RMSE)

## df RMSE MAE
## 1 6 4.326606 3.260358

#berdasarkan mae
best_s<-naturalspline %>% slice_min(MAE)
best_s

## df RMSE MAE
## 1 6 4.326606 3.260358

Diperoleh hasil bahwa, berdasarkan nilai RMSE dan MAE regresi natural spline terbaik
dengan df = 6.
Regresi Natural Spline Dengan df = 6
Berikut adalah knots pada regresi natural spline dengan df = 6.
attr(ns(Auto$horsepower, df=6),"knots")

## 16.66667% 33.33333% 50% 66.66667% 83.33333%


## 70.0 84.0 93.5 110.0 150.0

diperoleh nilai knots yang ditentukan oleh komputer.


mod_spline = lm(mpg ~ bs(horsepower, knots = c(70, 84, 93.5, 110,
150)),data=Auto)
summary(mod_spline)

##
## Call:
## lm(formula = mpg ~ bs(horsepower, knots = c(70, 84, 93.5, 110,
## 150)), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.4487 -2.4796 -0.1235 2.2310 15.0368
##
## Coefficients:
## Estimate Std. Error t
value
## (Intercept) 32.1314 2.1618
14.863
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))1 6.7079 4.2793
1.568
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))2 -0.5588 2.2767 -
0.245
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))3 -4.8617 2.5606 -
1.899
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))4 -10.7913 2.2789 -
4.735
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))5 -11.7298 2.6515 -
4.424
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))6 -20.6336 3.0711 -
6.719
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))7 -19.5128 3.7788 -
5.164
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))8 -18.5552 3.1736 -
5.847
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))1 0.1178
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))2 0.8062
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))3 0.0584 .
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))4 3.08e-06 ***
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))5 1.27e-05 ***
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))6 6.66e-11 ***
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))7 3.90e-07 ***
## bs(horsepower, knots = c(70, 84, 93.5, 110, 150))8 1.07e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.297 on 383 degrees of freedom
## Multiple R-squared: 0.7031, Adjusted R-squared: 0.6969
## F-statistic: 113.4 on 8 and 383 DF, p-value: < 2.2e-16

ggplot(Auto,aes(x=horsepower, y=mpg)) +geom_point(alpha=0.55, color="pink")


+stat_smooth(method = "lm",formula = y~ns(x, df=6),lty = 1,col = "black",
se=F)+ theme_bw()+ggtitle("Regresi Natural Spline Dengan df=6") +xlab("Horse
Power") + ylab("mile per gallon") +geom_vline(xintercept = c(70, 84, 93.5,
110, 150), col="magenta", lty=2)

Perbandingan Ketiga Model


a<- best_p %>% select(-1)
b<-best_t %>% select(-1)
c<-best_s %>% select(-1)
nilai_metric <- rbind(a,b,c)

nama_model <- c("Polinomial Derajat 2", "Fungsi Tangga (breaks=8)", "Regresi


Natural Spline (df=6)")
perbandingan.metode<-data.frame(nama_model, nilai_metric)
perbandingan.metode

## nama_model RMSE MAE


## 1 Polinomial Derajat 2 4.341205 3.269186
## 2 Fungsi Tangga (breaks=8) 4.412809 3.380792
## 3 Regresi Natural Spline (df=6) 4.326606 3.260358

perbandingan.metode %>% slice_min(MAE)

## nama_model RMSE MAE


## 1 Regresi Natural Spline (df=6) 4.326606 3.260358

Setelah melihat prolehan nilai RMSE dan MAE dari ketiga metode yang digunakan, dapat
disimpulkan bagwa model Regresi Natural Spline dengan df = 6 adalah model yang terbaik
untuk memodelkan hubungan antara variabel MPG dan HORSEPOWER karena memiliki
nilai terkecil.

Bagian 2
Akan dilakukan pengujian berdasarkan asal negaranya (Amerika, Eropa dan Jepang).

Berdasarkan Amerika
amerika <- Auto %>%
select(mpg,horsepower,origin) %>%
filter(origin == 1)
head(amerika)

## mpg horsepower origin


## 1 18 130 1
## 2 15 165 1
## 3 18 150 1
## 4 16 150 1
## 5 17 140 1
## 6 15 198 1

Selanjutnya akan dilakukan cross validation untuk menghasilkan pemodelan mpg vs


horsepower berdasarkan amerika yang optimal menggunakan metode regresi polinomial,
fungsi tangga, dan regresi natural spline.

Regresi Polinomial
Akan dilakukan cross validation untuk mengetahui regresi polinomial derajat berapa yang
menghasilkan model terbaik.
set.seed(7)
cross_val <- vfold_cv(amerika,v=10,strata = "mpg")

degree <- 1:4

polinomial <- map_dfr(degree, function(i) {


metric_polinom <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~
poly(horsepower,derajat=i),data=amerika[x$in_id,])
pred <- predict(mod, newdata=amerika[-x$in_id,])
truth <- amerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response
= pred)
mae <- mlr3measures::mae(truth = truth, response =
pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_polinom

# menghitung rata-rata untuk 10 folds


mean_metric_poly <- colMeans(metric_polinom)
mean_metric_poly
}
)

polinomial <- cbind(DERAJAT=degree,polinomial)


data.frame(polinomial)

## DERAJAT RMSE MAE


## 1 1 4.245769 3.350503
## 2 2 3.805157 2.909116
## 3 3 3.817234 2.918556
## 4 4 3.838211 2.917433

#berdasarkan rmse
polinomial %>% slice_min(RMSE)

## DERAJAT RMSE MAE


## 1 2 3.805157 2.909116

#berdasarkan mae
best_p_1 <- polinomial %>% slice_min(MAE)
best_p_1

## DERAJAT RMSE MAE


## 1 2 3.805157 2.909116

Berdasarkan hasil yang tersebut, diperoleh bahwa berdasarkan nilai RMSE dan MAE,
regresi polinomial berderajat 2 merupakan yang terbaik karena memiliki nilai terkecil.
Regresi Polinomial Derajat 2
mod_polinom2_a = lm(mpg ~ poly(horsepower,2,raw = T), data=amerika)
summary(mod_polinom2_a)

##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2, raw = T), data = amerika)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.9497 -2.5745 -0.0895 2.1583 13.1866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.9150173 2.3893530 21.73 < 2e-16 ***
## poly(horsepower, 2, raw = T)1 -0.4104369 0.0379990 -10.80 < 2e-16 ***
## poly(horsepower, 2, raw = T)2 0.0010776 0.0001398 7.71 3.26e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.822 on 242 degrees of freedom
## Multiple R-squared: 0.6507, Adjusted R-squared: 0.6478
## F-statistic: 225.4 on 2 and 242 DF, p-value: < 2.2e-16

ggplot(amerika,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,


color="lightblue") + stat_smooth(aes(x=horsepower, y=mpg),method = "lm",
formula = y~poly(x,2,raw=T),lty = 1, col = "black",se = F) + theme_bw() +
ggtitle("Regresi Polinomial Derajat 2 (Amerika)") + xlab("Horse Power") +
ylab("mile per gallon")
#### Fungsi
Tangga
Akan dilakukan cross validation untuk mengetahui fungsi tangga dengan knots berapa
yang menghasilkan model terbaik.
set.seed(258)
cross_val <- vfold_cv(amerika,v=10,strata = "mpg")

breaks <- 3:10

fungsitangga <- map_dfr(breaks, function(i){


metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- amerika[x$in_id,]
training$horsepower <-
cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric(
sub("\\((.+),.*","\\1", labs_x) ),
upper = as.numeric(
sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- amerika[-x$in_id,]
horsepower_new <-
cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth =
data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth =
data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)

metric_tangga

# menghitung rata-rata untuk 10 folds


mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
fungsitangga <- cbind(breaks=breaks,fungsitangga)
data.frame(fungsitangga)

## breaks RMSE MAE


## 1 3 4.679093 3.665106
## 2 4 4.106173 3.073562
## 3 5 4.043372 3.104597
## 4 6 4.267424 3.216019
## 5 7 4.071020 3.171505
## 6 8 3.866973 2.933945
## 7 9 3.741909 2.864725
## 8 10 3.866206 2.980407

#berdasarkan rmse
fungsitangga %>% slice_min(RMSE)

## breaks RMSE MAE


## 1 9 3.741909 2.864725

#berdasarkan mae
best_t_1<-fungsitangga %>% slice_min(MAE)
best_t_1

## breaks RMSE MAE


## 1 9 3.741909 2.864725

Diperoleh hasil bahwa, berdasarkan nilai RMSE dan MAE, fungsi tangga terbaik dengan
breaks 9 (knots = 8)
Fungsi Tangga dengan knots 8
# Fungsi Tangga dengan knots 8
mod_tangga8 = lm(mpg ~ cut(horsepower,9),data=amerika)
summary(mod_tangga8)

##
## Call:
## lm(formula = mpg ~ cut(horsepower, 9), data = amerika)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.946 -1.946 -0.375 2.077 13.054
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.5933 0.9521 34.23 < 2e-16 ***
## cut(horsepower, 9)(71.8,91.6] -7.6477 1.0519 -7.27 5.24e-12 ***
## cut(horsepower, 9)(91.6,111] -12.6916 1.0682 -11.88 < 2e-16 ***
## cut(horsepower, 9)(111,131] -13.8267 1.3465 -10.27 < 2e-16 ***
## cut(horsepower, 9)(131,151] -17.1706 1.1025 -15.57 < 2e-16 ***
## cut(horsepower, 9)(151,171] -18.3933 1.2892 -14.27 < 2e-16 ***
## cut(horsepower, 9)(171,190] -18.9010 1.3973 -13.53 < 2e-16 ***
## cut(horsepower, 9)(190,210] -21.2600 1.7813 -11.94 < 2e-16 ***
## cut(horsepower, 9)(210,230] -19.2183 1.6144 -11.90 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.688 on 236 degrees of freedom
## Multiple R-squared: 0.6829, Adjusted R-squared: 0.6722
## F-statistic: 63.53 on 8 and 236 DF, p-value: < 2.2e-16

ggplot(amerika,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,


color="grey") + stat_smooth(method = "lm", formula = y~cut(x,9), lty = 1, col
= "black",se = F)+ theme_bw()+ ggtitle("Fungsi Tangga Dengan Knots 8") +
xlab("Horse Power") + ylab("mile per gallon")
#### Regresi
Natural Spline
Akan dilakukan cross validation untuk mengetahui regresi natural spline berapa yang
menghasilkan model terbaik.
set.seed(258)
cross_val <- vfold_cv(amerika,v=10,strata = "mpg")

df <- 2:6

naturalspline <- map_dfr(df, function(i) {


metric_spline <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~ ns(horsepower,df=i),
data=amerika[x$in_id,])
pred <- predict(mod, newdata=amerika[-
x$in_id,])
truth <- amerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred)
mae <- mlr3measures::mae(truth = truth,
response = pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_spline

# menghitung rata-rata untuk 10 folds


mean_metric_spline <- colMeans(metric_spline)
mean_metric_spline
}
)

naturalspline <- cbind(df=df,naturalspline)


data.frame(naturalspline)

## df RMSE MAE
## 1 2 3.749580 2.883103
## 2 3 3.752743 2.882432
## 3 4 3.775525 2.896623
## 4 5 3.740099 2.824090
## 5 6 3.761657 2.852196

#berdasarkan rmse
naturalspline %>% slice_min(RMSE)

## df RMSE MAE
## 1 5 3.740099 2.82409

#berdasarkan mae
best_s_1<-naturalspline %>% slice_min(MAE)
best_s_1

## df RMSE MAE
## 1 5 3.740099 2.82409

Diperoleh hasil bahwa, berdasarkan nilai RMSE dan MAE regresi natural spline terbaik
dengan df = 5.
Regresi Natural Spline Dengan df = 5
Berikut adalah knots pada regresi natural spline dengan df = 5.
attr(ns(amerika$horsepower, df=5),"knots")

## 20% 40% 60% 80%


## 85.0 99.2 122.0 150.0

diperoleh nilai knots yang ditentukan oleh komputer.


mod_spline = lm(mpg ~ bs(horsepower, knots = c(85, 99.2, 122,
150)),data=amerika)
summary(mod_spline)

##
## Call:
## lm(formula = mpg ~ bs(horsepower, knots = c(85, 99.2, 122, 150)),
## data = amerika)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.038 -2.318 -0.280 2.072 12.920
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 31.6731 3.4622 9.148
## bs(horsepower, knots = c(85, 99.2, 122, 150))1 0.1218 5.2599 0.023
## bs(horsepower, knots = c(85, 99.2, 122, 150))2 -4.0112 3.4141 -1.175
## bs(horsepower, knots = c(85, 99.2, 122, 150))3 -12.9785 3.7707 -3.442
## bs(horsepower, knots = c(85, 99.2, 122, 150))4 -12.2301 3.7205 -3.287
## bs(horsepower, knots = c(85, 99.2, 122, 150))5 -20.7067 4.2610 -4.860
## bs(horsepower, knots = c(85, 99.2, 122, 150))6 -18.3482 4.5498 -4.033
## bs(horsepower, knots = c(85, 99.2, 122, 150))7 -18.3423 4.0867 -4.488
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## bs(horsepower, knots = c(85, 99.2, 122, 150))1 0.981545
## bs(horsepower, knots = c(85, 99.2, 122, 150))2 0.241212
## bs(horsepower, knots = c(85, 99.2, 122, 150))3 0.000683 ***
## bs(horsepower, knots = c(85, 99.2, 122, 150))4 0.001165 **
## bs(horsepower, knots = c(85, 99.2, 122, 150))5 2.14e-06 ***
## bs(horsepower, knots = c(85, 99.2, 122, 150))6 7.44e-05 ***
## bs(horsepower, knots = c(85, 99.2, 122, 150))7 1.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.795 on 237 degrees of freedom
## Multiple R-squared: 0.6628, Adjusted R-squared: 0.6528
## F-statistic: 66.54 on 7 and 237 DF, p-value: < 2.2e-16

ggplot(amerika,aes(x=horsepower, y=mpg)) +geom_point(alpha=0.55,


color="lightgreen")+stat_smooth(method = "lm",formula = y~ns(x, df=5),lty =
1,col = "black", se=F)+ theme_bw()+ggtitle("Regresi Natural Spline Dengan
df=5 (Amerika)") +xlab("Horse Power") + ylab("mile per gallon")
+geom_vline(xintercept = c(85, 99.2, 122, 150), col="green", lty=2)
####
Perbandingan Ketiga Metode
a1<- best_p_1 %>% select(-1)
b1<-best_t_1 %>% select(-1)
c1<-best_s_1 %>% select(-1)
nilai_metric <- rbind(a1,b1,c1)

nama_model <- c("Polinomial Derajat 2", "Fungsi Tangga (breaks=9)", "Regresi


Natural Spline (df=5)")

perbandingan.metode1<-data.frame(nama_model, nilai_metric)
perbandingan.metode1

## nama_model RMSE MAE


## 1 Polinomial Derajat 2 3.805157 2.909116
## 2 Fungsi Tangga (breaks=9) 3.741909 2.864725
## 3 Regresi Natural Spline (df=5) 3.740099 2.824090

perbandingan.metode1 %>% slice_min(MAE)

## nama_model RMSE MAE


## 1 Regresi Natural Spline (df=5) 3.740099 2.82409

Setelah melihat perolehan nilai MAE dari ketiga metode yang digunakan, dapat
disimpulkan bagha model Regresi Natural Spline dengan df = 5 adalah model yang terbaik
untuk memodelkan hubungan antara variabel MPG dan HORSEPOWER berdasarkan asal
negara amerika karena memiliki nilai terkecil.
Berdasarkan Eropa
eropa <- Auto %>%
select(mpg,horsepower,origin) %>%
filter(origin == 2)
head(eropa)

## mpg horsepower origin


## 20 26 46 2
## 21 25 87 2
## 22 24 90 2
## 23 25 95 2
## 24 26 113 2
## 51 28 90 2

Selanjutnya akan dilakukan cross validation untuk menghasilkan pemodelan mpg vs


horsepower berdasarkan asalnya yaitu eropa yang optimal menggunakan metode regresi
polinomial, fungsi tangga, dan regresi natural spline.

Regresi Polinomial
Akan dilakukan cross validation untuk mengetahui regresi polinomial derajat berapa yang
menghasilkan model terbaik.
set.seed(17)
cross_val <- vfold_cv(eropa,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the


recommended threshold of 20.
## • Stratification will use 3 breaks instead.

degree <- 2:4

polinomial <- map_dfr(degree, function(i) {


metric_polinom <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~
poly(horsepower,derajat=i),data=eropa[x$in_id,])
pred <- predict(mod, newdata=eropa[-x$in_id,])
truth <- eropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response
= pred)
mae <- mlr3measures::mae(truth = truth, response =
pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_polinom

# menghitung rata-rata untuk 10 folds


mean_metric_poly <- colMeans(metric_polinom)
mean_metric_poly
}
)

polinomial <- cbind(DERAJAT=degree,polinomial)


data.frame(polinomial)

## DERAJAT RMSE MAE


## 1 2 4.818942 3.842530
## 2 3 4.910695 3.953016
## 3 4 4.968377 4.034688

#berdasarkan rmse
polinomial %>% slice_min(RMSE)

## DERAJAT RMSE MAE


## 1 2 4.818942 3.84253

#berdasarkan mae
best_p_2 <- polinomial %>% slice_min(MAE)
best_p_2

## DERAJAT RMSE MAE


## 1 2 4.818942 3.84253

Berdasarkan hasil yang tersebut, diperoleh bahwa berdasarkan nilai RMSE dan MAE,
regresi polinomial berderajat 2 merupakan yang terbaik karena memiliki nilai terkecil.
Regresi Polinomial Derajat 2
mod_polinom2_e = lm(mpg ~ poly(horsepower,2,raw = T), data=eropa)
summary(mod_polinom2_e)

##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2, raw = T), data = eropa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.6282 -2.7724 -0.4052 2.2773 12.9676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.1311209 8.2852287 5.689 3.3e-07 ***
## poly(horsepower, 2, raw = T)1 -0.2631496 0.1994015 -1.320 0.192
## poly(horsepower, 2, raw = T)2 0.0002425 0.0011574 0.210 0.835
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.899 on 65 degrees of freedom
## Multiple R-squared: 0.4622, Adjusted R-squared: 0.4456
## F-statistic: 27.93 on 2 and 65 DF, p-value: 1.76e-09

ggplot(eropa,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55, color="blue")


+ stat_smooth(aes(x=horsepower, y=mpg),method = "lm", formula =
y~poly(x,2,raw=T),lty = 1, col = "black",se = F) + theme_bw() +
ggtitle("Regresi Polinomial Derajat 2 (Eropa)") + xlab("Horse Power") +
ylab("mile per gallon")

#### Fungsi
Tangga
Akan dilakukan cross validation untuk mengetahui fungsi tangga dengan knots berapa
yang menghasilkan model terbaik.
set.seed(258)
cross_val <- vfold_cv(eropa,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the


recommended threshold of 20.
## • Stratification will use 3 breaks instead.

breaks <- 3:9

fungsitangga <- map_dfr(breaks, function(i){


metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- eropa[x$in_id,]
training$horsepower <-
cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric(
sub("\\((.+),.*","\\1", labs_x) ),
upper = as.numeric(
sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- eropa[-x$in_id,]
horsepower_new <-
cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth =
data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth =
data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)

metric_tangga

# menghitung rata-rata untuk 10 folds


mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
fungsitangga <- cbind(breaks=breaks,fungsitangga)
data.frame(fungsitangga)

## breaks RMSE MAE


## 1 3 5.488316 4.524142
## 2 4 5.113734 4.117346
## 3 5 4.863383 3.807519
## 4 6 5.076525 4.070881
## 5 7 4.957220 3.833031
## 6 8 5.055641 4.007376
## 7 9 5.347387 4.270239

#berdasarkan rmse
fungsitangga %>% slice_min(RMSE)

## breaks RMSE MAE


## 1 5 4.863383 3.807519
#berdasarkan mae
best_t_2<-fungsitangga %>% slice_min(MAE)
best_t_2

## breaks RMSE MAE


## 1 5 4.863383 3.807519

Diperoleh hasil bahwa, berdasarkan nilai RMSE dan MAE, fungsi tangga terbaik dengan
breaks 5 (knots = 4)
Fungsi Tangga dengan knots 4
# Fungsi Tangga dengan knots 4
mod_tangga5 = lm(mpg ~ cut(horsepower,5),data=eropa)
summary(mod_tangga8)

##
## Call:
## lm(formula = mpg ~ cut(horsepower, 9), data = amerika)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.946 -1.946 -0.375 2.077 13.054
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.5933 0.9521 34.23 < 2e-16 ***
## cut(horsepower, 9)(71.8,91.6] -7.6477 1.0519 -7.27 5.24e-12 ***
## cut(horsepower, 9)(91.6,111] -12.6916 1.0682 -11.88 < 2e-16 ***
## cut(horsepower, 9)(111,131] -13.8267 1.3465 -10.27 < 2e-16 ***
## cut(horsepower, 9)(131,151] -17.1706 1.1025 -15.57 < 2e-16 ***
## cut(horsepower, 9)(151,171] -18.3933 1.2892 -14.27 < 2e-16 ***
## cut(horsepower, 9)(171,190] -18.9010 1.3973 -13.53 < 2e-16 ***
## cut(horsepower, 9)(190,210] -21.2600 1.7813 -11.94 < 2e-16 ***
## cut(horsepower, 9)(210,230] -19.2183 1.6144 -11.90 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.688 on 236 degrees of freedom
## Multiple R-squared: 0.6829, Adjusted R-squared: 0.6722
## F-statistic: 63.53 on 8 and 236 DF, p-value: < 2.2e-16

ggplot(eropa,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55, color="grey")


+ stat_smooth(method = "lm", formula = y~cut(x,5), lty = 1, col = "black",se
= F)+ theme_bw()+ ggtitle("Fungsi Tangga Dengan Knots 4") + xlab("Horse
Power") + ylab("mile per gallon")
#### Regresi
Natural Spline Akan dilakukan cross validation untuk mengetahui regresi natural spline
berapa yang menghasilkan model terbaik.
set.seed(258)
cross_val <- vfold_cv(eropa,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the


recommended threshold of 20.
## • Stratification will use 3 breaks instead.

df <- 2:6

naturalspline <- map_dfr(df, function(i) {


metric_spline <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~ ns(horsepower,df=i),
data=eropa[x$in_id,])
pred <- predict(mod, newdata=eropa[-x$in_id,])
truth <- eropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred)
mae <- mlr3measures::mae(truth = truth,
response = pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_spline

# menghitung rata-rata untuk 10 folds


mean_metric_spline <- colMeans(metric_spline)
mean_metric_spline
}
)

naturalspline <- cbind(df=df,naturalspline)


data.frame(naturalspline)

## df RMSE MAE
## 1 2 4.819233 3.785310
## 2 3 4.893459 3.873495
## 3 4 4.985735 3.933838
## 4 5 5.042966 3.984156
## 5 6 5.125449 3.980996

#berdasarkan rmse
naturalspline %>% slice_min(RMSE)

## df RMSE MAE
## 1 2 4.819233 3.78531

#berdasarkan mae
best_s_2<-naturalspline %>% slice_min(MAE)
best_s_2

## df RMSE MAE
## 1 2 4.819233 3.78531

Diperoleh hasil bahwa, berdasarkan nilai RMSE dan MAE regresi natural spline terbaik
dengan df = 2.
Regresi Natural Spline Dengan df = 2
Berikut adalah knots pada regresi natural spline dengan df = 2.
attr(ns(eropa$horsepower, df=2),"knots")

## 50%
## 76.5

diperoleh nilai knots yang ditentukan oleh komputer.


mod_spline = lm(mpg ~ bs(horsepower, knots = c(76.5)),data=eropa)
summary(mod_spline)

##
## Call:
## lm(formula = mpg ~ bs(horsepower, knots = c(76.5)), data = eropa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.0429 -2.7588 -0.5807 2.3619 13.0192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.7840 2.3824 14.600 < 2e-16 ***
## bs(horsepower, knots = c(76.5))1 -0.1427 4.9205 -0.029 0.976959
## bs(horsepower, knots = c(76.5))2 -10.2990 4.8625 -2.118 0.038123 *
## bs(horsepower, knots = c(76.5))3 -13.7392 5.7398 -2.394 0.019672 *
## bs(horsepower, knots = c(76.5))4 -18.4194 4.7706 -3.861 0.000269 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.967 on 63 degrees of freedom
## Multiple R-squared: 0.4642, Adjusted R-squared: 0.4302
## F-statistic: 13.64 on 4 and 63 DF, p-value: 4.55e-08

ggplot(eropa,aes(x=horsepower, y=mpg)) +geom_point(alpha=0.55,


color="lightgreen")+stat_smooth(method = "lm",formula = y~ns(x, df=5),lty =
1,col = "black", se=F)+ theme_bw()+ggtitle("Regresi Natural Spline Dengan
df=5 (Amerika)") +xlab("Horse Power") + ylab("mile per gallon")
+geom_vline(xintercept = c(76.5), col="green", lty=2)

####
Perbandingan Ketiga Metode
a2<- best_p_2 %>% select(-1)
b2<-best_t_2 %>% select(-1)
c2<-best_s_2 %>% select(-1)
nilai_metric <- rbind(a2,b2,c2)

nama_model <- c("Polinomial Derajat 2", "Fungsi Tangga (breaks=5)", "Regresi


Natural Spline (df=2)")

perbandingan.metode2<-data.frame(nama_model, nilai_metric)
perbandingan.metode2

## nama_model RMSE MAE


## 1 Polinomial Derajat 2 4.818942 3.842530
## 2 Fungsi Tangga (breaks=5) 4.863383 3.807519
## 3 Regresi Natural Spline (df=2) 4.819233 3.785310

perbandingan.metode2 %>% slice_min(MAE)

## nama_model RMSE MAE


## 1 Regresi Natural Spline (df=2) 4.819233 3.78531

Setelah melihat perolehan nilai MAE dari ketiga metode yang digunakan, dapat
disimpulkan bagha model Regresi Natural Spline dengan df = 2 adalah model yang terbaik
untuk memodelkan hubungan antara variabel MPG dan HORSEPOWER berdasarkan asal
negara aeropa karena memiliki nilai terkecil.

Berdasarkan Jepang
jepang <- Auto %>%
select(mpg,horsepower,origin) %>%
filter(origin == 3)
head(jepang)

## mpg horsepower origin


## 15 24 95 3
## 19 27 88 3
## 30 27 88 3
## 32 25 95 3
## 54 31 65 3
## 55 35 69 3

Selanjutnya akan dilakukan cross validation untuk menghasilkan pemodelan mpg vs


horsepower berdasarkan jepang yang optimal menggunakan metode regresi polinomial,
fungsi tangga, dan regresi natural spline.

Regresi Polinomial
Akan dilakukan cross validation untuk mengetahui regresi polinomial derajat berapa yang
menghasilkan model terbaik.
set.seed(7)
cross_val <- vfold_cv(jepang,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the
recommended threshold of 20.
## • Stratification will use 3 breaks instead.

degree <- 1:4

polinomial <- map_dfr(degree, function(i) {


metric_polinom <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~
poly(horsepower,derajat=i),data=jepang[x$in_id,])
pred <- predict(mod, newdata=jepang[-x$in_id,])
truth <- jepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response
= pred)
mae <- mlr3measures::mae(truth = truth, response =
pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_polinom

# menghitung rata-rata untuk 10 folds


mean_metric_poly <- colMeans(metric_polinom)
mean_metric_poly
}
)

polinomial <- cbind(DERAJAT=degree,polinomial)


data.frame(polinomial)

## DERAJAT RMSE MAE


## 1 1 4.323526 3.400140
## 2 2 4.435441 3.434890
## 3 3 3.869776 2.938533
## 4 4 4.305951 3.170233

#berdasarkan rmse
polinomial %>% slice_min(RMSE)

## DERAJAT RMSE MAE


## 1 3 3.869776 2.938533

#berdasarkan mae
best_p_3 <- polinomial %>% slice_min(MAE)
best_p_3

## DERAJAT RMSE MAE


## 1 3 3.869776 2.938533
Berdasarkan hasil yang tersebut, diperoleh bahwa berdasarkan nilai RMSE dan MAE,
regresi polinomial berderajat 3 merupakan yang terbaik karena memiliki nilai terkecil.
Regresi Polinomial Derajat 3
mod_polinom2_j = lm(mpg ~ poly(horsepower,3,raw = T), data=jepang)
summary(mod_polinom2_j)

##
## Call:
## lm(formula = mpg ~ poly(horsepower, 3, raw = T), data = jepang)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8602 -2.7115 -0.5224 2.1985 11.6985
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.163e+01 3.494e+01 -2.050 0.04385 *
## poly(horsepower, 3, raw = T)1 4.308e+00 1.230e+00 3.503 0.00078 ***
## poly(horsepower, 3, raw = T)2 -5.498e-02 1.400e-02 -3.926 0.00019 ***
## poly(horsepower, 3, raw = T)3 2.142e-04 5.170e-05 4.142 8.92e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.031 on 75 degrees of freedom
## Multiple R-squared: 0.5787, Adjusted R-squared: 0.5618
## F-statistic: 34.34 on 3 and 75 DF, p-value: 4.485e-14

ggplot(jepang,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,


color="lightblue") + stat_smooth(aes(x=horsepower, y=mpg),method = "lm",
formula = y~poly(x,3,raw=T),lty = 1, col = "black",se = F) + theme_bw() +
ggtitle("Regresi Polinomial Derajat 3 (Jepang)") + xlab("Horse Power") +
ylab("mile per gallon")
#### Fungsi
Tangga
Akan dilakukan cross validation untuk mengetahui fungsi tangga dengan knots berapa
yang menghasilkan model terbaik.
set.seed(258)
cross_val <- vfold_cv(jepang,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the


recommended threshold of 20.
## • Stratification will use 3 breaks instead.

breaks <- 3:8

fungsitangga <- map_dfr(breaks, function(i){


metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- jepang[x$in_id,]
training$horsepower <-
cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric(
sub("\\((.+),.*","\\1", labs_x) ),
upper = as.numeric(
sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- jepang[-x$in_id,]
horsepower_new <-
cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth =
data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth =
data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)

metric_tangga

# menghitung rata-rata untuk 10 folds


mean_metric_tangga <- colMeans(metric_tangga)

mean_metric_tangga
})
fungsitangga <- cbind(breaks=breaks,fungsitangga)
data.frame(fungsitangga)

## breaks RMSE MAE


## 1 3 3.989153 3.274142
## 2 4 3.916601 3.201922
## 3 5 3.923824 3.255117
## 4 6 4.059525 3.275023
## 5 7 4.330790 3.510876
## 6 8 3.836695 3.156339

#berdasarkan rmse
fungsitangga %>% slice_min(RMSE)

## breaks RMSE MAE


## 1 8 3.836695 3.156339

#berdasarkan mae
best_t_3<-fungsitangga %>% slice_min(MAE)
best_t_3

## breaks RMSE MAE


## 1 8 3.836695 3.156339

Diperoleh hasil bahwa, berdasarkan nilai RMSE dan MAE, fungsi tangga terbaik dengan
breaks 8 (knots = 7)
Fungsi Tangga dengan knots 7
# Fungsi Tangga dengan knots 7
mod_tangga7 = lm(mpg ~ cut(horsepower,8),data=amerika)
summary(mod_tangga7)

##
## Call:
## lm(formula = mpg ~ cut(horsepower, 8), data = amerika)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.420 -1.900 0.065 2.159 13.159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.4200 0.8729 33.706 < 2e-16 ***
## cut(horsepower, 8)(74.2,96.5] -4.5787 0.9824 -4.661 5.26e-06 ***
## cut(horsepower, 8)(96.5,119] -9.6780 1.0328 -9.371 < 2e-16 ***
## cut(horsepower, 8)(119,141] -12.7330 1.1935 -10.669 < 2e-16 ***
## cut(horsepower, 8)(141,163] -14.7050 1.0690 -13.756 < 2e-16 ***
## cut(horsepower, 8)(163,186] -15.4850 1.2344 -12.545 < 2e-16 ***
## cut(horsepower, 8)(186,208] -16.6343 1.7143 -9.704 < 2e-16 ***
## cut(horsepower, 8)(208,230] -16.5200 1.5118 -10.927 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.904 on 237 degrees of freedom
## Multiple R-squared: 0.6432, Adjusted R-squared: 0.6326
## F-statistic: 61.03 on 7 and 237 DF, p-value: < 2.2e-16

ggplot(jepang,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,


color="grey") + stat_smooth(method = "lm", formula = y~cut(x,8), lty = 1, col
= "black",se = F)+ theme_bw()+ ggtitle("Fungsi Tangga Dengan Knots 7") +
xlab("Horse Power") + ylab("mile per gallon")
#### Regresi
Natural Spline
Akan dilakukan cross validation untuk mengetahui regresi natural spline berapa yang
menghasilkan model terbaik.
set.seed(259)
cross_val <- vfold_cv(jepang,v=10,strata = "mpg")

## Warning: The number of observations in each quantile is below the


recommended threshold of 20.
## • Stratification will use 3 breaks instead.

df <- 2:6

naturalspline <- map_dfr(df, function(i) {


metric_spline <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~ ns(horsepower,df=i),
data=jepang[x$in_id,])
pred <- predict(mod, newdata=jepang[-x$in_id,])
truth <- jepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred)
mae <- mlr3measures::mae(truth = truth,
response = pred)
metric <- c(rmse,mae)
names(metric) <- c("RMSE","MAE")
return(metric)
}
)
metric_spline

# menghitung rata-rata untuk 10 folds


mean_metric_spline <- colMeans(metric_spline)
mean_metric_spline
}
)

naturalspline <- cbind(df=df,naturalspline)


data.frame(naturalspline)

## df RMSE MAE
## 1 2 4.574264 3.493363
## 2 3 3.985625 3.030560
## 3 4 4.045443 3.044222
## 4 5 3.931023 2.965285
## 5 6 3.951896 2.985599

#berdasarkan rmse
naturalspline %>% slice_min(RMSE)

## df RMSE MAE
## 1 5 3.931023 2.965285

#berdasarkan mae
best_s_3<-naturalspline %>% slice_min(MAE)
best_s_3

## df RMSE MAE
## 1 5 3.931023 2.965285

Diperoleh hasil bahwa, berdasarkan nilai RMSE dan MAE regresi natural spline terbaik
dengan df = 5.
Regresi Natural Spline Dengan df = 5
Berikut adalah knots pada regresi natural spline dengan df = 5.
attr(ns(jepang$horsepower, df=5),"knots")

## 20% 40% 60% 80%


## 65.0 69.2 88.0 96.0

diperoleh nilai knots yang ditentukan oleh komputer.


mod_spline = lm(mpg ~ bs(horsepower, knots = c(65, 69.2, 88,
96)),data=jepang)
summary(mod_spline)
##
## Call:
## lm(formula = mpg ~ bs(horsepower, knots = c(65, 69.2, 88, 96)),
## data = jepang)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7291 -2.6979 -0.2466 1.4271 10.8597
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 31.9499 2.3461 13.619
## bs(horsepower, knots = c(65, 69.2, 88, 96))1 5.3774 6.3634 0.845
## bs(horsepower, knots = c(65, 69.2, 88, 96))2 5.5408 3.1509 1.758
## bs(horsepower, knots = c(65, 69.2, 88, 96))3 -0.8369 3.6078 -0.232
## bs(horsepower, knots = c(65, 69.2, 88, 96))4 -3.8432 3.7673 -1.020
## bs(horsepower, knots = c(65, 69.2, 88, 96))5 -9.0415 4.6610 -1.940
## bs(horsepower, knots = c(65, 69.2, 88, 96))6 -15.0646 6.1037 -2.468
## bs(horsepower, knots = c(65, 69.2, 88, 96))7 0.2628 4.6155 0.057
## Pr(>|t|)
## (Intercept) <2e-16 ***
## bs(horsepower, knots = c(65, 69.2, 88, 96))1 0.4009
## bs(horsepower, knots = c(65, 69.2, 88, 96))2 0.0830 .
## bs(horsepower, knots = c(65, 69.2, 88, 96))3 0.8172
## bs(horsepower, knots = c(65, 69.2, 88, 96))4 0.3111
## bs(horsepower, knots = c(65, 69.2, 88, 96))5 0.0564 .
## bs(horsepower, knots = c(65, 69.2, 88, 96))6 0.0160 *
## bs(horsepower, knots = c(65, 69.2, 88, 96))7 0.9548
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.053 on 71 degrees of freedom
## Multiple R-squared: 0.5968, Adjusted R-squared: 0.557
## F-statistic: 15.01 on 7 and 71 DF, p-value: 7.304e-12

ggplot(jepang,aes(x=horsepower, y=mpg)) +geom_point(alpha=0.55,


color="lightgreen")+stat_smooth(method = "lm",formula = y~ns(x, df=5),lty =
1,col = "black", se=F)+ theme_bw()+ggtitle("Regresi Natural Spline Dengan
df=5 (Jepang)") +xlab("Horse Power") + ylab("mile per gallon")
+geom_vline(xintercept = c(65, 69.2, 88, 96), col="green", lty=2)
####
Perbandingan Ketiga Metode
a3<- best_p_3 %>% select(-1)
b3<-best_t_3 %>% select(-1)
c3<-best_s_3 %>% select(-1)
nilai_metric <- rbind(a1,b1,c1)

nama_model <- c("Polinomial Derajat 3", "Fungsi Tangga (breaks=8)", "Regresi


Natural Spline (df=5)")

perbandingan.metode3<-data.frame(nama_model, nilai_metric)
perbandingan.metode3

## nama_model RMSE MAE


## 1 Polinomial Derajat 3 3.805157 2.909116
## 2 Fungsi Tangga (breaks=8) 3.741909 2.864725
## 3 Regresi Natural Spline (df=5) 3.740099 2.824090

perbandingan.metode3 %>% slice_min(MAE)

## nama_model RMSE MAE


## 1 Regresi Natural Spline (df=5) 3.740099 2.82409

Setelah melihat perolehan nilai MAE dari ketiga metode yang digunakan, dapat
disimpulkan bagha model Regresi Natural Spline dengan df = 5 adalah model yang terbaik
untuk memodelkan hubungan antara variabel MPG dan HORSEPOWER berdasarkan asal
negara jepang karena memiliki nilai terkecil.

Anda mungkin juga menyukai