Variabel Dummy
Contoh 1. Berikut ini akan diberikan contoh analisis hubungan kapasitas paru-paru dengan kelompok tinggi
badan. Tinggi badan dalam variabel numerik, untuk menyusun kelompok tinggi badan digunakan variabel
dummy untuk tinggi badan pada data kapasitas paru-paru.
library(readr)
LungCap<-read_csv("E:/KUMPULAN BAHAN KUL UNY/Bahan Kuliah Pengantar Analisi Regresi/PPT Kuliah/LungCapDa
##
## -- Column specification --------------------------------------------------------
## cols(
## Age = col_double(),
## LungCap = col_double(),
## Height = col_double(),
## SqrHeight = col_double(),
## HeightT = col_double(),
## SqrHeightT = col_double(),
## SqrAge = col_double(),
## AgeT = col_double(),
## SqrAgeT = col_double(),
## AgeHeight = col_double(),
## AgeTHeightT = col_double()
## )
#Jika tinggi badan akan dibagi menjadi dua kelompok dengan interval A = (0,60], B = (60,75] #gunakan per
CatHeight<-cut(LungCap$Height,breaks=c(0,60,75),labels=c("A","B"))
#Dan jika interval A = [0,60), B = [60,75) yang akan disusun, gunakan perintah berikut ini
CatHeight<-cut(LungCap$Height,breaks=c(0,60,75),labels=c("A","B"),right=F)
# atau
CatHeight<-cut(LungCap$Height,breaks=2,labels=c("A","B"),right=F)
LungCapNew<-data.frame(LungCap,CatHeight)
str(LungCapNew)
1
# menghitung rata-rata kapasitas paru-paru untuk subyek pada kelompok tinggi A
mean(LungCapNew$LungCap[LungCapNew$CatHeight=="A"])
## [1] 3.576216
## [1] 7.308531
model1<-lm(LungCap~CatHeight,data=LungCapNew)
summary(model1) #reference category adalah A
##
## Call:
## lm(formula = LungCap ~ CatHeight, data = LungCapNew)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2265 -1.2760 -0.2379 0.8998 8.0705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5762 0.1196 29.91 <2e-16 ***
## CatHeightB 3.7323 0.1512 24.68 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 1.872 on 652 degrees of freedom
## Multiple R-squared: 0.4831, Adjusted R-squared: 0.4823
## F-statistic: 609.3 on 1 and 652 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = LungCap ~ CatHeight, data = LungCapNew)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2265 -1.2760 -0.2379 0.8998 8.0705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.30853 0.09255 78.97 <2e-16 ***
## CatHeightA -3.73231 0.15120 -24.68 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
2
##
## Residual standard error: 1.872 on 652 degrees of freedom
## Multiple R-squared: 0.4831, Adjusted R-squared: 0.4823
## F-statistic: 609.3 on 1 and 652 DF, p-value: < 2.2e-16
15
10
0
B A
3
15
10
LungCap
0
B A
CatHeight
model1<-lm(LungCap~Age,data=LungCapNew)
plot(LungCap~Age,data=LungCapNew, col = CatHeight + 1, pch = CatHeight + 1, cex = 0.5)
abline(model1, lwd = 3, col = "grey")
legend("topright", c("B", "A"), col = c(2, 3), pch = c(2, 3))
4
15
B
A
10
LungCap
5
0
5 10 15
Age
model2<-lm(LungCap~Age+CatHeight,data=LungCapNew)
height_A = coef(model2)[1]
height_B = coef(model2)[1] + coef(model2)[3]
slope_A = coef(model2)[2]
slope_B = coef(model2)[2]
plot(LungCap~Age,data=LungCapNew, col = CatHeight + 1, pch = CatHeight + 1, cex = 0.5)
legend("topright", c("Height B", "Height A"), col = c(2, 3), pch = c(2, 3))
abline(height_A, slope_A, col = 2, lty = 1, lwd = 2) # add line for Height A
abline(height_B, slope_B, col = 3, lty = 2, lwd = 2) # add line for Height B
5
15
Height B
Height A
10
LungCap
5
0
5 10 15
Age
Contoh 2. Berikut ini akan diberikan contoh analisis hubungan berat bayi (Wgt), status merokok ibu
(Smoke), dan jangka waktu kehamilan dalam satuan minggu (Gest). Perhatikan tipe data dari variabel
kategorik dalam data.
birthsmokers <- read.table("E:\\KUMPULAN BAHAN KUL UNY\\Bahan Kuliah Pengantar Analisi Regresi\\PPT Kuli
attach(birthsmokers)
str(birthsmokers)
6
3600
3200
Wgt
2800
2400
34 36 38 40 42
Gest
summary(model)
##
## Call:
## lm(formula = Wgt ~ Gest + Smoke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.693 -92.063 -9.365 79.663 197.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2389.573 349.206 -6.843 1.63e-07 ***
## Gest 143.100 9.128 15.677 1.07e-15 ***
## Smokeyes -244.544 41.982 -5.825 2.58e-06 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 115.5 on 29 degrees of freedom
## Multiple R-squared: 0.8964, Adjusted R-squared: 0.8892
## F-statistic: 125.4 on 2 and 29 DF, p-value: 5.289e-15
confint(model)
## 2.5 % 97.5 %
7
## (Intercept) -3103.7795 -1675.3663
## Gest 124.4312 161.7694
## Smokeyes -330.4064 -158.6817
predict(model, interval="confidence",
newdata=data.frame(Gest=c(38, 38), Smoke=c("yes", "no")))
#Ulangi proses analisis hanya untuk subyek yang tidak merokok saja
model.0 <- lm(Wgt ~ Gest, subset=Smoke=="no")
summary(model.0)
##
## Call:
## lm(formula = Wgt ~ Gest, subset = Smoke == "no")
##
## Residuals:
## Min 1Q Median 3Q Max
## -171.52 -101.59 23.28 83.63 139.48
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2546.14 457.29 -5.568 6.93e-05 ***
## Gest 147.21 11.97 12.294 6.85e-09 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 106.9 on 14 degrees of freedom
## Multiple R-squared: 0.9152, Adjusted R-squared: 0.9092
## F-statistic: 151.1 on 1 and 14 DF, p-value: 6.852e-09
predict(model.0, interval="confidence",
newdata=data.frame(Gest=38))
##
## Call:
## lm(formula = Wgt ~ Gest, subset = Smoke == "yes")
##
## Residuals:
## Min 1Q Median 3Q Max
## -228.53 -64.86 -19.10 93.89 184.53
8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2474.56 553.97 -4.467 0.000532 ***
## Gest 139.03 14.11 9.851 1.12e-07 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 126.6 on 14 degrees of freedom
## Multiple R-squared: 0.8739, Adjusted R-squared: 0.8649
## F-statistic: 97.04 on 1 and 14 DF, p-value: 1.125e-07
predict(model.1, interval="confidence",
newdata=data.frame(Gest=38))
##
## Call:
## lm(formula = Wgt ~ Gest + Smoke2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.693 -92.063 -9.365 79.663 197.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2511.845 353.449 -7.107 8.07e-08 ***
## Gest 143.100 9.128 15.677 1.07e-15 ***
## Smoke2 -122.272 20.991 -5.825 2.58e-06 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 115.5 on 29 degrees of freedom
## Multiple R-squared: 0.8964, Adjusted R-squared: 0.8892
## F-statistic: 125.4 on 2 and 29 DF, p-value: 5.289e-15
##
## Call:
## lm(formula = Wgt ~ Gest + Smoke, contrasts = list(Smoke = "contr.sum"))
##
## Residuals:
9
## Min 1Q Median 3Q Max
## -223.693 -92.063 -9.365 79.663 197.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2511.845 353.449 -7.107 8.07e-08 ***
## Gest 143.100 9.128 15.677 1.07e-15 ***
## Smoke1 122.272 20.991 5.825 2.58e-06 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 115.5 on 29 degrees of freedom
## Multiple R-squared: 0.8964, Adjusted R-squared: 0.8892
## F-statistic: 125.4 on 2 and 29 DF, p-value: 5.289e-15
10