Anda di halaman 1dari 10

Modul 9

Variabel Dummy

Contoh 1. Berikut ini akan diberikan contoh analisis hubungan kapasitas paru-paru dengan kelompok tinggi
badan. Tinggi badan dalam variabel numerik, untuk menyusun kelompok tinggi badan digunakan variabel
dummy untuk tinggi badan pada data kapasitas paru-paru.

library(readr)
LungCap<-read_csv("E:/KUMPULAN BAHAN KUL UNY/Bahan Kuliah Pengantar Analisi Regresi/PPT Kuliah/LungCapDa

##
## -- Column specification --------------------------------------------------------
## cols(
## Age = col_double(),
## LungCap = col_double(),
## Height = col_double(),
## SqrHeight = col_double(),
## HeightT = col_double(),
## SqrHeightT = col_double(),
## SqrAge = col_double(),
## AgeT = col_double(),
## SqrAgeT = col_double(),
## AgeHeight = col_double(),
## AgeTHeightT = col_double()
## )

#Jika tinggi badan akan dibagi menjadi dua kelompok dengan interval A = (0,60], B = (60,75] #gunakan per
CatHeight<-cut(LungCap$Height,breaks=c(0,60,75),labels=c("A","B"))
#Dan jika interval A = [0,60), B = [60,75) yang akan disusun, gunakan perintah berikut ini
CatHeight<-cut(LungCap$Height,breaks=c(0,60,75),labels=c("A","B"),right=F)
# atau
CatHeight<-cut(LungCap$Height,breaks=2,labels=c("A","B"),right=F)
LungCapNew<-data.frame(LungCap,CatHeight)
str(LungCapNew)

## ’data.frame’: 654 obs. of 12 variables:


## $ Age : num 9 8 7 9 9 8 6 6 8 9 ...
## $ LungCap : num 3.12 3.17 3.16 2.67 3.68 ...
## $ Height : num 57 67.5 54.5 53 57 61 58 56 58.5 60 ...
## $ SqrHeight : num 3249 4556 2970 2809 3249 ...
## $ HeightT : num -4.14 6.36 -6.64 -8.14 -4.14 ...
## $ SqrHeightT : num 17.2 40.4 44.1 66.3 17.2 ...
## $ SqrAge : num 81 64 49 81 81 64 36 36 64 81 ...
## $ AgeT : num -0.931 -1.931 -2.931 -0.931 -0.931 ...
## $ SqrAgeT : num 0.867 3.73 8.592 0.867 0.867 ...
## $ AgeHeight : num 513 540 382 477 513 ...
## $ AgeTHeightT: num 3.86 -12.28 19.47 7.58 3.86 ...
## $ CatHeight : Factor w/ 2 levels "A","B": 1 2 1 1 1 2 1 1 1 2 ...

1
# menghitung rata-rata kapasitas paru-paru untuk subyek pada kelompok tinggi A
mean(LungCapNew$LungCap[LungCapNew$CatHeight=="A"])

## [1] 3.576216

# menghitung rata-rata kapasitas paru-paru untuk subyek pada kelompok tinggi A


mean(LungCapNew$LungCap[LungCapNew$CatHeight=="B"])

## [1] 7.308531

model1<-lm(LungCap~CatHeight,data=LungCapNew)
summary(model1) #reference category adalah A

##
## Call:
## lm(formula = LungCap ~ CatHeight, data = LungCapNew)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2265 -1.2760 -0.2379 0.8998 8.0705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5762 0.1196 29.91 <2e-16 ***
## CatHeightB 3.7323 0.1512 24.68 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 1.872 on 652 degrees of freedom
## Multiple R-squared: 0.4831, Adjusted R-squared: 0.4823
## F-statistic: 609.3 on 1 and 652 DF, p-value: < 2.2e-16

# mengganti reference category menjadi B


LungCapNew$CatHeight<-factor(LungCapNew$CatHeight,c("B","A"))
model2<-lm(LungCap~CatHeight,data=LungCapNew)
summary(model2)

##
## Call:
## lm(formula = LungCap ~ CatHeight, data = LungCapNew)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2265 -1.2760 -0.2379 0.8998 8.0705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.30853 0.09255 78.97 <2e-16 ***
## CatHeightA -3.73231 0.15120 -24.68 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1

2
##
## Residual standard error: 1.872 on 652 degrees of freedom
## Multiple R-squared: 0.4831, Adjusted R-squared: 0.4823
## F-statistic: 609.3 on 1 and 652 DF, p-value: < 2.2e-16

#baik A sebagai refence variable maupun B sebagai refence variable


#baca nilai dugaan lungcap untuk masing-masing kategori, akan bernilai sama,
library(ggplot2)
library(ggthemes)

## Warning: package ’ggthemes’ was built under R version 4.0.5

ggplot(LungCapNew,aes(x=CatHeight,y=LungCap,group=1))+ geom_point()+ geom_smooth(method=lm,se=FALSE)+


theme_wsj()

## ‘geom_smooth()‘ using formula ’y ~ x’

15

10

0
B A

ggplot(LungCapNew,aes(x=CatHeight,y=LungCap))+ geom_point()+ geom_smooth(method=lm,se=FALSE)+


theme_classic()

## ‘geom_smooth()‘ using formula ’y ~ x’

3
15

10
LungCap

0
B A
CatHeight

LungCapNew$CatHeight <- as.numeric(LungCapNew$CatHeight)


str(LungCapNew)

## ’data.frame’: 654 obs. of 12 variables:


## $ Age : num 9 8 7 9 9 8 6 6 8 9 ...
## $ LungCap : num 3.12 3.17 3.16 2.67 3.68 ...
## $ Height : num 57 67.5 54.5 53 57 61 58 56 58.5 60 ...
## $ SqrHeight : num 3249 4556 2970 2809 3249 ...
## $ HeightT : num -4.14 6.36 -6.64 -8.14 -4.14 ...
## $ SqrHeightT : num 17.2 40.4 44.1 66.3 17.2 ...
## $ SqrAge : num 81 64 49 81 81 64 36 36 64 81 ...
## $ AgeT : num -0.931 -1.931 -2.931 -0.931 -0.931 ...
## $ SqrAgeT : num 0.867 3.73 8.592 0.867 0.867 ...
## $ AgeHeight : num 513 540 382 477 513 ...
## $ AgeTHeightT: num 3.86 -12.28 19.47 7.58 3.86 ...
## $ CatHeight : num 2 1 2 2 2 1 2 2 2 1 ...

model1<-lm(LungCap~Age,data=LungCapNew)
plot(LungCap~Age,data=LungCapNew, col = CatHeight + 1, pch = CatHeight + 1, cex = 0.5)
abline(model1, lwd = 3, col = "grey")
legend("topright", c("B", "A"), col = c(2, 3), pch = c(2, 3))

4
15

B
A
10
LungCap

5
0

5 10 15

Age

model2<-lm(LungCap~Age+CatHeight,data=LungCapNew)
height_A = coef(model2)[1]
height_B = coef(model2)[1] + coef(model2)[3]
slope_A = coef(model2)[2]
slope_B = coef(model2)[2]
plot(LungCap~Age,data=LungCapNew, col = CatHeight + 1, pch = CatHeight + 1, cex = 0.5)
legend("topright", c("Height B", "Height A"), col = c(2, 3), pch = c(2, 3))
abline(height_A, slope_A, col = 2, lty = 1, lwd = 2) # add line for Height A
abline(height_B, slope_B, col = 3, lty = 2, lwd = 2) # add line for Height B

5
15

Height B
Height A
10
LungCap

5
0

5 10 15

Age

Contoh 2. Berikut ini akan diberikan contoh analisis hubungan berat bayi (Wgt), status merokok ibu
(Smoke), dan jangka waktu kehamilan dalam satuan minggu (Gest). Perhatikan tipe data dari variabel
kategorik dalam data.

birthsmokers <- read.table("E:\\KUMPULAN BAHAN KUL UNY\\Bahan Kuliah Pengantar Analisi Regresi\\PPT Kuli
attach(birthsmokers)
str(birthsmokers)

## ’data.frame’: 32 obs. of 3 variables:


## $ Wgt : int 2940 3130 2420 2450 2760 2440 3226 3301 2729 3410 ...
## $ Gest : int 38 38 36 34 39 35 40 42 37 40 ...
## $ Smoke: chr "yes" "no" "yes" "no" ...

model <- lm(Wgt ~ Gest + Smoke)


plot(x=Gest, y=Wgt, ylim=c(2300, 3700),
col=ifelse(Smoke=="yes", "red", "blue"),
panel.last = c(lines(sort(Gest[Smoke=="no"]),
fitted(model)[Smoke=="no"][order(Gest[Smoke=="no"])],
col="blue"),
lines(sort(Gest[Smoke=="yes"]),
fitted(model)[Smoke=="yes"][order(Gest[Smoke=="yes"])],
col="red")))

6
3600
3200
Wgt

2800
2400

34 36 38 40 42

Gest

summary(model)

##
## Call:
## lm(formula = Wgt ~ Gest + Smoke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.693 -92.063 -9.365 79.663 197.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2389.573 349.206 -6.843 1.63e-07 ***
## Gest 143.100 9.128 15.677 1.07e-15 ***
## Smokeyes -244.544 41.982 -5.825 2.58e-06 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 115.5 on 29 degrees of freedom
## Multiple R-squared: 0.8964, Adjusted R-squared: 0.8892
## F-statistic: 125.4 on 2 and 29 DF, p-value: 5.289e-15

confint(model)

## 2.5 % 97.5 %

7
## (Intercept) -3103.7795 -1675.3663
## Gest 124.4312 161.7694
## Smokeyes -330.4064 -158.6817

predict(model, interval="confidence",
newdata=data.frame(Gest=c(38, 38), Smoke=c("yes", "no")))

## fit lwr upr


## 1 2803.693 2740.599 2866.788
## 2 3048.237 2989.120 3107.355

#Ulangi proses analisis hanya untuk subyek yang tidak merokok saja
model.0 <- lm(Wgt ~ Gest, subset=Smoke=="no")
summary(model.0)

##
## Call:
## lm(formula = Wgt ~ Gest, subset = Smoke == "no")
##
## Residuals:
## Min 1Q Median 3Q Max
## -171.52 -101.59 23.28 83.63 139.48
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2546.14 457.29 -5.568 6.93e-05 ***
## Gest 147.21 11.97 12.294 6.85e-09 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 106.9 on 14 degrees of freedom
## Multiple R-squared: 0.9152, Adjusted R-squared: 0.9092
## F-statistic: 151.1 on 1 and 14 DF, p-value: 6.852e-09

predict(model.0, interval="confidence",
newdata=data.frame(Gest=38))

## fit lwr upr


## 1 3047.724 2990.298 3105.15

#Ulangi proses analisis hanya untuk subyek yang merokok saja


model.1 <- lm(Wgt ~ Gest, subset=Smoke=="yes")
summary(model.1)

##
## Call:
## lm(formula = Wgt ~ Gest, subset = Smoke == "yes")
##
## Residuals:
## Min 1Q Median 3Q Max
## -228.53 -64.86 -19.10 93.89 184.53

8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2474.56 553.97 -4.467 0.000532 ***
## Gest 139.03 14.11 9.851 1.12e-07 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 126.6 on 14 degrees of freedom
## Multiple R-squared: 0.8739, Adjusted R-squared: 0.8649
## F-statistic: 97.04 on 1 and 14 DF, p-value: 1.125e-07

predict(model.1, interval="confidence",
newdata=data.frame(Gest=38))

## fit lwr upr


## 1 2808.528 2731.726 2885.331

#Ulangi proses analisis dengan menggunakan kode 1 dan -1


Smoke2 <- ifelse(Smoke=="yes", 1, -1)
model.3 <- lm(Wgt ~ Gest + Smoke2)
summary(model.3)

##
## Call:
## lm(formula = Wgt ~ Gest + Smoke2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.693 -92.063 -9.365 79.663 197.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2511.845 353.449 -7.107 8.07e-08 ***
## Gest 143.100 9.128 15.677 1.07e-15 ***
## Smoke2 -122.272 20.991 -5.825 2.58e-06 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 115.5 on 29 degrees of freedom
## Multiple R-squared: 0.8964, Adjusted R-squared: 0.8892
## F-statistic: 125.4 on 2 and 29 DF, p-value: 5.289e-15

#atau gunakan perintah lain


model.3 <- lm(Wgt ~ Gest + Smoke, contrasts=list(Smoke="contr.sum"))
summary(model.3)

##
## Call:
## lm(formula = Wgt ~ Gest + Smoke, contrasts = list(Smoke = "contr.sum"))
##
## Residuals:

9
## Min 1Q Median 3Q Max
## -223.693 -92.063 -9.365 79.663 197.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2511.845 353.449 -7.107 8.07e-08 ***
## Gest 143.100 9.128 15.677 1.07e-15 ***
## Smoke1 122.272 20.991 5.825 2.58e-06 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 115.5 on 29 degrees of freedom
## Multiple R-squared: 0.8964, Adjusted R-squared: 0.8892
## F-statistic: 125.4 on 2 and 29 DF, p-value: 5.289e-15

10

Anda mungkin juga menyukai