Anda di halaman 1dari 30

3/25/2018 Assignment3

Assignment3
Sujit
March 20, 2018

1) Identify percentage of missing values in


each column and display the same
percent_NA<-sapply(hr,function(x){sum(is.na(x))/length(x)*100})
percent_NA

## Age Attrition BusinessTravel


## 0 0 0
## DailyRate Department DistanceFromHome
## 0 0 0
## Education EducationField EmployeeCount
## 0 0 0
## EmployeeNumber EnvironmentSatisfaction Gender
## 0 0 0
## HourlyRate JobInvolvement JobLevel
## 0 0 0
## JobRole JobSatisfaction MaritalStatus
## 0 0 0
## MonthlyIncome MonthlyRate NumCompaniesWorked
## 0 0 0
## Over18 OverTime PercentSalaryHike
## 0 0 0
## PerformanceRating RelationshipSatisfaction StandardHours
## 0 0 0
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## 0 0 0
## YearsSinceLastPromotion YearsWithCurrManager
## 0 0

2) Create a function which identify the


percentage of outliers in a numerical column
and use the function to display the percentage
of outliers in all numerical

file:///C:/Users/Administrator/Documents/Assignment3eda.html 1/30
3/25/2018 Assignment3

out_percent<-function(x){
num_col<-names(x)[sapply(x, is.numeric)]
outliers<-sapply(x[,num_col], boxplot.stats,simplify = F)
outliers_percentage<-sapply(outliers, function(x){length(x$out)/x$n*100})
return(outliers_percentage)
}

out_percent(hr)

## Age Attrition DailyRate


## 0.000000 16.122449 0.000000
## DistanceFromHome Education EmployeeCount
## 0.000000 0.000000 0.000000
## EmployeeNumber EnvironmentSatisfaction HourlyRate
## 0.000000 0.000000 0.000000
## JobInvolvement JobLevel JobSatisfaction
## 0.000000 0.000000 0.000000
## MonthlyIncome MonthlyRate NumCompaniesWorked
## 7.755102 0.000000 3.537415
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## 0.000000 15.374150 0.000000
## StandardHours StockOptionLevel TotalWorkingYears
## 0.000000 5.782313 4.285714
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 16.190476 0.000000 7.074830
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1.428571 7.278912 0.952381

3) Apply univariate analysis on categorical


columns
# MISSING VALUES
sapply(cat_col, function(x){sum(is.na(x))})

## BusinessTravel Department EducationField Gender JobRole


## 0 0 0 0 0
## MaritalStatus Over18 OverTime
## 0 0 0

# Frequency of Unique levels


freq<-sapply(cat_col, table,simplify = F)
freq

file:///C:/Users/Administrator/Documents/Assignment3eda.html 2/30
3/25/2018 Assignment3

## $BusinessTravel
##
## Non-Travel Travel_Frequently Travel_Rarely
## 150 277 1043
##
## $Department
##
## Human Resources Research & Development Sales
## 63 961 446
##
## $EducationField
##
## Human Resources Life Sciences Marketing Medical
## 27 606 159 464
## Other Technical Degree
## 82 132
##
## $Gender
##
## Female Male
## 588 882
##
## $JobRole
##
## Healthcare Representative Human Resources
## 131 52
## Laboratory Technician Manager
## 259 102
## Manufacturing Director Research Director
## 145 80
## Research Scientist Sales Executive
## 292 326
## Sales Representative
## 83
##
## $MaritalStatus
##
## Divorced Married Single
## 327 673 470
##
## $Over18
##
## Y
## 1470
##
## $OverTime
##
## No Yes
## 1054 416

file:///C:/Users/Administrator/Documents/Assignment3eda.html 3/30
3/25/2018 Assignment3

# Percentage of occurences
perc<-sapply(freq, function(x){x/nrow(cat_col)*100})
perc

file:///C:/Users/Administrator/Documents/Assignment3eda.html 4/30
3/25/2018 Assignment3

## $BusinessTravel
##
## Non-Travel Travel_Frequently Travel_Rarely
## 10.20408 18.84354 70.95238
##
## $Department
##
## Human Resources Research & Development Sales
## 4.285714 65.374150 30.340136
##
## $EducationField
##
## Human Resources Life Sciences Marketing Medical
## 1.836735 41.224490 10.816327 31.564626
## Other Technical Degree
## 5.578231 8.979592
##
## $Gender
##
## Female Male
## 40 60
##
## $JobRole
##
## Healthcare Representative Human Resources
## 8.911565 3.537415
## Laboratory Technician Manager
## 17.619048 6.938776
## Manufacturing Director Research Director
## 9.863946 5.442177
## Research Scientist Sales Executive
## 19.863946 22.176871
## Sales Representative
## 5.646259
##
## $MaritalStatus
##
## Divorced Married Single
## 22.24490 45.78231 31.97279
##
## $Over18
##
## Y
## 100
##
## $OverTime
##
## No Yes
## 71.70068 28.29932

file:///C:/Users/Administrator/Documents/Assignment3eda.html 5/30
3/25/2018 Assignment3

# Groups together contribute to 80 percent

percent_80<-sapply(perc, function(x){cum_tot<-cumsum(sort(x,decreasing = T))


i=1
while(i<=length(x)){
if(cum_tot[i]>=80){
break()
}
i=i+1
}
return(i)})
percent_80

## BusinessTravel Department EducationField Gender JobRole


## 2 2 3 2 6
## MaritalStatus Over18 OverTime
## 3 1 2

# Bar graph of each column

invisible(sapply(cat_col, plot,col="skyblue"))

file:///C:/Users/Administrator/Documents/Assignment3eda.html 6/30
3/25/2018 Assignment3

file:///C:/Users/Administrator/Documents/Assignment3eda.html 7/30
3/25/2018 Assignment3

file:///C:/Users/Administrator/Documents/Assignment3eda.html 8/30
3/25/2018 Assignment3

file:///C:/Users/Administrator/Documents/Assignment3eda.html 9/30
3/25/2018 Assignment3

# 4)

Apply bivariate analysis like correlation analysis, crosstab analysis and segmented analysis, wherever necessary
file:///C:/Users/Administrator/Documents/Assignment3eda.html 10/30
3/25/2018 Assignment3

and share your inferences

# Correlation Analysis

corr<-cor(hr[sapply(hr, is.numeric)])
corrplot(corr,type = "upper",method = "circle") # Plot correlation matrix

corr[lower.tri(corr)]=0
corr<-corr-diag(nrow(corr))
index<-which(abs(corr)>0.7,arr.ind = T)
df<-data.frame(rowname=rownames(index),colname=colnames(corr)[index[,2]],cor_value=corr[index])
df

## rowname colname cor_value


## 1 JobLevel MonthlyIncome 0.9502999
## 2 PercentSalaryHike PerformanceRating 0.7735500
## 3 JobLevel TotalWorkingYears 0.7822078
## 4 MonthlyIncome TotalWorkingYears 0.7728932
## 5 YearsAtCompany YearsInCurrentRole 0.7587537
## 6 YearsAtCompany YearsWithCurrManager 0.7692124
## 7 YearsInCurrentRole YearsWithCurrManager 0.7143648

file:///C:/Users/Administrator/Documents/Assignment3eda.html 11/30
3/25/2018 Assignment3

# From corrplot and correlation value dataframe it can be inferred:


# Columns "JobLevel" and "MonthlyIncome" have high positive correlation

# Crosstab Analysis

crosstab<-hr %>% group_by(Gender,MaritalStatus) %>% summarise(Avg_Monthly_Income=mean(MonthlyInc


ome,na.rm = T))
crosstab<-as.data.frame(crosstab)
names(crosstab)<-c("Gender","MaritalStatus","Avg_Monthly_Income")
cross_table<-cast(crosstab,Gender~MaritalStatus,value = 'Avg_Monthly_Income')
cross_table

## Gender Divorced Married Single


## 1 Female 6769.325 7156.971 5994.945
## 2 Male 6795.738 6547.244 5812.022

d<-ggplot(crosstab,aes(Gender,MaritalStatus))+geom_tile(aes(fill=Avg_Monthly_Income))+scale_fill
_gradient(low = "grey",high = "black",na.value = "white")+theme_classic()
plot(d)

file:///C:/Users/Administrator/Documents/Assignment3eda.html 12/30
3/25/2018 Assignment3

crosstab2<-hr %>% group_by(Department,EducationField) %>% summarise(Avg_Monthly_Income=mean(Mont


hlyIncome,na.rm = T))
cross_table1<-cast(crosstab2,Department~EducationField,value="Avg_Monthly_Income")
cross_table1

## Department Human Resources Life Sciences Marketing Medical


## 1 Human Resources 7241.148 6914.062 NA 6594.077
## 2 Research & Development NA 6179.984 NA 6539.223
## 3 Sales NA 7246.233 7348.585 6377.227
## Other Technical Degree
## 1 5016.667 3081.250
## 2 6278.688 5760.819
## 3 5398.733 6066.294

d1<-ggplot(crosstab2,aes(Department,EducationField))+geom_tile(aes(fill=Avg_Monthly_Income),colo
ur="white",size=1)+scale_fill_gradient(low = "steelblue",high = "blue",na.value = "white")+theme
_classic()
plot(d1)

file:///C:/Users/Administrator/Documents/Assignment3eda.html 13/30
3/25/2018 Assignment3

# Segmented Analysis

i=1
while (i<=length(cat_col)) {
grps<-levels(cat_col[,i])
j=1
if (length(grps)>=2) {
not_influenced<-c()
influenced<-c()
while (j<=length(num_col)) {
anov<-summary(aov(num_col[,j]~cat_col[,i]))
if (anov[[1]][1,5]>=0.05) {
not_influenced<-append(not_influenced,names(num_col[j]))
}
else{
influenced<-append(influenced,names(num_col[j]))
}
j=j+1
}
}
cat("\n")
cat("Numerical columns influenced or not influenced by",names(cat_col[i]))
cat("\n")
print(list(not_influenced=not_influenced,influenced=influenced))
cat("\n")
i=i+1
}

file:///C:/Users/Administrator/Documents/Assignment3eda.html 14/30
3/25/2018 Assignment3

##
## Numerical columns influenced or not influenced by BusinessTravel
## $not_influenced
## [1] "Age" "DailyRate"
## [3] "DistanceFromHome" "Education"
## [5] "EmployeeCount" "EmployeeNumber"
## [7] "EnvironmentSatisfaction" "HourlyRate"
## [9] "JobInvolvement" "JobLevel"
## [11] "JobSatisfaction" "MonthlyIncome"
## [13] "MonthlyRate" "NumCompaniesWorked"
## [15] "PercentSalaryHike" "PerformanceRating"
## [17] "RelationshipSatisfaction" "StandardHours"
## [19] "StockOptionLevel" "TotalWorkingYears"
## [21] "TrainingTimesLastYear" "WorkLifeBalance"
## [23] "YearsAtCompany" "YearsInCurrentRole"
## [25] "YearsSinceLastPromotion" "YearsWithCurrManager"
##
## $influenced
## [1] "Attrition"
##
##
##
## Numerical columns influenced or not influenced by Department
## $not_influenced
## [1] "Age" "DailyRate"
## [3] "DistanceFromHome" "Education"
## [5] "EmployeeCount" "EnvironmentSatisfaction"
## [7] "HourlyRate" "JobInvolvement"
## [9] "JobSatisfaction" "MonthlyRate"
## [11] "NumCompaniesWorked" "PercentSalaryHike"
## [13] "PerformanceRating" "RelationshipSatisfaction"
## [15] "StandardHours" "StockOptionLevel"
## [17] "TotalWorkingYears" "TrainingTimesLastYear"
## [19] "YearsAtCompany" "YearsInCurrentRole"
## [21] "YearsSinceLastPromotion" "YearsWithCurrManager"
##
## $influenced
## [1] "Attrition" "EmployeeNumber" "JobLevel" "MonthlyIncome"
## [5] "WorkLifeBalance"
##
##
##
## Numerical columns influenced or not influenced by EducationField
## $not_influenced
## [1] "Age" "DailyRate"
## [3] "DistanceFromHome" "EmployeeCount"
## [5] "EmployeeNumber" "EnvironmentSatisfaction"
## [7] "HourlyRate" "JobInvolvement"
## [9] "JobSatisfaction" "MonthlyIncome"
## [11] "MonthlyRate" "NumCompaniesWorked"
## [13] "PercentSalaryHike" "PerformanceRating"
## [15] "RelationshipSatisfaction" "StandardHours"
## [17] "StockOptionLevel" "TotalWorkingYears"

file:///C:/Users/Administrator/Documents/Assignment3eda.html 15/30
3/25/2018 Assignment3

## [19] "TrainingTimesLastYear" "WorkLifeBalance"


## [21] "YearsAtCompany" "YearsInCurrentRole"
## [23] "YearsSinceLastPromotion" "YearsWithCurrManager"
##
## $influenced
## [1] "Attrition" "Education" "JobLevel"
##
##
##
## Numerical columns influenced or not influenced by Gender
## $not_influenced
## [1] "Age" "Attrition"
## [3] "DailyRate" "DistanceFromHome"
## [5] "Education" "EmployeeCount"
## [7] "EmployeeNumber" "EnvironmentSatisfaction"
## [9] "HourlyRate" "JobInvolvement"
## [11] "JobLevel" "JobSatisfaction"
## [13] "MonthlyIncome" "MonthlyRate"
## [15] "NumCompaniesWorked" "PercentSalaryHike"
## [17] "PerformanceRating" "RelationshipSatisfaction"
## [19] "StandardHours" "StockOptionLevel"
## [21] "TotalWorkingYears" "TrainingTimesLastYear"
## [23] "WorkLifeBalance" "YearsAtCompany"
## [25] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [27] "YearsWithCurrManager"
##
## $influenced
## NULL
##
##
##
## Numerical columns influenced or not influenced by JobRole
## $not_influenced
## [1] "DailyRate" "DistanceFromHome"
## [3] "EmployeeCount" "EmployeeNumber"
## [5] "EnvironmentSatisfaction" "HourlyRate"
## [7] "JobInvolvement" "JobSatisfaction"
## [9] "MonthlyRate" "PercentSalaryHike"
## [11] "PerformanceRating" "RelationshipSatisfaction"
## [13] "StandardHours" "StockOptionLevel"
## [15] "TrainingTimesLastYear" "WorkLifeBalance"
##
## $influenced
## [1] "Age" "Attrition"
## [3] "Education" "JobLevel"
## [5] "MonthlyIncome" "NumCompaniesWorked"
## [7] "TotalWorkingYears" "YearsAtCompany"
## [9] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [11] "YearsWithCurrManager"
##
##
##
## Numerical columns influenced or not influenced by MaritalStatus
## $not_influenced

file:///C:/Users/Administrator/Documents/Assignment3eda.html 16/30
3/25/2018 Assignment3

## [1] "DistanceFromHome" "Education"


## [3] "EmployeeCount" "EmployeeNumber"
## [5] "EnvironmentSatisfaction" "HourlyRate"
## [7] "JobInvolvement" "JobSatisfaction"
## [9] "MonthlyRate" "NumCompaniesWorked"
## [11] "PercentSalaryHike" "PerformanceRating"
## [13] "RelationshipSatisfaction" "StandardHours"
## [15] "TrainingTimesLastYear" "WorkLifeBalance"
## [17] "YearsSinceLastPromotion" "YearsWithCurrManager"
##
## $influenced
## [1] "Age" "Attrition" "DailyRate"
## [4] "JobLevel" "MonthlyIncome" "StockOptionLevel"
## [7] "TotalWorkingYears" "YearsAtCompany" "YearsInCurrentRole"
##
##
##
## Numerical columns influenced or not influenced by Over18
## $not_influenced
## [1] "DistanceFromHome" "Education"
## [3] "EmployeeCount" "EmployeeNumber"
## [5] "EnvironmentSatisfaction" "HourlyRate"
## [7] "JobInvolvement" "JobSatisfaction"
## [9] "MonthlyRate" "NumCompaniesWorked"
## [11] "PercentSalaryHike" "PerformanceRating"
## [13] "RelationshipSatisfaction" "StandardHours"
## [15] "TrainingTimesLastYear" "WorkLifeBalance"
## [17] "YearsSinceLastPromotion" "YearsWithCurrManager"
##
## $influenced
## [1] "Age" "Attrition" "DailyRate"
## [4] "JobLevel" "MonthlyIncome" "StockOptionLevel"
## [7] "TotalWorkingYears" "YearsAtCompany" "YearsInCurrentRole"
##
##
##
## Numerical columns influenced or not influenced by OverTime
## $not_influenced
## [1] "Age" "DailyRate"
## [3] "DistanceFromHome" "Education"
## [5] "EmployeeCount" "EmployeeNumber"
## [7] "HourlyRate" "JobInvolvement"
## [9] "JobLevel" "JobSatisfaction"
## [11] "MonthlyIncome" "MonthlyRate"
## [13] "NumCompaniesWorked" "PercentSalaryHike"
## [15] "PerformanceRating" "RelationshipSatisfaction"
## [17] "StandardHours" "StockOptionLevel"
## [19] "TotalWorkingYears" "WorkLifeBalance"
## [21] "YearsAtCompany" "YearsInCurrentRole"
## [23] "YearsSinceLastPromotion" "YearsWithCurrManager"
##
## $influenced
## [1] "Attrition" "EnvironmentSatisfaction"
## [3] "TrainingTimesLastYear"

file:///C:/Users/Administrator/Documents/Assignment3eda.html 17/30
3/25/2018 Assignment3

hr %>% group_by(MaritalStatus) %>% summarise(Avg_Income=mean(MonthlyIncome,na.rm = T),count=n())


%>% ggplot(aes(MaritalStatus,count,fill=-Avg_Income))+geom_bar(stat="identity")

# It can be concluded from graph too that marital status influenced Avg_Income with Married and
Divorced people having greater average salary compared to singles

5) Include more variables in a charts like


scatter plots, multiple box plots, stacked bars
etc to identify better insights
# Scatter Plots

hr %>% ggplot(aes(x=Age,y=MonthlyIncome,color=Gender))+geom_point()+geom_smooth(method = "loess"


)

file:///C:/Users/Administrator/Documents/Assignment3eda.html 18/30
3/25/2018 Assignment3

hr %>% ggplot(aes(x=Age,y=MonthlyIncome,color=MaritalStatus))+geom_point()+geom_smooth(method =
"loess")

file:///C:/Users/Administrator/Documents/Assignment3eda.html 19/30
3/25/2018 Assignment3

hr %>% ggplot(aes(x=NumCompaniesWorked,y = MonthlyRate,color=JobRole))+geom_point()+geom_smooth


(method = "loess")

file:///C:/Users/Administrator/Documents/Assignment3eda.html 20/30
3/25/2018 Assignment3

hr %>% ggplot(aes(x=PercentSalaryHike,y = MonthlyIncome,color=EducationField ))+geom_point()+geo


m_smooth(method = "loess")

file:///C:/Users/Administrator/Documents/Assignment3eda.html 21/30
3/25/2018 Assignment3

# Multiple BoxPlots

hr %>% ggplot(aes(x=BusinessTravel,y=MonthlyIncome))+geom_boxplot()

file:///C:/Users/Administrator/Documents/Assignment3eda.html 22/30
3/25/2018 Assignment3

hr %>% ggplot(aes(x=Gender,y=YearsWithCurrManager))+geom_boxplot()

file:///C:/Users/Administrator/Documents/Assignment3eda.html 23/30
3/25/2018 Assignment3

hr %>% ggplot(aes(x=JobRole,y=YearsAtCompany))+geom_boxplot()

file:///C:/Users/Administrator/Documents/Assignment3eda.html 24/30
3/25/2018 Assignment3

hr %>% ggplot(aes(x=JobRole,y=MonthlyIncome))+geom_boxplot()

file:///C:/Users/Administrator/Documents/Assignment3eda.html 25/30
3/25/2018 Assignment3

# Stacked Bars Graph

ggplot(hr,aes(x = BusinessTravel,fill=Gender))+geom_bar()

file:///C:/Users/Administrator/Documents/Assignment3eda.html 26/30
3/25/2018 Assignment3

ggplot(hr,aes(x = BusinessTravel,fill=MaritalStatus))+geom_bar()

file:///C:/Users/Administrator/Documents/Assignment3eda.html 27/30
3/25/2018 Assignment3

ggplot(hr,aes(x = Department,fill=EducationField))+geom_bar()

file:///C:/Users/Administrator/Documents/Assignment3eda.html 28/30
3/25/2018 Assignment3

ggplot(hr,aes(x = MaritalStatus,fill=OverTime))+geom_bar()

file:///C:/Users/Administrator/Documents/Assignment3eda.html 29/30
3/25/2018 Assignment3

file:///C:/Users/Administrator/Documents/Assignment3eda.html 30/30

Anda mungkin juga menyukai