Assignment3
Sujit
March 20, 2018
file:///C:/Users/Administrator/Documents/Assignment3eda.html 1/30
3/25/2018 Assignment3
out_percent<-function(x){
num_col<-names(x)[sapply(x, is.numeric)]
outliers<-sapply(x[,num_col], boxplot.stats,simplify = F)
outliers_percentage<-sapply(outliers, function(x){length(x$out)/x$n*100})
return(outliers_percentage)
}
out_percent(hr)
file:///C:/Users/Administrator/Documents/Assignment3eda.html 2/30
3/25/2018 Assignment3
## $BusinessTravel
##
## Non-Travel Travel_Frequently Travel_Rarely
## 150 277 1043
##
## $Department
##
## Human Resources Research & Development Sales
## 63 961 446
##
## $EducationField
##
## Human Resources Life Sciences Marketing Medical
## 27 606 159 464
## Other Technical Degree
## 82 132
##
## $Gender
##
## Female Male
## 588 882
##
## $JobRole
##
## Healthcare Representative Human Resources
## 131 52
## Laboratory Technician Manager
## 259 102
## Manufacturing Director Research Director
## 145 80
## Research Scientist Sales Executive
## 292 326
## Sales Representative
## 83
##
## $MaritalStatus
##
## Divorced Married Single
## 327 673 470
##
## $Over18
##
## Y
## 1470
##
## $OverTime
##
## No Yes
## 1054 416
file:///C:/Users/Administrator/Documents/Assignment3eda.html 3/30
3/25/2018 Assignment3
# Percentage of occurences
perc<-sapply(freq, function(x){x/nrow(cat_col)*100})
perc
file:///C:/Users/Administrator/Documents/Assignment3eda.html 4/30
3/25/2018 Assignment3
## $BusinessTravel
##
## Non-Travel Travel_Frequently Travel_Rarely
## 10.20408 18.84354 70.95238
##
## $Department
##
## Human Resources Research & Development Sales
## 4.285714 65.374150 30.340136
##
## $EducationField
##
## Human Resources Life Sciences Marketing Medical
## 1.836735 41.224490 10.816327 31.564626
## Other Technical Degree
## 5.578231 8.979592
##
## $Gender
##
## Female Male
## 40 60
##
## $JobRole
##
## Healthcare Representative Human Resources
## 8.911565 3.537415
## Laboratory Technician Manager
## 17.619048 6.938776
## Manufacturing Director Research Director
## 9.863946 5.442177
## Research Scientist Sales Executive
## 19.863946 22.176871
## Sales Representative
## 5.646259
##
## $MaritalStatus
##
## Divorced Married Single
## 22.24490 45.78231 31.97279
##
## $Over18
##
## Y
## 100
##
## $OverTime
##
## No Yes
## 71.70068 28.29932
file:///C:/Users/Administrator/Documents/Assignment3eda.html 5/30
3/25/2018 Assignment3
invisible(sapply(cat_col, plot,col="skyblue"))
file:///C:/Users/Administrator/Documents/Assignment3eda.html 6/30
3/25/2018 Assignment3
file:///C:/Users/Administrator/Documents/Assignment3eda.html 7/30
3/25/2018 Assignment3
file:///C:/Users/Administrator/Documents/Assignment3eda.html 8/30
3/25/2018 Assignment3
file:///C:/Users/Administrator/Documents/Assignment3eda.html 9/30
3/25/2018 Assignment3
# 4)
Apply bivariate analysis like correlation analysis, crosstab analysis and segmented analysis, wherever necessary
file:///C:/Users/Administrator/Documents/Assignment3eda.html 10/30
3/25/2018 Assignment3
# Correlation Analysis
corr<-cor(hr[sapply(hr, is.numeric)])
corrplot(corr,type = "upper",method = "circle") # Plot correlation matrix
corr[lower.tri(corr)]=0
corr<-corr-diag(nrow(corr))
index<-which(abs(corr)>0.7,arr.ind = T)
df<-data.frame(rowname=rownames(index),colname=colnames(corr)[index[,2]],cor_value=corr[index])
df
file:///C:/Users/Administrator/Documents/Assignment3eda.html 11/30
3/25/2018 Assignment3
# Crosstab Analysis
d<-ggplot(crosstab,aes(Gender,MaritalStatus))+geom_tile(aes(fill=Avg_Monthly_Income))+scale_fill
_gradient(low = "grey",high = "black",na.value = "white")+theme_classic()
plot(d)
file:///C:/Users/Administrator/Documents/Assignment3eda.html 12/30
3/25/2018 Assignment3
d1<-ggplot(crosstab2,aes(Department,EducationField))+geom_tile(aes(fill=Avg_Monthly_Income),colo
ur="white",size=1)+scale_fill_gradient(low = "steelblue",high = "blue",na.value = "white")+theme
_classic()
plot(d1)
file:///C:/Users/Administrator/Documents/Assignment3eda.html 13/30
3/25/2018 Assignment3
# Segmented Analysis
i=1
while (i<=length(cat_col)) {
grps<-levels(cat_col[,i])
j=1
if (length(grps)>=2) {
not_influenced<-c()
influenced<-c()
while (j<=length(num_col)) {
anov<-summary(aov(num_col[,j]~cat_col[,i]))
if (anov[[1]][1,5]>=0.05) {
not_influenced<-append(not_influenced,names(num_col[j]))
}
else{
influenced<-append(influenced,names(num_col[j]))
}
j=j+1
}
}
cat("\n")
cat("Numerical columns influenced or not influenced by",names(cat_col[i]))
cat("\n")
print(list(not_influenced=not_influenced,influenced=influenced))
cat("\n")
i=i+1
}
file:///C:/Users/Administrator/Documents/Assignment3eda.html 14/30
3/25/2018 Assignment3
##
## Numerical columns influenced or not influenced by BusinessTravel
## $not_influenced
## [1] "Age" "DailyRate"
## [3] "DistanceFromHome" "Education"
## [5] "EmployeeCount" "EmployeeNumber"
## [7] "EnvironmentSatisfaction" "HourlyRate"
## [9] "JobInvolvement" "JobLevel"
## [11] "JobSatisfaction" "MonthlyIncome"
## [13] "MonthlyRate" "NumCompaniesWorked"
## [15] "PercentSalaryHike" "PerformanceRating"
## [17] "RelationshipSatisfaction" "StandardHours"
## [19] "StockOptionLevel" "TotalWorkingYears"
## [21] "TrainingTimesLastYear" "WorkLifeBalance"
## [23] "YearsAtCompany" "YearsInCurrentRole"
## [25] "YearsSinceLastPromotion" "YearsWithCurrManager"
##
## $influenced
## [1] "Attrition"
##
##
##
## Numerical columns influenced or not influenced by Department
## $not_influenced
## [1] "Age" "DailyRate"
## [3] "DistanceFromHome" "Education"
## [5] "EmployeeCount" "EnvironmentSatisfaction"
## [7] "HourlyRate" "JobInvolvement"
## [9] "JobSatisfaction" "MonthlyRate"
## [11] "NumCompaniesWorked" "PercentSalaryHike"
## [13] "PerformanceRating" "RelationshipSatisfaction"
## [15] "StandardHours" "StockOptionLevel"
## [17] "TotalWorkingYears" "TrainingTimesLastYear"
## [19] "YearsAtCompany" "YearsInCurrentRole"
## [21] "YearsSinceLastPromotion" "YearsWithCurrManager"
##
## $influenced
## [1] "Attrition" "EmployeeNumber" "JobLevel" "MonthlyIncome"
## [5] "WorkLifeBalance"
##
##
##
## Numerical columns influenced or not influenced by EducationField
## $not_influenced
## [1] "Age" "DailyRate"
## [3] "DistanceFromHome" "EmployeeCount"
## [5] "EmployeeNumber" "EnvironmentSatisfaction"
## [7] "HourlyRate" "JobInvolvement"
## [9] "JobSatisfaction" "MonthlyIncome"
## [11] "MonthlyRate" "NumCompaniesWorked"
## [13] "PercentSalaryHike" "PerformanceRating"
## [15] "RelationshipSatisfaction" "StandardHours"
## [17] "StockOptionLevel" "TotalWorkingYears"
file:///C:/Users/Administrator/Documents/Assignment3eda.html 15/30
3/25/2018 Assignment3
file:///C:/Users/Administrator/Documents/Assignment3eda.html 16/30
3/25/2018 Assignment3
file:///C:/Users/Administrator/Documents/Assignment3eda.html 17/30
3/25/2018 Assignment3
# It can be concluded from graph too that marital status influenced Avg_Income with Married and
Divorced people having greater average salary compared to singles
file:///C:/Users/Administrator/Documents/Assignment3eda.html 18/30
3/25/2018 Assignment3
hr %>% ggplot(aes(x=Age,y=MonthlyIncome,color=MaritalStatus))+geom_point()+geom_smooth(method =
"loess")
file:///C:/Users/Administrator/Documents/Assignment3eda.html 19/30
3/25/2018 Assignment3
file:///C:/Users/Administrator/Documents/Assignment3eda.html 20/30
3/25/2018 Assignment3
file:///C:/Users/Administrator/Documents/Assignment3eda.html 21/30
3/25/2018 Assignment3
# Multiple BoxPlots
hr %>% ggplot(aes(x=BusinessTravel,y=MonthlyIncome))+geom_boxplot()
file:///C:/Users/Administrator/Documents/Assignment3eda.html 22/30
3/25/2018 Assignment3
hr %>% ggplot(aes(x=Gender,y=YearsWithCurrManager))+geom_boxplot()
file:///C:/Users/Administrator/Documents/Assignment3eda.html 23/30
3/25/2018 Assignment3
hr %>% ggplot(aes(x=JobRole,y=YearsAtCompany))+geom_boxplot()
file:///C:/Users/Administrator/Documents/Assignment3eda.html 24/30
3/25/2018 Assignment3
hr %>% ggplot(aes(x=JobRole,y=MonthlyIncome))+geom_boxplot()
file:///C:/Users/Administrator/Documents/Assignment3eda.html 25/30
3/25/2018 Assignment3
ggplot(hr,aes(x = BusinessTravel,fill=Gender))+geom_bar()
file:///C:/Users/Administrator/Documents/Assignment3eda.html 26/30
3/25/2018 Assignment3
ggplot(hr,aes(x = BusinessTravel,fill=MaritalStatus))+geom_bar()
file:///C:/Users/Administrator/Documents/Assignment3eda.html 27/30
3/25/2018 Assignment3
ggplot(hr,aes(x = Department,fill=EducationField))+geom_bar()
file:///C:/Users/Administrator/Documents/Assignment3eda.html 28/30
3/25/2018 Assignment3
ggplot(hr,aes(x = MaritalStatus,fill=OverTime))+geom_bar()
file:///C:/Users/Administrator/Documents/Assignment3eda.html 29/30
3/25/2018 Assignment3
file:///C:/Users/Administrator/Documents/Assignment3eda.html 30/30