Introduction
Here I work on kaggle data set "Human Resources Analytics", which can be found by urlhttps://www.kaggle.com/ludobenistant/hr-analytics.
We're supposed to predict which valuable employees will leave next. Fields in the data set include:
- Employee satisfaction level
- Last evaluation
- Number of projects
- Average monthly hours
- Time spent at the company
- Whether they have had a work accident
- Whether they have had a promotion in the last 5 years
- Department
- Salary
- Whether the employee has left
Data Load
I start with loading data set and looking at its properties: dimensions, column names and types of data.dt=read.csv("HR_comma_sep.csv", stringsAsFactors=F) str(dt)
## 'data.frame': 14999 obs. of 10 variables: ## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ... ## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ... ## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ... ## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ... ## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ... ## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ... ## $ left : int 1 1 1 1 1 1 1 1 1 1 ... ## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ... ## $ sales : chr "sales" "sales" "sales" "sales" ... ## $ salary : chr "low" "medium" "medium" "low" ...
sapply(dt, function(x) sum(is.na(x)))
## satisfaction_level last_evaluation number_project ## 0 0 0 ## average_montly_hours time_spend_company Work_accident ## 0 0 0 ## left promotion_last_5years sales ## 0 0 0 ## salary ## 0
We can check pairwise plots, correlations and densities.
library(ggplot2) library(GGally) ggpairs(data=dt, columns=1:8, mapping = aes(color = "navy"), axisLabels="show")
unique(dt$salary)
## [1] "low" "medium" "high"
unique(dt$sales)
## [1] "sales" "accounting" "hr" "technical" "support" ## [6] "management" "IT" "product_mng" "marketing" "RandD"
table(dt$sales)
## ## accounting hr IT management marketing product_mng ## 767 739 1227 630 858 902 ## RandD sales support technical ## 787 4140 2229 2720
table(dt$salary)
## ## high low medium ## 1237 7316 6446
library(dummies) dumdt=dummy(dt$salary) dumdt=as.data.frame(dumdt) names(dumdt)
## [1] "MachinePredictionForNorthropGrumman.Rhtmlhigh" ## [2] "MachinePredictionForNorthropGrumman.Rhtmllow" ## [3] "MachinePredictionForNorthropGrumman.Rhtmlmedium"
dumdt=dumdt[,-2] names(dumdt)=c("high_salary", "medium_salary")
dt=cbind(dt,dumdt) dt$salary=NULL str(dt)
## 'data.frame': 14999 obs. of 11 variables: ## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ... ## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ... ## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ... ## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ... ## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ... ## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ... ## $ left : int 1 1 1 1 1 1 1 1 1 1 ... ## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ... ## $ sales : chr "sales" "sales" "sales" "sales" ... ## $ high_salary : int 0 0 0 0 0 0 0 0 0 0 ... ## $ medium_salary : int 0 1 1 0 0 0 0 0 0 0 ...
dumdt=dummy(dt$sales) dumdt=as.data.frame(dumdt) names(dumdt)
## [1] "MachinePredictionForNorthropGrumman.Rhtmlaccounting" ## [2] "MachinePredictionForNorthropGrumman.Rhtmlhr" ## [3] "MachinePredictionForNorthropGrumman.RhtmlIT" ## [4] "MachinePredictionForNorthropGrumman.Rhtmlmanagement" ## [5] "MachinePredictionForNorthropGrumman.Rhtmlmarketing" ## [6] "MachinePredictionForNorthropGrumman.Rhtmlproduct_mng" ## [7] "MachinePredictionForNorthropGrumman.RhtmlRandD" ## [8] "MachinePredictionForNorthropGrumman.Rhtmlsales" ## [9] "MachinePredictionForNorthropGrumman.Rhtmlsupport" ## [10] "MachinePredictionForNorthropGrumman.Rhtmltechnical"
dumdt=dumdt[,-10] names(dumdt)=c("accounting","hr","IT","management","marketing","product_mng","RandD","sales","support") dt=cbind(dt,dumdt) dt$sales=NULL # Look at the new data frame: str(dt)
## 'data.frame': 14999 obs. of 19 variables: ## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ... ## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ... ## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ... ## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ... ## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ... ## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ... ## $ left : int 1 1 1 1 1 1 1 1 1 1 ... ## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ... ## $ high_salary : int 0 0 0 0 0 0 0 0 0 0 ... ## $ medium_salary : int 0 1 1 0 0 0 0 0 0 0 ... ## $ accounting : int 0 0 0 0 0 0 0 0 0 0 ... ## $ hr : int 0 0 0 0 0 0 0 0 0 0 ... ## $ IT : int 0 0 0 0 0 0 0 0 0 0 ... ## $ management : int 0 0 0 0 0 0 0 0 0 0 ... ## $ marketing : int 0 0 0 0 0 0 0 0 0 0 ... ## $ product_mng : int 0 0 0 0 0 0 0 0 0 0 ... ## $ RandD : int 0 0 0 0 0 0 0 0 0 0 ... ## $ sales : int 1 1 1 1 1 1 1 1 1 1 ... ## $ support : int 0 0 0 0 0 0 0 0 0 0 ...
rm(dumdt)
Machine Learning
Now my data is ready for prediction. I split the data set into 2 sets: for training and for testing.indeces=sample(1:dim(dt)[1], round(.7*dim(dt)[1])) train=dt[indeces,] test=dt[-indeces,]
str(train)
## 'data.frame': 10499 obs. of 19 variables: ## $ satisfaction_level : num 0.71 0.73 0.74 0.41 0.1 0.88 0.71 0.76 0.85 0.93 ... ## $ last_evaluation : num 0.5 0.6 0.55 0.49 0.94 1 0.92 0.8 0.65 0.65 ... ## $ number_project : int 4 3 6 2 6 5 3 4 4 4 ... ## $ average_montly_hours : int 253 137 130 130 255 219 202 226 280 212 ... ## $ time_spend_company : int 3 3 2 3 4 6 4 5 3 4 ... ## $ Work_accident : int 0 0 0 0 0 1 0 0 1 0 ... ## $ left : int 0 0 0 1 1 1 0 0 0 0 ... ## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ... ## $ high_salary : int 0 0 0 0 0 0 0 0 0 0 ... ## $ medium_salary : int 1 1 1 0 0 0 0 0 1 1 ... ## $ accounting : int 0 0 0 0 0 0 0 0 0 0 ... ## $ hr : int 0 1 0 0 0 0 0 0 0 0 ... ## $ IT : int 0 0 0 0 0 0 0 0 1 1 ... ## $ management : int 0 0 0 0 0 0 0 0 0 0 ... ## $ marketing : int 0 0 0 1 0 0 0 0 0 0 ... ## $ product_mng : int 0 0 0 0 0 0 0 0 0 0 ... ## $ RandD : int 1 0 0 0 0 0 0 0 0 0 ... ## $ sales : int 0 0 0 0 0 0 0 0 0 0 ... ## $ support : int 0 0 1 0 0 0 0 1 0 0 ...
Logistic Regression
glm_mod=glm(left~., data=train, family=binomial) options(width=120) summary(glm_mod)
## ## Call: ## glm(formula = left ~ ., family = binomial, data = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.252360 -0.669234 -0.404160 -0.118092 3.005301 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.513671571 0.152904509 3.35943 0.00078104 *** ## satisfaction_level -4.102569262 0.116375474 -35.25287 < 2.22e-16 *** ## last_evaluation 0.636893223 0.177888069 3.58030 0.00034320 *** ## number_project -0.315622781 0.025421761 -12.41546 < 2.22e-16 *** ## average_montly_hours 0.004809019 0.000613495 7.83872 4.5515e-15 *** ## time_spend_company 0.270089407 0.018619140 14.50601 < 2.22e-16 *** ## Work_accident -1.489204860 0.104868336 -14.20071 < 2.22e-16 *** ## promotion_last_5years -1.268547473 0.277706609 -4.56794 4.9254e-06 *** ## high_salary -1.903687581 0.153808902 -12.37697 < 2.22e-16 *** ## medium_salary -0.485601493 0.054435288 -8.92071 < 2.22e-16 *** ## accounting -0.216824268 0.130317943 -1.66381 0.09615045 . ## hr 0.177414039 0.129105528 1.37418 0.16938628 ## IT -0.271998822 0.110571123 -2.45994 0.01389585 * ## management -0.523601663 0.162647148 -3.21925 0.00128527 ** ## marketing -0.098831483 0.127918381 -0.77261 0.43975108 ## product_mng -0.272539608 0.124254440 -2.19340 0.02827862 * ## RandD -0.599234520 0.144357350 -4.15105 3.3095e-05 *** ## sales -0.129873866 0.078066780 -1.66363 0.09618735 . ## support -0.048540168 0.089959935 -0.53958 0.58948988 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 11544.393 on 10498 degrees of freedom ## Residual deviance: 9035.048 on 10480 degrees of freedom ## AIC: 9073.048 ## ## Number of Fisher Scoring iterations: 5
fittedValues=fitted(glm_mod) # How well is predicted people who stayed hist(fittedValues[train$left==0])
# And the ones who left. hist(fittedValues[train$left==1])
glm_predictions=predict(glm_mod,test[,-match("left", names(test))]) # How well is predicted people who stayed, colored red hist(glm_predictions[test$left==0], breaks =15, col=2 ) # And the ones who stayed, colored blue hist(glm_predictions[test$left==1],breaks =15, col=rgb(0,0.7,0.8,1/2),add=T)