xgboost vs randomforest vs kmean cluster

本篇主要想比較一下針對同一筆資料使用三種模型的不同結果:
1. Random Forest
2. xgboost
3. Kmeans 集群分析
分別會比較xgboost與RF在Variance importance 的選擇,以及三種模型的confusion matrix準確度,當然最後的最後一樣要有溝通的圖表讓人有一目了然的感覺。

資料集:
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

主要是關於乳癌的資料,透過Fine-needle aspiration(FNA)技術取得圖片資訊再進行測量而獲得本次的資料集,詳細的變數名稱以及定義可以由上數網址得知,而我們這次的目的是透過這些資料正確預測diagnosis的結果,正確預測腫瘤最後是惡性還是良性(Malignant or Benign?)

直接開始吧!


wbcd <- read.csv("data.csv")
wbcd$X <- NULL
wbcd <- wbcd[,-1]
wbcd$diagnosis <- factor(ifelse(wbcd$diagnosis=="B","Benign","Malignant"))
str(wbcd)
summary(wbcd)
head(wbcd)
讀取資料並排除不會用到的column X
把diagnosis作轉換

nrows <- NROW(wbcd)
set.seed(218)                           ## fix random value
index <- sample(1:nrows, 0.7 * nrows)   ## shuffle and divide

#train <- wbcd                          ## 569 test data (100%)
train <- wbcd[index,]                   ## 398 test data (70%)
test <- wbcd[-index,]                   ## 171 test data (30%)
prop.table(table(train$diagnosis))
Benign Malignant 
0.6180905 0.3819095 

prop.table(table(test$diagnosis))
 Benign Malignant 
0.6491228 0.3508772 
作個訓練集與測試集

1.Random Forest
#RandomForest
library(randomForest)
library('caret')
learn_rf <- randomForest(diagnosis~., data=train, ntree=500, proximity=T, importance=T)
pre_rf   <- predict(learn_rf, test[,-1])
cm_rf <- print(confusionMatrix(pre_rf, test$diagnosis))

importance    <- importance(learn_rf)
varImportance <- data.frame(Variables = row.names(importance), 
                            Importance = round(importance[ ,'MeanDecreaseGini'],2))


rankImportance <- varImportance %>%
  mutate(Rank = paste0('#',dense_rank(desc(Importance))))
ggplot(rankImportance, aes(x = reorder(Variables, Importance), 
                           y = Importance, fill = Importance)) +
  geom_bar(stat='identity') + 
  geom_text(aes(x = Variables, y = 0.5, label = Rank),
            hjust=0, vjust=0.55, size = 4, colour = 'red') +
  labs(x = 'Variables') +
  coord_flip()

col <- c("#ed3b3b", "#0099ff")
fourfoldplot(cm_rf$table, color = col, conf.level = 0, margin = 1, main=paste("RandomForest (",round(cm_rf$overall[1]*100),"%)",sep=""))





2. Xgboost

library(xgboost)
train$diagnosis <- ifelse(train$diagnosis == "Malignant",1,0) 
test$diagnosis <- ifelse(test$diagnosis == "Malignant",1,0) 
dtrain <- xgb.DMatrix(data=as.matrix(train[,-1]), label=train$diagnosis)
dtest <- xgb.DMatrix(data=as.matrix(test[,-1]), label=test$diagnosis)
watchlist <- list(train=dtrain, test=dtest)
xg.model <- xgboost(data= dtrain,     #train sparse matrix 
                    eval.metric= 'logloss',        #model minimizes Root Mean Squared Error
                    objective= "reg:logistic",     #regression
                    #tuning parameters
                    max.depth= 8,            #Vary btwn 3-15
                    eta= 0.1,                #Vary btwn 0.1-0.3
                    nthread = 5,             #Increase this to improve speed
                    subsample= 1,            #Vary btwn 0.8-1
                    colsample_bytree= 0.5,   #Vary btwn 0.3-0.8
                    lambda= 0.5,             #Vary between 0-3
                    alpha= 0.5,              #Vary between 0-3
                    min_child_weight= 3,     #Vary btwn 1-10
                    nround= 30               #Vary btwn 100-3000 based on max.depth, eta, subsample and               colsample
)
xg_prediction <- predict(xg.model,dtest)
xg_prediction <- ifelse(xg_prediction >= 0.55,1,0)
xg_diagnosis <- as.numeric(as.character(xg_prediction))
result <- test %>%
  cbind(xg_diagnosis)
xgresult <- ifelse(result$xg_diagnosis == 1,"Malignant","Benign")
result <- test %>%
  cbind(xgresult)
xgb<-confusionMatrix(result$xgresult,result$diagnosis)
xgb
xgb.plot.importance(xgb.importance(xg.model, feature_names = colnames(dtrain)))
col <- c("#ed3b3b", "#0099ff")
fourfoldplot(xgb$table, color = col, conf.level = 0, margin = 1, main=paste("xgboost (",round(xgb$overall[1]*100),"%)",sep=""))

3. Kmean

#Kmean Cluster
predict.kmeans <- function(newdata, object){
  centers <- object$centers
  n_centers <- nrow(centers)
  dist_mat <- as.matrix(dist(rbind(centers, newdata)))
  dist_mat <- dist_mat[-seq(n_centers), seq(n_centers)]
  max.col(-dist_mat)
}
learn_kmeans <- kmeans(train[,-1], centers=2)
pre_kmeans <- predict.kmeans(test[,-1],learn_kmeans)
pre_kmeans <- ifelse(pre_kmeans == 1,"Benign","Malignant")
test$pre_kmeans <- as.factor(pre_kmeans)
cm_kmeans <- confusionMatrix(test$pre_kmeans, test$diagnosis)
cm_kmeans$

col <- c("#ed3b3b", "#0099ff")
fourfoldplot(cm_kmeans$table, color = col, conf.level = 0, margin = 1, main=paste("KMeans (",round(cm_kmeans$overall[1]*100),"%)",sep=""))



Reference:
https://www.kaggle.com/mirichoi0218/classification-breast-cancer-or-not-with-15-ml









留言

這個網誌中的熱門文章

Word Vector & Word embedding 初探 - with n-Gram & GLOVE Model

文字探勘之關鍵字萃取 : TF-IDF , text-rank , RAKE

多元迴歸分析- subsets and shrinkage