xgboost vs randomforest vs kmean cluster
本篇主要想比較一下針對同一筆資料使用三種模型的不同結果:
1. Random Forest
2. xgboost
3. Kmeans 集群分析
分別會比較xgboost與RF在Variance importance 的選擇,以及三種模型的confusion matrix準確度,當然最後的最後一樣要有溝通的圖表讓人有一目了然的感覺。
資料集:
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
主要是關於乳癌的資料,透過Fine-needle aspiration(FNA)技術取得圖片資訊再進行測量而獲得本次的資料集,詳細的變數名稱以及定義可以由上數網址得知,而我們這次的目的是透過這些資料正確預測diagnosis的結果,正確預測腫瘤最後是惡性還是良性(Malignant or Benign?)
直接開始吧!
把diagnosis作轉換
1.Random Forest
2. Xgboost
3. Kmean
Reference:
https://www.kaggle.com/mirichoi0218/classification-breast-cancer-or-not-with-15-ml
1. Random Forest
2. xgboost
3. Kmeans 集群分析
分別會比較xgboost與RF在Variance importance 的選擇,以及三種模型的confusion matrix準確度,當然最後的最後一樣要有溝通的圖表讓人有一目了然的感覺。
資料集:
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
主要是關於乳癌的資料,透過Fine-needle aspiration(FNA)技術取得圖片資訊再進行測量而獲得本次的資料集,詳細的變數名稱以及定義可以由上數網址得知,而我們這次的目的是透過這些資料正確預測diagnosis的結果,正確預測腫瘤最後是惡性還是良性(Malignant or Benign?)
直接開始吧!
wbcd <- read.csv("data.csv")
wbcd$X <- NULL
wbcd <- wbcd[,-1]
wbcd$diagnosis <- factor(ifelse(wbcd$diagnosis=="B","Benign","Malignant"))
str(wbcd)
summary(wbcd)
head(wbcd)
讀取資料並排除不會用到的column X把diagnosis作轉換
nrows <- NROW(wbcd)
set.seed(218) ## fix random value
index <- sample(1:nrows, 0.7 * nrows) ## shuffle and divide
#train <- wbcd ## 569 test data (100%)
train <- wbcd[index,] ## 398 test data (70%)
test <- wbcd[-index,] ## 171 test data (30%)
prop.table(table(train$diagnosis))
Benign Malignant
0.6180905 0.3819095
prop.table(table(test$diagnosis))
Benign Malignant
0.6491228 0.3508772
作個訓練集與測試集1.Random Forest
#RandomForest
library(randomForest)
library('caret')
learn_rf <- randomForest(diagnosis~., data=train, ntree=500, proximity=T, importance=T)
pre_rf <- predict(learn_rf, test[,-1])
cm_rf <- print(confusionMatrix(pre_rf, test$diagnosis))
importance <- importance(learn_rf)
varImportance <- data.frame(Variables = row.names(importance),
Importance = round(importance[ ,'MeanDecreaseGini'],2))
rankImportance <- varImportance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))
ggplot(rankImportance, aes(x = reorder(Variables, Importance),
y = Importance, fill = Importance)) +
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'red') +
labs(x = 'Variables') +
coord_flip()
col <- c("#ed3b3b", "#0099ff")
fourfoldplot(cm_rf$table, color = col, conf.level = 0, margin = 1, main=paste("RandomForest (",round(cm_rf$overall[1]*100),"%)",sep=""))
2. Xgboost
library(xgboost)
train$diagnosis <- ifelse(train$diagnosis == "Malignant",1,0)
test$diagnosis <- ifelse(test$diagnosis == "Malignant",1,0)
dtrain <- xgb.DMatrix(data=as.matrix(train[,-1]), label=train$diagnosis)
dtest <- xgb.DMatrix(data=as.matrix(test[,-1]), label=test$diagnosis)
watchlist <- list(train=dtrain, test=dtest)
xg.model <- xgboost(data= dtrain, #train sparse matrix
eval.metric= 'logloss', #model minimizes Root Mean Squared Error
objective= "reg:logistic", #regression
#tuning parameters
max.depth= 8, #Vary btwn 3-15
eta= 0.1, #Vary btwn 0.1-0.3
nthread = 5, #Increase this to improve speed
subsample= 1, #Vary btwn 0.8-1
colsample_bytree= 0.5, #Vary btwn 0.3-0.8
lambda= 0.5, #Vary between 0-3
alpha= 0.5, #Vary between 0-3
min_child_weight= 3, #Vary btwn 1-10
nround= 30 #Vary btwn 100-3000 based on max.depth, eta, subsample and colsample
)
xg_prediction <- predict(xg.model,dtest)
xg_prediction <- ifelse(xg_prediction >= 0.55,1,0)
xg_diagnosis <- as.numeric(as.character(xg_prediction))
result <- test %>%
cbind(xg_diagnosis)
xgresult <- ifelse(result$xg_diagnosis == 1,"Malignant","Benign")
result <- test %>%
cbind(xgresult)
xgb<-confusionMatrix(result$xgresult,result$diagnosis)
xgb
xgb.plot.importance(xgb.importance(xg.model, feature_names = colnames(dtrain)))
col <- c("#ed3b3b", "#0099ff")
fourfoldplot(xgb$table, color = col, conf.level = 0, margin = 1, main=paste("xgboost (",round(xgb$overall[1]*100),"%)",sep=""))
3. Kmean
#Kmean Cluster
predict.kmeans <- function(newdata, object){
centers <- object$centers
n_centers <- nrow(centers)
dist_mat <- as.matrix(dist(rbind(centers, newdata)))
dist_mat <- dist_mat[-seq(n_centers), seq(n_centers)]
max.col(-dist_mat)
}
learn_kmeans <- kmeans(train[,-1], centers=2)
pre_kmeans <- predict.kmeans(test[,-1],learn_kmeans)
pre_kmeans <- ifelse(pre_kmeans == 1,"Benign","Malignant")
test$pre_kmeans <- as.factor(pre_kmeans)
cm_kmeans <- confusionMatrix(test$pre_kmeans, test$diagnosis)
cm_kmeans$
col <- c("#ed3b3b", "#0099ff")
fourfoldplot(cm_kmeans$table, color = col, conf.level = 0, margin = 1, main=paste("KMeans (",round(cm_kmeans$overall[1]*100),"%)",sep=""))
Reference:
https://www.kaggle.com/mirichoi0218/classification-breast-cancer-or-not-with-15-ml
留言
張貼留言