rm(list = ls( )) require(nnet); require(pROC); require(lmtest); require(caret); require(ROCR); require(mfx); require(parallel) require(ResourceSelection); require(devtools); require(woe); require(reshape2); require(e1071); require(randomForest) # install.packages(c("nnet", "lmtest", "caret", "ROCR", "mfx", "ResourceSelection", "devtools", "reshape2", "pROC")) # install.packages(c("e1071", "parallel")) # install_github("riv","tomasgreif") data1 <- read.csv('dataset1.csv') #nahrani dat corr <- findCorrelation(cor(data1), cutoff = .6, exact = TRUE) #nalezeni korelace corr <- sort(corr) data2 <- data1[ , -corr] #odstraneni korelace #faktorizace diskretnich promennych data3 <- data.frame(data2$Default, data2$Duration,data2$Age, apply(data2[ , c(1,3:11,13:19)], 2, as.factor)) names(data3)[names(data3) == 'data2.Duration'] <- 'Duration' names(data3)[names(data3) == 'data2.Age'] <- 'Age' names(data3)[names(data3) == 'data2.Default'] <- 'Default' woes <- iv.mult(data3, "Default", TRUE) #Spocteni IV iv.plot.summary(woes) #graf IV data3 <- data3[ , c(woes$Variable[woes$InformationValue > .1], "Default")] #odstaneni nizke IV woes2 <- iv.mult(data3, "Default", FALSE) #vypocet WoE ldata3 <- length(data3) data3 <- iv.replace.woe(data3, woes2) #nahrazeni dat kategoriemi WoE data3 <- data3[ , ldata3:length(data3)] data3 <- data.frame(data3$Default, apply(data3[ , 2:ldata3], 2, as.factor)) #faktorizace WoE names(data3)[names(data3) == 'data3.Default'] <- 'Default' data4 <- data.frame(scale(data1)) #standardizace dat pro modely strojoveho uceni data4$Default <- data1$Default func <- function(it) { #funkce pro bootstrap set.seed(it) intraining <- createDataPartition(data3$Default, p = .7, list = FALSE) #trenovaci mnozina modellin <- lm(Default~., data3[intraining, ]) #linearni model modellog <- glm(Default~., binomial('logit'), data3[intraining, ]) #logisticky model modelpro <- glm(Default~., binomial('probit'), data3[intraining, ]) #probitovy model modelneur <- nnet(Default~., data4[intraining, ], decay = 2, size = 15, maxit = 5000, trace = FALSE) #Neuronova sit modelsvm <- svm(Default~., data4[intraining, ], scale = FALSE, cost = .1) #SVM model modelfor <- randomForest(Default~., data4[intraining, ], nodesize = 50) #Random forest model a <- auc(data3[intraining, ]$Default, modellin$fitted.values) #AUC trenovaci b <- auc(data3[intraining, ]$Default, modellog$fitted.values) c <- auc(data3[intraining, ]$Default, modelpro$fitted.values) d <- auc(data4[intraining, ]$Default, modelneur$fitted.values) e <- auc(data4[intraining, ]$Default, modelsvm$fitted) f <- auc(data4[intraining, ]$Default, predict(modelfor, data4[intraining, ])) g <- auc(data3[-intraining, ]$Default, predict(modellin, data3[-intraining, ])) #AUC testovaci h <- auc(data3[-intraining, ]$Default, predict.glm(modellog, data3[-intraining, ], "response")) i <- auc(data3[-intraining, ]$Default, predict.glm(modelpro, data3[-intraining, ], "response")) j <- auc(data4[-intraining, ]$Default, predict(modelneur, data4[-intraining, ])) k <- auc(data4[-intraining, ]$Default, predict(modelsvm, data4[-intraining, ])) l <- auc(data4[-intraining, ]$Default, predict(modelfor, data4[-intraining, ])) vysledek <- c(a, b, c, d, e, f, g, h, i, j, k, l) return(vysledek) } iter <- 100 #pocet iteraci vysledek <- matrix(NA, iter, 12) #pripava matice pro vysledky cl <- makeCluster(4) #vytvoreni instanci pro paralelni vypocet, pocet jader (instanci) je 4 clusterExport(cl,c("data3", "data4")) #nahrani dat pro paralelni vypocet #nahrani package pro paralelni vypocet clusterEvalQ(cl, c(library(nnet), library(e1071), library(ROCR), library(pROC), library(caret), library(randomForest))) system.time(vysledek <- t(parSapply(cl, 1:iter, func))) #samotny vypocet + cas vypoctu, několik minut stopCluster(cl) #ukonceni instanci colnames(vysledek) <- c("Lin_in", "Log_in", "Pro_in", "Neur_in", "SVM_in", "For_in", "Lin_out", "Log_out", "Pro_out", "Neur_out", "SVM_out", "For_out") #jmena promennych vysledek <- data.frame(vysledek) summary(vysledek[1:6]) #souhrnne statistiky, trenovaci data summary(vysledek[7:12]) #souhrnne statistiky, testovaci data ins <- melt(vysledek[1:6]) outs <- melt(vysledek[7:12]) ggplot(ins, aes(x = value, fill = variable)) + geom_density(alpha = .5) + theme_minimal( ) + ylab("Hustota") + xlab("AUC") + geom_vline(xintercept = colMeans(vysledek[1:6]), colour = c("green", "red", "yellow", "blue", "brown", "grey")) + xlim(c(.75, .95)) + scale_fill_manual(values = c("green", "red", "yellow", "blue", "brown", "grey")) + ylim(c(0, 100)) ggplot(outs, aes(x = value, fill = variable)) + geom_density(alpha = .5) + theme_minimal( ) + ylab("Hustota") + xlab("AUC") + geom_vline(xintercept = colMeans(vysledek[7:12]), colour = c("green", "red", "yellow", "blue", "brown", "grey")) + xlim(c(.65, .9)) + scale_fill_manual(values = c("green", "red", "yellow", "blue", "brown", "grey")) + ylim(c(0, 20)) #funkce train vyzkousi vice parametru, kombinace, a vybere nejlepsi, trva dlouho train(Default~., trControl = trainControl(number = 10, allowParallel = TRUE), data4, method = 'nnet', maxit = 10000, MaxNWts = 100000, tuneGrid = expand.grid(.size = c(5, 10, 15), .decay = c(0, .5, 1, 2, 3))) cil <- 1 - data4$Default #1-Default pro hezci graf nize #model bez validace, lze zkopirovat jakykoliv vyse a umazat "[intraining, ]" model <- nnet(Default~., data4, decay = 2, size = 15, maxit = 5000) #Neuronova sit fit <- predict(model, data4) #odhady modelu ggplot(data.frame(fit, cil), aes(x = fit, fill = factor(cil,labels = c("Nesplaceno", "Splaceno")))) + geom_histogram(bins = 20, colour = "white") + theme_minimal( ) + xlab("Pravděpodobnost defaultu") + ylab("Počet") + guides(fill = guide_legend(title = NULL)) + scale_fill_manual(values = c("grey", "black")) plot.roc(data4$Default, as.numeric(fit), print.auc = TRUE) #ROC krivka a AUC perf <- (performance(ROCR::prediction(fit, data4$Default), "acc")) #performance krivka plot(perf) round(max(unlist(perf@y.values)) * 100, 2) #maximum performance krivky source_url('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r') plot.nnet(model, circle.col = "grey") #graf neuronove site set.seed(2017) k <- 10 #velikost k v k-fold cross-validaci a <- createFolds(data4$Duration,k, list = FALSE) #tvorba foldu, cislo od 1 do k cvvysledky <- matrix(NA, k, 13) #priprava matice na vysledky for (i in 1:k) { #cyklus na vypocet AUC pri cross-validaci cvvysledky[i, 1] <- i modelcv <- lm(Default~., data3[a!=1, ]) cvvysledky[i, 2] <- auc(data3$Default[a!=1], modelcv$fitted.values) cvvysledky[i, 8] <- auc(data3$Default[a==1], predict(modelcv, data3[a==1, ])) modelcv <- glm(Default~., binomial('logit'), data3[a!=1, ]) cvvysledky[i, 3] <- auc(data3$Default[a!=1], modelcv$fitted.values) cvvysledky[i, 9] <- auc(data3$Default[a==1], predict.glm(modelcv, data3[a==1, ], "response")) modelcv <- glm(Default~., binomial('probit'), data3[a!=1, ]) cvvysledky[i, 4] <- auc(data3$Default[a!=1], modelcv$fitted.values) cvvysledky[i, 10] <- auc(data3$Default[a==1], predict.glm(modelcv, data3[a==1, ], "response")) modelcv <- nnet(Default~., data4[a!=1, ], decay = 2, size = 15, maxit = 5000, trace = FALSE) cvvysledky[i, 5] <- auc(data4$Default[a!=1], as.numeric(modelcv$fitted.values)) cvvysledky[i, 11] <- auc(data4$Default[a==1], as.numeric(predict(modelcv, data4[a==1, ]))) modelcv <- svm(Default~., data4[a!=1, ], scale = FALSE, cost = .1) cvvysledky[i, 6] <- auc(data4$Default[a!=1], modelcv$fitted) cvvysledky[i, 12] <- auc(data4$Default[a==1], predict(modelcv, data4[a==1, ])) modelcv <- suppressWarnings(randomForest(Default~., data4[a!=1, ], nodesize = 50)) cvvysledky[i, 7] <- auc(data4$Default[a!=1], predict(modelcv, data4[a!=1, ])) cvvysledky[i, 13] <- auc(data4$Default[a==1], predict(modelcv, data4[a==1, ])) rm(modelcv) } colnames(cvvysledky) <- c("Fold", "Lin_in", "Log_in", "Pro_in", "Neur_in", "SVM_in", "For_in", "Lin_out", "Log_out", "Pro_out", "Neur_out", "SVM_out", "For_out") cvvysledky <- data.frame(cvvysledky) cvvysledky #vysledky cross-validace