file.name2<-'http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data' file.name<-'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data-numeric' credit.approval<-read.table(file.name, sep=",", dec=".", na.strings='?') credit.approval->backup credit.approval<-data.set credit.approval<-na.omit(credit.approval) #caveat! credit.approval ols<-glm(target~.+factor(V3), data=credit.approval) #Spinogram with(plot(target~V4), data=credit.approval) summary(credit.approval$V1) summary(as.factor(credit.approval$V1)) with(plot(target~V1), data=credit.approval) with(plot(as.factor(target)~V1), data=credit.approval) par(mfrow=c(2,2)) with(plot(target~V1, main="no jitter", col="orange"), data=credit.approval) grid() with(plot(jitter(target)~V1, main="jitter target", col="orange"), data=credit.approval) grid() with(plot(target~jitter(V1), main="jitter V1", col="orange" ), data=credit.approval) grid() with(plot(jitter(target)~jitter(V1), main="both jittered", col="orange"), data=credit.approval) grid() par(mfrow=c(2,1)) with(plot(jitter(target)~jitter(V1), main="both jittered", col="orange"), data=credit.approval) grid() with(plot(as.factor(target)~V1), data=credit.approval) par(mfrow=c(1,1)) plot(credit.approval$target~predict(ols), xlab="score", ylab="klasa") grid() plot(jitter(credit.approval$target)~predict(ols), xlab="score", ylab="klasa", col=ifelse(credit.approval$target==1,'red','green')) abline(a=0, b=1, col="orange", lwd=3, lty=3) grid() plot(residuals(ols)~predict(ols), xlab="score", ylab="reszty") grid() logit<-glm(target~., data=credit.approval, family=binomial('logit')) predict(logit) data.frame(predict(logit, type='response'), 1/(1+exp(-predict(logit)))) plot(predict(logit, type='response')~predict(logit), col=ifelse(credit.approval$target==1,'red','green')) grid() abline(a=.5, b=0, col="orange", lwd=3, lty=2) ols<-glm(target~., data=credit.approval) logit<-glm(target~., data=credit.approval, family=binomial("logit")) probit<-glm(target~., data=credit.approval, family=binomial("probit")) ols_step<-step(ols) logit_step<-step(logit) probit_step<-step(probit) p1<-hist(predict(ols, type="response" )[credit.approval$target==1]) p2<-hist(predict(ols, type="response" )[credit.approval$target==2]) plot(p1, col=rgb(0,1,0,1/4), xlim=c(min(predict(ols)),max(predict(ols))), ylim=c(0,150), ylab="częstość", xlab="score", main="") plot(p2, col=rgb(1,0,0,1/4), add=TRUE) koszt <- function(prog) { decyzja <- ifelse(predict(ols)>prog,1,0) decyzja<-factor(decyzja, levels=c(0,1)) pt <- prop.table(table(decyzja,realizacja=credit.approval$target)) pt[1,2] + pt[2,1] # return(pt) } koszt(.5) x <- seq(0, 1, length.out = 100) sapply(x,koszt) plot(sapply(x,koszt)~x, type='l', lwd=3, col="mediumpurple", ylab="koszt", xlab='próg odcięcia') grid() mini<-optimize(koszt, c(0,1), tol=.0000001) points(mini$minimum, mini$objective, pch=19, col="orange") abline(v=mini$minimum, lty=2, lwd=3, col="orange") koszty<-matrix(data=c(0,1,5,0), nrow=2, ncol=2) koszt_macierz <- function(prog, macierz) { decyzja <- ifelse(predict(ols)>prog,1,0) decyzja<-factor(decyzja, levels=c(0,1)) pt <- table(decyzja,realizacja=credit.approval$target)*koszty pt[1,2] + pt[2,1] } mini2<-optimize(koszt_macierz, c(0,1), tol=.0000001, macierz=koszty) ######Podział na zbiory dziel<-function(dane, frakcje) { frakcja_train<-frakcje[1] frakcja_valid<-frakcje[2] frakcja_test<-frakcje[3] losuj<-runif(dim(dane)[1]) assign<-ifelse(losuj<frakcja_train, 'train', ifelse(losuj<frakcja_train+frakcja_valid, 'valid', 'test')) assign<-factor(assign, levels=c('train', 'valid', 'test')) return(assign) } set.seed(38166) przydzial<-dziel(credit.approval, c(.6,.2,.2)) set.train<-credit.approval[which(przydzial=='train'),] set.valid<-credit.approval[which(przydzial=='valid'),] set.test<-credit.approval[which(przydzial=='test'),] ols2<-glm(target~., data=set.train) logit2<-glm(target~., data=set.train, family=binomial("logit")) probit2<-glm(target~., data=set.train, family=binomial("probit")) ols2_step<-step(ols2) logit2_step<-step(logit2) probit2_step<-step(probit2) predict(ols2, newdata=set.valid, type='response') predict(logit2, newdata=set.valid, type='response') predict(probit2, newdata=set.valid, type='response') koszt2 <- function(prog, model, data) { decyzja <- ifelse(predict(model, newdata=data, type='response')>prog,1,0) decyzja<-factor(decyzja, levels=c(0,1)) pt <- prop.table(table(decyzja,data$target)) pt[1,2] + pt[2,1] } optymalne_odciecie<-c(optimize(koszt2, c(0,1), model=ols2, data=set.train)$minimum, optimize(koszt2, c(0,1), model=ols2_step, data=set.train)$minimum, optimize(koszt2, c(0,1), model=logit2, data=set.train)$minimum, optimize(koszt2, c(0,1), model=logit2_step, data=set.train)$minimum, optimize(koszt2, c(0,1), model=probit2, data=set.train)$minimum, optimize(koszt2, c(0,1), model=probit2_step, data=set.train)$minimum) koszt_train<-c( koszt2(optymalne_odciecie[1], ols2, data=set.train), koszt2(optymalne_odciecie[2], ols2_step, data=set.train), koszt2(optymalne_odciecie[3], logit2, data=set.train), koszt2(optymalne_odciecie[4], logit2_step, data=set.train), koszt2(optymalne_odciecie[5], probit2, data=set.train), koszt2(optymalne_odciecie[6], probit2_step, data=set.train)) koszt_valid<-c( koszt2(optymalne_odciecie[1], ols2, data=set.valid), koszt2(optymalne_odciecie[2], ols2_step, data=set.valid), koszt2(optymalne_odciecie[3], logit2, data=set.valid), koszt2(optymalne_odciecie[4], logit2_step, data=set.valid), koszt2(optymalne_odciecie[5], probit2, data=set.valid), koszt2(optymalne_odciecie[6], probit2_step, data=set.valid)) podsumowanie<-data.frame(model=c("OLS","OLS Step", "Logit", "Logit Step", "Probit", "Probit Step" ),cutoff=optymalne_odciecie, KosztTrain=koszt_train, KosztValid=koszt_valid)