# Statystyczne systemy ucz¹ce siê - Cwiczenia w oparciu o pakiet R. # Jan Ćwik, Jan Mielniczuk # Oficyna Wydawnicza Politechniki Warszawskiej # Ostatnia aktualizacja: 16 października 2009 library(rpart) fitness <- read.table("fitness.txt",header=TRUE) fitness <- read.table(file.choose(), header = TRUE) dim(fitness) pairs(fitness) fitness.rpart <- rpart(Oxygen ~ ., data=fitness, cp=0.01, minsplit=2) ?rpart fitness.rpart plot(fitness.rpart, uniform=T, margin=0.1) text(fitness.rpart) fitness.rpart print("wektor median zmiennych objasniajacych") x0 <- apply(fitness[ ,-3],2,median) print(x0) print("pedykcja dla wektora median") predict(fitness.rpart, newdata=data.frame(t(x0))) predict(fitness.rpart) summary(fitness.rpart) par(mfrow=c(1,1)) predict(fitness.rpart, newdata=data.frame(Age=NaN, Weight=69, RunTime=NaN, RestPulse=57, RunPulse=168, MaxPulse=172)) plot(fitness.rpart, uniform=TRUE, margin=0.1) text(fitness.rpart) set.seed(289) fitness2.rpart <- rpart(Oxygen ~ ., data=fitness, cp=0.001, minsplit=2) fitness2.rpart printcp(fitness2.rpart) printcp(fitness.rpart) plotcp(fitness2.rpart) fitness3.rpart <- prune(fitness2.rpart, cp=0.03) fitness12.rpart<- rpart(Oxygen ~ ., data=fitness, cp=0.03) print(fitness3.rpart) print(fitness12.rpart) gg=lm(Oxygen ~ ., data=fitness) print(summary(gg)) print("RSS dla lm") print(sum(gg$res^2)) print("RSS dla fitness.rpart") print(sum(residuals(fitness.rpart)^2)) print("RSS dla fitness3.rpart") print(sum(residuals(fitness3.rpart)^2)) fitness5.rpart <- rpart(Oxygen ~ RunTime + Age, data=fitness, cp=0.00001, minsplit=2) print(fitness5.rpart) plot(fitness5.rpart, uniform=TRUE, margin=0.1) text(fitness5.rpart) plot(fitness$RunTime, fitness$Age, xlim=c(8.0,14.0), ylim=c(37,58), xlab="RunTime", ylab="Age") lines(c(8.935,8.935), c(37,58)) lines(c(12.29,12.29), c(37,58)) lines(c(8.64,8.64), c(37,58)) lines(c(12.29,12.29), c(37,58)) lines(c(10.9,10.9), c(37,58)) lines(c(11.015,11.015), c(37,58)) lines(c(8.935,10.9), c(53,53)) len=60 xp <- seq(8.0, 15.0, length=len) yp <- seq(35, 60, length=len) siatka <- expand.grid(RunTime=xp, Age=yp) zp <- predict(fitness5.rpart, siatka) persp(xp, yp, matrix(zp, len), theta = 150, phi = 30, col = "lightblue", zlim=c(37,60), xlab = "RunTime", ylab = "Age", zlab = "Oxygen") library(MASS) data(Cars93) dim(Cars93) names(Cars93) table(Cars93$Type) typ <- ifelse(Cars93$Type=="Large" | Cars93$Type=="Van", "D", "SR") typ2 <- ifelse(Cars93$Type=="Small", "M", typ) typ3 <- ifelse(Cars93$Type=="Sporty", "SP", typ2) typ3 <- ifelse(Cars93$Type=="Sporty", "SP", ifelse(Cars93$Type=="Small", "M", ifelse(Cars93$Type=="Large" | Cars93$Type=="Van", "D", "SR"))) table(typ3) Cars93 = data.frame(Cars93, typ=typ3) dim(Cars93) names(Cars93) set.seed(222) cars.tree <- rpart(typ ~ Length + Weight + EngineSize + Horsepower + RPM, data=Cars93, cp=0.0001, minsplit=5) par(mfrow=c(1,1), mar=c(0,0,0,0), pty="m") plot(cars.tree,compress=T,uniform=TRUE, branch=0.4, margin=0.2) text(cars.tree, use.n=T, cex=0.7) print(summary(cars.tree), digits=4) printcp(cars.tree) plotcp(cars.tree) library('party') cars2zmienne.tree <- ctree(typ ~ ., data=Cars93[,-3]) cars2zmienne.tree <- rpart(typ ~ Weight + EngineSize, data=Cars93, cp=0.03, minsplit=5) cars2zmienne.tree <- rpart(typ ~ ., data=Cars93, cp=0.03, minsplit=5) cars2zmienne.tree <- rpart(typ ~ Weight + EngineSize, data=Cars93, cp=0.03, minsplit=5) print(cars2zmienne.tree) plot(cars2zmienne.tree,compress=T,uniform=TRUE,branch=0.4,margin=0.2) text(cars2zmienne.tree, use.n=TRUE, cex=0.7) print(summary(cars2zmienne.tree), digits=4) plot(c(1,5.70), c(1695,4105), type="p", xlab="Weight", ylab="EngineSize") text(Cars93$EngineSize, Cars93$Weight, Cars93$typ, cex=0.6) lines(c(1.95,1.95), c(1695,4105)) lines(c(1.95,5.70), c(3460,3460)) lines(c(3.2,3.2), c(1695,3460)) lines(c(1.95,5.70), c(3700,3700)) lines(c(3.25,3.25), c(3460,3700)) lines(c(3.25,5.70), c(3630,3630)) p=0.51 n=1000 rbinom(1,n,p) rbinom(1,n,p) rbinom(1,n,p) rbinom(1,n,p) rbinom(1,n,p) rbinom(1,n,p)>500 rbinom(1,n,p)>500 rbinom(1,n,p)>500 rbinom(1,n,p)>500 rbinom(1,n,p)>500 prop.table(table(rbinom(1000000,n,p)>500)) library('randomForest') sam93<-na.omit(Cars93) which(sapply(sam93, function(y) nlevels(y) > 32)) las<-randomForest(y=sam93$typ,x=sam93[,-c(2,3,27, 28)], mtry=sqrt(dim(sam93)[2]), ntree=2500)