set.seed(38166) alpha<-12 beta<-11:20 n<-20 baza<-runif(n) X<-rep(2*baza,11) +rnorm(n*11) dim(X)<-c(n,11) symu<-1000 modele<-matrix(nrow=symu, ncol=11) for(i in 1:symu) { y<-alpha*X[,1]+X[,-1]%*%beta+rnorm(n, sd=.5) for(j in 1:11) { modele[i,j]<-glm(y~X[,1:j])$coef[2] } print(i) } ahat<-numeric(11) sdhat<-numeric(11) for(i in 1:11) { ahat[i]<-mean(modele[,i]) sdhat[i]<-sd(modele[,i]) } par(mfrow=c(2,1)) plot(ahat) abline(h=12, col='red') grid() plot(sdhat) abline(h=0, col='red') grid() ##################################### set.seed(38166, kind = "default", normal.kind = "default") library(MASS) alpha<-25 beta<-12 gamma<-23 slope_mnk<-c() slope_orr<-c() n<-25 baza<-runif(n,min=0,max=10) x<-2*baza+runif(n,min=0,max=10) z<-2*baza+runif(n,min=0,max=10) rm(baza) for(i in 1:1000) { eps<-rnorm(n, mean=0, sd=5) y<-alpha+beta*x+gamma*z+eps slope_mnk[i]<-lm(y~x+z)$coef[2] orr<-lm.ridge(y~x+z, lambda=1.5) slope_orr[i]<-orr$coef[1]/orr$scales[1] print(i) } par(mfrow=c(2,1)) hist(slope_mnk, col='orange', ylim=c(0, 300), breaks=10) hist(slope_orr, col='orange', ylim=c(0, 300), breaks=10) mean(slope_mnk) mean(slope_orr) sd(slope_mnk) sd(slope_orr) dane<-read.table('http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/prostate.data', header=TRUE) dane_train<-dane[which(dane[,10]==TRUE), ] dane_train<-dane_train[,-10] dane_valid<-dane[which(dane[,10]==FALSE), ] dane_valid<-dane_valid[,-10] ols<-lm(lpsa~., data=dane_train) # cor(dane) pairs(dane, pch=20) ols_rsq_train<-1-sum(residuals(ols)^2)/(sum(dane_train$lpsa^2) -dim(dane_train)[1]*mean(dane_train$lpsa)^2) ols_rsq<-1-sum((dane_valid$lpsa-predict(ols, newdata=dane_valid))^2)/(sum(dane_valid$lpsa^2) -dim(dane_valid)[1]*mean(dane_valid$lpsa)^2) library(leaps) modele_ex<-regsubsets(dane_train[,-7], dane_train[,7], method='exhaustive', nvmax=10) modele_back<-regsubsets(dane_train[,-7], dane_train[,7], method='backward', nvmax=10) modele_for<-regsubsets(dane_train[,-7], dane_train[,7], method='forward', nvmax=10) plot(modele_back, scale='adjr2', col=rainbow(200)) plot(modele_for, scale='adjr2', col=rainbow(200)) plot(modele_ex, scale='adjr2', col=rainbow(200)) ols_for<-step(ols, direction='forward') ols_back<-step(ols, direction='backward') ols_both<-step(ols, direction='both') ols_rsq_for<-1-sum((dane_valid$lpsa-predict(ols_for, newdata=dane_valid))^2)/(sum(dane_valid$lpsa^2) -dim(dane_valid)[1]*mean(dane_valid$lpsa)^2) ############ #Ridge regression d.mean=apply(dane_train,2,mean) d.mean dane.s=sweep(dane_train,2,d.mean,"-") mean(dane.s) d.sd=apply(dane_train,2,sd) d.sd dane.s=sweep(dane.s,2,d.sd,"/") sd(dane.s) dane.s<-dane.s[-10] d.meanv<-apply(dane_valid,2,mean) d.meanv dane.sv=sweep(dane_valid,2,d.mean,"-") mean(dane.sv) d.sdv=apply(dane_valid,2,sd) d.sdv dane.sv=sweep(dane.sv,2,d.sd,"/") sd(dane.sv) dane.sv<-dane.sv[-10] library('MASS') ridge<-lm.ridge(lpsa~., data=dane.s, lambda=seq(0,10000, by=1)) edf=function(lambda){ dekompozycja=svd(dane.s) d=dekompozycja$d wynik=0 for (i in 1:9){ wynik=wynik+d[i]/(d[i]+lambda) } wynik } par(mfrow=c(1,1)) plot(coef(ridge)[,1]~edf(seq(0,10000, by=1)), type='l', col=1, ylim=c(-.7,.7), ylab='Współczynnik', xlab=expression(df(lambda)), xlim=c(0,8)) for(i in 2:8) { lines(coef(ridge)[,i]~edf(seq(0,10000, by=1)), type='l', col=i) } grid() df<-edf(seq(0,10000, by=1)) r2<-numeric(length(df)) X<-cbind(1,as.matrix(dane.s[,-9])) for(i in 1:length(df)){ pred<-X%*%coef(ridge)[i,] RSK<-sum((pred-dane.s[,9])^2) OSK=sum(dane.s[,9]^2)-length(dane.s[,9])*mean(dane.s[,9])^2 r2[i]<-1-RSK/OSK print(i) } plot(r2~df, type='l', col='green', lty=2);grid() abline(v=df[which.max(r2)],lty=2,col="mediumpurple") r2v<-numeric(length(df)) Xv<-cbind(1,as.matrix(dane.sv[,-9])) for(i in 1:length(df)){ pred<-Xv%*%coef(ridge)[i,] RSK<-sum((pred-dane.sv[,9])^2) OSK=sum(dane.sv[,9]^2)-length(dane.sv[,9])*mean(dane.sv[,9])^2 r2v[i]<-1-RSK/OSK print(i) } lines(r2v~df, type='l', col='green', lwd=2.5);grid() abline(v=df[which.max(r2v)],lty=2, lwd=2,col="mediumpurple") plot(ridge$coef[1,]~edf(seq(0,10000, by=1)), type='l', col='blue', ylim=c(-.7,.7), ylab='Współczynnik', xlab=expression(df(lambda)), xlim=c(0,8)) for(i in 2:8) { lines(ridge$coef[i,]~edf(seq(0,10000, by=1)), type='l', col='blue') } grid() abline(v=df[which.max(r2v)],lty=2, lwd=2,col="mediumpurple") df_gold<-df[which.max(r2v)] which(df==df_gold) l_opt<-seq(0,10000, by=1)[which(df==df_gold)] ridge_opt<-lm.ridge(lpsa~., data=dane.s, lambda=l_opt) ridge_opt_r2<-max(r2v) ###### Lasso library('lasso2') lasso<-l1ce(lpsa~. , data=dane.s, standardize=FALSE, bound=1) ols<-glm(lpsa~., data=dane.s) lasso<-l1ce(lpsa~. , data=dane.s, standardize=FALSE, bound=seq(0,1,by=.01)) plot(l1ce(lpsa~. , data=dane.s, standardize=FALSE, bound=seq(0,1, by=.01))) r2<-numeric(length(seq(0,1, by=.01))) X<-cbind(1,as.matrix(dane.s[,-9])) for(i in 1:length(seq(0,1, by=.01))){ pred<-X%*%coef(lasso)[i,] RSK<-sum((pred-dane.s[,9])^2) OSK=sum(dane.s[,9]^2)-length(dane.s[,9])*mean(dane.s[,9])^2 r2[i]<-1-RSK/OSK print(i) } plot(r2~seq(0,1,by=.01), col='blue', lty=2, type='l') r2v<-numeric(length(seq(0,1, by=.01))) Xv<-cbind(1,as.matrix(dane.sv[,-9])) for(i in 1:length(seq(0,1, by=.01))){ pred<-Xv%*%coef(lasso)[i,] RSK<-sum((pred-dane.sv[,9])^2) OSK<-sum(dane.sv[,9]^2)-length(dane.sv[,9])*mean(dane.sv[,9])^2 r2v[i]<-1-RSK/OSK print(i) } lines(r2v~seq(0,1,by=.01), col='blue', lty=1, type='l') seq(0,1,by=.1)[which.max(r2v)] abline(v=seq(0,1,by=.01)[which.max(r2v)], col='red', lwd=3) which.max(r2v) seq(0,1, by=.01)[which.max(r2v)] lasso_opt<-l1ce(lpsa~. , data=dane.s, standardize=FALSE, bound=.49)