Autorka: Asia Franaszek
 
 
data(iris)
 
 
dim(iris)
 
 
names(iris)
 
 
summary(iris)
 
 
pairs(iris[,1:4], col = as.numeric(iris$Species))
 
 
 
odl<-dist(iris[,1:4])
 
 
dist(c('A','B'))
 
libary('cluster')
 
daisy(as.data.frame(cbind(c('A','C','C','D', 'D', 'D', 'D'))), metric='gower')
 
 
daisy(as.data.frame(cbind(c('A','C','C','D', 'D', 'D'),
                          c('A','C','C','D', 'D', 'A'))), type=list(ordratio=2))
 
 
 
klastry <- kmeans(iris[,1:4], 3)
 
 
?kmeans
 
pairs(iris[,1:5], col = klastry$cluster)
 
 
pairs(iris[,1:4], col = as.numeric(iris$Species))
 
 
?kmeans
 
names(klastry)
 
klastry$cluster
 
klastry$totss
 
 
klastry$withinss
 
sum(klastry$withinss)
 
klastry$tot.withinss
 
klastry$betweenss
 
 
klastry$tot.withinss+klastry$betweenss
 
klastry$totss
 
 
 
 
pairs(iris[,1:4], col = klastry$cluster)
 
 
pairs(iris[,1:4], col = as.numeric(iris$Species))
 
par(mfrow=c(1,1))
plot(iris$Petal.Length, iris$Sepal.Length, col=klastry$cluster)
plot(iris$Petal.Length, iris$Sepal.Length, col=as.numeric(iris$Species))
 
points(klastry$centers[,3], klastry$centers[,1], col= "blue", pch=19)
 
 
 
 
table(as.vector(iris$Species), klastry$cluster)
 
 
permutacje<-list(c(1,2,3),c(1,3,2),c(2,1,3),c(2,3,1),c(3,1,2),c(3,2,1))
dopasowanie<-numeric(6)
gatunki<-numeric(nrow(iris))
for (i in 1:6)
{gatunki[iris[,5]=='setosa']<-permutacje[[i]][1]
 gatunki[iris[,5]=='versicolor']<-permutacje[[i]][2]
 gatunki[iris[,5]=='virginica']<-permutacje[[i]][3]
 dopasowanie[i]<-sum(gatunki!=klastry$cluster)
}
 
a<-permutacje[[which.min(dopasowanie)]]
a
gatunki_final<-a[1]
gatunki_final[iris[,5]=='versicolor']<-a[2]
gatunki_final[iris[,5]=='virginica']<-a[3]
 
table(gatunki_final, klastry$cluster)
 
# Na rysunku:
pairs(iris[,1:4], col = (klastry$cluster==gatunki_final)+2)
 
library(gregmisc)
perm <- permutations(n = 3, r = 3)
 
perm <- matrix(c(1,2,3,1,3,2,2,1,3,2,3,1,3,1,2,3,2,1),6, byrow = T)
best <- -Inf
for (i in 1:nrow(perm)){
  if (sum(perm[i,klastry$cluster] == as.numeric(iris$Species)) > best)
    best <- sum(perm[i,klastry$cluster] == as.numeric(iris$Species))
}
 
best
 
 
 
 
#### Optymalizacja K
 
par(mfrow=c(3,4))
for (j in 2:4)
{
  for (i in 2:5)
  {km<-kmeans(iris[,1:4],i)
   plot(iris[,c(1,j)], col=km$cluster)
  }
}
 
 
 
 
 
library(cluster)
 
dist<-(daisy(zbior)^2)
par(mfrow=c(2, 2))
for (i in 2:5) {
  km <-kmeans(zbior,i)
  plot(silhouette(km$cl, dist))
}
 
?silhouette
 
sd(iris)
 
 
 
set.seed(12)
x<-c(rnorm(50, mean=0, sd=.1), rnorm(50, mean=1, sd=.1))
y<-c(rnorm(50, mean=0, sd=10), rnorm(50, mean=100, sd=10))
 
A<-cbind(sample(x), sample(y))
plot(A)
asd<-kmeans(A, 4)
plot(A, col=asd$cluster)
 
A.s<-scale(A)
 
asd.s<-kmeans(A.s, 4)
plot(A.s, col=asd.s$cluster)
 
 
 
zbior<-scale(iris[,1:4])
 
 
sd(zbior)
 
 
 
medoid<-pam(iris[,1:4],3)
medoid$clustering
pairs(iris[,1:4],col=medoid$clustering)
 
 
 
par(mfrow=c(2,3))
for(i in 1:4){
  for(j in 1:4){
    if (i<j){
      plot(iris[,i],iris[,j],col=medoid$clustering)
    }
  }
}
 
 
dim(medoid$medoids)
 
 
medoid$medoids
 
 
 
 
 
 
 
 
library(cluster)
set.seed(1)
reps <- 100
 
sim.km <- function(data.set, num) {
  min.tot <- Inf
  for (i in 1:reps) {
    km <- kmeans(data.set,num)
    if (km$tot.withinss < min.tot) {
      best <- km
      min.tot <- best$tot.withinss
    }
  }
  return (best)
}
 
data.set <- read.table("http://shazam.econ.ubc.ca/student/ramu/DATA4-12")
data.set <- data.frame(scale(data.set))
names(data.set) <- c("MORT", "INCC", "POV", "EDU1", "EDU2", "ALCC", "TOBC", "HEXC", "PHYS", "URB", "AGED")
 
raw.d <- daisy(data.set)
raw.km <- list()
par(mfrow=c(2, 2), mar = c(5, 4, 2, 2))
for (i in 1:4) {
  raw.km[[i]] <- sim.km(data.set, i + 1)
  plot(silhouette(raw.km[[i]]$cl, raw.d), main="", do.clus.stat=F)
}
 
devAskNewPage(ask = TRUE)
 
pc <- princomp(~., data = data.set, cor = TRUE)
pcs <- pc$scores[,1:2]
pcs.d <- daisy(pcs)
pcs.km <- list()
par(mfrow=c(2, 2), mar = c(5, 4, 2, 2))
for (i in 1:4) {
  pcs.km[[i]] <- sim.km(pcs, i + 1)
  plot(silhouette(pcs.km[[i]]$cl, pcs.d), main="", do.clus.stat=F)
}
 
 
par(mfrow=c(1, 1))
plot(pcs, pch = pcs.km[[3]]$cluster+48, col = raw.km[[3]]$cluster)