This is the code for the Pottery example as in Section 4.1 of Advances in dendrogram seriation for application to visualization, D. Earle and C. Hurley
if (!("HSAUR2" %in% rownames(installed.packages()))) install.packages("HSAUR2")
library(HSAUR2)
data(pottery)
sitename <- c("Gloucester","Llanedryn","Caldicot","IsleThorns","AshleyRails")[pottery[,"kiln"]]
pottery$Site <- factor(sitename)
reg <- c("Gloucester","Wales","Wales","NewForest","NewForest")[pottery[,"kiln"]]
pottery$Region <- factor(reg)panelMerit <- function(dats,clus,...){
gmean <- aggregate(dats,list(clus),mean)[,-1]
nv <- ncol(dats)
dv <- matrix(0,nv,nv)
for (i in 2:nv) {
for (j in 1:(i-1)){
cij <- gmean[,c(i,j)]
dv[i,j] <- sum(dist(cij))
dv[j,i] <- dv[i,j]
}
}
dv
}
# copied from PairZiz::pcp to avoid dependency
pcp <-
function (data, order = NULL, panel.colors = NULL, col = 1, lty = 1,
horizontal = TRUE, mar = NULL, scale=TRUE,axis.width=0,axis.grid.col="grey70",connect=TRUE,...)
{
if (is.null(mar))
if (horizontal == TRUE)
mar <- c(2, 2, 2, 2) + 0.1
else mar <- c(2, 5, 2, 2) + 0.1
par("mar"=mar)
if (!is.null(order)) {
axis.labels <- colnames(data)[order]
data <- data[, order]
if (is.matrix(panel.colors))
panel.colors <- panel.colors[order, order]
}
else axis.labels <- colnames(data)
if (is.matrix(panel.colors))
panel.colors <- panel.colors[col(panel.colors) == row(panel.colors) + 1]
if (is.vector(panel.colors))
if (ncol(data) - 1 != length(panel.colors))
stop("dimensions do not match")
if (scale==TRUE) x <- apply(data, 2, function(x) (x - min(x))/(max(x) - min(x)))
else x <- data
p <- ncol(x)
indx <- 1:p
if ((length(axis.width)==1) && (axis.width == 0))
bx <- x
else {
bx <- x[,rep(1:p,times=rep(2,times=p))]
if (length(axis.width)==1)
indx <- as.vector(sapply(indx,function(x)c(x-axis.width/2,x+axis.width/2)))
else indx <- as.vector(t(cbind(indx-axis.width/2,indx+axis.width/2)))
}
linesr <- range(bx)
pts <- rep(1,length.out=p)
if (horizontal == TRUE) {
matplot(indx, t(bx), xlab = "", ylab = "", axes = FALSE,
type = "n", xaxs="i",pch=pts,...)
axis(1, at = 1:p, labels = axis.labels)
if (!(is.null(panel.colors)))
for (i in 1:(p - 1)) rect(i, 0, i + 1, 1, lty = 0,
col = panel.colors[i])
if (!is.null(axis.grid.col))
for (i in 1:p) lines(c(i, i), linesr, col = axis.grid.col)
if (connect)
matpoints(indx, t(bx), type = "l", col = col, lty = lty, pch=pts,...)
else if (axis.width == 0)
matpoints(indx, t(bx), col = col, pch=pts,...)
else for (i in seq(1,length(indx),2))
matpoints(indx[i:(i+1)], t(bx[,i:(i+1)]), type = "l", col = col, lty = lty, pch=pts,...)
}
else {
matplot(t(bx), rev(indx), xlab = "", ylab = "", axes = FALSE,
type = "n",yaxs="i",pch=pts, ...)
axis(2, at = p:1, labels = axis.labels, las = 2)
if (!(is.null(panel.colors)))
for (i in 1:(p - 1)) rect(0, i, 1, i + 1, lty = 0,
col = panel.colors[p - i])
if (!is.null(axis.grid.col))
for (i in 1:p) lines(linesr, c(i, i), col = axis.grid.col)
if (connect)
matpoints(t(bx), rev(indx), type = "l", col = col, lty = lty, pch=pts, ...)
else if (axis.width == 0)
matpoints(t(bx), rev(indx), col = col, pch=pts,...)
else for (i in seq(1,length(indx),2))
matpoints(t(bx[,i:(i+1)]), rev(indx[i:(i+1)]), type = "l", col = col, lty = lty, pch=pts, ...)
}
invisible()
}The first heatmap uses the cases in the given order, which is already grouped by kiln, or equivalently site. The second heatmap is improved using DendSer with costBar.
The stars plot shows glyphs representing the pottery pieces in the same order as the second heatmap, with glyphs coloured by cluster membership in a 3-cluster solution.
cols <- hcl.colors(12,"Blues" )
par(mar=c(1,1,1,1))
par(mfrow=c(1,3))
od <- c(2,9,6,1,3,7,4,5,8) # random order of numeric variables
dat <- scale(pottery[,od])
d <- dist(dat)
plotAsColor(as.matrix(d), col=cols, main="",rank=TRUE) # the rank option ranks the distances before maping to colour
h <- hclust(d, method="average")
o <- DendSer(h, d, costBAR)
plotAsColor(as.matrix(d), o, col=cols, main="",rank=T) # Figure 9(a)
clus <- cutree(h, 3)
gcols <- c("#89b7e5","#B2F0A2","red3")
stars(dat[o,], col.stars=gcols[clus[o]], lwd=.05,labels=NULL, cex=1,radius=F) # Figure 9(b)The first ordering using the random ordering of variables given by
vector od. Colours represents the 3-cluster solution, as in
the stars plot. The second PCP uses an ordering of variables calculated
to highlight cluster separation. Each pair of variables is awarded a
merit measure representing the degree of cluster separation for those
two variables. The costLPL picks a path with high merit pairs, and also
places high merit pairs early in the ordering.
# Order vars to highlight cluster separation
dats <- apply(dat, 2, function(x) (x - min(x))/(max(x) -
min(x)))
merit <- panelMerit(dats,clus)
ov<- dser(as.dist(max(merit)-merit),cost=costLPL)
pcp(dat[,ov], col=gcols[clus], horiz = TRUE, mar=c(3,2,1,2),
main="", xaxs = "i") # Figure 10(b)The second PCP emphasizes cluster separation. Panels showing high separation of the regions (clusters) appear at the beginning and panels showing less separation are at the end. It is evident that pots from Hampshire (shown in dark red) contain the lowest levels of MgO, Fe2O3, K2O MnO and CaO. Pots from Wales (light green) contain the highest levels of MgO, K2O and MnO, and the lowest levels of Al2O3, while Gloucester pots (blue) have the highest levels of Fe2O3 and CaO, and medium levels of MgO, K2O and MnO. The first PCP which has arbitrarily ordered variables requires much closer inspection to extract this information.