"step2" = function(taxon.subset, dis.table) { original.genus <- substring(as.character(taxon.subset[, "species"]), 1, regexpr("[^a-zA-Z]", as.character(taxon.subset[, "species"])) - 1) taxon.level <- unique(original.genus) taxon.seq.new <- vector("list", length(taxon.level)) names(taxon.seq.new) <- taxon.level misidentifiedInStep2 <- NULL for(i in 1:length(unique(taxon.level))) { #At this stage I need to extract the dimnames for data sets with 2 or less #members for insertion into the taxon.seq.new list if(sum(original.genus == taxon.level[[i]]) <= 2) { # one and two samples cannot apply hierarchical clustering taxon.seq.new[[i]] <- dimnames(taxon.subset)[[1]][original.genus == taxon.level[[i]]] } if(sum(original.genus == taxon.level[[i]]) > 2) { #extract subset for each family ?? genus or family temp <- dis.table[dimnames(taxon.subset)[[1]][original.genus == taxon.level[[i]]], dimnames(taxon.subset)[[1]][original.genus == taxon.level[[i]]]] temp.bak <- temp #This statement is required to avoid problems when all of the values #are 0, which causes the clustering steps to fail if(sum(temp) == 0) { taxon.seq.new[[i]] <- as.character(dimnames(temp)[[1]]) next } temp.order <- matrix(0, nrow(temp), ncol(temp)) while (TRUE) { temp <- abs(jitter(temp, factor = 0.00001)) int <- F for (j in 1:nrow(temp)) { temp.order[j,] <- rank(as.matrix(temp[j, ])) } tf <- unique(as.integer(temp.order) == temp.order) print(tf) if (length(tf) == 1) { int <- T } if (length(tf) == 2) { int <- F } if (int) break } print("inside step2") temp <- temp.bak dimnames(temp.order)[[2]] <- dimnames(temp)[[2]] dimnames(temp.order)[[1]] <- dimnames(temp)[[1]] temp.reorder <- NULL for(j in 1:ncol(temp.order)) { temp.reorder <- c(temp.reorder, dimnames(temp.order)[[1]][temp.order[, j] == 1]) # ?? } #identify any data that would have been dropped by a tie dropped.names <- setdiff(dimnames(temp)[[1]], temp.reorder) temp <- temp[c(temp.reorder, dropped.names), c(temp.reorder, dropped.names)] #dis table #reorder the matrix using the 90% quantile of the 2nd best match to establish #membership and plotting sequence. Within closely related taxa, #this level is equivalent to a species to genus level relationship temp.order <- temp.order[c(temp.reorder, dropped.names), c(temp.reorder, dropped.names)] #rank table if(sum(1 * (temp.order == 2)) != 0) { # why == 2 neighbor <- quantile(temp[temp.order == 2], 0.9) } neighborhood <- temp >= neighbor neighborhood <- neighborhood * 1 temp.neighborhood <- temp * neighborhood neighborhood.tdist <- dist(as.matrix(t(temp.neighborhood)), met = "binary") neighborhood.tclust <- hclust(neighborhood.tdist, met = "compact") neighborhood.tclust <- clorder(neighborhood.tclust, apply(temp, 1, mean)) neighborhood <- neighborhood[neighborhood.tclust$order, neighborhood.tclust$order] neighborhood.dist <- dist(as.matrix((neighborhood)), met = "binary") neighborhood.clust <- hclust(neighborhood.dist, met = "compact") neighborhood.clust <- clorder(neighborhood.clust, apply(temp, 1, mean)) temp <- temp[neighborhood.clust$order, neighborhood.tclust$order] #NOTE that the incorporation of the following distributional test #provides and automatic way of removing outliers from genus-level groups misidentifiedInStep2 <- c(misidentifiedInStep2, dimnames(temp)[[1]][abs(apply(scale(temp), 1, max)) > 3]) misidentifiedInStep2 <- setdiff(misidentifiedInStep2, "") taxon.seq.new[[i]] <- as.character(dimnames(temp)[[1]]) } } return(taxon.seq.new, misidentifiedInStep2) }