#SOSCC algorithm #George M. Garrity and Timothy G. Lilburn #Copyright Michigan State University 2004, all rights reserved "step6" = function(id.matrix, tax.table, correct.placements) { print("inside step6") if (nrow(id.matrix) == 0) { known.errors <- vector(mode=mode(c(""))) revised.tax.table <- tax.table[correct.placements, c("species", "family", "taxon.seq", "species", "family")] #???? gamma?? dimnames(revised.tax.table)[[2]][4:5] <- c("MPI.species", "MPI.family") all.genus <- substring(as.character(revised.tax.table[, "MPI.species"]), 1, regexpr("[^a-zA-Z]", as.character(revised.tax.table[, "MPI.species"])) - 1) nearest.neighbors <- revised.tax.table[vector(mode=mode(c(""))), ] return(revised.tax.table, known.errors, nearest.neighbors, all.genus) } #create the ordered matrix based on the evolutionary distance data temp <- id.matrix temp.order <- matrix(0, nrow(temp), ncol(temp)) while (TRUE) { temp <- abs(jitter(temp, factor = 0.001)) int <- F for (j in 1:nrow(temp)) { temp.order[j,] <- rank(as.matrix(temp[j, ])) #make sure no tie there tf <- length(unique(as.integer(as.vector(temp.order[j,])) == as.vector(temp.order[j,]))) if (tf==1) { int <- T } else { int <- F } } if (int) break } dimnames(temp.order)[[2]] <- dimnames(temp)[[2]] dimnames(temp.order)[[1]] <- dimnames(temp)[[1]] temp.reorder <- NULL for(j in 1:ncol(temp.order)) { temp.reorder <- c(temp.reorder, dimnames(temp.order)[[1]][temp.order[, j] == 1]) } #identify any data that would have been dropped by a tie dropped.names <- setdiff(dimnames(temp)[[1]], temp.reorder) if (length(c(temp.reorder, dropped.names)) != 0) temp <- temp[c(temp.reorder, dropped.names), ] known.errors <- dimnames(temp)[[1]][abs(apply(scale(temp), 1, max)) > 2.5] best.match <- NULL for(i in 1:nrow(temp.order)) { best.match <- c(best.match, dimnames(temp.order)[[2]][temp.order[i, ] == 1]) #print(i) #print(dimnames(temp.order)[[2]][temp.order[i, ] == 1]) } nearest.neighbors <- cbind(dimnames(temp.order)[[1]], tax.table[best.match, c("species","family")]) nearest.neighbors <- cbind(nearest.neighbors, substring(as.character(nearest.neighbors[, "species"]), 1, regexpr("[^a-zA-Z]", as.character(nearest.neighbors[, "species"])) - 1)) dimnames(nearest.neighbors)[[2]][2:3] <- c("MPI.species", "MPI.family") nearest.neighbors <- cbind(tax.table[as.character(nearest.neighbors[, 1]), c("species", "family", "taxon.seq")], nearest.neighbors[, 2:3]) #At this point, I need to add back the sequences into the model at the #appropriate point, based on the genus id of the best match. The entire data set #would then need to be redone, until nothing is misplaced. revised.tax.table <- tax.table[correct.placements, c("species", "family", "taxon.seq", "species", "family")] #???? gamma?? dimnames(revised.tax.table)[[2]][4:5] <- c("MPI.species", "MPI.family") revised.tax.table <- rbind(revised.tax.table, nearest.neighbors) all.genus <- substring(as.character(revised.tax.table[, "MPI.species"]), 1, regexpr("[^a-zA-Z]", as.character(revised.tax.table[, "MPI.species"])) - 1) return(revised.tax.table, known.errors, nearest.neighbors, all.genus) }