Permute rows and columns to put largest values as close to diagonal as possible
05:30 25 Jan 2026

I have a confusion matrix from binning tools (MetaBAT2 vs MaxBin2) and I have an R script that puts the largest values on the diagonal by permuting rows and columns. It works pretty well, but some significant off-diagonal values remain that should ideally be positioned under the diagonal rather than scattered randomly. So, my main goal is to make the diagonal thicker with numbers that didn't moved there. Can anyone suggest any solutions? I am not a coder person and kind of stuck here for almost a week now... Changing tables by hands not an option, because I have like 20 tables on my hands. I would appreciate any help with the script.

The script I used and it need to be fixed:

data <- read_tsv("metabat2_vs_semibin_confusion_jaccard.tsv", col_names = TRUE)
data <- data[, !grepl("^\\.\\.\\.", colnames(data))]

mat <- as.matrix(data[, -1])
rownames(mat) <- data[[1]]

diagonalize_matrix <- function(mat) {
  n_rows <- nrow(mat)
  n_cols <- ncol(mat)
  
  matches <- data.frame()
  for (i in 1:n_rows) {
    for (j in 1:n_cols) {
      if (mat[i, j] > 0) {
        matches <- rbind(matches, data.frame(row = i, col = j, value = mat[i, j]))
      }
    }
  }
  
  matches <- matches[order(-matches$value), ]
 
  assigned_rows <- logical(n_rows)
  assigned_cols <- logical(n_cols)
  row_order <- integer(0)
  col_order <- integer(0)
  
  for (k in 1:nrow(matches)) {
    i <- matches$row[k]
    j <- matches$col[k]
    
    if (!assigned_rows[i] && !assigned_cols[j]) {
      row_order <- c(row_order, i)
      col_order <- c(col_order, j)
      assigned_rows[i] <- TRUE
      assigned_cols[j] <- TRUE
    }
  }
  
  if (length(row_order) < n_rows) {
    remaining_rows <- which(!assigned_rows)
    row_order <- c(row_order, remaining_rows)
  }
  
  if (length(col_order) < n_cols) {
    remaining_cols <- which(!assigned_cols)
    col_order <- c(col_order, remaining_cols)
  }
  
  reordered_mat <- mat[row_order, col_order]
  
  return(list(
    matrix = reordered_mat,
    row_order = row_order,
    col_order = col_order
  ))
}

result <- diagonalize_matrix(mat)
result_df <- as.data.frame(result$matrix)

for (col in colnames(result_df)) {
  if (is.numeric(result_df[[col]])) {
   
    result_df[[col]] <- format(result_df[[col]], scientific = FALSE, digits = 10)
    
    result_df[[col]] <- sub("\\.?0+$", "", result_df[[col]])
  }
}

result_df <- cbind(rownames(result$matrix), result_df)
colnames(result_df)[1] <- colnames(data)[1]

write_tsv(result_df, "clustered_diagonal_matrix_metabat2_vs_semibin_jaccard.tsv")

The tsv file that I get from the script:

semibin_bins.maxbin2_bins bin_001 bin_007 bin_025 bin_005 bin_028 bin_033 bin_026 bin_017 bin_004 bin_012 bin_010 bin_019
bin_58 0,955 0 0 0 0 0 0 0 0 0 0 0
bin_11 0 0,907 0,029 0,003 0 0,001 0 0 0 0 0 0
bin_10 0 0,006 0,865 0 0 0,014 0 0 0 0 0 0
bin_30 0 0 0 0,837 0 0 0 0 0 0 0 0
bin_51 0 0 0 0 0,83 0 0 0,003 0 0 0,001 0,037
bin_43 0 0 0 0 0 0,778 0 0 0 0 0 0
bin_74 0 0 0 0 0 0 0,77 0 0 0 0 0
bin_37 0 0 0 0 0 0 0 0,731 0 0 0 0
bin_42 0 0 0 0 0 0 0 0 0,712 0 0 0
bin_18 0 0 0 0 0 0 0 0 0 0,704 0 0
bin_20 0 0 0 0 0 0 0 0 0,001 0 0,682 0,207
bin_104 0 0 0 0 0 0 0 0 0 0 0 0,121
bin_105 0 0 0 0 0 0 0 0,049 0 0 0 0
bin_67 0 0 0 0 0 0 0 0 0,019 0 0 0
bin_69 0 0 0 0 0 0 0,215 0 0 0,004 0 0
bin_92 0 0 0 0,153 0 0 0 0 0 0 0 0,002
bin_86 0 0 0 0 0 0 0 0,023 0 0 0 0
data1[1:17,1:12])
structure(list(bin_001 = c(0.955, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), bin_007 = c(0, 0.907, 0.006, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_025 = c(0, 0.029, 0.865, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_005 = c(0, 0.003, 
0, 0.837, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.153, 0, 0), bin_028 = c(0, 
0, 0, 0, 0.83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_033 = c(0, 
0.001, 0.014, 0, 0, 0.778, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    bin_026 = c(0, 0, 0, 0, 0, 0, 0.77, 0, 0, 0, 0, 0, 0, 0, 
    0, 0.215, 0), bin_017 = c(0, 0, 0, 0, 0.003, 0, 0, 0.731, 
    0, 0, 0, 0, 0, 0.049, 0, 0, 0.023), bin_004 = c(0, 0, 0, 
    0, 0, 0, 0, 0, 0.712, 0, 0.001, 0.019, 0, 0, 0, 0, 0), bin_012 = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0.704, 0, 0, 0, 0, 0, 0.004, 0), 
    bin_010 = c(0, 0, 0, 0, 0.001, 0, 0, 0, 0, 0, 0.682, 0, 0, 
    0, 0, 0, 0), bin_019 = c(0, 0, 0, 0, 0.037, 0, 0, 0, 0, 0, 
    0.207, 0, 0.121, 0, 0.002, 0, 0)), row.names = c("bin_58", 
"bin_11", "bin_10", "bin_30", "bin_51", "bin_43", "bin_74", "bin_37", 
"bin_42", "bin_18", "bin_20", "bin_67", "bin_104", "bin_105", 
"bin_92", "bin_69", "bin_86"), class = "data.frame")

The file that I want to get:

semibin_bins.maxbin2_bins bin_001 bin_007 bin_025 bin_005 bin_028 bin_033 bin_026 bin_017 bin_004 bin_012 bin_010 bin_019
bin_58 0,955 0 0 0 0 0 0 0 0 0 0 0
bin_11 0 0,907 0,029 0,003 0 0,001 0 0 0 0 0 0
bin_10 0 0,006 0,865 0 0 0,014 0 0 0 0 0 0
bin_30 0 0 0 0,837 0 0 0 0 0 0 0 0
bin_92 0 0 0 0,153 0 0 0 0 0 0 0 0,002
bin_51 0 0 0 0 0,83 0 0 0,003 0 0 0,001 0,037
bin_43 0 0 0 0 0 0,778 0 0 0 0 0 0
bin_74 0 0 0 0 0 0 0,77 0 0 0 0 0
bin_69 0 0 0 0 0 0 0,215 0 0 0,004 0 0
bin_37 0 0 0 0 0 0 0 0,731 0 0 0 0
bin_105 0 0 0 0 0 0 0 0,049 0 0 0 0
bin_86 0 0 0 0 0 0 0 0,023 0 0 0 0
bin_42 0 0 0 0 0 0 0 0 0,712 0 0 0
bin_67 0 0 0 0 0 0 0 0 0,019 0 0 0
bin_18 0 0 0 0 0 0 0 0 0 0,704 0 0
bin_20 0 0 0 0 0 0 0 0 0,001 0 0,682 0,207
bin_104 0 0 0 0 0 0 0 0 0 0 0 0,121
data2[1:17,1:12]
structure(list(bin_001 = c(0.955, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), bin_007 = c(0, 0.907, 0.006, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_025 = c(0, 0.029, 0.865, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_005 = c(0, 0.003, 
0, 0.837, 0.153, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_028 = c(0, 
0, 0, 0, 0, 0.83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_033 = c(0, 
0.001, 0.014, 0, 0, 0, 0.778, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    bin_026 = c(0, 0, 0, 0, 0, 0, 0, 0.77, 0.215, 0, 0, 0, 0, 
    0, 0, 0, 0), bin_017 = c(0, 0, 0, 0, 0, 0.003, 0, 0, 0, 0.731, 
    0.049, 0.023, 0, 0, 0, 0, 0), bin_004 = c(0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0.712, 0.019, 0, 0.001, 0), bin_012 = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0.004, 0, 0, 0, 0, 0, 0.704, 0, 0), 
    bin_010 = c(0, 0, 0, 0, 0, 0.001, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0.682, 0), bin_019 = c(0, 0, 0, 0, 0.002, 0.037, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0.207, 0.121)), row.names = c("bin_58", 
"bin_11", "bin_10", "bin_30", "bin_92", "bin_51", "bin_43", "bin_74", 
"bin_69", "bin_37", "bin_105", "bin_86", "bin_42", "bin_67", 
"bin_18", "bin_20", "bin_104"), class = "data.frame")
r