I have a confusion matrix from binning tools (MetaBAT2 vs MaxBin2) and I have an R script that puts the largest values on the diagonal by permuting rows and columns. It works pretty well, but some significant off-diagonal values remain that should ideally be positioned under the diagonal rather than scattered randomly. So, my main goal is to make the diagonal thicker with numbers that didn't moved there. Can anyone suggest any solutions? I am not a coder person and kind of stuck here for almost a week now... Changing tables by hands not an option, because I have like 20 tables on my hands. I would appreciate any help with the script.
The script I used and it need to be fixed:
data <- read_tsv("metabat2_vs_semibin_confusion_jaccard.tsv", col_names = TRUE)
data <- data[, !grepl("^\\.\\.\\.", colnames(data))]
mat <- as.matrix(data[, -1])
rownames(mat) <- data[[1]]
diagonalize_matrix <- function(mat) {
n_rows <- nrow(mat)
n_cols <- ncol(mat)
matches <- data.frame()
for (i in 1:n_rows) {
for (j in 1:n_cols) {
if (mat[i, j] > 0) {
matches <- rbind(matches, data.frame(row = i, col = j, value = mat[i, j]))
}
}
}
matches <- matches[order(-matches$value), ]
assigned_rows <- logical(n_rows)
assigned_cols <- logical(n_cols)
row_order <- integer(0)
col_order <- integer(0)
for (k in 1:nrow(matches)) {
i <- matches$row[k]
j <- matches$col[k]
if (!assigned_rows[i] && !assigned_cols[j]) {
row_order <- c(row_order, i)
col_order <- c(col_order, j)
assigned_rows[i] <- TRUE
assigned_cols[j] <- TRUE
}
}
if (length(row_order) < n_rows) {
remaining_rows <- which(!assigned_rows)
row_order <- c(row_order, remaining_rows)
}
if (length(col_order) < n_cols) {
remaining_cols <- which(!assigned_cols)
col_order <- c(col_order, remaining_cols)
}
reordered_mat <- mat[row_order, col_order]
return(list(
matrix = reordered_mat,
row_order = row_order,
col_order = col_order
))
}
result <- diagonalize_matrix(mat)
result_df <- as.data.frame(result$matrix)
for (col in colnames(result_df)) {
if (is.numeric(result_df[[col]])) {
result_df[[col]] <- format(result_df[[col]], scientific = FALSE, digits = 10)
result_df[[col]] <- sub("\\.?0+$", "", result_df[[col]])
}
}
result_df <- cbind(rownames(result$matrix), result_df)
colnames(result_df)[1] <- colnames(data)[1]
write_tsv(result_df, "clustered_diagonal_matrix_metabat2_vs_semibin_jaccard.tsv")
The tsv file that I get from the script:
| semibin_bins.maxbin2_bins | bin_001 | bin_007 | bin_025 | bin_005 | bin_028 | bin_033 | bin_026 | bin_017 | bin_004 | bin_012 | bin_010 | bin_019 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| bin_58 | 0,955 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| bin_11 | 0 | 0,907 | 0,029 | 0,003 | 0 | 0,001 | 0 | 0 | 0 | 0 | 0 | 0 |
| bin_10 | 0 | 0,006 | 0,865 | 0 | 0 | 0,014 | 0 | 0 | 0 | 0 | 0 | 0 |
| bin_30 | 0 | 0 | 0 | 0,837 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| bin_51 | 0 | 0 | 0 | 0 | 0,83 | 0 | 0 | 0,003 | 0 | 0 | 0,001 | 0,037 |
| bin_43 | 0 | 0 | 0 | 0 | 0 | 0,778 | 0 | 0 | 0 | 0 | 0 | 0 |
| bin_74 | 0 | 0 | 0 | 0 | 0 | 0 | 0,77 | 0 | 0 | 0 | 0 | 0 |
| bin_37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,731 | 0 | 0 | 0 | 0 |
| bin_42 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,712 | 0 | 0 | 0 |
| bin_18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,704 | 0 | 0 |
| bin_20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,001 | 0 | 0,682 | 0,207 |
| bin_104 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,121 |
| bin_105 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,049 | 0 | 0 | 0 | 0 |
| bin_67 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,019 | 0 | 0 | 0 |
| bin_69 | 0 | 0 | 0 | 0 | 0 | 0 | 0,215 | 0 | 0 | 0,004 | 0 | 0 |
| bin_92 | 0 | 0 | 0 | 0,153 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,002 |
| bin_86 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,023 | 0 | 0 | 0 | 0 |
data1[1:17,1:12])
structure(list(bin_001 = c(0.955, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), bin_007 = c(0, 0.907, 0.006, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_025 = c(0, 0.029, 0.865, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_005 = c(0, 0.003,
0, 0.837, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.153, 0, 0), bin_028 = c(0,
0, 0, 0, 0.83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_033 = c(0,
0.001, 0.014, 0, 0, 0.778, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
bin_026 = c(0, 0, 0, 0, 0, 0, 0.77, 0, 0, 0, 0, 0, 0, 0,
0, 0.215, 0), bin_017 = c(0, 0, 0, 0, 0.003, 0, 0, 0.731,
0, 0, 0, 0, 0, 0.049, 0, 0, 0.023), bin_004 = c(0, 0, 0,
0, 0, 0, 0, 0, 0.712, 0, 0.001, 0.019, 0, 0, 0, 0, 0), bin_012 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0.704, 0, 0, 0, 0, 0, 0.004, 0),
bin_010 = c(0, 0, 0, 0, 0.001, 0, 0, 0, 0, 0, 0.682, 0, 0,
0, 0, 0, 0), bin_019 = c(0, 0, 0, 0, 0.037, 0, 0, 0, 0, 0,
0.207, 0, 0.121, 0, 0.002, 0, 0)), row.names = c("bin_58",
"bin_11", "bin_10", "bin_30", "bin_51", "bin_43", "bin_74", "bin_37",
"bin_42", "bin_18", "bin_20", "bin_67", "bin_104", "bin_105",
"bin_92", "bin_69", "bin_86"), class = "data.frame")
The file that I want to get:
| semibin_bins.maxbin2_bins | bin_001 | bin_007 | bin_025 | bin_005 | bin_028 | bin_033 | bin_026 | bin_017 | bin_004 | bin_012 | bin_010 | bin_019 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| bin_58 | 0,955 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| bin_11 | 0 | 0,907 | 0,029 | 0,003 | 0 | 0,001 | 0 | 0 | 0 | 0 | 0 | 0 |
| bin_10 | 0 | 0,006 | 0,865 | 0 | 0 | 0,014 | 0 | 0 | 0 | 0 | 0 | 0 |
| bin_30 | 0 | 0 | 0 | 0,837 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| bin_92 | 0 | 0 | 0 | 0,153 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,002 |
| bin_51 | 0 | 0 | 0 | 0 | 0,83 | 0 | 0 | 0,003 | 0 | 0 | 0,001 | 0,037 |
| bin_43 | 0 | 0 | 0 | 0 | 0 | 0,778 | 0 | 0 | 0 | 0 | 0 | 0 |
| bin_74 | 0 | 0 | 0 | 0 | 0 | 0 | 0,77 | 0 | 0 | 0 | 0 | 0 |
| bin_69 | 0 | 0 | 0 | 0 | 0 | 0 | 0,215 | 0 | 0 | 0,004 | 0 | 0 |
| bin_37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,731 | 0 | 0 | 0 | 0 |
| bin_105 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,049 | 0 | 0 | 0 | 0 |
| bin_86 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,023 | 0 | 0 | 0 | 0 |
| bin_42 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,712 | 0 | 0 | 0 |
| bin_67 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,019 | 0 | 0 | 0 |
| bin_18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,704 | 0 | 0 |
| bin_20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,001 | 0 | 0,682 | 0,207 |
| bin_104 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,121 |
data2[1:17,1:12]
structure(list(bin_001 = c(0.955, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), bin_007 = c(0, 0.907, 0.006, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_025 = c(0, 0.029, 0.865, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_005 = c(0, 0.003,
0, 0.837, 0.153, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_028 = c(0,
0, 0, 0, 0, 0.83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bin_033 = c(0,
0.001, 0.014, 0, 0, 0, 0.778, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
bin_026 = c(0, 0, 0, 0, 0, 0, 0, 0.77, 0.215, 0, 0, 0, 0,
0, 0, 0, 0), bin_017 = c(0, 0, 0, 0, 0, 0.003, 0, 0, 0, 0.731,
0.049, 0.023, 0, 0, 0, 0, 0), bin_004 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0.712, 0.019, 0, 0.001, 0), bin_012 = c(0,
0, 0, 0, 0, 0, 0, 0, 0.004, 0, 0, 0, 0, 0, 0.704, 0, 0),
bin_010 = c(0, 0, 0, 0, 0, 0.001, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.682, 0), bin_019 = c(0, 0, 0, 0, 0.002, 0.037, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0.207, 0.121)), row.names = c("bin_58",
"bin_11", "bin_10", "bin_30", "bin_92", "bin_51", "bin_43", "bin_74",
"bin_69", "bin_37", "bin_105", "bin_86", "bin_42", "bin_67",
"bin_18", "bin_20", "bin_104"), class = "data.frame")