Fingerprint analysis of data from a protein-small compound molecular dynamics (MD) simulation.

matriz.txt
684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807
HB 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 0 0 0 1 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0
Hyd 0 0 0 0 80 0 98 0 0 0 0 55 2 100 21 0 0 99 19 0 0 0 39 31 0 58 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 89 96 56 75 0 0 0 0 0 76 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 21 0 0 0 0 0 0 0 3 47 0
Hyd+HB 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 35 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ion 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ion+HB 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ion+Hyd 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ion+Hyd+HB 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
import pandas as pd
import matplotlib.pyplot as plt
# Entry route
entrada = r"d:\bbb\matriz.txt"
# We read the file with pandas
df = pd.read_csv(entrada, sep="\t")
# We extract interactions (first column, e.g. HB, Hyd, Ion...)
interacciones = df.iloc[:, 0]
# We extract amino acids (header except first column)
aminoacidos = df.columns[1:]
# We extract only numerical values
valores = df.iloc[:, 1:]
# Filter amino acids with interaction > 0
totales = valores.sum(axis=0)
aminoacidos_filtrados = [aa for aa in aminoacidos if totales[aa] > 0]
valores_filtrados = valores[aminoacidos_filtrados]
# Print to console
print("Tipo_interaccion\t" + "\t".join(aminoacidos_filtrados))
for idx, interaccion in enumerate(interacciones):
fila = [str(v) for v in valores_filtrados.iloc[idx].tolist()]
print(f"{interaccion}\t" + "\t".join(fila))
# Totals for filtered amino acids only
fila_total = [str(v) for v in valores_filtrados.sum(axis=0).tolist()]
print("TOTAL\t" + "\t".join(fila_total))
# Define custom colours
colores = {
"HB": "#FF0000",
"Hyd": "#00FF00",
"Hyd+HB": "#FFFF00",
"Ion": "#0000FF",
"Ion+HB": "#FF00FF",
"Ion+Hyd": "#00FFFF",
"Ion+Hyd+HB": "#E0E0E0"
}
# Assign colours in the same order as interactions
colores_lista = [colores.get(inter, "gray") for inter in interacciones]
# Stacked chart
ax = valores_filtrados.T.plot(
kind="bar",
stacked=True,
figsize=(12, 6),
color=colores_lista
)
# Change caption to show interaction names
ax.legend(interacciones, title="Interactions")
plt.xlabel("Aminoacids number")
plt.ylabel("Occupancy, %")
plt.title("Fingerprint of the small compound bound to the protein during MD simulation")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
Interact 688 690 695 696 697 698 701 702 706 707 708 709 726 727 728 729 735 739 758 797 804 805 806
HB 0 0 0 0 0 0 0 9 1 3 1 0 0 0 0 3 0 0 23 0 17 0 0
Hyd 80 98 55 2 100 21 99 19 39 31 0 58 89 96 56 75 76 1 2 21 0 3 47
Hyd+HB 0 0 0 0 0 0 0 35 0 6 0 0 0 0 0 0 0 0 9 0 0 0 0
Ion 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ion+HB 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ion+Hyd 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ion+Hyd+HB 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
TOTAL 80 98 55 2 100 21 99 63 40 40 1 58 89 96 56 78 76 1 34 21 17 3 47

import re
import csv
from collections import defaultdict
def parse_file(path):
# Leer líneas (ignorando líneas vacías)
with open(path, 'r', encoding='utf-8-sig') as f:
raw_lines = [ln.rstrip('\n\r') for ln in f]
lines = [ln for ln in raw_lines if ln.strip() != '']
if not lines:
raise ValueError("El fichero está vacío o solo contiene líneas en blanco.")
# Heurística: si la primera línea contiene letras, la consideramos encabezado
header_line = lines[0]
has_header = bool(re.search(r'[A-Za-zÁÉÍÓÚáéíóúÑñ_\-]', header_line))
data_lines = lines[1:] if has_header else lines
# Tokenizar todas las líneas
tokenized = []
max_tokens = 0
for ln in data_lines:
toks = re.split(r'\s+', ln.strip())
tokenized.append(toks)
if len(toks) > max_tokens:
max_tokens = len(toks)
if max_tokens % 2 != 0:
max_tokens += 1
n_pairs = max_tokens // 2
if n_pairs == 0:
raise ValueError("No se han detectado pares residuo/valor en el fichero.")
# Nombres de condiciones
cond_names = []
if has_header:
h_tokens = re.split(r'\s+', header_line.strip())
if len(h_tokens) == 2 * n_pairs:
cond_names = [h_tokens[2*i + 1] for i in range(n_pairs)]
elif len(h_tokens) >= n_pairs:
cond_names = h_tokens[:n_pairs]
else:
cond_names = h_tokens + [f"Cond{i+1}" for i in range(len(h_tokens), n_pairs)]
else:
cond_names = [f"Cond{i+1}" for i in range(n_pairs)]
# Acumular datos
maps = [defaultdict(int) for _ in range(n_pairs)]
residues_set = set()
for toks in tokenized:
if len(toks) < 2 * n_pairs:
toks = toks + [''] * (2 * n_pairs - len(toks))
for i in range(n_pairs):
res_tok = toks[2*i].strip() if 2*i < len(toks) else ''
val_tok = toks[2*i + 1].strip() if 2*i + 1 < len(toks) else ''
if res_tok == '':
continue
try:
resid = int(float(res_tok))
except Exception:
continue
try:
val = int(float(val_tok)) if val_tok != '' else 0
except Exception:
val = 0
maps[i][resid] = val
residues_set.add(resid)
residues = sorted(residues_set)
rows = []
for r in residues:
row = [r] + [maps[i].get(r, 0) for i in range(n_pairs)]
rows.append(row)
return cond_names, rows
def write_out(path_out, cond_names, rows):
header = ['Residue'] + cond_names
with open(path_out, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerow(header)
for row in rows:
writer.writerow(row)
# --- CONFIGURACIÓN MANUAL ---
entrada = r"D:\aaa\matriz_vertical_todos_fingerprint.txt"
salida = r"D:\aaa\matriz_vertical_todos_fingerprint_unificada.txt"
# ----------------------------
cond_names, rows = parse_file(entrada)
write_out(salida, cond_names, rows)
print(f"Salida guardada en: {salida}")
print(f"Columnas detectadas: {['Residue'] + cond_names}")
print(f"Filas escritas: {len(rows)}")
D:\aaa\matriz_vertical_todos_fingerprint_unificada.txt
compound-1 compound-2 compound-3 compound-4 compound-5 compound-6 compound-7
251 0 0.05 0 0 0 0 0
252 0 1.4 0 0 0 0 0
253 0 1.25 0 0 0 0 0
256 0 0.05 0 0 0 0 0
257 0 13.29 0 0 0 0 0
258 0 0.35 0 0 0 0 0
259 0 15.19 0 0 0 0 0
260 0 0.1 0 0 0 0 0
261 0 16.54 0 0 0 0.05 0
263 0 0.2 0 0 0 0 0
266 0 0.05 0 0 0 0 0
267 0 0.1 0 0 0 0 0
268 0.05 0.7 0 0 0 3.8 0
271 0 1.05 0.6 61.12 24.49 12.64 16.64
284 0 1.6 1.1 0 0 0 0
285 0 4.75 0 0 0 0 0
286 0 1.4 0 0 0 0 0
288 0 0.95 0 0 0 0 0
290 0 1.9 0 0 0 0 0
297 0 0.75 0 0 0 0 0
302 0 0.95 0 0 0 0 0
308 0 0 0.15 0 0 0 0
309 0 3.95 19.49 0 0 0 0
311 0.4 3.4 84.71 55.52 15.99 8.8 6.8
312 0 0 0 0 0 0 0.3
325 0 1 66.27 2.9 0 0.1 0
349 0 0 83.21 27.44 0 0.2 0
351 12.54 1.5 98.7 40.28 62.12 18.74 56.72
352 0 0 0.3 0 0 4.55 8.6
365 5.15 0.9 55.27 52.72 0.3 0.15 12.29
376 0 0 0 0.1 0 0 0
378 0 0 0 0.15 0 0 0.05
390 0 0 0 0 0 1.05 8
391 0 0 0.5 0 53.02 0.25 3.7
392 8.1 0.65 79.46 25.14 37.38 0 11.69
394 17.09 1.55 92.85 12.79 44.33 29.19 76.01
395 0.4 0 2.5 0.55 2.25 16.14 20.14
408 2.95 0.5 81.06 0.6 71.81 7.3 13.34
409 0 0 0 0 0 1.4 5.95
410 0 0 0 0 0 36.98 14.34
411 0 0 0 0 0 0.7 0
425 0 0 0 0 0 2 0
427 0 0 0 0 0 4.4 0
428 0 0 0 0 0 5.3 0
430 0 0 0 0 0 8.2 0
431 7.95 0.4 0.05 0.05 29.84 57.27 60.72
432 0.35 34.83 56.32 0.4 38.68 33.83 29.84
434 71.56 43.73 90.45 24.94 94.15 29.74 70.86
435 0.5 0 0.1 0 10.59 1.15 0.05
448 28.04 42.08 58.02 1.2 85.51 47.43 73.71
449 0 0 0 0 0 0.05 0
450 1.6 0 0 0 0 0 0
451 0 0 0 0 0 0.1 0
460 0 0 0 0 0 0.05 0
461 0 0 0 0 0 0.55 0
462 0 0 0 0 0 0.1 0
465 0 0 0 0 0 0.05 0
470 0 0 0 0 0 12.44 0
471 14.54 11.54 0 3.75 0 14.94 0
472 94.2 65.07 52.47 38.48 86.11 51.92 72.81
473 0.5 0 0 0 0 0 0
474 46.08 22.24 84.46 87.16 82.96 31.73 78.11
475 0 0 0 0 27.44 0 0
488 60.32 49.83 0.4 42.33 2.7 41.08 59.77
489 0 0 0 1.25 0 12.99 0
490 0 0 0 0 0 2.05 0
491 0 0 0 0 0 0.05 0
516 0 0 0 0 0 1.35 0
519 0 0.05 0 0 0 18.24 0
520 0 0 0 1.35 0 15.54 0
521 1.9 12.04 2.25 19.14 0 29.69 29.24
523 0.05 0.3 1.8 61.62 16.74 19.59 29.14
524 0 0 0 39.68 61.17 0 0
537 0 0.1 1.5 0.1 0 2.9 0.1
538 0 0.25 0 0 0 1.65 0
539 0 0.2 0 0 0 0.75 0
540 0 10.89 0 0 0 0 0
