SDF File Splitter and Combine 3000 molecules.
Objective
The goal was to create a Python script for Windows that processes all .sdf
files in the D:\ccc
directory. Each molecule in the file is separated by the delimiter $$$$
. The first line of each molecule represents its name. The script should split each molecule into a separate file named after the molecule and save these files in the D:\ccc\split
directory.
import os
def split_sdf_files(input_folder, output_folder):
# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Search for all files with .sdf extension in the input folder
sdf_files = [f for f in os.listdir(input_folder) if f.endswith('.sdf')]
for sdf_file in sdf_files:
input_path = os.path.join(input_folder, sdf_file)
with open(input_path, 'r') as file:
content = file.read()
# Split molecules using the $$$$ delimiter
molecules = content.split("$$$$")
for molecule in molecules:
if molecule.strip(): # Skip empty molecules
# Get the molecule name (first line)
lines = molecule.strip().split('\n')
molecule_name = lines[0].strip()
# Ensure the filename is valid
valid_name = "".join(c if c.isalnum() or c in (' ', '_', '-') \
else '_' for c in molecule_name)
output_file = os.path.join(output_folder, f"{valid_name}.sdf")
# Save the molecule into a separate file
with open(output_file, 'w') as out_file:
out_file.write(molecule.strip() + "\n$$$$\n")
print(f"Processing completed. The split files were saved in: {output_folder}")
# Define the input and output paths
input_folder = r"D:\ccc"
output_folder = os.path.join(input_folder, "split")
# Run the function
split_sdf_files(input_folder, output_folder)
A new version ...
import os
# Ruta del fichero de entrada
input_file = r"D:\zzz\Molport_RNA_Binder_Set_filtrado-Lipinski.sdf"
output_dir = r"D:\zzz"
prefix = "RNA-MP-"
max_molecules = 3000
def split_sdf(input_file, output_dir, prefix, max_molecules):
# Asegurarse que existe la carpeta de salida
os.makedirs(output_dir, exist_ok=True)
with open(input_file, "r", encoding="utf-8", errors="ignore") as infile:
mol_count = 0
file_count = 1
out_lines = []
for line in infile:
out_lines.append(line)
if line.strip() == "$$$$": # fin de una molécula
mol_count += 1
# Si ya alcanzamos el límite, escribimos a fichero
if mol_count >= max_molecules:
output_file = os.path.join(output_dir, f"{prefix}{file_count}.sdf")
with open(output_file, "w", encoding="utf-8") as outfile:
outfile.writelines(out_lines)
print(f"Creado: {output_file} con {mol_count} moléculas")
# Reiniciar contadores
file_count += 1
mol_count = 0
out_lines = []
# Guardar el último bloque si tiene moléculas
if out_lines:
output_file = os.path.join(output_dir, f"{prefix}{file_count}.sdf")
with open(output_file, "w", encoding="utf-8") as outfile:
outfile.writelines(out_lines)
print(f"Creado: {output_file} con {mol_count} moléculas")
if __name__ == "__main__":
split_sdf(input_file, output_dir, prefix, max_molecules)
And now combine 3000 molecules...
import os
def combine_sdf_files(input_folder, output_folder, group_size=3000):
# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Find all .sdf files in the input folder
sdf_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.sdf')]
# Counter for tracking the output file number
file_counter = 1
molecule_counter = 0
combined_content = ""
for sdf_file in sdf_files:
with open(sdf_file, 'r') as file:
content = file.read().strip()
if content: # Skip empty files
combined_content += content + "\n"
molecule_counter += 1
# If we've reached the group size, save the combined file
if molecule_counter == group_size:
output_filename = f"MP7-Q-{file_counter:04d}_ligands.sdf"
output_path = os.path.join(output_folder, output_filename)
with open(output_path, 'w') as out_file:
out_file.write(combined_content)
print(f"Generated: {output_filename}")
file_counter += 1
molecule_counter = 0
combined_content = "" # Reset for the next group
# Save remaining molecules if any
if combined_content:
output_filename = f"MP7-Q-{file_counter:04d}_ligands.sdf"
output_path = os.path.join(output_folder, output_filename)
with open(output_path, 'w') as out_file:
out_file.write(combined_content)
print(f"Generated: {output_filename}")
print(f"Processing completed. Files saved in: {output_folder}")
# Define the input and output paths
input_folder = r"D:\ccc\split"
output_folder = r"D:\ccc\Q-all-isomers-3000"
# Run the function
combine_sdf_files(input_folder, output_folder)