SDF File Splitter and Combine 3000 molecules.


Objective

The goal was to create a Python script for Windows that processes all .sdf files in the D:\ccc directory. Each molecule in the file is separated by the delimiter $$$$. The first line of each molecule represents its name. The script should split each molecule into a separate file named after the molecule and save these files in the D:\ccc\split directory.

import os

def split_sdf_files(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Search for all files with .sdf extension in the input folder
    sdf_files = [f for f in os.listdir(input_folder) if f.endswith('.sdf')]

    for sdf_file in sdf_files:
        input_path = os.path.join(input_folder, sdf_file)
        with open(input_path, 'r') as file:
            content = file.read()
        
        # Split molecules using the $$$$ delimiter
        molecules = content.split("$$$$")
        
        for molecule in molecules:
            if molecule.strip():  # Skip empty molecules
                # Get the molecule name (first line)
                lines = molecule.strip().split('\n')
                molecule_name = lines[0].strip()

                # Ensure the filename is valid
                valid_name = "".join(c if c.isalnum() or c in (' ', '_', '-') \ 
				else '_' for c in molecule_name)
                output_file = os.path.join(output_folder, f"{valid_name}.sdf")

                # Save the molecule into a separate file
                with open(output_file, 'w') as out_file:
                    out_file.write(molecule.strip() + "\n$$$$\n")

    print(f"Processing completed. The split files were saved in: {output_folder}")

# Define the input and output paths
input_folder = r"D:\ccc"
output_folder = os.path.join(input_folder, "split")

# Run the function
split_sdf_files(input_folder, output_folder)
    

A new version ...

import os

# Ruta del fichero de entrada
input_file = r"D:\zzz\Molport_RNA_Binder_Set_filtrado-Lipinski.sdf"
output_dir = r"D:\zzz"
prefix = "RNA-MP-"
max_molecules = 3000

def split_sdf(input_file, output_dir, prefix, max_molecules):
    # Asegurarse que existe la carpeta de salida
    os.makedirs(output_dir, exist_ok=True)
    
    with open(input_file, "r", encoding="utf-8", errors="ignore") as infile:
        mol_count = 0
        file_count = 1
        out_lines = []

        for line in infile:
            out_lines.append(line)
            if line.strip() == "$$$$":  # fin de una molécula
                mol_count += 1

                # Si ya alcanzamos el límite, escribimos a fichero
                if mol_count >= max_molecules:
                    output_file = os.path.join(output_dir, f"{prefix}{file_count}.sdf")
                    with open(output_file, "w", encoding="utf-8") as outfile:
                        outfile.writelines(out_lines)
                    print(f"Creado: {output_file} con {mol_count} moléculas")

                    # Reiniciar contadores
                    file_count += 1
                    mol_count = 0
                    out_lines = []

        # Guardar el último bloque si tiene moléculas
        if out_lines:
            output_file = os.path.join(output_dir, f"{prefix}{file_count}.sdf")
            with open(output_file, "w", encoding="utf-8") as outfile:
                outfile.writelines(out_lines)
            print(f"Creado: {output_file} con {mol_count} moléculas")

if __name__ == "__main__":
    split_sdf(input_file, output_dir, prefix, max_molecules)

And now combine 3000 molecules...

import os

def combine_sdf_files(input_folder, output_folder, group_size=3000):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Find all .sdf files in the input folder
    sdf_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.sdf')]

    # Counter for tracking the output file number
    file_counter = 1
    molecule_counter = 0
    combined_content = ""
    
    for sdf_file in sdf_files:
        with open(sdf_file, 'r') as file:
            content = file.read().strip()
            if content:  # Skip empty files
                combined_content += content + "\n"

                molecule_counter += 1

                # If we've reached the group size, save the combined file
                if molecule_counter == group_size:
                    output_filename = f"MP7-Q-{file_counter:04d}_ligands.sdf"
                    output_path = os.path.join(output_folder, output_filename)
                    
                    with open(output_path, 'w') as out_file:
                        out_file.write(combined_content)
                    
                    print(f"Generated: {output_filename}")
                    file_counter += 1
                    molecule_counter = 0
                    combined_content = ""  # Reset for the next group

    # Save remaining molecules if any
    if combined_content:
        output_filename = f"MP7-Q-{file_counter:04d}_ligands.sdf"
        output_path = os.path.join(output_folder, output_filename)
        
        with open(output_path, 'w') as out_file:
            out_file.write(combined_content)
        
        print(f"Generated: {output_filename}")

    print(f"Processing completed. Files saved in: {output_folder}")


# Define the input and output paths
input_folder = r"D:\ccc\split"
output_folder = r"D:\ccc\Q-all-isomers-3000"

# Run the function
combine_sdf_files(input_folder, output_folder)