SDF File Splitter and Combine 3000 molecules.

Objective

The goal was to create a Python script for Windows that processes all .sdf files in the D:\ccc directory. Each molecule in the file is separated by the delimiter $$$$. The first line of each molecule represents its name. The script should split each molecule into a separate file named after the molecule and save these files in the D:\ccc\split directory.

import os

def split_sdf_files(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Search for all files with .sdf extension in the input folder
    sdf_files = [f for f in os.listdir(input_folder) if f.endswith('.sdf')]

    for sdf_file in sdf_files:
        input_path = os.path.join(input_folder, sdf_file)
        with open(input_path, 'r') as file:
            content = file.read()
        
        # Split molecules using the $$$$ delimiter
        molecules = content.split("$$$$")
        
        for molecule in molecules:
            if molecule.strip():  # Skip empty molecules
                # Get the molecule name (first line)
                lines = molecule.strip().split('\n')
                molecule_name = lines[0].strip()

                # Ensure the filename is valid
                valid_name = "".join(c if c.isalnum() or c in (' ', '_', '-') \ 
				else '_' for c in molecule_name)
                output_file = os.path.join(output_folder, f"{valid_name}.sdf")

                # Save the molecule into a separate file
                with open(output_file, 'w') as out_file:
                    out_file.write(molecule.strip() + "\n$$$$\n")

    print(f"Processing completed. The split files were saved in: {output_folder}")

# Define the input and output paths
input_folder = r"D:\ccc"
output_folder = os.path.join(input_folder, "split")

# Run the function
split_sdf_files(input_folder, output_folder)
    

And now combine 3000 molecules...

import os

def combine_sdf_files(input_folder, output_folder, group_size=3000):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Find all .sdf files in the input folder
    sdf_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.sdf')]

    # Counter for tracking the output file number
    file_counter = 1
    molecule_counter = 0
    combined_content = ""
    
    for sdf_file in sdf_files:
        with open(sdf_file, 'r') as file:
            content = file.read().strip()
            if content:  # Skip empty files
                combined_content += content + "\n"

                molecule_counter += 1

                # If we've reached the group size, save the combined file
                if molecule_counter == group_size:
                    output_filename = f"MP7-Q-{file_counter:04d}_ligands.sdf"
                    output_path = os.path.join(output_folder, output_filename)
                    
                    with open(output_path, 'w') as out_file:
                        out_file.write(combined_content)
                    
                    print(f"Generated: {output_filename}")
                    file_counter += 1
                    molecule_counter = 0
                    combined_content = ""  # Reset for the next group

    # Save remaining molecules if any
    if combined_content:
        output_filename = f"MP7-Q-{file_counter:04d}_ligands.sdf"
        output_path = os.path.join(output_folder, output_filename)
        
        with open(output_path, 'w') as out_file:
            out_file.write(combined_content)
        
        print(f"Generated: {output_filename}")

    print(f"Processing completed. Files saved in: {output_folder}")


# Define the input and output paths
input_folder = r"D:\ccc\split"
output_folder = r"D:\ccc\Q-all-isomers-3000"

# Run the function
combine_sdf_files(input_folder, output_folder)





Última modificación: