Objective
The goal was to create a Python script for Windows that processes all .sdf
files in the D:\ccc
directory. Each molecule in the file is separated by the delimiter $$$$
. The first line of each molecule represents its name. The script should split each molecule into a separate file named after the molecule and save these files in the D:\ccc\split
directory.
import os
def split_sdf_files(input_folder, output_folder):
# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Search for all files with .sdf extension in the input folder
sdf_files = [f for f in os.listdir(input_folder) if f.endswith('.sdf')]
for sdf_file in sdf_files:
input_path = os.path.join(input_folder, sdf_file)
with open(input_path, 'r') as file:
content = file.read()
# Split molecules using the $$$$ delimiter
molecules = content.split("$$$$")
for molecule in molecules:
if molecule.strip(): # Skip empty molecules
# Get the molecule name (first line)
lines = molecule.strip().split('\n')
molecule_name = lines[0].strip()
# Ensure the filename is valid
valid_name = "".join(c if c.isalnum() or c in (' ', '_', '-') \
else '_' for c in molecule_name)
output_file = os.path.join(output_folder, f"{valid_name}.sdf")
# Save the molecule into a separate file
with open(output_file, 'w') as out_file:
out_file.write(molecule.strip() + "\n$$$$\n")
print(f"Processing completed. The split files were saved in: {output_folder}")
# Define the input and output paths
input_folder = r"D:\ccc"
output_folder = os.path.join(input_folder, "split")
# Run the function
split_sdf_files(input_folder, output_folder)
And now combine 3000 molecules...
import os
def combine_sdf_files(input_folder, output_folder, group_size=3000):
# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Find all .sdf files in the input folder
sdf_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.sdf')]
# Counter for tracking the output file number
file_counter = 1
molecule_counter = 0
combined_content = ""
for sdf_file in sdf_files:
with open(sdf_file, 'r') as file:
content = file.read().strip()
if content: # Skip empty files
combined_content += content + "\n"
molecule_counter += 1
# If we've reached the group size, save the combined file
if molecule_counter == group_size:
output_filename = f"MP7-Q-{file_counter:04d}_ligands.sdf"
output_path = os.path.join(output_folder, output_filename)
with open(output_path, 'w') as out_file:
out_file.write(combined_content)
print(f"Generated: {output_filename}")
file_counter += 1
molecule_counter = 0
combined_content = "" # Reset for the next group
# Save remaining molecules if any
if combined_content:
output_filename = f"MP7-Q-{file_counter:04d}_ligands.sdf"
output_path = os.path.join(output_folder, output_filename)
with open(output_path, 'w') as out_file:
out_file.write(combined_content)
print(f"Generated: {output_filename}")
print(f"Processing completed. Files saved in: {output_folder}")
# Define the input and output paths
input_folder = r"D:\ccc\split"
output_folder = r"D:\ccc\Q-all-isomers-3000"
# Run the function
combine_sdf_files(input_folder, output_folder)