Collection of Python Functions for Organic Chemistry (Chemoinformatics; Structural Bioinformatics)
Recommended posts : 【Organic Chemistry】 Organic Chemistry Index, 【Python】 Collection of Useful Python Functions
1. Nomenclature
2. Drawing
3. Amino Acid
4. Spectroscopy
1. Nomenclature
⑴ SMILES (Simplified Molecular-Input Line Entry System): A short ASCII string representation.
① Double bonds are represented by ‘=’, and triple bonds by ‘#’.
② For cyclic compounds, numbers such as 1, 2, … are assigned to indicate that the ends of a linear molecule are connected to form a ring.
○ Example: CN1C=NC2=C1C(=O)N(C(=O)N2C)C
③ ‘C’ represents a general carbon atom, whereas ‘c’ represents an aromatic carbon atom.
○ C1CCCCC1: Cyclohexane
○ c1ccccc1: Benzene
⑵ Code for converting organic compounds into SMILES
rdkit.Chem.MolToSmiles(Chem.MolFromFASTA(sequence, flavor = 1))
"""
flavor: (optional)
0 Protein, L amino acids (default)
1 Protein, D amino acids
2 RNA, no cap
3 RNA, 5’ cap
4 RNA, 3’ cap
5 RNA, both caps
6 DNA, no cap
7 DNA, 5’ cap
8 DNA, 3’ cap
9 DNA, both caps
"""
⑶ SMILES, IUPAC Interconversion Code : It uses a transformer model.
# reference: https://github.com/Kohulan/Smiles-TO-iUpac-Translator
! pip install STOUT-pypi
! pip install git+https://github.com/Kohulan/Smiles-TO-iUpac-Translator.git
from STOUT import translate_forward, translate_reverse
# SMILES to IUPAC name translation
SMILES = "CN1C=NC2=C1C(=O)N(C(=O)N2C)C"
IUPAC_name = translate_forward(SMILES)
print("IUPAC name of "+SMILES+" is: "+IUPAC_name)
# IUPAC name to SMILES translation
IUPAC_name = "1,3,7-trimethylpurine-2,6-dione"
SMILES = translate_reverse(IUPAC_name)
print("SMILES of "+IUPAC_name+" is: "+SMILES)
⑷ How to find out the IUPAC nomenclature given any chemical formula (to be updated in YouTube)
① Step 1. Try the SMILES-to-drawing function several times to find the SMILES code representing the given compound.
② Step 2. Execute the SMILES-to-IUPAC function to obtain the final nomenclature.
⑸ Code for obtaining molecular weight from SMILES
from rdkit import Chem
from rdkit.Chem import Descriptors
def calculate_molecular_weight(smiles):
molecule = Chem.MolFromSmiles(smiles)
return Descriptors.ExactMolWt(molecule)
# Example usage
smiles_code = "C1=CC=C(C=C1)O" # SMILES code for phenol
molecular_weight = calculate_molecular_weight(smiles_code)
print(f"Molecular Weight: {molecular_weight}")
⑹ Code to determine aromaticity from SMILES
from rdkit import Chem
def is_aromatic(smiles):
molecule = Chem.MolFromSmiles(smiles)
if molecule is None:
return False
return any(atom.GetIsAromatic() for atom in molecule.GetAtoms())
# Example usage
cyclohexane = "C1CCCCC1" # cyclohexane
benzene = "c1ccccc1" # benzene
imidazole = "C1=CN=CN1" # 1H-imidazole
print(f"The {cyclohexane} is {'aromatic' if is_aromatic(cyclohexane) else 'not aromatic'}")
print(f"The {benzene} is {'aromatic' if is_aromatic(benzene) else 'not aromatic'}")
print(f"The {imidazole} is {'aromatic' if is_aromatic(imidazole) else 'not aromatic'}")
⑺ Code for calculating dipole moment from SMILES
# conda install -c psi4 psi4
import psi4
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
def calculate_dipole_moment(smiles):
# Convert SMILES to molecule
mol = Chem.MolFromSmiles(smiles)
# Add Hydrogens
mol = Chem.AddHs(mol)
# Generate 3D coordinates
AllChem.EmbedMolecule(mol, AllChem.ETKDG())
# Extract coordinates
conf = mol.GetConformer()
xyz = ''
for atom in mol.GetAtoms():
pos = conf.GetAtomPosition(atom.GetIdx())
xyz += f"{atom.GetSymbol()} {pos.x} {pos.y} {pos.z}\n"
# Set up Psi4
psi4.set_memory('500 MB')
psi4.set_options({'basis': 'sto-3g'})
# Calculate dipole moment using Psi4
psi4_mol = psi4.geometry(xyz)
psi4.energy('scf')
dipole_moment = psi4.variable('SCF DIPOLE')
return dipole_moment
### Example usage
smiles_code = "CCO" # Example SMILES code for ethanol
dipole_moment = calculate_dipole_moment(smiles_code)
print(f"Dipole Moment: {dipole_moment} Debye")
# Dipole Moment: [ 0.04250251 0.20600936 -0.52850913] Debye
dipole_vector = np.array([0.04250251, 0.20600936, -0.52850913])
magnitude = np.linalg.norm(dipole_vector)
print(f"Magnitude of Dipole Moment: {magnitude} Debye")
# Magnitude of Dipole Moment: 0.5688305725409514 Debye
⑻ A function to obtain the boiling point (bp), melting point (mp), and critical temperature from SMILES
# reference: https://thermo.readthedocs.io/thermo.chemical.html
from thermo.chemical import Chemical
N2 = Chemical('Nitrogen')
print(N2.Tm, N2.Tb, N2.Tc) # melting, boiling, and critical points [K]
## 63.15 77.3549950205 126.192
molecule = Chemical('CC(C)C')
print(molecule.Tm, molecule.Tb, molecule.Tc) # melting, boiling, and critical points [K]
## 124.2 261.401014643 407.81
molecule_ = Chemical('2-methylpropane')
print(molecule_.Tm, molecule_.Tb, molecule_.Tc) # melting, boiling, and critical points [K]
## 124.2 261.401014643 407.81
① It operates on a search-based approach and not all compounds are targeted.
② Various machine learning models are being introduced to improve this.
⑼ Model for predicting ADMET (Absorption, Distribution, Metabolism, Excretion, Toxicity) from SMILES : ADMET-AI
2. Drawing
⑴ Drawing molecular formulas for organic compounds (e.g., peracetic acid)
from rdkit import Chem
from rdkit.Chem import Draw
# Define the SMILES strings for peractic acid
smiles = 'C=CC(=O)O'
# Convert the SMILES strings to RDKit molecule objects
molecule = Chem.MolFromSmiles(smiles)
# Draw the molecules without saving
Draw.MolToImage(molecule)
# Draw the molecules with saving
Draw.MolToFile(molecule, 'peractic_acid.png')
⑵ Drawing electron density maps for organic compounds (ver. 1) (e.g., peracetic acid)
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import SimilarityMaps
# Generate a 3D structure
molecule = Chem.MolFromSmiles('CC(=O)OO')
molecule_3d = Chem.AddHs(molecule)
AllChem.EmbedMolecule(molecule_3d, AllChem.ETKDG())
AllChem.MMFFOptimizeMolecule(molecule_3d)
# Calculate Gasteiger charges
AllChem.ComputeGasteigerCharges(molecule_3d)
# Function to get atom charges
def GetAtomCharges(mol):
charges = [float(mol.GetAtomWithIdx(i).GetProp('_GasteigerCharge')) for i in range(mol.GetNumAtoms())]
return charges
# Draw electrostatic potential map
fig = SimilarityMaps.GetSimilarityMapFromWeights(molecule_3d, GetAtomCharges(molecule_3d), colorMap='jet', contourLines=10)
⑶ Drawing electron density maps for organic compounds (ver. 2) (e.g., peracetic acid)
# Refernece: https://rdkit.readthedocs.io/en/latest/Cookbook.html
## STEP 1. make a random forest model
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
import numpy
# generate four molecules
m1 = Chem.MolFromSmiles('c1ccccc1')
m2 = Chem.MolFromSmiles('c1ccccc1CC')
m3 = Chem.MolFromSmiles('c1ccncc1')
m4 = Chem.MolFromSmiles('c1ccncc1CC')
mols = [m1, m2, m3, m4]
# generate fingeprints: Morgan fingerprint with radius 2
fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]
# convert the RDKit explicit vectors into numpy arrays
np_fps = []
for fp in fps:
arr = numpy.zeros((1,))
DataStructs.ConvertToNumpyArray(fp, arr)
np_fps.append(arr)
# get a random forest classifiert with 100 trees
rf = RandomForestClassifier(n_estimators=100, random_state=1123)
# train the random forest
# with the first two molecules being actives (class 1) and
# the last two being inactives (class 0)
ys_fit = [1, 1, 0, 0]
rf.fit(np_fps, ys_fit)
# use the random forest to predict a new molecule
m5 = Chem.MolFromSmiles('c1ccccc1O')
fp = numpy.zeros((1,))
DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(m5, 2), fp)
print(rf.predict((fp,)))
print(rf.predict_proba((fp,)))
## STEP 2. run the random forest model for the input
from rdkit.Chem.Draw import SimilarityMaps
# helper function
def getProba(fp, predictionFunction):
return predictionFunction((fp,))[0][1]
m5 = Chem.MolFromSmiles('CC(=O)OO')
fig, maxweight = SimilarityMaps.GetSimilarityMapForModel(m5, SimilarityMaps.GetMorganFingerprint, lambda x: getProba(x, rf.predict_proba))
⑷ Drawing three-dimensional molecular structures of organic compounds.
from rdkit import Chem
from rdkit.Chem import AllChem
import py3Dmol
def draw_3d_molecule(smiles):
# Convert SMILES to RDKit molecule
mol = Chem.MolFromSmiles(smiles)
if mol is None:
print("Invalid SMILES code.")
return
# Generate 3D coordinates for the molecule
mol = Chem.AddHs(mol) # Add hydrogens
AllChem.EmbedMolecule(mol, AllChem.ETKDG()) # Embed molecule in 3D space
# Convert RDKit molecule to 3Dmol.js viewable format
mb = Chem.MolToMolBlock(mol)
# Visualization with Py3Dmol
viewer = py3Dmol.view(width=400, height=300)
viewer.addModel(mb, 'mol')
viewer.setStyle({'stick': {}})
viewer.zoomTo()
return viewer.show()
# Example usage
smiles_code = "CCO" # Ethanol
draw_3d_molecule(smiles_code)
① Review 1: In methanol, the two methyl groups are in a staggered orientation to minimize steric hindrance, whereas in ethanol, this is not the case due to intramolecular hydrogen bonding.
② Review 2: The structure is also well implemented in the following conjugated ring compounds.
③ This is the charm of chemoinformatics, where new knowledge can be created solely with machine learning models.
3. Amino Acid
⑴ A function of converting a sequence into an amino acid sequence
from Bio.Seq import Seq
# Assuming 'sequence' is your string of nucleotides:
sequence = "ACTCATTCTCCCCAGACGCCAAGGATGGTGGTCATGGCGCCCCGAACCCTCTTCCTGCTGCTCTCGGGGGCCCTGACCCTGACCGAGACCTGGGCGG" # truncated for brevity
# Create a sequence object
seq_obj = Seq(sequence)
# Translate the sequence
amino_acid_sequence = seq_obj.translate(to_stop=True)
# Print the amino acid sequence
print(amino_acid_sequence)
⑵ Function to generate PBD files regarding amino acid structure from amino acid sequence using AlphaFold2
4. Spectroscopy
⑴ SMILES-to-IR
⑵ SMILES-to-NMR
⑶ NMR-to-structure
Input : 2023.11.30 02:40