Source code for selfies.encoder

from selfies.exceptions import EncoderError, SMILESParserError
from selfies.grammar_rules import get_selfies_from_index
from selfies.utils.linked_list import SinglyLinkedList
from selfies.utils.smiles_utils import (
    atom_to_smiles,
    bond_to_smiles,
    smiles_to_mol
)


[docs]def encoder(smiles: str, strict: bool = True) -> str: """Translates a SMILES string into its corresponding SELFIES string. This translation is deterministic and does not depend on the current semantic constraints. Additionally, it preserves the atom order of the input SMILES string; thus, one could generate randomized SELFIES strings by generating randomized SMILES strings, and then translating them. By nature of SELFIES, it is impossible to represent molecules that violate the current semantic constraints as SELFIES strings. Thus, we provide the ``strict`` flag to guard against such cases. If ``strict=True``, then this function will raise a :class:`selfies.EncoderError` if the input SMILES string represents a molecule that violates the semantic constraints. If ``strict=False``, then this function will not raise any error; however, calling :func:`selfies.decoder` on a SELFIES string generated this way will *not* be guaranteed to recover a SMILES string representing the original molecule. :param smiles: the SMILES string to be translated. It is recommended to use RDKit to check that the strings passed into this function are valid SMILES strings. :param strict: if ``True``, this function will check that the input SMILES string obeys the semantic constraints. Defaults to ``True``. :return: a SELFIES string translated from the input SMILES string. :raises EncoderError: if the input SMILES string is invalid, cannot be kekulized, or violates the semantic constraints with ``strict=True``. :Example: >>> import selfies as sf >>> sf.encoder("C=CF") '[C][=C][F]' .. note:: This function does not currently support SMILES with: * The wildcard symbol ``*``. * The quadruple bond symbol ``$``. * Chirality specifications other than ``@`` and ``@@``. * Ring bonds across a dot symbol (e.g. ``c1cc([O-].[Na+])ccc1``) or ring bonds between atoms that are over 4000 atoms apart. Although SELFIES does not have aromatic symbols, this function *does* support aromatic SMILES strings by internally kekulizing them before translation. """ try: mol = smiles_to_mol(smiles) except SMILESParserError as err: err_msg = "failed to parse input\n\tSMILES: {}".format(smiles) raise EncoderError(err_msg) from err if not mol.kekulize(): err_msg = "kekulization failed\n\tSMILES: {}".format(smiles) raise EncoderError(err_msg) if strict: _check_bond_constraints(mol, smiles) # invert chirality of atoms where necessary, # such that they are restored when the SELFIES is decoded for atom in mol.get_atoms(): if ((atom.chirality is not None) and mol.has_out_ring_bond(atom.index) and _should_invert_chirality(mol, atom)): atom.invert_chirality() fragments = [] for root in mol.get_roots(): derived = list(_fragment_to_selfies(mol, None, root)) fragments.append("".join(derived)) return ".".join(fragments)
def _check_bond_constraints(mol, smiles): errors = [] for atom in mol.get_atoms(): bond_cap = atom.bonding_capacity bond_count = mol.get_bond_count(atom.index) if bond_count > bond_cap: errors.append((atom_to_smiles(atom), bond_count, bond_cap)) if errors: err_msg = "input violates the currently-set semantic constraints\n" \ "\tSMILES: {}\n" \ "\tErrors:\n".format(smiles) for e in errors: err_msg += "\t[{:} with {} bond(s) - " \ "a max. of {} bond(s) was specified]\n".format(*e) raise EncoderError(err_msg) def _should_invert_chirality(mol, atom): out_bonds = mol.get_out_dirbonds(atom.index) # 1. rings whose right number are bonded to this atom (e.g. ...1...X1) # 2. rings whose left number are bonded to this atom (e.g. X1...1...) # 3. branches and other (e.g. X(...)...) partition = [[], [], []] for i, bond in enumerate(out_bonds): if not bond.ring_bond: partition[2].append(i) elif bond.src < bond.dst: partition[1].append(i) else: partition[0].append(i) partition[1].sort(key=lambda x: out_bonds[x].dst) # construct permutation perm = partition[0] + partition[1] + partition[2] count = 0 for i in range(len(perm)): for j in range(i + 1, len(perm)): if perm[i] > perm[j]: count += 1 return count % 2 != 0 # if odd permutation, should invert chirality def _fragment_to_selfies(mol, bond_into_root, root): derived = SinglyLinkedList() bond_into_curr, curr = bond_into_root, root while True: curr_atom = mol.get_atom(curr) derived.append(_atom_to_selfies(bond_into_curr, curr_atom)) out_bonds = mol.get_out_dirbonds(curr) for i, bond in enumerate(out_bonds): if bond.ring_bond: if bond.src < bond.dst: continue rev_bond = mol.get_dirbond(src=bond.dst, dst=bond.src) ring_len = bond.src - bond.dst Q_as_symbols = get_selfies_from_index(ring_len - 1) ring_symbol = "[{}Ring{}]".format( _ring_bonds_to_selfies(rev_bond, bond), len(Q_as_symbols) ) derived.append(ring_symbol) for symbol in Q_as_symbols: derived.append(symbol) elif i == len(out_bonds) - 1: bond_into_curr, curr = bond, bond.dst else: branch = _fragment_to_selfies(mol, bond, bond.dst) Q_as_symbols = get_selfies_from_index(len(branch) - 1) branch_symbol = "[{}Branch{}]".format( _bond_to_selfies(bond, show_stereo=False), len(Q_as_symbols) ) derived.append(branch_symbol) for symbol in Q_as_symbols: derived.append(symbol) derived.extend(branch) # end of chain if (not out_bonds) or out_bonds[-1].ring_bond: break return derived def _bond_to_selfies(bond, show_stereo=True): if not show_stereo and (bond.order == 1): return "" return bond_to_smiles(bond) def _ring_bonds_to_selfies(lbond, rbond): assert lbond.order == rbond.order if (lbond.order != 1) or all(b.stereo is None for b in (lbond, rbond)): return _bond_to_selfies(lbond, show_stereo=False) else: bond_char = "-" if (lbond.stereo is None) else lbond.stereo bond_char += "-" if (rbond.stereo is None) else rbond.stereo return bond_char def _atom_to_selfies(bond, atom): assert not atom.is_aromatic bond_char = "" if (bond is None) else _bond_to_selfies(bond) return "[{}{}]".format(bond_char, atom_to_smiles(atom, brackets=False))