From 41f729e9588720e1c6cc7696b0f9881941cb526e Mon Sep 17 00:00:00 2001
From: Ludwig Schneider <ludwigschneider@uchicago.edu>
Date: Tue, 17 Dec 2024 14:26:41 -0600
Subject: [PATCH] simplify AST building

---
 src/gbigsmiles/__init__.py           | 100 ++++++++++++++--------
 src/gbigsmiles/atom.py               |   6 +-
 src/gbigsmiles/bond.py               |  25 ++++++
 src/gbigsmiles/data/g-bigsmiles.lark |  33 ++++----
 src/gbigsmiles/exception.py          |  46 +++-------
 src/gbigsmiles/molecule.py           |   5 +-
 src/gbigsmiles/parser.py             |  16 +++-
 src/gbigsmiles/system.py             |   7 +-
 src/gbigsmiles/transformer.py        | 120 +++------------------------
 src/gbigsmiles/util.py               |  12 +++
 tests/test_core.py                   |  20 -----
 11 files changed, 171 insertions(+), 219 deletions(-)
 delete mode 100644 tests/test_core.py

diff --git a/src/gbigsmiles/__init__.py b/src/gbigsmiles/__init__.py
index d5b5d1e..f46f7bd 100644
--- a/src/gbigsmiles/__init__.py
+++ b/src/gbigsmiles/__init__.py
@@ -15,49 +15,79 @@
         "Please make sure to install this module correctly via setuptools with setuptools_scm activated to generate a `_version.py` file."
     ) from exc
 
-from .atom import Atom
-from .bond import BondDescriptor
-from .core import _GLOBAL_RNG, BigSMILESbase, reaction_graph_to_dot_string
-from .distribution import Distribution, FlorySchulz, Gauss
-from .exception import (
-    GBigSMILESError,
-    GBigSMILESInitNotEnoughError,
-    GBigSMILESInitTooMuchError,
-    GBigSMILESParsingError,
+from .atom import (
+    AliphaticOrganic,
+    AromaticOrganic,
+    AromaticSymbol,
+    Atom,
+    AtomCharge,
+    AtomClass,
+    AtomSymbol,
+    BracketAtom,
+    Chiral,
+    HCount,
+    Isotope,
 )
-from .graph_generate import AtomGraph
-from .mixture import Mixture
-from .mol_prob import get_ensemble_prob
-from .molecule import Molecule
-from .stochastic import Stochastic
-from .system import System
-from .token import SmilesToken
-from .transformer import GBigSMILESTransformer
-from .util import camel_to_snake, snake_to_camel
+from .bond import (
+    BondDescriptor,
+    BondDescriptorGeneration,
+    BondDescriptorSymbol,
+    BondDescriptorSymbolIdx,
+    BondSymbol,
+    InnerBondDescriptor,
+    RingBond,
+    SimpleBondDescriptor,
+    TerminalBondDescriptor,
+)
+from .core import BigSMILESbase
+from .distribution import (
+    FlorySchulz,
+    Gauss,
+    LogNormal,
+    Poisson,
+    StochasticDistribution,
+    Uniform,
+)
+from .parser import get_global_parser
+from .transformer import GBigSMILESTransformer, get_global_transformer
+from .util import camel_to_snake, get_global_rng, snake_to_camel
+
+# from .graph_generate import AtomGraph
+# from .mixture import Mixture
+# from .mol_prob import get_ensemble_prob
+# from .molecule import Molecule
+# from .stochastic import Stochastic
+# from .system import System
+# from .token import SmilesToken
 
 __all__ = [
     "__version__",
     "version_tuple",
+    "Atom",
+    "BracketAtom",
+    "Isotope",
+    "AtomSymbol",
+    "Chiral",
+    "HCount",
+    "AtomCharge",
+    "AtomClass",
+    "AromaticSymbol",
+    "AliphaticOrganic",
+    "AromaticOrganic",
+    "BondSymbol",
+    "RingBond",
+    "BondDescriptorSymbol",
+    "BondDescriptorSymbolIdx",
+    "BondDescriptorGeneration",
+    "InnerBondDescriptor",
     "BondDescriptor",
-    "_GLOBAL_RNG",
+    "SimpleBondDescriptor",
+    "TerminalBondDescriptor",
     "BigSMILESbase",
-    "reaction_graph_to_dot_string",
-    "Distribution",
-    "FlorySchulz",
-    "Gauss",
-    "Atom",
-    "AtomGraph",
-    "Mixture",
-    "get_ensemble_prob",
-    "Molecule",
-    "Stochastic",
-    "System",
-    "SmilesToken",
-    "GBigSMILESError",
-    "GBigSMILESParsingError",
-    "GBigSMILESInitNotEnoughError",
-    "GBigSMILESInitTooMuchError",
     "camel_to_snake",
     "snake_to_camel",
+    "get_global_rng",
     "GBigSMILESTransformer",
+    "get_global_transformer",
+    "get_global_parser",
 ]
diff --git a/src/gbigsmiles/atom.py b/src/gbigsmiles/atom.py
index 35b3d83..f03ce80 100644
--- a/src/gbigsmiles/atom.py
+++ b/src/gbigsmiles/atom.py
@@ -5,7 +5,7 @@
 import lark
 
 from .core import BigSMILESbase
-from .exception import GBigSMILESParsingError, GBigSMILESTooManyTokens
+from .exception import ParsingError, TooManyTokens
 
 
 class Atom(BigSMILESbase):
@@ -33,7 +33,7 @@ def __init__(self, children: list):
         for child in self._children:
             if isinstance(child, AtomSymbol):
                 if self._symbol is not None:
-                    raise GBigSMILESTooManyTokens(self.__class__, self._symbol, child)
+                    raise TooManyTokens(self.__class__, self._symbol, child)
                 self._symbol = child
 
     def generate_string(self, extension):
@@ -294,7 +294,7 @@ def __init__(self, children: list):
         super().__init__(children)
 
         if str(self._children[0]) != "H":
-            raise GBigSMILESParsingError(self._children[0])
+            raise ParsingError(self._children[0])
 
         if len(self._children) > 1:
             self._count = int(self._children[1])
diff --git a/src/gbigsmiles/bond.py b/src/gbigsmiles/bond.py
index d71481a..9e7c3ca 100644
--- a/src/gbigsmiles/bond.py
+++ b/src/gbigsmiles/bond.py
@@ -2,8 +2,13 @@
 # Copyright (c) 2022: Ludwig Schneider
 # See LICENSE for details
 
+try:
+    from typing import Self
+except ImportError:
+    from typing_extensions import Self
 
 from .core import BigSMILESbase
+from .parser import get_global_parser
 
 
 def _create_compatible_bond_text(bond):
@@ -309,6 +314,12 @@ def transition(self):
 
 
 class BondDescriptor(BigSMILESbase):
+    @classmethod
+    def make(cls, text: str) -> Self:
+        if "$" in text or "<" in text or ">" in text:
+            return SimpleBondDescriptor.make(text)
+        return TerminalBondDescriptor.make(text)
+
     @property
     def symbol(self):
         return None
@@ -334,6 +345,13 @@ def __init__(self, children):
             if isinstance(child, InnerBondDescriptor):
                 self._inner_bond_descriptor = child
 
+    @classmethod
+    def make(cls, text: str) -> Self:
+        # We use BigSMILESbase.make.__func__ to get the underlying function of the class method,
+        # then call it with cls as the first argument to ensure child typing.
+        # We do not want to call StochasticDistribution's make function, because it directs here.
+        return BigSMILESbase.make.__func__(cls, text)
+
     def generate_string(self, extension):
         return "[" + self._inner_bond_descriptor.generate_string(extension) + "]"
 
@@ -369,6 +387,13 @@ def __init__(self, children):
             if isinstance(child, BondDescriptorGeneration):
                 self._generation = child
 
+    @classmethod
+    def make(cls, text: str) -> Self:
+        # We use BigSMILESbase.make.__func__ to get the underlying function of the class method,
+        # then call it with cls as the first argument to ensure child typing.
+        # We do not want to call StochasticDistribution's make function, because it directs here.
+        return BigSMILESbase.make.__func__(cls, text)
+
     @property
     def weight(self):
         return self._generation.weight
diff --git a/src/gbigsmiles/data/g-bigsmiles.lark b/src/gbigsmiles/data/g-bigsmiles.lark
index ad324dd..bdb760e 100644
--- a/src/gbigsmiles/data/g-bigsmiles.lark
+++ b/src/gbigsmiles/data/g-bigsmiles.lark
@@ -40,8 +40,8 @@ bond_symbol: "-"
   | "/"
   | "\\"
 
-ring_bond: bond_symbol? DIGIT
-  | bond_symbol? "%" DIGIT? DIGIT
+ring_bond: bond_symbol? INT
+  | bond_symbol? "%" INT
 
 _branched_atom: _atom_stand_in ring_bond* branch*
 
@@ -77,12 +77,18 @@ bond_descriptor: simple_bond_descriptor | ladder_bond_descriptor | non_covalent_
 terminal_bond_descriptor: "[" bond_descriptor_symbol_idx? bond_descriptor_generation? "]"
 
 stochastic_generation: "|" stochastic_distribution "|"
-stochastic_distribution: "flory_schulz(" WS_INLINE* NUMBER WS_INLINE* ")"
-  | "schulz_zimm(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
-  | "gauss(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
-  | "uniform(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
-  | "log_normal(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
-  | "poisson(" WS_INLINE* NUMBER WS_INLINE* ")"
+flory_schulz: "flory_schulz(" WS_INLINE* NUMBER WS_INLINE* ")"
+schulz_zimm: "schulz_zimm(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
+gauss: "gauss(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
+uniform: "uniform(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
+log_normal: "log_normal(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
+poisson: "poisson(" WS_INLINE* NUMBER WS_INLINE* ")"
+stochastic_distribution: flory_schulz
+  | schulz_zimm
+  | gauss
+  | uniform
+  | log_normal
+  | poisson
 
 _unary_index_operator: "!"
 _binary_index_operator: "~" | "&"
@@ -94,10 +100,10 @@ _non_covalent_key_value_pair: WS_INLINE* "," WS_INLINE* _printable_character+ "=
 _non_covalent_context: WS_INLINE* "|" WS_INLINE* _index_expression _non_covalent_key_value_pair*
 
 h_count: "H"
-  | "H" DIGIT
+  | "H" INT
 
-atom_charge: "-" DIGIT?
-  | "+" DIGIT?
+atom_charge: "-" INT?
+  | "+" INT?
   | "--"
   | "++"
 
@@ -117,7 +123,6 @@ end_group.-1: ";" WS_INLINE* smiles _monomer_list*
 %import common.INT
 %import common.SIGNED_NUMBER
 %import common.NUMBER
-%import common.DIGIT
 %import common.WS
 %import common.WS_INLINE
 
@@ -152,8 +157,8 @@ chiral: "@"
   | "@SP1"
   | "@SP2"
   | "@SP3"
-  | "@TB" DIGIT? DIGIT
-  | "@OH" DIGIT? DIGIT
+  | "@TB" INT
+  | "@OH" INT
 
 _element_symbols: "H"
   | "He"
diff --git a/src/gbigsmiles/exception.py b/src/gbigsmiles/exception.py
index f523de2..c901532 100644
--- a/src/gbigsmiles/exception.py
+++ b/src/gbigsmiles/exception.py
@@ -11,7 +11,7 @@ class GBigSMILESError(Exception):
     pass
 
 
-class GBigSMILESParsingError(GBigSMILESError):
+class ParsingError(GBigSMILESError):
     """
     Parsing the Grammar went in an unanticipated manner.
     Please report bug with input string.
@@ -24,39 +24,7 @@ def __str__(self):
         return f"Unanticipated error while parsing. Please report and provide the input string. Token: {self.token} start: {self.token.start_pos}"
 
 
-class GBigSMILESInitNotEnoughError(GBigSMILESError):
-    """
-    GBigSMILES classes usually need to be initialized either via text,
-    or as part of parsing a different string.
-
-    If this isn't followed, this exception is raise.
-    Initialize the elements of G-BigSMILES with (part of) a G-BigSMILES string.
-    """
-
-    def __init__(self, class_name):
-        self.class_name = class_name
-
-    def __str__(self):
-        return f"Attempt to initialize {self.class_name} without sufficient arguments. Initialize objects of {self.class_name} by passing (part of) a G-BigSMILES string."
-
-
-class GBigSMILESInitTooMuchError(GBigSMILESError):
-    """
-    GBigSMILES classes usually need to be initialized either via text,
-    or as part of parsing a different string, but not both.
-
-    If this isn't followed, this exception is raise.
-    Initialize the elements of G-BigSMILES with (part of) a G-BigSMILES string.
-    """
-
-    def __init__(self, class_name):
-        self.class_name = class_name
-
-    def __str__(self):
-        return f"Attempt to initialize {self.class_name} with tree and text arguments. Initialize objects of {self.class_name} by passing (part of) a G-BigSMILES string."
-
-
-class GBigSMILESTooManyTokens(GBigSMILESError):
+class TooManyTokens(ParsingError):
     def __init__(self, class_name, existing_token, new_token):
         self.class_name = class_name
         self.existing_token = existing_token
@@ -67,3 +35,13 @@ def __str__(self):
         string += f"The existing token is {self.existing_token} which conflicts with the new "
         string += f"token {self.new_token}. Most likely in implementation error, please report."
         return string
+
+
+class UnknownDistribution(GBigSMILESError):
+    def __init__(self, distribution_text: str):
+        self.distribution_text = distribution_text
+
+    def __str__(self):
+        string = f"GBigSMILES a distribution with the following text {self.distribution_text} is unknown."
+        string += " Typo or not implemented distribution."
+        return string
diff --git a/src/gbigsmiles/molecule.py b/src/gbigsmiles/molecule.py
index 23ce75b..8ff5a35 100644
--- a/src/gbigsmiles/molecule.py
+++ b/src/gbigsmiles/molecule.py
@@ -7,11 +7,12 @@
 import networkx as nx
 
 from .bond import _create_compatible_bond_text
-from .core import _GLOBAL_RNG, BigSMILESbase
+from .core import BigSMILESbase
 from .mixture import Mixture
 from .stochastic import Stochastic
 from .stochastic_atom_graph import StochasticAtomGraph
 from .token import SmilesToken
+from .util import get_global_rng
 
 
 class Molecule(BigSMILESbase):
@@ -144,7 +145,7 @@ def generate_string(self, extension):
             string += self.mixture.generate_string(extension)
         return string
 
-    def generate(self, prefix=None, rng=_GLOBAL_RNG):
+    def generate(self, prefix=None, rng=get_global_rng()):
         my_mol = prefix
         for element in self._elements:
             my_mol = element.generate(my_mol, rng)
diff --git a/src/gbigsmiles/parser.py b/src/gbigsmiles/parser.py
index d3424ca..331526d 100644
--- a/src/gbigsmiles/parser.py
+++ b/src/gbigsmiles/parser.py
@@ -30,6 +30,7 @@ def _make_parser(filename=None, start_tokens=None):
             "ladder_bond_descriptor",
             "non_covalent_bond_descriptor",
             "bond_descriptor",
+            "simple_bond_descriptor",
             "terminal_bond_descriptor",
             "stochastic_generation",
             "stochastic_distribution",
@@ -41,9 +42,22 @@ def _make_parser(filename=None, start_tokens=None):
             "atom_symbol",
             "aromatic_symbol",
             "bracket_atom",
+            "flory_schulz",
+            "uniform",
+            "schulz_zimm",
+            "log_normal",
+            "gauss",
         ]
     parser = Lark(rf"{grammar_text}", start=start_tokens, keep_all_tokens=True)
     return parser
 
 
-_GLOBAL_PARSER = _make_parser()
+_GLOBAL_PARSER: None | Lark = None
+
+
+def get_global_parser():
+    global _GLOBAL_PARSER
+    if _GLOBAL_PARSER is None:
+        _GLOBAL_PARSER = _make_parser()
+
+    return _GLOBAL_PARSER
diff --git a/src/gbigsmiles/system.py b/src/gbigsmiles/system.py
index ada2247..1211e2d 100644
--- a/src/gbigsmiles/system.py
+++ b/src/gbigsmiles/system.py
@@ -7,9 +7,10 @@
 
 import numpy as np
 
-from .core import _GLOBAL_RNG, BigSMILESbase
+from .core import BigSMILESbase
 from .mixture import Mixture
 from .molecule import Molecule
+from .util import get_global_rng
 
 
 def _estimate_system_molecular_weight(molecules, system_molweight):
@@ -154,7 +155,7 @@ def generate_string(self, extension):
         return string
 
     @property
-    def generator(self, rng=_GLOBAL_RNG):
+    def generator(self, rng=get_global_rng()):
         if not self.generable:
             raise RuntimeError("Generable system required")
 
@@ -171,7 +172,7 @@ def generator(self, rng=_GLOBAL_RNG):
                 raise RuntimeError("We expect a fully generated molecule here.")
             yield mol_gen
 
-    def generate(self, prefix=None, rng=_GLOBAL_RNG):
+    def generate(self, prefix=None, rng=get_global_rng()):
 
         relative_fractions = [mol.mixture.relative_mass for mol in self._molecules]
         mol_idx = rng.choice(
diff --git a/src/gbigsmiles/transformer.py b/src/gbigsmiles/transformer.py
index 316bd12..8291868 100644
--- a/src/gbigsmiles/transformer.py
+++ b/src/gbigsmiles/transformer.py
@@ -2,111 +2,10 @@
 # Copyright (c) 2022: Ludwig Schneider
 # See LICENSE for details
 import lark
+from lark.visitors import Discard
 
 
 class GBigSMILESTransformer(lark.Transformer):
-    def atom(self, children):
-        from .atom import Atom
-
-        a = Atom(children)
-        return a
-
-    def bracket_atom(self, children):
-        from .atom import BracketAtom
-
-        return BracketAtom(children)
-
-    def chiral(self, children):
-        from .atom import Chiral
-
-        return Chiral(children)
-
-    def h_count(self, children):
-        from .atom import HCount
-
-        return HCount(children)
-
-    def atom_charge(self, children):
-        from .atom import AtomCharge
-
-        return AtomCharge(children)
-
-    def atom_class(self, children):
-        from .atom import AtomClass
-
-        return AtomClass(children)
-
-    def isotope(self, children):
-        from .atom import Isotope
-
-        return Isotope(children)
-
-    def atom_symbol(self, children):
-        from .atom import AtomSymbol
-
-        return AtomSymbol(children)
-
-    def aromatic_symbol(self, children):
-        from .atom import AromaticSymbol
-
-        return AromaticSymbol(children)
-
-    def aliphatic_organic(self, children):
-        from .atom import AliphaticOrganic
-
-        return AliphaticOrganic(children)
-
-    def aromatic_organic(self, children):
-        from .atom import AromaticOrganic
-
-        return AromaticOrganic(children)
-
-    def bond_symbol(self, children):
-        from .bond import BondSymbol
-
-        return BondSymbol(children)
-
-    def ring_bond(self, children):
-        from .bond import RingBond
-
-        return RingBond(children)
-
-    def bond_descriptor_symbol(self, children):
-        from .bond import BondDescriptorSymbol
-
-        return BondDescriptorSymbol(children)
-
-    def bond_descriptor_symbol_idx(self, children):
-        from .bond import BondDescriptorSymbolIdx
-
-        return BondDescriptorSymbolIdx(children)
-
-    def terminal_bond_descriptor(self, children):
-        from .bond import TerminalBondDescriptor
-
-        return TerminalBondDescriptor(children)
-
-    def simple_bond_descriptor(self, children):
-        from .bond import SimpleBondDescriptor
-
-        return SimpleBondDescriptor(children)
-
-    def inner_bond_descriptor(self, children):
-        from .bond import InnerBondDescriptor
-
-        return InnerBondDescriptor(children)
-
-    def bond_descriptor_generation(self, children):
-        from .bond import BondDescriptorGeneration
-
-        return BondDescriptorGeneration(children)
-
-    def bond_descriptor(self, children):
-        from .bond import BondDescriptor
-
-        assert isinstance(children[0], BondDescriptor)
-        return children[0]
-
     def NUMBER(self, children):
         return float(children)
 
@@ -114,13 +13,20 @@ def INT(self, children):
         return int(children)
 
     def WS_INLINE(self, children):
-        from lark.visitors import Discard
-
         return Discard
 
-    # def big_smiles_fragment_definition(self, children
 
+_GLOBAL_TRANSFORMER: None | GBigSMILESTransformer = None
+
+
+def get_global_transformer():
+    global _GLOBAL_TRANSFORMER
+    if _GLOBAL_TRANSFORMER is None:
+        import gbigsmiles
 
-#    "{[][<]N=Cc(cc1)ccc1C=N[13C@OH1H2+1:3]CC[Si](C)(C)O{[<][>][Si](C)(C)O[<][>]}[Si](C)(C)CCC[>][]}"
+        transformer = lark.ast_utils.create_transformer(
+            ast_module=gbigsmiles, transformer=GBigSMILESTransformer(visit_tokens=True)
+        )
+        _GLOBAL_TRANSFORMER = transformer
 
-_GLOBAL_TRANSFORMER = GBigSMILESTransformer(visit_tokens=True)
+    return _GLOBAL_TRANSFORMER
diff --git a/src/gbigsmiles/util.py b/src/gbigsmiles/util.py
index 0f3985c..b423b44 100644
--- a/src/gbigsmiles/util.py
+++ b/src/gbigsmiles/util.py
@@ -4,6 +4,10 @@
 
 import re
 
+import numpy as np
+
+_GLOBAL_RNG: None | np.random.Generator = None
+
 
 def snake_to_camel(snake_str):
     """
@@ -73,3 +77,11 @@ def camel_to_snake(name):
     # Finally, handle the case where an acronym is at the start of the string
     s3 = re.sub("([A-Z])([A-Z][a-z])", r"\1_\2", s2)
     return s3.lower()
+
+
+def get_global_rng(seed=None):
+    global _GLOBAL_RNG
+    if _GLOBAL_RNG is None:
+        _GLOBAL_RNG = np.random.default_rng(seed)
+
+    return _GLOBAL_RNG
diff --git a/tests/test_core.py b/tests/test_core.py
deleted file mode 100644
index 974c927..0000000
--- a/tests/test_core.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pytest
-
-from gbigsmiles import Atom, GBigSMILESInitNotEnoughError, GBigSMILESInitTooMuchError
-
-
-def test_core_errors():
-    Atom("C")
-    with pytest.raises(GBigSMILESInitTooMuchError):
-        try:
-            Atom("C", ["C"])
-        except Exception as exc:
-            print(exc)
-            raise exc
-
-    with pytest.raises(GBigSMILESInitNotEnoughError):
-        try:
-            Atom()
-        except Exception as exc:
-            print(exc)
-            raise exc