simplify AST building

InnocentBug · Dec 17, 2024 · 41f729e · 41f729e
1 parent 91c0466
commit 41f729e
Show file tree

Hide file tree

Showing 11 changed files with 171 additions and 219 deletions.
diff --git a/src/gbigsmiles/__init__.py b/src/gbigsmiles/__init__.py
@@ -15,49 +15,79 @@
         "Please make sure to install this module correctly via setuptools with setuptools_scm activated to generate a `_version.py` file."
     ) from exc
 
-from .atom import Atom
-from .bond import BondDescriptor
-from .core import _GLOBAL_RNG, BigSMILESbase, reaction_graph_to_dot_string
-from .distribution import Distribution, FlorySchulz, Gauss
-from .exception import (
-    GBigSMILESError,
-    GBigSMILESInitNotEnoughError,
-    GBigSMILESInitTooMuchError,
-    GBigSMILESParsingError,
+from .atom import (
+    AliphaticOrganic,
+    AromaticOrganic,
+    AromaticSymbol,
+    Atom,
+    AtomCharge,
+    AtomClass,
+    AtomSymbol,
+    BracketAtom,
+    Chiral,
+    HCount,
+    Isotope,
 )
-from .graph_generate import AtomGraph
-from .mixture import Mixture
-from .mol_prob import get_ensemble_prob
-from .molecule import Molecule
-from .stochastic import Stochastic
-from .system import System
-from .token import SmilesToken
-from .transformer import GBigSMILESTransformer
-from .util import camel_to_snake, snake_to_camel
+from .bond import (
+    BondDescriptor,
+    BondDescriptorGeneration,
+    BondDescriptorSymbol,
+    BondDescriptorSymbolIdx,
+    BondSymbol,
+    InnerBondDescriptor,
+    RingBond,
+    SimpleBondDescriptor,
+    TerminalBondDescriptor,
+)
+from .core import BigSMILESbase
+from .distribution import (
+    FlorySchulz,
+    Gauss,
+    LogNormal,
+    Poisson,
+    StochasticDistribution,
+    Uniform,
+)
+from .parser import get_global_parser
+from .transformer import GBigSMILESTransformer, get_global_transformer
+from .util import camel_to_snake, get_global_rng, snake_to_camel
+
+# from .graph_generate import AtomGraph
+# from .mixture import Mixture
+# from .mol_prob import get_ensemble_prob
+# from .molecule import Molecule
+# from .stochastic import Stochastic
+# from .system import System
+# from .token import SmilesToken
 
 __all__ = [
     "__version__",
     "version_tuple",
+    "Atom",
+    "BracketAtom",
+    "Isotope",
+    "AtomSymbol",
+    "Chiral",
+    "HCount",
+    "AtomCharge",
+    "AtomClass",
+    "AromaticSymbol",
+    "AliphaticOrganic",
+    "AromaticOrganic",
+    "BondSymbol",
+    "RingBond",
+    "BondDescriptorSymbol",
+    "BondDescriptorSymbolIdx",
+    "BondDescriptorGeneration",
+    "InnerBondDescriptor",
     "BondDescriptor",
-    "_GLOBAL_RNG",
+    "SimpleBondDescriptor",
+    "TerminalBondDescriptor",
     "BigSMILESbase",
-    "reaction_graph_to_dot_string",
-    "Distribution",
-    "FlorySchulz",
-    "Gauss",
-    "Atom",
-    "AtomGraph",
-    "Mixture",
-    "get_ensemble_prob",
-    "Molecule",
-    "Stochastic",
-    "System",
-    "SmilesToken",
-    "GBigSMILESError",
-    "GBigSMILESParsingError",
-    "GBigSMILESInitNotEnoughError",
-    "GBigSMILESInitTooMuchError",
     "camel_to_snake",
     "snake_to_camel",
+    "get_global_rng",
     "GBigSMILESTransformer",
+    "get_global_transformer",
+    "get_global_parser",
 ]
diff --git a/src/gbigsmiles/atom.py b/src/gbigsmiles/atom.py
@@ -5,7 +5,7 @@
 import lark
 
 from .core import BigSMILESbase
-from .exception import GBigSMILESParsingError, GBigSMILESTooManyTokens
+from .exception import ParsingError, TooManyTokens
 
 
 class Atom(BigSMILESbase):
@@ -33,7 +33,7 @@ def __init__(self, children: list):
         for child in self._children:
             if isinstance(child, AtomSymbol):
                 if self._symbol is not None:
-                    raise GBigSMILESTooManyTokens(self.__class__, self._symbol, child)
+                    raise TooManyTokens(self.__class__, self._symbol, child)
                 self._symbol = child
 
     def generate_string(self, extension):
@@ -294,7 +294,7 @@ def __init__(self, children: list):
         super().__init__(children)
 
         if str(self._children[0]) != "H":
-            raise GBigSMILESParsingError(self._children[0])
+            raise ParsingError(self._children[0])
 
         if len(self._children) > 1:
             self._count = int(self._children[1])

diff --git a/src/gbigsmiles/bond.py b/src/gbigsmiles/bond.py
@@ -2,8 +2,13 @@
 # Copyright (c) 2022: Ludwig Schneider
 # See LICENSE for details
 
+try:
+    from typing import Self
+except ImportError:
+    from typing_extensions import Self
 
 from .core import BigSMILESbase
+from .parser import get_global_parser
 
 
 def _create_compatible_bond_text(bond):
@@ -309,6 +314,12 @@ def transition(self):
 
 
 class BondDescriptor(BigSMILESbase):
+    @classmethod
+    def make(cls, text: str) -> Self:
+        if "$" in text or "<" in text or ">" in text:
+            return SimpleBondDescriptor.make(text)
+        return TerminalBondDescriptor.make(text)
+
     @property
     def symbol(self):
         return None
@@ -334,6 +345,13 @@ def __init__(self, children):
             if isinstance(child, InnerBondDescriptor):
                 self._inner_bond_descriptor = child
 
+    @classmethod
+    def make(cls, text: str) -> Self:
+        # We use BigSMILESbase.make.__func__ to get the underlying function of the class method,
+        # then call it with cls as the first argument to ensure child typing.
+        # We do not want to call StochasticDistribution's make function, because it directs here.
+        return BigSMILESbase.make.__func__(cls, text)
+
     def generate_string(self, extension):
         return "[" + self._inner_bond_descriptor.generate_string(extension) + "]"
 
@@ -369,6 +387,13 @@ def __init__(self, children):
             if isinstance(child, BondDescriptorGeneration):
                 self._generation = child
 
+    @classmethod
+    def make(cls, text: str) -> Self:
+        # We use BigSMILESbase.make.__func__ to get the underlying function of the class method,
+        # then call it with cls as the first argument to ensure child typing.
+        # We do not want to call StochasticDistribution's make function, because it directs here.
+        return BigSMILESbase.make.__func__(cls, text)
+
     @property
     def weight(self):
         return self._generation.weight

diff --git a/src/gbigsmiles/data/g-bigsmiles.lark b/src/gbigsmiles/data/g-bigsmiles.lark
@@ -40,8 +40,8 @@ bond_symbol: "-"
   | "/"
   | "\\"
 
-ring_bond: bond_symbol? DIGIT
-  | bond_symbol? "%" DIGIT? DIGIT
+ring_bond: bond_symbol? INT
+  | bond_symbol? "%" INT
 
 _branched_atom: _atom_stand_in ring_bond* branch*
 
@@ -77,12 +77,18 @@ bond_descriptor: simple_bond_descriptor | ladder_bond_descriptor | non_covalent_
 terminal_bond_descriptor: "[" bond_descriptor_symbol_idx? bond_descriptor_generation? "]"
 
 stochastic_generation: "|" stochastic_distribution "|"
-stochastic_distribution: "flory_schulz(" WS_INLINE* NUMBER WS_INLINE* ")"
-  | "schulz_zimm(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
-  | "gauss(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
-  | "uniform(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
-  | "log_normal(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
-  | "poisson(" WS_INLINE* NUMBER WS_INLINE* ")"
+flory_schulz: "flory_schulz(" WS_INLINE* NUMBER WS_INLINE* ")"
+schulz_zimm: "schulz_zimm(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
+gauss: "gauss(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
+uniform: "uniform(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
+log_normal: "log_normal(" WS_INLINE* NUMBER WS_INLINE* "," WS_INLINE* NUMBER WS_INLINE* ")"
+poisson: "poisson(" WS_INLINE* NUMBER WS_INLINE* ")"
+stochastic_distribution: flory_schulz
+  | schulz_zimm
+  | gauss
+  | uniform
+  | log_normal
+  | poisson
 
 _unary_index_operator: "!"
 _binary_index_operator: "~" | "&"
@@ -94,10 +100,10 @@ _non_covalent_key_value_pair: WS_INLINE* "," WS_INLINE* _printable_character+ "=
 _non_covalent_context: WS_INLINE* "|" WS_INLINE* _index_expression _non_covalent_key_value_pair*
 
 h_count: "H"
-  | "H" DIGIT
+  | "H" INT
 
-atom_charge: "-" DIGIT?
-  | "+" DIGIT?
+atom_charge: "-" INT?
+  | "+" INT?
   | "--"
   | "++"
 
@@ -117,7 +123,6 @@ end_group.-1: ";" WS_INLINE* smiles _monomer_list*
 %import common.INT
 %import common.SIGNED_NUMBER
 %import common.NUMBER
-%import common.DIGIT
 %import common.WS
 %import common.WS_INLINE
 
@@ -152,8 +157,8 @@ chiral: "@"
   | "@SP1"
   | "@SP2"
   | "@SP3"
-  | "@TB" DIGIT? DIGIT
-  | "@OH" DIGIT? DIGIT
+  | "@TB" INT
+  | "@OH" INT
 
 _element_symbols: "H"
   | "He"

diff --git a/src/gbigsmiles/exception.py b/src/gbigsmiles/exception.py
@@ -11,7 +11,7 @@ class GBigSMILESError(Exception):
     pass
 
 
-class GBigSMILESParsingError(GBigSMILESError):
+class ParsingError(GBigSMILESError):
     """
     Parsing the Grammar went in an unanticipated manner.
     Please report bug with input string.
@@ -24,39 +24,7 @@ def __str__(self):
         return f"Unanticipated error while parsing. Please report and provide the input string. Token: {self.token} start: {self.token.start_pos}"
 
 
-class GBigSMILESInitNotEnoughError(GBigSMILESError):
-    """
-    GBigSMILES classes usually need to be initialized either via text,
-    or as part of parsing a different string.
-
-    If this isn't followed, this exception is raise.
-    Initialize the elements of G-BigSMILES with (part of) a G-BigSMILES string.
-    """
-
-    def __init__(self, class_name):
-        self.class_name = class_name
-
-    def __str__(self):
-        return f"Attempt to initialize {self.class_name} without sufficient arguments. Initialize objects of {self.class_name} by passing (part of) a G-BigSMILES string."
-
-
-class GBigSMILESInitTooMuchError(GBigSMILESError):
-    """
-    GBigSMILES classes usually need to be initialized either via text,
-    or as part of parsing a different string, but not both.
-
-    If this isn't followed, this exception is raise.
-    Initialize the elements of G-BigSMILES with (part of) a G-BigSMILES string.
-    """
-
-    def __init__(self, class_name):
-        self.class_name = class_name
-
-    def __str__(self):
-        return f"Attempt to initialize {self.class_name} with tree and text arguments. Initialize objects of {self.class_name} by passing (part of) a G-BigSMILES string."
-
-
-class GBigSMILESTooManyTokens(GBigSMILESError):
+class TooManyTokens(ParsingError):
     def __init__(self, class_name, existing_token, new_token):
         self.class_name = class_name
         self.existing_token = existing_token
@@ -67,3 +35,13 @@ def __str__(self):
         string += f"The existing token is {self.existing_token} which conflicts with the new "
         string += f"token {self.new_token}. Most likely in implementation error, please report."
         return string
+
+
+class UnknownDistribution(GBigSMILESError):
+    def __init__(self, distribution_text: str):
+        self.distribution_text = distribution_text
+
+    def __str__(self):
+        string = f"GBigSMILES a distribution with the following text {self.distribution_text} is unknown."
+        string += " Typo or not implemented distribution."
+        return string
diff --git a/src/gbigsmiles/molecule.py b/src/gbigsmiles/molecule.py
@@ -7,11 +7,12 @@
 import networkx as nx
 
 from .bond import _create_compatible_bond_text
-from .core import _GLOBAL_RNG, BigSMILESbase
+from .core import BigSMILESbase
 from .mixture import Mixture
 from .stochastic import Stochastic
 from .stochastic_atom_graph import StochasticAtomGraph
 from .token import SmilesToken
+from .util import get_global_rng
 
 
 class Molecule(BigSMILESbase):
@@ -144,7 +145,7 @@ def generate_string(self, extension):
             string += self.mixture.generate_string(extension)
         return string
 
-    def generate(self, prefix=None, rng=_GLOBAL_RNG):
+    def generate(self, prefix=None, rng=get_global_rng()):
         my_mol = prefix
         for element in self._elements:
             my_mol = element.generate(my_mol, rng)

diff --git a/src/gbigsmiles/parser.py b/src/gbigsmiles/parser.py
@@ -30,6 +30,7 @@ def _make_parser(filename=None, start_tokens=None):
             "ladder_bond_descriptor",
             "non_covalent_bond_descriptor",
             "bond_descriptor",
+            "simple_bond_descriptor",
             "terminal_bond_descriptor",
             "stochastic_generation",
             "stochastic_distribution",
@@ -41,9 +42,22 @@ def _make_parser(filename=None, start_tokens=None):
             "atom_symbol",
             "aromatic_symbol",
             "bracket_atom",
+            "flory_schulz",
+            "uniform",
+            "schulz_zimm",
+            "log_normal",
+            "gauss",
         ]
     parser = Lark(rf"{grammar_text}", start=start_tokens, keep_all_tokens=True)
     return parser
 
 
-_GLOBAL_PARSER = _make_parser()
+_GLOBAL_PARSER: None | Lark = None
+
+
+def get_global_parser():
+    global _GLOBAL_PARSER
+    if _GLOBAL_PARSER is None:
+        _GLOBAL_PARSER = _make_parser()
+
+    return _GLOBAL_PARSER