From 8fe4292d6b34b3fd10d2a7d749f1a7519c693eda Mon Sep 17 00:00:00 2001 From: Zackary Troop Date: Mon, 11 Apr 2022 00:56:34 -0400 Subject: [PATCH] add requested features. --- README.md | 34 ++++++------ py3grok/__init__.py | 2 +- py3grok/py3grok.py | 122 +++++++++++++++++++++++++++--------------- pyproject.toml | 2 +- setup.py | 2 +- tests/test_py3grok.py | 69 ++++++++++++++---------- 6 files changed, 139 insertions(+), 92 deletions(-) diff --git a/README.md b/README.md index eb99b79..0a97ac1 100644 --- a/README.md +++ b/README.md @@ -19,31 +19,31 @@ pip install py3grok ## Getting Started -```python -from py3grok import Grok - -grok = Grok() -text = 'gary is male, 25 years old and weighs 68.5 kilograms' -grok.set_pattern('%{WORD:name} is %{WORD:gender}, %{NUMBER:age} years old and weighs %{NUMBER:weight} kilograms') -print(grok.match(text)) +When using this library, you will want to first create an instance of `GrokEnvironment`. +That will load the default and custom grok pattern files. Whenever you want to create a new +pattern, you can run `GrokEnvironment.create(pattern)` which returns an instance of `Grok`, +where you can simply run `Grok.match(text)`. -# {'gender': 'male', 'age': '25', 'name': 'gary', 'weight': '68.5'} -``` +For flexibility, you can also modify a `Grok` instance's `pattern` property as well if needed. -Numbers can be converted from string to `int` or `float` if you use `%{pattern:name:type}` syntax, such as `%{NUMBER:age:int}` +### Code Example ```python -from py3grok import Grok +from py3grok import GrokEnvironment + +grok_env = GrokEnvironment() +pattern = '%{WORD:name} is %{WORD:gender}, %{NUMBER:age} years old and weighs %{NUMBER:weight} kilograms.' -grok = Grok() -text = 'gary is male, 25 years old and weighs 68.5 kilograms' -grok.set_pattern('%{WORD:name} is %{WORD:gender}, %{NUMBER:age:int} years old and weighs %{NUMBER:weight:float} kilograms') +# Regex flags can be used, like: grok_env.create(pattern, flags=re.IGNORECASE) +grok = grok_env.create(pattern) + +text = 'Gary is male, 25 years old and weighs 68.5 kilograms.' print(grok.match(text)) -# {'gender': 'male', 'age': 25, 'name': 'gary', 'weight': 68.5} +# {'gender': 'male', 'age': '25', 'name': 'Gary', 'weight': '68.5'} ``` -Now `age` is of type `int` and `weight` is of type `float`. +Numbers can be converted from string to `int` or `float` if you use `%{pattern:name:type}` syntax, such as `%{NUMBER:age:int}` See all available patterns [here](./py3grok/patterns)! @@ -55,4 +55,4 @@ Grok is a simple software that allows you to easily parse strings, logs and othe I recommend you to have a look at [logstash filter grok](https://www.elastic.co/guide/en/logstash/current/plugins-filters-grok.html), it explains how Grok works. -Pattern files come from [logstash filter grok's pattern files](https://github.com/logstash-plugins/logstash-patterns-core/tree/master/patterns) +Pattern files come from [logstash filter grok's pattern files](https://github.com/logstash-plugins/logstash-patterns-core/tree/master/patterns). \ No newline at end of file diff --git a/py3grok/__init__.py b/py3grok/__init__.py index 009a136..5681a26 100644 --- a/py3grok/__init__.py +++ b/py3grok/__init__.py @@ -1 +1 @@ -from .py3grok import Grok, GrokPattern # noqa: F401 +from .py3grok import Grok, GrokPattern, GrokEnvironment # noqa: F401 diff --git a/py3grok/py3grok.py b/py3grok/py3grok.py index 13ccfb3..e1bf62b 100644 --- a/py3grok/py3grok.py +++ b/py3grok/py3grok.py @@ -28,20 +28,14 @@ def __hash__(self) -> int: class Grok: - def __init__(self, custom_dirs: List[str] = None, full_match: bool = True) -> None: - self.available_patterns: dict = {} - self.regex_obj: Optional[regex.Pattern] = None - self.full_match = full_match + def __init__(self, pattern: str, available_patterns: dict = None, **kwargs) -> None: + self.available_patterns: dict = available_patterns if available_patterns else {} + self.compiled_pattern: Optional[regex.Pattern] = None self._pattern: str = "" self._type_mapper: dict = {} - - if custom_dirs: - DEFAULT_PATTERNS_DIR.extend(custom_dirs) - - for directory in DEFAULT_PATTERNS_DIR: - for f in os.listdir(directory): - patterns = self.load_patterns_from_file(os.path.join(directory, f)) - self.available_patterns.update(patterns) + self._extra_args = kwargs + # Set and compile given pattern. + self.pattern = pattern def __str__(self) -> str: return f"Grok ({self.pattern})" @@ -54,7 +48,7 @@ def __eq__(self, __o: object) -> bool: return False return (self.pattern, frozenset(self.available_patterns.items())) == ( __o.pattern, - frozenset(self.available_patterns.items()), + frozenset(__o.available_patterns.items()), ) def __hash__(self) -> int: @@ -62,42 +56,25 @@ def __hash__(self) -> int: @property def pattern(self) -> str: + """Returns the current pattern used in this instance of ``Grok``.""" return self._pattern @pattern.setter def pattern(self, pattern) -> None: self._pattern = pattern - self._regex_compile() + self._compile_pattern() def set_pattern(self, pattern) -> None: """ - Convienence function that sets the pattern. This is equivalent to calling - ``grok.pattern = pattern``. It's preferrable to set a new pattern instead - of re-instantiating a new Grok object. + Convienence function that changes the pattern. This is equivalent to + calling ``grok.pattern = pattern``. """ self.pattern = pattern - @staticmethod - def load_patterns_from_file(file: str) -> dict: - """ - Load patterns from a given file. Instiates each line as an individual - ``GrokPattern`` object that's accessible by ``self.available_patterns``. - """ - patterns = {} - - with open(file, "r", encoding="utf-8") as f: - lines = filter(lambda l: (l.strip() != "" and l[0] != "#"), f.readlines()) - for l in lines: - sep = l.find(" ") - name = l[:sep] - patterns[name] = GrokPattern(name, l[sep:].strip()) - - return patterns - - def _regex_compile(self) -> None: + def _compile_pattern(self) -> None: """ Private function that compiles specified pattern into a ``Regex.Pattern`` - which is accessible by ``self.regex_obj`` after executing this function. + which is accessible by ``self.compiled_pattern`` after executing this function. """ self._type_mapper = {} pattern = copy(self.pattern) @@ -108,13 +85,13 @@ def _regex_compile(self) -> None: self._type_mapper[match[1]] = match[2] # Replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type} - # with regex pattern and group name + # with regex pattern and group name. pattern = regex.sub( r"%{(\w+):(\w+)(?::\w+)?}", lambda m: f"(?P<{m.group(2)}>{self.available_patterns[m.group(1)].regex_str})", pattern, ) - # Replace %{pattern_name} with regex pattern + # Replace %{pattern_name} with regex pattern. pattern = regex.sub( r"%{(\w+)}", lambda m: f"({self.available_patterns[m.group(1)].regex_str})", @@ -123,22 +100,22 @@ def _regex_compile(self) -> None: if regex.search(r"%{\w+(:\w+)?}", pattern) is None: break - self.regex_obj = regex.compile(pattern) + self.compiled_pattern = regex.compile(pattern, **self._extra_args) - def match(self, text: str) -> Optional[Dict[str, Any]]: + def match(self, text: str, full_match: bool = True) -> Optional[Dict[str, Any]]: """ If text is matched with pattern, return variable names specified (%{pattern:variable name}) in pattern and their corresponding values. If not matched, return None. """ - if not self.regex_obj: + if not self.compiled_pattern: return None match_object: Optional[regex.Match] = None - if self.full_match: - match_object = self.regex_obj.fullmatch(text) + if full_match: + match_object = self.compiled_pattern.fullmatch(text) else: - match_object = self.regex_obj.search(text) + match_object = self.compiled_pattern.search(text) if match_object is None: return None @@ -154,3 +131,60 @@ def match(self, text: str) -> Optional[Dict[str, Any]]: pass return matches + + +class GrokEnvironment: + def __init__(self, custom_dirs: List[str] = None): + """ + The ``GrokEnvironment`` is a factory class that loads grok pattern files + and creates new instances of ``Grok`` for pattern matching. You will only + need to have **one** instance of this class and use the method, + ``GrokEnvironment.create()`` to create ``Grok`` instances. + + Custom directories can be used if you want to load your own grok + pattern files. + """ + self.custom_dirs = None + self.available_patterns = {} + + if custom_dirs: + DEFAULT_PATTERNS_DIR.extend(custom_dirs) + + for directory in DEFAULT_PATTERNS_DIR: + for f in os.listdir(directory): + patterns = self.load_patterns_from_file(os.path.join(directory, f)) + self.available_patterns.update(patterns) + + def __eq__(self, __o: object) -> bool: + if not isinstance(__o, GrokEnvironment): + return False + return (frozenset(__o.available_patterns.items())) == ( + frozenset(self.available_patterns.items()), + ) + + def __hash__(self) -> int: + return hash(frozenset(self.available_patterns.items())) + + def create(self, pattern: str, **kwargs) -> Grok: + """ + Create a new instance of ``Grok`` for pattern matching. You can also pass + ``flags=re.IGNORECASE`` or other flags for regex configuration if needed. + """ + return Grok(pattern, available_patterns=self.available_patterns, **kwargs) + + @staticmethod + def load_patterns_from_file(file: str) -> dict: + """ + Load patterns from a given file. Instiates each line as an individual + ``GrokPattern`` object that can be accessed from ``self.available_patterns``. + """ + patterns = {} + + with open(file, "r", encoding="utf-8") as f: + lines = filter(lambda l: (l.strip() != "" and l[0] != "#"), f.readlines()) + for l in lines: + sep = l.find(" ") + name = l[:sep] + patterns[name] = GrokPattern(name, l[sep:].strip()) + + return patterns diff --git a/pyproject.toml b/pyproject.toml index 44a74b8..a018539 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "py3grok" -version = "0.1.0" +version = "0.2.0" description = "Parse strings and extract information from structured or unstructured data." authors = ["Zackary Troop "] license = "MIT, Apache 2.0" diff --git a/setup.py b/setup.py index 4db4d0a..a276c28 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( author="Zackary Troop", name="py3grok", - version="0.1.0", + version="0.2.0", url="https://github.com/ztroop/py3grok", license="MIT", description="Parse strings and extract information from structured or unstructured data.", diff --git a/tests/test_py3grok.py b/tests/test_py3grok.py index 1b49f51..273efaf 100644 --- a/tests/test_py3grok.py +++ b/tests/test_py3grok.py @@ -1,50 +1,63 @@ from unittest import TestCase -from py3grok import Grok -from py3grok.py3grok import GrokPattern +from py3grok import Grok, GrokEnvironment, GrokPattern +from re import IGNORECASE class TestGrok(TestCase): def setUp(self) -> None: - return super().setUp() + self.grok_env = GrokEnvironment() def test_grok(self): - grok = Grok() - - text = "gary is male, 25 years old and weighs 68.5 kilograms" - pattern = "%{WORD:name} is %{WORD:gender}, %{NUMBER:age:int} years old and weighs %{NUMBER:weight:float} kilograms" - - grok.pattern = pattern + text = "gary is male, 25 years old." + pattern = "%{WORD:name} is %{WORD:gender}, %{NUMBER:age:int} years old." + grok = self.grok_env.create(pattern) result = grok.match(text) - expected_result = {"name": "gary", "gender": "male", "age": 25, "weight": 68.5} + expected_result = {"age": 25, "gender": "male", "name": "gary"} self.assertEqual(result, expected_result) + self.assertEqual(len(grok.available_patterns), 361) - grok.set_pattern( - "%{WORD:name} is %{WORD:gender} and %{NUMBER:age:int} years old" - ) - result = grok.match("allie is female and 32 years old") - expected_result = {"age": 32, "gender": "female", "name": "allie"} + def test_grok_modify_pattern(self): + text = "gary is male, 25 years old." + pattern = "%{WORD:name} is %{WORD:gender}, %{NUMBER:age:int} years old." + grok = self.grok_env.create(pattern) + grok.set_pattern("something else") + + self.assertEqual(grok.pattern, "something else") + + def test_grok_ignore_case(self): + text = "gary is male, 25 years old." + pattern = "%{WORD:name} IS %{WORD:gender}, %{NUMBER:age:int} YEARS OLD." + grok = self.grok_env.create(pattern, flags=IGNORECASE) + result = grok.match(text) + expected_result = {"age": 25, "gender": "male", "name": "gary"} self.assertEqual(result, expected_result) - def test_grok_available_patterns(self): - grok = Grok() + def test_magic_methods(self): + grok1 = Grok("test pattern") + grok2 = Grok("test pattern") + grok3 = Grok("another pattern") - self.assertEqual(len(grok.available_patterns), 361) + grokpattern1 = GrokPattern("TEST", "regex") + grokpattern2 = GrokPattern("TEST", "regex") + grokpattern3 = GrokPattern("OTHER", "diff") - def test_grok__eq__comparison(self): - grok1 = Grok() - grok2 = Grok() - grokpattern1 = GrokPattern("test_name", "regex") - grokpattern2 = GrokPattern("test_name", "regex") - grokpattern3 = GrokPattern("other_name", "diff") + grok_set = {grok1, grok2, grok3} + grok_pattern_set = {grokpattern1, grokpattern2, grokpattern3} self.assertEqual(grok1, grok2) - self.assertNotEqual(grok1, grokpattern1) - - grok1.set_pattern("%{WORD:name}") - self.assertNotEqual(grok1, grok2) + self.assertNotEqual(grok1, grok3) self.assertEqual(grokpattern1, grokpattern2) self.assertNotEqual(grokpattern1, grokpattern3) self.assertNotEqual(grokpattern1, grok1) + + self.assertIsInstance(grok_set, set) + self.assertIsInstance(grok_pattern_set, set) + + self.assertEqual(str(grok1), "Grok (test pattern)") + self.assertEqual(repr(grok1), "test pattern") + + self.assertEqual(str(grokpattern1), "GrokPattern (TEST, regex)") + self.assertEqual(repr(grokpattern1), "TEST")