Skip to content
This repository has been archived by the owner on Mar 10, 2024. It is now read-only.

Commit

Permalink
add requested features.
Browse files Browse the repository at this point in the history
  • Loading branch information
ztroop committed Apr 11, 2022
1 parent 1bc1ce4 commit 8fe4292
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 92 deletions.
34 changes: 17 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,31 +19,31 @@ pip install py3grok

## Getting Started

```python
from py3grok import Grok

grok = Grok()
text = 'gary is male, 25 years old and weighs 68.5 kilograms'
grok.set_pattern('%{WORD:name} is %{WORD:gender}, %{NUMBER:age} years old and weighs %{NUMBER:weight} kilograms')
print(grok.match(text))
When using this library, you will want to first create an instance of `GrokEnvironment`.
That will load the default and custom grok pattern files. Whenever you want to create a new
pattern, you can run `GrokEnvironment.create(pattern)` which returns an instance of `Grok`,
where you can simply run `Grok.match(text)`.

# {'gender': 'male', 'age': '25', 'name': 'gary', 'weight': '68.5'}
```
For flexibility, you can also modify a `Grok` instance's `pattern` property as well if needed.

Numbers can be converted from string to `int` or `float` if you use `%{pattern:name:type}` syntax, such as `%{NUMBER:age:int}`
### Code Example

```python
from py3grok import Grok
from py3grok import GrokEnvironment

grok_env = GrokEnvironment()
pattern = '%{WORD:name} is %{WORD:gender}, %{NUMBER:age} years old and weighs %{NUMBER:weight} kilograms.'

grok = Grok()
text = 'gary is male, 25 years old and weighs 68.5 kilograms'
grok.set_pattern('%{WORD:name} is %{WORD:gender}, %{NUMBER:age:int} years old and weighs %{NUMBER:weight:float} kilograms')
# Regex flags can be used, like: grok_env.create(pattern, flags=re.IGNORECASE)
grok = grok_env.create(pattern)

text = 'Gary is male, 25 years old and weighs 68.5 kilograms.'
print(grok.match(text))

# {'gender': 'male', 'age': 25, 'name': 'gary', 'weight': 68.5}
# {'gender': 'male', 'age': '25', 'name': 'Gary', 'weight': '68.5'}
```

Now `age` is of type `int` and `weight` is of type `float`.
Numbers can be converted from string to `int` or `float` if you use `%{pattern:name:type}` syntax, such as `%{NUMBER:age:int}`

See all available patterns [here](./py3grok/patterns)!

Expand All @@ -55,4 +55,4 @@ Grok is a simple software that allows you to easily parse strings, logs and othe

I recommend you to have a look at [logstash filter grok](https://www.elastic.co/guide/en/logstash/current/plugins-filters-grok.html), it explains how Grok works.

Pattern files come from [logstash filter grok's pattern files](https://github.com/logstash-plugins/logstash-patterns-core/tree/master/patterns)
Pattern files come from [logstash filter grok's pattern files](https://github.com/logstash-plugins/logstash-patterns-core/tree/master/patterns).
2 changes: 1 addition & 1 deletion py3grok/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .py3grok import Grok, GrokPattern # noqa: F401
from .py3grok import Grok, GrokPattern, GrokEnvironment # noqa: F401
122 changes: 78 additions & 44 deletions py3grok/py3grok.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,14 @@ def __hash__(self) -> int:


class Grok:
def __init__(self, custom_dirs: List[str] = None, full_match: bool = True) -> None:
self.available_patterns: dict = {}
self.regex_obj: Optional[regex.Pattern] = None
self.full_match = full_match
def __init__(self, pattern: str, available_patterns: dict = None, **kwargs) -> None:
self.available_patterns: dict = available_patterns if available_patterns else {}
self.compiled_pattern: Optional[regex.Pattern] = None
self._pattern: str = ""
self._type_mapper: dict = {}

if custom_dirs:
DEFAULT_PATTERNS_DIR.extend(custom_dirs)

for directory in DEFAULT_PATTERNS_DIR:
for f in os.listdir(directory):
patterns = self.load_patterns_from_file(os.path.join(directory, f))
self.available_patterns.update(patterns)
self._extra_args = kwargs
# Set and compile given pattern.
self.pattern = pattern

def __str__(self) -> str:
return f"Grok ({self.pattern})"
Expand All @@ -54,50 +48,33 @@ def __eq__(self, __o: object) -> bool:
return False
return (self.pattern, frozenset(self.available_patterns.items())) == (
__o.pattern,
frozenset(self.available_patterns.items()),
frozenset(__o.available_patterns.items()),
)

def __hash__(self) -> int:
return hash(frozenset(self.available_patterns.items()))

@property
def pattern(self) -> str:
"""Returns the current pattern used in this instance of ``Grok``."""
return self._pattern

@pattern.setter
def pattern(self, pattern) -> None:
self._pattern = pattern
self._regex_compile()
self._compile_pattern()

def set_pattern(self, pattern) -> None:
"""
Convienence function that sets the pattern. This is equivalent to calling
``grok.pattern = pattern``. It's preferrable to set a new pattern instead
of re-instantiating a new Grok object.
Convienence function that changes the pattern. This is equivalent to
calling ``grok.pattern = pattern``.
"""
self.pattern = pattern

@staticmethod
def load_patterns_from_file(file: str) -> dict:
"""
Load patterns from a given file. Instiates each line as an individual
``GrokPattern`` object that's accessible by ``self.available_patterns``.
"""
patterns = {}

with open(file, "r", encoding="utf-8") as f:
lines = filter(lambda l: (l.strip() != "" and l[0] != "#"), f.readlines())
for l in lines:
sep = l.find(" ")
name = l[:sep]
patterns[name] = GrokPattern(name, l[sep:].strip())

return patterns

def _regex_compile(self) -> None:
def _compile_pattern(self) -> None:
"""
Private function that compiles specified pattern into a ``Regex.Pattern``
which is accessible by ``self.regex_obj`` after executing this function.
which is accessible by ``self.compiled_pattern`` after executing this function.
"""
self._type_mapper = {}
pattern = copy(self.pattern)
Expand All @@ -108,13 +85,13 @@ def _regex_compile(self) -> None:
self._type_mapper[match[1]] = match[2]

# Replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
# with regex pattern and group name
# with regex pattern and group name.
pattern = regex.sub(
r"%{(\w+):(\w+)(?::\w+)?}",
lambda m: f"(?P<{m.group(2)}>{self.available_patterns[m.group(1)].regex_str})",
pattern,
)
# Replace %{pattern_name} with regex pattern
# Replace %{pattern_name} with regex pattern.
pattern = regex.sub(
r"%{(\w+)}",
lambda m: f"({self.available_patterns[m.group(1)].regex_str})",
Expand All @@ -123,22 +100,22 @@ def _regex_compile(self) -> None:
if regex.search(r"%{\w+(:\w+)?}", pattern) is None:
break

self.regex_obj = regex.compile(pattern)
self.compiled_pattern = regex.compile(pattern, **self._extra_args)

def match(self, text: str) -> Optional[Dict[str, Any]]:
def match(self, text: str, full_match: bool = True) -> Optional[Dict[str, Any]]:
"""
If text is matched with pattern, return variable names specified
(%{pattern:variable name}) in pattern and their corresponding values.
If not matched, return None.
"""
if not self.regex_obj:
if not self.compiled_pattern:
return None

match_object: Optional[regex.Match] = None
if self.full_match:
match_object = self.regex_obj.fullmatch(text)
if full_match:
match_object = self.compiled_pattern.fullmatch(text)
else:
match_object = self.regex_obj.search(text)
match_object = self.compiled_pattern.search(text)

if match_object is None:
return None
Expand All @@ -154,3 +131,60 @@ def match(self, text: str) -> Optional[Dict[str, Any]]:
pass

return matches


class GrokEnvironment:
def __init__(self, custom_dirs: List[str] = None):
"""
The ``GrokEnvironment`` is a factory class that loads grok pattern files
and creates new instances of ``Grok`` for pattern matching. You will only
need to have **one** instance of this class and use the method,
``GrokEnvironment.create()`` to create ``Grok`` instances.
Custom directories can be used if you want to load your own grok
pattern files.
"""
self.custom_dirs = None
self.available_patterns = {}

if custom_dirs:
DEFAULT_PATTERNS_DIR.extend(custom_dirs)

for directory in DEFAULT_PATTERNS_DIR:
for f in os.listdir(directory):
patterns = self.load_patterns_from_file(os.path.join(directory, f))
self.available_patterns.update(patterns)

def __eq__(self, __o: object) -> bool:
if not isinstance(__o, GrokEnvironment):
return False
return (frozenset(__o.available_patterns.items())) == (
frozenset(self.available_patterns.items()),
)

def __hash__(self) -> int:
return hash(frozenset(self.available_patterns.items()))

def create(self, pattern: str, **kwargs) -> Grok:
"""
Create a new instance of ``Grok`` for pattern matching. You can also pass
``flags=re.IGNORECASE`` or other flags for regex configuration if needed.
"""
return Grok(pattern, available_patterns=self.available_patterns, **kwargs)

@staticmethod
def load_patterns_from_file(file: str) -> dict:
"""
Load patterns from a given file. Instiates each line as an individual
``GrokPattern`` object that can be accessed from ``self.available_patterns``.
"""
patterns = {}

with open(file, "r", encoding="utf-8") as f:
lines = filter(lambda l: (l.strip() != "" and l[0] != "#"), f.readlines())
for l in lines:
sep = l.find(" ")
name = l[:sep]
patterns[name] = GrokPattern(name, l[sep:].strip())

return patterns
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "py3grok"
version = "0.1.0"
version = "0.2.0"
description = "Parse strings and extract information from structured or unstructured data."
authors = ["Zackary Troop <[email protected]>"]
license = "MIT, Apache 2.0"
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
author="Zackary Troop",
name="py3grok",
version="0.1.0",
version="0.2.0",
url="https://github.com/ztroop/py3grok",
license="MIT",
description="Parse strings and extract information from structured or unstructured data.",
Expand Down
69 changes: 41 additions & 28 deletions tests/test_py3grok.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,63 @@
from unittest import TestCase
from py3grok import Grok
from py3grok.py3grok import GrokPattern
from py3grok import Grok, GrokEnvironment, GrokPattern
from re import IGNORECASE


class TestGrok(TestCase):
def setUp(self) -> None:
return super().setUp()
self.grok_env = GrokEnvironment()

def test_grok(self):
grok = Grok()

text = "gary is male, 25 years old and weighs 68.5 kilograms"
pattern = "%{WORD:name} is %{WORD:gender}, %{NUMBER:age:int} years old and weighs %{NUMBER:weight:float} kilograms"

grok.pattern = pattern
text = "gary is male, 25 years old."
pattern = "%{WORD:name} is %{WORD:gender}, %{NUMBER:age:int} years old."
grok = self.grok_env.create(pattern)
result = grok.match(text)
expected_result = {"name": "gary", "gender": "male", "age": 25, "weight": 68.5}
expected_result = {"age": 25, "gender": "male", "name": "gary"}

self.assertEqual(result, expected_result)
self.assertEqual(len(grok.available_patterns), 361)

grok.set_pattern(
"%{WORD:name} is %{WORD:gender} and %{NUMBER:age:int} years old"
)
result = grok.match("allie is female and 32 years old")
expected_result = {"age": 32, "gender": "female", "name": "allie"}
def test_grok_modify_pattern(self):
text = "gary is male, 25 years old."
pattern = "%{WORD:name} is %{WORD:gender}, %{NUMBER:age:int} years old."
grok = self.grok_env.create(pattern)
grok.set_pattern("something else")

self.assertEqual(grok.pattern, "something else")

def test_grok_ignore_case(self):
text = "gary is male, 25 years old."
pattern = "%{WORD:name} IS %{WORD:gender}, %{NUMBER:age:int} YEARS OLD."
grok = self.grok_env.create(pattern, flags=IGNORECASE)
result = grok.match(text)
expected_result = {"age": 25, "gender": "male", "name": "gary"}

self.assertEqual(result, expected_result)

def test_grok_available_patterns(self):
grok = Grok()
def test_magic_methods(self):
grok1 = Grok("test pattern")
grok2 = Grok("test pattern")
grok3 = Grok("another pattern")

self.assertEqual(len(grok.available_patterns), 361)
grokpattern1 = GrokPattern("TEST", "regex")
grokpattern2 = GrokPattern("TEST", "regex")
grokpattern3 = GrokPattern("OTHER", "diff")

def test_grok__eq__comparison(self):
grok1 = Grok()
grok2 = Grok()
grokpattern1 = GrokPattern("test_name", "regex")
grokpattern2 = GrokPattern("test_name", "regex")
grokpattern3 = GrokPattern("other_name", "diff")
grok_set = {grok1, grok2, grok3}
grok_pattern_set = {grokpattern1, grokpattern2, grokpattern3}

self.assertEqual(grok1, grok2)
self.assertNotEqual(grok1, grokpattern1)

grok1.set_pattern("%{WORD:name}")
self.assertNotEqual(grok1, grok2)
self.assertNotEqual(grok1, grok3)

self.assertEqual(grokpattern1, grokpattern2)
self.assertNotEqual(grokpattern1, grokpattern3)
self.assertNotEqual(grokpattern1, grok1)

self.assertIsInstance(grok_set, set)
self.assertIsInstance(grok_pattern_set, set)

self.assertEqual(str(grok1), "Grok (test pattern)")
self.assertEqual(repr(grok1), "test pattern")

self.assertEqual(str(grokpattern1), "GrokPattern (TEST, regex)")
self.assertEqual(repr(grokpattern1), "TEST")

0 comments on commit 8fe4292

Please sign in to comment.