Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BSE-4155] Add support for the Project relational node #37

Merged
merged 45 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
cce5e4c
Added the new base files
njriasan Nov 6, 2024
19460cc
Started added abstract classes
njriasan Nov 6, 2024
89f8e42
Added the file definitions
njriasan Nov 7, 2024
2786774
Added the sqlglot import
njriasan Nov 7, 2024
4ee70be
Added a basic test [run CI]
njriasan Nov 7, 2024
66107ec
Defined tests
njriasan Nov 8, 2024
256ecef
Defined the tests cases we want to write
njriasan Nov 8, 2024
d267275
Added the project file docstring
njriasan Nov 8, 2024
cb32b89
Added the class defintion
njriasan Nov 8, 2024
b11a50f
Started adding implementation parts
njriasan Nov 8, 2024
856e20e
Added the projection defintion
njriasan Nov 8, 2024
5ed09e0
Added a single relational base class
njriasan Nov 8, 2024
178ffd1
Updated scan
njriasan Nov 8, 2024
2e7d1a5
wrote tests for the basic scan ops
njriasan Nov 8, 2024
9b88810
Fixed the first basic unit test:
njriasan Nov 8, 2024
b844130
Added equality tests
njriasan Nov 8, 2024
24b620e
Added error test
njriasan Nov 8, 2024
63aa271
Added remaining tests [run CI]
njriasan Nov 8, 2024
d32f73e
Merged with prior PR
njriasan Nov 8, 2024
c2aef51
Updated orderings check
njriasan Nov 8, 2024
c832181
added the projection changes
njriasan Nov 8, 2024
135ab5f
added a test for project equality
njriasan Nov 9, 2024
ff3b25f
Added function definitions for the remaining unit tests
njriasan Nov 9, 2024
90cd1e5
Fixed the to_string() test
njriasan Nov 9, 2024
8bd7cfc
added a can merge test
njriasan Nov 9, 2024
0737744
added a merge test
njriasan Nov 9, 2024
95c281e
Finished adding project tests [run CI]
njriasan Nov 9, 2024
ac2a6e4
Merged with prior [run CI]
njriasan Nov 11, 2024
44f709f
Merge branch 'nick/relational_scan' into nick/projection
njriasan Nov 11, 2024
ecc1600
applied most of Kian's changes, need to test column expressions
njriasan Nov 13, 2024
25a7791
Removed unnecessary column class
njriasan Nov 13, 2024
ea9255c
Added remaining tests [run CI]
njriasan Nov 13, 2024
bf20591
Added remaining tests [run CI]
njriasan Nov 13, 2024
ab2f5db
added the literal
njriasan Nov 13, 2024
32cc1ea
added the remaining tests [run CI]
njriasan Nov 13, 2024
591b67d
Back-ported changes [run CI]
njriasan Nov 13, 2024
d96e9f6
Merged with prior PR
njriasan Nov 13, 2024
ce7b9fa
Fix a typo [run CI]
njriasan Nov 13, 2024
5376da1
Merge branch 'nick/relational_scan' into nick/projection [run CI]
njriasan Nov 13, 2024
90b4e7a
Renamed files
njriasan Nov 14, 2024
cc1a2d1
Applied actual refactoring, need to update test info
njriasan Nov 14, 2024
4a01824
Added test changes [run CI]
njriasan Nov 14, 2024
d05aa76
merged with prior PR
njriasan Nov 14, 2024
11b5d27
Added test docstrings
njriasan Nov 14, 2024
0f9329d
applied remaining feedback [run CI]
njriasan Nov 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions pydough/pydough_ast/expressions/simple_column_reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
The representation of a simplified column for use as an expression in the Relational
tree. Since the relational step is geared towards building the actual SQL query, it doesn't
need the back references to the metadata, which can complicate the ability to determine information
like equivalence between columns.
"""

__all__ = ["SimpleColumnReference"]

from pydough.types import PyDoughType

from .expression_ast import PyDoughExpressionAST


class SimpleColumnReference(PyDoughExpressionAST):
"""
The AST node implementation class representing a simple reference to a column.
This is used once the AST is converted to a relational tree to avoid creating
duplicate Relational version of every function.

Note: We avoid using index based accesses to columns to avoid confusion in cases
where we may have a subset of information and to avoid needing to reorder columns.
"""

def __init__(self, name: str, data_type: PyDoughType):
self._name: str = name
self._data_type: PyDoughType = data_type

def __hash__(self) -> int:
return hash((self._name, self._data_type))

@property
def name(self) -> object:
"""
The name of the column.
"""
return self._name

@property
def pydough_type(self) -> PyDoughType:
return self._data_type

@property
def is_aggregation(self) -> bool:
return False

def requires_enclosing_parens(self, parent: PyDoughExpressionAST) -> bool:
return False

def to_string(self, tree_form: bool = False) -> str:
return f"Column({self.name})"

def equals(self, other: object) -> bool:
return (
isinstance(other, SimpleColumnReference)
and (self.pydough_type == other.pydough_type)
and (self.name == other.name)
)
12 changes: 12 additions & 0 deletions pydough/relational/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""
TODO: add module-level docstring
"""

__all__ = [
"Column",
"Relational",
]
from .abstract import (
Column,
Relational,
)
208 changes: 208 additions & 0 deletions pydough/relational/abstract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
"""
This file contains the abstract base classes for the relational
representation. This roughly maps to a Relational Algebra representation
but is not exact because it needs to maintain PyDough traits that define
ordering and other properties of the relational expression.
"""

from abc import ABC, abstractmethod
from collections.abc import MutableMapping, MutableSequence
from typing import Any, NamedTuple

from sqlglot.expressions import Expression

from pydough.pydough_ast.expressions import PyDoughExpressionAST


class Relational(ABC):
"""
The base class for any relational node. This interface defines the basic
structure of all relational nodes in the PyDough system.
"""

@property
@abstractmethod
def inputs(self) -> MutableSequence["Relational"]:
"""
Returns any inputs to the current relational expression.

Returns:
MutableSequence["Relational"]: The list of inputs, each of which must
be a relational expression.
"""

@property
def traits(self) -> MutableMapping[str, Any]:
"""
Return the traits of the relational expression.
The traits in general may have a variable schema,
but each entry should be strongly defined. Here are
traits that should always be available:

- orderings: MutableSequence[PyDoughExpressionAST]

Returns:
MutableMapping[str, Any]: The traits of the relational expression.
"""
return {"orderings": self.orderings}

@property
@abstractmethod
def orderings(self) -> MutableSequence["PyDoughExpressionAST"]:
"""
Returns the PyDoughExpressionAST that the relational expression is ordered by.
Each PyDoughExpressionAST is a result computed relative to the given set of columns.

Returns:
MutableSequence[PyDoughExpressionAST]: The PyDoughExpressionAST that the relational expression is ordered by,
possibly empty.
"""

def orderings_match(
self, other_orderings: MutableSequence["PyDoughExpressionAST"]
) -> bool:
"""
Determine if two orderings match in a way that would be considered equivalent
for the given node.

Args:
other_orderings (MutableSequence[PyDoughExpressionAST]): The orderings property
of another relational node.

Returns:
bool: Can the two orderings be considered equivalent and therefore safely merged.
"""
# TODO: Allow merging compatible orderings?
return not self.orderings and not other_orderings

@property
@abstractmethod
def columns(self) -> MutableSequence["Column"]:
"""
Returns the columns of the relational expression.

Returns:
MutableSequence[Column]: The columns of the relational expression.
"""

def columns_match(self, other_columns: MutableSequence["Column"]) -> bool:
"""
Determine if two sets of columns match in a way that would be considered compatible
for the given node. In general we current assume that columns are indexed by name
and any columns with the same name must be equivalent.

Args:
other_columns (MutableSequence[Column]): The columns property
of another relational node.

Returns:
bool: Can the two columns be considered equivalent and therefore safely merged.
"""
first_keys = {col.name: col.expr for col in self.columns}
second_keys = {col.name: col.expr for col in other_columns}
for key, value in first_keys.items():
if key in second_keys and value != second_keys[key]:
return False
return True

def merge_columns(self, other_columns: MutableSequence["Column"]) -> list["Column"]:
"""
Merge two sets of columns together, keeping the original ordering
of self as much as possible. This eliminates any duplicates between
the two sets of columns and assumes that if two columns have the same name
then they must match (which is enforced by the columns_match method).

Args:
other_columns (MutableSequence[Column]): The columns property
of another relational node.


Returns:
list["Column"]: The list of merged columns keeping the original ordering
of self as much as possible.
"""
cols = list(self.columns)
col_set = set(cols)
for col in other_columns:
if col not in col_set:
cols.append(col)
return cols

@abstractmethod
def equals(self, other: "Relational") -> bool:
"""
Determine if two relational nodes are exactly identical,
including column ordering.

Args:
other (Relational): The other relational node to compare against.

Returns:
bool: Are the two relational nodes equal.
"""

def __eq__(self, other: Any) -> bool:
if not isinstance(other, Relational):
return False
return self.equals(other)

@abstractmethod
def to_sqlglot(self) -> "Expression":
"""Translate the given relational expression
and its children to a SQLGlot expression.

Returns:
Expression: A SqlGlot expression representing the relational expression.
"""

@abstractmethod
def to_string(self) -> str:
"""
Convert the relational expression to a string.

TODO: Refactor this API to include some form of string
builder so we can draw lines between children properly.

Returns:
str: A string representation of the relational tree
with this node at the root.
"""

def __repr__(self) -> str:
return self.to_string()

@abstractmethod
def can_merge(self, other: "Relational") -> bool:
"""
Determine if two relational nodes can be merged together.

Args:
other (Relational): The other relational node to merge with.

Returns:
bool: Can the two relational nodes be merged together.
"""

@abstractmethod
def merge(self, other: "Relational") -> "Relational":
"""
Merge two relational nodes together to produce one output
relational node. This requires can_merge to return True.

Args:
other (Relational): The other relational node to merge with.

Returns:
Relational: A new relational node that is the result of merging
the two input relational nodes together and removing any redundant
components.
"""


class Column(NamedTuple):
"""
An column expression consisting of a name and an expression.
"""

name: str
expr: "PyDoughExpressionAST"
85 changes: 85 additions & 0 deletions pydough/relational/project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""
This file contains the relational implementation for a "project". This is our
relational representation for a "calc" that involves any compute steps and can include
adding or removing columns (as well as technically reordering). In general, we seek to
avoid introducing extra nodes just to reorder or prune columns, so ideally their use
should be sparse.
"""

from collections.abc import MutableSequence

from sqlglot.expressions import Expression

from pydough.pydough_ast.expressions import PyDoughExpressionAST

from .abstract import Column, Relational
from .single_relational import SingleRelational


class Project(SingleRelational):
"""
The Project node in the relational tree. This node represents a "calc" in
relational algebra, which should involve some "compute" functions and may
involve adding, removing, or reordering columns.
"""

def __init__(
self,
input: Relational,
columns: MutableSequence["Column"],
orderings: MutableSequence["PyDoughExpressionAST"] | None = None,
) -> None:
super().__init__(input)
self._columns: MutableSequence[Column] = columns
self._orderings: MutableSequence[PyDoughExpressionAST] = (
orderings if orderings else []
)

@property
def orderings(self) -> MutableSequence["PyDoughExpressionAST"]:
return self._orderings

@property
def columns(self) -> MutableSequence["Column"]:
return self._columns

def to_sqlglot(self) -> "Expression":
raise NotImplementedError(
"Conversion to SQLGlot Expressions is not yet implemented."
)

def equals(self, other: "Relational") -> bool:
if not isinstance(other, Project):
return False
return (
# TODO: Do we need a fast path for caching the inputs?
self.input.equals(other.input)
and self.columns == other.columns
and self.orderings == other.orderings
)

def to_string(self) -> str:
# TODO: Should we visit the input?
return f"PROJECT(columns={self.columns}, orderings={self.orderings})"

def can_merge(self, other: Relational) -> bool:
if isinstance(other, Project):
# TODO: Can we allow inputs to ever not merge exactly?
return (
self.input.equals(other.input)
and self.orderings_match(other.orderings)
and self.columns_match(other.columns)
)
else:
return False

def merge(self, other: Relational) -> Relational:
if not self.can_merge(other):
raise ValueError(
f"Cannot merge nodes {self.to_string()} and {other.to_string()}"
)
assert isinstance(other, Project)
input = self.input
cols = self.merge_columns(other.columns)
orderings = self.orderings
return Project(input, cols, orderings)
Loading