Skip to content

Commit

Permalink
Add Context and Node Docs (#586)
Browse files Browse the repository at this point in the history
* add links and docs

* henry comment

* lint

---------

Co-authored-by: Karan Sampath <[email protected]>
  • Loading branch information
karansampath and karanataryn authored Jul 30, 2024
1 parent c80d83f commit eb172dd
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 2 deletions.
2 changes: 2 additions & 0 deletions docs/source/APIs/data_preparation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ These APIs are used for writing data preparation jobs and loading data into Syca
:maxdepth: 2

/APIs/data_preparation/docset.rst
/APIs/data_preparation/context.rst
/APIs/data_preparation/node.rst
/APIs/data_preparation/docsetreader.rst
/APIs/data_preparation/docsetwriter.rst
/APIs/data_preparation/document.rst
Expand Down
7 changes: 7 additions & 0 deletions docs/source/APIs/data_preparation/context.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.. _Ref-Data:

Context
================

.. automodule:: sycamore.context
:members:
7 changes: 7 additions & 0 deletions docs/source/APIs/data_preparation/node.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.. _Ref-Data:

Node
================

.. automodule:: sycamore.plan_nodes
:members:
15 changes: 15 additions & 0 deletions lib/sycamore/sycamore/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ def _ray_logging_setup():


class Context:
"""
A class to implement a Sycamore Context, which initializes a Ray Worker and provides the ability
to read data into a DocSet
"""

def __init__(self, ray_args: Optional[dict[str, Any]] = None):
if ray_args is None:
ray_args = {}
Expand All @@ -61,15 +66,25 @@ def read(self):
return DocSetReader(self)

def register_rule(self, rule: Rule) -> None:
"""
Allows for the registration of Rules in the Sycamore Context that allow for communication with the
underlying Ray context and can specify additional performance optimizations
"""
with self._internal_lock:
self.extension_rules.append(rule)

def get_extension_rule(self) -> list[Rule]:
"""
Returns all Rules currently registered in the Context
"""
with self._internal_lock:
copied = self.extension_rules.copy()
return copied

def deregister_rule(self, rule: Rule) -> None:
"""
Removes a currently registered Rule from the context
"""
with self._internal_lock:
self.extension_rules.remove(rule)

Expand Down
4 changes: 2 additions & 2 deletions lib/sycamore/sycamore/docset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
from typing import Callable, Optional, Any, Iterable, Type

from sycamore import Context
from sycamore.context import Context
from sycamore.data import Document, Element, MetadataDocument
from sycamore.functions.tokenizer import Tokenizer
from sycamore.plan_nodes import Node, Transform
Expand All @@ -26,7 +26,7 @@

class DocSet:
"""
A DocSet, short for “documentation set,” is a distributed collection of documents bundled together for processing.
A DocSet, short for “Document Set”, is a distributed collection of documents bundled together for processing.
Sycamore provides a variety of transformations on DocSets to help customers handle unstructured data easily.
"""

Expand Down
12 changes: 12 additions & 0 deletions lib/sycamore/sycamore/plan_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@


class Node(ABC):
"""
A Node is the abstract base unit of a Sycamore Transform, which allows DocSets to transform themselves into end
results. Sycamore processes this as a directed tree graph, which allows transforms to be linked to each other
and then implemented
"""

def __init__(self, children: list[Optional["Node"]], **resource_args):
self.children = children
self.resource_args = resource_args
Expand All @@ -17,11 +23,17 @@ def execute(self, **kwargs) -> Dataset:
pass

def traverse_down(self, f: Callable[["Node"], "Node"]) -> "Node":
"""
Allows a function to be applied to a node first and then all of its children
"""
f(self)
self.children = [c.traverse_down(f) for c in self.children if c is not None]
return self

def traverse_up(self, f: Callable[["Node"], "Node"]) -> "Node":
"""
Allows a function to be applied to all of a node's children first and then itelf
"""
self.children = [c.traverse_up(f) for c in self.children if c is not None]
f(self)
return self
Expand Down

0 comments on commit eb172dd

Please sign in to comment.