ScrapeGraphAI
diff --git a/‎dataset.json
Lines changed: 0 additions & 14 deletions b/‎dataset.json
Lines changed: 0 additions & 14 deletions
diff --git a/‎general.json
Lines changed: 38 additions & 0 deletions b/‎general.json
Lines changed: 38 additions & 0 deletions
diff --git a/‎nodes.json
Lines changed: 3 additions & 0 deletions b/‎nodes.json
Lines changed: 3 additions & 0 deletions
diff --git a/‎nodes/basenode.json
Lines changed: 70 additions & 0 deletions b/‎nodes/basenode.json
Lines changed: 70 additions & 0 deletions
@@ -0,0 +1,38 @@
+[
+    {
+        "prompt": "What is scrapegraph-ai?",
+        "answer": "ScrapegraphAI is an open-source library conceived to meet the demands of a dynamic web environment and data-centric world. Its mission is to usher in a new era of scraping tools, leveraging Large Language Models (LLMs) to offer enhanced flexibility and reduced maintenance burden for developers."
+    },
+    {
+        "prompt": "What sets Scrapegraph AI apart from the competition?",
+        "answer": "ScrapegraphAI stands apart from traditional web scraping tools by harnessing the capabilities of Large Language Models (LLMs). Unlike rigid methods that rely on predefined patterns or manual adjustments, ScrapegraphAI dynamically adapts to variations in website structures. This adaptability minimizes the need for ongoing developer involvement, ensuring continued functionality even amidst changes to website layouts."
+    },
+    {
+        "prompt": "What LLM models and providers are compatible with Scrapegraph-ai?",
+        "answer": "ScrapegraphAI a variety of well-known LLM models and providers to analyze web pages and extract the information requested by users. These models are specified within the graph configuration dictionary and can be used interchangeably, allowing you to define different models for LLM and embeddings as needed. You can utilize various local language models with tools like Ollama or through APIs from providers such as OpenAI, Groq, Azure, Gemini, Anthropic, DeepSeek, Mistral and others."
+    },
+    {
+        "prompt": "What functionalities does Scrapegraph AI offer?",
+        "answer": "ScrapegraphAI offers various scraping modes and other advanced features, such as extracting information from local files, single web pages, and multiple web pages. Additionally, it can transform the extracted information into audio. If you use GPT-4o, it can describe images."
+    },
+    {
+        "prompt": "Can ScrapeGraphAI handle different document formats?",
+        "answer": "Yes, ScrapeGraphAI can scrape information from various document formats such as XML, HTML, JSON, and more."
+    },
+    {
+        "prompt": "How does ScrapeGraphAI handle the context window limit of LLMs?",
+        "answer": "By splitting big websites/documents into chunks with overlaps and applying compression techniques to reduce the number of tokens. If multiple chunks are present, we will have multiple answers to the user prompt, and therefore, we merge them together in the last step of the scraping pipeline."
+    },
+    {
+        "prompt": "How can I contribute to ScrapeGraphAI?",
+        "answer": "You can contribute to ScrapeGraphAI by submitting bug reports, feature requests, or pull requests on the GitHub repository."
+    },
+    {
+        "prompt": "Give me an overview of Scrapegraph-ai.",
+        "answer": "ScrapeGraphAI is an open-source Python library designed to revolutionize scraping tools. In today’s data-intensive digital landscape, this library stands out by integrating Large Language Models (LLMs) and modular graph-based pipelines to automate the scraping of data from various sources (e.g., websites, local files etc.).\nSimply specify the information you need to extract, and ScrapeGraphAI handles the rest, providing a more flexible and low-maintenance solution compared to traditional scraping tools."
+    },
+    {
+        "prompt": "Why should I use Scrapegraph-ai?",
+        "answer": "Traditional web scraping tools often rely on fixed patterns or manual configuration to extract data from web pages. ScrapegraphAI, leveraging the power of LLMs, adapts to changes in website structures, reducing the need for constant developer intervention. This flexibility ensures that scrapers remain functional even when website layouts change."
+    }
+]
@@ -0,0 +1,3 @@
+[
+
+]
@@ -0,0 +1,70 @@
+[
+    {
+        "prompt": "What is the base node in ScrapeGraphAI?",
+        "answer": "The ScrapegraphAI `BaseNode` is an abstract base class for nodes in a graph-based workflow, designed to perform specific actions when executed."
+    },
+    {
+        "prompt": "What is the purpose of the BaseNode class in ScrapeGraphAI?",
+        "answer": "The BaseNode class is an abstract base class for nodes in a graph-based workflow. It is designed to perform specific actions when executed."
+    },
+    {
+        "prompt": "What are the attributes of the BaseNode class in ScrapeGraphAI?",
+        "answer": "The attributes of the BaseNode class are `node_name`, `input`, `output`, `min_input_len`, `node_config`, and `logger`."
+    },
+    {
+        "prompt": "What is the role of the `node_name` attribute in the `BaseNode` class of ScrapeGraphAI?",
+        "answer": "The `node_name` attribute is a unique identifier name for the node in the `BaseNode` class."
+    },
+    {
+        "prompt": "What is the role of the input attribute in the BaseNode class in ScrapeGraphAI?",
+        "answer": "The `input` attribute is a boolean expression defining the input keys needed from the state in the `BaseNode` class."
+    },
+    {
+        "prompt": "What is the role of the output attribute in the BaseNode class in ScrapeGraphAI?",
+        "answer": "The `output` attribute is a list of output keys to be updated in the state in the `BaseNode` class."
+    },
+    {
+        "prompt": "What is the role of the min_input_len attribute in the BaseNode class? in ScrapeGraphAI",
+        "answer": "The `min_input_len` attribute is the minimum required number of input keys in the `BaseNode` class."
+    },
+    {
+        "prompt": "What is the role of the node_config attribute in the BaseNode class? in ScrapeGraphAI",
+        "answer": "The `node_config` attribute is an optional additional configuration for the node in the `BaseNode` class."
+    },
+    {
+        "prompt": "What is the role of the logger attribute in the BaseNode class in ScrapeGraphAI?",
+        "answer": "The `logger` attribute is the centralized root logger in the `BaseNode` class."
+    },
+    {
+        "prompt": "What are the methods of the BaseNode class in ScrapeGraphAI?",
+        "answer": "The methods of the `BaseNode` class are `__init__`, `execute`, `update_config`, `get_input_keys`, `_validate_input_keys`, and `_parse_input_keys`."
+    },
+    {
+        "prompt": "What is the purpose of the __init__ method in the BaseNode class in ScrapeGraphAI?",
+        "answer": "The `__init__` method is the constructor method for initializing the `BaseNode` class with the required attributes."
+    },
+    {
+        "prompt": "What is the purpose of the execute method in the BaseNode class in ScrapeGraphAI?",
+        "answer": "The `execute` method is an abstract method that is responsible for executing the node's logic based on the current state and updating it accordingly."
+    },
+    {
+        "prompt": "What is the purpose of the update_config method in the BaseNode class in ScrapeGraphAI?",
+        "answer": "The `update_config` method is responsible for updating the `node_config` dictionary as well as attributes with the same key in the `BaseNode` class."
+    },
+    {
+        "prompt": "What is the purpose of the get_input_keys method in the BaseNode class in ScrapeGraphAI?",
+        "answer": "The `get_input_keys` method is responsible for determining the necessary state keys based on the input specification in the BaseNode class."
+    },
+    {
+        "prompt": "What is the purpose of the _validate_input_keys method in the BaseNode class in ScrapeGraphAI?",
+        "answer": "The `_validate_input_keys` method is responsible for validating if the provided input keys meet the minimum length requirement in the `BaseNode` class."
+    },
+    {
+        "prompt": "What is the purpose of the _parse_input_keys method in the BaseNode class in ScrapeGraphAI?",
+        "answer": "The `_parse_input_keys` method is responsible for parsing the input keys expression to extract relevant keys from the state based on logical conditions in the `BaseNode` class."
+    },
+    {
+        "prompt": "Show me the implementation of the BaseNode in ScrapegraphAI.",
+        "answer": "Sure, here's the implementation of the `BaseNode` in ScrapeGraphAI:\n\n```python\n\"\"\" \nBaseNode Module\n\"\"\"\n\nimport re\nfrom abc import ABC, abstractmethod\nfrom typing import List, Optional\n\nfrom ..utils import get_logger\n\n\nclass BaseNode(ABC):\n    \"\"\"\n    An abstract base class for nodes in a graph-based workflow, designed to perform specific actions when executed.\n\n    Attributes:\n        node_name (str): The unique identifier name for the node.\n        input (str): Boolean expression defining the input keys needed from the state.\n        output (List[str]): List of\n        min_input_len (int): Minimum required number of input keys.\n        node_config (Optional[dict]): Additional configuration for the node.\n        logger (logging.Logger): The centralized root logger\n\n    Args:\n        node_name (str): Name for identifying the node.\n        node_type (str): Type of the node; must be 'node' or 'conditional_node'.\n        input (str): Expression defining the input keys needed from the state.\n        output (List[str]): List of output keys to be updated in the state.\n        min_input_len (int, optional): Minimum required number of input keys; defaults to 1.\n        node_config (Optional[dict], optional): Additional configuration for the node; defaults to None.\n\n    Raises:\n        ValueError: If `node_type` is not one of the allowed types.\n\n    Example:\n        >>> class MyNode(BaseNode):\n        ...     def execute(self, state):\n        ...         # Implementation of node logic here\n        ...         return state\n        ...\n        >>> my_node = MyNode(\"ExampleNode\", \"node\", \"input_spec\", [\"output_spec\"])\n        >>> updated_state = my_node.execute({'key': 'value'})\n        {'key': 'value'}\n    \"\"\"\n\n    def __init__(\n        self,\n        node_name: str,\n        node_type: str,\n        input: str,\n        output: List[str],\n        min_input_len: int = 1,\n        node_config: Optional[dict] = None,\n    ):\n\n        self.node_name = node_name\n        self.input = input\n        self.output = output\n        self.min_input_len = min_input_len\n        self.node_config = node_config\n        self.logger = get_logger()\n\n        if node_type not in [\"node\", \"conditional_node\"]:\n            raise ValueError(\n                f\"node_type must be 'node' or 'conditional_node', got '{node_type}'\"\n            )\n        self.node_type = node_type\n\n    @abstractmethod\n    def execute(self, state: dict) -> dict:\n        \"\"\"\n        Execute the node's logic based on the current state and update it accordingly.\n\n        Args:\n            state (dict): The current state of the graph.\n\n        Returns:\n            dict: The updated state after executing the node's logic.\n        \"\"\"\n\n        pass\n\n    def update_config(self, params: dict, overwrite: bool = False):\n        \"\"\"\n        Updates the node_config dictionary as well as attributes with same key.\n\n        Args:\n            param (dict): The dictionary to update node_config with.\n            overwrite (bool): Flag indicating if the values of node_config should be overwritten if their value is not None.\n        \"\"\"\n        if self.node_config is None:\n            self.node_config = {}\n        for key, val in params.items():\n            if hasattr(self, key) and (key not in self.node_config or overwrite):\n                self.node_config[key] = val\n                setattr(self, key, val)\n\n    def get_input_keys(self, state: dict) -> List[str]:\n        \"\"\"\n        Determines the necessary state keys based on the input specification.\n\n        Args:\n            state (dict): The current state of the graph used to parse input keys.\n\n        Returns:\n            List[str]: A list of input keys required for node operation.\n\n        Raises:\n            ValueError: If error occurs in parsing input keys.\n        \"\"\"\n\n        try:\n            input_keys = self._parse_input_keys(state, self.input)\n            self._validate_input_keys(input_keys)\n            return input_keys\n        except ValueError as e:\n            raise ValueError(f\"Error parsing input keys for {self.node_name}: {str(e)}\")\n\n    def _validate_input_keys(self, input_keys):\n        \"\"\"\n        Validates if the provided input keys meet the minimum length requirement.\n\n        Args:\n            input_keys (List[str]): The list of input keys to validate.\n\n        Raises:\n            ValueError: If the number of input keys is less than the minimum required.\n        \"\"\"\n\n        if len(input_keys) < self.min_input_len:\n            raise ValueError(\n                f\"\"\"{self.node_name} requires at least {self.min_input_len} input keys,\n                  got {len(input_keys)}.\"\"\"\n            )\n\n    def _parse_input_keys(self, state: dict, expression: str) -> List[str]:\n        \"\"\"\n        Parses the input keys expression to extract relevant keys from the state based on logical conditions.\n        The expression can contain AND (&), OR (|), and parentheses to group conditions.\n\n        Args:\n            state (dict): The current state of the graph.\n            expression (str): The input keys expression to parse.\n\n        Returns:\n            List[str]: A list of key names that match the input keys expression logic.\n\n        Raises:\n            ValueError: If the expression is invalid or if no state keys match the expression.\n        \"\"\"\n\n        # Check for empty expression\n        if not expression:\n            raise ValueError(\"Empty expression.\")\n\n        # Check for adjacent state keys without an operator between them\n        pattern = (\n            r\"\\b(\"\n            + \"|\".join(re.escape(key) for key in state.keys())\n            + r\")(\\b\\s*\\b)(\"\n            + \"|\".join(re.escape(key) for key in state.keys())\n            + r\")\\b\"\n        )\n        if re.search(pattern, expression):\n            raise ValueError(\n                \"Adjacent state keys found without an operator between them.\"\n            )\n\n        # Remove spaces\n        expression = expression.replace(\" \", \"\")\n\n        # Check for operators with empty adjacent tokens or at the start/end\n        if (\n            expression[0] in \"&|\"\n            or expression[-1] in \"&|\"\n            or \"&&\" in expression\n            or \"||\" in expression\n            or \"&|\" in expression\n            or \"|&\" in expression\n        ):\n            raise ValueError(\"Invalid operator usage.\")\n\n        # Check for balanced parentheses and valid operator placement\n        open_parentheses = close_parentheses = 0\n        for i, char in enumerate(expression):\n            if char == \"(\":\n                open_parentheses += 1\n            elif char == \")\":\n                close_parentheses += 1\n            # Check for invalid operator sequences\n            if char in \"&|\" and i + 1 < len(expression) and expression[i + 1] in \"&|\":\n                raise ValueError(\n                    \"Invalid operator placement: operators cannot be adjacent.\"\n                )\n\n        # Check for missing or balanced parentheses\n        if open_parentheses != close_parentheses:\n            raise ValueError(\"Missing or unbalanced parentheses in expression.\")\n\n        # Helper function to evaluate an expression without parentheses\n        def evaluate_simple_expression(exp: str) -> List[str]:\n            \"\"\"Evaluate an expression without parentheses.\"\"\"\n\n            # Split the expression by the OR operator and process each segment\n            for or_segment in exp.split(\"|\"):\n\n                # Check if all elements in an AND segment are in state\n                and_segment = or_segment.split(\"&\")\n                if all(elem.strip() in state for elem in and_segment):\n                    return [\n                        elem.strip() for elem in and_segment if elem.strip() in state\n                    ]\n            return []\n\n        # Helper function to evaluate expressions with parentheses\n        def evaluate_expression(expression: str) -> List[str]:\n            \"\"\"Evaluate an expression with parentheses.\"\"\"\n\n            while \"(\" in expression:\n                start = expression.rfind(\"(\")\n                end = expression.find(\")\", start)\n                sub_exp = expression[start + 1 : end]\n\n                # Replace the evaluated part with a placeholder and then evaluate it\n                sub_result = evaluate_simple_expression(sub_exp)\n\n                # For simplicity in handling, join sub-results with OR to reprocess them later\n                expression = (\n                    expression[:start] + \"|\".join(sub_result) + expression[end + 1 :]\n                )\n            return evaluate_simple_expression(expression)\n\n        result = evaluate_expression(expression)\n\n        if not result:\n            raise ValueError(\"No state keys matched the expression.\")\n\n        # Remove redundant state keys from the result, without changing their order\n        final_result = []\n        for key in result:\n            if key not in final_result:\n                final_result.append(key)\n\n        return final_result\n```"
+    }
+]