ScrapeGraphAI
diff --git a/‎builders/graph_builder.json
+2-2 b/‎builders/graph_builder.json
+2-2
diff --git a/‎dataset.json
+1-1 b/‎dataset.json
+1-1
diff --git a/‎graphs/deep_scraper_graph.json
+1-1 b/‎graphs/deep_scraper_graph.json
+1-1
diff --git a/‎graphs/script_creator_graph.json
+1-1 b/‎graphs/script_creator_graph.json
+1-1
diff --git a/‎graphs/smart_scraper_graph.json
+1-1 b/‎graphs/smart_scraper_graph.json
+1-1
@@ -13,11 +13,11 @@
     },
     {
       "prompt": "In scrapegraph ai, what is the purpose of the `build_graph` method in the GraphBuilder class?",
-      "answer": "The `build_graph` method in the `GraphBuilder` class of ScrapeGraph-ai executes the graph creation process based on the user prompt and returns the graph configuration as a JSON representation."
+      "answer": "The `build_graph` method in the `GraphBuilder` class of scrapegraphai executes the graph creation process based on the user prompt and returns the graph configuration as a JSON representation."
     },
     {
       "prompt": "In scrapegraph ai, what is the purpose of the `convert_json_to_graphviz` method in the GraphBuilder class?",
-      "answer": "The `convert_json_to_graphviz` method in the `GraphBuilder` class of scrapegraph-ai converts a JSON graph configuration to a Graphviz object for visualization."
+      "answer": "The `convert_json_to_graphviz` method in the `GraphBuilder` class of scrapegraphai converts a JSON graph configuration to a Graphviz object for visualization."
     },
     {
       "prompt": "Show me the implementation of the GraphBuilder class in scrapegraph ai.",
 
@@ -20,7 +20,7 @@
         "answer": "The `run` method in the `DeepScraperGraph` class of scrapegraph ai executes the scraping process by providing the initial inputs to the graph and calling the graph's execute method. It then stores the final state and execution info and returns the answer to the prompt."
     },
     {
-        "prompt": "Show me the implementation for the DeepScraperGraph in scrapegraph-ai.",
+        "prompt": "Show me the implementation for the DeepScraperGraph in scrapegraphai.",
         "answer": "Sure, here's the implementation for the `DeepScraperGraph` class in ScrapeGraphAI:\n\n```python\n\"\"\"\nDeepScraperGraph Module\n\"\"\"\n\nfrom typing import Optional\n\nfrom .base_graph import BaseGraph\nfrom .abstract_graph import AbstractGraph\n\nfrom ..nodes import (\n    FetchNode,\n    SearchLinkNode,\n    ParseNode,\n    RAGNode,\n    GenerateAnswerNode,\n    GraphIteratorNode,\n    MergeAnswersNode\n)\n\n\nclass DeepScraperGraph(AbstractGraph):\n    \"\"\"\n    [WIP]\n\n    DeepScraper is a scraping pipeline that automates the process of \n    extracting information from web pages using a natural language model \n    to interpret and answer prompts.\n\n    Unlike SmartScraper, DeepScraper can navigate to the links within,\n    the input webpage to fuflfil the task within the prompt.\n    \n    Attributes:\n        prompt (str): The prompt for the graph.\n        source (str): The source of the graph.\n        config (dict): Configuration parameters for the graph.\n        schema (str): The schema for the graph output.\n        llm_model: An instance of a language model client, configured for generating answers.\n        embedder_model: An instance of an embedding model client, \n        configured for generating embeddings.\n        verbose (bool): A flag indicating whether to show print statements during execution.\n        headless (bool): A flag indicating whether to run the graph in headless mode.\n        \n    Args:\n        prompt (str): The prompt for the graph.\n        source (str): The source of the graph.\n        config (dict): Configuration parameters for the graph.\n        schema (str): The schema for the graph output.\n\n    Example:\n        >>> deep_scraper = DeepScraperGraph(\n        ...     \"List me all the job titles and detailed job description.\",\n        ...     \"https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India\",\n        ...     {\"llm\": {\"model\": \"gpt-3.5-turbo\"}}\n        ... )\n        >>> result = deep_scraper.run()\n        )\n    \"\"\"\n\n    def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):\n    \n        super().__init__(prompt, config, source, schema)\n\n        self.input_key = \"url\" if source.startswith(\"http\") else \"local_dir\"\n\n    def _create_repeated_graph(self) -> BaseGraph:\n        \"\"\"\n        Creates the graph that can be repeatedly executed to conduct search on\n        hyperlinks within the webpage.\n\n        Returns:\n            BaseGraph: A graph instance representing the web scraping workflow.\n        \"\"\"\n        fetch_node = FetchNode(\n            input=\"url | local_dir\",\n            output=[\"doc\", \"link_urls\", \"img_urls\"]\n        )\n        parse_node = ParseNode(\n            input=\"doc\",\n            output=[\"parsed_doc\"],\n            node_config={\n                \"chunk_size\": self.model_token\n            }\n        )\n        rag_node = RAGNode(\n            input=\"user_prompt & (parsed_doc | doc)\",\n            output=[\"relevant_chunks\"],\n            node_config={\n                \"llm_model\": self.llm_model,\n                \"embedder_model\": self.embedder_model\n            }\n        )\n        generate_answer_node = GenerateAnswerNode(\n            input=\"user_prompt & (relevant_chunks | parsed_doc | doc)\",\n            output=[\"answer\"],\n            node_config={\n                \"llm_model\": self.llm_model,\n                \"schema\": self.schema\n            }\n        )\n        search_node = SearchLinkNode(\n            input=\"user_prompt & relevant_chunks\",\n            output=[\"relevant_links\"],\n            node_config={\n                \"llm_model\": self.llm_model,\n                \"embedder_model\": self.embedder_model\n            }\n        )\n        graph_iterator_node = GraphIteratorNode(\n            input=\"user_prompt & relevant_links\",\n            output=[\"results\"],\n            node_config={\n                \"graph_instance\": None,\n                \"batchsize\": 1\n            }\n        )\n        merge_answers_node = MergeAnswersNode(\n            input=\"user_prompt & results\",\n            output=[\"answer\"],\n            node_config={\n                \"llm_model\": self.llm_model,\n                \"schema\": self.schema\n            }\n        )\n\n        return BaseGraph(\n            nodes=[\n                fetch_node,\n                parse_node,\n                rag_node,\n                generate_answer_node,\n                search_node,\n                graph_iterator_node,\n                merge_answers_node\n            ],\n            edges=[\n                (fetch_node, parse_node),\n                (parse_node, rag_node),\n                (rag_node, generate_answer_node),\n                (rag_node, search_node),\n                (search_node, graph_iterator_node),\n                (graph_iterator_node, merge_answers_node)\n            ],\n            entry_point=fetch_node\n        )\n\n\n\n    def _create_graph(self) -> BaseGraph:\n        \"\"\"\n        Creates the graph of nodes representing the workflow for web scraping\n        n-levels deep.\n\n        Returns:\n            BaseGraph: A graph instance representing the web scraping workflow.\n        \"\"\"\n\n        base_graph = self._create_repeated_graph()\n        graph_iterator_node = list(filter(lambda x: x.node_name == \"GraphIterator\", base_graph.nodes))[0]\n        # Graph iterator will repeat the same graph for multiple hyperlinks found within input webpage\n        graph_iterator_node.node_config[\"graph_instance\"] = self\n        return base_graph\n\n    def run(self) -> str:\n        \"\"\"\n        Executes the scraping process and returns the answer to the prompt.\n        Returns:\n            str: The answer to the prompt.\n        \"\"\"\n\n        inputs = {\"user_prompt\": self.prompt, self.input_key: self.source}\n        self.final_state, self.execution_info = self.graph.execute(inputs)\n\n        return self.final_state.get(\"answer\", \"No answer found.\")\n```"
     }
 ]
 
@@ -20,7 +20,7 @@
         "answer": "The `ScriptCreatorGraph` class in scrapegraph ai determines the `input_key` attribute based on the source. If the source starts with 'http', the `input_key` is set to 'url', and if not, it is set to 'local_dir'. This `input_key` is then used to provide the correct input to the graph's fetch node."
     },
     {
-        "prompt": "Can you show me the implementation of the ScriptCreatorGraph from Scrapegraph-AI?",
+        "prompt": "Can you show me the implementation of the ScriptCreatorGraph from scrapegraphai?",
         "answer": "Sure, here's the implementation for the `ScriptCreatorGraph` class in ScrapeGraphAI:\n\"\"\"\nScriptCreatorGraph Module\n\"\"\"\n\nfrom typing import Optional\n\nfrom .base_graph import BaseGraph\nfrom .abstract_graph import AbstractGraph\n\nfrom ..nodes import (\n    FetchNode,\n    ParseNode,\n    GenerateScraperNode\n)\n\n\nclass ScriptCreatorGraph(AbstractGraph):\n    \"\"\"\n    ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts.\n\n    Attributes:\n        prompt (str): The prompt for the graph.\n        source (str): The source of the graph.\n        config (dict): Configuration parameters for the graph.\n        schema (str): The schema for the graph output.\n        llm_model: An instance of a language model client, configured for generating answers.\n        embedder_model: An instance of an embedding model client, \n        configured for generating embeddings.\n        verbose (bool): A flag indicating whether to show print statements during execution.\n        headless (bool): A flag indicating whether to run the graph in headless mode.\n        model_token (int): The token limit for the language model.\n        library (str): The library used for web scraping.\n\n    Args:\n        prompt (str): The prompt for the graph.\n        source (str): The source of the graph.\n        config (dict): Configuration parameters for the graph.\n        schema (str): The schema for the graph output.\n\n    Example:\n        >>> script_creator = ScriptCreatorGraph(\n        ...     \"List me all the attractions in Chioggia.\",\n        ...     \"https://en.wikipedia.org/wiki/Chioggia\",\n        ...     {\"llm\": {\"model\": \"gpt-3.5-turbo\"}}\n        ... )\n        >>> result = script_creator.run()\n    \"\"\"\n\n    def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):\n\n        self.library = config['library']\n\n        super().__init__(prompt, config, source, schema)\n\n        self.input_key = \"url\" if source.startswith(\"http\") else \"local_dir\"\n\n    def _create_graph(self) -> BaseGraph:\n        \"\"\"\n        Creates the graph of nodes representing the workflow for web scraping.\n\n        Returns:\n            BaseGraph: A graph instance representing the web scraping workflow.\n        \"\"\"\n\n        fetch_node = FetchNode(\n            input=\"url | local_dir\",\n            output=[\"doc\", \"link_urls\", \"img_urls\"],\n        )\n        parse_node = ParseNode(\n            input=\"doc\",\n            output=[\"parsed_doc\"],\n            node_config={\"chunk_size\": self.model_token,\n                         \"parse_html\": False\n                         }\n        )\n        generate_scraper_node = GenerateScraperNode(\n            input=\"user_prompt & (doc)\",\n            output=[\"answer\"],\n            node_config={\n                \"llm_model\": self.llm_model,\n                \"schema\": self.schema,\n            },\n            library=self.library,\n            website=self.source\n        )\n\n        return BaseGraph(\n            nodes=[\n                fetch_node,\n                parse_node,\n                generate_scraper_node,\n            ],\n            edges=[\n                (fetch_node, parse_node),\n                (parse_node, generate_scraper_node),\n            ],\n            entry_point=fetch_node\n        )\n\n    def run(self) -> str:\n        \"\"\"\n        Executes the web scraping process and returns the answer to the prompt.\n\n        Returns:\n            str: The answer to the prompt.\n        \"\"\"\n\n        inputs = {\"user_prompt\": self.prompt, self.input_key: self.source}\n        self.final_state, self.execution_info = self.graph.execute(inputs)\n\n        return self.final_state.get(\"answer\", \"No answer found \")\n```python\n\n```"
     }
 ]
@@ -20,7 +20,7 @@
         "answer": "The `SmartScraperGraph` class in scrapegraph ai determines the `input_key` attribute based on the source. If the source starts with 'http', the `input_key` is set to 'url', and if not, it is set to 'local_dir'. This `input_key` is then used to provide the correct input to the graph's fetch node."
     },
     {
-        "prompt": "Show me the implementation of the SmartScraperGraph in ScrapeGraph-AI",
+        "prompt": "Show me the implementation of the SmartScraperGraph in scrapegraphai",
         "answer": "Here's the implementation for the `SmartScraperGraph` class in ScrapeGraphAI:\n\n```python\n\"\"\"\nSmartScraperGraph Module\n\"\"\"\n\nfrom typing import Optional\n\nfrom .base_graph import BaseGraph\nfrom .abstract_graph import AbstractGraph\n\nfrom ..nodes import (\n    FetchNode,\n    ParseNode,\n    RAGNode,\n    GenerateAnswerNode\n)\n\n\nclass SmartScraperGraph(AbstractGraph):\n    \"\"\"\n    SmartScraper is a scraping pipeline that automates the process of \n    extracting information from web pages\n    using a natural language model to interpret and answer prompts.\n\n    Attributes:\n        prompt (str): The prompt for the graph.\n        source (str): The source of the graph.\n        config (dict): Configuration parameters for the graph.\n        schema (str): The schema for the graph output.\n        llm_model: An instance of a language model client, configured for generating answers.\n        embedder_model: An instance of an embedding model client, \n        configured for generating embeddings.\n        verbose (bool): A flag indicating whether to show print statements during execution.\n        headless (bool): A flag indicating whether to run the graph in headless mode.\n\n    Args:\n        prompt (str): The prompt for the graph.\n        source (str): The source of the graph.\n        config (dict): Configuration parameters for the graph.\n        schema (str): The schema for the graph output.\n\n    Example:\n        >>> smart_scraper = SmartScraperGraph(\n        ...     \"List me all the attractions in Chioggia.\",\n        ...     \"https://en.wikipedia.org/wiki/Chioggia\",\n        ...     {\"llm\": {\"model\": \"gpt-3.5-turbo\"}}\n        ... )\n        >>> result = smart_scraper.run()\n        )\n    \"\"\"\n\n    def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):\n        super().__init__(prompt, config, source, schema)\n\n        self.input_key = \"url\" if source.startswith(\"http\") else \"local_dir\"\n\n    def _create_graph(self) -> BaseGraph:\n        \"\"\"\n        Creates the graph of nodes representing the workflow for web scraping.\n\n        Returns:\n            BaseGraph: A graph instance representing the web scraping workflow.\n        \"\"\"\n        fetch_node = FetchNode(\n            input=\"url | local_dir\",\n            output=[\"doc\", \"link_urls\", \"img_urls\"],\n            node_config={\n                \"loader_kwargs\": self.config.get(\"loader_kwargs\", {}),\n            }\n        )\n        parse_node = ParseNode(\n            input=\"doc\",\n            output=[\"parsed_doc\"],\n            node_config={\n                \"chunk_size\": self.model_token\n            }\n        )\n        rag_node = RAGNode(\n            input=\"user_prompt & (parsed_doc | doc)\",\n            output=[\"relevant_chunks\"],\n            node_config={\n                \"llm_model\": self.llm_model,\n                \"embedder_model\": self.embedder_model\n            }\n        )\n        generate_answer_node = GenerateAnswerNode(\n            input=\"user_prompt & (relevant_chunks | parsed_doc | doc)\",\n            output=[\"answer\"],\n            node_config={\n                \"llm_model\": self.llm_model,\n                \"schema\": self.schema,\n            }\n        )\n\n        return BaseGraph(\n            nodes=[\n                fetch_node,\n                parse_node,\n                rag_node,\n                generate_answer_node,\n            ],\n            edges=[\n                (fetch_node, parse_node),\n                (parse_node, rag_node),\n                (rag_node, generate_answer_node)\n            ],\n            entry_point=fetch_node\n        )\n\n    def run(self) -> str:\n        \"\"\"\n        Executes the scraping process and returns the answer to the prompt.\n\n        Returns:\n            str: The answer to the prompt.\n        \"\"\"\n\n        inputs = {\"user_prompt\": self.prompt, self.input_key: self.source}\n        self.final_state, self.execution_info = self.graph.execute(inputs)\n\n        return self.final_state.get(\"answer\", \"No answer found.\")\n```"
     }
 ]
Original file line number	Diff line number	Diff line change
`@@ -13,11 +13,11 @@`
`13`	`13`	`},`
`14`	`14`	`{`
`15`	`15`	"prompt": "In scrapegraph ai, what is the purpose of the `build_graph` method in the GraphBuilder class?",
`16`		- "answer": "The `build_graph` method in the `GraphBuilder` class of ScrapeGraph-ai executes the graph creation process based on the user prompt and returns the graph configuration as a JSON representation."
	`16`	+ "answer": "The `build_graph` method in the `GraphBuilder` class of scrapegraphai executes the graph creation process based on the user prompt and returns the graph configuration as a JSON representation."
`17`	`17`	`},`
`18`	`18`	`{`
`19`	`19`	"prompt": "In scrapegraph ai, what is the purpose of the `convert_json_to_graphviz` method in the GraphBuilder class?",
`20`		- "answer": "The `convert_json_to_graphviz` method in the `GraphBuilder` class of scrapegraph-ai converts a JSON graph configuration to a Graphviz object for visualization."
	`20`	+ "answer": "The `convert_json_to_graphviz` method in the `GraphBuilder` class of scrapegraphai converts a JSON graph configuration to a Graphviz object for visualization."
`21`	`21`	`},`
`22`	`22`	`{`
`23`	`23`	`"prompt": "Show me the implementation of the GraphBuilder class in scrapegraph ai.",`
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@`
`20`	`20`	"answer": "The `run` method in the `DeepScraperGraph` class of scrapegraph ai executes the scraping process by providing the initial inputs to the graph and calling the graph's execute method. It then stores the final state and execution info and returns the answer to the prompt."
`21`	`21`	`},`
`22`	`22`	`{`
`23`		`- "prompt": "Show me the implementation for the DeepScraperGraph in scrapegraph-ai.",`
	`23`	`+ "prompt": "Show me the implementation for the DeepScraperGraph in scrapegraphai.",`
`24`	`24`	"answer": "Sure, here's the implementation for the `DeepScraperGraph` class in ScrapeGraphAI:\n\n```python\n\"\"\"\nDeepScraperGraph Module\n\"\"\"\n\nfrom typing import Optional\n\nfrom .base_graph import BaseGraph\nfrom .abstract_graph import AbstractGraph\n\nfrom ..nodes import (\n FetchNode,\n SearchLinkNode,\n ParseNode,\n RAGNode,\n GenerateAnswerNode,\n GraphIteratorNode,\n MergeAnswersNode\n)\n\n\nclass DeepScraperGraph(AbstractGraph):\n \"\"\"\n [WIP]\n\n DeepScraper is a scraping pipeline that automates the process of \n extracting information from web pages using a natural language model \n to interpret and answer prompts.\n\n Unlike SmartScraper, DeepScraper can navigate to the links within,\n the input webpage to fuflfil the task within the prompt.\n \n Attributes:\n prompt (str): The prompt for the graph.\n source (str): The source of the graph.\n config (dict): Configuration parameters for the graph.\n schema (str): The schema for the graph output.\n llm_model: An instance of a language model client, configured for generating answers.\n embedder_model: An instance of an embedding model client, \n configured for generating embeddings.\n verbose (bool): A flag indicating whether to show print statements during execution.\n headless (bool): A flag indicating whether to run the graph in headless mode.\n \n Args:\n prompt (str): The prompt for the graph.\n source (str): The source of the graph.\n config (dict): Configuration parameters for the graph.\n schema (str): The schema for the graph output.\n\n Example:\n >>> deep_scraper = DeepScraperGraph(\n ... \"List me all the job titles and detailed job description.\",\n ... \"https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India\",\n ... {\"llm\": {\"model\": \"gpt-3.5-turbo\"}}\n ... )\n >>> result = deep_scraper.run()\n )\n \"\"\"\n\n def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):\n \n super().__init__(prompt, config, source, schema)\n\n self.input_key = \"url\" if source.startswith(\"http\") else \"local_dir\"\n\n def _create_repeated_graph(self) -> BaseGraph:\n \"\"\"\n Creates the graph that can be repeatedly executed to conduct search on\n hyperlinks within the webpage.\n\n Returns:\n BaseGraph: A graph instance representing the web scraping workflow.\n \"\"\"\n fetch_node = FetchNode(\n input=\"url \| local_dir\",\n output=[\"doc\", \"link_urls\", \"img_urls\"]\n )\n parse_node = ParseNode(\n input=\"doc\",\n output=[\"parsed_doc\"],\n node_config={\n \"chunk_size\": self.model_token\n }\n )\n rag_node = RAGNode(\n input=\"user_prompt & (parsed_doc \| doc)\",\n output=[\"relevant_chunks\"],\n node_config={\n \"llm_model\": self.llm_model,\n \"embedder_model\": self.embedder_model\n }\n )\n generate_answer_node = GenerateAnswerNode(\n input=\"user_prompt & (relevant_chunks \| parsed_doc \| doc)\",\n output=[\"answer\"],\n node_config={\n \"llm_model\": self.llm_model,\n \"schema\": self.schema\n }\n )\n search_node = SearchLinkNode(\n input=\"user_prompt & relevant_chunks\",\n output=[\"relevant_links\"],\n node_config={\n \"llm_model\": self.llm_model,\n \"embedder_model\": self.embedder_model\n }\n )\n graph_iterator_node = GraphIteratorNode(\n input=\"user_prompt & relevant_links\",\n output=[\"results\"],\n node_config={\n \"graph_instance\": None,\n \"batchsize\": 1\n }\n )\n merge_answers_node = MergeAnswersNode(\n input=\"user_prompt & results\",\n output=[\"answer\"],\n node_config={\n \"llm_model\": self.llm_model,\n \"schema\": self.schema\n }\n )\n\n return BaseGraph(\n nodes=[\n fetch_node,\n parse_node,\n rag_node,\n generate_answer_node,\n search_node,\n graph_iterator_node,\n merge_answers_node\n ],\n edges=[\n (fetch_node, parse_node),\n (parse_node, rag_node),\n (rag_node, generate_answer_node),\n (rag_node, search_node),\n (search_node, graph_iterator_node),\n (graph_iterator_node, merge_answers_node)\n ],\n entry_point=fetch_node\n )\n\n\n\n def _create_graph(self) -> BaseGraph:\n \"\"\"\n Creates the graph of nodes representing the workflow for web scraping\n n-levels deep.\n\n Returns:\n BaseGraph: A graph instance representing the web scraping workflow.\n \"\"\"\n\n base_graph = self._create_repeated_graph()\n graph_iterator_node = list(filter(lambda x: x.node_name == \"GraphIterator\", base_graph.nodes))[0]\n # Graph iterator will repeat the same graph for multiple hyperlinks found within input webpage\n graph_iterator_node.node_config[\"graph_instance\"] = self\n return base_graph\n\n def run(self) -> str:\n \"\"\"\n Executes the scraping process and returns the answer to the prompt.\n Returns:\n str: The answer to the prompt.\n \"\"\"\n\n inputs = {\"user_prompt\": self.prompt, self.input_key: self.source}\n self.final_state, self.execution_info = self.graph.execute(inputs)\n\n return self.final_state.get(\"answer\", \"No answer found.\")\n```"
`25`	`25`	`}`
`26`	`26`	`]`