"answer": "Sure, here's the implementation for the `DeepScraperGraph` class in ScrapeGraphAI:\n\n```python\n\"\"\"\nDeepScraperGraph Module\n\"\"\"\n\nfrom typing import Optional\n\nfrom .base_graph import BaseGraph\nfrom .abstract_graph import AbstractGraph\n\nfrom ..nodes import (\n FetchNode,\n SearchLinkNode,\n ParseNode,\n RAGNode,\n GenerateAnswerNode,\n GraphIteratorNode,\n MergeAnswersNode\n)\n\n\nclass DeepScraperGraph(AbstractGraph):\n \"\"\"\n [WIP]\n\n DeepScraper is a scraping pipeline that automates the process of \n extracting information from web pages using a natural language model \n to interpret and answer prompts.\n\n Unlike SmartScraper, DeepScraper can navigate to the links within,\n the input webpage to fuflfil the task within the prompt.\n \n Attributes:\n prompt (str): The prompt for the graph.\n source (str): The source of the graph.\n config (dict): Configuration parameters for the graph.\n schema (str): The schema for the graph output.\n llm_model: An instance of a language model client, configured for generating answers.\n embedder_model: An instance of an embedding model client, \n configured for generating embeddings.\n verbose (bool): A flag indicating whether to show print statements during execution.\n headless (bool): A flag indicating whether to run the graph in headless mode.\n \n Args:\n prompt (str): The prompt for the graph.\n source (str): The source of the graph.\n config (dict): Configuration parameters for the graph.\n schema (str): The schema for the graph output.\n\n Example:\n >>> deep_scraper = DeepScraperGraph(\n ... \"List me all the job titles and detailed job description.\",\n ... \"https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India\",\n ... {\"llm\": {\"model\": \"gpt-3.5-turbo\"}}\n ... )\n >>> result = deep_scraper.run()\n )\n \"\"\"\n\n def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):\n \n super().__init__(prompt, config, source, schema)\n\n self.input_key = \"url\" if source.startswith(\"http\") else \"local_dir\"\n\n def _create_repeated_graph(self) -> BaseGraph:\n \"\"\"\n Creates the graph that can be repeatedly executed to conduct search on\n hyperlinks within the webpage.\n\n Returns:\n BaseGraph: A graph instance representing the web scraping workflow.\n \"\"\"\n fetch_node = FetchNode(\n input=\"url | local_dir\",\n output=[\"doc\", \"link_urls\", \"img_urls\"]\n )\n parse_node = ParseNode(\n input=\"doc\",\n output=[\"parsed_doc\"],\n node_config={\n \"chunk_size\": self.model_token\n }\n )\n rag_node = RAGNode(\n input=\"user_prompt & (parsed_doc | doc)\",\n output=[\"relevant_chunks\"],\n node_config={\n \"llm_model\": self.llm_model,\n \"embedder_model\": self.embedder_model\n }\n )\n generate_answer_node = GenerateAnswerNode(\n input=\"user_prompt & (relevant_chunks | parsed_doc | doc)\",\n output=[\"answer\"],\n node_config={\n \"llm_model\": self.llm_model,\n \"schema\": self.schema\n }\n )\n search_node = SearchLinkNode(\n input=\"user_prompt & relevant_chunks\",\n output=[\"relevant_links\"],\n node_config={\n \"llm_model\": self.llm_model,\n \"embedder_model\": self.embedder_model\n }\n )\n graph_iterator_node = GraphIteratorNode(\n input=\"user_prompt & relevant_links\",\n output=[\"results\"],\n node_config={\n \"graph_instance\": None,\n \"batchsize\": 1\n }\n )\n merge_answers_node = MergeAnswersNode(\n input=\"user_prompt & results\",\n output=[\"answer\"],\n node_config={\n \"llm_model\": self.llm_model,\n \"schema\": self.schema\n }\n )\n\n return BaseGraph(\n nodes=[\n fetch_node,\n parse_node,\n rag_node,\n generate_answer_node,\n search_node,\n graph_iterator_node,\n merge_answers_node\n ],\n edges=[\n (fetch_node, parse_node),\n (parse_node, rag_node),\n (rag_node, generate_answer_node),\n (rag_node, search_node),\n (search_node, graph_iterator_node),\n (graph_iterator_node, merge_answers_node)\n ],\n entry_point=fetch_node\n )\n\n\n\n def _create_graph(self) -> BaseGraph:\n \"\"\"\n Creates the graph of nodes representing the workflow for web scraping\n n-levels deep.\n\n Returns:\n BaseGraph: A graph instance representing the web scraping workflow.\n \"\"\"\n\n base_graph = self._create_repeated_graph()\n graph_iterator_node = list(filter(lambda x: x.node_name == \"GraphIterator\", base_graph.nodes))[0]\n # Graph iterator will repeat the same graph for multiple hyperlinks found within input webpage\n graph_iterator_node.node_config[\"graph_instance\"] = self\n return base_graph\n\n def run(self) -> str:\n \"\"\"\n Executes the scraping process and returns the answer to the prompt.\n Returns:\n str: The answer to the prompt.\n \"\"\"\n\n inputs = {\"user_prompt\": self.prompt, self.input_key: self.source}\n self.final_state, self.execution_info = self.graph.execute(inputs)\n\n return self.final_state.get(\"answer\", \"No answer found.\")\n```"
0 commit comments