@@ -126,9 +126,7 @@ def smart_split_content(text: str, chunk_size: int, overlap: int) -> List[str]:
126126 best_break_rel = idx + len (pattern )
127127 break
128128
129- effective_end = (
130- search_start + best_break_rel if best_break_rel != - 1 else end
131- )
129+ effective_end = search_start + best_break_rel if best_break_rel != - 1 else end
132130
133131 # Ensure we always move forward at least by 10% of chunk size or at least 'overlap'
134132 # to avoid infinite loops if overlap is too large
@@ -215,9 +213,7 @@ def get_content_score(res: Dict[str, Any], query: str = "") -> int:
215213 "about" ,
216214 }
217215 query_words = {
218- w
219- for w in re .findall (r"\w{3,}" , query .lower ())
220- if w not in stop_words
216+ w for w in re .findall (r"\w{3,}" , query .lower ()) if w not in stop_words
221217 }
222218
223219 if query_words :
@@ -295,10 +291,7 @@ def get_prompt_with_truncation(
295291 # Calculate how many chars to KEEP (not how many to remove)
296292 keep_chars = max (len (content ) - truncate_last_num_chars , 2000 )
297293 if keep_chars < len (content ):
298- content = (
299- content [:keep_chars ]
300- + "\n [...truncated due to length limits]"
301- )
294+ content = content [:keep_chars ] + "\n [...truncated due to length limits]"
302295 return EXTRACT_INFO_PROMPT .format (info_to_extract , content )
303296
304297
@@ -555,16 +548,13 @@ async def scrape_url_with_firecrawl(
555548 "error" : "" ,
556549 "char_count" : total_char_count ,
557550 "line_count" : total_line_count ,
558- "all_content_displayed" : total_char_count
559- <= max_chars ,
551+ "all_content_displayed" : total_char_count <= max_chars ,
560552 "last_char_line" : displayed_content .count ("\n " ) + 1
561553 if displayed_content
562554 else 0 ,
563555 }
564556 else :
565- error_msg = res_data .get (
566- "error" , "Unknown Firecrawl error"
567- )
557+ error_msg = res_data .get ("error" , "Unknown Firecrawl error" )
568558 if attempt < len (retry_delays ):
569559 await asyncio .sleep (delay )
570560 continue
@@ -655,18 +645,14 @@ async def scrape_url_with_playwright(
655645 content_type = response .headers .get ("content-type" , "" ).lower ()
656646 content = ""
657647
658- if (
659- "application/pdf" in content_type
660- or url .lower ().endswith (".pdf" )
661- ):
648+ if "application/pdf" in content_type or url .lower ().endswith (".pdf" ):
662649 pdf_bytes = await response .body ()
663650 if PdfReader :
664651 with io .BytesIO (pdf_bytes ) as f :
665652 reader = PdfReader (f )
666653 pages_to_read = min (len (reader .pages ), 50 )
667654 content = "\n " .join (
668- reader .pages [i ].extract_text ()
669- for i in range (pages_to_read )
655+ reader .pages [i ].extract_text () for i in range (pages_to_read )
670656 )
671657 else :
672658 content = "PDF detected but pypdf is not installed."
@@ -837,12 +823,12 @@ async def call_robust_llm(
837823 logger .warning (
838824 f"LLM: Context limit hit (attempt { attempt + 1 } ). Retrying with gradient truncation..."
839825 )
840- payload ["messages" ][0 ][
841- "content"
842- ] = get_prompt_with_truncation (
843- info_for_truncation ,
844- original_content ,
845- truncate_last_num_chars = 40960 * ( attempt + 1 ),
826+ payload ["messages" ][0 ]["content" ] = (
827+ get_prompt_with_truncation (
828+ info_for_truncation ,
829+ original_content ,
830+ truncate_last_num_chars = 40960 * ( attempt + 1 ) ,
831+ )
846832 )
847833 await asyncio .sleep (delay )
848834 continue
@@ -866,9 +852,7 @@ async def call_robust_llm(
866852 return {
867853 "success" : True ,
868854 "extracted_info" : output ,
869- "tokens_used" : data .get ("usage" , {}).get (
870- "total_tokens" , 0
871- ),
855+ "tokens_used" : data .get ("usage" , {}).get ("total_tokens" , 0 ),
872856 }
873857
874858 except Exception as e :
@@ -951,19 +935,15 @@ async def scrape_and_extract_info(
951935
952936 # Tier 2: Firecrawl (First fallback, highly reliable)
953937 if best_score < 5 :
954- logger .info (
955- f"Jina quality low (Score: { best_score } ). Trying Firecrawl..."
956- )
938+ logger .info (f"Jina quality low (Score: { best_score } ). Trying Firecrawl..." )
957939 fc_res = await scrape_url_with_firecrawl (url )
958940 fc_score = get_content_score (fc_res , info_to_extract )
959941 if fc_score > best_score :
960942 best_res , best_method , best_score = fc_res , "Firecrawl" , fc_score
961943
962944 # Tier 3: Playwright
963945 if best_score < 5 :
964- logger .info (
965- f"Current quality low (Score: { best_score } ). Trying Playwright..."
966- )
946+ logger .info (f"Current quality low (Score: { best_score } ). Trying Playwright..." )
967947 pw_res = await scrape_url_with_playwright (url )
968948 pw_score = get_content_score (pw_res , info_to_extract )
969949 if pw_score > best_score :
@@ -983,8 +963,10 @@ async def scrape_and_extract_info(
983963 py_score ,
984964 )
985965
986- if not best_res or not best_res .get ("success" ) or (
987- best_score < 1 and best_res .get ("char_count" , 0 ) < 50
966+ if (
967+ not best_res
968+ or not best_res .get ("success" )
969+ or (best_score < 1 and best_res .get ("char_count" , 0 ) < 50 )
988970 ):
989971 return json .dumps (
990972 {
@@ -1031,9 +1013,7 @@ async def scrape_and_extract_info(
10311013
10321014 async def sem_call_robust_llm (chunk_text ):
10331015 async with semaphore :
1034- chunk_prompt = EXTRACT_INFO_PROMPT .format (
1035- info_to_extract , chunk_text
1036- )
1016+ chunk_prompt = EXTRACT_INFO_PROMPT .format (info_to_extract , chunk_text )
10371017 return await call_robust_llm (
10381018 chunk_prompt ,
10391019 temperature = 0.2 ,
@@ -1042,9 +1022,7 @@ async def sem_call_robust_llm(chunk_text):
10421022 )
10431023
10441024 # Map Phase: Parallel extraction with concurrency control
1045- chunk_results = await asyncio .gather (
1046- * (sem_call_robust_llm (c ) for c in chunks )
1047- )
1025+ chunk_results = await asyncio .gather (* (sem_call_robust_llm (c ) for c in chunks ))
10481026
10491027 # Filter successful findings
10501028 valid_partials = []
@@ -1056,9 +1034,7 @@ async def sem_call_robust_llm(chunk_text):
10561034 valid_partials .append (text )
10571035
10581036 if not valid_partials :
1059- logger .warning (
1060- "No extracted information available from any chunk."
1061- )
1037+ logger .warning ("No extracted information available from any chunk." )
10621038 final_info = (
10631039 "The requested information was not found in the provided document."
10641040 )
@@ -1071,9 +1047,7 @@ async def sem_call_robust_llm(chunk_text):
10711047 )
10721048 formatted_fragments = ""
10731049 for i , partial in enumerate (valid_partials ):
1074- formatted_fragments += (
1075- f"--- FRAGMENT { i + 1 } ---\n { partial } \n \n "
1076- )
1050+ formatted_fragments += f"--- FRAGMENT { i + 1 } ---\n { partial } \n \n "
10771051
10781052 reduce_prompt = REDUCE_PROMPT .format (
10791053 info = info_to_extract , partials = formatted_fragments
@@ -1094,10 +1068,7 @@ async def sem_call_robust_llm(chunk_text):
10941068 verification_note = ""
10951069 if final_info :
10961070 info_lower = final_info .lower ()
1097- if (
1098- "[confidence: low" in info_lower
1099- or "requires verification" in info_lower
1100- ):
1071+ if "[confidence: low" in info_lower or "requires verification" in info_lower :
11011072 verification_note = "LOW CONFIDENCE: This information has low reliability. Strongly recommend verifying with additional independent sources before using."
11021073 elif (
11031074 "[confidence: medium" in info_lower
0 commit comments