@@ -218,7 +218,14 @@ try {
218
218
```
219
219
220
220
``` bash cURL
221
- // TODO
221
+ curl -X ' POST' \
222
+ ' https://api.scrapegraphai.com/v1/searchscraper' \
223
+ -H ' accept: application/json' \
224
+ -H ' SGAI-APIKEY: sgai-********************' \
225
+ -H ' Content-Type: application/json' \
226
+ -d ' {
227
+ "user_prompt": "Search for information"
228
+ }'
222
229
```
223
230
224
231
</CodeGroup >
@@ -291,31 +298,238 @@ try {
291
298
292
299
</CodeGroup >
293
300
301
+ ### Advanced Schema Usage
302
+
303
+ The schema system in SearchScraper is a powerful way to ensure you get exactly the data structure you need. Here are some advanced techniques for using schemas effectively:
304
+
305
+ #### Nested Schemas
306
+
307
+ You can create complex nested structures to capture hierarchical data:
308
+
309
+ <CodeGroup >
310
+
311
+ ``` python Python
312
+ from pydantic import BaseModel, Field
313
+ from typing import List, Optional
314
+
315
+ class Author (BaseModel ):
316
+ name: str = Field(description = " Author's full name" )
317
+ bio: Optional[str ] = Field(description = " Author's biography" )
318
+ expertise: List[str ] = Field(description = " Areas of expertise" )
319
+
320
+ class Article (BaseModel ):
321
+ title: str = Field(description = " Article title" )
322
+ content: str = Field(description = " Main article content" )
323
+ author: Author = Field(description = " Article author information" )
324
+ publication_date: str = Field(description = " Date of publication" )
325
+ tags: List[str ] = Field(description = " Article tags or categories" )
326
+
327
+ response = client.searchscraper(
328
+ user_prompt = " Find the latest AI research articles" ,
329
+ output_schema = Article
330
+ )
331
+ ```
332
+
333
+ ``` typescript JavaScript
334
+ import { z } from ' zod' ;
335
+
336
+ const Author = z .object ({
337
+ name: z .string ().describe (" Author's full name" ),
338
+ bio: z .string ().optional ().describe (" Author's biography" ),
339
+ expertise: z .array (z .string ()).describe (" Areas of expertise" )
340
+ });
341
+
342
+ const Article = z .object ({
343
+ title: z .string ().describe (" Article title" ),
344
+ content: z .string ().describe (" Main article content" ),
345
+ author: Author .describe (" Article author information" ),
346
+ publicationDate: z .string ().describe (" Date of publication" ),
347
+ tags: z .array (z .string ()).describe (" Article tags or categories" )
348
+ });
349
+
350
+ const response = await searchScraper (apiKey , prompt , Article );
351
+ ```
352
+
353
+ </CodeGroup >
354
+
355
+ #### Schema Validation Rules
356
+
357
+ Enhance data quality by adding validation rules to your schema:
358
+
359
+ <CodeGroup >
360
+
361
+ ``` python Python
362
+ from pydantic import BaseModel, Field, validator
363
+ from typing import List
364
+ from datetime import datetime
365
+
366
+ class ProductInfo (BaseModel ):
367
+ name: str = Field(description = " Product name" )
368
+ price: float = Field(description = " Product price" , gt = 0 )
369
+ currency: str = Field(description = " Currency code" , max_length = 3 )
370
+ release_date: str = Field(description = " Product release date" )
371
+
372
+ @validator (' currency' )
373
+ def validate_currency (cls , v ):
374
+ if len (v) != 3 or not v.isupper():
375
+ raise ValueError (' Currency must be a 3-letter uppercase code' )
376
+ return v
377
+
378
+ @validator (' release_date' )
379
+ def validate_date (cls , v ):
380
+ try :
381
+ datetime.strptime(v, ' %Y-%m-%d ' )
382
+ return v
383
+ except ValueError :
384
+ raise ValueError (' Date must be in YYYY-MM-DD format' )
385
+ ```
386
+
387
+ ``` typescript JavaScript
388
+ import { z } from ' zod' ;
389
+
390
+ const ProductInfo = z .object ({
391
+ name: z .string ().min (1 ).describe (" Product name" ),
392
+ price: z .number ().positive ().describe (" Product price" ),
393
+ currency: z .string ().length (3 ).toUpperCase ()
394
+ .describe (" Currency code" ),
395
+ releaseDate: z .string ().regex (/ ^ \d {4} -\d {2} -\d {2} $ / )
396
+ .describe (" Product release date" )
397
+ });
398
+ ```
399
+
400
+ </CodeGroup >
401
+
402
+ ### Quality Improvement Tips
403
+
404
+ To get the highest quality results from SearchScraper, follow these best practices:
405
+
406
+ #### 1. Detailed Field Descriptions
407
+
408
+ Always provide clear, detailed descriptions for each field in your schema:
409
+
410
+ ``` python
411
+ class CompanyInfo (BaseModel ):
412
+ revenue: str = Field(
413
+ description = " Annual revenue in USD, including the year of reporting"
414
+ # Good: "Annual revenue in USD, including the year of reporting"
415
+ # Bad: "Revenue"
416
+ )
417
+ market_position: str = Field(
418
+ description = " Company's market position including market share percentage and rank among competitors"
419
+ # Good: "Company's market position including market share percentage and rank among competitors"
420
+ # Bad: "Position"
421
+ )
422
+ ```
423
+
424
+ #### 2. Structured Prompts
425
+
426
+ Combine schemas with well-structured prompts for better results:
427
+
428
+ ``` python
429
+ response = client.searchscraper(
430
+ user_prompt = """
431
+ Find information about Tesla's electric vehicles with specific focus on:
432
+ - Latest Model 3 and Model Y specifications
433
+ - Current pricing structure
434
+ - Available customization options
435
+ - Delivery timeframes
436
+ Please include only verified information from official sources.
437
+ """ ,
438
+ output_schema = TeslaVehicleInfo
439
+ )
440
+ ```
441
+
442
+ #### 3. Data Validation
443
+
444
+ Implement comprehensive validation to ensure data quality:
445
+
446
+ ``` python
447
+ from pydantic import BaseModel, Field, validator
448
+ from typing import List, Optional
449
+ from datetime import datetime
450
+
451
+ class MarketData (BaseModel ):
452
+ timestamp: str = Field(description = " Data timestamp in ISO format" )
453
+ value: float = Field(description = " Market value" )
454
+ confidence_score: float = Field(description = " Confidence score between 0 and 1" )
455
+
456
+ @validator (' timestamp' )
457
+ def validate_timestamp (cls , v ):
458
+ try :
459
+ datetime.fromisoformat(v)
460
+ return v
461
+ except ValueError :
462
+ raise ValueError (' Invalid ISO timestamp format' )
463
+
464
+ @validator (' confidence_score' )
465
+ def validate_confidence (cls , v ):
466
+ if not 0 <= v <= 1 :
467
+ raise ValueError (' Confidence score must be between 0 and 1' )
468
+ return v
469
+ ```
470
+
471
+ #### 4. Error Handling
472
+
473
+ Implement robust error handling for schema validation:
474
+
475
+ ``` python
476
+ try :
477
+ response = client.searchscraper(
478
+ user_prompt = " Find market data for NASDAQ:AAPL" ,
479
+ output_schema = MarketData
480
+ )
481
+ validated_data = MarketData(** response.result)
482
+ except ValidationError as e:
483
+ print (f " Data validation failed: { e.json()} " )
484
+ # Implement fallback logic or error reporting
485
+ except Exception as e:
486
+ print (f " An error occurred: { str (e)} " )
487
+ ```
488
+
294
489
### Async Support
295
490
296
- For applications requiring asynchronous execution :
491
+ Example of using the async searchscraper functionality to search for information concurrently :
297
492
298
493
``` python
299
- from scrapegraph_py import AsyncClient
300
494
import asyncio
495
+ from scrapegraph_py import AsyncClient
496
+ from scrapegraph_py.logger import sgai_logger
497
+
498
+ sgai_logger.set_logging(level = " INFO" )
301
499
302
500
async def main ():
303
- async with AsyncClient(api_key = " your-api-key" ) as client:
304
-
305
- response = await client.searchscraper(
306
- user_prompt = " Analyze the current AI chip market" ,
307
- )
308
-
309
- # Process the structured results
310
- market_data = response.result
311
- print (f " Market Size: { market_data[' market_overview' ][' total_size' ]} " )
312
- print (f " Growth Rate: { market_data[' market_overview' ][' growth_rate' ]} " )
313
- print (" \n Key Players:" )
314
- for player in market_data[' market_overview' ][' key_players' ]:
315
- print (f " - { player} " )
316
-
317
- # Run the async function
318
- asyncio.run(main())
501
+ # Initialize async client
502
+ sgai_client = AsyncClient(api_key = " your-api-key-here" )
503
+
504
+ # List of search queries
505
+ queries = [
506
+ " What is the latest version of Python and what are its main features?" ,
507
+ " What are the key differences between Python 2 and Python 3?" ,
508
+ " What is Python's GIL and how does it work?" ,
509
+ ]
510
+
511
+ # Create tasks for concurrent execution
512
+ tasks = [sgai_client.searchscraper(user_prompt = query) for query in queries]
513
+
514
+ # Execute requests concurrently
515
+ responses = await asyncio.gather(* tasks, return_exceptions = True )
516
+
517
+ # Process results
518
+ for i, response in enumerate (responses):
519
+ if isinstance (response, Exception ):
520
+ print (f " \n Error for query { i+ 1 } : { response} " )
521
+ else :
522
+ print (f " \n Search { i+ 1 } : " )
523
+ print (f " Query: { queries[i]} " )
524
+ print (f " Result: { response[' result' ]} " )
525
+ print (" Reference URLs:" )
526
+ for url in response[" reference_urls" ]:
527
+ print (f " - { url} " )
528
+
529
+ await sgai_client.close()
530
+
531
+ if __name__ == " __main__" :
532
+ asyncio.run(main())
319
533
```
320
534
321
535
## Integration Options
0 commit comments