Skip to content

Commit 2205a29

Browse files
authored
Merge pull request #11 from ScrapeGraphAI/refactoring-endpoints
feat: add endpoints refactoring
2 parents 5c4e727 + 91e73e1 commit 2205a29

File tree

3 files changed

+432
-30
lines changed

3 files changed

+432
-30
lines changed

Diff for: services/markdownify.mdx

+54-2
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,14 @@ try {
153153
```
154154

155155
```bash cURL
156-
// TODO
156+
curl -X 'POST' \
157+
'https://api.scrapegraphai.com/v1/markdownify' \
158+
-H 'accept: application/json' \
159+
-H 'SGAI-APIKEY: sgai-********************' \
160+
-H 'Content-Type: application/json' \
161+
-d '{
162+
"website_url": "https://example.com"
163+
}'
157164
```
158165

159166
</CodeGroup>
@@ -168,7 +175,7 @@ try {
168175

169176
### Async Support
170177

171-
For applications requiring asynchronous execution, Markdownify provides async support through the `AsyncClient`:
178+
For applications requiring asynchronous execution, Markdownify provides async support through the `AsyncClient`. Here's a basic example:
172179

173180
```python
174181
from scrapegraph_py import AsyncClient
@@ -185,6 +192,51 @@ async def main():
185192
asyncio.run(main())
186193
```
187194

195+
For more advanced concurrent processing, you can use the following example:
196+
197+
```python
198+
import asyncio
199+
from scrapegraph_py import AsyncClient
200+
from scrapegraph_py.logger import sgai_logger
201+
202+
sgai_logger.set_logging(level="INFO")
203+
204+
async def main():
205+
# Initialize async client
206+
sgai_client = AsyncClient(api_key="your-api-key-here")
207+
208+
# Concurrent markdownify requests
209+
urls = [
210+
"https://scrapegraphai.com/",
211+
"https://github.com/ScrapeGraphAI/Scrapegraph-ai",
212+
]
213+
214+
tasks = [sgai_client.markdownify(website_url=url) for url in urls]
215+
216+
# Execute requests concurrently
217+
responses = await asyncio.gather(*tasks, return_exceptions=True)
218+
219+
# Process results
220+
for i, response in enumerate(responses):
221+
if isinstance(response, Exception):
222+
print(f"\nError for {urls[i]}: {response}")
223+
else:
224+
print(f"\nPage {i+1} Markdown:")
225+
print(f"URL: {urls[i]}")
226+
print(f"Result: {response['result']}")
227+
228+
await sgai_client.close()
229+
230+
if __name__ == "__main__":
231+
asyncio.run(main())
232+
```
233+
234+
This advanced example demonstrates:
235+
- Concurrent processing of multiple URLs
236+
- Error handling for failed requests
237+
- Proper client cleanup
238+
- Logging configuration
239+
188240
## Integration Options
189241

190242
### Official SDKs

Diff for: services/searchscraper.mdx

+233-19
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,14 @@ try {
218218
```
219219

220220
```bash cURL
221-
// TODO
221+
curl -X 'POST' \
222+
'https://api.scrapegraphai.com/v1/searchscraper' \
223+
-H 'accept: application/json' \
224+
-H 'SGAI-APIKEY: sgai-********************' \
225+
-H 'Content-Type: application/json' \
226+
-d '{
227+
"user_prompt": "Search for information"
228+
}'
222229
```
223230

224231
</CodeGroup>
@@ -291,31 +298,238 @@ try {
291298

292299
</CodeGroup>
293300

301+
### Advanced Schema Usage
302+
303+
The schema system in SearchScraper is a powerful way to ensure you get exactly the data structure you need. Here are some advanced techniques for using schemas effectively:
304+
305+
#### Nested Schemas
306+
307+
You can create complex nested structures to capture hierarchical data:
308+
309+
<CodeGroup>
310+
311+
```python Python
312+
from pydantic import BaseModel, Field
313+
from typing import List, Optional
314+
315+
class Author(BaseModel):
316+
name: str = Field(description="Author's full name")
317+
bio: Optional[str] = Field(description="Author's biography")
318+
expertise: List[str] = Field(description="Areas of expertise")
319+
320+
class Article(BaseModel):
321+
title: str = Field(description="Article title")
322+
content: str = Field(description="Main article content")
323+
author: Author = Field(description="Article author information")
324+
publication_date: str = Field(description="Date of publication")
325+
tags: List[str] = Field(description="Article tags or categories")
326+
327+
response = client.searchscraper(
328+
user_prompt="Find the latest AI research articles",
329+
output_schema=Article
330+
)
331+
```
332+
333+
```typescript JavaScript
334+
import { z } from 'zod';
335+
336+
const Author = z.object({
337+
name: z.string().describe("Author's full name"),
338+
bio: z.string().optional().describe("Author's biography"),
339+
expertise: z.array(z.string()).describe("Areas of expertise")
340+
});
341+
342+
const Article = z.object({
343+
title: z.string().describe("Article title"),
344+
content: z.string().describe("Main article content"),
345+
author: Author.describe("Article author information"),
346+
publicationDate: z.string().describe("Date of publication"),
347+
tags: z.array(z.string()).describe("Article tags or categories")
348+
});
349+
350+
const response = await searchScraper(apiKey, prompt, Article);
351+
```
352+
353+
</CodeGroup>
354+
355+
#### Schema Validation Rules
356+
357+
Enhance data quality by adding validation rules to your schema:
358+
359+
<CodeGroup>
360+
361+
```python Python
362+
from pydantic import BaseModel, Field, validator
363+
from typing import List
364+
from datetime import datetime
365+
366+
class ProductInfo(BaseModel):
367+
name: str = Field(description="Product name")
368+
price: float = Field(description="Product price", gt=0)
369+
currency: str = Field(description="Currency code", max_length=3)
370+
release_date: str = Field(description="Product release date")
371+
372+
@validator('currency')
373+
def validate_currency(cls, v):
374+
if len(v) != 3 or not v.isupper():
375+
raise ValueError('Currency must be a 3-letter uppercase code')
376+
return v
377+
378+
@validator('release_date')
379+
def validate_date(cls, v):
380+
try:
381+
datetime.strptime(v, '%Y-%m-%d')
382+
return v
383+
except ValueError:
384+
raise ValueError('Date must be in YYYY-MM-DD format')
385+
```
386+
387+
```typescript JavaScript
388+
import { z } from 'zod';
389+
390+
const ProductInfo = z.object({
391+
name: z.string().min(1).describe("Product name"),
392+
price: z.number().positive().describe("Product price"),
393+
currency: z.string().length(3).toUpperCase()
394+
.describe("Currency code"),
395+
releaseDate: z.string().regex(/^\d{4}-\d{2}-\d{2}$/)
396+
.describe("Product release date")
397+
});
398+
```
399+
400+
</CodeGroup>
401+
402+
### Quality Improvement Tips
403+
404+
To get the highest quality results from SearchScraper, follow these best practices:
405+
406+
#### 1. Detailed Field Descriptions
407+
408+
Always provide clear, detailed descriptions for each field in your schema:
409+
410+
```python
411+
class CompanyInfo(BaseModel):
412+
revenue: str = Field(
413+
description="Annual revenue in USD, including the year of reporting"
414+
# Good: "Annual revenue in USD, including the year of reporting"
415+
# Bad: "Revenue"
416+
)
417+
market_position: str = Field(
418+
description="Company's market position including market share percentage and rank among competitors"
419+
# Good: "Company's market position including market share percentage and rank among competitors"
420+
# Bad: "Position"
421+
)
422+
```
423+
424+
#### 2. Structured Prompts
425+
426+
Combine schemas with well-structured prompts for better results:
427+
428+
```python
429+
response = client.searchscraper(
430+
user_prompt="""
431+
Find information about Tesla's electric vehicles with specific focus on:
432+
- Latest Model 3 and Model Y specifications
433+
- Current pricing structure
434+
- Available customization options
435+
- Delivery timeframes
436+
Please include only verified information from official sources.
437+
""",
438+
output_schema=TeslaVehicleInfo
439+
)
440+
```
441+
442+
#### 3. Data Validation
443+
444+
Implement comprehensive validation to ensure data quality:
445+
446+
```python
447+
from pydantic import BaseModel, Field, validator
448+
from typing import List, Optional
449+
from datetime import datetime
450+
451+
class MarketData(BaseModel):
452+
timestamp: str = Field(description="Data timestamp in ISO format")
453+
value: float = Field(description="Market value")
454+
confidence_score: float = Field(description="Confidence score between 0 and 1")
455+
456+
@validator('timestamp')
457+
def validate_timestamp(cls, v):
458+
try:
459+
datetime.fromisoformat(v)
460+
return v
461+
except ValueError:
462+
raise ValueError('Invalid ISO timestamp format')
463+
464+
@validator('confidence_score')
465+
def validate_confidence(cls, v):
466+
if not 0 <= v <= 1:
467+
raise ValueError('Confidence score must be between 0 and 1')
468+
return v
469+
```
470+
471+
#### 4. Error Handling
472+
473+
Implement robust error handling for schema validation:
474+
475+
```python
476+
try:
477+
response = client.searchscraper(
478+
user_prompt="Find market data for NASDAQ:AAPL",
479+
output_schema=MarketData
480+
)
481+
validated_data = MarketData(**response.result)
482+
except ValidationError as e:
483+
print(f"Data validation failed: {e.json()}")
484+
# Implement fallback logic or error reporting
485+
except Exception as e:
486+
print(f"An error occurred: {str(e)}")
487+
```
488+
294489
### Async Support
295490

296-
For applications requiring asynchronous execution:
491+
Example of using the async searchscraper functionality to search for information concurrently:
297492

298493
```python
299-
from scrapegraph_py import AsyncClient
300494
import asyncio
495+
from scrapegraph_py import AsyncClient
496+
from scrapegraph_py.logger import sgai_logger
497+
498+
sgai_logger.set_logging(level="INFO")
301499

302500
async def main():
303-
async with AsyncClient(api_key="your-api-key") as client:
304-
305-
response = await client.searchscraper(
306-
user_prompt="Analyze the current AI chip market",
307-
)
308-
309-
# Process the structured results
310-
market_data = response.result
311-
print(f"Market Size: {market_data['market_overview']['total_size']}")
312-
print(f"Growth Rate: {market_data['market_overview']['growth_rate']}")
313-
print("\nKey Players:")
314-
for player in market_data['market_overview']['key_players']:
315-
print(f"- {player}")
316-
317-
# Run the async function
318-
asyncio.run(main())
501+
# Initialize async client
502+
sgai_client = AsyncClient(api_key="your-api-key-here")
503+
504+
# List of search queries
505+
queries = [
506+
"What is the latest version of Python and what are its main features?",
507+
"What are the key differences between Python 2 and Python 3?",
508+
"What is Python's GIL and how does it work?",
509+
]
510+
511+
# Create tasks for concurrent execution
512+
tasks = [sgai_client.searchscraper(user_prompt=query) for query in queries]
513+
514+
# Execute requests concurrently
515+
responses = await asyncio.gather(*tasks, return_exceptions=True)
516+
517+
# Process results
518+
for i, response in enumerate(responses):
519+
if isinstance(response, Exception):
520+
print(f"\nError for query {i+1}: {response}")
521+
else:
522+
print(f"\nSearch {i+1}:")
523+
print(f"Query: {queries[i]}")
524+
print(f"Result: {response['result']}")
525+
print("Reference URLs:")
526+
for url in response["reference_urls"]:
527+
print(f"- {url}")
528+
529+
await sgai_client.close()
530+
531+
if __name__ == "__main__":
532+
asyncio.run(main())
319533
```
320534

321535
## Integration Options

0 commit comments

Comments
 (0)