|
2 | 2 |
|
3 | 3 | In conjunction with failed retries, customized error times and HTTP status codes automatically rotate agents for crawling targets.
|
4 | 4 |
|
5 |
| -It can be set in three places: Create crawler application instance, advanced usage, and detailed goals. |
6 |
| - |
7 |
| -Take crawlPage as an example: |
8 |
| - |
9 |
| -```js |
| 5 | +```js{8,9,10,11,12,13,14,15,16} |
10 | 6 | import { createCrawl } from 'x-crawl'
|
11 | 7 |
|
12 | 8 | const crawlApp = createCrawl()
|
13 | 9 |
|
14 | 10 | crawlApp
|
15 |
| - .crawlPage({ |
16 |
| - targets: [ |
17 |
| - 'https://www.example.com/page-1', |
18 |
| - 'https://www.example.com/page-2', |
19 |
| - 'https://www.example.com/page-3', |
20 |
| - 'https://www.example.com/page-4', |
21 |
| - // Cancel the proxy for this target |
22 |
| - { url: 'https://www.example.com/page-6', proxy: null }, |
23 |
| - // Set up a separate proxy for this target |
24 |
| - { |
25 |
| - url: 'https://www.example.com/page-6', |
26 |
| - proxy: { |
27 |
| - urls: [ |
28 |
| - 'https://www.example.com/proxy-4', |
29 |
| - 'https://www.example.com/proxy-5' |
30 |
| - ], |
31 |
| - switchByErrorCount: 3 |
32 |
| - } |
33 |
| - } |
34 |
| - ], |
35 |
| - maxRetry: 10, |
36 |
| - // Set the proxy uniformly for this target |
37 |
| - proxy: { |
38 |
| - urls: [ |
39 |
| - 'https://www.example.com/proxy-1', |
40 |
| - 'https://www.example.com/proxy-2', |
41 |
| - 'https://www.example.com/proxy-3' |
42 |
| - ], |
43 |
| - switchByErrorCount: 3, |
44 |
| - switchByHttpStatus: [401, 403] |
45 |
| - } |
46 |
| - }) |
47 |
| - .then((res) => {}) |
| 11 | + .crawlPage({ |
| 12 | + url: 'https://www.example.com', |
| 13 | + maxRetry: 10, |
| 14 | + proxy: { |
| 15 | + urls: [ |
| 16 | + 'https://www.example.com/proxy-1', |
| 17 | + 'https://www.example.com/proxy-2' |
| 18 | + ], |
| 19 | + switchByHttpStatus: [401, 403], |
| 20 | + switchByErrorCount: 3 |
| 21 | + } |
| 22 | + }) |
| 23 | + .then((res) => {}) |
48 | 24 | ```
|
49 | 25 |
|
| 26 | +In the above example, we use `switchByErrorCount` to set 3 opportunities for each agent. When the 3 opportunities are used up, the next agent will be automatically switched. If `switchByHttpStatus` is provided, the proxy will be automatically switched based on the status code first. |
| 27 | + |
50 | 28 | ::: tip
|
51 |
| -This function needs to be retried upon failure to function properly. |
| 29 | +This parameter is available only when maxRetry fails. maxRetry must be greater than the sum of switchByErrorCount of all proxies in the target, because maxRetry controls the number of retries of the target. |
52 | 30 | :::
|
| 31 | + |
| 32 | +**It can be set in three places: Create crawler application instance, advanced usage, and detailed goals. ** |
| 33 | + |
| 34 | +Take crawlPage as an example: |
| 35 | + |
| 36 | +```js{13,17,18,19,20,21,22,23,26,28,29,30,31,32,33,34,35,36} |
| 37 | +import { createCrawl } from 'x-crawl' |
| 38 | +
|
| 39 | +const crawlApp = createCrawl() |
| 40 | +
|
| 41 | +crawlApp |
| 42 | + .crawlPage({ |
| 43 | + targets: [ |
| 44 | + 'https://www.example.com/page-1', |
| 45 | + 'https://www.example.com/page-2', |
| 46 | + 'https://www.example.com/page-3', |
| 47 | + 'https://www.example.com/page-4', |
| 48 | + // Cancel the proxy for this target |
| 49 | + { url: 'https://www.example.com/page-6', proxy: null }, |
| 50 | + // Set up a separate proxy for this target |
| 51 | + { |
| 52 | + url: 'https://www.example.com/page-6', |
| 53 | + proxy: { |
| 54 | + urls: [ |
| 55 | + 'https://www.example.com/proxy-4', |
| 56 | + 'https://www.example.com/proxy-5' |
| 57 | + ], |
| 58 | + switchByErrorCount: 3 |
| 59 | + } |
| 60 | + } |
| 61 | + ], |
| 62 | + maxRetry: 10, |
| 63 | + // Set the proxy uniformly for this target |
| 64 | + proxy: { |
| 65 | + urls: [ |
| 66 | + 'https://www.example.com/proxy-1', |
| 67 | + 'https://www.example.com/proxy-2', |
| 68 | + 'https://www.example.com/proxy-3' |
| 69 | + ], |
| 70 | + switchByErrorCount: 3, |
| 71 | + switchByHttpStatus: [401, 403] |
| 72 | + } |
| 73 | + }) |
| 74 | + .then((res) => {}) |
| 75 | +``` |
0 commit comments