Firecrawl - Extract actions (#16069)

michelle0927 · web-flow · commit a2bdd590b9b8 · 2025-04-01T11:20:11.000-04:00
* new extract actions

* update

* update

* version
diff --git a/components/bloomerang/bloomerang.app.mjs b/components/bloomerang/bloomerang.app.mjs
@@ -8,4 +8,4 @@ export default {
       console.log(Object.keys(this.$auth));
     },
   },
-};
+};
diff --git a/components/firecrawl/actions/crawl-url/crawl-url.mjs b/components/firecrawl/actions/crawl-url/crawl-url.mjs
@@ -5,7 +5,7 @@ export default {
   key: "firecrawl-crawl-url",
   name: "Crawl URL",
   description: "Crawls a given URL and returns the contents of sub-pages. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/crawl-post)",
-  version: "1.0.1",
+  version: "1.0.2",
   type: "action",
   props: {
     firecrawl,
diff --git a/components/firecrawl/actions/extract-data/extract-data.mjs b/components/firecrawl/actions/extract-data/extract-data.mjs
@@ -0,0 +1,96 @@
+import firecrawl from "../../firecrawl.app.mjs";
+import { ConfigurationError } from "@pipedream/platform";
+import { parseObjectEntries } from "../../common/utils.mjs";
+
+export default {
+  key: "firecrawl-extract-data",
+  name: "Extract Data",
+  description: "Extract structured data from one or multiple URLs. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/extract)",
+  version: "0.0.1",
+  type: "action",
+  props: {
+    firecrawl,
+    urls: {
+      type: "string[]",
+      label: "URLs",
+      description: "An array of one or more URLs. Supports wildcards (/*) for broader crawling.",
+    },
+    prompt: {
+      type: "string",
+      label: "Prompt",
+      description: "(Optional unless no schema): A natural language prompt describing the data you want or specifying how you want that data structured.",
+      optional: true,
+    },
+    schema: {
+      type: "object",
+      label: "Schema",
+      description: "(Optional unless no prompt): A more rigid structure if you already know the JSON layout.",
+      optional: true,
+    },
+    enableWebSearch: {
+      type: "boolean",
+      label: "Enable Web Search",
+      description: "When `true`, the extraction will use web search to find additional data",
+      optional: true,
+    },
+    ignoreSitemap: {
+      type: "boolean",
+      label: "Ignore Sitemap",
+      description: "When true, sitemap.xml files will be ignored during website scanning",
+      optional: true,
+    },
+    includeSubdomains: {
+      type: "boolean",
+      label: "Include Subdomains",
+      description: "When true, subdomains of the provided URLs will also be scanned",
+      optional: true,
+    },
+    showSources: {
+      type: "boolean",
+      label: "Show Sources",
+      description: "When true, the sources used to extract the data will be included in the response",
+      optional: true,
+    },
+    waitForCompletion: {
+      type: "boolean",
+      label: "Wait For Completion",
+      description: "Set to `true` to poll the API in 3-second intervals until the job is completed",
+      optional: true,
+    },
+  },
+  async run({ $ }) {
+    if (!this.prompt && !this.schema) {
+      throw new ConfigurationError("Must enter one of Prompt or Schema");
+    }
+
+    let response = await this.firecrawl.extract({
+      $,
+      data: {
+        urls: this.urls,
+        prompt: this.prompt,
+        schema: this.schema && parseObjectEntries(this.schema),
+        enableWebSearch: this.enableWebSearch,
+        ignoreSitemap: this.ignoreSitemap,
+        includeSubdomains: this.includeSubdomains,
+        showSources: this.showSources,
+      },
+    });
+
+    if (this.waitForCompletion) {
+      const id = response.id;
+      const timer = (ms) => new Promise((res) => setTimeout(res, ms));
+      do {
+        response = await this.firecrawl.getExtractStatus({
+          $,
+          id,
+        });
+        await timer(3000);
+      } while (response.status === "processing");
+    }
+
+    if (response.success) {
+      $.export("$summary", "Successfully extracted data.");
+    }
+    return response;
+  },
+};
diff --git a/components/firecrawl/actions/get-crawl-status/get-crawl-status.mjs b/components/firecrawl/actions/get-crawl-status/get-crawl-status.mjs
@@ -4,7 +4,7 @@ export default {
   key: "firecrawl-get-crawl-status",
   name: "Get Crawl Data",
   description: "Obtains the status and data from a previous crawl operation. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/crawl-get)",
-  version: "0.0.2",
+  version: "0.0.3",
   type: "action",
   props: {
     firecrawl,
diff --git a/components/firecrawl/actions/get-extract-status/get-extract-status.mjs b/components/firecrawl/actions/get-extract-status/get-extract-status.mjs
@@ -0,0 +1,26 @@
+import firecrawl from "../../firecrawl.app.mjs";
+
+export default {
+  key: "firecrawl-get-extract-status",
+  name: "Get Extract Data",
+  description: "Obtains the status and data from a previous extract operation. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/extract-get)",
+  version: "0.0.1",
+  type: "action",
+  props: {
+    firecrawl,
+    extractId: {
+      type: "string",
+      label: "Extract Job ID",
+      description: "The ID of the extract job",
+    },
+  },
+  async run({ $ }) {
+    const response = await this.firecrawl.getExtractStatus({
+      $,
+      id: this.extractId,
+    });
+
+    $.export("$summary", `Successfully retrieved status for extract (ID: ${this.extractId})`);
+    return response;
+  },
+};
diff --git a/components/firecrawl/actions/scrape-page/scrape-page.mjs b/components/firecrawl/actions/scrape-page/scrape-page.mjs
@@ -7,7 +7,7 @@ export default {
   name: "Scrape Page",
   description:
     "Scrapes a URL and returns content from that page. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/scrape)",
-  version: "1.0.0",
+  version: "1.0.1",
   type: "action",
   props: {
     firecrawl,
diff --git a/components/firecrawl/firecrawl.app.mjs b/components/firecrawl/firecrawl.app.mjs
@@ -62,5 +62,20 @@ export default {
         ...opts,
       });
     },
+    extract(opts = {}) {
+      return this._makeRequest({
+        method: "POST",
+        path: "/extract",
+        ...opts,
+      });
+    },
+    getExtractStatus({
+      id, ...opts
+    }) {
+      return this._makeRequest({
+        path: `/extract/${id}`,
+        ...opts,
+      });
+    },
   },
 };
diff --git a/components/firecrawl/package.json b/components/firecrawl/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@pipedream/firecrawl",
-  "version": "1.0.1",
+  "version": "1.1.0",
   "description": "Pipedream FireCrawl Components",
   "main": "firecrawl.app.mjs",
   "keywords": [
diff --git a/components/hyperbrowser/hyperbrowser.app.mjs b/components/hyperbrowser/hyperbrowser.app.mjs
@@ -8,4 +8,4 @@ export default {
       console.log(Object.keys(this.$auth));
     },
   },
-};
+};
diff --git a/components/nutrient_workflow_automation/nutrient_workflow_automation.app.mjs b/components/nutrient_workflow_automation/nutrient_workflow_automation.app.mjs
@@ -8,4 +8,4 @@ export default {
       console.log(Object.keys(this.$auth));
     },
   },
-};
+};
diff --git a/components/oracle_cloud_infrastructure/oracle_cloud_infrastructure.app.mjs b/components/oracle_cloud_infrastructure/oracle_cloud_infrastructure.app.mjs
@@ -8,4 +8,4 @@ export default {
       console.log(Object.keys(this.$auth));
     },
   },
-};
+};

-Original file line number
+Diff line change
       console.log(Object.keys(this.$auth));
     },
   },
 -};
 +};
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@pipedream/firecrawl",`
`3`		`- "version": "1.0.1",`
	`3`	`+ "version": "1.1.0",`
`4`	`4`	`"description": "Pipedream FireCrawl Components",`
`5`	`5`	`"main": "firecrawl.app.mjs",`
`6`	`6`	`"keywords": [`