From fa7ca4411eed588c91af90f1169afb6d380f2650 Mon Sep 17 00:00:00 2001
From: viktor <viktorhu@foxmail.com>
Date: Wed, 14 May 2025 19:48:51 +0800
Subject: [PATCH 1/4] feat. Refactor the NovitaTextToVideoTask using the async
 API.

---
 packages/inference/README.md                  |  3 +-
 .../inference/src/lib/getProviderHelper.ts    |  1 +
 packages/inference/src/providers/novita.ts    | 89 +++++++++++++++----
 .../inference/src/tasks/cv/textToVideo.ts     |  4 +-
 4 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/packages/inference/README.md b/packages/inference/README.md
index 55cff9429c..6542dd8855 100644
--- a/packages/inference/README.md
+++ b/packages/inference/README.md
@@ -52,7 +52,7 @@ Currently, we support the following providers:
 - [Fireworks AI](https://fireworks.ai)
 - [Hyperbolic](https://hyperbolic.xyz)
 - [Nebius](https://studio.nebius.ai)
-- [Novita](https://novita.ai/?utm_source=github_huggingface&utm_medium=github_readme&utm_campaign=link)
+- [Novita](https://novita.ai)
 - [Nscale](https://nscale.com)
 - [OVHcloud](https://endpoints.ai.cloud.ovh.net/)
 - [Replicate](https://replicate.com)
@@ -93,6 +93,7 @@ Only a subset of models are supported when requesting third-party providers. You
 - [Cerebras supported models](https://huggingface.co/api/partners/cerebras/models)
 - [Groq supported models](https://console.groq.com/docs/models)
 - [HF Inference API (serverless)](https://huggingface.co/models?inference=warm&sort=trending)
+- [Novita AI supported models](https://huggingface.co/api/partners/novita/models)
 
 ❗**Important note:** To be compatible, the third-party API must adhere to the "standard" shape API we expect on HF model pages for each pipeline task type.
 This is not an issue for LLMs as everyone converged on the OpenAI API anyways, but can be more tricky for other tasks like "text-to-image" or "automatic-speech-recognition" where there exists no standard API. Let us know if any help is needed or if we can make things easier for you!
diff --git a/packages/inference/src/lib/getProviderHelper.ts b/packages/inference/src/lib/getProviderHelper.ts
index 4e9e3ddbe5..8d85f43dae 100644
--- a/packages/inference/src/lib/getProviderHelper.ts
+++ b/packages/inference/src/lib/getProviderHelper.ts
@@ -120,6 +120,7 @@ export const PROVIDERS: Record<InferenceProvider, Partial<Record<InferenceTask,
 	novita: {
 		conversational: new Novita.NovitaConversationalTask(),
 		"text-generation": new Novita.NovitaTextGenerationTask(),
+		"text-to-video": new Novita.NovitaTextToVideoTask(),
 	},
 	nscale: {
 		"text-to-image": new Nscale.NscaleTextToImageTask(),
diff --git a/packages/inference/src/providers/novita.ts b/packages/inference/src/providers/novita.ts
index bc66e0936d..a014aec55d 100644
--- a/packages/inference/src/providers/novita.ts
+++ b/packages/inference/src/providers/novita.ts
@@ -17,6 +17,7 @@
 import { InferenceOutputError } from "../lib/InferenceOutputError";
 import { isUrl } from "../lib/isUrl";
 import type { BodyParams, UrlParams } from "../types";
+import { delay } from "../utils/delay";
 import { omit } from "../utils/omit";
 import {
 	BaseConversationalTask,
@@ -26,11 +27,11 @@ import {
 } from "./providerHelper";
 
 const NOVITA_API_BASE_URL = "https://api.novita.ai";
-export interface NovitaOutput {
-	video: {
-		video_url: string;
-	};
+
+export interface NovitaAsyncAPIOutput {
+	task_id: string;
 }
+
 export class NovitaTextGenerationTask extends BaseTextGenerationTask {
 	constructor() {
 		super("novita", NOVITA_API_BASE_URL);
@@ -50,38 +51,88 @@ export class NovitaConversationalTask extends BaseConversationalTask {
 		return "/v3/openai/chat/completions";
 	}
 }
+
 export class NovitaTextToVideoTask extends TaskProviderHelper implements TextToVideoTaskHelper {
 	constructor() {
 		super("novita", NOVITA_API_BASE_URL);
 	}
 
-	makeRoute(params: UrlParams): string {
-		return `/v3/hf/${params.model}`;
+	override makeRoute(params: UrlParams): string {
+		if (params.authMethod !== "provider-key") {
+			return `/v3/async/${params.model}?_subdomain=queue`;
+		}
+		return `/v3/async/${params.model}`;
 	}
 
-	preparePayload(params: BodyParams): Record<string, unknown> {
+	override preparePayload(params: BodyParams): Record<string, unknown> {
+		const { num_inference_steps, ...restParameters } = params.args.parameters as Record<string, unknown>;
 		return {
 			...omit(params.args, ["inputs", "parameters"]),
-			...(params.args.parameters as Record<string, unknown>),
+			...restParameters,
+			steps: num_inference_steps,
 			prompt: params.args.inputs,
 		};
 	}
-	override async getResponse(response: NovitaOutput): Promise<Blob> {
+
+	override async getResponse(
+		response: NovitaAsyncAPIOutput,
+		url?: string,
+		headers?: Record<string, string>
+	): Promise<Blob> {
+		if (!url || !headers) {
+			throw new InferenceOutputError("URL and headers are required for text-to-video task");
+		}
+		const taskId = response.task_id;
+		if (!taskId) {
+			throw new InferenceOutputError("No task ID found in the response");
+		}
+
+		const parsedUrl = new URL(url);
+		const baseUrl = `${parsedUrl.protocol}//${parsedUrl.host}${
+			parsedUrl.host === "router.huggingface.co" ? "/novita" : ""
+		}`;
+		const queryParams = parsedUrl.search;
+		const resultUrl = `${baseUrl}/v3/async/task-result${queryParams ? queryParams + '&' : '?'}task_id=${taskId}`;
+
+		let status = '';
+		let taskResult = undefined;
+
+		while (status !== 'TASK_STATUS_SUCCEED' && status !== 'TASK_STATUS_FAILED') {
+			await delay(500);
+			const resultResponse = await fetch(resultUrl, { headers });
+			if (!resultResponse.ok) {
+				throw new InferenceOutputError("Failed to fetch task result");
+			}
+			try {
+				taskResult = await resultResponse.json();
+				status = taskResult.task.status;
+			} catch (error) {
+				throw new InferenceOutputError("Failed to parse task result");
+			}
+		}
+
+		if (status === 'TASK_STATUS_FAILED') {
+			throw new InferenceOutputError("Task failed");
+		}
+
+		// There will be at most one video in the response.
 		const isValidOutput =
-			typeof response === "object" &&
-			!!response &&
-			"video" in response &&
-			typeof response.video === "object" &&
-			!!response.video &&
-			"video_url" in response.video &&
-			typeof response.video.video_url === "string" &&
-			isUrl(response.video.video_url);
+			typeof taskResult === "object" &&
+			!!taskResult &&
+			"videos" in taskResult &&
+			typeof taskResult.videos === "object" &&
+			!!taskResult.videos &&
+			Array.isArray(taskResult.videos) &&
+			taskResult.videos.length > 0 &&
+			"video_url" in taskResult.videos[0] &&
+			typeof taskResult.videos[0].video_url === "string" &&
+			isUrl(taskResult.videos[0].video_url);
 
 		if (!isValidOutput) {
-			throw new InferenceOutputError("Expected { video: { video_url: string } }");
+			throw new InferenceOutputError("Expected { videos: [{ video_url: string }] }");
 		}
 
-		const urlResponse = await fetch(response.video.video_url);
+		const urlResponse = await fetch(taskResult.videos[0].video_url);
 		return await urlResponse.blob();
 	}
 }
diff --git a/packages/inference/src/tasks/cv/textToVideo.ts b/packages/inference/src/tasks/cv/textToVideo.ts
index 58f12e26ad..c4a619b32c 100644
--- a/packages/inference/src/tasks/cv/textToVideo.ts
+++ b/packages/inference/src/tasks/cv/textToVideo.ts
@@ -3,7 +3,7 @@ import { resolveProvider } from "../../lib/getInferenceProviderMapping";
 import { getProviderHelper } from "../../lib/getProviderHelper";
 import { makeRequestOptions } from "../../lib/makeRequestOptions";
 import type { FalAiQueueOutput } from "../../providers/fal-ai";
-import type { NovitaOutput } from "../../providers/novita";
+import type { NovitaAsyncAPIOutput } from "../../providers/novita";
 import type { ReplicateOutput } from "../../providers/replicate";
 import type { BaseArgs, Options } from "../../types";
 import { innerRequest } from "../../utils/request";
@@ -15,7 +15,7 @@ export type TextToVideoOutput = Blob;
 export async function textToVideo(args: TextToVideoArgs, options?: Options): Promise<TextToVideoOutput> {
 	const provider = await resolveProvider(args.provider, args.model, args.endpointUrl);
 	const providerHelper = getProviderHelper(provider, "text-to-video");
-	const { data: response } = await innerRequest<FalAiQueueOutput | ReplicateOutput | NovitaOutput>(
+	const { data: response } = await innerRequest<FalAiQueueOutput | ReplicateOutput | NovitaAsyncAPIOutput>(
 		args,
 		providerHelper,
 		{

From 32a68f2ab23a9a836c76b62823beeca8ba21348e Mon Sep 17 00:00:00 2001
From: viktor <viktorhu@foxmail.com>
Date: Fri, 16 May 2025 19:03:55 +0800
Subject: [PATCH 2/4] fix. Remove the unexpected changes.

---
 packages/inference/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/packages/inference/README.md b/packages/inference/README.md
index 9018b3d5b2..21e46625be 100644
--- a/packages/inference/README.md
+++ b/packages/inference/README.md
@@ -96,7 +96,6 @@ Only a subset of models are supported when requesting third-party providers. You
 - [Cohere supported models](https://huggingface.co/api/partners/cohere/models)
 - [Cerebras supported models](https://huggingface.co/api/partners/cerebras/models)
 - [Groq supported models](https://console.groq.com/docs/models)
-- [HF Inference API (serverless)](https://huggingface.co/models?inference=warm&sort=trending)
 - [Novita AI supported models](https://huggingface.co/api/partners/novita/models)
 
 ❗**Important note:** To be compatible, the third-party API must adhere to the "standard" shape API we expect on HF model pages for each pipeline task type.

From 88b8e469e8a14075fdd122a217440db78f82800f Mon Sep 17 00:00:00 2001
From: viktor <viktorhu@foxmail.com>
Date: Tue, 20 May 2025 11:41:42 +0800
Subject: [PATCH 3/4] Refactor code based on the PR comments from @SBrandeis

---
 packages/inference/src/providers/novita.ts | 38 +++++++++++++---------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/packages/inference/src/providers/novita.ts b/packages/inference/src/providers/novita.ts
index a014aec55d..6b8022a767 100644
--- a/packages/inference/src/providers/novita.ts
+++ b/packages/inference/src/providers/novita.ts
@@ -58,14 +58,11 @@ export class NovitaTextToVideoTask extends TaskProviderHelper implements TextToV
 	}
 
 	override makeRoute(params: UrlParams): string {
-		if (params.authMethod !== "provider-key") {
-			return `/v3/async/${params.model}?_subdomain=queue`;
-		}
 		return `/v3/async/${params.model}`;
 	}
 
 	override preparePayload(params: BodyParams): Record<string, unknown> {
-		const { num_inference_steps, ...restParameters } = params.args.parameters as Record<string, unknown>;
+		const { num_inference_steps, ...restParameters } = (params.args.parameters as Record<string, unknown>) ?? {};
 		return {
 			...omit(params.args, ["inputs", "parameters"]),
 			...restParameters,
@@ -91,11 +88,10 @@ export class NovitaTextToVideoTask extends TaskProviderHelper implements TextToV
 		const baseUrl = `${parsedUrl.protocol}//${parsedUrl.host}${
 			parsedUrl.host === "router.huggingface.co" ? "/novita" : ""
 		}`;
-		const queryParams = parsedUrl.search;
-		const resultUrl = `${baseUrl}/v3/async/task-result${queryParams ? queryParams + '&' : '?'}task_id=${taskId}`;
+		const resultUrl = `${baseUrl}/v3/async/task-result?task_id=${taskId}`;
 
 		let status = '';
-		let taskResult = undefined;
+		let taskResult: unknown;
 
 		while (status !== 'TASK_STATUS_SUCCEED' && status !== 'TASK_STATUS_FAILED') {
 			await delay(500);
@@ -105,7 +101,19 @@ export class NovitaTextToVideoTask extends TaskProviderHelper implements TextToV
 			}
 			try {
 				taskResult = await resultResponse.json();
-				status = taskResult.task.status;
+				if (
+					taskResult &&
+					typeof taskResult === "object" &&
+					"task" in taskResult &&
+					taskResult.task &&
+					typeof taskResult.task === "object" &&
+					"status" in taskResult.task &&
+					typeof taskResult.task.status === "string"
+				) {
+					status = taskResult.task.status;
+				} else {
+					throw new InferenceOutputError("Failed to get task status");
+				}
 			} catch (error) {
 				throw new InferenceOutputError("Failed to parse task result");
 			}
@@ -115,8 +123,7 @@ export class NovitaTextToVideoTask extends TaskProviderHelper implements TextToV
 			throw new InferenceOutputError("Task failed");
 		}
 
-		// There will be at most one video in the response.
-		const isValidOutput =
+		if (
 			typeof taskResult === "object" &&
 			!!taskResult &&
 			"videos" in taskResult &&
@@ -126,13 +133,12 @@ export class NovitaTextToVideoTask extends TaskProviderHelper implements TextToV
 			taskResult.videos.length > 0 &&
 			"video_url" in taskResult.videos[0] &&
 			typeof taskResult.videos[0].video_url === "string" &&
-			isUrl(taskResult.videos[0].video_url);
-
-		if (!isValidOutput) {
+			isUrl(taskResult.videos[0].video_url)
+		) {
+			const urlResponse = await fetch(taskResult.videos[0].video_url);
+			return await urlResponse.blob();
+		} else {
 			throw new InferenceOutputError("Expected { videos: [{ video_url: string }] }");
 		}
-
-		const urlResponse = await fetch(taskResult.videos[0].video_url);
-		return await urlResponse.blob();
 	}
 }

From 362ebe738ab53ddf61469a39723988f20b0ee837 Mon Sep 17 00:00:00 2001
From: viktor <viktorhu@foxmail.com>
Date: Thu, 22 May 2025 20:42:07 +0800
Subject: [PATCH 4/4] Refactor code.

---
 packages/inference/src/providers/novita.ts | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/packages/inference/src/providers/novita.ts b/packages/inference/src/providers/novita.ts
index 6b8022a767..17bd4bb17c 100644
--- a/packages/inference/src/providers/novita.ts
+++ b/packages/inference/src/providers/novita.ts
@@ -16,6 +16,7 @@
  */
 import { InferenceOutputError } from "../lib/InferenceOutputError";
 import { isUrl } from "../lib/isUrl";
+import type { TextToVideoArgs } from "../tasks/cv/textToVideo";
 import type { BodyParams, UrlParams } from "../types";
 import { delay } from "../utils/delay";
 import { omit } from "../utils/omit";
@@ -61,8 +62,8 @@ export class NovitaTextToVideoTask extends TaskProviderHelper implements TextToV
 		return `/v3/async/${params.model}`;
 	}
 
-	override preparePayload(params: BodyParams): Record<string, unknown> {
-		const { num_inference_steps, ...restParameters } = (params.args.parameters as Record<string, unknown>) ?? {};
+	override preparePayload(params: BodyParams<TextToVideoArgs>): Record<string, unknown> {
+		const { num_inference_steps, ...restParameters } = params.args.parameters ?? {};
 		return {
 			...omit(params.args, ["inputs", "parameters"]),
 			...restParameters,
@@ -90,10 +91,10 @@ export class NovitaTextToVideoTask extends TaskProviderHelper implements TextToV
 		}`;
 		const resultUrl = `${baseUrl}/v3/async/task-result?task_id=${taskId}`;
 
-		let status = '';
+		let status = "";
 		let taskResult: unknown;
 
-		while (status !== 'TASK_STATUS_SUCCEED' && status !== 'TASK_STATUS_FAILED') {
+		while (status !== "TASK_STATUS_SUCCEED" && status !== "TASK_STATUS_FAILED") {
 			await delay(500);
 			const resultResponse = await fetch(resultUrl, { headers });
 			if (!resultResponse.ok) {
@@ -119,7 +120,7 @@ export class NovitaTextToVideoTask extends TaskProviderHelper implements TextToV
 			}
 		}
 
-		if (status === 'TASK_STATUS_FAILED') {
+		if (status === "TASK_STATUS_FAILED") {
 			throw new InferenceOutputError("Task failed");
 		}