diff --git a/package.json b/package.json index 0b36ffb..99e559d 100644 --- a/package.json +++ b/package.json @@ -57,10 +57,12 @@ "@oclif/plugin-help", "@oclif/plugin-plugins" ], - "topicSeparator": " ", "topics": { - "hello": { - "description": "Say hello to the world and others" + "profiles": { + "description": "Manage LLM provider profiles" + }, + "translate": { + "description": "Translate content between languages" } } }, @@ -77,4 +79,4 @@ }, "types": "dist/index.d.ts", "packageManager": "pnpm@10.20.0" -} +} \ No newline at end of file diff --git a/src/commands/markdown.ts b/src/commands/markdown.ts deleted file mode 100644 index 7fafba2..0000000 --- a/src/commands/markdown.ts +++ /dev/null @@ -1,78 +0,0 @@ -import {Args, Command, Flags} from '@oclif/core' - -import {MarkdownTranslator} from '../core/translators/markdown.js' -import {createProviderFromProfile} from '../lib/profile/factory.js' -import {loadProfile} from '../lib/profile/storage.js' - -export default class Markdown extends Command { - static args = { - input: Args.string({ - description: 'The markdown text you want to translate', - required: false, - }), - } - static description = 'Translate markdown' - static examples = [ - '<%= config.bin %> <%= command.id %> --profile default-openai --from EN --to ES "Hello"', - '<%= config.bin %> <%= command.id %> --profile default-openai --from EN --to ES --stream "Hello"', - 'cat doc.md | <%= config.bin %> <%= command.id %> --profile default-openai --from EN --to ES', - 'echo "# Hello" | <%= config.bin %> <%= command.id %> --profile default-openai --from EN --to ES', - ] - static flags = { - from: Flags.string({ - description: 'Source language', - required: true, - }), - profile: Flags.string({ - description: 'Profile to use for translation', - required: true, - }), - stream: Flags.boolean({ - default: false, - description: 'Stream the translation output', - }), - to: Flags.string({ - description: 'Target language', - required: true, - }), - } - - async run(): Promise { - const {args, flags} = await this.parse(Markdown) - - let input: string - - if (args.input) { - input = args.input - } else { - const chunks: Buffer[] = [] - for await (const chunk of process.stdin) { - chunks.push(chunk) - } - - input = Buffer.concat(chunks).toString('utf8') - } - - const profile = loadProfile(flags.profile) - const llm = createProviderFromProfile(profile) - const translator = new MarkdownTranslator(llm) - - if (flags.stream) { - for await (const chunk of translator.translateStream({ - content: input, - sourceLanguage: flags.from, - targetLanguage: flags.to, - })) { - process.stdout.write(chunk) - } - } else { - const result = await translator.translate({ - content: input, - sourceLanguage: flags.from, - targetLanguage: flags.to, - }) - - process.stdout.write(result) - } - } -} diff --git a/src/commands/translate/base.ts b/src/commands/translate/base.ts new file mode 100644 index 0000000..e25ca90 --- /dev/null +++ b/src/commands/translate/base.ts @@ -0,0 +1,90 @@ +import {BaseChatModel} from '@langchain/core/language_models/chat_models' +import {Args, Command, Flags, Interfaces} from '@oclif/core' + +import {Translator} from '../../core/translators/translator.js' +import {createProviderFromProfile} from '../../lib/profile/factory.js' +import {loadProfile} from '../../lib/profile/storage.js' + +type TranslateFlags = Interfaces.InferredFlags< + T['flags'] & typeof BaseTranslateCommand.baseFlags +> +type TranslateArgs = Interfaces.InferredArgs + +export abstract class BaseTranslateCommand extends Command { + static args = { + input: Args.string({ + description: 'The text you want to translate', + required: false, + }), + } + static baseFlags = { + from: Flags.string({ + description: 'Source language', + required: true, + }), + profile: Flags.string({ + description: 'Profile to use for translation', + required: true, + }), + stream: Flags.boolean({ + default: false, + description: 'Stream the translation output', + }), + to: Flags.string({ + description: 'Target language', + required: true, + }), + } + protected args!: TranslateArgs + protected flags!: TranslateFlags + + abstract createTranslator(llm: BaseChatModel): Translator + + public async init(): Promise { + await super.init() + const {args, flags} = await this.parse({ + args: this.ctor.args, + baseFlags: (super.ctor as typeof BaseTranslateCommand).baseFlags, + flags: this.ctor.flags, + strict: this.ctor.strict, + }) + this.flags = flags as TranslateFlags + this.args = args as TranslateArgs + } + + async run(): Promise { + let input: string + + if (this.args.input) { + input = this.args.input + } else { + const chunks: Buffer[] = [] + for await (const chunk of process.stdin) { + chunks.push(chunk) + } + + input = Buffer.concat(chunks).toString('utf8') + } + + const llm = createProviderFromProfile(loadProfile(this.flags.profile)) + const translator = this.createTranslator(llm) + + if (this.flags.stream) { + for await (const chunk of translator.translateStream({ + content: input, + sourceLanguage: this.flags.from, + targetLanguage: this.flags.to, + })) { + process.stdout.write(chunk) + } + } else { + const result = await translator.translate({ + content: input, + sourceLanguage: this.flags.from, + targetLanguage: this.flags.to, + }) + + process.stdout.write(result) + } + } +} diff --git a/src/commands/translate/markdown.ts b/src/commands/translate/markdown.ts new file mode 100644 index 0000000..6cff231 --- /dev/null +++ b/src/commands/translate/markdown.ts @@ -0,0 +1,26 @@ +import {BaseChatModel} from '@langchain/core/language_models/chat_models' + +import {MARKDOWN_SYSTEM_PROMPT} from '../../core/prompts/markdown.js' +import {MarkdownSplitter} from '../../core/splitters/markdown.js' +import {Translator} from '../../core/translators/translator.js' +import {BaseTranslateCommand} from './base.js' + +export default class TranslateMarkdown extends BaseTranslateCommand { + static args = { + ...BaseTranslateCommand.args, + } + static description = 'Translate markdown' + static examples = [ + '<%= config.bin %> <%= command.id %> --profile default-openai --from EN --to ES "Hello"', + '<%= config.bin %> <%= command.id %> --profile default-openai --from EN --to ES --stream "Hello"', + 'cat doc.md | <%= config.bin %> <%= command.id %> --profile default-openai --from EN --to ES', + 'echo "# Hello" | <%= config.bin %> <%= command.id %> --profile default-openai --from EN --to ES', + ] + static flags = { + ...BaseTranslateCommand.baseFlags, + } + + createTranslator(llm: BaseChatModel): Translator { + return new Translator(llm, new MarkdownSplitter(), MARKDOWN_SYSTEM_PROMPT) + } +} diff --git a/src/core/prompts/markdown.ts b/src/core/prompts/markdown.ts new file mode 100644 index 0000000..90b1329 --- /dev/null +++ b/src/core/prompts/markdown.ts @@ -0,0 +1,27 @@ +/** + * System prompt template for markdown translation. + * Uses {{sourceLanguage}} and {{targetLanguage}} as placeholders. + */ +export const MARKDOWN_SYSTEM_PROMPT = ` +You are a helpful assistant that accurately translates markdown document snippets from {{sourceLanguage}} to {{targetLanguage}} while preserving markdown syntax, formatting, and custom directives. +You always preserve the structure and formatting exactly as it is. +You do not add, alter or modify the text you receive in any way. + +Reminder: +- Translate only the text, preserving the structure and formatting. +- NEVER under any circumstances translate any words found inside backticks Eg. \`Text\`. +- NEVER translate custom directive like ::startApplication{...} or ::openFile{...}. +- DO translate titles inside the ::page{title=""} custom directive. +- NEVER translate keywords that appear after colons, such as \`:fa-lightbulb-o:\`. +- NEVER translate the sections "Author", "Other Contributors", and "Change Logs". +- NEVER translate any URLs. +- NEVER translate HTML tags like \`
\` and \`\`. +- Translate idiomatically, adapting expressions to sound natural in {{targetLanguage}}. +- Avoid overly literal translations; prioritize clarity and fluency in {{targetLanguage}} over word-for-word accuracy. +- Use concise and clear language that would sound natural in everyday speech or written {{targetLanguage}}. +- When technical {{sourceLanguage}} terms lack a common {{targetLanguage}} equivalent, use well-known {{targetLanguage}} alternatives or rephrase for clarity. +- Be consistent with technical terms. If an equivalent technical term is not available in {{targetLanguage}}, always use the original term. + +*IMPORTANT* +Translate without any additional information or comments. +` diff --git a/src/core/splitters/markdown.ts b/src/core/splitters/markdown.ts index d71184c..4fb73e2 100644 --- a/src/core/splitters/markdown.ts +++ b/src/core/splitters/markdown.ts @@ -1,6 +1,6 @@ import {MarkdownTextSplitter} from '@langchain/textsplitters' -import type {Chunk} from '../types.js' +import type {BaseSplitter, Chunk} from '../types.js' /** * Hybrid two-pass markdown splitter for translation purposes. @@ -13,7 +13,7 @@ import type {Chunk} from '../types.js' * - Prevents massive chunks from overwhelming translation APIs * - Uses LangChain's intelligent splitting for size management (tries headers > paragraphs > lines) */ -export class MarkdownSplitter { +export class MarkdownSplitter implements BaseSplitter { private readonly chunkSize: number private recursiveSplitter: MarkdownTextSplitter @@ -30,7 +30,7 @@ export class MarkdownSplitter { /** * Appends a chunk to an accumulator string, preserving whitespace */ - reconstructChunk(accumulator: string, chunk: Chunk): string { + reconstruct(accumulator: string, chunk: Chunk): string { return accumulator + (chunk.leadingWhitespace || '') + chunk.content + (chunk.trailingWhitespace || '') } diff --git a/src/core/translators/markdown.ts b/src/core/translators/markdown.ts deleted file mode 100644 index adf1a91..0000000 --- a/src/core/translators/markdown.ts +++ /dev/null @@ -1,116 +0,0 @@ -import type {BaseChatModel} from '@langchain/core/language_models/chat_models' - -import {BaseMessage, HumanMessage, SystemMessage} from '@langchain/core/messages' - -import type {TranslationOptions} from '../types.js' - -import {MarkdownSplitter} from '../splitters/markdown.js' - -/** - * Translates markdown content using a chat model - */ -export class MarkdownTranslator { - private splitter: MarkdownSplitter - - constructor(private chatModel: BaseChatModel) { - this.splitter = new MarkdownSplitter() - } - - /** - * Translates markdown content from source language to target language - */ - async translate(options: TranslationOptions): Promise { - const chunks = await this.splitter.split(options.content) - let response = '' - - for (const chunk of chunks) { - if (chunk.shouldTranslate) { - // eslint-disable-next-line no-await-in-loop - const translatedChunk = await this.chatModel.invoke(this.buildMessages({...options, content: chunk.content})) - - response = this.splitter.reconstructChunk(response, {...chunk, content: translatedChunk.content as string}) - } else { - response = this.splitter.reconstructChunk(response, chunk) - } - } - - return response - } - - /** - * Streams translated markdown content from source language to target language - * @yields {string} Chunks of translated content - */ - async *translateStream(options: TranslationOptions): AsyncGenerator { - const chunks = await this.splitter.split(options.content) - - for (const chunk of chunks) { - yield chunk.leadingWhitespace || '' - - if (chunk.shouldTranslate) { - // eslint-disable-next-line no-await-in-loop - for await (const streamedChunk of this.streamChunk({ - ...options, - content: chunk.content, - })) { - yield streamedChunk - } - } else { - yield chunk.content - } - - yield chunk.trailingWhitespace || '' - } - } - - private buildMessages({content, sourceLanguage, targetLanguage}: TranslationOptions): BaseMessage[] { - const systemPrompt = this.createSystemPrompt(sourceLanguage, targetLanguage) - const messages = [new SystemMessage(systemPrompt), new HumanMessage(content)] - - return messages - } - - /** - * Creates the system prompt for translation - */ - private createSystemPrompt(sourceLanguage: string, targetLanguage: string): string { - return ` -You are a helpful assistant that accurately translates markdown document snippets from ${sourceLanguage} to ${targetLanguage} while preserving markdown syntax, formatting, and custom directives. -You always preserve the structure and formatting exactly as it is. -You do not add, alter or modify the text you receive in any way. - -Reminder: -- Translate only the text, preserving the structure and formatting. -- NEVER under any circumstances translate any words found inside backticks Eg. \`Text\`. -- NEVER translate custom directive like ::startApplication{...} or ::openFile{...}. -- DO translate titles inside the ::page{title=""} custom directive. -- NEVER translate keywords that appear after colons, such as \`:fa-lightbulb-o:\`. -- NEVER translate the sections "Author", "Other Contributors", and "Change Logs". -- NEVER translate any URLs. -- NEVER translate HTML tags like \`
\` and \`\`. -- Translate idiomatically, adapting expressions to sound natural in ${targetLanguage}. -- Avoid overly literal translations; prioritize clarity and fluency in ${targetLanguage} over word-for-word accuracy. -- Use concise and clear language that would sound natural in everyday speech or written ${targetLanguage}. -- When technical ${sourceLanguage} terms lack a common ${targetLanguage} equivalent, use well-known ${targetLanguage} alternatives or rephrase for clarity. -- Be consistent with technical terms. If an equivalent technical term is not available in ${targetLanguage}, always use the original term. - -*IMPORTANT* -Translate without any additional information or comments. -` - } - - /** - * Streams a single chunk through the chat model - * @yields {string} Chunks of translated content from the model - */ - private async *streamChunk(options: TranslationOptions): AsyncGenerator { - const stream = await this.chatModel.stream(this.buildMessages(options)) - - for await (const chunk of stream) { - const {content} = chunk - if (typeof content === 'string') { - yield content - } - } - } -} diff --git a/src/core/translators/translator.ts b/src/core/translators/translator.ts new file mode 100644 index 0000000..e6fb0ca --- /dev/null +++ b/src/core/translators/translator.ts @@ -0,0 +1,90 @@ +import type {BaseChatModel} from '@langchain/core/language_models/chat_models' + +import {BaseMessage, HumanMessage, SystemMessage} from '@langchain/core/messages' + +import type {BaseSplitter, TranslationOptions} from '../types.js' + +/** + * Generic translator that orchestrates content splitting, translation, and reconstruction. + * Works with any splitter and prompt via dependency injection. + */ +export class Translator { + constructor( + private chatModel: BaseChatModel, + private splitter: BaseSplitter, + private systemPromptTemplate: string, + ) {} + + /** + * Translates content from source language to target language + */ + async translate(options: TranslationOptions): Promise { + const chunks = await this.splitter.split(options.content) + let response = '' + + for (const chunk of chunks) { + if (chunk.shouldTranslate) { + // eslint-disable-next-line no-await-in-loop + const translatedChunk = await this.chatModel.invoke(this.buildMessages({...options, content: chunk.content})) + + response = this.splitter.reconstruct(response, {...chunk, content: translatedChunk.content as string}) + } else { + response = this.splitter.reconstruct(response, chunk) + } + } + + return response + } + + /** + * Streams translated content from source language to target language + * @yields {string} Chunks of translated content + */ + async *translateStream(options: TranslationOptions): AsyncGenerator { + const chunks = await this.splitter.split(options.content) + + for (const chunk of chunks) { + yield chunk.leadingWhitespace || '' + + if (chunk.shouldTranslate) { + // eslint-disable-next-line no-await-in-loop + for await (const streamedChunk of this.streamChunk({ + ...options, + content: chunk.content, + })) { + yield streamedChunk + } + } else { + yield chunk.content + } + + yield chunk.trailingWhitespace || '' + } + } + + private buildMessages({content, sourceLanguage, targetLanguage}: TranslationOptions): BaseMessage[] { + const systemPrompt = this.interpolatePrompt(this.systemPromptTemplate, sourceLanguage, targetLanguage) + const messages = [new SystemMessage(systemPrompt), new HumanMessage(content)] + + return messages + } + + private interpolatePrompt(template: string, sourceLanguage: string, targetLanguage: string): string { + return template.replaceAll('{{sourceLanguage}}', sourceLanguage).replaceAll('{{targetLanguage}}', targetLanguage) + } + + /** + * Streams a single chunk through the chat model + * @yields {string} Chunks of translated content from the model + */ + private async *streamChunk(options: TranslationOptions): AsyncGenerator { + const stream = await this.chatModel.stream(this.buildMessages(options)) + + for await (const chunk of stream) { + const {content} = chunk + if (typeof content === 'string') { + yield content + } + } + } +} diff --git a/src/core/types.ts b/src/core/types.ts index ae45ffb..539c3c3 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -24,6 +24,22 @@ export interface Chunk { trailingWhitespace?: string } +/** + * Base interface for content splitters. + * All splitter implementations must provide these methods. + */ +export interface BaseSplitter { + /** + * Reconstructs content by appending a chunk to an accumulator with proper whitespace + */ + reconstruct(accumulator: string, chunk: Chunk): string + + /** + * Splits content into chunks + */ + split(content: string): Promise +} + /** * Translation configuration options */ diff --git a/test/commands/markdown.test.ts b/test/commands/translate/markdown.test.ts similarity index 77% rename from test/commands/markdown.test.ts rename to test/commands/translate/markdown.test.ts index 295c048..2749800 100644 --- a/test/commands/markdown.test.ts +++ b/test/commands/translate/markdown.test.ts @@ -1,9 +1,9 @@ import {afterAll, beforeAll, describe, expect, it} from '@jest/globals' import {runCommand} from '@oclif/test' -import {setupTestProfile, teardownTestProfile} from '../helpers/profile-setup.js' +import {setupTestProfile, teardownTestProfile} from '../../helpers/profile-setup.js' -describe('markdown command', () => { +describe('translate:markdown command', () => { beforeAll(() => { setupTestProfile() }) @@ -15,13 +15,13 @@ describe('markdown command', () => { describe('basic usage', () => { it('translates markdown', async () => { await expect( - runCommand(['markdown', '--profile', 'test-profile', '--from', 'EN', '--to', 'ES', 'Hello']), + runCommand(['translate:markdown', '--profile', 'test-profile', '--from', 'EN', '--to', 'ES', 'Hello']), ).resolves.not.toThrow() }) it('streams translated markdown', async () => { await expect( - runCommand(['markdown', '--profile', 'test-profile', '--from', 'EN', '--to', 'ES', '--stream', 'Hello']), + runCommand(['translate:markdown', '--profile', 'test-profile', '--from', 'EN', '--to', 'ES', '--stream', 'Hello']), ).resolves.not.toThrow() }) }) @@ -31,7 +31,7 @@ describe('markdown command', () => { setupTestProfile('test-profile', ['Hóla']) const {stdout} = await runCommand([ - 'markdown', + 'translate:markdown', '--profile', 'test-profile', '--from', @@ -47,7 +47,7 @@ describe('markdown command', () => { setupTestProfile('test-profile', ['Hóla']) const {stdout} = await runCommand([ - 'markdown', + 'translate:markdown', '--profile', 'test-profile', '--from', @@ -64,7 +64,7 @@ describe('markdown command', () => { setupTestProfile('test-profile', ['# Page 1\nbonjour', '# Page 2\nmonde']) const {stdout} = await runCommand([ - 'markdown', + 'translate:markdown', '--profile', 'test-profile', '--from', diff --git a/test/core/translators/markdown.test.ts b/test/core/translators/markdown.test.ts index 377c91a..cdc18d0 100644 --- a/test/core/translators/markdown.test.ts +++ b/test/core/translators/markdown.test.ts @@ -1,18 +1,20 @@ import {describe, expect, it} from '@jest/globals' import {FakeListChatModel} from '@langchain/core/utils/testing' -import {MarkdownTranslator} from '../../../src/core/translators/markdown.js' +import {MARKDOWN_SYSTEM_PROMPT} from '../../../src/core/prompts/markdown.js' +import {MarkdownSplitter} from '../../../src/core/splitters/markdown.js' +import {Translator} from '../../../src/core/translators/translator.js' describe('MarkdownTranslator', () => { let fakeChatModel: FakeListChatModel - let translator: MarkdownTranslator + let translator: Translator describe('translate', () => { it('translates markdown content and returns string', async () => { fakeChatModel = new FakeListChatModel({ responses: ['Traducido'], }) - translator = new MarkdownTranslator(fakeChatModel) + translator = new Translator(fakeChatModel, new MarkdownSplitter(), MARKDOWN_SYSTEM_PROMPT) const result = await translator.translate({ content: '# Hello World', @@ -29,7 +31,7 @@ describe('MarkdownTranslator', () => { fakeChatModel = new FakeListChatModel({ responses: ['Hello World'], }) - translator = new MarkdownTranslator(fakeChatModel) + translator = new Translator(fakeChatModel, new MarkdownSplitter(), MARKDOWN_SYSTEM_PROMPT) const chunks: string[] = [] for await (const chunk of translator.translateStream({