Skip to content

Commit a506ef8

Browse files
Basic splitter
1 parent 372ea94 commit a506ef8

File tree

7 files changed

+396
-26
lines changed

7 files changed

+396
-26
lines changed

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"@ibm-cloud/watsonx-ai": "^1.7.0",
1212
"@langchain/community": "^1.0.0",
1313
"@langchain/core": "^1.0.2",
14+
"@langchain/textsplitters": "^1.0.0",
1415
"@oclif/core": "^4",
1516
"@oclif/plugin-help": "^6",
1617
"@oclif/plugin-plugins": "^5"

pnpm-lock.yaml

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/core/providers/watsonx.ts

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,49 @@
1-
import { ChatWatsonx } from '@langchain/community/chat_models/ibm';
1+
import {ChatWatsonx} from '@langchain/community/chat_models/ibm'
22

33
/**
44
* Configuration for watsonx.ai client
55
*/
66
export interface WatsonxConfig {
7-
apiKey: string;
8-
maxNewTokens?: number;
9-
model?: string;
10-
projectId: string;
11-
serviceUrl: string;
12-
temperature?: number;
7+
apiKey: string
8+
maxNewTokens?: number
9+
model?: string
10+
projectId: string
11+
serviceUrl: string
12+
temperature?: number
1313
}
1414

1515
/**
1616
* Creates and returns a configured watsonx.ai chat model with IAM authentication
1717
*/
1818
export function createClient(config?: Partial<WatsonxConfig>): ChatWatsonx {
1919
// Get configuration from environment variables or provided config
20-
const serviceUrl = config?.serviceUrl || process.env.WATSONX_AI_SERVICE_URL;
21-
const projectId = config?.projectId || process.env.WATSONX_AI_PROJECT_ID;
22-
const apiKey = config?.apiKey || process.env.WATSONX_AI_APIKEY;
20+
const serviceUrl = config?.serviceUrl || process.env.WATSONX_AI_SERVICE_URL
21+
const projectId = config?.projectId || process.env.WATSONX_AI_PROJECT_ID
22+
const apiKey = config?.apiKey || process.env.WATSONX_AI_APIKEY
2323

2424
// Validate required configuration
2525
if (!serviceUrl) {
26-
throw new Error('WATSONX_AI_SERVICE_URL is required');
26+
throw new Error('WATSONX_AI_SERVICE_URL is required')
2727
}
2828

2929
if (!projectId) {
30-
throw new Error('WATSONX_AI_PROJECT_ID is required');
30+
throw new Error('WATSONX_AI_PROJECT_ID is required')
3131
}
3232

3333
if (!apiKey) {
34-
throw new Error('WATSONX_AI_APIKEY is required');
34+
throw new Error('WATSONX_AI_APIKEY is required')
3535
}
3636

3737
// Create and return the chat model with IAM authentication
3838
return new ChatWatsonx({
3939
maxRetries: 3,
4040
maxTokens: config?.maxNewTokens || 2000,
41-
model: config?.model || 'ibm/granite-3-8b-instruct',
41+
model: config?.model || 'ibm/granite-4-h-small',
4242
projectId,
4343
serviceUrl,
4444
temperature: config?.temperature || 0.3,
4545
version: '2024-05-31',
4646
watsonxAIApikey: apiKey,
4747
watsonxAIAuthType: 'iam',
48-
});
48+
})
4949
}

src/core/splitters/markdown.ts

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import {MarkdownTextSplitter} from '@langchain/textsplitters'
2+
3+
import type {Chunk} from '../types.js'
4+
5+
/**
6+
* Hybrid two-pass markdown splitter for translation purposes.
7+
*
8+
* Pass 1: Structural split by markdown elements (frontmatter, code blocks, headers/page directives)
9+
* Pass 2: Size-based recursive split using LangChain for chunks exceeding chunkSize
10+
*
11+
* This approach:
12+
* - Respects markdown structure and translation requirements (code blocks, frontmatter = non-translatable)
13+
* - Prevents massive chunks from overwhelming translation APIs
14+
* - Uses LangChain's intelligent splitting for size management (tries headers > paragraphs > lines)
15+
*/
16+
export class MarkdownSplitter {
17+
private readonly chunkSize: number
18+
private recursiveSplitter: MarkdownTextSplitter
19+
20+
constructor(chunkSize = 12_000) {
21+
this.chunkSize = chunkSize
22+
23+
// Initialize LangChain's MarkdownTextSplitter for Pass 2
24+
this.recursiveSplitter = new MarkdownTextSplitter({
25+
chunkOverlap: 0, // No overlap needed for translation (avoids duplicates)
26+
chunkSize,
27+
keepSeparator: true, // Preserve markdown formatting (headers, etc.)
28+
})
29+
}
30+
31+
/**
32+
* Default split method - automatically detects and uses the appropriate splitting strategy.
33+
* If the content contains ::page directives, splits by those.
34+
* Otherwise, splits by headers (# ## ### etc.)
35+
*/
36+
async split(markdown: string): Promise<Chunk[]> {
37+
if (/^::page/m.test(markdown)) {
38+
return this.splitByPageDirective(markdown)
39+
}
40+
41+
return this.splitByHeaders(markdown)
42+
}
43+
44+
/**
45+
* Splits markdown by headers (# ## ### etc.)
46+
*/
47+
async splitByHeaders(markdown: string): Promise<Chunk[]> {
48+
const firstPassChunks = this.splitInternal(markdown, /^#+\s+/m)
49+
50+
return this.secondPassSplit(firstPassChunks)
51+
}
52+
53+
/**
54+
* Splits markdown by custom ::page directives
55+
*/
56+
async splitByPageDirective(markdown: string): Promise<Chunk[]> {
57+
const firstPassChunks = this.splitInternal(markdown, /^::page/m)
58+
59+
return this.secondPassSplit(firstPassChunks)
60+
}
61+
62+
/**
63+
* Finds the index of the closing --- for frontmatter
64+
*/
65+
private findFrontmatterEnd(lines: string[]): number {
66+
return lines.slice(1).findIndex((line) => line.trim() === '---') + 1
67+
}
68+
69+
/**
70+
* Checks if the markdown starts with YAML frontmatter (--- ... ---)
71+
*/
72+
private hasFrontmatter(lines: string[]): boolean {
73+
if (lines.length === 0) return false
74+
75+
return lines[0].trim() === '---' && lines.slice(1).some((line) => line.trim() === '---')
76+
}
77+
78+
/**
79+
* Pass 2: Size-based recursive split using LangChain
80+
*
81+
* For chunks exceeding chunkSize, uses MarkdownTextSplitter to break them down
82+
* intelligently while preserving the shouldTranslate flag.
83+
*
84+
* LangChain's MarkdownTextSplitter tries separators in this order:
85+
* 1. H2-H6 headers (## ### #### etc.)
86+
* 2. Code blocks with spacing
87+
* 3. Horizontal rules (---, ***, ___)
88+
* 4. Paragraph breaks (\n\n)
89+
* 5. Line breaks (\n)
90+
* 6. Spaces
91+
* 7. Characters (last resort)
92+
*/
93+
private async secondPassSplit(chunks: Chunk[]): Promise<Chunk[]> {
94+
const finalChunks: Chunk[] = []
95+
96+
// Process all chunks, splitting large ones
97+
const splitPromises = chunks.map(async (chunk) => {
98+
// If chunk is within size limit, keep as-is
99+
if (chunk.content.length <= this.chunkSize) {
100+
return [chunk]
101+
}
102+
103+
// Chunk is too large - use LangChain's recursive splitter
104+
const splitTexts = await this.recursiveSplitter.splitText(chunk.content)
105+
106+
// Preserve the shouldTranslate flag on all sub-chunks
107+
return splitTexts.map((text) => ({
108+
content: text,
109+
shouldTranslate: chunk.shouldTranslate,
110+
}))
111+
})
112+
113+
const splitResults = await Promise.all(splitPromises)
114+
115+
// Flatten the results
116+
for (const result of splitResults) {
117+
finalChunks.push(...result)
118+
}
119+
120+
return finalChunks
121+
}
122+
123+
/**
124+
* Pass 1: Structural split using state machine (ported from Ruby implementation)
125+
*
126+
* Handles:
127+
* - Frontmatter detection (YAML between --- markers)
128+
* - Code block detection (``` fences)
129+
* - Split pattern matching (headers or ::page directives)
130+
* - shouldTranslate flag assignment
131+
*/
132+
private splitInternal(markdown: string, splitPattern: RegExp): Chunk[] {
133+
const chunks: Chunk[] = []
134+
const lines = markdown.split('\n').map((line) => line + '\n')
135+
let currentPosition = 0
136+
137+
// Handle frontmatter if present
138+
if (this.hasFrontmatter(lines)) {
139+
const endIndex = this.findFrontmatterEnd(lines)
140+
const frontmatterContent = lines.slice(0, endIndex + 1).join('')
141+
chunks.push({
142+
content: frontmatterContent,
143+
shouldTranslate: false,
144+
})
145+
currentPosition = endIndex + 1
146+
}
147+
148+
let currentChunk: Chunk = {content: '', shouldTranslate: true}
149+
let inCodeBlock = false
150+
151+
for (let i = currentPosition; i < lines.length; i++) {
152+
const line = lines[i]
153+
154+
// Check for code fence (supports language identifiers and trailing whitespace)
155+
// Matches: ```, ```ruby, ```sh-session, ```python , etc.
156+
if (/^```[\w-]*\s*$/.test(line)) {
157+
if (inCodeBlock) {
158+
// End of code block
159+
currentChunk.content += line
160+
chunks.push(currentChunk)
161+
currentChunk = {content: '', shouldTranslate: true}
162+
inCodeBlock = false
163+
} else {
164+
// Start of code block
165+
if (currentChunk.content.trim()) {
166+
chunks.push(currentChunk)
167+
}
168+
169+
currentChunk = {content: line, shouldTranslate: false}
170+
inCodeBlock = true
171+
}
172+
173+
continue
174+
}
175+
176+
// Handle line based on current state and patterns
177+
if (splitPattern.test(line) && !inCodeBlock) {
178+
// This line matches the split pattern and we're not in a code block
179+
if (currentChunk.content.trim()) {
180+
chunks.push(currentChunk)
181+
}
182+
183+
currentChunk = {content: line, shouldTranslate: true}
184+
} else {
185+
// Regular line - append to current chunk
186+
currentChunk.content += line
187+
}
188+
}
189+
190+
// Add final chunk if not empty
191+
if (currentChunk.content.trim()) {
192+
chunks.push(currentChunk)
193+
}
194+
195+
// Filter out empty chunks
196+
return chunks.filter((chunk) => chunk.content.trim() !== '')
197+
}
198+
}

src/core/translators/markdown.ts

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,50 @@ import {BaseMessage, HumanMessage, SystemMessage} from '@langchain/core/messages
44

55
import type {TranslationOptions} from '../types.js'
66

7+
import {MarkdownSplitter} from '../splitters/markdown.js'
8+
79
/**
810
* Translates markdown content using a chat model
911
*/
1012
export class MarkdownTranslator {
11-
constructor(private chatModel: BaseChatModel) {}
13+
private splitter: MarkdownSplitter
14+
15+
constructor(private chatModel: BaseChatModel) {
16+
this.splitter = new MarkdownSplitter()
17+
}
1218

1319
/**
1420
* Translates markdown content from source language to target language
1521
*/
1622
async translate(options: TranslationOptions): Promise<string> {
17-
const response = await this.chatModel.invoke(this.buildMessages(options))
18-
const translatedContent = response.content as string
23+
const chunks = await this.splitter.split(options.content)
24+
let response = ''
25+
26+
for (const chunk of chunks) {
27+
if (chunk.shouldTranslate) {
28+
// eslint-disable-next-line no-await-in-loop
29+
const translatedChunk = await this.chatModel.invoke(this.buildMessages({...options, content: chunk.content}))
30+
response += translatedChunk.content as string
31+
} else {
32+
response += chunk.content
33+
}
34+
}
1935

20-
return translatedContent
36+
return response
2137
}
2238

2339
/**
2440
* Streams translated markdown content from source language to target language
2541
* @yields {string} Chunks of translated content
2642
*/
2743
async *translateStream(options: TranslationOptions): AsyncGenerator<string> {
28-
const stream = await this.chatModel.stream(this.buildMessages(options))
44+
const chunks = await this.splitter.split(options.content)
2945

30-
for await (const chunk of stream) {
31-
const {content} = chunk
32-
if (typeof content === 'string') {
33-
yield content
46+
for (const chunk of chunks) {
47+
if (chunk.shouldTranslate) {
48+
yield* this.streamChunk({...options, content: chunk.content})
49+
} else {
50+
yield chunk.content
3451
}
3552
}
3653
}
@@ -46,7 +63,8 @@ export class MarkdownTranslator {
4663
* Creates the system prompt for translation
4764
*/
4865
private createSystemPrompt(sourceLanguage: string, targetLanguage: string): string {
49-
return `You are a helpful assistant that accurately translates markdown document snippets from ${sourceLanguage} to ${targetLanguage} while preserving markdown syntax, formatting, and custom directives.
66+
return `
67+
You are a helpful assistant that accurately translates markdown document snippets from ${sourceLanguage} to ${targetLanguage} while preserving markdown syntax, formatting, and custom directives.
5068
You always preserve the structure and formatting exactly as it is.
5169
You do not add, alter or modify the text you receive in any way.
5270
@@ -66,6 +84,22 @@ Reminder:
6684
- Be consistent with technical terms. If an equivalent technical term is not available in ${targetLanguage}, always use the original term.
6785
6886
*IMPORTANT*
69-
Translate without any additional information or comments.`
87+
Translate without any additional information or comments.
88+
`
89+
}
90+
91+
/**
92+
* Streams a single chunk through the chat model
93+
* @yields {string} Chunks of translated content from the model
94+
*/
95+
private async *streamChunk(options: TranslationOptions): AsyncGenerator<string> {
96+
const stream = await this.chatModel.stream(this.buildMessages(options))
97+
98+
for await (const chunk of stream) {
99+
const {content} = chunk
100+
if (typeof content === 'string') {
101+
yield content
102+
}
103+
}
70104
}
71105
}

src/core/types.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
/**
2+
* Represents a chunk of split source text.
3+
* Each chunk can indicate whether it should be translated or not via shouldTranslate.
4+
*/
5+
export interface Chunk {
6+
content: string;
7+
shouldTranslate: boolean;
8+
}
9+
110
/**
211
* Translation configuration options
312
*/

0 commit comments

Comments
 (0)