Skip to content

Commit d2d741a

Browse files
feat: add basic markdown splitter (#7)
* Basic splitter * Whitespace * Lint * lint
1 parent c66041a commit d2d741a

File tree

7 files changed

+589
-29
lines changed

7 files changed

+589
-29
lines changed

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"@ibm-cloud/watsonx-ai": "^1.7.0",
1212
"@langchain/community": "^1.0.0",
1313
"@langchain/core": "^1.0.2",
14+
"@langchain/textsplitters": "^1.0.0",
1415
"@oclif/core": "^4",
1516
"@oclif/plugin-help": "^6",
1617
"@oclif/plugin-plugins": "^5"

pnpm-lock.yaml

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/core/providers/watsonx.ts

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,49 @@
1-
import { ChatWatsonx } from '@langchain/community/chat_models/ibm';
1+
import {ChatWatsonx} from '@langchain/community/chat_models/ibm'
22

33
/**
44
* Configuration for watsonx.ai client
55
*/
66
export interface WatsonxConfig {
7-
apiKey: string;
8-
maxNewTokens?: number;
9-
model?: string;
10-
projectId: string;
11-
serviceUrl: string;
12-
temperature?: number;
7+
apiKey: string
8+
maxNewTokens?: number
9+
model?: string
10+
projectId: string
11+
serviceUrl: string
12+
temperature?: number
1313
}
1414

1515
/**
1616
* Creates and returns a configured watsonx.ai chat model with IAM authentication
1717
*/
1818
export function createClient(config?: Partial<WatsonxConfig>): ChatWatsonx {
1919
// Get configuration from environment variables or provided config
20-
const serviceUrl = config?.serviceUrl || process.env.WATSONX_AI_SERVICE_URL;
21-
const projectId = config?.projectId || process.env.WATSONX_AI_PROJECT_ID;
22-
const apiKey = config?.apiKey || process.env.WATSONX_AI_APIKEY;
20+
const serviceUrl = config?.serviceUrl || process.env.WATSONX_AI_SERVICE_URL
21+
const projectId = config?.projectId || process.env.WATSONX_AI_PROJECT_ID
22+
const apiKey = config?.apiKey || process.env.WATSONX_AI_APIKEY
2323

2424
// Validate required configuration
2525
if (!serviceUrl) {
26-
throw new Error('WATSONX_AI_SERVICE_URL is required');
26+
throw new Error('WATSONX_AI_SERVICE_URL is required')
2727
}
2828

2929
if (!projectId) {
30-
throw new Error('WATSONX_AI_PROJECT_ID is required');
30+
throw new Error('WATSONX_AI_PROJECT_ID is required')
3131
}
3232

3333
if (!apiKey) {
34-
throw new Error('WATSONX_AI_APIKEY is required');
34+
throw new Error('WATSONX_AI_APIKEY is required')
3535
}
3636

3737
// Create and return the chat model with IAM authentication
3838
return new ChatWatsonx({
3939
maxRetries: 3,
4040
maxTokens: config?.maxNewTokens || 2000,
41-
model: config?.model || 'ibm/granite-3-8b-instruct',
41+
model: config?.model || 'ibm/granite-4-h-small',
4242
projectId,
4343
serviceUrl,
4444
temperature: config?.temperature || 0.3,
4545
version: '2024-05-31',
4646
watsonxAIApikey: apiKey,
4747
watsonxAIAuthType: 'iam',
48-
});
48+
})
4949
}

src/core/splitters/markdown.ts

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
import {MarkdownTextSplitter} from '@langchain/textsplitters'
2+
3+
import type {Chunk} from '../types.js'
4+
5+
/**
6+
* Hybrid two-pass markdown splitter for translation purposes.
7+
*
8+
* Pass 1: Structural split by markdown elements (frontmatter, code blocks, headers/page directives)
9+
* Pass 2: Size-based recursive split using LangChain for chunks exceeding chunkSize
10+
*
11+
* This approach:
12+
* - Respects markdown structure and translation requirements (code blocks, frontmatter = non-translatable)
13+
* - Prevents massive chunks from overwhelming translation APIs
14+
* - Uses LangChain's intelligent splitting for size management (tries headers > paragraphs > lines)
15+
*/
16+
export class MarkdownSplitter {
17+
private readonly chunkSize: number
18+
private recursiveSplitter: MarkdownTextSplitter
19+
20+
constructor(chunkSize = 12_000) {
21+
this.chunkSize = chunkSize
22+
23+
this.recursiveSplitter = new MarkdownTextSplitter({
24+
chunkOverlap: 0,
25+
chunkSize,
26+
keepSeparator: true,
27+
})
28+
}
29+
30+
/**
31+
* Default split method - automatically detects and uses the appropriate splitting strategy.
32+
* If the content contains ::page directives, splits by those.
33+
* Otherwise, splits by headers (# ## ### etc.)
34+
*/
35+
async split(markdown: string): Promise<Chunk[]> {
36+
if (/^::page/m.test(markdown)) {
37+
return this.splitByPageDirective(markdown)
38+
}
39+
40+
return this.splitByHeaders(markdown)
41+
}
42+
43+
/**
44+
* Splits markdown by headers (# ## ### etc.)
45+
*/
46+
async splitByHeaders(markdown: string): Promise<Chunk[]> {
47+
const firstPassChunks = this.splitInternal(markdown, /^#+\s+/m)
48+
49+
return this.secondPassSplit(firstPassChunks)
50+
}
51+
52+
/**
53+
* Splits markdown by custom ::page directives
54+
*/
55+
async splitByPageDirective(markdown: string): Promise<Chunk[]> {
56+
const firstPassChunks = this.splitInternal(markdown, /^::page/m)
57+
58+
return this.secondPassSplit(firstPassChunks)
59+
}
60+
61+
/**
62+
* Creates a chunk with whitespace extracted and stored separately
63+
*/
64+
private createChunk(content: string, shouldTranslate: boolean): Chunk {
65+
const leadingWhitespace = content.match(/^\s+/)?.[0] || ''
66+
const trailingWhitespace = content.match(/\s+$/)?.[0] || ''
67+
const trimmedContent = content.trim()
68+
69+
return {
70+
content: trimmedContent,
71+
leadingWhitespace,
72+
shouldTranslate,
73+
trailingWhitespace,
74+
}
75+
}
76+
77+
/**
78+
* Finds the index of the closing --- for frontmatter
79+
*/
80+
private findFrontmatterEnd(lines: string[]): number {
81+
return lines.slice(1).findIndex((line) => line.trim() === '---') + 1
82+
}
83+
84+
/**
85+
* Checks if the markdown starts with YAML frontmatter (--- ... ---)
86+
*/
87+
private hasFrontmatter(lines: string[]): boolean {
88+
if (lines.length === 0) return false
89+
90+
return lines[0].trim() === '---' && lines.slice(1).some((line) => line.trim() === '---')
91+
}
92+
93+
/**
94+
* Pass 2: Size-based recursive split using LangChain
95+
*
96+
* For chunks exceeding chunkSize, uses MarkdownTextSplitter to break them down
97+
* intelligently while preserving the shouldTranslate flag.
98+
*
99+
* LangChain's MarkdownTextSplitter tries separators in this order:
100+
* 1. H2-H6 headers (## ### #### etc.)
101+
* 2. Code blocks with spacing
102+
* 3. Horizontal rules (---, ***, ___)
103+
* 4. Paragraph breaks (\n\n)
104+
* 5. Line breaks (\n)
105+
* 6. Spaces
106+
* 7. Characters (last resort)
107+
*/
108+
private async secondPassSplit(chunks: Chunk[]): Promise<Chunk[]> {
109+
const finalChunks: Chunk[] = []
110+
111+
const splitPromises = chunks.map(async (chunk) => {
112+
if (chunk.content.length <= this.chunkSize) {
113+
return [chunk]
114+
}
115+
116+
const splitTexts = await this.recursiveSplitter.splitText(chunk.content)
117+
118+
return splitTexts.map((text, textIndex) => {
119+
const isFirst = textIndex === 0
120+
const isLast = textIndex === splitTexts.length - 1
121+
122+
return this.createChunk(
123+
(isFirst ? chunk.leadingWhitespace || '' : '') + text + (isLast ? chunk.trailingWhitespace || '' : ''),
124+
chunk.shouldTranslate,
125+
)
126+
})
127+
})
128+
129+
const splitResults = await Promise.all(splitPromises)
130+
131+
for (const result of splitResults) {
132+
finalChunks.push(...result)
133+
}
134+
135+
return finalChunks
136+
}
137+
138+
/**
139+
* Pass 1: Structural split using state machine (ported from Ruby implementation)
140+
*
141+
* Handles:
142+
* - Frontmatter detection (YAML between --- markers)
143+
* - Code block detection (``` fences)
144+
* - Split pattern matching (headers or ::page directives)
145+
* - shouldTranslate flag assignment
146+
*/
147+
private splitInternal(markdown: string, splitPattern: RegExp): Chunk[] {
148+
const chunks: Chunk[] = []
149+
const splitLines = markdown.split('\n')
150+
151+
const hasTrailingNewline = markdown.endsWith('\n')
152+
153+
if (hasTrailingNewline && splitLines.at(-1) === '') {
154+
splitLines.pop()
155+
}
156+
157+
const lines = splitLines.map((line) => line + '\n')
158+
159+
if (!hasTrailingNewline && lines.length > 0) {
160+
lines[lines.length - 1] = lines.at(-1)!.slice(0, -1)
161+
}
162+
163+
let currentPosition = 0
164+
165+
if (this.hasFrontmatter(lines)) {
166+
const endIndex = this.findFrontmatterEnd(lines)
167+
const frontmatterContent = lines.slice(0, endIndex + 1).join('')
168+
chunks.push(this.createChunk(frontmatterContent, false))
169+
currentPosition = endIndex + 1
170+
}
171+
172+
let currentChunkContent = ''
173+
let currentChunkShouldTranslate = true
174+
let inCodeBlock = false
175+
176+
for (let i = currentPosition; i < lines.length; i++) {
177+
const line = lines[i]
178+
179+
if (/^```[\w-]*\s*$/.test(line)) {
180+
if (inCodeBlock) {
181+
currentChunkContent += line
182+
chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
183+
currentChunkContent = ''
184+
currentChunkShouldTranslate = true
185+
inCodeBlock = false
186+
} else {
187+
if (currentChunkContent.trim()) {
188+
chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
189+
currentChunkContent = line
190+
} else {
191+
currentChunkContent += line
192+
}
193+
194+
currentChunkShouldTranslate = false
195+
inCodeBlock = true
196+
}
197+
198+
continue
199+
}
200+
201+
if (splitPattern.test(line) && !inCodeBlock) {
202+
if (currentChunkContent.trim()) {
203+
chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
204+
currentChunkContent = line
205+
} else {
206+
currentChunkContent += line
207+
}
208+
209+
currentChunkShouldTranslate = true
210+
} else {
211+
currentChunkContent += line
212+
}
213+
}
214+
215+
if (currentChunkContent.trim()) {
216+
chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
217+
}
218+
219+
return chunks.filter((chunk) => chunk.content !== '')
220+
}
221+
}

0 commit comments

Comments
 (0)