|
| 1 | +import {MarkdownTextSplitter} from '@langchain/textsplitters' |
| 2 | + |
| 3 | +import type {Chunk} from '../types.js' |
| 4 | + |
| 5 | +/** |
| 6 | + * Hybrid two-pass markdown splitter for translation purposes. |
| 7 | + * |
| 8 | + * Pass 1: Structural split by markdown elements (frontmatter, code blocks, headers/page directives) |
| 9 | + * Pass 2: Size-based recursive split using LangChain for chunks exceeding chunkSize |
| 10 | + * |
| 11 | + * This approach: |
| 12 | + * - Respects markdown structure and translation requirements (code blocks, frontmatter = non-translatable) |
| 13 | + * - Prevents massive chunks from overwhelming translation APIs |
| 14 | + * - Uses LangChain's intelligent splitting for size management (tries headers > paragraphs > lines) |
| 15 | + */ |
| 16 | +export class MarkdownSplitter { |
| 17 | + private readonly chunkSize: number |
| 18 | + private recursiveSplitter: MarkdownTextSplitter |
| 19 | + |
| 20 | + constructor(chunkSize = 12_000) { |
| 21 | + this.chunkSize = chunkSize |
| 22 | + |
| 23 | + this.recursiveSplitter = new MarkdownTextSplitter({ |
| 24 | + chunkOverlap: 0, |
| 25 | + chunkSize, |
| 26 | + keepSeparator: true, |
| 27 | + }) |
| 28 | + } |
| 29 | + |
| 30 | + /** |
| 31 | + * Default split method - automatically detects and uses the appropriate splitting strategy. |
| 32 | + * If the content contains ::page directives, splits by those. |
| 33 | + * Otherwise, splits by headers (# ## ### etc.) |
| 34 | + */ |
| 35 | + async split(markdown: string): Promise<Chunk[]> { |
| 36 | + if (/^::page/m.test(markdown)) { |
| 37 | + return this.splitByPageDirective(markdown) |
| 38 | + } |
| 39 | + |
| 40 | + return this.splitByHeaders(markdown) |
| 41 | + } |
| 42 | + |
| 43 | + /** |
| 44 | + * Splits markdown by headers (# ## ### etc.) |
| 45 | + */ |
| 46 | + async splitByHeaders(markdown: string): Promise<Chunk[]> { |
| 47 | + const firstPassChunks = this.splitInternal(markdown, /^#+\s+/m) |
| 48 | + |
| 49 | + return this.secondPassSplit(firstPassChunks) |
| 50 | + } |
| 51 | + |
| 52 | + /** |
| 53 | + * Splits markdown by custom ::page directives |
| 54 | + */ |
| 55 | + async splitByPageDirective(markdown: string): Promise<Chunk[]> { |
| 56 | + const firstPassChunks = this.splitInternal(markdown, /^::page/m) |
| 57 | + |
| 58 | + return this.secondPassSplit(firstPassChunks) |
| 59 | + } |
| 60 | + |
| 61 | + /** |
| 62 | + * Creates a chunk with whitespace extracted and stored separately |
| 63 | + */ |
| 64 | + private createChunk(content: string, shouldTranslate: boolean): Chunk { |
| 65 | + const leadingWhitespace = content.match(/^\s+/)?.[0] || '' |
| 66 | + const trailingWhitespace = content.match(/\s+$/)?.[0] || '' |
| 67 | + const trimmedContent = content.trim() |
| 68 | + |
| 69 | + return { |
| 70 | + content: trimmedContent, |
| 71 | + leadingWhitespace, |
| 72 | + shouldTranslate, |
| 73 | + trailingWhitespace, |
| 74 | + } |
| 75 | + } |
| 76 | + |
| 77 | + /** |
| 78 | + * Finds the index of the closing --- for frontmatter |
| 79 | + */ |
| 80 | + private findFrontmatterEnd(lines: string[]): number { |
| 81 | + return lines.slice(1).findIndex((line) => line.trim() === '---') + 1 |
| 82 | + } |
| 83 | + |
| 84 | + /** |
| 85 | + * Checks if the markdown starts with YAML frontmatter (--- ... ---) |
| 86 | + */ |
| 87 | + private hasFrontmatter(lines: string[]): boolean { |
| 88 | + if (lines.length === 0) return false |
| 89 | + |
| 90 | + return lines[0].trim() === '---' && lines.slice(1).some((line) => line.trim() === '---') |
| 91 | + } |
| 92 | + |
| 93 | + /** |
| 94 | + * Pass 2: Size-based recursive split using LangChain |
| 95 | + * |
| 96 | + * For chunks exceeding chunkSize, uses MarkdownTextSplitter to break them down |
| 97 | + * intelligently while preserving the shouldTranslate flag. |
| 98 | + * |
| 99 | + * LangChain's MarkdownTextSplitter tries separators in this order: |
| 100 | + * 1. H2-H6 headers (## ### #### etc.) |
| 101 | + * 2. Code blocks with spacing |
| 102 | + * 3. Horizontal rules (---, ***, ___) |
| 103 | + * 4. Paragraph breaks (\n\n) |
| 104 | + * 5. Line breaks (\n) |
| 105 | + * 6. Spaces |
| 106 | + * 7. Characters (last resort) |
| 107 | + */ |
| 108 | + private async secondPassSplit(chunks: Chunk[]): Promise<Chunk[]> { |
| 109 | + const finalChunks: Chunk[] = [] |
| 110 | + |
| 111 | + const splitPromises = chunks.map(async (chunk) => { |
| 112 | + if (chunk.content.length <= this.chunkSize) { |
| 113 | + return [chunk] |
| 114 | + } |
| 115 | + |
| 116 | + const splitTexts = await this.recursiveSplitter.splitText(chunk.content) |
| 117 | + |
| 118 | + return splitTexts.map((text, textIndex) => { |
| 119 | + const isFirst = textIndex === 0 |
| 120 | + const isLast = textIndex === splitTexts.length - 1 |
| 121 | + |
| 122 | + return this.createChunk( |
| 123 | + (isFirst ? chunk.leadingWhitespace || '' : '') + text + (isLast ? chunk.trailingWhitespace || '' : ''), |
| 124 | + chunk.shouldTranslate, |
| 125 | + ) |
| 126 | + }) |
| 127 | + }) |
| 128 | + |
| 129 | + const splitResults = await Promise.all(splitPromises) |
| 130 | + |
| 131 | + for (const result of splitResults) { |
| 132 | + finalChunks.push(...result) |
| 133 | + } |
| 134 | + |
| 135 | + return finalChunks |
| 136 | + } |
| 137 | + |
| 138 | + /** |
| 139 | + * Pass 1: Structural split using state machine (ported from Ruby implementation) |
| 140 | + * |
| 141 | + * Handles: |
| 142 | + * - Frontmatter detection (YAML between --- markers) |
| 143 | + * - Code block detection (``` fences) |
| 144 | + * - Split pattern matching (headers or ::page directives) |
| 145 | + * - shouldTranslate flag assignment |
| 146 | + */ |
| 147 | + private splitInternal(markdown: string, splitPattern: RegExp): Chunk[] { |
| 148 | + const chunks: Chunk[] = [] |
| 149 | + const splitLines = markdown.split('\n') |
| 150 | + |
| 151 | + const hasTrailingNewline = markdown.endsWith('\n') |
| 152 | + |
| 153 | + if (hasTrailingNewline && splitLines.at(-1) === '') { |
| 154 | + splitLines.pop() |
| 155 | + } |
| 156 | + |
| 157 | + const lines = splitLines.map((line) => line + '\n') |
| 158 | + |
| 159 | + if (!hasTrailingNewline && lines.length > 0) { |
| 160 | + lines[lines.length - 1] = lines.at(-1)!.slice(0, -1) |
| 161 | + } |
| 162 | + |
| 163 | + let currentPosition = 0 |
| 164 | + |
| 165 | + if (this.hasFrontmatter(lines)) { |
| 166 | + const endIndex = this.findFrontmatterEnd(lines) |
| 167 | + const frontmatterContent = lines.slice(0, endIndex + 1).join('') |
| 168 | + chunks.push(this.createChunk(frontmatterContent, false)) |
| 169 | + currentPosition = endIndex + 1 |
| 170 | + } |
| 171 | + |
| 172 | + let currentChunkContent = '' |
| 173 | + let currentChunkShouldTranslate = true |
| 174 | + let inCodeBlock = false |
| 175 | + |
| 176 | + for (let i = currentPosition; i < lines.length; i++) { |
| 177 | + const line = lines[i] |
| 178 | + |
| 179 | + if (/^```[\w-]*\s*$/.test(line)) { |
| 180 | + if (inCodeBlock) { |
| 181 | + currentChunkContent += line |
| 182 | + chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate)) |
| 183 | + currentChunkContent = '' |
| 184 | + currentChunkShouldTranslate = true |
| 185 | + inCodeBlock = false |
| 186 | + } else { |
| 187 | + if (currentChunkContent.trim()) { |
| 188 | + chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate)) |
| 189 | + currentChunkContent = line |
| 190 | + } else { |
| 191 | + currentChunkContent += line |
| 192 | + } |
| 193 | + |
| 194 | + currentChunkShouldTranslate = false |
| 195 | + inCodeBlock = true |
| 196 | + } |
| 197 | + |
| 198 | + continue |
| 199 | + } |
| 200 | + |
| 201 | + if (splitPattern.test(line) && !inCodeBlock) { |
| 202 | + if (currentChunkContent.trim()) { |
| 203 | + chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate)) |
| 204 | + currentChunkContent = line |
| 205 | + } else { |
| 206 | + currentChunkContent += line |
| 207 | + } |
| 208 | + |
| 209 | + currentChunkShouldTranslate = true |
| 210 | + } else { |
| 211 | + currentChunkContent += line |
| 212 | + } |
| 213 | + } |
| 214 | + |
| 215 | + if (currentChunkContent.trim()) { |
| 216 | + chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate)) |
| 217 | + } |
| 218 | + |
| 219 | + return chunks.filter((chunk) => chunk.content !== '') |
| 220 | + } |
| 221 | +} |
0 commit comments