@@ -20,11 +20,10 @@ export class MarkdownSplitter {
2020 constructor ( chunkSize = 12_000 ) {
2121 this . chunkSize = chunkSize
2222
23- // Initialize LangChain's MarkdownTextSplitter for Pass 2
2423 this . recursiveSplitter = new MarkdownTextSplitter ( {
25- chunkOverlap : 0 , // No overlap needed for translation (avoids duplicates)
24+ chunkOverlap : 0 ,
2625 chunkSize,
27- keepSeparator : true , // Preserve markdown formatting (headers, etc.)
26+ keepSeparator : true ,
2827 } )
2928 }
3029
@@ -59,6 +58,22 @@ export class MarkdownSplitter {
5958 return this . secondPassSplit ( firstPassChunks )
6059 }
6160
61+ /**
62+ * Creates a chunk with whitespace extracted and stored separately
63+ */
64+ private createChunk ( content : string , shouldTranslate : boolean ) : Chunk {
65+ const leadingWhitespace = content . match ( / ^ \s + / ) ?. [ 0 ] || ''
66+ const trailingWhitespace = content . match ( / \s + $ / ) ?. [ 0 ] || ''
67+ const trimmedContent = content . trim ( )
68+
69+ return {
70+ content : trimmedContent ,
71+ leadingWhitespace,
72+ shouldTranslate,
73+ trailingWhitespace,
74+ }
75+ }
76+
6277 /**
6378 * Finds the index of the closing --- for frontmatter
6479 */
@@ -93,26 +108,28 @@ export class MarkdownSplitter {
93108 private async secondPassSplit ( chunks : Chunk [ ] ) : Promise < Chunk [ ] > {
94109 const finalChunks : Chunk [ ] = [ ]
95110
96- // Process all chunks, splitting large ones
97111 const splitPromises = chunks . map ( async ( chunk ) => {
98- // If chunk is within size limit, keep as-is
99112 if ( chunk . content . length <= this . chunkSize ) {
100113 return [ chunk ]
101114 }
102115
103- // Chunk is too large - use LangChain's recursive splitter
104116 const splitTexts = await this . recursiveSplitter . splitText ( chunk . content )
105117
106- // Preserve the shouldTranslate flag on all sub-chunks
107- return splitTexts . map ( ( text ) => ( {
108- content : text ,
109- shouldTranslate : chunk . shouldTranslate ,
110- } ) )
118+ return splitTexts . map ( ( text , textIndex ) => {
119+ const isFirst = textIndex === 0
120+ const isLast = textIndex === splitTexts . length - 1
121+
122+ return this . createChunk (
123+ ( isFirst ? chunk . leadingWhitespace || '' : '' ) +
124+ text +
125+ ( isLast ? chunk . trailingWhitespace || '' : '' ) ,
126+ chunk . shouldTranslate ,
127+ )
128+ } )
111129 } )
112130
113131 const splitResults = await Promise . all ( splitPromises )
114132
115- // Flatten the results
116133 for ( const result of splitResults ) {
117134 finalChunks . push ( ...result )
118135 }
@@ -131,68 +148,76 @@ export class MarkdownSplitter {
131148 */
132149 private splitInternal ( markdown : string , splitPattern : RegExp ) : Chunk [ ] {
133150 const chunks : Chunk [ ] = [ ]
134- const lines = markdown . split ( '\n' ) . map ( ( line ) => line + '\n' )
151+ const splitLines = markdown . split ( '\n' )
152+
153+ const hasTrailingNewline = markdown . endsWith ( '\n' )
154+
155+ if ( hasTrailingNewline && splitLines . at ( - 1 ) === '' ) {
156+ splitLines . pop ( )
157+ }
158+
159+ const lines = splitLines . map ( line => line + '\n' )
160+
161+ if ( ! hasTrailingNewline && lines . length > 0 ) {
162+ lines [ lines . length - 1 ] = lines . at ( - 1 ) ! . slice ( 0 , - 1 )
163+ }
164+
135165 let currentPosition = 0
136166
137- // Handle frontmatter if present
138167 if ( this . hasFrontmatter ( lines ) ) {
139168 const endIndex = this . findFrontmatterEnd ( lines )
140169 const frontmatterContent = lines . slice ( 0 , endIndex + 1 ) . join ( '' )
141- chunks . push ( {
142- content : frontmatterContent ,
143- shouldTranslate : false ,
144- } )
170+ chunks . push ( this . createChunk ( frontmatterContent , false ) )
145171 currentPosition = endIndex + 1
146172 }
147173
148- let currentChunk : Chunk = { content : '' , shouldTranslate : true }
174+ let currentChunkContent = ''
175+ let currentChunkShouldTranslate = true
149176 let inCodeBlock = false
150177
151178 for ( let i = currentPosition ; i < lines . length ; i ++ ) {
152179 const line = lines [ i ]
153180
154- // Check for code fence (supports language identifiers and trailing whitespace)
155- // Matches: ```, ```ruby, ```sh-session, ```python , etc.
156181 if ( / ^ ` ` ` [ \w - ] * \s * $ / . test ( line ) ) {
157182 if ( inCodeBlock ) {
158- // End of code block
159- currentChunk . content += line
160- chunks . push ( currentChunk )
161- currentChunk = { content : '' , shouldTranslate : true }
183+ currentChunkContent += line
184+ chunks . push ( this . createChunk ( currentChunkContent , currentChunkShouldTranslate ) )
185+ currentChunkContent = ''
186+ currentChunkShouldTranslate = true
162187 inCodeBlock = false
163188 } else {
164- // Start of code block
165- if ( currentChunk . content . trim ( ) ) {
166- chunks . push ( currentChunk )
189+ if ( currentChunkContent . trim ( ) ) {
190+ chunks . push ( this . createChunk ( currentChunkContent , currentChunkShouldTranslate ) )
191+ currentChunkContent = line
192+ } else {
193+ currentChunkContent += line
167194 }
168195
169- currentChunk = { content : line , shouldTranslate : false }
196+ currentChunkShouldTranslate = false
170197 inCodeBlock = true
171198 }
172199
173200 continue
174201 }
175202
176- // Handle line based on current state and patterns
177203 if ( splitPattern . test ( line ) && ! inCodeBlock ) {
178- // This line matches the split pattern and we're not in a code block
179- if ( currentChunk . content . trim ( ) ) {
180- chunks . push ( currentChunk )
204+ if ( currentChunkContent . trim ( ) ) {
205+ chunks . push ( this . createChunk ( currentChunkContent , currentChunkShouldTranslate ) )
206+ currentChunkContent = line
207+ } else {
208+ currentChunkContent += line
181209 }
182210
183- currentChunk = { content : line , shouldTranslate : true }
211+ currentChunkShouldTranslate = true
184212 } else {
185- // Regular line - append to current chunk
186- currentChunk . content += line
213+ currentChunkContent += line
187214 }
188215 }
189216
190- // Add final chunk if not empty
191- if ( currentChunk . content . trim ( ) ) {
192- chunks . push ( currentChunk )
217+ if ( currentChunkContent . trim ( ) ) {
218+ chunks . push ( this . createChunk ( currentChunkContent , currentChunkShouldTranslate ) )
193219 }
194220
195- // Filter out empty chunks
196- return chunks . filter ( ( chunk ) => chunk . content . trim ( ) !== '' )
221+ return chunks . filter ( ( chunk ) => chunk . content !== '' )
197222 }
198223}
0 commit comments