Skip to content

Commit 9b5d9a4

Browse files
Whitespace
1 parent a506ef8 commit 9b5d9a4

File tree

4 files changed

+242
-48
lines changed

4 files changed

+242
-48
lines changed

src/core/splitters/markdown.ts

Lines changed: 66 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,10 @@ export class MarkdownSplitter {
2020
constructor(chunkSize = 12_000) {
2121
this.chunkSize = chunkSize
2222

23-
// Initialize LangChain's MarkdownTextSplitter for Pass 2
2423
this.recursiveSplitter = new MarkdownTextSplitter({
25-
chunkOverlap: 0, // No overlap needed for translation (avoids duplicates)
24+
chunkOverlap: 0,
2625
chunkSize,
27-
keepSeparator: true, // Preserve markdown formatting (headers, etc.)
26+
keepSeparator: true,
2827
})
2928
}
3029

@@ -59,6 +58,22 @@ export class MarkdownSplitter {
5958
return this.secondPassSplit(firstPassChunks)
6059
}
6160

61+
/**
62+
* Creates a chunk with whitespace extracted and stored separately
63+
*/
64+
private createChunk(content: string, shouldTranslate: boolean): Chunk {
65+
const leadingWhitespace = content.match(/^\s+/)?.[0] || ''
66+
const trailingWhitespace = content.match(/\s+$/)?.[0] || ''
67+
const trimmedContent = content.trim()
68+
69+
return {
70+
content: trimmedContent,
71+
leadingWhitespace,
72+
shouldTranslate,
73+
trailingWhitespace,
74+
}
75+
}
76+
6277
/**
6378
* Finds the index of the closing --- for frontmatter
6479
*/
@@ -93,26 +108,28 @@ export class MarkdownSplitter {
93108
private async secondPassSplit(chunks: Chunk[]): Promise<Chunk[]> {
94109
const finalChunks: Chunk[] = []
95110

96-
// Process all chunks, splitting large ones
97111
const splitPromises = chunks.map(async (chunk) => {
98-
// If chunk is within size limit, keep as-is
99112
if (chunk.content.length <= this.chunkSize) {
100113
return [chunk]
101114
}
102115

103-
// Chunk is too large - use LangChain's recursive splitter
104116
const splitTexts = await this.recursiveSplitter.splitText(chunk.content)
105117

106-
// Preserve the shouldTranslate flag on all sub-chunks
107-
return splitTexts.map((text) => ({
108-
content: text,
109-
shouldTranslate: chunk.shouldTranslate,
110-
}))
118+
return splitTexts.map((text, textIndex) => {
119+
const isFirst = textIndex === 0
120+
const isLast = textIndex === splitTexts.length - 1
121+
122+
return this.createChunk(
123+
(isFirst ? chunk.leadingWhitespace || '' : '') +
124+
text +
125+
(isLast ? chunk.trailingWhitespace || '' : ''),
126+
chunk.shouldTranslate,
127+
)
128+
})
111129
})
112130

113131
const splitResults = await Promise.all(splitPromises)
114132

115-
// Flatten the results
116133
for (const result of splitResults) {
117134
finalChunks.push(...result)
118135
}
@@ -131,68 +148,76 @@ export class MarkdownSplitter {
131148
*/
132149
private splitInternal(markdown: string, splitPattern: RegExp): Chunk[] {
133150
const chunks: Chunk[] = []
134-
const lines = markdown.split('\n').map((line) => line + '\n')
151+
const splitLines = markdown.split('\n')
152+
153+
const hasTrailingNewline = markdown.endsWith('\n')
154+
155+
if (hasTrailingNewline && splitLines.at(-1) === '') {
156+
splitLines.pop()
157+
}
158+
159+
const lines = splitLines.map(line => line + '\n')
160+
161+
if (!hasTrailingNewline && lines.length > 0) {
162+
lines[lines.length - 1] = lines.at(-1)!.slice(0, -1)
163+
}
164+
135165
let currentPosition = 0
136166

137-
// Handle frontmatter if present
138167
if (this.hasFrontmatter(lines)) {
139168
const endIndex = this.findFrontmatterEnd(lines)
140169
const frontmatterContent = lines.slice(0, endIndex + 1).join('')
141-
chunks.push({
142-
content: frontmatterContent,
143-
shouldTranslate: false,
144-
})
170+
chunks.push(this.createChunk(frontmatterContent, false))
145171
currentPosition = endIndex + 1
146172
}
147173

148-
let currentChunk: Chunk = {content: '', shouldTranslate: true}
174+
let currentChunkContent = ''
175+
let currentChunkShouldTranslate = true
149176
let inCodeBlock = false
150177

151178
for (let i = currentPosition; i < lines.length; i++) {
152179
const line = lines[i]
153180

154-
// Check for code fence (supports language identifiers and trailing whitespace)
155-
// Matches: ```, ```ruby, ```sh-session, ```python , etc.
156181
if (/^```[\w-]*\s*$/.test(line)) {
157182
if (inCodeBlock) {
158-
// End of code block
159-
currentChunk.content += line
160-
chunks.push(currentChunk)
161-
currentChunk = {content: '', shouldTranslate: true}
183+
currentChunkContent += line
184+
chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
185+
currentChunkContent = ''
186+
currentChunkShouldTranslate = true
162187
inCodeBlock = false
163188
} else {
164-
// Start of code block
165-
if (currentChunk.content.trim()) {
166-
chunks.push(currentChunk)
189+
if (currentChunkContent.trim()) {
190+
chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
191+
currentChunkContent = line
192+
} else {
193+
currentChunkContent += line
167194
}
168195

169-
currentChunk = {content: line, shouldTranslate: false}
196+
currentChunkShouldTranslate = false
170197
inCodeBlock = true
171198
}
172199

173200
continue
174201
}
175202

176-
// Handle line based on current state and patterns
177203
if (splitPattern.test(line) && !inCodeBlock) {
178-
// This line matches the split pattern and we're not in a code block
179-
if (currentChunk.content.trim()) {
180-
chunks.push(currentChunk)
204+
if (currentChunkContent.trim()) {
205+
chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
206+
currentChunkContent = line
207+
} else {
208+
currentChunkContent += line
181209
}
182210

183-
currentChunk = {content: line, shouldTranslate: true}
211+
currentChunkShouldTranslate = true
184212
} else {
185-
// Regular line - append to current chunk
186-
currentChunk.content += line
213+
currentChunkContent += line
187214
}
188215
}
189216

190-
// Add final chunk if not empty
191-
if (currentChunk.content.trim()) {
192-
chunks.push(currentChunk)
217+
if (currentChunkContent.trim()) {
218+
chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
193219
}
194220

195-
// Filter out empty chunks
196-
return chunks.filter((chunk) => chunk.content.trim() !== '')
221+
return chunks.filter((chunk) => chunk.content !== '')
197222
}
198223
}

src/core/translators/markdown.ts

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,20 @@ export class MarkdownTranslator {
2424
let response = ''
2525

2626
for (const chunk of chunks) {
27+
response += chunk.leadingWhitespace || ''
28+
2729
if (chunk.shouldTranslate) {
2830
// eslint-disable-next-line no-await-in-loop
29-
const translatedChunk = await this.chatModel.invoke(this.buildMessages({...options, content: chunk.content}))
31+
const translatedChunk = await this.chatModel.invoke(
32+
this.buildMessages({...options, content: chunk.content}),
33+
)
34+
3035
response += translatedChunk.content as string
3136
} else {
3237
response += chunk.content
3338
}
39+
40+
response += chunk.trailingWhitespace || ''
3441
}
3542

3643
return response
@@ -44,11 +51,25 @@ export class MarkdownTranslator {
4451
const chunks = await this.splitter.split(options.content)
4552

4653
for (const chunk of chunks) {
54+
if (chunk.leadingWhitespace) {
55+
yield chunk.leadingWhitespace
56+
}
57+
4758
if (chunk.shouldTranslate) {
48-
yield* this.streamChunk({...options, content: chunk.content})
59+
// eslint-disable-next-line no-await-in-loop
60+
for await (const streamedChunk of this.streamChunk({
61+
...options,
62+
content: chunk.content,
63+
})) {
64+
yield streamedChunk
65+
}
4966
} else {
5067
yield chunk.content
5168
}
69+
70+
if (chunk.trailingWhitespace) {
71+
yield chunk.trailingWhitespace
72+
}
5273
}
5374
}
5475

src/core/types.ts

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,25 @@
33
* Each chunk can indicate whether it should be translated or not via shouldTranslate.
44
*/
55
export interface Chunk {
6-
content: string;
7-
shouldTranslate: boolean;
6+
/**
7+
* The main content of the chunk (trimmed of leading/trailing whitespace)
8+
*/
9+
content: string
10+
11+
/**
12+
* Leading whitespace that was trimmed from the content
13+
*/
14+
leadingWhitespace?: string
15+
16+
/**
17+
* Whether this chunk should be translated
18+
*/
19+
shouldTranslate: boolean
20+
21+
/**
22+
* Trailing whitespace that was trimmed from the content
23+
*/
24+
trailingWhitespace?: string
825
}
926

1027
/**
@@ -14,15 +31,15 @@ export interface TranslationOptions {
1431
/**
1532
* The markdown content to translate
1633
*/
17-
content: string;
34+
content: string
1835

1936
/**
2037
* Source language (e.g., "EN", "ES")
2138
*/
22-
sourceLanguage: string;
39+
sourceLanguage: string
2340

2441
/**
2542
* Target language (e.g., "EN", "ES")
2643
*/
27-
targetLanguage: string;
44+
targetLanguage: string
2845
}

0 commit comments

Comments
 (0)