Basic splitter

michaelroudnitski · michaelroudnitski · commit a506ef841f7a · 2025-10-30T18:46:41.000-04:00
diff --git a/package.json b/package.json
@@ -11,6 +11,7 @@
     "@ibm-cloud/watsonx-ai": "^1.7.0",
     "@langchain/community": "^1.0.0",
     "@langchain/core": "^1.0.2",
+    "@langchain/textsplitters": "^1.0.0",
     "@oclif/core": "^4",
     "@oclif/plugin-help": "^6",
     "@oclif/plugin-plugins": "^5"
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/src/core/providers/watsonx.ts b/src/core/providers/watsonx.ts
@@ -1,49 +1,49 @@
-import { ChatWatsonx } from '@langchain/community/chat_models/ibm';
+import {ChatWatsonx} from '@langchain/community/chat_models/ibm'
 
 /**
  * Configuration for watsonx.ai client
  */
 export interface WatsonxConfig {
-  apiKey: string;
-  maxNewTokens?: number;
-  model?: string;
-  projectId: string;
-  serviceUrl: string;
-  temperature?: number;
+  apiKey: string
+  maxNewTokens?: number
+  model?: string
+  projectId: string
+  serviceUrl: string
+  temperature?: number
 }
 
 /**
  * Creates and returns a configured watsonx.ai chat model with IAM authentication
  */
 export function createClient(config?: Partial<WatsonxConfig>): ChatWatsonx {
   // Get configuration from environment variables or provided config
-  const serviceUrl = config?.serviceUrl || process.env.WATSONX_AI_SERVICE_URL;
-  const projectId = config?.projectId || process.env.WATSONX_AI_PROJECT_ID;
-  const apiKey = config?.apiKey || process.env.WATSONX_AI_APIKEY;
+  const serviceUrl = config?.serviceUrl || process.env.WATSONX_AI_SERVICE_URL
+  const projectId = config?.projectId || process.env.WATSONX_AI_PROJECT_ID
+  const apiKey = config?.apiKey || process.env.WATSONX_AI_APIKEY
 
   // Validate required configuration
   if (!serviceUrl) {
-    throw new Error('WATSONX_AI_SERVICE_URL is required');
+    throw new Error('WATSONX_AI_SERVICE_URL is required')
   }
 
   if (!projectId) {
-    throw new Error('WATSONX_AI_PROJECT_ID is required');
+    throw new Error('WATSONX_AI_PROJECT_ID is required')
   }
 
   if (!apiKey) {
-    throw new Error('WATSONX_AI_APIKEY is required');
+    throw new Error('WATSONX_AI_APIKEY is required')
   }
 
   // Create and return the chat model with IAM authentication
   return new ChatWatsonx({
     maxRetries: 3,
     maxTokens: config?.maxNewTokens || 2000,
-    model: config?.model || 'ibm/granite-3-8b-instruct',
+    model: config?.model || 'ibm/granite-4-h-small',
     projectId,
     serviceUrl,
     temperature: config?.temperature || 0.3,
     version: '2024-05-31',
     watsonxAIApikey: apiKey,
     watsonxAIAuthType: 'iam',
-  });
+  })
 }
diff --git a/src/core/splitters/markdown.ts b/src/core/splitters/markdown.ts
@@ -0,0 +1,198 @@
+import {MarkdownTextSplitter} from '@langchain/textsplitters'
+
+import type {Chunk} from '../types.js'
+
+/**
+ * Hybrid two-pass markdown splitter for translation purposes.
+ *
+ * Pass 1: Structural split by markdown elements (frontmatter, code blocks, headers/page directives)
+ * Pass 2: Size-based recursive split using LangChain for chunks exceeding chunkSize
+ *
+ * This approach:
+ * - Respects markdown structure and translation requirements (code blocks, frontmatter = non-translatable)
+ * - Prevents massive chunks from overwhelming translation APIs
+ * - Uses LangChain's intelligent splitting for size management (tries headers > paragraphs > lines)
+ */
+export class MarkdownSplitter {
+  private readonly chunkSize: number
+  private recursiveSplitter: MarkdownTextSplitter
+
+  constructor(chunkSize = 12_000) {
+    this.chunkSize = chunkSize
+
+    // Initialize LangChain's MarkdownTextSplitter for Pass 2
+    this.recursiveSplitter = new MarkdownTextSplitter({
+      chunkOverlap: 0, // No overlap needed for translation (avoids duplicates)
+      chunkSize,
+      keepSeparator: true, // Preserve markdown formatting (headers, etc.)
+    })
+  }
+
+  /**
+   * Default split method - automatically detects and uses the appropriate splitting strategy.
+   * If the content contains ::page directives, splits by those.
+   * Otherwise, splits by headers (# ## ### etc.)
+   */
+  async split(markdown: string): Promise<Chunk[]> {
+    if (/^::page/m.test(markdown)) {
+      return this.splitByPageDirective(markdown)
+    }
+
+    return this.splitByHeaders(markdown)
+  }
+
+  /**
+   * Splits markdown by headers (# ## ### etc.)
+   */
+  async splitByHeaders(markdown: string): Promise<Chunk[]> {
+    const firstPassChunks = this.splitInternal(markdown, /^#+\s+/m)
+
+    return this.secondPassSplit(firstPassChunks)
+  }
+
+  /**
+   * Splits markdown by custom ::page directives
+   */
+  async splitByPageDirective(markdown: string): Promise<Chunk[]> {
+    const firstPassChunks = this.splitInternal(markdown, /^::page/m)
+
+    return this.secondPassSplit(firstPassChunks)
+  }
+
+  /**
+   * Finds the index of the closing --- for frontmatter
+   */
+  private findFrontmatterEnd(lines: string[]): number {
+    return lines.slice(1).findIndex((line) => line.trim() === '---') + 1
+  }
+
+  /**
+   * Checks if the markdown starts with YAML frontmatter (--- ... ---)
+   */
+  private hasFrontmatter(lines: string[]): boolean {
+    if (lines.length === 0) return false
+
+    return lines[0].trim() === '---' && lines.slice(1).some((line) => line.trim() === '---')
+  }
+
+  /**
+   * Pass 2: Size-based recursive split using LangChain
+   *
+   * For chunks exceeding chunkSize, uses MarkdownTextSplitter to break them down
+   * intelligently while preserving the shouldTranslate flag.
+   *
+   * LangChain's MarkdownTextSplitter tries separators in this order:
+   * 1. H2-H6 headers (## ### #### etc.)
+   * 2. Code blocks with spacing
+   * 3. Horizontal rules (---, ***, ___)
+   * 4. Paragraph breaks (\n\n)
+   * 5. Line breaks (\n)
+   * 6. Spaces
+   * 7. Characters (last resort)
+   */
+  private async secondPassSplit(chunks: Chunk[]): Promise<Chunk[]> {
+    const finalChunks: Chunk[] = []
+
+    // Process all chunks, splitting large ones
+    const splitPromises = chunks.map(async (chunk) => {
+      // If chunk is within size limit, keep as-is
+      if (chunk.content.length <= this.chunkSize) {
+        return [chunk]
+      }
+
+      // Chunk is too large - use LangChain's recursive splitter
+      const splitTexts = await this.recursiveSplitter.splitText(chunk.content)
+
+      // Preserve the shouldTranslate flag on all sub-chunks
+      return splitTexts.map((text) => ({
+        content: text,
+        shouldTranslate: chunk.shouldTranslate,
+      }))
+    })
+
+    const splitResults = await Promise.all(splitPromises)
+
+    // Flatten the results
+    for (const result of splitResults) {
+      finalChunks.push(...result)
+    }
+
+    return finalChunks
+  }
+
+  /**
+   * Pass 1: Structural split using state machine (ported from Ruby implementation)
+   *
+   * Handles:
+   * - Frontmatter detection (YAML between --- markers)
+   * - Code block detection (``` fences)
+   * - Split pattern matching (headers or ::page directives)
+   * - shouldTranslate flag assignment
+   */
+  private splitInternal(markdown: string, splitPattern: RegExp): Chunk[] {
+    const chunks: Chunk[] = []
+    const lines = markdown.split('\n').map((line) => line + '\n')
+    let currentPosition = 0
+
+    // Handle frontmatter if present
+    if (this.hasFrontmatter(lines)) {
+      const endIndex = this.findFrontmatterEnd(lines)
+      const frontmatterContent = lines.slice(0, endIndex + 1).join('')
+      chunks.push({
+        content: frontmatterContent,
+        shouldTranslate: false,
+      })
+      currentPosition = endIndex + 1
+    }
+
+    let currentChunk: Chunk = {content: '', shouldTranslate: true}
+    let inCodeBlock = false
+
+    for (let i = currentPosition; i < lines.length; i++) {
+      const line = lines[i]
+
+      // Check for code fence (supports language identifiers and trailing whitespace)
+      // Matches: ```, ```ruby, ```sh-session, ```python  , etc.
+      if (/^```[\w-]*\s*$/.test(line)) {
+        if (inCodeBlock) {
+          // End of code block
+          currentChunk.content += line
+          chunks.push(currentChunk)
+          currentChunk = {content: '', shouldTranslate: true}
+          inCodeBlock = false
+        } else {
+          // Start of code block
+          if (currentChunk.content.trim()) {
+            chunks.push(currentChunk)
+          }
+
+          currentChunk = {content: line, shouldTranslate: false}
+          inCodeBlock = true
+        }
+
+        continue
+      }
+
+      // Handle line based on current state and patterns
+      if (splitPattern.test(line) && !inCodeBlock) {
+        // This line matches the split pattern and we're not in a code block
+        if (currentChunk.content.trim()) {
+          chunks.push(currentChunk)
+        }
+
+        currentChunk = {content: line, shouldTranslate: true}
+      } else {
+        // Regular line - append to current chunk
+        currentChunk.content += line
+      }
+    }
+
+    // Add final chunk if not empty
+    if (currentChunk.content.trim()) {
+      chunks.push(currentChunk)
+    }
+
+    // Filter out empty chunks
+    return chunks.filter((chunk) => chunk.content.trim() !== '')
+  }
+}
diff --git a/src/core/translators/markdown.ts b/src/core/translators/markdown.ts
@@ -4,33 +4,50 @@ import {BaseMessage, HumanMessage, SystemMessage} from '@langchain/core/messages
 
 import type {TranslationOptions} from '../types.js'
 
+import {MarkdownSplitter} from '../splitters/markdown.js'
+
 /**
  * Translates markdown content using a chat model
  */
 export class MarkdownTranslator {
-  constructor(private chatModel: BaseChatModel) {}
+  private splitter: MarkdownSplitter
+
+  constructor(private chatModel: BaseChatModel) {
+    this.splitter = new MarkdownSplitter()
+  }
 
   /**
    * Translates markdown content from source language to target language
    */
   async translate(options: TranslationOptions): Promise<string> {
-    const response = await this.chatModel.invoke(this.buildMessages(options))
-    const translatedContent = response.content as string
+    const chunks = await this.splitter.split(options.content)
+    let response = ''
+
+    for (const chunk of chunks) {
+      if (chunk.shouldTranslate) {
+        // eslint-disable-next-line no-await-in-loop
+        const translatedChunk = await this.chatModel.invoke(this.buildMessages({...options, content: chunk.content}))
+        response += translatedChunk.content as string
+      } else {
+        response += chunk.content
+      }
+    }
 
-    return translatedContent
+    return response
   }
 
   /**
    * Streams translated markdown content from source language to target language
    * @yields {string} Chunks of translated content
    */
   async *translateStream(options: TranslationOptions): AsyncGenerator<string> {
-    const stream = await this.chatModel.stream(this.buildMessages(options))
+    const chunks = await this.splitter.split(options.content)
 
-    for await (const chunk of stream) {
-      const {content} = chunk
-      if (typeof content === 'string') {
-        yield content
+    for (const chunk of chunks) {
+      if (chunk.shouldTranslate) {
+        yield* this.streamChunk({...options, content: chunk.content})
+      } else {
+        yield chunk.content
       }
     }
   }
@@ -46,7 +63,8 @@ export class MarkdownTranslator {
    * Creates the system prompt for translation
    */
   private createSystemPrompt(sourceLanguage: string, targetLanguage: string): string {
-    return `You are a helpful assistant that accurately translates markdown document snippets from ${sourceLanguage} to ${targetLanguage} while preserving markdown syntax, formatting, and custom directives.
+    return `
+You are a helpful assistant that accurately translates markdown document snippets from ${sourceLanguage} to ${targetLanguage} while preserving markdown syntax, formatting, and custom directives.
 You always preserve the structure and formatting exactly as it is.
 You do not add, alter or modify the text you receive in any way.
 
@@ -66,6 +84,22 @@ Reminder:
 - Be consistent with technical terms. If an equivalent technical term is not available in ${targetLanguage}, always use the original term.
 
 *IMPORTANT*
-Translate without any additional information or comments.`
+Translate without any additional information or comments.
+`
+  }
+
+  /**
+   * Streams a single chunk through the chat model
+   * @yields {string} Chunks of translated content from the model
+   */
+  private async *streamChunk(options: TranslationOptions): AsyncGenerator<string> {
+    const stream = await this.chatModel.stream(this.buildMessages(options))
+
+    for await (const chunk of stream) {
+      const {content} = chunk
+      if (typeof content === 'string') {
+        yield content
+      }
+    }
   }
 }
diff --git a/src/core/types.ts b/src/core/types.ts
@@ -1,3 +1,12 @@
+/**
+ * Represents a chunk of split source text.
+ * Each chunk can indicate whether it should be translated or not via shouldTranslate.
+ */
+export interface Chunk {
+  content: string;
+  shouldTranslate: boolean;
+}
+
 /**
  * Translation configuration options
  */
diff --git a/test/core/splitters/markdown.test.ts b/test/core/splitters/markdown.test.ts