ibm-skills-network
diff --git a/‎package.json‎
Lines changed: 1 addition & 0 deletions b/‎package.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pnpm-lock.yaml‎
Lines changed: 3 additions & 0 deletions b/‎pnpm-lock.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/core/providers/watsonx.ts‎
Lines changed: 15 additions & 15 deletions b/‎src/core/providers/watsonx.ts‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎src/core/splitters/markdown.ts‎
Lines changed: 221 additions & 0 deletions b/‎src/core/splitters/markdown.ts‎
Lines changed: 221 additions & 0 deletions
@@ -11,6 +11,7 @@
     "@ibm-cloud/watsonx-ai": "^1.7.0",
     "@langchain/community": "^1.0.0",
     "@langchain/core": "^1.0.2",
+    "@langchain/textsplitters": "^1.0.0",
     "@oclif/core": "^4",
     "@oclif/plugin-help": "^6",
     "@oclif/plugin-plugins": "^5"
 
@@ -1,49 +1,49 @@
-import { ChatWatsonx } from '@langchain/community/chat_models/ibm';
+import {ChatWatsonx} from '@langchain/community/chat_models/ibm'
 
 /**
  * Configuration for watsonx.ai client
  */
 export interface WatsonxConfig {
-  apiKey: string;
-  maxNewTokens?: number;
-  model?: string;
-  projectId: string;
-  serviceUrl: string;
-  temperature?: number;
+  apiKey: string
+  maxNewTokens?: number
+  model?: string
+  projectId: string
+  serviceUrl: string
+  temperature?: number
 }
 
 /**
  * Creates and returns a configured watsonx.ai chat model with IAM authentication
  */
 export function createClient(config?: Partial<WatsonxConfig>): ChatWatsonx {
   // Get configuration from environment variables or provided config
-  const serviceUrl = config?.serviceUrl || process.env.WATSONX_AI_SERVICE_URL;
-  const projectId = config?.projectId || process.env.WATSONX_AI_PROJECT_ID;
-  const apiKey = config?.apiKey || process.env.WATSONX_AI_APIKEY;
+  const serviceUrl = config?.serviceUrl || process.env.WATSONX_AI_SERVICE_URL
+  const projectId = config?.projectId || process.env.WATSONX_AI_PROJECT_ID
+  const apiKey = config?.apiKey || process.env.WATSONX_AI_APIKEY
 
   // Validate required configuration
   if (!serviceUrl) {
-    throw new Error('WATSONX_AI_SERVICE_URL is required');
+    throw new Error('WATSONX_AI_SERVICE_URL is required')
   }
 
   if (!projectId) {
-    throw new Error('WATSONX_AI_PROJECT_ID is required');
+    throw new Error('WATSONX_AI_PROJECT_ID is required')
   }
 
   if (!apiKey) {
-    throw new Error('WATSONX_AI_APIKEY is required');
+    throw new Error('WATSONX_AI_APIKEY is required')
   }
 
   // Create and return the chat model with IAM authentication
   return new ChatWatsonx({
     maxRetries: 3,
     maxTokens: config?.maxNewTokens || 2000,
-    model: config?.model || 'ibm/granite-3-8b-instruct',
+    model: config?.model || 'ibm/granite-4-h-small',
     projectId,
     serviceUrl,
     temperature: config?.temperature || 0.3,
     version: '2024-05-31',
     watsonxAIApikey: apiKey,
     watsonxAIAuthType: 'iam',
-  });
+  })
 }
@@ -0,0 +1,221 @@
+import {MarkdownTextSplitter} from '@langchain/textsplitters'
+
+import type {Chunk} from '../types.js'
+
+/**
+ * Hybrid two-pass markdown splitter for translation purposes.
+ *
+ * Pass 1: Structural split by markdown elements (frontmatter, code blocks, headers/page directives)
+ * Pass 2: Size-based recursive split using LangChain for chunks exceeding chunkSize
+ *
+ * This approach:
+ * - Respects markdown structure and translation requirements (code blocks, frontmatter = non-translatable)
+ * - Prevents massive chunks from overwhelming translation APIs
+ * - Uses LangChain's intelligent splitting for size management (tries headers > paragraphs > lines)
+ */
+export class MarkdownSplitter {
+  private readonly chunkSize: number
+  private recursiveSplitter: MarkdownTextSplitter
+
+  constructor(chunkSize = 12_000) {
+    this.chunkSize = chunkSize
+
+    this.recursiveSplitter = new MarkdownTextSplitter({
+      chunkOverlap: 0,
+      chunkSize,
+      keepSeparator: true,
+    })
+  }
+
+  /**
+   * Default split method - automatically detects and uses the appropriate splitting strategy.
+   * If the content contains ::page directives, splits by those.
+   * Otherwise, splits by headers (# ## ### etc.)
+   */
+  async split(markdown: string): Promise<Chunk[]> {
+    if (/^::page/m.test(markdown)) {
+      return this.splitByPageDirective(markdown)
+    }
+
+    return this.splitByHeaders(markdown)
+  }
+
+  /**
+   * Splits markdown by headers (# ## ### etc.)
+   */
+  async splitByHeaders(markdown: string): Promise<Chunk[]> {
+    const firstPassChunks = this.splitInternal(markdown, /^#+\s+/m)
+
+    return this.secondPassSplit(firstPassChunks)
+  }
+
+  /**
+   * Splits markdown by custom ::page directives
+   */
+  async splitByPageDirective(markdown: string): Promise<Chunk[]> {
+    const firstPassChunks = this.splitInternal(markdown, /^::page/m)
+
+    return this.secondPassSplit(firstPassChunks)
+  }
+
+  /**
+   * Creates a chunk with whitespace extracted and stored separately
+   */
+  private createChunk(content: string, shouldTranslate: boolean): Chunk {
+    const leadingWhitespace = content.match(/^\s+/)?.[0] || ''
+    const trailingWhitespace = content.match(/\s+$/)?.[0] || ''
+    const trimmedContent = content.trim()
+
+    return {
+      content: trimmedContent,
+      leadingWhitespace,
+      shouldTranslate,
+      trailingWhitespace,
+    }
+  }
+
+  /**
+   * Finds the index of the closing --- for frontmatter
+   */
+  private findFrontmatterEnd(lines: string[]): number {
+    return lines.slice(1).findIndex((line) => line.trim() === '---') + 1
+  }
+
+  /**
+   * Checks if the markdown starts with YAML frontmatter (--- ... ---)
+   */
+  private hasFrontmatter(lines: string[]): boolean {
+    if (lines.length === 0) return false
+
+    return lines[0].trim() === '---' && lines.slice(1).some((line) => line.trim() === '---')
+  }
+
+  /**
+   * Pass 2: Size-based recursive split using LangChain
+   *
+   * For chunks exceeding chunkSize, uses MarkdownTextSplitter to break them down
+   * intelligently while preserving the shouldTranslate flag.
+   *
+   * LangChain's MarkdownTextSplitter tries separators in this order:
+   * 1. H2-H6 headers (## ### #### etc.)
+   * 2. Code blocks with spacing
+   * 3. Horizontal rules (---, ***, ___)
+   * 4. Paragraph breaks (\n\n)
+   * 5. Line breaks (\n)
+   * 6. Spaces
+   * 7. Characters (last resort)
+   */
+  private async secondPassSplit(chunks: Chunk[]): Promise<Chunk[]> {
+    const finalChunks: Chunk[] = []
+
+    const splitPromises = chunks.map(async (chunk) => {
+      if (chunk.content.length <= this.chunkSize) {
+        return [chunk]
+      }
+
+      const splitTexts = await this.recursiveSplitter.splitText(chunk.content)
+
+      return splitTexts.map((text, textIndex) => {
+        const isFirst = textIndex === 0
+        const isLast = textIndex === splitTexts.length - 1
+
+        return this.createChunk(
+          (isFirst ? chunk.leadingWhitespace || '' : '') + text + (isLast ? chunk.trailingWhitespace || '' : ''),
+          chunk.shouldTranslate,
+        )
+      })
+    })
+
+    const splitResults = await Promise.all(splitPromises)
+
+    for (const result of splitResults) {
+      finalChunks.push(...result)
+    }
+
+    return finalChunks
+  }
+
+  /**
+   * Pass 1: Structural split using state machine (ported from Ruby implementation)
+   *
+   * Handles:
+   * - Frontmatter detection (YAML between --- markers)
+   * - Code block detection (``` fences)
+   * - Split pattern matching (headers or ::page directives)
+   * - shouldTranslate flag assignment
+   */
+  private splitInternal(markdown: string, splitPattern: RegExp): Chunk[] {
+    const chunks: Chunk[] = []
+    const splitLines = markdown.split('\n')
+
+    const hasTrailingNewline = markdown.endsWith('\n')
+
+    if (hasTrailingNewline && splitLines.at(-1) === '') {
+      splitLines.pop()
+    }
+
+    const lines = splitLines.map((line) => line + '\n')
+
+    if (!hasTrailingNewline && lines.length > 0) {
+      lines[lines.length - 1] = lines.at(-1)!.slice(0, -1)
+    }
+
+    let currentPosition = 0
+
+    if (this.hasFrontmatter(lines)) {
+      const endIndex = this.findFrontmatterEnd(lines)
+      const frontmatterContent = lines.slice(0, endIndex + 1).join('')
+      chunks.push(this.createChunk(frontmatterContent, false))
+      currentPosition = endIndex + 1
+    }
+
+    let currentChunkContent = ''
+    let currentChunkShouldTranslate = true
+    let inCodeBlock = false
+
+    for (let i = currentPosition; i < lines.length; i++) {
+      const line = lines[i]
+
+      if (/^```[\w-]*\s*$/.test(line)) {
+        if (inCodeBlock) {
+          currentChunkContent += line
+          chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
+          currentChunkContent = ''
+          currentChunkShouldTranslate = true
+          inCodeBlock = false
+        } else {
+          if (currentChunkContent.trim()) {
+            chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
+            currentChunkContent = line
+          } else {
+            currentChunkContent += line
+          }
+
+          currentChunkShouldTranslate = false
+          inCodeBlock = true
+        }
+
+        continue
+      }
+
+      if (splitPattern.test(line) && !inCodeBlock) {
+        if (currentChunkContent.trim()) {
+          chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
+          currentChunkContent = line
+        } else {
+          currentChunkContent += line
+        }
+
+        currentChunkShouldTranslate = true
+      } else {
+        currentChunkContent += line
+      }
+    }
+
+    if (currentChunkContent.trim()) {
+      chunks.push(this.createChunk(currentChunkContent, currentChunkShouldTranslate))
+    }
+
+    return chunks.filter((chunk) => chunk.content !== '')
+  }
+}