entropyxyz
diff --git a/‎scripts/combine_docs.sh
Lines changed: 111 additions & 0 deletions b/‎scripts/combine_docs.sh
Lines changed: 111 additions & 0 deletions
@@ -0,0 +1,111 @@
+#!/bin/bash
+# =======================================================================
+# Documentation Combiner
+# =======================================================================
+# 
+# This script combines all markdown files in this repo into a single text
+# file called `docs_combined.txt`. This file can then be easily fed to
+# LLMs for retrieval-augmented generation (RAG). 
+# =======================================================================
+
+# Set paths.
+CONTENT_DIR="../content"
+OUTPUT_FILE="../static/rag.txt"
+
+# Create or clear the output file.
+> "$OUTPUT_FILE"
+
+# Add the header line indicating the purpose of the file
+echo "This file was automatically generated from the latest available content at docs.entropy.xyz. It's intended to be used by AI and LLM bots to gather data into a query for retrieval-augmented generation." > "$OUTPUT_FILE"
+
+# Find all markdown files in the content directory.
+find "$CONTENT_DIR" -name "*.md" | sort | while read -r file; do
+    echo "Processing: $file"
+    
+    # Extract filename for reference.
+    filename=$(basename "$file")
+    
+    # Add a separator and filename before each file's content.
+    echo -e "\n\n=== START OF FILE: $filename ===\n" >> "$OUTPUT_FILE"
+    
+    # Process the file - using sed and grep for macOS compatibility.
+    {
+        in_yaml=0
+        yaml_processed=0
+        in_shortcode=0
+        
+        while IFS= read -r line; do
+            # Handle YAML frontmatter.
+            if [[ "$line" == "---" ]]; then
+                if [[ $in_yaml -eq 0 && $yaml_processed -eq 0 ]]; then
+                    in_yaml=1
+                    echo "DOCUMENT METADATA:"
+                    continue
+                elif [[ $in_yaml -eq 1 ]]; then
+                    in_yaml=0
+                    yaml_processed=1
+                    echo ""
+                    continue
+                fi
+            fi
+            
+            # Handle content in YAML block.
+            if [[ $in_yaml -eq 1 ]]; then
+                # Clean up YAML formatting but keep the content.
+                cleaned_line=$(echo "$line" | sed -E 's/^[a-zA-Z0-9_-]+: *//' | sed -E 's/^[ ]*"//' | sed -E 's/"[ ]*$//' | sed -E 's/^[ ]*- */- /')
+                echo "$cleaned_line"
+                continue
+            fi
+            
+            # Handle shortcodes.
+            if [[ "$line" == *"{{<"* ]]; then
+                in_shortcode=1
+                # Extract useful info from shortcodes.
+                if [[ "$line" == *"callout type="* ]]; then
+                    # Extract the callout type using sed.
+                    callout_type=$(echo "$line" | sed -E 's/.*type="([^"]*)".*/\1/')
+                    echo "[${callout_type} note]"
+                fi
+                continue
+            fi
+            
+            if [[ "$line" == *"}}"* ]]; then
+                in_shortcode=0
+                continue
+            fi
+            
+            if [[ $in_shortcode -eq 1 ]]; then
+                continue
+            fi
+            
+            # Handle markdown formatting - headers.
+            if [[ "$line" =~ ^#+ ]]; then
+                # Remove the # symbols and convert to uppercase.
+                header=$(echo "$line" | sed -E 's/^#+[ ]*//')
+                echo "$header" | tr '[:lower:]' '[:upper:]'
+                echo ""
+                continue
+            fi
+            
+            # Handle links and formatting.
+            processed_line=$(echo "$line" | \
+                sed -E 's/\[([^\]]*)\]\([^)]*\)/\1/g' | \
+                sed -E 's/\*\*([^*]*)\*\*/\1/g' | \
+                sed -E 's/\*([^*]*)\*/\1/g' | \
+                sed -E 's/`([^`]*)`/\1/g' | \
+                sed -E 's/{{< relref "[^"]*" >}}//g')
+            
+            # Only print non-empty lines after processing.
+            if [[ -n "$processed_line" ]]; then
+                echo "$processed_line"
+            fi
+            
+        done < "$file"
+    } >> "$OUTPUT_FILE"
+    
+    echo -e "\n=== END OF FILE: $filename ===\n" >> "$OUTPUT_FILE"
+done
+
+echo "All markdown files have been combined into $OUTPUT_FILE"
+echo "Total files processed: $(find "$CONTENT_DIR" -name "*.md" | wc -l)"
+echo "Total size of combined file: $(du -h "$OUTPUT_FILE" | cut -f1)"