|
| 1 | +#!/bin/bash |
| 2 | +# ======================================================================= |
| 3 | +# Documentation Combiner |
| 4 | +# ======================================================================= |
| 5 | +# |
| 6 | +# This script combines all markdown files in this repo into a single text |
| 7 | +# file called `docs_combined.txt`. This file can then be easily fed to |
| 8 | +# LLMs for retrieval-augmented generation (RAG). |
| 9 | +# ======================================================================= |
| 10 | + |
| 11 | +# Set paths. |
| 12 | +CONTENT_DIR="../content" |
| 13 | +OUTPUT_FILE="../static/rag.txt" |
| 14 | + |
| 15 | +# Create or clear the output file. |
| 16 | +> "$OUTPUT_FILE" |
| 17 | + |
| 18 | +# Add the header line indicating the purpose of the file |
| 19 | +echo "This file was automatically generated from the latest available content at docs.entropy.xyz. It's intended to be used by AI and LLM bots to gather data into a query for retrieval-augmented generation." > "$OUTPUT_FILE" |
| 20 | + |
| 21 | +# Find all markdown files in the content directory. |
| 22 | +find "$CONTENT_DIR" -name "*.md" | sort | while read -r file; do |
| 23 | + echo "Processing: $file" |
| 24 | + |
| 25 | + # Extract filename for reference. |
| 26 | + filename=$(basename "$file") |
| 27 | + |
| 28 | + # Add a separator and filename before each file's content. |
| 29 | + echo -e "\n\n=== START OF FILE: $filename ===\n" >> "$OUTPUT_FILE" |
| 30 | + |
| 31 | + # Process the file - using sed and grep for macOS compatibility. |
| 32 | + { |
| 33 | + in_yaml=0 |
| 34 | + yaml_processed=0 |
| 35 | + in_shortcode=0 |
| 36 | + |
| 37 | + while IFS= read -r line; do |
| 38 | + # Handle YAML frontmatter. |
| 39 | + if [[ "$line" == "---" ]]; then |
| 40 | + if [[ $in_yaml -eq 0 && $yaml_processed -eq 0 ]]; then |
| 41 | + in_yaml=1 |
| 42 | + echo "DOCUMENT METADATA:" |
| 43 | + continue |
| 44 | + elif [[ $in_yaml -eq 1 ]]; then |
| 45 | + in_yaml=0 |
| 46 | + yaml_processed=1 |
| 47 | + echo "" |
| 48 | + continue |
| 49 | + fi |
| 50 | + fi |
| 51 | + |
| 52 | + # Handle content in YAML block. |
| 53 | + if [[ $in_yaml -eq 1 ]]; then |
| 54 | + # Clean up YAML formatting but keep the content. |
| 55 | + cleaned_line=$(echo "$line" | sed -E 's/^[a-zA-Z0-9_-]+: *//' | sed -E 's/^[ ]*"//' | sed -E 's/"[ ]*$//' | sed -E 's/^[ ]*- */- /') |
| 56 | + echo "$cleaned_line" |
| 57 | + continue |
| 58 | + fi |
| 59 | + |
| 60 | + # Handle shortcodes. |
| 61 | + if [[ "$line" == *"{{<"* ]]; then |
| 62 | + in_shortcode=1 |
| 63 | + # Extract useful info from shortcodes. |
| 64 | + if [[ "$line" == *"callout type="* ]]; then |
| 65 | + # Extract the callout type using sed. |
| 66 | + callout_type=$(echo "$line" | sed -E 's/.*type="([^"]*)".*/\1/') |
| 67 | + echo "[${callout_type} note]" |
| 68 | + fi |
| 69 | + continue |
| 70 | + fi |
| 71 | + |
| 72 | + if [[ "$line" == *"}}"* ]]; then |
| 73 | + in_shortcode=0 |
| 74 | + continue |
| 75 | + fi |
| 76 | + |
| 77 | + if [[ $in_shortcode -eq 1 ]]; then |
| 78 | + continue |
| 79 | + fi |
| 80 | + |
| 81 | + # Handle markdown formatting - headers. |
| 82 | + if [[ "$line" =~ ^#+ ]]; then |
| 83 | + # Remove the # symbols and convert to uppercase. |
| 84 | + header=$(echo "$line" | sed -E 's/^#+[ ]*//') |
| 85 | + echo "$header" | tr '[:lower:]' '[:upper:]' |
| 86 | + echo "" |
| 87 | + continue |
| 88 | + fi |
| 89 | + |
| 90 | + # Handle links and formatting. |
| 91 | + processed_line=$(echo "$line" | \ |
| 92 | + sed -E 's/\[([^\]]*)\]\([^)]*\)/\1/g' | \ |
| 93 | + sed -E 's/\*\*([^*]*)\*\*/\1/g' | \ |
| 94 | + sed -E 's/\*([^*]*)\*/\1/g' | \ |
| 95 | + sed -E 's/`([^`]*)`/\1/g' | \ |
| 96 | + sed -E 's/{{< relref "[^"]*" >}}//g') |
| 97 | + |
| 98 | + # Only print non-empty lines after processing. |
| 99 | + if [[ -n "$processed_line" ]]; then |
| 100 | + echo "$processed_line" |
| 101 | + fi |
| 102 | + |
| 103 | + done < "$file" |
| 104 | + } >> "$OUTPUT_FILE" |
| 105 | + |
| 106 | + echo -e "\n=== END OF FILE: $filename ===\n" >> "$OUTPUT_FILE" |
| 107 | +done |
| 108 | + |
| 109 | +echo "All markdown files have been combined into $OUTPUT_FILE" |
| 110 | +echo "Total files processed: $(find "$CONTENT_DIR" -name "*.md" | wc -l)" |
| 111 | +echo "Total size of combined file: $(du -h "$OUTPUT_FILE" | cut -f1)" |
0 commit comments