Skip to content

Commit de29946

Browse files
Add script to combine doc files for LLM context (#323)
* Creates script to grab all docs content into one folder. * Moves combined content into static dir. * Adds header to generated file. * Regenrates file with header. * Renames script output file t RAG. --------- Co-authored-by: johnnymatthews <>
1 parent 261c24a commit de29946

File tree

2 files changed

+1188
-0
lines changed

2 files changed

+1188
-0
lines changed

scripts/combine_docs.sh

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#!/bin/bash
2+
# =======================================================================
3+
# Documentation Combiner
4+
# =======================================================================
5+
#
6+
# This script combines all markdown files in this repo into a single text
7+
# file called `docs_combined.txt`. This file can then be easily fed to
8+
# LLMs for retrieval-augmented generation (RAG).
9+
# =======================================================================
10+
11+
# Set paths.
12+
CONTENT_DIR="../content"
13+
OUTPUT_FILE="../static/rag.txt"
14+
15+
# Create or clear the output file.
16+
> "$OUTPUT_FILE"
17+
18+
# Add the header line indicating the purpose of the file
19+
echo "This file was automatically generated from the latest available content at docs.entropy.xyz. It's intended to be used by AI and LLM bots to gather data into a query for retrieval-augmented generation." > "$OUTPUT_FILE"
20+
21+
# Find all markdown files in the content directory.
22+
find "$CONTENT_DIR" -name "*.md" | sort | while read -r file; do
23+
echo "Processing: $file"
24+
25+
# Extract filename for reference.
26+
filename=$(basename "$file")
27+
28+
# Add a separator and filename before each file's content.
29+
echo -e "\n\n=== START OF FILE: $filename ===\n" >> "$OUTPUT_FILE"
30+
31+
# Process the file - using sed and grep for macOS compatibility.
32+
{
33+
in_yaml=0
34+
yaml_processed=0
35+
in_shortcode=0
36+
37+
while IFS= read -r line; do
38+
# Handle YAML frontmatter.
39+
if [[ "$line" == "---" ]]; then
40+
if [[ $in_yaml -eq 0 && $yaml_processed -eq 0 ]]; then
41+
in_yaml=1
42+
echo "DOCUMENT METADATA:"
43+
continue
44+
elif [[ $in_yaml -eq 1 ]]; then
45+
in_yaml=0
46+
yaml_processed=1
47+
echo ""
48+
continue
49+
fi
50+
fi
51+
52+
# Handle content in YAML block.
53+
if [[ $in_yaml -eq 1 ]]; then
54+
# Clean up YAML formatting but keep the content.
55+
cleaned_line=$(echo "$line" | sed -E 's/^[a-zA-Z0-9_-]+: *//' | sed -E 's/^[ ]*"//' | sed -E 's/"[ ]*$//' | sed -E 's/^[ ]*- */- /')
56+
echo "$cleaned_line"
57+
continue
58+
fi
59+
60+
# Handle shortcodes.
61+
if [[ "$line" == *"{{<"* ]]; then
62+
in_shortcode=1
63+
# Extract useful info from shortcodes.
64+
if [[ "$line" == *"callout type="* ]]; then
65+
# Extract the callout type using sed.
66+
callout_type=$(echo "$line" | sed -E 's/.*type="([^"]*)".*/\1/')
67+
echo "[${callout_type} note]"
68+
fi
69+
continue
70+
fi
71+
72+
if [[ "$line" == *"}}"* ]]; then
73+
in_shortcode=0
74+
continue
75+
fi
76+
77+
if [[ $in_shortcode -eq 1 ]]; then
78+
continue
79+
fi
80+
81+
# Handle markdown formatting - headers.
82+
if [[ "$line" =~ ^#+ ]]; then
83+
# Remove the # symbols and convert to uppercase.
84+
header=$(echo "$line" | sed -E 's/^#+[ ]*//')
85+
echo "$header" | tr '[:lower:]' '[:upper:]'
86+
echo ""
87+
continue
88+
fi
89+
90+
# Handle links and formatting.
91+
processed_line=$(echo "$line" | \
92+
sed -E 's/\[([^\]]*)\]\([^)]*\)/\1/g' | \
93+
sed -E 's/\*\*([^*]*)\*\*/\1/g' | \
94+
sed -E 's/\*([^*]*)\*/\1/g' | \
95+
sed -E 's/`([^`]*)`/\1/g' | \
96+
sed -E 's/{{< relref "[^"]*" >}}//g')
97+
98+
# Only print non-empty lines after processing.
99+
if [[ -n "$processed_line" ]]; then
100+
echo "$processed_line"
101+
fi
102+
103+
done < "$file"
104+
} >> "$OUTPUT_FILE"
105+
106+
echo -e "\n=== END OF FILE: $filename ===\n" >> "$OUTPUT_FILE"
107+
done
108+
109+
echo "All markdown files have been combined into $OUTPUT_FILE"
110+
echo "Total files processed: $(find "$CONTENT_DIR" -name "*.md" | wc -l)"
111+
echo "Total size of combined file: $(du -h "$OUTPUT_FILE" | cut -f1)"

0 commit comments

Comments
 (0)