Skip to content

Commit 90e4c1f

Browse files
authored
add pageinfo.py tool (n8n-io#2816)
1 parent 22e4ff8 commit 90e4c1f

File tree

2 files changed

+99
-0
lines changed

2 files changed

+99
-0
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,8 @@ __pycache__/
1616

1717
## Ignore pyenv configuration
1818
.python-version
19+
20+
## Ignore ephemeral doc-tool output
21+
22+
output.csv
23+
_doctools/*.csv

_doctools/pageinfo.py

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#!/usr/bin/env python3
2+
import os
3+
import yaml
4+
import argparse
5+
import re
6+
import glob
7+
import csv
8+
9+
OUTPUT_CSV = "pageinfo.csv"
10+
11+
def extract_frontmatter_and_content(filepath):
12+
"""Extracts frontmatter and main content from a Markdown file."""
13+
with open(filepath, "r", encoding="utf-8") as f:
14+
content = f.read()
15+
16+
# Match YAML frontmatter with regex (--- as delimiter)
17+
match = re.match(r"^---\n(.*?)\n---\n(.*)", content, re.DOTALL)
18+
19+
if match:
20+
try:
21+
frontmatter = yaml.safe_load(match.group(1)) # Parse YAML
22+
except yaml.YAMLError:
23+
frontmatter = None # Invalid YAML
24+
main_content = match.group(2).strip()
25+
else:
26+
frontmatter = None
27+
main_content = content.strip()
28+
29+
return frontmatter, main_content
30+
31+
def count_words(text):
32+
"""Counts the number of words in the given text."""
33+
# yes, this is fairly simplistic, but a general idea is fine for most uses
34+
return len(text.split())
35+
36+
def find_markdown_files(directory):
37+
"""Recursively finds all markdown files in the given directory."""
38+
return glob.glob(os.path.join(directory, "**", "*.md"), recursive=True)
39+
40+
def save_to_csv(data, filename=OUTPUT_CSV):
41+
"""Saves extracted data to a CSV file with dynamic contentType columns."""
42+
max_types = max((len(row[1]) if isinstance(row[1], list) else 1) for row in data)
43+
headers = ["Filename", "WordCount"] + [f"ContentType_{i+1}" for i in range(max_types)]
44+
45+
with open(filename, "w", newline="", encoding="utf-8") as csvfile:
46+
writer = csv.writer(csvfile)
47+
writer.writerow(headers) # Write CSV header
48+
49+
for filename, content_type, word_count in data:
50+
if isinstance(content_type, list):
51+
row = [filename, word_count] + content_type + [""] * (max_types - len(content_type))
52+
else:
53+
row = [filename, word_count, content_type] + [""] * (max_types - 1)
54+
writer.writerow(row)
55+
56+
def main(directory,print_output):
57+
"""Finds Markdown files, extracts 'contentType' and word count, then prints and saves results."""
58+
md_files = find_markdown_files(directory)
59+
extracted_data = []
60+
61+
for file in md_files:
62+
frontmatter, main_content = extract_frontmatter_and_content(file)
63+
word_count = count_words(main_content)
64+
65+
if frontmatter and "contentType" in frontmatter:
66+
content_type = frontmatter["contentType"]
67+
else:
68+
content_type = ""
69+
70+
# Convert list to comma-separated string for printing
71+
if isinstance(content_type, list):
72+
content_str = ", ".join(content_type)
73+
else:
74+
content_str = str(content_type)
75+
76+
if print_output:
77+
print(f"File: {file}")
78+
print(f"Word Count: {word_count}")
79+
print(f"contentType: {content_str}\n")
80+
81+
extracted_data.append([file, content_type, word_count])
82+
83+
if extracted_data:
84+
save_to_csv(extracted_data)
85+
print(f"Results saved to {OUTPUT_CSV}")
86+
87+
if __name__ == "__main__":
88+
parser = argparse.ArgumentParser(description="Extract contentType and word count from Markdown files.")
89+
parser.add_argument("--dir", type=str, default="../docs", help="Directory to scan (default: '../docs')")
90+
parser.add_argument("--print", action="store_true", help="Print output to console (default: False, only CSV)")
91+
92+
args = parser.parse_args()
93+
main(args.dir, args.print)
94+

0 commit comments

Comments
 (0)