|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import os |
| 3 | +import yaml |
| 4 | +import argparse |
| 5 | +import re |
| 6 | +import glob |
| 7 | +import csv |
| 8 | + |
| 9 | +OUTPUT_CSV = "pageinfo.csv" |
| 10 | + |
| 11 | +def extract_frontmatter_and_content(filepath): |
| 12 | + """Extracts frontmatter and main content from a Markdown file.""" |
| 13 | + with open(filepath, "r", encoding="utf-8") as f: |
| 14 | + content = f.read() |
| 15 | + |
| 16 | + # Match YAML frontmatter with regex (--- as delimiter) |
| 17 | + match = re.match(r"^---\n(.*?)\n---\n(.*)", content, re.DOTALL) |
| 18 | + |
| 19 | + if match: |
| 20 | + try: |
| 21 | + frontmatter = yaml.safe_load(match.group(1)) # Parse YAML |
| 22 | + except yaml.YAMLError: |
| 23 | + frontmatter = None # Invalid YAML |
| 24 | + main_content = match.group(2).strip() |
| 25 | + else: |
| 26 | + frontmatter = None |
| 27 | + main_content = content.strip() |
| 28 | + |
| 29 | + return frontmatter, main_content |
| 30 | + |
| 31 | +def count_words(text): |
| 32 | + """Counts the number of words in the given text.""" |
| 33 | + # yes, this is fairly simplistic, but a general idea is fine for most uses |
| 34 | + return len(text.split()) |
| 35 | + |
| 36 | +def find_markdown_files(directory): |
| 37 | + """Recursively finds all markdown files in the given directory.""" |
| 38 | + return glob.glob(os.path.join(directory, "**", "*.md"), recursive=True) |
| 39 | + |
| 40 | +def save_to_csv(data, filename=OUTPUT_CSV): |
| 41 | + """Saves extracted data to a CSV file with dynamic contentType columns.""" |
| 42 | + max_types = max((len(row[1]) if isinstance(row[1], list) else 1) for row in data) |
| 43 | + headers = ["Filename", "WordCount"] + [f"ContentType_{i+1}" for i in range(max_types)] |
| 44 | + |
| 45 | + with open(filename, "w", newline="", encoding="utf-8") as csvfile: |
| 46 | + writer = csv.writer(csvfile) |
| 47 | + writer.writerow(headers) # Write CSV header |
| 48 | + |
| 49 | + for filename, content_type, word_count in data: |
| 50 | + if isinstance(content_type, list): |
| 51 | + row = [filename, word_count] + content_type + [""] * (max_types - len(content_type)) |
| 52 | + else: |
| 53 | + row = [filename, word_count, content_type] + [""] * (max_types - 1) |
| 54 | + writer.writerow(row) |
| 55 | + |
| 56 | +def main(directory,print_output): |
| 57 | + """Finds Markdown files, extracts 'contentType' and word count, then prints and saves results.""" |
| 58 | + md_files = find_markdown_files(directory) |
| 59 | + extracted_data = [] |
| 60 | + |
| 61 | + for file in md_files: |
| 62 | + frontmatter, main_content = extract_frontmatter_and_content(file) |
| 63 | + word_count = count_words(main_content) |
| 64 | + |
| 65 | + if frontmatter and "contentType" in frontmatter: |
| 66 | + content_type = frontmatter["contentType"] |
| 67 | + else: |
| 68 | + content_type = "" |
| 69 | + |
| 70 | + # Convert list to comma-separated string for printing |
| 71 | + if isinstance(content_type, list): |
| 72 | + content_str = ", ".join(content_type) |
| 73 | + else: |
| 74 | + content_str = str(content_type) |
| 75 | + |
| 76 | + if print_output: |
| 77 | + print(f"File: {file}") |
| 78 | + print(f"Word Count: {word_count}") |
| 79 | + print(f"contentType: {content_str}\n") |
| 80 | + |
| 81 | + extracted_data.append([file, content_type, word_count]) |
| 82 | + |
| 83 | + if extracted_data: |
| 84 | + save_to_csv(extracted_data) |
| 85 | + print(f"Results saved to {OUTPUT_CSV}") |
| 86 | + |
| 87 | +if __name__ == "__main__": |
| 88 | + parser = argparse.ArgumentParser(description="Extract contentType and word count from Markdown files.") |
| 89 | + parser.add_argument("--dir", type=str, default="../docs", help="Directory to scan (default: '../docs')") |
| 90 | + parser.add_argument("--print", action="store_true", help="Print output to console (default: False, only CSV)") |
| 91 | + |
| 92 | + args = parser.parse_args() |
| 93 | + main(args.dir, args.print) |
| 94 | + |
0 commit comments