| 
 | 1 | +#!/usr/bin/env python3  | 
 | 2 | +import os  | 
 | 3 | +import yaml  | 
 | 4 | +import argparse  | 
 | 5 | +import re  | 
 | 6 | +import glob  | 
 | 7 | +import csv  | 
 | 8 | + | 
 | 9 | +OUTPUT_CSV = "pageinfo.csv"  | 
 | 10 | + | 
 | 11 | +def extract_frontmatter_and_content(filepath):  | 
 | 12 | +    """Extracts frontmatter and main content from a Markdown file."""  | 
 | 13 | +    with open(filepath, "r", encoding="utf-8") as f:  | 
 | 14 | +        content = f.read()  | 
 | 15 | + | 
 | 16 | +    # Match YAML frontmatter with regex (--- as delimiter)  | 
 | 17 | +    match = re.match(r"^---\n(.*?)\n---\n(.*)", content, re.DOTALL)  | 
 | 18 | +      | 
 | 19 | +    if match:  | 
 | 20 | +        try:  | 
 | 21 | +            frontmatter = yaml.safe_load(match.group(1))  # Parse YAML  | 
 | 22 | +        except yaml.YAMLError:  | 
 | 23 | +            frontmatter = None  # Invalid YAML  | 
 | 24 | +        main_content = match.group(2).strip()  | 
 | 25 | +    else:  | 
 | 26 | +        frontmatter = None  | 
 | 27 | +        main_content = content.strip()  | 
 | 28 | + | 
 | 29 | +    return frontmatter, main_content  | 
 | 30 | + | 
 | 31 | +def count_words(text):  | 
 | 32 | +    """Counts the number of words in the given text."""  | 
 | 33 | +    # yes, this is fairly simplistic, but a general idea is fine for most uses   | 
 | 34 | +    return len(text.split())  | 
 | 35 | + | 
 | 36 | +def find_markdown_files(directory):  | 
 | 37 | +    """Recursively finds all markdown files in the given directory."""  | 
 | 38 | +    return glob.glob(os.path.join(directory, "**", "*.md"), recursive=True)  | 
 | 39 | + | 
 | 40 | +def save_to_csv(data, filename=OUTPUT_CSV):  | 
 | 41 | +    """Saves extracted data to a CSV file with dynamic contentType columns."""  | 
 | 42 | +    max_types = max((len(row[1]) if isinstance(row[1], list) else 1) for row in data)  | 
 | 43 | +    headers = ["Filename", "WordCount"] + [f"ContentType_{i+1}" for i in range(max_types)]  | 
 | 44 | + | 
 | 45 | +    with open(filename, "w", newline="", encoding="utf-8") as csvfile:  | 
 | 46 | +        writer = csv.writer(csvfile)  | 
 | 47 | +        writer.writerow(headers)  # Write CSV header  | 
 | 48 | + | 
 | 49 | +        for filename, content_type, word_count in data:  | 
 | 50 | +            if isinstance(content_type, list):  | 
 | 51 | +                row = [filename, word_count] + content_type + [""] * (max_types - len(content_type))  | 
 | 52 | +            else:  | 
 | 53 | +                row = [filename, word_count, content_type] + [""] * (max_types - 1)  | 
 | 54 | +            writer.writerow(row)  | 
 | 55 | + | 
 | 56 | +def main(directory,print_output):  | 
 | 57 | +    """Finds Markdown files, extracts 'contentType' and word count, then prints and saves results."""  | 
 | 58 | +    md_files = find_markdown_files(directory)  | 
 | 59 | +    extracted_data = []  | 
 | 60 | + | 
 | 61 | +    for file in md_files:  | 
 | 62 | +        frontmatter, main_content = extract_frontmatter_and_content(file)  | 
 | 63 | +        word_count = count_words(main_content)  | 
 | 64 | + | 
 | 65 | +        if frontmatter and "contentType" in frontmatter:  | 
 | 66 | +            content_type = frontmatter["contentType"]  | 
 | 67 | +        else:  | 
 | 68 | +            content_type = ""  | 
 | 69 | + | 
 | 70 | +        # Convert list to comma-separated string for printing  | 
 | 71 | +        if isinstance(content_type, list):  | 
 | 72 | +            content_str = ", ".join(content_type)  | 
 | 73 | +        else:  | 
 | 74 | +            content_str = str(content_type)  | 
 | 75 | + | 
 | 76 | +        if print_output:  | 
 | 77 | +            print(f"File: {file}")  | 
 | 78 | +            print(f"Word Count: {word_count}")  | 
 | 79 | +            print(f"contentType: {content_str}\n")  | 
 | 80 | + | 
 | 81 | +        extracted_data.append([file, content_type, word_count])  | 
 | 82 | + | 
 | 83 | +    if extracted_data:  | 
 | 84 | +        save_to_csv(extracted_data)  | 
 | 85 | +        print(f"Results saved to {OUTPUT_CSV}")  | 
 | 86 | + | 
 | 87 | +if __name__ == "__main__":  | 
 | 88 | +    parser = argparse.ArgumentParser(description="Extract contentType and word count from Markdown files.")  | 
 | 89 | +    parser.add_argument("--dir", type=str, default="../docs", help="Directory to scan (default: '../docs')")  | 
 | 90 | +    parser.add_argument("--print", action="store_true", help="Print output to console (default: False, only CSV)")  | 
 | 91 | + | 
 | 92 | +    args = parser.parse_args()  | 
 | 93 | +    main(args.dir, args.print)  | 
 | 94 | + | 
0 commit comments