obsidian-clippings-scripts/organize_files.py at main · devandapaige/obsidian-clippings-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/env python3
"""
Organize clippings (initial thoughts/intuitions) from Clippings folder into Archives.

These files contain personal reflections captured after reading/watching longer-form content,
not full articles. They are organized by category into the Archives folder structure.
"""
import os
import shutil
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional

def parse_categorization_file(filepath: str) -> List[Tuple[str, str, Optional[str]]]:
    """Parse the categorization file and return list of (filename, primary, secondary) tuples."""
    entries = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            parts = line.split('|')
            if len(parts) >= 2:
                filename = parts[0].strip()
                primary = parts[1].strip()
                secondary = parts[2].strip() if len(parts) > 2 else None
                if primary and primary != 'CATEGORY':
                    entries.append((filename, primary, secondary))
    return entries

def normalize_filename(filename: str) -> str:
    """Normalize filename for comparison."""
    # Remove common punctuation and whitespace
    normalized = filename.lower()
    normalized = ''.join(c for c in normalized if c.isalnum() or c in [' ', '-'])
    normalized = ' '.join(normalized.split())
    return normalized

def extract_insights_from_file(filepath: str) -> Tuple[str, str, Optional[str]]:
    """
    Extract title, user's thoughts, and date from a clipping file.
    Returns (title, thoughts, date) tuple.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        # Extract title and date from frontmatter
        title = None
        date = None
        if content.startswith('---'):
            # Find the end of frontmatter
            end_idx = content.find('---', 3)
            if end_idx != -1:
                frontmatter = content[3:end_idx]
                # Look for title and date fields
                for line in frontmatter.split('\n'):
                    if line.startswith('title:'):
                        title = line.split('title:', 1)[1].strip().strip('"').strip("'")
                    elif line.startswith('created:'):
                        date = line.split('created:', 1)[1].strip()
                    elif line.startswith('published:') and not date:
                        # Use published date if created date not available
                        date = line.split('published:', 1)[1].strip()

        # Extract thoughts (content after frontmatter)
        thoughts = ""
        if content.startswith('---'):
            end_idx = content.find('---', 3)
            if end_idx != -1:
                thoughts = content[end_idx + 3:].strip()
        else:
            thoughts = content.strip()

        # If no title found, use filename without extension
        if not title:
            title = os.path.splitext(os.path.basename(filepath))[0]

        return title, thoughts, date
    except Exception as e:
        # Fallback to filename if extraction fails
        title = os.path.splitext(os.path.basename(filepath))[0]
        return title, "", None

def append_to_insights_index(title: str, thoughts: str, file_path: str, date: Optional[str] = None):
    """Append an entry to the insights index file, avoiding duplicates."""
    insights_file = os.path.join('Archives', 'INSIGHTS.md')

    # Check if entry already exists
    entry_exists = False
    if os.path.exists(insights_file):
        with open(insights_file, 'r', encoding='utf-8') as f:
            content = f.read()
            # Check if this title already exists
            if f"## {title}\n" in content:
                entry_exists = True

    # Skip if entry already exists
    if entry_exists:
        return

    # Create file with header if it doesn't exist
    if not os.path.exists(insights_file):
        with open(insights_file, 'w', encoding='utf-8') as f:
            f.write("# Archives Insights Index\n\n")
            f.write("This file contains a chronological index of all your insights and thoughts from consumed content.\n\n")
            f.write("---\n\n")

    # Format date if available
    date_str = ""
    if date:
        date_str = f"**Date:** {date}\n\n"

    # Append new entry
    with open(insights_file, 'a', encoding='utf-8') as f:
        f.write(f"## {title}\n\n")
        if date_str:
            f.write(date_str)
        if thoughts:
            # Show full thoughts (these are your personal insights, so keep them complete)
            f.write(f"{thoughts}\n\n")
        f.write(f"📍 `Archives/{file_path}`\n\n")
        f.write("---\n\n")

def find_matching_file(filename: str, directory: str) -> Optional[str]:
    """Find a matching file in directory using fuzzy matching."""
    normalized_target = normalize_filename(filename)
    best_match = None
    best_ratio = 0

    for actual_file in os.listdir(directory):
        if actual_file.startswith('.'):  # Skip hidden files
            continue

        normalized_actual = normalize_filename(actual_file)

        # Check if the normalized versions match at the start
        if normalized_actual.startswith(normalized_target[:50]) or normalized_target.startswith(normalized_actual[:50]):
            # Use the longer match ratio as the best match
            ratio = len(os.path.commonprefix([normalized_target, normalized_actual])) / max(len(normalized_target), len(normalized_actual))
            if ratio > best_ratio:
                best_ratio = ratio
                best_match = actual_file

    # Only return a match if it's a good match (more than 80% similar)
    return best_match if best_ratio > 0.8 else None

def create_directory_structure():
    """Create the necessary directory structure."""
    base = Path('Archives')
    categories = {
        'AI-and-Technology': ['AI-Limitations', 'Tech-Competition', 'Privacy-Surveillance', 'Open-Source-AI'],
        'Media-and-Communication': ['Media-Transformation', 'Social-Platforms', 'Communication-Frameworks'],
        'Business-and-Finance': ['Luxury-Markets', 'Corporate-Ethics', 'Marketing-Strategy'],
        'Society-and-Human-Understanding': ['Dehumanization-Propaganda', 'Impact-vs-Intent', 'Religion-Narratives'],
        'Personal-Development': ['Generalist-Resources', 'Neurodiversity-Tools']
    }

    for category, subcategories in categories.items():
        for subcategory in subcategories:
            (base / category / subcategory).mkdir(parents=True, exist_ok=True)

def organize_files(categorization_file: str):
    """
    Main function to organize clippings (initial thoughts/intuitions) into Archives.

    Moves files from Clippings directory to appropriate Archive subdirectories
    based on the categorization file. Files contain personal reflections, not full articles.
    """
    if not os.path.exists('Clippings'):
        print("Error: Clippings directory not found!")
        return

    # Check if Clippings directory is empty
    clippings_files = [f for f in os.listdir('Clippings') if not f.startswith('.') and f.endswith('.md')]
    if not clippings_files:
        print("Warning: Clippings directory appears to be empty!")
        print("This might mean files were already moved. Checking Archives...")
        # Count files in Archives that match our categorization
        entries = parse_categorization_file(categorization_file)
        found_count = 0
        for filename, primary, _ in entries:
            archive_path = os.path.join('Archives', primary, filename)
            if os.path.exists(archive_path):
                found_count += 1
        if found_count > 0:
            print(f"Found {found_count}/{len(entries)} files already in Archives.")
            print("Files may have already been organized. Exiting to prevent duplicate processing.")
            return
        else:
            print("No matching files found in Archives. Proceeding anyway...")

    create_directory_structure()
    entries = parse_categorization_file(categorization_file)

    stats = {'moved': 0, 'linked': 0, 'skipped': 0}
    total = len(entries)

    for idx, (filename, primary, secondary) in enumerate(entries, 1):
        print(f"[{idx}/{total}] Processing: {filename}")

        # Find matching file in Clippings directory
        actual_filename = find_matching_file(filename, 'Clippings')
        if not actual_filename:
            print(f"  ✕ File not found in Clippings directory")
            stats['skipped'] += 1
            continue

        source_path = os.path.join('Clippings', actual_filename)
        primary_path = os.path.join('Archives', primary, actual_filename)

        # Create primary category directory if it doesn't exist
        os.makedirs(os.path.dirname(primary_path), exist_ok=True)

        try:
            # Copy file to primary category
            shutil.copy2(source_path, primary_path)

            # Verify copy succeeded before removing original
            if not os.path.exists(primary_path):
                raise Exception(f"Copy verification failed: {primary_path} does not exist")

            # Verify file sizes match (basic integrity check)
            source_size = os.path.getsize(source_path)
            dest_size = os.path.getsize(primary_path)
            if source_size != dest_size:
                raise Exception(f"Copy verification failed: size mismatch (source: {source_size}, dest: {dest_size})")

            # Only remove original after successful copy and verification
            os.remove(source_path)
            print(f"  → Copied to Archives/{primary}/")
            stats['moved'] += 1

            # Extract insights and append to insights index
            title, thoughts, date = extract_insights_from_file(primary_path)
            archive_path = f"{primary}/{actual_filename}"
            append_to_insights_index(title, thoughts, archive_path, date)
            print(f"  → Added to insights index")

            # Handle secondary category if specified
            if secondary and secondary != 'CATEGORY':
                secondary_path = os.path.join('Archives', secondary)
                os.makedirs(secondary_path, exist_ok=True)

                # Calculate relative path for symlink
                rel_path = os.path.relpath(primary_path, secondary_path)
                link_path = os.path.join(secondary_path, actual_filename)

                # Create symlink
                if os.path.exists(link_path):
                    os.remove(link_path)
                os.symlink(rel_path, link_path)
                print(f"  → Linked to Archives/{secondary}/")
                stats['linked'] += 1

        except Exception as e:
            print(f"  ✕ Error processing file: {str(e)}")
            stats['skipped'] += 1

    print("\nOrganization complete!")
    print(f"  • Files moved: {stats['moved']}")
    print(f"  • Secondary links created: {stats['linked']}")
    print(f"  • Files skipped: {stats['skipped']}")

if __name__ == '__main__':
    input_file = sys.argv[1] if len(sys.argv) > 1 else 'categorize_all.txt'
    organize_files(input_file)