|
| 1 | +import argparse |
| 2 | +import logging |
| 3 | +import sys |
| 4 | +from pathlib import Path |
| 5 | +from dotenv import load_dotenv |
| 6 | +from llama_index import ( |
| 7 | + ObsidianReader, |
| 8 | + GPTVectorStoreIndex, |
| 9 | + StorageContext, |
| 10 | + load_index_from_storage |
| 11 | +) |
| 12 | + |
| 13 | +# to see token counter and token usage for the LLM and Embedding |
| 14 | +logging.basicConfig(stream=sys.stdout, level=logging.INFO) |
| 15 | +logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) |
| 16 | + |
| 17 | +load_dotenv() |
| 18 | + |
| 19 | +OBSIDIAN_DIR = "/home/samuel/vaults/fragments/journals" |
| 20 | +docs = ObsidianReader(OBSIDIAN_DIR).load_data() |
| 21 | + |
| 22 | + |
| 23 | +def read_journal_md(file_path): |
| 24 | + from bs4 import BeautifulSoup |
| 25 | + import markdown |
| 26 | + import re |
| 27 | + |
| 28 | + with open(file_path, "r") as f: |
| 29 | + text = f.read() |
| 30 | + html = markdown.markdown(text) |
| 31 | + soup = BeautifulSoup(html, "html.parser") |
| 32 | + |
| 33 | + # take only the first <p> tag |
| 34 | + # p = soup.find("p") |
| 35 | + ps = soup.find_all("p") |
| 36 | + # take only the p tags that have at least 2 `|` characters |
| 37 | + p = [p for p in ps if p.text.count("|") > 1] |
| 38 | + |
| 39 | + # replace all characters before the first `|` character with '' |
| 40 | + result = re.sub(r'^[^|]*\|', '', p[0].text) |
| 41 | + |
| 42 | + print(f"Finished processing {file_path}") |
| 43 | + return result |
| 44 | + |
| 45 | + |
| 46 | +def create_journal_nodes(dir_path): |
| 47 | + """ |
| 48 | + Examples: https://gpt-index.readthedocs.io/en/stable/guides/primer/usage_pattern.html |
| 49 | + """ |
| 50 | + from llama_index import Document |
| 51 | + from llama_index.node_parser import SimpleNodeParser |
| 52 | + |
| 53 | + |
| 54 | + docs = [] |
| 55 | + parser = SimpleNodeParser() |
| 56 | + |
| 57 | + # loop through each markdown file in the directory |
| 58 | + for file_path in Path(dir_path).glob("*.md"): |
| 59 | + md = read_journal_md(file_path) |
| 60 | + # construct documents manually using the lower level Document struct |
| 61 | + docs.append(Document(md)) |
| 62 | + |
| 63 | + nodes = parser.get_nodes_from_documents(docs) |
| 64 | + |
| 65 | + |
| 66 | + return nodes, docs |
| 67 | + |
| 68 | +if Path("./storage").exists(): |
| 69 | + storage_context = StorageContext.from_defaults(persist_dir="./storage") |
| 70 | + index = load_index_from_storage(storage_context) |
| 71 | +else: |
| 72 | + nodes, docs = create_journal_nodes(OBSIDIAN_DIR) |
| 73 | + index = GPTVectorStoreIndex(nodes) |
| 74 | + index.storage_context.persist(persist_dir="./storage") |
| 75 | + |
| 76 | +if __name__ == "__main__": |
| 77 | + """ |
| 78 | + Usage: python 10_journal_x.py -q "what are places I ate at in March and April?" |
| 79 | + """ |
| 80 | + query_engine = index.as_query_engine() |
| 81 | + # cli argument parser |
| 82 | + parser = argparse.ArgumentParser( |
| 83 | + prog="QueryJournal", |
| 84 | + description="Query my bullet journals in Obsidian using Llama Index." |
| 85 | + ) |
| 86 | + parser.add_argument( |
| 87 | + "-q", |
| 88 | + "--query", |
| 89 | + type=str, |
| 90 | + help="Ask a question answerable in my journals", |
| 91 | + required=True |
| 92 | + ) |
| 93 | + args = parser.parse_args() |
| 94 | + query = args.query |
| 95 | + |
| 96 | + if(query): |
| 97 | + res = query_engine.query(query) |
| 98 | + print(f"Query: {query}") |
| 99 | + print(f"Results: \n {res}") |
| 100 | + else: |
| 101 | + print("No query provided. Exiting...") |
| 102 | + exit(0) |
0 commit comments