-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_files.py
More file actions
22 lines (19 loc) · 873 Bytes
/
check_files.py
File metadata and controls
22 lines (19 loc) · 873 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# /home/wangxinxin/my_graphrag/check_files.py
import pandas as pd
import os
# Check text_units.parquet (stores raw text chunks)
FILE_PATH = "output/text_units.parquet"
if os.path.exists(FILE_PATH):
df = pd.read_parquet(FILE_PATH)
# Search for text blocks containing "AHG" or "顶峰" (Apex/Summit)
mask = df['text'].str.contains("AHG|顶峰", case=False, na=False)
results = df[mask]
if not results.empty:
print(f"Successfully found {len(results)} text chunks containing 'AHG/顶峰'!")
print("Sample text snippet:")
print(results.iloc[0]['text'][:100] + "...")
else:
print("No content containing 'AHG' or '顶峰' was found in text_units.parquet.")
print("Possible reasons: The file was not indexed, or the filename is not in the processing list.")
else:
print(f"File does not exist: {FILE_PATH}")