-
Notifications
You must be signed in to change notification settings - Fork 97
/
Copy pathtest_onefilellm.py
152 lines (140 loc) · 7.29 KB
/
test_onefilellm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import unittest
import os
import tempfile
import shutil
# Assuming onefilellm.py is in the same directory or accessible via PYTHONPATH
from onefilellm import (
process_github_repo,
process_arxiv_pdf,
process_local_folder,
fetch_youtube_transcript,
crawl_and_extract_text,
process_doi_or_pmid,
process_github_pull_request,
process_github_issue
)
class TestDataAggregation(unittest.TestCase):
def setUp(self):
# Create a temporary directory for any potential test artifacts if needed
self.temp_dir = tempfile.mkdtemp()
# You might want to create dummy files here if tests need them
def tearDown(self):
# Clean up the temporary directory
shutil.rmtree(self.temp_dir)
def test_github_repo(self):
print("\nTesting GitHub repository processing...")
repo_url = "https://github.com/jimmc414/onefilellm"
repo_content = process_github_repo(repo_url)
self.assertIsInstance(repo_content, str)
self.assertGreater(len(repo_content), 0)
# Check for the correct root source tag
self.assertIn('<source type="github_repository"', repo_content)
# Check if file tags are present (more specific check)
self.assertIn('<file path="', repo_content)
print("GitHub repository processing test passed.")
def test_arxiv_pdf(self):
print("\nTesting arXiv PDF processing...")
arxiv_url = "https://arxiv.org/abs/2401.14295"
arxiv_content = process_arxiv_pdf(arxiv_url)
self.assertIsInstance(arxiv_content, str)
self.assertGreater(len(arxiv_content), 0)
# Corrected the expected source type
self.assertIn('<source type="arxiv" url="https://arxiv.org/abs/2401.14295">', arxiv_content)
# Optionally check for some content indicator
self.assertIn("Demystifying Chains, Trees, and Graphs", arxiv_content)
print("arXiv PDF processing test passed.")
def test_local_folder(self):
print("\nTesting local folder processing...")
# Use the directory containing this test file as the test target
local_path = os.path.dirname(os.path.abspath(__file__))
local_content = process_local_folder(local_path)
self.assertIsInstance(local_content, str)
self.assertGreater(len(local_content), 0)
# Corrected the expected source type
self.assertIn('<source type="local_folder"', local_content)
# Check if file tags are present
self.assertIn('<file path="', local_content)
print("Local folder processing test passed.")
def test_youtube_transcript(self):
print("\nTesting YouTube transcript fetching...")
video_url = "https://www.youtube.com/watch?v=KZ_NlnmPQYk"
transcript = fetch_youtube_transcript(video_url)
self.assertIsInstance(transcript, str)
self.assertGreater(len(transcript), 0)
# Check for the correct source tag
self.assertIn('<source type="youtube_transcript"', transcript)
# Check for a common word instead of assuming "LLM" is present
self.assertTrue("the" in transcript.lower() or "a" in transcript.lower(), "Transcript content seems empty or very unusual.")
print("YouTube transcript fetching test passed.")
def test_webpage_crawl(self):
print("\nTesting webpage crawling and text extraction...")
webpage_url = "https://llm.datasette.io/en/stable/"
max_depth = 1
include_pdfs = False # Keep False to speed up test
ignore_epubs = True
crawl_result = crawl_and_extract_text(webpage_url, max_depth, include_pdfs, ignore_epubs)
self.assertIsInstance(crawl_result, dict)
self.assertIn('content', crawl_result)
self.assertIn('processed_urls', crawl_result)
self.assertGreater(len(crawl_result['content']), 0)
self.assertGreater(len(crawl_result['processed_urls']), 0)
# Corrected the expected source type
self.assertIn('<source type="web_crawl"', crawl_result['content'])
# Check for page tags
self.assertIn('<page url="', crawl_result['content'])
print("Webpage crawling and text extraction test passed.")
def test_process_doi(self):
print("\nTesting DOI processing...")
doi = "10.1053/j.ajkd.2017.08.002"
doi_content = process_doi_or_pmid(doi)
self.assertIsInstance(doi_content, str)
self.assertGreater(len(doi_content), 0)
# Corrected the expected source type and check identifier attribute
self.assertIn('<source type="sci-hub" identifier="10.1053/j.ajkd.2017.08.002">', doi_content)
# Check for some content
self.assertIn("Oxalate Nephropathy", doi_content)
print("DOI processing test passed.")
def test_process_pmid(self):
print("\nTesting PMID processing...")
pmid = "29203127" # This PMID corresponds to the same paper as the DOI above
pmid_content = process_doi_or_pmid(pmid)
self.assertIsInstance(pmid_content, str)
self.assertGreater(len(pmid_content), 0)
# Corrected the expected source type and check identifier attribute
self.assertIn('<source type="sci-hub" identifier="29203127">', pmid_content)
# Check for some content
self.assertIn("Oxalate Nephropathy", pmid_content)
print("PMID processing test passed.")
def test_process_github_pull_request(self):
print("\nTesting GitHub pull request processing...")
pull_request_url = "https://github.com/dear-github/dear-github/pull/102"
pull_request_content = process_github_pull_request(pull_request_url)
self.assertIsInstance(pull_request_content, str)
self.assertGreater(len(pull_request_content), 0)
# Check for the correct source tag
self.assertIn('<source type="github_pull_request"', pull_request_content)
# Check for specific structural elements instead of <pull_request_info>
self.assertIn('<title>', pull_request_content)
self.assertIn('<description>', pull_request_content)
self.assertIn('<details>', pull_request_content)
self.assertIn('<diff>', pull_request_content)
# Check for the embedded repository content tag
self.assertIn('<source type="github_repository"', pull_request_content)
print("GitHub pull request processing test passed.")
def test_process_github_issue(self):
print("\nTesting GitHub issue processing...")
issue_url = "https://github.com/isaacs/github/issues/1191"
issue_content = process_github_issue(issue_url)
self.assertIsInstance(issue_content, str)
self.assertGreater(len(issue_content), 0)
# Check for the correct source tag
self.assertIn('<source type="github_issue"', issue_content)
# Check for specific structural elements instead of <issue_info>
self.assertIn('<title>', issue_content)
self.assertIn('<description>', issue_content)
self.assertIn('<details>', issue_content)
# Check for the embedded repository content tag
self.assertIn('<source type="github_repository"', issue_content)
print("GitHub issue processing test passed.")
if __name__ == "__main__":
unittest.main()