Skip to content

Commit 7862e1d

Browse files
committed
fixed issue when querying for document number greater than 100
1 parent e7ea529 commit 7862e1d

20 files changed

+39
-22
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.vscode
Binary file not shown.
53 Bytes
Binary file not shown.
Binary file not shown.
7.49 KB
Binary file not shown.
Binary file not shown.
7.48 KB
Binary file not shown.

industry_documents_wrapper/ucsf_api.py renamed to industryDocumentsWrapper/ucsf_api.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from dataclasses import dataclass
22
import re
33
import requests
4+
import time
45
import polars as pl
56

67

@@ -36,20 +37,25 @@ def _loop_results(self, query:str, n:int) -> None:
3637
n = float('inf')
3738

3839
while (next_cursor != current_cursor) and (len(self.results) < n):
40+
3941
if next_cursor:
40-
current_cursor = next_cursor
4142
query = self._update_cursormark(query, next_cursor)
43+
4244
r = requests.get(query).json()
4345

46+
current_cursor = r['responseHeader']['params']['cursorMark']
47+
4448
if n < len(r['response']['docs']):
4549
self.results.extend(r['response']['docs'][:n])
46-
elif len(self.results) + len(r['response']['docs']) > n:
50+
51+
elif n < (len(self.results) + len(r['response']['docs'])):
4752
self.results.extend(r['response']['docs'][:n-len(self.results)])
53+
4854
else:
4955
self.results.extend(r['response']['docs'])
50-
next_cursor = r['nextCursorMark']
56+
next_cursor = r['nextCursorMark']
5157

52-
print(f"{len(self.results)}/{n} documents collected", end='\r')
58+
print(f"{len(self.results)}/{n} documents collected")
5359

5460
return
5561

@@ -72,6 +78,7 @@ def query(self,
7278
author:str = False,
7379
source:str = False,
7480
bates:str = False,
81+
box:str = False,
7582
originalformat:str = False,
7683
wt:str ='json',
7784
cursor_mark:str='*',
@@ -91,6 +98,7 @@ def query(self,
9198
author=author,
9299
source=source,
93100
batesexpanded=bates,
101+
box=box,
94102
originalformat=originalformat,
95103
wt=wt,
96104
cursorMark=cursor_mark,
@@ -102,7 +110,6 @@ def query(self,
102110

103111
"""Queries the UCSF Industry Documents Solr Library for documents"""
104112
self._loop_results(query, n)
105-
print('Adding URLs to query results')
106113
if industry:
107114
self._create_links(industry)
108115

Binary file not shown.

0 commit comments

Comments
 (0)