Skip to content

Commit 91b47aa

Browse files
committed
updated issue with regex parsing the cursormark and updated testing'
1 parent aad5496 commit 91b47aa

File tree

3 files changed

+24
-12
lines changed

3 files changed

+24
-12
lines changed

industryDocumentsWrapper/ucsf_api.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from dataclasses import dataclass
22
import re
33
import requests
4-
import time
54
import polars as pl
65

76

@@ -20,13 +19,13 @@ def _create_query(self, **kwargs) -> str:
2019
if kwargs['q']:
2120
query = f"{self.base_url}query?q=({kwargs['q']})&wt={kwargs['wt']}&cursorMark={kwargs['cursorMark']}&sort={kwargs['sort']}"
2221
else:
23-
query = f"{self.base_url}query?q=("+' AND '.join([f'{k}:{v}' for k, v in kwargs.items() if v and k != 'wt' and k != 'cursorMark' and k != 'sort'])+f")&wt={kwargs['wt']}&cursorMark={kwargs['cursorMark']}&sort={kwargs['sort']}"
22+
query = f"{self.base_url}query?q=("+' AND '.join([f'{k}:{v}' for k, v in kwargs.items() if v and k != 'wt' and k != 'cursorMark' and k != 'sort' and k != 'n'])+f")&wt={kwargs['wt']}&cursorMark={kwargs['cursorMark']}&sort={kwargs['sort']}"
2423

2524
return query
2625

2726
def _update_cursormark(self, query:str, cursor_mark: str) -> str:
2827
"""Updates cursor mark in query string"""
29-
return re.sub(r'(?<=cursorMark=)[A-Za-z0-9*]+(?=&)', cursor_mark, query)
28+
return re.sub(r'(?<=cursorMark=)[A-Za-z0-9*=]+(?=&)', cursor_mark, query)
3029

3130
def _loop_results(self, query:str, n:int) -> None:
3231
"""Iteratively retrieves documents with cursor_mark for Solr deep paging"""
@@ -39,12 +38,16 @@ def _loop_results(self, query:str, n:int) -> None:
3938
while (next_cursor != current_cursor) and (len(self.results) < n):
4039

4140
if next_cursor:
42-
query = self._update_cursormark(query, next_cursor)
43-
41+
print(current_cursor)
42+
current_cursor = next_cursor
43+
print(current_cursor)
44+
query = self._update_cursormark(query, current_cursor)
45+
print(query)
46+
47+
print(query)
4448
r = requests.get(query).json()
4549

46-
current_cursor = r['responseHeader']['params']['cursorMark']
47-
50+
print(r['response']['numFound'])
4851
if n < len(r['response']['docs']):
4952
self.results.extend(r['response']['docs'][:n])
5053

@@ -53,7 +56,10 @@ def _loop_results(self, query:str, n:int) -> None:
5356

5457
else:
5558
self.results.extend(r['response']['docs'])
56-
next_cursor = r['nextCursorMark']
59+
60+
next_cursor = r['nextCursorMark']
61+
62+
print(f'Current cursor: {current_cursor} | Next cursor: {next_cursor}')
5763

5864
print(f"{len(self.results)}/{n} documents collected")
5965

@@ -64,7 +70,6 @@ def _create_links(self, industry) -> None:
6470
for doc in self.results:
6571
doc['url'] = f"https://www.industrydocuments.ucsf.edu/{industry}/docs/#id={doc['id']}"
6672

67-
# TODO: add functionality for /select/* queries for specific fields
6873
def query(self,
6974
q:str = False,
7075
case:str = False,
@@ -110,6 +115,7 @@ def query(self,
110115

111116
"""Queries the UCSF Industry Documents Solr Library for documents"""
112117
self._loop_results(query, n)
118+
113119
if industry:
114120
self._create_links(industry)
115121

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "industryDocumentsWrapper"
3-
version = "0.1.2"
3+
version = "0.1.3"
44
description = "A simple python wrapper for the UCSF Industry Documents API."
55
authors = ["Rolando Rodriguez <[email protected]>"]
66
maintainers = ["Rolando Rodriguez <[email protected]>"]

tests/test_ucsf_api.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,14 @@ def test_loop_results_50(indDocSearch):
7070
query = 'https://metadata.idl.ucsf.edu/solr/ltdl3/query?q=(industry:tobacco AND case:"State of North Carolina" AND collection:"JUUL labs Collection" AND type:email)&wt=json&cursorMark=*&sort=id%20asc'
7171
indDocSearch._loop_results(query, 50)
7272
assert len(indDocSearch.results) == 50
73+
assert len(set([x['id'] for x in indDocSearch.results])) == 50
7374
assert indDocSearch.results[0]['id'] == 'ffbb0284'
7475

7576
def test_loop_results_150(indDocSearch):
7677
query = 'https://metadata.idl.ucsf.edu/solr/ltdl3/query?q=(industry:tobacco AND case:"State of North Carolina" AND collection:"JUUL labs Collection" AND type:email)&wt=json&cursorMark=*&sort=id%20asc'
7778
indDocSearch._loop_results(query, 150)
7879
assert len(indDocSearch.results) == 150
80+
assert len(set([x['id'] for x in indDocSearch.results])) == 150
7981
assert indDocSearch.results[0]['id'] == 'ffbb0284'
8082

8183
def test_create_links(indDocSearch, mock_results):
@@ -87,22 +89,26 @@ def test_create_links(indDocSearch, mock_results):
8789
def test_query_with_q_100(indDocSearch):
8890
indDocSearch.query(q='industry:tobacco AND case:"State of North Carolina" AND collection:"JUUL labs Collection" AND type:email', n=100)
8991
assert len(indDocSearch.results) == 100
92+
assert len(set([x['id'] for x in indDocSearch.results])) == 100
9093
assert indDocSearch.results[0]['id'] == 'ffbb0284'
9194
assert indDocSearch.results[0]['url'] == 'https://www.industrydocuments.ucsf.edu/tobacco/docs/#id=ffbb0284'
9295

9396
def test_query_with_q_500(indDocSearch):
9497
indDocSearch.query(q='industry:tobacco AND case:"State of North Carolina" AND collection:"JUUL labs Collection" AND type:email', n=500)
9598
assert len(indDocSearch.results) == 500
99+
assert len(set([x['id'] for x in indDocSearch.results])) == 500
96100
assert indDocSearch.results[0]['id'] == 'ffbb0284'
97101
assert indDocSearch.results[0]['url'] == 'https://www.industrydocuments.ucsf.edu/tobacco/docs/#id=ffbb0284'
98102

99103
def test_query_with_no_q_50(indDocSearch):
100-
indDocSearch.query(collection='JUUL labs Collection', case='State of North Carolina', doc_type='email', n=50)
104+
indDocSearch.query(industry='tobacco', collection='JUUL labs Collection', case='State of North Carolina', doc_type='email', n=50)
101105
assert len(indDocSearch.results) == 50
106+
assert len(set([x['id'] for x in indDocSearch.results])) == 50
102107

103108
def test_query_with_no_q_1000(indDocSearch):
104-
indDocSearch.query(collection='JUUL labs Collection', case='State of North Carolina', doc_type='email', n=1000)
109+
indDocSearch.query(industry='tobacco', collection='JUUL labs Collection', case='State of North Carolina', doc_type='email', n=1000)
105110
assert len(indDocSearch.results) == 1000
111+
assert len(set([x['id'] for x in indDocSearch.results])) == 1000
106112

107113
def test_save_parquet(indDocSearch, mock_results, tmp_path):
108114
indDocSearch.results = mock_results

0 commit comments

Comments
 (0)