From 3e403e4df9a3ad0ef05bbc64777c8a4754b1dc03 Mon Sep 17 00:00:00 2001 From: raahulmona <157174139+raahulmona@users.noreply.github.com> Date: Thu, 28 Mar 2024 22:22:14 +0100 Subject: [PATCH] uploading clinical trails --- backend/.vscode/settings.json | 1 + backend/app/config.py | 12 +- .../10_ClinicalStudiesArmDetails.json | 4 + .../11_ClinicalStudiesDesignOutcomes.json | 4 + .../12_ClinicalStudiesDesigns.json | 4 + .../1_ClinicalStudiesEligibility.json | 2 +- .../2_ClinicalStudiesCollaboraions.json | 2 +- .../3_ClinicalStudiesAdverseDetails.json | 2 +- .../4_ClinicalStudiesPubmedCitation.json | 2 +- .../5_ClinicalStudiesConditions.json | 2 +- .../6_ClinicalStudiesPrimaryOutcomes.json | 4 + .../7_ClinicalStudiesSecondaryOutcomes.json | 4 + .../8_ClinicalStudiesBaselineDetails.json | 4 + .../9_ClinicalStudiesInterventions.json | 4 + .../clinical_trial_sql_query_engine.py | 126 +++++++++++++----- .../rag/retrieval/pubmed/pubmedqueryengine.py | 11 +- backend/app/rag/retrieval/web/brave_search.py | 6 +- backend/app/router/orchestrator.py | 20 +-- 18 files changed, 156 insertions(+), 58 deletions(-) create mode 100644 backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/10_ClinicalStudiesArmDetails.json create mode 100644 backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/11_ClinicalStudiesDesignOutcomes.json create mode 100644 backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/12_ClinicalStudiesDesigns.json create mode 100644 backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/6_ClinicalStudiesPrimaryOutcomes.json create mode 100644 backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/7_ClinicalStudiesSecondaryOutcomes.json create mode 100644 backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/8_ClinicalStudiesBaselineDetails.json create mode 100644 backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/9_ClinicalStudiesInterventions.json diff --git a/backend/.vscode/settings.json b/backend/.vscode/settings.json index 614a3fe4..c9bd45b5 100644 --- a/backend/.vscode/settings.json +++ b/backend/.vscode/settings.json @@ -9,6 +9,7 @@ "clinicaltrials", "Curieo", "cypher", + "dspy", "GROQ", "llms", "mistralai", diff --git a/backend/app/config.py b/backend/app/config.py index c9aa0a63..7d9f5473 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -125,8 +125,11 @@ else: QDRANT_API_URL = config("QDRANT_API_URL", default="https://ff1f8e90-959e-4cff-9455-03914d8a7002.europe-west3-0.gcp.cloud.qdrant.io") QDRANT_COLLECTION_NAME: str = config("QDRANT_COLLECTION_NAME", default="pubmed_hybrid_vector_db") +QDRANT_CLINICAL_TRIAL_COLLECTION_NAME: str = config("QDRANT_CLINICAL_TRIAL_COLLECTION_NAME", default="clinical_trials_vector_db") QDRANT_TOP_K: int = config("QDRANT_TOP_K", default=20) QDRANT_SPARSE_TOP_K: int = config("QDRANT_SPARSE_TOP_K", default=3) +QDRANT_TOP_CLINICAL_TRAIL_K: int = config("QDRANT_TOP_CLINICAL_TRAIL_K", default=5) +QDRANT_CLINICAL_TRIAL_METADATA_FIELD_NAME: str = config("QDRANT_CLINICAL_TRIAL_METADATA_FIELD_NAME", default="title") # LLAMA_INDEX Configuration CHAT_ENABLED: bool = config("CHAT_ENABLED", default=False) @@ -135,12 +138,13 @@ # Dspy Integration Configuration CLINICAL_TRIAL_SQL_PROGRAM: str = "app/dspy_integration/dspy_programs/clinical_trials_sql_generation.json" CLINICAL_TRIALS_RESPONSE_REFINEMENT_PROGRAM: str = "app/dspy_integration/dspy_programs/clinical_trials_response_refinement.json" -ORCHESRATOR_ROUTER_PROMPT_PROGRAM: str = "app/dspy_integration/dspy_programs/orchestrator_router_prompt.json" +ORCHESTRATOR_ROUTER_PROMPT_PROGRAM: str = "app/dspy_integration/dspy_programs/orchestrator_router_prompt.json" # Phoenix Configuration PHOENIX_API_ENDPOINT: str = config("PHOENIX_API_ENDPOINT", default="http://127.0.0.1:6007/v1/traces") #AI models -ROUTER_MODEL: str = "gpt-3.5-turbo" -SQL_GENERATION_MODEL: str = "codellama/CodeLlama-13b-Instruct-hf" -RESPONSE_SYNTHESIZER_MODEL: str = "NousResearch/Nous-Hermes-llama-2-7b" +ROUTER_MODEL: str = config("ROUTER_MODEL", default="gpt-3.5-turbo") +SQL_GENERATION_MODEL: str = config("ROUTER_MODEL", default="codellama/CodeLlama-13b-Instruct-hf") +CLINICAL_TRAIL_RESPONSE_SYNTHESIZER_MODEL: str = config("CLINICAL_TRAIL_RESPONSE_SYNTHESIZER_MODEL", default="NousResearch/Nous-Hermes-llama-2-7b") +PUBMED_RESPONSE_SYNTHESIZER_MODEL: str = config("PUBMED_RESPONSE_SYNTHESIZER_MODEL", default="mistralai/Mixtral-8x7B-Instruct-v0.1") diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/10_ClinicalStudiesArmDetails.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/10_ClinicalStudiesArmDetails.json new file mode 100644 index 00000000..66d79f2c --- /dev/null +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/10_ClinicalStudiesArmDetails.json @@ -0,0 +1,4 @@ +{ + "table_name": "tbl_studies_arms_details", + "table_summary": "The table defines the protocol-specified group, subgroup, or cohort of participants in a clinical trial assigned to receive the specific intervention(s) or observations according to a protocol. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document. 'title' is the title of the document. \n\n The column 'Arm_Details' is a JSON column and it has the below information in a nested way. The 'Title' field holds information about the design group. The 'Description' field holds the description of the result group. The 'type' field holds information about the type of the design_groups like 'Active Comparator', 'Experimental', 'No Intervention', 'Other', 'Placebo Comparator', and 'Sham Comparator" +} \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/11_ClinicalStudiesDesignOutcomes.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/11_ClinicalStudiesDesignOutcomes.json new file mode 100644 index 00000000..0c55ace5 --- /dev/null +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/11_ClinicalStudiesDesignOutcomes.json @@ -0,0 +1,4 @@ +{ + "table_name": "tbl_studies_design_outcomes", + "table_summary": "The table defines the Description of planned outcome measures and observations that will describe patterns of diseases and traits/associations with exposures, risk factors, or treatment. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document. 'title' is the title of the document. \n\n The column 'Design_Outcome_Measures' is a JSON column and it has the below information in a nested way. The 'OutcomeType' field holds information about the type of the design outcomes like 'other', 'primary', and 'secondary'. The 'Measure' holds the information about the measurement of the design groups. The 'Time' holds the information on the time frame of associated design groups. The 'Description' holds information about the description of the design groups." +} \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/12_ClinicalStudiesDesigns.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/12_ClinicalStudiesDesigns.json new file mode 100644 index 00000000..e42d1a96 --- /dev/null +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/12_ClinicalStudiesDesigns.json @@ -0,0 +1,4 @@ +{ + "table_name": "tbl_studies_designs", + "table_summary": "The table defines the Description of how the study will be conducted, including comparison group design and strategies for masking and allocating participants. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document. 'title' is the title of the document. \n\n The column 'Design_Details' is a JSON column and it has the below information in a nested way. The 'Allocation' field holds information about the type of the allocation of design like 'Non-Randomized', or 'Randomized'. The 'Intervention_Model' holds information about the intervention like 'Crossover Assignment', 'Factorial Assignment', 'Parallel Assignment', 'Sequential Assignment', and 'Single Group Assignment'.The 'Masking' column holds masking information like 'Double', 'None (Open Label)', 'Quadruple', 'Single', and 'Triple'.The 'Primary_Purpose' holds information about the designs like 'Basic Science', 'Device Feasibility', 'Diagnostic', 'Educational/Counseling/Training', 'Health Services Research', 'Other', 'Prevention', 'Screening', 'Supportive Care', 'Treatment'" +} \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/1_ClinicalStudiesEligibility.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/1_ClinicalStudiesEligibility.json index 6d3a24cf..590e6f36 100644 --- a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/1_ClinicalStudiesEligibility.json +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/1_ClinicalStudiesEligibility.json @@ -1,4 +1,4 @@ { "table_name": "tbl_studies_eligibilities", - "table_summary": "The table contains Information about the criteria used to select participants; includes inclusion and exclusion criteria. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.'title' is the title of the document.'description' is the description of the document. 'eligibility_details' has information in JSON format. The first field is 'Population' (field name is 'Population) which contains the information about the associated population type. The second field is the Sampling Method (field name is 'SamplingMethod') which contains the data about the method of sampling. The third field is Minimum Age (field name is 'MinimumAge') which contains data about the minimum age of the population. The fourth field is Maximum Age (field name is 'MaximumAge') which contains data about the maximum age of the population. Next field is 'HealthyVolunteers' that have information about the requirements of healthy volunteers. The last field is 'Criteria' which has the inclusion and exclusion criteria for the trial." + "table_summary": "The table contains Information about the criteria used to select participants; includes inclusion and exclusion criteria. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.'title' is the title of the document. 'eligibility_details' has information in JSON format. The first field is 'Population' (field name is 'Population) which contains the information about the associated population type. The second field is the Sampling Method (field name is 'SamplingMethod') which contains the data about the method of sampling. The third field is Minimum Age (field name is 'MinimumAge') which contains data about the minimum age of the population. The fourth field is Maximum Age (field name is 'MaximumAge') which contains data about the maximum age of the population. Next field is 'HealthyVolunteers' that have information about the requirements of healthy volunteers. The last field is 'Criteria' which has the inclusion and exclusion criteria for the trial." } \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/2_ClinicalStudiesCollaboraions.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/2_ClinicalStudiesCollaboraions.json index e36f1c94..49134874 100644 --- a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/2_ClinicalStudiesCollaboraions.json +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/2_ClinicalStudiesCollaboraions.json @@ -1,4 +1,4 @@ { "table_name": "tbl_studies_sponsors", - "table_summary": "The table contains the Name of study sponsors and collaborators. The sponsor is the entity or individual initiating the study. Collaborators are other organizations providing support, including funding, design, implementation, data analysis, and reporting. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.' title' is the title of the document.'description' is the description of the document. 'CollaboratorDetails' has information in JSON format. The first field is 'Collaborator Type' (field name is 'CollaboratorType') which contains information about the type of collaborations. The second field is the name of the collaborator entity (field name is 'CollaboratorDetails')." + "table_summary": "The table contains the Name of study sponsors and collaborators. The sponsor is the entity or individual initiating the study. Collaborators are other organizations providing support, including funding, design, implementation, data analysis, and reporting. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.' title' is the title of the document. 'CollaboratorDetails' has information in JSON format. The first field is 'Collaborator Type' (field name is 'CollaboratorType') which contains information about the type of collaborations. The second field is the name of the collaborator entity (field name is 'CollaboratorDetails')." } \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/3_ClinicalStudiesAdverseDetails.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/3_ClinicalStudiesAdverseDetails.json index acba0801..d4d38f80 100644 --- a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/3_ClinicalStudiesAdverseDetails.json +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/3_ClinicalStudiesAdverseDetails.json @@ -1,4 +1,4 @@ { "table_name": "tbl_studies_adverse_details", - "table_summary": "The table contains Summary information about reported adverse events (any untoward or unfavorable medical occurrence to participants, including abnormal physical exams, laboratory findings, symptoms, or diseases), including serious adverse events, other adverse events, and mortality. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.' title' is the title of the document.'description' is the description of the document. 'adverse_details' has information in JSON format. The first field is 'Event Type' (field name is 'EventType') which contains information about the type of adverse events like 'deaths' or 'serious'. The second field is about the count of subjects that are affected by the trial(field name is 'SubjestAffected'). The third field is 'Classification' which tells a detailed description of the Adverse Events. The fourth field is about the count of subjects that are risked by the trial(field name is 'SubjectsRisk')." + "table_summary": "The table contains Summary information about reported adverse events (any untoward or unfavorable medical occurrence to participants, including abnormal physical exams, laboratory findings, symptoms, or diseases), including serious adverse events, other adverse events, and mortality. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.' title' is the title of the document. 'adverse_details' has information in JSON format. The first field is 'Event Type' (field name is 'EventType') which contains information about the type of adverse events like 'deaths' or 'serious'. The second field is about the count of subjects that are affected by the trial(field name is 'SubjestAffected'). The third field is 'Classification' which tells a detailed description of the Adverse Events. The fourth field is about the count of subjects that are risked by the trial(field name is 'SubjectsRisk')." } \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/4_ClinicalStudiesPubmedCitation.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/4_ClinicalStudiesPubmedCitation.json index bb84f643..8aebf448 100644 --- a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/4_ClinicalStudiesPubmedCitation.json +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/4_ClinicalStudiesPubmedCitation.json @@ -1,4 +1,4 @@ { "table_name": "tbl_studies_pubmed_links", - "table_summary": "The table contains Citations to publications related to the study protocol and/or results. Includes PubMed Unique Identifier (PMID) and/or full bibliographic citation. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.' title' is the title of the document.'description' is the description of the document. 'pubmedcitation' has information in JSON format. The first field is 'Pubmed' which contains the associated article ID at the Pubmed website. The second field is about the citation details of the article of subject(field name is 'Citation')." + "table_summary": "The table contains Citations to publications related to the study protocol and/or results. Includes PubMed Unique Identifier (PMID) and/or full bibliographic citation. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.' title' is the title of the document. 'pubmedcitation' has information in JSON format. The first field is 'Pubmed' which contains the associated article ID at the Pubmed website. The second field is about the citation details of the article of subject(field name is 'Citation')." } \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/5_ClinicalStudiesConditions.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/5_ClinicalStudiesConditions.json index a0ea06a5..600a53c2 100644 --- a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/5_ClinicalStudiesConditions.json +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/5_ClinicalStudiesConditions.json @@ -1,4 +1,4 @@ { "table_name": "tbl_studies_conditions", - "table_summary": "The table contains the Name(s) of the disease(s) or condition(s) studied in the clinical study or the focus of the clinical study. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.' title' is the title of the document.'description' is the description of the document. The filed 'condition_name' has the name of disease or conditions names that are mentioned in the study." + "table_summary": "The table contains the Name(s) of the disease(s) or condition(s) studied in the clinical study or the focus of the clinical study. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.' title' is the title of the document. The filed 'condition_name' has the name of disease or conditions names that are mentioned in the study." } \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/6_ClinicalStudiesPrimaryOutcomes.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/6_ClinicalStudiesPrimaryOutcomes.json new file mode 100644 index 00000000..966758e8 --- /dev/null +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/6_ClinicalStudiesPrimaryOutcomes.json @@ -0,0 +1,4 @@ +{ + "table_name": "tbl_primary_outcome_measurement", + "table_summary": "The table contains all the primary outcome measurements of a clinical study.The sample size included in the analysis for each outcome for each study group; usually participants but can represent other units of measure such as eyes 'lesions', etc.Summary data for primary and secondary outcome measures for each study group. Includes parameter estimates and measures of dispersion/precision. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.'title' is the title of the document. \n\n The column 'Outcome_Primary_Measurement_Details' is a JSON column and it has the below information in a nested way. The 'Title' field holds the information about the definition of the measurement. The 'Description' field holds the information about the description of the measurement. The 'Time' field holds the information about the time frame duration of the measurement. The 'Population' field holds the information about the population details of the measurement. The 'Units' field unit measurement of the occurrence.\n\n Another main column 'Outcome_Primary_Measurement_Value_Details' has all the value-related information. It is also a JSON column. The 'Type' field holds the types of Measurement details like 'Count of Participants', 'Count of Units', 'Geometric Least Squares Mean', 'Geometric Mean', 'Least Squares Mean', 'Log Mean', 'Mean', 'Median', 'Number'.The 'Value' field holds the value of the measurement. The 'DispersionValue' holds the statistical or dispersion value. The 'DispersionType' holds the types of statistical analysis like 'Full Range', 'Geometric Coefficient of Variation', 'Inter-Quartile Range', 'Standard Deviation', 'Standard Error', 'Confidence Interval'" +} \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/7_ClinicalStudiesSecondaryOutcomes.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/7_ClinicalStudiesSecondaryOutcomes.json new file mode 100644 index 00000000..45cb84cc --- /dev/null +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/7_ClinicalStudiesSecondaryOutcomes.json @@ -0,0 +1,4 @@ +{ + "table_name": "tbl_secondary_outcome_measurement", + "table_summary": "The table contains all the secondary outcome measurements of a clinical study.The sample size included in the analysis for each outcome for each study group; usually participants but can represent other units of measure such as eyes 'lesions', etc.Summary data for primary and secondary outcome measures for each study group. Includes parameter estimates and measures of dispersion/precision.The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document.'title' is the title of the document. \n\n The column 'Outcome_Secondary_Measurement_Details' is a JSON column and it has the below information in a nested way. The 'Title' field holds the information about the definition of the measurement. The 'Description' field holds the information about the description of the measurement. The 'Time' field holds the information about the time frame duration of the measurement. The 'Population' field holds the information about the population details of the measurement. The 'Units' field unit measurement of the occurrence.\n\n Another main column 'Outcome_Secondary_Measurement_Value_Details' has all the value-related information. It is also a JSON column. The 'Type' field holds the types of Measurement details like 'Count of Participants', 'Count of Units', 'Geometric Least Squares Mean', 'Geometric Mean', 'Least Squares Mean', 'Log Mean', 'Mean', 'Median', 'Number'.The 'Value' field holds the value of the measurement. The 'DispersionValue' holds the statistical or dispersion value. The 'DispersionType' holds the types of statistical analysis like 'Full Range', 'Geometric Coefficient of Variation', 'Inter-Quartile Range', 'Standard Deviation', 'Standard Error', 'Confidence Interval'" +} \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/8_ClinicalStudiesBaselineDetails.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/8_ClinicalStudiesBaselineDetails.json new file mode 100644 index 00000000..4415bac4 --- /dev/null +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/8_ClinicalStudiesBaselineDetails.json @@ -0,0 +1,4 @@ +{ + "table_name": "tbl_baseline_details", + "table_summary": "The table summaries of demographic & baseline measures collected by arm or comparison group and for the entire population of participants in the clinical study. all the secondary outcome measurements of a clinical study. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document. 'title' is the title of the document. \n\n The column 'Baseline_Measurement_Details' is a JSON column and it has the below information in a nested way. The 'Title' field holds the information about the definition of the baseline measurement detail. The 'Type' field holds the types of baseline measurements like 'Count of Participants', 'Count of Units, 'Geometric Least Squares Mean',' Geometric Mean', 'Least Squares Mean', 'Log Mean', 'Mean',' Median', 'Number'.The 'Value' field holds the value of the baseline measurement. The 'category' holds different categories of data. The 'Classification' holds different classifications of data. The 'DispersionValue' holds the statistical or dispersion value. The 'DispersionType' holds the types of statistical analysis like 'Full Range', 'Inter-Quartile Range', and 'Standard Deviation'.\n\n Another main column 'Baseline_Group_Details' has all the value-related information. It is also a JSON column. The 'ResultGroupTitle' field holds the information about the result group. The 'ResultGroupDesc' holds the description of the result group. The 'BaseLineCount' holds the count of the sample. The 'BaseLineCountUnits' holds the unit types." +} \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/9_ClinicalStudiesInterventions.json b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/9_ClinicalStudiesInterventions.json new file mode 100644 index 00000000..47eedbf2 --- /dev/null +++ b/backend/app/rag/retrieval/clinical_trials/AACTTableQuestions_TableInfo/9_ClinicalStudiesInterventions.json @@ -0,0 +1,4 @@ +{ + "table_name": "tbl_studies_interventions", + "table_summary": "The table has the interventions or exposures (including drugs, medical devices, procedures, vaccines, and other products) of interest to the study, or associated with study arms/groups. The table has the below columns.'nct_id' is the primary key of the table. It is the id of the document. 'title' is the title of the document. \n\n The column 'Study_Intervention_Compressed_Details' is a JSON column and it has the below information in a nested way. The 'Type' field holds the information about the intervention type like 'Behavioral', 'Biological', 'Combination Product', 'Device', 'Diagnostic Test', 'Dietary Supplement', 'Drug', 'Genetic', 'Other', 'Procedure', 'Radiation'.The 'Details' field is also a JSON field that holds information about the 'name' and 'description' of the interventions." +} \ No newline at end of file diff --git a/backend/app/rag/retrieval/clinical_trials/clinical_trial_sql_query_engine.py b/backend/app/rag/retrieval/clinical_trials/clinical_trial_sql_query_engine.py index d1ddee5c..fc200758 100644 --- a/backend/app/rag/retrieval/clinical_trials/clinical_trial_sql_query_engine.py +++ b/backend/app/rag/retrieval/clinical_trials/clinical_trial_sql_query_engine.py @@ -10,14 +10,17 @@ ObjectIndex, SQLTableSchema, ) +from llama_index.core.retrievers import VectorIndexRetriever from llama_index.core.retrievers import SQLRetriever from llama_index.embeddings.text_embeddings_inference import TextEmbeddingsInference +from llama_index.vector_stores.qdrant import QdrantVectorStore import os from pathlib import Path from typing import List -from sqlalchemy import create_engine +from sqlalchemy import create_engine, text from pyvis.network import Network +from qdrant_client import QdrantClient from app.config import ( CLINICAL_TRIALS_TABLE_INFO_DIR, @@ -28,7 +31,13 @@ CLINICAL_TRIALS_RESPONSE_REFINEMENT_PROGRAM, TOGETHER_KEY, SQL_GENERATION_MODEL, - RESPONSE_SYNTHESIZER_MODEL + CLINICAL_TRAIL_RESPONSE_SYNTHESIZER_MODEL, + QDRANT_API_KEY, + QDRANT_API_URL, + QDRANT_API_PORT, + QDRANT_CLINICAL_TRIAL_COLLECTION_NAME, + QDRANT_TOP_CLINICAL_TRAIL_K, + QDRANT_CLINICAL_TRIAL_METADATA_FIELD_NAME, ) from app.services.search_utility import setup_logger @@ -38,7 +47,6 @@ import dspy import re - logger = setup_logger("ClinicalTrialText2SQLEngine") @@ -46,7 +54,6 @@ class TableInfo(BaseModel): """ Information regarding a structured table. """ - table_name: str = Field( ..., description="table name (must be underscores and NO spaces)" ) @@ -64,9 +71,15 @@ class ClinicalTrialText2SQLEngine: def __init__(self, config): self.config = config - self.nous =dspy.Together(model = str(RESPONSE_SYNTHESIZER_MODEL), api_key=str(TOGETHER_KEY)) - self.llm = dspy.Together(model = str(SQL_GENERATION_MODEL), api_key=str(TOGETHER_KEY)) - dspy.settings.configure(lm = self.llm) + self.response_llm =dspy.Together( + model = str(CLINICAL_TRAIL_RESPONSE_SYNTHESIZER_MODEL), + api_key=str(TOGETHER_KEY), + max_tokens = 500) + self.sql_llm = dspy.Together( + model = str(SQL_GENERATION_MODEL), + api_key=str(TOGETHER_KEY), + max_tokens = 500) + dspy.settings.configure(lm = self.sql_llm) self.sql_module = SQL_module() self.sql_module.load(CLINICAL_TRIAL_SQL_PROGRAM) @@ -77,22 +90,39 @@ def __init__(self, config): self.sql_database = SQLDatabase(self.engine) self.table_node_mapping = SQLTableNodeMapping(self.sql_database) self.sql_retriever = SQLRetriever(self.sql_database) + self.embed_model = TextEmbeddingsInference( + base_url=EMBEDDING_MODEL_API, model_name=EMBEDDING_MODEL_NAME + ) + + self.client = QdrantClient( + url=QDRANT_API_URL, + port=QDRANT_API_PORT, + api_key=str(QDRANT_API_KEY), + https=False + ) + + self.vector_store = QdrantVectorStore( + client=self.client, + collection_name=QDRANT_CLINICAL_TRIAL_COLLECTION_NAME + ) + + self.retriever = VectorIndexRetriever( + index=VectorStoreIndex.from_vector_store(vector_store=self.vector_store), + similarity_top_k=int(QDRANT_TOP_CLINICAL_TRAIL_K), + embed_model=TextEmbeddingsInference(base_url=EMBEDDING_MODEL_API, model_name="") + ) self.table_schema_objs = [ SQLTableSchema(table_name=t.table_name, context_str=t.table_summary) for t in self.get_all_table_info() ] - self.embed_model = TextEmbeddingsInference( - base_url=EMBEDDING_MODEL_API, model_name=EMBEDDING_MODEL_NAME - ) - self.obj_index = ObjectIndex.from_objects( objects=self.table_schema_objs, object_mapping=self.table_node_mapping, index_cls=VectorStoreIndex, embed_model=self.embed_model, ) - self.obj_retriever = self.obj_index.as_retriever(similarity_top_k=3) + self.obj_retriever = self.obj_index.as_retriever(similarity_top_k=1) self.qp = self.build_query_pipeline() def _get_table_info_with_index(self, idx: int) -> str: @@ -132,33 +162,55 @@ def get_table_context_str(self, table_schema_objs: List[SQLTableSchema]): context_strs.append(table_info) return "\n\n".join(context_strs) - - def extract_sql(self, llm_response: str) -> str: + def get_sql_query(self, question, context): + sql_query = self.sql_module(question = question, context = context).answer + return sql_query + + def extract_sql(self, question:str, llm_response: str) -> str: # First try to extract SQL code blocks enclosed in triple backticks - sql = re.search(r"```(?:sql\n)?(.*?)```", llm_response, re.DOTALL | re.IGNORECASE) - if sql: - extracted_sql = sql.group(1).strip() - logger.info(f"Output from LLM: {llm_response} \nExtracted SQL: {extracted_sql}") - return extracted_sql + pattern = r"```(?:sql\n)?(.*?)```|(select.*?;)|('*\n\n---\n\nQuestion:')" + sql_match = re.search(pattern, llm_response, re.DOTALL | re.IGNORECASE) - # If not found, try to extract a plain SQL query - sql = re.search(r"(select.*?;)", llm_response, re.DOTALL | re.IGNORECASE) - if sql: - extracted_sql = sql.group(1).strip() - logger.info(f"Output from LLM: {llm_response} \nExtracted SQL: {extracted_sql}") - return extracted_sql - return llm_response + if sql_match: + extracted_sql = (sql_match.group(1) or sql_match.group(2)).strip() + retrieved_sql = self.get_relevant_tiles(question, extracted_sql) - - - def get_sql_query(self, question, context): + logger.info(f"Output from LLM: {llm_response} \nExtracted SQL: {retrieved_sql}") + return retrieved_sql + else: + # Handle the case where no SQL pattern is matched + logger.info(f"No SQL pattern matched in LLM response: {llm_response}") + # Return an appropriate response, such as an empty list or a message indicating no SQL was found + return [] + + def replace_title_value(self, sql_command: str, title_names: list[str]): + titles_for_sql = '\', \''.join([title.replace("\'", "\"") for title in title_names]) + quoted_titles_for_sql = "'{}'".format(titles_for_sql) + # Pattern to match the WHERE clause related to the title + pattern = r"title\s*=\s*'.*?'" + # Replacement pattern using IN clause + replacement = f"title IN ({quoted_titles_for_sql})" + # Perform the substitution + updated_sql_command = re.sub(pattern, replacement, sql_command) + return updated_sql_command + + def get_relevant_tiles(self, question, sql_query) -> list[str] : + return self.replace_title_value( + sql_query, + [ + node.metadata.get(QDRANT_CLINICAL_TRIAL_METADATA_FIELD_NAME) + for node in self.retriever.retrieve(question) + ] + ) + + def retrieve_input_title_name(self, question, context): sql_query = self.sql_module(question = question, context = context).answer return sql_query def get_synthesized_response(self, question, sql, database_output): if len(database_output) > 0: database_output = database_output[0].text - with dspy.context(lm=self.nous): + with dspy.context(lm=self.response_llm): response = self.response_synthesizer(question = question, sql = sql, database_output = database_output).answer return response @@ -177,21 +229,21 @@ def build_query_pipeline(self): ) qp.add_chain(["input", "table_retriever", "table_output_parser"]) - qp.add_link("input", "text2sql_llm", dest_key="question") # FIX - qp.add_link("table_output_parser", "text2sql_llm", dest_key= "context") #FIX - qp.add_chain( - ["text2sql_llm", "sql_output_parser", "sql_retriever"] - ) + qp.add_link("input", "text2sql_llm", dest_key="question") + qp.add_link("table_output_parser", "text2sql_llm", dest_key= "context") + qp.add_link("input", "sql_output_parser", dest_key="question") qp.add_link("text2sql_llm", "sql_output_parser", dest_key = "llm_response") + + qp.add_chain(["sql_output_parser", "sql_retriever"]) qp.add_link("input", "response_synthesis_llm", dest_key = "question") qp.add_link("text2sql_llm", "response_synthesis_llm", dest_key = "sql") qp.add_link("sql_retriever", "response_synthesis_llm", dest_key = "database_output") + net = Network(notebook=True, cdn_resources="in_line", directed=True) net.from_nx(qp.dag) net.show("text2sql_dag.html") return qp - async def call_text2sql( self, search_text:str @@ -205,4 +257,4 @@ async def call_text2sql( logger.exception("call_text2sql Exception -", exc_info = ex, stack_info=True) raise ex - return {"result": str(response)} + return {"result": str(response)} \ No newline at end of file diff --git a/backend/app/rag/retrieval/pubmed/pubmedqueryengine.py b/backend/app/rag/retrieval/pubmed/pubmedqueryengine.py index 7e350e21..dc7390e6 100644 --- a/backend/app/rag/retrieval/pubmed/pubmedqueryengine.py +++ b/backend/app/rag/retrieval/pubmed/pubmedqueryengine.py @@ -9,7 +9,16 @@ from qdrant_client import QdrantClient from app.services.search_utility import setup_logger -from app.config import QDRANT_API_KEY, QDRANT_API_URL, QDRANT_API_PORT, QDRANT_COLLECTION_NAME, QDRANT_TOP_K, QDRANT_SPARSE_TOP_K, EMBEDDING_MODEL_API, PUBMED_RELEVANCE_CRITERIA +from app.config import ( + QDRANT_API_KEY, + QDRANT_API_URL, + QDRANT_API_PORT, + QDRANT_COLLECTION_NAME, + QDRANT_TOP_K, + QDRANT_SPARSE_TOP_K, + EMBEDDING_MODEL_API, + PUBMED_RELEVANCE_CRITERIA + ) logger = setup_logger("PubmedSearchQueryEngine") diff --git a/backend/app/rag/retrieval/web/brave_search.py b/backend/app/rag/retrieval/web/brave_search.py index e253487e..23ebcc19 100644 --- a/backend/app/rag/retrieval/web/brave_search.py +++ b/backend/app/rag/retrieval/web/brave_search.py @@ -4,7 +4,11 @@ from llama_index.core.schema import TextNode from app.services.search_utility import setup_logger -from app.config import BRAVE_RESULT_COUNT, BRAVE_SEARCH_API, BRAVE_SUBSCRIPTION_KEY +from app.config import ( + BRAVE_RESULT_COUNT, + BRAVE_SEARCH_API, + BRAVE_SUBSCRIPTION_KEY + ) logger = setup_logger("BraveSearchQueryEngine") diff --git a/backend/app/router/orchestrator.py b/backend/app/router/orchestrator.py index 2c2134ee..d78fa5d4 100644 --- a/backend/app/router/orchestrator.py +++ b/backend/app/router/orchestrator.py @@ -11,14 +11,18 @@ from app.rag.retrieval.pubmed.pubmedqueryengine import PubmedSearchQueryEngine from app.rag.reranker.response_reranker import TextEmbeddingInferenceRerankEngine from app.api.common.util import RouteCategory -from app.config import OPENAI_API_KEY, TOGETHER_KEY, ORCHESRATOR_ROUTER_PROMPT_PROGRAM, ROUTER_MODEL +from app.config import ( + OPENAI_API_KEY, + TOGETHER_KEY, + ORCHESTRATOR_ROUTER_PROMPT_PROGRAM, + ROUTER_MODEL + ) from app.services.search_utility import setup_logger import dspy from app.dspy_integration.router_prompt import Router_module - logger = setup_logger("Orchestrator") TAG_RE = re.compile(r'<[^>]+>') @@ -31,13 +35,10 @@ class Orchestrator: def __init__(self, config): self.config = config - - - self.llm = dspy.OpenAI(model=str(ROUTER_MODEL), api_key=str(OPENAI_API_KEY)) dspy.settings.configure(lm = self.llm) self.router = Router_module() - self.router.load(ORCHESRATOR_ROUTER_PROMPT_PROGRAM) + self.router.load(ORCHESTRATOR_ROUTER_PROMPT_PROGRAM) self.clinicalTrialSearch = ClinicalTrialText2SQLEngine(config) self.drugChemblSearch = DrugChEMBLText2CypherEngine(config) @@ -75,9 +76,9 @@ async def query_and_get_answer( try: sqlResponse = await self.clinicalTrialSearch.call_text2sql(search_text=search_text) result = str(sqlResponse) - sources = result + sources = result #TODO - logger.info(f"Orchestrator.query_and_get_answer.sqlResponse sqlResponse: {result}") + logger.info(f"Orchestrator.query_and_get_answer.sqlResponse sqlResponse: {result} and {sources}") return { "result" : result, @@ -139,5 +140,4 @@ async def query_and_get_answer( return { "result" : result, "sources": sources - } - \ No newline at end of file + } \ No newline at end of file