From 852ca0bb41d73f8e427c29e963e6970b431142ba Mon Sep 17 00:00:00 2001 From: Rohit Date: Tue, 11 Feb 2025 16:34:15 -0600 Subject: [PATCH 1/2] Moved to training --- training/README.md | 29 - training/graphs/defog.json | 1796 ----------------------------------- training/graphs/tpc.json | 201 ---- training/pydough_corpus.csv | 523 ---------- training/scrap.py | 42 - 5 files changed, 2591 deletions(-) delete mode 100644 training/README.md delete mode 100644 training/graphs/defog.json delete mode 100644 training/graphs/tpc.json delete mode 100644 training/pydough_corpus.csv delete mode 100644 training/scrap.py diff --git a/training/README.md b/training/README.md deleted file mode 100644 index 54907546..00000000 --- a/training/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# PyDough Training Module - -This module contains information that can be used to aid in training an LLM for PyDough. - -## Corpus Format - -A training set of PyDough question-answer pairs is located in `pydough_corpus.csv`. The data format is as follows: -- `graph`: The name of the graph being used to explore the data. -- `question`: The text for the question being asked. -- `valid`: Whether the question has a valid PyDough answer (Y or N). -- `output`: - * If valid: the PyDough code to answer the question. The final answer should be a PyDough variable stored in a variable named `"result"`. - * If invalid: an error message explaining why the question cannot be answered by PyDough. -- `sql_text` (optional): An equivalent SQL query that solves the same question. -- `sql_dialect`: The dialect of SQL used by the code in `sql_text` (`sqlite`, `snowflake`, etc.). -- `is_benchmark`: Whether the question is a benchmark question, and therefore should NOT be used in LLM training (Y or N). - -All of the graphs used by the training set are stored one of the JSON files in the `graphs` directory. - -## Data Included - -The following knowledge bases and queries are included: - -- TPCH: the knowledge base used to describe the TPC-H schema. - - Specific implementations of all 21 TPC-H queries. - - Several additional custom queries using the same schema. -- Broker: the BROKER schema from the defog.ai database. - - 7/10 of the basic questions from defog.ai in this schema. - - 6/16 of the advanced questions from defog.ai in this schema. diff --git a/training/graphs/defog.json b/training/graphs/defog.json deleted file mode 100644 index 715cfd70..00000000 --- a/training/graphs/defog.json +++ /dev/null @@ -1,1796 +0,0 @@ -{ - "Broker": { - "Customers": { - "type": "simple_table", - "table_path": "main.sbCustomer", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "sbCustId", - "data_type": "string" - }, - "name": { - "type": "table_column", - "column_name": "sbCustName", - "data_type": "string" - }, - "email": { - "type": "table_column", - "column_name": "sbCustEmail", - "data_type": "string" - }, - "phone": { - "type": "table_column", - "column_name": "sbCustPhone", - "data_type": "string" - }, - "address1": { - "type": "table_column", - "column_name": "sbCustAddress1", - "data_type": "string" - }, - "address2": { - "type": "table_column", - "column_name": "sbCustAddress2", - "data_type": "string" - }, - "city": { - "type": "table_column", - "column_name": "sbCustCity", - "data_type": "string" - }, - "state": { - "type": "table_column", - "column_name": "sbCustState", - "data_type": "string" - }, - "country": { - "type": "table_column", - "column_name": "sbCustCountry", - "data_type": "string" - }, - "postal_code": { - "type": "table_column", - "column_name": "sbCustPostalCode", - "data_type": "string" - }, - "join_date": { - "type": "table_column", - "column_name": "sbCustJoinDate", - "data_type": "date" - }, - "status": { - "type": "table_column", - "column_name": "sbCustStatus", - "data_type": "string" - }, - "transactions_made": { - "type": "simple_join", - "other_collection_name": "Transactions", - "singular": false, - "no_collisions": true, - "keys": { - "_id": [ - "customer_id" - ] - }, - "reverse_relationship_name": "customer" - } - } - }, - "Tickers": { - "type": "simple_table", - "table_path": "main.sbTicker", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "sbTickerId", - "data_type": "string" - }, - "symbol": { - "type": "table_column", - "column_name": "sbTickerSymbol", - "data_type": "string" - }, - "name": { - "type": "table_column", - "column_name": "sbTickerName", - "data_type": "string" - }, - "ticker_type": { - "type": "table_column", - "column_name": "sbTickerType", - "data_type": "string" - }, - "exchange": { - "type": "table_column", - "column_name": "sbTickerExchange", - "data_type": "string" - }, - "currency": { - "type": "table_column", - "column_name": "sbTickerCurrency", - "data_type": "string" - }, - "db2x": { - "type": "table_column", - "column_name": "sbTickerDb2x", - "data_type": "string" - }, - "is_active": { - "type": "table_column", - "column_name": "sbTickerIsActive", - "data_type": "bool" - }, - "transactions_of": { - "type": "simple_join", - "other_collection_name": "Transactions", - "singular": false, - "no_collisions": true, - "keys": { - "_id": [ - "ticker_id" - ] - }, - "reverse_relationship_name": "ticker" - } - } - }, - "DailyPrices": { - "type": "simple_table", - "table_path": "main.sbDailyPrice", - "unique_properties": [["ticker_id", "date"]], - "properties": { - "ticker_id": { - "type": "table_column", - "column_name": "sbDpTickerId", - "data_type": "string" - }, - "date": { - "type": "table_column", - "column_name": "sbDpDate", - "data_type": "date" - }, - "open": { - "type": "table_column", - "column_name": "sbDpOpen", - "data_type": "decimal[10,2]" - }, - "high": { - "type": "table_column", - "column_name": "sbDpHigh", - "data_type": "decimal[10,2]" - }, - "low": { - "type": "table_column", - "column_name": "sbDpLow", - "data_type": "decimal[10,2]" - }, - "close": { - "type": "table_column", - "column_name": "sbDpClose", - "data_type": "decimal[10,2]" - }, - "volume": { - "type": "table_column", - "column_name": "sbDpVolume", - "data_type": "int64" - }, - "epoch_ms": { - "type": "table_column", - "column_name": "sbDpEpochMs", - "data_type": "int64" - }, - "source": { - "type": "table_column", - "column_name": "sbDpSource", - "data_type": "string" - }, - "ticker": { - "type": "simple_join", - "other_collection_name": "Tickers", - "singular": true, - "no_collisions": false, - "keys": { - "ticker_id": ["_id"] - }, - "reverse_relationship_name": "historical_prices" - } - } - }, - "Transactions": { - "type": "simple_table", - "table_path": "main.sbTransaction", - "unique_properties": ["transaction_id"], - "properties": { - "transaction_id": { - "type": "table_column", - "column_name": "sbTxId", - "data_type": "string" - }, - "customer_id": { - "type": "table_column", - "column_name": "sbTxCustId", - "data_type": "string" - }, - "ticker_id": { - "type": "table_column", - "column_name": "sbTxTickerId", - "data_type": "string" - }, - "date_time": { - "type": "table_column", - "column_name": "sbTxDateTime", - "data_type": "timestamp[3]" - }, - "transaction_type": { - "type": "table_column", - "column_name": "sbTxType", - "data_type": "string" - }, - "shares": { - "type": "table_column", - "column_name": "sbTxShares", - "data_type": "decimal[10,2]" - }, - "price": { - "type": "table_column", - "column_name": "sbTxPrice", - "data_type": "decimal[10,2]" - }, - "amount": { - "type": "table_column", - "column_name": "sbTxAmount", - "data_type": "decimal[10,2]" - }, - "currency": { - "type": "table_column", - "column_name": "sbTxCcy", - "data_type": "string" - }, - "tax": { - "type": "table_column", - "column_name": "sbTxTax", - "data_type": "decimal[10,2]" - }, - "commission": { - "type": "table_column", - "column_name": "sbTxCommission", - "data_type": "decimal[10,2]" - }, - "kpx": { - "type": "table_column", - "column_name": "sbTxKpx", - "data_type": "string" - }, - "settlement_date_str": { - "type": "table_column", - "column_name": "sbTxSettlementDateStr", - "data_type": "string" - }, - "status": { - "type": "table_column", - "column_name": "sbTxStatus", - "data_type": "string" - } - } - } - }, - "Dealership": { - "Cars": { - "type": "simple_table", - "table_path": "main.cars", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "_id", - "data_type": "int32" - }, - "make": { - "type": "table_column", - "column_name": "make", - "data_type": "string" - }, - "model": { - "type": "table_column", - "column_name": "model", - "data_type": "string" - }, - "year": { - "type": "table_column", - "column_name": "year", - "data_type": "int32" - }, - "color": { - "type": "table_column", - "column_name": "color", - "data_type": "string" - }, - "vin_number": { - "type": "table_column", - "column_name": "vin_number", - "data_type": "string" - }, - "engine_type": { - "type": "table_column", - "column_name": "engine_type", - "data_type": "string" - }, - "transmission": { - "type": "table_column", - "column_name": "transmission", - "data_type": "string" - }, - "cost": { - "type": "table_column", - "column_name": "cost", - "data_type": "decimal[10,2]" - }, - "crtd_ts": { - "type": "table_column", - "column_name": "crtd_ts", - "data_type": "timestamp[3]" - }, - "sale_records": { - "type": "simple_join", - "other_collection_name": "Sales", - "singular": false, - "no_collisions": true, - "keys": { - "_id": ["car_id"] - }, - "reverse_relationship_name": "car" - }, - "inventory_snapshots": { - "type": "simple_join", - "other_collection_name": "InventorySnapshots", - "singular": false, - "no_collisions": true, - "keys": { - "_id": ["car_id"] - }, - "reverse_relationship_name": "car" - } - } - }, - "Salespersons": { - "type": "simple_table", - "table_path": "main.salespersons", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "_id", - "data_type": "int32" - }, - "first_name": { - "type": "table_column", - "column_name": "first_name", - "data_type": "string" - }, - "last_name": { - "type": "table_column", - "column_name": "last_name", - "data_type": "string" - }, - "email": { - "type": "table_column", - "column_name": "email", - "data_type": "string" - }, - "phone": { - "type": "table_column", - "column_name": "phone", - "data_type": "string" - }, - "hire_date": { - "type": "table_column", - "column_name": "hire_date", - "data_type": "date" - }, - "termination_date": { - "type": "table_column", - "column_name": "termination_date", - "data_type": "date" - }, - "crtd_ts": { - "type": "table_column", - "column_name": "crtd_ts", - "data_type": "timestamp[3]" - }, - "sales_made": { - "type": "simple_join", - "other_collection_name": "Sales", - "singular": false, - "no_collisions": true, - "keys": { - "_id": ["salesperson_id"] - }, - "reverse_relationship_name": "salesperson" - } - } - }, - "Customers": { - "type": "simple_table", - "table_path": "main.customers", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "_id", - "data_type": "int32" - }, - "first_name": { - "type": "table_column", - "column_name": "first_name", - "data_type": "string" - }, - "last_name": { - "type": "table_column", - "column_name": "last_name", - "data_type": "string" - }, - "email": { - "type": "table_column", - "column_name": "email", - "data_type": "string" - }, - "phone": { - "type": "table_column", - "column_name": "phone", - "data_type": "string" - }, - "address": { - "type": "table_column", - "column_name": "address", - "data_type": "string" - }, - "city": { - "type": "table_column", - "column_name": "city", - "data_type": "string" - }, - "state": { - "type": "table_column", - "column_name": "state", - "data_type": "string" - }, - "zip_code": { - "type": "table_column", - "column_name": "zip_code", - "data_type": "string" - }, - "crtd_ts": { - "type": "table_column", - "column_name": "crtd_ts", - "data_type": "timestamp[3]" - }, - "car_purchases": { - "type": "simple_join", - "other_collection_name": "Sales", - "singular": false, - "no_collisions": true, - "keys": { - "_id": ["customer_id"] - }, - "reverse_relationship_name": "customer" - } - } - }, - "PaymentsReceived": { - "type": "simple_table", - "table_path": "main.payments_received", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "_id", - "data_type": "int32" - }, - "sale_id": { - "type": "table_column", - "column_name": "sale_id", - "data_type": "int32" - }, - "payment_date": { - "type": "table_column", - "column_name": "payment_date", - "data_type": "date" - }, - "payment_amount": { - "type": "table_column", - "column_name": "payment_amount", - "data_type": "decimal[10,2]" - }, - "payment_method": { - "type": "table_column", - "column_name": "payment_method", - "data_type": "string" - }, - "crtd_ts": { - "type": "table_column", - "column_name": "crtd_ts", - "data_type": "timestamp[3]" - } - } - }, - "Sales": { - "type": "simple_table", - "table_path": "main.sales", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "_id", - "data_type": "int32" - }, - "car_id": { - "type": "table_column", - "column_name": "car_id", - "data_type": "int32" - }, - "salesperson_id": { - "type": "table_column", - "column_name": "salesperson_id", - "data_type": "int32" - }, - "customer_id": { - "type": "table_column", - "column_name": "customer_id", - "data_type": "int32" - }, - "sale_price": { - "type": "table_column", - "column_name": "sale_price", - "data_type": "decimal[10,2]" - }, - "sale_date": { - "type": "table_column", - "column_name": "sale_date", - "data_type": "date" - }, - "crtd_ts": { - "type": "table_column", - "column_name": "crtd_ts", - "data_type": "timestamp[3]" - }, - "payment": { - "type": "simple_join", - "other_collection_name": "PaymentsReceived", - "singular": false, - "no_collisions": true, - "keys": { - "_id": ["sale_id"] - }, - "reverse_relationship_name": "sale_record" - } - } - }, - "InventorySnapshots": { - "type": "simple_table", - "table_path": "main.inventory_snapshots", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "_id", - "data_type": "int32" - }, - "snapshot_date": { - "type": "table_column", - "column_name": "snapshot_date", - "data_type": "date" - }, - "car_id": { - "type": "table_column", - "column_name": "car_id", - "data_type": "int32" - }, - "is_in_inventory": { - "type": "table_column", - "column_name": "is_in_inventory", - "data_type": "bool" - }, - "crtd_ts": { - "type": "table_column", - "column_name": "crtd_ts", - "data_type": "timestamp[3]" - } - } - } - }, - "DermTreatment": { - "Doctors": { - "type": "simple_table", - "table_path": "main.doctors", - "unique_properties": ["doc_id"], - "properties": { - "doc_id": { - "type": "table_column", - "column_name": "doc_id", - "data_type": "int32" - }, - "first_name": { - "type": "table_column", - "column_name": "first_name", - "data_type": "string" - }, - "last_name": { - "type": "table_column", - "column_name": "last_name", - "data_type": "string" - }, - "speciality": { - "type": "table_column", - "column_name": "specialty", - "data_type": "string" - }, - "year_reg": { - "type": "table_column", - "column_name": "year_reg", - "data_type": "int32" - }, - "med_school_name": { - "type": "table_column", - "column_name": "med_school_name", - "data_type": "string" - }, - "loc_city": { - "type": "table_column", - "column_name": "loc_city", - "data_type": "string" - }, - "loc_state": { - "type": "table_column", - "column_name": "loc_state", - "data_type": "string" - }, - "loc_zip": { - "type": "table_column", - "column_name": "loc_zip", - "data_type": "string" - }, - "bd_cert_num": { - "type": "table_column", - "column_name": "bd_cert_num", - "data_type": "string" - } - } - }, - "Patients": { - "type": "simple_table", - "table_path": "main.patients", - "unique_properties": ["patient_id"], - "properties": { - "patient_id": { - "type": "table_column", - "column_name": "patient_id", - "data_type": "int32" - }, - "first_name": { - "type": "table_column", - "column_name": "first_name", - "data_type": "string" - }, - "last_name": { - "type": "table_column", - "column_name": "last_name", - "data_type": "string" - }, - "date_of_birth": { - "type": "table_column", - "column_name": "date_of_birth", - "data_type": "date" - }, - "date_of_registration": { - "type": "table_column", - "column_name": "date_of_registration", - "data_type": "date" - }, - "gender": { - "type": "table_column", - "column_name": "gender", - "data_type": "string" - }, - "email": { - "type": "table_column", - "column_name": "email", - "data_type": "string" - }, - "phone": { - "type": "table_column", - "column_name": "phone", - "data_type": "string" - }, - "addr_city": { - "type": "table_column", - "column_name": "addr_city", - "data_type": "string" - }, - "addr_state": { - "type": "table_column", - "column_name": "addr_state", - "data_type": "string" - }, - "addr_zip": { - "type": "table_column", - "column_name": "addr_zip", - "data_type": "string" - }, - "ins_type": { - "type": "table_column", - "column_name": "ins_type", - "data_type": "string" - }, - "ins_policy_num": { - "type": "table_column", - "column_name": "ins_policy_num", - "data_type": "string" - }, - "height_cm": { - "type": "table_column", - "column_name": "height_cm", - "data_type": "float64" - }, - "weight_kg": { - "type": "table_column", - "column_name": "weight_kg", - "data_type": "float64" - } - } - }, - "Drugs": { - "type": "simple_table", - "table_path": "main.drugs", - "unique_properties": ["drug_id"], - "properties": { - "drug_id": { - "type": "table_column", - "column_name": "drug_id", - "data_type": "int32" - }, - "drug_name": { - "type": "table_column", - "column_name": "drug_name", - "data_type": "string" - }, - "manufacturer": { - "type": "table_column", - "column_name": "manufacturer", - "data_type": "string" - }, - "drug_type": { - "type": "table_column", - "column_name": "drug_type", - "data_type": "string" - }, - "moa": { - "type": "table_column", - "column_name": "moa", - "data_type": "string" - }, - "fda_appr_dt": { - "type": "table_column", - "column_name": "fda_appr_dt", - "data_type": "date" - }, - "admin_route": { - "type": "table_column", - "column_name": "admin_route", - "data_type": "string" - }, - "dos_amt": { - "type": "table_column", - "column_name": "dos_amt", - "data_type": "decimal[10,2]" - }, - "dos_unit": { - "type": "table_column", - "column_name": "dos_unit", - "data_type": "string" - }, - "dos_freq_hrs": { - "type": "table_column", - "column_name": "dos_freq_hrs", - "data_type": "int32" - }, - "ndc": { - "type": "table_column", - "column_name": "ndc", - "data_type": "string" - } - } - }, - "Diagnoses": { - "type": "simple_table", - "table_path": "main.diagnoses", - "unique_properties": ["diag_id"], - "properties": { - "diag_id": { - "type": "table_column", - "column_name": "diag_id", - "data_type": "int32" - }, - "diag_code": { - "type": "table_column", - "column_name": "diag_code", - "data_type": "string" - }, - "diag_name": { - "type": "table_column", - "column_name": "diag_name", - "data_type": "string" - }, - "diag_desc": { - "type": "table_column", - "column_name": "diag_desc", - "data_type": "string" - } - } - }, - "Treatments": { - "type": "simple_table", - "table_path": "main.treatments", - "unique_properties": ["treatment_id"], - "properties": { - "treatment_id": { - "type": "table_column", - "column_name": "treatment_id", - "data_type": "int32" - }, - "patient_id": { - "type": "table_column", - "column_name": "patient_id", - "data_type": "int32" - }, - "doc_id": { - "type": "table_column", - "column_name": "doc_id", - "data_type": "int32" - }, - "drug_id": { - "type": "table_column", - "column_name": "drug_id", - "data_type": "int32" - }, - "diag_id": { - "type": "table_column", - "column_name": "diag_id", - "data_type": "int32" - }, - "start_dt": { - "type": "table_column", - "column_name": "start_dt", - "data_type": "date" - }, - "end_dt": { - "type": "table_column", - "column_name": "end_dt", - "data_type": "date" - }, - "is_placebo": { - "type": "table_column", - "column_name": "is_placebo", - "data_type": "bool" - }, - "tot_drug_amt": { - "type": "table_column", - "column_name": "tot_drug_amt", - "data_type": "decimal[10,2]" - }, - "drug_unit": { - "type": "table_column", - "column_name": "drug_unit", - "data_type": "string" - }, - "doctor": { - "type": "simple_join", - "other_collection_name": "Doctors", - "singular": true, - "no_collisions": false, - "keys": { - "doc_id": ["doc_id"] - }, - "reverse_relationship_name": "prescribed_treatments" - }, - "patient": { - "type": "simple_join", - "other_collection_name": "Patients", - "singular": true, - "no_collisions": false, - "keys": { - "patient_id": [ - "patient_id" - ] - }, - "reverse_relationship_name": "treatments_received" - }, - "drug": { - "type": "simple_join", - "other_collection_name": "Drugs", - "singular": true, - "no_collisions": false, - "keys": { - "drug_id": [ - "drug_id" - ] - }, - "reverse_relationship_name": "treatments_used_in" - }, - "diagnosis": { - "type": "simple_join", - "other_collection_name": "Diagnoses", - "singular": true, - "no_collisions": false, - "keys": { - "diag_id": ["diag_id"] - }, - "reverse_relationship_name": "treatments_for" - }, - "outcome_records": { - "type": "simple_join", - "other_collection_name": "Outcomes", - "singular": false, - "no_collisions": true, - "keys": { - "treatment_id": ["treatment_id"] - }, - "reverse_relationship_name": "treatment" - }, - "concomitant_meds": { - "type": "simple_join", - "other_collection_name": "ConcomitantMeds", - "singular": false, - "no_collisions": true, - "keys": { - "treatment_id": [ - "treatment_id" - ] - }, - "reverse_relationship_name": "treatment" - }, - "adverse_events": { - "type": "simple_join", - "other_collection_name": "AdverseEvents", - "singular": false, - "no_collisions": true, - "keys": { - "treatment_id": [ - "treatment_id" - ] - }, - "reverse_relationship_name": "treatment" - } - } - }, - "Outcomes": { - "type": "simple_table", - "table_path": "main.outcomes", - "unique_properties": ["outcome_id"], - "properties": { - "outcome_id": { - "type": "table_column", - "column_name": "outcome_id", - "data_type": "int32" - }, - "treatment_id": { - "type": "table_column", - "column_name": "treatment_id", - "data_type": "int32" - }, - "assess_dt": { - "type": "table_column", - "column_name": "assess_dt", - "data_type": "date" - }, - "day7_lesion_cnt": { - "type": "table_column", - "column_name": "day7_lesion_cnt", - "data_type": "int32" - }, - "day30_lesion_cnt": { - "type": "table_column", - "column_name": "day30_lesion_cnt", - "data_type": "int32" - }, - "day100_lesion_cnt": { - "type": "table_column", - "column_name": "day100_lesion_cnt", - "data_type": "int32" - }, - "day7_pasi_score": { - "type": "table_column", - "column_name": "day7_pasi_score", - "data_type": "decimal[4,1]" - }, - "day30_pasi_score": { - "type": "table_column", - "column_name": "day30_pasi_score", - "data_type": "decimal[4,1]" - }, - "day100_pasi_score": { - "type": "table_column", - "column_name": "day100_pasi_score", - "data_type": "decimal[4,1]" - }, - "day7_tewl": { - "type": "table_column", - "column_name": "day7_tewl", - "data_type": "decimal[5,2]" - }, - "day30_tewl": { - "type": "table_column", - "column_name": "day30_tewl", - "data_type": "decimal[5,2]" - }, - "day100_tewl": { - "type": "table_column", - "column_name": "day100_tewl", - "data_type": "decimal[5,2]" - }, - "day7_itch_vas": { - "type": "table_column", - "column_name": "day7_itch_vas", - "data_type": "int32" - }, - "day30_itch_vas": { - "type": "table_column", - "column_name": "day30_itch_vas", - "data_type": "int32" - }, - "day100_itch_vas": { - "type": "table_column", - "column_name": "day100_itch_vas", - "data_type": "int32" - }, - "day7_hfg": { - "type": "table_column", - "column_name": "day7_hfg", - "data_type": "decimal[4,1]" - }, - "day30_hfg": { - "type": "table_column", - "column_name": "day30_hfg", - "data_type": "decimal[4,1]" - }, - "day100_hfg": { - "type": "table_column", - "column_name": "day100_hfg", - "data_type": "decimal[4,1]" - } - } - }, - "ConcomitantMeds": { - "type": "simple_table", - "table_path": "main.concomitant_meds", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "_id", - "data_type": "int32" - }, - "treatment_id": { - "type": "table_column", - "column_name": "treatment_id", - "data_type": "int32" - }, - "med_name": { - "type": "table_column", - "column_name": "med_name", - "data_type": "string" - }, - "start_dt": { - "type": "table_column", - "column_name": "start_dt", - "data_type": "string" - }, - "end_dt": { - "type": "table_column", - "column_name": "end_dt", - "data_type": "string" - }, - "dose_amt": { - "type": "table_column", - "column_name": "dose_amt", - "data_type": "decimal[10,2]" - }, - "dose_unit": { - "type": "table_column", - "column_name": "dose_unit", - "data_type": "string" - }, - "freq_hrs": { - "type": "table_column", - "column_name": "freq_hrs", - "data_type": "int32" - } - } - }, - "AdverseEvents": { - "type": "simple_table", - "table_path": "main.adverse_events", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "_id", - "data_type": "int32" - }, - "treatment_id": { - "type": "table_column", - "column_name": "treatment_id", - "data_type": "int32" - }, - "reported_dt": { - "type": "table_column", - "column_name": "reported_dt", - "data_type": "date" - }, - "description": { - "type": "table_column", - "column_name": "description", - "data_type": "string" - } - } - } - }, - "Ewallet": { - "Users": { - "type": "simple_table", - "table_path": "main.users", - "unique_properties": ["uid", "username"], - "properties": { - "uid": { - "type": "table_column", - "column_name": "uid", - "data_type": "int64" - }, - "username": { - "type": "table_column", - "column_name": "username", - "data_type": "string" - }, - "email": { - "type": "table_column", - "column_name": "email", - "data_type": "string" - }, - "phone_number": { - "type": "table_column", - "column_name": "phone_number", - "data_type": "string" - }, - "created_at": { - "type": "table_column", - "column_name": "created_at", - "data_type": "timestamp[3]" - }, - "last_login_at": { - "type": "table_column", - "column_name": "last_login_at", - "data_type": "timestamp[3]" - }, - "user_type": { - "type": "table_column", - "column_name": "user_type", - "data_type": "string" - }, - "status": { - "type": "table_column", - "column_name": "status", - "data_type": "string" - }, - "country": { - "type": "table_column", - "column_name": "country", - "data_type": "string" - }, - "address_billing": { - "type": "table_column", - "column_name": "address_billing", - "data_type": "string" - }, - "address_delivery": { - "type": "table_column", - "column_name": "address_delivery", - "data_type": "string" - }, - "kyc_status": { - "type": "table_column", - "column_name": "kyc_status", - "data_type": "string" - }, - "kyc_verified_at": { - "type": "table_column", - "column_name": "kyc_verified_at", - "data_type": "timestamp[3]" - }, - "transactions_sent": { - "type": "simple_join", - "other_collection_name": "Transactions", - "singular": false, - "no_collisions": true, - "keys": { - "uid": ["sender_id"] - }, - "reverse_relationship_name": "sending_user" - }, - "transactions_received": { - "type": "simple_join", - "other_collection_name": "Transactions", - "singular": false, - "no_collisions": true, - "keys": { - "uid": ["receiver_id"] - }, - "reverse_relationship_name": "receiving_user" - }, - "balances": { - "type": "simple_join", - "other_collection_name": "UserBalances", - "singular": false, - "no_collisions": true, - "keys": { - "uid": ["user_id"] - }, - "reverse_relationship_name": "user" - }, - "notifications": { - "type": "simple_join", - "other_collection_name": "Notifications", - "singular": false, - "no_collisions": true, - "keys": { - "uid": ["user_id"] - }, - "reverse_relationship_name": "user" - }, - "sessions": { - "type": "simple_join", - "other_collection_name": "UserSessions", - "singular": false, - "no_collisions": true, - "keys": { - "uid": ["user_id"] - }, - "reverse_relationship_name": "user" - }, - "setting_snapshots": { - "type": "simple_join", - "other_collection_name": "UserSettingSnapshots", - "singular": false, - "no_collisions": true, - "keys": { - "uid": ["user_id"] - }, - "reverse_relationship_name": "user" - } - } - }, - "Merchants": { - "type": "simple_table", - "table_path": "main.merchants", - "unique_properties": ["mid"], - "properties": { - "mid": { - "type": "table_column", - "column_name": "mid", - "data_type": "int64" - }, - "name": { - "type": "table_column", - "column_name": "name", - "data_type": "string" - }, - "description": { - "type": "table_column", - "column_name": "description", - "data_type": "string" - }, - "website_url": { - "type": "table_column", - "column_name": "website_url", - "data_type": "string" - }, - "logo_url": { - "type": "table_column", - "column_name": "logo_url", - "data_type": "string" - }, - "created_at": { - "type": "table_column", - "column_name": "created_at", - "data_type": "timestamp[3]" - }, - "country": { - "type": "table_column", - "column_name": "country", - "data_type": "string" - }, - "state": { - "type": "table_column", - "column_name": "state", - "data_type": "string" - }, - "city": { - "type": "table_column", - "column_name": "city", - "data_type": "string" - }, - "postal_code": { - "type": "table_column", - "column_name": "postal_code", - "data_type": "string" - }, - "address": { - "type": "table_column", - "column_name": "address", - "data_type": "string" - }, - "status": { - "type": "table_column", - "column_name": "status", - "data_type": "string" - }, - "category": { - "type": "table_column", - "column_name": "category", - "data_type": "string" - }, - "sub_category": { - "type": "table_column", - "column_name": "sub_category", - "data_type": "string" - }, - "mcc": { - "type": "table_column", - "column_name": "mcc", - "data_type": "int32" - }, - "contact_name": { - "type": "table_column", - "column_name": "contact_name", - "data_type": "string" - }, - "contact_email": { - "type": "table_column", - "column_name": "contact_email", - "data_type": "string" - }, - "contact_phone": { - "type": "table_column", - "column_name": "contact_phone", - "data_type": "string" - }, - "transactions_sent": { - "type": "simple_join", - "other_collection_name": "Transactions", - "singular": false, - "no_collisions": true, - "keys": { - "mid": ["sender_id"] - }, - "reverse_relationship_name": "sending_merchant" - }, - "transactions_received": { - "type": "simple_join", - "other_collection_name": "Transactions", - "singular": false, - "no_collisions": true, - "keys": { - "mid": ["receiver_id"] - }, - "reverse_relationship_name": "receiving_merchant" - }, - "balances": { - "type": "simple_join", - "other_collection_name": "MerchantBalances", - "singular": false, - "no_collisions": true, - "keys": { - "mid": ["merchant_id"] - }, - "reverse_relationship_name": "merchant" - }, - "coupons": { - "type": "simple_join", - "other_collection_name": "Coupons", - "singular": false, - "no_collisions": true, - "keys": { - "mid": ["merchant_id"] - }, - "reverse_relationship_name": "merchant" - } - } - }, - "Coupons": { - "type": "simple_table", - "table_path": "main.coupons", - "unique_properties": ["cid"], - "properties": { - "cid": { - "type": "table_column", - "column_name": "cid", - "data_type": "int64" - }, - "merchant_id": { - "type": "table_column", - "column_name": "merchant_id", - "data_type": "int64" - }, - "code": { - "type": "table_column", - "column_name": "code", - "data_type": "string" - }, - "description": { - "type": "table_column", - "column_name": "description", - "data_type": "string" - }, - "start_date": { - "type": "table_column", - "column_name": "start_date", - "data_type": "date" - }, - "end_date": { - "type": "table_column", - "column_name": "end_date", - "data_type": "date" - }, - "discount_type": { - "type": "table_column", - "column_name": "discount_type", - "data_type": "string" - }, - "discount_value": { - "type": "table_column", - "column_name": "discount_value", - "data_type": "decimal[10,2]" - }, - "min_purchase_amount": { - "type": "table_column", - "column_name": "min_purchase_amount", - "data_type": "decimal[10,2]" - }, - "max_discount_amount": { - "type": "table_column", - "column_name": "max_discount_amount", - "data_type": "decimal[10,2]" - }, - "redemption_limit": { - "type": "table_column", - "column_name": "redemption_limit", - "data_type": "int32" - }, - "status": { - "type": "table_column", - "column_name": "status", - "data_type": "string" - }, - "created_at": { - "type": "table_column", - "column_name": "created_at", - "data_type": "timestamp[3]" - }, - "updated_at": { - "type": "table_column", - "column_name": "updated_at", - "data_type": "timestamp[3]" - }, - "transaction_used_in": { - "type": "simple_join", - "other_collection_name": "Transactions", - "singular": false, - "no_collisions": true, - "keys": { - "cid": ["coupon_id"] - }, - "reverse_relationship_name": "coupon" - } - } - }, - "Transactions": { - "type": "simple_table", - "table_path": "main.wallet_transactions_daily", - "unique_properties": ["txid"], - "properties": { - "txid": { - "type": "table_column", - "column_name": "txid", - "data_type": "int32" - }, - "sender_id": { - "type": "table_column", - "column_name": "sender_id", - "data_type": "int64" - }, - "sender_type": { - "type": "table_column", - "column_name": "sender_type", - "data_type": "int32" - }, - "receiver_id": { - "type": "table_column", - "column_name": "receiver_id", - "data_type": "int64" - }, - "receiver_type": { - "type": "table_column", - "column_name": "receiver_type", - "data_type": "int32" - }, - "amount": { - "type": "table_column", - "column_name": "amount", - "data_type": "decimal[10,2]" - }, - "status": { - "type": "table_column", - "column_name": "status", - "data_type": "string" - }, - "transaction_type": { - "type": "table_column", - "column_name": "type", - "data_type": "string" - }, - "description": { - "type": "table_column", - "column_name": "description", - "data_type": "string" - }, - "coupon_id": { - "type": "table_column", - "column_name": "coupon_id", - "data_type": "int64" - }, - "created_at": { - "type": "table_column", - "column_name": "created_at", - "data_type": "timestamp[3]" - }, - "completed_at": { - "type": "table_column", - "column_name": "completed_at", - "data_type": "timestamp[3]" - }, - "transaction_ref": { - "type": "table_column", - "column_name": "transaction_ref", - "data_type": "string" - }, - "gateway_name": { - "type": "table_column", - "column_name": "gateway_name", - "data_type": "string" - }, - "gateway_ref": { - "type": "table_column", - "column_name": "gateway_ref", - "data_type": "string" - }, - "device_id": { - "type": "table_column", - "column_name": "device_id", - "data_type": "string" - }, - "ip_address": { - "type": "table_column", - "column_name": "ip_address", - "data_type": "string" - }, - "user_agent": { - "type": "table_column", - "column_name": "user_agent", - "data_type": "string" - } - } - }, - "UserBalances": { - "type": "simple_table", - "table_path": "main.wallet_user_balance_daily", - "unique_properties": ["user_id","updated_at"], - "properties": { - "user_id": { - "type": "table_column", - "column_name": "user_id", - "data_type": "int64" - }, - "balance": { - "type": "table_column", - "column_name": "balance", - "data_type": "decimal[10,2]" - }, - "updated_at": { - "type": "table_column", - "column_name": "updated_at", - "data_type": "timestamp[3]" - } - } - }, - "MerchantBalances": { - "type": "simple_table", - "table_path": "main.wallet_merchant_balance_daily", - "unique_properties": ["merchant_id","updated_at"], - "properties": { - "merchant_id": { - "type": "table_column", - "column_name": "merchant_id", - "data_type": "int64" - }, - "balance": { - "type": "table_column", - "column_name": "balance", - "data_type": "decimal[10,2]" - }, - "updated_at": { - "type": "table_column", - "column_name": "updated_at", - "data_type": "timestamp[3]" - } - } - }, - "Notifications": { - "type": "simple_table", - "table_path": "main.notifications", - "unique_properties": ["_id"], - "properties": { - "_id": { - "type": "table_column", - "column_name": "_id", - "data_type": "int32" - }, - "user_id": { - "type": "table_column", - "column_name": "user_id", - "data_type": "int64" - }, - "message": { - "type": "table_column", - "column_name": "message", - "data_type": "string" - }, - "notification_type": { - "type": "table_column", - "column_name": "type", - "data_type": "string" - }, - "status": { - "type": "table_column", - "column_name": "status", - "data_type": "string" - }, - "created_at": { - "type": "table_column", - "column_name": "created_at", - "data_type": "timestamp[3]" - }, - "read_at": { - "type": "table_column", - "column_name": "read_at", - "data_type": "timestamp[3]" - }, - "device_type": { - "type": "table_column", - "column_name": "device_type", - "data_type": "string" - }, - "device_id": { - "type": "table_column", - "column_name": "device_id", - "data_type": "string" - }, - "action_url": { - "type": "table_column", - "column_name": "action_url", - "data_type": "string" - } - } - }, - "UserSessions": { - "type": "simple_table", - "table_path": "main.user_sessions", - "unique_properties": ["user_id", "session_start_ts", "session_end_ts"], - "properties": { - "user_id": { - "type": "table_column", - "column_name": "user_id", - "data_type": "int64" - }, - "session_start_ts": { - "type": "table_column", - "column_name": "session_start_ts", - "data_type": "timestamp[3]" - }, - "session_end_ts": { - "type": "table_column", - "column_name": "session_end_ts", - "data_type": "timestamp[3]" - }, - "device_type": { - "type": "table_column", - "column_name": "device_type", - "data_type": "string" - }, - "device_id": { - "type": "table_column", - "column_name": "device_id", - "data_type": "string" - } - } - }, - "UserSettingSnapshots": { - "type": "simple_table", - "table_path": "main.user_setting_snapshot", - "unique_properties": ["user_id","snapshot_date"], - "properties": { - "user_id": { - "type": "table_column", - "column_name": "user_id", - "data_type": "int64" - }, - "snapshot_date": { - "type": "table_column", - "column_name": "snapshot_date", - "data_type": "date" - }, - "tx_limit_daily": { - "type": "table_column", - "column_name": "tx_limit_daily", - "data_type": "decimal[10,2]" - }, - "tx_limit_monthly": { - "type": "table_column", - "column_name": "tx_limit_monthly", - "data_type": "decimal[10,2]" - }, - "membership_status": { - "type": "table_column", - "column_name": "membership_status", - "data_type": "int32" - }, - "password_hash": { - "type": "table_column", - "column_name": "password_hash", - "data_type": "string" - }, - "api_key": { - "type": "table_column", - "column_name": "api_key", - "data_type": "string" - }, - "verified_devices": { - "type": "table_column", - "column_name": "verified_devices", - "data_type": "string" - }, - "verified_ips": { - "type": "table_column", - "column_name": "verified_ips", - "data_type": "string" - }, - "mfa_enabled": { - "type": "table_column", - "column_name": "mfa_enabled", - "data_type": "bool" - }, - "marketing_opt_in": { - "type": "table_column", - "column_name": "marketing_opt_in", - "data_type": "bool" - }, - "created_at": { - "type": "table_column", - "column_name": "created_at", - "data_type": "timestamp[3]" - } - } - } - } -} diff --git a/training/graphs/tpc.json b/training/graphs/tpc.json deleted file mode 100644 index 0ae6ad3e..00000000 --- a/training/graphs/tpc.json +++ /dev/null @@ -1,201 +0,0 @@ -{ - "TPCH": { - "regions": { - "type": "simple_table", - "table_path": "main.REGION", - "unique_properties": ["key"], - "properties": { - "key": {"type": "table_column", "column_name": "r_regionkey", "data_type": "int64"}, - "name": {"type": "table_column", "column_name": "r_name", "data_type": "string"}, - "comment": {"type": "table_column", "column_name": "r_comment", "data_type": "string"}, - "nations": { - "type": "simple_join", - "other_collection_name": "nations", - "singular": false, - "no_collisions": true, - "keys": {"key": ["region_key"]}, - "reverse_relationship_name": "region" - } - } - }, - "nations": { - "type": "simple_table", - "table_path": "main.NATION", - "unique_properties": ["key"], - "properties": { - "key": {"type": "table_column", "column_name": "n_nationkey", "data_type": "int64"}, - "name": {"type": "table_column", "column_name": "n_name", "data_type": "string"}, - "region_key": {"type": "table_column", "column_name": "n_regionkey", "data_type": "int64"}, - "comment": {"type": "table_column", "column_name": "n_comment", "data_type": "string"}, - "suppliers": { - "type": "simple_join", - "other_collection_name": "suppliers", - "singular": false, - "no_collisions": true, - "keys": {"key": ["nation_key"]}, - "reverse_relationship_name": "nation" - }, - "customers": { - "type": "simple_join", - "other_collection_name": "customers", - "singular": false, - "no_collisions": true, - "keys": {"key": ["nation_key"]}, - "reverse_relationship_name": "nation" - } - } - }, - "parts": { - "type": "simple_table", - "table_path": "main.PART", - "unique_properties": ["key"], - "properties": { - "key": {"type": "table_column", "column_name": "p_partkey", "data_type": "int64"}, - "name": {"type": "table_column", "column_name": "p_name", "data_type": "string"}, - "manufacturer": {"type": "table_column", "column_name": "p_mfgr", "data_type": "string"}, - "brand": {"type": "table_column", "column_name": "p_brand", "data_type": "string"}, - "part_type": {"type": "table_column", "column_name": "p_type", "data_type": "string"}, - "size": {"type": "table_column", "column_name": "p_size", "data_type": "int64"}, - "container": {"type": "table_column", "column_name": "p_container", "data_type": "string"}, - "retail_price": {"type": "table_column", "column_name": "p_retailprice", "data_type": "decimal[12,2]"}, - "comment": {"type": "table_column", "column_name": "p_comment", "data_type": "string"}, - "supply_records": { - "type": "simple_join", - "other_collection_name": "supply_records", - "singular": false, - "no_collisions": true, - "keys": {"key": ["part_key"]}, - "reverse_relationship_name": "part" - }, - "lines": { - "type": "simple_join", - "other_collection_name": "lines", - "singular": false, - "no_collisions": true, - "keys": {"key": ["part_key"]}, - "reverse_relationship_name": "part" - } - } - }, - "suppliers": { - "type": "simple_table", - "table_path": "main.SUPPLIER", - "unique_properties": ["key"], - "properties": { - "key": {"type": "table_column", "column_name": "s_suppkey", "data_type": "int64"}, - "name": {"type": "table_column", "column_name": "s_name", "data_type": "string"}, - "address": {"type": "table_column", "column_name": "s_address", "data_type": "string"}, - "nation_key": {"type": "table_column", "column_name": "s_nationkey", "data_type": "int64"}, - "phone": {"type": "table_column", "column_name": "s_phone", "data_type": "string"}, - "account_balance": {"type": "table_column", "column_name": "s_acctbal", "data_type": "decimal[12,2]"}, - "comment": {"type": "table_column", "column_name": "s_comment", "data_type": "string"}, - "supply_records": { - "type": "simple_join", - "other_collection_name": "supply_records", - "singular": false, - "no_collisions": true, - "keys": {"key": ["supplier_key"]}, - "reverse_relationship_name": "supplier" - }, - "lines": { - "type": "simple_join", - "other_collection_name": "lines", - "singular": false, - "no_collisions": true, - "keys": {"key": ["supplier_key"]}, - "reverse_relationship_name": "supplier" - } - } - }, - "lines": { - "type": "simple_table", - "table_path": "main.LINEITEM", - "unique_properties": [["order_key", "line_number"], ["part_key", "supplier_key", "order_key"]], - "properties": { - "order_key": {"type": "table_column", "column_name": "l_orderkey", "data_type": "int64"}, - "part_key": {"type": "table_column", "column_name": "l_partkey", "data_type": "int64"}, - "supplier_key": {"type": "table_column", "column_name": "l_suppkey", "data_type": "int64"}, - "line_number": {"type": "table_column", "column_name": "l_linenumber", "data_type": "int8"}, - "quantity": {"type": "table_column", "column_name": "l_quantity", "data_type": "decimal[12,2]"}, - "extended_price": {"type": "table_column", "column_name": "l_extendedprice", "data_type": "decimal[12,2]"}, - "discount": {"type": "table_column", "column_name": "l_discount", "data_type": "decimal[12,2]"}, - "tax": {"type": "table_column", "column_name": "l_tax", "data_type": "decimal[12,2]"}, - "status": {"type": "table_column", "column_name": "l_linestatus", "data_type": "string"}, - "ship_date": {"type": "table_column", "column_name": "l_shipdate", "data_type": "date"}, - "commit_date": {"type": "table_column", "column_name": "l_commitdate", "data_type": "date"}, - "receipt_date": {"type": "table_column", "column_name": "l_receiptdate", "data_type": "date"}, - "ship_instruct": {"type": "table_column", "column_name": "l_shipinstruct", "data_type": "string"}, - "ship_mode": {"type": "table_column", "column_name": "l_shipmode", "data_type": "string"}, - "return_flag": {"type": "table_column", "column_name": "l_returnflag", "data_type": "string"}, - "comment": {"type": "table_column", "column_name": "l_comment", "data_type": "string"}, - "part_and_supplier": { - "type": "simple_join", - "other_collection_name": "supply_records", - "singular": true, - "no_collisions": false, - "keys": {"part_key": ["part_key"], "supplier_key": ["supplier_key"]}, - "reverse_relationship_name": "lines" - }, - "order": { - "type": "simple_join", - "other_collection_name": "orders", - "singular": true, - "no_collisions": false, - "keys": {"order_key": ["key"]}, - "reverse_relationship_name": "lines" - } - } - }, - "supply_records": { - "type": "simple_table", - "table_path": "main.PARTSUPP", - "unique_properties": [["part_key", "supplier_key"]], - "properties": { - "part_key": {"type": "table_column", "column_name": "ps_partkey", "data_type": "int64"}, - "supplier_key": {"type": "table_column", "column_name": "ps_suppkey", "data_type": "int64"}, - "availqty": {"type": "table_column", "column_name": "ps_availqty", "data_type": "decimal[12,2]"}, - "supplycost": {"type": "table_column", "column_name": "ps_supplycost", "data_type": "decimal[12,2]"}, - "comment": {"type": "table_column", "column_name": "ps_comment", "data_type": "string"} - } - }, - "orders": { - "type": "simple_table", - "table_path": "main.ORDERS", - "unique_properties": ["key"], - "properties": { - "key": {"type": "table_column", "column_name": "o_orderkey", "data_type": "int64"}, - "customer_key": {"type": "table_column", "column_name": "o_custkey", "data_type": "int64"}, - "order_status": {"type": "table_column", "column_name": "o_orderstatus", "data_type": "decimal[12,2]"}, - "total_price": {"type": "table_column", "column_name": "o_totalprice", "data_type": "decimal[12,2]"}, - "order_date": {"type": "table_column", "column_name": "o_orderdate", "data_type": "date"}, - "order_priority": {"type": "table_column", "column_name": "o_orderpriority", "data_type": "int64"}, - "clerk": {"type": "table_column", "column_name": "o_clerk", "data_type": "int64"}, - "ship_priority": {"type": "table_column", "column_name": "o_shippriority", "data_type": "int64"}, - "comment": {"type": "table_column", "column_name": "o_comment", "data_type": "string"}, - "customer": { - "type": "simple_join", - "other_collection_name": "customers", - "singular": true, - "no_collisions": false, - "keys": {"customer_key": ["key"]}, - "reverse_relationship_name": "orders" - } - } - }, - "customers": { - "type": "simple_table", - "table_path": "main.CUSTOMER", - "unique_properties": ["key"], - "properties": { - "key": {"type": "table_column", "column_name": "c_custkey", "data_type": "int64"}, - "name": {"type": "table_column", "column_name": "c_name", "data_type": "string"}, - "address": {"type": "table_column", "column_name": "c_address", "data_type": "string"}, - "nation_key": {"type": "table_column", "column_name": "c_nationkey", "data_type": "int64"}, - "phone": {"type": "table_column", "column_name": "c_phone", "data_type": "string"}, - "acctbal": {"type": "table_column", "column_name": "c_acctbal", "data_type": "decimal[12,2]"}, - "mktsegment": {"type": "table_column", "column_name": "c_mktsegment", "data_type": "string"}, - "comment": {"type": "table_column", "column_name": "c_comment", "data_type": "string"} - } - } - } - } diff --git a/training/pydough_corpus.csv b/training/pydough_corpus.csv deleted file mode 100644 index 29b896cd..00000000 --- a/training/pydough_corpus.csv +++ /dev/null @@ -1,523 +0,0 @@ -id,graph,question,valid,output,sql_text,sql_dialect,is_benchmark -q001,TPCH,"Provide a summary pricing report for all line items shipped as of September 2nd of 1998. The summary should list totals for extended price, discounted extended price, discounted extended price plus tax, average quantity, average extended price, average discount, and number of lines, all broken down by the return flag & line status. This is also known as TPC-H query #1.",Y,"disc_price = l.extended_price * (1 - l.discount) -charge = disc_price * (1 + l.tax) -selected_lines = lines.WHERE((ship_date <= datetime.date(1998, 9, 2))) -partitioned_lines = PARTITION(selected_lines, name=""l"", by=(return_flag, status)) -result = partitioned_lines( - L_RETURNFLAG=return_flag, - L_LINESTATUS=status, - SUM_QTY=SUM(l.quantity), - SUM_BASE_PRICE=SUM(l.extended_price), - SUM_DISC_PRICE=SUM(disc_price), - SUM_CHARGE=SUM(charge), - AVG_QTY=AVG(l.quantity), - AVG_PRICE=AVG(l.extended_price), - AVG_DISC=AVG(l.discount), - COUNT_ORDER=COUNT(l), -).ORDER_BY(return_flag.ASC(), status.ASC())","select - l_returnflag, - l_linestatus, - sum(l_quantity) as sum_qty, - sum(l_extendedprice) as sum_base_price, - sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, - sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, - avg(l_quantity) as avg_qty, - avg(l_extendedprice) as avg_price, - avg(l_discount) as avg_disc, - count(*) as count_order -from - lineitem -where - l_shipdate <= date '1998-12-01' - interval '90' day -group by - l_returnflag, - l_linestatus -order by - l_returnflag, - l_linestatus",sqlite,N -q002,TPCH,"Find, within the European region, for each brass part of size 15, the supplier who can supply it at minimum cost. If several suppliers in that region offer the desired part type and size at the same (minimum) cost, list the parts from suppliers with the 100 highest account balances. List the supplier's account balance, name and nation; the part's number and manufacturer; the supplier's address, phone number and comment information. This is also known as TPC-H query #2.",Y,"selected_parts = ( - nations.WHERE(region.name == ""EUROPE"") - .suppliers.supply_records.part( - s_acctbal=BACK(2).account_balance, - s_name=BACK(2).name, - n_name=BACK(3).name, - s_address=BACK(2).address, - s_phone=BACK(2).phone, - s_comment=BACK(2).comment, - supplycost=BACK(1).supplycost, - ) - .WHERE(ENDSWITH(part_type, ""BRASS"") & (size == 15)) -) -part_groups = PARTITION(selected_parts, name=""p"", by=key)( - best_cost=MIN(p.supplycost) -) -result = part_groups.p.WHERE( - (supplycost == BACK(1).best_cost) - & ENDSWITH(part_type, ""BRASS"") - & (size == 15) -)( - S_ACCTBAL=s_acctbal, - S_NAME=s_name, - N_NAME=n_name, - P_PARTKEY=key, - P_MFGR=manufacturer, - S_ADDRESS=s_address, - S_PHONE=s_phone, - S_COMMENT=s_comment, -).TOP_K( - 100, - by=(S_ACCTBAL.DESC(), N_NAME.ASC(), S_NAME.ASC(), P_PARTKEY.ASC()), -)","SELECT - S_ACCTBAL, - S_NAME, - N_NAME, - P_PARTKEY, - P_MFGR, - S_ADDRESS, - S_PHONE, - S_COMMENT -FROM - PART, - SUPPLIER, - PARTSUPP, - NATION, - REGION -WHERE - P_PARTKEY = PS_PARTKEY - AND S_SUPPKEY = PS_SUPPKEY - AND P_SIZE = 15 - AND P_TYPE LIKE '%BRASS' - AND S_NATIONKEY = N_NATIONKEY - AND N_REGIONKEY = R_REGIONKEY - AND R_NAME = 'EUROPE' - AND PS_SUPPLYCOST = ( - SELECT MIN(PS_SUPPLYCOST) - FROM PARTSUPP, SUPPLIER, NATION, REGION - WHERE P_PARTKEY = PS_PARTKEY - AND S_SUPPKEY = PS_SUPPKEY - AND S_NATIONKEY = N_NATIONKEY - AND N_REGIONKEY = R_REGIONKEY - AND R_NAME = 'EUROPE' - ) -ORDER BY - S_ACCTBAL DESC, - N_NAME, - S_NAME, - P_PARTKEY -LIMIT 100",sqlite,N -q003,TPCH,"Find the 10 unshipped orders with the highest value. Retrieves the shipping priority and potential revenue, defined as the sum of the extended price times one minus the discount, of the orders having the largest revenue among those that had not been shipped as of March 15th, 1995. orders are listed in decreasing order of revenue. If more than 10 unshipped orders exist meeting the criteria, only the 10 orders with the largest revenue are listed. This is also known as TPC-H query #3.",Y,"cutoff_date = datetime.date(1995, 3, 15) -selected_orders = orders.WHERE( - (customer.mktsegment == ""BUILDING"") & (order_date < cutoff_date) -) -selected_lines = selected_orders.lines.WHERE(ship_date > cutoff_date)( - BACK(1).order_date, - BACK(1).ship_priority, -) -result = PARTITION( - selected_lines, name=""l"", by=(order_key, order_date, ship_priority) -)( - L_ORDERKEY=order_key, - REVENUE=SUM(l.extended_price * (1 - l.discount)), - O_ORDERDATE=order_date, - O_SHIPPRIORITY=ship_priority, -).TOP_K(10, by=(REVENUE.DESC(), O_ORDERDATE.ASC(), L_ORDERKEY.ASC()))","SELECT - L_ORDERKEY, - SUM(L_EXTENDEDPRICE * (1 - L_DISCOUNT)) AS REVENUE, - O_ORDERDATE, - O_SHIPPRIORITY -FROM - CUSTOMER, - ORDERS, - LINEITEM -WHERE - C_MKTSEGMENT = 'BUILDING' - AND C_CUSTKEY = O_CUSTKEY - AND L_ORDERKEY = O_ORDERKEY - AND O_ORDERDATE < DATE '1995-03-15' - AND L_SHIPDATE > DATE '1995-03-15' -GROUP BY - L_ORDERKEY, - O_ORDERDATE, - O_SHIPPRIORITY -ORDER BY - REVENUE DESC, - O_ORDERDATE -LIMIT 10",sqlite,N -q004,TPCH,"How well is the order priority system is working at ensuring that orders are delivered on time within the third quarter of 1993? Analyze this by counting the number of orders ordered in the given quarter of the given year in which at least one lineitem was received by the customer later than its committed date. List the count of such orders for each order priority sorted in ascending priority order. This is also known as TPC-H query #4.",Y,"selected_lines = lines.WHERE(commit_date < receipt_date) -selected_orders = orders.WHERE( - (order_date >= datetime.date(1993, 7, 1)) - & (order_date < datetime.date(1993, 10, 1)) - & HAS(selected_lines) -) -result = PARTITION(selected_orders, name=""o"", by=order_priority)( - O_ORDERPRIORITY=order_priority, - ORDER_COUNT=COUNT(o), -).ORDER_BY(O_ORDERPRIORITY.ASC())","SELECT - O_ORDERPRIORITY, - COUNT(*) AS ORDER_COUNT -FROM - ORDERS -WHERE - O_ORDERDATE >= DATE '1993-07-01' - AND O_ORDERDATE < DATE '1993-10-01' - AND EXISTS ( - SELECT 1 - FROM LINEITEM - WHERE L_ORDERKEY = O_ORDERKEY - AND L_COMMITDATE < L_RECEIPTDATE - ) -GROUP BY - O_ORDERPRIORITY -ORDER BY - O_ORDERPRIORITY",sqlite,N -q005,TPCH,"For each Asian country, calculate the total revenue generated by suppliers in that nation shipping a part to a customer from the same nation, only considering shipments ordered in 1994. Revenue volume for all qualifying lineitems in a particular nation is defined as sum(l_extendedprice * (1 - l_discount)). This is also known as TPC-H query #5.",Y,"selected_lines = customers.orders.WHERE( - (order_date >= datetime.date(1994, 1, 1)) - & (order_date < datetime.date(1995, 1, 1)) -).lines.WHERE(supplier.nation.name == BACK(3).name)( - value=extended_price * (1 - discount) -) -result = nations.WHERE(region.name == ""ASIA"")( - N_NAME=name, REVENUE=SUM(selected_lines.value) -).ORDER_BY(REVENUE.DESC())",,sqlite,N -q006,TPCH,"How was revenue impacted in 1994 due to the presence of certain discounts, specifically those between 0.05 and 0.07 (inclusive); speculate on future revenue due to a change in that discount. Considers all the lineitems shipped in the given year with discounts in thar range value. List the amount by which the total revenue would have increased if these discounts had been eliminated for lineitems with a quantity below 24. Note that the potential revenue increase is equal to the sum of the extended price times one minus the discount for all lineitems with discounts and quantities in the qualifying range. This is also known as TPC-H query #6.",Y,"selected_lines = lines.WHERE( - (ship_date >= datetime.date(1994, 1, 1)) - & (ship_date < datetime.date(1995, 1, 1)) - & (0.05 <= discount) - & (discount <= 0.07) - & (quantity < 24) -)(amt=extended_price * discount) -result = TPCH(REVENUE=SUM(selected_lines.amt))",,sqlite,N -q007,TPCH,"Calculate the value of goods shipped between France and Germany in 1995 and 1996 as a representation for future contract negotiations. Find, for these two nations, the gross discounted revenues derived from lineitems in which parts were shipped from a supplier in either nation to a customer in the other nation during the chosen years. Lists the supplier nation, the customer nation, the year, and the revenue from shipments that took place in that year. Order the answer by Supplier nation, Customer nation, and year (all ascending). This is also known as TPC-H query #7.",Y,"line_info = lines( - supp_nation=supplier.nation.name, - cust_nation=order.customer.nation.name, - l_year=YEAR(ship_date), - volume=extended_price * (1 - discount), -).WHERE( - (ship_date >= datetime.date(1995, 1, 1)) - & (ship_date <= datetime.date(1996, 12, 31)) - & ( - ((supp_nation == ""FRANCE"") & (cust_nation == ""GERMANY"")) - | ((supp_nation == ""GERMANY"") & (cust_nation == ""FRANCE"")) - ) -) -result = PARTITION(line_info, name=""l"", by=(supp_nation, cust_nation, l_year))( - SUPP_NATION=supp_nation, - CUST_NATION=cust_nation, - L_YEAR=l_year, - REVENUE=SUM(l.volume), -).ORDER_BY( - SUPP_NATION.ASC(), - CUST_NATION.ASC(), - L_YEAR.ASC(), -)",,sqlite,N -q008,TPCH,"How has the market share of Brazil within the American region has changed from 1995 to 1996 for parts of type 'ECONOMY ANODIZED STEEL'. The market share for a given nation within a given region is defined as the fraction of the revenue, defined as the sum of the extended price times one minus the discount, from the products of a specified type in that region that was supplied by suppliers from the given nation. This is also known as TPC-H query #8.",Y,"selected_orders = orders.WHERE( - (order_date >= datetime.date(1995, 1, 1)) - & (order_date <= datetime.date(1996, 12, 31)) - & (customer.nation.region.name == ""AMERICA"") -) - -volume = extended_price * (1 - discount) - -volume_data = selected_orders.lines.WHERE( - part.part_type == ""ECONOMY ANODIZED STEEL"" -)( - o_year=YEAR(BACK(1).order_date), - volume=volume, - brazil_volume=IFF(supplier.nation.name == ""BRAZIL"", volume, 0) -) - -result = PARTITION(volume_data, name=""v"", by=o_year)( - O_YEAR=o_year, - MKT_SHARE=SUM(v.brazil_volume) / SUM(v.volume), -)",,sqlite,N -q009,TPCH,"What was the profit made for a particular line of parts looking at the supplying nation and particular year in question, where the line of parts is green parts. Specifically, for each nation and each year, find the profit for all parts ordered in that year that contain 'green' in their names and that were filled by a supplier in that nation. The profit is defined as the sum of the extended price times one minus the discount, minus the product of the supply cost and quantity, for all lineitems describing parts in the specified line. List the nations in ascending alphabetical order and, for each nation, the year and profit in descending order by year (most recent first). This is also known as TPC-H query #9.",Y,"selected_lines = nations.suppliers.supply_records.WHERE( - CONTAINS(part.name, ""green"") -).lines( - nation=BACK(3).name, - o_year=YEAR(order.order_date), - value=extended_price * (1 - discount) - BACK(1).supplycost * quantity, -) - -result = PARTITION(selected_lines, name=""l"", by=(nation, o_year))( - NATION=nation, O_YEAR=o_year, AMOUNT=SUM(l.value) -).ORDER_BY(NATION.ASC(), O_YEAR.DESC())",,sqlite,N -q010,TPCH,"This question determines the top 20 customers based on impact in revenue due to having returned parts within a particular quarter. Revenue lost is defined as the sum of the extended price times one minus the discount for all qualifying lineitems. The chosen quarter is the last 3 months of 1993. The query lists the customer's name, address, nation, phone number, account balance, comment information and revenue lost. This is also known as TPC-H query #10.",Y,"selected_lines = orders.WHERE( - (order_date >= datetime.date(1993, 10, 1)) - & (order_date < datetime.date(1994, 1, 1)) -).lines.WHERE(return_flag == 'R')(amt=extended_price * (1 - discount)) - -result = customers( - C_CUSTKEY=key, - C_NAME=name, - REVENUE=SUM(selected_lines.amt), - C_ACCTBAL=acctbal, - N_NAME=nation.name, - C_ADDRESS=address, - C_PHONE=phone, - C_COMMENT=comment, -).TOP_K(20, by=(REVENUE.DESC(), C_CUSTKEY.ASC()))",,sqlite,N -q011,TPCH,"Identify the most import subset of available part supplies from Germany. To do so, scan the available stock of German suppliers and identify all the parts that represent a significant percentage (at least 0.01%) of the total value of all the stock in Germany. Displays the part number and the value of those parts in descending order of value. The value for a given part from a supplier is defined as the product of the supply cost of the supplier for the part and the quantity that the supplier has for that product. This is also known as TPC-H query #11.",Y,"is_german_supplier = supplier.nation.name == ""GERMANY"" -selected_records = supply_records.WHERE(is_german_supplier)(metric=supplycost * availqty) -result = TPCH(min_market_share=SUM(selected_records.metric) * 0.0001).PARTITION( - selected_records, name=""ps"", by=part_key -)( - PS_PARTKEY=part_key, VALUE=SUM(ps.metric) -).WHERE(VALUE > BACK(1).min_market_share).ORDER_BY(VALUE.DESC())",,sqlite,N -q012,TPCH,"Determine the impact of low cost shipping modes on delays for orders of varying priorities. Do so by counting, by ship mode, for lineitems actually received by customers in 1994, the number of lineitems belonging to orders for which the receipt date exceeds the commit date for ship modes of 'MAIL' or 'SHIP'. Only lineitems that were actually shipped before the commit date are considered. The late lineitems are partitioned into two groups, those with priority `1-URGENT` or `2-HIGH`, and those with a priority other than `1-URGENT` or `2-HIGH`. This is also known as TPC-H query #12.",Y,"selected_lines = lines.WHERE( - ((ship_mode == ""MAIL"") | (ship_mode == ""SHIP"")) - & (ship_date < commit_date) - & (commit_date < receipt_date) - & (receipt_date >= datetime.date(1994, 1, 1)) - & (receipt_date < datetime.date(1995, 1, 1)) -)( - is_high_priority=(order.order_priority == ""1-URGENT"") - | (order.order_priority == ""2-HIGH""), -) -result = PARTITION(selected_lines, ""l"", by=ship_mode)( - L_SHIPMODE=ship_mode, - HIGH_LINE_COUNT=SUM(l.is_high_priority), - LOW_LINE_COUNT=SUM(~(l.is_high_priority)), -).ORDER_BY(L_SHIPMODE.ASC())",,sqlite,N -q013,TPCH,"Calculate the distribution of customers who have made 0, 1, 2, etc. special request orders made. An order is determined to be a special request if its comment contains 'special' and later contains 'requests'. This is also known as TPC-H query #13.",Y,"customer_info = customers( - key, - num_non_special_orders=COUNT( - orders.WHERE(~(LIKE(comment, ""%special%requests%""))) - ), -) -result = PARTITION(customer_info, name=""custs"", by=num_non_special_orders)( - C_COUNT=num_non_special_orders, CUSTDIST=COUNT(custs) -).ORDER_BY(CUSTDIST.DESC(), C_COUNT.DESC())",,sqlite,N -q014,TPCH,"What percentage of revenue from parts shipped in September of 1995 was from promotional parts (parts whose type begins with 'PROMO'). Revenue is defined as extended price times one minus the discount. This is also known as TPC-H query #14.",Y,"value = extended_price * (1 - discount) -selected_lines = lines.WHERE( - (ship_date >= datetime.date(1995, 9, 1)) - & (ship_date < datetime.date(1995, 10, 1)) -)( - value=value, - promo_value=IFF(STARTSWITH(part.part_type, ""PROMO""), value, 0), -) -result = TPCH(PROMO_REVENUE=100.0 * SUM(selected_lines.promo_value) / SUM(selected_lines.value))",,sqlite,N -q015,TPCH,"Determine the top suppliers in the first quarter of 1996 with the highest revenue from parts shipped in that time period. In case of a tie, the query lists all suppliers whose contribution was equal to the maximum, presented in supplier number order. This is also known as TPC-H query #15.",Y,"selected_lines = lines.WHERE( - (ship_date >= datetime.date(1996, 1, 1)) - & (ship_date < datetime.date(1996, 4, 1)) -) -total = SUM(selected_lines.extended_price * (1 - selected_lines.discount)) -result = TPCH( - max_revenue=MAX(suppliers(total_revenue=total).total_revenue) -).suppliers( - S_SUPPKEY=key, - S_NAME=name, - S_ADDRESS=address, - S_PHONE=phone, - TOTAL_REVENUE=total, -).WHERE(TOTAL_REVENUE == BACK(1).max_revenue).ORDER_BY(S_SUPPKEY.ASC())",,sqlite,N -q016,TPCH,"Determine how many distinct suppliers can meet the needs of a part with a particular set of attributes. The customer is interested in parts of eight different sizes as long as they are not of type 'MEDIUM POLISHED', not of brand 'BRAND#45', the size is one of [49, 14, 23, 45, 19, 3, 36, 9], excluding suppliers supplier who have had complaints registered at the Better Business Bureau. Break down qualifying parts by the brand, type and size. Present the results by descending count of qualifying suppliers and ascending brand, type, and size. This is also known as TPC-H query #16.",Y,"selected_records = ( - parts.WHERE( - (brand != ""BRAND#45"") - & ~STARTSWITH(part_type, ""MEDIUM POLISHED%"") - & ISIN(size, [49, 14, 23, 45, 19, 3, 36, 9]) - ) - .supply_records( - p_brand=BACK(1).brand, - p_type=BACK(1).part_type, - p_size=BACK(1).size, - ps_suppkey=supplier_key, - ) - .WHERE(~LIKE(supplier.comment, ""%Customer%Complaints%"")) -) -result = PARTITION(selected_records, name=""ps"", by=(p_brand, p_type, p_size))( - P_BRAND=p_brand, - P_TYPE=p_type, - P_SIZE=p_size, - SUPPLIER_COUNT=NDISTINCT(ps.supplier_key), -).ORDER_BY(SUPPLIER_COUNT.DESC(), P_BRAND.ASC(), P_TYPE.ASC(), P_SIZE.ASC())",,sqlite,N -q017,TPCH,"Determine how much average yearly revenue would be lost if orders were no longer filled for small quantities (quantities below 20% of the global average) of parts from brand 'BRAND#23' whose container size is 'MED BOX'. Assume that the data is spread out across 7 years when taking the average across all years. This is also known as TPC-H query #17.",Y,"selected_lines = parts.WHERE((brand == ""Brand#23"") & (container == ""MED BOX""))( - avg_quantity=AVG(lines.quantity) -).lines.WHERE(quantity < 0.2 * BACK(1).avg_quantity) -result = TPCH(AVG_YEARLY=SUM(selected_lines.extended_price) / 7.0)",,sqlite,N -q018,TPCH,"Order customers based on their status as having placed a large quantity order. Calculate a list of the top 100 instances of a customer having placed large quantity orders by the total price of the order. List the customer name, customer key, the order key, date and total price and the quantity for the order. Break ties in favor of the order made first chronologically. This is also known as TPC-H query #18.",Y,"result = orders( - C_NAME=customer.name, - C_CUSTKEY=customer.key, - O_ORDERKEY=key, - O_ORDERDATE=order_date, - O_TOTALPRICE=total_price, - TOTAL_QUANTITY=SUM(lines.quantity), -).WHERE( - TOTAL_QUANTITY > 300 -).TOP_K(100, by=(O_TOTALPRICE.DESC(), O_ORDERDATE.ASC()))",,sqlite,N -q019,TPCH,"Report the gross discounted revenue resulting from select parts that were shipped by air and delivered in person. More specifically, it finds the gross discounted revenue for all orders for three different types of parts that were shipped by air and delivered in person. The selected parts are those where one of the following combinations of criteria are true: - - The brand is `'BRAND#12'`, the quantity is between 1 and 11, the size is between 1 and 5, and the container type is one of `['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG']`. - - The brand is `'BRAND#23'`, the quantity is between 10 and 20, the size is between 1 and 10, and the container type is one of `['MED BAG', 'MED BOX', 'MED PKG', 'MED PACK']`. - - The brand is `'BRAND#34'`, the quantity is between 20 and 30, the size is between 1 and 15, and the container type is one of `['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG']`. -This is also known as TPC-H query #19.",Y,"selected_lines = lines.WHERE( - (ISIN(ship_mode, (""AIR"", ""AIR REG""))) - & (ship_instruct == ""DELIVER IN PERSON"") - & (part.size >= 1) - & ( - ( - (part.size <= 5) - & (quantity >= 1) - & (quantity <= 11) - & ISIN( - part.container, - (""SM CASE"", ""SM BOX"", ""SM PACK"", ""SM PKG""), - ) - & (part.brand == ""Brand#12"") - ) - | ( - (part.size <= 10) - & (quantity >= 10) - & (quantity <= 20) - & ISIN( - part.container, - (""MED BAG"", ""MED BOX"", ""MED PACK"", ""MED PKG""), - ) - & (part.brand == ""Brand#23"") - ) - | ( - (part.size <= 15) - & (quantity >= 20) - & (quantity <= 30) - & ISIN( - part.container, - (""LG CASE"", ""LG BOX"", ""LG PACK"", ""LG PKG""), - ) - & (part.brand == ""Brand#34"") - ) - ) -) -result = TPCH( - REVENUE=SUM(selected_lines.extended_price * (1 - selected_lines.discount)) -)",,sqlite,N -q020,TPCH,"List the name and address for all Canadian suppliers that may be a candidate for discounts based on their sales in 1994. Specifically, if the supplier has had at least 1 part they supply that starts with 'forest' where the supplier's excess of that part is more than 50% of the total quantity of that part purchased in 1994. Order the qualifying suppliers alphabetically. This is also known as TPC-H query #20.",Y,"part_qty = SUM( - lines.WHERE( - (ship_date >= datetime.date(1994, 1, 1)) - & (ship_date < datetime.date(1995, 1, 1)) - ).quantity -) -selected_part_supplied = supply_records.part.WHERE( - STARTSWITH(name, ""forest"") & (BACK(1).availqty > part_qty * 0.5) -) -result = suppliers( - S_NAME=name, - S_ADDRESS=address, -).WHERE((nation.name == ""CANADA"") & COUNT(selected_part_supplied) > 0).ORDER_BY(S_NAME.ASC())",,sqlite,N -q021,TPCH,"For each Saudi Arabian supplier, count how many times their product was part of a multi-supplier order (with current status of 'F') where they were the only supplier who failed to meet the committed delivery date. This is also known as TPC-H query #21.",Y,"date_check = receipt_date > commit_date -different_supplier = supplier_key != BACK(2).supplier_key -waiting_entries = lines.WHERE(date_check).order.WHERE( - (order_status == ""F"") - & HAS(lines.WHERE(different_supplier)) - & HASNOT(lines.WHERE(different_supplier & date_check)) -) -result = suppliers.WHERE(nation.name == ""SAUDI ARABIA"")( - S_NAME=name, - NUMWAIT=COUNT(waiting_entries), -).ORDER_BY(NUMWAIT.DESC(), S_NAME.ASC())",,sqlite,N -q022,TPCH,"Break down how many customers, by country code of their phone number (only including customers from certain country codes), have not placed an order but have an account balance that is above the average for all positive account balances of such customers. Also include the total balance for all such customers for each country code. The country codes to include are 13, 31, 23, 29, 30, 18 and 17. This is also known as TPC-H query #22.",Y,"selected_customers = customers(cntry_code=phone[:2]).WHERE( - ISIN(cntry_code, (""13"", ""31"", ""23"", ""29"", ""30"", ""18"", ""17"")) & HASNOT(orders) -) -result = TPCH( - avg_balance=AVG(selected_customers.WHERE(acctbal > 0.0).acctbal) -).PARTITION( - selected_customers.WHERE(acctbal > BACK(1).avg_balance), - name=""custs"", - by=cntry_code, -)( - CNTRY_CODE=cntry_code, - NUM_CUSTS=COUNT(custs), - TOTACCTBAL=SUM(custs.acctbal), -)",,sqlite,N -q023,TPCH,"What is the percentage of orders that include products from multiple suppliers?",Y,"orders_with_multiple_suppliers = orders.WHERE(NDISTINCT(lines.supplier.name) > 1) -result = TPCH(percentage=100.0*COUNT(orders_with_multiple_suppliers) / COUNT(orders))",,sqlite,N -q024,TPCH,"What are the top 10 most purchased products by north american customers?",Y,"selected_lines = lines.WHERE(order.customer.nation.region.name == ""AMERICA"")",,sqlite,N -q025,TPCH,"Which 10 customers ordered the most products with turquoise in the name, by quantity, in the year 1995? Include their name and the total quantity.",Y,"selected_year = 1995 -selected_orders = orders.WHERE(YEAR(order_date) == selected_year) -selected_lines = selected_orders.lines.WHERE( - CONTAINS(part.name, ""turquoise"") -) -result = customers( - name, - total_quantity=SUM(selected_lines.quantity) -).TOP_K(10, by=total_quantity.DESC())",,sqlite,N -q026,TPCH,"Which products are at least 1% of all income that their producing supplier makes? Income for a supplier is the extended price of all line items that they supply, after accounting for any discounts. List the product name, supplier name, and percentage of the supplier's income that the product name accounts for.",Y,"line_revenue = extended_price * (1 - discount) -lines_income = SUM(lines(revenue=line_revenue).revenue) -result = suppliers( - total_income=lines_income -).supply_records( - supplier_name=BACK(1).name, - part_name=part.name, - income_percentage=100.0*line_income/BACK(1).lines_income -).WHERE(income_percentage >= 1.0)",,sqlite,N -q027,TPCH,"Find, for every nation, the name of the nation, the name of its region, and the number of customers in the nation.",Y,"result = nations( - nation_name=name, - region_name=region.name, - num_customers=COUNT(customers) -)",,sqlite,N -q028,TPCH,"How many customers made repeated purchases of the same product within a six-month period?",Y,"six_month_repeat_purchases = orders.WHERE( - (order_date > BACK(2).order_date) & - (order_date < DATE(BACK(2).order_date, '6 months')) -).lines.WHERE(part_key == BACK(4).part_key) -selected_customers = lines.order.customer.WHERE(HAS(six_month_repeat_purchases)) -result = TPCH( - customer_count = NDISTINCT(selected_customers.cust_key) -)",,sqlite,N -q029,TPCH,"What is the total revenue generated by each customer in 1994?",Y,"revenue_from_lines = extended_price * (1 - discount) -start_date = datetime.date(1994, 1, 1) -end_date = datetime.date(1994, 12, 31) -selected_lines = lines.WHERE( - (order.order_date >= start_date) & (order.order_date < end_date) -)(revenue = revenue_from_lines) -customer_revenue = customers(name, total_revenue=sum(selected_lines.revenue))",,sqlite,N -q030,TPCH,"For each region, find the suppliers in the top 0.1% by number of parts they supply, breaking ties alphabetically by name.",Y,"pct = PERCENTILE( - by=(COUNT(supply_records).ASC(), name.ASC()), levels=2, n_buckets=1000 - ) -result = regions.nations.suppliers(name).WHERE(HAS(supply_records) & (pct == 1000))",,sqlite,N -q031,TPCH,"For each region, calculate the average percentage of purchases made from suppliers in that region belonging to the most common part type shipped from the supplier region to the customer region, averaging across all customer region. Only considers lineitems from June of 1992 where the container is small.",Y,"line_info = ( - parts.WHERE( - STARTSWITH(container, ""SM""), - ) - .lines.WHERE((MONTH(ship_date) == 6) & (YEAR(ship_date) == 1992))( - supp_region=supplier.nation.region.name, - ) - .order.WHERE(YEAR(order_date) == 1992)( - supp_region=BACK(1).supp_region, - part_type=BACK(2).part_type, - cust_region=customer.nation.region.name, - ) -) -rrt_combos = PARTITION( - line_info, name=""lines"", by=(supp_region, cust_region, part_type) -)(n_instances=COUNT(lines)) -rr_combos = PARTITION(rrt_combos, name=""part_types"", by=(supp_region, cust_region))( - percentage=100.0 * MAX(part_types.n_instances) / SUM(part_types.n_instances) -) -result = PARTITION( - rr_combos, - name=""cust_regions"", - by=supp_region, -)(supp_region, avg_percentage=AVG(cust_regions.percentage)).ORDER_BY( - supp_region.ASC() -)",,sqlite,N -q032,TPCH,"Find all customers whose name ends with a zero and are in the 30-lowest account balances",Y,"result = output = customers(name).WHERE(ENDSWITH(name, ""0"") & (RANKING(by=acctbal.DESC() <= 30)))",,sqlite,N -q033,TPCH,"For each of the 5 largest part sizes, find the part of that size with the largest retail price",Y,"result = PARTITION(parts, name=""p"", by=size -).TOP_K(5, by=size.DESC() -).p(size, name -).WHERE(RANKING(by=retail_price.DESC(), levels=1) == 1)",,sqlite,N -q034,TPCH,"What 3 product brands saw the largest positive change in quantity sold from 1995 to 1996.",Y,"selected_lines = lines.WHERE(ISIN(YEAR(order.order_date), (1995, 1996))) -line_info = selected_lines(quant=IFF(YEAR(order.order_date) == 1996, 1, -1)) -result = PARTITION(parts, name=""p"", by=brand)( - brand, - change_95_to_96=SUM(line_info.quant) -).WHERE( - change_95_to_96 > 0 -).TOP_K(5, by=change_95_to_96.DESC())",,sqlite,N -q035,Broker,"Who are the top 5 customers by total transaction amount? Return their name and total amount.",Y,"result = Customers(name, total_amount=SUM(transactions_made.amount)).TOP_K( - 5, by=total_amount.DESC() -)","WITH cust_tx AS ( - SELECT c.sbCustId, c.sbCustName, SUM(t.sbTxAmount) AS total_amount - FROM sbCustomer AS c - JOIN sbTransaction AS t ON c.sbCustId = t.sbTxCustId - GROUP BY c.sbCustId, c.sbCustName) -SELECT sbCustName, total_amount -FROM cust_tx -ORDER BY CASE WHEN total_amount IS NULL THEN 1 ELSE 0 END DESC, total_amount DESC -LIMIT 5",sqlite,Y diff --git a/training/scrap.py b/training/scrap.py deleted file mode 100644 index 02fbb868..00000000 --- a/training/scrap.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Sandbox file for testing LLM training/interactions with the sample PyDough data. -""" - -import json -import os - -import pandas as pd - - -def get_graphs() -> dict[str, dict]: - """ - Returns a mapping of each graph name within any graph file in the graphs - directory to the raw JSON metadata for that graph. - """ - result: dict[str, dict] = {} - - # Loop over every json file in the graphs folder - for file_name in os.listdir(f"{os.path.dirname(__file__)}/graphs"): - if file_name.endswith(".json"): - # Load the JSON, then dump every top-level key-value pair into - # the result. - fpath: str = f"{os.path.dirname(__file__)}/graphs/{file_name}" - with open(fpath) as f: - result.update(json.load(f)) - - return result - - -def run(training_data: pd.DataFrame, graphs_json: dict[str, dict]): - """ - TODO: implement logic using the training data & the available graphs - """ - pass - - -if __name__ == "__main__": - training_data: pd.DataFrame = pd.read_csv( - f"{os.path.dirname(__file__)}/pydough_corpus.csv" - ) - graphs_json: dict[str, dict] = get_graphs() - run(training_data, graphs_json) From e7a507b9e60d928a8cc8f9fb90c94435ddcab5a0 Mon Sep 17 00:00:00 2001 From: Rohit Date: Wed, 12 Feb 2025 11:59:05 -0600 Subject: [PATCH 2/2] [RUN CI]