-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[backend][app] Always send generic email result as best GPT on the transcript, other workflows yolo #2
[backend][app] Always send generic email result as best GPT on the transcript, other workflows yolo #2
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -7,7 +7,6 @@ | |||||
DEFAULT_MODEL, | ||||||
OpenAiClient, | ||||||
gpt_response_to_json, | ||||||
num_tokens_from_string, | ||||||
) | ||||||
|
||||||
# Min transcript size somewhat trims down on "hallucinations" | ||||||
|
@@ -17,6 +16,13 @@ | |||||
MAX_TRANSCRIPT_TOKEN_COUNT = 2500 # words | ||||||
|
||||||
|
||||||
# https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them | ||||||
def poor_mans_token_counter(text: str) -> int: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🐛 Bug Additionally, the calculation could be simplified by using a more straightforward approach or by explaining the rationale behind the chosen formula. This would help in understanding why dividing by 4 and multiplying by 3/4 is used. The function uses a heuristic to estimate the number of tokens, but the logic could be more clearly documented or explained. Consider renaming the variables
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a link explaining this formula There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The comment is too long I didn't read it |
||||||
by_character = len(text) / 4 | ||||||
by_words = 3 * len(text.split()) // 4 | ||||||
return int(by_character + by_words) // 2 | ||||||
|
||||||
|
||||||
# TODO(P1, devx): Historically, this query give me most of the headaches. | ||||||
# * GPT-4 suggests using Named Entity Recognition (NER) - with nodes and edges. | ||||||
# * If it remains a problem - maybe just do it one-by-one, screw token cost. | ||||||
|
@@ -25,7 +31,7 @@ def extract_everyone_i_have_talked_to( | |||||
gpt_client: OpenAiClient, full_transcript: str | ||||||
) -> List: | ||||||
# NOTE: We shorten the string by words cause easier, but we better estimate the token count by OpenAI counter. | ||||||
token_count = num_tokens_from_string(full_transcript) | ||||||
token_count = poor_mans_token_counter(full_transcript) | ||||||
print(f"Transcript has {token_count} words and {len(full_transcript)} characters") | ||||||
|
||||||
# This can happen for either super-short, or silent uploads | ||||||
|
@@ -47,14 +53,15 @@ def extract_everyone_i_have_talked_to( | |||||
# https://openai.com/blog/function-calling-and-other-api-updates | ||||||
# TODO(P0, ux): Still often-times it treats "Katka" and "Katka Sabo" as different people. | ||||||
query_people = """ | ||||||
This is a voice note from a meeting or event where I talked to one or multiple people. | ||||||
This is a transcribed voice note. | ||||||
List everybody I have directly talked to, omit mentions of other people in our conversation. | ||||||
Output a valid json list of strings of the people I have directly talked to | ||||||
- sometimes I don't recall their names so use a short description. | ||||||
Voice transcript of our meeting: {} | ||||||
""".format( | ||||||
full_transcript | ||||||
) | ||||||
# TODO(ux): Maybe worth using GPT4-32k here, also I though I have changed these? | ||||||
raw_response = gpt_client.run_prompt(query_people) | ||||||
if raw_response is None: | ||||||
print("WARNING: Likely no people found in the input transcript") | ||||||
|
@@ -260,7 +267,7 @@ def run_executive_assistant_to_get_drafts( | |||||
f"WARNING: full_transcript length too short {MIN_FULL_TRANSCRIPT_CHAR_LENGTH}" | ||||||
) | ||||||
|
||||||
token_count = num_tokens_from_string(full_transcript) | ||||||
token_count = poor_mans_token_counter(full_transcript) | ||||||
print(f"extract_context_per_person on raw_transcript of {token_count} token count") | ||||||
|
||||||
people = extract_everyone_i_have_talked_to(gpt_client, full_transcript) | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -179,7 +179,7 @@ def create_raw_email_with_attachments(params: EmailLog): | |
<head></head> | ||
<body> | ||
""" | ||
+ params.body_text | ||
+ + (params.body_text.replace("\n", "<br />") if params.body_text else "") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have accepted your suggestion to use the above instead just
|
||
+ """ | ||
</body> | ||
</html> | ||
|
@@ -237,6 +237,7 @@ def send_email(params: EmailLog) -> bool: | |
|
||
if not is_running_in_aws() or str(SKIP_SENDING_EMAILS) == "1": | ||
# TODO(P2, testing): Ideally we should also test the translation from params to raw email. | ||
# TODO(P1, devx): How this can be printed "an contents None"? If params is None, it should fail earlier. | ||
print( | ||
f"Skipping ses.send_raw_email cause NOT in AWS or SKIP_SENDING_EMAILS={SKIP_SENDING_EMAILS} " | ||
f"Dumping the email {params.idempotency_id} contents {params}" | ||
|
@@ -445,7 +446,7 @@ def _form_data_to_email_table_html(form_data: FormData) -> str: | |
for field in form_data.form.fields: | ||
if field.ignore_in_display or field.ignore_in_email: | ||
print( | ||
f"INFO: ignoring {field.name} for emails (ignore_id_display: {field.ignore_in_display}" | ||
f"INFO: ignoring {field.name} for emails (ignore_id_display: {field.ignore_in_display} " | ||
f"ignore_in_email: {field.ignore_in_email}" | ||
) | ||
continue | ||
|
@@ -457,7 +458,7 @@ def _form_data_to_email_table_html(form_data: FormData) -> str: | |
return "\n".join(rows) | ||
|
||
|
||
def _craft_result_email_body( | ||
def _craft_networking_person_result_email_body( | ||
person: PersonDataEntry, shareable_link: Optional[str] | ||
) -> (str, str): | ||
# TODO(P1, ux): Migrate to new email template | ||
|
@@ -522,12 +523,25 @@ def _craft_result_email_body( | |
return subject_prefix, res_content_html | ||
|
||
|
||
def send_result( | ||
def send_generic_result( | ||
account_id: UUID, idempotency_id: str, email_subject: str, email_body: str | ||
) -> bool: | ||
email_params = EmailLog.get_email_reply_params_for_account_id( | ||
account_id=account_id, | ||
idempotency_id=idempotency_id, | ||
subject=email_subject, | ||
) | ||
email_params.body_text = email_body | ||
|
||
return send_email(params=email_params) | ||
|
||
|
||
def send_networking_per_person_result( | ||
account_id: UUID, idempotency_id_prefix: str, person: PersonDataEntry | ||
) -> bool: | ||
person_name_safe = re.sub(r"\W", "-", person.name).lower() | ||
acc: Account = Account.get_by_id(account_id) | ||
subject_prefix, content_html = _craft_result_email_body( | ||
subject_prefix, content_html = _craft_networking_person_result_email_body( | ||
person, shareable_link=acc.get_shareable_spreadsheet_link() | ||
) | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -228,6 +228,7 @@ def add_form_datas_to_spreadsheet(self, form_datas: List[FormData]): | |||||
sheet_cache = {} | ||||||
|
||||||
for form_data in form_datas: | ||||||
# TODO: AttributeError: 'str' object has no attribute 'value' | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🐛 Bug
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct, done in a follow up diff |
||||||
form_name = form_data.form.form_name.value | ||||||
if form_name not in sheet_cache: | ||||||
sheet_cache[form_name] = get_or_create_worksheet( | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🐛 Bug
The variable
GPTo_MODEL
used in thegpt_client.run_prompt
call on line 377 seems to be a typo or undefined variable. It should likely beBEST_MODEL
or another defined model variable. This will lead to a runtime error ifGPTo_MODEL
is not defined elsewhere in the code.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wrong, this is defined on line 62