Skip to content

Commit

Permalink
Generation of LLM-friendly summary of the analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorKraevTransferwise committed Jan 8, 2025
1 parent 08d7dca commit 69c6964
Show file tree
Hide file tree
Showing 4 changed files with 758 additions and 357 deletions.
163 changes: 6 additions & 157 deletions notebooks/Finding interesting segments in time series.ipynb

Large diffs are not rendered by default.

863 changes: 664 additions & 199 deletions notebooks/Finding interesting segments.ipynb

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def explain_changes_in_average(
dims: List[str],
total_name: str,
size_name: str,
average_name: Optional[str] = None,
min_segments: Optional[int] = None,
max_segments: int = None,
min_depth: int = 1,
Expand Down Expand Up @@ -124,6 +125,9 @@ def explain_changes_in_average(

# And might want to relabel some plots?
sf.task = "changes in average"
sf.size_name = size_name
sf.total_name = total_name
sf.average_name = average_name
return sf


Expand All @@ -133,6 +137,7 @@ def explain_changes_in_totals(
dims: List[str],
total_name: str,
size_name: str,
average_name: Optional[str] = None,
min_segments: Optional[int] = None,
max_segments: int = None,
min_depth: int = 1,
Expand Down Expand Up @@ -274,6 +279,9 @@ def explain_changes_in_totals(
return_fig=return_fig,
)
sf.task = "changes in totals"
sf.size_name = size_name
sf.total_name = total_name
sf.average_name = average_name
return sf


Expand All @@ -282,6 +290,7 @@ def explain_levels(
dims: List[str],
total_name: str,
size_name: Optional[str] = None,
average_name: Optional[str] = None,
min_segments: int = None,
max_segments: int = None,
min_depth: int = 1,
Expand Down Expand Up @@ -358,6 +367,9 @@ def explain_levels(
cluster_value_width=cluster_value_width,
)
sf.task = "levels"
sf.size_name = size_name
sf.total_name = total_name
sf.average_name = average_name
return sf


Expand All @@ -367,6 +379,7 @@ def explain_timeseries(
total_name: str,
time_name: str,
size_name: Optional[str] = None,
average_name: Optional[str] = None,
num_segments: int = None,
max_depth: int = 2,
solver: str = "tree",
Expand Down Expand Up @@ -557,4 +570,7 @@ def explain_timeseries(
average_name=average_name,
)
sf.task = "time"
sf.size_name = size_name
sf.total_name = total_name
sf.average_name = average_name
return sf
73 changes: 72 additions & 1 deletion wise_pizza/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ def _init_mat(
)

# do pre-filter recursively
for this_X, these_col_defs in basis_iter:
for i in basis_iter:
this_X, these_col_defs = i
if this_X is not None:
X_out, col_defs_out = sel(this_X, these_col_defs)

Expand Down Expand Up @@ -535,6 +536,40 @@ def predict(
out = SliceFinderPredictFacade(self, new_dim_df, segments)
return out

@property
def nice_summary(self, average_name: str | None = None):
return nice_summary(
self.summary(), self.total_name, self.size_name, self.average_name
)

@property
def markdown_summary(self):
return markdown_summary(self.nice_summary)

def descriptive_prompt(
self, prompt_template: Optional["BasePromptTemplate"] = None
):
if prompt_template is not None:
return prompt_template.format(
total_name=self.total_name,
size_name=self.size_name,
average_name=self.average_name,
summary=self.markdown_summary,
)
else:
return f"""
You are a helpful research assistant. You are given a summary analysis of a dataset,
highlighting the key segments that drove the change in total volume.
The logic behind choosing those segments is the following: a segment's impact equals the segment's size
({self.size_name}) multiplied by the difference between the segment' average ({self.average_name}) and
the average of the whole dataset, that describes the change in the total volume ({self.total_name}) due
to the segment's average deviation from the dataset's average. We look for the segments that have the
larges absolute impact on the total volume.
Please summarize that data BRIEFLY in a few sentences.
Here is the summary:
{self.markdown_summary}"""


def make_dummy(segment_def: Dict[str, str], dim_df: pd.DataFrame) -> np.ndarray:
"""
Expand All @@ -557,6 +592,42 @@ def make_dummy(segment_def: Dict[str, str], dim_df: pd.DataFrame) -> np.ndarray:
return dummy, Xi


def nice_summary(
x: str,
total_name: str,
size_name: Optional[str] = None,
average_name: Optional[str] = None,
):
x = json.loads(x)

df = pd.DataFrame(x["segments"]).rename(
columns={"seg_size": size_name, "total": total_name}
)
df["segment"] = df["segment"].apply(
lambda x: str(x).replace("'", "").replace("{", "").replace("}", "")
)
if average_name is not None:
df = df.rename(columns={"naive_avg": average_name})

# TODO: more flexible formatting
for col in df.columns:
if col != "segment":
df[col] = df[col].astype(int)
out = {"summary": df, "clusters": x["relevant_clusters"]}
return out


def markdown_summary(x: dict):
table = x["summary"].to_markdown(index=False)

out = f"""Key segment summary:
{table}"""

if clusters := x["clusters"]:
out += f"\n\nDefinitions of clusters: {clusters}"
return out


class SlicerPair:
def __init__(self, s1: SliceFinder, s2: SliceFinder):
self.s1 = s1
Expand Down

0 comments on commit 69c6964

Please sign in to comment.