Skip to content

Commit

Permalink
Merge pull request #1 from Linh-nk/gpt
Browse files Browse the repository at this point in the history
get anomalies with chatgpt
  • Loading branch information
Linh-nk authored Mar 19, 2024
2 parents f122724 + 2375a54 commit 53bdd9c
Show file tree
Hide file tree
Showing 10 changed files with 611 additions and 46 deletions.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
history = history_file.read()

install_requires = [
'numpy',
'numpy', 'openai', 'pandas','orion', 'matplotlib', 'scikit-learn',
'tiktoken',
]

setup_requires = [
Expand Down
132 changes: 132 additions & 0 deletions sigllm/anomalies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# -*- coding: utf-8 -*-

"""
Result post-processing module.
This module contains functions that help convert model responses back to indices and timestamps.
"""
import numpy as np


def str2sig(text, sep=',', decimal=0):
"""Convert a text string to a signal.
Convert a string containing digits into an array of numbers.
Args:
text (str):
A string containing signal values.
sep (str):
String that was used to separate each element in text, Default to `","`.
decimal (int):
Number of decimal points to shift each element in text to. Default to `0`.
Returns:
numpy.ndarray:
A 1-dimensional array containing parsed elements in `text`.
"""
# Remove all characters from text except the digits and sep and decimal point
text = ''.join(i for i in text if (i.isdigit() or i == sep or i == '.'))
values = np.fromstring(text, dtype=float, sep=sep)
return values * 10**(-decimal)


def str2idx(text, len_seq, sep=','):
"""Convert a text string to indices.
Convert a string containing digits into an array of indices.
Args:
text (str):
A string containing indices values.
len_seq (int):
The length of processed sequence
sep (str):
String that was used to separate each element in text, Default to `","`.
Returns:
numpy.ndarray:
A 1-dimensional array containing parsed elements in `text`.
"""
# Remove all characters from text except the digits and sep
text = ''.join(i for i in text if (i.isdigit() or i == sep))

values = np.fromstring(text, dtype=int, sep=sep)

# Remove indices that exceed the length of sequence
values = values[values < len_seq]
return values


def get_anomaly_list_within_seq(res_list, alpha=0.5):
"""Get the final list of anomalous indices of a sequence
Choose anomalous index in the sequence based on multiple LLM responses
Args:
res_list (List[numpy.ndarray]):
A list of 1-dimensional array containing anomous indices output by LLM
alpha (float):
Percentage of votes needed for an index to be deemed anomalous. Default: 0.5
Returns:
numpy.ndarray:
A 1-dimensional array containing final anomalous indices
"""
min_vote = np.ceil(alpha * len(res_list))

flattened_res = np.concatenate(res_list)

unique_elements, counts = np.unique(flattened_res, return_counts=True)

final_list = unique_elements[counts >= min_vote]

return final_list


def merge_anomaly_seq(anomalies, start_indices, window_size, step_size, beta=0.5):
"""Get the final list of anomalous indices of a sequence when merging all rolling windows
Args:
anomalies (List[numpy.ndarray]):
A list of 1-dimensional array containing anomous indices of each window
start_indices (numpy.ndarray):
A 1-dimensional array contaning the first index of each window
window_size (int):
Length of each window
step_size (int):
Indicating the number of steps the window moves forward each round.
beta (float):
Percentage of containing windows needed for index to be deemed anomalous. Default: 0.5
Return:
numpy.ndarray:
A 1-dimensional array containing final anomalous indices
"""
anomalies = [arr + first_idx for (arr, first_idx) in zip(anomalies, start_indices)]

min_vote = np.ceil(beta * window_size / step_size)

flattened_res = np.concatenate(anomalies)

unique_elements, counts = np.unique(flattened_res, return_counts=True)

final_list = unique_elements[counts >= min_vote]

return np.sort(final_list)


def idx2time(sequence, idx_list):
"""Convert list of indices into list of timestamp
Args:
sequence (pandas.Dataframe):
Signal with timestamps and values
idx_list (numpy.ndarray):
A 1-dimensional array of indices
Returns:
numpy.ndarray:
A 1-dimensional array containing timestamps
"""
return sequence.iloc[idx_list].timestamp.to_numpy()
72 changes: 45 additions & 27 deletions sigllm/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,63 @@
"""
Data preprocessing module.
This module contains functions to help parse time series into
text, preparing it for a language model.
This module contains functions that prepare timeseries for a language model.
"""

import numpy as np


def sig2str(values, sep=',', space=False, decimal=0):
def rolling_window_sequences(X, index, window_size, step_size):
"""Create rolling window sequences out of time series data.
The function creates an array of sequences by rolling over the input sequence.
Args:
X (ndarray):
The sequence to iterate over.
index (ndarray):
Array containing the index values of X.
window_size (int):
Length of window.
step_size (int):
Indicating the number of steps to move the window forward each round.
Returns:
ndarray, ndarray:
* rolling window sequences.
* first index value of each input sequence.
"""
out_X = list()
X_index = list()

start = 0
max_start = len(X) - window_size + 1
while start < max_start:
end = start + window_size
out_X.append(X[start:end])
X_index.append(index[start])
start = start + step_size

return np.asarray(out_X), np.asarray(X_index)


def sig2str(values, sep=',', space=False, decimal=0, rescale=True):
"""Convert a signal to a string.
Convert a 1-dimensional time series into text by casting it
to integer values then into a string.
Convert a 1-dimensional time series into text by casting and rescaling it
to nonnegative integer values then into a string (optional).
Args:
values (numpy.ndarray):
A sequence of signal values.
sep (str):
String to separate each element in values, Default to `","`.
String to separate each element in values. Default to `","`.
space (bool):
Whether to add space between each digit in the result. Default to `False`.
decimal (int):
Number of decimal points to keep from the float representation. Default to `0`.
rescale(bool):
Whether to rescale the time series. Default to `True`
Returns:
str:
Expand All @@ -35,29 +70,12 @@ def sig2str(values, sep=',', space=False, decimal=0):

sequence = sign * (values * 10**decimal).astype(int)

# Rescale all elements to be nonnegative
if rescale:
sequence = sequence - min(sequence)

res = sep.join([str(num) for num in sequence])
if space:
res = ' '.join(res)

return res


def str2sig(text, sep=',', decimal=0):
"""Convert a text string to a signal.
Convert a string containing digits into an array of numbers.
Args:
text (str):
A string containing signal values.
sep (str):
String that was used to separate each element in text, Default to `","`.
decimal (int):
Number of decimal points to shift each element in text to. Default to `0`.
Returns:
numpy.ndarray:
A 1-dimensional array containing parsed elements in `text`.
"""
values = np.fromstring(text, dtype=float, sep=sep)
return values * 10**(-decimal)
61 changes: 61 additions & 0 deletions sigllm/gpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-

"""
GPT model module.
This module contains functions that are specifically used for GPT models
"""
import os

from openai import OpenAI


def load_system_prompt(file_path):
with open(file_path) as f:
system_prompt = f.read()
return system_prompt


CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

ZERO_SHOT_FILE = 'gpt_system_prompt_zero_shot.txt'
ONE_SHOT_FILE = 'gpt_system_prompt_one_shot.txt'

ZERO_SHOT_DIR = os.path.join(CURRENT_DIR, "..", "template", ZERO_SHOT_FILE)
ONE_SHOT_DIR = os.path.join(CURRENT_DIR, "..", "template", ONE_SHOT_FILE)


GPT_model = "gpt-4" # "gpt-4-0125-preview" # # #"gpt-3.5-turbo" #
client = OpenAI()


def get_gpt_model_response(message, gpt_model=GPT_model):
completion = client.chat.completions.create(
model=gpt_model,
messages=message,
)
return completion.choices[0].message.content


def create_message_zero_shot(seq_query, system_prompt_file=ZERO_SHOT_DIR):
messages = []

messages.append({"role": "system", "content": load_system_prompt(system_prompt_file)})

# final prompt
messages.append({"role": "user", "content": f"Sequence: {seq_query}"})
return messages


def create_message_one_shot(seq_query, seq_ex, ano_idx_ex, system_prompt_file=ONE_SHOT_DIR):
messages = []

messages.append({"role": "system", "content": load_system_prompt(system_prompt_file)})

# one shot
messages.append({"role": "user", "content": f"Sequence: {seq_ex}"})
messages.append({"role": "assistant", "content": ano_idx_ex})

# final prompt
messages.append({"role": "user", "content": f"Sequence: {seq_query}"})
return messages
39 changes: 38 additions & 1 deletion sigllm/sigllm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,40 @@
# -*- coding: utf-8 -*-

"""Main module."""
"""
Main module.
This module contains functions that get LLM's anomaly detection results.
"""
from anomalies import get_anomaly_list_within_seq, str2idx
from data import sig2str


def get_anomalies(seq, msg_func, model_func, num_iters=1, alpha=0.5):
"""Get LLM anomaly detection results.
The function get the LLM's anomaly detection and converts them into an 1D array
Args:
seq (ndarray):
The sequence to detect anomalies.
msg_func (func):
Function to create message prompt.
model_func (func):
Function to get LLM answer.
num_iters (int):
Number of times to run the same query.
alpha (float):
Percentage of total number of votes that an index needs to have to be
considered anomalous. Default: 0.5
Returns:
ndarray:
1D array containing anomalous indices of the sequence.
"""
message = msg_func(sig2str(seq, space=True))
res_list = []
for i in range(num_iters):
res = model_func(message)
ano_ind = str2idx(res, len(seq))
res_list.append(ano_ind)
return get_anomaly_list_within_seq(res_list, alpha=alpha)
Loading

0 comments on commit 53bdd9c

Please sign in to comment.