-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen-vectors.py
57 lines (45 loc) · 1.95 KB
/
gen-vectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from decouple import config
from datetime import datetime
import time
import pandas as pd
import os
import openai
from ratelimit import limits, sleep_and_retry
now = datetime.now()
openai.api_key = str(config('API_OPENAI'))
batch_size = 600
@sleep_and_retry
@limits(calls=60, period=60)
def create_embedding(input_text, max_retries=30):
num_retries = 0
while num_retries < max_retries:
try:
return openai.Embedding.create(input=input_text, engine='text-embedding-ada-002')['data'][0]['embedding']
except Exception as e:
print(f"Error calling OpenAI embeddings API: {e}")
num_retries += 1
time.sleep(1) # wait for 1 second before retrying
raise Exception("Failed to call OpenAI embeddings API after multiple retries.")
df = pd.read_csv('processed/short.csv', index_col=0)
df.columns = ['text', 'tokens']
print(df.head())
for i in range(0, len(df), batch_size):
batch = df.iloc[i:i+batch_size].copy()
batch['embeddings'] = batch['text'].apply(create_embedding)
batch.to_csv('processed/embed/embeddings-batch:' + str(i) + '.csv')
batch.to_csv('processed/embed history/embeddings-batch:' + str(i) + ";rows:" + str(len(batch.index)) + ";time:" + now.strftime("%m-%d-%Y %H:%M:%S") + '.csv')
print(batch.head())
csv_dir = "processed/embed"
# Get a list of all the CSV files in the directory
csv_files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith('.csv')]
# Create an empty list to store dataframes
dfs = []
# Loop through each CSV file, read it into a dataframe, and append it to the list
for csv_file in csv_files:
df = pd.read_csv(csv_file)
dfs.append(df)
# Concatenate all the dataframes in the list into a single dataframe
combined_df = pd.concat(dfs)
# Save the combined dataframe to a new CSV file
combined_df.to_csv("processed/embed/embed-comb.csv")
combined_df.to_csv("processed/embed/embed" + ";time:" + now.strftime("%m-%d-%Y %H:%M:%S") + '.csv')