Skip to content

Commit cfedc1f

Browse files
Prepare dataset
1 parent e82676d commit cfedc1f

File tree

4 files changed

+577
-0
lines changed

4 files changed

+577
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import pandas as pd\n",
10+
"import numpy as np\n",
11+
"\n",
12+
"# Paths to your pickle files\n",
13+
"train_data_path_cg = 'finetuning_cg_gpt_v1.jsonl'\n",
14+
"train_data_path_cs = 'finetuning_cs_gpt_v1.jsonl'\n",
15+
"\n",
16+
"# Load the jsonl files\n",
17+
"train_data_cg = pd.read_json(train_data_path_cg, lines=True)\n",
18+
"train_data_cs = pd.read_json(train_data_path_cs, lines=True)\n",
19+
"\n",
20+
"# rename column to text\n",
21+
"train_data_cg.rename(columns={'messages': 'text'}, inplace=True)\n",
22+
"train_data_cs.rename(columns={'messages': 'text'}, inplace=True)"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": 4,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"# Save to CSV in new directory\n",
32+
"import os\n",
33+
"\n",
34+
"os.makedirs('data_cg', exist_ok=True)\n",
35+
"os.makedirs('data_cs', exist_ok=True)\n",
36+
"\n",
37+
"train_data_cg.to_csv('data_cg/train.csv', index=False)\n",
38+
"train_data_cs.to_csv('data_cs/train.csv', index=False)"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": null,
44+
"metadata": {},
45+
"outputs": [],
46+
"source": []
47+
}
48+
],
49+
"metadata": {
50+
"kernelspec": {
51+
"display_name": ".venv",
52+
"language": "python",
53+
"name": "python3"
54+
},
55+
"language_info": {
56+
"codemirror_mode": {
57+
"name": "ipython",
58+
"version": 3
59+
},
60+
"file_extension": ".py",
61+
"mimetype": "text/x-python",
62+
"name": "python",
63+
"nbconvert_exporter": "python",
64+
"pygments_lexer": "ipython3",
65+
"version": "3.10.14"
66+
}
67+
},
68+
"nbformat": 4,
69+
"nbformat_minor": 2
70+
}

0 commit comments

Comments
 (0)