-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQ_T2.py
76 lines (67 loc) · 2.67 KB
/
Q_T2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
# Load a pre-trained model and tokenizer
pretrained_model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
# Convert to AutoGPTQ model with quantization config
quantize_config = BaseQuantizeConfig(bits=4)
quantized_model = AutoGPTQForCausalLM.from_pretrained(
pretrained_model_name,
quantize_config=quantize_config,
torch_dtype=torch.float16
)
# Ensure CUDA is available and set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Prepare example data for quantization
example_texts = [
"Write a bubble search in C++: #include <iostream>\n"
"using namespace std;\n"
"\n"
"void bubbleSort(int array[], int size) {\n"
" for (int step = 0; step < size - 1; ++step) {\n"
" bool swapped = false;\n"
" for (int i = 0; i < size - step - 1; ++i) {\n"
" if (array[i] > array[i + 1]) {\n"
" // Swap if the element found is greater than the next element\n"
" int temp = array[i];\n"
" array[i] = array[i + 1];\n"
" array[i + 1] = temp;\n"
" swapped = true;\n"
" }\n"
" }\n"
" // If no two elements were swapped by inner loop, then break\n"
" if (!swapped) {\n"
" break;\n"
" }\n"
" }\n"
"}\n"
"\n"
"void printArray(int array[], int size) {\n"
" for (int i = 0; i < size; i++) {\n"
" cout << array[i] << ' ';\n"
" }\n"
" cout << endl;\n"
"}\n"
"\n"
"int main() {\n"
" int data[] = {-2, 45, 0, 11, -9};\n"
" int size = sizeof(data) / sizeof(data[0]);\n"
" \n"
" bubbleSort(data, size);\n"
" \n"
" cout << \"Sorted Array in Ascending Order:\\n\";\n"
" printArray(data, size);\n"
"}\n"
]
# Tokenize the example texts and ensure they are in the correct structure and on the correct device
examples = [tokenizer.encode_plus(text, return_tensors="pt", return_attention_mask=True) for text in example_texts]
examples = [{key: tensor.to(device, dtype=torch.float16) for key, tensor in example.items()} for example in examples]
# Quantize the model with the examples
quantized_model.quantize(examples)
# Generate an answer to the prompt
task_prompt = "Write a bubble sort in C++"
inputs = tokenizer(task_prompt, return_tensors="pt").to(device, dtype=torch.float16)
outputs = quantized_model.generate(**inputs, max_length=500)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)