labmlai
diff --git a/‎docs/zh/index.html
+1 b/‎docs/zh/index.html
+1
diff --git a/‎docs/zh/lora/experiment.html
+298-135 b/‎docs/zh/lora/experiment.html
+298-135
diff --git a/‎docs/zh/lora/gpt2.html
+402-144 b/‎docs/zh/lora/gpt2.html
+402-144
diff --git a/‎docs/zh/lora/index.html
+34-34 b/‎docs/zh/lora/index.html
+34-34
diff --git a/‎docs/zh/sitemap.xml
+3-3 b/‎docs/zh/sitemap.xml
+3-3
diff --git a/‎translate_cache/__init__.zh.json
+1 b/‎translate_cache/__init__.zh.json
+1
diff --git a/‎translate_cache/lora/experiment.zh.json
+18-3 b/‎translate_cache/lora/experiment.zh.json
+18-3
diff --git a/‎translate_cache/lora/gpt2.zh.json
+32-6 b/‎translate_cache/lora/gpt2.zh.json
+32-6
@@ -101,6 +101,7 @@ <h4>✨ <a href="transformers/index.html">Transformers</a></h4>
 <li><a href="transformers/vit/index.html">视觉 Transformer (ViT)</a></li>
 <li><a href="transformers/primer_ez/index.html">Primer</a></li>
 <li><a href="transformers/hour_glass/index.html">沙漏网络</a></li></ul>
+<h4>✨ <a href="lora/index.html">Low-Rank Adaptation (LoRA)</a></h4>
 <h4>✨ <a href="neox/index.html">Eleuther GPT-neox</a></h4>
 <ul><li><a href="neox/samples/generate.html">在一块 48GB GPU 上生成</a></li> 
 <li><a href="neox/samples/finetune.html">在两块 48GB GPU 上微调</a></li>
 
@@ -1296,21 +1296,21 @@
 
     <url>
       <loc>https://nn.labml.ai/lora/gpt2.html</loc>
-      <lastmod>2024-08-16T16:30:00+00:00</lastmod>
+      <lastmod>2024-08-18T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
 
 
     <url>
       <loc>https://nn.labml.ai/lora/index.html</loc>
-      <lastmod>2024-08-03T16:30:00+00:00</lastmod>
+      <lastmod>2024-08-23T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
 
 
     <url>
       <loc>https://nn.labml.ai/lora/experiment.html</loc>
-      <lastmod>2024-08-16T16:30:00+00:00</lastmod>
+      <lastmod>2024-08-23T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
 
 
@@ -15,6 +15,7 @@
  "<h4>\u2728 <a href=\"distillation/index.html\">Distillation</a></h4>\n": "<h4>\u2728 <a href=\"distillation/index.html\">\u84b8\u998f</a></h4>\n",
  "<h4>\u2728 <a href=\"gan/index.html\">Generative Adversarial Networks</a></h4>\n": "<h4>\u2728 <a href=\"gan/index.html\">\u751f\u6210\u5bf9\u6297\u7f51\u7edc</a></h4>\n",
  "<h4>\u2728 <a href=\"hypernetworks/hyper_lstm.html\">HyperNetworks - HyperLSTM</a></h4>\n": "<h4>\u2728 <a href=\"hypernetworks/hyper_lstm.html\">\u8d85\u7f51\u7edc-HyperLSTM</a></h4>\n",
+ "<h4>\u2728 <a href=\"lora/index.html\">Low-Rank Adaptation (LoRA)</a></h4>\n": "<h4>\u2728 <a href=\"lora/index.html\">Low-Rank Adaptation (LoRA)</a></h4>\n",
  "<h4>\u2728 <a href=\"lstm/index.html\">LSTM</a></h4>\n": "<h4>\u2728 <a href=\"lstm/index.html\">LSTM</a></h4>\n",
  "<h4>\u2728 <a href=\"neox/index.html\">Eleuther GPT-NeoX</a></h4>\n": "<h4>\u2728 <a href=\"neox/index.html\">Eleuther GPT-neox</a></h4>\n",
  "<h4>\u2728 <a href=\"normalization/index.html\">Normalization Layers</a></h4>\n": "<h4>\u2728 <a href=\"normalization/index.html\">\u5f52\u4e00\u5316\u5c42</a></h4>\n",
 
@@ -1,24 +1,39 @@
 {
- "<h1>Finetune GPT-2 with <a href=\"index.html\">LoRA</a></h1>\n<p>Here&#x27;s a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.</p>\n<p><a href=\"https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/lora/experiment.ipynb\"><span translate=no>_^_0_^_</span></a></p>\n": "<h1>Finetune GPT-2 with <a href=\"index.html\">LoRA</a></h1>\n<p>Here&#x27;s a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.</p>\n<p><a href=\"https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/lora/experiment.ipynb\"><span translate=no>_^_0_^_</span></a></p>\n",
+ "<h1>Finetune <a href=\"gpt2.html\">GPT-2</a> with <a href=\"index.html\">LoRA</a></h1>\n<p>Here&#x27;s a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.</p>\n<p><a href=\"https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/lora/experiment.ipynb\"><span translate=no>_^_0_^_</span></a></p>\n": "<h1>Finetune <a href=\"gpt2.html\">GPT-2</a> with <a href=\"index.html\">LoRA</a></h1>\n<p>Here&#x27;s a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.</p>\n<p><a href=\"https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/lora/experiment.ipynb\"><span translate=no>_^_0_^_</span></a></p>\n",
  "<h2>Trainer configurations and the training loop</h2>\n<p>The default configs can and will be over-ridden when we start the experiment</p>\n": "<h2>Trainer configurations and the training loop</h2>\n<p>The default configs can and will be over-ridden when we start the experiment</p>\n",
  "<h3>Initialize the model, optimizer and dataloader</h3>\n": "<h3>Initialize the model, optimizer and dataloader</h3>\n",
  "<h3>Load pre-trained <a href=\"https://huggingface.co/openai-community/gpt2\">GPT-2 from huggingface</a></h3>\n": "<h3>Load pre-trained <a href=\"https://huggingface.co/openai-community/gpt2\">GPT-2 from huggingface</a></h3>\n",
  "<h3>Tiny Shakespeare dataset</h3>\n<p>It will download from the url if not present</p>\n": "<h3>Tiny Shakespeare dataset</h3>\n<p>It will download from the url if not present</p>\n",
  "<h3>Training loop</h3>\n": "<h3>Training loop</h3>\n",
+ "<p> </p>\n": "<p> </p>\n",
+ "<p><a href=\"gpt2.html\">GPT2 model</a> </p>\n": "<p><a href=\"gpt2.html\">GPT2 model</a> </p>\n",
+ "<p><span translate=no>_^_0_^_</span> has shape <span translate=no>_^_1_^_</span> </p>\n": "<p><span translate=no>_^_0_^_</span> has shape <span translate=no>_^_1_^_</span> </p>\n",
+ "<p>Call the model, with the all but the last token </p>\n": "<p>Call the model, with the all but the last token </p>\n",
+ "<p>Compute gradients </p>\n": "<p>Compute gradients </p>\n",
+ "<p>Cross entropy loss </p>\n": "<p>Cross entropy loss </p>\n",
+ "<p>Dataloader </p>\n": "<p>Dataloader </p>\n",
  "<p>Dataset </p>\n": "<p>Dataset </p>\n",
  "<p>GPT-2 configs </p>\n": "<p>GPT-2 configs </p>\n",
  "<p>GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers </p>\n": "<p>GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers </p>\n",
+ "<p>Get cross entropy loss </p>\n": "<p>Get cross entropy loss </p>\n",
+ "<p>Huggingface tokenizer </p>\n": "<p>Huggingface tokenizer </p>\n",
+ "<p>Initialize the <a href=\"gpt2.html\">GPT2 model</a> </p>\n": "<p>Initialize the <a href=\"gpt2.html\">GPT2 model</a> </p>\n",
  "<p>Initialize the data loader </p>\n": "<p>Initialize the data loader </p>\n",
- "<p>Initialize the model </p>\n": "<p>Initialize the model </p>\n",
  "<p>Initialize the optimizer </p>\n": "<p>Initialize the optimizer </p>\n",
  "<p>LoRA rank </p>\n": "<p>LoRA rank </p>\n",
- "<p>Load out model </p>\n": "<p>Load out model </p>\n",
+ "<p>Load out model. We use <span translate=no>_^_0_^_</span> because the state does not have LoRA weights </p>\n": "<p>Load out model. We use <span translate=no>_^_0_^_</span> because the state does not have LoRA weights </p>\n",
  "<p>Load pre-trained model weights </p>\n": "<p>Load pre-trained model weights </p>\n",
  "<p>Load the huggingface model and get the parameters </p>\n": "<p>Load the huggingface model and get the parameters </p>\n",
+ "<p>Log the loss </p>\n": "<p>Log the loss </p>\n",
+ "<p>Make gradients 0 </p>\n": "<p>Make gradients 0 </p>\n",
  "<p>Mapping (<span translate=no>_^_0_^_</span>) of decoder layers </p>\n": "<p>Mapping (<span translate=no>_^_0_^_</span>) of decoder layers </p>\n",
+ "<p>Move <span translate=no>_^_0_^_</span> to device </p>\n": "<p>Move <span translate=no>_^_0_^_</span> to device </p>\n",
  "<p>Move the parameters based on mapping </p>\n": "<p>Move the parameters based on mapping </p>\n",
+ "<p>Optimize </p>\n": "<p>Optimize </p>\n",
+ "<p>Optimizer </p>\n": "<p>Optimizer </p>\n",
  "<p>Training configs </p>\n": "<p>Training configs </p>\n",
  "<p>Transformer embedding and prediction layer parameter mapping (<span translate=no>_^_0_^_</span>) </p>\n": "<p>Transformer embedding and prediction layer parameter mapping (<span translate=no>_^_0_^_</span>) </p>\n",
+ "<p>make sure that only lora weights are not loaded </p>\n": "<p>make sure that only lora weights are not loaded </p>\n",
  "Finetune GPT-2 with LoRA": "Finetune GPT-2 with LoRA",
  "This is training code with notes for fine-tuning pre-trained GPT-2 model with LoRA.": "This is training code with notes for fine-tuning pre-trained GPT-2 model with LoRA."
 }
@@ -1,16 +1,42 @@
 {
- "<p> Splits hidden_size dim into attn_head_size and num_heads</p>\n": "<p> Splits hidden_size dim into attn_head_size and num_heads</p>\n",
+ "<h1>GPT-2 with <a href=\"index.html\">LoRA modules</a></h1>\n<p>Here&#x27;s <a href=\"experiment.html\">the training code</a> for training a GPT2 model with LoRA  on Tiny Shakespeare dataset.</p>\n": "<h1>GPT-2 with <a href=\"index.html\">LoRA modules</a></h1>\n<p>Here&#x27;s <a href=\"experiment.html\">the training code</a> for training a GPT2 model with LoRA  on Tiny Shakespeare dataset.</p>\n",
+ "<h2>GPT2 Model</h2>\n": "<h2>GPT2 Model</h2>\n",
+ "<h3>Decoder block</h3>\n": "<h3>Decoder block</h3>\n",
+ "<h3>Feedforward Network</h3>\n": "<h3>Feedforward Network</h3>\n",
+ "<h3>Multi-Head Attention</h3>\n": "<h3>Multi-Head Attention</h3>\n",
  "<p>Add position embeddings </p>\n": "<p>Add position embeddings </p>\n",
+ "<p>Apply causal attention </p>\n": "<p>Apply causal attention </p>\n",
+ "<p>Attention </p>\n": "<p>Attention </p>\n",
+ "<p>Attention layer </p>\n": "<p>Attention layer </p>\n",
+ "<p>Attention pre-normalization layer </p>\n": "<p>Attention pre-normalization layer </p>\n",
+ "<p>Decoder blocks </p>\n": "<p>Decoder blocks </p>\n",
+ "<p>FFN </p>\n": "<p>FFN </p>\n",
+ "<p>FFN pre-normalization layer </p>\n": "<p>FFN pre-normalization layer </p>\n",
+ "<p>Feed-forward network </p>\n": "<p>Feed-forward network </p>\n",
+ "<p>Final layer norm </p>\n": "<p>Final layer norm </p>\n",
  "<p>Final normalization </p>\n": "<p>Final normalization </p>\n",
+ "<p>Final project </p>\n": "<p>Final project </p>\n",
  "<p>Get logits from projection layer </p>\n": "<p>Get logits from projection layer </p>\n",
  "<p>Get position embeddings </p>\n": "<p>Get position embeddings </p>\n",
  "<p>Get position ids </p>\n": "<p>Get position ids </p>\n",
+ "<p>Get query, key and value </p>\n": "<p>Get query, key and value </p>\n",
  "<p>Get token embeddings </p>\n": "<p>Get token embeddings </p>\n",
+ "<p>Linear transformation for QKV </p>\n": "<p>Linear transformation for QKV </p>\n",
+ "<p>Output projection </p>\n": "<p>Output projection </p>\n",
+ "<p>Projection layer to logit space </p>\n": "<p>Projection layer to logit space </p>\n",
+ "<p>Reorder to <span translate=no>_^_0_^_</span> </p>\n": "<p>Reorder to <span translate=no>_^_0_^_</span> </p>\n",
  "<p>Run through transformer blocks </p>\n": "<p>Run through transformer blocks </p>\n",
- "<p>lin1 </p>\n": "<p>lin1 </p>\n",
- "<p>lin2 </p>\n": "<p>lin2 </p>\n",
- "<p>out </p>\n": "<p>out </p>\n",
- "<p>qkv </p>\n": "<p>qkv </p>\n",
+ "<p>Split last dimension to <span translate=no>_^_0_^_</span> </p>\n": "<p>Split last dimension to <span translate=no>_^_0_^_</span> </p>\n",
+ "<p>The linear layers and the activation </p>\n": "<p>The linear layers and the activation </p>\n",
+ "<p>Token and absolute positional embeddings </p>\n": "<p>Token and absolute positional embeddings </p>\n",
+ "<p>Transform them from shape <span translate=no>_^_0_^_</span> to <span translate=no>_^_1_^_</span> </p>\n": "<p>Transform them from shape <span translate=no>_^_0_^_</span> to <span translate=no>_^_1_^_</span> </p>\n",
  "<ul><li><span translate=no>_^_0_^_</span>  has shape <span translate=no>_^_1_^_</span></li></ul>\n": "<ul><li><span translate=no>_^_0_^_</span>  has shape <span translate=no>_^_1_^_</span></li></ul>\n",
- "gpt2.py": "gpt2.py"
+ "<ul><li><span translate=no>_^_0_^_</span>  is the embeddings tensor with shape <span translate=no>_^_1_^_</span></li></ul>\n": "<ul><li><span translate=no>_^_0_^_</span>  is the embeddings tensor with shape <span translate=no>_^_1_^_</span></li></ul>\n",
+ "<ul><li><span translate=no>_^_0_^_</span>  is the number of dimensions </li>\n<li><span translate=no>_^_1_^_</span>  is the size of the hidden dimension </li>\n<li><span translate=no>_^_2_^_</span>  is the lora rank</li></ul>\n": "<ul><li><span translate=no>_^_0_^_</span>  is the number of dimensions </li>\n<li><span translate=no>_^_1_^_</span>  is the size of the hidden dimension </li>\n<li><span translate=no>_^_2_^_</span>  is the lora rank</li></ul>\n",
+ "<ul><li><span translate=no>_^_0_^_</span>  is the number of dimensions in the embeddings </li>\n<li><span translate=no>_^_1_^_</span>  is the number of attention heads </li>\n<li><span translate=no>_^_2_^_</span>  is the number of decoder layers </li>\n<li><span translate=no>_^_3_^_</span>  is the number of positional embeddings </li>\n<li><span translate=no>_^_4_^_</span>  is the layer norm epsilon </li>\n<li><span translate=no>_^_5_^_</span>  is the vocabulary size </li>\n<li><span translate=no>_^_6_^_</span>  is the lora rank</li></ul>\n": "<ul><li><span translate=no>_^_0_^_</span>  is the number of dimensions in the embeddings </li>\n<li><span translate=no>_^_1_^_</span>  is the number of attention heads </li>\n<li><span translate=no>_^_2_^_</span>  is the number of decoder layers </li>\n<li><span translate=no>_^_3_^_</span>  is the number of positional embeddings </li>\n<li><span translate=no>_^_4_^_</span>  is the layer norm epsilon </li>\n<li><span translate=no>_^_5_^_</span>  is the vocabulary size </li>\n<li><span translate=no>_^_6_^_</span>  is the lora rank</li></ul>\n",
+ "<ul><li><span translate=no>_^_0_^_</span>  is the number of dimensions in the embeddings </li>\n<li><span translate=no>_^_1_^_</span>  is the number of heads </li>\n<li><span translate=no>_^_2_^_</span>  is the layer norm epsilon </li>\n<li><span translate=no>_^_3_^_</span>  is the lora rank</li></ul>\n": "<ul><li><span translate=no>_^_0_^_</span>  is the number of dimensions in the embeddings </li>\n<li><span translate=no>_^_1_^_</span>  is the number of heads </li>\n<li><span translate=no>_^_2_^_</span>  is the layer norm epsilon </li>\n<li><span translate=no>_^_3_^_</span>  is the lora rank</li></ul>\n",
+ "<ul><li><span translate=no>_^_0_^_</span>  is the number of dimensions in the embeddings </li>\n<li><span translate=no>_^_1_^_</span>  is the number of heads </li>\n<li><span translate=no>_^_2_^_</span>  is the lora rank</li></ul>\n": "<ul><li><span translate=no>_^_0_^_</span>  is the number of dimensions in the embeddings </li>\n<li><span translate=no>_^_1_^_</span>  is the number of heads </li>\n<li><span translate=no>_^_2_^_</span>  is the lora rank</li></ul>\n",
+ "<ul><li><span translate=no>_^_0_^_</span>  is the tensor with shape <span translate=no>_^_1_^_</span></li></ul>\n": "<ul><li><span translate=no>_^_0_^_</span>  is the tensor with shape <span translate=no>_^_1_^_</span></li></ul>\n",
+ "GPT-2 implementation with LoRA modules": "GPT-2 implementation with LoRA modules",
+ "GPT-2 with LoRA": "GPT-2 with LoRA"
 }