diff --git a/configs/test_ft.yaml b/configs/test_ft.yaml deleted file mode 100644 index 0298533..0000000 --- a/configs/test_ft.yaml +++ /dev/null @@ -1,54 +0,0 @@ -model_name_or_path: "NousResearch/Nous-Hermes-llama-2-7b" -dataset: "HydraLM/corpus_1_clustered_formatted" -split: "config0" -dataset_format: 'input-output' -use_auth: true -output_dir: "./output/7b_cluster00" -run_name: "run_name" -logging_steps: 10 -source_max_len: 2048 -target_max_len: 2048 -per_device_train_batch_size: 4 -gradient_accumulation_steps: 4 -learning_rate: 0.0002 -save_strategy: "steps" -data_seed: 42 -max_steps: 5000 -eval_steps: 200 -save_steps: 200 -save_total_limit: 100 -evaluation_strategy: "steps" -eval_dataset_size: 1024 -max_eval_samples: 1000 -per_device_eval_batch_size: 4 -do_train: true -do_eval: true -do_mmlu_eval: true -metric_for_best_model: eval_loss -load_best_model_at_end: true -greater_is_better: false -metric_for_best_model: eval_loss -greater_is_better: false -dataloader_num_workers: 8 -group_by_length: false -logging_strategy: "steps" -remove_unused_columns: false -lora_r: 64 -lora_alpha: 16 -lora_modules: "all" -double_quant: true -quant_type: "nf4" -bf16: true -bits: 4 -warmup_ratio: 0.03 -lr_scheduler_type: "constant" -gradient_checkpointing: true -adam_beta2: 0.999 -max_grad_norm: 0.3 -lora_dropout: 0.1 -weight_decay: 0.05 -seed: 0 -push_to_hub: true -hub_strategy: "all_checkpoints" -hub_model_id: "HydraLM" -report_to: wandb \ No newline at end of file diff --git a/moe_/__init__.py b/moe_/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/moe_/alpha.py b/moe_/alpha.py deleted file mode 100644 index a67aa40..0000000 --- a/moe_/alpha.py +++ /dev/null @@ -1,362 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -import bitsandbytes as bnb - -from peft import LoraModel -from peft.tuners.lora import LoraLayer, Linear, Linear4bit, Linear8bitLt, Embedding - -class AlphaLoraModel(LoraModel): - #Extends LoraModel to provide support for inference with multiple adapters at a time. - - def __init__(self, model, config, adapter_name): - super().__init__() - self.model = model - self.forward = self.model.forward - self.peft_config = config - self.add_adapter(adapter_name, self.peft_config[adapter_name]) - - # transformers models have a .config attribute, whose presence is assumed later on - if not hasattr(self, "config"): - self.config = {"model_type": "custom"} - - def _update_alphas(self, alphas_dict): - lora_config = list(self.peft_config.items())[0][1] - self._check_quantization_dependency() - - key_list = [key for key, _ in self.model.named_modules()] - for key in key_list: - - if not self._check_target_module_exists(lora_config, key): - continue - - parent, target, target_name = _get_submodules(self.model, key) - - if isinstance(target, LoraLayer): - target._update_alphas(alphas_dict) - - def _create_new_module(self, lora_config, adapter_name, target): - bias = hasattr(target, "bias") and target.bias is not None - kwargs = { - "r": lora_config.r, - "lora_alpha": lora_config.lora_alpha, - "lora_dropout": lora_config.lora_dropout, - "fan_in_fan_out": lora_config.fan_in_fan_out, - "init_lora_weights": lora_config.init_lora_weights, - } - loaded_in_4bit = getattr(self.model, "is_loaded_in_4bit", False) - loaded_in_8bit = getattr(self.model, "is_loaded_in_8bit", False) - - if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt): - eightbit_kwargs = kwargs.copy() - eightbit_kwargs.update( - { - "has_fp16_weights": target.state.has_fp16_weights, - "memory_efficient_backward": target.state.memory_efficient_backward, - "threshold": target.state.threshold, - "index": target.index, - } - ) - new_module = AlphaLinear8bitLt( - adapter_name, target.in_features, target.out_features, bias=bias, **eightbit_kwargs - ) - elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target, bnb.nn.Linear4bit): - fourbit_kwargs = kwargs.copy() - fourbit_kwargs.update( - { - "compute_dtype": target.compute_dtype, - "compress_statistics": target.weight.compress_statistics, - "quant_type": target.weight.quant_type, - } - ) - new_module = AlphaLinear4bit( - adapter_name, target.in_features, target.out_features, bias=bias, **fourbit_kwargs) - elif isinstance(target, torch.nn.Embedding): - embedding_kwargs = kwargs.copy() - embedding_kwargs.pop("fan_in_fan_out", None) - in_features, out_features = target.num_embeddings, target.embedding_dim - new_module = AlphaEmbedding( - adapter_name, in_features, out_features, **embedding_kwargs) - else: - if isinstance(target, torch.nn.Linear): - in_features, out_features = target.in_features, target.out_features - if kwargs["fan_in_fan_out"]: - warnings.warn( - "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " - "Setting fan_in_fan_out to False." - ) - kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False - elif isinstance(target, Conv1D): - in_features, out_features = ( - target.weight.ds_shape if hasattr( - target.weight, "ds_shape") else target.weight.shape - ) - kwargs["is_target_conv_1d_layer"] = True - if not kwargs["fan_in_fan_out"]: - warnings.warn( - "fan_in_fan_out is set to False but the target module is `Conv1D`. " - "Setting fan_in_fan_out to True." - ) - kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True - else: - raise ValueError( - f"Target module {target} is not supported. " - f"Currently, only `torch.nn.Linear` and `Conv1D` are supported." - ) - new_module = AlphaLinear(adapter_name, in_features, - out_features, bias=bias, **kwargs) - - return new_module - - def enable_adapter_layers(self): - return NotImplemented - - def _get_active_adapter(self) -> str: - return NotImplemented - - def disable_adapter_layers(self): - return NotImplemented - - def set_adapter(self, adapter_name): - return NotImplemented - - - def merge_adapter(self): - return NotImplemented - - def unmerge_adapter(self): - return NotImplemented - - def delete_adapter(self, adapter_name): - """ - Deletes an existing adapter. - - Args: - adapter_name (str): Name of the adapter to be deleted. - """ - if adapter_name not in list(self.peft_config.keys()): - raise ValueError(f"Adapter {adapter_name} does not exist") - del self.peft_config[adapter_name] - key_list = [key for key, _ in self.model.named_modules() - if "lora" not in key] - for key in key_list: - _, target, _ = _get_submodules(self.model, key) - if isinstance(target, LoraLayer): - for attr in [ - "r", - "lora_alpha", - "scaling", - "lora_A", - "lora_B", - "lora_embedding_A", - "lora_embedding_B", - "lora_dropout", - ]: - if adapter_name in getattr(target, attr): - getattr(target, attr).pop(adapter_name) - if target.active_adapter == adapter_name: - resetting_active_adapter = list(self.peft_config.keys())[0] - warnings.warn( - f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to {resetting_active_adapter}. " - ) - target.active_adapter = resetting_active_adapter - - def unload(self): - """ - Gets back the base model by removing all the lora modules without merging. This gives back the original base - model. - """ - return self._unload_and_optionally_merge(merge=False) - - -class AlphaLoraLayer(LoraLayer): - - def __init__(self, in_features: int, out_features: int, **kwargs): - super(LoraLayer, self).__init__(self, in_features, out_features, **kwargs) - - def _update_alphas(self, new_alphas: dict): - # dont update entire layer, only alphas - self.lora_alpha = new_alphas - adapters = list(self.lora_A.keys()) - for adapter_name in adapters: - self.scaling[adapter_name] = self.lora_alpha[adapter_name] / self.r[adapter_name] - - -class AlphaLinear(Linear): - # hydra-moe-alpha implemented in a linear layer. - - def forward(self, x: torch.Tensor): - previous_dtype = x.dtype - result = F.linear(x, transpose( - self.weight, self.fan_in_fan_out), bias=self.bias) - adapters = list(self.lora_A.keys()) - x = x.to(self.lora_A[adapters[0]].weight.dtype) - for adapter_name in adapters: - result += ( - self.lora_B[adapter_name]( - self.lora_A[adapter_name]( - self.lora_dropout[adapter_name](x)) - ) - * self.scaling[adapter_name] - ) - result = result.to(previous_dtype) - - return result - - -class AlphaEmbedding(Embedding): - # hydra-moe-alpha implemented in an embedding layer. - - def forward(self, x: torch.Tensor): - result = nn.Embedding.forward(self, x) - adapters = list(self.lora_A.keys()) - for adapter_name in adapters: - after_A = F.embedding( - x, - self.lora_embedding_A[adapter_name].T, - self.padding_idx, - self.max_norm, - self.norm_type, - self.scale_grad_by_freq, - self.sparse, - ) - result += (after_A @ - self.lora_embedding_B[adapter_name].T) * self.scaling[adapter_name] - - return result - - -class AlphaLinear8bitLt(Linear8bitLt): - # hydra-moe-alpha implemented in a dense layer. - - def forward(self, x: torch.Tensor): - result = super().forward(x) - - if self.disable_adapters: - return result - - else: - adapters = list(self.lora_A.keys()) - - if not torch.is_autocast_enabled(): - expected_dtype = result.dtype - if x.dtype != torch.float32: - x = x.float() - - for adapter_name in adapters: - result += self.lora_B[adapter_name]( - self.lora_A[adapter_name]( - self.lora_dropout[adapter_name](x)) - ).to(expected_dtype) \ - * self.scaling[adapter_name] - else: - - for adapter_name in adapters: - result += self.lora_B[adapter_name]( - self.lora_A[adapter_name]( - self.lora_dropout[adapter_name](x)) - ) * self.scaling[adapter_name] - return result - - -class AlphaLinear4Bit(Linear4bit): - # hydra-moe-alpha implemented in a dense layer. - - def __init__(self, **kwargs): - super(Linear4bit, self).__init__(kwargs) - - - def forward(self, x: torch.Tensor): - result = super().forward(x) - - if self.disable_adapters: - return result - - else: - result = result.clone() - adapters = list(self.lora_A.keys()) - - if not torch.is_autocast_enabled(): - expected_dtype = result.dtype - x = x.to(self.lora_A[adapters[0]].weight.dtype) - - for adapter_name in adapters: - result += self.lora_B[adapter_name]( - self.lora_A[adapter_name]( - self.lora_dropout[adapter_name](x)) - ).to(expected_dtype) \ - * self.scaling[adapter_name] - else: - - for adapter_name in adapters: - result += self.lora_B[adapter_name]( - self.lora_A[adapter_name]( - self.lora_dropout[adapter_name](x)) - ) * self.scaling[adapter_name] - - return result - - - -class Linear4bit(bnb.nn.Linear4bit, LoraLayer): - # Lora implemented in a dense layer - def __init__( - self, - adapter_name, - in_features, - out_features, - r: int = 0, - lora_alpha: int = 1, - lora_dropout: float = 0.0, - **kwargs, - ): - bnb.nn.Linear4bit.__init__( - self, - in_features, - out_features, - bias=kwargs.get("bias", True), - compute_dtype=kwargs.get("compute_dtype", torch.float32), - compress_statistics=kwargs.get( - "compress_statistics", True), - quant_type=kwargs.get("quant_type", "nf4"), - ) - LoraLayer.__init__( - self, in_features=in_features, out_features=out_features) - - # Freezing the pre-trained weight matrix - self.weight.requires_grad = False - - init_lora_weights = kwargs.pop("init_lora_weights", True) - self.update_layer(adapter_name, r, lora_alpha, - lora_dropout, init_lora_weights) - self.active_adapter = adapter_name - - def forward(self, x: torch.Tensor): - result = super().forward(x) - - if self.disable_adapters: - return result - - else: - result = result.clone() - adapters = list(self.lora_A.keys()) - - if not torch.is_autocast_enabled(): - expected_dtype = result.dtype - x = x.to(self.lora_A[adapters[0]].weight.dtype) - - for adapter_name in adapters: - result += self.lora_B[adapter_name]( - self.lora_A[adapter_name]( - self.lora_dropout[adapter_name](x)) - ).to(expected_dtype) \ - * self.scaling[adapter_name] - else: - - for adapter_name in adapters: - result += self.lora_B[adapter_name]( - self.lora_A[adapter_name]( - self.lora_dropout[adapter_name](x)) - ) * self.scaling[adapter_name] - - return result diff --git a/pipeline_test.ipynb b/pipeline_test.ipynb deleted file mode 100644 index 56cc69e..0000000 --- a/pipeline_test.ipynb +++ /dev/null @@ -1,262 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "initial_id", - "metadata": { - "collapsed": true, - "is_executing": true - }, - "outputs": [], - "source": [ - "from data_utils import *\n", - "import numpy as np\n", - "from datasets import load_dataset, Dataset\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "id": "c1ba515b9dfcb95d", - "metadata": { - "collapsed": false - }, - "source": [ - "# Step 1: Define dataset names to be used" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a811f1c8215dffcb", - "metadata": { - "ExecuteTime": { - "end_time": "2023-08-22T05:44:51.105605Z", - "start_time": "2023-08-22T05:44:51.103575Z" - }, - "collapsed": false - }, - "outputs": [], - "source": [ - "dataset_names = [\n", - " # \"HydraLM/camel_society_standardized\",\n", - " # \"HydraLM/camel_code_standardized\",\n", - " # \"HydraLM/GPTeacher_toolformer_standardized\",\n", - " # \"HydraLM/GPTeacher_roleplay_standardized\",\n", - " # \"HydraLM/GPTeacher_codegen_standardized\",\n", - " # \"HydraLM/unnatural-instructions_standardized\",\n", - " # \"HydraLM/physics_dataset_standardized\",\n", - " # \"HydraLM/math_dataset_standardized\",\n", - " # \"HydraLM/chemistry_dataset_standardized\",\n", - " # \"HydraLM/biology_dataset_standardized\",\n", - " # \"HydraLM/airoboros-gpt4-1.4_standardized\",\n", - " # \"HydraLM/WizardLM_evol_instruct_V2_196k_standardized\",\n", - " # \"HydraLM/CodeAlpaca-20k_standardized\",\n", - " # \"HydraLM/GPT4-LLM-Cleaned_standardized\",\n", - " # \"HydraLM/GPTeacher-General-Instruct_standardized\",\n", - " # \"HydraLM/lima_standardized\",\n", - " # \"HydraLM/conala_standardized\",\n", - " # \"HydraLM/alpaca_data_cleaned_standardized\",\n", - " # \"HydraLM/riddle_sense_standardized\",\n", - " # \"HydraLM/gsm8k_standardized\",\n", - " # \"HydraLM/sciq_standardized\",\n", - " # \"HydraLM/share_gpt_vicuna_unfiltered_standardized\",\n", - " # \"HydraLM/wizard_vicuna_dataset_unfiltered_standardized\",\n", - " # \"HydraLM/science_qa_txt_only_standardized\",\n", - " # \"HydraLM/glaive_function_calling_v1_standardized\",\n", - " # \"HydraLM/puffin_standardized\",\n", - " # \"HydraLM/gorilla_16k_standardized\",\n", - " # \"HydraLM/goat_standardized\",\n", - " # \"HydraLM/wizard_evolinstruct70k_k4_standardized\",\n", - " # \"HydraLM/Open_Platypus_standardized\",\n", - " \"HydraLM/TheoremQA_standardized\",\n", - " \"HydraLM/code_instructions_122k_alpaca_style_standardized\",\n", - " \"HydraLM/databricks-dolly-15k_standardized\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f5c5d5370d3ed468", - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "68ce6e7cf8593eb1", - "metadata": { - "collapsed": false - }, - "source": [ - "# Step 2: Load datasets" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b04c1b0416d300fa", - "metadata": { - "ExecuteTime": { - "end_time": "2023-08-22T05:44:57.954359Z", - "start_time": "2023-08-22T05:44:54.970712Z" - }, - "collapsed": false - }, - "outputs": [], - "source": [ - "datasets = {dataset_name: load_dataset(dataset_name) for dataset_name in dataset_names}" - ] - }, - { - "cell_type": "markdown", - "id": "3dd45a6195a744ac", - "metadata": { - "collapsed": false - }, - "source": [ - "# Step 3: Combine datasets" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "3de141c24715c6da", - "metadata": { - "ExecuteTime": { - "end_time": "2023-08-22T05:45:06.701655Z", - "start_time": "2023-08-22T05:44:57.955383Z" - }, - "collapsed": false - }, - "outputs": [], - "source": [ - "combined = combine_datasets(list(datasets.values()), list(datasets.keys()), add_unique_id=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "a13f7c9fe9ab3538", - "metadata": { - "ExecuteTime": { - "end_time": "2023-08-22T05:46:00.796041Z", - "start_time": "2023-08-22T05:46:00.794293Z" - }, - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "137770" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(set(combined[\"unique_conversation_id\"]))" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6a110a3d0e8fe5", - "metadata": { - "ExecuteTime": { - "end_time": "2023-08-22T05:45:06.706303Z", - "start_time": "2023-08-22T05:45:06.702076Z" - }, - "collapsed": false - }, - "outputs": [], - "source": [ - "#TODO: filter out empty inputs and empty outputs(removing messages until previous output for multi-turn)" - ] - }, - { - "cell_type": "markdown", - "id": "e8ac7ea4ecf5b6e9", - "metadata": { - "collapsed": false - }, - "source": [ - "# Step 4: Prepare for clustering, combine the text." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d7e71053c742c59", - "metadata": { - "ExecuteTime": { - "end_time": "2023-08-22T05:45:26.867523Z", - "start_time": "2023-08-22T05:45:17.872057Z" - }, - "collapsed": false - }, - "outputs": [], - "source": [ - "clustering_dataset = format(combined, cluster_template, split=None)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "71d8f13a37736dce", - "metadata": { - "ExecuteTime": { - "end_time": "2023-08-22T05:59:59.028885Z", - "start_time": "2023-08-22T05:59:57.340648Z" - }, - "collapsed": false - }, - "outputs": [], - "source": [ - "from embedding import BGE_Large\n", - "embedding_model = BGE_Large()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66f0e72d6f398760", - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "\n", - "clustering_dataset = clustering_dataset.map(lambda x: {\"embedding\": embedding_model.embed_text(x[\"text\"])})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}