From f70571188b5f27d694edc4fdf276bc96886dcc20 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 9 May 2024 13:00:30 -0300 Subject: [PATCH] TGIS gRPC adapter for lm-eval This PoC adds a backend to lm-eval so that it can call a running TGIS or tgis-vllm server over grpc. It can run benchmarks based on the generate function for decoder and encoder-decoder models. For the logprobs function only decoder models are supported because tgis doesn't return the input logprobs for encoder-decoder models. Signed-off-by: Max de Bayser --- lm-eval/Dockerfile | 16 ++ lm-eval/Makefile | 15 ++ lm-eval/README.md | 51 ++++++ lm-eval/job.yaml | 30 +++ lm-eval/pyproject.toml | 14 ++ lm-eval/tgis_eval/__init__.py | 0 lm-eval/tgis_eval/__main__.py | 10 + lm-eval/tgis_eval/model.py | 335 ++++++++++++++++++++++++++++++++++ 8 files changed, 471 insertions(+) create mode 100644 lm-eval/Dockerfile create mode 100644 lm-eval/Makefile create mode 100644 lm-eval/README.md create mode 100644 lm-eval/job.yaml create mode 100644 lm-eval/pyproject.toml create mode 100644 lm-eval/tgis_eval/__init__.py create mode 100644 lm-eval/tgis_eval/__main__.py create mode 100644 lm-eval/tgis_eval/model.py diff --git a/lm-eval/Dockerfile b/lm-eval/Dockerfile new file mode 100644 index 00000000..6d32c382 --- /dev/null +++ b/lm-eval/Dockerfile @@ -0,0 +1,16 @@ +ARG BASE_UBI_IMAGE_TAG=9.3-1552 +FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} + +RUN dnf update -y && dnf install -y python3-pip python3-wheel git && dnf clean all + +WORKDIR /app +COPY pyproject.toml pyproject.toml +COPY tgis_eval tgis_eval + +RUN pip install . + +RUN useradd app +VOLUME /cache +ENV XDG_CACHE_HOME=/cache + +USER app diff --git a/lm-eval/Makefile b/lm-eval/Makefile new file mode 100644 index 00000000..66d6edc2 --- /dev/null +++ b/lm-eval/Makefile @@ -0,0 +1,15 @@ +gen-client: + # Compile protos + pip install grpcio-tools==1.60.0 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' --no-cache-dir + mkdir tgis_eval/pb || true + python -m grpc_tools.protoc -I../proto --python_out=tgis_eval/pb \ + --grpc_python_out=tgis_eval/pb --mypy_out=tgis_eval/pb ../proto/generation.proto + find tgis_eval/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \; + touch tgis_eval/pb/__init__.py + +install: gen-client + pip install pip --upgrade + pip install -e . --no-cache-dir + +image: gen-client + podman build -t quay.io/wxpe/lm-eval-tgis:0.0.2 . diff --git a/lm-eval/README.md b/lm-eval/README.md new file mode 100644 index 00000000..371773e3 --- /dev/null +++ b/lm-eval/README.md @@ -0,0 +1,51 @@ +# TGIS eval framework + +This directory contains an adapter to run the [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) +framework on a TGIS server. We subclass the Model class to collect the benchmark requests and send +them to the TGIS server over gRPC. + +## Installing + +To install lm-eval with tgis support in your environment run `make install`. + + +## Running: + +To run the benchmark, call it as python module on the command line: +``` +python3 -m tgis_eval \ + --model_args server=,port= \ + --model=tgis_eval \ + --batch_size=16 \ # <-- change the batch size to fit your gpu + --tasks +``` + +For example, to run the 5 benchmarks that make up the huggingface leaderboard +on a TGIS instance running on hostname `flan-t5-inference-server`: + +``` +python3 -m tgis_eval \ + --model_args server=flan-t5-inference-server,port=8033 \ + --model=tgis_eval \ + --batch_size=16 \ + --tasks ai2_arc,hellaswag,mmlu,truthfulqa,winogrande,gsm8k +``` + +## Building the container + +To build the container, run `make image`. + + +## Running as a job on Kubernetes + +You can run tgis-eval as a Kubernetes Job. Locate the `job.yaml` file in this directory +and edit it to adjust it to your needs. Make sure that the hostname is correct and that +the benchmarks to run are the ones you need. Then submit the job with + +``` +kubectl apply -f job.yaml +``` + +If you're going to run several rounds of tests, it is recommended to allocate a persistent +volume and mount it in the job pod. This will avoid downloading the same datasets over and +over. diff --git a/lm-eval/job.yaml b/lm-eval/job.yaml new file mode 100644 index 00000000..27675b57 --- /dev/null +++ b/lm-eval/job.yaml @@ -0,0 +1,30 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: lm-eval-job +spec: + template: + spec: + containers: + - name: eval + image: quay.io/wxpe/lm-eval-tgis:0.0.7 + command: + - python3 + - -m + - tgis_eval + - --model_args + - server=flan-t5-inference-server,port=8033 + - --model=tgis_eval + - --batch_size=16 + - --tasks + - ai2_arc,hellaswag,mmlu,truthfulqa,winogrande,gsm8k + #- --limit=10 + volumeMounts: + - name: cache-volume + mountPath: /cache + restartPolicy: Never + volumes: + - name: cache-volume + emptyDir: {} + backoffLimit: 0 + diff --git a/lm-eval/pyproject.toml b/lm-eval/pyproject.toml new file mode 100644 index 00000000..9d4bc6a4 --- /dev/null +++ b/lm-eval/pyproject.toml @@ -0,0 +1,14 @@ +[tool.poetry] +name = "tgis-eval" +version = "0.1.0" +description = "lm-eval backend for tgis" +authors = ["Max de Bayser"] + +[tool.poetry.dependencies] +python = ">=3.9" +grpcio-tools = "^1.62.1" +lm-eval ="^0.4.2" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/lm-eval/tgis_eval/__init__.py b/lm-eval/tgis_eval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lm-eval/tgis_eval/__main__.py b/lm-eval/tgis_eval/__main__.py new file mode 100644 index 00000000..7cabd90a --- /dev/null +++ b/lm-eval/tgis_eval/__main__.py @@ -0,0 +1,10 @@ +try: + from lm_eval.__main__ import cli_evaluate +except ImportError: + raise ImportError("Could not import lm_eval: Please install ibm-generative-ai[lm-eval] extension.") # noqa: B904 + +from .model import initialize_model + +initialize_model() + +cli_evaluate() diff --git a/lm-eval/tgis_eval/model.py b/lm-eval/tgis_eval/model.py new file mode 100644 index 00000000..0dfb0456 --- /dev/null +++ b/lm-eval/tgis_eval/model.py @@ -0,0 +1,335 @@ +import json +from collections import defaultdict +from typing import Any, Iterator, NamedTuple, Optional, Type, cast + + +import lm_eval.utils +from lm_eval.api.instance import Instance +from lm_eval.api.model import LM +from lm_eval.api.registry import register_model +from lm_eval.models.utils import Grouper + +from tqdm import tqdm +from .pb import generation_pb2_grpc as gpb2, generation_pb2 as pb2 +#import proto +import grpc +import numpy as np +from time import time + +SERVER = 'localhost' +PORT = 8033 +DEFAULT_BATCH_SIZE=64 +DEFAULT_MAX_NEW_TOKENS=300 + +class LogLikelihoodResult(NamedTuple): + log_likelihood: float + is_greedy: bool + + +def initialize_model(): + pass # model is registered by importing this module + +def chunks(lst, n): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i:i + n] + +def option(cls, val): + return None if val is None else cls(val) + +@register_model("tgis_eval") +class TGISLMEval(LM): + """ + Implementation of LM model interface for evaluating TGIS model with the lm_eval framework. + + See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md for reference. + """ + + @classmethod + def create_from_arg_string( + cls: Type["TGISLMEval"], + arg_string: str, + additional_config: Optional[dict] = None, + ) -> "TGISLMEval": + """Allow the user to specify model parameters (TextGenerationParameters) in CLI arguments.""" + args = lm_eval.utils.simple_parse_args_string(arg_string) + print(f"LM args = {args}") + return cls(parameters=args, additional_config=additional_config) + + def __init__( + self, + parameters = None, + additional_config: Optional[dict] = None, + show_progressbar: Optional[bool] = True + ): + super().__init__() + + additional_config = {} if additional_config is None else additional_config + + self.server = parameters.get("server", SERVER) + self.port = int(parameters.get("port", PORT)) + + self.channel = grpc.insecure_channel(f"{self.server}:{self.port}") + self.stub = gpb2.GenerationServiceStub(self.channel) + + self.model_kind = self.stub.ModelInfo(pb2.ModelInfoRequest()).model_kind + + self._parameters = parameters + self._show_progressbar = show_progressbar + self.batch_size = int(additional_config.get("batch_size", DEFAULT_BATCH_SIZE)) + + self.decoding_method = parameters.get("decoding_method", "greedy") + + if self.decoding_method == "greedy": + self.decoding_method = pb2.GREEDY + elif self.decoding_method == "sample": + self.decoding_method = pb2.SAMPLE + else: + raise ValueError(f"{self.decoding_method} is not valid for parameter decoding_method") + + self.sampling_params = pb2.SamplingParameters( + temperature = option(float,parameters.get("temperature")), + top_k = option(int,parameters.get("top_k")), + top_p = option(float,parameters.get("top_p")), + typical_p = option(float,parameters.get("typical_p")), + seed = option(int,parameters.get("seed")) + ) + start_index = option(int,parameters.get("length_penalty.start_index")) + decay_factor = option(float,parameters.get("length_penalty.decay_factor")) + + if (start_index is None) != (decay_factor is None): + raise ValueError(f"length_penalty.{start_index, decay_factor} must both be set or unset") + + length_penalty = pb2.DecodingParameters.LengthPenalty( + start_index = start_index, + decay_factor = decay_factor + ) if start_index is not None else None + + self.decoding_parameters = pb2.DecodingParameters ( + repetition_penalty = option(float,parameters.get("repetition_penalty")), + length_penalty = length_penalty + ) + + def close(self): + self.channel.close() + + def _tokenize(self, inputs: list[str]) -> Iterator[list[str]]: + tokenization_request = self.get_tokenization_request(inputs) + for response in self.stub.Tokenize(tokenization_request).responses: + yield response.tokens + + def _has_stop_token(self, response_tokens: list[str], context_tokens: list[str]) -> bool: + context_length = len(context_tokens) + + # workaround difference in tokenization in some models + for i in range(len(context_tokens)): + if response_tokens[i] == '': + response_tokens[i] = context_tokens[i] + + if response_tokens[: context_length - 1] == context_tokens[:-1]: + return response_tokens[-1] != context_tokens[-1] # only last token differs, probably stop sequence () + raise RuntimeError( + f"There is an unexpected difference between tokenizer and model tokens:\n" + f"context_tokens={context_tokens}\n" + f"response_tokens={response_tokens[:context_length]}" + ) + + def _check_model_logprobs_support(self): + + if self.model_kind == pb2.ModelInfoResponse.ENCODER_DECODER: + raise RuntimeError(f"Encoder decoder models don't return logprobs for input tokens and are not supported") + + input_tokens = self.stub.Generate( + self.get_batch_request(["The best ice cream flavor is:"]) + ).responses[0].input_tokens + + if all(token.logprob is None or np.isnan(token.logprob) for token in input_tokens): + raise RuntimeError(f"The model is not supported: does not return logprobs for input tokens") + + + def _get_log_likelihood(self, input_tokens: list[pb2.TokenInfo], context_tokens: list[str]) -> LogLikelihoodResult: + response_tokens: list[str] = [token.text for token in input_tokens] + context_length = len(context_tokens) + + if self._has_stop_token(response_tokens, context_tokens): + context_length -= 1 + + return LogLikelihoodResult( + log_likelihood=sum(token.logprob for token in input_tokens[context_length:]), + is_greedy=all(token.rank == 1 for token in input_tokens[context_length:]), + ) + + @property + def _log_likelihood_parameters(self): + + return pb2.Parameters( + method=self.decoding_method, + sampling=self.sampling_params if self.decoding_method==pb2.SAMPLE else None, + stopping=pb2.StoppingCriteria( + min_new_tokens=1, + max_new_tokens=1, + ), + response=pb2.ResponseOptions ( + generated_tokens=True, + input_tokens=True, + token_logprobs=True, + token_ranks=True, + ), + decoding=self.decoding_parameters, + ) + + def get_batch_request(self, requests, parameters=None): + params = parameters or self._log_likelihood_parameters + return pb2.BatchedGenerationRequest( + model_id="unused", + params=params, + requests=[ + pb2.GenerationRequest(text=request) for request in requests + ], + ) + + def get_tokenization_request(self, requests): + return pb2.BatchedTokenizeRequest( + model_id="unused", + requests = [ + pb2.TokenizeRequest(text=request) for request in requests + ], + return_tokens=True, + ) + + def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]: + """ + Args: + requests: Each request contains Instance.args : Tuple[str, str] containing: + 1. an input string to the LM and + 2. a target string on which the loglikelihood of the LM producing this target, + conditioned on the input, will be returned. + Returns: + tuple (loglikelihood, is_greedy) for each request according to the input order: + loglikelihood: probability of generating the target string conditioned on the input + is_greedy: True if and only if the target string would be generated by greedy sampling from the LM + """ + start = time() + #print(f"loglikelihood batch size = {len(requests)}") + self._check_model_logprobs_support() + + results = [] + + pb = tqdm(desc="Running text generation", total=len(requests), disable=not self._show_progressbar) + + for batch in chunks(requests, self.batch_size): + pb.update(len(batch)) + results.extend(self._loglikelihood_batch(batch)) + pb.close() + print(f"Time elapsed running the loglikelihood requests: {time()-start}s") + return results + + def _loglikelihood_batch(self, requests: list[Instance]) -> list[tuple[float, bool]]: + + #print(f"loglikelihood batch size = {len(requests)}") + + requests = [request.args for request in requests] + results: list[LogLikelihoodResult] = [] + + contexts_tokenized = list(self._tokenize([context for context, _ in requests])) + generation_inputs = [context + continuation for context, continuation in requests] + + for result, context_tokens in zip( + self.stub.Generate(self.get_batch_request(generation_inputs)).responses, + contexts_tokenized, + ): + results.append(self._get_log_likelihood(result.input_tokens, context_tokens)) + + return cast(list[tuple[float, bool]], results) + + def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]: + """ + Used to evaluate perplexity on a data distribution. + + Args: + requests: Each request contains Instance.args : tuple[str] containing an input string to the model whose + entire loglikelihood, conditioned on purely the EOT token, will be calculated. + Returns: + tuple (loglikelihood,) for each request according to the input order: + loglikelihood: solely the probability of producing each piece of text given no starting input. + """ + start = time() + self._check_model_logprobs_support() + results = [] + for batch in chunks(requests, self.batch_size): + results.extend(self._loglikelihood_rolling_batch(batch)) + print(f"Time elapsed running the loglikelihood_rolling requests: {time()-start}s") + return results + + def _loglikelihood_rolling_batch(self, requests: list[Instance]) -> list[tuple[float, bool]]: + generation_inputs = [request.args[0] for request in requests] + results: list[LogLikelihoodResult] = [] + for result in self.stub.Generate(self.get_batch_request(generation_inputs)).responses: + results.append(self._get_log_likelihood(result.input_tokens, [])) + + return cast(list[tuple[float, bool]], results) + + + def generate_until(self, requests: list[Instance]) -> list[str]: + """ + From official model_guide: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md: + + Each request contains Instance.args : Tuple[str, dict] containing: + 1. an input string to the LM and + 2. a dictionary of keyword arguments used to control generation parameters. + Using this input and these generation parameters, text will be sampled from the language model + + ( + typically until a maximum output length or specific stopping string sequences--for example, + {"until": ["\n\n", "."], "max_gen_toks": 128} + ). + The generated input+output text from the model will then be returned. + """ + start = time() + # group requests by their args (e.g. temperature, do_sample, etc.) + grouper = Grouper(requests, lambda request: json.dumps(request.args[1], sort_keys=True)) + results: dict[str, list[str]] = defaultdict(list) + + pb = tqdm(desc="Running text generation", total=len(requests), disable=not self._show_progressbar) + + for key, requests_group in grouper.get_grouped().items(): + generation_parameters: dict[str, Any] = requests_group[0].args[1] + inputs = [request.args[0] for request in requests_group] + + # Process parameters + do_sample = generation_parameters.pop("do_sample", False) + decoding_method = pb2.DecodingMethod.SAMPLE if do_sample else pb2.DecodingMethod.GREEDY + until = generation_parameters.pop("until") + stop_sequences = [until] if isinstance(until, str) else until + max_new_tokens = generation_parameters.pop("max_gen_toks", DEFAULT_MAX_NEW_TOKENS) + temperature = generation_parameters.pop("temperature", 0) + + sampling_params = self.sampling_params + sampling_params.temperature = temperature + + + parameters = pb2.Parameters( + method=decoding_method, + sampling = sampling_params, + stopping=pb2.StoppingCriteria( + min_new_tokens=1, + max_new_tokens=max_new_tokens, + stop_sequences=stop_sequences, + ), + response=pb2.ResponseOptions ( + generated_tokens=True, + input_tokens=True, + token_logprobs=True, + token_ranks=True, + ), + decoding=self.decoding_parameters, + ) + + for batch in chunks(inputs, self.batch_size): + for result in self.stub.Generate(self.get_batch_request(batch, parameters)).responses: + results[key].append(result.text) + pb.update(len(batch)) + + pb.close() + print(f"Time elapsed running the generate_until requests: {time()-start}s") + return grouper.get_original(results)