From f70571188b5f27d694edc4fdf276bc96886dcc20 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Thu, 9 May 2024 13:00:30 -0300
Subject: [PATCH] TGIS gRPC adapter for lm-eval

This PoC adds a backend to lm-eval so that it can call a running TGIS or tgis-vllm server
over grpc. It can run benchmarks based on the generate function for
decoder and encoder-decoder models. For the logprobs function only
decoder models are supported because tgis doesn't return the input
logprobs for encoder-decoder models.

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 lm-eval/Dockerfile            |  16 ++
 lm-eval/Makefile              |  15 ++
 lm-eval/README.md             |  51 ++++++
 lm-eval/job.yaml              |  30 +++
 lm-eval/pyproject.toml        |  14 ++
 lm-eval/tgis_eval/__init__.py |   0
 lm-eval/tgis_eval/__main__.py |  10 +
 lm-eval/tgis_eval/model.py    | 335 ++++++++++++++++++++++++++++++++++
 8 files changed, 471 insertions(+)
 create mode 100644 lm-eval/Dockerfile
 create mode 100644 lm-eval/Makefile
 create mode 100644 lm-eval/README.md
 create mode 100644 lm-eval/job.yaml
 create mode 100644 lm-eval/pyproject.toml
 create mode 100644 lm-eval/tgis_eval/__init__.py
 create mode 100644 lm-eval/tgis_eval/__main__.py
 create mode 100644 lm-eval/tgis_eval/model.py
diff --git a/lm-eval/Dockerfile b/lm-eval/Dockerfile
new file mode 100644
index 00000000..6d32c382
--- /dev/null
+++ b/lm-eval/Dockerfile
@@ -0,0 +1,16 @@
+ARG BASE_UBI_IMAGE_TAG=9.3-1552
+FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG}
+
+RUN dnf update -y && dnf install -y python3-pip python3-wheel git && dnf clean all
+
+WORKDIR /app
+COPY pyproject.toml pyproject.toml
+COPY tgis_eval tgis_eval
+
+RUN pip install .
+
+RUN useradd app
+VOLUME /cache
+ENV XDG_CACHE_HOME=/cache
+
+USER app
diff --git a/lm-eval/Makefile b/lm-eval/Makefile
new file mode 100644
index 00000000..66d6edc2
--- /dev/null
+++ b/lm-eval/Makefile
@@ -0,0 +1,15 @@
+gen-client:
+	# Compile protos
+	pip install grpcio-tools==1.60.0 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' --no-cache-dir
+	mkdir tgis_eval/pb || true
+	python -m grpc_tools.protoc -I../proto --python_out=tgis_eval/pb \
+		--grpc_python_out=tgis_eval/pb --mypy_out=tgis_eval/pb ../proto/generation.proto
+	find tgis_eval/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+	touch tgis_eval/pb/__init__.py
+
+install: gen-client
+	pip install pip --upgrade
+	pip install -e . --no-cache-dir
+
+image: gen-client
+  podman build -t quay.io/wxpe/lm-eval-tgis:0.0.2 .
diff --git a/lm-eval/README.md b/lm-eval/README.md
new file mode 100644
index 00000000..371773e3
--- /dev/null
+++ b/lm-eval/README.md
@@ -0,0 +1,51 @@
+# TGIS eval framework
+
+This directory contains an adapter to run the [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness)
+framework on a TGIS server. We subclass the Model class to collect the benchmark requests and send
+them to the TGIS server over gRPC.
+
+## Installing
+
+To install lm-eval with tgis support in your environment run `make install`.
+
+
+## Running:
+
+To run the benchmark, call it as python module on the command line:
+```
+python3 -m tgis_eval \
+  --model_args server=<host, defaults to localhost>,port=<defaults to 8033> \
+  --model=tgis_eval \ 
+  --batch_size=16 \ # <-- change the batch size to fit your gpu
+  --tasks <task_id>
+```
+
+For example, to run the 5 benchmarks that make up the huggingface leaderboard
+on a TGIS instance running on hostname `flan-t5-inference-server`:
+
+```
+python3 -m tgis_eval \
+  --model_args server=flan-t5-inference-server,port=8033 \
+  --model=tgis_eval \
+  --batch_size=16 \
+  --tasks ai2_arc,hellaswag,mmlu,truthfulqa,winogrande,gsm8k 
+```
+
+## Building the container
+
+To build the container, run `make image`.
+
+
+## Running as a job on Kubernetes
+
+You can run tgis-eval as a Kubernetes Job. Locate the `job.yaml` file in this directory
+and edit it to adjust it to your needs. Make sure that the hostname is correct and that
+the benchmarks to run are the ones you need. Then submit the job with
+
+```
+kubectl apply -f job.yaml
+```
+
+If you're going to run several rounds of tests, it is recommended to allocate a persistent
+volume and mount it in the job pod. This will avoid downloading the same datasets over and
+over.
diff --git a/lm-eval/job.yaml b/lm-eval/job.yaml
new file mode 100644
index 00000000..27675b57
--- /dev/null
+++ b/lm-eval/job.yaml
@@ -0,0 +1,30 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: lm-eval-job
+spec:
+  template:
+    spec:
+      containers:
+      - name: eval
+        image: quay.io/wxpe/lm-eval-tgis:0.0.7
+        command:
+          - python3
+          - -m
+          - tgis_eval
+          - --model_args
+          - server=flan-t5-inference-server,port=8033
+          - --model=tgis_eval
+          - --batch_size=16
+          - --tasks
+          - ai2_arc,hellaswag,mmlu,truthfulqa,winogrande,gsm8k
+          #- --limit=10
+        volumeMounts:
+        - name: cache-volume
+          mountPath: /cache
+      restartPolicy: Never
+      volumes:
+      - name: cache-volume
+        emptyDir: {}
+  backoffLimit: 0
+
diff --git a/lm-eval/pyproject.toml b/lm-eval/pyproject.toml
new file mode 100644
index 00000000..9d4bc6a4
--- /dev/null
+++ b/lm-eval/pyproject.toml
@@ -0,0 +1,14 @@
+[tool.poetry]
+name = "tgis-eval"
+version = "0.1.0"
+description = "lm-eval backend for tgis"
+authors = ["Max de Bayser"]
+
+[tool.poetry.dependencies]
+python = ">=3.9"
+grpcio-tools = "^1.62.1"
+lm-eval ="^0.4.2"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/lm-eval/tgis_eval/__init__.py b/lm-eval/tgis_eval/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/lm-eval/tgis_eval/__main__.py b/lm-eval/tgis_eval/__main__.py
new file mode 100644
index 00000000..7cabd90a
--- /dev/null
+++ b/lm-eval/tgis_eval/__main__.py
@@ -0,0 +1,10 @@
+try:
+    from lm_eval.__main__ import cli_evaluate
+except ImportError:
+    raise ImportError("Could not import lm_eval: Please install ibm-generative-ai[lm-eval] extension.")  # noqa: B904
+
+from .model import initialize_model
+
+initialize_model()
+
+cli_evaluate()
diff --git a/lm-eval/tgis_eval/model.py b/lm-eval/tgis_eval/model.py
new file mode 100644
index 00000000..0dfb0456
--- /dev/null
+++ b/lm-eval/tgis_eval/model.py
@@ -0,0 +1,335 @@
+import json
+from collections import defaultdict
+from typing import Any, Iterator, NamedTuple, Optional, Type, cast
+
+
+import lm_eval.utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import Grouper
+
+from tqdm import tqdm
+from .pb import generation_pb2_grpc as gpb2, generation_pb2 as pb2
+#import proto
+import grpc
+import numpy as np
+from time import time
+
+SERVER = 'localhost'
+PORT = 8033
+DEFAULT_BATCH_SIZE=64
+DEFAULT_MAX_NEW_TOKENS=300
+
+class LogLikelihoodResult(NamedTuple):
+    log_likelihood: float
+    is_greedy: bool
+
+
+def initialize_model():
+    pass  # model is registered by importing this module
+
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+
+def option(cls, val):
+    return None if val is None else cls(val)
+
+@register_model("tgis_eval")
+class TGISLMEval(LM):
+    """
+    Implementation of LM model interface for evaluating TGIS model with the lm_eval framework.
+
+    See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md for reference.
+    """
+
+    @classmethod
+    def create_from_arg_string(
+        cls: Type["TGISLMEval"],
+        arg_string: str,
+        additional_config: Optional[dict] = None,
+    ) -> "TGISLMEval":
+        """Allow the user to specify model parameters (TextGenerationParameters) in CLI arguments."""
+        args = lm_eval.utils.simple_parse_args_string(arg_string)
+        print(f"LM args = {args}")
+        return cls(parameters=args, additional_config=additional_config)
+
+    def __init__(
+        self,
+        parameters = None,
+        additional_config: Optional[dict] = None,
+        show_progressbar: Optional[bool] = True
+    ):
+        super().__init__()
+
+        additional_config = {} if additional_config is None else additional_config
+    
+        self.server = parameters.get("server", SERVER)
+        self.port = int(parameters.get("port", PORT))
+
+        self.channel = grpc.insecure_channel(f"{self.server}:{self.port}")
+        self.stub = gpb2.GenerationServiceStub(self.channel)
+
+        self.model_kind = self.stub.ModelInfo(pb2.ModelInfoRequest()).model_kind
+
+        self._parameters = parameters
+        self._show_progressbar = show_progressbar
+        self.batch_size = int(additional_config.get("batch_size", DEFAULT_BATCH_SIZE))
+
+        self.decoding_method = parameters.get("decoding_method", "greedy")
+
+        if self.decoding_method == "greedy":
+            self.decoding_method = pb2.GREEDY
+        elif self.decoding_method == "sample":
+            self.decoding_method = pb2.SAMPLE
+        else:
+            raise ValueError(f"{self.decoding_method} is not valid for parameter decoding_method")
+        
+        self.sampling_params = pb2.SamplingParameters(
+            temperature = option(float,parameters.get("temperature")),
+            top_k       = option(int,parameters.get("top_k")),
+            top_p       = option(float,parameters.get("top_p")),
+            typical_p   = option(float,parameters.get("typical_p")),
+            seed        = option(int,parameters.get("seed"))
+        )
+        start_index = option(int,parameters.get("length_penalty.start_index"))
+        decay_factor = option(float,parameters.get("length_penalty.decay_factor"))
+
+        if (start_index is None) != (decay_factor is None):
+            raise ValueError(f"length_penalty.{start_index, decay_factor} must both be set or unset")
+
+        length_penalty = pb2.DecodingParameters.LengthPenalty(
+            start_index = start_index,
+            decay_factor = decay_factor
+        ) if start_index is not None else None
+
+        self.decoding_parameters = pb2.DecodingParameters (
+            repetition_penalty = option(float,parameters.get("repetition_penalty")),
+            length_penalty = length_penalty
+        )
+    
+    def close(self):
+        self.channel.close()
+
+    def _tokenize(self, inputs: list[str]) -> Iterator[list[str]]:
+        tokenization_request = self.get_tokenization_request(inputs)
+        for response in self.stub.Tokenize(tokenization_request).responses:
+            yield response.tokens
+
+    def _has_stop_token(self, response_tokens: list[str], context_tokens: list[str]) -> bool:
+        context_length = len(context_tokens)
+
+        # workaround difference in tokenization in some models
+        for i in range(len(context_tokens)):
+            if response_tokens[i] == '<unk>':
+                response_tokens[i] = context_tokens[i]
+
+        if response_tokens[: context_length - 1] == context_tokens[:-1]:
+            return response_tokens[-1] != context_tokens[-1]  # only last token differs, probably stop sequence (</s>)
+        raise RuntimeError(
+            f"There is an unexpected difference between tokenizer and model tokens:\n"
+            f"context_tokens={context_tokens}\n"
+            f"response_tokens={response_tokens[:context_length]}"
+        )
+
+    def _check_model_logprobs_support(self):
+
+        if self.model_kind == pb2.ModelInfoResponse.ENCODER_DECODER:
+            raise RuntimeError(f"Encoder decoder models don't return logprobs for input tokens and are not supported")
+
+        input_tokens = self.stub.Generate(
+            self.get_batch_request(["The best ice cream flavor is:"])
+        ).responses[0].input_tokens
+        
+        if all(token.logprob is None or np.isnan(token.logprob) for token in input_tokens):
+            raise RuntimeError(f"The model is not supported: does not return logprobs for input tokens")
+        
+
+    def _get_log_likelihood(self, input_tokens: list[pb2.TokenInfo], context_tokens: list[str]) -> LogLikelihoodResult:
+        response_tokens: list[str] = [token.text for token in input_tokens]
+        context_length = len(context_tokens)
+
+        if self._has_stop_token(response_tokens, context_tokens):
+            context_length -= 1
+
+        return LogLikelihoodResult(
+            log_likelihood=sum(token.logprob for token in input_tokens[context_length:]),
+            is_greedy=all(token.rank == 1 for token in input_tokens[context_length:]),
+        )
+
+    @property
+    def _log_likelihood_parameters(self):
+
+        return pb2.Parameters(
+            method=self.decoding_method,
+            sampling=self.sampling_params if self.decoding_method==pb2.SAMPLE else None,
+            stopping=pb2.StoppingCriteria(
+                min_new_tokens=1,
+                max_new_tokens=1,
+            ),
+            response=pb2.ResponseOptions (
+                generated_tokens=True,
+                input_tokens=True,
+                token_logprobs=True,
+                token_ranks=True,
+            ),
+            decoding=self.decoding_parameters,
+        )
+    
+    def get_batch_request(self, requests, parameters=None):
+        params = parameters or self._log_likelihood_parameters
+        return pb2.BatchedGenerationRequest(
+                model_id="unused",
+                params=params,
+                requests=[
+                    pb2.GenerationRequest(text=request) for request in requests
+                ],
+            )
+    
+    def get_tokenization_request(self, requests):
+        return pb2.BatchedTokenizeRequest(
+            model_id="unused",
+            requests = [
+                 pb2.TokenizeRequest(text=request) for request in requests
+            ],
+            return_tokens=True,
+        )
+
+    def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        """
+        Args:
+            requests: Each request contains Instance.args : Tuple[str, str] containing:
+                1. an input string to the LM and
+                2. a target string on which the loglikelihood of the LM producing this target,
+                   conditioned on the input, will be returned.
+        Returns:
+            tuple (loglikelihood, is_greedy) for each request according to the input order:
+                loglikelihood: probability of generating the target string conditioned on the input
+                is_greedy: True if and only if the target string would be generated by greedy sampling from the LM
+        """
+        start = time()
+        #print(f"loglikelihood batch size = {len(requests)}")
+        self._check_model_logprobs_support()
+
+        results = []
+
+        pb = tqdm(desc="Running text generation", total=len(requests), disable=not self._show_progressbar)
+
+        for batch in chunks(requests, self.batch_size):
+            pb.update(len(batch))
+            results.extend(self._loglikelihood_batch(batch))
+        pb.close()
+        print(f"Time elapsed running the loglikelihood requests: {time()-start}s")
+        return results
+    
+    def _loglikelihood_batch(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+
+        #print(f"loglikelihood batch size = {len(requests)}")
+
+        requests = [request.args for request in requests]
+        results: list[LogLikelihoodResult] = []
+
+        contexts_tokenized = list(self._tokenize([context for context, _ in requests]))
+        generation_inputs = [context + continuation for context, continuation in requests]
+
+        for result, context_tokens in zip(
+            self.stub.Generate(self.get_batch_request(generation_inputs)).responses,
+            contexts_tokenized,
+        ):
+            results.append(self._get_log_likelihood(result.input_tokens, context_tokens))
+
+        return cast(list[tuple[float, bool]], results)
+
+    def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        """
+        Used to evaluate perplexity on a data distribution.
+
+        Args:
+            requests: Each request contains Instance.args : tuple[str] containing an input string to the model whose
+                entire loglikelihood, conditioned on purely the EOT token, will be calculated.
+        Returns:
+            tuple (loglikelihood,) for each request according to the input order:
+                loglikelihood: solely the probability of producing each piece of text given no starting input.
+        """
+        start = time()
+        self._check_model_logprobs_support()
+        results = []
+        for batch in chunks(requests, self.batch_size):
+            results.extend(self._loglikelihood_rolling_batch(batch))
+        print(f"Time elapsed running the loglikelihood_rolling requests: {time()-start}s")
+        return results
+
+    def _loglikelihood_rolling_batch(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        generation_inputs = [request.args[0] for request in requests]
+        results: list[LogLikelihoodResult] = []
+        for result in self.stub.Generate(self.get_batch_request(generation_inputs)).responses:
+            results.append(self._get_log_likelihood(result.input_tokens, []))
+
+        return cast(list[tuple[float, bool]], results)
+
+
+    def generate_until(self, requests: list[Instance]) -> list[str]:
+        """
+        From official model_guide: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md:
+
+        Each request contains Instance.args : Tuple[str, dict] containing:
+            1. an input string to the LM and
+            2. a dictionary of keyword arguments used to control generation parameters.
+        Using this input and these generation parameters, text will be sampled from the language model
+
+        (
+            typically until a maximum output length or specific stopping string sequences--for example,
+            {"until": ["\n\n", "."], "max_gen_toks": 128}
+        ).
+        The generated input+output text from the model will then be returned.
+        """
+        start = time()
+        # group requests by their args (e.g. temperature, do_sample, etc.)
+        grouper = Grouper(requests, lambda request: json.dumps(request.args[1], sort_keys=True))
+        results: dict[str, list[str]] = defaultdict(list)
+
+        pb = tqdm(desc="Running text generation", total=len(requests), disable=not self._show_progressbar)
+
+        for key, requests_group in grouper.get_grouped().items():
+            generation_parameters: dict[str, Any] = requests_group[0].args[1]
+            inputs = [request.args[0] for request in requests_group]
+
+            # Process parameters
+            do_sample = generation_parameters.pop("do_sample", False)
+            decoding_method = pb2.DecodingMethod.SAMPLE if do_sample else pb2.DecodingMethod.GREEDY
+            until = generation_parameters.pop("until")
+            stop_sequences = [until] if isinstance(until, str) else until
+            max_new_tokens = generation_parameters.pop("max_gen_toks", DEFAULT_MAX_NEW_TOKENS)
+            temperature = generation_parameters.pop("temperature", 0)
+
+            sampling_params = self.sampling_params
+            sampling_params.temperature = temperature
+
+
+            parameters = pb2.Parameters(
+                        method=decoding_method,
+                        sampling = sampling_params,
+                        stopping=pb2.StoppingCriteria(
+                            min_new_tokens=1,
+                            max_new_tokens=max_new_tokens,
+                            stop_sequences=stop_sequences,
+                        ),
+                        response=pb2.ResponseOptions (
+                            generated_tokens=True,
+                            input_tokens=True,
+                            token_logprobs=True,
+                            token_ranks=True,
+                        ),
+                        decoding=self.decoding_parameters,
+                    )
+            
+            for batch in chunks(inputs, self.batch_size):
+                for result in self.stub.Generate(self.get_batch_request(batch, parameters)).responses:
+                    results[key].append(result.text)
+                pb.update(len(batch))
+
+        pb.close()
+        print(f"Time elapsed running the generate_until requests: {time()-start}s")
+        return grouper.get_original(results)