From 18449a280771468a94e231a15e5ce00dd3ba30e8 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Fri, 1 May 2020 23:10:26 -0700 Subject: [PATCH] Use the AutoML Model and the universal model to generate predictions. (#134) Create an AutoML Model class to generate predictions using AutoML. * Create a new class/module automl_model to generate predictions using AutoML models * Change the function signature of predict_labels to include org and repo because we want org and repo to be features because they can be highly informative. * Also change predict_issue_labels to take in a list of strings for the body text because in follow on PRs we will start taking into additional comments and not just the first one. * Define github_util.build_issue_doc to construct a text document out of the various features. Testing * Dev instance successfully used AutoML model. https://github.com/kubeflow/code-intelligence/issues/131#issuecomment-622616399 * Check in hydrated configs for prod. * prod has also been updated and looks to be using the new model correctly. Related issues: * Hopefully this model is an improvement. Miscellaneous changes Add logging and monitoring instructions. Update automl notebook to use the new code to build an issue. --- .gitignore | 2 + ...s_v1beta1_deployment_label-bot-worker.yaml | 59 ++++++++ .../prod/~g_v1_service_label-bot-worker.yaml | 20 +++ Label_Microservice/Makefile | 10 ++ .../overlays/dev/kustomization.yaml | 4 +- .../overlays/prod/kustomization.yaml | 2 +- .../deployment/requirements.worker.txt | 1 + Label_Microservice/developer_guide.md | 12 +- .../docs/logging_and_monitoring.md | 19 +++ Label_Microservice/notebooks/automl.ipynb | 134 +++++++++++++++++- py/code_intelligence/github_util.py | 25 +++- py/code_intelligence/github_util_test.py | 29 ++++ py/label_microservice/automl_model.py | 96 +++++++++++++ py/label_microservice/automl_model_test.py | 54 +++++++ py/label_microservice/cli.py | 21 +++ py/label_microservice/combined_model.py | 5 +- .../issue_label_predictor.py | 60 +++++--- py/label_microservice/models.py | 12 +- .../universal_kind_label_model.py | 14 +- py/label_microservice/worker.py | 2 - 20 files changed, 544 insertions(+), 37 deletions(-) create mode 100644 Label_Microservice/.build/prod/extensions_v1beta1_deployment_label-bot-worker.yaml create mode 100644 Label_Microservice/.build/prod/~g_v1_service_label-bot-worker.yaml create mode 100644 Label_Microservice/Makefile create mode 100644 Label_Microservice/docs/logging_and_monitoring.md create mode 100644 py/code_intelligence/github_util_test.py create mode 100644 py/label_microservice/automl_model.py create mode 100644 py/label_microservice/automl_model_test.py diff --git a/.gitignore b/.gitignore index c003aab28f..99a8f27dbc 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ !.gitignore !.dockerignore **/flask_session +**/.cache +**/.data build/** fairing/__pycache__/** **/__pycache__/** diff --git a/Label_Microservice/.build/prod/extensions_v1beta1_deployment_label-bot-worker.yaml b/Label_Microservice/.build/prod/extensions_v1beta1_deployment_label-bot-worker.yaml new file mode 100644 index 0000000000..3fdebf672d --- /dev/null +++ b/Label_Microservice/.build/prod/extensions_v1beta1_deployment_label-bot-worker.yaml @@ -0,0 +1,59 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app: label-bot + environment: prod + service: label-bot + name: label-bot-worker + namespace: label-bot-prod +spec: + replicas: 5 + selector: + matchLabels: + app: label-bot + environment: prod + service: label-bot + template: + metadata: + labels: + app: label-bot + environment: prod + service: label-bot + spec: + containers: + - command: + - python3 + - -m + - label_microservice.worker + - subscribe_from_env + env: + - name: PORT + value: "80" + - name: ISSUE_EMBEDDING_SERVICE + value: http://issue-embedding-server + - name: PROJECT + value: issue-label-bot-dev + - name: ISSUE_EVENT_TOPIC + value: event_queue + - name: ISSUE_EVENT_SUBSCRIPTION + value: label_bot_prod + - name: GITHUB_APP_ID + value: "27079" + - name: GITHUB_APP_PEM_KEY + value: /var/secrets/github/issue-label-bot-github-app.private-key.pem + image: gcr.io/issue-label-bot-dev/bot-worker:011a589 + name: app + resources: + requests: + cpu: "4" + memory: 4Gi + volumeMounts: + - mountPath: /var/secrets/github + name: github-app + restartPolicy: Always + serviceAccountName: default-editor + volumes: + - name: github-app + secret: + secretName: github-app diff --git a/Label_Microservice/.build/prod/~g_v1_service_label-bot-worker.yaml b/Label_Microservice/.build/prod/~g_v1_service_label-bot-worker.yaml new file mode 100644 index 0000000000..ff3a0c1419 --- /dev/null +++ b/Label_Microservice/.build/prod/~g_v1_service_label-bot-worker.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: label-bot + environment: prod + service: label-bot + name: label-bot-worker + namespace: label-bot-prod +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 80 + selector: + app: label-bot + environment: prod + service: label-bot + type: ClusterIP diff --git a/Label_Microservice/Makefile b/Label_Microservice/Makefile new file mode 100644 index 0000000000..613c028d07 --- /dev/null +++ b/Label_Microservice/Makefile @@ -0,0 +1,10 @@ + +CONTEXT=issue-label-bot + +hydrate-prod: + rm -rf .build/prod + mkdir -p .build/prod + kustomize build -o .build/prod deployment/overlays/prod + +apply-prod: hydrate-prod + kubectl --context=$(CONTEXT) apply -f .build/prod \ No newline at end of file diff --git a/Label_Microservice/deployment/overlays/dev/kustomization.yaml b/Label_Microservice/deployment/overlays/dev/kustomization.yaml index 418ab163ba..b1910aa697 100644 --- a/Label_Microservice/deployment/overlays/dev/kustomization.yaml +++ b/Label_Microservice/deployment/overlays/dev/kustomization.yaml @@ -3,9 +3,9 @@ kind: Kustomization bases: - ../../base images: -- digest: sha256:cb2b2e604d4056b78ecd51d7113de04ebfa60e542310265b3871e7873417e34a +- #digest: sha256:cb2b2e604d4056b78ecd51d7113de04ebfa60e542310265b3871e7873417e34a name: gcr.io/issue-label-bot-dev/bot-worker - newName: gcr.io/issue-label-bot-dev/bot-worker:3a82547 + #newName: gcr.io/issue-label-bot-dev/bot-worker:3a82547 commonLabels: environment: dev namespace: label-bot-dev diff --git a/Label_Microservice/deployment/overlays/prod/kustomization.yaml b/Label_Microservice/deployment/overlays/prod/kustomization.yaml index 19b13d3055..5c0f2ce563 100644 --- a/Label_Microservice/deployment/overlays/prod/kustomization.yaml +++ b/Label_Microservice/deployment/overlays/prod/kustomization.yaml @@ -10,4 +10,4 @@ resources: images: - name: gcr.io/issue-label-bot-dev/bot-worker newName: gcr.io/issue-label-bot-dev/bot-worker - newTag: 79cd85a-dirty + newTag: 011a589 diff --git a/Label_Microservice/deployment/requirements.worker.txt b/Label_Microservice/deployment/requirements.worker.txt index 87a850067a..1ea4a012f5 100644 --- a/Label_Microservice/deployment/requirements.worker.txt +++ b/Label_Microservice/deployment/requirements.worker.txt @@ -11,6 +11,7 @@ google-api-core==1.14.2 google-api-python-client==1.7.10 google-auth==1.6.3 google-auth-httplib2==0.0.3 +google-cloud-automl==0.10.0 #google-cloud-bigquery==1.17.0 google-cloud-core==1.0.3 google-cloud-pubsub==0.45.0 diff --git a/Label_Microservice/developer_guide.md b/Label_Microservice/developer_guide.md index 82ead87088..e8b59177fa 100644 --- a/Label_Microservice/developer_guide.md +++ b/Label_Microservice/developer_guide.md @@ -68,19 +68,29 @@ Setup a namespace for your development 1. Send a prediction request using pubsub ``` - python -m label_microservice.py --issue=kubeflow/kubeflow#4602 + python -m label_microservice.cli label-issue --issue=kubeflow/kubeflow#4602 --topic=projects/issue-label-bot-dev/topics/TEST_event_queue ``` * Look at the logs of the pod to see the prediction * Ensure that you don't have other pods using the same pubsub subscription; otherwise your item might not get handled by the pod you are looking at +1. Get pod logs + + ``` + python -m label_microservice.cli pod-logs --pod= + ``` + + * This will pretty print the json logs which is easier to read. + 1. Ensure your kubeconfig context sets the namespace to the namespace skaffold is deploying in; otherwise file sync and log streaming doesn't seem to work. ## Unresolved Issues * skaffold continuous mode (`skaffold dev` ) doesn't appear to detect changes in the python files and retrigger the build and deployment +* skaffold doesn't appear to substitute the newly built image into the kustomize package + ### Kaniko Image Caching diff --git a/Label_Microservice/docs/logging_and_monitoring.md b/Label_Microservice/docs/logging_and_monitoring.md new file mode 100644 index 0000000000..c7e0003031 --- /dev/null +++ b/Label_Microservice/docs/logging_and_monitoring.md @@ -0,0 +1,19 @@ +# Logging and Monitoring + + +## Stackdriver logs + +* Label bot workers use structured json logs +* You can search the logs in stackdrive some examples below +* There is also a BigQuery sink for the stackdriver logs to facilitate analysis and querying + + +Use a label like the following to see messages for +a specific issue + +``` +jsonPayload.repo_owner = "kubeflow" +jsonPayload.repo_name = "code-intelligence" +jsonPayload.issue_num = "132" +resource.labels.namespace_name = "label-bot-prod" +``` \ No newline at end of file diff --git a/Label_Microservice/notebooks/automl.ipynb b/Label_Microservice/notebooks/automl.ipynb index 7e720cbba9..61d5f7c8b8 100644 --- a/Label_Microservice/notebooks/automl.ipynb +++ b/Label_Microservice/notebooks/automl.ipynb @@ -8351,7 +8351,8 @@ " blob = bucket.blob(obj_path)\n", " \n", " # Include the owner and repo in the text body because it is predictive\n", - " blob.upload_from_string(issue[\"title\"] + \"\\n\" + owner_repo + \"\\n\" + issue[\"body\"])\n", + " doc = github_util.build_issue_doc(owner, repo, issue[\"title\"], [issue[\"body\"]])\n", + " blob.upload_from_string(doc)\n", " logging.info(f\"Created {target}\")\n", "\n", " info.iloc[i][\"url\"] = target \n", @@ -8674,6 +8675,26 @@ "model_name = result.name" ] }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'projects/976279526634/locations/us-central1/models/TCN654213816573231104'" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_name" + ] + }, { "cell_type": "code", "execution_count": 39, @@ -8790,6 +8811,117 @@ " )\n", " )" ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "google.protobuf.pyext._message.RepeatedCompositeContainer" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response.payload.__class__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "automl.types" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud.automl import types as automl_types" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "predict_response = automl_types.PredictResponse()" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "predict_response.payload.append(annotation)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[classification {\n", + " score: 0.8999999761581421\n", + "}\n", + "display_name: \"area-jupyter\"\n", + "]" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_response.payload" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "google.cloud.automl_v1.types.AnnotationPayload" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "annotation_payload.__class__" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "annotation = automl_types.AnnotationPayload()\n", + "annotation.display_name = \"area-jupyter\"\n", + "annotation.classification.score = .9" + ] } ], "metadata": { diff --git a/py/code_intelligence/github_util.py b/py/code_intelligence/github_util.py index 7511c214ca..2774ccbbe2 100644 --- a/py/code_intelligence/github_util.py +++ b/py/code_intelligence/github_util.py @@ -1,6 +1,7 @@ import os import logging from code_intelligence import github_app +import typing import yaml def get_issue_handle(installation_id, username, repository, number): @@ -28,7 +29,29 @@ def get_yaml(owner, repo, ghapp=None): # get the repo handle, which allows you got get the file contents repo = inst.repository(owner=owner, repository=repo) results = repo.file_contents('.github/issue_label_bot.yaml').decoded - except: + # TODO(jlewi): We should probably catching more narrow exceptions and + # not swallowing all exceptions. The exceptions we should swallow are + # the ones related to the configuration file not existing. + except Exception as e: + logging.info(f"Exception occured getting .github/issue_label_bot.yaml: {e}") return None return yaml.safe_load(results) + +def build_issue_doc(org:str, repo:str, title:str, text:typing.List[str]): + """Build a document string out of various github features. + + Args: + org: The organization the issue belongs in + repo: The repository. + title: Issue title + text: List of contents of the comments on the issue + + Returns: + content: The document to classify + """ + pieces = [title] + pieces.append(f"{org.lower()}_{repo.lower()}") + pieces.extend(text) + content = "\n".join(pieces) + return content diff --git a/py/code_intelligence/github_util_test.py b/py/code_intelligence/github_util_test.py new file mode 100644 index 0000000000..3f15d7d826 --- /dev/null +++ b/py/code_intelligence/github_util_test.py @@ -0,0 +1,29 @@ +"""Unittest for github_util. """ +import logging +import pytest + +from code_intelligence import github_util + +def test_build_issue_doc(): + result = github_util.build_issue_doc("someOrg", "someRepo", "issue title", + ["line1", "line2"]) + + expected = """issue title +someorg_somerepo +line1 +line2""" + assert result == expected + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + + pytest.main() + + + diff --git a/py/label_microservice/automl_model.py b/py/label_microservice/automl_model.py new file mode 100644 index 0000000000..551486acce --- /dev/null +++ b/py/label_microservice/automl_model.py @@ -0,0 +1,96 @@ +"""Use a model trained on automl. +""" +import hashlib +import logging +import numpy as np +import os +import requests +import retrying +import typing +import yaml + +from code_intelligence import github_util +from google.cloud import automl +from label_microservice import models + +# TODO(jlewi): Do we want to hardcode this? +CONFIDENCE_THRESHOLD = .5 + +class AutoMLModel(models.IssueLabelModel): + """Use a model training with GCP AutoML.""" + + def __init__(self, model_name = None, prediction_client=None): + self.config = None + # The model name + # e.g. a value like value like + # "projects/976279526634/locations/us-central1/models/TCN654213816573231104'" + self.model_name = model_name + + if not prediction_client: + prediction_client = automl.PredictionServiceClient() + + self._prediction_client = prediction_client + + def predict_issue_labels(self, org:str, repo:str, title:str, + text:typing.List[str], context=None): + """Return a dictionary of label probabilities. + + Args: + org: The organization the issue belongs in + repo: The repository. + title: Issue title + text: List of contents of the comments on the issue + + context: (Optional) dictionary of information like the issue. Used + for logging + Return + ------ + dict: Dictionary of label to probability of that label for the + the issue str -> float + """ + if not context: + context = {} + + content = github_util.build_issue_doc(org, repo, title, text) + text_snippet = automl.types.TextSnippet(content=content) + payload = automl.types.ExamplePayload(text_snippet=text_snippet) + + # TODO(jlewi): Retry longer? Distinguish permanent vs. retryable errors + #@retrying.retry(stop_max_delay=60*1000) + def _predict(): + response = self._prediction_client.predict(self.model_name, payload) + return response + + response = _predict() + + predictions = {} + + for annotation_payload in response.payload: + # TODO(jlewi): Can we do this in a more principled way? + # AutoML doesn't allow "/" in the label names so during training + # we convert them from "/" to "-". So here we need to convert them + # back to "/" + # Only replace the first occurence of the "-". In principle I think + # we might have sub areas as well and we should fix this. + label = annotation_payload.display_name.replace("-", "/", 1) + predictions[label] = annotation_payload.classification.score + + # TODO(https://github.com/kubeflow/code-intelligence/issues/79): + # We should use some sort of context to pass along information + # about the issue so we can log what issue these predictions pertain + # to. + extra = {} + extra.update(context) + extra["predictions"] = predictions + logging.info(f"Unfiltered predictions: {predictions}", extra=extra) + + labels_to_remove = [] + for label, probability in predictions.items(): + if probability < CONFIDENCE_THRESHOLD: + labels_to_remove.append(label) + + for l in labels_to_remove: + del predictions[l] + logging.info(f"Labels below precision and recall {labels_to_remove}", + extra=context) + return predictions diff --git a/py/label_microservice/automl_model_test.py b/py/label_microservice/automl_model_test.py new file mode 100644 index 0000000000..38a4e511bb --- /dev/null +++ b/py/label_microservice/automl_model_test.py @@ -0,0 +1,54 @@ +"""Unittest for repo_specific_model. """ +import logging +from unittest import mock +import pytest + +from google.cloud.automl import types as automl_types +from label_microservice import automl_model +from label_microservice import test_util + +def test_predict_labels(): + """A unittest for predict labels. + + This function mocks out AutoML. + """ + mock_client = mock.MagicMock() + payload = [] + + predict_response = automl_types.PredictResponse() + + annotation = automl_types.AnnotationPayload() + annotation.display_name = "area-jupyter" + annotation.classification.score = 1 + predict_response.payload.append(annotation) + + annotation = automl_types.AnnotationPayload() + annotation.display_name = "area-operator" + annotation.classification.score = .4 + predict_response.payload.append(annotation) + + mock_client.predict.return_value = predict_response + + model = automl_model.AutoMLModel(model_name="some/model", prediction_client=mock_client) + + results = model.predict_issue_labels("kubeflow", "docs", "some title", + ["some text"]) + + expected = { + # Use an integer and not float to avoid numeric issues in evalueation + "area/jupyter": 1, + } + test_util.assert_dict_equal(expected, results) + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + + pytest.main() + + diff --git a/py/label_microservice/cli.py b/py/label_microservice/cli.py index fcd8c4804d..0343458ed5 100644 --- a/py/label_microservice/cli.py +++ b/py/label_microservice/cli.py @@ -4,9 +4,11 @@ to be picked up by the backends. """ import logging +import json import fire from code_intelligence import util from google.cloud import pubsub +import subprocess DEFAULT_TOPIC = "projects/issue-label-bot-dev/topics/TEST_event_queue" class Cli: @@ -37,6 +39,25 @@ def label_issue(issue, pubsub_topic=DEFAULT_TOPIC): repo_name=repo_name, issue_num=str(issue_num)) + @staticmethod + def pod_logs(pod): + """Pretty print pod logs + + Args: + pod: Name of the pod + """ + output = subprocess.check_output(["kubectl", "logs", pod]) + + for l in output.splitlines(): + try: + entry = json.loads(l) + filename = entry.get("filename") + line = entry.get("line") + message = entry.get("message") + print(f"{filename}:{line}: {message}") + except json.JSONDecodeError: + print(l) + continue if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format=('%(levelname)s|%(asctime)s' diff --git a/py/label_microservice/combined_model.py b/py/label_microservice/combined_model.py index 4811a11f3b..42baa6e88b 100644 --- a/py/label_microservice/combined_model.py +++ b/py/label_microservice/combined_model.py @@ -12,7 +12,8 @@ def __init__(self, models=None): # A list of models to generate predictions self._models = models - def predict_issue_labels(self, title:str , text:str, context=None): + def predict_issue_labels(self, org:str, repo:str, + title:str , text:str, context=None): """Return a dictionary of label probabilities. Args: @@ -31,7 +32,7 @@ def predict_issue_labels(self, title:str , text:str, context=None): for i, m in enumerate(self._models): logging.info(f"Generating predictions with model {i}") - latest = m.predict_issue_labels(title, text, context=context) + latest = m.predict_issue_labels(org, repo, title, text, context=context) predictions = self._combine_predictions(predictions, latest) diff --git a/py/label_microservice/issue_label_predictor.py b/py/label_microservice/issue_label_predictor.py index 0f22f3ed4e..7a785f3173 100644 --- a/py/label_microservice/issue_label_predictor.py +++ b/py/label_microservice/issue_label_predictor.py @@ -2,15 +2,31 @@ import os from code_intelligence import embeddings +from label_microservice import automl_model from label_microservice import combined_model from label_microservice import repo_specific_model from label_microservice import universal_kind_label_model as universal_model UNIVERSAL_MODEL_NAME = "universal" -def _combined_model_name(org, repo): - """Return the name of the combined model for a repo""" - return f"{org}/{repo}_combined" +# TODO(jlewi): Lets not hardcode this. +KUBEFLOW_AUTOML_MODEL = "projects/976279526634/locations/us-central1/models/TCN654213816573231104" + +def _combined_model_name(org, repo=None): + """Return the name of the combined model for a repo or organization. + + If repo is specified looks for a repo specific model. If repo is + none we return an org wide model. + + Args: + org: Name of the org. + repo: (Optional) The name of the repo + """ + + if repo: + return f"{org}/{repo}_combined" + else: + return f"{org}_combined" def _dict_has_keys(d, keys): for k in keys: @@ -30,36 +46,38 @@ class IssueLabelPredictor: def __init__(self): + # A dictionary mapping keys to individual models. self._models = {} self._load_models() def _load_models(self): + """Load the models.""" logging.info("Loading the universal model") self._models[UNIVERSAL_MODEL_NAME] = universal_model.UniversalKindLabelModel() # TODO(jlewi): How should we get a list of all models for which we - # have repo specific models. mlbot is doing this based on a config + # have repo or org specific models. mlbot is doing this based on a config # file; https://github.com/machine-learning-apps/Issue-Label-Bot/blob/26d8fb65be3b39de244c4be9e32b2838111dac10/flask_app/forward_utils.py#L5 - for org_and_repo in [("kubeflow", "kubeflow")]: - org = org_and_repo[0] - repo = org_and_repo[1] - logging.info(f"Loading model for repo {org}/{repo}") + for org in ["kubeflow"]: + logging.info(f"Loading AutoML model for org: {org}; model: {KUBEFLOW_AUTOML_MODEL}") - repo_model = repo_specific_model.RepoSpecificLabelModel.from_repo( - org, repo, - embedding_api_endpoint=os.environ.get("ISSUE_EMBEDDING_SERVICE")) + org_model = automl_model.AutoMLModel(model_name=KUBEFLOW_AUTOML_MODEL) - self._models[f"{org}/{repo}"] = repo_model + self._models[f"{org}"] = org_model combined = combined_model.CombinedLabelModels( - models=[self._models["universal"], repo_model]) - self._models[_combined_model_name(org, repo)] = combined + models=[self._models["universal"], org_model]) + self._models[_combined_model_name(org)] = combined + - def predict_labels_for_data(self, model_name, title, body, context=None): + def predict_labels_for_data(self, model_name, org, repo, title, body, + context=None): """Generate label predictions for the specified data. Args: model_name: Which model to use + org: org + repo: Repo name title: Title for the issue body: body of the issue @@ -70,8 +88,10 @@ def predict_labels_for_data(self, model_name, title, body, context=None): raise ValueError(f"No model named {model_name}") model = self._models[model_name] - logging.info(f"Generating predictions for title={title} text={body}") - predictions = model.predict_issue_labels(title, body, context=context) + logging.info(f"Generating predictions for title={title} text={body} using" + f"model: {model_name} class:{model.__class__}") + predictions = model.predict_issue_labels(org, repo, title, body, + context=context) return predictions @@ -91,10 +111,13 @@ def predict_labels_for_issue(self, org, repo, issue_number, model_name=None): dict: str -> float; dictionary mapping labels to their probability """ if not model_name: + org_model = _combined_model_name(org) repo_model = _combined_model_name(org, repo) if repo_model in self._models: model_name = repo_model + elif org_model in self._models: + model_name = org_model else: model_name = UNIVERSAL_MODEL_NAME @@ -117,7 +140,8 @@ def predict_labels_for_issue(self, org, repo, issue_number, model_name=None): } predictions = self.predict_labels_for_data( - model_name, data.get("title"), data.get("body"), context=context) + model_name, org, repo, data.get("title"), [data.get("body")], + context=context) return predictions diff --git a/py/label_microservice/models.py b/py/label_microservice/models.py index 238894c167..8b550dd620 100644 --- a/py/label_microservice/models.py +++ b/py/label_microservice/models.py @@ -1,5 +1,6 @@ """The models packages defines wrappers around different models.""" import abc +import typing class IssueLabelModel: """A base class for all Issue label models. @@ -8,13 +9,18 @@ class IssueLabelModel: """ @abc.abstractmethod - def predict_issue_labels(self, title:str , text:str, context=None): + def predict_issue_labels(self, org:str, repo:str, title:str, + text:typing.List[str], context=None): """Return a dictionary of label probabilities. Args: - title: The title for the issue - text: The text for the issue + org: The organization the issue belongs in + repo: The repository. + title: Issue title + text: List of contents of the comments on the issue + context: (Optional) Dictionary of additional context information + Return ------ dict: Dictionary of label to probability of that label for the diff --git a/py/label_microservice/universal_kind_label_model.py b/py/label_microservice/universal_kind_label_model.py index 46c5d90ac6..0ee5315bcf 100644 --- a/py/label_microservice/universal_kind_label_model.py +++ b/py/label_microservice/universal_kind_label_model.py @@ -9,6 +9,7 @@ from urllib.request import urlopen from label_microservice import models +import typing class UniversalKindLabelModel(models.IssueLabelModel): """UniversalKindLabelModel is a universal model that is trained across all repos. @@ -49,16 +50,17 @@ def __init__(self, class_names=['bug', 'feature', 'question']): self._prediction_threshold = defaultdict(lambda: .52) self._prediction_threshold["question"] = .60 - def predict_issue_labels(self, title:str, body:str, context=None): + def predict_issue_labels(self, org:str, repo:str, title:str, + text:typing.List[str], context=None): """ Get probabilities for the each class. Parameters ---------- - title: str - the issue title - body: str - the issue body + org: The organization the issue belongs in. Ignored by model. + repo: The repository. Ignored by model + title: Issue title + text: List of contents of the comments on the issue Returns ------ @@ -75,7 +77,7 @@ def predict_issue_labels(self, title:str, body:str, context=None): if not context: context = {} #transform raw text into array of ints - vec_body = self.body_pp.transform([body]) + vec_body = self.body_pp.transform(["\n".join(text)]) vec_title = self.title_pp.transform([title]) # make predictions with the model diff --git a/py/label_microservice/worker.py b/py/label_microservice/worker.py index 7b566c9219..f3d1ae962b 100644 --- a/py/label_microservice/worker.py +++ b/py/label_microservice/worker.py @@ -20,8 +20,6 @@ DEFAULT_APP_URL = "https://github.com/marketplace/issue-label-bot" -DEFAULT_APP_URL = "https://github.com/marketplace/issue-label-bot" - class Worker: """ The worker class aims to do label prediction for issues from github repos.