jlewi
diff --git a/‎Label_Microservice/.build/prod/extensions_v1beta1_deployment_label-bot-worker.yaml
Lines changed: 1 addition & 1 deletion b/‎Label_Microservice/.build/prod/extensions_v1beta1_deployment_label-bot-worker.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎Label_Microservice/README.md
Lines changed: 8 additions & 4 deletions b/‎Label_Microservice/README.md
Lines changed: 8 additions & 4 deletions
diff --git a/‎Label_Microservice/deployment/overlays/prod/kustomization.yaml
Lines changed: 1 addition & 1 deletion b/‎Label_Microservice/deployment/overlays/prod/kustomization.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎py/code_intelligence/embeddings.py
Lines changed: 3 additions & 56 deletions b/‎py/code_intelligence/embeddings.py
Lines changed: 3 additions & 56 deletions
diff --git a/‎py/code_intelligence/github_util.py
Lines changed: 201 additions & 46 deletions b/‎py/code_intelligence/github_util.py
Lines changed: 201 additions & 46 deletions
@@ -42,7 +42,7 @@ spec:
           value: "27079"
         - name: GITHUB_APP_PEM_KEY
           value: /var/secrets/github/issue-label-bot-github-app.private-key.pem
-        image: gcr.io/issue-label-bot-dev/bot-worker:011a589
+        image: gcr.io/issue-label-bot-dev/bot-worker:6848ad6
         name: app
         resources:
           requests:
 
@@ -62,7 +62,7 @@ The following describes the GCP projects and clusters where the two services are
     - **repository**: [machine-learning-apps/Issue-Label-Bot](https://github.com/machine-learning-apps/Issue-Label-Bot)
     - **GCP project**: github-probots
     - **cluster**: kf-ci-ml
-    - **namespace**: mlapp
+    - **namespace**: label-bot-prod
     - **yaml files**: [deployment](https://github.com/machine-learning-apps/Issue-Label-Bot/tree/master/deployment)
 
 1. Repo-specific label microservice
@@ -76,9 +76,9 @@ The following describes the GCP projects and clusters where the two services are
 
 1. The flask app
     - **repository**: [machine-learning-apps/Issue-Label-Bot](https://github.com/machine-learning-apps/Issue-Label-Bot)
-    - **GCP project**: issue-label-bot-dev
-    - **cluster**: github-mlapp-test
-    - **namespace**: mlapp
+    - **GCP project**: github-probots
+    - **cluster**: kf-ci-ml
+    - **namespace**: label-bot-dev
     - **yaml files**: [deployment](https://github.com/machine-learning-apps/Issue-Label-Bot/tree/master/deployment)
 
 1. Repo-specific label microservice
@@ -88,6 +88,10 @@ The following describes the GCP projects and clusters where the two services are
     - **namespace**: default
     - **yaml files**: [Label\_Microservice/deployment](https://github.com/kubeflow/code-intelligence/tree/master/Label_Microservice/deployment)
 
+1, GitHub bot - **kf-label-bot-dev**
+
+     - see [kubeflow/code-intelligence#84](https://github.com/kubeflow/code-intelligence/issues/84) for information on the setup
+     - see [machine-learning-apps/Issue-Label-Bot#57](https://github.com/machine-learning-apps/Issue-Label-Bot/issues/57)
 
 ## Instructions
 
 
@@ -10,4 +10,4 @@ resources:
 images:
 - name: gcr.io/issue-label-bot-dev/bot-worker
   newName: gcr.io/issue-label-bot-dev/bot-worker
-  newTag: 011a589
+  newTag: 6848ad6
@@ -42,6 +42,7 @@ def get_issue_text(num, idx, owner, repo, skip_issue=True):
     dict
         {'title':str, 'body':str}
     """
+    logging.warning("get_issue_text is deprecated; use github_util.get_issue")
     url = f'https://github.com/{owner}/{repo}/issues/{num}'
     status_code = requests.head(url).status_code
     if status_code != 200:
@@ -73,60 +74,6 @@ def get_issue_text(num, idx, owner, repo, skip_issue=True):
             'labels': labels,
             'num': num}
 
-# TODO(https://github.com/kubeflow/code-intelligence/issues/126): This function should replace
-# get_issue_text
-def get_issue(url, gh_client):
-  """Fetch the issue data using GraphQL
-  
-  Args:
-    url: Url of the GitHub isue to fetch
-    gh_client: GitHub GraphQl client.
-    
-  Returns
-    ------
-    dict
-        {'title':str, 'body':str}
-  """
-  issue_query = """query getIssue($url: URI!) {
-  resource(url: $url) {
-    __typename
-    ... on Issue {
-      author {
-        __typename
-        ... on User {
-          login
-        }
-        ... on Bot {
-          login
-        }
-      }
-      id
-      title
-      body
-      url
-      state
-      labels(first: 30) {
-        totalCount
-        edges {
-          node {
-            name
-          }
-        }
-      }
-    }
-  }
-}"""
-
-  variables = {
-          "url": url,
-  }
-  issue_results = gh_client.run_query(issue_query, variables)
-  
-  if "errors" in issue_results:
-    logging.error(f"There was a problem running the github query; {issue_results['errors']}")
-    raise ValueError(f"There was a problem running the github query: {issue_results['errors']}")
-  return issue_results["data"]["resource"]
-  
 def get_all_issue_text(owner, repo, inf_wrapper, workers=64):
     """
     Prepare embedding features of all issues in a given repository.
@@ -191,9 +138,9 @@ def load_model_artifact(model_url, local_dir=None):
     if not local_dir:
       home = str(Path.home())
       local_dir = os.path.join(home, "model_files")
-      
+
     full_path = os.path.join(local_dir, 'model.pkl')
-    
+
     if not full_path.exists():
         logging.info('Loading model.')
         path.mkdir(exist_ok=True)
 
@@ -1,57 +1,212 @@
+import fire
 import os
 import logging
 from code_intelligence import github_app
 import typing
 import yaml
 
 def get_issue_handle(installation_id, username, repository, number):
-    "get an issue object."
-    ghapp = github_app.GitHubApp.create_from_env()
-    install = ghapp.get_installation(installation_id)
-    return install.issue(username, repository, number)
+  "get an issue object."
+  ghapp = github_app.GitHubApp.create_from_env()
+  install = ghapp.get_installation(installation_id)
+  return install.issue(username, repository, number)
 
 def get_yaml(owner, repo, ghapp=None):
-    """
-    Looks for the yaml file in a /.github directory.
-
-    yaml file must be named issue_label_bot.yaml
-    """
-
-    if not ghapp:
-        # TODO(jlewi): Should we deprecate this code path and always pass
-        # in the github app?
-        ghapp = github_app.GitHubApp.create_from_env()
-
-    try:
-        # get the app installation handle
-        inst_id = ghapp.get_installation_id(owner=owner, repo=repo)
-        inst = ghapp.get_installation(installation_id=inst_id)
-        # get the repo handle, which allows you got get the file contents
-        repo = inst.repository(owner=owner, repository=repo)
-        results = repo.file_contents('.github/issue_label_bot.yaml').decoded
-    # TODO(jlewi): We should probably catching more narrow exceptions and
-    # not swallowing all exceptions. The exceptions we should swallow are
-    # the ones related to the configuration file not existing.
-    except Exception as e:
-        logging.info(f"Exception occured getting .github/issue_label_bot.yaml: {e}")
-        return None
-
-    return yaml.safe_load(results)
+  """
+  Looks for the yaml file in a /.github directory.
+
+  yaml file must be named issue_label_bot.yaml
+  """
+
+  if not ghapp:
+    # TODO(jlewi): Should we deprecate this code path and always pass
+    # in the github app?
+    ghapp = github_app.GitHubApp.create_from_env()
+
+  try:
+    # get the app installation handle
+    inst_id = ghapp.get_installation_id(owner=owner, repo=repo)
+    inst = ghapp.get_installation(installation_id=inst_id)
+    # get the repo handle, which allows you got get the file contents
+    repo = inst.repository(owner=owner, repository=repo)
+    results = repo.file_contents('.github/issue_label_bot.yaml').decoded
+  # TODO(jlewi): We should probably catching more narrow exceptions and
+  # not swallowing all exceptions. The exceptions we should swallow are
+  # the ones related to the configuration file not existing.
+  except Exception as e:
+    logging.info(f"Exception occured getting .github/issue_label_bot.yaml: {e}")
+    return None
+
+  return yaml.safe_load(results)
 
 def build_issue_doc(org:str, repo:str, title:str, text:typing.List[str]):
-    """Build a document string out of various github features.
-
-    Args:
-     org: The organization the issue belongs in
-     repo: The repository.
-     title: Issue title
-     text: List of contents of the comments on the issue
-
-    Returns:
-     content: The document to classify
-    """
-    pieces = [title]
-    pieces.append(f"{org.lower()}_{repo.lower()}")
-    pieces.extend(text)
-    content = "\n".join(pieces)
-    return content
+  """Build a document string out of various github features.
+
+  Args:
+   org: The organization the issue belongs in
+   repo: The repository.
+   title: Issue title
+   text: List of contents of the comments on the issue
+
+  Returns:
+   content: The document to classify
+  """
+  pieces = [title]
+  pieces.append(f"{org.lower()}_{repo.lower()}")
+  pieces.extend(text)
+  content = "\n".join(pieces)
+  return content
+
+# TODO(https://github.com/kubeflow/code-intelligence/issues/126): This function should replace
+# get_issue_text
+def get_issue(url, gh_client):
+  """Fetch the issue data using GraphQL.
+
+  Args:
+    url: Url of the GitHub isue to fetch
+    gh_client: GitHub GraphQl client.
+
+  Returns
+    ------
+    dict
+        {'title':str,
+         'comments':List[str]
+         'labels': List[str]
+         'removed_labels': List[str]}
+
+    comments is a list of comments. The first one will be the body of the issue.
+
+    labels: Labels currently on the issue
+    removed_labels: Labels that have been removed
+  """
+
+  # The "!" means the variable can't be null. We allow the cursors
+  # to be null so that on the first call we fetch the first couple items.
+  issue_query = """query getIssue($url: URI!, $labelCursor: String, $timelineCursor: String, $commentsCursor: String) {
+  resource(url: $url) {
+    __typename
+    ... on Issue {
+      author {
+        __typename
+        ... on User {
+          login
+        }
+        ... on Bot {
+          login
+        }
+      }
+      id
+      title
+      body
+      url
+      state
+      comments(first: 100, after: $commentsCursor) {
+        totalCount
+        edges {
+          node {
+            author {
+              login
+            }
+            body
+          }
+        }
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+      }
+      timelineItems(first: 100, itemTypes: [UNLABELED_EVENT], after: $timelineCursor) {
+        totalCount
+        edges {
+          node {
+            __typename
+             ... on UnlabeledEvent {
+                  createdAt
+                  label {
+                    name
+                  }
+                }
+          }
+        }
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+      }
+      labels(first: 100, after: $labelCursor) {
+        totalCount
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+        edges {
+          node {
+            name
+          }
+        }
+      }
+    }
+  }
+}"""
+
+  variables = {
+    "url": url,
+    "labelCursor": None,
+    "commentsCursor": None,
+    "timelineCurosr": None,
+  }
+
+  has_more = True
+
+  result = {
+    "title": None,
+    "comments": [],
+    "comment_authors": [],
+    "labels": set(),
+    "removed_labels": set(),
+  }
+  while has_more:
+    issue_results = gh_client.run_query(issue_query, variables)
+
+    if "errors" in issue_results:
+      logging.error(f"There was a problem running the github query; {issue_results['errors']}")
+      raise ValueError(f"There was a problem running the github query: {issue_results['errors']}")
+
+    issue = issue_results["data"]["resource"]
+
+    # Only set the title once on the first call
+    if not result["title"]:
+      result["title"] = issue["title"]
+
+    if not result["comments"]:
+      result["comments"].append(issue["body"])
+      result["comment_authors"].append(issue["author"]["login"])
+
+    for e in issue["comments"]["edges"]:
+      node = e["node"]
+      result["comments"].append(node["body"])
+      result["comment_authors"].append(node["author"]["login"])
+
+    for e in issue["labels"]["edges"]:
+      node = e["node"]
+      result["labels"].add(node["name"])
+
+    for e in issue["timelineItems"]["edges"]:
+      node = e["node"]
+      result["removed_labels"].add(node["label"]["name"])
+
+    has_more = False
+
+    for f in ["comments", "labels", "timelineItems"]:
+      has_more = has_more or issue[f].get("pageInfo").get("hasNextPage")
+
+    variables["labelCursor"] = issue["labels"]["pageInfo"]["endCursor"]
+    variables["commentsCursor"] = issue["comments"]["pageInfo"]["endCursor"]
+    variables["timelineCursor"] = issue["timelineItems"]["pageInfo"]["endCursor"]
+
+  # For removed_labels we only want labels that were permanently removed
+  result["removed_labels"] = result["removed_labels"] - result["labels"]
+
+  result["labels"] = list(result["labels"])
+  result["removed_labels"] = list(result["removed_labels"])
+  return result