Update repo_mlp.ipynb to train an org wide model (kubeflow#128)

jlewi · web-flow · commit 2e59857c72cd · 2020-04-27T17:30:04.000-07:00
* Related to kubeflow#110 * Uses the newly computed embeddings for all issues in the org to train the model * cleanup .gitignore. * Create a kustomize package to start a notebook with appropriate settings for running the example. * Warning: The kpt setters aren't fully configured yet * Evaluate the model qualitatively by fetching recent issues from BigQuery and computing predictions for those issues * Very few issues get an area or platform label. It looks like the model falls far short of our goal of labeling 25% of issues. * Move code for fetching issues from bigquery into github_bigquery.py * Still need to update Get-GitHub-Issues.ipynb to use this * When computing ROC curves only look at issues with an area, kind, or platform label * Do this because we have lots of examples with no labels. We shouldn't take those to be "true negatives" becauses its likely a human never looked at them and if they did some labels might apply.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,3 @@
-.*
 *.pem
 *.hdf5
 *.pkl
@@ -13,5 +12,9 @@ fairing/__pycache__/**
 *.pyc
 py/code_intelligence/.data/**
 
+# ignore coredumps
+**/core.*
+# ignore checkpoints
+**/.ipynb_checkpoints/
 # TODO(jlewi): Is this a remote module? Why is the fairing src getting cloned here?
 Label_Microservice/src/**
diff --git a/Issue_Embeddings/notebooks/Get-GitHub-Issues.ipynb b/Issue_Embeddings/notebooks/Get-GitHub-Issues.ipynb
@@ -397,6 +397,7 @@
     }
    ],
    "source": [
+    "# TODO(jlewi): This code should now be a function in embeddings/github_bigquery.py\n",
     "query = \"\"\"SELECT          \n",
     "          JSON_EXTRACT(payload, '$.issue.html_url') as html_url,\n",
     "          JSON_EXTRACT(payload, '$.issue.title') as title,\n",
diff --git a/Issue_Triage/notebooks/triage.ipynb b/Issue_Triage/notebooks/triage.ipynb
diff --git a/Label_Microservice/notebooks/Label_k8s_issues_with_MLP.ipynb b/Label_Microservice/notebooks/Label_k8s_issues_with_MLP.ipynb
@@ -7,6 +7,10 @@
     "## Background\n",
     "In this notebook, we show how to feed the embeddings from the language model into the MLP classifier. Then, we take the github repo, `kubernetes/kubernetes`, as an example. We do transfer learning and show the results.\n",
     "\n",
+    "* TODO(jlewi): This notebook is duplicative with repo_mlp.ipynb. It looks like this might have contained\n",
+    "the original training code which has been refactored in repo_mlp.ipynb. But unlike repo_mlp.ipynb this\n",
+    "notebook contains code for model evaluation. We should probably combine them and remove duplication.\n",
+    "\n",
     "## Data\n",
     "**combined_sig_df.pkl**\n",
     "https://storage.googleapis.com/issue_label_bot/notebook_files/combined_sig_df.pkl\n",
@@ -451,6 +455,7 @@
    "source": [
     "from sklearn.metrics import roc_auc_score\n",
     "\n",
+    "# TODO(jlewi): I moved this into mlp.py\n",
     "def calculate_auc(predictions):\n",
     "    auc_scores = []\n",
     "    counts = []\n",
@@ -5654,9 +5659,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.6"
+   "version": "3.6.9"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/Label_Microservice/notebooks/repo_mlp.ipynb b/Label_Microservice/notebooks/repo_mlp.ipynb
diff --git a/k8s-notebooks/Kptfile b/k8s-notebooks/Kptfile
@@ -0,0 +1,34 @@
+apiVersion: kpt.dev/v1alpha1
+kind: Kptfile
+metadata:
+  name: .
+packageMetadata:
+  shortDescription: sample description
+openAPI:
+  definitions:
+    io.k8s.cli.setters.namespace:
+      x-k8s-cli:
+        setter:
+          name: namespace
+          value: kubeflow-jlewi
+    io.k8s.cli.substitutions.namespace:
+      x-k8s-cli:
+        substitution:
+          name: namespace
+          pattern: NAMESPACE
+          values:
+          - marker: NAMESPACE
+            ref: '#/definitions/io.k8s.cli.setters.namespace'
+    io.k8s.cli.setters.name:
+      x-k8s-cli:
+        setter:
+          name: name
+          value: mnist
+    io.k8s.cli.substitutions.name:
+      x-k8s-cli:
+        substitution:
+          name: name
+          pattern: NAME
+          values:
+          - marker: NAME
+            ref: '#/definitions/io.k8s.cli.setters.name'
diff --git a/k8s-notebooks/README.md b/k8s-notebooks/README.md
@@ -0,0 +1,14 @@
+# Notebook Manifests
+
+TODO(jlewi): kpt setters aren't properly configured yet
+  * volumes need to be properly set
+
+This directory contains a kustomize package for spinning up
+a notebook on Kubeflow to run the example.
+
+Create a secret with the GITHUB_TOKEN
+
+```
+kubectl -n kubeflow-jlewi create secret generic github-token --from-literal=github_token=${GITHUB_TOKEN}
+```
+
diff --git a/k8s-notebooks/kustomization.yaml b/k8s-notebooks/kustomization.yaml
@@ -0,0 +1,8 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: kubeflow-jlewi # {"$ref":"#/definitions/io.k8s.cli.substitutions.namespace"}
+resources:
+- notebook.yaml
+- pvc.yaml
+- service.yaml
+- virtual_service.yaml
diff --git a/k8s-notebooks/notebook.yaml b/k8s-notebooks/notebook.yaml
@@ -0,0 +1,58 @@
+apiVersion: kubeflow.org/v1
+kind: Notebook
+metadata:
+  labels:
+    app: mnist # {"$ref":"#/definitions/io.k8s.cli.substitutions.name"}
+  name: mnist # {"$ref":"#/definitions/io.k8s.cli.substitutions.name"}
+spec:
+  template:
+    spec:
+      containers:
+      - env:
+        - name: JUPYTERLAB_DIR # Set the JJUPYTERLAB_DIR so we can install extensions        
+          value: /home/jovyan/.jupyterlab_dir
+        - name: GITHUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: github-token
+              key: github_token
+        image: gcr.io/kubeflow-images-public/tensorflow-1.15.2-notebook-gpu:1.0.0
+        name: mnist # {"$ref":"#/definitions/io.k8s.cli.substitutions.name"}
+        # Bump the resources to include a GPU
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+          requests:
+            cpu: "15"
+            memory: 32.0Gi
+        volumeMounts:
+        - mountPath: /home/jovyan
+          name: workspace-mnist
+        - mountPath: /dev/shm
+          name: dshm
+      # Start a container running theia which is an ID
+      - env:
+        - name: GITHUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: github-token
+              key: github_token
+        # TODO(jlewi): Should we use an image which actually includes an appropriate toolchain like python?
+        image: theiaide/theia:next
+        name: theia
+        resources:
+          requests:
+            cpu: "4"
+            memory: 1.0Gi
+        volumeMounts:
+        - mountPath: /mount/jovyan
+          name: workspace-mnist
+      serviceAccountName: default-editor
+      ttlSecondsAfterFinished: 300
+      volumes:
+      - name: workspace-mnist
+        persistentVolumeClaim:
+          claimName: workspace-mnist
+      - emptyDir:
+          medium: Memory
+        name: dshm
diff --git a/k8s-notebooks/pvc.yaml b/k8s-notebooks/pvc.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  # TODO(jlewi): Need to create a kpt setter for this.
+  name: workspace-mnist
+spec:
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      # We need more storage.
+      storage: 100Gi
+  storageClassName: standard
+  volumeMode: Filesystem
diff --git a/k8s-notebooks/service.yaml b/k8s-notebooks/service.yaml
@@ -0,0 +1,15 @@
+# Define a service for theia
+# TODO(jlewi): This needs to be adjusted based on kpt setters
+apiVersion: v1
+kind: Service
+metadata:
+  name: mnist-theia
+spec:
+  ports:
+  - name: http-theia
+    port: 3000
+    protocol: TCP
+    targetPort: 3000
+  selector:
+    notebook-name: mnist
+  type: ClusterIP
diff --git a/k8s-notebooks/virtual_service.yaml b/k8s-notebooks/virtual_service.yaml
@@ -0,0 +1,24 @@
+apiVersion: networking.istio.io/v1alpha3
+kind: VirtualService
+metadata:
+  name: notebook-kubeflow-jlewi-mnist-theia
+  namespace: kubeflow-jlewi  
+spec:
+  gateways:
+  - kubeflow/kubeflow-gateway
+  hosts:
+  - '*'
+  http:
+  - match:
+    - uri:
+        # The prefix must have a trailing slash
+        # And when you navigate to the URL you must include the trailing slash.
+        prefix: /notebook/kubeflow-jlewi/mnist-theia/        
+    rewrite:
+      uri: /
+    route:
+    - destination:
+        host: mnist-theia.kubeflow-jlewi.svc.cluster.local
+        port:
+          number: 3000
+    timeout: 300s
diff --git a/py/code_intelligence/embeddings.py b/py/code_intelligence/embeddings.py
@@ -17,8 +17,8 @@ def find_max_issue_num(owner, repo):
 
     Returns
     -------
-    int
-        the highest issue number associated with this repo.
+    int192
+            the highest issue number associated with this repo.
     """
     url = f'https://github.com/{owner}/{repo}/issues'
     r = requests.get(url)
@@ -174,24 +174,32 @@ def pass_through(x):
     """Avoid messages when the model is deserialized in fastai library."""
     return x
 
-def load_model_artifact(model_url):
+# TODO(jlewi): I think we should just get rid of this method.
+# Callers should use gcs_util and then call inference_wrapper
+def load_model_artifact(model_url, local_dir=None):
     """
     Download the pretrained language model from URL
     Args:
       model_url: URL to store the pretrained model
+      local_dir: (Optional) Director where model files are stored
 
     Returns
     ------
     InferenceWrapper
         a wrapper for a Learner object in fastai.
     """
-    path = Path('./model_files')
-    full_path = path/'model.pkl'
-
+    if not local_dir:
+      home = str(Path.home())
+      local_dir = os.path.join(home, "model_files")
+      
+    full_path = os.path.join(local_dir, 'model.pkl')
+    
     if not full_path.exists():
         logging.info('Loading model.')
         path.mkdir(exist_ok=True)
         request_url.urlretrieve(model_url, path/'model.pkl')
+    else:
+      logging.info(f"Model {full_path} exists")
     return InferenceWrapper(model_path=path, model_file_name='model.pkl')
 
 
diff --git a/py/code_intelligence/gcs_util.py b/py/code_intelligence/gcs_util.py
@@ -71,6 +71,18 @@ def upload_file_to_gcs(bucket_name, gcs_filename, local_filename, storage_client
     blob = bucket.blob(gcs_filename)
     blob.upload_from_filename(local_filename)
 
+
+def copy_from_gcs(gcs_path, local_filename, storage_client=None):
+    """
+    Download a file in GCS to the local.
+    Args:
+      gcs_path: gcs path
+      local_filename: the new local file, str
+      storage_client: client to bundle configuration needed for API requests
+    """
+    bucket_name, gcs_file_name = split_gcs_uri(gcs_path)
+    return download_file_from_gcs(bucket_name, gcs_file_name, local_filename, storage_client=storage_client)    
+    
 def download_file_from_gcs(bucket_name, gcs_filename, local_filename, storage_client=None):
     """
     Download a file in GCS to the local.
diff --git a/py/code_intelligence/github_bigquery.py b/py/code_intelligence/github_bigquery.py
@@ -0,0 +1,66 @@
+"""This module contains code to get issue data from BigQuery."""
+
+import dateutil 
+import json
+from pandas.io import gbq
+import re
+
+def get_issues(login, project, max_age_days=None):
+  """Get issue data from bigquery.
+  
+  Args:
+    login: Which GitHub organization to query for
+    project: GCP project to charge BigQuery to
+    max_age_days: (Optional) If present only fetch issues which were created
+      less then max age_days ago
+  """
+  query = f"""SELECT          
+        JSON_EXTRACT(payload, '$.issue.html_url') as html_url,
+        JSON_EXTRACT(payload, '$.issue.title') as title,
+        JSON_EXTRACT(payload, '$.issue.body') as body,
+        JSON_EXTRACT(payload, "$.issue.labels") as labels,
+        JSON_EXTRACT(payload, "$.issue.created_at") as created_at,
+        JSON_EXTRACT(payload, "$.issue.updated_at") as updated_at,
+        org.login,
+        type,
+    FROM `githubarchive.month.20*`
+    WHERE  (type="IssuesEvent" or type="IssueCommentEvent") and org.login = '{login}'"""
+  
+  if max_age_days:
+    # We need to convert the created_at field to a timestamp.
+    # JSON_EXTRACT returns a json string meaning it is quoted and we need 
+    # to remove the quotes
+    query += f""" and DATETIME_DIFF(CURRENT_DATETIME(), PARSE_DATETIME(
+                          "\\"%Y-%m-%dT%TZ\\"", JSON_EXTRACT(payload, 
+                                                           "$.issue.created_at")), DAY) 
+                   <= {max_age_days} """
+
+  issues_and_pulls=gbq.read_gbq(query, dialect='standard', project_id=project)
+  
+  # pull request comments also get included so we need to filter those out
+  pattern = re.compile(".*issues/[\d]+")
+  
+  issues_index = issues_and_pulls["html_url"].apply(lambda x: pattern.match(x) is not None)
+  issues = issues_and_pulls[issues_index]
+
+  # We need to group the events by issue and then select the most recent event for each 
+  # issue as that should have the most up to date labels for each issue.
+  # TODO(jlewi): Should we be converting updated_at to a datetime before doing the sort?
+  latest_issues = issues.groupby("html_url", as_index=False).apply(lambda x: x.sort_values(["updated_at"]).iloc[-1])
+  
+  # we need to deserialize the json strings to remove escaping
+  for f in ["html_url", "title", "body", "created_at", "updated_at"]:
+    latest_issues[f] = latest_issues[f].apply(lambda x : json.loads(x))
+  
+  # Parse timestamps
+  for f in ["created_at", "updated_at"]:
+    latest_issues[f] = latest_issues[f].apply(lambda x : dateutil.parser.parse(x))
+  
+  # Parse labels
+  def get_labels(x):
+    d = json.loads(x)
+    return [i["name"] for i in d]
+
+  latest_issues["parsed_labels"] = latest_issues["labels"].apply(get_labels)
+  
+  return latest_issues
diff --git a/py/label_microservice/mlp.py b/py/label_microservice/mlp.py

Original file line number	Diff line number	Diff line change
`@@ -397,6 +397,7 @@`
`397`	`397`	`}`
`398`	`398`	`],`
`399`	`399`	`"source": [`
	`400`	`+ "# TODO(jlewi): This code should now be a function in embeddings/github_bigquery.py\n",`
`400`	`401`	`"query = \"\"\"SELECT \n",`
`401`	`402`	`" JSON_EXTRACT(payload, '$.issue.html_url') as html_url,\n",`
`402`	`403`	`" JSON_EXTRACT(payload, '$.issue.title') as title,\n",`