finish model training code (#41)

danaiamirali · Nishant2022 · aarushis18 · web-flow · commit a87f13dc7962 · 2024-11-17T16:42:32.000-05:00
Co-authored-by: Amirali Danai &lt;79483315+danaiamirali@users.noreply.github.com&gt;
Co-authored-by: Nishant Dash &lt;nishant.dash@gmail.com&gt;
Co-authored-by: Aarushi Shah &lt;aarushah@umich.edu&gt;
Co-authored-by: Dennis &lt;dennisfj@umich.edu&gt;
Co-authored-by: khhan &lt;khhan@umich.edu&gt;
Co-authored-by: jechingliao43 &lt;jechingliao@gmail.com&gt;
Co-authored-by: Jeffrey Lu &lt;jeffrey.p.lu@gmail.com&gt;
Co-authored-by: maxim12313 &lt;maximkim16@gmail.com&gt;
Co-authored-by: Selina &lt;selinateam14@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,12 @@
+# Model files
+tokenizer_10M
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
+checkpoints/
+checkpoints/*
 
 # C extensions
 *.so
diff --git a/education/docker/Dockerfile b/education/docker/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.10.15-slim-bullseye
+
+WORKDIR /app 
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --compile --no-cache-dir -r requirements.txt
+
+COPY basic_flask.py basic_flask.py
+
+CMD ["flask", "--app", "basic_flask.py", "run", "-p", "8080"]
+
+
+
diff --git a/education/docker/basic_flask.py b/education/docker/basic_flask.py
@@ -0,0 +1,11 @@
+from flask import Flask
+app = Flask(__name__)
+
+
+@app.route('/')
+def hello_world():
+    return "<h1>Hello World from Containerized App</h1>"
+
+
+if __name__ == "__main__":
+    app.run(debug=True)
diff --git a/education/docker/requirements.txt b/education/docker/requirements.txt
@@ -0,0 +1 @@
+Flask==3.0.3
diff --git a/src/backend/.gitkeep b/src/backend/.gitkeep
diff --git a/src/backend/lambda.py b/src/backend/lambda.py
@@ -0,0 +1,32 @@
+import base64
+import json
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+model = AutoModelForCausalLM.from_pretrained("trained/gpt2-728")
+model.to(device)
+model.eval()
+
+
+# Function to predict the next token
+def predict_next_token(input_text, model=model, tokenizer=tokenizer, max_length=50):
+    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
+
+    predicted_text = pipe(input_text)[0]["generated_text"]
+
+    input_text_len = len(input_text)
+
+    return predicted_text[input_text_len:]
+
+
+def handler(event, context):
+    try:
+        if event.get('isBase64Encoded', False):
+            body = base64.b64decode(event['body']).decode('utf-8')
+        else:
+            body = event['body']
+    except (KeyError, json.JSONDecodeError) as e:
+        return {"statusCode": 400, "body": f"Error processing request: {str(e)}"}
+    return {"statusCode": 200, "body": predict_next_token(body)}
diff --git a/src/model/chunk.py b/src/model/chunk.py
@@ -0,0 +1,54 @@
+## This file contains the function to chunk the input text into smaller pieces for the model to process
+## The input text is tokenized and then split into chunks of size chunk_size
+
+from transformers import AutoTokenizer
+import torch
+
+def chunk(inp: str, 
+          tokenizer: AutoTokenizer, 
+          chunk_size: int = 256, 
+          overlapping_len: int = 3,
+          max_chunks: int = 128) -> torch.Tensor:
+    
+    # Tokenize entire sample
+    tokenized_txt = tokenizer(inp,
+                              return_tensors="pt"
+                             )["input_ids"].view(-1)
+    token_len = len(tokenized_txt)
+    
+    # Add chunks
+    chunks = []
+    last_padding_size = 0
+
+    for i in range(0, token_len-overlapping_len, chunk_size-overlapping_len):
+        # Exit if max_chunks is met
+        if len(chunks) >= max_chunks:
+            break
+
+        # Create (potentially too short) new chunk
+        new_chunk = tokenized_txt[i:i+chunk_size]
+        
+        # Generate (potentially empty) padding
+        padding = torch.full(
+            size=(chunk_size - len(new_chunk), ),
+            fill_value=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
+        )
+        last_padding_size = max(0, chunk_size - len(new_chunk))
+
+        # Pad
+        new_chunk = torch.cat((new_chunk, padding))
+        
+        # Add new correctly-sized chunk
+        chunks.append(new_chunk)
+    
+    # Compile results
+    input_ids = torch.stack(chunks)
+    attention_mask = torch.ones_like(input_ids)
+
+    if last_padding_size >= 2:
+        attention_mask[-1, -last_padding_size:] = 0
+
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask
+    }
diff --git a/src/model/dataset.py b/src/model/dataset.py
@@ -0,0 +1,136 @@
+from datasets import load_dataset
+from torch.utils.data import IterableDataset
+from transformers import AutoTokenizer
+from typing import Iterator
+
+from preprocess import clean_comments, include, keep_only_content
+
+from chunk import chunk
+
+class CleanDataset(IterableDataset):
+    TRAIN_SPLIT_NAME = "codeparrot/codeparrot-clean-train"
+    VAL_SPLIT_NAME = "codeparrot/codeparrot-clean-valid"
+
+    def __init__(self, train_split: bool, max_size: int = float("inf")):
+        SPLIT_NAME = [CleanDataset.TRAIN_SPLIT_NAME, CleanDataset.VAL_SPLIT_NAME][int(train_split)]
+
+        # Set max size
+        self.max_size = max_size
+
+        # Load dataset
+        ds = load_dataset(SPLIT_NAME,
+                          streaming=True,
+                          split="train")    # Invariant for BOTH train and val sets
+
+        # Preprocessing
+        ds = ds.filter(lambda x: x["path"].endswith(".py"))               # Python only
+        ds = ds.filter(lambda x: include(x["content"]))                   # DS imports only
+        ds = ds.map(lambda x: {"content": clean_comments(x["content"])})  # Reformat code
+        ds = ds.map(keep_only_content)                                    # Smaller samples
+
+        # Prepare for torch DataLoader
+        ds = ds.with_format("torch")
+
+        # Enforce max size
+        ds = ds.take(max_size)
+
+        self.ds = ds
+
+    def generate(self) -> Iterator[str]:
+        i = 0 # Tracks attempt number for exception reporting
+
+        for code_file in self.ds:
+            i += 1
+
+            # Yield when possible, skip and log when not
+            try:
+                yield code_file["content"]
+            except StopIteration:
+                break
+            except Exception as e:
+                print(f"[WARNING] Exception while loading sample {i+1}/{self.max_size}: {e}. Skipped item")
+                continue
+
+    def __iter__(self) -> Iterator[dict]:
+        return self.generate()
+
+
+class ChunkedDataset(CleanDataset):
+    def __init__(self, train_split: bool, max_size: int, tokenizer: AutoTokenizer,
+                 chunk_size: int = 256, chunk_overlap_len: int = 3, max_chunks: int = 128):
+
+        super().__init__(train_split, max_size)
+
+        self.tokenizer = tokenizer
+        self.chunk_size = chunk_size
+        self.overlapping_len = chunk_overlap_len
+        self.max_chunks = max_chunks
+
+    def generate(self) -> Iterator[dict]:
+        count = 0
+
+        for text in super().generate():
+            # Attempt to chunk each code sample
+            chunks = None
+            try:
+                chunks = chunk(inp=text,
+                               tokenizer=self.tokenizer, 
+                               chunk_size=self.chunk_size,
+                               overlapping_len=self.overlapping_len,
+                               max_chunks=self.max_chunks)
+            except Exception as e:
+                print(f"[WARNING] Exception while chunking sample {count}/{self.max_size}: {e}. Skipped item")
+                continue
+
+            # Extract input ids and attention masks
+            ids, mask = chunks["input_ids"], chunks["attention_mask"]
+
+            # Yield each chunk, stopping if max_size is reached
+            for i in range(ids.size()[0]):
+                # Stop yielding if max_size is reached
+                if count >= self.max_size:
+                    break
+
+                # Yield
+                yield {
+                    "input_ids": ids[i],
+                    "attention_mask": mask[i],
+                    "labels": ids[i].clone()
+                }
+                count += 1
+
+            # Stop generating new chunks if max_size is reached
+            if count >= self.max_size:
+                break
+
+
+# SAMPLE USAGE
+if __name__ == "__main__":
+    try:
+        tokenizer = AutoTokenizer.from_pretrained("./tokenizer_10M")
+    except OSError as _:
+        print("[WARNING] tokenizer_10M folder was not found, defaulting to GPT2")
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+
+    ds = ChunkedDataset(
+        train_split=True,      # Use training split
+        max_size=1_000_000,    # Provide up to 1 million samples (not files)
+        tokenizer=tokenizer,   # Set tokenizer
+        chunk_size=256,        # Max length of id/mask sequences is 256
+        chunk_overlap_len=3,   # Chunks share 3 ids with the previous chunk
+        max_chunks=128,        # Max chunks per file
+    )
+
+    # ChunkedDataset is iterable, so it can be directly passed to a DataLoader
+    from torch.utils.data import DataLoader
+
+    loader = DataLoader(
+        dataset=ds,
+        batch_size=16,
+        # shuffle should NOT be set because the dataset has unknown length
+    )
+
+    # Inspect a single element of this batch
+    for batch in loader:
+        print(batch)
+        break
diff --git a/src/model/preprocess.py b/src/model/preprocess.py
@@ -0,0 +1,109 @@
+import regex as re
+
+# -----------------------  Implementation ---------------------
+def clean_doc(code: str, delim: str):
+    string_pre = r"(\S+\s*=\s*)"
+    between = rf"({delim})((?:(?!{delim})[\s\S])*)({delim})"
+
+    # this should not hit
+    holder = "!|<1multiline1>|!"
+
+    # initially save mutliline strings with flag value
+    code = re.sub(
+        string_pre + between,
+        rf"\1{holder}\3{holder}",
+        code,
+    )
+
+    # removing docs without caring about mutliline
+    code = re.sub(between, "", code)
+
+    # fix flags back to original
+    code = re.sub(re.escape(holder), delim, code)
+    return code
+
+
+# Im not sure this was the best approach for doc comments
+# Clean inline comments and block comments
+def clean_comments(code: str) -> str:
+    # remove comments with #
+    code = re.sub(r"#[^\n]*", "", code)
+
+    # remove docs with """
+    code = clean_doc(code, '"""')
+
+    # remove docs with '''
+    code = clean_doc(code, "'''")
+
+    # get rid of trailing
+    code = re.sub(r"\s*\n", "\n", code)
+    return code
+
+
+# for lambda deciding what files to include as training data
+def include(content: str) -> bool:
+    libraries = [
+        "numpy",
+        "pandas",
+        "matplotlib",
+        "sklearn",
+        "tensorflow",
+        "torch",
+        "scipy",
+    ]
+
+    return bool(re.search("|".join(libraries), content))
+
+
+# Filters samples to return only code
+def keep_only_content(sample: dict) -> dict:
+    return {"content": sample["content"]}
+
+
+# preview cleaning and check
+def preview():
+    from dataset import CleanDataset
+
+    ds = CleanDataset(
+        train_split=False,
+        max_size=100
+    )
+
+    for x in ds:
+        print(x["content"])
+
+def tests():
+    input1 = """
+        string1 = '''
+            keep me
+        '''
+
+        '''
+        docstring1
+        '''
+
+        '''docstring2'''
+
+        '''docstring3
+        docstring3'''
+
+        string2 = '''keep me'''
+    """
+    _ = clean_comments(input1)
+
+    input2 = '''
+        string = """
+            "hello there" 'hi there'
+        """
+
+        """
+        1234567890qwertyuiop[!@#$%^&*()].
+        """
+    '''
+    print(clean_comments(input1))
+    print(clean_comments(input2))
+
+
+if __name__ == "__main__":
+    tests()
+
diff --git a/src/model/requirements.txt b/src/model/requirements.txt
@@ -0,0 +1,3 @@
+datasets == 3.1.0
+torch == 2.5.0+cu121
+transformers == 4.44.2
diff --git a/src/model/retrain_tokenizer.py b/src/model/retrain_tokenizer.py
diff --git a/src/model/tokenizer_10M.tgz b/src/model/tokenizer_10M.tgz
diff --git a/src/model/train.py b/src/model/train.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+datasets == 3.1.0`
	`2`	`+torch == 2.5.0+cu121`
	`3`	`+transformers == 4.44.2`