benchmarking-SLMs/sample_website_and_truncate_github_version.py at main · sbaresearch/benchmarking-SLMs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-

import os
import sys
import tiktoken
import pandas as pd
import json


# Define the path to the dataset folder containing unpacked phishing data
ws_path = "/workspace/dataset/unpacked_folder_phishing"

# List all files/folders inside the dataset path
files = os.listdir(ws_path)

# Dictionary to store HTML content with folder names as keys
all_html1 = {}

# Iterate over each folder/file in the dataset path
for filename in files:
    # Construct the full path of the current item
    p2 = os.path.join(ws_path, filename).replace("\\", "/")

    # List files within the current folder
    wss = os.listdir(p2)

    # Iterate through files in the folder
    for f1 in wss:
        # Process only files ending with ".html"
        if f1.endswith(".html"):
            # Construct full path of the HTML file
            file_path = os.path.join(p2, f1).replace("\\", "/")

            # Open the HTML file in read mode with UTF-8 encoding
            with open(file_path, "r", encoding="utf-8") as f:
                # Store the content in a dictionary with the folder name as key
                all_html1[filename] = f.read()


# Step 1: Initialize GPT-4 tokenizer using tiktoken
model_name = "gpt-4"
encoding = tiktoken.encoding_for_model(model_name)

# Step 2: Count tokens for each HTML document
doc_token_counts = {name: len(encoding.encode(doc)) for name, doc in all_html1.items()}

# Step 3: Convert token counts into a pandas DataFrame
df = pd.DataFrame(list(doc_token_counts.items()), columns=["doc_name", "token_count"])

# Step 4: Split documents into 10 bins based on token counts (quantile-based binning)
df["token_bin"] = pd.qcut(df["token_count"], q=10, labels=False)

# Step 5: Sample 500 documents in total, proportionally distributed across bins (50 per bin)
sampled_df = df.groupby("token_bin", group_keys=False).apply(
    lambda x: x.sample(n=int(500 / 10), random_state=42)  # random_state ensures reproducibility
)

# Step 6: Extract only the sampled document names
sampled_doc_names = sampled_df["doc_name"].tolist()
print(f"Number of sampled document names: {len(sampled_doc_names)}")


# Add custom script path for importing truncate functions
ROOT = "/workspace/scripts"
sys.path.append(ROOT)

# Import helper functions to truncate HTML by token length
from truncate_html_functions_github_version import *

# Dictionaries to store truncated versions of documents
p5 = {}   # Stores 5% of tokens
p50 = {}  # Stores 50% of tokens

# Iterate over all documents
for name, doc in all_html1.items():
    if name in sampled_doc_names:  # Only process sampled documents
        doc_len = len(encoding.encode(doc))  # Get token length of document

        # Truncate to 5% of original length
        p5[name] = truncate_html_to_tokens_merged(doc, doc_len*0.05)

        # Truncate to 50% of original length
        p50[name] = truncate_html_to_tokens_merged(doc, doc_len*0.50)

# Save the truncated 5% documents into JSON
with open('/workspace/dataset/temp/phish_5.json', 'w') as fp:
    json.dump(p5, fp)

# Save the truncated 50% documents into JSON
with open('/workspace/dataset/temp/phish_50.json', 'w') as fp:
    json.dump(p50, fp)