Skip to content

Commit b84ca7a

Browse files
committed
update clean_sharegpt.py
1 parent 74aa8ae commit b84ca7a

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

fastchat/data/clean_sharegpt.py

+15
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,18 @@ def contain_blocked_words(val: str) -> bool:
7171
return False
7272

7373

74+
def contain_blocked_responses(role: str, val: str) -> bool:
75+
if role == "gpt":
76+
blocked_responses = [
77+
"Too many requests in 1 hour. Try again later.",
78+
"!Too many requests in 1 hour. Try again later.",
79+
]
80+
for w in blocked_responses:
81+
if val.startswith(w):
82+
return True
83+
return False
84+
85+
7486
def clean_html_one_sample(sample):
7587
roles = ["human", "gpt"]
7688

@@ -102,6 +114,9 @@ def clean_html_one_sample(sample):
102114
except (bs4.builder.ParserRejectedMarkup, AssertionError):
103115
return (sample, 4)
104116

117+
if contain_blocked_responses(c["from"], new_val):
118+
return (sample, 3)
119+
105120
# Filter empty answers like https://sharegpt.com/c/mrllZ6u
106121
if not new_val or not new_val[0].isprintable():
107122
break

0 commit comments

Comments
 (0)