@@ -65,6 +65,36 @@ public function containsStopWords(): bool
65
65
return Str::of ($ this ->message )->contains ($ this ->stopWords , true );
66
66
}
67
67
68
+ /**
69
+ * Checks if the message contains an excessive amount of special characters.
70
+ * For example, the proportion of special characters should not exceed a given threshold (default is 2%).
71
+ *
72
+ * @param float $threshold
73
+ *
74
+ * @return bool
75
+ */
76
+ public function hasExcessiveUnicodeCharacters (float $ threshold = 0.02 ): bool
77
+ {
78
+ // Length of the message including special characters
79
+ $ withUnicode = Str::of ($ this ->message )
80
+ ->replaceMatches ('/^[^\p{L}\p{N}\p{Z}\p{P}]+|[^\p{L}\p{N}\p{Z}\p{P}]+$/u ' , '' ) // without start and end special characters (emoji, etc.)
81
+ ->length ();
82
+
83
+ // Length of the message without special characters
84
+ $ withOutUnicode = Str::of ($ this ->message )
85
+ ->replaceMatches ('/[^\p{L}\p{N}\p{Z}\p{P}]/u ' , '' )
86
+ ->length ();
87
+
88
+ // Difference in length
89
+ $ unicodeLength = $ withUnicode - $ withOutUnicode ;
90
+
91
+ // Proportion of special characters in the message
92
+ $ unicodePercentage = $ unicodeLength / $ withUnicode ;
93
+
94
+ // Check if the proportion of special characters exceeds the given threshold
95
+ return $ unicodePercentage > $ threshold ;
96
+ }
97
+
68
98
/**
69
99
* Check if the message is spam using a Naive Bayes classifier.
70
100
*
@@ -114,6 +144,10 @@ private function trainClassifier(Classifier $classifier, string $fileName, strin
114
144
*/
115
145
public function isSpam ()
116
146
{
147
+ if ($ this ->hasExcessiveUnicodeCharacters ()) {
148
+ return true ;
149
+ }
150
+
117
151
if ($ this ->containsStopWords ()) {
118
152
return true ;
119
153
}
0 commit comments