Skip to content

Commit 5888cb6

Browse files
committed
Added new rule on anti-spam for unicode characters
1 parent ef94d9c commit 5888cb6

File tree

2 files changed

+54
-0
lines changed

2 files changed

+54
-0
lines changed

app/Services/SpamDetector.php

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,36 @@ public function containsStopWords(): bool
6565
return Str::of($this->message)->contains($this->stopWords, true);
6666
}
6767

68+
/**
69+
* Checks if the message contains an excessive amount of special characters.
70+
* For example, the proportion of special characters should not exceed a given threshold (default is 2%).
71+
*
72+
* @param float $threshold
73+
*
74+
* @return bool
75+
*/
76+
public function hasExcessiveUnicodeCharacters(float $threshold = 0.02): bool
77+
{
78+
// Length of the message including special characters
79+
$withUnicode = Str::of($this->message)
80+
->replaceMatches('/^[^\p{L}\p{N}\p{Z}\p{P}]+|[^\p{L}\p{N}\p{Z}\p{P}]+$/u', '') // without start and end special characters (emoji, etc.)
81+
->length();
82+
83+
// Length of the message without special characters
84+
$withOutUnicode = Str::of($this->message)
85+
->replaceMatches('/[^\p{L}\p{N}\p{Z}\p{P}]/u', '')
86+
->length();
87+
88+
// Difference in length
89+
$unicodeLength = $withUnicode - $withOutUnicode;
90+
91+
// Proportion of special characters in the message
92+
$unicodePercentage = $unicodeLength / $withUnicode;
93+
94+
// Check if the proportion of special characters exceeds the given threshold
95+
return $unicodePercentage > $threshold;
96+
}
97+
6898
/**
6999
* Check if the message is spam using a Naive Bayes classifier.
70100
*
@@ -114,6 +144,10 @@ private function trainClassifier(Classifier $classifier, string $fileName, strin
114144
*/
115145
public function isSpam()
116146
{
147+
if ($this->hasExcessiveUnicodeCharacters()) {
148+
return true;
149+
}
150+
117151
if ($this->containsStopWords()) {
118152
return true;
119153
}

tests/Unit/SpamDetectorTest.php

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,26 @@ public static function messageProvider()
2929
['Стабильный доход от 100$ Нужен только телефон', true],
3030
['блокчейн в ЛС', true],
3131
['Крипто инвестиции', true],
32+
['18+', true],
33+
['hamsterkombat', true],
34+
['hamster', true],
35+
['Прuвет', true],
3236
];
3337
}
38+
public function testUnicodeRules():void
39+
{
40+
//Long unicode characters in center of the message/words
41+
$spamDetector = new SpamDetector('Прuвет всем, хoчу предлoжuть реaльный дoпoлнuтельный зaрaбoтoк!
42+
- От 50$ в/зa день гaрaнтuрoвaнo
43+
- Чaс в день твoегo временu
44+
- Честнo u легaльнo, НЕ НАРКОТИКИ!!
45+
46+
Еслu ты действuтельнo зauнтересoвaн в быстрoм u честнoм зaрaбoтке , пuшu + в ЛС!!!!');
47+
48+
$this->assertTrue($spamDetector->hasExcessiveUnicodeCharacters());
49+
50+
// Long unicode characters at the end of the message/words
51+
$spamDetector = new SpamDetector('🍕 Прикольно, что ты тут делаешь? 🍣🍰');
52+
$this->assertFalse($spamDetector->hasExcessiveUnicodeCharacters());
53+
}
3454
}

0 commit comments

Comments
 (0)