using System.Collections.Frozen;
using System.Linq;
using System.Text;
using System.Text.Unicode;
namespace Content.Shared.Chat.V2.Moderation;
///
/// A basic censor. Not bullet-proof.
///
public sealed class SimpleCensor : IChatCensor
{
// Common substitution symbols are replaced with one of the characters they commonly substitute.
private bool _shouldSanitizeLeetspeak;
private FrozenDictionary _leetspeakReplacements = FrozenDictionary.Empty;
// Special characters are replaced with spaces.
private bool _shouldSanitizeSpecialCharacters;
private HashSet _specialCharacterReplacements = [];
// Censored words are removed unless they're a false positive (e.g. Scunthorpe)
private string[] _censoredWords = Array.Empty();
private string[] _falsePositives = Array.Empty();
// False negatives are censored words that contain a false positives.
private string[] _falseNegatives = Array.Empty();
// What unicode ranges are allowed? If this array is empty, don't filter by range.
private UnicodeRange[] _allowedUnicodeRanges= Array.Empty();
///
/// Censors the input string.
///
/// The input string
/// The output string
/// The character to replace with
/// If output is valid
public bool Censor(string input, out string output, char replaceWith = '*')
{
output = Censor(input, replaceWith);
return !string.Equals(input, output);
}
public string Censor(string input, char replaceWith = '*')
{
// We flat-out ban anything not in the allowed unicode ranges, stripping them
input = SanitizeOutBlockedUnicode(input);
var originalInput = input.ToCharArray();
input = SanitizeInput(input);
var censored = input.ToList();
// Remove false negatives
input = CheckProfanity(input, censored, _falseNegatives, replaceWith);
// Get false positives
var falsePositives = FindFalsePositives(censored, replaceWith);
// Remove censored words
CheckProfanity(input, censored, _censoredWords, replaceWith);
// Reconstruct
// Reconstruct false positives
for (var i = 0; i < falsePositives.Length; i++)
{
if (falsePositives[i] != replaceWith)
{
censored[i] = falsePositives[i];
}
}
for (var i = 0; i < originalInput.Length; i++)
{
if (originalInput[i] == ' ')
{
censored.Insert(i, ' ');
continue;
}
if (_shouldSanitizeSpecialCharacters && _specialCharacterReplacements.Contains(originalInput[i]))
{
censored.Insert(i, originalInput[i]);
continue;
}
if (_shouldSanitizeLeetspeak || _shouldSanitizeSpecialCharacters)
{
// detect "()"
if (originalInput[i] == '(' && i != originalInput.Length - 1 && originalInput[i+1] == ')')
{
// censored has now had "o" replaced with "o) so both strings line up again..."
censored.Insert(i+1, censored[i] != replaceWith ? ')' : replaceWith);
}
}
if (censored[i] != replaceWith)
{
censored[i] = originalInput[i];
}
}
// SO says this is fast...
return string.Concat(censored);
}
///
/// Adds a l33tsp34k sanitization rule
///
/// The censor for further configuration
public SimpleCensor WithSanitizeLeetSpeak()
{
_shouldSanitizeLeetspeak = true;
return BuildCharacterReplacements();
}
///
/// Adds a l33tsp34k sanitization rule
///
/// The censor for further configuration
public SimpleCensor WithSanitizeSpecialCharacters()
{
_shouldSanitizeSpecialCharacters = true;
return BuildCharacterReplacements();
}
public SimpleCensor WithRanges(UnicodeRange[] ranges)
{
_allowedUnicodeRanges = ranges;
return this;
}
public SimpleCensor WithCustomDictionary(string[] naughtyWords)
{
_censoredWords = naughtyWords;
return this;
}
public SimpleCensor WithFalsePositives(string[] falsePositives)
{
_falsePositives = falsePositives;
return this;
}
public SimpleCensor WithFalseNegatives(string[] falseNegatives)
{
_falseNegatives = falseNegatives;
return this;
}
public SimpleCensor WithLeetspeakReplacements(Dictionary replacements)
{
_leetspeakReplacements = replacements.ToFrozenDictionary();
return this;
}
public SimpleCensor WithSpecialCharacterReplacements(Dictionary replacements)
{
_leetspeakReplacements = replacements.ToFrozenDictionary();
return this;
}
private string CheckProfanity(string input, List censored, string[] words, char replaceWith = '*')
{
foreach (var word in words)
{
var wordLength = word.Length;
var endOfFoundWord = 0;
var foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
while(foundIndex > -1)
{
endOfFoundWord = foundIndex + wordLength;
for (var i = 0; i < wordLength; i++)
{
censored[foundIndex+i] = replaceWith;
}
foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
}
}
return input;
}
private char[] FindFalsePositives(List chars, char replaceWith = '*')
{
var input = string.Concat(chars);
var output = Enumerable.Repeat(replaceWith, input.Length).ToArray();
var inputAsARr = input.ToArray();
foreach (var word in _falsePositives)
{
var wordLength = word.Length;
var endOfFoundWord = 0;
var foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
while(foundIndex > -1)
{
endOfFoundWord = foundIndex + wordLength;
for (var i = foundIndex; i < endOfFoundWord; i++)
{
output[i] = inputAsARr[i];
}
foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
}
}
return output;
}
private string SanitizeInput(string input)
{
// "()" is a broad enough trick to beat censors that we we should check for it broadly.
if (_shouldSanitizeLeetspeak || _shouldSanitizeSpecialCharacters)
{
input = input.Replace("()", "o");
}
var sb = new StringBuilder();
// ReSharper disable once ForeachCanBePartlyConvertedToQueryUsingAnotherGetEnumerator
foreach (var character in input)
{
if (character == ' ' || _shouldSanitizeSpecialCharacters && _specialCharacterReplacements.Contains(character))
{
continue;
}
if (_shouldSanitizeLeetspeak && _leetspeakReplacements.TryGetValue(character, out var leetRepl))
{
sb.Append(leetRepl);
continue;
}
sb.Append(character);
}
return sb.ToString();
}
///
/// Returns a string with all characters not in ISO-8851-1 replaced with question marks
///
private string SanitizeOutBlockedUnicode(string input)
{
if (_allowedUnicodeRanges.Length <= 0)
{
return input;
}
var sb = new StringBuilder();
foreach (var symbol in input.EnumerateRunes())
{
// ReSharper disable once LoopCanBeConvertedToQuery
foreach (var range in _allowedUnicodeRanges)
{
if (symbol.Value < range.FirstCodePoint || symbol.Value >= range.FirstCodePoint + range.Length)
continue;
sb.Append(symbol);
break;
}
}
return sb.ToString();
}
private SimpleCensor BuildCharacterReplacements()
{
if (_shouldSanitizeSpecialCharacters)
{
_specialCharacterReplacements =
[
'-',
'_',
'|',
'.',
',',
'(',
')',
'<',
'>',
'"',
'`',
'~',
'*',
'&',
'%',
'$',
'#',
'@',
'!',
'?',
'+'
];
}
if (_shouldSanitizeLeetspeak)
{
_leetspeakReplacements = new Dictionary
{
['4'] = 'a',
['$'] = 's',
['!'] = 'i',
['+'] = 't',
['#'] = 'h',
['@'] = 'a',
['0'] = 'o',
['1'] = 'i', // also obviously can be l; gamer-words need i's more though.
['7'] = 'l',
['3'] = 'e',
['5'] = 's',
['9'] = 'g',
['<'] = 'c'
}.ToFrozenDictionary();
}
return this;
}
}