1
0

SimpleCensor.cs 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. using System.Collections.Frozen;
  2. using System.Linq;
  3. using System.Text;
  4. using System.Text.Unicode;
  5. namespace Content.Shared.Chat.V2.Moderation;
  6. /// <summary>
  7. /// A basic censor. Not bullet-proof.
  8. /// </summary>
  9. public sealed class SimpleCensor : IChatCensor
  10. {
  11. // Common substitution symbols are replaced with one of the characters they commonly substitute.
  12. private bool _shouldSanitizeLeetspeak;
  13. private FrozenDictionary<char, char> _leetspeakReplacements = FrozenDictionary<char, char>.Empty;
  14. // Special characters are replaced with spaces.
  15. private bool _shouldSanitizeSpecialCharacters;
  16. private HashSet<char> _specialCharacterReplacements = [];
  17. // Censored words are removed unless they're a false positive (e.g. Scunthorpe)
  18. private string[] _censoredWords = Array.Empty<string>();
  19. private string[] _falsePositives = Array.Empty<string>();
  20. // False negatives are censored words that contain a false positives.
  21. private string[] _falseNegatives = Array.Empty<string>();
  22. // What unicode ranges are allowed? If this array is empty, don't filter by range.
  23. private UnicodeRange[] _allowedUnicodeRanges= Array.Empty<UnicodeRange>();
  24. /// <summary>
  25. /// Censors the input string.
  26. /// </summary>
  27. /// <param name="input">The input string</param>
  28. /// <param name="output">The output string</param>
  29. /// <param name="replaceWith">The character to replace with</param>
  30. /// <returns>If output is valid</returns>
  31. public bool Censor(string input, out string output, char replaceWith = '*')
  32. {
  33. output = Censor(input, replaceWith);
  34. return !string.Equals(input, output);
  35. }
  36. public string Censor(string input, char replaceWith = '*')
  37. {
  38. // We flat-out ban anything not in the allowed unicode ranges, stripping them
  39. input = SanitizeOutBlockedUnicode(input);
  40. var originalInput = input.ToCharArray();
  41. input = SanitizeInput(input);
  42. var censored = input.ToList();
  43. // Remove false negatives
  44. input = CheckProfanity(input, censored, _falseNegatives, replaceWith);
  45. // Get false positives
  46. var falsePositives = FindFalsePositives(censored, replaceWith);
  47. // Remove censored words
  48. CheckProfanity(input, censored, _censoredWords, replaceWith);
  49. // Reconstruct
  50. // Reconstruct false positives
  51. for (var i = 0; i < falsePositives.Length; i++)
  52. {
  53. if (falsePositives[i] != replaceWith)
  54. {
  55. censored[i] = falsePositives[i];
  56. }
  57. }
  58. for (var i = 0; i < originalInput.Length; i++)
  59. {
  60. if (originalInput[i] == ' ')
  61. {
  62. censored.Insert(i, ' ');
  63. continue;
  64. }
  65. if (_shouldSanitizeSpecialCharacters && _specialCharacterReplacements.Contains(originalInput[i]))
  66. {
  67. censored.Insert(i, originalInput[i]);
  68. continue;
  69. }
  70. if (_shouldSanitizeLeetspeak || _shouldSanitizeSpecialCharacters)
  71. {
  72. // detect "()"
  73. if (originalInput[i] == '(' && i != originalInput.Length - 1 && originalInput[i+1] == ')')
  74. {
  75. // censored has now had "o" replaced with "o) so both strings line up again..."
  76. censored.Insert(i+1, censored[i] != replaceWith ? ')' : replaceWith);
  77. }
  78. }
  79. if (censored[i] != replaceWith)
  80. {
  81. censored[i] = originalInput[i];
  82. }
  83. }
  84. // SO says this is fast...
  85. return string.Concat(censored);
  86. }
  87. /// <summary>
  88. /// Adds a l33tsp34k sanitization rule
  89. /// </summary>
  90. /// <returns>The censor for further configuration</returns>
  91. public SimpleCensor WithSanitizeLeetSpeak()
  92. {
  93. _shouldSanitizeLeetspeak = true;
  94. return BuildCharacterReplacements();
  95. }
  96. /// <summary>
  97. /// Adds a l33tsp34k sanitization rule
  98. /// </summary>
  99. /// <returns>The censor for further configuration</returns>
  100. public SimpleCensor WithSanitizeSpecialCharacters()
  101. {
  102. _shouldSanitizeSpecialCharacters = true;
  103. return BuildCharacterReplacements();
  104. }
  105. public SimpleCensor WithRanges(UnicodeRange[] ranges)
  106. {
  107. _allowedUnicodeRanges = ranges;
  108. return this;
  109. }
  110. public SimpleCensor WithCustomDictionary(string[] naughtyWords)
  111. {
  112. _censoredWords = naughtyWords;
  113. return this;
  114. }
  115. public SimpleCensor WithFalsePositives(string[] falsePositives)
  116. {
  117. _falsePositives = falsePositives;
  118. return this;
  119. }
  120. public SimpleCensor WithFalseNegatives(string[] falseNegatives)
  121. {
  122. _falseNegatives = falseNegatives;
  123. return this;
  124. }
  125. public SimpleCensor WithLeetspeakReplacements(Dictionary<char, char> replacements)
  126. {
  127. _leetspeakReplacements = replacements.ToFrozenDictionary();
  128. return this;
  129. }
  130. public SimpleCensor WithSpecialCharacterReplacements(Dictionary<char, char> replacements)
  131. {
  132. _leetspeakReplacements = replacements.ToFrozenDictionary();
  133. return this;
  134. }
  135. private string CheckProfanity(string input, List<char> censored, string[] words, char replaceWith = '*')
  136. {
  137. foreach (var word in words)
  138. {
  139. var wordLength = word.Length;
  140. var endOfFoundWord = 0;
  141. var foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
  142. while(foundIndex > -1)
  143. {
  144. endOfFoundWord = foundIndex + wordLength;
  145. for (var i = 0; i < wordLength; i++)
  146. {
  147. censored[foundIndex+i] = replaceWith;
  148. }
  149. foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
  150. }
  151. }
  152. return input;
  153. }
  154. private char[] FindFalsePositives(List<char> chars, char replaceWith = '*')
  155. {
  156. var input = string.Concat(chars);
  157. var output = Enumerable.Repeat(replaceWith, input.Length).ToArray();
  158. var inputAsARr = input.ToArray();
  159. foreach (var word in _falsePositives)
  160. {
  161. var wordLength = word.Length;
  162. var endOfFoundWord = 0;
  163. var foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
  164. while(foundIndex > -1)
  165. {
  166. endOfFoundWord = foundIndex + wordLength;
  167. for (var i = foundIndex; i < endOfFoundWord; i++)
  168. {
  169. output[i] = inputAsARr[i];
  170. }
  171. foundIndex = input.IndexOf(word, endOfFoundWord, StringComparison.OrdinalIgnoreCase);
  172. }
  173. }
  174. return output;
  175. }
  176. private string SanitizeInput(string input)
  177. {
  178. // "()" is a broad enough trick to beat censors that we we should check for it broadly.
  179. if (_shouldSanitizeLeetspeak || _shouldSanitizeSpecialCharacters)
  180. {
  181. input = input.Replace("()", "o");
  182. }
  183. var sb = new StringBuilder();
  184. // ReSharper disable once ForeachCanBePartlyConvertedToQueryUsingAnotherGetEnumerator
  185. foreach (var character in input)
  186. {
  187. if (character == ' ' || _shouldSanitizeSpecialCharacters && _specialCharacterReplacements.Contains(character))
  188. {
  189. continue;
  190. }
  191. if (_shouldSanitizeLeetspeak && _leetspeakReplacements.TryGetValue(character, out var leetRepl))
  192. {
  193. sb.Append(leetRepl);
  194. continue;
  195. }
  196. sb.Append(character);
  197. }
  198. return sb.ToString();
  199. }
  200. /// <summary>
  201. /// Returns a string with all characters not in ISO-8851-1 replaced with question marks
  202. /// </summary>
  203. private string SanitizeOutBlockedUnicode(string input)
  204. {
  205. if (_allowedUnicodeRanges.Length <= 0)
  206. {
  207. return input;
  208. }
  209. var sb = new StringBuilder();
  210. foreach (var symbol in input.EnumerateRunes())
  211. {
  212. // ReSharper disable once LoopCanBeConvertedToQuery
  213. foreach (var range in _allowedUnicodeRanges)
  214. {
  215. if (symbol.Value < range.FirstCodePoint || symbol.Value >= range.FirstCodePoint + range.Length)
  216. continue;
  217. sb.Append(symbol);
  218. break;
  219. }
  220. }
  221. return sb.ToString();
  222. }
  223. private SimpleCensor BuildCharacterReplacements()
  224. {
  225. if (_shouldSanitizeSpecialCharacters)
  226. {
  227. _specialCharacterReplacements =
  228. [
  229. '-',
  230. '_',
  231. '|',
  232. '.',
  233. ',',
  234. '(',
  235. ')',
  236. '<',
  237. '>',
  238. '"',
  239. '`',
  240. '~',
  241. '*',
  242. '&',
  243. '%',
  244. '$',
  245. '#',
  246. '@',
  247. '!',
  248. '?',
  249. '+'
  250. ];
  251. }
  252. if (_shouldSanitizeLeetspeak)
  253. {
  254. _leetspeakReplacements = new Dictionary<char, char>
  255. {
  256. ['4'] = 'a',
  257. ['$'] = 's',
  258. ['!'] = 'i',
  259. ['+'] = 't',
  260. ['#'] = 'h',
  261. ['@'] = 'a',
  262. ['0'] = 'o',
  263. ['1'] = 'i', // also obviously can be l; gamer-words need i's more though.
  264. ['7'] = 'l',
  265. ['3'] = 'e',
  266. ['5'] = 's',
  267. ['9'] = 'g',
  268. ['<'] = 'c'
  269. }.ToFrozenDictionary();
  270. }
  271. return this;
  272. }
  273. }