From 6f4ea88e9570654d75153ab613ffae2176b495bd Mon Sep 17 00:00:00 2001 From: Arjun Satarkar Date: Wed, 6 Mar 2024 19:51:30 -0500 Subject: markov: normalize message content better We now do NFKC normalization and replace U+2019 with the normal ASCII single quote. --- markov/markov.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/markov/markov.py b/markov/markov.py index ec5acbd..a8b749c 100644 --- a/markov/markov.py +++ b/markov/markov.py @@ -6,6 +6,7 @@ from redbot.core import commands import math import random import re +import unicodedata from .errors import * MAX_BLACKLISTED_STRINGS_PER_GUILD = 50 @@ -50,6 +51,10 @@ class Markov(commands.Cog): ) async def process_message(self, clean_content: str, guild_id: int, member_id: int): + # Normalize + clean_content = unicodedata.normalize("NFKC", clean_content) + clean_content = clean_content.replace("’", "'") + # Ignore messages with blacklisted strings for blacklisted_string in await self.config.guild_from_id( guild_id -- cgit v1.2.3-57-g22cb