From 80a31686aa0b3265c1d3299d4b5f6173a3bb7096 Mon Sep 17 00:00:00 2001 From: Arjun Satarkar Date: Wed, 6 Mar 2024 19:33:11 -0500 Subject: markov: tokenize custom emoji as one token --- markov/markov.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/markov/markov.py b/markov/markov.py index 2a37a6c..ec5acbd 100644 --- a/markov/markov.py +++ b/markov/markov.py @@ -10,7 +10,7 @@ from .errors import * MAX_BLACKLISTED_STRINGS_PER_GUILD = 50 MAX_TOKEN_GENERATION_ITERATIONS = 1000 -MAX_WORD_LENGTH = 50 +MAX_TOKEN_LENGTH = 70 class Markov(commands.Cog): @@ -60,14 +60,15 @@ class Markov(commands.Cog): # Strip out URL-esque patterns - a run of characters without spaces that contains '://' within it clean_content = re.sub(r"(?: |^)\w+:\/\/[^ ]+(?: |$)", " ", clean_content) - # Extract words and punctuation, normalize to lowercase, add sentinel (empty string) on either end - # NOTE: if changing the punctuation in the regex, also changing PUNCTUATION in generate() + # Extract words, punctuation, and custom emoji as individual + # tokens, then add sentinel (empty string) on either end. + # NOTE: if changing the punctuation in the regex, also change PUNCTUATION in generate() tokens = ( [""] + [ - word - for word in re.findall(r"[\w']+|[\.,!?\/]", clean_content) - if len(word) <= MAX_WORD_LENGTH + token + for token in re.findall(r"[\w']+|[\.,!?\/]|<:\w+:\d+>", clean_content) + if len(token) <= MAX_TOKEN_LENGTH ] + [""] ) -- cgit v1.2.3-57-g22cb