markov: tokenize custom emoji as one token

author: Arjun Satarkar <me@arjunsatarkar.net> 2024-03-07 00:33:11 +0000
committer: Arjun Satarkar <me@arjunsatarkar.net> 2024-03-07 00:33:11 +0000
commit: 80a31686aa0b3265c1d3299d4b5f6173a3bb7096 (patch)
tree: 5191cf572988a44e4e06b06cb840b3c56fc15e8e
parent: 2f110607f7108451e3fc2c1c9f7df58217bc1cb4 (diff)
download: aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.tar
aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.tar.gz
aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.zip
1 files changed, 7 insertions, 6 deletions
diff --git a/markov/markov.py b/markov/markov.py
index 2a37a6c..ec5acbd 100644
--- a/markov/markov.py
+++ b/markov/markov.py
@@ -10,7 +10,7 @@ from .errors import *
 
 MAX_BLACKLISTED_STRINGS_PER_GUILD = 50
 MAX_TOKEN_GENERATION_ITERATIONS = 1000
-MAX_WORD_LENGTH = 50
+MAX_TOKEN_LENGTH = 70
 
 
 class Markov(commands.Cog):
@@ -60,14 +60,15 @@ class Markov(commands.Cog):
         # Strip out URL-esque patterns - a run of characters without spaces that contains '://' within it
         clean_content = re.sub(r"(?: |^)\w+:\/\/[^ ]+(?: |$)", " ", clean_content)
 
-        # Extract words and punctuation, normalize to lowercase, add sentinel (empty string) on either end
-        # NOTE: if changing the punctuation in the regex, also changing PUNCTUATION in generate()
+        # Extract words, punctuation, and custom emoji as individual
+        # tokens, then add sentinel (empty string) on either end.
+        # NOTE: if changing the punctuation in the regex, also change PUNCTUATION in generate()
         tokens = (
             [""]
             + [
-                word
-                for word in re.findall(r"[\w']+|[\.,!?\/]", clean_content)
-                if len(word) <= MAX_WORD_LENGTH
+                token
+                for token in re.findall(r"[\w']+|[\.,!?\/]|<:\w+:\d+>", clean_content)
+                if len(token) <= MAX_TOKEN_LENGTH
             ]
             + [""]
         )
author	Arjun Satarkar <me@arjunsatarkar.net>	2024-03-07 00:33:11 +0000
committer	Arjun Satarkar <me@arjunsatarkar.net>	2024-03-07 00:33:11 +0000
commit	80a31686aa0b3265c1d3299d4b5f6173a3bb7096 (patch)
tree	5191cf572988a44e4e06b06cb840b3c56fc15e8e
parent	2f110607f7108451e3fc2c1c9f7df58217bc1cb4 (diff)
download	aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.tar aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.tar.gz aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.zip