aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArjun Satarkar <me@arjunsatarkar.net>2024-03-07 00:33:11 +0000
committerArjun Satarkar <me@arjunsatarkar.net>2024-03-07 00:33:11 +0000
commit80a31686aa0b3265c1d3299d4b5f6173a3bb7096 (patch)
tree5191cf572988a44e4e06b06cb840b3c56fc15e8e
parent2f110607f7108451e3fc2c1c9f7df58217bc1cb4 (diff)
downloadaps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.tar
aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.tar.gz
aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.zip
markov: tokenize custom emoji as one token
-rw-r--r--markov/markov.py13
1 files changed, 7 insertions, 6 deletions
diff --git a/markov/markov.py b/markov/markov.py
index 2a37a6c..ec5acbd 100644
--- a/markov/markov.py
+++ b/markov/markov.py
@@ -10,7 +10,7 @@ from .errors import *
MAX_BLACKLISTED_STRINGS_PER_GUILD = 50
MAX_TOKEN_GENERATION_ITERATIONS = 1000
-MAX_WORD_LENGTH = 50
+MAX_TOKEN_LENGTH = 70
class Markov(commands.Cog):
@@ -60,14 +60,15 @@ class Markov(commands.Cog):
# Strip out URL-esque patterns - a run of characters without spaces that contains '://' within it
clean_content = re.sub(r"(?: |^)\w+:\/\/[^ ]+(?: |$)", " ", clean_content)
- # Extract words and punctuation, normalize to lowercase, add sentinel (empty string) on either end
- # NOTE: if changing the punctuation in the regex, also changing PUNCTUATION in generate()
+ # Extract words, punctuation, and custom emoji as individual
+ # tokens, then add sentinel (empty string) on either end.
+ # NOTE: if changing the punctuation in the regex, also change PUNCTUATION in generate()
tokens = (
[""]
+ [
- word
- for word in re.findall(r"[\w']+|[\.,!?\/]", clean_content)
- if len(word) <= MAX_WORD_LENGTH
+ token
+ for token in re.findall(r"[\w']+|[\.,!?\/]|<:\w+:\d+>", clean_content)
+ if len(token) <= MAX_TOKEN_LENGTH
]
+ [""]
)