diff options
author | Arjun Satarkar <me@arjunsatarkar.net> | 2024-03-07 00:33:11 +0000 |
---|---|---|
committer | Arjun Satarkar <me@arjunsatarkar.net> | 2024-03-07 00:33:11 +0000 |
commit | 80a31686aa0b3265c1d3299d4b5f6173a3bb7096 (patch) | |
tree | 5191cf572988a44e4e06b06cb840b3c56fc15e8e /markov | |
parent | 2f110607f7108451e3fc2c1c9f7df58217bc1cb4 (diff) | |
download | aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.tar aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.tar.gz aps-cogs-80a31686aa0b3265c1d3299d4b5f6173a3bb7096.zip |
markov: tokenize custom emoji as one token
Diffstat (limited to 'markov')
-rw-r--r-- | markov/markov.py | 13 |
1 files changed, 7 insertions, 6 deletions
diff --git a/markov/markov.py b/markov/markov.py index 2a37a6c..ec5acbd 100644 --- a/markov/markov.py +++ b/markov/markov.py @@ -10,7 +10,7 @@ from .errors import * MAX_BLACKLISTED_STRINGS_PER_GUILD = 50 MAX_TOKEN_GENERATION_ITERATIONS = 1000 -MAX_WORD_LENGTH = 50 +MAX_TOKEN_LENGTH = 70 class Markov(commands.Cog): @@ -60,14 +60,15 @@ class Markov(commands.Cog): # Strip out URL-esque patterns - a run of characters without spaces that contains '://' within it clean_content = re.sub(r"(?: |^)\w+:\/\/[^ ]+(?: |$)", " ", clean_content) - # Extract words and punctuation, normalize to lowercase, add sentinel (empty string) on either end - # NOTE: if changing the punctuation in the regex, also changing PUNCTUATION in generate() + # Extract words, punctuation, and custom emoji as individual + # tokens, then add sentinel (empty string) on either end. + # NOTE: if changing the punctuation in the regex, also change PUNCTUATION in generate() tokens = ( [""] + [ - word - for word in re.findall(r"[\w']+|[\.,!?\/]", clean_content) - if len(word) <= MAX_WORD_LENGTH + token + for token in re.findall(r"[\w']+|[\.,!?\/]|<:\w+:\d+>", clean_content) + if len(token) <= MAX_TOKEN_LENGTH ] + [""] ) |