Source code for fst_runtime.tokenize_input
"""
This module holds a tokenization function that splits an input string into its constituent parts,
while considering the set of provided multi-character symbols.
Attributes
----------
tokenize_input_string : function
Tokenizes the input string while respecting the multichar_symbols.
"""
from fst_runtime import logger
[docs]
def tokenize_input_string(input_string: str, multichar_symbols: set[str]) -> list[str]:
"""
Returns a list containing the individual tokens that make up the ``input_string``.
Parameters
----------
input_string : str
The input string to be tokenized.
multichar_symbols : set[str]
A set of multi-character symbols that need to be recognized as single tokens.
Returns
-------
list[str]
A list of individual tokens that make up the input string.
Note
-----
This function tokenizes the input string into individual tokens, taking into account
the multi-character symbols specified in the ``multichar_symbols`` set. It ensures that
the multi-character symbols are recognized as single tokens rather than being split
into multiple tokens.
"""
# This gets the character lengs of all the multicharacter symbols and sorts them from highest to lowest.
# Note lengths are distinct from use of set comprehension.
multichar_lengths = list({
len(symbol)
for symbol
in multichar_symbols
})
multichar_lengths.sort(reverse=True)
tokens = []
# Loop until all input characters are consumed.
while len(input_string) > 0:
# This is used to continue from a nested loop.
should_continue_while = False
# If any multi-character symbols exist in the FST, then loop over the lengths. For each length, take a slice
# of the current input from the start up to the length. Then, given that slice, check if it exists in the set
# of multi-character symbols for the FST. If it does exist, then add it as a token, consume the input characters,
# and continue the outer loop. If it doesn't exist, continue looping through the multichar lengths. If nothing is
# found, then token found is a single character. This continues until the whole input has been consumed.
if multichar_lengths:
for symbol_length in multichar_lengths:
try:
substring = input_string[:symbol_length]
# Not enough input left.
except IndexError:
continue
if substring in multichar_symbols:
tokens.append(substring)
input_string = input_string.removeprefix(substring) # Consume input characters.
should_continue_while = True
break
# Continue from nested loop.
if should_continue_while:
continue
tokens.append(input_string[0])
input_string = input_string[1:] # Consume input characters.
logger.debug('_tokenize_input_string.tokens: %s', tokens)
return tokens