Source code for fst_runtime.tokenize_input

"""
This module holds a tokenization function that splits an input string into its constituent parts,
while considering the set of provided multi-character symbols.

Attributes
----------
tokenize_input_string : function
    Tokenizes the input string while respecting the multichar_symbols.
"""

from fst_runtime import logger

[docs] def tokenize_input_string(input_string: str, multichar_symbols: set[str]) -> list[str]: """ Returns a list containing the individual tokens that make up the ``input_string``. Parameters ---------- input_string : str The input string to be tokenized. multichar_symbols : set[str] A set of multi-character symbols that need to be recognized as single tokens. Returns ------- list[str] A list of individual tokens that make up the input string. Note ----- This function tokenizes the input string into individual tokens, taking into account the multi-character symbols specified in the ``multichar_symbols`` set. It ensures that the multi-character symbols are recognized as single tokens rather than being split into multiple tokens. """ # This gets the character lengs of all the multicharacter symbols and sorts them from highest to lowest. # Note lengths are distinct from use of set comprehension. multichar_lengths = list({ len(symbol) for symbol in multichar_symbols }) multichar_lengths.sort(reverse=True) tokens = [] # Loop until all input characters are consumed. while len(input_string) > 0: # This is used to continue from a nested loop. should_continue_while = False # If any multi-character symbols exist in the FST, then loop over the lengths. For each length, take a slice # of the current input from the start up to the length. Then, given that slice, check if it exists in the set # of multi-character symbols for the FST. If it does exist, then add it as a token, consume the input characters, # and continue the outer loop. If it doesn't exist, continue looping through the multichar lengths. If nothing is # found, then token found is a single character. This continues until the whole input has been consumed. if multichar_lengths: for symbol_length in multichar_lengths: try: substring = input_string[:symbol_length] # Not enough input left. except IndexError: continue if substring in multichar_symbols: tokens.append(substring) input_string = input_string.removeprefix(substring) # Consume input characters. should_continue_while = True break # Continue from nested loop. if should_continue_while: continue tokens.append(input_string[0]) input_string = input_string[1:] # Consume input characters. logger.debug('_tokenize_input_string.tokens: %s', tokens) return tokens