18 lines
547 B
Python
18 lines
547 B
Python
import tiktoken
|
|
from src.config import ENCODING_FOR_MODEL, ENCODING
|
|
|
|
tiktoken.encoding_for_model(ENCODING_FOR_MODEL)
|
|
tokenizer = tiktoken.get_encoding(ENCODING)
|
|
|
|
def tiktoken_len(text: str) -> int:
|
|
tokens = tokenizer.encode(
|
|
text,
|
|
disallowed_special=()
|
|
)
|
|
return len(tokens)
|
|
|
|
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
|
"""Returns the number of tokens in a text string."""
|
|
encoding = tiktoken.get_encoding(encoding_name)
|
|
num_tokens = len(encoding.encode(string))
|
|
return num_tokens |