2025-08-09 17:27:25 +08:00

18 lines
547 B
Python

import tiktoken
from src.config import ENCODING_FOR_MODEL, ENCODING
tiktoken.encoding_for_model(ENCODING_FOR_MODEL)
tokenizer = tiktoken.get_encoding(ENCODING)
def tiktoken_len(text: str) -> int:
tokens = tokenizer.encode(
text,
disallowed_special=()
)
return len(tokens)
def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""Returns the number of tokens in a text string."""
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens