def chunk_by_token_count(
content: str,
overlap_token_size=128,
max_token_size=1024,
encoder=None
) -> List[Dict[str, Any]]:
"""
Splits the input content into chunks based on token size limits.
Args:
content: The input text content to be chunked
overlap_token_size: Number of tokens to overlap between chunks
max_token_size: Maximum number of tokens per chunk
encoder: Tokenizer (e.g., tiktoken encoder)
Returns:
List of chunk dicts with keys: 'num_tokens', 'content', 'chunk_order'
"""
tokens = encoder.encode(content)
if max_token_size is None:
return [{"num_tokens": len(tokens), "content": content, "chunk_order": 0}]
results = []
for index, start in enumerate(range(0, len(tokens), max_token_size - overlap_token_size)):
chunk_content = encoder.decode(tokens[start : start + max_token_size])
results.append({
"num_tokens": min(max_token_size, len(tokens) - start),
"content": chunk_content,
"chunk_order": index,
})
return results