mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
3a64f41af8
Signed-off-by: anish.raghavendra <anish.raghavendra@ibm.com> Co-authored-by: anish.raghavendra <anish.raghavendra@ibm.com>
642 lines
21 KiB
Plaintext
Vendored
642 lines
21 KiB
Plaintext
Vendored
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Line-Based Token Chunking\n",
|
|
"## Overview\n",
|
|
"The `LineBasedTokenChunker` is a tokenization-aware chunker that preserves line boundaries. It's particularly useful for structured content like tables, code, or logs where line boundaries are semantically important.\n",
|
|
"\n",
|
|
"Key features:\n",
|
|
"- **Line preservation**: Keeps entire lines within a single chunk when possible\n",
|
|
"- **Prefix support**: Add repeated context (e.g., table headers) to each chunk\n",
|
|
"- **Overflow handling**: Choose between splitting lines or omitting prefix when lines are too long"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Setup"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%pip install -qU pip docling transformers"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker\n",
|
|
"from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer\n",
|
|
"from transformers import AutoTokenizer"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Example 1: Basic Table Chunking with Prefix\n",
|
|
"\n",
|
|
"In this example, we'll chunk a table while repeating the header in each chunk."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Max tokens: 50\n",
|
|
"Prefix token count: 34\n",
|
|
"\n",
|
|
"Total chunks: 3\n",
|
|
"\n",
|
|
"=== Chunk 1 ===\n",
|
|
"| Name | Age | Department |\n",
|
|
"|------|-----|------------|\n",
|
|
"| Alice | 30 | Engineering |\n",
|
|
"| Bob | 25 | Marketing |\n",
|
|
"\n",
|
|
"Tokens: 48\n",
|
|
"\n",
|
|
"=== Chunk 2 ===\n",
|
|
"| Name | Age | Department |\n",
|
|
"|------|-----|------------|\n",
|
|
"| Charlie | 35 | Sales |\n",
|
|
"| Diana | 28 | HR |\n",
|
|
"\n",
|
|
"Tokens: 48\n",
|
|
"\n",
|
|
"=== Chunk 3 ===\n",
|
|
"| Name | Age | Department |\n",
|
|
"|------|-----|------------|\n",
|
|
"| Eve | 32 | Finance |\n",
|
|
"\n",
|
|
"Tokens: 41\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Setup tokenizer with a reasonable token limit\n",
|
|
"tokenizer = HuggingFaceTokenizer(\n",
|
|
" tokenizer=AutoTokenizer.from_pretrained(\"sentence-transformers/all-MiniLM-L6-v2\"),\n",
|
|
" max_tokens=50, # Small limit to demonstrate chunking\n",
|
|
")\n",
|
|
"\n",
|
|
"# Create chunker with table header prefix\n",
|
|
"chunker = LineBasedTokenChunker(\n",
|
|
" tokenizer=tokenizer,\n",
|
|
" prefix=\"| Name | Age | Department |\\n|------|-----|------------|\\n\",\n",
|
|
" omit_prefix_on_overflow=False, # Always include prefix (default)\n",
|
|
")\n",
|
|
"\n",
|
|
"# Sample table rows\n",
|
|
"lines = [\n",
|
|
" \"| Alice | 30 | Engineering |\\n\",\n",
|
|
" \"| Bob | 25 | Marketing |\\n\",\n",
|
|
" \"| Charlie | 35 | Sales |\\n\",\n",
|
|
" \"| Diana | 28 | HR |\\n\",\n",
|
|
" \"| Eve | 32 | Finance |\\n\",\n",
|
|
"]\n",
|
|
"\n",
|
|
"print(f\"Max tokens: {chunker.max_tokens}\")\n",
|
|
"print(f\"Prefix token count: {chunker.prefix_len}\\n\")\n",
|
|
"\n",
|
|
"chunks = chunker.chunk_text(lines)\n",
|
|
"\n",
|
|
"print(f\"Total chunks: {len(chunks)}\\n\")\n",
|
|
"for i, chunk in enumerate(chunks, 1):\n",
|
|
" print(f\"=== Chunk {i} ===\")\n",
|
|
" print(chunk)\n",
|
|
" print(f\"Tokens: {tokenizer.count_tokens(chunk)}\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Example 2: Handling Wide Tables with `omit_prefix_on_overflow`\n",
|
|
"\n",
|
|
"When working with wide tables, some rows might fit without the header but not with it. The `omit_prefix_on_overflow` parameter provides flexibility in these cases."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Prefix token count: 47\n",
|
|
"Max tokens: 30\n",
|
|
"\n",
|
|
"Token counts:\n",
|
|
" Line 1: 11 tokens (with prefix: 58 tokens)\n",
|
|
" Line 2: 11 tokens (with prefix: 58 tokens)\n",
|
|
" Line 3: 17 tokens (with prefix: 64 tokens)\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Setup tokenizer with a very small token limit\n",
|
|
"tokenizer = HuggingFaceTokenizer(\n",
|
|
" tokenizer=AutoTokenizer.from_pretrained(\"sentence-transformers/all-MiniLM-L6-v2\"),\n",
|
|
" max_tokens=30, # Very small limit to force overflow\n",
|
|
")\n",
|
|
"\n",
|
|
"# Create chunker with a longer prefix\n",
|
|
"prefix = (\n",
|
|
" \"| Name | Age | Department | Location |\\n|------|-----|------------|----------|\\n\"\n",
|
|
")\n",
|
|
"\n",
|
|
"print(f\"Prefix token count: {tokenizer.count_tokens(prefix)}\")\n",
|
|
"print(f\"Max tokens: {tokenizer.get_max_tokens()}\\n\")\n",
|
|
"\n",
|
|
"# Sample lines - some will be too long with prefix\n",
|
|
"lines = [\n",
|
|
" \"| Alice Johnson | 30 | Engineering | San Francisco |\\n\",\n",
|
|
" \"| Bob Smith | 25 | Marketing | New York |\\n\",\n",
|
|
" \"| Charlie Brown with a very long name | 35 | Sales Department | Los Angeles |\\n\",\n",
|
|
"]\n",
|
|
"\n",
|
|
"# Check token counts for each line\n",
|
|
"print(\"Token counts:\")\n",
|
|
"for i, line in enumerate(lines, 1):\n",
|
|
" line_tokens = tokenizer.count_tokens(line)\n",
|
|
" with_prefix = line_tokens + tokenizer.count_tokens(prefix)\n",
|
|
" print(f\" Line {i}: {line_tokens} tokens (with prefix: {with_prefix} tokens)\")\n",
|
|
"print()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Without `omit_prefix_on_overflow` (default behavior)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"============================================================\n",
|
|
"WITHOUT omit_prefix_on_overflow (may split long lines)\n",
|
|
"============================================================\n",
|
|
"\n",
|
|
"Total chunks: 5\n",
|
|
"\n",
|
|
"--- Chunk 1 ---\n",
|
|
"\n",
|
|
"| Name | Age | Department | Location\n",
|
|
"Tokens: 8\n",
|
|
"Has prefix: False\n",
|
|
"\n",
|
|
"--- Chunk 2 ---\n",
|
|
"\n",
|
|
" |\n",
|
|
"|------|-----|------------|--\n",
|
|
"Tokens: 30\n",
|
|
"Has prefix: False\n",
|
|
"\n",
|
|
"--- Chunk 3 ---\n",
|
|
"--------|\n",
|
|
"\n",
|
|
"Tokens: 9\n",
|
|
"Has prefix: False\n",
|
|
"\n",
|
|
"--- Chunk 4 ---\n",
|
|
"| Alice Johnson | 30 | Engineering | San Francisco |\n",
|
|
"| Bob Smith | 25 | Marketing | New York |\n",
|
|
"\n",
|
|
"Tokens: 22\n",
|
|
"Has prefix: False\n",
|
|
"\n",
|
|
"--- Chunk 5 ---\n",
|
|
"| Charlie Brown with a very long name | 35 | Sales Department | Los Angeles |\n",
|
|
"\n",
|
|
"Tokens: 17\n",
|
|
"Has prefix: False\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/Users/anish/Desktop/Programs/docling/.venv/lib/python3.12/site-packages/docling_core/transforms/chunker/line_chunker.py:83: UserWarning: Chunks prefix is too long (47 tokens) for chunk size 30. It will be split into multiple chunks and only included in the first chunk(s). Consider increasing max_tokens to accommodate the full prefix in each chunk.\n",
|
|
" warnings.warn(\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"chunker_no_omit = LineBasedTokenChunker(\n",
|
|
" tokenizer=tokenizer,\n",
|
|
" prefix=prefix,\n",
|
|
" omit_prefix_on_overflow=False, # Default: always include prefix\n",
|
|
")\n",
|
|
"\n",
|
|
"chunks_no_omit = chunker_no_omit.chunk_text(lines)\n",
|
|
"\n",
|
|
"print(\"=\" * 60)\n",
|
|
"print(\"WITHOUT omit_prefix_on_overflow (may split long lines)\")\n",
|
|
"print(\"=\" * 60)\n",
|
|
"print(f\"\\nTotal chunks: {len(chunks_no_omit)}\\n\")\n",
|
|
"\n",
|
|
"for i, chunk in enumerate(chunks_no_omit, 1):\n",
|
|
" print(f\"--- Chunk {i} ---\")\n",
|
|
" print(chunk)\n",
|
|
" print(f\"Tokens: {tokenizer.count_tokens(chunk)}\")\n",
|
|
" print(f\"Has prefix: {chunk.startswith(prefix)}\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### With `omit_prefix_on_overflow=True`"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"============================================================\n",
|
|
"WITH omit_prefix_on_overflow (keeps lines intact)\n",
|
|
"============================================================\n",
|
|
"\n",
|
|
"Total chunks: 5\n",
|
|
"\n",
|
|
"--- Chunk 1 ---\n",
|
|
"\n",
|
|
"| Name | Age | Department | Location\n",
|
|
"Tokens: 8\n",
|
|
"Has prefix: False\n",
|
|
"\n",
|
|
"--- Chunk 2 ---\n",
|
|
"\n",
|
|
" |\n",
|
|
"|------|-----|------------|--\n",
|
|
"Tokens: 30\n",
|
|
"Has prefix: False\n",
|
|
"\n",
|
|
"--- Chunk 3 ---\n",
|
|
"--------|\n",
|
|
"\n",
|
|
"Tokens: 9\n",
|
|
"Has prefix: False\n",
|
|
"\n",
|
|
"--- Chunk 4 ---\n",
|
|
"| Alice Johnson | 30 | Engineering | San Francisco |\n",
|
|
"| Bob Smith | 25 | Marketing | New York |\n",
|
|
"\n",
|
|
"Tokens: 22\n",
|
|
"Has prefix: False\n",
|
|
"\n",
|
|
"--- Chunk 5 ---\n",
|
|
"| Charlie Brown with a very long name | 35 | Sales Department | Los Angeles |\n",
|
|
"\n",
|
|
"Tokens: 17\n",
|
|
"Has prefix: False\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"chunker_with_omit = LineBasedTokenChunker(\n",
|
|
" tokenizer=tokenizer,\n",
|
|
" prefix=prefix,\n",
|
|
" omit_prefix_on_overflow=True, # Omit prefix for lines that would overflow\n",
|
|
")\n",
|
|
"\n",
|
|
"chunks_with_omit = chunker_with_omit.chunk_text(lines)\n",
|
|
"\n",
|
|
"print(\"=\" * 60)\n",
|
|
"print(\"WITH omit_prefix_on_overflow (keeps lines intact)\")\n",
|
|
"print(\"=\" * 60)\n",
|
|
"print(f\"\\nTotal chunks: {len(chunks_with_omit)}\\n\")\n",
|
|
"\n",
|
|
"for i, chunk in enumerate(chunks_with_omit, 1):\n",
|
|
" print(f\"--- Chunk {i} ---\")\n",
|
|
" print(chunk)\n",
|
|
" print(f\"Tokens: {tokenizer.count_tokens(chunk)}\")\n",
|
|
" print(f\"Has prefix: {chunk.startswith(prefix)}\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Example 3: Chunking a DoclingDocument\n",
|
|
"\n",
|
|
"The `LineBasedTokenChunker` can also be used directly with `DoclingDocument` objects."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Total chunks: 11\n",
|
|
"\n",
|
|
"=== Chunk 1 ===\n",
|
|
"Text: # IBM\n",
|
|
"\n",
|
|
"International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over ...\n",
|
|
"Tokens: 57\n",
|
|
"Doc items: 12\n",
|
|
"\n",
|
|
"=== Chunk 2 ===\n",
|
|
"Text: IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for ...\n",
|
|
"Tokens: 99\n",
|
|
"Doc items: 12\n",
|
|
"\n",
|
|
"=== Chunk 3 ===\n",
|
|
"Text: systems. During the 1960s and 1970s, the IBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and ...\n",
|
|
"Tokens: 100\n",
|
|
"Doc items: 12\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from docling.document_converter import DocumentConverter\n",
|
|
"\n",
|
|
"# Convert a document\n",
|
|
"converter = DocumentConverter()\n",
|
|
"result = converter.convert(\"../../tests/data/md/wiki.md\")\n",
|
|
"doc = result.document\n",
|
|
"\n",
|
|
"# Create chunker\n",
|
|
"tokenizer = HuggingFaceTokenizer(\n",
|
|
" tokenizer=AutoTokenizer.from_pretrained(\"sentence-transformers/all-MiniLM-L6-v2\"),\n",
|
|
" max_tokens=100,\n",
|
|
")\n",
|
|
"\n",
|
|
"chunker = LineBasedTokenChunker(\n",
|
|
" tokenizer=tokenizer,\n",
|
|
" prefix=\"\", # No prefix for general documents\n",
|
|
")\n",
|
|
"\n",
|
|
"# Chunk the document\n",
|
|
"chunks = list(chunker.chunk(doc))\n",
|
|
"\n",
|
|
"print(f\"Total chunks: {len(chunks)}\\n\")\n",
|
|
"\n",
|
|
"# Display first few chunks\n",
|
|
"for i, chunk in enumerate(chunks[:3], 1):\n",
|
|
" print(f\"=== Chunk {i} ===\")\n",
|
|
" print(\n",
|
|
" f\"Text: {chunk.text[:200]}...\"\n",
|
|
" if len(chunk.text) > 200\n",
|
|
" else f\"Text: {chunk.text}\"\n",
|
|
" )\n",
|
|
" print(f\"Tokens: {tokenizer.count_tokens(chunk.text)}\")\n",
|
|
" print(f\"Doc items: {len(chunk.meta.doc_items)}\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Example 4: Handling Large Prefixes\n",
|
|
"\n",
|
|
"When a prefix exceeds the `max_tokens` limit, it's automatically split into multiple chunks and only included at the beginning."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Large prefix token count: 130 tokens\n",
|
|
"Max tokens allowed: 25 tokens\n",
|
|
"\n",
|
|
"⚠️ Warning issued:\n",
|
|
" Chunks prefix is too long (130 tokens) for chunk size 25. It will be split into multiple chunks and only included in the first chunk(s). Consider increasing max_tokens to accommodate the full prefix in each chunk.\n",
|
|
"\n",
|
|
"Number of prefix chunks: 6\n",
|
|
"Prefix len (for single chunk): 0\n",
|
|
"\n",
|
|
"Prefix chunks:\n",
|
|
" Chunk 1: 25 tokens\n",
|
|
" Content: \n",
|
|
"This is a very long table header that contains a lot of information This is a very long table heade...\n",
|
|
"\n",
|
|
" Chunk 2: 25 tokens\n",
|
|
" Content: \n",
|
|
" information This is a very long table header that contains a lot of information This is a very lon...\n",
|
|
"\n",
|
|
" Chunk 3: 25 tokens\n",
|
|
" Content: \n",
|
|
" of information This is a very long table header that contains a lot of information This is a very ...\n",
|
|
"\n",
|
|
" Chunk 4: 24 tokens\n",
|
|
" Content: \n",
|
|
" lot of information This is a very long table header that contains a lot of information This is a v...\n",
|
|
"\n",
|
|
" Chunk 5: 24 tokens\n",
|
|
" Content: \n",
|
|
" contains a lot of information This is a very long table header that contains a lot of information ...\n",
|
|
"\n",
|
|
" Chunk 6: 7 tokens\n",
|
|
" Content: header that contains a lot of information \n",
|
|
"\n",
|
|
"Total chunks (including prefix chunks): 7\n",
|
|
"Content chunks: 1\n",
|
|
"\n",
|
|
"Chunk 1 [PREFIX CHUNK]:\n",
|
|
" Content: \n",
|
|
"This is a very long table header that contains a lot of information This is a very long table heade...\n",
|
|
" Tokens: 25\n",
|
|
"\n",
|
|
"Chunk 2 [PREFIX CHUNK]:\n",
|
|
" Content: \n",
|
|
" information This is a very long table header that contains a lot of information This is a very lon...\n",
|
|
" Tokens: 25\n",
|
|
"\n",
|
|
"Chunk 3 [PREFIX CHUNK]:\n",
|
|
" Content: \n",
|
|
" of information This is a very long table header that contains a lot of information This is a very ...\n",
|
|
" Tokens: 25\n",
|
|
"\n",
|
|
"Chunk 4 [PREFIX CHUNK]:\n",
|
|
" Content: \n",
|
|
" lot of information This is a very long table header that contains a lot of information This is a v...\n",
|
|
" Tokens: 24\n",
|
|
"\n",
|
|
"Chunk 5 [PREFIX CHUNK]:\n",
|
|
" Content: \n",
|
|
" contains a lot of information This is a very long table header that contains a lot of information ...\n",
|
|
" Tokens: 24\n",
|
|
"\n",
|
|
"Chunk 6 [PREFIX CHUNK]:\n",
|
|
" Content: header that contains a lot of information \n",
|
|
" Tokens: 7\n",
|
|
"\n",
|
|
"Chunk 7 [CONTENT CHUNK]:\n",
|
|
" Content: Row 1: Some data here\n",
|
|
"Row 2: More data here\n",
|
|
"Row 3: Even more data\n",
|
|
"\n",
|
|
" Tokens: 18\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import warnings\n",
|
|
"\n",
|
|
"# Create a very long prefix that exceeds max_tokens\n",
|
|
"tokenizer = HuggingFaceTokenizer(\n",
|
|
" tokenizer=AutoTokenizer.from_pretrained(\"sentence-transformers/all-MiniLM-L6-v2\"),\n",
|
|
" max_tokens=25, # Small limit\n",
|
|
")\n",
|
|
"\n",
|
|
"large_prefix = (\n",
|
|
" \"This is a very long table header that contains a lot of information \" * 10\n",
|
|
")\n",
|
|
"\n",
|
|
"print(f\"Large prefix token count: {tokenizer.count_tokens(large_prefix)} tokens\")\n",
|
|
"print(f\"Max tokens allowed: {tokenizer.get_max_tokens()} tokens\\n\")\n",
|
|
"\n",
|
|
"# Create chunker with large prefix - will trigger warning\n",
|
|
"with warnings.catch_warnings(record=True) as w:\n",
|
|
" warnings.simplefilter(\"always\")\n",
|
|
"\n",
|
|
" chunker_large = LineBasedTokenChunker(\n",
|
|
" tokenizer=tokenizer,\n",
|
|
" prefix=large_prefix,\n",
|
|
" )\n",
|
|
"\n",
|
|
" if w:\n",
|
|
" print(\"⚠️ Warning issued:\")\n",
|
|
" print(f\" {w[0].message}\\n\")\n",
|
|
"\n",
|
|
"print(f\"Number of prefix chunks: {len(chunker_large.prefix_chunks)}\")\n",
|
|
"print(f\"Prefix len (for single chunk): {chunker_large.prefix_len}\\n\")\n",
|
|
"\n",
|
|
"# Show the prefix chunks\n",
|
|
"print(\"Prefix chunks:\")\n",
|
|
"for i, prefix_chunk in enumerate(chunker_large.prefix_chunks, 1):\n",
|
|
" preview = prefix_chunk[:100] + \"...\" if len(prefix_chunk) > 100 else prefix_chunk\n",
|
|
" print(f\" Chunk {i}: {tokenizer.count_tokens(prefix_chunk)} tokens\")\n",
|
|
" print(f\" Content: {preview}\\n\")\n",
|
|
"\n",
|
|
"# Test chunking with the large prefix\n",
|
|
"lines = [\n",
|
|
" \"Row 1: Some data here\\n\",\n",
|
|
" \"Row 2: More data here\\n\",\n",
|
|
" \"Row 3: Even more data\\n\",\n",
|
|
"]\n",
|
|
"\n",
|
|
"chunks_large = chunker_large.chunk_text(lines)\n",
|
|
"\n",
|
|
"print(f\"Total chunks (including prefix chunks): {len(chunks_large)}\")\n",
|
|
"print(f\"Content chunks: {len(chunks_large) - len(chunker_large.prefix_chunks)}\\n\")\n",
|
|
"\n",
|
|
"# Display all chunks\n",
|
|
"for i, chunk in enumerate(chunks_large, 1):\n",
|
|
" is_prefix_chunk = i <= len(chunker_large.prefix_chunks)\n",
|
|
" chunk_type = \"[PREFIX CHUNK]\" if is_prefix_chunk else \"[CONTENT CHUNK]\"\n",
|
|
"\n",
|
|
" print(f\"Chunk {i} {chunk_type}:\")\n",
|
|
" preview = chunk[:100] + \"...\" if len(chunk) > 100 else chunk\n",
|
|
" print(f\" Content: {preview}\")\n",
|
|
" print(f\" Tokens: {tokenizer.count_tokens(chunk)}\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Summary\n",
|
|
"\n",
|
|
"### When to use `LineBasedTokenChunker`\n",
|
|
"\n",
|
|
"- You need to preserve line boundaries (tables, code, logs)\n",
|
|
"- You want to add context (headers, metadata) to each chunk\n",
|
|
"- You're working with structured text where lines have semantic meaning\n",
|
|
"- You need fine-grained control over how lines are split\n",
|
|
"\n",
|
|
"### When to use `omit_prefix_on_overflow=True`\n",
|
|
"\n",
|
|
"- Working with wide tables or long prefixes\n",
|
|
"- Token budget is limited\n",
|
|
"- Line integrity is more important than consistent formatting\n",
|
|
"- You can handle chunks without the prefix in downstream processing\n",
|
|
"\n",
|
|
"### When to use `omit_prefix_on_overflow=False` (default)\n",
|
|
"\n",
|
|
"- You need the prefix in every chunk for context\n",
|
|
"- Consistent formatting is critical\n",
|
|
"- Downstream processing requires the prefix to understand the content\n",
|
|
"- Working with narrow content where the prefix doesn't cause overflow"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|