diff --git a/docs/examples/llm_table_extraction_example.py b/docs/examples/llm_table_extraction_example.py index 845f7eb9..b97d2bbe 100644 --- a/docs/examples/llm_table_extraction_example.py +++ b/docs/examples/llm_table_extraction_example.py @@ -17,7 +17,6 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file import asyncio -import json from crawl4ai import ( AsyncWebCrawler, CrawlerRunConfig, @@ -38,13 +37,19 @@ async def basic_llm_extraction(): provider="openai/gpt-4.1-mini", api_token="env:OPENAI_API_KEY", # Uses environment variable temperature=0.1, # Low temperature for consistency - max_tokens=2000 + max_tokens=32000 ) # Create LLM table extraction strategy table_strategy = LLMTableExtraction( llm_config=llm_config, - verbose=True + verbose=True, + # css_selector="div.mw-content-ltr", + max_tries=2, + enable_chunking=True, + chunk_token_threshold=5000, # Lower threshold to force chunking + min_rows_per_chunk=10, + max_parallel_chunks=3 ) # Configure crawler with the strategy @@ -56,7 +61,7 @@ async def basic_llm_extraction(): async with AsyncWebCrawler() as crawler: # Extract tables from a Wikipedia page result = await crawler.arun( - url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)", + url="https://en.wikipedia.org/wiki/List_of_chemical_elements", config=config ) @@ -264,70 +269,14 @@ async def compare_strategies(): print(f" Row {i+1}: {row}") print(f" Metadata: {table.get('metadata', {})}") - -# Example 4: Using Local Models (Ollama) -async def local_model_extraction(): - """Extract tables using local Ollama models for privacy/cost.""" - print("\n=== Example 4: Local Model Extraction with Ollama ===") - - # Configure for local Ollama - llm_config = LLMConfig( - provider="ollama/llama3.3", - api_token=None, # Not needed for Ollama - base_url="http://localhost:11434", - temperature=0.1 - ) - - table_strategy = LLMTableExtraction( - llm_config=llm_config, - verbose=True - ) - - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - table_extraction=table_strategy - ) - - # Simple test HTML - test_html = """ - - - - - - - - - -
ProductPriceStock
Apple$1.50100
Banana$0.50200
Orange$2.0050
- """ - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - url=f"raw:{test_html}", - config=config - ) - - if result.success and result.tables: - table = result.tables[0] - print(f"✓ Extracted with local model:") - - # Create DataFrame - df = pd.DataFrame(table['rows'], columns=table['headers']) - print(df.to_string()) - else: - print("✗ Make sure Ollama is running locally with llama3.3 model") - - -# Example 5: Batch Processing Multiple Pages +# Example 4: Batch Processing Multiple Pages async def batch_extraction(): """Extract tables from multiple pages efficiently.""" - print("\n=== Example 5: Batch Table Extraction ===") + print("\n=== Example 4: Batch Table Extraction ===") urls = [ - "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)", - "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)", - "https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index" + "https://www.worldometers.info/geography/alphabetical-list-of-countries/", + # "https://en.wikipedia.org/wiki/List_of_chemical_elements", ] llm_config = LLMConfig( @@ -339,8 +288,12 @@ async def batch_extraction(): table_strategy = LLMTableExtraction( llm_config=llm_config, - css_selector="table.wikitable", # Wikipedia data tables - verbose=False + css_selector="div.datatable-container", # Wikipedia data tables + verbose=False, + enable_chunking=True, + chunk_token_threshold=5000, # Lower threshold to force chunking + min_rows_per_chunk=10, + max_parallel_chunks=3 ) config = CrawlerRunConfig( @@ -391,9 +344,6 @@ async def main(): # # Compare strategies # await compare_strategies() - # # Local model (requires Ollama) - # # await local_model_extraction() - # # Batch processing # await batch_extraction() diff --git a/tests/test_llm_simple_url.py b/tests/test_llm_simple_url.py index c5f4068a..bb31434c 100644 --- a/tests/test_llm_simple_url.py +++ b/tests/test_llm_simple_url.py @@ -23,90 +23,15 @@ async def test_controlled_html(): print("LLM TABLE EXTRACTION TEST") print("=" * 60) - # Create test HTML with complex tables - test_html = """ - - - Test Tables - -

Sales Data

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Q1 2024 Sales Report
ProductJanuaryFebruary
Week 1Week 2Week 3Week 1Week 2Week 3
Widget A100120110130140150
Widget B200180190210220230
Note: All values in thousands USD
- -
- - - - - - - - - - - - - - - - - -
CountryPopulationGDP
USA331M$21T
China1.4B$14T
- - - """ - - # url = "https://www.w3schools.com/html/html_tables.asp" url = "https://en.wikipedia.org/wiki/List_of_chemical_elements" # url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India" # Configure LLM llm_config = LLMConfig( - provider="openai/gpt-4.1-mini", - # provider="groq/llama-3.3-70b-versatile", - api_token=os.getenv("OPENAI_API_KEY"), - # api_token=os.getenv("GROQ_API_KEY"), - # api_token="os.getenv("GROQ_API_KEY")", + # provider="openai/gpt-4.1-mini", + # api_token=os.getenv("OPENAI_API_KEY"), + provider="groq/llama-3.3-70b-versatile", + api_token="GROQ_API_TOKEN", temperature=0.1, max_tokens=32000 )