Files
crawl4ai/tests/test_llm_simple_url.py

170 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""
Test LLMTableExtraction with controlled HTML
"""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import asyncio
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
LLMConfig,
LLMTableExtraction,
DefaultTableExtraction,
CacheMode
)
async def test_controlled_html():
"""Test with controlled HTML content."""
print("\n" + "=" * 60)
print("LLM TABLE EXTRACTION TEST")
print("=" * 60)
url = "https://en.wikipedia.org/wiki/List_of_chemical_elements"
# url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India"
# Configure LLM
llm_config = LLMConfig(
# provider="openai/gpt-4.1-mini",
# api_token=os.getenv("OPENAI_API_KEY"),
provider="groq/llama-3.3-70b-versatile",
api_token="GROQ_API_TOKEN",
temperature=0.1,
max_tokens=32000
)
print("\n1. Testing LLMTableExtraction:")
# Create LLM extraction strategy
llm_strategy = LLMTableExtraction(
llm_config=llm_config,
verbose=True,
# css_selector="div.w3-example"
css_selector="div.mw-content-ltr",
# css_selector="table.wikitable",
max_tries=2,
enable_chunking=True,
chunk_token_threshold=5000, # Lower threshold to force chunking
min_rows_per_chunk=10,
max_parallel_chunks=3
)
config_llm = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=llm_strategy
)
async with AsyncWebCrawler() as crawler:
# Test with LLM extraction
result_llm = await crawler.arun(
# url=f"raw:{test_html}",
url=url,
config=config_llm
)
if result_llm.success:
print(f"\n ✓ LLM Extraction: Found {len(result_llm.tables)} table(s)")
for i, table in enumerate(result_llm.tables, 1):
print(f"\n Table {i}:")
print(f" - Caption: {table.get('caption', 'No caption')}")
print(f" - Headers: {table['headers']}")
print(f" - Rows: {len(table['rows'])}")
# Show how colspan/rowspan were handled
print(f" - Sample rows:")
for j, row in enumerate(table['rows'][:2], 1):
print(f" Row {j}: {row}")
metadata = table.get('metadata', {})
print(f" - Metadata:")
print(f" • Has merged cells: {metadata.get('has_merged_cells', False)}")
print(f" • Table type: {metadata.get('table_type', 'unknown')}")
# # Compare with default extraction
# print("\n2. Comparing with DefaultTableExtraction:")
# default_strategy = DefaultTableExtraction(
# table_score_threshold=3,
# verbose=False
# )
# config_default = CrawlerRunConfig(
# cache_mode=CacheMode.BYPASS,
# table_extraction=default_strategy
# )
# result_default = await crawler.arun(
# # url=f"raw:{test_html}",
# url=url,
# config=config_default
# )
# if result_default.success:
# print(f" ✓ Default Extraction: Found {len(result_default.tables)} table(s)")
# # Compare handling of complex structures
# print("\n3. Comparison Summary:")
# print(f" LLM found: {len(result_llm.tables)} tables")
# print(f" Default found: {len(result_default.tables)} tables")
# if result_llm.tables and result_default.tables:
# llm_first = result_llm.tables[0]
# default_first = result_default.tables[0]
# print(f"\n First table comparison:")
# print(f" LLM headers: {len(llm_first['headers'])} columns")
# print(f" Default headers: {len(default_first['headers'])} columns")
# # Check if LLM better handled the complex structure
# if llm_first.get('metadata', {}).get('has_merged_cells'):
# print(" ✓ LLM correctly identified merged cells")
# # Test pandas compatibility
# try:
# import pandas as pd
# print("\n4. Testing Pandas compatibility:")
# # Create DataFrame from LLM extraction
# df_llm = pd.DataFrame(
# llm_first['rows'],
# columns=llm_first['headers']
# )
# print(f" ✓ LLM table -> DataFrame: Shape {df_llm.shape}")
# # Create DataFrame from default extraction
# df_default = pd.DataFrame(
# default_first['rows'],
# columns=default_first['headers']
# )
# print(f" ✓ Default table -> DataFrame: Shape {df_default.shape}")
# print("\n LLM DataFrame preview:")
# print(df_llm.head(2).to_string())
# except ImportError:
# print("\n4. Pandas not installed, skipping DataFrame test")
print("\n✅ Test completed successfully!")
async def main():
"""Run the test."""
# Check for API key
if not os.getenv("OPENAI_API_KEY"):
print("⚠️ OPENAI_API_KEY not set. Please set it to test LLM extraction.")
print(" You can set it with: export OPENAI_API_KEY='your-key-here'")
return
await test_controlled_html()
if __name__ == "__main__":
asyncio.run(main())