feat(scraping): add smart table extraction and analysis capabilities

Add comprehensive table detection and extraction functionality to the web scraping system:
- Implement intelligent table detection algorithm with scoring system
- Add table extraction with support for headers, rows, captions
- Update models to include tables in Media class
- Add table_score_threshold configuration option
- Add documentation and examples for table extraction
- Include crypto analysis example demonstrating table usage

This change enables users to extract structured data from HTML tables while intelligently filtering out layout tables.
This commit is contained in:
UncleCode
2025-03-09 21:31:33 +08:00
parent c6a605ccce
commit 9d69fce834
5 changed files with 586 additions and 9 deletions

View File

@@ -597,6 +597,8 @@ class CrawlerRunConfig():
Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
exclude_external_images (bool): If True, exclude all external images from processing.
Default: False.
table_score_threshold (int): Minimum score threshold for processing a table.
Default: 7.
# Link and Domain Handling Parameters
exclude_social_media_domains (list of str): List of domains to exclude for social media links.
@@ -698,6 +700,7 @@ class CrawlerRunConfig():
pdf: bool = False,
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
table_score_threshold: int = 7,
exclude_external_images: bool = False,
# Link and Domain Handling Parameters
exclude_social_media_domains: list = None,
@@ -783,6 +786,7 @@ class CrawlerRunConfig():
self.image_description_min_word_threshold = image_description_min_word_threshold
self.image_score_threshold = image_score_threshold
self.exclude_external_images = exclude_external_images
self.table_score_threshold = table_score_threshold
# Link and Domain Handling Parameters
self.exclude_social_media_domains = (
@@ -913,6 +917,7 @@ class CrawlerRunConfig():
image_score_threshold=kwargs.get(
"image_score_threshold", IMAGE_SCORE_THRESHOLD
),
table_score_threshold=kwargs.get("table_score_threshold", 7),
exclude_external_images=kwargs.get("exclude_external_images", False),
# Link and Domain Handling Parameters
exclude_social_media_domains=kwargs.get(
@@ -1001,6 +1006,7 @@ class CrawlerRunConfig():
"pdf": self.pdf,
"image_description_min_word_threshold": self.image_description_min_word_threshold,
"image_score_threshold": self.image_score_threshold,
"table_score_threshold": self.table_score_threshold,
"exclude_external_images": self.exclude_external_images,
"exclude_social_media_domains": self.exclude_social_media_domains,
"exclude_external_links": self.exclude_external_links,

View File

@@ -155,6 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
for aud in raw_result.get("media", {}).get("audios", [])
if aud
],
tables=raw_result.get("media", {}).get("tables", [])
)
# Convert links
@@ -193,6 +194,139 @@ class WebScrapingStrategy(ContentScrapingStrategy):
"""
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
def is_data_table(self, table: Tag, **kwargs) -> bool:
"""
Determine if a table element is a data table (not a layout table).
Args:
table (Tag): BeautifulSoup Tag representing a table element
**kwargs: Additional keyword arguments including table_score_threshold
Returns:
bool: True if the table is a data table, False otherwise
"""
score = 0
# Check for thead and tbody
has_thead = len(table.select('thead')) > 0
has_tbody = len(table.select('tbody')) > 0
if has_thead:
score += 2
if has_tbody:
score += 1
# Check for th elements
th_count = len(table.select('th'))
if th_count > 0:
score += 2
if has_thead or len(table.select('tr:first-child th')) > 0:
score += 1
# Check for nested tables
if len(table.select('table')) > 0:
score -= 3
# Role attribute check
role = table.get('role', '').lower()
if role in {'presentation', 'none'}:
score -= 3
# Column consistency
rows = table.select('tr')
if not rows:
return False
col_counts = [len(row.select('td, th')) for row in rows]
avg_cols = sum(col_counts) / len(col_counts)
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
if variance < 1:
score += 2
# Caption and summary
if table.select('caption'):
score += 2
if table.has_attr('summary') and table['summary']:
score += 1
# Text density
total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th'))
total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag))
text_ratio = total_text / (total_tags + 1e-5)
if text_ratio > 20:
score += 3
elif text_ratio > 10:
score += 2
# Data attributes
data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-'))
score += data_attrs * 0.5
# Size check
if avg_cols >= 2 and len(rows) >= 2:
score += 2
threshold = kwargs.get('table_score_threshold', 7)
return score >= threshold
def extract_table_data(self, table: Tag) -> dict:
"""
Extract structured data from a table element.
Args:
table (Tag): BeautifulSoup Tag representing a table element
Returns:
dict: Dictionary containing table data (headers, rows, caption, summary)
"""
caption_elem = table.select_one('caption')
caption = caption_elem.get_text().strip() if caption_elem else ""
summary = table.get('summary', '').strip()
# Extract headers with colspan handling
headers = []
thead_rows = table.select('thead tr')
if thead_rows:
header_cells = thead_rows[0].select('th')
for cell in header_cells:
text = cell.get_text().strip()
colspan = int(cell.get('colspan', 1))
headers.extend([text] * colspan)
else:
first_row = table.select('tr:first-child')
if first_row:
for cell in first_row[0].select('th, td'):
text = cell.get_text().strip()
colspan = int(cell.get('colspan', 1))
headers.extend([text] * colspan)
# Extract rows with colspan handling
rows = []
for row in table.select('tr:not(:has(ancestor::thead))'):
row_data = []
for cell in row.select('td'):
text = cell.get_text().strip()
colspan = int(cell.get('colspan', 1))
row_data.extend([text] * colspan)
if row_data:
rows.append(row_data)
# Align rows with headers
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
aligned_rows = []
for row in rows:
aligned = row[:max_columns] + [''] * (max_columns - len(row))
aligned_rows.append(aligned)
if not headers:
headers = [f"Column {i+1}" for i in range(max_columns)]
return {
"headers": headers,
"rows": aligned_rows,
"caption": caption,
"summary": summary,
}
def flatten_nested_elements(self, node):
"""
Flatten nested elements in a HTML tree.
@@ -431,7 +565,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
Returns:
dict: A dictionary containing the processed element information.
"""
media = {"images": [], "videos": [], "audios": []}
media = {"images": [], "videos": [], "audios": [], "tables": []}
internal_links_dict = {}
external_links_dict = {}
self._process_element(
@@ -797,6 +931,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
if result is not None
for img in result
]
# Process tables if not excluded
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
if 'table' not in excluded_tags:
tables = body.find_all('table')
for table in tables:
if self.is_data_table(table, **kwargs):
table_data = self.extract_table_data(table)
media["tables"].append(table_data)
body = self.flatten_nested_elements(body)
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
@@ -847,8 +990,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
cleaned_html = str_body.replace("\n\n", "\n").replace(" ", " ")
return {
# **markdown_content,
# "scraped_html": html,
"cleaned_html": cleaned_html,
"success": success,
"media": media,
@@ -1188,6 +1329,118 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
return root
def is_data_table(self, table: etree.Element, **kwargs) -> bool:
score = 0
# Check for thead and tbody
has_thead = len(table.xpath(".//thead")) > 0
has_tbody = len(table.xpath(".//tbody")) > 0
if has_thead:
score += 2
if has_tbody:
score += 1
# Check for th elements
th_count = len(table.xpath(".//th"))
if th_count > 0:
score += 2
if has_thead or table.xpath(".//tr[1]/th"):
score += 1
# Check for nested tables
if len(table.xpath(".//table")) > 0:
score -= 3
# Role attribute check
role = table.get("role", "").lower()
if role in {"presentation", "none"}:
score -= 3
# Column consistency
rows = table.xpath(".//tr")
if not rows:
return False
col_counts = [len(row.xpath(".//td|.//th")) for row in rows]
avg_cols = sum(col_counts) / len(col_counts)
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
if variance < 1:
score += 2
# Caption and summary
if table.xpath(".//caption"):
score += 2
if table.get("summary"):
score += 1
# Text density
total_text = sum(len(''.join(cell.itertext()).strip()) for row in rows for cell in row.xpath(".//td|.//th"))
total_tags = sum(1 for _ in table.iterdescendants())
text_ratio = total_text / (total_tags + 1e-5)
if text_ratio > 20:
score += 3
elif text_ratio > 10:
score += 2
# Data attributes
data_attrs = sum(1 for attr in table.attrib if attr.startswith('data-'))
score += data_attrs * 0.5
# Size check
if avg_cols >= 2 and len(rows) >= 2:
score += 2
threshold = kwargs.get("table_score_threshold", 7)
return score >= threshold
def extract_table_data(self, table: etree.Element) -> dict:
caption = table.xpath(".//caption/text()")
caption = caption[0].strip() if caption else ""
summary = table.get("summary", "").strip()
# Extract headers with colspan handling
headers = []
thead_rows = table.xpath(".//thead/tr")
if thead_rows:
header_cells = thead_rows[0].xpath(".//th")
for cell in header_cells:
text = cell.text_content().strip()
colspan = int(cell.get("colspan", 1))
headers.extend([text] * colspan)
else:
first_row = table.xpath(".//tr[1]")
if first_row:
for cell in first_row[0].xpath(".//th|.//td"):
text = cell.text_content().strip()
colspan = int(cell.get("colspan", 1))
headers.extend([text] * colspan)
# Extract rows with colspan handling
rows = []
for row in table.xpath(".//tr[not(ancestor::thead)]"):
row_data = []
for cell in row.xpath(".//td"):
text = cell.text_content().strip()
colspan = int(cell.get("colspan", 1))
row_data.extend([text] * colspan)
if row_data:
rows.append(row_data)
# Align rows with headers
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
aligned_rows = []
for row in rows:
aligned = row[:max_columns] + [''] * (max_columns - len(row))
aligned_rows.append(aligned)
if not headers:
headers = [f"Column {i+1}" for i in range(max_columns)]
return {
"headers": headers,
"rows": aligned_rows,
"caption": caption,
"summary": summary,
}
def _scrap(
self,
url: str,
@@ -1285,7 +1538,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
form.getparent().remove(form)
# Process content
media = {"images": [], "videos": [], "audios": []}
media = {"images": [], "videos": [], "audios": [], "tables": []}
internal_links_dict = {}
external_links_dict = {}
@@ -1299,6 +1552,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
**kwargs,
)
if 'table' not in excluded_tags:
tables = body.xpath(".//table")
for table in tables:
if self.is_data_table(table, **kwargs):
table_data = self.extract_table_data(table)
media["tables"].append(table_data)
# Handle only_text option
if kwargs.get("only_text", False):
for tag in ONLY_TEXT_ELIGIBLE_TAGS:
@@ -1370,7 +1630,12 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
return {
"cleaned_html": cleaned_html,
"success": False,
"media": {"images": [], "videos": [], "audios": []},
"media": {
"images": [],
"videos": [],
"audios": [],
"tables": []
},
"links": {"internal": [], "external": []},
"metadata": {},
}

View File

@@ -326,6 +326,7 @@ class Media(BaseModel):
audios: List[
MediaItem
] = [] # Using MediaItem model for now, can be extended with Audio model if needed
tables: List[Dict] = [] # Table data extracted from HTML tables
class Links(BaseModel):

View File

@@ -0,0 +1,230 @@
"""
Crawl4AI Crypto Trading Analysis Demo
Author: Unclecode
Date: 2024-03-15
This script demonstrates advanced crypto market analysis using:
1. Web scraping of real-time CoinMarketCap data
2. Smart table extraction with layout detection
3. Hedge fund-grade financial metrics
4. Interactive visualizations for trading signals
Key Features:
- Volume Anomaly Detection: Finds unusual trading activity
- Liquidity Power Score: Identifies easily tradable assets
- Volatility-Weighted Momentum: Surface sustainable trends
- Smart Money Signals: Algorithmic buy/hold recommendations
"""
import asyncio
import pandas as pd
import plotly.express as px
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LXMLWebScrapingStrategy
from crawl4ai import CrawlResult
from typing import List
from IPython.display import HTML
class CryptoAlphaGenerator:
"""
Advanced crypto analysis engine that transforms raw web data into:
- Volume anomaly flags
- Liquidity scores
- Momentum-risk ratios
- Machine learning-inspired trading signals
Methods:
analyze_tables(): Process raw tables into trading insights
create_visuals(): Generate institutional-grade visualizations
generate_insights(): Create plain English trading recommendations
"""
def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Convert crypto market data to machine-readable format
Handles currency symbols, units (B=Billions), and percentage values
"""
# Clean numeric columns
df['Price'] = df['Price'].str.replace('[^\d.]', '', regex=True).astype(float)
df['Market Cap'] = df['Market Cap'].str.extract(r'\$([\d.]+)B')[0].astype(float) * 1e9
df['Volume(24h)'] = df['Volume(24h)'].str.extract(r'\$([\d.]+)B')[0].astype(float) * 1e9
# Convert percentages to decimal values
for col in ['1h %', '24h %', '7d %']:
df[col] = df[col].str.replace('%', '').astype(float) / 100
return df
def calculate_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Compute advanced trading metrics used by quantitative funds:
1. Volume/Market Cap Ratio - Measures liquidity efficiency
(High ratio = Underestimated attention)
2. Volatility Score - Risk-adjusted momentum potential
(STD of 1h/24h/7d returns)
3. Momentum Score - Weighted average of returns
(1h:30% + 24h:50% + 7d:20%)
4. Volume Anomaly - 3σ deviation detection
(Flags potential insider activity)
"""
# Liquidity Metrics
df['Volume/Market Cap Ratio'] = df['Volume(24h)'] / df['Market Cap']
# Risk Metrics
df['Volatility Score'] = df[['1h %','24h %','7d %']].std(axis=1)
# Momentum Metrics
df['Momentum Score'] = (df['1h %']*0.3 + df['24h %']*0.5 + df['7d %']*0.2)
# Anomaly Detection
median_vol = df['Volume(24h)'].median()
df['Volume Anomaly'] = df['Volume(24h)'] > 3 * median_vol
# Value Flags
df['Undervalued Flag'] = (df['Market Cap'] < 1e9) & (df['Momentum Score'] > 0.05)
df['Liquid Giant'] = (df['Volume/Market Cap Ratio'] > 0.15) & (df['Market Cap'] > 1e9)
return df
def create_visuals(self, df: pd.DataFrame) -> dict:
"""
Generate three institutional-grade visualizations:
1. 3D Market Map - X:Size, Y:Liquidity, Z:Momentum
2. Liquidity Tree - Color:Volume Efficiency
3. Momentum Leaderboard - Top sustainable movers
"""
# 3D Market Overview
fig1 = px.scatter_3d(
df,
x='Market Cap',
y='Volume/Market Cap Ratio',
z='Momentum Score',
size='Volatility Score',
color='Volume Anomaly',
hover_name='Name',
title='Smart Money Market Map: Spot Overlooked Opportunities',
labels={'Market Cap': 'Size (Log $)', 'Volume/Market Cap Ratio': 'Liquidity Power'},
log_x=True,
template='plotly_dark'
)
# Liquidity Efficiency Tree
fig2 = px.treemap(
df,
path=['Name'],
values='Market Cap',
color='Volume/Market Cap Ratio',
hover_data=['Momentum Score'],
title='Liquidity Forest: Green = High Trading Efficiency',
color_continuous_scale='RdYlGn'
)
# Momentum Leaders
fig3 = px.bar(
df.sort_values('Momentum Score', ascending=False).head(10),
x='Name',
y='Momentum Score',
color='Volatility Score',
title='Sustainable Momentum Leaders (Low Volatility + High Growth)',
text='7d %',
template='plotly_dark'
)
return {'market_map': fig1, 'liquidity_tree': fig2, 'momentum_leaders': fig3}
def generate_insights(self, df: pd.DataFrame) -> str:
"""
Create plain English trading insights explaining:
- Volume spikes and their implications
- Risk-reward ratios of top movers
- Liquidity warnings for large positions
"""
top_coin = df.sort_values('Momentum Score', ascending=False).iloc[0]
anomaly_coins = df[df['Volume Anomaly']].sort_values('Volume(24h)', ascending=False)
report = f"""
🚀 Top Alpha Opportunity: {top_coin['Name']}
- Momentum Score: {top_coin['Momentum Score']:.2%} (Top 1%)
- Risk-Reward Ratio: {top_coin['Momentum Score']/top_coin['Volatility Score']:.1f}
- Liquidity Warning: {'✅ Safe' if top_coin['Liquid Giant'] else '⚠️ Thin Markets'}
🔥 Volume Spikes Detected ({len(anomaly_coins)} coins):
{anomaly_coins[['Name', 'Volume(24h)']].head(3).to_markdown(index=False)}
💡 Smart Money Tip: Coins with Volume/Cap > 15% and Momentum > 5%
historically outperform by 22% weekly returns.
"""
return report
async def main():
"""
Main execution flow:
1. Configure headless browser for scraping
2. Extract live crypto market data
3. Clean and analyze using hedge fund models
4. Generate visualizations and insights
5. Output professional trading report
"""
# Configure browser with anti-detection features
browser_config = BrowserConfig(
headless=True,
stealth=True,
block_resources=["image", "media"]
)
# Initialize crawler with smart table detection
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
try:
# Set up scraping parameters
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
scraping_strategy=LXMLWebScrapingStrategy(
table_score_threshold=8, # Strict table detection
keep_data_attributes=True
)
)
# Execute market data extraction
results: List[CrawlResult] = await crawler.arun(
url='https://coinmarketcap.com/?page=1',
config=crawl_config
)
# Process results
for result in results:
if result.success and result.media['tables']:
# Extract primary market table
raw_df = pd.DataFrame(
result.media['tables'][0]['rows'],
columns=result.media['tables'][0]['headers']
)
# Initialize analysis engine
analyzer = CryptoAlphaGenerator()
clean_df = analyzer.clean_data(raw_df)
analyzed_df = analyzer.calculate_metrics(clean_df)
# Generate outputs
visuals = analyzer.create_visuals(analyzed_df)
insights = analyzer.generate_insights(analyzed_df)
# Save visualizations
visuals['market_map'].write_html("market_map.html")
visuals['liquidity_tree'].write_html("liquidity_tree.html")
# Display results
print("🔑 Key Trading Insights:")
print(insights)
print("\n📊 Open 'market_map.html' for interactive analysis")
finally:
await crawler.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -133,19 +133,28 @@ This approach is handy when you still want external links but need to block cert
### 3.1 Accessing `result.media`
By default, Crawl4AI collects images, audio, and video URLs it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`).
By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
**Basic Example**:
```python
if result.success:
# Get images
images_info = result.media.get("images", [])
print(f"Found {len(images_info)} images in total.")
for i, img in enumerate(images_info[:5]): # Inspect just the first 5
for i, img in enumerate(images_info[:3]): # Inspect just the first 3
print(f"[Image {i}] URL: {img['src']}")
print(f" Alt text: {img.get('alt', '')}")
print(f" Score: {img.get('score')}")
print(f" Description: {img.get('desc', '')}\n")
# Get tables
tables = result.media.get("tables", [])
print(f"Found {len(tables)} data tables in total.")
for i, table in enumerate(tables):
print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}")
print(f" Columns: {len(table.get('headers', []))}")
print(f" Rows: {len(table.get('rows', []))}")
```
**Structure Example**:
@@ -171,6 +180,19 @@ result.media = {
],
"audio": [
# Similar structure but with audio-specific fields
],
"tables": [
{
"headers": ["Name", "Age", "Location"],
"rows": [
["John Doe", "34", "New York"],
["Jane Smith", "28", "San Francisco"],
["Alex Johnson", "42", "Chicago"]
],
"caption": "Employee Directory",
"summary": "Directory of company employees"
},
# More tables if present
]
}
```
@@ -199,7 +221,53 @@ crawler_cfg = CrawlerRunConfig(
This setting attempts to discard images from outside the primary domain, keeping only those from the site youre crawling.
### 3.3 Additional Media Config
### 3.3 Working with Tables
Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
- Presence of thead and tbody sections
- Use of th elements for headers
- Column consistency
- Text density
- And other factors
Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`.
**Accessing Table Data**:
```python
if result.success:
tables = result.media.get("tables", [])
print(f"Found {len(tables)} data tables on the page")
if tables:
# Access the first table
first_table = tables[0]
print(f"Table caption: {first_table.get('caption', 'No caption')}")
print(f"Headers: {first_table.get('headers', [])}")
# Print the first 3 rows
for i, row in enumerate(first_table.get('rows', [])[:3]):
print(f"Row {i+1}: {row}")
```
**Configuring Table Extraction**:
You can adjust the sensitivity of the table detection algorithm with:
```python
crawler_cfg = CrawlerRunConfig(
table_score_threshold=5 # Lower value = more tables detected (default: 7)
)
```
Each extracted table contains:
- `headers`: Column header names
- `rows`: List of rows, each containing cell values
- `caption`: Table caption text (if available)
- `summary`: Table summary attribute (if specified)
### 3.4 Additional Media Config
- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.
- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.
@@ -273,4 +341,11 @@ if __name__ == "__main__":
---
**Thats it for Link & Media Analysis!** Youre now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
**Thats it for Link & Media Analysis!** Youre now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
### Table Extraction Tips
- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.