Apply Ruff Corrections

This commit is contained in:
UncleCode
2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions

View File

@@ -10,18 +10,17 @@ import asyncio
import os
import json
import re
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
LLMExtractionStrategy,
JsonCssExtractionStrategy
JsonCssExtractionStrategy,
)
from crawl4ai.content_filter_strategy import RelevantContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from bs4 import BeautifulSoup
# Sample HTML for demonstrations
@@ -52,17 +51,18 @@ SAMPLE_HTML = """
</div>
"""
async def demo_ssl_features():
"""
Enhanced SSL & Security Features Demo
-----------------------------------
This example demonstrates the new SSL certificate handling and security features:
1. Custom certificate paths
2. SSL verification options
3. HTTPS error handling
4. Certificate validation configurations
These features are particularly useful when:
- Working with self-signed certificates
- Dealing with corporate proxies
@@ -76,14 +76,11 @@ async def demo_ssl_features():
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
fetch_ssl_certificate=True # Enable SSL certificate fetching
fetch_ssl_certificate=True, # Enable SSL certificate fetching
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com",
config=run_config
)
result = await crawler.arun(url="https://example.com", config=run_config)
print(f"SSL Crawl Success: {result.success}")
result.ssl_certificate.to_json(
os.path.join(os.getcwd(), "ssl_certificate.json")
@@ -91,11 +88,12 @@ async def demo_ssl_features():
if not result.success:
print(f"SSL Error: {result.error_message}")
async def demo_content_filtering():
"""
Smart Content Filtering Demo
----------------------
Demonstrates advanced content filtering capabilities:
1. Custom filter to identify and extract specific content
2. Integration with markdown generation
@@ -110,87 +108,90 @@ async def demo_content_filtering():
super().__init__()
# Add news-specific patterns
self.negative_patterns = re.compile(
r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending',
re.I
r"nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending",
re.I,
)
self.min_word_count = 30 # Higher threshold for news content
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
def filter_content(
self, html: str, min_word_threshold: int = None
) -> List[str]:
"""
Implements news-specific content filtering logic.
Args:
html (str): HTML content to be filtered
min_word_threshold (int, optional): Minimum word count threshold
Returns:
List[str]: List of filtered HTML content blocks
"""
if not html or not isinstance(html, str):
return []
soup = BeautifulSoup(html, 'lxml')
soup = BeautifulSoup(html, "lxml")
if not soup.body:
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
body = soup.find('body')
soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
body = soup.find("body")
# Extract chunks with metadata
chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count)
chunks = self.extract_text_chunks(
body, min_word_threshold or self.min_word_count
)
# Filter chunks based on news-specific criteria
filtered_chunks = []
for _, text, tag_type, element in chunks:
# Skip if element has negative class/id
if self.is_excluded(element):
continue
# Headers are important in news articles
if tag_type == 'header':
if tag_type == "header":
filtered_chunks.append(self.clean_element(element))
continue
# For content, check word count and link density
text = element.get_text(strip=True)
if len(text.split()) >= (min_word_threshold or self.min_word_count):
# Calculate link density
links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a'))
links_text = " ".join(
a.get_text(strip=True) for a in element.find_all("a")
)
link_density = len(links_text) / len(text) if text else 1
# Accept if link density is reasonable
if link_density < 0.5:
filtered_chunks.append(self.clean_element(element))
return filtered_chunks
# Create markdown generator with custom filter
markdown_gen = DefaultMarkdownGenerator(
content_filter=CustomNewsFilter()
)
markdown_gen = DefaultMarkdownGenerator(content_filter=CustomNewsFilter())
run_config = CrawlerRunConfig(
markdown_generator=markdown_gen,
cache_mode=CacheMode.BYPASS
markdown_generator=markdown_gen, cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://news.ycombinator.com",
config=run_config
url="https://news.ycombinator.com", config=run_config
)
print("Filtered Content Sample:")
print(result.markdown[:500]) # Show first 500 chars
async def demo_json_extraction():
"""
Improved JSON Extraction Demo
---------------------------
Demonstrates the enhanced JSON extraction capabilities:
1. Base element attributes extraction
2. Complex nested structures
3. Multiple extraction patterns
Key features shown:
- Extracting attributes from base elements (href, data-* attributes)
- Processing repeated patterns
@@ -206,7 +207,7 @@ async def demo_json_extraction():
"baseSelector": "div.article-list",
"baseFields": [
{"name": "list_id", "type": "attribute", "attribute": "data-list-id"},
{"name": "category", "type": "attribute", "attribute": "data-category"}
{"name": "category", "type": "attribute", "attribute": "data-category"},
],
"fields": [
{
@@ -214,8 +215,16 @@ async def demo_json_extraction():
"selector": "article.post",
"type": "nested_list",
"baseFields": [
{"name": "post_id", "type": "attribute", "attribute": "data-post-id"},
{"name": "author_id", "type": "attribute", "attribute": "data-author"}
{
"name": "post_id",
"type": "attribute",
"attribute": "data-post-id",
},
{
"name": "author_id",
"type": "attribute",
"attribute": "data-author",
},
],
"fields": [
{
@@ -223,60 +232,68 @@ async def demo_json_extraction():
"selector": "h2.title a",
"type": "text",
"baseFields": [
{"name": "url", "type": "attribute", "attribute": "href"}
]
{
"name": "url",
"type": "attribute",
"attribute": "href",
}
],
},
{
"name": "author",
"selector": "div.meta a.author",
"type": "text",
"baseFields": [
{"name": "profile_url", "type": "attribute", "attribute": "href"}
]
},
{
"name": "date",
"selector": "span.date",
"type": "text"
{
"name": "profile_url",
"type": "attribute",
"attribute": "href",
}
],
},
{"name": "date", "selector": "span.date", "type": "text"},
{
"name": "read_more",
"selector": "a.read-more",
"type": "nested",
"fields": [
{"name": "text", "type": "text"},
{"name": "url", "type": "attribute", "attribute": "href"}
]
}
]
{
"name": "url",
"type": "attribute",
"attribute": "href",
},
],
},
],
}
]
],
}
)
# Demonstrate extraction from raw HTML
run_config = CrawlerRunConfig(
extraction_strategy=json_strategy,
cache_mode=CacheMode.BYPASS
extraction_strategy=json_strategy, cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="raw:" + SAMPLE_HTML, # Use raw: prefix for raw HTML
config=run_config
config=run_config,
)
print("Extracted Content:")
print(result.extracted_content)
async def demo_input_formats():
"""
Input Format Handling Demo
----------------------
Demonstrates how LLM extraction can work with different input formats:
1. Markdown (default) - Good for simple text extraction
2. HTML - Better when you need structure and attributes
This example shows how HTML input can be beneficial when:
- You need to understand the DOM structure
- You want to extract both visible text and HTML attributes
@@ -350,7 +367,7 @@ async def demo_input_formats():
</footer>
</div>
"""
# Use raw:// prefix to pass HTML content directly
url = f"raw://{dummy_html}"
@@ -359,18 +376,30 @@ async def demo_input_formats():
# Define our schema using Pydantic
class JobRequirement(BaseModel):
category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)")
items: List[str] = Field(description="List of specific requirements in this category")
priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context")
category: str = Field(
description="Category of the requirement (e.g., Technical, Soft Skills)"
)
items: List[str] = Field(
description="List of specific requirements in this category"
)
priority: str = Field(
description="Priority level (Required/Preferred) based on the HTML class or context"
)
class JobPosting(BaseModel):
title: str = Field(description="Job title")
department: str = Field(description="Department or team")
location: str = Field(description="Job location, including remote options")
salary_range: Optional[str] = Field(description="Salary range if specified")
requirements: List[JobRequirement] = Field(description="Categorized job requirements")
application_deadline: Optional[str] = Field(description="Application deadline if specified")
contact_info: Optional[dict] = Field(description="Contact information from footer or contact section")
requirements: List[JobRequirement] = Field(
description="Categorized job requirements"
)
application_deadline: Optional[str] = Field(
description="Application deadline if specified"
)
contact_info: Optional[dict] = Field(
description="Contact information from footer or contact section"
)
# First try with markdown (default)
markdown_strategy = LLMExtractionStrategy(
@@ -382,7 +411,7 @@ async def demo_input_formats():
Extract job posting details into structured data. Focus on the visible text content
and organize requirements into categories.
""",
input_format="markdown" # default
input_format="markdown", # default
)
# Then with HTML for better structure understanding
@@ -400,34 +429,25 @@ async def demo_input_formats():
Use HTML attributes and classes to enhance extraction accuracy.
""",
input_format="html" # explicitly use HTML
input_format="html", # explicitly use HTML
)
async with AsyncWebCrawler() as crawler:
# Try with markdown first
markdown_config = CrawlerRunConfig(
extraction_strategy=markdown_strategy
)
markdown_result = await crawler.arun(
url=url,
config=markdown_config
)
markdown_config = CrawlerRunConfig(extraction_strategy=markdown_strategy)
markdown_result = await crawler.arun(url=url, config=markdown_config)
print("\nMarkdown-based Extraction Result:")
items = json.loads(markdown_result.extracted_content)
print(json.dumps(items, indent=2))
# Then with HTML for better structure understanding
html_config = CrawlerRunConfig(
extraction_strategy=html_strategy
)
html_result = await crawler.arun(
url=url,
config=html_config
)
html_config = CrawlerRunConfig(extraction_strategy=html_strategy)
html_result = await crawler.arun(url=url, config=html_config)
print("\nHTML-based Extraction Result:")
items = json.loads(html_result.extracted_content)
print(json.dumps(items, indent=2))
# Main execution
async def main():
print("Crawl4AI v0.4.24 Feature Walkthrough")
@@ -439,5 +459,6 @@ async def main():
await demo_json_extraction()
# await demo_input_formats()
if __name__ == "__main__":
asyncio.run(main())