diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index dfda9e28..73e5c025 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.25" +__version__ = "0.4.24" diff --git a/docs/examples/v0_4_24_walkthrough.py b/docs/examples/v0_4_24_walkthrough.py new file mode 100644 index 00000000..61477b78 --- /dev/null +++ b/docs/examples/v0_4_24_walkthrough.py @@ -0,0 +1,387 @@ +""" +Crawl4AI v0.4.24 Feature Walkthrough +=================================== + +This script demonstrates the new features introduced in Crawl4AI v0.4.24. +Each section includes detailed examples and explanations of the new capabilities. +""" + +import asyncio +import os +import json +from typing import List, Optional, Dict, Any +from pydantic import BaseModel, Field +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CacheMode, + LLMExtractionStrategy +) +from crawl4ai.content_filter_strategy import PruningContentFilter + +# Sample HTML for demonstrations +SAMPLE_HTML = """ +
+
+

First Post

+
+ John Doe + 2023-12-31 +
+
+

First post content...

+ Read More +
+
+
+

Second Post

+
+ Jane Smith + 2023-12-30 +
+
+

Second post content...

+ Read More +
+
+
+""" + +async def demo_ssl_features(): + """ + Enhanced SSL & Security Features Demo + ----------------------------------- + + This example demonstrates the new SSL certificate handling and security features: + 1. Custom certificate paths + 2. SSL verification options + 3. HTTPS error handling + 4. Certificate validation configurations + + These features are particularly useful when: + - Working with self-signed certificates + - Dealing with corporate proxies + - Handling mixed content websites + - Managing different SSL security levels + """ + print("\n1. Enhanced SSL & Security Demo") + print("--------------------------------") + + browser_config = BrowserConfig( + ignore_https_errors=True, + verbose=True + ) + + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + fetch_ssl_certificate=True # Enable SSL certificate fetching + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + print(f"SSL Crawl Success: {result.success}") + if not result.success: + print(f"SSL Error: {result.error_message}") + +async def demo_content_filtering(): + """ + Smart Content Filtering Demo + -------------------------- + + Demonstrates the new content filtering system with: + 1. Regular expression pattern matching + 2. Length-based filtering + 3. Custom filtering rules + 4. Content chunking strategies + + This is particularly useful for: + - Removing advertisements and boilerplate content + - Extracting meaningful paragraphs + - Filtering out irrelevant sections + - Processing content in manageable chunks + """ + print("\n2. Smart Content Filtering Demo") + print("--------------------------------") + + content_filter = PruningContentFilter( + min_word_threshold=50, + threshold_type='dynamic', + threshold=0.5 + ) + + run_config = CrawlerRunConfig( + content_filter=content_filter, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://news.ycombinator.com", + config=run_config + ) + print("Filtered Content Sample:") + print(result.markdown[:500] + "...\n") + +async def demo_json_extraction(): + """ + Advanced JSON Extraction Demo + --------------------------- + + Demonstrates the enhanced JSON extraction capabilities: + 1. Using different input formats (markdown, html) + 2. Base element attributes extraction + 3. Complex nested structures + 4. Multiple extraction patterns + + Key features shown: + - Extracting from different input formats (markdown vs html) + - Extracting attributes from base elements (href, data-* attributes) + - Processing repeated patterns + - Handling optional fields + - Computing derived values + """ + print("\n3. Improved JSON Extraction Demo") + print("--------------------------------") + + # Define the extraction schema with base element attributes + json_strategy = JsonCssExtractionStrategy( + schema={ + "name": "Blog Posts", + "baseSelector": "div.article-list", + "fields": [ + { + "name": "posts", + "selector": "article.post", + "type": "nested_list", + "baseFields": [ + {"name": "category", "type": "attribute", "attribute": "data-category"}, + {"name": "author_id", "type": "attribute", "attribute": "data-author"} + ], + "fields": [ + { + "name": "title", + "selector": "h2.title a", + "type": "text", + "baseFields": [ + {"name": "url", "type": "attribute", "attribute": "href"} + ] + }, + { + "name": "author", + "selector": "div.meta a.author", + "type": "text", + "baseFields": [ + {"name": "profile_url", "type": "attribute", "attribute": "href"} + ] + }, + { + "name": "date", + "selector": "span.date", + "type": "text" + }, + { + "name": "read_more", + "selector": "a.read-more", + "type": "nested", + "fields": [ + {"name": "text", "type": "text"}, + {"name": "url", "type": "attribute", "attribute": "href"} + ] + } + ] + } + ] + } + ) + + # Demonstrate extraction from raw HTML + run_config = CrawlerRunConfig( + extraction_strategy=json_strategy, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="raw:" + SAMPLE_HTML, # Use raw: prefix for raw HTML + config=run_config + ) + print("Extracted Content:") + print(result.extracted_content) + +async def demo_input_formats(): + """ + Input Format Handling Demo + ---------------------- + + Demonstrates how LLM extraction can work with different input formats: + 1. Markdown (default) - Good for simple text extraction + 2. HTML - Better when you need structure and attributes + + This example shows how HTML input can be beneficial when: + - You need to understand the DOM structure + - You want to extract both visible text and HTML attributes + - The content has complex layouts like tables or forms + """ + print("\n4. Input Format Handling Demo") + print("---------------------------") + + # Create a dummy HTML with rich structure + dummy_html = """ +
+
+

Senior AI/ML Engineer

+
+ AI Research Division + San Francisco (Hybrid) +
+
+ $150,000 - $220,000 + per year +
+
+ +
+
+

Technical Requirements

+
    +
  • + 5+ years experience in Machine Learning +
  • +
  • + Proficiency in Python and PyTorch/TensorFlow +
  • +
  • + Experience with distributed training systems +
  • +
+
+ +
+

Professional Skills

+
    +
  • + Strong problem-solving abilities +
  • +
  • + Experience leading technical teams +
  • +
+
+
+ +
+ +
+ + +
+ """ + + # Use raw:// prefix to pass HTML content directly + url = f"raw://{dummy_html}" + + from pydantic import BaseModel, Field + from typing import List, Optional + + # Define our schema using Pydantic + class JobRequirement(BaseModel): + category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)") + items: List[str] = Field(description="List of specific requirements in this category") + priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context") + + class JobPosting(BaseModel): + title: str = Field(description="Job title") + department: str = Field(description="Department or team") + location: str = Field(description="Job location, including remote options") + salary_range: Optional[str] = Field(description="Salary range if specified") + requirements: List[JobRequirement] = Field(description="Categorized job requirements") + application_deadline: Optional[str] = Field(description="Application deadline if specified") + contact_info: Optional[dict] = Field(description="Contact information from footer or contact section") + + # First try with markdown (default) + markdown_strategy = LLMExtractionStrategy( + provider="openai/gpt-4o", + api_token=os.getenv("OPENAI_API_KEY"), + schema=JobPosting.model_json_schema(), + extraction_type="schema", + instruction=""" + Extract job posting details into structured data. Focus on the visible text content + and organize requirements into categories. + """, + input_format="markdown" # default + ) + + # Then with HTML for better structure understanding + html_strategy = LLMExtractionStrategy( + provider="openai/gpt-4", + api_token=os.getenv("OPENAI_API_KEY"), + schema=JobPosting.model_json_schema(), + extraction_type="schema", + instruction=""" + Extract job posting details, using HTML structure to: + 1. Identify requirement priorities from CSS classes (e.g., 'required' vs 'preferred') + 2. Extract contact info from the page footer or dedicated contact section + 3. Parse salary information from specially formatted elements + 4. Determine application deadline from timestamp or date elements + + Use HTML attributes and classes to enhance extraction accuracy. + """, + input_format="html" # explicitly use HTML + ) + + async with AsyncWebCrawler() as crawler: + # Try with markdown first + markdown_config = CrawlerRunConfig( + extraction_strategy=markdown_strategy + ) + markdown_result = await crawler.arun( + url=url, + config=markdown_config + ) + print("\nMarkdown-based Extraction Result:") + items = json.loads(markdown_result.extracted_content) + print(json.dumps(items, indent=2)) + + # Then with HTML for better structure understanding + html_config = CrawlerRunConfig( + extraction_strategy=html_strategy + ) + html_result = await crawler.arun( + url=url, + config=html_config + ) + print("\nHTML-based Extraction Result:") + items = json.loads(html_result.extracted_content) + print(json.dumps(items, indent=2)) + +# Main execution +async def main(): + print("Crawl4AI v0.4.24 Feature Walkthrough") + print("====================================") + + # Run all demos + # await demo_ssl_features() + # await demo_content_filtering() + # await demo_json_extraction() + await demo_input_formats() + +if __name__ == "__main__": + asyncio.run(main())