diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index dfda9e28..73e5c025 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,2 @@
# crawl4ai/_version.py
-__version__ = "0.4.25"
+__version__ = "0.4.24"
diff --git a/docs/examples/v0_4_24_walkthrough.py b/docs/examples/v0_4_24_walkthrough.py
new file mode 100644
index 00000000..61477b78
--- /dev/null
+++ b/docs/examples/v0_4_24_walkthrough.py
@@ -0,0 +1,387 @@
+"""
+Crawl4AI v0.4.24 Feature Walkthrough
+===================================
+
+This script demonstrates the new features introduced in Crawl4AI v0.4.24.
+Each section includes detailed examples and explanations of the new capabilities.
+"""
+
+import asyncio
+import os
+import json
+from typing import List, Optional, Dict, Any
+from pydantic import BaseModel, Field
+from crawl4ai import (
+ AsyncWebCrawler,
+ BrowserConfig,
+ CrawlerRunConfig,
+ CacheMode,
+ LLMExtractionStrategy
+)
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+# Sample HTML for demonstrations
+SAMPLE_HTML = """
+
+
+
+
+
+
First post content...
+
Read More
+
+
+
+
+
+
+
Second post content...
+
Read More
+
+
+
+"""
+
+async def demo_ssl_features():
+ """
+ Enhanced SSL & Security Features Demo
+ -----------------------------------
+
+ This example demonstrates the new SSL certificate handling and security features:
+ 1. Custom certificate paths
+ 2. SSL verification options
+ 3. HTTPS error handling
+ 4. Certificate validation configurations
+
+ These features are particularly useful when:
+ - Working with self-signed certificates
+ - Dealing with corporate proxies
+ - Handling mixed content websites
+ - Managing different SSL security levels
+ """
+ print("\n1. Enhanced SSL & Security Demo")
+ print("--------------------------------")
+
+ browser_config = BrowserConfig(
+ ignore_https_errors=True,
+ verbose=True
+ )
+
+ run_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ fetch_ssl_certificate=True # Enable SSL certificate fetching
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://example.com",
+ config=run_config
+ )
+ print(f"SSL Crawl Success: {result.success}")
+ if not result.success:
+ print(f"SSL Error: {result.error_message}")
+
+async def demo_content_filtering():
+ """
+ Smart Content Filtering Demo
+ --------------------------
+
+ Demonstrates the new content filtering system with:
+ 1. Regular expression pattern matching
+ 2. Length-based filtering
+ 3. Custom filtering rules
+ 4. Content chunking strategies
+
+ This is particularly useful for:
+ - Removing advertisements and boilerplate content
+ - Extracting meaningful paragraphs
+ - Filtering out irrelevant sections
+ - Processing content in manageable chunks
+ """
+ print("\n2. Smart Content Filtering Demo")
+ print("--------------------------------")
+
+ content_filter = PruningContentFilter(
+ min_word_threshold=50,
+ threshold_type='dynamic',
+ threshold=0.5
+ )
+
+ run_config = CrawlerRunConfig(
+ content_filter=content_filter,
+ cache_mode=CacheMode.BYPASS
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://news.ycombinator.com",
+ config=run_config
+ )
+ print("Filtered Content Sample:")
+ print(result.markdown[:500] + "...\n")
+
+async def demo_json_extraction():
+ """
+ Advanced JSON Extraction Demo
+ ---------------------------
+
+ Demonstrates the enhanced JSON extraction capabilities:
+ 1. Using different input formats (markdown, html)
+ 2. Base element attributes extraction
+ 3. Complex nested structures
+ 4. Multiple extraction patterns
+
+ Key features shown:
+ - Extracting from different input formats (markdown vs html)
+ - Extracting attributes from base elements (href, data-* attributes)
+ - Processing repeated patterns
+ - Handling optional fields
+ - Computing derived values
+ """
+ print("\n3. Improved JSON Extraction Demo")
+ print("--------------------------------")
+
+ # Define the extraction schema with base element attributes
+ json_strategy = JsonCssExtractionStrategy(
+ schema={
+ "name": "Blog Posts",
+ "baseSelector": "div.article-list",
+ "fields": [
+ {
+ "name": "posts",
+ "selector": "article.post",
+ "type": "nested_list",
+ "baseFields": [
+ {"name": "category", "type": "attribute", "attribute": "data-category"},
+ {"name": "author_id", "type": "attribute", "attribute": "data-author"}
+ ],
+ "fields": [
+ {
+ "name": "title",
+ "selector": "h2.title a",
+ "type": "text",
+ "baseFields": [
+ {"name": "url", "type": "attribute", "attribute": "href"}
+ ]
+ },
+ {
+ "name": "author",
+ "selector": "div.meta a.author",
+ "type": "text",
+ "baseFields": [
+ {"name": "profile_url", "type": "attribute", "attribute": "href"}
+ ]
+ },
+ {
+ "name": "date",
+ "selector": "span.date",
+ "type": "text"
+ },
+ {
+ "name": "read_more",
+ "selector": "a.read-more",
+ "type": "nested",
+ "fields": [
+ {"name": "text", "type": "text"},
+ {"name": "url", "type": "attribute", "attribute": "href"}
+ ]
+ }
+ ]
+ }
+ ]
+ }
+ )
+
+ # Demonstrate extraction from raw HTML
+ run_config = CrawlerRunConfig(
+ extraction_strategy=json_strategy,
+ cache_mode=CacheMode.BYPASS
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="raw:" + SAMPLE_HTML, # Use raw: prefix for raw HTML
+ config=run_config
+ )
+ print("Extracted Content:")
+ print(result.extracted_content)
+
+async def demo_input_formats():
+ """
+ Input Format Handling Demo
+ ----------------------
+
+ Demonstrates how LLM extraction can work with different input formats:
+ 1. Markdown (default) - Good for simple text extraction
+ 2. HTML - Better when you need structure and attributes
+
+ This example shows how HTML input can be beneficial when:
+ - You need to understand the DOM structure
+ - You want to extract both visible text and HTML attributes
+ - The content has complex layouts like tables or forms
+ """
+ print("\n4. Input Format Handling Demo")
+ print("---------------------------")
+
+ # Create a dummy HTML with rich structure
+ dummy_html = """
+
+
+
+
+
+
Technical Requirements
+
+ -
+ 5+ years experience in Machine Learning
+
+ -
+ Proficiency in Python and PyTorch/TensorFlow
+
+ -
+ Experience with distributed training systems
+
+
+
+
+
+
Professional Skills
+
+ -
+ Strong problem-solving abilities
+
+ -
+ Experience leading technical teams
+
+
+
+
+
+
+
+
+
+
+
+ """
+
+ # Use raw:// prefix to pass HTML content directly
+ url = f"raw://{dummy_html}"
+
+ from pydantic import BaseModel, Field
+ from typing import List, Optional
+
+ # Define our schema using Pydantic
+ class JobRequirement(BaseModel):
+ category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)")
+ items: List[str] = Field(description="List of specific requirements in this category")
+ priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context")
+
+ class JobPosting(BaseModel):
+ title: str = Field(description="Job title")
+ department: str = Field(description="Department or team")
+ location: str = Field(description="Job location, including remote options")
+ salary_range: Optional[str] = Field(description="Salary range if specified")
+ requirements: List[JobRequirement] = Field(description="Categorized job requirements")
+ application_deadline: Optional[str] = Field(description="Application deadline if specified")
+ contact_info: Optional[dict] = Field(description="Contact information from footer or contact section")
+
+ # First try with markdown (default)
+ markdown_strategy = LLMExtractionStrategy(
+ provider="openai/gpt-4o",
+ api_token=os.getenv("OPENAI_API_KEY"),
+ schema=JobPosting.model_json_schema(),
+ extraction_type="schema",
+ instruction="""
+ Extract job posting details into structured data. Focus on the visible text content
+ and organize requirements into categories.
+ """,
+ input_format="markdown" # default
+ )
+
+ # Then with HTML for better structure understanding
+ html_strategy = LLMExtractionStrategy(
+ provider="openai/gpt-4",
+ api_token=os.getenv("OPENAI_API_KEY"),
+ schema=JobPosting.model_json_schema(),
+ extraction_type="schema",
+ instruction="""
+ Extract job posting details, using HTML structure to:
+ 1. Identify requirement priorities from CSS classes (e.g., 'required' vs 'preferred')
+ 2. Extract contact info from the page footer or dedicated contact section
+ 3. Parse salary information from specially formatted elements
+ 4. Determine application deadline from timestamp or date elements
+
+ Use HTML attributes and classes to enhance extraction accuracy.
+ """,
+ input_format="html" # explicitly use HTML
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ # Try with markdown first
+ markdown_config = CrawlerRunConfig(
+ extraction_strategy=markdown_strategy
+ )
+ markdown_result = await crawler.arun(
+ url=url,
+ config=markdown_config
+ )
+ print("\nMarkdown-based Extraction Result:")
+ items = json.loads(markdown_result.extracted_content)
+ print(json.dumps(items, indent=2))
+
+ # Then with HTML for better structure understanding
+ html_config = CrawlerRunConfig(
+ extraction_strategy=html_strategy
+ )
+ html_result = await crawler.arun(
+ url=url,
+ config=html_config
+ )
+ print("\nHTML-based Extraction Result:")
+ items = json.loads(html_result.extracted_content)
+ print(json.dumps(items, indent=2))
+
+# Main execution
+async def main():
+ print("Crawl4AI v0.4.24 Feature Walkthrough")
+ print("====================================")
+
+ # Run all demos
+ # await demo_ssl_features()
+ # await demo_content_filtering()
+ # await demo_json_extraction()
+ await demo_input_formats()
+
+if __name__ == "__main__":
+ asyncio.run(main())