feat(browser): add standalone CDP browser launch and lxml extraction strategy

Add new features to enhance browser automation and HTML extraction:
- Add CDP browser launch capability with customizable ports and profiles
- Implement JsonLxmlExtractionStrategy for faster HTML parsing
- Add CLI command 'crwl cdp' for launching standalone CDP browsers
- Support connecting to external CDP browsers via URL
- Optimize selector caching and context-sensitive queries

BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai
This commit is contained in:
UncleCode
2025-03-07 20:55:56 +08:00
parent f78c46446b
commit a68cbb232b
22 changed files with 745 additions and 29 deletions

View File

@@ -11,7 +11,7 @@ import asyncio
import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import (
LLMExtractionStrategy,
JsonCssExtractionStrategy,

View File

@@ -1,4 +1,4 @@
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
import asyncio
import os

View File

@@ -1,7 +1,7 @@
import os
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai.content_filter_strategy import LLMContentFilter
async def test_llm_filter():

View File

@@ -1,6 +1,6 @@
import os, sys
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

View File

@@ -1,6 +1,6 @@
import os, sys
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
# append parent directory to system path
sys.path.append(

View File

@@ -1,6 +1,6 @@
import os
import time
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *

View File

@@ -17,7 +17,7 @@ from crawl4ai.configs import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from pprint import pprint

View File

@@ -131,7 +131,7 @@ OverlappingWindowChunking(
```python
from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
# Define schema
class Article(BaseModel):
@@ -198,7 +198,7 @@ result = await crawler.arun(
```python
from crawl4ai.chunking_strategy import OverlappingWindowChunking
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
# Create chunking strategy
chunker = OverlappingWindowChunking(

View File

@@ -305,7 +305,7 @@ asyncio.run(main())
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
import asyncio
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
@@ -335,7 +335,7 @@ asyncio.run(main())
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
@@ -401,7 +401,7 @@ print(schema)
experimentation between different LLM configurations.
```python
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig

View File

@@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
# Generate a schema (one-time cost)
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"

View File

@@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
# Sample HTML with product information
html = """