feat(browser): add standalone CDP browser launch and lxml extraction strategy

Add new features to enhance browser automation and HTML extraction:
- Add CDP browser launch capability with customizable ports and profiles
- Implement JsonLxmlExtractionStrategy for faster HTML parsing
- Add CLI command 'crwl cdp' for launching standalone CDP browsers
- Support connecting to external CDP browsers via URL
- Optimize selector caching and context-sensitive queries

BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai
This commit is contained in:
UncleCode
2025-03-07 20:55:56 +08:00
parent f78c46446b
commit a68cbb232b
22 changed files with 745 additions and 29 deletions

View File

@@ -11,7 +11,7 @@ import asyncio
import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import (
LLMExtractionStrategy,
JsonCssExtractionStrategy,

View File

@@ -1,4 +1,4 @@
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
import asyncio
import os

View File

@@ -1,7 +1,7 @@
import os
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai.content_filter_strategy import LLMContentFilter
async def test_llm_filter():

View File

@@ -1,6 +1,6 @@
import os, sys
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

View File

@@ -1,6 +1,6 @@
import os, sys
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
# append parent directory to system path
sys.path.append(

View File

@@ -1,6 +1,6 @@
import os
import time
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *

View File

@@ -17,7 +17,7 @@ from crawl4ai.configs import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai.types import LLMConfig
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from pprint import pprint