feat(browser): add standalone CDP browser launch and lxml extraction strategy
Add new features to enhance browser automation and HTML extraction: - Add CDP browser launch capability with customizable ports and profiles - Implement JsonLxmlExtractionStrategy for faster HTML parsing - Add CLI command 'crwl cdp' for launching standalone CDP browsers - Support connecting to external CDP browsers via URL - Optimize selector caching and context-sensitive queries BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def test_llm_filter():
|
||||
|
||||
@@ -7,7 +7,7 @@ import json
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.chunking_strategy import RegexChunking
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
17
tests/browser/test_launch_standalone.py
Normal file
17
tests/browser/test_launch_standalone.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from crawl4ai.browser_profiler import BrowserProfiler
|
||||
import asyncio
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test launching a standalone browser
|
||||
async def test_standalone_browser():
|
||||
profiler = BrowserProfiler()
|
||||
cdp_url = await profiler.launch_standalone_browser(
|
||||
browser_type="chromium",
|
||||
user_data_dir="~/.crawl4ai/browser_profile/test-browser-data",
|
||||
debugging_port=9222,
|
||||
headless=False
|
||||
)
|
||||
print(f"CDP URL: {cdp_url}")
|
||||
|
||||
asyncio.run(test_standalone_browser())
|
||||
@@ -7,7 +7,7 @@ from crawl4ai import (
|
||||
BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
|
||||
PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
|
||||
)
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
class Crawl4AiTester:
|
||||
|
||||
@@ -2,7 +2,7 @@ import inspect
|
||||
from typing import Any, Dict
|
||||
from enum import Enum
|
||||
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
def to_serializable_dict(obj: Any) -> Dict:
|
||||
"""
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import unittest, os
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import (
|
||||
RegexChunking,
|
||||
|
||||
Reference in New Issue
Block a user