feat(browser): add standalone CDP browser launch and lxml extraction strategy
Add new features to enhance browser automation and HTML extraction: - Add CDP browser launch capability with customizable ports and profiles - Implement JsonLxmlExtractionStrategy for faster HTML parsing - Add CLI command 'crwl cdp' for launching standalone CDP browsers - Support connecting to external CDP browsers via URL - Optimize selector caching and context-sensitive queries BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai
This commit is contained in:
@@ -11,7 +11,7 @@ import asyncio
|
||||
import os
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.extraction_strategy import (
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def test_llm_filter():
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os, sys
|
||||
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
sys.path.append(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os, sys
|
||||
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# append parent directory to system path
|
||||
sys.path.append(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os
|
||||
import time
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
|
||||
@@ -17,7 +17,7 @@ from crawl4ai.configs import ProxyConfig
|
||||
from crawl4ai import RoundRobinProxyStrategy
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||
from pprint import pprint
|
||||
|
||||
@@ -131,7 +131,7 @@ OverlappingWindowChunking(
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Define schema
|
||||
class Article(BaseModel):
|
||||
@@ -198,7 +198,7 @@ result = await crawler.arun(
|
||||
|
||||
```python
|
||||
from crawl4ai.chunking_strategy import OverlappingWindowChunking
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Create chunking strategy
|
||||
chunker = OverlappingWindowChunking(
|
||||
|
||||
@@ -305,7 +305,7 @@ asyncio.run(main())
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
import asyncio
|
||||
|
||||
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
@@ -335,7 +335,7 @@ asyncio.run(main())
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
|
||||
@@ -401,7 +401,7 @@ print(schema)
|
||||
experimentation between different LLM configurations.
|
||||
|
||||
```python
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
|
||||
@@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Generate a schema (one-time cost)
|
||||
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
|
||||
|
||||
@@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Sample HTML with product information
|
||||
html = """
|
||||
|
||||
Reference in New Issue
Block a user