refactor(scraping): replace ScrapingMode enum with strategy pattern
Replace the ScrapingMode enum with a proper strategy pattern implementation for content scraping. This change introduces: - New ContentScrapingStrategy abstract base class - Concrete WebScrapingStrategy and LXMLWebScrapingStrategy implementations - New Pydantic models for structured scraping results - Updated documentation reflecting the new strategy-based approach BREAKING CHANGE: ScrapingMode enum has been removed. Users should now use ContentScrapingStrategy implementations instead.
This commit is contained in:
21
tests/test_scraping_strategy.py
Normal file
21
tests/test_scraping_strategy.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy, CacheMode
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user