Enhance crawler capabilities and documentation

- Add llm.txt generator
  - Added SSL certificate extraction in AsyncWebCrawler.
  - Introduced new content filters and chunking strategies for more robust data extraction.
  - Updated documentation.
This commit is contained in:
UncleCode
2024-12-25 21:34:31 +08:00
parent 84b311760f
commit d5ed451299
59 changed files with 2208 additions and 1763 deletions

View File

@@ -23,11 +23,7 @@ from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_logger import AsyncLogger
from playwright_stealth import StealthConfig, stealth_async
from io import BytesIO
import base64
from PIL import Image, ImageDraw, ImageFont
from .utilities.ssl_utils import get_ssl_certificate
stealth_config = StealthConfig(
webdriver=True,
@@ -566,18 +562,6 @@ class AsyncCrawlerStrategy(ABC):
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
pass
@abstractmethod
async def take_screenshot(self, **kwargs) -> str:
pass
@abstractmethod
def update_user_agent(self, user_agent: str):
pass
@abstractmethod
def set_hook(self, hook_type: str, hook: Callable):
pass
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
def __init__(
@@ -928,6 +912,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
page.on("pageerror", lambda e: log_consol(e, "error"))
try:
# Get SSL certificate information if requested and URL is HTTPS
ssl_certificate = None
if config.fetch_ssl_certificate and url.startswith('https://'):
ssl_certificate = get_ssl_certificate(url)
# Set up download handling
if self.browser_config.accept_downloads:
page.on(
@@ -1155,6 +1144,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
screenshot=screenshot_data,
pdf_data=pdf_data,
get_delayed_content=get_delayed_content,
ssl_certificate=ssl_certificate,
downloaded_files=(
self._downloaded_files if self._downloaded_files else None
),