Merge PR #899 into next, resolve conflicts in server.py and docs/browser-crawler-config.md

This commit is contained in:
unclecode
2025-04-22 14:56:47 +08:00
16 changed files with 132 additions and 140 deletions

View File

@@ -24,7 +24,7 @@ from .browser_manager import BrowserManager
import aiofiles import aiofiles
import aiohttp import aiohttp
import cchardet import chardet
from aiohttp.client import ClientTimeout from aiohttp.client import ClientTimeout
from urllib.parse import urlparse from urllib.parse import urlparse
from types import MappingProxyType from types import MappingProxyType
@@ -130,6 +130,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
Close the browser and clean up resources. Close the browser and clean up resources.
""" """
await self.browser_manager.close() await self.browser_manager.close()
# Explicitly reset the static Playwright instance
BrowserManager._playwright_instance = None
async def kill_session(self, session_id: str): async def kill_session(self, session_id: str):
""" """
@@ -679,14 +681,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if console_log_type == "error": if console_log_type == "error":
self.logger.error( self.logger.error(
message=f"Console error: {msg}", # Use f-string for variable interpolation message=f"Console error: {msg}", # Use f-string for variable interpolation
tag="CONSOLE", tag="CONSOLE"
params={"msg": msg.text},
) )
elif console_log_type == "debug": elif console_log_type == "debug":
self.logger.debug( self.logger.debug(
message=f"Console: {msg}", # Use f-string for variable interpolation message=f"Console: {msg}", # Use f-string for variable interpolation
tag="CONSOLE", tag="CONSOLE"
params={"msg": msg.text},
) )
page.on("console", log_consol) page.on("console", log_consol)
@@ -967,7 +967,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
for selector in selectors: for selector in selectors:
try: try:
content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''") content = await page.evaluate(
f"""Array.from(document.querySelectorAll("{selector}"))
.map(el => el.outerHTML)
.join('')"""
)
html_parts.append(content) html_parts.append(content)
except Error as e: except Error as e:
print(f"Warning: Could not get content for selector '{selector}': {str(e)}") print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
@@ -1975,7 +1979,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
await self.start() await self.start()
yield self._session yield self._session
finally: finally:
await self.close() pass
def set_hook(self, hook_type: str, hook_func: Callable) -> None: def set_hook(self, hook_type: str, hook_func: Callable) -> None:
if hook_type in self.hooks: if hook_type in self.hooks:
@@ -2091,7 +2095,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
encoding = response.charset encoding = response.charset
if not encoding: if not encoding:
encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8' encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'
result = AsyncCrawlResponse( result = AsyncCrawlResponse(
html=content.tobytes().decode(encoding, errors='replace'), html=content.tobytes().decode(encoding, errors='replace'),

View File

@@ -4,6 +4,7 @@ from typing import Optional, Dict, Any
from colorama import Fore, Style, init from colorama import Fore, Style, init
import os import os
from datetime import datetime from datetime import datetime
from urllib.parse import unquote
class LogLevel(Enum): class LogLevel(Enum):
@@ -44,11 +45,11 @@ class AsyncLoggerBase(ABC):
pass pass
@abstractmethod @abstractmethod
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50): def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
pass pass
@abstractmethod @abstractmethod
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50): def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
pass pass
class AsyncLogger(AsyncLoggerBase): class AsyncLogger(AsyncLoggerBase):
@@ -130,6 +131,14 @@ class AsyncLogger(AsyncLoggerBase):
def _get_icon(self, tag: str) -> str: def _get_icon(self, tag: str) -> str:
"""Get the icon for a tag, defaulting to info icon if not found.""" """Get the icon for a tag, defaulting to info icon if not found."""
return self.icons.get(tag, self.icons["INFO"]) return self.icons.get(tag, self.icons["INFO"])
def _shorten(self, text, length, placeholder="..."):
"""Truncate text in the middle if longer than length, or pad if shorter."""
if len(text) <= length:
return text.ljust(length) # Pad with spaces to reach desired length
half = (length - len(placeholder)) // 2
shortened = text[:half] + placeholder + text[-half:]
return shortened.ljust(length) # Also pad shortened text to consistent length
def _write_to_file(self, message: str): def _write_to_file(self, message: str):
"""Write a message to the log file if configured.""" """Write a message to the log file if configured."""
@@ -259,7 +268,7 @@ class AsyncLogger(AsyncLoggerBase):
success: bool, success: bool,
timing: float, timing: float,
tag: str = "FETCH", tag: str = "FETCH",
url_length: int = 50, url_length: int = 100,
): ):
""" """
Convenience method for logging URL fetch status. Convenience method for logging URL fetch status.
@@ -271,14 +280,15 @@ class AsyncLogger(AsyncLoggerBase):
tag: Tag for the message tag: Tag for the message
url_length: Maximum length for URL in log url_length: Maximum length for URL in log
""" """
decoded_url = unquote(url)
readable_url = self._shorten(decoded_url, url_length)
self._log( self._log(
level=LogLevel.SUCCESS if success else LogLevel.ERROR, level=LogLevel.SUCCESS if success else LogLevel.ERROR,
message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s", message="{url} | {status} | : {timing:.2f}s",
tag=tag, tag=tag,
params={ params={
"url": url, "url": readable_url,
"url_length": url_length, "status": "" if success else "",
"status": success,
"timing": timing, "timing": timing,
}, },
colors={ colors={
@@ -299,11 +309,13 @@ class AsyncLogger(AsyncLoggerBase):
tag: Tag for the message tag: Tag for the message
url_length: Maximum length for URL in log url_length: Maximum length for URL in log
""" """
decoded_url = unquote(url)
readable_url = self._shorten(decoded_url, url_length)
self._log( self._log(
level=LogLevel.ERROR, level=LogLevel.ERROR,
message="{url:.{url_length}}... | Error: {error}", message="{url} | Error: {error}",
tag=tag, tag=tag,
params={"url": url, "url_length": url_length, "error": error}, params={"url": readable_url, "error": error},
) )
class AsyncFileLogger(AsyncLoggerBase): class AsyncFileLogger(AsyncLoggerBase):
@@ -347,13 +359,13 @@ class AsyncFileLogger(AsyncLoggerBase):
"""Log an error message to file.""" """Log an error message to file."""
self._write_to_file("ERROR", message, tag) self._write_to_file("ERROR", message, tag)
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50): def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
"""Log URL fetch status to file.""" """Log URL fetch status to file."""
status = "SUCCESS" if success else "FAILED" status = "SUCCESS" if success else "FAILED"
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s" message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
self._write_to_file("URL_STATUS", message, tag) self._write_to_file("URL_STATUS", message, tag)
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50): def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
"""Log error status to file.""" """Log error status to file."""
message = f"{url[:url_length]}... | Error: {error}" message = f"{url[:url_length]}... | Error: {error}"
self._write_to_file("ERROR", message, tag) self._write_to_file("ERROR", message, tag)

View File

@@ -358,10 +358,11 @@ class AsyncWebCrawler:
html=html, html=html,
extracted_content=extracted_content, extracted_content=extracted_content,
config=config, # Pass the config object instead of individual parameters config=config, # Pass the config object instead of individual parameters
screenshot=screenshot_data, screenshot_data=screenshot_data,
pdf_data=pdf_data, pdf_data=pdf_data,
verbose=config.verbose, verbose=config.verbose,
is_raw_html=True if url.startswith("raw:") else False, is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
**kwargs, **kwargs,
) )
@@ -380,18 +381,11 @@ class AsyncWebCrawler:
crawl_result.session_id = getattr( crawl_result.session_id = getattr(
config, "session_id", None) config, "session_id", None)
self.logger.success( self.logger.url_status(
message="{url:.50}... | Status: {status} | Total: {timing}", url=cache_context.display_url,
success=crawl_result.success,
timing=time.perf_counter() - start_time,
tag="COMPLETE", tag="COMPLETE",
params={
"url": cache_context.display_url,
"status": crawl_result.success,
"timing": f"{time.perf_counter() - start_time:.2f}s",
},
colors={
"status": Fore.GREEN if crawl_result.success else Fore.RED,
"timing": Fore.YELLOW,
},
) )
# Update cache if appropriate # Update cache if appropriate
@@ -401,17 +395,12 @@ class AsyncWebCrawler:
return CrawlResultContainer(crawl_result) return CrawlResultContainer(crawl_result)
else: else:
self.logger.success( self.logger.url_status(
message="{url:.50}... | Status: {status} | Total: {timing}", url=cache_context.display_url,
tag="COMPLETE", success=True,
params={ timing=time.perf_counter() - start_time,
"url": cache_context.display_url, tag="COMPLETE"
"status": True,
"timing": f"{time.perf_counter() - start_time:.2f}s",
},
colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
) )
cached_result.success = bool(html) cached_result.success = bool(html)
cached_result.session_id = getattr( cached_result.session_id = getattr(
config, "session_id", None) config, "session_id", None)
@@ -446,7 +435,7 @@ class AsyncWebCrawler:
html: str, html: str,
extracted_content: str, extracted_content: str,
config: CrawlerRunConfig, config: CrawlerRunConfig,
screenshot: str, screenshot_data: str,
pdf_data: str, pdf_data: str,
verbose: bool, verbose: bool,
**kwargs, **kwargs,
@@ -459,7 +448,7 @@ class AsyncWebCrawler:
html: Raw HTML content html: Raw HTML content
extracted_content: Previously extracted content (if any) extracted_content: Previously extracted content (if any)
config: Configuration object controlling processing behavior config: Configuration object controlling processing behavior
screenshot: Screenshot data (if any) screenshot_data: Screenshot data (if any)
pdf_data: PDF data (if any) pdf_data: PDF data (if any)
verbose: Whether to enable verbose logging verbose: Whether to enable verbose logging
**kwargs: Additional parameters for backwards compatibility **kwargs: Additional parameters for backwards compatibility
@@ -564,20 +553,23 @@ class AsyncWebCrawler:
markdown_result: MarkdownGenerationResult = ( markdown_result: MarkdownGenerationResult = (
markdown_generator.generate_markdown( markdown_generator.generate_markdown(
input_html=markdown_input_html, input_html=markdown_input_html,
base_url=url, base_url=params.get("redirected_url", url)
# html2text_options=kwargs.get('html2text', {}) # html2text_options=kwargs.get('html2text', {})
) )
) )
# Log processing completion # Log processing completion
self.logger.info( self.logger.url_status(
message="{url:.50}... | Time: {timing}s", url=_url,
tag="SCRAPE", success=True,
params={ timing=int((time.perf_counter() - t1) * 1000) / 1000,
"url": _url, tag="SCRAPE"
"timing": int((time.perf_counter() - t1) * 1000) / 1000,
},
) )
# self.logger.info(
# message="{url:.50}... | Time: {timing}s",
# tag="SCRAPE",
# params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
# )
################################ ################################
# Structured Content Extraction # # Structured Content Extraction #
@@ -624,10 +616,6 @@ class AsyncWebCrawler:
params={"url": _url, "timing": time.perf_counter() - t1}, params={"url": _url, "timing": time.perf_counter() - t1},
) )
# Handle screenshot and PDF data
screenshot_data = None if not screenshot else screenshot
pdf_data = None if not pdf_data else pdf_data
# Apply HTML formatting if requested # Apply HTML formatting if requested
if config.prettiify: if config.prettiify:
cleaned_html = fast_format_html(cleaned_html) cleaned_html = fast_format_html(cleaned_html)

View File

@@ -28,6 +28,7 @@ from lxml import etree
from lxml import html as lhtml from lxml import html as lhtml
from typing import List from typing import List
from .models import ScrapingResult, MediaItem, Link, Media, Links from .models import ScrapingResult, MediaItem, Link, Media, Links
import copy
# Pre-compile regular expressions for Open Graph and Twitter metadata # Pre-compile regular expressions for Open Graph and Twitter metadata
OG_REGEX = re.compile(r"^og:") OG_REGEX = re.compile(r"^og:")
@@ -48,7 +49,7 @@ def parse_srcset(s: str) -> List[Dict]:
if len(parts) >= 1: if len(parts) >= 1:
url = parts[0] url = parts[0]
width = ( width = (
parts[1].rstrip("w") parts[1].rstrip("w").split('.')[0]
if len(parts) > 1 and parts[1].endswith("w") if len(parts) > 1 and parts[1].endswith("w")
else None else None
) )
@@ -128,7 +129,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
Returns: Returns:
ScrapingResult: A structured result containing the scraped content. ScrapingResult: A structured result containing the scraped content.
""" """
raw_result = self._scrap(url, html, is_async=False, **kwargs) actual_url = kwargs.get("redirected_url", url)
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
if raw_result is None: if raw_result is None:
return ScrapingResult( return ScrapingResult(
cleaned_html="", cleaned_html="",
@@ -619,6 +621,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return False return False
keep_element = False keep_element = False
# Special case for table elements - always preserve structure
if element.name in ["tr", "td", "th"]:
keep_element = True
exclude_domains = kwargs.get("exclude_domains", []) exclude_domains = kwargs.get("exclude_domains", [])
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS)) # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
@@ -859,6 +864,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
parser_type = kwargs.get("parser", "lxml") parser_type = kwargs.get("parser", "lxml")
soup = BeautifulSoup(html, parser_type) soup = BeautifulSoup(html, parser_type)
body = soup.body body = soup.body
if body is None:
raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
base_domain = get_base_domain(url) base_domain = get_base_domain(url)
# Early removal of all images if exclude_all_images is set # Early removal of all images if exclude_all_images is set
@@ -897,23 +904,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
for element in body.select(excluded_selector): for element in body.select(excluded_selector):
element.extract() element.extract()
# if False and css_selector:
# selected_elements = body.select(css_selector)
# if not selected_elements:
# return {
# "markdown": "",
# "cleaned_html": "",
# "success": True,
# "media": {"images": [], "videos": [], "audios": []},
# "links": {"internal": [], "external": []},
# "metadata": {},
# "message": f"No elements found for CSS selector: {css_selector}",
# }
# # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
# body = soup.new_tag("div")
# for el in selected_elements:
# body.append(el)
content_element = None content_element = None
if target_elements: if target_elements:
try: try:
@@ -922,12 +912,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
for_content_targeted_element.extend(body.select(target_element)) for_content_targeted_element.extend(body.select(target_element))
content_element = soup.new_tag("div") content_element = soup.new_tag("div")
for el in for_content_targeted_element: for el in for_content_targeted_element:
content_element.append(el) content_element.append(copy.deepcopy(el))
except Exception as e: except Exception as e:
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
return None return None
else: else:
content_element = body content_element = body
kwargs["exclude_social_media_domains"] = set( kwargs["exclude_social_media_domains"] = set(
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
@@ -1308,6 +1298,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
"source", "source",
"track", "track",
"wbr", "wbr",
"tr",
"td",
"th",
} }
for el in reversed(list(root.iterdescendants())): for el in reversed(list(root.iterdescendants())):
@@ -1540,26 +1533,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
meta = {} meta = {}
# Handle CSS selector targeting
# if css_selector:
# try:
# selected_elements = body.cssselect(css_selector)
# if not selected_elements:
# return {
# "markdown": "",
# "cleaned_html": "",
# "success": True,
# "media": {"images": [], "videos": [], "audios": []},
# "links": {"internal": [], "external": []},
# "metadata": meta,
# "message": f"No elements found for CSS selector: {css_selector}",
# }
# body = lhtml.Element("div")
# body.extend(selected_elements)
# except Exception as e:
# self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
# return None
content_element = None content_element = None
if target_elements: if target_elements:
try: try:
@@ -1567,7 +1540,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
for target_element in target_elements: for target_element in target_elements:
for_content_targeted_element.extend(body.cssselect(target_element)) for_content_targeted_element.extend(body.cssselect(target_element))
content_element = lhtml.Element("div") content_element = lhtml.Element("div")
content_element.extend(for_content_targeted_element) content_element.extend(copy.deepcopy(for_content_targeted_element))
except Exception as e: except Exception as e:
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
return None return None
@@ -1636,7 +1609,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
# Remove empty elements # Remove empty elements
self.remove_empty_elements_fast(body, 1) self.remove_empty_elements_fast(body, 1)
# Remvoe unneeded attributes # Remove unneeded attributes
self.remove_unwanted_attributes_fast( self.remove_unwanted_attributes_fast(
body, keep_data_attributes=kwargs.get("keep_data_attributes", False) body, keep_data_attributes=kwargs.get("keep_data_attributes", False)
) )

View File

@@ -11,6 +11,7 @@ from .scorers import URLScorer
from . import DeepCrawlStrategy from . import DeepCrawlStrategy
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
from ..utils import normalize_url_for_deep_crawl
from math import inf as infinity from math import inf as infinity
@@ -106,13 +107,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
valid_links = [] valid_links = []
for link in links: for link in links:
url = link.get("href") url = link.get("href")
if url in visited: base_url = normalize_url_for_deep_crawl(url, source_url)
if base_url in visited:
continue continue
if not await self.can_process_url(url, new_depth): if not await self.can_process_url(url, new_depth):
self.stats.urls_skipped += 1 self.stats.urls_skipped += 1
continue continue
valid_links.append(url) valid_links.append(base_url)
# If we have more valid links than capacity, limit them # If we have more valid links than capacity, limit them
if len(valid_links) > remaining_capacity: if len(valid_links) > remaining_capacity:

View File

@@ -117,7 +117,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}") self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
self.stats.urls_skipped += 1 self.stats.urls_skipped += 1
continue continue
visited.add(base_url)
valid_links.append((base_url, score)) valid_links.append((base_url, score))
# If we have more valid links than capacity, sort by score and take the top ones # If we have more valid links than capacity, sort by score and take the top ones
@@ -158,7 +159,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
while current_level and not self._cancel_event.is_set(): while current_level and not self._cancel_event.is_set():
next_level: List[Tuple[str, Optional[str]]] = [] next_level: List[Tuple[str, Optional[str]]] = []
urls = [url for url, _ in current_level] urls = [url for url, _ in current_level]
visited.update(urls)
# Clone the config to disable deep crawling recursion and enforce batch mode. # Clone the config to disable deep crawling recursion and enforce batch mode.
batch_config = config.clone(deep_crawl_strategy=None, stream=False) batch_config = config.clone(deep_crawl_strategy=None, stream=False)

View File

@@ -115,5 +115,6 @@ async () => {
document.body.style.overflow = "auto"; document.body.style.overflow = "auto";
// Wait a bit for any animations to complete // Wait a bit for any animations to complete
await new Promise((resolve) => setTimeout(resolve, 100)); document.body.scrollIntoView(false);
await new Promise((resolve) => setTimeout(resolve, 50));
}; };

View File

@@ -2003,6 +2003,10 @@ def normalize_url(href, base_url):
if not parsed_base.scheme or not parsed_base.netloc: if not parsed_base.scheme or not parsed_base.netloc:
raise ValueError(f"Invalid base URL format: {base_url}") raise ValueError(f"Invalid base URL format: {base_url}")
# Ensure base_url ends with a trailing slash if it's a directory path
if not base_url.endswith('/'):
base_url = base_url + '/'
# Use urljoin to handle all cases # Use urljoin to handle all cases
normalized = urljoin(base_url, href.strip()) normalized = urljoin(base_url, href.strip())
return normalized return normalized
@@ -2047,7 +2051,7 @@ def normalize_url_for_deep_crawl(href, base_url):
normalized = urlunparse(( normalized = urlunparse((
parsed.scheme, parsed.scheme,
netloc, netloc,
parsed.path.rstrip('/') or '/', # Normalize trailing slash parsed.path.rstrip('/'), # Normalize trailing slash
parsed.params, parsed.params,
query, query,
fragment fragment
@@ -2075,7 +2079,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
normalized = urlunparse(( normalized = urlunparse((
parsed.scheme, parsed.scheme,
parsed.netloc.lower(), parsed.netloc.lower(),
parsed.path, parsed.path.rstrip('/'),
parsed.params, parsed.params,
parsed.query, parsed.query,
'' # Remove fragment '' # Remove fragment

View File

@@ -60,6 +60,8 @@ async def handle_llm_qa(
) -> str: ) -> str:
"""Process QA using LLM with crawled content as context.""" """Process QA using LLM with crawled content as context."""
try: try:
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
# Extract base URL by finding last '?q=' occurrence # Extract base URL by finding last '?q=' occurrence
last_q_index = url.rfind('?q=') last_q_index = url.rfind('?q=')
if last_q_index != -1: if last_q_index != -1:
@@ -73,7 +75,7 @@ async def handle_llm_qa(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=result.error_message detail=result.error_message
) )
content = result.markdown.fit_markdown content = result.markdown.fit_markdown or result.markdown.raw_markdown
# Create prompt and get LLM response # Create prompt and get LLM response
prompt = f"""Use the following content as context to answer the question. prompt = f"""Use the following content as context to answer the question.
@@ -397,6 +399,7 @@ async def handle_crawl_request(
peak_mem_mb = start_mem_mb peak_mem_mb = start_mem_mb
try: try:
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls]
browser_config = BrowserConfig.load(browser_config) browser_config = BrowserConfig.load(browser_config)
crawler_config = CrawlerRunConfig.load(crawler_config) crawler_config = CrawlerRunConfig.load(crawler_config)

View File

@@ -432,7 +432,7 @@ async def execute_js(
async def llm_endpoint( async def llm_endpoint(
request: Request, request: Request,
url: str = Path(...), url: str = Path(...),
q: Optional[str] = Query(None), q: str = Query(...),
_td: Dict = Depends(token_dep), _td: Dict = Depends(token_dep),
): ):
if not q: if not q:

View File

@@ -12,9 +12,10 @@ Weve introduced a new feature that effortlessly handles even the biggest page
**Simple Example:** **Simple Example:**
```python ```python
import os, sys import os
import sys
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
# Adjust paths as needed # Adjust paths as needed
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -26,9 +27,11 @@ async def main():
# Request both PDF and screenshot # Request both PDF and screenshot
result = await crawler.arun( result = await crawler.arun(
url='https://en.wikipedia.org/wiki/List_of_common_misconceptions', url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
cache_mode=CacheMode.BYPASS, config=CrawlerRunConfig(
pdf=True, cache_mode=CacheMode.BYPASS,
screenshot=True pdf=True,
screenshot=True
)
) )
if result.success: if result.success:
@@ -40,9 +43,8 @@ async def main():
# Save PDF # Save PDF
if result.pdf: if result.pdf:
pdf_bytes = b64decode(result.pdf)
with open(os.path.join(__location__, "page.pdf"), "wb") as f: with open(os.path.join(__location__, "page.pdf"), "wb") as f:
f.write(pdf_bytes) f.write(result.pdf)
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())

View File

@@ -232,6 +232,7 @@ async def main():
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())
```
## 2.4 Compliance & Ethics ## 2.4 Compliance & Ethics

View File

@@ -36,8 +36,6 @@ class BrowserConfig:
### Key Fields to Note ### Key Fields to Note
1. **`browser_type`** 1. **`browser_type`**
- Options: `"chromium"`, `"firefox"`, or `"webkit"`. - Options: `"chromium"`, `"firefox"`, or `"webkit"`.
- Defaults to `"chromium"`. - Defaults to `"chromium"`.
@@ -215,6 +213,7 @@ class CrawlerRunConfig:
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.). - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
- Affects how much information is printed during the crawl. - Affects how much information is printed during the crawl.
### Helper Methods ### Helper Methods
The `clone()` method is particularly useful for creating variations of your crawler configuration: The `clone()` method is particularly useful for creating variations of your crawler configuration:
@@ -248,9 +247,6 @@ The `clone()` method:
--- ---
## 3. LLMConfig Essentials ## 3. LLMConfig Essentials
### Key fields to note ### Key fields to note

View File

@@ -2,7 +2,7 @@
In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that: In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more). 1. Works with **any** large language model supported by [LiteLLM](https://github.com/BerriAI/litellm) (Ollama, OpenAI, Claude, and more).
2. Automatically splits content into chunks (if desired) to handle token limits, then combines results. 2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.
3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach. 3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
@@ -18,13 +18,19 @@ In some cases, you need to extract **complex or unstructured** information from
--- ---
## 2. Provider-Agnostic via LightLLM ## 2. Provider-Agnostic via LiteLLM
Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide: You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
```python
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
```
Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
- **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.). - **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).
- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it. - **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.
- **`api_base`** (optional): If your provider has a custom endpoint. - **`base_url`** (optional): If your provider has a custom endpoint.
This means you **arent locked** into a single LLM vendor. Switch or experiment easily. This means you **arent locked** into a single LLM vendor. Switch or experiment easily.
@@ -52,20 +58,19 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic
Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`. Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`. 1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
2. **`api_token`** (str): The API key or token for that model. May not be needed for local models. 2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.
3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`. 3. **`extraction_type`** (str): `"schema"` or `"block"`.
4. **`extraction_type`** (str): `"schema"` or `"block"`. 4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”
5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.” 5. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.
6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM. 6. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.
7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity. 7. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.
8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`. 8. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:
9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:
- `"markdown"`: The raw markdown (default). - `"markdown"`: The raw markdown (default).
- `"fit_markdown"`: The filtered “fit” markdown if you used a content filter. - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.
- `"html"`: The cleaned or raw HTML. - `"html"`: The cleaned or raw HTML.
10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc. 9. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.
11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known). 10. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).
**Example**: **Example**:
@@ -233,8 +238,7 @@ class KnowledgeGraph(BaseModel):
async def main(): async def main():
# LLM extraction strategy # LLM extraction strategy
llm_strat = LLMExtractionStrategy( llm_strat = LLMExtractionStrategy(
provider="openai/gpt-4", llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
api_token=os.getenv('OPENAI_API_KEY'),
schema=KnowledgeGraph.schema_json(), schema=KnowledgeGraph.schema_json(),
extraction_type="schema", extraction_type="schema",
instruction="Extract entities and relationships from the content. Return valid JSON.", instruction="Extract entities and relationships from the content. Return valid JSON.",
@@ -286,7 +290,7 @@ if __name__ == "__main__":
## 11. Conclusion ## 11. Conclusion
**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. Its perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, its **slower** and potentially costlier than schema-based approaches. Keep these tips in mind: **LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LiteLLM. Its perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, its **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
- Put your LLM strategy **in `CrawlerRunConfig`**. - Put your LLM strategy **in `CrawlerRunConfig`**.
- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees. - Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.
@@ -317,4 +321,4 @@ If your sites data is consistent or repetitive, consider [`JsonCssExtractionS
--- ---
Thats it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling! Thats it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!

View File

@@ -40,8 +40,9 @@ dependencies = [
"fake-useragent>=2.0.3", "fake-useragent>=2.0.3",
"click>=8.1.7", "click>=8.1.7",
"pyperclip>=1.8.2", "pyperclip>=1.8.2",
"faust-cchardet>=2.1.19", "chardet>=5.2.0",
"aiohttp>=3.11.11", "aiohttp>=3.11.11",
"brotli>=1.1.0",
"humanize>=4.10.0", "humanize>=4.10.0",
] ]
classifiers = [ classifiers = [

View File

@@ -21,4 +21,5 @@ psutil>=6.1.1
nltk>=3.9.1 nltk>=3.9.1
rich>=13.9.4 rich>=13.9.4
cssselect>=1.2.0 cssselect>=1.2.0
faust-cchardet>=2.1.19 chardet>=5.2.0
brotli>=1.1.0