Merge PR #899 into next, resolve conflicts in server.py and docs/browser-crawler-config.md
This commit is contained in:
@@ -24,7 +24,7 @@ from .browser_manager import BrowserManager
|
||||
|
||||
import aiofiles
|
||||
import aiohttp
|
||||
import cchardet
|
||||
import chardet
|
||||
from aiohttp.client import ClientTimeout
|
||||
from urllib.parse import urlparse
|
||||
from types import MappingProxyType
|
||||
@@ -130,6 +130,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
Close the browser and clean up resources.
|
||||
"""
|
||||
await self.browser_manager.close()
|
||||
# Explicitly reset the static Playwright instance
|
||||
BrowserManager._playwright_instance = None
|
||||
|
||||
async def kill_session(self, session_id: str):
|
||||
"""
|
||||
@@ -679,14 +681,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if console_log_type == "error":
|
||||
self.logger.error(
|
||||
message=f"Console error: {msg}", # Use f-string for variable interpolation
|
||||
tag="CONSOLE",
|
||||
params={"msg": msg.text},
|
||||
tag="CONSOLE"
|
||||
)
|
||||
elif console_log_type == "debug":
|
||||
self.logger.debug(
|
||||
message=f"Console: {msg}", # Use f-string for variable interpolation
|
||||
tag="CONSOLE",
|
||||
params={"msg": msg.text},
|
||||
tag="CONSOLE"
|
||||
)
|
||||
|
||||
page.on("console", log_consol)
|
||||
@@ -967,7 +967,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
for selector in selectors:
|
||||
try:
|
||||
content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''")
|
||||
content = await page.evaluate(
|
||||
f"""Array.from(document.querySelectorAll("{selector}"))
|
||||
.map(el => el.outerHTML)
|
||||
.join('')"""
|
||||
)
|
||||
html_parts.append(content)
|
||||
except Error as e:
|
||||
print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
|
||||
@@ -1975,7 +1979,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
await self.start()
|
||||
yield self._session
|
||||
finally:
|
||||
await self.close()
|
||||
pass
|
||||
|
||||
def set_hook(self, hook_type: str, hook_func: Callable) -> None:
|
||||
if hook_type in self.hooks:
|
||||
@@ -2091,7 +2095,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
encoding = response.charset
|
||||
if not encoding:
|
||||
encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8'
|
||||
encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'
|
||||
|
||||
result = AsyncCrawlResponse(
|
||||
html=content.tobytes().decode(encoding, errors='replace'),
|
||||
|
||||
@@ -4,6 +4,7 @@ from typing import Optional, Dict, Any
|
||||
from colorama import Fore, Style, init
|
||||
import os
|
||||
from datetime import datetime
|
||||
from urllib.parse import unquote
|
||||
|
||||
|
||||
class LogLevel(Enum):
|
||||
@@ -44,11 +45,11 @@ class AsyncLoggerBase(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
||||
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
||||
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
|
||||
pass
|
||||
|
||||
class AsyncLogger(AsyncLoggerBase):
|
||||
@@ -130,6 +131,14 @@ class AsyncLogger(AsyncLoggerBase):
|
||||
def _get_icon(self, tag: str) -> str:
|
||||
"""Get the icon for a tag, defaulting to info icon if not found."""
|
||||
return self.icons.get(tag, self.icons["INFO"])
|
||||
|
||||
def _shorten(self, text, length, placeholder="..."):
|
||||
"""Truncate text in the middle if longer than length, or pad if shorter."""
|
||||
if len(text) <= length:
|
||||
return text.ljust(length) # Pad with spaces to reach desired length
|
||||
half = (length - len(placeholder)) // 2
|
||||
shortened = text[:half] + placeholder + text[-half:]
|
||||
return shortened.ljust(length) # Also pad shortened text to consistent length
|
||||
|
||||
def _write_to_file(self, message: str):
|
||||
"""Write a message to the log file if configured."""
|
||||
@@ -259,7 +268,7 @@ class AsyncLogger(AsyncLoggerBase):
|
||||
success: bool,
|
||||
timing: float,
|
||||
tag: str = "FETCH",
|
||||
url_length: int = 50,
|
||||
url_length: int = 100,
|
||||
):
|
||||
"""
|
||||
Convenience method for logging URL fetch status.
|
||||
@@ -271,14 +280,15 @@ class AsyncLogger(AsyncLoggerBase):
|
||||
tag: Tag for the message
|
||||
url_length: Maximum length for URL in log
|
||||
"""
|
||||
decoded_url = unquote(url)
|
||||
readable_url = self._shorten(decoded_url, url_length)
|
||||
self._log(
|
||||
level=LogLevel.SUCCESS if success else LogLevel.ERROR,
|
||||
message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
|
||||
message="{url} | {status} | ⏱: {timing:.2f}s",
|
||||
tag=tag,
|
||||
params={
|
||||
"url": url,
|
||||
"url_length": url_length,
|
||||
"status": success,
|
||||
"url": readable_url,
|
||||
"status": "✓" if success else "✗",
|
||||
"timing": timing,
|
||||
},
|
||||
colors={
|
||||
@@ -299,11 +309,13 @@ class AsyncLogger(AsyncLoggerBase):
|
||||
tag: Tag for the message
|
||||
url_length: Maximum length for URL in log
|
||||
"""
|
||||
decoded_url = unquote(url)
|
||||
readable_url = self._shorten(decoded_url, url_length)
|
||||
self._log(
|
||||
level=LogLevel.ERROR,
|
||||
message="{url:.{url_length}}... | Error: {error}",
|
||||
message="{url} | Error: {error}",
|
||||
tag=tag,
|
||||
params={"url": url, "url_length": url_length, "error": error},
|
||||
params={"url": readable_url, "error": error},
|
||||
)
|
||||
|
||||
class AsyncFileLogger(AsyncLoggerBase):
|
||||
@@ -347,13 +359,13 @@ class AsyncFileLogger(AsyncLoggerBase):
|
||||
"""Log an error message to file."""
|
||||
self._write_to_file("ERROR", message, tag)
|
||||
|
||||
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
||||
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
|
||||
"""Log URL fetch status to file."""
|
||||
status = "SUCCESS" if success else "FAILED"
|
||||
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
|
||||
self._write_to_file("URL_STATUS", message, tag)
|
||||
|
||||
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
||||
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
|
||||
"""Log error status to file."""
|
||||
message = f"{url[:url_length]}... | Error: {error}"
|
||||
self._write_to_file("ERROR", message, tag)
|
||||
|
||||
@@ -358,10 +358,11 @@ class AsyncWebCrawler:
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
config=config, # Pass the config object instead of individual parameters
|
||||
screenshot=screenshot_data,
|
||||
screenshot_data=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
verbose=config.verbose,
|
||||
is_raw_html=True if url.startswith("raw:") else False,
|
||||
redirected_url=async_response.redirected_url,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -380,18 +381,11 @@ class AsyncWebCrawler:
|
||||
crawl_result.session_id = getattr(
|
||||
config, "session_id", None)
|
||||
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
success=crawl_result.success,
|
||||
timing=time.perf_counter() - start_time,
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": crawl_result.success,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s",
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
||||
"timing": Fore.YELLOW,
|
||||
},
|
||||
)
|
||||
|
||||
# Update cache if appropriate
|
||||
@@ -401,17 +395,12 @@ class AsyncWebCrawler:
|
||||
return CrawlResultContainer(crawl_result)
|
||||
|
||||
else:
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": True,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s",
|
||||
},
|
||||
colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
success=True,
|
||||
timing=time.perf_counter() - start_time,
|
||||
tag="COMPLETE"
|
||||
)
|
||||
|
||||
cached_result.success = bool(html)
|
||||
cached_result.session_id = getattr(
|
||||
config, "session_id", None)
|
||||
@@ -446,7 +435,7 @@ class AsyncWebCrawler:
|
||||
html: str,
|
||||
extracted_content: str,
|
||||
config: CrawlerRunConfig,
|
||||
screenshot: str,
|
||||
screenshot_data: str,
|
||||
pdf_data: str,
|
||||
verbose: bool,
|
||||
**kwargs,
|
||||
@@ -459,7 +448,7 @@ class AsyncWebCrawler:
|
||||
html: Raw HTML content
|
||||
extracted_content: Previously extracted content (if any)
|
||||
config: Configuration object controlling processing behavior
|
||||
screenshot: Screenshot data (if any)
|
||||
screenshot_data: Screenshot data (if any)
|
||||
pdf_data: PDF data (if any)
|
||||
verbose: Whether to enable verbose logging
|
||||
**kwargs: Additional parameters for backwards compatibility
|
||||
@@ -564,20 +553,23 @@ class AsyncWebCrawler:
|
||||
markdown_result: MarkdownGenerationResult = (
|
||||
markdown_generator.generate_markdown(
|
||||
input_html=markdown_input_html,
|
||||
base_url=url,
|
||||
base_url=params.get("redirected_url", url)
|
||||
# html2text_options=kwargs.get('html2text', {})
|
||||
)
|
||||
)
|
||||
|
||||
# Log processing completion
|
||||
self.logger.info(
|
||||
message="{url:.50}... | Time: {timing}s",
|
||||
tag="SCRAPE",
|
||||
params={
|
||||
"url": _url,
|
||||
"timing": int((time.perf_counter() - t1) * 1000) / 1000,
|
||||
},
|
||||
self.logger.url_status(
|
||||
url=_url,
|
||||
success=True,
|
||||
timing=int((time.perf_counter() - t1) * 1000) / 1000,
|
||||
tag="SCRAPE"
|
||||
)
|
||||
# self.logger.info(
|
||||
# message="{url:.50}... | Time: {timing}s",
|
||||
# tag="SCRAPE",
|
||||
# params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
|
||||
# )
|
||||
|
||||
################################
|
||||
# Structured Content Extraction #
|
||||
@@ -624,10 +616,6 @@ class AsyncWebCrawler:
|
||||
params={"url": _url, "timing": time.perf_counter() - t1},
|
||||
)
|
||||
|
||||
# Handle screenshot and PDF data
|
||||
screenshot_data = None if not screenshot else screenshot
|
||||
pdf_data = None if not pdf_data else pdf_data
|
||||
|
||||
# Apply HTML formatting if requested
|
||||
if config.prettiify:
|
||||
cleaned_html = fast_format_html(cleaned_html)
|
||||
|
||||
@@ -28,6 +28,7 @@ from lxml import etree
|
||||
from lxml import html as lhtml
|
||||
from typing import List
|
||||
from .models import ScrapingResult, MediaItem, Link, Media, Links
|
||||
import copy
|
||||
|
||||
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
||||
OG_REGEX = re.compile(r"^og:")
|
||||
@@ -48,7 +49,7 @@ def parse_srcset(s: str) -> List[Dict]:
|
||||
if len(parts) >= 1:
|
||||
url = parts[0]
|
||||
width = (
|
||||
parts[1].rstrip("w")
|
||||
parts[1].rstrip("w").split('.')[0]
|
||||
if len(parts) > 1 and parts[1].endswith("w")
|
||||
else None
|
||||
)
|
||||
@@ -128,7 +129,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
Returns:
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
raw_result = self._scrap(url, html, is_async=False, **kwargs)
|
||||
actual_url = kwargs.get("redirected_url", url)
|
||||
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
||||
if raw_result is None:
|
||||
return ScrapingResult(
|
||||
cleaned_html="",
|
||||
@@ -619,6 +621,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
return False
|
||||
|
||||
keep_element = False
|
||||
# Special case for table elements - always preserve structure
|
||||
if element.name in ["tr", "td", "th"]:
|
||||
keep_element = True
|
||||
|
||||
exclude_domains = kwargs.get("exclude_domains", [])
|
||||
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
|
||||
@@ -859,6 +864,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
parser_type = kwargs.get("parser", "lxml")
|
||||
soup = BeautifulSoup(html, parser_type)
|
||||
body = soup.body
|
||||
if body is None:
|
||||
raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
# Early removal of all images if exclude_all_images is set
|
||||
@@ -897,23 +904,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
for element in body.select(excluded_selector):
|
||||
element.extract()
|
||||
|
||||
# if False and css_selector:
|
||||
# selected_elements = body.select(css_selector)
|
||||
# if not selected_elements:
|
||||
# return {
|
||||
# "markdown": "",
|
||||
# "cleaned_html": "",
|
||||
# "success": True,
|
||||
# "media": {"images": [], "videos": [], "audios": []},
|
||||
# "links": {"internal": [], "external": []},
|
||||
# "metadata": {},
|
||||
# "message": f"No elements found for CSS selector: {css_selector}",
|
||||
# }
|
||||
# # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
|
||||
# body = soup.new_tag("div")
|
||||
# for el in selected_elements:
|
||||
# body.append(el)
|
||||
|
||||
content_element = None
|
||||
if target_elements:
|
||||
try:
|
||||
@@ -922,12 +912,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
for_content_targeted_element.extend(body.select(target_element))
|
||||
content_element = soup.new_tag("div")
|
||||
for el in for_content_targeted_element:
|
||||
content_element.append(el)
|
||||
content_element.append(copy.deepcopy(el))
|
||||
except Exception as e:
|
||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||
return None
|
||||
else:
|
||||
content_element = body
|
||||
content_element = body
|
||||
|
||||
kwargs["exclude_social_media_domains"] = set(
|
||||
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
||||
@@ -1308,6 +1298,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
"source",
|
||||
"track",
|
||||
"wbr",
|
||||
"tr",
|
||||
"td",
|
||||
"th",
|
||||
}
|
||||
|
||||
for el in reversed(list(root.iterdescendants())):
|
||||
@@ -1540,26 +1533,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
|
||||
meta = {}
|
||||
|
||||
# Handle CSS selector targeting
|
||||
# if css_selector:
|
||||
# try:
|
||||
# selected_elements = body.cssselect(css_selector)
|
||||
# if not selected_elements:
|
||||
# return {
|
||||
# "markdown": "",
|
||||
# "cleaned_html": "",
|
||||
# "success": True,
|
||||
# "media": {"images": [], "videos": [], "audios": []},
|
||||
# "links": {"internal": [], "external": []},
|
||||
# "metadata": meta,
|
||||
# "message": f"No elements found for CSS selector: {css_selector}",
|
||||
# }
|
||||
# body = lhtml.Element("div")
|
||||
# body.extend(selected_elements)
|
||||
# except Exception as e:
|
||||
# self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
|
||||
# return None
|
||||
|
||||
content_element = None
|
||||
if target_elements:
|
||||
try:
|
||||
@@ -1567,7 +1540,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
for target_element in target_elements:
|
||||
for_content_targeted_element.extend(body.cssselect(target_element))
|
||||
content_element = lhtml.Element("div")
|
||||
content_element.extend(for_content_targeted_element)
|
||||
content_element.extend(copy.deepcopy(for_content_targeted_element))
|
||||
except Exception as e:
|
||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||
return None
|
||||
@@ -1636,7 +1609,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
# Remove empty elements
|
||||
self.remove_empty_elements_fast(body, 1)
|
||||
|
||||
# Remvoe unneeded attributes
|
||||
# Remove unneeded attributes
|
||||
self.remove_unwanted_attributes_fast(
|
||||
body, keep_data_attributes=kwargs.get("keep_data_attributes", False)
|
||||
)
|
||||
|
||||
@@ -11,6 +11,7 @@ from .scorers import URLScorer
|
||||
from . import DeepCrawlStrategy
|
||||
|
||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
||||
from ..utils import normalize_url_for_deep_crawl
|
||||
|
||||
from math import inf as infinity
|
||||
|
||||
@@ -106,13 +107,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
valid_links = []
|
||||
for link in links:
|
||||
url = link.get("href")
|
||||
if url in visited:
|
||||
base_url = normalize_url_for_deep_crawl(url, source_url)
|
||||
if base_url in visited:
|
||||
continue
|
||||
if not await self.can_process_url(url, new_depth):
|
||||
self.stats.urls_skipped += 1
|
||||
continue
|
||||
|
||||
valid_links.append(url)
|
||||
valid_links.append(base_url)
|
||||
|
||||
# If we have more valid links than capacity, limit them
|
||||
if len(valid_links) > remaining_capacity:
|
||||
|
||||
@@ -117,7 +117,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
|
||||
self.stats.urls_skipped += 1
|
||||
continue
|
||||
|
||||
|
||||
visited.add(base_url)
|
||||
valid_links.append((base_url, score))
|
||||
|
||||
# If we have more valid links than capacity, sort by score and take the top ones
|
||||
@@ -158,7 +159,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
while current_level and not self._cancel_event.is_set():
|
||||
next_level: List[Tuple[str, Optional[str]]] = []
|
||||
urls = [url for url, _ in current_level]
|
||||
visited.update(urls)
|
||||
|
||||
# Clone the config to disable deep crawling recursion and enforce batch mode.
|
||||
batch_config = config.clone(deep_crawl_strategy=None, stream=False)
|
||||
|
||||
@@ -115,5 +115,6 @@ async () => {
|
||||
document.body.style.overflow = "auto";
|
||||
|
||||
// Wait a bit for any animations to complete
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
document.body.scrollIntoView(false);
|
||||
await new Promise((resolve) => setTimeout(resolve, 50));
|
||||
};
|
||||
|
||||
@@ -2003,6 +2003,10 @@ def normalize_url(href, base_url):
|
||||
if not parsed_base.scheme or not parsed_base.netloc:
|
||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
|
||||
# Ensure base_url ends with a trailing slash if it's a directory path
|
||||
if not base_url.endswith('/'):
|
||||
base_url = base_url + '/'
|
||||
|
||||
# Use urljoin to handle all cases
|
||||
normalized = urljoin(base_url, href.strip())
|
||||
return normalized
|
||||
@@ -2047,7 +2051,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
netloc,
|
||||
parsed.path.rstrip('/') or '/', # Normalize trailing slash
|
||||
parsed.path.rstrip('/'), # Normalize trailing slash
|
||||
parsed.params,
|
||||
query,
|
||||
fragment
|
||||
@@ -2075,7 +2079,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc.lower(),
|
||||
parsed.path,
|
||||
parsed.path.rstrip('/'),
|
||||
parsed.params,
|
||||
parsed.query,
|
||||
'' # Remove fragment
|
||||
|
||||
@@ -60,6 +60,8 @@ async def handle_llm_qa(
|
||||
) -> str:
|
||||
"""Process QA using LLM with crawled content as context."""
|
||||
try:
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = 'https://' + url
|
||||
# Extract base URL by finding last '?q=' occurrence
|
||||
last_q_index = url.rfind('?q=')
|
||||
if last_q_index != -1:
|
||||
@@ -73,7 +75,7 @@ async def handle_llm_qa(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=result.error_message
|
||||
)
|
||||
content = result.markdown.fit_markdown
|
||||
content = result.markdown.fit_markdown or result.markdown.raw_markdown
|
||||
|
||||
# Create prompt and get LLM response
|
||||
prompt = f"""Use the following content as context to answer the question.
|
||||
@@ -397,6 +399,7 @@ async def handle_crawl_request(
|
||||
peak_mem_mb = start_mem_mb
|
||||
|
||||
try:
|
||||
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls]
|
||||
browser_config = BrowserConfig.load(browser_config)
|
||||
crawler_config = CrawlerRunConfig.load(crawler_config)
|
||||
|
||||
|
||||
@@ -432,7 +432,7 @@ async def execute_js(
|
||||
async def llm_endpoint(
|
||||
request: Request,
|
||||
url: str = Path(...),
|
||||
q: Optional[str] = Query(None),
|
||||
q: str = Query(...),
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
if not q:
|
||||
|
||||
@@ -12,9 +12,10 @@ We’ve introduced a new feature that effortlessly handles even the biggest page
|
||||
|
||||
**Simple Example:**
|
||||
```python
|
||||
import os, sys
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
# Adjust paths as needed
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
@@ -26,9 +27,11 @@ async def main():
|
||||
# Request both PDF and screenshot
|
||||
result = await crawler.arun(
|
||||
url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
)
|
||||
)
|
||||
|
||||
if result.success:
|
||||
@@ -40,9 +43,8 @@ async def main():
|
||||
|
||||
# Save PDF
|
||||
if result.pdf:
|
||||
pdf_bytes = b64decode(result.pdf)
|
||||
with open(os.path.join(__location__, "page.pdf"), "wb") as f:
|
||||
f.write(pdf_bytes)
|
||||
f.write(result.pdf)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -232,6 +232,7 @@ async def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## 2.4 Compliance & Ethics
|
||||
|
||||
|
||||
@@ -36,8 +36,6 @@ class BrowserConfig:
|
||||
|
||||
### Key Fields to Note
|
||||
|
||||
|
||||
|
||||
1. **`browser_type`**
|
||||
- Options: `"chromium"`, `"firefox"`, or `"webkit"`.
|
||||
- Defaults to `"chromium"`.
|
||||
@@ -215,6 +213,7 @@ class CrawlerRunConfig:
|
||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||
- Affects how much information is printed during the crawl.
|
||||
|
||||
|
||||
### Helper Methods
|
||||
|
||||
The `clone()` method is particularly useful for creating variations of your crawler configuration:
|
||||
@@ -248,9 +247,6 @@ The `clone()` method:
|
||||
---
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## 3. LLMConfig Essentials
|
||||
|
||||
### Key fields to note
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
|
||||
|
||||
1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more).
|
||||
1. Works with **any** large language model supported by [LiteLLM](https://github.com/BerriAI/litellm) (Ollama, OpenAI, Claude, and more).
|
||||
2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.
|
||||
3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
|
||||
|
||||
@@ -18,13 +18,19 @@ In some cases, you need to extract **complex or unstructured** information from
|
||||
|
||||
---
|
||||
|
||||
## 2. Provider-Agnostic via LightLLM
|
||||
## 2. Provider-Agnostic via LiteLLM
|
||||
|
||||
Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
|
||||
You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
|
||||
|
||||
```python
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
```
|
||||
|
||||
Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
|
||||
|
||||
- **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).
|
||||
- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.
|
||||
- **`api_base`** (optional): If your provider has a custom endpoint.
|
||||
- **`base_url`** (optional): If your provider has a custom endpoint.
|
||||
|
||||
This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.
|
||||
|
||||
@@ -52,20 +58,19 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic
|
||||
|
||||
Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
|
||||
|
||||
1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
|
||||
2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.
|
||||
3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.
|
||||
4. **`extraction_type`** (str): `"schema"` or `"block"`.
|
||||
5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”
|
||||
6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.
|
||||
7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.
|
||||
8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.
|
||||
9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:
|
||||
1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
|
||||
2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.
|
||||
3. **`extraction_type`** (str): `"schema"` or `"block"`.
|
||||
4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”
|
||||
5. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.
|
||||
6. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.
|
||||
7. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.
|
||||
8. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:
|
||||
- `"markdown"`: The raw markdown (default).
|
||||
- `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.
|
||||
- `"html"`: The cleaned or raw HTML.
|
||||
10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.
|
||||
11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).
|
||||
9. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.
|
||||
10. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).
|
||||
|
||||
**Example**:
|
||||
|
||||
@@ -233,8 +238,7 @@ class KnowledgeGraph(BaseModel):
|
||||
async def main():
|
||||
# LLM extraction strategy
|
||||
llm_strat = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
schema=KnowledgeGraph.schema_json(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract entities and relationships from the content. Return valid JSON.",
|
||||
@@ -286,7 +290,7 @@ if __name__ == "__main__":
|
||||
|
||||
## 11. Conclusion
|
||||
|
||||
**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
|
||||
**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LiteLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
|
||||
|
||||
- Put your LLM strategy **in `CrawlerRunConfig`**.
|
||||
- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.
|
||||
@@ -317,4 +321,4 @@ If your site’s data is consistent or repetitive, consider [`JsonCssExtractionS
|
||||
|
||||
---
|
||||
|
||||
That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
|
||||
That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
|
||||
|
||||
@@ -40,8 +40,9 @@ dependencies = [
|
||||
"fake-useragent>=2.0.3",
|
||||
"click>=8.1.7",
|
||||
"pyperclip>=1.8.2",
|
||||
"faust-cchardet>=2.1.19",
|
||||
"chardet>=5.2.0",
|
||||
"aiohttp>=3.11.11",
|
||||
"brotli>=1.1.0",
|
||||
"humanize>=4.10.0",
|
||||
]
|
||||
classifiers = [
|
||||
|
||||
@@ -21,4 +21,5 @@ psutil>=6.1.1
|
||||
nltk>=3.9.1
|
||||
rich>=13.9.4
|
||||
cssselect>=1.2.0
|
||||
faust-cchardet>=2.1.19
|
||||
chardet>=5.2.0
|
||||
brotli>=1.1.0
|
||||
Reference in New Issue
Block a user