Merge PR #899 into next, resolve conflicts in server.py and docs/browser-crawler-config.md
This commit is contained in:
@@ -24,7 +24,7 @@ from .browser_manager import BrowserManager
|
|||||||
|
|
||||||
import aiofiles
|
import aiofiles
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import cchardet
|
import chardet
|
||||||
from aiohttp.client import ClientTimeout
|
from aiohttp.client import ClientTimeout
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from types import MappingProxyType
|
from types import MappingProxyType
|
||||||
@@ -130,6 +130,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
Close the browser and clean up resources.
|
Close the browser and clean up resources.
|
||||||
"""
|
"""
|
||||||
await self.browser_manager.close()
|
await self.browser_manager.close()
|
||||||
|
# Explicitly reset the static Playwright instance
|
||||||
|
BrowserManager._playwright_instance = None
|
||||||
|
|
||||||
async def kill_session(self, session_id: str):
|
async def kill_session(self, session_id: str):
|
||||||
"""
|
"""
|
||||||
@@ -679,14 +681,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
if console_log_type == "error":
|
if console_log_type == "error":
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
message=f"Console error: {msg}", # Use f-string for variable interpolation
|
message=f"Console error: {msg}", # Use f-string for variable interpolation
|
||||||
tag="CONSOLE",
|
tag="CONSOLE"
|
||||||
params={"msg": msg.text},
|
|
||||||
)
|
)
|
||||||
elif console_log_type == "debug":
|
elif console_log_type == "debug":
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
message=f"Console: {msg}", # Use f-string for variable interpolation
|
message=f"Console: {msg}", # Use f-string for variable interpolation
|
||||||
tag="CONSOLE",
|
tag="CONSOLE"
|
||||||
params={"msg": msg.text},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
page.on("console", log_consol)
|
page.on("console", log_consol)
|
||||||
@@ -967,7 +967,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
for selector in selectors:
|
for selector in selectors:
|
||||||
try:
|
try:
|
||||||
content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''")
|
content = await page.evaluate(
|
||||||
|
f"""Array.from(document.querySelectorAll("{selector}"))
|
||||||
|
.map(el => el.outerHTML)
|
||||||
|
.join('')"""
|
||||||
|
)
|
||||||
html_parts.append(content)
|
html_parts.append(content)
|
||||||
except Error as e:
|
except Error as e:
|
||||||
print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
|
print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
|
||||||
@@ -1975,7 +1979,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
await self.start()
|
await self.start()
|
||||||
yield self._session
|
yield self._session
|
||||||
finally:
|
finally:
|
||||||
await self.close()
|
pass
|
||||||
|
|
||||||
def set_hook(self, hook_type: str, hook_func: Callable) -> None:
|
def set_hook(self, hook_type: str, hook_func: Callable) -> None:
|
||||||
if hook_type in self.hooks:
|
if hook_type in self.hooks:
|
||||||
@@ -2091,7 +2095,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
encoding = response.charset
|
encoding = response.charset
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8'
|
encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'
|
||||||
|
|
||||||
result = AsyncCrawlResponse(
|
result = AsyncCrawlResponse(
|
||||||
html=content.tobytes().decode(encoding, errors='replace'),
|
html=content.tobytes().decode(encoding, errors='replace'),
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from typing import Optional, Dict, Any
|
|||||||
from colorama import Fore, Style, init
|
from colorama import Fore, Style, init
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from urllib.parse import unquote
|
||||||
|
|
||||||
|
|
||||||
class LogLevel(Enum):
|
class LogLevel(Enum):
|
||||||
@@ -44,11 +45,11 @@ class AsyncLoggerBase(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class AsyncLogger(AsyncLoggerBase):
|
class AsyncLogger(AsyncLoggerBase):
|
||||||
@@ -130,6 +131,14 @@ class AsyncLogger(AsyncLoggerBase):
|
|||||||
def _get_icon(self, tag: str) -> str:
|
def _get_icon(self, tag: str) -> str:
|
||||||
"""Get the icon for a tag, defaulting to info icon if not found."""
|
"""Get the icon for a tag, defaulting to info icon if not found."""
|
||||||
return self.icons.get(tag, self.icons["INFO"])
|
return self.icons.get(tag, self.icons["INFO"])
|
||||||
|
|
||||||
|
def _shorten(self, text, length, placeholder="..."):
|
||||||
|
"""Truncate text in the middle if longer than length, or pad if shorter."""
|
||||||
|
if len(text) <= length:
|
||||||
|
return text.ljust(length) # Pad with spaces to reach desired length
|
||||||
|
half = (length - len(placeholder)) // 2
|
||||||
|
shortened = text[:half] + placeholder + text[-half:]
|
||||||
|
return shortened.ljust(length) # Also pad shortened text to consistent length
|
||||||
|
|
||||||
def _write_to_file(self, message: str):
|
def _write_to_file(self, message: str):
|
||||||
"""Write a message to the log file if configured."""
|
"""Write a message to the log file if configured."""
|
||||||
@@ -259,7 +268,7 @@ class AsyncLogger(AsyncLoggerBase):
|
|||||||
success: bool,
|
success: bool,
|
||||||
timing: float,
|
timing: float,
|
||||||
tag: str = "FETCH",
|
tag: str = "FETCH",
|
||||||
url_length: int = 50,
|
url_length: int = 100,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Convenience method for logging URL fetch status.
|
Convenience method for logging URL fetch status.
|
||||||
@@ -271,14 +280,15 @@ class AsyncLogger(AsyncLoggerBase):
|
|||||||
tag: Tag for the message
|
tag: Tag for the message
|
||||||
url_length: Maximum length for URL in log
|
url_length: Maximum length for URL in log
|
||||||
"""
|
"""
|
||||||
|
decoded_url = unquote(url)
|
||||||
|
readable_url = self._shorten(decoded_url, url_length)
|
||||||
self._log(
|
self._log(
|
||||||
level=LogLevel.SUCCESS if success else LogLevel.ERROR,
|
level=LogLevel.SUCCESS if success else LogLevel.ERROR,
|
||||||
message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
|
message="{url} | {status} | ⏱: {timing:.2f}s",
|
||||||
tag=tag,
|
tag=tag,
|
||||||
params={
|
params={
|
||||||
"url": url,
|
"url": readable_url,
|
||||||
"url_length": url_length,
|
"status": "✓" if success else "✗",
|
||||||
"status": success,
|
|
||||||
"timing": timing,
|
"timing": timing,
|
||||||
},
|
},
|
||||||
colors={
|
colors={
|
||||||
@@ -299,11 +309,13 @@ class AsyncLogger(AsyncLoggerBase):
|
|||||||
tag: Tag for the message
|
tag: Tag for the message
|
||||||
url_length: Maximum length for URL in log
|
url_length: Maximum length for URL in log
|
||||||
"""
|
"""
|
||||||
|
decoded_url = unquote(url)
|
||||||
|
readable_url = self._shorten(decoded_url, url_length)
|
||||||
self._log(
|
self._log(
|
||||||
level=LogLevel.ERROR,
|
level=LogLevel.ERROR,
|
||||||
message="{url:.{url_length}}... | Error: {error}",
|
message="{url} | Error: {error}",
|
||||||
tag=tag,
|
tag=tag,
|
||||||
params={"url": url, "url_length": url_length, "error": error},
|
params={"url": readable_url, "error": error},
|
||||||
)
|
)
|
||||||
|
|
||||||
class AsyncFileLogger(AsyncLoggerBase):
|
class AsyncFileLogger(AsyncLoggerBase):
|
||||||
@@ -347,13 +359,13 @@ class AsyncFileLogger(AsyncLoggerBase):
|
|||||||
"""Log an error message to file."""
|
"""Log an error message to file."""
|
||||||
self._write_to_file("ERROR", message, tag)
|
self._write_to_file("ERROR", message, tag)
|
||||||
|
|
||||||
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
|
||||||
"""Log URL fetch status to file."""
|
"""Log URL fetch status to file."""
|
||||||
status = "SUCCESS" if success else "FAILED"
|
status = "SUCCESS" if success else "FAILED"
|
||||||
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
|
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
|
||||||
self._write_to_file("URL_STATUS", message, tag)
|
self._write_to_file("URL_STATUS", message, tag)
|
||||||
|
|
||||||
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
|
||||||
"""Log error status to file."""
|
"""Log error status to file."""
|
||||||
message = f"{url[:url_length]}... | Error: {error}"
|
message = f"{url[:url_length]}... | Error: {error}"
|
||||||
self._write_to_file("ERROR", message, tag)
|
self._write_to_file("ERROR", message, tag)
|
||||||
|
|||||||
@@ -358,10 +358,11 @@ class AsyncWebCrawler:
|
|||||||
html=html,
|
html=html,
|
||||||
extracted_content=extracted_content,
|
extracted_content=extracted_content,
|
||||||
config=config, # Pass the config object instead of individual parameters
|
config=config, # Pass the config object instead of individual parameters
|
||||||
screenshot=screenshot_data,
|
screenshot_data=screenshot_data,
|
||||||
pdf_data=pdf_data,
|
pdf_data=pdf_data,
|
||||||
verbose=config.verbose,
|
verbose=config.verbose,
|
||||||
is_raw_html=True if url.startswith("raw:") else False,
|
is_raw_html=True if url.startswith("raw:") else False,
|
||||||
|
redirected_url=async_response.redirected_url,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -380,18 +381,11 @@ class AsyncWebCrawler:
|
|||||||
crawl_result.session_id = getattr(
|
crawl_result.session_id = getattr(
|
||||||
config, "session_id", None)
|
config, "session_id", None)
|
||||||
|
|
||||||
self.logger.success(
|
self.logger.url_status(
|
||||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
url=cache_context.display_url,
|
||||||
|
success=crawl_result.success,
|
||||||
|
timing=time.perf_counter() - start_time,
|
||||||
tag="COMPLETE",
|
tag="COMPLETE",
|
||||||
params={
|
|
||||||
"url": cache_context.display_url,
|
|
||||||
"status": crawl_result.success,
|
|
||||||
"timing": f"{time.perf_counter() - start_time:.2f}s",
|
|
||||||
},
|
|
||||||
colors={
|
|
||||||
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
|
||||||
"timing": Fore.YELLOW,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update cache if appropriate
|
# Update cache if appropriate
|
||||||
@@ -401,17 +395,12 @@ class AsyncWebCrawler:
|
|||||||
return CrawlResultContainer(crawl_result)
|
return CrawlResultContainer(crawl_result)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.logger.success(
|
self.logger.url_status(
|
||||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
url=cache_context.display_url,
|
||||||
tag="COMPLETE",
|
success=True,
|
||||||
params={
|
timing=time.perf_counter() - start_time,
|
||||||
"url": cache_context.display_url,
|
tag="COMPLETE"
|
||||||
"status": True,
|
|
||||||
"timing": f"{time.perf_counter() - start_time:.2f}s",
|
|
||||||
},
|
|
||||||
colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
cached_result.success = bool(html)
|
cached_result.success = bool(html)
|
||||||
cached_result.session_id = getattr(
|
cached_result.session_id = getattr(
|
||||||
config, "session_id", None)
|
config, "session_id", None)
|
||||||
@@ -446,7 +435,7 @@ class AsyncWebCrawler:
|
|||||||
html: str,
|
html: str,
|
||||||
extracted_content: str,
|
extracted_content: str,
|
||||||
config: CrawlerRunConfig,
|
config: CrawlerRunConfig,
|
||||||
screenshot: str,
|
screenshot_data: str,
|
||||||
pdf_data: str,
|
pdf_data: str,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@@ -459,7 +448,7 @@ class AsyncWebCrawler:
|
|||||||
html: Raw HTML content
|
html: Raw HTML content
|
||||||
extracted_content: Previously extracted content (if any)
|
extracted_content: Previously extracted content (if any)
|
||||||
config: Configuration object controlling processing behavior
|
config: Configuration object controlling processing behavior
|
||||||
screenshot: Screenshot data (if any)
|
screenshot_data: Screenshot data (if any)
|
||||||
pdf_data: PDF data (if any)
|
pdf_data: PDF data (if any)
|
||||||
verbose: Whether to enable verbose logging
|
verbose: Whether to enable verbose logging
|
||||||
**kwargs: Additional parameters for backwards compatibility
|
**kwargs: Additional parameters for backwards compatibility
|
||||||
@@ -564,20 +553,23 @@ class AsyncWebCrawler:
|
|||||||
markdown_result: MarkdownGenerationResult = (
|
markdown_result: MarkdownGenerationResult = (
|
||||||
markdown_generator.generate_markdown(
|
markdown_generator.generate_markdown(
|
||||||
input_html=markdown_input_html,
|
input_html=markdown_input_html,
|
||||||
base_url=url,
|
base_url=params.get("redirected_url", url)
|
||||||
# html2text_options=kwargs.get('html2text', {})
|
# html2text_options=kwargs.get('html2text', {})
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Log processing completion
|
# Log processing completion
|
||||||
self.logger.info(
|
self.logger.url_status(
|
||||||
message="{url:.50}... | Time: {timing}s",
|
url=_url,
|
||||||
tag="SCRAPE",
|
success=True,
|
||||||
params={
|
timing=int((time.perf_counter() - t1) * 1000) / 1000,
|
||||||
"url": _url,
|
tag="SCRAPE"
|
||||||
"timing": int((time.perf_counter() - t1) * 1000) / 1000,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
|
# self.logger.info(
|
||||||
|
# message="{url:.50}... | Time: {timing}s",
|
||||||
|
# tag="SCRAPE",
|
||||||
|
# params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
|
||||||
|
# )
|
||||||
|
|
||||||
################################
|
################################
|
||||||
# Structured Content Extraction #
|
# Structured Content Extraction #
|
||||||
@@ -624,10 +616,6 @@ class AsyncWebCrawler:
|
|||||||
params={"url": _url, "timing": time.perf_counter() - t1},
|
params={"url": _url, "timing": time.perf_counter() - t1},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Handle screenshot and PDF data
|
|
||||||
screenshot_data = None if not screenshot else screenshot
|
|
||||||
pdf_data = None if not pdf_data else pdf_data
|
|
||||||
|
|
||||||
# Apply HTML formatting if requested
|
# Apply HTML formatting if requested
|
||||||
if config.prettiify:
|
if config.prettiify:
|
||||||
cleaned_html = fast_format_html(cleaned_html)
|
cleaned_html = fast_format_html(cleaned_html)
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ from lxml import etree
|
|||||||
from lxml import html as lhtml
|
from lxml import html as lhtml
|
||||||
from typing import List
|
from typing import List
|
||||||
from .models import ScrapingResult, MediaItem, Link, Media, Links
|
from .models import ScrapingResult, MediaItem, Link, Media, Links
|
||||||
|
import copy
|
||||||
|
|
||||||
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
||||||
OG_REGEX = re.compile(r"^og:")
|
OG_REGEX = re.compile(r"^og:")
|
||||||
@@ -48,7 +49,7 @@ def parse_srcset(s: str) -> List[Dict]:
|
|||||||
if len(parts) >= 1:
|
if len(parts) >= 1:
|
||||||
url = parts[0]
|
url = parts[0]
|
||||||
width = (
|
width = (
|
||||||
parts[1].rstrip("w")
|
parts[1].rstrip("w").split('.')[0]
|
||||||
if len(parts) > 1 and parts[1].endswith("w")
|
if len(parts) > 1 and parts[1].endswith("w")
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
@@ -128,7 +129,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
Returns:
|
Returns:
|
||||||
ScrapingResult: A structured result containing the scraped content.
|
ScrapingResult: A structured result containing the scraped content.
|
||||||
"""
|
"""
|
||||||
raw_result = self._scrap(url, html, is_async=False, **kwargs)
|
actual_url = kwargs.get("redirected_url", url)
|
||||||
|
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
||||||
if raw_result is None:
|
if raw_result is None:
|
||||||
return ScrapingResult(
|
return ScrapingResult(
|
||||||
cleaned_html="",
|
cleaned_html="",
|
||||||
@@ -619,6 +621,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
keep_element = False
|
keep_element = False
|
||||||
|
# Special case for table elements - always preserve structure
|
||||||
|
if element.name in ["tr", "td", "th"]:
|
||||||
|
keep_element = True
|
||||||
|
|
||||||
exclude_domains = kwargs.get("exclude_domains", [])
|
exclude_domains = kwargs.get("exclude_domains", [])
|
||||||
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
|
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
|
||||||
@@ -859,6 +864,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
parser_type = kwargs.get("parser", "lxml")
|
parser_type = kwargs.get("parser", "lxml")
|
||||||
soup = BeautifulSoup(html, parser_type)
|
soup = BeautifulSoup(html, parser_type)
|
||||||
body = soup.body
|
body = soup.body
|
||||||
|
if body is None:
|
||||||
|
raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
|
||||||
base_domain = get_base_domain(url)
|
base_domain = get_base_domain(url)
|
||||||
|
|
||||||
# Early removal of all images if exclude_all_images is set
|
# Early removal of all images if exclude_all_images is set
|
||||||
@@ -897,23 +904,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
for element in body.select(excluded_selector):
|
for element in body.select(excluded_selector):
|
||||||
element.extract()
|
element.extract()
|
||||||
|
|
||||||
# if False and css_selector:
|
|
||||||
# selected_elements = body.select(css_selector)
|
|
||||||
# if not selected_elements:
|
|
||||||
# return {
|
|
||||||
# "markdown": "",
|
|
||||||
# "cleaned_html": "",
|
|
||||||
# "success": True,
|
|
||||||
# "media": {"images": [], "videos": [], "audios": []},
|
|
||||||
# "links": {"internal": [], "external": []},
|
|
||||||
# "metadata": {},
|
|
||||||
# "message": f"No elements found for CSS selector: {css_selector}",
|
|
||||||
# }
|
|
||||||
# # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
|
|
||||||
# body = soup.new_tag("div")
|
|
||||||
# for el in selected_elements:
|
|
||||||
# body.append(el)
|
|
||||||
|
|
||||||
content_element = None
|
content_element = None
|
||||||
if target_elements:
|
if target_elements:
|
||||||
try:
|
try:
|
||||||
@@ -922,12 +912,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
for_content_targeted_element.extend(body.select(target_element))
|
for_content_targeted_element.extend(body.select(target_element))
|
||||||
content_element = soup.new_tag("div")
|
content_element = soup.new_tag("div")
|
||||||
for el in for_content_targeted_element:
|
for el in for_content_targeted_element:
|
||||||
content_element.append(el)
|
content_element.append(copy.deepcopy(el))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
content_element = body
|
content_element = body
|
||||||
|
|
||||||
kwargs["exclude_social_media_domains"] = set(
|
kwargs["exclude_social_media_domains"] = set(
|
||||||
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
||||||
@@ -1308,6 +1298,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
"source",
|
"source",
|
||||||
"track",
|
"track",
|
||||||
"wbr",
|
"wbr",
|
||||||
|
"tr",
|
||||||
|
"td",
|
||||||
|
"th",
|
||||||
}
|
}
|
||||||
|
|
||||||
for el in reversed(list(root.iterdescendants())):
|
for el in reversed(list(root.iterdescendants())):
|
||||||
@@ -1540,26 +1533,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
|
self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
|
||||||
meta = {}
|
meta = {}
|
||||||
|
|
||||||
# Handle CSS selector targeting
|
|
||||||
# if css_selector:
|
|
||||||
# try:
|
|
||||||
# selected_elements = body.cssselect(css_selector)
|
|
||||||
# if not selected_elements:
|
|
||||||
# return {
|
|
||||||
# "markdown": "",
|
|
||||||
# "cleaned_html": "",
|
|
||||||
# "success": True,
|
|
||||||
# "media": {"images": [], "videos": [], "audios": []},
|
|
||||||
# "links": {"internal": [], "external": []},
|
|
||||||
# "metadata": meta,
|
|
||||||
# "message": f"No elements found for CSS selector: {css_selector}",
|
|
||||||
# }
|
|
||||||
# body = lhtml.Element("div")
|
|
||||||
# body.extend(selected_elements)
|
|
||||||
# except Exception as e:
|
|
||||||
# self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
|
|
||||||
# return None
|
|
||||||
|
|
||||||
content_element = None
|
content_element = None
|
||||||
if target_elements:
|
if target_elements:
|
||||||
try:
|
try:
|
||||||
@@ -1567,7 +1540,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
for target_element in target_elements:
|
for target_element in target_elements:
|
||||||
for_content_targeted_element.extend(body.cssselect(target_element))
|
for_content_targeted_element.extend(body.cssselect(target_element))
|
||||||
content_element = lhtml.Element("div")
|
content_element = lhtml.Element("div")
|
||||||
content_element.extend(for_content_targeted_element)
|
content_element.extend(copy.deepcopy(for_content_targeted_element))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||||
return None
|
return None
|
||||||
@@ -1636,7 +1609,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
# Remove empty elements
|
# Remove empty elements
|
||||||
self.remove_empty_elements_fast(body, 1)
|
self.remove_empty_elements_fast(body, 1)
|
||||||
|
|
||||||
# Remvoe unneeded attributes
|
# Remove unneeded attributes
|
||||||
self.remove_unwanted_attributes_fast(
|
self.remove_unwanted_attributes_fast(
|
||||||
body, keep_data_attributes=kwargs.get("keep_data_attributes", False)
|
body, keep_data_attributes=kwargs.get("keep_data_attributes", False)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from .scorers import URLScorer
|
|||||||
from . import DeepCrawlStrategy
|
from . import DeepCrawlStrategy
|
||||||
|
|
||||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
||||||
|
from ..utils import normalize_url_for_deep_crawl
|
||||||
|
|
||||||
from math import inf as infinity
|
from math import inf as infinity
|
||||||
|
|
||||||
@@ -106,13 +107,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
valid_links = []
|
valid_links = []
|
||||||
for link in links:
|
for link in links:
|
||||||
url = link.get("href")
|
url = link.get("href")
|
||||||
if url in visited:
|
base_url = normalize_url_for_deep_crawl(url, source_url)
|
||||||
|
if base_url in visited:
|
||||||
continue
|
continue
|
||||||
if not await self.can_process_url(url, new_depth):
|
if not await self.can_process_url(url, new_depth):
|
||||||
self.stats.urls_skipped += 1
|
self.stats.urls_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
valid_links.append(url)
|
valid_links.append(base_url)
|
||||||
|
|
||||||
# If we have more valid links than capacity, limit them
|
# If we have more valid links than capacity, limit them
|
||||||
if len(valid_links) > remaining_capacity:
|
if len(valid_links) > remaining_capacity:
|
||||||
|
|||||||
@@ -117,7 +117,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
|
self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
|
||||||
self.stats.urls_skipped += 1
|
self.stats.urls_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
visited.add(base_url)
|
||||||
valid_links.append((base_url, score))
|
valid_links.append((base_url, score))
|
||||||
|
|
||||||
# If we have more valid links than capacity, sort by score and take the top ones
|
# If we have more valid links than capacity, sort by score and take the top ones
|
||||||
@@ -158,7 +159,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
while current_level and not self._cancel_event.is_set():
|
while current_level and not self._cancel_event.is_set():
|
||||||
next_level: List[Tuple[str, Optional[str]]] = []
|
next_level: List[Tuple[str, Optional[str]]] = []
|
||||||
urls = [url for url, _ in current_level]
|
urls = [url for url, _ in current_level]
|
||||||
visited.update(urls)
|
|
||||||
|
|
||||||
# Clone the config to disable deep crawling recursion and enforce batch mode.
|
# Clone the config to disable deep crawling recursion and enforce batch mode.
|
||||||
batch_config = config.clone(deep_crawl_strategy=None, stream=False)
|
batch_config = config.clone(deep_crawl_strategy=None, stream=False)
|
||||||
|
|||||||
@@ -115,5 +115,6 @@ async () => {
|
|||||||
document.body.style.overflow = "auto";
|
document.body.style.overflow = "auto";
|
||||||
|
|
||||||
// Wait a bit for any animations to complete
|
// Wait a bit for any animations to complete
|
||||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
document.body.scrollIntoView(false);
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 50));
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -2003,6 +2003,10 @@ def normalize_url(href, base_url):
|
|||||||
if not parsed_base.scheme or not parsed_base.netloc:
|
if not parsed_base.scheme or not parsed_base.netloc:
|
||||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||||
|
|
||||||
|
# Ensure base_url ends with a trailing slash if it's a directory path
|
||||||
|
if not base_url.endswith('/'):
|
||||||
|
base_url = base_url + '/'
|
||||||
|
|
||||||
# Use urljoin to handle all cases
|
# Use urljoin to handle all cases
|
||||||
normalized = urljoin(base_url, href.strip())
|
normalized = urljoin(base_url, href.strip())
|
||||||
return normalized
|
return normalized
|
||||||
@@ -2047,7 +2051,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
|||||||
normalized = urlunparse((
|
normalized = urlunparse((
|
||||||
parsed.scheme,
|
parsed.scheme,
|
||||||
netloc,
|
netloc,
|
||||||
parsed.path.rstrip('/') or '/', # Normalize trailing slash
|
parsed.path.rstrip('/'), # Normalize trailing slash
|
||||||
parsed.params,
|
parsed.params,
|
||||||
query,
|
query,
|
||||||
fragment
|
fragment
|
||||||
@@ -2075,7 +2079,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
|||||||
normalized = urlunparse((
|
normalized = urlunparse((
|
||||||
parsed.scheme,
|
parsed.scheme,
|
||||||
parsed.netloc.lower(),
|
parsed.netloc.lower(),
|
||||||
parsed.path,
|
parsed.path.rstrip('/'),
|
||||||
parsed.params,
|
parsed.params,
|
||||||
parsed.query,
|
parsed.query,
|
||||||
'' # Remove fragment
|
'' # Remove fragment
|
||||||
|
|||||||
@@ -60,6 +60,8 @@ async def handle_llm_qa(
|
|||||||
) -> str:
|
) -> str:
|
||||||
"""Process QA using LLM with crawled content as context."""
|
"""Process QA using LLM with crawled content as context."""
|
||||||
try:
|
try:
|
||||||
|
if not url.startswith(('http://', 'https://')):
|
||||||
|
url = 'https://' + url
|
||||||
# Extract base URL by finding last '?q=' occurrence
|
# Extract base URL by finding last '?q=' occurrence
|
||||||
last_q_index = url.rfind('?q=')
|
last_q_index = url.rfind('?q=')
|
||||||
if last_q_index != -1:
|
if last_q_index != -1:
|
||||||
@@ -73,7 +75,7 @@ async def handle_llm_qa(
|
|||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
detail=result.error_message
|
detail=result.error_message
|
||||||
)
|
)
|
||||||
content = result.markdown.fit_markdown
|
content = result.markdown.fit_markdown or result.markdown.raw_markdown
|
||||||
|
|
||||||
# Create prompt and get LLM response
|
# Create prompt and get LLM response
|
||||||
prompt = f"""Use the following content as context to answer the question.
|
prompt = f"""Use the following content as context to answer the question.
|
||||||
@@ -397,6 +399,7 @@ async def handle_crawl_request(
|
|||||||
peak_mem_mb = start_mem_mb
|
peak_mem_mb = start_mem_mb
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls]
|
||||||
browser_config = BrowserConfig.load(browser_config)
|
browser_config = BrowserConfig.load(browser_config)
|
||||||
crawler_config = CrawlerRunConfig.load(crawler_config)
|
crawler_config = CrawlerRunConfig.load(crawler_config)
|
||||||
|
|
||||||
|
|||||||
@@ -432,7 +432,7 @@ async def execute_js(
|
|||||||
async def llm_endpoint(
|
async def llm_endpoint(
|
||||||
request: Request,
|
request: Request,
|
||||||
url: str = Path(...),
|
url: str = Path(...),
|
||||||
q: Optional[str] = Query(None),
|
q: str = Query(...),
|
||||||
_td: Dict = Depends(token_dep),
|
_td: Dict = Depends(token_dep),
|
||||||
):
|
):
|
||||||
if not q:
|
if not q:
|
||||||
|
|||||||
@@ -12,9 +12,10 @@ We’ve introduced a new feature that effortlessly handles even the biggest page
|
|||||||
|
|
||||||
**Simple Example:**
|
**Simple Example:**
|
||||||
```python
|
```python
|
||||||
import os, sys
|
import os
|
||||||
|
import sys
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||||
|
|
||||||
# Adjust paths as needed
|
# Adjust paths as needed
|
||||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
@@ -26,9 +27,11 @@ async def main():
|
|||||||
# Request both PDF and screenshot
|
# Request both PDF and screenshot
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
|
url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
|
||||||
cache_mode=CacheMode.BYPASS,
|
config=CrawlerRunConfig(
|
||||||
pdf=True,
|
cache_mode=CacheMode.BYPASS,
|
||||||
screenshot=True
|
pdf=True,
|
||||||
|
screenshot=True
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.success:
|
if result.success:
|
||||||
@@ -40,9 +43,8 @@ async def main():
|
|||||||
|
|
||||||
# Save PDF
|
# Save PDF
|
||||||
if result.pdf:
|
if result.pdf:
|
||||||
pdf_bytes = b64decode(result.pdf)
|
|
||||||
with open(os.path.join(__location__, "page.pdf"), "wb") as f:
|
with open(os.path.join(__location__, "page.pdf"), "wb") as f:
|
||||||
f.write(pdf_bytes)
|
f.write(result.pdf)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
@@ -232,6 +232,7 @@ async def main():
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
## 2.4 Compliance & Ethics
|
## 2.4 Compliance & Ethics
|
||||||
|
|
||||||
|
|||||||
@@ -36,8 +36,6 @@ class BrowserConfig:
|
|||||||
|
|
||||||
### Key Fields to Note
|
### Key Fields to Note
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
1. **`browser_type`**
|
1. **`browser_type`**
|
||||||
- Options: `"chromium"`, `"firefox"`, or `"webkit"`.
|
- Options: `"chromium"`, `"firefox"`, or `"webkit"`.
|
||||||
- Defaults to `"chromium"`.
|
- Defaults to `"chromium"`.
|
||||||
@@ -215,6 +213,7 @@ class CrawlerRunConfig:
|
|||||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||||
- Affects how much information is printed during the crawl.
|
- Affects how much information is printed during the crawl.
|
||||||
|
|
||||||
|
|
||||||
### Helper Methods
|
### Helper Methods
|
||||||
|
|
||||||
The `clone()` method is particularly useful for creating variations of your crawler configuration:
|
The `clone()` method is particularly useful for creating variations of your crawler configuration:
|
||||||
@@ -248,9 +247,6 @@ The `clone()` method:
|
|||||||
---
|
---
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 3. LLMConfig Essentials
|
## 3. LLMConfig Essentials
|
||||||
|
|
||||||
### Key fields to note
|
### Key fields to note
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
|
In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
|
||||||
|
|
||||||
1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more).
|
1. Works with **any** large language model supported by [LiteLLM](https://github.com/BerriAI/litellm) (Ollama, OpenAI, Claude, and more).
|
||||||
2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.
|
2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.
|
||||||
3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
|
3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
|
||||||
|
|
||||||
@@ -18,13 +18,19 @@ In some cases, you need to extract **complex or unstructured** information from
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 2. Provider-Agnostic via LightLLM
|
## 2. Provider-Agnostic via LiteLLM
|
||||||
|
|
||||||
Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
|
You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
|
||||||
|
|
||||||
|
```python
|
||||||
|
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||||
|
```
|
||||||
|
|
||||||
|
Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
|
||||||
|
|
||||||
- **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).
|
- **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).
|
||||||
- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.
|
- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.
|
||||||
- **`api_base`** (optional): If your provider has a custom endpoint.
|
- **`base_url`** (optional): If your provider has a custom endpoint.
|
||||||
|
|
||||||
This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.
|
This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.
|
||||||
|
|
||||||
@@ -52,20 +58,19 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic
|
|||||||
|
|
||||||
Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
|
Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
|
||||||
|
|
||||||
1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
|
1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
|
||||||
2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.
|
2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.
|
||||||
3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.
|
3. **`extraction_type`** (str): `"schema"` or `"block"`.
|
||||||
4. **`extraction_type`** (str): `"schema"` or `"block"`.
|
4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”
|
||||||
5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”
|
5. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.
|
||||||
6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.
|
6. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.
|
||||||
7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.
|
7. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.
|
||||||
8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.
|
8. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:
|
||||||
9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:
|
|
||||||
- `"markdown"`: The raw markdown (default).
|
- `"markdown"`: The raw markdown (default).
|
||||||
- `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.
|
- `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.
|
||||||
- `"html"`: The cleaned or raw HTML.
|
- `"html"`: The cleaned or raw HTML.
|
||||||
10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.
|
9. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.
|
||||||
11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).
|
10. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).
|
||||||
|
|
||||||
**Example**:
|
**Example**:
|
||||||
|
|
||||||
@@ -233,8 +238,7 @@ class KnowledgeGraph(BaseModel):
|
|||||||
async def main():
|
async def main():
|
||||||
# LLM extraction strategy
|
# LLM extraction strategy
|
||||||
llm_strat = LLMExtractionStrategy(
|
llm_strat = LLMExtractionStrategy(
|
||||||
provider="openai/gpt-4",
|
llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
|
||||||
api_token=os.getenv('OPENAI_API_KEY'),
|
|
||||||
schema=KnowledgeGraph.schema_json(),
|
schema=KnowledgeGraph.schema_json(),
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="Extract entities and relationships from the content. Return valid JSON.",
|
instruction="Extract entities and relationships from the content. Return valid JSON.",
|
||||||
@@ -286,7 +290,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
## 11. Conclusion
|
## 11. Conclusion
|
||||||
|
|
||||||
**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
|
**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LiteLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
|
||||||
|
|
||||||
- Put your LLM strategy **in `CrawlerRunConfig`**.
|
- Put your LLM strategy **in `CrawlerRunConfig`**.
|
||||||
- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.
|
- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.
|
||||||
@@ -317,4 +321,4 @@ If your site’s data is consistent or repetitive, consider [`JsonCssExtractionS
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
|
That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
|
||||||
|
|||||||
@@ -40,8 +40,9 @@ dependencies = [
|
|||||||
"fake-useragent>=2.0.3",
|
"fake-useragent>=2.0.3",
|
||||||
"click>=8.1.7",
|
"click>=8.1.7",
|
||||||
"pyperclip>=1.8.2",
|
"pyperclip>=1.8.2",
|
||||||
"faust-cchardet>=2.1.19",
|
"chardet>=5.2.0",
|
||||||
"aiohttp>=3.11.11",
|
"aiohttp>=3.11.11",
|
||||||
|
"brotli>=1.1.0",
|
||||||
"humanize>=4.10.0",
|
"humanize>=4.10.0",
|
||||||
]
|
]
|
||||||
classifiers = [
|
classifiers = [
|
||||||
|
|||||||
@@ -21,4 +21,5 @@ psutil>=6.1.1
|
|||||||
nltk>=3.9.1
|
nltk>=3.9.1
|
||||||
rich>=13.9.4
|
rich>=13.9.4
|
||||||
cssselect>=1.2.0
|
cssselect>=1.2.0
|
||||||
faust-cchardet>=2.1.19
|
chardet>=5.2.0
|
||||||
|
brotli>=1.1.0
|
||||||
Reference in New Issue
Block a user